480 files changed, 11387 insertions, 16598 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c
index 6181ad79e1a5..adaf6f6dd858 100644
--- a/fs/9p/vfs_addr.c
+++ b/fs/9p/vfs_addr.c
@@ -34,6 +34,7 @@
 #include <linux/idr.h>
 #include <linux/sched.h>
 #include <linux/uio.h>
+#include <linux/bvec.h>
 #include <net/9p/9p.h>
 #include <net/9p/client.h>
 
@@ -309,18 +310,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 
 	p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping);
 
-	if (unlikely(copied < len)) {
-		/*
-		 * zero out the rest of the area
-		 */
-		unsigned from = pos & (PAGE_SIZE - 1);
-
-		zero_user(page, from + copied, len - copied);
-		flush_dcache_page(page);
+	if (unlikely(copied < len && !PageUptodate(page))) {
+		copied = 0;
+		goto out;
 	}
-
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold the i_mutex.
@@ -330,6 +323,7 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping,
 		i_size_write(inode, last_pos);
 	}
 	set_page_dirty(page);
+out:
 	unlock_page(page);
 	put_page(page);
 
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d7b78d531e63..6a0f3fa85ef7 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -34,7 +34,7 @@
 #include <linux/list.h>
 #include <linux/pagemap.h>
 #include <linux/utsname.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/idr.h>
 #include <linux/uio.h>
 #include <linux/slab.h>
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 30ca770c5e0b..f4f4450119e4 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -1464,7 +1464,6 @@ static const struct inode_operations v9fs_file_inode_operations = {
 };
 
 static const struct inode_operations v9fs_symlink_inode_operations = {
-	.readlink = generic_readlink,
 	.get_link = v9fs_vfs_get_link,
 	.getattr = v9fs_vfs_getattr,
 	.setattr = v9fs_vfs_setattr,
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index afaa4b6de801..5999bd050678 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -979,7 +979,6 @@ const struct inode_operations v9fs_file_inode_operations_dotl = {
 };
 
 const struct inode_operations v9fs_symlink_inode_operations_dotl = {
-	.readlink = generic_readlink,
 	.get_link = v9fs_vfs_get_link_dotl,
 	.getattr = v9fs_vfs_getattr_dotl,
 	.setattr = v9fs_vfs_setattr_dotl,
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..83eab52fb3f6 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -38,6 +38,7 @@ config FS_DAX
 	bool "Direct Access (DAX) support"
 	depends on MMU
 	depends on !(ARM || MIPS || SPARC)
+	select FS_IOMAP
 	help
 	  Direct Access (DAX) can be used on memory-backed block devices.
 	  If the block device supports DAX and the filesystem supports DAX,
@@ -55,7 +56,6 @@ config FS_DAX_PMD
 	depends on FS_DAX
 	depends on ZONE_DEVICE
 	depends on TRANSPARENT_HUGEPAGE
-	depends on BROKEN
 
 endif # BLOCK
 
@@ -235,7 +235,6 @@ source "fs/efs/Kconfig"
 source "fs/jffs2/Kconfig"
 # UBIFS File system configuration
 source "fs/ubifs/Kconfig"
-source "fs/logfs/Kconfig"
 source "fs/cramfs/Kconfig"
 source "fs/squashfs/Kconfig"
 source "fs/freevxfs/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index 4c09d93d9569..b2f82cf6bf86 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -170,8 +170,8 @@ config BINFMT_MISC
 
 	  You can do other nice things, too. Read the file
 	  <file:Documentation/binfmt_misc.txt> to learn how to use this
-	  feature, <file:Documentation/java.txt> for information about how
-	  to include Java support. and <file:Documentation/mono.txt> for
+	  feature, <file:Documentation/admin-guide/java.rst> for information about how
+	  to include Java support. and <file:Documentation/admin-guide/mono.rst> for
           information about how to include Mono-based .NET support.
 
           To use binfmt_misc, you will need to mount it:
diff --git a/fs/Makefile b/fs/Makefile
index ed2b63257ba9..7bbaca9c67b1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -97,7 +97,6 @@ obj-$(CONFIG_NTFS_FS)		+= ntfs/
 obj-$(CONFIG_UFS_FS)		+= ufs/
 obj-$(CONFIG_EFS_FS)		+= efs/
 obj-$(CONFIG_JFFS2_FS)		+= jffs2/
-obj-$(CONFIG_LOGFS)		+= logfs/
 obj-$(CONFIG_UBIFS_FS)		+= ubifs/
 obj-$(CONFIG_AFFS_FS)		+= affs/
 obj-$(CONFIG_ROMFS_FS)		+= romfs/
diff --git a/fs/affs/symlink.c b/fs/affs/symlink.c
index 69b03dbb792f..ae622cdce142 100644
--- a/fs/affs/symlink.c
+++ b/fs/affs/symlink.c
@@ -70,7 +70,6 @@ const struct address_space_operations affs_symlink_aops = {
 };
 
 const struct inode_operations affs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= affs_notify_change,
 };
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index 2853b4095344..35efb9a31dd7 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -14,7 +14,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 static struct proc_dir_entry *proc_afs;
diff --git a/fs/aio.c b/fs/aio.c
index 428484f2f841..873b4ca82ccb 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -42,7 +42,7 @@
 #include <linux/mount.h>
 
 #include <asm/kmap_types.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
@@ -277,10 +277,10 @@ static void put_aio_ring_file(struct kioctx *ctx)
 	struct address_space *i_mapping;
 
 	if (aio_ring_file) {
-		truncate_setsize(aio_ring_file->f_inode, 0);
+		truncate_setsize(file_inode(aio_ring_file), 0);
 
 		/* Prevent further access to the kioctx from migratepages */
-		i_mapping = aio_ring_file->f_inode->i_mapping;
+		i_mapping = aio_ring_file->f_mapping;
 		spin_lock(&i_mapping->private_lock);
 		i_mapping->private_data = NULL;
 		ctx->aio_ring_file = NULL;
@@ -483,7 +483,7 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	for (i = 0; i < nr_pages; i++) {
 		struct page *page;
-		page = find_or_create_page(file->f_inode->i_mapping,
+		page = find_or_create_page(file->f_mapping,
 					   i, GFP_HIGHUSER | __GFP_ZERO);
 		if (!page)
 			break;
@@ -1085,7 +1085,8 @@ static void aio_complete(struct kiocb *kiocb, long res, long res2)
 		 * Tell lockdep we inherited freeze protection from submission
 		 * thread.
 		 */
-		__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		if (S_ISREG(file_inode(file)->i_mode))
+			__sb_writers_acquired(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 		file_end_write(file);
 	}
 
@@ -1285,7 +1286,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 			struct io_event __user *event,
 			struct timespec __user *timeout)
 {
-	ktime_t until = { .tv64 = KTIME_MAX };
+	ktime_t until = KTIME_MAX;
 	long ret = 0;
 
 	if (timeout) {
@@ -1311,7 +1312,7 @@ static long read_events(struct kioctx *ctx, long min_nr, long nr,
 	 * the ringbuffer empty. So in practice we should be ok, but it's
 	 * something to be aware of when touching this code.
 	 */
-	if (until.tv64 == 0)
+	if (until == 0)
 		aio_read_events(ctx, min_nr, nr, event, &ret);
 	else
 		wait_event_interruptible_hrtimeout(ctx->wait,
@@ -1367,6 +1368,39 @@ out:
 	return ret;
 }
 
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p)
+{
+	struct kioctx *ioctx = NULL;
+	unsigned long ctx;
+	long ret;
+
+	ret = get_user(ctx, ctx32p);
+	if (unlikely(ret))
+		goto out;
+
+	ret = -EINVAL;
+	if (unlikely(ctx || nr_events == 0)) {
+		pr_debug("EINVAL: ctx %lu nr_events %u\n",
+		         ctx, nr_events);
+		goto out;
+	}
+
+	ioctx = ioctx_alloc(nr_events);
+	ret = PTR_ERR(ioctx);
+	if (!IS_ERR(ioctx)) {
+		/* truncating is ok because it's a user address */
+		ret = put_user((u32)ioctx->user_id, ctx32p);
+		if (ret)
+			kill_ioctx(current->mm, ioctx, NULL);
+		percpu_ref_put(&ioctx->users);
+	}
+
+out:
+	return ret;
+}
+#endif
+
 /* sys_io_destroy:
  *	Destroy the aio_context specified.  May cancel any outstanding 
  *	AIOs and block on completion.  Will fail with -ENOSYS if not
@@ -1492,7 +1526,8 @@ static ssize_t aio_write(struct kiocb *req, struct iocb *iocb, bool vectored,
 		 * by telling it the lock got released so that it doesn't
 		 * complain about held lock when we return to userspace.
 		 */
-		__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
+		if (S_ISREG(file_inode(file)->i_mode))
+			__sb_writers_release(file_inode(file)->i_sb, SB_FREEZE_WRITE);
 	}
 	kfree(iovec);
 	return ret;
@@ -1591,8 +1626,8 @@ out_put_req:
 	return ret;
 }
 
-long do_io_submit(aio_context_t ctx_id, long nr,
-		  struct iocb __user *__user *iocbpp, bool compat)
+static long do_io_submit(aio_context_t ctx_id, long nr,
+			  struct iocb __user *__user *iocbpp, bool compat)
 {
 	struct kioctx *ctx;
 	long ret = 0;
@@ -1662,6 +1697,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr,
 	return do_io_submit(ctx_id, nr, iocbpp, 0);
 }
 
+#ifdef CONFIG_COMPAT
+static inline long
+copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
+{
+	compat_uptr_t uptr;
+	int i;
+
+	for (i = 0; i < nr; ++i) {
+		if (get_user(uptr, ptr32 + i))
+			return -EFAULT;
+		if (put_user(compat_ptr(uptr), ptr64 + i))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
+
+COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
+		       int, nr, u32 __user *, iocb)
+{
+	struct iocb __user * __user *iocb64;
+	long ret;
+
+	if (unlikely(nr < 0))
+		return -EINVAL;
+
+	if (nr > MAX_AIO_SUBMITS)
+		nr = MAX_AIO_SUBMITS;
+
+	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
+	ret = copy_iocb(nr, iocb, iocb64);
+	if (!ret)
+		ret = do_io_submit(ctx_id, nr, iocb64, 1);
+	return ret;
+}
+#endif
+
 /* lookup_kiocb
  *	Finds a given iocb for cancellation.
  */
@@ -1761,3 +1834,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	}
 	return ret;
 }
+
+#ifdef CONFIG_COMPAT
+COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
+		       compat_long_t, min_nr,
+		       compat_long_t, nr,
+		       struct io_event __user *, events,
+		       struct compat_timespec __user *, timeout)
+{
+	struct timespec t;
+	struct timespec __user *ut = NULL;
+
+	if (timeout) {
+		if (compat_get_timespec(&t, timeout))
+			return -EFAULT;
+
+		ut = compat_alloc_user_space(sizeof(*ut));
+		if (copy_to_user(ut, &t, sizeof(t)))
+			return -EFAULT;
+	}
+	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
+}
+#endif
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 80ef38c73e5a..3168ee4e77f4 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -20,7 +20,7 @@
 #include <linux/magic.h>
 #include <linux/anon_inodes.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static struct vfsmount *anon_inode_mnt __read_mostly;
 static struct inode *anon_inode_inode;
diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h
index a1fba4285277..c885daae68c8 100644
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -145,7 +145,7 @@ void autofs4_free_ino(struct autofs_info *);
 
 /* Expiration */
 int is_autofs4_dentry(struct dentry *);
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk);
+int autofs4_expire_wait(const struct path *path, int rcu_walk);
 int autofs4_expire_run(struct super_block *, struct vfsmount *,
 		       struct autofs_sb_info *,
 		       struct autofs_packet_expire __user *);
@@ -217,7 +217,8 @@ static inline int autofs_prepare_pipe(struct file *pipe)
 
 /* Queue management functions */
 
-int autofs4_wait(struct autofs_sb_info *, struct dentry *, enum autofs_notify);
+int autofs4_wait(struct autofs_sb_info *,
+		 const struct path *, enum autofs_notify);
 int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int);
 void autofs4_catatonic_mode(struct autofs_sb_info *);
 
diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c
index fc09eb77ddf3..6f48d670c941 100644
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -204,7 +204,7 @@ static int autofs_dev_ioctl_protosubver(struct file *fp,
 /* Find the topmost mount satisfying test() */
 static int find_autofs_mount(const char *pathname,
 			     struct path *res,
-			     int test(struct path *path, void *data),
+			     int test(const struct path *path, void *data),
 			     void *data)
 {
 	struct path path;
@@ -230,12 +230,12 @@ static int find_autofs_mount(const char *pathname,
 	return err;
 }
 
-static int test_by_dev(struct path *path, void *p)
+static int test_by_dev(const struct path *path, void *p)
 {
 	return path->dentry->d_sb->s_dev == *(dev_t *)p;
 }
 
-static int test_by_type(struct path *path, void *p)
+static int test_by_type(const struct path *path, void *p)
 {
 	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
 
@@ -468,7 +468,7 @@ static int autofs_dev_ioctl_requester(struct file *fp,
 	ino = autofs4_dentry_ino(path.dentry);
 	if (ino) {
 		err = 0;
-		autofs4_expire_wait(path.dentry, 0);
+		autofs4_expire_wait(&path, 0);
 		spin_lock(&sbi->fs_lock);
 		param->requester.uid =
 			from_kuid_munged(current_user_ns(), ino->uid);
@@ -575,7 +575,7 @@ static int autofs_dev_ioctl_ismountpoint(struct file *fp,
 
 		devid = new_encode_dev(dev);
 
-		err = have_submounts(path.dentry);
+		err = path_has_submounts(&path);
 
 		if (follow_down_one(&path))
 			magic = path.dentry->d_sb->s_magic;
diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c
index d8e6d421c27f..57725d4a8c59 100644
--- a/fs/autofs4/expire.c
+++ b/fs/autofs4/expire.c
@@ -310,26 +310,29 @@ struct dentry *autofs4_expire_direct(struct super_block *sb,
 	now = jiffies;
 	timeout = sbi->exp_timeout;
 
-	spin_lock(&sbi->fs_lock);
-	ino = autofs4_dentry_ino(root);
-	/* No point expiring a pending mount */
-	if (ino->flags & AUTOFS_INF_PENDING)
-		goto out;
 	if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+		spin_lock(&sbi->fs_lock);
+		ino = autofs4_dentry_ino(root);
+		/* No point expiring a pending mount */
+		if (ino->flags & AUTOFS_INF_PENDING) {
+			spin_unlock(&sbi->fs_lock);
+			goto out;
+		}
 		ino->flags |= AUTOFS_INF_WANT_EXPIRE;
 		spin_unlock(&sbi->fs_lock);
 		synchronize_rcu();
-		spin_lock(&sbi->fs_lock);
 		if (!autofs4_direct_busy(mnt, root, timeout, do_now)) {
+			spin_lock(&sbi->fs_lock);
 			ino->flags |= AUTOFS_INF_EXPIRING;
 			init_completion(&ino->expire_complete);
 			spin_unlock(&sbi->fs_lock);
 			return root;
 		}
+		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_WANT_EXPIRE;
+		spin_unlock(&sbi->fs_lock);
 	}
 out:
-	spin_unlock(&sbi->fs_lock);
 	dput(root);
 
 	return NULL;
@@ -495,8 +498,9 @@ found:
 	return expired;
 }
 
-int autofs4_expire_wait(struct dentry *dentry, int rcu_walk)
+int autofs4_expire_wait(const struct path *path, int rcu_walk)
 {
+	struct dentry *dentry = path->dentry;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	int status;
@@ -525,7 +529,7 @@ retry:
 
 		pr_debug("waiting for expire %p name=%pd\n", dentry, dentry);
 
-		status = autofs4_wait(sbi, dentry, NFY_NONE);
+		status = autofs4_wait(sbi, path, NFY_NONE);
 		wait_for_completion(&ino->expire_complete);
 
 		pr_debug("expire done status=%d\n", status);
@@ -592,11 +596,12 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt,
 
 	if (dentry) {
 		struct autofs_info *ino = autofs4_dentry_ino(dentry);
+		const struct path path = { .mnt = mnt, .dentry = dentry };
 
 		/* This is synchronous because it makes the daemon a
 		 * little easier
 		 */
-		ret = autofs4_wait(sbi, dentry, NFY_EXPIRE);
+		ret = autofs4_wait(sbi, &path, NFY_EXPIRE);
 
 		spin_lock(&sbi->fs_lock);
 		/* avoid rapid-fire expire attempts if expiry fails */
diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c
index 438b5bf675b6..09e7d68dff02 100644
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -94,7 +94,7 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		seq_printf(m, ",indirect");
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	if (sbi->pipe)
-		seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+		seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino);
 	else
 		seq_printf(m, ",pipe_ino=-1");
 #endif
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index a11f73174877..82e8f6edfb48 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -32,7 +32,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file);
 static struct dentry *autofs4_lookup(struct inode *,
 				     struct dentry *, unsigned int);
 static struct vfsmount *autofs4_d_automount(struct path *);
-static int autofs4_d_manage(struct dentry *, bool);
+static int autofs4_d_manage(const struct path *, bool);
 static void autofs4_dentry_release(struct dentry *);
 
 const struct file_operations autofs4_root_operations = {
@@ -123,7 +123,7 @@ static int autofs4_dir_open(struct inode *inode, struct file *file)
 	 * it.
 	 */
 	spin_lock(&sbi->lookup_lock);
-	if (!d_mountpoint(dentry) && simple_empty(dentry)) {
+	if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) {
 		spin_unlock(&sbi->lookup_lock);
 		return -ENOENT;
 	}
@@ -269,39 +269,41 @@ next:
 	return NULL;
 }
 
-static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk)
+static int autofs4_mount_wait(const struct path *path, bool rcu_walk)
 {
-	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
-	struct autofs_info *ino = autofs4_dentry_ino(dentry);
+	struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb);
+	struct autofs_info *ino = autofs4_dentry_ino(path->dentry);
 	int status = 0;
 
 	if (ino->flags & AUTOFS_INF_PENDING) {
 		if (rcu_walk)
 			return -ECHILD;
-		pr_debug("waiting for mount name=%pd\n", dentry);
-		status = autofs4_wait(sbi, dentry, NFY_MOUNT);
+		pr_debug("waiting for mount name=%pd\n", path->dentry);
+		status = autofs4_wait(sbi, path, NFY_MOUNT);
 		pr_debug("mount wait done status=%d\n", status);
 	}
 	ino->last_used = jiffies;
 	return status;
 }
 
-static int do_expire_wait(struct dentry *dentry, bool rcu_walk)
+static int do_expire_wait(const struct path *path, bool rcu_walk)
 {
+	struct dentry *dentry = path->dentry;
 	struct dentry *expiring;
 
 	expiring = autofs4_lookup_expiring(dentry, rcu_walk);
 	if (IS_ERR(expiring))
 		return PTR_ERR(expiring);
 	if (!expiring)
-		return autofs4_expire_wait(dentry, rcu_walk);
+		return autofs4_expire_wait(path, rcu_walk);
 	else {
+		const struct path this = { .mnt = path->mnt, .dentry = expiring };
 		/*
 		 * If we are racing with expire the request might not
 		 * be quite complete, but the directory has been removed
 		 * so it must have been successful, just wait for it.
 		 */
-		autofs4_expire_wait(expiring, 0);
+		autofs4_expire_wait(&this, 0);
 		autofs4_del_expiring(expiring);
 		dput(expiring);
 	}
@@ -354,7 +356,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	 * and the directory was removed, so just go ahead and try
 	 * the mount.
 	 */
-	status = do_expire_wait(dentry, 0);
+	status = do_expire_wait(path, 0);
 	if (status && status != -EAGAIN)
 		return NULL;
 
@@ -362,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 	spin_lock(&sbi->fs_lock);
 	if (ino->flags & AUTOFS_INF_PENDING) {
 		spin_unlock(&sbi->fs_lock);
-		status = autofs4_mount_wait(dentry, 0);
+		status = autofs4_mount_wait(path, 0);
 		if (status)
 			return ERR_PTR(status);
 		goto done;
@@ -370,28 +372,28 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 
 	/*
 	 * If the dentry is a symlink it's equivalent to a directory
-	 * having d_mountpoint() true, so there's no need to call back
-	 * to the daemon.
+	 * having path_is_mountpoint() true, so there's no need to call
+	 * back to the daemon.
 	 */
 	if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
 		spin_unlock(&sbi->fs_lock);
 		goto done;
 	}
 
-	if (!d_mountpoint(dentry)) {
+	if (!path_is_mountpoint(path)) {
 		/*
 		 * It's possible that user space hasn't removed directories
 		 * after umounting a rootless multi-mount, although it
-		 * should. For v5 have_submounts() is sufficient to handle
-		 * this because the leaves of the directory tree under the
-		 * mount never trigger mounts themselves (they have an autofs
-		 * trigger mount mounted on them). But v4 pseudo direct mounts
-		 * do need the leaves to trigger mounts. In this case we
-		 * have no choice but to use the list_empty() check and
+		 * should. For v5 path_has_submounts() is sufficient to
+		 * handle this because the leaves of the directory tree under
+		 * the mount never trigger mounts themselves (they have an
+		 * autofs trigger mount mounted on them). But v4 pseudo direct
+		 * mounts do need the leaves to trigger mounts. In this case
+		 * we have no choice but to use the list_empty() check and
 		 * require user space behave.
 		 */
 		if (sbi->version > 4) {
-			if (have_submounts(dentry)) {
+			if (path_has_submounts(path)) {
 				spin_unlock(&sbi->fs_lock);
 				goto done;
 			}
@@ -403,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path)
 		}
 		ino->flags |= AUTOFS_INF_PENDING;
 		spin_unlock(&sbi->fs_lock);
-		status = autofs4_mount_wait(dentry, 0);
+		status = autofs4_mount_wait(path, 0);
 		spin_lock(&sbi->fs_lock);
 		ino->flags &= ~AUTOFS_INF_PENDING;
 		if (status) {
@@ -421,8 +423,9 @@ done:
 	return NULL;
 }
 
-static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
+static int autofs4_d_manage(const struct path *path, bool rcu_walk)
 {
+	struct dentry *dentry = path->dentry;
 	struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb);
 	struct autofs_info *ino = autofs4_dentry_ino(dentry);
 	int status;
@@ -431,20 +434,20 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 
 	/* The daemon never waits. */
 	if (autofs4_oz_mode(sbi)) {
-		if (!d_mountpoint(dentry))
+		if (!path_is_mountpoint(path))
 			return -EISDIR;
 		return 0;
 	}
 
 	/* Wait for pending expires */
-	if (do_expire_wait(dentry, rcu_walk) == -ECHILD)
+	if (do_expire_wait(path, rcu_walk) == -ECHILD)
 		return -ECHILD;
 
 	/*
 	 * This dentry may be under construction so wait on mount
 	 * completion.
 	 */
-	status = autofs4_mount_wait(dentry, rcu_walk);
+	status = autofs4_mount_wait(path, rcu_walk);
 	if (status)
 		return status;
 
@@ -460,7 +463,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 
 		if (ino->flags & AUTOFS_INF_WANT_EXPIRE)
 			return 0;
-		if (d_mountpoint(dentry))
+		if (path_is_mountpoint(path))
 			return 0;
 		inode = d_inode_rcu(dentry);
 		if (inode && S_ISLNK(inode->i_mode))
@@ -487,7 +490,7 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk)
 		 * we can avoid needless calls ->d_automount() and avoid
 		 * an incorrect ELOOP error return.
 		 */
-		if ((!d_mountpoint(dentry) && !simple_empty(dentry)) ||
+		if ((!path_is_mountpoint(path) && !simple_empty(dentry)) ||
 		    (d_really_is_positive(dentry) && d_is_symlink(dentry)))
 			status = -EISDIR;
 	}
diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c
index 99aab00dc217..ab0b4285a202 100644
--- a/fs/autofs4/symlink.c
+++ b/fs/autofs4/symlink.c
@@ -25,6 +25,5 @@ static const char *autofs4_get_link(struct dentry *dentry,
 }
 
 const struct inode_operations autofs4_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= autofs4_get_link
 };
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index e44271dfceb6..1278335ce366 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -250,8 +250,9 @@ autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
 static int validate_request(struct autofs_wait_queue **wait,
 			    struct autofs_sb_info *sbi,
 			    const struct qstr *qstr,
-			    struct dentry *dentry, enum autofs_notify notify)
+			    const struct path *path, enum autofs_notify notify)
 {
+	struct dentry *dentry = path->dentry;
 	struct autofs_wait_queue *wq;
 	struct autofs_info *ino;
 
@@ -314,6 +315,7 @@ static int validate_request(struct autofs_wait_queue **wait,
 	 */
 	if (notify == NFY_MOUNT) {
 		struct dentry *new = NULL;
+		struct path this;
 		int valid = 1;
 
 		/*
@@ -333,7 +335,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 					dentry = new;
 			}
 		}
-		if (have_submounts(dentry))
+		this.mnt = path->mnt;
+		this.dentry = dentry;
+		if (path_has_submounts(&this))
 			valid = 0;
 
 		if (new)
@@ -345,8 +349,9 @@ static int validate_request(struct autofs_wait_queue **wait,
 }
 
 int autofs4_wait(struct autofs_sb_info *sbi,
-		 struct dentry *dentry, enum autofs_notify notify)
+		 const struct path *path, enum autofs_notify notify)
 {
+	struct dentry *dentry = path->dentry;
 	struct autofs_wait_queue *wq;
 	struct qstr qstr;
 	char *name;
@@ -405,7 +410,7 @@ int autofs4_wait(struct autofs_sb_info *sbi,
 		return -EINTR;
 	}
 
-	ret = validate_request(&wq, sbi, &qstr, dentry, notify);
+	ret = validate_request(&wq, sbi, &qstr, path, notify);
 	if (ret <= 0) {
 		if (ret != -EINTR)
 			mutex_unlock(&sbi->wq_mutex);
diff --git a/fs/bad_inode.c b/fs/bad_inode.c
index 8712062275b8..5f685c819298 100644
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -106,6 +106,50 @@ static ssize_t bad_inode_listxattr(struct dentry *dentry, char *buffer,
 	return -EIO;
 }
 
+static const char *bad_inode_get_link(struct dentry *dentry,
+				      struct inode *inode,
+				      struct delayed_call *done)
+{
+	return ERR_PTR(-EIO);
+}
+
+static struct posix_acl *bad_inode_get_acl(struct inode *inode, int type)
+{
+	return ERR_PTR(-EIO);
+}
+
+static int bad_inode_fiemap(struct inode *inode,
+			    struct fiemap_extent_info *fieinfo, u64 start,
+			    u64 len)
+{
+	return -EIO;
+}
+
+static int bad_inode_update_time(struct inode *inode, struct timespec *time,
+				 int flags)
+{
+	return -EIO;
+}
+
+static int bad_inode_atomic_open(struct inode *inode, struct dentry *dentry,
+				 struct file *file, unsigned int open_flag,
+				 umode_t create_mode, int *opened)
+{
+	return -EIO;
+}
+
+static int bad_inode_tmpfile(struct inode *inode, struct dentry *dentry,
+			     umode_t mode)
+{
+	return -EIO;
+}
+
+static int bad_inode_set_acl(struct inode *inode, struct posix_acl *acl,
+			     int type)
+{
+	return -EIO;
+}
+
 static const struct inode_operations bad_inode_ops =
 {
 	.create		= bad_inode_create,
@@ -118,14 +162,17 @@ static const struct inode_operations bad_inode_ops =
 	.mknod		= bad_inode_mknod,
 	.rename		= bad_inode_rename2,
 	.readlink	= bad_inode_readlink,
-	/* follow_link must be no-op, otherwise unmounting this inode
-	   won't work */
-	/* put_link returns void */
-	/* truncate returns void */
 	.permission	= bad_inode_permission,
 	.getattr	= bad_inode_getattr,
 	.setattr	= bad_inode_setattr,
 	.listxattr	= bad_inode_listxattr,
+	.get_link	= bad_inode_get_link,
+	.get_acl	= bad_inode_get_acl,
+	.fiemap		= bad_inode_fiemap,
+	.update_time	= bad_inode_update_time,
+	.atomic_open	= bad_inode_atomic_open,
+	.tmpfile	= bad_inode_tmpfile,
+	.set_acl	= bad_inode_set_acl,
 };
 
 
diff --git a/fs/befs/befs.h b/fs/befs/befs.h
index c6bad51d8ec7..b914cfb03820 100644
--- a/fs/befs/befs.h
+++ b/fs/befs/befs.h
@@ -129,6 +129,7 @@ static inline befs_inode_addr
 blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
 {
 	befs_inode_addr iaddr;
+
 	iaddr.allocation_group = blockno >> BEFS_SB(sb)->ag_shift;
 	iaddr.start =
 	    blockno - (iaddr.allocation_group << BEFS_SB(sb)->ag_shift);
@@ -140,7 +141,7 @@ blockno2iaddr(struct super_block *sb, befs_blocknr_t blockno)
 static inline unsigned int
 befs_iaddrs_per_block(struct super_block *sb)
 {
-	return BEFS_SB(sb)->block_size / sizeof (befs_disk_inode_addr);
+	return BEFS_SB(sb)->block_size / sizeof(befs_disk_inode_addr);
 }
 
 #include "endian.h"
diff --git a/fs/befs/befs_fs_types.h b/fs/befs/befs_fs_types.h
index eb557d9dc8be..69c9d8cde955 100644
--- a/fs/befs/befs_fs_types.h
+++ b/fs/befs/befs_fs_types.h
@@ -55,12 +55,12 @@ enum super_flags {
 };
 
 #define BEFS_BYTEORDER_NATIVE 0x42494745
-#define BEFS_BYTEORDER_NATIVE_LE (__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE)
-#define BEFS_BYTEORDER_NATIVE_BE (__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE)
+#define BEFS_BYTEORDER_NATIVE_LE ((__force fs32)cpu_to_le32(BEFS_BYTEORDER_NATIVE))
+#define BEFS_BYTEORDER_NATIVE_BE ((__force fs32)cpu_to_be32(BEFS_BYTEORDER_NATIVE))
 
 #define BEFS_SUPER_MAGIC BEFS_SUPER_MAGIC1
-#define BEFS_SUPER_MAGIC1_LE (__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1)
-#define BEFS_SUPER_MAGIC1_BE (__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1)
+#define BEFS_SUPER_MAGIC1_LE ((__force fs32)cpu_to_le32(BEFS_SUPER_MAGIC1))
+#define BEFS_SUPER_MAGIC1_BE ((__force fs32)cpu_to_be32(BEFS_SUPER_MAGIC1))
 
 /*
  * Flags of inode
@@ -79,7 +79,7 @@ enum inode_flags {
 	BEFS_INODE_WAS_WRITTEN = 0x00020000,
 	BEFS_NO_TRANSACTION = 0x00040000,
 };
-/* 
+/*
  * On-Disk datastructures of BeFS
  */
 
@@ -139,7 +139,7 @@ typedef struct {
 
 } PACKED befs_super_block;
 
-/* 
+/*
  * Note: the indirect and dbl_indir block_runs may
  * be longer than one block!
  */
diff --git a/fs/befs/btree.c b/fs/befs/btree.c
index 7e135ea73fdd..d509887c580c 100644
--- a/fs/befs/btree.c
+++ b/fs/befs/btree.c
@@ -12,8 +12,8 @@
  *
  * Dominic Giampaolo, author of "Practical File System
  * Design with the Be File System", for such a helpful book.
- * 
- * Marcus J. Ranum, author of the b+tree package in 
+ *
+ * Marcus J. Ranum, author of the b+tree package in
  * comp.sources.misc volume 10. This code is not copied from that
  * work, but it is partially based on it.
  *
@@ -38,38 +38,38 @@
  */
 
 /* Befs B+tree structure:
- * 
+ *
  * The first thing in the tree is the tree superblock. It tells you
  * all kinds of useful things about the tree, like where the rootnode
  * is located, and the size of the nodes (always 1024 with current version
  * of BeOS).
  *
  * The rest of the tree consists of a series of nodes. Nodes contain a header
- * (struct befs_btree_nodehead), the packed key data, an array of shorts 
+ * (struct befs_btree_nodehead), the packed key data, an array of shorts
  * containing the ending offsets for each of the keys, and an array of
- * befs_off_t values. In interior nodes, the keys are the ending keys for 
- * the childnode they point to, and the values are offsets into the 
- * datastream containing the tree. 
+ * befs_off_t values. In interior nodes, the keys are the ending keys for
+ * the childnode they point to, and the values are offsets into the
+ * datastream containing the tree.
  */
 
 /* Note:
- * 
- * The book states 2 confusing things about befs b+trees. First, 
+ *
+ * The book states 2 confusing things about befs b+trees. First,
  * it states that the overflow field of node headers is used by internal nodes
  * to point to another node that "effectively continues this one". Here is what
  * I believe that means. Each key in internal nodes points to another node that
- * contains key values less than itself. Inspection reveals that the last key 
- * in the internal node is not the last key in the index. Keys that are 
- * greater than the last key in the internal node go into the overflow node. 
+ * contains key values less than itself. Inspection reveals that the last key
+ * in the internal node is not the last key in the index. Keys that are
+ * greater than the last key in the internal node go into the overflow node.
  * I imagine there is a performance reason for this.
  *
- * Second, it states that the header of a btree node is sufficient to 
- * distinguish internal nodes from leaf nodes. Without saying exactly how. 
+ * Second, it states that the header of a btree node is sufficient to
+ * distinguish internal nodes from leaf nodes. Without saying exactly how.
  * After figuring out the first, it becomes obvious that internal nodes have
  * overflow nodes and leafnodes do not.
  */
 
-/* 
+/*
  * Currently, this code is only good for directory B+trees.
  * In order to be used for other BFS indexes, it needs to be extended to handle
  * duplicate keys and non-string keytypes (int32, int64, float, double).
@@ -237,8 +237,8 @@ befs_bt_read_node(struct super_block *sb, const befs_data_stream *ds,
  * with @key (usually the disk block number of an inode).
  *
  * On failure, returns BEFS_ERR or BEFS_BT_NOT_FOUND.
- * 
- * Algorithm: 
+ *
+ * Algorithm:
  *   Read the superblock and rootnode of the b+tree.
  *   Drill down through the interior nodes using befs_find_key().
  *   Once at the correct leaf node, use befs_find_key() again to get the
@@ -402,12 +402,12 @@ befs_find_key(struct super_block *sb, struct befs_btree_node *node,
  *
  * Here's how it works: Key_no is the index of the key/value pair to
  * return in keybuf/value.
- * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is 
+ * Bufsize is the size of keybuf (BEFS_NAME_LEN+1 is a good size). Keysize is
  * the number of characters in the key (just a convenience).
  *
  * Algorithm:
  *   Get the first leafnode of the tree. See if the requested key is in that
- *   node. If not, follow the node->right link to the next leafnode. Repeat 
+ *   node. If not, follow the node->right link to the next leafnode. Repeat
  *   until the (key_no)th key is found or the tree is out of keys.
  */
 int
@@ -536,7 +536,7 @@ befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
  * @node_off: Pointer to offset of current node within datastream. Modified
  * 		by the function.
  *
- * Helper function for btree traverse. Moves the current position to the 
+ * Helper function for btree traverse. Moves the current position to the
  * start of the first leaf node.
  *
  * Also checks for an empty tree. If there are no keys, returns BEFS_BT_EMPTY.
@@ -592,10 +592,10 @@ befs_btree_seekleaf(struct super_block *sb, const befs_data_stream *ds,
 }
 
 /**
- * befs_leafnode - Determine if the btree node is a leaf node or an 
+ * befs_leafnode - Determine if the btree node is a leaf node or an
  * interior node
  * @node: Pointer to node structure to test
- * 
+ *
  * Return 1 if leaf, 0 if interior
  */
 static int
@@ -656,7 +656,7 @@ befs_bt_valarray(struct befs_btree_node *node)
  * @node: Pointer to the node structure to find the keydata array within
  *
  * Returns a pointer to the start of the keydata array
- * of the node pointed to by the node header 
+ * of the node pointed to by the node header
  */
 static char *
 befs_bt_keydata(struct befs_btree_node *node)
@@ -702,7 +702,7 @@ befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node,
 
 /**
  * befs_compare_strings - compare two strings
- * @key1: pointer to the first key to be compared 
+ * @key1: pointer to the first key to be compared
  * @keylen1: length in bytes of key1
  * @key2: pointer to the second key to be compared
  * @keylen2: length in bytes of key2
diff --git a/fs/befs/btree.h b/fs/befs/btree.h
index f2a8f637e9e0..60c6c728e64e 100644
--- a/fs/befs/btree.h
+++ b/fs/befs/btree.h
@@ -1,13 +1,11 @@
 /*
  * btree.h
- * 
+ *
  */
 
-
 int befs_btree_find(struct super_block *sb, const befs_data_stream *ds,
-		    const char *key, befs_off_t * value);
+		    const char *key, befs_off_t *value);
 
 int befs_btree_read(struct super_block *sb, const befs_data_stream *ds,
 		    loff_t key_no, size_t bufsize, char *keybuf,
-		    size_t * keysize, befs_off_t * value);
-
+		    size_t *keysize, befs_off_t *value);
diff --git a/fs/befs/datastream.c b/fs/befs/datastream.c
index b4c7ba013c0d..720b3bc5c16a 100644
--- a/fs/befs/datastream.c
+++ b/fs/befs/datastream.c
@@ -84,13 +84,11 @@ befs_read_datastream(struct super_block *sb, const befs_data_stream *ds,
  *
  * Takes a file position and gives back a brun who's starting block
  * is block number fblock of the file.
- * 
+ *
  * Returns BEFS_OK or BEFS_ERR.
- * 
+ *
  * Calls specialized functions for each of the three possible
  * datastream regions.
- *
- * 2001-11-15 Will Dyson
  */
 int
 befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
@@ -120,7 +118,7 @@ befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
 
 /**
  * befs_read_lsmylink - read long symlink from datastream.
- * @sb: Filesystem superblock 
+ * @sb: Filesystem superblock
  * @ds: Datastream to read from
  * @buff: Buffer in which to place long symlink data
  * @len: Length of the long symlink in bytes
diff --git a/fs/befs/datastream.h b/fs/befs/datastream.h
index 91ba8203d83f..7ff9ff09ec6e 100644
--- a/fs/befs/datastream.h
+++ b/fs/befs/datastream.h
@@ -5,10 +5,10 @@
 
 struct buffer_head *befs_read_datastream(struct super_block *sb,
 					 const befs_data_stream *ds,
-					 befs_off_t pos, uint * off);
+					 befs_off_t pos, uint *off);
 
 int befs_fblock2brun(struct super_block *sb, const befs_data_stream *data,
-		     befs_blocknr_t fblock, befs_block_run * run);
+		     befs_blocknr_t fblock, befs_block_run *run);
 
 size_t befs_read_lsymlink(struct super_block *sb, const befs_data_stream *data,
 			  void *buff, befs_off_t len);
@@ -17,4 +17,3 @@ befs_blocknr_t befs_count_blocks(struct super_block *sb,
 			  const befs_data_stream *ds);
 
 extern const befs_inode_addr BAD_IADDR;
-
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index 85c13392e9e8..36656c86f50e 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -1,6 +1,6 @@
 /*
  *  linux/fs/befs/debug.c
- * 
+ *
  * Copyright (C) 2001 Will Dyson (will_dyson at pobox.com)
  *
  * With help from the ntfs-tng driver by Anton Altparmakov
@@ -57,6 +57,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
 
 	struct va_format vaf;
 	va_list args;
+
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
@@ -67,7 +68,7 @@ befs_debug(const struct super_block *sb, const char *fmt, ...)
 }
 
 void
-befs_dump_inode(const struct super_block *sb, befs_inode * inode)
+befs_dump_inode(const struct super_block *sb, befs_inode *inode)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
@@ -151,7 +152,7 @@ befs_dump_inode(const struct super_block *sb, befs_inode * inode)
  */
 
 void
-befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
+befs_dump_super_block(const struct super_block *sb, befs_super_block *sup)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
@@ -202,7 +203,7 @@ befs_dump_super_block(const struct super_block *sb, befs_super_block * sup)
 #if 0
 /* unused */
 void
-befs_dump_small_data(const struct super_block *sb, befs_small_data * sd)
+befs_dump_small_data(const struct super_block *sb, befs_small_data *sd)
 {
 }
 
@@ -221,7 +222,8 @@ befs_dump_run(const struct super_block *sb, befs_disk_block_run run)
 #endif  /*  0  */
 
 void
-befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * super)
+befs_dump_index_entry(const struct super_block *sb,
+		      befs_disk_btree_super *super)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
@@ -242,7 +244,7 @@ befs_dump_index_entry(const struct super_block *sb, befs_disk_btree_super * supe
 }
 
 void
-befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead * node)
+befs_dump_index_node(const struct super_block *sb, befs_btree_nodehead *node)
 {
 #ifdef CONFIG_BEFS_DEBUG
 
diff --git a/fs/befs/inode.c b/fs/befs/inode.c
index fa4b718de597..5367a6470a69 100644
--- a/fs/befs/inode.c
+++ b/fs/befs/inode.c
@@ -1,6 +1,6 @@
 /*
  * inode.c
- * 
+ *
  * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com>
  */
 
@@ -10,12 +10,12 @@
 #include "inode.h"
 
 /*
-	Validates the correctness of the befs inode
-	Returns BEFS_OK if the inode should be used, otherwise
-	returns BEFS_BAD_INODE
-*/
+ * Validates the correctness of the befs inode
+ * Returns BEFS_OK if the inode should be used, otherwise
+ * returns BEFS_BAD_INODE
+ */
 int
-befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+befs_check_inode(struct super_block *sb, befs_inode *raw_inode,
 		 befs_blocknr_t inode)
 {
 	u32 magic1 = fs32_to_cpu(sb, raw_inode->magic1);
diff --git a/fs/befs/inode.h b/fs/befs/inode.h
index 9dc7fd9b7570..2219e412f49b 100644
--- a/fs/befs/inode.h
+++ b/fs/befs/inode.h
@@ -1,8 +1,7 @@
 /*
  * inode.h
- * 
+ *
  */
 
-int befs_check_inode(struct super_block *sb, befs_inode * raw_inode,
+int befs_check_inode(struct super_block *sb, befs_inode *raw_inode,
 		     befs_blocknr_t inode);
-
diff --git a/fs/befs/io.c b/fs/befs/io.c
index b4a558126ee1..227cb86e07fe 100644
--- a/fs/befs/io.c
+++ b/fs/befs/io.c
@@ -3,7 +3,7 @@
  *
  * Copyright (C) 2001 Will Dyson <will_dyson@pobox.com
  *
- * Based on portions of file.c and inode.c 
+ * Based on portions of file.c and inode.c
  * by Makoto Kato (m_kato@ga2.so-net.ne.jp)
  *
  * Many thanks to Dominic Giampaolo, author of Practical File System
@@ -19,8 +19,7 @@
 /*
  * Converts befs notion of disk addr to a disk offset and uses
  * linux kernel function sb_bread() to get the buffer containing
- * the offset. -Will Dyson
- *
+ * the offset.
  */
 
 struct buffer_head *
@@ -55,7 +54,7 @@ befs_bread_iaddr(struct super_block *sb, befs_inode_addr iaddr)
 	befs_debug(sb, "<--- %s", __func__);
 	return bh;
 
-      error:
+error:
 	befs_debug(sb, "<--- %s ERROR", __func__);
 	return NULL;
 }
diff --git a/fs/befs/io.h b/fs/befs/io.h
index 78d7bc6e60de..9b3e1967cb31 100644
--- a/fs/befs/io.h
+++ b/fs/befs/io.h
@@ -4,4 +4,3 @@
 
 struct buffer_head *befs_bread_iaddr(struct super_block *sb,
 				     befs_inode_addr iaddr);
-
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c
index 647a276eba56..19407165f4aa 100644
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -18,6 +18,7 @@
 #include <linux/parser.h>
 #include <linux/namei.h>
 #include <linux/sched.h>
+#include <linux/exportfs.h>
 
 #include "befs.h"
 #include "btree.h"
@@ -37,7 +38,8 @@ static int befs_readdir(struct file *, struct dir_context *);
 static int befs_get_block(struct inode *, sector_t, struct buffer_head *, int);
 static int befs_readpage(struct file *file, struct page *page);
 static sector_t befs_bmap(struct address_space *mapping, sector_t block);
-static struct dentry *befs_lookup(struct inode *, struct dentry *, unsigned int);
+static struct dentry *befs_lookup(struct inode *, struct dentry *,
+				  unsigned int);
 static struct inode *befs_iget(struct super_block *, unsigned long);
 static struct inode *befs_alloc_inode(struct super_block *sb);
 static void befs_destroy_inode(struct inode *inode);
@@ -51,6 +53,10 @@ static void befs_put_super(struct super_block *);
 static int befs_remount(struct super_block *, int *, char *);
 static int befs_statfs(struct dentry *, struct kstatfs *);
 static int parse_options(char *, struct befs_mount_options *);
+static struct dentry *befs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type);
+static struct dentry *befs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type);
 
 static const struct super_operations befs_sops = {
 	.alloc_inode	= befs_alloc_inode,	/* allocate a new inode */
@@ -83,9 +89,14 @@ static const struct address_space_operations befs_symlink_aops = {
 	.readpage	= befs_symlink_readpage,
 };
 
-/* 
+static const struct export_operations befs_export_operations = {
+	.fh_to_dentry	= befs_fh_to_dentry,
+	.fh_to_parent	= befs_fh_to_parent,
+};
+
+/*
  * Called by generic_file_read() to read a page of data
- * 
+ *
  * In turn, simply calls a generic block read function and
  * passes it the address of befs_get_block, for mapping file
  * positions to disk blocks.
@@ -102,15 +113,13 @@ befs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, befs_get_block);
 }
 
-/* 
- * Generic function to map a file position (block) to a 
+/*
+ * Generic function to map a file position (block) to a
  * disk offset (passed back in bh_result).
  *
  * Used by many higher level functions.
  *
  * Calls befs_fblock2brun() in datastream.c to do the real work.
- *
- * -WD 10-26-01
  */
 
 static int
@@ -269,15 +278,15 @@ befs_alloc_inode(struct super_block *sb)
 	struct befs_inode_info *bi;
 
 	bi = kmem_cache_alloc(befs_inode_cachep, GFP_KERNEL);
-        if (!bi)
-                return NULL;
-        return &bi->vfs_inode;
+	if (!bi)
+		return NULL;
+	return &bi->vfs_inode;
 }
 
 static void befs_i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-        kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
+	kmem_cache_free(befs_inode_cachep, BEFS_I(inode));
 }
 
 static void befs_destroy_inode(struct inode *inode)
@@ -287,7 +296,7 @@ static void befs_destroy_inode(struct inode *inode)
 
 static void init_once(void *foo)
 {
-        struct befs_inode_info *bi = (struct befs_inode_info *) foo;
+	struct befs_inode_info *bi = (struct befs_inode_info *) foo;
 
 	inode_init_once(&bi->vfs_inode);
 }
@@ -338,7 +347,7 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	/*
 	 * set uid and gid.  But since current BeOS is single user OS, so
 	 * you can change by "uid" or "gid" options.
-	 */   
+	 */
 
 	inode->i_uid = befs_sb->mount_opts.use_uid ?
 		befs_sb->mount_opts.uid :
@@ -353,14 +362,14 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	 * BEFS's time is 64 bits, but current VFS is 32 bits...
 	 * BEFS don't have access time. Nor inode change time. VFS
 	 * doesn't have creation time.
-	 * Also, the lower 16 bits of the last_modified_time and 
+	 * Also, the lower 16 bits of the last_modified_time and
 	 * create_time are just a counter to help ensure uniqueness
 	 * for indexing purposes. (PFD, page 54)
 	 */
 
 	inode->i_mtime.tv_sec =
 	    fs64_to_cpu(sb, raw_inode->last_modified_time) >> 16;
-	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */	
+	inode->i_mtime.tv_nsec = 0;   /* lower 16 bits are not a time */
 	inode->i_ctime = inode->i_mtime;
 	inode->i_atime = inode->i_mtime;
 
@@ -414,10 +423,10 @@ static struct inode *befs_iget(struct super_block *sb, unsigned long ino)
 	unlock_new_inode(inode);
 	return inode;
 
-      unacquire_bh:
+unacquire_bh:
 	brelse(bh);
 
-      unacquire_none:
+unacquire_none:
 	iget_failed(inode);
 	befs_debug(sb, "<--- %s - Bad inode", __func__);
 	return ERR_PTR(-EIO);
@@ -442,7 +451,7 @@ befs_init_inodecache(void)
 }
 
 /* Called at fs teardown.
- * 
+ *
  * Taken from NFS implementation by Al Viro.
  */
 static void
@@ -491,13 +500,10 @@ fail:
 }
 
 /*
- * UTF-8 to NLS charset  convert routine
- * 
+ * UTF-8 to NLS charset convert routine
  *
- * Changed 8/10/01 by Will Dyson. Now use uni2char() / char2uni() rather than
- * the nls tables directly
+ * Uses uni2char() / char2uni() rather than the nls tables directly
  */
-
 static int
 befs_utf2nls(struct super_block *sb, const char *in,
 	     int in_len, char **out, int *out_len)
@@ -521,9 +527,8 @@ befs_utf2nls(struct super_block *sb, const char *in,
 	}
 
 	*out = result = kmalloc(maxlen, GFP_NOFS);
-	if (!*out) {
+	if (!*out)
 		return -ENOMEM;
-	}
 
 	for (i = o = 0; i < in_len; i += utflen, o += unilen) {
 
@@ -546,7 +551,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
 
 	return o;
 
-      conv_err:
+conv_err:
 	befs_error(sb, "Name using character set %s contains a character that "
 		   "cannot be converted to unicode.", nls->charset);
 	befs_debug(sb, "<--- %s", __func__);
@@ -561,18 +566,18 @@ befs_utf2nls(struct super_block *sb, const char *in,
  * @in_len: Length of input string in bytes
  * @out: The output string in UTF-8 format
  * @out_len: Length of the output buffer
- * 
+ *
  * Converts input string @in, which is in the format of the loaded NLS map,
  * into a utf8 string.
- * 
+ *
  * The destination string @out is allocated by this function and the caller is
  * responsible for freeing it with kfree()
- * 
+ *
  * On return, *@out_len is the length of @out in bytes.
  *
  * On success, the return value is the number of utf8 characters written to
  * the output buffer @out.
- *  
+ *
  * On Failure, a negative number coresponding to the error code is returned.
  */
 
@@ -585,9 +590,11 @@ befs_nls2utf(struct super_block *sb, const char *in,
 	wchar_t uni;
 	int unilen, utflen;
 	char *result;
-	/* There're nls characters that will translate to 3-chars-wide UTF-8
-	 * characters, a additional byte is needed to save the final \0
-	 * in special cases */
+	/*
+	 * There are nls characters that will translate to 3-chars-wide UTF-8
+	 * characters, an additional byte is needed to save the final \0
+	 * in special cases
+	 */
 	int maxlen = (3 * in_len) + 1;
 
 	befs_debug(sb, "---> %s\n", __func__);
@@ -624,14 +631,41 @@ befs_nls2utf(struct super_block *sb, const char *in,
 
 	return i;
 
-      conv_err:
-	befs_error(sb, "Name using charecter set %s contains a charecter that "
+conv_err:
+	befs_error(sb, "Name using character set %s contains a character that "
 		   "cannot be converted to unicode.", nls->charset);
 	befs_debug(sb, "<--- %s", __func__);
 	kfree(result);
 	return -EILSEQ;
 }
 
+static struct inode *befs_nfs_get_inode(struct super_block *sb, uint64_t ino,
+					 uint32_t generation)
+{
+	/* No need to handle i_generation */
+	return befs_iget(sb, ino);
+}
+
+/*
+ * Map a NFS file handle to a corresponding dentry
+ */
+static struct dentry *befs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    befs_nfs_get_inode);
+}
+
+/*
+ * Find the parent for a file specified by NFS handle
+ */
+static struct dentry *befs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    befs_nfs_get_inode);
+}
+
 enum {
 	Opt_uid, Opt_gid, Opt_charset, Opt_debug, Opt_err,
 };
@@ -666,6 +700,7 @@ parse_options(char *options, struct befs_mount_options *opts)
 
 	while ((p = strsep(&options, ",")) != NULL) {
 		int token;
+
 		if (!*p)
 			continue;
 
@@ -721,7 +756,7 @@ parse_options(char *options, struct befs_mount_options *opts)
 }
 
 /* This function has the responsibiltiy of getting the
- * filesystem ready for unmounting. 
+ * filesystem ready for unmounting.
  * Basically, we free everything that we allocated in
  * befs_read_inode
  */
@@ -782,8 +817,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Linux 2.4.10 and later refuse to read blocks smaller than
 	 * the logical block size for the device. But we also need to read at
 	 * least 1k to get the second 512 bytes of the volume.
-	 * -WD 10-26-01
-	 */ 
+	 */
 	blocksize = sb_min_blocksize(sb, 1024);
 	if (!blocksize) {
 		if (!silent)
@@ -791,7 +825,8 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 		goto unacquire_priv_sbp;
 	}
 
-	if (!(bh = sb_bread(sb, sb_block))) {
+	bh = sb_bread(sb, sb_block);
+	if (!bh) {
 		if (!silent)
 			befs_error(sb, "unable to read superblock");
 		goto unacquire_priv_sbp;
@@ -816,7 +851,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 
 	brelse(bh);
 
-	if( befs_sb->num_blocks > ~((sector_t)0) ) {
+	if (befs_sb->num_blocks > ~((sector_t)0)) {
 		if (!silent)
 			befs_error(sb, "blocks count: %llu is larger than the host can use",
 					befs_sb->num_blocks);
@@ -831,6 +866,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	/* Set real blocksize of fs */
 	sb_set_blocksize(sb, (ulong) befs_sb->block_size);
 	sb->s_op = &befs_sops;
+	sb->s_export_op = &befs_export_operations;
 	root = befs_iget(sb, iaddr2blockno(sb, &(befs_sb->root_dir)));
 	if (IS_ERR(root)) {
 		ret = PTR_ERR(root);
@@ -861,16 +897,16 @@ befs_fill_super(struct super_block *sb, void *data, int silent)
 	}
 
 	return 0;
-/*****************/
-      unacquire_bh:
+
+unacquire_bh:
 	brelse(bh);
 
-      unacquire_priv_sbp:
+unacquire_priv_sbp:
 	kfree(befs_sb->mount_opts.iocharset);
 	kfree(sb->s_fs_info);
 	sb->s_fs_info = NULL;
 
-      unacquire_none:
+unacquire_none:
 	return ret;
 }
 
@@ -919,7 +955,7 @@ static struct file_system_type befs_fs_type = {
 	.name		= "befs",
 	.mount		= befs_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,	
+	.fs_flags	= FS_REQUIRES_DEV,
 };
 MODULE_ALIAS_FS("befs");
 
@@ -956,9 +992,9 @@ exit_befs_fs(void)
 }
 
 /*
-Macros that typecheck the init and exit functions,
-ensures that they are called at init and cleanup,
-and eliminates warnings about unused functions.
-*/
+ * Macros that typecheck the init and exit functions,
+ * ensures that they are called at init and cleanup,
+ * and eliminates warnings about unused functions.
+ */
 module_init(init_befs_fs)
 module_exit(exit_befs_fs)
diff --git a/fs/befs/super.h b/fs/befs/super.h
index dc4556376a22..ec1df30a7e9a 100644
--- a/fs/befs/super.h
+++ b/fs/befs/super.h
@@ -2,7 +2,5 @@
  * super.h
  */
 
-int befs_load_sb(struct super_block *sb, befs_super_block * disk_sb);
-
+int befs_load_sb(struct super_block *sb, befs_super_block *disk_sb);
 int befs_check_sb(struct super_block *sb);
-
diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c
index 1e5c896f6b79..f2deec0a62f0 100644
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -16,7 +16,7 @@
 #include <linux/vfs.h>
 #include <linux/writeback.h>
 #include <linux/uio.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "bfs.h"
 
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c
index ae1b5404fced..2a59139f520b 100644
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -26,7 +26,7 @@
 #include <linux/coredump.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/a.out-core.h>
 
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 2472af2798c7..422370293cfd 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -36,7 +36,7 @@
 #include <linux/coredump.h>
 #include <linux/sched.h>
 #include <linux/dax.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
 
@@ -2204,7 +2204,9 @@ static int elf_core_dump(struct coredump_params *cprm)
 
 	dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
 
-	vma_filesz = kmalloc_array(segs - 1, sizeof(*vma_filesz), GFP_KERNEL);
+	if (segs - 1 > ULONG_MAX / sizeof(*vma_filesz))
+		goto end_coredump;
+	vma_filesz = vmalloc((segs - 1) * sizeof(*vma_filesz));
 	if (!vma_filesz)
 		goto end_coredump;
 
@@ -2296,6 +2298,7 @@ static int elf_core_dump(struct coredump_params *cprm)
 				goto end_coredump;
 		}
 	}
+	dump_truncate(cprm);
 
 	if (!elf_core_write_extra_data(cprm))
 		goto end_coredump;
@@ -2311,7 +2314,7 @@ end_coredump:
 cleanup:
 	free_note_info(&info);
 	kfree(shdr4extnum);
-	kfree(vma_filesz);
+	vfree(vma_filesz);
 	kfree(phdr4note);
 	kfree(elf);
 out:
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 464a972e88c1..d2e36f82c35d 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -37,7 +37,7 @@
 #include <linux/coredump.h>
 #include <linux/dax.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/param.h>
 #include <asm/pgalloc.h>
 
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 05b553368bb4..3c47614a4b32 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -30,8 +30,9 @@
 #include <linux/cleancache.h>
 #include <linux/dax.h>
 #include <linux/badblocks.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/falloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 struct bdev_inode {
@@ -175,17 +176,273 @@ static struct inode *bdev_file_inode(struct file *file)
 	return file->f_mapping->host;
 }
 
+static unsigned int dio_bio_write_op(struct kiocb *iocb)
+{
+	unsigned int op = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+	/* avoid the need for a I/O completion work item */
+	if (iocb->ki_flags & IOCB_DSYNC)
+		op |= REQ_FUA;
+	return op;
+}
+
+#define DIO_INLINE_BIO_VECS 4
+
+static void blkdev_bio_end_io_simple(struct bio *bio)
+{
+	struct task_struct *waiter = bio->bi_private;
+
+	WRITE_ONCE(bio->bi_private, NULL);
+	wake_up_process(waiter);
+}
+
 static ssize_t
-blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+__blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
+		int nr_pages)
+{
+	struct file *file = iocb->ki_filp;
+	struct block_device *bdev = I_BDEV(bdev_file_inode(file));
+	struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs, *bvec;
+	loff_t pos = iocb->ki_pos;
+	bool should_dirty = false;
+	struct bio bio;
+	ssize_t ret;
+	blk_qc_t qc;
+	int i;
+
+	if ((pos | iov_iter_alignment(iter)) &
+	    (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+
+	if (nr_pages <= DIO_INLINE_BIO_VECS)
+		vecs = inline_vecs;
+	else {
+		vecs = kmalloc(nr_pages * sizeof(struct bio_vec), GFP_KERNEL);
+		if (!vecs)
+			return -ENOMEM;
+	}
+
+	bio_init(&bio, vecs, nr_pages);
+	bio.bi_bdev = bdev;
+	bio.bi_iter.bi_sector = pos >> 9;
+	bio.bi_private = current;
+	bio.bi_end_io = blkdev_bio_end_io_simple;
+
+	ret = bio_iov_iter_get_pages(&bio, iter);
+	if (unlikely(ret))
+		return ret;
+	ret = bio.bi_iter.bi_size;
+
+	if (iov_iter_rw(iter) == READ) {
+		bio.bi_opf = REQ_OP_READ;
+		if (iter_is_iovec(iter))
+			should_dirty = true;
+	} else {
+		bio.bi_opf = dio_bio_write_op(iocb);
+		task_io_account_write(ret);
+	}
+
+	qc = submit_bio(&bio);
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(bio.bi_private))
+			break;
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+			io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	bio_for_each_segment_all(bvec, &bio, i) {
+		if (should_dirty && !PageCompound(bvec->bv_page))
+			set_page_dirty_lock(bvec->bv_page);
+		put_page(bvec->bv_page);
+	}
+
+	if (vecs != inline_vecs)
+		kfree(vecs);
+
+	if (unlikely(bio.bi_error))
+		return bio.bi_error;
+	return ret;
+}
+
+struct blkdev_dio {
+	union {
+		struct kiocb		*iocb;
+		struct task_struct	*waiter;
+	};
+	size_t			size;
+	atomic_t		ref;
+	bool			multi_bio : 1;
+	bool			should_dirty : 1;
+	bool			is_sync : 1;
+	struct bio		bio;
+};
+
+static struct bio_set *blkdev_dio_pool __read_mostly;
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+	struct blkdev_dio *dio = bio->bi_private;
+	bool should_dirty = dio->should_dirty;
+
+	if (dio->multi_bio && !atomic_dec_and_test(&dio->ref)) {
+		if (bio->bi_error && !dio->bio.bi_error)
+			dio->bio.bi_error = bio->bi_error;
+	} else {
+		if (!dio->is_sync) {
+			struct kiocb *iocb = dio->iocb;
+			ssize_t ret = dio->bio.bi_error;
+
+			if (likely(!ret)) {
+				ret = dio->size;
+				iocb->ki_pos += ret;
+			}
+
+			dio->iocb->ki_complete(iocb, ret, 0);
+			bio_put(&dio->bio);
+		} else {
+			struct task_struct *waiter = dio->waiter;
+
+			WRITE_ONCE(dio->waiter, NULL);
+			wake_up_process(waiter);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		struct bio_vec *bvec;
+		int i;
+
+		bio_for_each_segment_all(bvec, bio, i)
+			put_page(bvec->bv_page);
+		bio_put(bio);
+	}
+}
+
+static ssize_t
+__blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, int nr_pages)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = bdev_file_inode(file);
+	struct block_device *bdev = I_BDEV(inode);
+	struct blk_plug plug;
+	struct blkdev_dio *dio;
+	struct bio *bio;
+	bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+	loff_t pos = iocb->ki_pos;
+	blk_qc_t qc = BLK_QC_T_NONE;
+	int ret;
+
+	if ((pos | iov_iter_alignment(iter)) &
+	    (bdev_logical_block_size(bdev) - 1))
+		return -EINVAL;
+
+	bio = bio_alloc_bioset(GFP_KERNEL, nr_pages, blkdev_dio_pool);
+	bio_get(bio); /* extra ref for the completion handler */
+
+	dio = container_of(bio, struct blkdev_dio, bio);
+	dio->is_sync = is_sync = is_sync_kiocb(iocb);
+	if (dio->is_sync)
+		dio->waiter = current;
+	else
+		dio->iocb = iocb;
+
+	dio->size = 0;
+	dio->multi_bio = false;
+	dio->should_dirty = is_read && (iter->type == ITER_IOVEC);
+
+	blk_start_plug(&plug);
+	for (;;) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = pos >> 9;
+		bio->bi_private = dio;
+		bio->bi_end_io = blkdev_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, iter);
+		if (unlikely(ret)) {
+			bio->bi_error = ret;
+			bio_endio(bio);
+			break;
+		}
+
+		if (is_read) {
+			bio->bi_opf = REQ_OP_READ;
+			if (dio->should_dirty)
+				bio_set_pages_dirty(bio);
+		} else {
+			bio->bi_opf = dio_bio_write_op(iocb);
+			task_io_account_write(bio->bi_iter.bi_size);
+		}
+
+		dio->size += bio->bi_iter.bi_size;
+		pos += bio->bi_iter.bi_size;
+
+		nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES);
+		if (!nr_pages) {
+			qc = submit_bio(bio);
+			break;
+		}
+
+		if (!dio->multi_bio) {
+			dio->multi_bio = true;
+			atomic_set(&dio->ref, 2);
+		} else {
+			atomic_inc(&dio->ref);
+		}
+
+		submit_bio(bio);
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+	}
+	blk_finish_plug(&plug);
+
+	if (!is_sync)
+		return -EIOCBQUEUED;
+
+	for (;;) {
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (!READ_ONCE(dio->waiter))
+			break;
+
+		if (!(iocb->ki_flags & IOCB_HIPRI) ||
+		    !blk_mq_poll(bdev_get_queue(bdev), qc))
+			io_schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+
+	ret = dio->bio.bi_error;
+	if (likely(!ret))
+		ret = dio->size;
+
+	bio_put(&dio->bio);
+	return ret;
+}
 
-	return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter,
-				    blkdev_get_block, NULL, NULL,
-				    DIO_SKIP_DIO_COUNT);
+static ssize_t
+blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+	int nr_pages;
+
+	nr_pages = iov_iter_npages(iter, BIO_MAX_PAGES + 1);
+	if (!nr_pages)
+		return 0;
+	if (is_sync_kiocb(iocb) && nr_pages <= BIO_MAX_PAGES)
+		return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+
+	return __blkdev_direct_IO(iocb, iter, min(nr_pages, BIO_MAX_PAGES));
 }
 
+static __init int blkdev_init(void)
+{
+	blkdev_dio_pool = bioset_create(4, offsetof(struct blkdev_dio, bio));
+	if (!blkdev_dio_pool)
+		return -ENOMEM;
+	return 0;
+}
+module_init(blkdev_init);
+
 int __sync_blockdev(struct block_device *bdev, int wait)
 {
 	if (!bdev)
@@ -832,7 +1089,7 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole,
 		return true;	 /* already a holder */
 	else if (bdev->bd_holder != NULL)
 		return false; 	 /* held by someone else */
-	else if (bdev->bd_contains == bdev)
+	else if (whole == bdev)
 		return true;  	 /* is a whole device which isn't held */
 
 	else if (whole->bd_holder == bd_may_claim)
@@ -1950,6 +2207,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 	spin_lock(&blockdev_superblock->s_inode_list_lock);
 	list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
 		struct address_space *mapping = inode->i_mapping;
+		struct block_device *bdev;
 
 		spin_lock(&inode->i_lock);
 		if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
@@ -1970,8 +2228,12 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg)
 		 */
 		iput(old_inode);
 		old_inode = inode;
+		bdev = I_BDEV(inode);
 
-		func(I_BDEV(inode), arg);
+		mutex_lock(&bdev->bd_mutex);
+		if (bdev->bd_openers)
+			func(bdev, arg);
+		mutex_unlock(&bdev->bd_mutex);
 
 		spin_lock(&blockdev_superblock->s_inode_list_lock);
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 50bcfb80d33a..6a823719b6c5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -3232,9 +3232,6 @@ int btrfs_dirty_pages(struct inode *inode, struct page **pages,
 		      size_t num_pages, loff_t pos, size_t write_bytes,
 		      struct extent_state **cached);
 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-			      struct file *file_out, loff_t pos_out,
-			      size_t len, unsigned int flags);
 int btrfs_clone_file_range(struct file *file_in, loff_t pos_in,
 			   struct file *file_out, loff_t pos_out, u64 len);
 
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 066d9b929a0c..18004169552c 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -946,7 +946,7 @@ int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode,
 
 	atomic_inc(&fs_info->nr_async_submits);
 
-	if (bio->bi_opf & REQ_SYNC)
+	if (op_is_sync(bio->bi_opf))
 		btrfs_set_work_high_priority(&async->work);
 
 	btrfs_queue_work(fs_info->workers, &async->work);
@@ -3486,9 +3486,9 @@ static int write_dev_supers(struct btrfs_device *device,
 		 * to go down lazy.
 		 */
 		if (i == 0)
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_FUA, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_FUA, bh);
 		else
-			ret = btrfsic_submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+			ret = btrfsic_submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 		if (ret)
 			errors++;
 	}
@@ -3552,7 +3552,7 @@ static int write_dev_flush(struct btrfs_device *device, int wait)
 
 	bio->bi_end_io = btrfs_end_empty_barrier;
 	bio->bi_bdev = device->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
 	init_completion(&device->flush_wait);
 	bio->bi_private = &device->flush_wait;
 	device->flush_bio = bio;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index bf2defa8ebe0..4ac383a3a649 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -127,7 +127,7 @@ struct extent_page_data {
 	 */
 	unsigned int extent_locked:1;
 
-	/* tells the submit_bio code to use a WRITE_SYNC */
+	/* tells the submit_bio code to use REQ_SYNC */
 	unsigned int sync_io:1;
 };
 
@@ -2047,7 +2047,7 @@ int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical,
 		return -EIO;
 	}
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	bio_add_page(bio, page, length, pg_offset);
 
 	if (btrfsic_submit_bio_wait(bio)) {
@@ -2388,7 +2388,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct bio *bio;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -2404,9 +2404,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
 	}
 
 	if (failed_bio->bi_vcnt > 1)
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	phy_offset >>= inode->i_sb->s_blocksize_bits;
 	bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page,
@@ -3486,7 +3484,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 	unsigned long nr_written = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		write_flags = WRITE_SYNC;
+		write_flags = REQ_SYNC;
 
 	trace___extent_writepage(page, inode, wbc);
 
@@ -3731,7 +3729,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
 	unsigned long i, num_pages;
 	unsigned long bio_flags = 0;
 	unsigned long start, end;
-	int write_flags = (epd->sync_io ? WRITE_SYNC : 0) | REQ_META;
+	int write_flags = (epd->sync_io ? REQ_SYNC : 0) | REQ_META;
 	int ret = 0;
 
 	clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags);
@@ -4077,7 +4075,7 @@ static void flush_epd_write_bio(struct extent_page_data *epd)
 		int ret;
 
 		bio_set_op_attrs(epd->bio, REQ_OP_WRITE,
-				 epd->sync_io ? WRITE_SYNC : 0);
+				 epd->sync_io ? REQ_SYNC : 0);
 
 		ret = submit_one_bio(epd->bio, 0, epd->bio_flags);
 		BUG_ON(ret < 0); /* -ENOMEM */
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 448f57d108d1..b5c5da215d05 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -3033,7 +3033,6 @@ const struct file_operations btrfs_file_operations = {
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= btrfs_compat_ioctl,
 #endif
-	.copy_file_range = btrfs_copy_file_range,
 	.clone_file_range = btrfs_clone_file_range,
 	.dedupe_file_range = btrfs_dedupe_file_range,
 };
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8a8e719778bd..1e861a063721 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7912,7 +7912,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	struct io_failure_record *failrec;
 	struct bio *bio;
 	int isector;
-	int read_mode;
+	int read_mode = 0;
 	int ret;
 
 	BUG_ON(bio_op(failed_bio) == REQ_OP_WRITE);
@@ -7931,9 +7931,7 @@ static int dio_read_error(struct inode *inode, struct bio *failed_bio,
 	if ((failed_bio->bi_vcnt > 1)
 		|| (failed_bio->bi_io_vec->bv_len
 			> btrfs_inode_sectorsize(inode)))
-		read_mode = READ_SYNC | REQ_FAILFAST_DEV;
-	else
-		read_mode = READ_SYNC;
+		read_mode |= REQ_FAILFAST_DEV;
 
 	isector = start - btrfs_io_bio(failed_bio)->logical;
 	isector >>= inode->i_sb->s_blocksize_bits;
@@ -8419,7 +8417,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip,
 	if (!bio)
 		return -ENOMEM;
 
-	bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio));
+	bio->bi_opf = orig_bio->bi_opf;
 	bio->bi_private = dip;
 	bio->bi_end_io = btrfs_end_dio_bio;
 	btrfs_io_bio(bio)->logical = file_offset;
@@ -8457,8 +8455,7 @@ next_block:
 						  start_sector, GFP_NOFS);
 			if (!bio)
 				goto out_err;
-			bio_set_op_attrs(bio, bio_op(orig_bio),
-					 bio_flags(orig_bio));
+			bio->bi_opf = orig_bio->bi_opf;
 			bio->bi_private = dip;
 			bio->bi_end_io = btrfs_end_dio_bio;
 			btrfs_io_bio(bio)->logical = file_offset;
@@ -10671,7 +10668,6 @@ static const struct inode_operations btrfs_special_inode_operations = {
 	.update_time	= btrfs_update_time,
 };
 static const struct inode_operations btrfs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.getattr	= btrfs_getattr,
 	.setattr	= btrfs_setattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 0a6902555e65..33f967d30b2a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -834,7 +834,7 @@ static inline int btrfs_may_create(struct inode *dir, struct dentry *child)
  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
  * inside this filesystem so it's quite a bit simpler.
  */
-static noinline int btrfs_mksubvol(struct path *parent,
+static noinline int btrfs_mksubvol(const struct path *parent,
 				   char *name, int namelen,
 				   struct btrfs_root *snap_src,
 				   u64 *async_transid, bool readonly,
@@ -3987,18 +3987,6 @@ out_unlock:
 	return ret;
 }
 
-ssize_t btrfs_copy_file_range(struct file *file_in, loff_t pos_in,
-			      struct file *file_out, loff_t pos_out,
-			      size_t len, unsigned int flags)
-{
-	ssize_t ret;
-
-	ret = btrfs_clone_files(file_out, file_in, pos_in, len, pos_out);
-	if (ret == 0)
-		ret = len;
-	return ret;
-}
-
 int btrfs_clone_file_range(struct file *src_file, loff_t off,
 		struct file *dst_file, loff_t destoff, u64 len)
 {
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0d63d994be10..9a94670536a6 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -4431,7 +4431,7 @@ static int write_page_nocow(struct scrub_ctx *sctx,
 	bio->bi_iter.bi_size = 0;
 	bio->bi_iter.bi_sector = physical_for_dev_replace >> 9;
 	bio->bi_bdev = dev->bdev;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 	ret = bio_add_page(bio, page, PAGE_SIZE, 0);
 	if (ret != PAGE_SIZE) {
 leave_with_eio:
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 4a151930f201..b5ae7d3d1896 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -202,27 +202,25 @@ static struct ratelimit_state printk_limits[] = {
 void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 {
 	struct super_block *sb = fs_info->sb;
-	char lvl[4];
+	char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
 	struct va_format vaf;
 	va_list args;
-	const char *type = logtypes[4];
 	int kern_level;
-	struct ratelimit_state *ratelimit;
+	const char *type = logtypes[4];
+	struct ratelimit_state *ratelimit = &printk_limits[4];
 
 	va_start(args, fmt);
 
-	kern_level = printk_get_level(fmt);
-	if (kern_level) {
+	while ((kern_level = printk_get_level(fmt)) != 0) {
 		size_t size = printk_skip_level(fmt) - fmt;
-		memcpy(lvl, fmt,  size);
-		lvl[size] = '\0';
+
+		if (kern_level >= '0' && kern_level <= '7') {
+			memcpy(lvl, fmt,  size);
+			lvl[size] = '\0';
+			type = logtypes[kern_level - '0'];
+			ratelimit = &printk_limits[kern_level - '0'];
+		}
 		fmt += size;
-		type = logtypes[kern_level - '0'];
-		ratelimit = &printk_limits[kern_level - '0'];
-	} else {
-		*lvl = '\0';
-		/* Default to debug output */
-		ratelimit = &printk_limits[7];
 	}
 
 	vaf.fmt = fmt;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index cfabeeb16368..ea272432c930 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -165,6 +165,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
 				slot = radix_tree_iter_retry(&iter);
 			continue;
 		}
+		slot = radix_tree_iter_resume(slot, &iter);
 		spin_unlock(&fs_info->buffer_lock);
 		free_extent_buffer_stale(eb);
 		spin_lock(&fs_info->buffer_lock);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bb8592e1a364..3c3c69c0eee4 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -6012,7 +6012,7 @@ static void btrfs_end_bio(struct bio *bio)
 				else
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_READ_ERRS);
-				if ((bio->bi_opf & WRITE_FLUSH) == WRITE_FLUSH)
+				if (bio->bi_opf & REQ_PREFLUSH)
 					btrfs_dev_stat_inc(dev,
 						BTRFS_DEV_STAT_FLUSH_ERRS);
 				btrfs_dev_stat_print_on_error(dev);
@@ -6089,7 +6089,7 @@ static noinline void btrfs_schedule_bio(struct btrfs_device *device,
 	bio->bi_next = NULL;
 
 	spin_lock(&device->io_lock);
-	if (bio->bi_opf & REQ_SYNC)
+	if (op_is_sync(bio->bi_opf))
 		pending_bios = &device->pending_sync_bios;
 	else
 		pending_bios = &device->pending_bios;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 811328984702..24ba6bc3ec34 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -61,7 +61,7 @@ struct btrfs_device {
 	int running_pending;
 	/* regular prio bios */
 	struct btrfs_pending_bios pending_bios;
-	/* WRITE_SYNC bios */
+	/* sync bios */
 	struct btrfs_pending_bios pending_sync_bios;
 
 	struct block_device *bdev;
diff --git a/fs/buffer.c b/fs/buffer.c
index b205a629001d..0e87401cf335 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -43,6 +43,7 @@
 #include <linux/bitops.h>
 #include <linux/mpage.h>
 #include <linux/bit_spinlock.h>
+#include <linux/pagevec.h>
 #include <trace/events/block.h>
 
 static int fsync_buffers_list(spinlock_t *lock, struct list_head *list);
@@ -753,7 +754,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
 				 * still in flight on potentially older
 				 * contents.
 				 */
-				write_dirty_buffer(bh, WRITE_SYNC);
+				write_dirty_buffer(bh, REQ_SYNC);
 
 				/*
 				 * Kick off IO for the previous mapping. Note
@@ -1604,37 +1605,80 @@ void create_empty_buffers(struct page *page,
 }
 EXPORT_SYMBOL(create_empty_buffers);
 
-/*
- * We are taking a block for data and we don't want any output from any
- * buffer-cache aliases starting from return from that function and
- * until the moment when something will explicitly mark the buffer
- * dirty (hopefully that will not happen until we will free that block ;-)
- * We don't even need to mark it not-uptodate - nobody can expect
- * anything from a newly allocated buffer anyway. We used to used
- * unmap_buffer() for such invalidation, but that was wrong. We definitely
- * don't want to mark the alias unmapped, for example - it would confuse
- * anyone who might pick it with bread() afterwards...
- *
- * Also..  Note that bforget() doesn't lock the buffer.  So there can
- * be writeout I/O going on against recently-freed buffers.  We don't
- * wait on that I/O in bforget() - it's more efficient to wait on the I/O
- * only if we really need to.  That happens here.
- */
-void unmap_underlying_metadata(struct block_device *bdev, sector_t block)
+/**
+ * clean_bdev_aliases: clean a range of buffers in block device
+ * @bdev: Block device to clean buffers in
+ * @block: Start of a range of blocks to clean
+ * @len: Number of blocks to clean
+ *
+ * We are taking a range of blocks for data and we don't want writeback of any
+ * buffer-cache aliases starting from return from this function and until the
+ * moment when something will explicitly mark the buffer dirty (hopefully that
+ * will not happen until we will free that block ;-) We don't even need to mark
+ * it not-uptodate - nobody can expect anything from a newly allocated buffer
+ * anyway. We used to use unmap_buffer() for such invalidation, but that was
+ * wrong. We definitely don't want to mark the alias unmapped, for example - it
+ * would confuse anyone who might pick it with bread() afterwards...
+ *
+ * Also..  Note that bforget() doesn't lock the buffer.  So there can be
+ * writeout I/O going on against recently-freed buffers.  We don't wait on that
+ * I/O in bforget() - it's more efficient to wait on the I/O only if we really
+ * need to.  That happens here.
+ */
+void clean_bdev_aliases(struct block_device *bdev, sector_t block, sector_t len)
 {
-	struct buffer_head *old_bh;
+	struct inode *bd_inode = bdev->bd_inode;
+	struct address_space *bd_mapping = bd_inode->i_mapping;
+	struct pagevec pvec;
+	pgoff_t index = block >> (PAGE_SHIFT - bd_inode->i_blkbits);
+	pgoff_t end;
+	int i;
+	struct buffer_head *bh;
+	struct buffer_head *head;
 
-	might_sleep();
+	end = (block + len - 1) >> (PAGE_SHIFT - bd_inode->i_blkbits);
+	pagevec_init(&pvec, 0);
+	while (index <= end && pagevec_lookup(&pvec, bd_mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
 
-	old_bh = __find_get_block_slow(bdev, block);
-	if (old_bh) {
-		clear_buffer_dirty(old_bh);
-		wait_on_buffer(old_bh);
-		clear_buffer_req(old_bh);
-		__brelse(old_bh);
+			index = page->index;
+			if (index > end)
+				break;
+			if (!page_has_buffers(page))
+				continue;
+			/*
+			 * We use page lock instead of bd_mapping->private_lock
+			 * to pin buffers here since we can afford to sleep and
+			 * it scales better than a global spinlock lock.
+			 */
+			lock_page(page);
+			/* Recheck when the page is locked which pins bhs */
+			if (!page_has_buffers(page))
+				goto unlock_page;
+			head = page_buffers(page);
+			bh = head;
+			do {
+				if (!buffer_mapped(bh) || (bh->b_blocknr < block))
+					goto next;
+				if (bh->b_blocknr >= block + len)
+					break;
+				clear_buffer_dirty(bh);
+				wait_on_buffer(bh);
+				clear_buffer_req(bh);
+next:
+				bh = bh->b_this_page;
+			} while (bh != head);
+unlock_page:
+			unlock_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+		index++;
 	}
 }
-EXPORT_SYMBOL(unmap_underlying_metadata);
+EXPORT_SYMBOL(clean_bdev_aliases);
 
 /*
  * Size is a power-of-two in the range 512..PAGE_SIZE,
@@ -1684,7 +1728,7 @@ static struct buffer_head *create_page_buffers(struct page *page, struct inode *
  * prevents this contention from occurring.
  *
  * If block_write_full_page() is called with wbc->sync_mode ==
- * WB_SYNC_ALL, the writes are posted using WRITE_SYNC; this
+ * WB_SYNC_ALL, the writes are posted using REQ_SYNC; this
  * causes the writes to be flagged as synchronous writes.
  */
 int __block_write_full_page(struct inode *inode, struct page *page,
@@ -1697,7 +1741,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 	struct buffer_head *bh, *head;
 	unsigned int blocksize, bbits;
 	int nr_underway = 0;
-	int write_flags = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = wbc_to_write_flags(wbc);
 
 	head = create_page_buffers(page, inode,
 					(1 << BH_Dirty)|(1 << BH_Uptodate));
@@ -1745,8 +1789,7 @@ int __block_write_full_page(struct inode *inode, struct page *page,
 			if (buffer_new(bh)) {
 				/* blockdev mappings never come here */
 				clear_buffer_new(bh);
-				unmap_underlying_metadata(bh->b_bdev,
-							bh->b_blocknr);
+				clean_bdev_bh_alias(bh);
 			}
 		}
 		bh = bh->b_this_page;
@@ -1992,8 +2035,7 @@ int __block_write_begin_int(struct page *page, loff_t pos, unsigned len,
 			}
 
 			if (buffer_new(bh)) {
-				unmap_underlying_metadata(bh->b_bdev,
-							bh->b_blocknr);
+				clean_bdev_bh_alias(bh);
 				if (PageUptodate(page)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
@@ -2633,7 +2675,7 @@ int nobh_write_begin(struct address_space *mapping,
 		if (!buffer_mapped(bh))
 			is_mapped_to_disk = 0;
 		if (buffer_new(bh))
-			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			clean_bdev_bh_alias(bh);
 		if (PageUptodate(page)) {
 			set_buffer_uptodate(bh);
 			continue;
@@ -3118,7 +3160,7 @@ EXPORT_SYMBOL(submit_bh);
 /**
  * ll_rw_block: low-level access to block devices (DEPRECATED)
  * @op: whether to %READ or %WRITE
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  * @nr: number of &struct buffer_heads in the array
  * @bhs: array of pointers to &struct buffer_head
  *
@@ -3210,7 +3252,7 @@ EXPORT_SYMBOL(__sync_dirty_buffer);
 
 int sync_dirty_buffer(struct buffer_head *bh)
 {
-	return __sync_dirty_buffer(bh, WRITE_SYNC);
+	return __sync_dirty_buffer(bh, REQ_SYNC);
 }
 EXPORT_SYMBOL(sync_dirty_buffer);
 
@@ -3403,7 +3445,7 @@ void free_buffer_head(struct buffer_head *bh)
 }
 EXPORT_SYMBOL(free_buffer_head);
 
-static void buffer_exit_cpu(int cpu)
+static int buffer_exit_cpu_dead(unsigned int cpu)
 {
 	int i;
 	struct bh_lru *b = &per_cpu(bh_lrus, cpu);
@@ -3414,14 +3456,7 @@ static void buffer_exit_cpu(int cpu)
 	}
 	this_cpu_add(bh_accounting.nr, per_cpu(bh_accounting, cpu).nr);
 	per_cpu(bh_accounting, cpu).nr = 0;
-}
-
-static int buffer_cpu_notify(struct notifier_block *self,
-			      unsigned long action, void *hcpu)
-{
-	if (action == CPU_DEAD || action == CPU_DEAD_FROZEN)
-		buffer_exit_cpu((unsigned long)hcpu);
-	return NOTIFY_OK;
+	return 0;
 }
 
 /**
@@ -3471,6 +3506,7 @@ EXPORT_SYMBOL(bh_submit_read);
 void __init buffer_init(void)
 {
 	unsigned long nrpages;
+	int ret;
 
 	bh_cachep = kmem_cache_create("buffer_head",
 			sizeof(struct buffer_head), 0,
@@ -3483,5 +3519,7 @@ void __init buffer_init(void)
 	 */
 	nrpages = (nr_free_buffer_pages() * 10) / 100;
 	max_buffer_heads = nrpages * (PAGE_SIZE / sizeof(struct buffer_head));
-	hotcpu_notifier(buffer_cpu_notify, 0);
+	ret = cpuhp_setup_state_nocalls(CPUHP_FS_BUFF_DEAD, "fs/buffer:dead",
+					NULL, buffer_exit_cpu_dead);
+	WARN_ON(ret < 0);
 }
diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
index ef3ebd780aff..e4b066cd912a 100644
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -315,7 +315,32 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	struct page **pages;
 	pgoff_t next_index;
 	int nr_pages = 0;
-	int ret;
+	int got = 0;
+	int ret = 0;
+
+	if (!current->journal_info) {
+		/* caller of readpages does not hold buffer and read caps
+		 * (fadvise, madvise and readahead cases) */
+		int want = CEPH_CAP_FILE_CACHE;
+		ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
+		if (ret < 0) {
+			dout("start_read %p, error getting cap\n", inode);
+		} else if (!(got & want)) {
+			dout("start_read %p, no cache cap\n", inode);
+			ret = 0;
+		}
+		if (ret <= 0) {
+			if (got)
+				ceph_put_cap_refs(ci, got);
+			while (!list_empty(page_list)) {
+				page = list_entry(page_list->prev,
+						  struct page, lru);
+				list_del(&page->lru);
+				put_page(page);
+			}
+			return ret;
+		}
+	}
 
 	off = (u64) page_offset(page);
 
@@ -338,15 +363,18 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 				    CEPH_OSD_FLAG_READ, NULL,
 				    ci->i_truncate_seq, ci->i_truncate_size,
 				    false);
-	if (IS_ERR(req))
-		return PTR_ERR(req);
+	if (IS_ERR(req)) {
+		ret = PTR_ERR(req);
+		goto out;
+	}
 
 	/* build page vector */
 	nr_pages = calc_pages_for(0, len);
 	pages = kmalloc(sizeof(*pages) * nr_pages, GFP_KERNEL);
-	ret = -ENOMEM;
-	if (!pages)
-		goto out;
+	if (!pages) {
+		ret = -ENOMEM;
+		goto out_put;
+	}
 	for (i = 0; i < nr_pages; ++i) {
 		page = list_entry(page_list->prev, struct page, lru);
 		BUG_ON(PageLocked(page));
@@ -378,6 +406,12 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
 	if (ret < 0)
 		goto out_pages;
 	ceph_osdc_put_request(req);
+
+	/* After adding locked pages to page cache, the inode holds cache cap.
+	 * So we can drop our cap refs. */
+	if (got)
+		ceph_put_cap_refs(ci, got);
+
 	return nr_pages;
 
 out_pages:
@@ -386,8 +420,11 @@ out_pages:
 		unlock_page(pages[i]);
 	}
 	ceph_put_page_vector(pages, nr_pages, false);
-out:
+out_put:
 	ceph_osdc_put_request(req);
+out:
+	if (got)
+		ceph_put_cap_refs(ci, got);
 	return ret;
 }
 
@@ -424,7 +461,6 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
 		rc = start_read(inode, page_list, max);
 		if (rc < 0)
 			goto out;
-		BUG_ON(rc == 0);
 	}
 out:
 	ceph_fscache_readpages_cancel(inode, page_list);
@@ -438,7 +474,9 @@ out:
  * only snap context we are allowed to write back.
  */
 static struct ceph_snap_context *get_oldest_context(struct inode *inode,
-						    loff_t *snap_size)
+						    loff_t *snap_size,
+						    u64 *truncate_size,
+						    u32 *truncate_seq)
 {
 	struct ceph_inode_info *ci = ceph_inode(inode);
 	struct ceph_snap_context *snapc = NULL;
@@ -452,6 +490,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 			snapc = ceph_get_snap_context(capsnap->context);
 			if (snap_size)
 				*snap_size = capsnap->size;
+			if (truncate_size)
+				*truncate_size = capsnap->truncate_size;
+			if (truncate_seq)
+				*truncate_seq = capsnap->truncate_seq;
 			break;
 		}
 	}
@@ -459,6 +501,10 @@ static struct ceph_snap_context *get_oldest_context(struct inode *inode,
 		snapc = ceph_get_snap_context(ci->i_head_snapc);
 		dout(" head snapc %p has %d dirty pages\n",
 		     snapc, ci->i_wrbuffer_ref_head);
+		if (truncate_size)
+			*truncate_size = ci->i_truncate_size;
+		if (truncate_seq)
+			*truncate_seq = ci->i_truncate_seq;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 	return snapc;
@@ -501,7 +547,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 		dout("writepage %p page %p not dirty?\n", inode, page);
 		goto out;
 	}
-	oldest = get_oldest_context(inode, &snap_size);
+	oldest = get_oldest_context(inode, &snap_size,
+				    &truncate_size, &truncate_seq);
 	if (snapc->seq > oldest->seq) {
 		dout("writepage %p page %p snapc %p not writeable - noop\n",
 		     inode, page, snapc);
@@ -512,12 +559,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
 	}
 	ceph_put_snap_context(oldest);
 
-	spin_lock(&ci->i_ceph_lock);
-	truncate_seq = ci->i_truncate_seq;
-	truncate_size = ci->i_truncate_size;
 	if (snap_size == -1)
 		snap_size = i_size_read(inode);
-	spin_unlock(&ci->i_ceph_lock);
 
 	/* is this a partial page at end of file? */
 	if (page_off >= snap_size) {
@@ -764,7 +807,8 @@ retry:
 	/* find oldest snap context with dirty data */
 	ceph_put_snap_context(snapc);
 	snap_size = -1;
-	snapc = get_oldest_context(inode, &snap_size);
+	snapc = get_oldest_context(inode, &snap_size,
+				   &truncate_size, &truncate_seq);
 	if (!snapc) {
 		/* hmm, why does writepages get called when there
 		   is no dirty data? */
@@ -774,11 +818,7 @@ retry:
 	dout(" oldest snapc is %p seq %lld (%d snaps)\n",
 	     snapc, snapc->seq, snapc->num_snaps);
 
-	spin_lock(&ci->i_ceph_lock);
-	truncate_seq = ci->i_truncate_seq;
-	truncate_size = ci->i_truncate_size;
 	i_size = i_size_read(inode);
-	spin_unlock(&ci->i_ceph_lock);
 
 	if (last_snapc && snapc != last_snapc) {
 		/* if we switched to a newer snapc, restart our scan at the
@@ -1124,7 +1164,8 @@ out:
 static int context_is_writeable_or_written(struct inode *inode,
 					   struct ceph_snap_context *snapc)
 {
-	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+	struct ceph_snap_context *oldest = get_oldest_context(inode, NULL,
+							      NULL, NULL);
 	int ret = !oldest || snapc->seq <= oldest->seq;
 
 	ceph_put_snap_context(oldest);
@@ -1169,7 +1210,7 @@ retry_locked:
 		 * this page is already dirty in another (older) snap
 		 * context!  is it writeable now?
 		 */
-		oldest = get_oldest_context(inode, NULL);
+		oldest = get_oldest_context(inode, NULL, NULL, NULL);
 
 		if (snapc->seq > oldest->seq) {
 			ceph_put_snap_context(oldest);
@@ -1276,25 +1317,27 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
 			  struct page *page, void *fsdata)
 {
 	struct inode *inode = file_inode(file);
-	unsigned from = pos & (PAGE_SIZE - 1);
 	int check_cap = 0;
 
 	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
 	     inode, page, (int)pos, (int)copied, (int)len);
 
 	/* zero the stale part of the page if we did a short copy */
-	if (copied < len)
-		zero_user_segment(page, from+copied, len);
+	if (!PageUptodate(page)) {
+		if (copied < len) {
+			copied = 0;
+			goto out;
+		}
+		SetPageUptodate(page);
+	}
 
 	/* did file size increase? */
 	if (pos+copied > i_size_read(inode))
 		check_cap = ceph_inode_set_size(inode, pos+copied);
 
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
-
 	set_page_dirty(page);
 
+out:
 	unlock_page(page);
 	put_page(page);
 
@@ -1371,9 +1414,11 @@ static int ceph_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	     inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got));
 
 	if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) ||
-	    ci->i_inline_version == CEPH_INLINE_NONE)
+	    ci->i_inline_version == CEPH_INLINE_NONE) {
+		current->journal_info = vma->vm_file;
 		ret = filemap_fault(vma, vmf);
-	else
+		current->journal_info = NULL;
+	} else
 		ret = -EAGAIN;
 
 	dout("filemap_fault %p %llu~%zd dropping cap refs on %s ret %d\n",
@@ -1905,6 +1950,15 @@ int ceph_pool_perm_check(struct ceph_inode_info *ci, int need)
 	struct ceph_string *pool_ns;
 	int ret, flags;
 
+	if (ci->i_vino.snap != CEPH_NOSNAP) {
+		/*
+		 * Pool permission check needs to write to the first object.
+		 * But for snapshot, head of the first object may have alread
+		 * been deleted. Skip check to avoid creating orphan object.
+		 */
+		return 0;
+	}
+
 	if (ceph_test_mount_opt(ceph_inode_to_client(&ci->vfs_inode),
 				NOPOOLPERM))
 		return 0;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 16e6ded0b7f2..94fd76d04683 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -987,96 +987,127 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
 		__cap_delay_cancel(mdsc, ci);
 }
 
+struct cap_msg_args {
+	struct ceph_mds_session	*session;
+	u64			ino, cid, follows;
+	u64			flush_tid, oldest_flush_tid, size, max_size;
+	u64			xattr_version;
+	struct ceph_buffer	*xattr_buf;
+	struct timespec		atime, mtime, ctime;
+	int			op, caps, wanted, dirty;
+	u32			seq, issue_seq, mseq, time_warp_seq;
+	u32			flags;
+	kuid_t			uid;
+	kgid_t			gid;
+	umode_t			mode;
+	bool			inline_data;
+};
+
 /*
  * Build and send a cap message to the given MDS.
  *
  * Caller should be holding s_mutex.
  */
-static int send_cap_msg(struct ceph_mds_session *session,
-			u64 ino, u64 cid, int op,
-			int caps, int wanted, int dirty,
-			u32 seq, u64 flush_tid, u64 oldest_flush_tid,
-			u32 issue_seq, u32 mseq, u64 size, u64 max_size,
-			struct timespec *mtime, struct timespec *atime,
-			struct timespec *ctime, u32 time_warp_seq,
-			kuid_t uid, kgid_t gid, umode_t mode,
-			u64 xattr_version,
-			struct ceph_buffer *xattrs_buf,
-			u64 follows, bool inline_data)
+static int send_cap_msg(struct cap_msg_args *arg)
 {
 	struct ceph_mds_caps *fc;
 	struct ceph_msg *msg;
 	void *p;
 	size_t extra_len;
+	struct timespec zerotime = {0};
 
 	dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
 	     " seq %u/%u tid %llu/%llu mseq %u follows %lld size %llu/%llu"
-	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
-	     cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
-	     ceph_cap_string(dirty),
-	     seq, issue_seq, flush_tid, oldest_flush_tid,
-	     mseq, follows, size, max_size,
-	     xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+	     " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(arg->op),
+	     arg->cid, arg->ino, ceph_cap_string(arg->caps),
+	     ceph_cap_string(arg->wanted), ceph_cap_string(arg->dirty),
+	     arg->seq, arg->issue_seq, arg->flush_tid, arg->oldest_flush_tid,
+	     arg->mseq, arg->follows, arg->size, arg->max_size,
+	     arg->xattr_version,
+	     arg->xattr_buf ? (int)arg->xattr_buf->vec.iov_len : 0);
 
 	/* flock buffer size + inline version + inline data size +
 	 * osd_epoch_barrier + oldest_flush_tid */
-	extra_len = 4 + 8 + 4 + 4 + 8;
+	extra_len = 4 + 8 + 4 + 4 + 8 + 4 + 4 + 4 + 8 + 8 + 4;
 	msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc) + extra_len,
 			   GFP_NOFS, false);
 	if (!msg)
 		return -ENOMEM;
 
-	msg->hdr.version = cpu_to_le16(6);
-	msg->hdr.tid = cpu_to_le64(flush_tid);
+	msg->hdr.version = cpu_to_le16(10);
+	msg->hdr.tid = cpu_to_le64(arg->flush_tid);
 
 	fc = msg->front.iov_base;
 	memset(fc, 0, sizeof(*fc));
 
-	fc->cap_id = cpu_to_le64(cid);
-	fc->op = cpu_to_le32(op);
-	fc->seq = cpu_to_le32(seq);
-	fc->issue_seq = cpu_to_le32(issue_seq);
-	fc->migrate_seq = cpu_to_le32(mseq);
-	fc->caps = cpu_to_le32(caps);
-	fc->wanted = cpu_to_le32(wanted);
-	fc->dirty = cpu_to_le32(dirty);
-	fc->ino = cpu_to_le64(ino);
-	fc->snap_follows = cpu_to_le64(follows);
-
-	fc->size = cpu_to_le64(size);
-	fc->max_size = cpu_to_le64(max_size);
-	if (mtime)
-		ceph_encode_timespec(&fc->mtime, mtime);
-	if (atime)
-		ceph_encode_timespec(&fc->atime, atime);
-	if (ctime)
-		ceph_encode_timespec(&fc->ctime, ctime);
-	fc->time_warp_seq = cpu_to_le32(time_warp_seq);
-
-	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
-	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, gid));
-	fc->mode = cpu_to_le32(mode);
+	fc->cap_id = cpu_to_le64(arg->cid);
+	fc->op = cpu_to_le32(arg->op);
+	fc->seq = cpu_to_le32(arg->seq);
+	fc->issue_seq = cpu_to_le32(arg->issue_seq);
+	fc->migrate_seq = cpu_to_le32(arg->mseq);
+	fc->caps = cpu_to_le32(arg->caps);
+	fc->wanted = cpu_to_le32(arg->wanted);
+	fc->dirty = cpu_to_le32(arg->dirty);
+	fc->ino = cpu_to_le64(arg->ino);
+	fc->snap_follows = cpu_to_le64(arg->follows);
+
+	fc->size = cpu_to_le64(arg->size);
+	fc->max_size = cpu_to_le64(arg->max_size);
+	ceph_encode_timespec(&fc->mtime, &arg->mtime);
+	ceph_encode_timespec(&fc->atime, &arg->atime);
+	ceph_encode_timespec(&fc->ctime, &arg->ctime);
+	fc->time_warp_seq = cpu_to_le32(arg->time_warp_seq);
+
+	fc->uid = cpu_to_le32(from_kuid(&init_user_ns, arg->uid));
+	fc->gid = cpu_to_le32(from_kgid(&init_user_ns, arg->gid));
+	fc->mode = cpu_to_le32(arg->mode);
+
+	fc->xattr_version = cpu_to_le64(arg->xattr_version);
+	if (arg->xattr_buf) {
+		msg->middle = ceph_buffer_get(arg->xattr_buf);
+		fc->xattr_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+		msg->hdr.middle_len = cpu_to_le32(arg->xattr_buf->vec.iov_len);
+	}
 
 	p = fc + 1;
-	/* flock buffer size */
+	/* flock buffer size (version 2) */
 	ceph_encode_32(&p, 0);
-	/* inline version */
-	ceph_encode_64(&p, inline_data ? 0 : CEPH_INLINE_NONE);
+	/* inline version (version 4) */
+	ceph_encode_64(&p, arg->inline_data ? 0 : CEPH_INLINE_NONE);
 	/* inline data size */
 	ceph_encode_32(&p, 0);
-	/* osd_epoch_barrier */
+	/* osd_epoch_barrier (version 5) */
 	ceph_encode_32(&p, 0);
-	/* oldest_flush_tid */
-	ceph_encode_64(&p, oldest_flush_tid);
+	/* oldest_flush_tid (version 6) */
+	ceph_encode_64(&p, arg->oldest_flush_tid);
 
-	fc->xattr_version = cpu_to_le64(xattr_version);
-	if (xattrs_buf) {
-		msg->middle = ceph_buffer_get(xattrs_buf);
-		fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
-		msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
-	}
+	/*
+	 * caller_uid/caller_gid (version 7)
+	 *
+	 * Currently, we don't properly track which caller dirtied the caps
+	 * last, and force a flush of them when there is a conflict. For now,
+	 * just set this to 0:0, to emulate how the MDS has worked up to now.
+	 */
+	ceph_encode_32(&p, 0);
+	ceph_encode_32(&p, 0);
+
+	/* pool namespace (version 8) (mds always ignores this) */
+	ceph_encode_32(&p, 0);
 
-	ceph_con_send(&session->s_con, msg);
+	/*
+	 * btime and change_attr (version 9)
+	 *
+	 * We just zero these out for now, as the MDS ignores them unless
+	 * the requisite feature flags are set (which we don't do yet).
+	 */
+	ceph_encode_timespec(p, &zerotime);
+	p += sizeof(struct ceph_timespec);
+	ceph_encode_64(&p, 0);
+
+	/* Advisory flags (version 10) */
+	ceph_encode_32(&p, arg->flags);
+
+	ceph_con_send(&arg->session->s_con, msg);
 	return 0;
 }
 
@@ -1115,27 +1146,17 @@ void ceph_queue_caps_release(struct inode *inode)
  * caller should hold snap_rwsem (read), s_mutex.
  */
 static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
-		      int op, int used, int want, int retain, int flushing,
-		      u64 flush_tid, u64 oldest_flush_tid)
+		      int op, bool sync, int used, int want, int retain,
+		      int flushing, u64 flush_tid, u64 oldest_flush_tid)
 	__releases(cap->ci->i_ceph_lock)
 {
 	struct ceph_inode_info *ci = cap->ci;
 	struct inode *inode = &ci->vfs_inode;
-	u64 cap_id = cap->cap_id;
-	int held, revoking, dropping, keep;
-	u64 follows, size, max_size;
-	u32 seq, issue_seq, mseq, time_warp_seq;
-	struct timespec mtime, atime, ctime;
+	struct cap_msg_args arg;
+	int held, revoking, dropping;
 	int wake = 0;
-	umode_t mode;
-	kuid_t uid;
-	kgid_t gid;
-	struct ceph_mds_session *session;
-	u64 xattr_version = 0;
-	struct ceph_buffer *xattr_blob = NULL;
 	int delayed = 0;
 	int ret;
-	bool inline_data;
 
 	held = cap->issued | cap->implemented;
 	revoking = cap->implemented & ~cap->issued;
@@ -1148,7 +1169,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	     ceph_cap_string(revoking));
 	BUG_ON((retain & CEPH_CAP_PIN) == 0);
 
-	session = cap->session;
+	arg.session = cap->session;
 
 	/* don't release wanted unless we've waited a bit. */
 	if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
@@ -1177,40 +1198,51 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
 	cap->implemented &= cap->issued | used;
 	cap->mds_wanted = want;
 
-	follows = flushing ? ci->i_head_snapc->seq : 0;
-
-	keep = cap->implemented;
-	seq = cap->seq;
-	issue_seq = cap->issue_seq;
-	mseq = cap->mseq;
-	size = inode->i_size;
-	ci->i_reported_size = size;
-	max_size = ci->i_wanted_max_size;
-	ci->i_requested_max_size = max_size;
-	mtime = inode->i_mtime;
-	atime = inode->i_atime;
-	ctime = inode->i_ctime;
-	time_warp_seq = ci->i_time_warp_seq;
-	uid = inode->i_uid;
-	gid = inode->i_gid;
-	mode = inode->i_mode;
+	arg.ino = ceph_vino(inode).ino;
+	arg.cid = cap->cap_id;
+	arg.follows = flushing ? ci->i_head_snapc->seq : 0;
+	arg.flush_tid = flush_tid;
+	arg.oldest_flush_tid = oldest_flush_tid;
+
+	arg.size = inode->i_size;
+	ci->i_reported_size = arg.size;
+	arg.max_size = ci->i_wanted_max_size;
+	ci->i_requested_max_size = arg.max_size;
 
 	if (flushing & CEPH_CAP_XATTR_EXCL) {
 		__ceph_build_xattrs_blob(ci);
-		xattr_blob = ci->i_xattrs.blob;
-		xattr_version = ci->i_xattrs.version;
+		arg.xattr_version = ci->i_xattrs.version;
+		arg.xattr_buf = ci->i_xattrs.blob;
+	} else {
+		arg.xattr_buf = NULL;
 	}
 
-	inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+	arg.mtime = inode->i_mtime;
+	arg.atime = inode->i_atime;
+	arg.ctime = inode->i_ctime;
+
+	arg.op = op;
+	arg.caps = cap->implemented;
+	arg.wanted = want;
+	arg.dirty = flushing;
+
+	arg.seq = cap->seq;
+	arg.issue_seq = cap->issue_seq;
+	arg.mseq = cap->mseq;
+	arg.time_warp_seq = ci->i_time_warp_seq;
+
+	arg.uid = inode->i_uid;
+	arg.gid = inode->i_gid;
+	arg.mode = inode->i_mode;
+
+	arg.inline_data = ci->i_inline_version != CEPH_INLINE_NONE;
+	arg.flags = 0;
+	if (sync)
+		arg.flags |= CEPH_CLIENT_CAPS_SYNC;
 
 	spin_unlock(&ci->i_ceph_lock);
 
-	ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
-		op, keep, want, flushing, seq,
-		flush_tid, oldest_flush_tid, issue_seq, mseq,
-		size, max_size, &mtime, &atime, &ctime, time_warp_seq,
-		uid, gid, mode, xattr_version, xattr_blob,
-		follows, inline_data);
+	ret = send_cap_msg(&arg);
 	if (ret < 0) {
 		dout("error sending cap msg, must requeue %p\n", inode);
 		delayed = 1;
@@ -1227,15 +1259,42 @@ static inline int __send_flush_snap(struct inode *inode,
 				    struct ceph_cap_snap *capsnap,
 				    u32 mseq, u64 oldest_flush_tid)
 {
-	return send_cap_msg(session, ceph_vino(inode).ino, 0,
-			CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
-			capsnap->dirty, 0, capsnap->cap_flush.tid,
-			oldest_flush_tid, 0, mseq, capsnap->size, 0,
-			&capsnap->mtime, &capsnap->atime,
-			&capsnap->ctime, capsnap->time_warp_seq,
-			capsnap->uid, capsnap->gid, capsnap->mode,
-			capsnap->xattr_version, capsnap->xattr_blob,
-			capsnap->follows, capsnap->inline_data);
+	struct cap_msg_args	arg;
+
+	arg.session = session;
+	arg.ino = ceph_vino(inode).ino;
+	arg.cid = 0;
+	arg.follows = capsnap->follows;
+	arg.flush_tid = capsnap->cap_flush.tid;
+	arg.oldest_flush_tid = oldest_flush_tid;
+
+	arg.size = capsnap->size;
+	arg.max_size = 0;
+	arg.xattr_version = capsnap->xattr_version;
+	arg.xattr_buf = capsnap->xattr_blob;
+
+	arg.atime = capsnap->atime;
+	arg.mtime = capsnap->mtime;
+	arg.ctime = capsnap->ctime;
+
+	arg.op = CEPH_CAP_OP_FLUSHSNAP;
+	arg.caps = capsnap->issued;
+	arg.wanted = 0;
+	arg.dirty = capsnap->dirty;
+
+	arg.seq = 0;
+	arg.issue_seq = 0;
+	arg.mseq = mseq;
+	arg.time_warp_seq = capsnap->time_warp_seq;
+
+	arg.uid = capsnap->uid;
+	arg.gid = capsnap->gid;
+	arg.mode = capsnap->mode;
+
+	arg.inline_data = capsnap->inline_data;
+	arg.flags = 0;
+
+	return send_cap_msg(&arg);
 }
 
 /*
@@ -1858,9 +1917,9 @@ ack:
 		sent++;
 
 		/* __send_cap drops i_ceph_lock */
-		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
-				      want, retain, flushing,
-				      flush_tid, oldest_flush_tid);
+		delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, false,
+				cap_used, want, retain, flushing,
+				flush_tid, oldest_flush_tid);
 		goto retry; /* retake i_ceph_lock and restart our cap scan. */
 	}
 
@@ -1924,9 +1983,9 @@ retry:
 						&flush_tid, &oldest_flush_tid);
 
 		/* __send_cap drops i_ceph_lock */
-		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
-				     (cap->issued | cap->implemented),
-				     flushing, flush_tid, oldest_flush_tid);
+		delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, true,
+				used, want, (cap->issued | cap->implemented),
+				flushing, flush_tid, oldest_flush_tid);
 
 		if (delayed) {
 			spin_lock(&ci->i_ceph_lock);
@@ -1996,7 +2055,7 @@ static int unsafe_request_wait(struct inode *inode)
 	}
 	spin_unlock(&ci->i_unsafe_lock);
 
-	dout("unsafe_requeset_wait %p wait on tid %llu %llu\n",
+	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
 	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
 	if (req1) {
 		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@@ -2119,7 +2178,7 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
 			     inode, cap, cf->tid, ceph_cap_string(cf->caps));
 			ci->i_ceph_flags |= CEPH_I_NODELAY;
 			ret = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
-					  __ceph_caps_used(ci),
+					  false, __ceph_caps_used(ci),
 					  __ceph_caps_wanted(ci),
 					  cap->issued | cap->implemented,
 					  cf->caps, cf->tid, oldest_flush_tid);
@@ -2479,6 +2538,27 @@ static void check_max_size(struct inode *inode, loff_t endoff)
 		ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
 }
 
+int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
+{
+	int ret, err = 0;
+
+	BUG_ON(need & ~CEPH_CAP_FILE_RD);
+	BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
+	ret = ceph_pool_perm_check(ci, need);
+	if (ret < 0)
+		return ret;
+
+	ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
+	if (ret) {
+		if (err == -EAGAIN) {
+			ret = 0;
+		} else if (err < 0) {
+			ret = err;
+		}
+	}
+	return ret;
+}
+
 /*
  * Wait for caps, and take cap references.  If we can't get a WR cap
  * due to a small max_size, make sure we check_max_size (and possibly
@@ -2507,9 +2587,20 @@ int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 			if (err < 0)
 				ret = err;
 		} else {
-			ret = wait_event_interruptible(ci->i_cap_wq,
-					try_get_cap_refs(ci, need, want, endoff,
-							 true, &_got, &err));
+			DEFINE_WAIT_FUNC(wait, woken_wake_function);
+			add_wait_queue(&ci->i_cap_wq, &wait);
+
+			while (!try_get_cap_refs(ci, need, want, endoff,
+						 true, &_got, &err)) {
+				if (signal_pending(current)) {
+					ret = -ERESTARTSYS;
+					break;
+				}
+				wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT);
+			}
+
+			remove_wait_queue(&ci->i_cap_wq, &wait);
+
 			if (err == -EAGAIN)
 				continue;
 			if (err < 0)
@@ -3570,6 +3661,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
 			cap->cap_id = le64_to_cpu(h->cap_id);
 			cap->mseq = mseq;
 			cap->seq = seq;
+			cap->issue_seq = seq;
 			spin_lock(&session->s_cap_lock);
 			list_add_tail(&cap->session_caps,
 					&session->s_cap_releases);
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 78180d151730..8ab1fdf0bd49 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -32,40 +32,19 @@ const struct dentry_operations ceph_dentry_ops;
 /*
  * Initialize ceph dentry state.
  */
-int ceph_init_dentry(struct dentry *dentry)
+static int ceph_d_init(struct dentry *dentry)
 {
 	struct ceph_dentry_info *di;
 
-	if (dentry->d_fsdata)
-		return 0;
-
 	di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
 	if (!di)
 		return -ENOMEM;          /* oh well */
 
-	spin_lock(&dentry->d_lock);
-	if (dentry->d_fsdata) {
-		/* lost a race */
-		kmem_cache_free(ceph_dentry_cachep, di);
-		goto out_unlock;
-	}
-
-	if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_NOSNAP)
-		d_set_d_op(dentry, &ceph_dentry_ops);
-	else if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
-		d_set_d_op(dentry, &ceph_snapdir_dentry_ops);
-	else
-		d_set_d_op(dentry, &ceph_snap_dentry_ops);
-
 	di->dentry = dentry;
 	di->lease_session = NULL;
 	di->time = jiffies;
-	/* avoid reordering d_fsdata setup so that the check above is safe */
-	smp_mb();
 	dentry->d_fsdata = di;
 	ceph_dentry_lru_add(dentry);
-out_unlock:
-	spin_unlock(&dentry->d_lock);
 	return 0;
 }
 
@@ -737,10 +716,6 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	err = ceph_init_dentry(dentry);
-	if (err < 0)
-		return ERR_PTR(err);
-
 	/* can we conclude ENOENT locally? */
 	if (d_really_is_negative(dentry)) {
 		struct ceph_inode_info *ci = ceph_inode(dir);
@@ -1255,32 +1230,37 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
 		struct ceph_mds_client *mdsc =
 			ceph_sb_to_client(dir->i_sb)->mdsc;
 		struct ceph_mds_request *req;
-		int op, mask, err;
+		int op, err;
+		u32 mask;
 
 		if (flags & LOOKUP_RCU)
 			return -ECHILD;
 
 		op = ceph_snap(dir) == CEPH_SNAPDIR ?
-			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+			CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_GETATTR;
 		req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
 		if (!IS_ERR(req)) {
 			req->r_dentry = dget(dentry);
-			req->r_num_caps = 2;
+			req->r_num_caps = op == CEPH_MDS_OP_GETATTR ? 1 : 2;
 
 			mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
 			if (ceph_security_xattr_wanted(dir))
 				mask |= CEPH_CAP_XATTR_SHARED;
-			req->r_args.getattr.mask = mask;
+			req->r_args.getattr.mask = cpu_to_le32(mask);
 
-			req->r_locked_dir = dir;
 			err = ceph_mdsc_do_request(mdsc, NULL, req);
-			if (err == 0 || err == -ENOENT) {
-				if (dentry == req->r_dentry) {
-					valid = !d_unhashed(dentry);
-				} else {
-					d_invalidate(req->r_dentry);
-					err = -EAGAIN;
-				}
+			switch (err) {
+			case 0:
+				if (d_really_is_positive(dentry) &&
+				    d_inode(dentry) == req->r_target_inode)
+					valid = 1;
+				break;
+			case -ENOENT:
+				if (d_really_is_negative(dentry))
+					valid = 1;
+				/* Fallthrough */
+			default:
+				break;
 			}
 			ceph_mdsc_put_request(req);
 			dout("d_revalidate %p lookup result=%d\n",
@@ -1319,16 +1299,6 @@ static void ceph_d_release(struct dentry *dentry)
 	kmem_cache_free(ceph_dentry_cachep, di);
 }
 
-static int ceph_snapdir_d_revalidate(struct dentry *dentry,
-					  unsigned int flags)
-{
-	/*
-	 * Eventually, we'll want to revalidate snapped metadata
-	 * too... probably...
-	 */
-	return 1;
-}
-
 /*
  * When the VFS prunes a dentry from the cache, we need to clear the
  * complete flag on the parent directory.
@@ -1347,6 +1317,9 @@ static void ceph_d_prune(struct dentry *dentry)
 	if (d_unhashed(dentry))
 		return;
 
+	if (ceph_snap(d_inode(dentry->d_parent)) == CEPH_SNAPDIR)
+		return;
+
 	/*
 	 * we hold d_lock, so d_parent is stable, and d_fsdata is never
 	 * cleared until d_release
@@ -1517,14 +1490,5 @@ const struct dentry_operations ceph_dentry_ops = {
 	.d_revalidate = ceph_d_revalidate,
 	.d_release = ceph_d_release,
 	.d_prune = ceph_d_prune,
-};
-
-const struct dentry_operations ceph_snapdir_dentry_ops = {
-	.d_revalidate = ceph_snapdir_d_revalidate,
-	.d_release = ceph_d_release,
-};
-
-const struct dentry_operations ceph_snap_dentry_ops = {
-	.d_release = ceph_d_release,
-	.d_prune = ceph_d_prune,
+	.d_init = ceph_d_init,
 };
diff --git a/fs/ceph/export.c b/fs/ceph/export.c
index 1780218a48f0..180bbef760f2 100644
--- a/fs/ceph/export.c
+++ b/fs/ceph/export.c
@@ -62,7 +62,6 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
 {
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct inode *inode;
-	struct dentry *dentry;
 	struct ceph_vino vino;
 	int err;
 
@@ -94,16 +93,7 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
 			return ERR_PTR(-ESTALE);
 	}
 
-	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry))
-		return dentry;
-	err = ceph_init_dentry(dentry);
-	if (err < 0) {
-		dput(dentry);
-		return ERR_PTR(err);
-	}
-	dout("__fh_to_dentry %llx %p dentry %p\n", ino, inode, dentry);
-	return dentry;
+	return d_obtain_alias(inode);
 }
 
 /*
@@ -131,7 +121,6 @@ static struct dentry *__get_parent(struct super_block *sb,
 	struct ceph_mds_client *mdsc = ceph_sb_to_client(sb)->mdsc;
 	struct ceph_mds_request *req;
 	struct inode *inode;
-	struct dentry *dentry;
 	int mask;
 	int err;
 
@@ -164,18 +153,7 @@ static struct dentry *__get_parent(struct super_block *sb,
 	if (!inode)
 		return ERR_PTR(-ENOENT);
 
-	dentry = d_obtain_alias(inode);
-	if (IS_ERR(dentry))
-		return dentry;
-	err = ceph_init_dentry(dentry);
-	if (err < 0) {
-		dput(dentry);
-		return ERR_PTR(err);
-	}
-	dout("__get_parent ino %llx parent %p ino %llx.%llx\n",
-	     child ? ceph_ino(d_inode(child)) : ino,
-	     dentry, ceph_vinop(inode));
-	return dentry;
+	return d_obtain_alias(inode);
 }
 
 static struct dentry *ceph_get_parent(struct dentry *child)
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index f995e3528a33..045d30d26624 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -351,10 +351,6 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
 	if (dentry->d_name.len > NAME_MAX)
 		return -ENAMETOOLONG;
 
-	err = ceph_init_dentry(dentry);
-	if (err < 0)
-		return err;
-
 	if (flags & O_CREAT) {
 		err = ceph_pre_init_acls(dir, &mode, &acls);
 		if (err < 0)
@@ -458,71 +454,60 @@ enum {
  * only return a short read to the caller if we hit EOF.
  */
 static int striped_read(struct inode *inode,
-			u64 off, u64 len,
+			u64 pos, u64 len,
 			struct page **pages, int num_pages,
-			int *checkeof)
+			int page_align, int *checkeof)
 {
 	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
 	struct ceph_inode_info *ci = ceph_inode(inode);
-	u64 pos, this_len, left;
+	u64 this_len;
 	loff_t i_size;
-	int page_align, pages_left;
-	int read, ret;
-	struct page **page_pos;
+	int page_idx;
+	int ret, read = 0;
 	bool hit_stripe, was_short;
 
 	/*
 	 * we may need to do multiple reads.  not atomic, unfortunately.
 	 */
-	pos = off;
-	left = len;
-	page_pos = pages;
-	pages_left = num_pages;
-	read = 0;
-
 more:
-	page_align = pos & ~PAGE_MASK;
-	this_len = left;
+	this_len = len;
+	page_idx = (page_align + read) >> PAGE_SHIFT;
 	ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode),
 				  &ci->i_layout, pos, &this_len,
-				  ci->i_truncate_seq,
-				  ci->i_truncate_size,
-				  page_pos, pages_left, page_align);
+				  ci->i_truncate_seq, ci->i_truncate_size,
+				  pages + page_idx, num_pages - page_idx,
+				  ((page_align + read) & ~PAGE_MASK));
 	if (ret == -ENOENT)
 		ret = 0;
-	hit_stripe = this_len < left;
+	hit_stripe = this_len < len;
 	was_short = ret >= 0 && ret < this_len;
-	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read,
+	dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, len, read,
 	     ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : "");
 
 	i_size = i_size_read(inode);
 	if (ret >= 0) {
-		int didpages;
 		if (was_short && (pos + ret < i_size)) {
 			int zlen = min(this_len - ret, i_size - pos - ret);
-			int zoff = (off & ~PAGE_MASK) + read + ret;
+			int zoff = page_align + read + ret;
 			dout(" zero gap %llu to %llu\n",
-				pos + ret, pos + ret + zlen);
+			     pos + ret, pos + ret + zlen);
 			ceph_zero_page_vector_range(zoff, zlen, pages);
 			ret += zlen;
 		}
 
-		didpages = (page_align + ret) >> PAGE_SHIFT;
+		read += ret;
 		pos += ret;
-		read = pos - off;
-		left -= ret;
-		page_pos += didpages;
-		pages_left -= didpages;
+		len -= ret;
 
 		/* hit stripe and need continue*/
-		if (left && hit_stripe && pos < i_size)
+		if (len && hit_stripe && pos < i_size)
 			goto more;
 	}
 
 	if (read > 0) {
 		ret = read;
 		/* did we bounce off eof? */
-		if (pos + left > i_size)
+		if (pos + len > i_size)
 			*checkeof = CHECK_EOF;
 	}
 
@@ -536,15 +521,16 @@ more:
  *
  * If the read spans object boundary, just do multiple reads.
  */
-static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
-				int *checkeof)
+static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to,
+			      int *checkeof)
 {
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	struct page **pages;
 	u64 off = iocb->ki_pos;
-	int num_pages, ret;
-	size_t len = iov_iter_count(i);
+	int num_pages;
+	ssize_t ret;
+	size_t len = iov_iter_count(to);
 
 	dout("sync_read on file %p %llu~%u %s\n", file, off,
 	     (unsigned)len,
@@ -563,35 +549,56 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 	if (ret < 0)
 		return ret;
 
-	num_pages = calc_pages_for(off, len);
-	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
-	if (IS_ERR(pages))
-		return PTR_ERR(pages);
-	ret = striped_read(inode, off, len, pages,
-				num_pages, checkeof);
-	if (ret > 0) {
-		int l, k = 0;
-		size_t left = ret;
-
-		while (left) {
-			size_t page_off = off & ~PAGE_MASK;
-			size_t copy = min_t(size_t, left,
-					    PAGE_SIZE - page_off);
-			l = copy_page_to_iter(pages[k++], page_off, copy, i);
-			off += l;
-			left -= l;
-			if (l < copy)
-				break;
+	if (unlikely(to->type & ITER_PIPE)) {
+		size_t page_off;
+		ret = iov_iter_get_pages_alloc(to, &pages, len,
+					       &page_off);
+		if (ret <= 0)
+			return -ENOMEM;
+		num_pages = DIV_ROUND_UP(ret + page_off, PAGE_SIZE);
+
+		ret = striped_read(inode, off, ret, pages, num_pages,
+				   page_off, checkeof);
+		if (ret > 0) {
+			iov_iter_advance(to, ret);
+			off += ret;
+		} else {
+			iov_iter_advance(to, 0);
 		}
+		ceph_put_page_vector(pages, num_pages, false);
+	} else {
+		num_pages = calc_pages_for(off, len);
+		pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+
+		ret = striped_read(inode, off, len, pages, num_pages,
+				   (off & ~PAGE_MASK), checkeof);
+		if (ret > 0) {
+			int l, k = 0;
+			size_t left = ret;
+
+			while (left) {
+				size_t page_off = off & ~PAGE_MASK;
+				size_t copy = min_t(size_t, left,
+						    PAGE_SIZE - page_off);
+				l = copy_page_to_iter(pages[k++], page_off,
+						      copy, to);
+				off += l;
+				left -= l;
+				if (l < copy)
+					break;
+			}
+		}
+		ceph_release_page_vector(pages, num_pages);
 	}
-	ceph_release_page_vector(pages, num_pages);
 
 	if (off > iocb->ki_pos) {
 		ret = off - iocb->ki_pos;
 		iocb->ki_pos = off;
 	}
 
-	dout("sync_read result %d\n", ret);
+	dout("sync_read result %zd\n", ret);
 	return ret;
 }
 
@@ -853,7 +860,7 @@ void ceph_sync_write_wait(struct inode *inode)
 
 		dout("sync_write_wait on tid %llu (until %llu)\n",
 		     req->r_tid, last_tid);
-		wait_for_completion(&req->r_safe_completion);
+		wait_for_completion(&req->r_done_completion);
 		ceph_osdc_put_request(req);
 
 		spin_lock(&ci->i_unsafe_lock);
@@ -906,7 +913,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
 					pos >> PAGE_SHIFT,
 					(pos + count) >> PAGE_SHIFT);
 		if (ret2 < 0)
-			dout("invalidate_inode_pages2_range returned %d\n", ret);
+			dout("invalidate_inode_pages2_range returned %d\n", ret2);
 
 		flags = CEPH_OSD_FLAG_ORDERSNAP |
 			CEPH_OSD_FLAG_ONDISK |
@@ -1249,8 +1256,9 @@ again:
 		dout("aio_read %p %llx.%llx %llu~%u got cap refs on %s\n",
 		     inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len,
 		     ceph_cap_string(got));
-
+		current->journal_info = filp;
 		ret = generic_file_read_iter(iocb, to);
+		current->journal_info = NULL;
 	}
 	dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n",
 	     inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret);
@@ -1770,6 +1778,7 @@ const struct file_operations ceph_file_fops = {
 	.fsync = ceph_fsync,
 	.lock = ceph_lock,
 	.flock = ceph_flock,
+	.splice_read = generic_file_splice_read,
 	.splice_write = iter_file_splice_write,
 	.unlocked_ioctl = ceph_ioctl,
 	.compat_ioctl	= ceph_ioctl,
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index ef4d04647325..5e659d054b40 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -305,7 +305,8 @@ static int frag_tree_split_cmp(const void *l, const void *r)
 {
 	struct ceph_frag_tree_split *ls = (struct ceph_frag_tree_split*)l;
 	struct ceph_frag_tree_split *rs = (struct ceph_frag_tree_split*)r;
-	return ceph_frag_compare(ls->frag, rs->frag);
+	return ceph_frag_compare(le32_to_cpu(ls->frag),
+				 le32_to_cpu(rs->frag));
 }
 
 static bool is_frag_child(u32 f, struct ceph_inode_frag *frag)
@@ -1023,16 +1024,17 @@ static void update_dentry_lease(struct dentry *dentry,
 	long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
 	struct inode *dir;
 
-	/* only track leases on regular dentries */
-	if (dentry->d_op != &ceph_dentry_ops)
-		return;
-
 	spin_lock(&dentry->d_lock);
 	dout("update_dentry_lease %p duration %lu ms ttl %lu\n",
 	     dentry, duration, ttl);
 
 	/* make lease_rdcache_gen match directory */
 	dir = d_inode(dentry->d_parent);
+
+	/* only track leases on regular dentries */
+	if (ceph_snap(dir) != CEPH_NOSNAP)
+		goto out_unlock;
+
 	di->lease_shared_gen = ceph_inode(dir)->i_shared_gen;
 
 	if (duration == 0)
@@ -1202,12 +1204,7 @@ retry_lookup:
 					err = -ENOMEM;
 					goto done;
 				}
-				err = ceph_init_dentry(dn);
-				if (err < 0) {
-					dput(dn);
-					dput(parent);
-					goto done;
-				}
+				err = 0;
 			} else if (d_really_is_positive(dn) &&
 				   (ceph_ino(d_inode(dn)) != vino.ino ||
 				    ceph_snap(d_inode(dn)) != vino.snap)) {
@@ -1561,12 +1558,6 @@ retry_lookup:
 				err = -ENOMEM;
 				goto out;
 			}
-			ret = ceph_init_dentry(dn);
-			if (ret < 0) {
-				dput(dn);
-				err = ret;
-				goto out;
-			}
 		} else if (d_really_is_positive(dn) &&
 			   (ceph_ino(d_inode(dn)) != vino.ino ||
 			    ceph_snap(d_inode(dn)) != vino.snap)) {
@@ -1879,7 +1870,6 @@ retry:
  * symlinks
  */
 static const struct inode_operations ceph_symlink_iops = {
-	.readlink = generic_readlink,
 	.get_link = simple_get_link,
 	.setattr = ceph_setattr,
 	.getattr = ceph_getattr,
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 815acd1a56d4..c9d2e553a6c4 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -288,12 +288,13 @@ static int parse_reply_info_extra(void **p, void *end,
 				  struct ceph_mds_reply_info_parsed *info,
 				  u64 features)
 {
-	if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
+	u32 op = le32_to_cpu(info->head->op);
+
+	if (op == CEPH_MDS_OP_GETFILELOCK)
 		return parse_reply_info_filelock(p, end, info, features);
-	else if (info->head->op == CEPH_MDS_OP_READDIR ||
-		 info->head->op == CEPH_MDS_OP_LSSNAP)
+	else if (op == CEPH_MDS_OP_READDIR || op == CEPH_MDS_OP_LSSNAP)
 		return parse_reply_info_dir(p, end, info, features);
-	else if (info->head->op == CEPH_MDS_OP_CREATE)
+	else if (op == CEPH_MDS_OP_CREATE)
 		return parse_reply_info_create(p, end, info, features);
 	else
 		return -EIO;
@@ -2100,17 +2101,31 @@ static int __do_request(struct ceph_mds_client *mdsc,
 		err = -EIO;
 		goto finish;
 	}
+	if (ACCESS_ONCE(mdsc->fsc->mount_state) == CEPH_MOUNT_MOUNTING) {
+		if (mdsc->mdsmap_err) {
+			err = mdsc->mdsmap_err;
+			dout("do_request mdsmap err %d\n", err);
+			goto finish;
+		}
+		if (mdsc->mdsmap->m_epoch == 0) {
+			dout("do_request no mdsmap, waiting for map\n");
+			list_add(&req->r_wait, &mdsc->waiting_for_map);
+			goto finish;
+		}
+		if (!(mdsc->fsc->mount_options->flags &
+		      CEPH_MOUNT_OPT_MOUNTWAIT) &&
+		    !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
+			err = -ENOENT;
+			pr_info("probably no mds server is up\n");
+			goto finish;
+		}
+	}
 
 	put_request_session(req);
 
 	mds = __choose_mds(mdsc, req);
 	if (mds < 0 ||
 	    ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
-		if (mdsc->mdsmap_err) {
-			err = mdsc->mdsmap_err;
-			dout("do_request mdsmap err %d\n", err);
-			goto finish;
-		}
 		dout("do_request no mds or not active, waiting for map\n");
 		list_add(&req->r_wait, &mdsc->waiting_for_map);
 		goto out;
@@ -3943,13 +3958,13 @@ static struct ceph_auth_handshake *get_authorizer(struct ceph_connection *con,
 }
 
 
-static int verify_authorizer_reply(struct ceph_connection *con, int len)
+static int verify_authorizer_reply(struct ceph_connection *con)
 {
 	struct ceph_mds_session *s = con->private;
 	struct ceph_mds_client *mdsc = s->s_mdsc;
 	struct ceph_auth_client *ac = mdsc->fsc->client->monc.auth;
 
-	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer, len);
+	return ceph_auth_verify_authorizer_reply(ac, s->s_auth.authorizer);
 }
 
 static int invalidate_authorizer(struct ceph_connection *con)
diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c
index 8c3591a7fbae..5454e2327a5f 100644
--- a/fs/ceph/mdsmap.c
+++ b/fs/ceph/mdsmap.c
@@ -42,6 +42,60 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
 	return i;
 }
 
+#define __decode_and_drop_type(p, end, type, bad)		\
+	do {							\
+		if (*p + sizeof(type) > end)			\
+			goto bad;				\
+		*p += sizeof(type);				\
+	} while (0)
+
+#define __decode_and_drop_set(p, end, type, bad)		\
+	do {							\
+		u32 n;						\
+		size_t need;					\
+		ceph_decode_32_safe(p, end, n, bad);		\
+		need = sizeof(type) * n;			\
+		ceph_decode_need(p, end, need, bad);		\
+		*p += need;					\
+	} while (0)
+
+#define __decode_and_drop_map(p, end, ktype, vtype, bad)	\
+	do {							\
+		u32 n;						\
+		size_t need;					\
+		ceph_decode_32_safe(p, end, n, bad);		\
+		need = (sizeof(ktype) + sizeof(vtype)) * n;	\
+		ceph_decode_need(p, end, need, bad);		\
+		*p += need;					\
+	} while (0)
+
+
+static int __decode_and_drop_compat_set(void **p, void* end)
+{
+	int i;
+	/* compat, ro_compat, incompat*/
+	for (i = 0; i < 3; i++) {
+		u32 n;
+		ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+		/* mask */
+		*p += sizeof(u64);
+		/* names (map<u64, string>) */
+		n = ceph_decode_32(p);
+		while (n-- > 0) {
+			u32 len;
+			ceph_decode_need(p, end, sizeof(u64) + sizeof(u32),
+					 bad);
+			*p += sizeof(u64);
+			len = ceph_decode_32(p);
+			ceph_decode_need(p, end, len, bad);
+			*p += len;
+		}
+	}
+	return 0;
+bad:
+	return -1;
+}
+
 /*
  * Decode an MDS map
  *
@@ -55,6 +109,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	int i, j, n;
 	int err = -EINVAL;
 	u8 mdsmap_v, mdsmap_cv;
+	u16 mdsmap_ev;
 
 	m = kzalloc(sizeof(*m), GFP_NOFS);
 	if (m == NULL)
@@ -83,7 +138,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 
 	m->m_info = kcalloc(m->m_max_mds, sizeof(*m->m_info), GFP_NOFS);
 	if (m->m_info == NULL)
-		goto badmem;
+		goto nomem;
 
 	/* pick out active nodes from mds_info (state > 0) */
 	n = ceph_decode_32(p);
@@ -166,7 +221,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 			info->export_targets = kcalloc(num_export_targets,
 						       sizeof(u32), GFP_NOFS);
 			if (info->export_targets == NULL)
-				goto badmem;
+				goto nomem;
 			for (j = 0; j < num_export_targets; j++)
 				info->export_targets[j] =
 				       ceph_decode_32(&pexport_targets);
@@ -180,24 +235,104 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
 	m->m_num_data_pg_pools = n;
 	m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
 	if (!m->m_data_pg_pools)
-		goto badmem;
+		goto nomem;
 	ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
 	for (i = 0; i < n; i++)
 		m->m_data_pg_pools[i] = ceph_decode_64(p);
 	m->m_cas_pg_pool = ceph_decode_64(p);
+	m->m_enabled = m->m_epoch > 1;
+
+	mdsmap_ev = 1;
+	if (mdsmap_v >= 2) {
+		ceph_decode_16_safe(p, end, mdsmap_ev, bad_ext);
+	}
+	if (mdsmap_ev >= 3) {
+		if (__decode_and_drop_compat_set(p, end) < 0)
+			goto bad_ext;
+	}
+	/* metadata_pool */
+	if (mdsmap_ev < 5) {
+		__decode_and_drop_type(p, end, u32, bad_ext);
+	} else {
+		__decode_and_drop_type(p, end, u64, bad_ext);
+	}
 
-	/* ok, we don't care about the rest. */
+	/* created + modified + tableserver */
+	__decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+	__decode_and_drop_type(p, end, struct ceph_timespec, bad_ext);
+	__decode_and_drop_type(p, end, u32, bad_ext);
+
+	/* in */
+	{
+		int num_laggy = 0;
+		ceph_decode_32_safe(p, end, n, bad_ext);
+		ceph_decode_need(p, end, sizeof(u32) * n, bad_ext);
+
+		for (i = 0; i < n; i++) {
+			s32 mds = ceph_decode_32(p);
+			if (mds >= 0 && mds < m->m_max_mds) {
+				if (m->m_info[mds].laggy)
+					num_laggy++;
+			}
+		}
+		m->m_num_laggy = num_laggy;
+	}
+
+	/* inc */
+	__decode_and_drop_map(p, end, u32, u32, bad_ext);
+	/* up */
+	__decode_and_drop_map(p, end, u32, u64, bad_ext);
+	/* failed */
+	__decode_and_drop_set(p, end, u32, bad_ext);
+	/* stopped */
+	__decode_and_drop_set(p, end, u32, bad_ext);
+
+	if (mdsmap_ev >= 4) {
+		/* last_failure_osd_epoch */
+		__decode_and_drop_type(p, end, u32, bad_ext);
+	}
+	if (mdsmap_ev >= 6) {
+		/* ever_allowed_snaps */
+		__decode_and_drop_type(p, end, u8, bad_ext);
+		/* explicitly_allowed_snaps */
+		__decode_and_drop_type(p, end, u8, bad_ext);
+	}
+	if (mdsmap_ev >= 7) {
+		/* inline_data_enabled */
+		__decode_and_drop_type(p, end, u8, bad_ext);
+	}
+	if (mdsmap_ev >= 8) {
+		u32 name_len;
+		/* enabled */
+		ceph_decode_8_safe(p, end, m->m_enabled, bad_ext);
+		ceph_decode_32_safe(p, end, name_len, bad_ext);
+		ceph_decode_need(p, end, name_len, bad_ext);
+		*p += name_len;
+	}
+	/* damaged */
+	if (mdsmap_ev >= 9) {
+		size_t need;
+		ceph_decode_32_safe(p, end, n, bad_ext);
+		need = sizeof(u32) * n;
+		ceph_decode_need(p, end, need, bad_ext);
+		*p += need;
+		m->m_damaged = n > 0;
+	} else {
+		m->m_damaged = false;
+	}
+bad_ext:
 	*p = end;
 	dout("mdsmap_decode success epoch %u\n", m->m_epoch);
 	return m;
-
-badmem:
+nomem:
 	err = -ENOMEM;
+	goto out_err;
 bad:
 	pr_err("corrupt mdsmap\n");
 	print_hex_dump(KERN_DEBUG, "mdsmap: ",
 		       DUMP_PREFIX_OFFSET, 16, 1,
 		       start, end - start, true);
+out_err:
 	ceph_mdsmap_destroy(m);
 	return ERR_PTR(err);
 }
@@ -212,3 +347,19 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
 	kfree(m->m_data_pg_pools);
 	kfree(m);
 }
+
+bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
+{
+	int i, nr_active = 0;
+	if (!m->m_enabled)
+		return false;
+	if (m->m_damaged)
+		return false;
+	if (m->m_num_laggy > 0)
+		return false;
+	for (i = 0; i < m->m_max_mds; i++) {
+		if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
+			nr_active++;
+	}
+	return nr_active > 0;
+}
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 9ff5219d849e..8f8b41c2ef0f 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -593,6 +593,8 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
 	capsnap->atime = inode->i_atime;
 	capsnap->ctime = inode->i_ctime;
 	capsnap->time_warp_seq = ci->i_time_warp_seq;
+	capsnap->truncate_size = ci->i_truncate_size;
+	capsnap->truncate_seq = ci->i_truncate_seq;
 	if (capsnap->dirty_pages) {
 		dout("finish_cap_snap %p cap_snap %p snapc %p %llu %s s=%llu "
 		     "still has %d dirty pages\n", inode, capsnap,
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index b382e5910eea..6bd20d707bfd 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -137,6 +137,8 @@ enum {
 	Opt_nofscache,
 	Opt_poolperm,
 	Opt_nopoolperm,
+	Opt_require_active_mds,
+	Opt_norequire_active_mds,
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	Opt_acl,
 #endif
@@ -171,6 +173,8 @@ static match_table_t fsopt_tokens = {
 	{Opt_nofscache, "nofsc"},
 	{Opt_poolperm, "poolperm"},
 	{Opt_nopoolperm, "nopoolperm"},
+	{Opt_require_active_mds, "require_active_mds"},
+	{Opt_norequire_active_mds, "norequire_active_mds"},
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	{Opt_acl, "acl"},
 #endif
@@ -287,6 +291,12 @@ static int parse_fsopt_token(char *c, void *private)
 	case Opt_nopoolperm:
 		fsopt->flags |= CEPH_MOUNT_OPT_NOPOOLPERM;
 		break;
+	case Opt_require_active_mds:
+		fsopt->flags &= ~CEPH_MOUNT_OPT_MOUNTWAIT;
+		break;
+	case Opt_norequire_active_mds:
+		fsopt->flags |= CEPH_MOUNT_OPT_MOUNTWAIT;
+		break;
 #ifdef CONFIG_CEPH_FS_POSIX_ACL
 	case Opt_acl:
 		fsopt->sb_flags |= MS_POSIXACL;
@@ -795,7 +805,6 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
 			root = ERR_PTR(-ENOMEM);
 			goto out;
 		}
-		ceph_init_dentry(root);
 		dout("open_root_inode success, root dentry is %p\n", root);
 	} else {
 		root = ERR_PTR(err);
@@ -879,6 +888,7 @@ static int ceph_set_super(struct super_block *s, void *data)
 	fsc->sb = s;
 
 	s->s_op = &ceph_super_ops;
+	s->s_d_op = &ceph_dentry_ops;
 	s->s_export_op = &ceph_export_ops;
 
 	s->s_time_gran = 1000;  /* 1000 ns == 1 us */
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 3e3fa9163059..3373b61faefd 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -36,6 +36,7 @@
 #define CEPH_MOUNT_OPT_DCACHE          (1<<9) /* use dcache for readdir etc */
 #define CEPH_MOUNT_OPT_FSCACHE         (1<<10) /* use fscache */
 #define CEPH_MOUNT_OPT_NOPOOLPERM      (1<<11) /* no pool permission check */
+#define CEPH_MOUNT_OPT_MOUNTWAIT       (1<<12) /* mount waits if no mds is up */
 
 #define CEPH_MOUNT_OPT_DEFAULT    CEPH_MOUNT_OPT_DCACHE
 
@@ -180,6 +181,8 @@ struct ceph_cap_snap {
 	u64 size;
 	struct timespec mtime, atime, ctime;
 	u64 time_warp_seq;
+	u64 truncate_size;
+	u32 truncate_seq;
 	int writing;   /* a sync write is still in progress */
 	int dirty_pages;     /* dirty pages awaiting writeback */
 	bool inline_data;
@@ -905,6 +908,8 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
 
 extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
 			 loff_t endoff, int *got, struct page **pinned_page);
+extern int ceph_try_get_caps(struct ceph_inode_info *ci,
+			     int need, int want, int *got);
 
 /* for counting open files by mode */
 extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
@@ -934,8 +939,7 @@ extern const struct file_operations ceph_dir_fops;
 extern const struct file_operations ceph_snapdir_fops;
 extern const struct inode_operations ceph_dir_iops;
 extern const struct inode_operations ceph_snapdir_iops;
-extern const struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
-	ceph_snapdir_dentry_ops;
+extern const struct dentry_operations ceph_dentry_ops;
 
 extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order);
 extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
@@ -951,13 +955,6 @@ extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
 extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
 extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
 
-/*
- * our d_ops vary depending on whether the inode is live,
- * snapshotted (read-only), or a virtual ".snap" directory.
- */
-int ceph_init_dentry(struct dentry *dentry);
-
-
 /* ioctl.c */
 extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 
diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c
index 3d03e48a9213..9727e1dcacd5 100644
--- a/fs/cifs/cifs_debug.c
+++ b/fs/cifs/cifs_debug.c
@@ -24,7 +24,7 @@
 #include <linux/ctype.h>
 #include <linux/module.h>
 #include <linux/proc_fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c
index 8347c90cf483..66bd7fa9b7a6 100644
--- a/fs/cifs/cifsencrypt.c
+++ b/fs/cifs/cifsencrypt.c
@@ -699,11 +699,15 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp)
 
 	if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) {
 		if (!ses->domainName) {
-			rc = find_domain_name(ses, nls_cp);
-			if (rc) {
-				cifs_dbg(VFS, "error %d finding domain name\n",
-					 rc);
-				goto setup_ntlmv2_rsp_ret;
+			if (ses->domainAuto) {
+				rc = find_domain_name(ses, nls_cp);
+				if (rc) {
+					cifs_dbg(VFS, "error %d finding domain name\n",
+						 rc);
+					goto setup_ntlmv2_rsp_ret;
+				}
+			} else {
+				ses->domainName = kstrdup("", GFP_KERNEL);
 			}
 		}
 	} else {
@@ -808,7 +812,11 @@ calc_seckey(struct cifs_ses *ses)
 	struct crypto_skcipher *tfm_arc4;
 	struct scatterlist sgin, sgout;
 	struct skcipher_request *req;
-	unsigned char sec_key[CIFS_SESS_KEY_SIZE]; /* a nonce */
+	unsigned char *sec_key;
+
+	sec_key = kmalloc(CIFS_SESS_KEY_SIZE, GFP_KERNEL);
+	if (sec_key == NULL)
+		return -ENOMEM;
 
 	get_random_bytes(sec_key, CIFS_SESS_KEY_SIZE);
 
@@ -816,7 +824,7 @@ calc_seckey(struct cifs_ses *ses)
 	if (IS_ERR(tfm_arc4)) {
 		rc = PTR_ERR(tfm_arc4);
 		cifs_dbg(VFS, "could not allocate crypto API arc4\n");
-		return rc;
+		goto out;
 	}
 
 	rc = crypto_skcipher_setkey(tfm_arc4, ses->auth_key.response,
@@ -854,7 +862,8 @@ calc_seckey(struct cifs_ses *ses)
 
 out_free_cipher:
 	crypto_free_skcipher(tfm_arc4);
-
+out:
+	kfree(sec_key);
 	return rc;
 }
 
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 15261ba464c5..70f4e65fced2 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -615,7 +615,7 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb)
 		return dget(sb->s_root);
 
 	full_path = cifs_build_path_to_root(vol, cifs_sb,
-					    cifs_sb_master_tcon(cifs_sb));
+				cifs_sb_master_tcon(cifs_sb), 0);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
@@ -914,7 +914,6 @@ const struct inode_operations cifs_file_inode_ops = {
 };
 
 const struct inode_operations cifs_symlink_inode_ops = {
-	.readlink = generic_readlink,
 	.get_link = cifs_get_link,
 	.permission = cifs_permission,
 	.listxattr = cifs_listxattr,
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 1f17f6bd7a60..7ea8a3393936 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -514,6 +514,7 @@ struct smb_vol {
 	bool persistent:1;
 	bool nopersistent:1;
 	bool resilient:1; /* noresilient not required since not fored for CA */
+	bool domainauto:1;
 	unsigned int rsize;
 	unsigned int wsize;
 	bool sockopt_tcp_nodelay:1;
@@ -525,6 +526,7 @@ struct smb_vol {
 	struct sockaddr_storage srcaddr; /* allow binding to a local IP */
 	struct nls_table *local_nls;
 	unsigned int echo_interval; /* echo interval in secs */
+	__u64 snapshot_time; /* needed for timewarp tokens */
 	unsigned int max_credits; /* smb3 max_credits 10 < credits < 60000 */
 };
 
@@ -646,6 +648,8 @@ struct TCP_Server_Info {
 	unsigned int	max_read;
 	unsigned int	max_write;
 	__u8		preauth_hash[512];
+	struct delayed_work reconnect; /* reconnect workqueue job */
+	struct mutex reconnect_mutex; /* prevent simultaneous reconnects */
 #endif /* CONFIG_CIFS_SMB2 */
 	unsigned long echo_interval;
 };
@@ -827,6 +831,7 @@ struct cifs_ses {
 	enum securityEnum sectype; /* what security flavor was specified? */
 	bool sign;		/* is signing required? */
 	bool need_reconnect:1; /* connection reset, uid now invalid */
+	bool domainAuto:1;
 #ifdef CONFIG_CIFS_SMB2
 	__u16 session_flags;
 	__u8 smb3signingkey[SMB3_SIGN_KEY_SIZE];
@@ -849,6 +854,7 @@ cap_unix(struct cifs_ses *ses)
 struct cifs_tcon {
 	struct list_head tcon_list;
 	int tc_count;
+	struct list_head rlist; /* reconnect list */
 	struct list_head openFileList;
 	spinlock_t open_file_lock; /* protects list above */
 	struct cifs_ses *ses;	/* pointer to session associated with */
@@ -922,6 +928,7 @@ struct cifs_tcon {
 	bool broken_posix_open; /* e.g. Samba server versions < 3.3.2, 3.2.9 */
 	bool broken_sparse_sup; /* if server or share does not support sparse */
 	bool need_reconnect:1; /* connection reset, tid now invalid */
+	bool need_reopen_files:1; /* need to reopen tcon file handles */
 	bool use_resilient:1; /* use resilient instead of durable handles */
 	bool use_persistent:1; /* use persistent instead of durable handles */
 #ifdef CONFIG_CIFS_SMB2
@@ -932,6 +939,7 @@ struct cifs_tcon {
 	__u32 maximal_access;
 	__u32 vol_serial_number;
 	__le64 vol_create_time;
+	__u64 snapshot_time; /* for timewarp tokens - timestamp of snapshot */
 	__u32 ss_flags;		/* sector size flags */
 	__u32 perf_sector_size; /* best sector size for perf */
 	__u32 max_chunks;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ced0e42ce460..c7b3c841e660 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -63,7 +63,8 @@ extern void exit_cifs_spnego(void);
 extern char *build_path_from_dentry(struct dentry *);
 extern char *cifs_build_path_to_root(struct smb_vol *vol,
 				     struct cifs_sb_info *cifs_sb,
-				     struct cifs_tcon *tcon);
+				     struct cifs_tcon *tcon,
+				     int add_treename);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 extern char *cifs_compose_mount_options(const char *sb_mountdata,
 		const char *fullpath, const struct dfs_info3_param *ref,
@@ -206,6 +207,9 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid,
 					 struct tcon_link *tlink,
 					 struct cifs_pending_open *open);
 extern void cifs_del_pending_open(struct cifs_pending_open *open);
+extern void cifs_put_tcp_session(struct TCP_Server_Info *server,
+				 int from_reconnect);
+extern void cifs_put_tcon(struct cifs_tcon *tcon);
 
 #if IS_ENABLED(CONFIG_CIFS_DFS_UPCALL)
 extern void cifs_dfs_release_automount_timer(void);
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 3f3185febc58..b47261858e6d 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -35,7 +35,7 @@
 #include <linux/pagemap.h>
 #include <linux/swap.h>
 #include <linux/task_io_accounting_ops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsacl.h"
@@ -3427,6 +3427,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 	__u16 rc = 0;
 	struct cifs_posix_acl *cifs_acl = (struct cifs_posix_acl *)parm_data;
 	struct posix_acl_xattr_header *local_acl = (void *)pACL;
+	struct posix_acl_xattr_entry *ace = (void *)(local_acl + 1);
 	int count;
 	int i;
 
@@ -3453,8 +3454,7 @@ static __u16 ACL_to_cifs_posix(char *parm_data, const char *pACL,
 		return 0;
 	}
 	for (i = 0; i < count; i++) {
-		rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i],
-			(struct posix_acl_xattr_entry *)(local_acl + 1));
+		rc = convert_ace_to_cifs_ace(&cifs_acl->ace_array[i], &ace[i]);
 		if (rc != 0) {
 			/* ACE not converted */
 			break;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index aab5227979e2..35ae49ed1f76 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -34,13 +34,14 @@
 #include <linux/pagevec.h>
 #include <linux/freezer.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/inet.h>
 #include <linux/module.h>
 #include <keys/user-type.h>
 #include <net/ipv6.h>
 #include <linux/parser.h>
+#include <linux/bvec.h>
 
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -52,6 +53,9 @@
 #include "nterr.h"
 #include "rfc1002pdu.h"
 #include "fscache.h"
+#ifdef CONFIG_CIFS_SMB2
+#include "smb2proto.h"
+#endif
 
 #define CIFS_PORT 445
 #define RFC1001_PORT 139
@@ -88,6 +92,7 @@ enum {
 	Opt_multiuser, Opt_sloppy, Opt_nosharesock,
 	Opt_persistent, Opt_nopersistent,
 	Opt_resilient, Opt_noresilient,
+	Opt_domainauto,
 
 	/* Mount options which take numeric value */
 	Opt_backupuid, Opt_backupgid, Opt_uid,
@@ -95,6 +100,7 @@ enum {
 	Opt_dirmode, Opt_port,
 	Opt_rsize, Opt_wsize, Opt_actimeo,
 	Opt_echo_interval, Opt_max_credits,
+	Opt_snapshot,
 
 	/* Mount options which take string value */
 	Opt_user, Opt_pass, Opt_ip,
@@ -176,6 +182,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_nopersistent, "nopersistenthandles"},
 	{ Opt_resilient, "resilienthandles"},
 	{ Opt_noresilient, "noresilienthandles"},
+	{ Opt_domainauto, "domainauto"},
 
 	{ Opt_backupuid, "backupuid=%s" },
 	{ Opt_backupgid, "backupgid=%s" },
@@ -191,6 +198,7 @@ static const match_table_t cifs_mount_option_tokens = {
 	{ Opt_actimeo, "actimeo=%s" },
 	{ Opt_echo_interval, "echo_interval=%s" },
 	{ Opt_max_credits, "max_credits=%s" },
+	{ Opt_snapshot, "snapshot=%s" },
 
 	{ Opt_blank_user, "user=" },
 	{ Opt_blank_user, "username=" },
@@ -412,6 +420,9 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		}
 	} while (server->tcpStatus == CifsNeedReconnect);
 
+	if (server->tcpStatus == CifsNeedNegotiate)
+		mod_delayed_work(cifsiod_wq, &server->echo, 0);
+
 	return rc;
 }
 
@@ -421,17 +432,25 @@ cifs_echo_request(struct work_struct *work)
 	int rc;
 	struct TCP_Server_Info *server = container_of(work,
 					struct TCP_Server_Info, echo.work);
-	unsigned long echo_interval = server->echo_interval;
+	unsigned long echo_interval;
 
 	/*
-	 * We cannot send an echo if it is disabled or until the
-	 * NEGOTIATE_PROTOCOL request is done, which is indicated by
-	 * server->ops->need_neg() == true. Also, no need to ping if
-	 * we got a response recently.
+	 * If we need to renegotiate, set echo interval to zero to
+	 * immediately call echo service where we can renegotiate.
+	 */
+	if (server->tcpStatus == CifsNeedNegotiate)
+		echo_interval = 0;
+	else
+		echo_interval = server->echo_interval;
+
+	/*
+	 * We cannot send an echo if it is disabled.
+	 * Also, no need to ping if we got a response recently.
 	 */
 
 	if (server->tcpStatus == CifsNeedReconnect ||
-	    server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew ||
+	    server->tcpStatus == CifsExiting ||
+	    server->tcpStatus == CifsNew ||
 	    (server->ops->can_echo && !server->ops->can_echo(server)) ||
 	    time_before(jiffies, server->lstrp + echo_interval - HZ))
 		goto requeue_echo;
@@ -442,7 +461,7 @@ cifs_echo_request(struct work_struct *work)
 			 server->hostname);
 
 requeue_echo:
-	queue_delayed_work(cifsiod_wq, &server->echo, echo_interval);
+	queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval);
 }
 
 static bool
@@ -1488,6 +1507,9 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 		case Opt_noresilient:
 			vol->resilient = false; /* already the default */
 			break;
+		case Opt_domainauto:
+			vol->domainauto = true;
+			break;
 
 		/* Numeric Values */
 		case Opt_backupuid:
@@ -1590,6 +1612,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			}
 			vol->echo_interval = option;
 			break;
+		case Opt_snapshot:
+			if (get_option_ul(args, &option)) {
+				cifs_dbg(VFS, "%s: Invalid snapshot time\n",
+					 __func__);
+				goto cifs_parse_mount_err;
+			}
+			vol->snapshot_time = option;
+			break;
 		case Opt_max_credits:
 			if (get_option_ul(args, &option) || (option < 20) ||
 			    (option > 60000)) {
@@ -2089,8 +2119,8 @@ cifs_find_tcp_session(struct smb_vol *vol)
 	return NULL;
 }
 
-static void
-cifs_put_tcp_session(struct TCP_Server_Info *server)
+void
+cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect)
 {
 	struct task_struct *task;
 
@@ -2107,6 +2137,19 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 
 	cancel_delayed_work_sync(&server->echo);
 
+#ifdef CONFIG_CIFS_SMB2
+	if (from_reconnect)
+		/*
+		 * Avoid deadlock here: reconnect work calls
+		 * cifs_put_tcp_session() at its end. Need to be sure
+		 * that reconnect work does nothing with server pointer after
+		 * that step.
+		 */
+		cancel_delayed_work(&server->reconnect);
+	else
+		cancel_delayed_work_sync(&server->reconnect);
+#endif
+
 	spin_lock(&GlobalMid_Lock);
 	server->tcpStatus = CifsExiting;
 	spin_unlock(&GlobalMid_Lock);
@@ -2171,6 +2214,10 @@ cifs_get_tcp_session(struct smb_vol *volume_info)
 	INIT_LIST_HEAD(&tcp_ses->tcp_ses_list);
 	INIT_LIST_HEAD(&tcp_ses->smb_ses_list);
 	INIT_DELAYED_WORK(&tcp_ses->echo, cifs_echo_request);
+#ifdef CONFIG_CIFS_SMB2
+	INIT_DELAYED_WORK(&tcp_ses->reconnect, smb2_reconnect_server);
+	mutex_init(&tcp_ses->reconnect_mutex);
+#endif
 	memcpy(&tcp_ses->srcaddr, &volume_info->srcaddr,
 	       sizeof(tcp_ses->srcaddr));
 	memcpy(&tcp_ses->dstaddr, &volume_info->dstaddr,
@@ -2329,7 +2376,7 @@ cifs_put_smb_ses(struct cifs_ses *ses)
 	spin_unlock(&cifs_tcp_ses_lock);
 
 	sesInfoFree(ses);
-	cifs_put_tcp_session(server);
+	cifs_put_tcp_session(server, 0);
 }
 
 #ifdef CONFIG_KEYS
@@ -2503,7 +2550,7 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		mutex_unlock(&ses->session_mutex);
 
 		/* existing SMB ses has a server reference already */
-		cifs_put_tcp_session(server);
+		cifs_put_tcp_session(server, 0);
 		free_xid(xid);
 		return ses;
 	}
@@ -2537,6 +2584,8 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb_vol *volume_info)
 		if (!ses->domainName)
 			goto get_ses_fail;
 	}
+	if (volume_info->domainauto)
+		ses->domainAuto = volume_info->domainauto;
 	ses->cred_uid = volume_info->cred_uid;
 	ses->linux_uid = volume_info->linux_uid;
 
@@ -2575,7 +2624,7 @@ static int match_tcon(struct cifs_tcon *tcon, const char *unc)
 }
 
 static struct cifs_tcon *
-cifs_find_tcon(struct cifs_ses *ses, const char *unc)
+cifs_find_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 {
 	struct list_head *tmp;
 	struct cifs_tcon *tcon;
@@ -2583,8 +2632,14 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc)
 	spin_lock(&cifs_tcp_ses_lock);
 	list_for_each(tmp, &ses->tcon_list) {
 		tcon = list_entry(tmp, struct cifs_tcon, tcon_list);
-		if (!match_tcon(tcon, unc))
+		if (!match_tcon(tcon, volume_info->UNC))
+			continue;
+
+#ifdef CONFIG_CIFS_SMB2
+		if (tcon->snapshot_time != volume_info->snapshot_time)
 			continue;
+#endif /* CONFIG_CIFS_SMB2 */
+
 		++tcon->tc_count;
 		spin_unlock(&cifs_tcp_ses_lock);
 		return tcon;
@@ -2593,7 +2648,7 @@ cifs_find_tcon(struct cifs_ses *ses, const char *unc)
 	return NULL;
 }
 
-static void
+void
 cifs_put_tcon(struct cifs_tcon *tcon)
 {
 	unsigned int xid;
@@ -2625,7 +2680,7 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 	int rc, xid;
 	struct cifs_tcon *tcon;
 
-	tcon = cifs_find_tcon(ses, volume_info->UNC);
+	tcon = cifs_find_tcon(ses, volume_info);
 	if (tcon) {
 		cifs_dbg(FYI, "Found match on UNC path\n");
 		/* existing tcon already has a reference */
@@ -2646,6 +2701,22 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb_vol *volume_info)
 		goto out_fail;
 	}
 
+	if (volume_info->snapshot_time) {
+#ifdef CONFIG_CIFS_SMB2
+		if (ses->server->vals->protocol_id == 0) {
+			cifs_dbg(VFS,
+			     "Use SMB2 or later for snapshot mount option\n");
+			rc = -EOPNOTSUPP;
+			goto out_fail;
+		} else
+			tcon->snapshot_time = volume_info->snapshot_time;
+#else
+		cifs_dbg(VFS, "Snapshot mount option requires SMB2 support\n");
+		rc = -EOPNOTSUPP;
+		goto out_fail;
+#endif /* CONFIG_CIFS_SMB2 */
+	}
+
 	tcon->ses = ses;
 	if (volume_info->password) {
 		tcon->password = kstrdup(volume_info->password, GFP_KERNEL);
@@ -3695,7 +3766,8 @@ remote_path_check:
 		/*
 		 * cifs_build_path_to_root works only when we have a valid tcon
 		 */
-		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon);
+		full_path = cifs_build_path_to_root(volume_info, cifs_sb, tcon,
+					tcon->Flags & SMB_SHARE_IS_IN_DFS);
 		if (full_path == NULL) {
 			rc = -ENOMEM;
 			goto mount_fail_check;
@@ -3781,7 +3853,7 @@ mount_fail_check:
 		else if (ses)
 			cifs_put_smb_ses(ses);
 		else
-			cifs_put_tcp_session(server);
+			cifs_put_tcp_session(server, 0);
 		bdi_destroy(&cifs_sb->bdi);
 	}
 
@@ -4092,7 +4164,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid)
 	ses = cifs_get_smb_ses(master_tcon->ses->server, vol_info);
 	if (IS_ERR(ses)) {
 		tcon = (struct cifs_tcon *)ses;
-		cifs_put_tcp_session(master_tcon->ses->server);
+		cifs_put_tcp_session(master_tcon->ses->server, 0);
 		goto out;
 	}
 
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 789ff1df2d8d..2c227a99f369 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -47,7 +47,7 @@ renew_parental_timestamps(struct dentry *direntry)
 
 char *
 cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
-			struct cifs_tcon *tcon)
+			struct cifs_tcon *tcon, int add_treename)
 {
 	int pplen = vol->prepath ? strlen(vol->prepath) + 1 : 0;
 	int dfsplen;
@@ -59,7 +59,7 @@ cifs_build_path_to_root(struct smb_vol *vol, struct cifs_sb_info *cifs_sb,
 		return full_path;
 	}
 
-	if (tcon->Flags & SMB_SHARE_IS_IN_DFS)
+	if (add_treename)
 		dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1);
 	else
 		dfsplen = 0;
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 7f5f6176c6f1..18a1e1d6671f 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -777,6 +777,11 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
 	struct list_head *tmp1;
 	struct list_head tmp_list;
 
+	if (!tcon->use_persistent || !tcon->need_reopen_files)
+		return;
+
+	tcon->need_reopen_files = false;
+
 	cifs_dbg(FYI, "Reopen persistent handles");
 	INIT_LIST_HEAD(&tmp_list);
 
@@ -793,7 +798,8 @@ cifs_reopen_persistent_handles(struct cifs_tcon *tcon)
 
 	list_for_each_safe(tmp, tmp1, &tmp_list) {
 		open_file = list_entry(tmp, struct cifsFileInfo, rlist);
-		cifs_reopen_file(open_file, false /* do not flush */);
+		if (cifs_reopen_file(open_file, false /* do not flush */))
+			tcon->need_reopen_files = true;
 		list_del_init(&open_file->rlist);
 		cifsFileInfo_put(open_file);
 	}
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index 9f51b81119f2..001528781b6b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -189,7 +189,7 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg)
 	xid = get_xid();
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	cifs_dbg(VFS, "cifs ioctl 0x%x\n", command);
+	cifs_dbg(FYI, "cifs ioctl 0x%x\n", command);
 	switch (command) {
 		case FS_IOC_GETFLAGS:
 			if (pSMBFile == NULL)
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index d031af8d3d4d..c4d996f78e1c 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -45,13 +45,8 @@
 	(CIFS_MF_SYMLINK_LINK_OFFSET + CIFS_MF_SYMLINK_LINK_MAXLEN)
 
 #define CIFS_MF_SYMLINK_LEN_FORMAT "XSym\n%04u\n"
-#define CIFS_MF_SYMLINK_MD5_FORMAT \
-	"%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n"
-#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) \
-	md5_hash[0],  md5_hash[1],  md5_hash[2],  md5_hash[3], \
-	md5_hash[4],  md5_hash[5],  md5_hash[6],  md5_hash[7], \
-	md5_hash[8],  md5_hash[9],  md5_hash[10], md5_hash[11],\
-	md5_hash[12], md5_hash[13], md5_hash[14], md5_hash[15]
+#define CIFS_MF_SYMLINK_MD5_FORMAT "%16phN\n"
+#define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash
 
 static int
 symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash)
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index f9e766f464be..b2aff0c6f22c 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -260,7 +260,7 @@ smb2_push_mandatory_locks(struct cifsFileInfo *cfile)
 	 * and check it for zero before using.
 	 */
 	max_buf = tlink_tcon(cfile->tlink)->ses->server->maxBuf;
-	if (!max_buf) {
+	if (max_buf < sizeof(struct smb2_lock_element)) {
 		free_xid(xid);
 		return -EINVAL;
 	}
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 5ca5ea4668a1..87457227812c 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -250,16 +250,19 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon)
 	}
 
 	cifs_mark_open_files_invalid(tcon);
+	if (tcon->use_persistent)
+		tcon->need_reopen_files = true;
 
 	rc = SMB2_tcon(0, tcon->ses, tcon->treeName, tcon, nls_codepage);
 	mutex_unlock(&tcon->ses->session_mutex);
 
-	if (tcon->use_persistent)
-		cifs_reopen_persistent_handles(tcon);
-
 	cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc);
 	if (rc)
 		goto out;
+
+	if (smb2_command != SMB2_INTERNAL_CMD)
+		queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+
 	atomic_inc(&tconInfoReconnectCount);
 out:
 	/*
@@ -280,7 +283,7 @@ out:
 	case SMB2_CHANGE_NOTIFY:
 	case SMB2_QUERY_INFO:
 	case SMB2_SET_INFO:
-		return -EAGAIN;
+		rc = -EAGAIN;
 	}
 	unload_nls(nls_codepage);
 	return rc;
@@ -1972,6 +1975,55 @@ smb2_echo_callback(struct mid_q_entry *mid)
 	add_credits(server, credits_received, CIFS_ECHO_OP);
 }
 
+void smb2_reconnect_server(struct work_struct *work)
+{
+	struct TCP_Server_Info *server = container_of(work,
+					struct TCP_Server_Info, reconnect.work);
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon, *tcon2;
+	struct list_head tmp_list;
+	int tcon_exist = false;
+
+	/* Prevent simultaneous reconnects that can corrupt tcon->rlist list */
+	mutex_lock(&server->reconnect_mutex);
+
+	INIT_LIST_HEAD(&tmp_list);
+	cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		list_for_each_entry(tcon, &ses->tcon_list, tcon_list) {
+			if (tcon->need_reconnect || tcon->need_reopen_files) {
+				tcon->tc_count++;
+				list_add_tail(&tcon->rlist, &tmp_list);
+				tcon_exist = true;
+			}
+		}
+	}
+	/*
+	 * Get the reference to server struct to be sure that the last call of
+	 * cifs_put_tcon() in the loop below won't release the server pointer.
+	 */
+	if (tcon_exist)
+		server->srv_count++;
+
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) {
+		if (!smb2_reconnect(SMB2_INTERNAL_CMD, tcon))
+			cifs_reopen_persistent_handles(tcon);
+		list_del_init(&tcon->rlist);
+		cifs_put_tcon(tcon);
+	}
+
+	cifs_dbg(FYI, "Reconnecting tcons finished\n");
+	mutex_unlock(&server->reconnect_mutex);
+
+	/* now we can safely release srv struct */
+	if (tcon_exist)
+		cifs_put_tcp_session(server, 1);
+}
+
 int
 SMB2_echo(struct TCP_Server_Info *server)
 {
@@ -1984,32 +2036,11 @@ SMB2_echo(struct TCP_Server_Info *server)
 	cifs_dbg(FYI, "In echo request\n");
 
 	if (server->tcpStatus == CifsNeedNegotiate) {
-		struct list_head *tmp, *tmp2;
-		struct cifs_ses *ses;
-		struct cifs_tcon *tcon;
-
-		cifs_dbg(FYI, "Need negotiate, reconnecting tcons\n");
-		spin_lock(&cifs_tcp_ses_lock);
-		list_for_each(tmp, &server->smb_ses_list) {
-			ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
-			list_for_each(tmp2, &ses->tcon_list) {
-				tcon = list_entry(tmp2, struct cifs_tcon,
-						  tcon_list);
-				/* add check for persistent handle reconnect */
-				if (tcon && tcon->need_reconnect) {
-					spin_unlock(&cifs_tcp_ses_lock);
-					rc = smb2_reconnect(SMB2_ECHO, tcon);
-					spin_lock(&cifs_tcp_ses_lock);
-				}
-			}
-		}
-		spin_unlock(&cifs_tcp_ses_lock);
+		/* No need to send echo on newly established connections */
+		queue_delayed_work(cifsiod_wq, &server->reconnect, 0);
+		return rc;
 	}
 
-	/* if no session, renegotiate failed above */
-	if (server->tcpStatus == CifsNeedNegotiate)
-		return -EIO;
-
 	rc = small_smb2_init(SMB2_ECHO, NULL, (void **)&req);
 	if (rc)
 		return rc;
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index fd3709e8de33..dc0d141f33e2 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -80,6 +80,8 @@
 #define SMB2_SET_INFO		cpu_to_le16(SMB2_SET_INFO_HE)
 #define SMB2_OPLOCK_BREAK	cpu_to_le16(SMB2_OPLOCK_BREAK_HE)
 
+#define SMB2_INTERNAL_CMD	cpu_to_le16(0xFFFF)
+
 #define NUMBER_OF_SMB2_COMMANDS	0x0013
 
 /* BB FIXME - analyze following length BB */
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index eb2cde2f64ba..f2d511a6971b 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -96,6 +96,7 @@ extern int smb2_open_file(const unsigned int xid,
 extern int smb2_unlock_range(struct cifsFileInfo *cfile,
 			     struct file_lock *flock, const unsigned int xid);
 extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
+extern void smb2_reconnect_server(struct work_struct *work);
 
 /*
  * SMB2 Worker functions - most of protocol specific implementation details
diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c
index 699b7868108f..c12bffefa3c9 100644
--- a/fs/cifs/smbencrypt.c
+++ b/fs/cifs/smbencrypt.c
@@ -23,7 +23,7 @@
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 */
 
-#include <crypto/skcipher.h>
+#include <linux/crypto.h>
 #include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -69,46 +69,22 @@ str_to_key(unsigned char *str, unsigned char *key)
 static int
 smbhash(unsigned char *out, const unsigned char *in, unsigned char *key)
 {
-	int rc;
 	unsigned char key2[8];
-	struct crypto_skcipher *tfm_des;
-	struct scatterlist sgin, sgout;
-	struct skcipher_request *req;
+	struct crypto_cipher *tfm_des;
 
 	str_to_key(key, key2);
 
-	tfm_des = crypto_alloc_skcipher("ecb(des)", 0, CRYPTO_ALG_ASYNC);
+	tfm_des = crypto_alloc_cipher("des", 0, 0);
 	if (IS_ERR(tfm_des)) {
-		rc = PTR_ERR(tfm_des);
-		cifs_dbg(VFS, "could not allocate des crypto API\n");
-		goto smbhash_err;
-	}
-
-	req = skcipher_request_alloc(tfm_des, GFP_KERNEL);
-	if (!req) {
-		rc = -ENOMEM;
 		cifs_dbg(VFS, "could not allocate des crypto API\n");
-		goto smbhash_free_skcipher;
+		return PTR_ERR(tfm_des);
 	}
 
-	crypto_skcipher_setkey(tfm_des, key2, 8);
-
-	sg_init_one(&sgin, in, 8);
-	sg_init_one(&sgout, out, 8);
+	crypto_cipher_setkey(tfm_des, key2, 8);
+	crypto_cipher_encrypt_one(tfm_des, out, in);
+	crypto_free_cipher(tfm_des);
 
-	skcipher_request_set_callback(req, 0, NULL, NULL);
-	skcipher_request_set_crypt(req, &sgin, &sgout, 8, NULL);
-
-	rc = crypto_skcipher_encrypt(req);
-	if (rc)
-		cifs_dbg(VFS, "could not encrypt crypt key rc: %d\n", rc);
-
-	skcipher_request_free(req);
-
-smbhash_free_skcipher:
-	crypto_free_skcipher(tfm_des);
-smbhash_err:
-	return rc;
+	return 0;
 }
 
 static int
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 206a597b2293..fbb84c08e3cd 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -28,8 +28,9 @@
 #include <linux/delay.h>
 #include <linux/freezer.h>
 #include <linux/tcp.h>
+#include <linux/bvec.h>
 #include <linux/highmem.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/processor.h>
 #include <linux/mempool.h>
 #include "cifspdu.h"
diff --git a/fs/coda/cnode.c b/fs/coda/cnode.c
index 1bfb7ba4e85e..f13e09057c6b 100644
--- a/fs/coda/cnode.c
+++ b/fs/coda/cnode.c
@@ -17,7 +17,6 @@ static inline int coda_fideq(struct CodaFid *fid1, struct CodaFid *fid2)
 }
 
 static const struct inode_operations coda_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= coda_setattr,
 };
diff --git a/fs/compat.c b/fs/compat.c
index bd064a2c3550..e50a2114f474 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -49,7 +49,7 @@
 #include <linux/pagemap.h>
 #include <linux/aio.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/ioctls.h>
 #include "internal.h"
@@ -253,9 +253,9 @@ COMPAT_SYSCALL_DEFINE2(fstatfs, unsigned int, fd, struct compat_statfs __user *,
 
 static int put_compat_statfs64(struct compat_statfs64 __user *ubuf, struct kstatfs *kbuf)
 {
-	if (sizeof ubuf->f_blocks == 4) {
-		if ((kbuf->f_blocks | kbuf->f_bfree | kbuf->f_bavail |
-		     kbuf->f_bsize | kbuf->f_frsize) & 0xffffffff00000000ULL)
+	if (sizeof(ubuf->f_bsize) == 4) {
+		if ((kbuf->f_type | kbuf->f_bsize | kbuf->f_namelen |
+		     kbuf->f_frsize | kbuf->f_flags) & 0xffffffff00000000ULL)
 			return -EOVERFLOW;
 		/* f_files and f_ffree may be -1; it's okay
 		 * to stuff that into 32 bits */
@@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd,
 	return compat_sys_fcntl64(fd, cmd, arg);
 }
 
-COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p)
-{
-	long ret;
-	aio_context_t ctx64;
-
-	mm_segment_t oldfs = get_fs();
-	if (unlikely(get_user(ctx64, ctx32p)))
-		return -EFAULT;
-
-	set_fs(KERNEL_DS);
-	/* The __user pointer cast is valid because of the set_fs() */
-	ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64);
-	set_fs(oldfs);
-	/* truncating is ok because it's a user address */
-	if (!ret)
-		ret = put_user((u32) ctx64, ctx32p);
-	return ret;
-}
-
-COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id,
-		       compat_long_t, min_nr,
-		       compat_long_t, nr,
-		       struct io_event __user *, events,
-		       struct compat_timespec __user *, timeout)
-{
-	struct timespec t;
-	struct timespec __user *ut = NULL;
-
-	if (timeout) {
-		if (compat_get_timespec(&t, timeout))
-			return -EFAULT;
-
-		ut = compat_alloc_user_space(sizeof(*ut));
-		if (copy_to_user(ut, &t, sizeof(t)) )
-			return -EFAULT;
-	} 
-	return sys_io_getevents(ctx_id, min_nr, nr, events, ut);
-}
-
 /* A write operation does a read from user space and vice versa */
 #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ)
 
@@ -602,42 +563,6 @@ out:
 	return ret;
 }
 
-static inline long
-copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64)
-{
-	compat_uptr_t uptr;
-	int i;
-
-	for (i = 0; i < nr; ++i) {
-		if (get_user(uptr, ptr32 + i))
-			return -EFAULT;
-		if (put_user(compat_ptr(uptr), ptr64 + i))
-			return -EFAULT;
-	}
-	return 0;
-}
-
-#define MAX_AIO_SUBMITS 	(PAGE_SIZE/sizeof(struct iocb *))
-
-COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id,
-		       int, nr, u32 __user *, iocb)
-{
-	struct iocb __user * __user *iocb64; 
-	long ret;
-
-	if (unlikely(nr < 0))
-		return -EINVAL;
-
-	if (nr > MAX_AIO_SUBMITS)
-		nr = MAX_AIO_SUBMITS;
-	
-	iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64));
-	ret = copy_iocb(nr, iocb, iocb64);
-	if (!ret)
-		ret = do_io_submit(ctx_id, nr, iocb64, 1);
-	return ret;
-}
-
 struct compat_ncp_mount_data {
 	compat_int_t version;
 	compat_uint_t ncp_fd;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index f2d7402abe02..11d087b2b28e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -76,7 +76,7 @@
 #include <scsi/sg.h>
 #endif
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/ethtool.h>
 #include <linux/mii.h>
 #include <linux/if_bonding.h>
diff --git a/fs/configfs/file.c b/fs/configfs/file.c
index 2c6312db8516..39da1103d341 100644
--- a/fs/configfs/file.c
+++ b/fs/configfs/file.c
@@ -29,7 +29,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/configfs.h>
 #include "configfs_internal.h"
diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c
index db6d69289608..a6ab012a2c6a 100644
--- a/fs/configfs/symlink.c
+++ b/fs/configfs/symlink.c
@@ -305,7 +305,6 @@ static const char *configfs_get_link(struct dentry *dentry,
 
 const struct inode_operations configfs_symlink_inode_operations = {
 	.get_link = configfs_get_link,
-	.readlink = generic_readlink,
 	.setattr = configfs_setattr,
 };
 
diff --git a/fs/coredump.c b/fs/coredump.c
index eb9c92c9b20f..ae6b05629ca1 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -38,7 +38,7 @@
 #include <linux/path.h>
 #include <linux/timekeeping.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 #include <asm/exec.h>
@@ -833,3 +833,21 @@ int dump_align(struct coredump_params *cprm, int align)
 	return mod ? dump_skip(cprm, align - mod) : 1;
 }
 EXPORT_SYMBOL(dump_align);
+
+/*
+ * Ensures that file size is big enough to contain the current file
+ * postion. This prevents gdb from complaining about a truncated file
+ * if the last "write" to the file was dump_skip.
+ */
+void dump_truncate(struct coredump_params *cprm)
+{
+	struct file *file = cprm->file;
+	loff_t offset;
+
+	if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
+		offset = file->f_op->llseek(file, 0, SEEK_CUR);
+		if (i_size_read(file->f_mapping->host) < offset)
+			do_truncate(file->f_path.dentry, offset, 0, file);
+	}
+}
+EXPORT_SYMBOL(dump_truncate);
diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig
index 92348faf9865..f514978f6688 100644
--- a/fs/crypto/Kconfig
+++ b/fs/crypto/Kconfig
@@ -8,9 +8,7 @@ config FS_ENCRYPTION
 	select CRYPTO_XTS
 	select CRYPTO_CTS
 	select CRYPTO_CTR
-	select CRYPTO_SHA256
 	select KEYS
-	select ENCRYPTED_KEYS
 	help
 	  Enable encryption of files and directories.  This
 	  feature is similar to ecryptfs, but it is more memory
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c
index 98f87fe8f186..ac8e4f6a3773 100644
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -27,7 +27,7 @@
 #include <linux/bio.h>
 #include <linux/dcache.h>
 #include <linux/namei.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 static unsigned int num_prealloc_crypto_pages = 32;
 static unsigned int num_prealloc_crypto_ctxs = 128;
@@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx)
 {
 	unsigned long flags;
 
-	if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) {
+	if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) {
 		mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool);
 		ctx->w.bounce_page = NULL;
 	}
@@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx);
  * Return: An allocated and initialized encryption context on success; error
  * value or NULL otherwise.
  */
-struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
+struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
 {
 	struct fscrypt_ctx *ctx = NULL;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags)
 	} else {
 		ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL;
 	}
-	ctx->flags &= ~FS_WRITE_PATH_FL;
+	ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx;
 }
 EXPORT_SYMBOL(fscrypt_get_ctx);
@@ -146,9 +146,10 @@ typedef enum {
 	FS_ENCRYPT,
 } fscrypt_direction_t;
 
-static int do_page_crypto(struct inode *inode,
-			fscrypt_direction_t rw, pgoff_t index,
+static int do_page_crypto(const struct inode *inode,
+			fscrypt_direction_t rw, u64 lblk_num,
 			struct page *src_page, struct page *dest_page,
+			unsigned int len, unsigned int offs,
 			gfp_t gfp_flags)
 {
 	struct {
@@ -162,6 +163,8 @@ static int do_page_crypto(struct inode *inode,
 	struct crypto_skcipher *tfm = ci->ci_ctfm;
 	int res = 0;
 
+	BUG_ON(len == 0);
+
 	req = skcipher_request_alloc(tfm, gfp_flags);
 	if (!req) {
 		printk_ratelimited(KERN_ERR
@@ -175,14 +178,14 @@ static int do_page_crypto(struct inode *inode,
 		page_crypt_complete, &ecr);
 
 	BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE);
-	xts_tweak.index = cpu_to_le64(index);
+	xts_tweak.index = cpu_to_le64(lblk_num);
 	memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding));
 
 	sg_init_table(&dst, 1);
-	sg_set_page(&dst, dest_page, PAGE_SIZE, 0);
+	sg_set_page(&dst, dest_page, len, offs);
 	sg_init_table(&src, 1);
-	sg_set_page(&src, src_page, PAGE_SIZE, 0);
-	skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak);
+	sg_set_page(&src, src_page, len, offs);
+	skcipher_request_set_crypt(req, &src, &dst, len, &xts_tweak);
 	if (rw == FS_DECRYPT)
 		res = crypto_skcipher_decrypt(req);
 	else
@@ -207,34 +210,66 @@ static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags)
 	ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags);
 	if (ctx->w.bounce_page == NULL)
 		return ERR_PTR(-ENOMEM);
-	ctx->flags |= FS_WRITE_PATH_FL;
+	ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL;
 	return ctx->w.bounce_page;
 }
 
 /**
  * fscypt_encrypt_page() - Encrypts a page
- * @inode:          The inode for which the encryption should take place
- * @plaintext_page: The page to encrypt. Must be locked.
- * @gfp_flags:      The gfp flag for memory allocation
+ * @inode:     The inode for which the encryption should take place
+ * @page:      The page to encrypt. Must be locked for bounce-page
+ *             encryption.
+ * @len:       Length of data to encrypt in @page and encrypted
+ *             data in returned page.
+ * @offs:      Offset of data within @page and returned
+ *             page holding encrypted data.
+ * @lblk_num:  Logical block number. This must be unique for multiple
+ *             calls with same inode, except when overwriting
+ *             previously written data.
+ * @gfp_flags: The gfp flag for memory allocation
  *
- * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx
- * encryption context.
+ * Encrypts @page using the ctx encryption context. Performs encryption
+ * either in-place or into a newly allocated bounce page.
+ * Called on the page write path.
  *
- * Called on the page write path.  The caller must call
+ * Bounce page allocation is the default.
+ * In this case, the contents of @page are encrypted and stored in an
+ * allocated bounce page. @page has to be locked and the caller must call
  * fscrypt_restore_control_page() on the returned ciphertext page to
  * release the bounce buffer and the encryption context.
  *
- * Return: An allocated page with the encrypted content on success. Else, an
+ * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in
+ * fscrypt_operations. Here, the input-page is returned with its content
+ * encrypted.
+ *
+ * Return: A page with the encrypted content on success. Else, an
  * error value or NULL.
  */
-struct page *fscrypt_encrypt_page(struct inode *inode,
-				struct page *plaintext_page, gfp_t gfp_flags)
+struct page *fscrypt_encrypt_page(const struct inode *inode,
+				struct page *page,
+				unsigned int len,
+				unsigned int offs,
+				u64 lblk_num, gfp_t gfp_flags)
+
 {
 	struct fscrypt_ctx *ctx;
-	struct page *ciphertext_page = NULL;
+	struct page *ciphertext_page = page;
 	int err;
 
-	BUG_ON(!PageLocked(plaintext_page));
+	BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0);
+
+	if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) {
+		/* with inplace-encryption we just encrypt the page */
+		err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+					page, ciphertext_page,
+					len, offs, gfp_flags);
+		if (err)
+			return ERR_PTR(err);
+
+		return ciphertext_page;
+	}
+
+	BUG_ON(!PageLocked(page));
 
 	ctx = fscrypt_get_ctx(inode, gfp_flags);
 	if (IS_ERR(ctx))
@@ -245,10 +280,10 @@ struct page *fscrypt_encrypt_page(struct inode *inode,
 	if (IS_ERR(ciphertext_page))
 		goto errout;
 
-	ctx->w.control_page = plaintext_page;
-	err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index,
-					plaintext_page, ciphertext_page,
-					gfp_flags);
+	ctx->w.control_page = page;
+	err = do_page_crypto(inode, FS_ENCRYPT, lblk_num,
+					page, ciphertext_page,
+					len, offs, gfp_flags);
 	if (err) {
 		ciphertext_page = ERR_PTR(err);
 		goto errout;
@@ -265,8 +300,13 @@ errout:
 EXPORT_SYMBOL(fscrypt_encrypt_page);
 
 /**
- * f2crypt_decrypt_page() - Decrypts a page in-place
- * @page: The page to decrypt. Must be locked.
+ * fscrypt_decrypt_page() - Decrypts a page in-place
+ * @inode:     The corresponding inode for the page to decrypt.
+ * @page:      The page to decrypt. Must be locked in case
+ *             it is a writeback page (FS_CFLG_OWN_PAGES unset).
+ * @len:       Number of bytes in @page to be decrypted.
+ * @offs:      Start of data in @page.
+ * @lblk_num:  Logical block number.
  *
  * Decrypts page in-place using the ctx encryption context.
  *
@@ -274,16 +314,18 @@ EXPORT_SYMBOL(fscrypt_encrypt_page);
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_decrypt_page(struct page *page)
+int fscrypt_decrypt_page(const struct inode *inode, struct page *page,
+			unsigned int len, unsigned int offs, u64 lblk_num)
 {
-	BUG_ON(!PageLocked(page));
+	if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES))
+		BUG_ON(!PageLocked(page));
 
-	return do_page_crypto(page->mapping->host,
-			FS_DECRYPT, page->index, page, page, GFP_NOFS);
+	return do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, len,
+			offs, GFP_NOFS);
 }
 EXPORT_SYMBOL(fscrypt_decrypt_page);
 
-int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
+int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
 				sector_t pblk, unsigned int len)
 {
 	struct fscrypt_ctx *ctx;
@@ -306,7 +348,7 @@ int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk,
 	while (len--) {
 		err = do_page_crypto(inode, FS_ENCRYPT, lblk,
 					ZERO_PAGE(0), ciphertext_page,
-					GFP_NOFS);
+					PAGE_SIZE, 0, GFP_NOFS);
 		if (err)
 			goto errout;
 
@@ -414,7 +456,8 @@ static void completion_pages(struct work_struct *work)
 
 	bio_for_each_segment_all(bv, bio, i) {
 		struct page *page = bv->bv_page;
-		int ret = fscrypt_decrypt_page(page);
+		int ret = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0, page->index);
 
 		if (ret) {
 			WARN_ON_ONCE(1);
@@ -482,17 +525,22 @@ static void fscrypt_destroy(void)
 
 /**
  * fscrypt_initialize() - allocate major buffers for fs encryption.
+ * @cop_flags:  fscrypt operations flags
  *
  * We only call this when we start accessing encrypted files, since it
  * results in memory getting allocated that wouldn't otherwise be used.
  *
  * Return: Zero on success, non-zero otherwise.
  */
-int fscrypt_initialize(void)
+int fscrypt_initialize(unsigned int cop_flags)
 {
 	int i, res = -ENOMEM;
 
-	if (fscrypt_bounce_page_pool)
+	/*
+	 * No need to allocate a bounce page pool if there already is one or
+	 * this FS won't use it.
+	 */
+	if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool)
 		return 0;
 
 	mutex_lock(&fscrypt_init_mutex);
@@ -521,7 +569,6 @@ fail:
 	mutex_unlock(&fscrypt_init_mutex);
 	return res;
 }
-EXPORT_SYMBOL(fscrypt_initialize);
 
 /**
  * fscrypt_init() - Set up for fs encryption.
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c
index 9b774f4b50c8..56ad9d195f18 100644
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -12,7 +12,7 @@
 
 #include <linux/scatterlist.h>
 #include <linux/ratelimit.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 /**
  * fname_crypt_complete() - completion callback for filename crypto
@@ -209,7 +209,7 @@ static int digest_decode(const char *src, int len, char *dst)
 	return cp - dst;
 }
 
-u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen)
+u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen)
 {
 	int padding = 32;
 	struct fscrypt_info *ci = inode->i_crypt_info;
@@ -227,7 +227,7 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size);
  * Allocates an output buffer that is sufficient for the crypto operation
  * specified by the context and the direction.
  */
-int fscrypt_fname_alloc_buffer(struct inode *inode,
+int fscrypt_fname_alloc_buffer(const struct inode *inode,
 				u32 ilen, struct fscrypt_str *crypto_str)
 {
 	unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen);
@@ -350,7 +350,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
 		fname->disk_name.len = iname->len;
 		return 0;
 	}
-	ret = get_crypt_info(dir);
+	ret = fscrypt_get_crypt_info(dir);
 	if (ret && ret != -EOPNOTSUPP)
 		return ret;
 
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
new file mode 100644
index 000000000000..aeab032d7d35
--- /dev/null
+++ b/fs/crypto/fscrypt_private.h
@@ -0,0 +1,93 @@
+/*
+ * fscrypt_private.h
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * This contains encryption key functions.
+ *
+ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015.
+ */
+
+#ifndef _FSCRYPT_PRIVATE_H
+#define _FSCRYPT_PRIVATE_H
+
+#include <linux/fscrypto.h>
+
+#define FS_FNAME_CRYPTO_DIGEST_SIZE	32
+
+/* Encryption parameters */
+#define FS_XTS_TWEAK_SIZE		16
+#define FS_AES_128_ECB_KEY_SIZE		16
+#define FS_AES_256_GCM_KEY_SIZE		32
+#define FS_AES_256_CBC_KEY_SIZE		32
+#define FS_AES_256_CTS_KEY_SIZE		32
+#define FS_AES_256_XTS_KEY_SIZE		64
+#define FS_MAX_KEY_SIZE			64
+
+#define FS_KEY_DESC_PREFIX		"fscrypt:"
+#define FS_KEY_DESC_PREFIX_SIZE		8
+
+#define FS_KEY_DERIVATION_NONCE_SIZE		16
+
+/**
+ * Encryption context for inode
+ *
+ * Protector format:
+ *  1 byte: Protector format (1 = this version)
+ *  1 byte: File contents encryption mode
+ *  1 byte: File names encryption mode
+ *  1 byte: Flags
+ *  8 bytes: Master Key descriptor
+ *  16 bytes: Encryption Key derivation nonce
+ */
+struct fscrypt_context {
+	u8 format;
+	u8 contents_encryption_mode;
+	u8 filenames_encryption_mode;
+	u8 flags;
+	u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE];
+	u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE];
+} __packed;
+
+#define FS_ENCRYPTION_CONTEXT_FORMAT_V1		1
+
+/* This is passed in from userspace into the kernel keyring */
+struct fscrypt_key {
+	u32 mode;
+	u8 raw[FS_MAX_KEY_SIZE];
+	u32 size;
+} __packed;
+
+/*
+ * A pointer to this structure is stored in the file system's in-core
+ * representation of an inode.
+ */
+struct fscrypt_info {
+	u8 ci_data_mode;
+	u8 ci_filename_mode;
+	u8 ci_flags;
+	struct crypto_skcipher *ci_ctfm;
+	struct key *ci_keyring_key;
+	u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE];
+};
+
+#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL		0x00000001
+#define FS_CTX_HAS_BOUNCE_BUFFER_FL		0x00000002
+
+struct fscrypt_completion_result {
+	struct completion completion;
+	int res;
+};
+
+#define DECLARE_FS_COMPLETION_RESULT(ecr) \
+	struct fscrypt_completion_result ecr = { \
+		COMPLETION_INITIALIZER((ecr).completion), 0 }
+
+
+/* crypto.c */
+int fscrypt_initialize(unsigned int cop_flags);
+
+/* keyinfo.c */
+extern int fscrypt_get_crypt_info(struct inode *);
+
+#endif /* _FSCRYPT_PRIVATE_H */
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c
index 67fb6d8876d0..95cd4c3b06c3 100644
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -10,7 +10,7 @@
 
 #include <keys/user-type.h>
 #include <linux/scatterlist.h>
-#include <linux/fscrypto.h>
+#include "fscrypt_private.h"
 
 static void derive_crypt_complete(struct crypto_async_request *req, int rc)
 {
@@ -178,7 +178,7 @@ static void put_crypt_info(struct fscrypt_info *ci)
 	kmem_cache_free(fscrypt_info_cachep, ci);
 }
 
-int get_crypt_info(struct inode *inode)
+int fscrypt_get_crypt_info(struct inode *inode)
 {
 	struct fscrypt_info *crypt_info;
 	struct fscrypt_context ctx;
@@ -188,7 +188,7 @@ int get_crypt_info(struct inode *inode)
 	u8 *raw_key = NULL;
 	int res;
 
-	res = fscrypt_initialize();
+	res = fscrypt_initialize(inode->i_sb->s_cop->flags);
 	if (res)
 		return res;
 
@@ -248,7 +248,8 @@ retry:
 		goto out;
 
 	if (fscrypt_dummy_context_enabled(inode)) {
-		memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE);
+		memset(raw_key, 0x42, keysize/2);
+		memset(raw_key+keysize/2, 0x24, keysize - (keysize/2));
 		goto got_key;
 	}
 
@@ -327,7 +328,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
 		 (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) |
 					       (1 << KEY_FLAG_REVOKED) |
 					       (1 << KEY_FLAG_DEAD)))))
-		return get_crypt_info(inode);
+		return fscrypt_get_crypt_info(inode);
 	return 0;
 }
 EXPORT_SYMBOL(fscrypt_get_encryption_info);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index 6865663aac69..d6cd7ea4851d 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -10,8 +10,8 @@
 
 #include <linux/random.h>
 #include <linux/string.h>
-#include <linux/fscrypto.h>
 #include <linux/mount.h>
+#include "fscrypt_private.h"
 
 static int inode_has_encryption_context(struct inode *inode)
 {
@@ -93,16 +93,19 @@ static int create_encryption_context_from_policy(struct inode *inode,
 	return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL);
 }
 
-int fscrypt_process_policy(struct file *filp,
-				const struct fscrypt_policy *policy)
+int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg)
 {
+	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 	int ret;
 
+	if (copy_from_user(&policy, arg, sizeof(policy)))
+		return -EFAULT;
+
 	if (!inode_owner_or_capable(inode))
 		return -EACCES;
 
-	if (policy->version != 0)
+	if (policy.version != 0)
 		return -EINVAL;
 
 	ret = mnt_want_write_file(filp);
@@ -120,9 +123,9 @@ int fscrypt_process_policy(struct file *filp,
 			ret = -ENOTEMPTY;
 		else
 			ret = create_encryption_context_from_policy(inode,
-								    policy);
+								    &policy);
 	} else if (!is_encryption_context_consistent_with_policy(inode,
-								 policy)) {
+								 &policy)) {
 		printk(KERN_WARNING
 		       "%s: Policy inconsistent with encryption context\n",
 		       __func__);
@@ -134,11 +137,13 @@ int fscrypt_process_policy(struct file *filp,
 	mnt_drop_write_file(filp);
 	return ret;
 }
-EXPORT_SYMBOL(fscrypt_process_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_set_policy);
 
-int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
+int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
 {
+	struct inode *inode = file_inode(filp);
 	struct fscrypt_context ctx;
+	struct fscrypt_policy policy;
 	int res;
 
 	if (!inode->i_sb->s_cop->get_context ||
@@ -151,15 +156,18 @@ int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy)
 	if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1)
 		return -EINVAL;
 
-	policy->version = 0;
-	policy->contents_encryption_mode = ctx.contents_encryption_mode;
-	policy->filenames_encryption_mode = ctx.filenames_encryption_mode;
-	policy->flags = ctx.flags;
-	memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor,
+	policy.version = 0;
+	policy.contents_encryption_mode = ctx.contents_encryption_mode;
+	policy.filenames_encryption_mode = ctx.filenames_encryption_mode;
+	policy.flags = ctx.flags;
+	memcpy(policy.master_key_descriptor, ctx.master_key_descriptor,
 				FS_KEY_DESCRIPTOR_SIZE);
+
+	if (copy_to_user(arg, &policy, sizeof(policy)))
+		return -EFAULT;
 	return 0;
 }
-EXPORT_SYMBOL(fscrypt_get_policy);
+EXPORT_SYMBOL(fscrypt_ioctl_get_policy);
 
 int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 {
@@ -171,6 +179,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
 		BUG_ON(1);
 	}
 
+	/* No restrictions on file types which are never encrypted */
+	if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) &&
+	    !S_ISLNK(child->i_mode))
+		return 1;
+
 	/* no restrictions if the parent directory is not encrypted */
 	if (!parent->i_sb->s_cop->is_encrypted(parent))
 		return 1;
diff --git a/fs/dax.c b/fs/dax.c
index 014defd2e744..3af2da5e64ce 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -31,28 +31,15 @@
 #include <linux/vmstat.h>
 #include <linux/pfn_t.h>
 #include <linux/sizes.h>
+#include <linux/mmu_notifier.h>
 #include <linux/iomap.h>
 #include "internal.h"
 
-/*
- * We use lowest available bit in exceptional entry for locking, other two
- * bits to determine entry type. In total 3 special bits.
- */
-#define RADIX_DAX_SHIFT	(RADIX_TREE_EXCEPTIONAL_SHIFT + 3)
-#define RADIX_DAX_PTE (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
-#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
-#define RADIX_DAX_TYPE_MASK (RADIX_DAX_PTE | RADIX_DAX_PMD)
-#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_TYPE_MASK)
-#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT))
-#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \
-		RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE) | \
-		RADIX_TREE_EXCEPTIONAL_ENTRY))
-
 /* We choose 4096 entries - same as per-zone page wait tables */
 #define DAX_WAIT_TABLE_BITS 12
 #define DAX_WAIT_TABLE_ENTRIES (1 << DAX_WAIT_TABLE_BITS)
 
-wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
+static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES];
 
 static int __init init_dax_wait_table(void)
 {
@@ -64,14 +51,6 @@ static int __init init_dax_wait_table(void)
 }
 fs_initcall(init_dax_wait_table);
 
-static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
-					      pgoff_t index)
-{
-	unsigned long hash = hash_long((unsigned long)mapping ^ index,
-				       DAX_WAIT_TABLE_BITS);
-	return wait_table + hash;
-}
-
 static long dax_map_atomic(struct block_device *bdev, struct blk_dax_ctl *dax)
 {
 	struct request_queue *q = bdev->bd_queue;
@@ -98,209 +77,52 @@ static void dax_unmap_atomic(struct block_device *bdev,
 	blk_queue_exit(bdev->bd_queue);
 }
 
-struct page *read_dax_sector(struct block_device *bdev, sector_t n)
+static int dax_is_pmd_entry(void *entry)
 {
-	struct page *page = alloc_pages(GFP_KERNEL, 0);
-	struct blk_dax_ctl dax = {
-		.size = PAGE_SIZE,
-		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
-	};
-	long rc;
-
-	if (!page)
-		return ERR_PTR(-ENOMEM);
-
-	rc = dax_map_atomic(bdev, &dax);
-	if (rc < 0)
-		return ERR_PTR(rc);
-	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
-	dax_unmap_atomic(bdev, &dax);
-	return page;
+	return (unsigned long)entry & RADIX_DAX_PMD;
 }
 
-static bool buffer_written(struct buffer_head *bh)
+static int dax_is_pte_entry(void *entry)
 {
-	return buffer_mapped(bh) && !buffer_unwritten(bh);
+	return !((unsigned long)entry & RADIX_DAX_PMD);
 }
 
-/*
- * When ext4 encounters a hole, it returns without modifying the buffer_head
- * which means that we can't trust b_size.  To cope with this, we set b_state
- * to 0 before calling get_block and, if any bit is set, we know we can trust
- * b_size.  Unfortunate, really, since ext4 knows precisely how long a hole is
- * and would save us time calling get_block repeatedly.
- */
-static bool buffer_size_valid(struct buffer_head *bh)
+static int dax_is_zero_entry(void *entry)
 {
-	return bh->b_state != 0;
+	return (unsigned long)entry & RADIX_DAX_HZP;
 }
 
-
-static sector_t to_sector(const struct buffer_head *bh,
-		const struct inode *inode)
+static int dax_is_empty_entry(void *entry)
 {
-	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
-
-	return sector;
+	return (unsigned long)entry & RADIX_DAX_EMPTY;
 }
 
-static ssize_t dax_io(struct inode *inode, struct iov_iter *iter,
-		      loff_t start, loff_t end, get_block_t get_block,
-		      struct buffer_head *bh)
+struct page *read_dax_sector(struct block_device *bdev, sector_t n)
 {
-	loff_t pos = start, max = start, bh_max = start;
-	bool hole = false;
-	struct block_device *bdev = NULL;
-	int rw = iov_iter_rw(iter), rc;
-	long map_len = 0;
+	struct page *page = alloc_pages(GFP_KERNEL, 0);
 	struct blk_dax_ctl dax = {
-		.addr = ERR_PTR(-EIO),
+		.size = PAGE_SIZE,
+		.sector = n & ~((((int) PAGE_SIZE) / 512) - 1),
 	};
-	unsigned blkbits = inode->i_blkbits;
-	sector_t file_blks = (i_size_read(inode) + (1 << blkbits) - 1)
-								>> blkbits;
-
-	if (rw == READ)
-		end = min(end, i_size_read(inode));
-
-	while (pos < end) {
-		size_t len;
-		if (pos == max) {
-			long page = pos >> PAGE_SHIFT;
-			sector_t block = page << (PAGE_SHIFT - blkbits);
-			unsigned first = pos - (block << blkbits);
-			long size;
-
-			if (pos == bh_max) {
-				bh->b_size = PAGE_ALIGN(end - pos);
-				bh->b_state = 0;
-				rc = get_block(inode, block, bh, rw == WRITE);
-				if (rc)
-					break;
-				if (!buffer_size_valid(bh))
-					bh->b_size = 1 << blkbits;
-				bh_max = pos - first + bh->b_size;
-				bdev = bh->b_bdev;
-				/*
-				 * We allow uninitialized buffers for writes
-				 * beyond EOF as those cannot race with faults
-				 */
-				WARN_ON_ONCE(
-					(buffer_new(bh) && block < file_blks) ||
-					(rw == WRITE && buffer_unwritten(bh)));
-			} else {
-				unsigned done = bh->b_size -
-						(bh_max - (pos - first));
-				bh->b_blocknr += done >> blkbits;
-				bh->b_size -= done;
-			}
-
-			hole = rw == READ && !buffer_written(bh);
-			if (hole) {
-				size = bh->b_size - first;
-			} else {
-				dax_unmap_atomic(bdev, &dax);
-				dax.sector = to_sector(bh, inode);
-				dax.size = bh->b_size;
-				map_len = dax_map_atomic(bdev, &dax);
-				if (map_len < 0) {
-					rc = map_len;
-					break;
-				}
-				dax.addr += first;
-				size = map_len - first;
-			}
-			/*
-			 * pos + size is one past the last offset for IO,
-			 * so pos + size can overflow loff_t at extreme offsets.
-			 * Cast to u64 to catch this and get the true minimum.
-			 */
-			max = min_t(u64, pos + size, end);
-		}
-
-		if (iov_iter_rw(iter) == WRITE) {
-			len = copy_from_iter_pmem(dax.addr, max - pos, iter);
-		} else if (!hole)
-			len = copy_to_iter((void __force *) dax.addr, max - pos,
-					iter);
-		else
-			len = iov_iter_zero(max - pos, iter);
-
-		if (!len) {
-			rc = -EFAULT;
-			break;
-		}
+	long rc;
 
-		pos += len;
-		if (!IS_ERR(dax.addr))
-			dax.addr += len;
-	}
+	if (!page)
+		return ERR_PTR(-ENOMEM);
 
+	rc = dax_map_atomic(bdev, &dax);
+	if (rc < 0)
+		return ERR_PTR(rc);
+	memcpy_from_pmem(page_address(page), dax.addr, PAGE_SIZE);
 	dax_unmap_atomic(bdev, &dax);
-
-	return (pos == start) ? rc : pos - start;
-}
-
-/**
- * dax_do_io - Perform I/O to a DAX file
- * @iocb: The control block for this I/O
- * @inode: The file which the I/O is directed at
- * @iter: The addresses to do I/O from or to
- * @get_block: The filesystem method used to translate file offsets to blocks
- * @end_io: A filesystem callback for I/O completion
- * @flags: See below
- *
- * This function uses the same locking scheme as do_blockdev_direct_IO:
- * If @flags has DIO_LOCKING set, we assume that the i_mutex is held by the
- * caller for writes.  For reads, we take and release the i_mutex ourselves.
- * If DIO_LOCKING is not set, the filesystem takes care of its own locking.
- * As with do_blockdev_direct_IO(), we increment i_dio_count while the I/O
- * is in progress.
- */
-ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode,
-		  struct iov_iter *iter, get_block_t get_block,
-		  dio_iodone_t end_io, int flags)
-{
-	struct buffer_head bh;
-	ssize_t retval = -EINVAL;
-	loff_t pos = iocb->ki_pos;
-	loff_t end = pos + iov_iter_count(iter);
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-
-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		inode_lock(inode);
-
-	/* Protects against truncate */
-	if (!(flags & DIO_SKIP_DIO_COUNT))
-		inode_dio_begin(inode);
-
-	retval = dax_io(inode, iter, pos, end, get_block, &bh);
-
-	if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ)
-		inode_unlock(inode);
-
-	if (end_io) {
-		int err;
-
-		err = end_io(iocb, pos, retval, bh.b_private);
-		if (err)
-			retval = err;
-	}
-
-	if (!(flags & DIO_SKIP_DIO_COUNT))
-		inode_dio_end(inode);
-	return retval;
+	return page;
 }
-EXPORT_SYMBOL_GPL(dax_do_io);
 
 /*
  * DAX radix tree locking
  */
 struct exceptional_entry_key {
 	struct address_space *mapping;
-	unsigned long index;
+	pgoff_t entry_start;
 };
 
 struct wait_exceptional_entry_queue {
@@ -308,6 +130,26 @@ struct wait_exceptional_entry_queue {
 	struct exceptional_entry_key key;
 };
 
+static wait_queue_head_t *dax_entry_waitqueue(struct address_space *mapping,
+		pgoff_t index, void *entry, struct exceptional_entry_key *key)
+{
+	unsigned long hash;
+
+	/*
+	 * If 'entry' is a PMD, align the 'index' that we use for the wait
+	 * queue to the start of that PMD.  This ensures that all offsets in
+	 * the range covered by the PMD map to the same bit lock.
+	 */
+	if (dax_is_pmd_entry(entry))
+		index &= ~((1UL << (PMD_SHIFT - PAGE_SHIFT)) - 1);
+
+	key->mapping = mapping;
+	key->entry_start = index;
+
+	hash = hash_long((unsigned long)mapping ^ index, DAX_WAIT_TABLE_BITS);
+	return wait_table + hash;
+}
+
 static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 				       int sync, void *keyp)
 {
@@ -316,7 +158,7 @@ static int wake_exceptional_entry_func(wait_queue_t *wait, unsigned int mode,
 		container_of(wait, struct wait_exceptional_entry_queue, wait);
 
 	if (key->mapping != ewait->key.mapping ||
-	    key->index != ewait->key.index)
+	    key->entry_start != ewait->key.entry_start)
 		return 0;
 	return autoremove_wake_function(wait, mode, sync, NULL);
 }
@@ -342,7 +184,7 @@ static inline void *lock_slot(struct address_space *mapping, void **slot)
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 
 	entry |= RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(slot, (void *)entry);
+	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
 	return (void *)entry;
 }
 
@@ -356,7 +198,7 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
 		radix_tree_deref_slot_protected(slot, &mapping->tree_lock);
 
 	entry &= ~(unsigned long)RADIX_DAX_ENTRY_LOCK;
-	radix_tree_replace_slot(slot, (void *)entry);
+	radix_tree_replace_slot(&mapping->page_tree, slot, (void *)entry);
 	return (void *)entry;
 }
 
@@ -372,24 +214,24 @@ static inline void *unlock_slot(struct address_space *mapping, void **slot)
 static void *get_unlocked_mapping_entry(struct address_space *mapping,
 					pgoff_t index, void ***slotp)
 {
-	void *ret, **slot;
+	void *entry, **slot;
 	struct wait_exceptional_entry_queue ewait;
-	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+	wait_queue_head_t *wq;
 
 	init_wait(&ewait.wait);
 	ewait.wait.func = wake_exceptional_entry_func;
-	ewait.key.mapping = mapping;
-	ewait.key.index = index;
 
 	for (;;) {
-		ret = __radix_tree_lookup(&mapping->page_tree, index, NULL,
+		entry = __radix_tree_lookup(&mapping->page_tree, index, NULL,
 					  &slot);
-		if (!ret || !radix_tree_exceptional_entry(ret) ||
+		if (!entry || !radix_tree_exceptional_entry(entry) ||
 		    !slot_locked(mapping, slot)) {
 			if (slotp)
 				*slotp = slot;
-			return ret;
+			return entry;
 		}
+
+		wq = dax_entry_waitqueue(mapping, index, entry, &ewait.key);
 		prepare_to_wait_exclusive(wq, &ewait.wait,
 					  TASK_UNINTERRUPTIBLE);
 		spin_unlock_irq(&mapping->tree_lock);
@@ -399,52 +241,173 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping,
 	}
 }
 
+static void dax_unlock_mapping_entry(struct address_space *mapping,
+				     pgoff_t index)
+{
+	void *entry, **slot;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
+	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) ||
+			 !slot_locked(mapping, slot))) {
+		spin_unlock_irq(&mapping->tree_lock);
+		return;
+	}
+	unlock_slot(mapping, slot);
+	spin_unlock_irq(&mapping->tree_lock);
+	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
+static void put_locked_mapping_entry(struct address_space *mapping,
+				     pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry)) {
+		unlock_page(entry);
+		put_page(entry);
+	} else {
+		dax_unlock_mapping_entry(mapping, index);
+	}
+}
+
+/*
+ * Called when we are done with radix tree entry we looked up via
+ * get_unlocked_mapping_entry() and which we didn't lock in the end.
+ */
+static void put_unlocked_mapping_entry(struct address_space *mapping,
+				       pgoff_t index, void *entry)
+{
+	if (!radix_tree_exceptional_entry(entry))
+		return;
+
+	/* We have to wake up next waiter for the radix tree entry lock */
+	dax_wake_mapping_entry_waiter(mapping, index, entry, false);
+}
+
 /*
  * Find radix tree entry at given index. If it points to a page, return with
  * the page locked. If it points to the exceptional entry, return with the
  * radix tree entry locked. If the radix tree doesn't contain given index,
  * create empty exceptional entry for the index and return with it locked.
  *
+ * When requesting an entry with size RADIX_DAX_PMD, grab_mapping_entry() will
+ * either return that locked entry or will return an error.  This error will
+ * happen if there are any 4k entries (either zero pages or DAX entries)
+ * within the 2MiB range that we are requesting.
+ *
+ * We always favor 4k entries over 2MiB entries. There isn't a flow where we
+ * evict 4k entries in order to 'upgrade' them to a 2MiB entry.  A 2MiB
+ * insertion will fail if it finds any 4k entries already in the tree, and a
+ * 4k insertion will cause an existing 2MiB entry to be unmapped and
+ * downgraded to 4k entries.  This happens for both 2MiB huge zero pages as
+ * well as 2MiB empty entries.
+ *
+ * The exception to this downgrade path is for 2MiB DAX PMD entries that have
+ * real storage backing them.  We will leave these real 2MiB DAX entries in
+ * the tree, and PTE writes will simply dirty the entire 2MiB DAX entry.
+ *
  * Note: Unlike filemap_fault() we don't honor FAULT_FLAG_RETRY flags. For
  * persistent memory the benefit is doubtful. We can add that later if we can
  * show it helps.
  */
-static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index)
+static void *grab_mapping_entry(struct address_space *mapping, pgoff_t index,
+		unsigned long size_flag)
 {
-	void *ret, **slot;
+	bool pmd_downgrade = false; /* splitting 2MiB entry into 4k entries? */
+	void *entry, **slot;
 
 restart:
 	spin_lock_irq(&mapping->tree_lock);
-	ret = get_unlocked_mapping_entry(mapping, index, &slot);
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+
+	if (entry) {
+		if (size_flag & RADIX_DAX_PMD) {
+			if (!radix_tree_exceptional_entry(entry) ||
+			    dax_is_pte_entry(entry)) {
+				put_unlocked_mapping_entry(mapping, index,
+						entry);
+				entry = ERR_PTR(-EEXIST);
+				goto out_unlock;
+			}
+		} else { /* trying to grab a PTE entry */
+			if (radix_tree_exceptional_entry(entry) &&
+			    dax_is_pmd_entry(entry) &&
+			    (dax_is_zero_entry(entry) ||
+			     dax_is_empty_entry(entry))) {
+				pmd_downgrade = true;
+			}
+		}
+	}
+
 	/* No entry for given index? Make sure radix tree is big enough. */
-	if (!ret) {
+	if (!entry || pmd_downgrade) {
 		int err;
 
+		if (pmd_downgrade) {
+			/*
+			 * Make sure 'entry' remains valid while we drop
+			 * mapping->tree_lock.
+			 */
+			entry = lock_slot(mapping, slot);
+		}
+
 		spin_unlock_irq(&mapping->tree_lock);
+		/*
+		 * Besides huge zero pages the only other thing that gets
+		 * downgraded are empty entries which don't need to be
+		 * unmapped.
+		 */
+		if (pmd_downgrade && dax_is_zero_entry(entry))
+			unmap_mapping_range(mapping,
+				(index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
+
 		err = radix_tree_preload(
 				mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM);
-		if (err)
+		if (err) {
+			if (pmd_downgrade)
+				put_locked_mapping_entry(mapping, index, entry);
 			return ERR_PTR(err);
-		ret = (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY |
-			       RADIX_DAX_ENTRY_LOCK);
+		}
 		spin_lock_irq(&mapping->tree_lock);
-		err = radix_tree_insert(&mapping->page_tree, index, ret);
+
+		if (pmd_downgrade) {
+			radix_tree_delete(&mapping->page_tree, index);
+			mapping->nrexceptional--;
+			dax_wake_mapping_entry_waiter(mapping, index, entry,
+					true);
+		}
+
+		entry = dax_radix_locked_entry(0, size_flag | RADIX_DAX_EMPTY);
+
+		err = __radix_tree_insert(&mapping->page_tree, index,
+				dax_radix_order(entry), entry);
 		radix_tree_preload_end();
 		if (err) {
 			spin_unlock_irq(&mapping->tree_lock);
-			/* Someone already created the entry? */
-			if (err == -EEXIST)
+			/*
+			 * Someone already created the entry?  This is a
+			 * normal failure when inserting PMDs in a range
+			 * that already contains PTEs.  In that case we want
+			 * to return -EEXIST immediately.
+			 */
+			if (err == -EEXIST && !(size_flag & RADIX_DAX_PMD))
 				goto restart;
+			/*
+			 * Our insertion of a DAX PMD entry failed, most
+			 * likely because it collided with a PTE sized entry
+			 * at a different index in the PMD range.  We haven't
+			 * inserted anything into the radix tree and have no
+			 * waiters to wake.
+			 */
 			return ERR_PTR(err);
 		}
 		/* Good, we have inserted empty locked entry into the tree. */
 		mapping->nrexceptional++;
 		spin_unlock_irq(&mapping->tree_lock);
-		return ret;
+		return entry;
 	}
 	/* Normal page in radix tree? */
-	if (!radix_tree_exceptional_entry(ret)) {
-		struct page *page = ret;
+	if (!radix_tree_exceptional_entry(entry)) {
+		struct page *page = entry;
 
 		get_page(page);
 		spin_unlock_irq(&mapping->tree_lock);
@@ -457,15 +420,26 @@ restart:
 		}
 		return page;
 	}
-	ret = lock_slot(mapping, slot);
+	entry = lock_slot(mapping, slot);
+ out_unlock:
 	spin_unlock_irq(&mapping->tree_lock);
-	return ret;
+	return entry;
 }
 
+/*
+ * We do not necessarily hold the mapping->tree_lock when we call this
+ * function so it is possible that 'entry' is no longer a valid item in the
+ * radix tree.  This is okay because all we really need to do is to find the
+ * correct waitqueue where tasks might be waiting for that old 'entry' and
+ * wake them.
+ */
 void dax_wake_mapping_entry_waiter(struct address_space *mapping,
-				   pgoff_t index, bool wake_all)
+		pgoff_t index, void *entry, bool wake_all)
 {
-	wait_queue_head_t *wq = dax_entry_waitqueue(mapping, index);
+	struct exceptional_entry_key key;
+	wait_queue_head_t *wq;
+
+	wq = dax_entry_waitqueue(mapping, index, entry, &key);
 
 	/*
 	 * Checking for locked entry and prepare_to_wait_exclusive() happens
@@ -473,66 +447,41 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping,
 	 * So at this point all tasks that could have seen our entry locked
 	 * must be in the waitqueue and the following check will see them.
 	 */
-	if (waitqueue_active(wq)) {
-		struct exceptional_entry_key key;
-
-		key.mapping = mapping;
-		key.index = index;
+	if (waitqueue_active(wq))
 		__wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key);
-	}
 }
 
-void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index)
+static int __dax_invalidate_mapping_entry(struct address_space *mapping,
+					  pgoff_t index, bool trunc)
 {
-	void *ret, **slot;
+	int ret = 0;
+	void *entry;
+	struct radix_tree_root *page_tree = &mapping->page_tree;
 
 	spin_lock_irq(&mapping->tree_lock);
-	ret = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot);
-	if (WARN_ON_ONCE(!ret || !radix_tree_exceptional_entry(ret) ||
-			 !slot_locked(mapping, slot))) {
-		spin_unlock_irq(&mapping->tree_lock);
-		return;
-	}
-	unlock_slot(mapping, slot);
+	entry = get_unlocked_mapping_entry(mapping, index, NULL);
+	if (!entry || !radix_tree_exceptional_entry(entry))
+		goto out;
+	if (!trunc &&
+	    (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+	     radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)))
+		goto out;
+	radix_tree_delete(page_tree, index);
+	mapping->nrexceptional--;
+	ret = 1;
+out:
+	put_unlocked_mapping_entry(mapping, index, entry);
 	spin_unlock_irq(&mapping->tree_lock);
-	dax_wake_mapping_entry_waiter(mapping, index, false);
-}
-
-static void put_locked_mapping_entry(struct address_space *mapping,
-				     pgoff_t index, void *entry)
-{
-	if (!radix_tree_exceptional_entry(entry)) {
-		unlock_page(entry);
-		put_page(entry);
-	} else {
-		dax_unlock_mapping_entry(mapping, index);
-	}
-}
-
-/*
- * Called when we are done with radix tree entry we looked up via
- * get_unlocked_mapping_entry() and which we didn't lock in the end.
- */
-static void put_unlocked_mapping_entry(struct address_space *mapping,
-				       pgoff_t index, void *entry)
-{
-	if (!radix_tree_exceptional_entry(entry))
-		return;
-
-	/* We have to wake up next waiter for the radix tree entry lock */
-	dax_wake_mapping_entry_waiter(mapping, index, false);
+	return ret;
 }
-
 /*
  * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree
  * entry to get unlocked before deleting it.
  */
 int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 {
-	void *entry;
+	int ret = __dax_invalidate_mapping_entry(mapping, index, true);
 
-	spin_lock_irq(&mapping->tree_lock);
-	entry = get_unlocked_mapping_entry(mapping, index, NULL);
 	/*
 	 * This gets called from truncate / punch_hole path. As such, the caller
 	 * must hold locks protecting against concurrent modifications of the
@@ -540,16 +489,46 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
 	 * caller has seen exceptional entry for this index, we better find it
 	 * at that index as well...
 	 */
-	if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry))) {
-		spin_unlock_irq(&mapping->tree_lock);
-		return 0;
-	}
-	radix_tree_delete(&mapping->page_tree, index);
+	WARN_ON_ONCE(!ret);
+	return ret;
+}
+
+/*
+ * Invalidate exceptional DAX entry if easily possible. This handles DAX
+ * entries for invalidate_inode_pages() so we evict the entry only if we can
+ * do so without blocking.
+ */
+int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index)
+{
+	int ret = 0;
+	void *entry, **slot;
+	struct radix_tree_root *page_tree = &mapping->page_tree;
+
+	spin_lock_irq(&mapping->tree_lock);
+	entry = __radix_tree_lookup(page_tree, index, NULL, &slot);
+	if (!entry || !radix_tree_exceptional_entry(entry) ||
+	    slot_locked(mapping, slot))
+		goto out;
+	if (radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_DIRTY) ||
+	    radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto out;
+	radix_tree_delete(page_tree, index);
 	mapping->nrexceptional--;
+	ret = 1;
+out:
 	spin_unlock_irq(&mapping->tree_lock);
-	dax_wake_mapping_entry_waiter(mapping, index, true);
+	if (ret)
+		dax_wake_mapping_entry_waiter(mapping, index, entry, true);
+	return ret;
+}
 
-	return 1;
+/*
+ * Invalidate exceptional DAX entry if it is clean.
+ */
+int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
+				      pgoff_t index)
+{
+	return __dax_invalidate_mapping_entry(mapping, index, false);
 }
 
 /*
@@ -560,26 +539,34 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index)
  * otherwise it will simply fall out of the page cache under memory
  * pressure without ever having been dirtied.
  */
-static int dax_load_hole(struct address_space *mapping, void *entry,
+static int dax_load_hole(struct address_space *mapping, void **entry,
 			 struct vm_fault *vmf)
 {
 	struct page *page;
+	int ret;
 
 	/* Hole page already exists? Return it...  */
-	if (!radix_tree_exceptional_entry(entry)) {
-		vmf->page = entry;
-		return VM_FAULT_LOCKED;
+	if (!radix_tree_exceptional_entry(*entry)) {
+		page = *entry;
+		goto out;
 	}
 
 	/* This will replace locked radix tree entry with a hole page */
 	page = find_or_create_page(mapping, vmf->pgoff,
 				   vmf->gfp_mask | __GFP_ZERO);
-	if (!page) {
-		put_locked_mapping_entry(mapping, vmf->pgoff, entry);
+	if (!page)
 		return VM_FAULT_OOM;
-	}
+ out:
 	vmf->page = page;
-	return VM_FAULT_LOCKED;
+	ret = finish_fault(vmf);
+	vmf->page = NULL;
+	*entry = page;
+	if (!ret) {
+		/* Grab reference for PTE that is now referencing the page */
+		get_page(page);
+		return VM_FAULT_NOPAGE;
+	}
+	return ret;
 }
 
 static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size,
@@ -600,11 +587,17 @@ static int copy_user_dax(struct block_device *bdev, sector_t sector, size_t size
 	return 0;
 }
 
-#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_SHIFT))
-
+/*
+ * By this point grab_mapping_entry() has ensured that we have a locked entry
+ * of the appropriate size so we don't have to worry about downgrading PMDs to
+ * PTEs.  If we happen to be trying to insert a PTE and there is a PMD
+ * already in the tree, we will skip the insertion and just dirty the PMD as
+ * appropriate.
+ */
 static void *dax_insert_mapping_entry(struct address_space *mapping,
 				      struct vm_fault *vmf,
-				      void *entry, sector_t sector)
+				      void *entry, sector_t sector,
+				      unsigned long flags)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
 	int error = 0;
@@ -627,28 +620,43 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 		error = radix_tree_preload(vmf->gfp_mask & ~__GFP_HIGHMEM);
 		if (error)
 			return ERR_PTR(error);
+	} else if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_HZP)) {
+		/* replacing huge zero page with PMD block mapping */
+		unmap_mapping_range(mapping,
+			(vmf->pgoff << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0);
 	}
 
 	spin_lock_irq(&mapping->tree_lock);
-	new_entry = (void *)((unsigned long)RADIX_DAX_ENTRY(sector, false) |
-		       RADIX_DAX_ENTRY_LOCK);
+	new_entry = dax_radix_locked_entry(sector, flags);
+
 	if (hole_fill) {
 		__delete_from_page_cache(entry, NULL);
 		/* Drop pagecache reference */
 		put_page(entry);
-		error = radix_tree_insert(page_tree, index, new_entry);
+		error = __radix_tree_insert(page_tree, index,
+				dax_radix_order(new_entry), new_entry);
 		if (error) {
 			new_entry = ERR_PTR(error);
 			goto unlock;
 		}
 		mapping->nrexceptional++;
-	} else {
+	} else if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) {
+		/*
+		 * Only swap our new entry into the radix tree if the current
+		 * entry is a zero page or an empty entry.  If a normal PTE or
+		 * PMD entry is already in the tree, we leave it alone.  This
+		 * means that if we are trying to insert a PTE and the
+		 * existing entry is a PMD, we will just leave the PMD in the
+		 * tree and dirty it if necessary.
+		 */
+		struct radix_tree_node *node;
 		void **slot;
 		void *ret;
 
-		ret = __radix_tree_lookup(page_tree, index, NULL, &slot);
+		ret = __radix_tree_lookup(page_tree, index, &node, &slot);
 		WARN_ON_ONCE(ret != entry);
-		radix_tree_replace_slot(slot, new_entry);
+		__radix_tree_replace(page_tree, node, slot,
+				     new_entry, NULL, NULL);
 	}
 	if (vmf->flags & FAULT_FLAG_WRITE)
 		radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
@@ -668,63 +676,171 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
 	return new_entry;
 }
 
+static inline unsigned long
+pgoff_address(pgoff_t pgoff, struct vm_area_struct *vma)
+{
+	unsigned long address;
+
+	address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+	VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
+	return address;
+}
+
+/* Walk all mappings of a given index of a file and writeprotect them */
+static void dax_mapping_entry_mkclean(struct address_space *mapping,
+				      pgoff_t index, unsigned long pfn)
+{
+	struct vm_area_struct *vma;
+	pte_t pte, *ptep = NULL;
+	pmd_t *pmdp = NULL;
+	spinlock_t *ptl;
+	bool changed;
+
+	i_mmap_lock_read(mapping);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, index, index) {
+		unsigned long address;
+
+		cond_resched();
+
+		if (!(vma->vm_flags & VM_SHARED))
+			continue;
+
+		address = pgoff_address(index, vma);
+		changed = false;
+		if (follow_pte_pmd(vma->vm_mm, address, &ptep, &pmdp, &ptl))
+			continue;
+
+		if (pmdp) {
+#ifdef CONFIG_FS_DAX_PMD
+			pmd_t pmd;
+
+			if (pfn != pmd_pfn(*pmdp))
+				goto unlock_pmd;
+			if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
+				goto unlock_pmd;
+
+			flush_cache_page(vma, address, pfn);
+			pmd = pmdp_huge_clear_flush(vma, address, pmdp);
+			pmd = pmd_wrprotect(pmd);
+			pmd = pmd_mkclean(pmd);
+			set_pmd_at(vma->vm_mm, address, pmdp, pmd);
+			changed = true;
+unlock_pmd:
+			spin_unlock(ptl);
+#endif
+		} else {
+			if (pfn != pte_pfn(*ptep))
+				goto unlock_pte;
+			if (!pte_dirty(*ptep) && !pte_write(*ptep))
+				goto unlock_pte;
+
+			flush_cache_page(vma, address, pfn);
+			pte = ptep_clear_flush(vma, address, ptep);
+			pte = pte_wrprotect(pte);
+			pte = pte_mkclean(pte);
+			set_pte_at(vma->vm_mm, address, ptep, pte);
+			changed = true;
+unlock_pte:
+			pte_unmap_unlock(ptep, ptl);
+		}
+
+		if (changed)
+			mmu_notifier_invalidate_page(vma->vm_mm, address);
+	}
+	i_mmap_unlock_read(mapping);
+}
+
 static int dax_writeback_one(struct block_device *bdev,
 		struct address_space *mapping, pgoff_t index, void *entry)
 {
 	struct radix_tree_root *page_tree = &mapping->page_tree;
-	int type = RADIX_DAX_TYPE(entry);
-	struct radix_tree_node *node;
 	struct blk_dax_ctl dax;
-	void **slot;
+	void *entry2, **slot;
 	int ret = 0;
 
-	spin_lock_irq(&mapping->tree_lock);
 	/*
-	 * Regular page slots are stabilized by the page lock even
-	 * without the tree itself locked.  These unlocked entries
-	 * need verification under the tree lock.
+	 * A page got tagged dirty in DAX mapping? Something is seriously
+	 * wrong.
 	 */
-	if (!__radix_tree_lookup(page_tree, index, &node, &slot))
-		goto unlock;
-	if (*slot != entry)
-		goto unlock;
-
-	/* another fsync thread may have already written back this entry */
-	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
-		goto unlock;
+	if (WARN_ON(!radix_tree_exceptional_entry(entry)))
+		return -EIO;
 
-	if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) {
+	spin_lock_irq(&mapping->tree_lock);
+	entry2 = get_unlocked_mapping_entry(mapping, index, &slot);
+	/* Entry got punched out / reallocated? */
+	if (!entry2 || !radix_tree_exceptional_entry(entry2))
+		goto put_unlocked;
+	/*
+	 * Entry got reallocated elsewhere? No need to writeback. We have to
+	 * compare sectors as we must not bail out due to difference in lockbit
+	 * or entry type.
+	 */
+	if (dax_radix_sector(entry2) != dax_radix_sector(entry))
+		goto put_unlocked;
+	if (WARN_ON_ONCE(dax_is_empty_entry(entry) ||
+				dax_is_zero_entry(entry))) {
 		ret = -EIO;
-		goto unlock;
+		goto put_unlocked;
 	}
 
-	dax.sector = RADIX_DAX_SECTOR(entry);
-	dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE);
+	/* Another fsync thread may have already written back this entry */
+	if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE))
+		goto put_unlocked;
+	/* Lock the entry to serialize with page faults */
+	entry = lock_slot(mapping, slot);
+	/*
+	 * We can clear the tag now but we have to be careful so that concurrent
+	 * dax_writeback_one() calls for the same index cannot finish before we
+	 * actually flush the caches. This is achieved as the calls will look
+	 * at the entry only under tree_lock and once they do that they will
+	 * see the entry locked and wait for it to unlock.
+	 */
+	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
 	spin_unlock_irq(&mapping->tree_lock);
 
 	/*
+	 * Even if dax_writeback_mapping_range() was given a wbc->range_start
+	 * in the middle of a PMD, the 'index' we are given will be aligned to
+	 * the start index of the PMD, as will the sector we pull from
+	 * 'entry'.  This allows us to flush for PMD_SIZE and not have to
+	 * worry about partial PMD writebacks.
+	 */
+	dax.sector = dax_radix_sector(entry);
+	dax.size = PAGE_SIZE << dax_radix_order(entry);
+
+	/*
 	 * We cannot hold tree_lock while calling dax_map_atomic() because it
 	 * eventually calls cond_resched().
 	 */
 	ret = dax_map_atomic(bdev, &dax);
-	if (ret < 0)
+	if (ret < 0) {
+		put_locked_mapping_entry(mapping, index, entry);
 		return ret;
+	}
 
 	if (WARN_ON_ONCE(ret < dax.size)) {
 		ret = -EIO;
 		goto unmap;
 	}
 
+	dax_mapping_entry_mkclean(mapping, index, pfn_t_to_pfn(dax.pfn));
 	wb_cache_pmem(dax.addr, dax.size);
-
+	/*
+	 * After we have flushed the cache, we can clear the dirty tag. There
+	 * cannot be new dirty data in the pfn after the flush has completed as
+	 * the pfn mappings are writeprotected and fault waits for mapping
+	 * entry lock.
+	 */
 	spin_lock_irq(&mapping->tree_lock);
-	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE);
+	radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_DIRTY);
 	spin_unlock_irq(&mapping->tree_lock);
  unmap:
 	dax_unmap_atomic(bdev, &dax);
+	put_locked_mapping_entry(mapping, index, entry);
 	return ret;
 
- unlock:
+ put_unlocked:
+	put_unlocked_mapping_entry(mapping, index, entry2);
 	spin_unlock_irq(&mapping->tree_lock);
 	return ret;
 }
@@ -738,12 +854,11 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 		struct block_device *bdev, struct writeback_control *wbc)
 {
 	struct inode *inode = mapping->host;
-	pgoff_t start_index, end_index, pmd_index;
+	pgoff_t start_index, end_index;
 	pgoff_t indices[PAGEVEC_SIZE];
 	struct pagevec pvec;
 	bool done = false;
 	int i, ret = 0;
-	void *entry;
 
 	if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT))
 		return -EIO;
@@ -753,15 +868,6 @@ int dax_writeback_mapping_range(struct address_space *mapping,
 
 	start_index = wbc->range_start >> PAGE_SHIFT;
 	end_index = wbc->range_end >> PAGE_SHIFT;
-	pmd_index = DAX_PMD_INDEX(start_index);
-
-	rcu_read_lock();
-	entry = radix_tree_lookup(&mapping->page_tree, pmd_index);
-	rcu_read_unlock();
-
-	/* see if the start of our range is covered by a PMD entry */
-	if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD)
-		start_index = pmd_index;
 
 	tag_pages_for_writeback(mapping, start_index, end_index);
 
@@ -794,7 +900,7 @@ static int dax_insert_mapping(struct address_space *mapping,
 		struct block_device *bdev, sector_t sector, size_t size,
 		void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long vaddr = vmf->address;
 	struct blk_dax_ctl dax = {
 		.sector = sector,
 		.size = size,
@@ -806,7 +912,7 @@ static int dax_insert_mapping(struct address_space *mapping,
 		return PTR_ERR(dax.addr);
 	dax_unmap_atomic(bdev, &dax);
 
-	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
+	ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector, 0);
 	if (IS_ERR(ret))
 		return PTR_ERR(ret);
 	*entryp = ret;
@@ -815,323 +921,6 @@ static int dax_insert_mapping(struct address_space *mapping,
 }
 
 /**
- * dax_fault - handle a page fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * fault handler for DAX files. dax_fault() assumes the caller has done all
- * the necessary locking for the page fault to proceed successfully.
- */
-int dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
-			get_block_t get_block)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	void *entry;
-	struct buffer_head bh;
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
-	unsigned blkbits = inode->i_blkbits;
-	sector_t block;
-	pgoff_t size;
-	int error;
-	int major = 0;
-
-	/*
-	 * Check whether offset isn't beyond end of file now. Caller is supposed
-	 * to hold locks serializing us with truncate / punch hole so this is
-	 * a reliable test.
-	 */
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (vmf->pgoff >= size)
-		return VM_FAULT_SIGBUS;
-
-	memset(&bh, 0, sizeof(bh));
-	block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_SIZE;
-
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
-	if (IS_ERR(entry)) {
-		error = PTR_ERR(entry);
-		goto out;
-	}
-
-	error = get_block(inode, block, &bh, 0);
-	if (!error && (bh.b_size < PAGE_SIZE))
-		error = -EIO;		/* fs corruption? */
-	if (error)
-		goto unlock_entry;
-
-	if (vmf->cow_page) {
-		struct page *new_page = vmf->cow_page;
-		if (buffer_written(&bh))
-			error = copy_user_dax(bh.b_bdev, to_sector(&bh, inode),
-					bh.b_size, new_page, vaddr);
-		else
-			clear_user_highpage(new_page, vaddr);
-		if (error)
-			goto unlock_entry;
-		if (!radix_tree_exceptional_entry(entry)) {
-			vmf->page = entry;
-			return VM_FAULT_LOCKED;
-		}
-		vmf->entry = entry;
-		return VM_FAULT_DAX_LOCKED;
-	}
-
-	if (!buffer_mapped(&bh)) {
-		if (vmf->flags & FAULT_FLAG_WRITE) {
-			error = get_block(inode, block, &bh, 1);
-			count_vm_event(PGMAJFAULT);
-			mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-			major = VM_FAULT_MAJOR;
-			if (!error && (bh.b_size < PAGE_SIZE))
-				error = -EIO;
-			if (error)
-				goto unlock_entry;
-		} else {
-			return dax_load_hole(mapping, entry, vmf);
-		}
-	}
-
-	/* Filesystem should not return unwritten buffers to us! */
-	WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	error = dax_insert_mapping(mapping, bh.b_bdev, to_sector(&bh, inode),
-			bh.b_size, &entry, vma, vmf);
- unlock_entry:
-	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
-	/* -EBUSY is fine, somebody else faulted on the same PTE */
-	if ((error < 0) && (error != -EBUSY))
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
-}
-EXPORT_SYMBOL_GPL(dax_fault);
-
-#if defined(CONFIG_TRANSPARENT_HUGEPAGE)
-/*
- * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
- * more often than one might expect in the below function.
- */
-#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
-
-static void __dax_dbg(struct buffer_head *bh, unsigned long address,
-		const char *reason, const char *fn)
-{
-	if (bh) {
-		char bname[BDEVNAME_SIZE];
-		bdevname(bh->b_bdev, bname);
-		pr_debug("%s: %s addr: %lx dev %s state %lx start %lld "
-			"length %zd fallback: %s\n", fn, current->comm,
-			address, bname, bh->b_state, (u64)bh->b_blocknr,
-			bh->b_size, reason);
-	} else {
-		pr_debug("%s: %s addr: %lx fallback: %s\n", fn,
-			current->comm, address, reason);
-	}
-}
-
-#define dax_pmd_dbg(bh, address, reason)	__dax_dbg(bh, address, reason, "dax_pmd")
-
-/**
- * dax_pmd_fault - handle a PMD fault on a DAX file
- * @vma: The virtual memory area where the fault occurred
- * @vmf: The description of the fault
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * When a page fault occurs, filesystems may call this helper in their
- * pmd_fault handler for DAX files.
- */
-int dax_pmd_fault(struct vm_area_struct *vma, unsigned long address,
-		pmd_t *pmd, unsigned int flags, get_block_t get_block)
-{
-	struct file *file = vma->vm_file;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	struct buffer_head bh;
-	unsigned blkbits = inode->i_blkbits;
-	unsigned long pmd_addr = address & PMD_MASK;
-	bool write = flags & FAULT_FLAG_WRITE;
-	struct block_device *bdev;
-	pgoff_t size, pgoff;
-	sector_t block;
-	int result = 0;
-	bool alloc = false;
-
-	/* dax pmd mappings require pfn_t_devmap() */
-	if (!IS_ENABLED(CONFIG_FS_DAX_PMD))
-		return VM_FAULT_FALLBACK;
-
-	/* Fall back to PTEs if we're going to COW */
-	if (write && !(vma->vm_flags & VM_SHARED)) {
-		split_huge_pmd(vma, pmd, address);
-		dax_pmd_dbg(NULL, address, "cow write");
-		return VM_FAULT_FALLBACK;
-	}
-	/* If the PMD would extend outside the VMA */
-	if (pmd_addr < vma->vm_start) {
-		dax_pmd_dbg(NULL, address, "vma start unaligned");
-		return VM_FAULT_FALLBACK;
-	}
-	if ((pmd_addr + PMD_SIZE) > vma->vm_end) {
-		dax_pmd_dbg(NULL, address, "vma end unaligned");
-		return VM_FAULT_FALLBACK;
-	}
-
-	pgoff = linear_page_index(vma, pmd_addr);
-	size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	if (pgoff >= size)
-		return VM_FAULT_SIGBUS;
-	/* If the PMD would cover blocks out of the file */
-	if ((pgoff | PG_PMD_COLOUR) >= size) {
-		dax_pmd_dbg(NULL, address,
-				"offset + huge page size > file size");
-		return VM_FAULT_FALLBACK;
-	}
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
-
-	bh.b_size = PMD_SIZE;
-
-	if (get_block(inode, block, &bh, 0) != 0)
-		return VM_FAULT_SIGBUS;
-
-	if (!buffer_mapped(&bh) && write) {
-		if (get_block(inode, block, &bh, 1) != 0)
-			return VM_FAULT_SIGBUS;
-		alloc = true;
-		WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
-	}
-
-	bdev = bh.b_bdev;
-
-	/*
-	 * If the filesystem isn't willing to tell us the length of a hole,
-	 * just fall back to PTEs.  Calling get_block 512 times in a loop
-	 * would be silly.
-	 */
-	if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) {
-		dax_pmd_dbg(&bh, address, "allocated block too small");
-		return VM_FAULT_FALLBACK;
-	}
-
-	/*
-	 * If we allocated new storage, make sure no process has any
-	 * zero pages covering this hole
-	 */
-	if (alloc) {
-		loff_t lstart = pgoff << PAGE_SHIFT;
-		loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */
-
-		truncate_pagecache_range(inode, lstart, lend);
-	}
-
-	if (!write && !buffer_mapped(&bh)) {
-		spinlock_t *ptl;
-		pmd_t entry;
-		struct page *zero_page = mm_get_huge_zero_page(vma->vm_mm);
-
-		if (unlikely(!zero_page)) {
-			dax_pmd_dbg(&bh, address, "no zero page");
-			goto fallback;
-		}
-
-		ptl = pmd_lock(vma->vm_mm, pmd);
-		if (!pmd_none(*pmd)) {
-			spin_unlock(ptl);
-			dax_pmd_dbg(&bh, address, "pmd already present");
-			goto fallback;
-		}
-
-		dev_dbg(part_to_dev(bdev->bd_part),
-				"%s: %s addr: %lx pfn: <zero> sect: %llx\n",
-				__func__, current->comm, address,
-				(unsigned long long) to_sector(&bh, inode));
-
-		entry = mk_pmd(zero_page, vma->vm_page_prot);
-		entry = pmd_mkhuge(entry);
-		set_pmd_at(vma->vm_mm, pmd_addr, pmd, entry);
-		result = VM_FAULT_NOPAGE;
-		spin_unlock(ptl);
-	} else {
-		struct blk_dax_ctl dax = {
-			.sector = to_sector(&bh, inode),
-			.size = PMD_SIZE,
-		};
-		long length = dax_map_atomic(bdev, &dax);
-
-		if (length < 0) {
-			dax_pmd_dbg(&bh, address, "dax-error fallback");
-			goto fallback;
-		}
-		if (length < PMD_SIZE) {
-			dax_pmd_dbg(&bh, address, "dax-length too small");
-			dax_unmap_atomic(bdev, &dax);
-			goto fallback;
-		}
-		if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR) {
-			dax_pmd_dbg(&bh, address, "pfn unaligned");
-			dax_unmap_atomic(bdev, &dax);
-			goto fallback;
-		}
-
-		if (!pfn_t_devmap(dax.pfn)) {
-			dax_unmap_atomic(bdev, &dax);
-			dax_pmd_dbg(&bh, address, "pfn not in memmap");
-			goto fallback;
-		}
-		dax_unmap_atomic(bdev, &dax);
-
-		/*
-		 * For PTE faults we insert a radix tree entry for reads, and
-		 * leave it clean.  Then on the first write we dirty the radix
-		 * tree entry via the dax_pfn_mkwrite() path.  This sequence
-		 * allows the dax_pfn_mkwrite() call to be simpler and avoid a
-		 * call into get_block() to translate the pgoff to a sector in
-		 * order to be able to create a new radix tree entry.
-		 *
-		 * The PMD path doesn't have an equivalent to
-		 * dax_pfn_mkwrite(), though, so for a read followed by a
-		 * write we traverse all the way through dax_pmd_fault()
-		 * twice.  This means we can just skip inserting a radix tree
-		 * entry completely on the initial read and just wait until
-		 * the write to insert a dirty entry.
-		 */
-		if (write) {
-			/*
-			 * We should insert radix-tree entry and dirty it here.
-			 * For now this is broken...
-			 */
-		}
-
-		dev_dbg(part_to_dev(bdev->bd_part),
-				"%s: %s addr: %lx pfn: %lx sect: %llx\n",
-				__func__, current->comm, address,
-				pfn_t_to_pfn(dax.pfn),
-				(unsigned long long) dax.sector);
-		result |= vmf_insert_pfn_pmd(vma, address, pmd,
-				dax.pfn, write);
-	}
-
- out:
-	return result;
-
- fallback:
-	count_vm_event(THP_FAULT_FALLBACK);
-	result = VM_FAULT_FALLBACK;
-	goto out;
-}
-EXPORT_SYMBOL_GPL(dax_pmd_fault);
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-/**
  * dax_pfn_mkwrite - handle first write to DAX page
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
@@ -1140,17 +929,27 @@ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct file *file = vma->vm_file;
 	struct address_space *mapping = file->f_mapping;
-	void *entry;
+	void *entry, **slot;
 	pgoff_t index = vmf->pgoff;
 
 	spin_lock_irq(&mapping->tree_lock);
-	entry = get_unlocked_mapping_entry(mapping, index, NULL);
-	if (!entry || !radix_tree_exceptional_entry(entry))
-		goto out;
+	entry = get_unlocked_mapping_entry(mapping, index, &slot);
+	if (!entry || !radix_tree_exceptional_entry(entry)) {
+		if (entry)
+			put_unlocked_mapping_entry(mapping, index, entry);
+		spin_unlock_irq(&mapping->tree_lock);
+		return VM_FAULT_NOPAGE;
+	}
 	radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
-	put_unlocked_mapping_entry(mapping, index, entry);
-out:
+	entry = lock_slot(mapping, slot);
 	spin_unlock_irq(&mapping->tree_lock);
+	/*
+	 * If we race with somebody updating the PTE and finish_mkwrite_fault()
+	 * fails, we don't care. We need to return VM_FAULT_NOPAGE and retry
+	 * the fault in either case.
+	 */
+	finish_mkwrite_fault(vmf);
+	put_locked_mapping_entry(mapping, index, entry);
 	return VM_FAULT_NOPAGE;
 }
 EXPORT_SYMBOL_GPL(dax_pfn_mkwrite);
@@ -1191,62 +990,13 @@ int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
 }
 EXPORT_SYMBOL_GPL(__dax_zero_page_range);
 
-/**
- * dax_zero_page_range - zero a range within a page of a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @length: The number of bytes to zero
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * This function can be called by a filesystem when it is zeroing part of a
- * page in a DAX file.  This is intended for hole-punch operations.  If
- * you are truncating a file, the helper function dax_truncate_page() may be
- * more convenient.
- */
-int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length,
-							get_block_t get_block)
-{
-	struct buffer_head bh;
-	pgoff_t index = from >> PAGE_SHIFT;
-	unsigned offset = from & (PAGE_SIZE-1);
-	int err;
-
-	/* Block boundary? Nothing to do */
-	if (!length)
-		return 0;
-	BUG_ON((offset + length) > PAGE_SIZE);
-
-	memset(&bh, 0, sizeof(bh));
-	bh.b_bdev = inode->i_sb->s_bdev;
-	bh.b_size = PAGE_SIZE;
-	err = get_block(inode, index, &bh, 0);
-	if (err < 0 || !buffer_written(&bh))
-		return err;
-
-	return __dax_zero_page_range(bh.b_bdev, to_sector(&bh, inode),
-			offset, length);
-}
-EXPORT_SYMBOL_GPL(dax_zero_page_range);
-
-/**
- * dax_truncate_page - handle a partial page being truncated in a DAX file
- * @inode: The file being truncated
- * @from: The file offset that is being truncated to
- * @get_block: The filesystem method used to translate file offsets to blocks
- *
- * Similar to block_truncate_page(), this function can be called by a
- * filesystem when it is truncating a DAX file to handle the partial page.
- */
-int dax_truncate_page(struct inode *inode, loff_t from, get_block_t get_block)
+static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
 {
-	unsigned length = PAGE_ALIGN(from) - from;
-	return dax_zero_page_range(inode, from, length, get_block);
+	return iomap->blkno + (((pos & PAGE_MASK) - iomap->offset) >> 9);
 }
-EXPORT_SYMBOL_GPL(dax_truncate_page);
 
-#ifdef CONFIG_FS_IOMAP
 static loff_t
-iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
+dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 		struct iomap *iomap)
 {
 	struct iov_iter *iter = data;
@@ -1265,13 +1015,23 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 	if (WARN_ON_ONCE(iomap->type != IOMAP_MAPPED))
 		return -EIO;
 
+	/*
+	 * Write can allocate block for an area which has a hole page mapped
+	 * into page tables. We have to tear down these mappings so that data
+	 * written by write(2) is visible in mmap.
+	 */
+	if ((iomap->flags & IOMAP_F_NEW) && inode->i_mapping->nrpages) {
+		invalidate_inode_pages2_range(inode->i_mapping,
+					      pos >> PAGE_SHIFT,
+					      (end - 1) >> PAGE_SHIFT);
+	}
+
 	while (pos < end) {
 		unsigned offset = pos & (PAGE_SIZE - 1);
 		struct blk_dax_ctl dax = { 0 };
 		ssize_t map_len;
 
-		dax.sector = iomap->blkno +
-			(((pos & PAGE_MASK) - iomap->offset) >> 9);
+		dax.sector = dax_iomap_sector(iomap, pos);
 		dax.size = (length + offset + PAGE_SIZE - 1) & PAGE_MASK;
 		map_len = dax_map_atomic(iomap->bdev, &dax);
 		if (map_len < 0) {
@@ -1303,7 +1063,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
 }
 
 /**
- * iomap_dax_rw - Perform I/O to a DAX file
+ * dax_iomap_rw - Perform I/O to a DAX file
  * @iocb:	The control block for this I/O
  * @iter:	The addresses to do I/O from or to
  * @ops:	iomap ops passed from the file system
@@ -1313,7 +1073,7 @@ iomap_dax_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
  * and evicting any page cache pages in the region under I/O.
  */
 ssize_t
-iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
+dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
 		struct iomap_ops *ops)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
@@ -1324,26 +1084,9 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
 	if (iov_iter_rw(iter) == WRITE)
 		flags |= IOMAP_WRITE;
 
-	/*
-	 * Yes, even DAX files can have page cache attached to them:  A zeroed
-	 * page is inserted into the pagecache when we have to serve a write
-	 * fault on a hole.  It should never be dirtied and can simply be
-	 * dropped from the pagecache once we get real data for the page.
-	 *
-	 * XXX: This is racy against mmap, and there's nothing we can do about
-	 * it. We'll eventually need to shift this down even further so that
-	 * we can check if we allocated blocks over a hole first.
-	 */
-	if (mapping->nrpages) {
-		ret = invalidate_inode_pages2_range(mapping,
-				pos >> PAGE_SHIFT,
-				(pos + iov_iter_count(iter) - 1) >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-	}
-
 	while (iov_iter_count(iter)) {
 		ret = iomap_apply(inode, pos, iov_iter_count(iter), flags, ops,
-				iter, iomap_dax_actor);
+				iter, dax_iomap_actor);
 		if (ret <= 0)
 			break;
 		pos += ret;
@@ -1353,10 +1096,19 @@ iomap_dax_rw(struct kiocb *iocb, struct iov_iter *iter,
 	iocb->ki_pos += done;
 	return done ? done : ret;
 }
-EXPORT_SYMBOL_GPL(iomap_dax_rw);
+EXPORT_SYMBOL_GPL(dax_iomap_rw);
+
+static int dax_fault_return(int error)
+{
+	if (error == 0)
+		return VM_FAULT_NOPAGE;
+	if (error == -ENOMEM)
+		return VM_FAULT_OOM;
+	return VM_FAULT_SIGBUS;
+}
 
 /**
- * iomap_dax_fault - handle a page fault on a DAX file
+ * dax_iomap_fault - handle a page fault on a DAX file
  * @vma: The virtual memory area where the fault occurred
  * @vmf: The description of the fault
  * @ops: iomap ops passed from the file system
@@ -1365,17 +1117,18 @@ EXPORT_SYMBOL_GPL(iomap_dax_rw);
  * or mkwrite handler for DAX files. Assumes the caller has done all the
  * necessary locking for the page fault to proceed successfully.
  */
-int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
+int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 			struct iomap_ops *ops)
 {
 	struct address_space *mapping = vma->vm_file->f_mapping;
 	struct inode *inode = mapping->host;
-	unsigned long vaddr = (unsigned long)vmf->virtual_address;
+	unsigned long vaddr = vmf->address;
 	loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
 	sector_t sector;
 	struct iomap iomap = { 0 };
-	unsigned flags = 0;
+	unsigned flags = IOMAP_FAULT;
 	int error, major = 0;
+	int vmf_ret = 0;
 	void *entry;
 
 	/*
@@ -1386,12 +1139,6 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	if (pos >= i_size_read(inode))
 		return VM_FAULT_SIGBUS;
 
-	entry = grab_mapping_entry(mapping, vmf->pgoff);
-	if (IS_ERR(entry)) {
-		error = PTR_ERR(entry);
-		goto out;
-	}
-
 	if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page)
 		flags |= IOMAP_WRITE;
 
@@ -1402,13 +1149,19 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 	 */
 	error = ops->iomap_begin(inode, pos, PAGE_SIZE, flags, &iomap);
 	if (error)
-		goto unlock_entry;
+		return dax_fault_return(error);
 	if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) {
-		error = -EIO;		/* fs corruption? */
-		goto unlock_entry;
+		vmf_ret = dax_fault_return(-EIO);	/* fs corruption? */
+		goto finish_iomap;
+	}
+
+	entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
+	if (IS_ERR(entry)) {
+		vmf_ret = dax_fault_return(PTR_ERR(entry));
+		goto finish_iomap;
 	}
 
-	sector = iomap.blkno + (((pos & PAGE_MASK) - iomap.offset) >> 9);
+	sector = dax_iomap_sector(&iomap, pos);
 
 	if (vmf->cow_page) {
 		switch (iomap.type) {
@@ -1427,13 +1180,13 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		}
 
 		if (error)
-			goto unlock_entry;
-		if (!radix_tree_exceptional_entry(entry)) {
-			vmf->page = entry;
-			return VM_FAULT_LOCKED;
-		}
-		vmf->entry = entry;
-		return VM_FAULT_DAX_LOCKED;
+			goto error_unlock_entry;
+
+		__SetPageUptodate(vmf->cow_page);
+		vmf_ret = finish_fault(vmf);
+		if (!vmf_ret)
+			vmf_ret = VM_FAULT_DONE_COW;
+		goto unlock_entry;
 	}
 
 	switch (iomap.type) {
@@ -1445,11 +1198,16 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		}
 		error = dax_insert_mapping(mapping, iomap.bdev, sector,
 				PAGE_SIZE, &entry, vma, vmf);
+		/* -EBUSY is fine, somebody else faulted on the same PTE */
+		if (error == -EBUSY)
+			error = 0;
 		break;
 	case IOMAP_UNWRITTEN:
 	case IOMAP_HOLE:
-		if (!(vmf->flags & FAULT_FLAG_WRITE))
-			return dax_load_hole(mapping, entry, vmf);
+		if (!(vmf->flags & FAULT_FLAG_WRITE)) {
+			vmf_ret = dax_load_hole(mapping, &entry, vmf);
+			goto unlock_entry;
+		}
 		/*FALLTHRU*/
 	default:
 		WARN_ON_ONCE(1);
@@ -1457,15 +1215,215 @@ int iomap_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
 		break;
 	}
 
+ error_unlock_entry:
+	vmf_ret = dax_fault_return(error) | major;
  unlock_entry:
 	put_locked_mapping_entry(mapping, vmf->pgoff, entry);
- out:
-	if (error == -ENOMEM)
-		return VM_FAULT_OOM | major;
-	/* -EBUSY is fine, somebody else faulted on the same PTE */
-	if (error < 0 && error != -EBUSY)
-		return VM_FAULT_SIGBUS | major;
-	return VM_FAULT_NOPAGE | major;
+ finish_iomap:
+	if (ops->iomap_end) {
+		int copied = PAGE_SIZE;
+
+		if (vmf_ret & VM_FAULT_ERROR)
+			copied = 0;
+		/*
+		 * The fault is done by now and there's no way back (other
+		 * thread may be already happily using PTE we have installed).
+		 * Just ignore error from ->iomap_end since we cannot do much
+		 * with it.
+		 */
+		ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap);
+	}
+	return vmf_ret;
+}
+EXPORT_SYMBOL_GPL(dax_iomap_fault);
+
+#ifdef CONFIG_FS_DAX_PMD
+/*
+ * The 'colour' (ie low bits) within a PMD of a page offset.  This comes up
+ * more often than one might expect in the below functions.
+ */
+#define PG_PMD_COLOUR	((PMD_SIZE >> PAGE_SHIFT) - 1)
+
+static int dax_pmd_insert_mapping(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, loff_t pos, bool write, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	struct block_device *bdev = iomap->bdev;
+	struct blk_dax_ctl dax = {
+		.sector = dax_iomap_sector(iomap, pos),
+		.size = PMD_SIZE,
+	};
+	long length = dax_map_atomic(bdev, &dax);
+	void *ret;
+
+	if (length < 0) /* dax_map_atomic() failed */
+		return VM_FAULT_FALLBACK;
+	if (length < PMD_SIZE)
+		goto unmap_fallback;
+	if (pfn_t_to_pfn(dax.pfn) & PG_PMD_COLOUR)
+		goto unmap_fallback;
+	if (!pfn_t_devmap(dax.pfn))
+		goto unmap_fallback;
+
+	dax_unmap_atomic(bdev, &dax);
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, dax.sector,
+			RADIX_DAX_PMD);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	return vmf_insert_pfn_pmd(vma, address, pmd, dax.pfn, write);
+
+ unmap_fallback:
+	dax_unmap_atomic(bdev, &dax);
+	return VM_FAULT_FALLBACK;
+}
+
+static int dax_pmd_load_hole(struct vm_area_struct *vma, pmd_t *pmd,
+		struct vm_fault *vmf, unsigned long address,
+		struct iomap *iomap, void **entryp)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	struct page *zero_page;
+	spinlock_t *ptl;
+	pmd_t pmd_entry;
+	void *ret;
+
+	zero_page = mm_get_huge_zero_page(vma->vm_mm);
+
+	if (unlikely(!zero_page))
+		return VM_FAULT_FALLBACK;
+
+	ret = dax_insert_mapping_entry(mapping, vmf, *entryp, 0,
+			RADIX_DAX_PMD | RADIX_DAX_HZP);
+	if (IS_ERR(ret))
+		return VM_FAULT_FALLBACK;
+	*entryp = ret;
+
+	ptl = pmd_lock(vma->vm_mm, pmd);
+	if (!pmd_none(*pmd)) {
+		spin_unlock(ptl);
+		return VM_FAULT_FALLBACK;
+	}
+
+	pmd_entry = mk_pmd(zero_page, vma->vm_page_prot);
+	pmd_entry = pmd_mkhuge(pmd_entry);
+	set_pmd_at(vma->vm_mm, pmd_addr, pmd, pmd_entry);
+	spin_unlock(ptl);
+	return VM_FAULT_NOPAGE;
+}
+
+int dax_iomap_pmd_fault(struct vm_area_struct *vma, unsigned long address,
+		pmd_t *pmd, unsigned int flags, struct iomap_ops *ops)
+{
+	struct address_space *mapping = vma->vm_file->f_mapping;
+	unsigned long pmd_addr = address & PMD_MASK;
+	bool write = flags & FAULT_FLAG_WRITE;
+	unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
+	struct inode *inode = mapping->host;
+	int result = VM_FAULT_FALLBACK;
+	struct iomap iomap = { 0 };
+	pgoff_t max_pgoff, pgoff;
+	struct vm_fault vmf;
+	void *entry;
+	loff_t pos;
+	int error;
+
+	/* Fall back to PTEs if we're going to COW */
+	if (write && !(vma->vm_flags & VM_SHARED))
+		goto fallback;
+
+	/* If the PMD would extend outside the VMA */
+	if (pmd_addr < vma->vm_start)
+		goto fallback;
+	if ((pmd_addr + PMD_SIZE) > vma->vm_end)
+		goto fallback;
+
+	/*
+	 * Check whether offset isn't beyond end of file now. Caller is
+	 * supposed to hold locks serializing us with truncate / punch hole so
+	 * this is a reliable test.
+	 */
+	pgoff = linear_page_index(vma, pmd_addr);
+	max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+
+	if (pgoff > max_pgoff)
+		return VM_FAULT_SIGBUS;
+
+	/* If the PMD would extend beyond the file size */
+	if ((pgoff | PG_PMD_COLOUR) > max_pgoff)
+		goto fallback;
+
+	/*
+	 * Note that we don't use iomap_apply here.  We aren't doing I/O, only
+	 * setting up a mapping, so really we're using iomap_begin() as a way
+	 * to look up our filesystem block.
+	 */
+	pos = (loff_t)pgoff << PAGE_SHIFT;
+	error = ops->iomap_begin(inode, pos, PMD_SIZE, iomap_flags, &iomap);
+	if (error)
+		goto fallback;
+
+	if (iomap.offset + iomap.length < pos + PMD_SIZE)
+		goto finish_iomap;
+
+	/*
+	 * grab_mapping_entry() will make sure we get a 2M empty entry, a DAX
+	 * PMD or a HZP entry.  If it can't (because a 4k page is already in
+	 * the tree, for instance), it will return -EEXIST and we just fall
+	 * back to 4k entries.
+	 */
+	entry = grab_mapping_entry(mapping, pgoff, RADIX_DAX_PMD);
+	if (IS_ERR(entry))
+		goto finish_iomap;
+
+	vmf.pgoff = pgoff;
+	vmf.flags = flags;
+	vmf.gfp_mask = mapping_gfp_mask(mapping) | __GFP_IO;
+
+	switch (iomap.type) {
+	case IOMAP_MAPPED:
+		result = dax_pmd_insert_mapping(vma, pmd, &vmf, address,
+				&iomap, pos, write, &entry);
+		break;
+	case IOMAP_UNWRITTEN:
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(write))
+			goto unlock_entry;
+		result = dax_pmd_load_hole(vma, pmd, &vmf, address, &iomap,
+				&entry);
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		break;
+	}
+
+ unlock_entry:
+	put_locked_mapping_entry(mapping, pgoff, entry);
+ finish_iomap:
+	if (ops->iomap_end) {
+		int copied = PMD_SIZE;
+
+		if (result == VM_FAULT_FALLBACK)
+			copied = 0;
+		/*
+		 * The fault is done by now and there's no way back (other
+		 * thread may be already happily using PMD we have installed).
+		 * Just ignore error from ->iomap_end since we cannot do much
+		 * with it.
+		 */
+		ops->iomap_end(inode, pos, PMD_SIZE, copied, iomap_flags,
+				&iomap);
+	}
+ fallback:
+	if (result == VM_FAULT_FALLBACK) {
+		split_huge_pmd(vma, pmd, address);
+		count_vm_event(THP_FAULT_FALLBACK);
+	}
+	return result;
 }
-EXPORT_SYMBOL_GPL(iomap_dax_fault);
-#endif /* CONFIG_FS_IOMAP */
+EXPORT_SYMBOL_GPL(dax_iomap_pmd_fault);
+#endif /* CONFIG_FS_DAX_PMD */
diff --git a/fs/dcache.c b/fs/dcache.c
index 5c7cc953ac81..95d71eda8142 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -26,7 +26,7 @@
 #include <linux/export.h>
 #include <linux/mount.h>
 #include <linux/file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/security.h>
 #include <linux/seqlock.h>
 #include <linux/swap.h>
@@ -1273,38 +1273,44 @@ rename_retry:
 	goto again;
 }
 
-/*
- * Search for at least 1 mount point in the dentry's subdirs.
- * We descend to the next level whenever the d_subdirs
- * list is non-empty and continue searching.
- */
+struct check_mount {
+	struct vfsmount *mnt;
+	unsigned int mounted;
+};
 
-static enum d_walk_ret check_mount(void *data, struct dentry *dentry)
+static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry)
 {
-	int *ret = data;
-	if (d_mountpoint(dentry)) {
-		*ret = 1;
+	struct check_mount *info = data;
+	struct path path = { .mnt = info->mnt, .dentry = dentry };
+
+	if (likely(!d_mountpoint(dentry)))
+		return D_WALK_CONTINUE;
+	if (__path_is_mountpoint(&path)) {
+		info->mounted = 1;
 		return D_WALK_QUIT;
 	}
 	return D_WALK_CONTINUE;
 }
 
 /**
- * have_submounts - check for mounts over a dentry
- * @parent: dentry to check.
+ * path_has_submounts - check for mounts over a dentry in the
+ *                      current namespace.
+ * @parent: path to check.
  *
  * Return true if the parent or its subdirectories contain
- * a mount point
+ * a mount point in the current namespace.
  */
-int have_submounts(struct dentry *parent)
+int path_has_submounts(const struct path *parent)
 {
-	int ret = 0;
+	struct check_mount data = { .mnt = parent->mnt, .mounted = 0 };
 
-	d_walk(parent, &ret, check_mount, NULL);
+	read_seqlock_excl(&mount_lock);
+	d_walk(parent->dentry, &data, path_check_mount, NULL);
+	read_sequnlock_excl(&mount_lock);
 
-	return ret;
+	return data.mounted;
 }
-EXPORT_SYMBOL(have_submounts);
+EXPORT_SYMBOL(path_has_submounts);
 
 /*
  * Called by mount code to set a mountpoint and check if the mountpoint is
@@ -1330,8 +1336,11 @@ int d_set_mounted(struct dentry *dentry)
 	}
 	spin_lock(&dentry->d_lock);
 	if (!d_unlinked(dentry)) {
-		dentry->d_flags |= DCACHE_MOUNTED;
-		ret = 0;
+		ret = -EBUSY;
+		if (!d_mountpoint(dentry)) {
+			dentry->d_flags |= DCACHE_MOUNTED;
+			ret = 0;
+		}
 	}
  	spin_unlock(&dentry->d_lock);
 out:
diff --git a/fs/dcookies.c b/fs/dcookies.c
index ac44a69fbea9..0d0461cf2431 100644
--- a/fs/dcookies.c
+++ b/fs/dcookies.c
@@ -26,7 +26,7 @@
 #include <linux/mutex.h>
 #include <linux/path.h>
 #include <linux/compat.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /* The dcookies are allocated from a kmem_cache and
  * hashed onto a small number of lists. None of the
@@ -90,7 +90,7 @@ static void hash_dcookie(struct dcookie_struct * dcs)
 }
 
 
-static struct dcookie_struct *alloc_dcookie(struct path *path)
+static struct dcookie_struct *alloc_dcookie(const struct path *path)
 {
 	struct dcookie_struct *dcs = kmem_cache_alloc(dcookie_cache,
 							GFP_KERNEL);
@@ -113,7 +113,7 @@ static struct dcookie_struct *alloc_dcookie(struct path *path)
 /* This is the main kernel-side routine that retrieves the cookie
  * value for a dentry/vfsmnt pair.
  */
-int get_dcookie(struct path *path, unsigned long *cookie)
+int get_dcookie(const struct path *path, unsigned long *cookie)
 {
 	int err = 0;
 	struct dcookie_struct * dcs;
diff --git a/fs/direct-io.c b/fs/direct-io.c
index fb9aa16a7727..c87bae4376b8 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -457,7 +457,7 @@ static struct bio *dio_await_one(struct dio *dio)
 		dio->waiter = current;
 		spin_unlock_irqrestore(&dio->bio_lock, flags);
 		if (!(dio->iocb->ki_flags & IOCB_HIPRI) ||
-		    !blk_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
+		    !blk_mq_poll(bdev_get_queue(dio->bio_bdev), dio->bio_cookie))
 			io_schedule();
 		/* wake up sets us TASK_RUNNING */
 		spin_lock_irqsave(&dio->bio_lock, flags);
@@ -554,7 +554,7 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
  * filesystems that don't need it and also allows us to create the workqueue
  * late enough so the we can include s_id in the name of the workqueue.
  */
-static int sb_init_dio_done_wq(struct super_block *sb)
+int sb_init_dio_done_wq(struct super_block *sb)
 {
 	struct workqueue_struct *old;
 	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
@@ -843,24 +843,6 @@ out:
 }
 
 /*
- * Clean any dirty buffers in the blockdev mapping which alias newly-created
- * file blocks.  Only called for S_ISREG files - blockdevs do not set
- * buffer_new
- */
-static void clean_blockdev_aliases(struct dio *dio, struct buffer_head *map_bh)
-{
-	unsigned i;
-	unsigned nblocks;
-
-	nblocks = map_bh->b_size >> dio->inode->i_blkbits;
-
-	for (i = 0; i < nblocks; i++) {
-		unmap_underlying_metadata(map_bh->b_bdev,
-					  map_bh->b_blocknr + i);
-	}
-}
-
-/*
  * If we are not writing the entire block and get_block() allocated
  * the block for us, we need to fill-in the unused portion of the
  * block with zeros. This happens only if user-buffer, fileoffset or
@@ -924,6 +906,7 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 			struct buffer_head *map_bh)
 {
 	const unsigned blkbits = sdio->blkbits;
+	const unsigned i_blkbits = blkbits + sdio->blkfactor;
 	int ret = 0;
 
 	while (sdio->block_in_file < sdio->final_block_in_request) {
@@ -960,11 +943,15 @@ static int do_direct_IO(struct dio *dio, struct dio_submit *sdio,
 					goto do_holes;
 
 				sdio->blocks_available =
-						map_bh->b_size >> sdio->blkbits;
+						map_bh->b_size >> blkbits;
 				sdio->next_block_for_io =
 					map_bh->b_blocknr << sdio->blkfactor;
-				if (buffer_new(map_bh))
-					clean_blockdev_aliases(dio, map_bh);
+				if (buffer_new(map_bh)) {
+					clean_bdev_aliases(
+						map_bh->b_bdev,
+						map_bh->b_blocknr,
+						map_bh->b_size >> i_blkbits);
+				}
 
 				if (!sdio->blkfactor)
 					goto do_holes;
@@ -1209,7 +1196,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
 	dio->inode = inode;
 	if (iov_iter_rw(iter) == WRITE) {
 		dio->op = REQ_OP_WRITE;
-		dio->op_flags = WRITE_ODIRECT;
+		dio->op_flags = REQ_SYNC | REQ_IDLE;
 	} else {
 		dio->op = REQ_OP_READ;
 	}
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index dcea1e37a1b7..07fed838d8fd 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -268,7 +268,7 @@ void dlm_callback_work(struct work_struct *work)
 int dlm_callback_start(struct dlm_ls *ls)
 {
 	ls->ls_callback_wq = alloc_workqueue("dlm_callback",
-					     WQ_UNBOUND | WQ_MEM_RECLAIM, 0);
+					     WQ_HIGHPRI | WQ_MEM_RECLAIM, 0);
 	if (!ls->ls_callback_wq) {
 		log_print("can't start dlm_callback workqueue");
 		return -ENOMEM;
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index df955d2209ce..7211e826d90d 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -12,7 +12,7 @@
 ******************************************************************************/
 
 #include <linux/kernel.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/configfs.h>
 #include <linux/slab.h>
 #include <linux/in.h>
diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c
index 466f7d60edc2..ca7089aeadab 100644
--- a/fs/dlm/debug_fs.c
+++ b/fs/dlm/debug_fs.c
@@ -12,7 +12,7 @@
 
 #include <linux/pagemap.h>
 #include <linux/seq_file.h>
-#include <linux/module.h>
+#include <linux/init.h>
 #include <linux/ctype.h>
 #include <linux/debugfs.h>
 #include <linux/slab.h>
diff --git a/fs/dlm/dlm_internal.h b/fs/dlm/dlm_internal.h
index 216b61604ef9..748e8d59e611 100644
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -18,7 +18,6 @@
  * This is the main header file to be included in each DLM source file.
  */
 
-#include <linux/module.h>
 #include <linux/slab.h>
 #include <linux/sched.h>
 #include <linux/types.h>
@@ -39,7 +38,7 @@
 #include <linux/mutex.h>
 #include <linux/idr.h>
 #include <linux/ratelimit.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/dlm.h>
 #include "config.h"
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 35502d4046f5..6df332296c66 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1395,7 +1395,6 @@ static int nodeid_warned(int nodeid, int num_nodes, int *warned)
 void dlm_scan_waiters(struct dlm_ls *ls)
 {
 	struct dlm_lkb *lkb;
-	ktime_t zero = ktime_set(0, 0);
 	s64 us;
 	s64 debug_maxus = 0;
 	u32 debug_scanned = 0;
@@ -1409,7 +1408,7 @@ void dlm_scan_waiters(struct dlm_ls *ls)
 	mutex_lock(&ls->ls_waiters_mutex);
 
 	list_for_each_entry(lkb, &ls->ls_waiters, lkb_wait_reply) {
-		if (ktime_equal(lkb->lkb_wait_time, zero))
+		if (!lkb->lkb_wait_time)
 			continue;
 
 		debug_scanned++;
@@ -1419,7 +1418,7 @@ void dlm_scan_waiters(struct dlm_ls *ls)
 		if (us < dlm_config.ci_waitwarn_us)
 			continue;
 
-		lkb->lkb_wait_time = zero;
+		lkb->lkb_wait_time = 0;
 
 		debug_expired++;
 		if (us > debug_maxus)
diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c
index f3e72787e7f9..91592b75c309 100644
--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -11,6 +11,8 @@
 *******************************************************************************
 ******************************************************************************/
 
+#include <linux/module.h>
+
 #include "dlm_internal.h"
 #include "lockspace.h"
 #include "member.h"
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 609998de533e..7d398d300e97 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -519,29 +519,25 @@ out:
 /* Note: sk_callback_lock must be locked before calling this function. */
 static void save_callbacks(struct connection *con, struct sock *sk)
 {
-	lock_sock(sk);
 	con->orig_data_ready = sk->sk_data_ready;
 	con->orig_state_change = sk->sk_state_change;
 	con->orig_write_space = sk->sk_write_space;
 	con->orig_error_report = sk->sk_error_report;
-	release_sock(sk);
 }
 
 static void restore_callbacks(struct connection *con, struct sock *sk)
 {
 	write_lock_bh(&sk->sk_callback_lock);
-	lock_sock(sk);
 	sk->sk_user_data = NULL;
 	sk->sk_data_ready = con->orig_data_ready;
 	sk->sk_state_change = con->orig_state_change;
 	sk->sk_write_space = con->orig_write_space;
 	sk->sk_error_report = con->orig_error_report;
-	release_sock(sk);
 	write_unlock_bh(&sk->sk_callback_lock);
 }
 
 /* Make a socket active */
-static void add_sock(struct socket *sock, struct connection *con)
+static void add_sock(struct socket *sock, struct connection *con, bool save_cb)
 {
 	struct sock *sk = sock->sk;
 
@@ -549,7 +545,7 @@ static void add_sock(struct socket *sock, struct connection *con)
 	con->sock = sock;
 
 	sk->sk_user_data = con;
-	if (!test_bit(CF_IS_OTHERCON, &con->flags))
+	if (save_cb)
 		save_callbacks(con, sk);
 	/* Install a data_ready callback */
 	sk->sk_data_ready = lowcomms_data_ready;
@@ -806,7 +802,7 @@ static int tcp_accept_from_sock(struct connection *con)
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
-			add_sock(newsock, othercon);
+			add_sock(newsock, othercon, false);
 			addcon = othercon;
 		}
 		else {
@@ -819,7 +815,10 @@ static int tcp_accept_from_sock(struct connection *con)
 	else {
 		newsock->sk->sk_user_data = newcon;
 		newcon->rx_action = receive_from_sock;
-		add_sock(newsock, newcon);
+		/* accept copies the sk after we've saved the callbacks, so we
+		   don't want to save them a second time or comm errors will
+		   result in calling sk_error_report recursively. */
+		add_sock(newsock, newcon, false);
 		addcon = newcon;
 	}
 
@@ -880,7 +879,8 @@ static int sctp_accept_from_sock(struct connection *con)
 	}
 
 	make_sockaddr(&prim.ssp_addr, 0, &addr_len);
-	if (addr_to_nodeid(&prim.ssp_addr, &nodeid)) {
+	ret = addr_to_nodeid(&prim.ssp_addr, &nodeid);
+	if (ret) {
 		unsigned char *b = (unsigned char *)&prim.ssp_addr;
 
 		log_print("reject connect from unknown addr");
@@ -919,7 +919,7 @@ static int sctp_accept_from_sock(struct connection *con)
 			newcon->othercon = othercon;
 			othercon->sock = newsock;
 			newsock->sk->sk_user_data = othercon;
-			add_sock(newsock, othercon);
+			add_sock(newsock, othercon, false);
 			addcon = othercon;
 		} else {
 			printk("Extra connection from node %d attempted\n", nodeid);
@@ -930,7 +930,7 @@ static int sctp_accept_from_sock(struct connection *con)
 	} else {
 		newsock->sk->sk_user_data = newcon;
 		newcon->rx_action = receive_from_sock;
-		add_sock(newsock, newcon);
+		add_sock(newsock, newcon, false);
 		addcon = newcon;
 	}
 
@@ -1058,7 +1058,7 @@ static void sctp_connect_to_sock(struct connection *con)
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = sctp_connect_to_sock;
-	add_sock(sock, con);
+	add_sock(sock, con, true);
 
 	/* Bind to all addresses. */
 	if (sctp_bind_addrs(con, 0))
@@ -1146,7 +1146,7 @@ static void tcp_connect_to_sock(struct connection *con)
 	sock->sk->sk_user_data = con;
 	con->rx_action = receive_from_sock;
 	con->connect_action = tcp_connect_to_sock;
-	add_sock(sock, con);
+	add_sock(sock, con, true);
 
 	/* Bind to our cluster-known address connecting to avoid
 	   routing problems */
@@ -1366,7 +1366,7 @@ static int tcp_listen_for_all(void)
 
 	sock = tcp_create_listen_sock(con, dlm_local_addr[0]);
 	if (sock) {
-		add_sock(sock, con);
+		add_sock(sock, con, true);
 		result = 0;
 	}
 	else {
diff --git a/fs/dlm/main.c b/fs/dlm/main.c
index 079c0bd71ab7..8e1b618891be 100644
--- a/fs/dlm/main.c
+++ b/fs/dlm/main.c
@@ -11,6 +11,8 @@
 *******************************************************************************
 ******************************************************************************/
 
+#include <linux/module.h>
+
 #include "dlm_internal.h"
 #include "lockspace.h"
 #include "lock.h"
diff --git a/fs/dlm/netlink.c b/fs/dlm/netlink.c
index 1e6e227134d7..43a96c330570 100644
--- a/fs/dlm/netlink.c
+++ b/fs/dlm/netlink.c
@@ -16,11 +16,7 @@
 static uint32_t dlm_nl_seqnum;
 static uint32_t listener_nlportid;
 
-static struct genl_family family = {
-	.id		= GENL_ID_GENERATE,
-	.name		= DLM_GENL_NAME,
-	.version	= DLM_GENL_VERSION,
-};
+static struct genl_family family;
 
 static int prepare_data(u8 cmd, struct sk_buff **skbp, size_t size)
 {
@@ -69,16 +65,24 @@ static int user_cmd(struct sk_buff *skb, struct genl_info *info)
 	return 0;
 }
 
-static struct genl_ops dlm_nl_ops[] = {
+static const struct genl_ops dlm_nl_ops[] = {
 	{
 		.cmd	= DLM_CMD_HELLO,
 		.doit	= user_cmd,
 	},
 };
 
+static struct genl_family family __ro_after_init = {
+	.name		= DLM_GENL_NAME,
+	.version	= DLM_GENL_VERSION,
+	.ops		= dlm_nl_ops,
+	.n_ops		= ARRAY_SIZE(dlm_nl_ops),
+	.module		= THIS_MODULE,
+};
+
 int __init dlm_netlink_init(void)
 {
-	return genl_register_family_with_ops(&family, dlm_nl_ops);
+	return genl_register_family(&family);
 }
 
 void dlm_netlink_exit(void)
diff --git a/fs/dlm/user.c b/fs/dlm/user.c
index 58c2f4a21b7f..1ce908c2232c 100644
--- a/fs/dlm/user.c
+++ b/fs/dlm/user.c
@@ -9,7 +9,6 @@
 #include <linux/miscdevice.h>
 #include <linux/init.h>
 #include <linux/wait.h>
-#include <linux/module.h>
 #include <linux/file.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c
index cf390dceddd2..e7413f82d27b 100644
--- a/fs/ecryptfs/inode.c
+++ b/fs/ecryptfs/inode.c
@@ -631,28 +631,23 @@ out_lock:
 
 static char *ecryptfs_readlink_lower(struct dentry *dentry, size_t *bufsiz)
 {
+	DEFINE_DELAYED_CALL(done);
 	struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry);
-	char *lower_buf;
+	const char *link;
 	char *buf;
-	mm_segment_t old_fs;
 	int rc;
 
-	lower_buf = kmalloc(PATH_MAX, GFP_KERNEL);
-	if (!lower_buf)
-		return ERR_PTR(-ENOMEM);
-	old_fs = get_fs();
-	set_fs(get_ds());
-	rc = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
-						   (char __user *)lower_buf,
-						   PATH_MAX);
-	set_fs(old_fs);
-	if (rc < 0)
-		goto out;
+	link = vfs_get_link(lower_dentry, &done);
+	if (IS_ERR(link))
+		return ERR_CAST(link);
+
 	rc = ecryptfs_decode_and_decrypt_filename(&buf, bufsiz, dentry->d_sb,
-						  lower_buf, rc);
-out:
-	kfree(lower_buf);
-	return rc ? ERR_PTR(rc) : buf;
+						  link, strlen(link));
+	do_delayed_call(&done);
+	if (rc)
+		return ERR_PTR(rc);
+
+	return buf;
 }
 
 static const char *ecryptfs_get_link(struct dentry *dentry,
@@ -1089,7 +1084,6 @@ out:
 }
 
 const struct inode_operations ecryptfs_symlink_iops = {
-	.readlink = generic_readlink,
 	.get_link = ecryptfs_get_link,
 	.permission = ecryptfs_permission,
 	.setattr = ecryptfs_setattr,
diff --git a/fs/efs/efs.h b/fs/efs/efs.h
index 5bbf9612140c..70f5d4f9a945 100644
--- a/fs/efs/efs.h
+++ b/fs/efs/efs.h
@@ -14,7 +14,7 @@
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
 #include <linux/fs.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define EFS_VERSION "1.0a"
 
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 10db91218933..bcb68fcc8445 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,7 +34,7 @@
 #include <linux/mutex.h>
 #include <linux/anon_inodes.h>
 #include <linux/device.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <asm/mman.h>
 #include <linux/atomic.h>
diff --git a/fs/exec.c b/fs/exec.c
index 4e497b9ee71e..e57946610733 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -19,7 +19,7 @@
  * current->executable is only used by the procfs.  This allows a dispatch
  * table to check for several different types  of binary formats.  We keep
  * trying until we recognize the file or we run out of supported binary
- * formats. 
+ * formats.
  */
 
 #include <linux/slab.h>
@@ -58,7 +58,7 @@
 #include <linux/compat.h>
 #include <linux/vmalloc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
 
@@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
 	 * doing the exec and bprm->mm is the new process's mm.
 	 */
 	ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags,
-			&page, NULL);
+			&page, NULL, NULL);
 	if (ret <= 0)
 		return NULL;
 
@@ -1169,8 +1169,10 @@ no_thread_group:
 	/* we have changed execution domain */
 	tsk->exit_signal = SIGCHLD;
 
+#ifdef CONFIG_POSIX_TIMERS
 	exit_itimers(sig);
 	flush_itimer_signals();
+#endif
 
 	if (atomic_read(&oldsighand->count) != 1) {
 		struct sighand_struct *newsighand;
@@ -1266,6 +1268,13 @@ int flush_old_exec(struct linux_binprm * bprm)
 	flush_thread();
 	current->personality &= ~bprm->per_clear;
 
+	/*
+	 * We have to apply CLOEXEC before we change whether the process is
+	 * dumpable (in setup_new_exec) to avoid a race with a process in userspace
+	 * trying to access the should-be-closed file descriptors of a process
+	 * undergoing exec(2).
+	 */
+	do_close_on_exec(current->files);
 	return 0;
 
 out:
@@ -1275,8 +1284,22 @@ EXPORT_SYMBOL(flush_old_exec);
 
 void would_dump(struct linux_binprm *bprm, struct file *file)
 {
-	if (inode_permission(file_inode(file), MAY_READ) < 0)
+	struct inode *inode = file_inode(file);
+	if (inode_permission(inode, MAY_READ) < 0) {
+		struct user_namespace *old, *user_ns;
 		bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
+
+		/* Ensure mm->user_ns contains the executable */
+		user_ns = old = bprm->mm->user_ns;
+		while ((user_ns != &init_user_ns) &&
+		       !privileged_wrt_inode_uidgid(user_ns, inode))
+			user_ns = user_ns->parent;
+
+		if (old != user_ns) {
+			bprm->mm->user_ns = get_user_ns(user_ns);
+			put_user_ns(old);
+		}
+	}
 }
 EXPORT_SYMBOL(would_dump);
 
@@ -1306,7 +1329,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 	    !gid_eq(bprm->cred->gid, current_egid())) {
 		current->pdeath_signal = 0;
 	} else {
-		would_dump(bprm, bprm->file);
 		if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
 			set_dumpable(current->mm, suid_dumpable);
 	}
@@ -1315,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm)
 	   group */
 	current->self_exec_id++;
 	flush_signal_handlers(current, 0);
-	do_close_on_exec(current->files);
 }
 EXPORT_SYMBOL(setup_new_exec);
 
@@ -1406,7 +1427,7 @@ static void check_unsafe_exec(struct linux_binprm *bprm)
 	unsigned n_fs;
 
 	if (p->ptrace) {
-		if (p->ptrace & PT_PTRACE_CAP)
+		if (ptracer_capable(p, current_user_ns()))
 			bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
 		else
 			bprm->unsafe |= LSM_UNSAFE_PTRACE;
@@ -1741,6 +1762,8 @@ static int do_execveat_common(int fd, struct filename *filename,
 	if (retval < 0)
 		goto out;
 
+	would_dump(bprm, bprm->file);
+
 	retval = exec_binprm(bprm);
 	if (retval < 0)
 		goto out;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index d8072bc074a4..0ac62811b341 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -870,46 +870,31 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 
 	page = *pagep;
 	if (page == NULL) {
-		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
-					 fsdata);
-		if (ret) {
-			EXOFS_DBGMSG("simple_write_begin failed\n");
-			goto out;
+		page = grab_cache_page_write_begin(mapping, pos >> PAGE_SHIFT,
+						   flags);
+		if (!page) {
+			EXOFS_DBGMSG("grab_cache_page_write_begin failed\n");
+			return -ENOMEM;
 		}
-
-		page = *pagep;
+		*pagep = page;
 	}
 
 	 /* read modify write */
 	if (!PageUptodate(page) && (len != PAGE_SIZE)) {
 		loff_t i_size = i_size_read(mapping->host);
 		pgoff_t end_index = i_size >> PAGE_SHIFT;
-		size_t rlen;
 
-		if (page->index < end_index)
-			rlen = PAGE_SIZE;
-		else if (page->index == end_index)
-			rlen = i_size & ~PAGE_MASK;
-		else
-			rlen = 0;
-
-		if (!rlen) {
+		if (page->index > end_index) {
 			clear_highpage(page);
 			SetPageUptodate(page);
-			goto out;
-		}
-
-		ret = _readpage(page, true);
-		if (ret) {
-			/*SetPageError was done by _readpage. Is it ok?*/
-			unlock_page(page);
-			EXOFS_DBGMSG("__readpage failed\n");
+		} else {
+			ret = _readpage(page, true);
+			if (ret) {
+				unlock_page(page);
+				EXOFS_DBGMSG("__readpage failed\n");
+			}
 		}
 	}
-out:
-	if (unlikely(ret))
-		_write_failed(mapping->host, pos + len);
-
 	return ret;
 }
 
@@ -929,18 +914,25 @@ static int exofs_write_end(struct file *file, struct address_space *mapping,
 			struct page *page, void *fsdata)
 {
 	struct inode *inode = mapping->host;
-	/* According to comment in simple_write_end i_mutex is held */
-	loff_t i_size = inode->i_size;
-	int ret;
-
-	ret = simple_write_end(file, mapping,pos, len, copied, page, fsdata);
-	if (unlikely(ret))
-		_write_failed(inode, pos + len);
+	loff_t last_pos = pos + copied;
 
-	/* TODO: once simple_write_end marks inode dirty remove */
-	if (i_size != inode->i_size)
+	if (!PageUptodate(page)) {
+		if (copied < len) {
+			_write_failed(inode, pos + len);
+			copied = 0;
+			goto out;
+		}
+		SetPageUptodate(page);
+	}
+	if (last_pos > inode->i_size) {
+		i_size_write(inode, last_pos);
 		mark_inode_dirty(inode);
-	return ret;
+	}
+	set_page_dirty(page);
+out:
+	unlock_page(page);
+	put_page(page);
+	return copied;
 }
 
 static int exofs_releasepage(struct page *page, gfp_t gfp)
diff --git a/fs/ext2/Kconfig b/fs/ext2/Kconfig
index 36bea5adcaba..c634874e12d9 100644
--- a/fs/ext2/Kconfig
+++ b/fs/ext2/Kconfig
@@ -1,6 +1,5 @@
 config EXT2_FS
 	tristate "Second extended fs support"
-	select FS_IOMAP if FS_DAX
 	help
 	  Ext2 is a standard Linux file system for hard disks.
 
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index a0e1478dfd04..b0f241528a30 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -38,7 +38,7 @@ static ssize_t ext2_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
 		return 0; /* skip atime */
 
 	inode_lock_shared(inode);
-	ret = iomap_dax_rw(iocb, to, &ext2_iomap_ops);
+	ret = dax_iomap_rw(iocb, to, &ext2_iomap_ops);
 	inode_unlock_shared(inode);
 
 	file_accessed(iocb->ki_filp);
@@ -62,7 +62,7 @@ static ssize_t ext2_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	if (ret)
 		goto out_unlock;
 
-	ret = iomap_dax_rw(iocb, from, &ext2_iomap_ops);
+	ret = dax_iomap_rw(iocb, from, &ext2_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		mark_inode_dirty(inode);
@@ -99,7 +99,7 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	}
 	down_read(&ei->dax_sem);
 
-	ret = iomap_dax_fault(vma, vmf, &ext2_iomap_ops);
+	ret = dax_iomap_fault(vma, vmf, &ext2_iomap_ops);
 
 	up_read(&ei->dax_sem);
 	if (vmf->flags & FAULT_FLAG_WRITE)
@@ -107,27 +107,6 @@ static int ext2_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	return ret;
 }
 
-static int ext2_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
-						pmd_t *pmd, unsigned int flags)
-{
-	struct inode *inode = file_inode(vma->vm_file);
-	struct ext2_inode_info *ei = EXT2_I(inode);
-	int ret;
-
-	if (flags & FAULT_FLAG_WRITE) {
-		sb_start_pagefault(inode->i_sb);
-		file_update_time(vma->vm_file);
-	}
-	down_read(&ei->dax_sem);
-
-	ret = dax_pmd_fault(vma, addr, pmd, flags, ext2_get_block);
-
-	up_read(&ei->dax_sem);
-	if (flags & FAULT_FLAG_WRITE)
-		sb_end_pagefault(inode->i_sb);
-	return ret;
-}
-
 static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 		struct vm_fault *vmf)
 {
@@ -154,7 +133,11 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma,
 
 static const struct vm_operations_struct ext2_dax_vm_ops = {
 	.fault		= ext2_dax_fault,
-	.pmd_fault	= ext2_dax_pmd_fault,
+	/*
+	 * .pmd_fault is not supported for DAX because allocation in ext2
+	 * cannot be reliably aligned to huge page sizes and so pmd faults
+	 * will always fail and fail back to regular faults.
+	 */
 	.page_mkwrite	= ext2_dax_fault,
 	.pfn_mkwrite	= ext2_dax_pfn_mkwrite,
 };
@@ -166,7 +149,7 @@ static int ext2_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	file_accessed(file);
 	vma->vm_ops = &ext2_dax_vm_ops;
-	vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
+	vma->vm_flags |= VM_MIXEDMAP;
 	return 0;
 }
 #else
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 41b8b44a391c..f073bfca694b 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -732,16 +732,13 @@ static int ext2_get_blocks(struct inode *inode,
 	}
 
 	if (IS_DAX(inode)) {
-		int i;
-
 		/*
 		 * We must unmap blocks before zeroing so that writeback cannot
 		 * overwrite zeros with stale data from block device page cache.
 		 */
-		for (i = 0; i < count; i++) {
-			unmap_underlying_metadata(inode->i_sb->s_bdev,
-					le32_to_cpu(chain[depth-1].key) + i);
-		}
+		clean_bdev_aliases(inode->i_sb->s_bdev,
+				   le32_to_cpu(chain[depth-1].key),
+				   count);
 		/*
 		 * block must be initialised before we put it in the tree
 		 * so that it's not found by another thread before it's
@@ -754,9 +751,8 @@ static int ext2_get_blocks(struct inode *inode,
 			mutex_unlock(&ei->truncate_mutex);
 			goto cleanup;
 		}
-	} else {
-		*new = true;
 	}
+	*new = true;
 
 	ext2_splice_branch(inode, iblock, partial, indirect_blks, count);
 	mutex_unlock(&ei->truncate_mutex);
@@ -850,6 +846,9 @@ struct iomap_ops ext2_iomap_ops = {
 	.iomap_begin		= ext2_iomap_begin,
 	.iomap_end		= ext2_iomap_end,
 };
+#else
+/* Define empty ops for !CONFIG_FS_DAX case to avoid ugly ifdefs */
+struct iomap_ops ext2_iomap_ops;
 #endif /* CONFIG_FS_DAX */
 
 int ext2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -1293,9 +1292,11 @@ static int ext2_setsize(struct inode *inode, loff_t newsize)
 
 	inode_dio_wait(inode);
 
-	if (IS_DAX(inode))
-		error = dax_truncate_page(inode, newsize, ext2_get_block);
-	else if (test_opt(inode->i_sb, NOBH))
+	if (IS_DAX(inode)) {
+		error = iomap_zero_range(inode, newsize,
+					 PAGE_ALIGN(newsize) - newsize, NULL,
+					 &ext2_iomap_ops);
+	} else if (test_opt(inode->i_sb, NOBH))
 		error = nobh_truncate_page(inode->i_mapping,
 				newsize, ext2_get_block);
 	else
@@ -1476,6 +1477,10 @@ struct inode *ext2_iget (struct super_block *sb, unsigned long ino)
 		inode->i_size |= ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
 	else
 		ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
+	if (i_size_read(inode) < 0) {
+		ret = -EFSCORRUPTED;
+		goto bad_inode;
+	}
 	ei->i_dtime = 0;
 	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
 	ei->i_state = 0;
diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c
index 9d617423e936..191e02b28ce8 100644
--- a/fs/ext2/ioctl.c
+++ b/fs/ext2/ioctl.c
@@ -14,7 +14,7 @@
 #include <linux/compat.h>
 #include <linux/mount.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 6cb042b53b5b..9e25a71fe1a2 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -31,7 +31,7 @@
 #include <linux/mount.h>
 #include <linux/log2.h>
 #include <linux/quotaops.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "ext2.h"
 #include "xattr.h"
 #include "acl.h"
diff --git a/fs/ext2/symlink.c b/fs/ext2/symlink.c
index 8437b191bf5d..eeffb0138a17 100644
--- a/fs/ext2/symlink.c
+++ b/fs/ext2/symlink.c
@@ -21,7 +21,6 @@
 #include "xattr.h"
 
 const struct inode_operations ext2_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
@@ -30,7 +29,6 @@ const struct inode_operations ext2_symlink_inode_operations = {
 };
  
 const struct inode_operations ext2_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= simple_get_link,
 	.setattr	= ext2_setattr,
 #ifdef CONFIG_EXT2_FS_XATTR
diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c
index dfa519979038..fd389935ecd1 100644
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -196,7 +196,7 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
 			if (error)
 				return error;
-			inode->i_ctime = ext4_current_time(inode);
+			inode->i_ctime = current_time(inode);
 			ext4_mark_inode_dirty(handle, inode);
 		}
 		break;
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index a8a750f59621..2163c1e69f2a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -397,8 +397,9 @@ struct flex_groups {
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
 
 #define EXT4_FL_USER_VISIBLE		0x304BDFFF /* User visible flags */
-#define EXT4_FL_USER_MODIFIABLE		0x204380FF /* User modifiable flags */
+#define EXT4_FL_USER_MODIFIABLE		0x204BC0FF /* User modifiable flags */
 
+/* Flags we can manipulate with through EXT4_IOC_FSSETXATTR */
 #define EXT4_FL_XFLAG_VISIBLE		(EXT4_SYNC_FL | \
 					 EXT4_IMMUTABLE_FL | \
 					 EXT4_APPEND_FL | \
@@ -1533,12 +1534,6 @@ static inline struct ext4_inode_info *EXT4_I(struct inode *inode)
 	return container_of(inode, struct ext4_inode_info, vfs_inode);
 }
 
-static inline struct timespec ext4_current_time(struct inode *inode)
-{
-	return (inode->i_sb->s_time_gran < NSEC_PER_SEC) ?
-		current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
-}
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
 	return ino == EXT4_ROOT_INO ||
@@ -2277,11 +2272,6 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb,
 					      struct ext4_group_desc *gdp);
 ext4_fsblk_t ext4_inode_to_goal_block(struct inode *);
 
-static inline int ext4_sb_has_crypto(struct super_block *sb)
-{
-	return ext4_has_feature_encrypt(sb);
-}
-
 static inline bool ext4_encrypted_inode(struct inode *inode)
 {
 	return ext4_test_inode_flag(inode, EXT4_INODE_ENCRYPT);
@@ -2339,8 +2329,8 @@ static inline void ext4_fname_free_filename(struct ext4_filename *fname) { }
 #define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
 #define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
 #define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy		fscrypt_notsupp_process_policy
-#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
 #define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
 #define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
 #define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
@@ -2458,8 +2448,6 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
 struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
 int ext4_get_block_unwritten(struct inode *inode, sector_t iblock,
 			     struct buffer_head *bh_result, int create);
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create);
 int ext4_get_block(struct inode *inode, sector_t iblock,
 		   struct buffer_head *bh_result, int create);
 int ext4_dio_get_block(struct inode *inode, sector_t iblock,
@@ -2492,7 +2480,7 @@ extern int ext4_change_inode_journal_flag(struct inode *, int);
 extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
 extern int ext4_inode_attach_jinode(struct inode *inode);
 extern int ext4_can_truncate(struct inode *inode);
-extern void ext4_truncate(struct inode *);
+extern int ext4_truncate(struct inode *);
 extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
 extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
 extern void ext4_set_inode_flags(struct inode *);
@@ -3129,7 +3117,7 @@ extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
 extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents);
 extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 			       struct ext4_map_blocks *map, int flags);
-extern void ext4_ext_truncate(handle_t *, struct inode *);
+extern int ext4_ext_truncate(handle_t *, struct inode *);
 extern int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
 				 ext4_lblk_t end);
 extern void ext4_ext_init(struct super_block *);
@@ -3265,12 +3253,7 @@ static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end)
 	}
 }
 
-static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
-{
-	int blksize = 1 << inode->i_blkbits;
-
-	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
-}
+extern struct iomap_ops ext4_iomap_ops;
 
 #endif	/* __KERNEL__ */
 
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index b1d52c14098e..f97611171023 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -414,17 +414,19 @@ static inline int ext4_inode_journal_mode(struct inode *inode)
 		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
 	/* We do not support data journalling with delayed allocation */
 	if (!S_ISREG(inode->i_mode) ||
-	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
-	    !test_opt(inode->i_sb, DELALLOC))
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+	    (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))) {
+		/* We do not support data journalling for encrypted data */
+		if (S_ISREG(inode->i_mode) && ext4_encrypted_inode(inode))
+			return EXT4_INODE_ORDERED_DATA_MODE;  /* ordered */
 		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	}
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
 		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
 	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
 		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
-	else
-		BUG();
+	BUG();
 }
 
 static inline int ext4_should_journal_data(struct inode *inode)
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c930a0110fb4..3e295d3350a9 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -37,7 +37,7 @@
 #include <linux/quotaops.h>
 #include <linux/string.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fiemap.h>
 #include <linux/backing-dev.h>
 #include "ext4_jbd2.h"
@@ -3777,14 +3777,6 @@ out:
 	return err;
 }
 
-static void unmap_underlying_metadata_blocks(struct block_device *bdev,
-			sector_t block, int count)
-{
-	int i;
-	for (i = 0; i < count; i++)
-                unmap_underlying_metadata(bdev, block + i);
-}
-
 /*
  * Handle EOFBLOCKS_FL flag, clearing it if necessary
  */
@@ -4121,9 +4113,8 @@ out:
 	 * new.
 	 */
 	if (allocated > map->m_len) {
-		unmap_underlying_metadata_blocks(inode->i_sb->s_bdev,
-					newblock + map->m_len,
-					allocated - map->m_len);
+		clean_bdev_aliases(inode->i_sb->s_bdev, newblock + map->m_len,
+				   allocated - map->m_len);
 		allocated = map->m_len;
 	}
 	map->m_len = allocated;
@@ -4631,7 +4622,7 @@ out2:
 	return err ? err : allocated;
 }
 
-void ext4_ext_truncate(handle_t *handle, struct inode *inode)
+int ext4_ext_truncate(handle_t *handle, struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 	ext4_lblk_t last_block;
@@ -4645,7 +4636,9 @@ void ext4_ext_truncate(handle_t *handle, struct inode *inode)
 
 	/* we have to know where to truncate from in crash case */
 	EXT4_I(inode)->i_disksize = inode->i_size;
-	ext4_mark_inode_dirty(handle, inode);
+	err = ext4_mark_inode_dirty(handle, inode);
+	if (err)
+		return err;
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
@@ -4657,12 +4650,9 @@ retry:
 		congestion_wait(BLK_RW_ASYNC, HZ/50);
 		goto retry;
 	}
-	if (err) {
-		ext4_std_error(inode->i_sb, err);
-		return;
-	}
-	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
-	ext4_std_error(inode->i_sb, err);
+	if (err)
+		return err;
+	return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 }
 
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
@@ -4701,7 +4691,7 @@ retry:
 		/*
 		 * Recalculate credits when extent tree depth changes.
 		 */
-		if (depth >= 0 && depth != ext_depth(inode)) {
+		if (depth != ext_depth(inode)) {
 			credits = ext4_chunk_trans_blocks(inode, len);
 			depth = ext_depth(inode);
 		}
@@ -4725,7 +4715,7 @@ retry:
 		map.m_lblk += ret;
 		map.m_len = len = len - ret;
 		epos = (loff_t)map.m_lblk << inode->i_blkbits;
-		inode->i_ctime = ext4_current_time(inode);
+		inode->i_ctime = current_time(inode);
 		if (new_size) {
 			if (epos > new_size)
 				epos = new_size;
@@ -4853,7 +4843,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		}
 		/* Now release the pages and zero block aligned part of pages */
 		truncate_pagecache_range(inode, start, end - 1);
-		inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+		inode->i_mtime = inode->i_ctime = current_time(inode);
 
 		ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size,
 					     flags, mode);
@@ -4878,7 +4868,7 @@ static long ext4_zero_range(struct file *file, loff_t offset,
 		goto out_dio;
 	}
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	if (new_size) {
 		ext4_update_inode_size(inode, new_size);
 	} else {
@@ -5568,7 +5558,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
 	up_write(&EXT4_I(inode)->i_data_sem);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 out_stop:
@@ -5678,7 +5668,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
 	/* Expand file to avoid data loss if there is error while shifting */
 	inode->i_size += len;
 	EXT4_I(inode)->i_disksize += len;
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ret = ext4_mark_inode_dirty(handle, inode);
 	if (ret)
 		goto out_stop;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 2a822d30e73f..d663d3d7c81c 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -31,6 +31,42 @@
 #include "xattr.h"
 #include "acl.h"
 
+#ifdef CONFIG_FS_DAX
+static ssize_t ext4_dax_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	inode_lock_shared(inode);
+	/*
+	 * Recheck under inode lock - at this point we are sure it cannot
+	 * change anymore
+	 */
+	if (!IS_DAX(inode)) {
+		inode_unlock_shared(inode);
+		/* Fallback to buffered IO in case we cannot support DAX */
+		return generic_file_read_iter(iocb, to);
+	}
+	ret = dax_iomap_rw(iocb, to, &ext4_iomap_ops);
+	inode_unlock_shared(inode);
+
+	file_accessed(iocb->ki_filp);
+	return ret;
+}
+#endif
+
+static ssize_t ext4_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+	if (!iov_iter_count(to))
+		return 0; /* skip atime */
+
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(file_inode(iocb->ki_filp)))
+		return ext4_dax_read_iter(iocb, to);
+#endif
+	return generic_file_read_iter(iocb, to);
+}
+
 /*
  * Called when an inode is released. Note that this is different
  * from ext4_file_open: open gets called at every open, but release
@@ -88,6 +124,86 @@ ext4_unaligned_aio(struct inode *inode, struct iov_iter *from, loff_t pos)
 	return 0;
 }
 
+/* Is IO overwriting allocated and initialized blocks? */
+static bool ext4_overwrite_io(struct inode *inode, loff_t pos, loff_t len)
+{
+	struct ext4_map_blocks map;
+	unsigned int blkbits = inode->i_blkbits;
+	int err, blklen;
+
+	if (pos + len > i_size_read(inode))
+		return false;
+
+	map.m_lblk = pos >> blkbits;
+	map.m_len = EXT4_MAX_BLOCKS(len, pos, blkbits);
+	blklen = map.m_len;
+
+	err = ext4_map_blocks(NULL, inode, &map, 0);
+	/*
+	 * 'err==len' means that all of the blocks have been preallocated,
+	 * regardless of whether they have been initialized or not. To exclude
+	 * unwritten extents, we need to check m_flags.
+	 */
+	return err == blklen && (map.m_flags & EXT4_MAP_MAPPED);
+}
+
+static ssize_t ext4_write_checks(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+
+	ret = generic_write_checks(iocb, from);
+	if (ret <= 0)
+		return ret;
+	/*
+	 * If we have encountered a bitmap-format file, the size limit
+	 * is smaller than s_maxbytes, which is for extent-mapped files.
+	 */
+	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
+		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+
+		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes)
+			return -EFBIG;
+		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
+	}
+	return iov_iter_count(from);
+}
+
+#ifdef CONFIG_FS_DAX
+static ssize_t
+ext4_dax_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+	struct inode *inode = file_inode(iocb->ki_filp);
+	ssize_t ret;
+	bool overwrite = false;
+
+	inode_lock(inode);
+	ret = ext4_write_checks(iocb, from);
+	if (ret <= 0)
+		goto out;
+	ret = file_remove_privs(iocb->ki_filp);
+	if (ret)
+		goto out;
+	ret = file_update_time(iocb->ki_filp);
+	if (ret)
+		goto out;
+
+	if (ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from))) {
+		overwrite = true;
+		downgrade_write(&inode->i_rwsem);
+	}
+	ret = dax_iomap_rw(iocb, from, &ext4_iomap_ops);
+out:
+	if (!overwrite)
+		inode_unlock(inode);
+	else
+		inode_unlock_shared(inode);
+	if (ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	return ret;
+}
+#endif
+
 static ssize_t
 ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 {
@@ -97,8 +213,13 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	int overwrite = 0;
 	ssize_t ret;
 
+#ifdef CONFIG_FS_DAX
+	if (IS_DAX(inode))
+		return ext4_dax_write_iter(iocb, from);
+#endif
+
 	inode_lock(inode);
-	ret = generic_write_checks(iocb, from);
+	ret = ext4_write_checks(iocb, from);
 	if (ret <= 0)
 		goto out;
 
@@ -114,53 +235,11 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 		ext4_unwritten_wait(inode);
 	}
 
-	/*
-	 * If we have encountered a bitmap-format file, the size limit
-	 * is smaller than s_maxbytes, which is for extent-mapped files.
-	 */
-	if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
-		struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-
-		if (iocb->ki_pos >= sbi->s_bitmap_maxbytes) {
-			ret = -EFBIG;
-			goto out;
-		}
-		iov_iter_truncate(from, sbi->s_bitmap_maxbytes - iocb->ki_pos);
-	}
-
 	iocb->private = &overwrite;
-	if (o_direct) {
-		size_t length = iov_iter_count(from);
-		loff_t pos = iocb->ki_pos;
-
-		/* check whether we do a DIO overwrite or not */
-		if (ext4_should_dioread_nolock(inode) && !unaligned_aio &&
-		    pos + length <= i_size_read(inode)) {
-			struct ext4_map_blocks map;
-			unsigned int blkbits = inode->i_blkbits;
-			int err, len;
-
-			map.m_lblk = pos >> blkbits;
-			map.m_len = EXT4_MAX_BLOCKS(length, pos, blkbits);
-			len = map.m_len;
-
-			err = ext4_map_blocks(NULL, inode, &map, 0);
-			/*
-			 * 'err==len' means that all of blocks has
-			 * been preallocated no matter they are
-			 * initialized or not.  For excluding
-			 * unwritten extents, we need to check
-			 * m_flags.  There are two conditions that
-			 * indicate for initialized extents.  1) If we
-			 * hit extent cache, EXT4_MAP_MAPPED flag is
-			 * returned; 2) If we do a real lookup,
-			 * non-flags are returned.  So we should check
-			 * these two conditions.
-			 */
-			if (err == len && (map.m_flags & EXT4_MAP_MAPPED))
-				overwrite = 1;
-		}
-	}
+	/* Check whether we do a DIO overwrite or not */
+	if (o_direct && ext4_should_dioread_nolock(inode) && !unaligned_aio &&
+	    ext4_overwrite_io(inode, iocb->ki_pos, iov_iter_count(from)))
+		overwrite = 1;
 
 	ret = __generic_file_write_iter(iocb, from);
 	inode_unlock(inode);
@@ -179,7 +258,6 @@ out:
 static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	int result;
-	handle_t *handle = NULL;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -187,24 +265,12 @@ static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
-						EXT4_DATA_TRANS_BLOCKS(sb));
-	} else
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-
-	if (IS_ERR(handle))
-		result = VM_FAULT_SIGBUS;
-	else
-		result = dax_fault(vma, vmf, ext4_dax_get_block);
-
-	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+	}
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	result = dax_iomap_fault(vma, vmf, &ext4_iomap_ops);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	if (write)
 		sb_end_pagefault(sb);
-	} else
-		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
@@ -213,7 +279,6 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 						pmd_t *pmd, unsigned int flags)
 {
 	int result;
-	handle_t *handle = NULL;
 	struct inode *inode = file_inode(vma->vm_file);
 	struct super_block *sb = inode->i_sb;
 	bool write = flags & FAULT_FLAG_WRITE;
@@ -221,26 +286,13 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr,
 	if (write) {
 		sb_start_pagefault(sb);
 		file_update_time(vma->vm_file);
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-		handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
-				ext4_chunk_trans_blocks(inode,
-							PMD_SIZE / PAGE_SIZE));
-	} else
-		down_read(&EXT4_I(inode)->i_mmap_sem);
-
-	if (IS_ERR(handle))
-		result = VM_FAULT_SIGBUS;
-	else
-		result = dax_pmd_fault(vma, addr, pmd, flags,
-					 ext4_dax_get_block);
-
-	if (write) {
-		if (!IS_ERR(handle))
-			ext4_journal_stop(handle);
-		up_read(&EXT4_I(inode)->i_mmap_sem);
+	}
+	down_read(&EXT4_I(inode)->i_mmap_sem);
+	result = dax_iomap_pmd_fault(vma, addr, pmd, flags,
+				     &ext4_iomap_ops);
+	up_read(&EXT4_I(inode)->i_mmap_sem);
+	if (write)
 		sb_end_pagefault(sb);
-	} else
-		up_read(&EXT4_I(inode)->i_mmap_sem);
 
 	return result;
 }
@@ -687,7 +739,7 @@ loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
 
 const struct file_operations ext4_file_operations = {
 	.llseek		= ext4_llseek,
-	.read_iter	= generic_file_read_iter,
+	.read_iter	= ext4_file_read_iter,
 	.write_iter	= ext4_file_write_iter,
 	.unlocked_ioctl = ext4_ioctl,
 #ifdef CONFIG_COMPAT
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 170421edfdfe..e57e8d90ea54 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,7 +1039,7 @@ got:
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blocks = 0;
 	inode->i_mtime = inode->i_atime = inode->i_ctime = ei->i_crtime =
-						       ext4_current_time(inode);
+						       current_time(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_dir_start_lookup = 0;
@@ -1115,8 +1115,7 @@ got:
 	}
 
 	if (encrypt) {
-		/* give pointer to avoid set_context with journal ops. */
-		err = fscrypt_inherit_context(dir, inode, &encrypt, true);
+		err = fscrypt_inherit_context(dir, inode, handle, true);
 		if (err)
 			goto fail_free_drop;
 	}
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index f74d5ee2cdec..437df6a1a841 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -299,6 +299,11 @@ static int ext4_create_inline_data(handle_t *handle,
 	EXT4_I(inode)->i_inline_size = len + EXT4_MIN_INLINE_DATA_SIZE;
 	ext4_clear_inode_flag(inode, EXT4_INODE_EXTENTS);
 	ext4_set_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	/*
+	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+	 * get cleared
+	 */
+	ext4_set_inode_flags(inode);
 	get_bh(is.iloc.bh);
 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
 
@@ -336,8 +341,10 @@ static int ext4_update_inline_data(handle_t *handle, struct inode *inode,
 
 	len -= EXT4_MIN_INLINE_DATA_SIZE;
 	value = kzalloc(len, GFP_NOFS);
-	if (!value)
+	if (!value) {
+		error = -ENOMEM;
 		goto out;
+	}
 
 	error = ext4_xattr_ibody_get(inode, i.name_index, i.name,
 				     value, len);
@@ -442,6 +449,11 @@ static int ext4_destroy_inline_data_nolock(handle_t *handle,
 		}
 	}
 	ext4_clear_inode_flag(inode, EXT4_INODE_INLINE_DATA);
+	/*
+	 * Propagate changes to inode->i_flags as well - e.g. S_DAX may
+	 * get set.
+	 */
+	ext4_set_inode_flags(inode);
 
 	get_bh(is.iloc.bh);
 	error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1028,7 +1040,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -1971,7 +1983,7 @@ out:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9c064727ed62..88d57af1b516 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -37,6 +37,7 @@
 #include <linux/printk.h>
 #include <linux/slab.h>
 #include <linux/bitops.h>
+#include <linux/iomap.h>
 
 #include "ext4_jbd2.h"
 #include "xattr.h"
@@ -71,10 +72,9 @@ static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
 					   csum_size);
 			offset += csum_size;
-			csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
-					   EXT4_INODE_SIZE(inode->i_sb) -
-					   offset);
 		}
+		csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+				   EXT4_INODE_SIZE(inode->i_sb) - offset);
 	}
 
 	return csum;
@@ -261,8 +261,15 @@ void ext4_evict_inode(struct inode *inode)
 			     "couldn't mark inode dirty (err %d)", err);
 		goto stop_handle;
 	}
-	if (inode->i_blocks)
-		ext4_truncate(inode);
+	if (inode->i_blocks) {
+		err = ext4_truncate(inode);
+		if (err) {
+			ext4_error(inode->i_sb,
+				   "couldn't truncate inode %lu (err %d)",
+				   inode->i_ino, err);
+			goto stop_handle;
+		}
+	}
 
 	/*
 	 * ext4_ext_truncate() doesn't reserve any slop when it
@@ -654,12 +661,8 @@ found:
 		if (flags & EXT4_GET_BLOCKS_ZERO &&
 		    map->m_flags & EXT4_MAP_MAPPED &&
 		    map->m_flags & EXT4_MAP_NEW) {
-			ext4_lblk_t i;
-
-			for (i = 0; i < map->m_len; i++) {
-				unmap_underlying_metadata(inode->i_sb->s_bdev,
-							  map->m_pblk + i);
-			}
+			clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+					   map->m_len);
 			ret = ext4_issue_zeroout(inode, map->m_lblk,
 						 map->m_pblk, map->m_len);
 			if (ret) {
@@ -767,6 +770,9 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock,
 		ext4_update_bh_state(bh, map.m_flags);
 		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 		ret = 0;
+	} else if (ret == 0) {
+		/* hole case, need to fill in bh->b_size */
+		bh->b_size = inode->i_sb->s_blocksize * map.m_len;
 	}
 	return ret;
 }
@@ -1127,8 +1133,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 			if (err)
 				break;
 			if (buffer_new(bh)) {
-				unmap_underlying_metadata(bh->b_bdev,
-							  bh->b_blocknr);
+				clean_bdev_bh_alias(bh);
 				if (PageUptodate(page)) {
 					clear_buffer_new(bh);
 					set_buffer_uptodate(bh);
@@ -1166,7 +1171,8 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len,
 	if (unlikely(err))
 		page_zero_new_buffers(page, from, to);
 	else if (decrypt)
-		err = fscrypt_decrypt_page(page);
+		err = fscrypt_decrypt_page(page->mapping->host, page,
+				PAGE_SIZE, 0, page->index);
 	return err;
 }
 #endif
@@ -2360,11 +2366,8 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
 
 	BUG_ON(map->m_len == 0);
 	if (map->m_flags & EXT4_MAP_NEW) {
-		struct block_device *bdev = inode->i_sb->s_bdev;
-		int i;
-
-		for (i = 0; i < map->m_len; i++)
-			unmap_underlying_metadata(bdev, map->m_pblk + i);
+		clean_bdev_aliases(inode->i_sb->s_bdev, map->m_pblk,
+				   map->m_len);
 	}
 	return 0;
 }
@@ -2891,7 +2894,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
 
 	index = pos >> PAGE_SHIFT;
 
-	if (ext4_nonda_switch(inode->i_sb)) {
+	if (ext4_nonda_switch(inode->i_sb) ||
+	    S_ISLNK(inode->i_mode)) {
 		*fsdata = (void *)FALL_BACK_TO_NONDELALLOC;
 		return ext4_write_begin(file, mapping, pos,
 					len, flags, pagep, fsdata);
@@ -3268,53 +3272,159 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
 }
 
 #ifdef CONFIG_FS_DAX
-/*
- * Get block function for DAX IO and mmap faults. It takes care of converting
- * unwritten extents to written ones and initializes new / converted blocks
- * to zeros.
- */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create)
+static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+			    unsigned flags, struct iomap *iomap)
 {
+	unsigned int blkbits = inode->i_blkbits;
+	unsigned long first_block = offset >> blkbits;
+	unsigned long last_block = (offset + length - 1) >> blkbits;
+	struct ext4_map_blocks map;
 	int ret;
 
-	ext4_debug("inode %lu, create flag %d\n", inode->i_ino, create);
-	if (!create)
-		return _ext4_get_block(inode, iblock, bh_result, 0);
+	if (WARN_ON_ONCE(ext4_has_inline_data(inode)))
+		return -ERANGE;
 
-	ret = ext4_get_block_trans(inode, iblock, bh_result,
-				   EXT4_GET_BLOCKS_PRE_IO |
-				   EXT4_GET_BLOCKS_CREATE_ZERO);
-	if (ret < 0)
-		return ret;
+	map.m_lblk = first_block;
+	map.m_len = last_block - first_block + 1;
 
-	if (buffer_unwritten(bh_result)) {
+	if (!(flags & IOMAP_WRITE)) {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+	} else {
+		int dio_credits;
+		handle_t *handle;
+		int retries = 0;
+
+		/* Trim mapping request to maximum we can map at once for DIO */
+		if (map.m_len > DIO_MAX_BLOCKS)
+			map.m_len = DIO_MAX_BLOCKS;
+		dio_credits = ext4_chunk_trans_blocks(inode, map.m_len);
+retry:
 		/*
-		 * We are protected by i_mmap_sem or i_mutex so we know block
-		 * cannot go away from under us even though we dropped
-		 * i_data_sem. Convert extent to written and write zeros there.
+		 * Either we allocate blocks and then we don't get unwritten
+		 * extent so we have reserved enough credits, or the blocks
+		 * are already allocated and unwritten and in that case
+		 * extent conversion fits in the credits as well.
 		 */
-		ret = ext4_get_block_trans(inode, iblock, bh_result,
-					   EXT4_GET_BLOCKS_CONVERT |
-					   EXT4_GET_BLOCKS_CREATE_ZERO);
-		if (ret < 0)
+		handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS,
+					    dio_credits);
+		if (IS_ERR(handle))
+			return PTR_ERR(handle);
+
+		ret = ext4_map_blocks(handle, inode, &map,
+				      EXT4_GET_BLOCKS_CREATE_ZERO);
+		if (ret < 0) {
+			ext4_journal_stop(handle);
+			if (ret == -ENOSPC &&
+			    ext4_should_retry_alloc(inode->i_sb, &retries))
+				goto retry;
 			return ret;
+		}
+
+		/*
+		 * If we added blocks beyond i_size, we need to make sure they
+		 * will get truncated if we crash before updating i_size in
+		 * ext4_iomap_end(). For faults we don't need to do that (and
+		 * even cannot because for orphan list operations inode_lock is
+		 * required) - if we happen to instantiate block beyond i_size,
+		 * it is because we race with truncate which has already added
+		 * the inode to the orphan list.
+		 */
+		if (!(flags & IOMAP_FAULT) && first_block + map.m_len >
+		    (i_size_read(inode) + (1 << blkbits) - 1) >> blkbits) {
+			int err;
+
+			err = ext4_orphan_add(handle, inode);
+			if (err < 0) {
+				ext4_journal_stop(handle);
+				return err;
+			}
+		}
+		ext4_journal_stop(handle);
 	}
-	/*
-	 * At least for now we have to clear BH_New so that DAX code
-	 * doesn't attempt to zero blocks again in a racy way.
-	 */
-	clear_buffer_new(bh_result);
+
+	iomap->flags = 0;
+	iomap->bdev = inode->i_sb->s_bdev;
+	iomap->offset = first_block << blkbits;
+
+	if (ret == 0) {
+		iomap->type = IOMAP_HOLE;
+		iomap->blkno = IOMAP_NULL_BLOCK;
+		iomap->length = (u64)map.m_len << blkbits;
+	} else {
+		if (map.m_flags & EXT4_MAP_MAPPED) {
+			iomap->type = IOMAP_MAPPED;
+		} else if (map.m_flags & EXT4_MAP_UNWRITTEN) {
+			iomap->type = IOMAP_UNWRITTEN;
+		} else {
+			WARN_ON_ONCE(1);
+			return -EIO;
+		}
+		iomap->blkno = (sector_t)map.m_pblk << (blkbits - 9);
+		iomap->length = (u64)map.m_len << blkbits;
+	}
+
+	if (map.m_flags & EXT4_MAP_NEW)
+		iomap->flags |= IOMAP_F_NEW;
 	return 0;
 }
-#else
-/* Just define empty function, it will never get called. */
-int ext4_dax_get_block(struct inode *inode, sector_t iblock,
-		       struct buffer_head *bh_result, int create)
+
+static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length,
+			  ssize_t written, unsigned flags, struct iomap *iomap)
 {
-	BUG();
-	return 0;
+	int ret = 0;
+	handle_t *handle;
+	int blkbits = inode->i_blkbits;
+	bool truncate = false;
+
+	if (!(flags & IOMAP_WRITE) || (flags & IOMAP_FAULT))
+		return 0;
+
+	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		goto orphan_del;
+	}
+	if (ext4_update_inode_size(inode, offset + written))
+		ext4_mark_inode_dirty(handle, inode);
+	/*
+	 * We may need to truncate allocated but not written blocks beyond EOF.
+	 */
+	if (iomap->offset + iomap->length > 
+	    ALIGN(inode->i_size, 1 << blkbits)) {
+		ext4_lblk_t written_blk, end_blk;
+
+		written_blk = (offset + written) >> blkbits;
+		end_blk = (offset + length) >> blkbits;
+		if (written_blk < end_blk && ext4_can_truncate(inode))
+			truncate = true;
+	}
+	/*
+	 * Remove inode from orphan list if we were extending a inode and
+	 * everything went fine.
+	 */
+	if (!truncate && inode->i_nlink &&
+	    !list_empty(&EXT4_I(inode)->i_orphan))
+		ext4_orphan_del(handle, inode);
+	ext4_journal_stop(handle);
+	if (truncate) {
+		ext4_truncate_failed_write(inode);
+orphan_del:
+		/*
+		 * If truncate failed early the inode might still be on the
+		 * orphan list; we need to make sure the inode is removed from
+		 * the orphan list in that case.
+		 */
+		if (inode->i_nlink)
+			ext4_orphan_del(NULL, inode);
+	}
+	return ret;
 }
+
+struct iomap_ops ext4_iomap_ops = {
+	.iomap_begin		= ext4_iomap_begin,
+	.iomap_end		= ext4_iomap_end,
+};
+
 #endif
 
 static int ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
@@ -3436,19 +3546,7 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 	iocb->private = NULL;
 	if (overwrite)
 		get_block_func = ext4_dio_get_block_overwrite;
-	else if (IS_DAX(inode)) {
-		/*
-		 * We can avoid zeroing for aligned DAX writes beyond EOF. Other
-		 * writes need zeroing either because they can race with page
-		 * faults or because they use partial blocks.
-		 */
-		if (round_down(offset, 1<<inode->i_blkbits) >= inode->i_size &&
-		    ext4_aligned_io(inode, offset, count))
-			get_block_func = ext4_dio_get_block;
-		else
-			get_block_func = ext4_dax_get_block;
-		dio_flags = DIO_LOCKING;
-	} else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+	else if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
 		   round_down(offset, 1 << inode->i_blkbits) >= inode->i_size) {
 		get_block_func = ext4_dio_get_block;
 		dio_flags = DIO_LOCKING | DIO_SKIP_HOLES;
@@ -3462,14 +3560,9 @@ static ssize_t ext4_direct_IO_write(struct kiocb *iocb, struct iov_iter *iter)
 #ifdef CONFIG_EXT4_FS_ENCRYPTION
 	BUG_ON(ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode));
 #endif
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, iter, get_block_func,
-				ext4_end_io_dio, dio_flags);
-	} else
-		ret = __blockdev_direct_IO(iocb, inode,
-					   inode->i_sb->s_bdev, iter,
-					   get_block_func,
-					   ext4_end_io_dio, NULL, dio_flags);
+	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev, iter,
+				   get_block_func, ext4_end_io_dio, NULL,
+				   dio_flags);
 
 	if (ret > 0 && !overwrite && ext4_test_inode_state(inode,
 						EXT4_STATE_DIO_UNWRITTEN)) {
@@ -3538,6 +3631,7 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 {
 	struct address_space *mapping = iocb->ki_filp->f_mapping;
 	struct inode *inode = mapping->host;
+	size_t count = iov_iter_count(iter);
 	ssize_t ret;
 
 	/*
@@ -3546,19 +3640,12 @@ static ssize_t ext4_direct_IO_read(struct kiocb *iocb, struct iov_iter *iter)
 	 * we are protected against page writeback as well.
 	 */
 	inode_lock_shared(inode);
-	if (IS_DAX(inode)) {
-		ret = dax_do_io(iocb, inode, iter, ext4_dio_get_block, NULL, 0);
-	} else {
-		size_t count = iov_iter_count(iter);
-
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
-						   iocb->ki_pos + count);
-		if (ret)
-			goto out_unlock;
-		ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
-					   iter, ext4_dio_get_block,
-					   NULL, NULL, 0);
-	}
+	ret = filemap_write_and_wait_range(mapping, iocb->ki_pos,
+					   iocb->ki_pos + count);
+	if (ret)
+		goto out_unlock;
+	ret = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
+				   iter, ext4_dio_get_block, NULL, NULL, 0);
 out_unlock:
 	inode_unlock_shared(inode);
 	return ret;
@@ -3587,6 +3674,10 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (ext4_has_inline_data(inode))
 		return 0;
 
+	/* DAX uses iomap path now */
+	if (WARN_ON_ONCE(IS_DAX(inode)))
+		return 0;
+
 	trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
 	if (iov_iter_rw(iter) == READ)
 		ret = ext4_direct_IO_read(iocb, iter);
@@ -3615,6 +3706,13 @@ static int ext4_journalled_set_page_dirty(struct page *page)
 	return __set_page_dirty_nobuffers(page);
 }
 
+static int ext4_set_page_dirty(struct page *page)
+{
+	WARN_ON_ONCE(!PageLocked(page) && !PageDirty(page));
+	WARN_ON_ONCE(!page_has_buffers(page));
+	return __set_page_dirty_buffers(page);
+}
+
 static const struct address_space_operations ext4_aops = {
 	.readpage		= ext4_readpage,
 	.readpages		= ext4_readpages,
@@ -3622,6 +3720,7 @@ static const struct address_space_operations ext4_aops = {
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_write_begin,
 	.write_end		= ext4_write_end,
+	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_invalidatepage,
 	.releasepage		= ext4_releasepage,
@@ -3654,6 +3753,7 @@ static const struct address_space_operations ext4_da_aops = {
 	.writepages		= ext4_writepages,
 	.write_begin		= ext4_da_write_begin,
 	.write_end		= ext4_da_write_end,
+	.set_page_dirty		= ext4_set_page_dirty,
 	.bmap			= ext4_bmap,
 	.invalidatepage		= ext4_da_invalidatepage,
 	.releasepage		= ext4_releasepage,
@@ -3743,7 +3843,8 @@ static int __ext4_block_zero_page_range(handle_t *handle,
 			/* We expect the key to be set. */
 			BUG_ON(!fscrypt_has_encryption_key(inode));
 			BUG_ON(blocksize != PAGE_SIZE);
-			WARN_ON_ONCE(fscrypt_decrypt_page(page));
+			WARN_ON_ONCE(fscrypt_decrypt_page(page->mapping->host,
+						page, PAGE_SIZE, 0, page->index));
 		}
 	}
 	if (ext4_should_journal_data(inode)) {
@@ -3792,8 +3893,10 @@ static int ext4_block_zero_page_range(handle_t *handle,
 	if (length > max || length < 0)
 		length = max;
 
-	if (IS_DAX(inode))
-		return dax_zero_page_range(inode, from, length, ext4_get_block);
+	if (IS_DAX(inode)) {
+		return iomap_zero_range(inode, from, length, NULL,
+					&ext4_iomap_ops);
+	}
 	return __ext4_block_zero_page_range(handle, mapping, from, length);
 }
 
@@ -4026,7 +4129,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 out_stop:
 	ext4_journal_stop(handle);
@@ -4091,10 +4194,11 @@ int ext4_inode_attach_jinode(struct inode *inode)
  * that's fine - as long as they are linked from the inode, the post-crash
  * ext4_truncate() run will find them and release them.
  */
-void ext4_truncate(struct inode *inode)
+int ext4_truncate(struct inode *inode)
 {
 	struct ext4_inode_info *ei = EXT4_I(inode);
 	unsigned int credits;
+	int err = 0;
 	handle_t *handle;
 	struct address_space *mapping = inode->i_mapping;
 
@@ -4108,7 +4212,7 @@ void ext4_truncate(struct inode *inode)
 	trace_ext4_truncate_enter(inode);
 
 	if (!ext4_can_truncate(inode))
-		return;
+		return 0;
 
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 
@@ -4120,13 +4224,13 @@ void ext4_truncate(struct inode *inode)
 
 		ext4_inline_data_truncate(inode, &has_inline);
 		if (has_inline)
-			return;
+			return 0;
 	}
 
 	/* If we zero-out tail of the page, we have to create jinode for jbd2 */
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1)) {
 		if (ext4_inode_attach_jinode(inode) < 0)
-			return;
+			return 0;
 	}
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4135,10 +4239,8 @@ void ext4_truncate(struct inode *inode)
 		credits = ext4_blocks_for_truncate(inode);
 
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits);
-	if (IS_ERR(handle)) {
-		ext4_std_error(inode->i_sb, PTR_ERR(handle));
-		return;
-	}
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
 
 	if (inode->i_size & (inode->i_sb->s_blocksize - 1))
 		ext4_block_truncate_page(handle, mapping, inode->i_size);
@@ -4152,7 +4254,8 @@ void ext4_truncate(struct inode *inode)
 	 * Implication: the file must always be in a sane, consistent
 	 * truncatable state while each transaction commits.
 	 */
-	if (ext4_orphan_add(handle, inode))
+	err = ext4_orphan_add(handle, inode);
+	if (err)
 		goto out_stop;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
@@ -4160,11 +4263,13 @@ void ext4_truncate(struct inode *inode)
 	ext4_discard_preallocations(inode);
 
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
-		ext4_ext_truncate(handle, inode);
+		err = ext4_ext_truncate(handle, inode);
 	else
 		ext4_ind_truncate(handle, inode);
 
 	up_write(&ei->i_data_sem);
+	if (err)
+		goto out_stop;
 
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);
@@ -4180,11 +4285,12 @@ out_stop:
 	if (inode->i_nlink)
 		ext4_orphan_del(handle, inode);
 
-	inode->i_mtime = inode->i_ctime = ext4_current_time(inode);
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 
 	trace_ext4_truncate_exit(inode);
+	return err;
 }
 
 /*
@@ -4352,7 +4458,9 @@ void ext4_set_inode_flags(struct inode *inode)
 		new_fl |= S_NOATIME;
 	if (flags & EXT4_DIRSYNC_FL)
 		new_fl |= S_DIRSYNC;
-	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode))
+	if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
+	    !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
+	    !ext4_encrypted_inode(inode))
 		new_fl |= S_DAX;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
@@ -4411,7 +4519,9 @@ static inline void ext4_iget_extra_inode(struct inode *inode,
 {
 	__le32 *magic = (void *)raw_inode +
 			EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
-	if (*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
+	if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
+	    EXT4_INODE_SIZE(inode->i_sb) &&
+	    *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
 		ext4_set_inode_state(inode, EXT4_STATE_XATTR);
 		ext4_find_inline_data_nolock(inode);
 	} else
@@ -4434,6 +4544,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	struct inode *inode;
 	journal_t *journal = EXT4_SB(sb)->s_journal;
 	long ret;
+	loff_t size;
 	int block;
 	uid_t i_uid;
 	gid_t i_gid;
@@ -4456,10 +4567,12 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
 		if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
-		    EXT4_INODE_SIZE(inode->i_sb)) {
-			EXT4_ERROR_INODE(inode, "bad extra_isize (%u != %u)",
-				EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize,
-				EXT4_INODE_SIZE(inode->i_sb));
+			EXT4_INODE_SIZE(inode->i_sb) ||
+		    (ei->i_extra_isize & 3)) {
+			EXT4_ERROR_INODE(inode,
+					 "bad extra_isize %u (inode size %u)",
+					 ei->i_extra_isize,
+					 EXT4_INODE_SIZE(inode->i_sb));
 			ret = -EFSCORRUPTED;
 			goto bad_inode;
 		}
@@ -4534,6 +4647,11 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		ei->i_file_acl |=
 			((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
 	inode->i_size = ext4_isize(raw_inode);
+	if ((size = i_size_read(inode)) < 0) {
+		EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
+		ret = -EFSCORRUPTED;
+		goto bad_inode;
+	}
 	ei->i_disksize = inode->i_size;
 #ifdef CONFIG_QUOTA
 	ei->i_reserved_quota = 0;
@@ -4577,6 +4695,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
 		if (ei->i_extra_isize == 0) {
 			/* The extra space is currently unused. Use it. */
+			BUILD_BUG_ON(sizeof(struct ext4_inode) & 3);
 			ei->i_extra_isize = sizeof(struct ext4_inode) -
 					    EXT4_GOOD_OLD_INODE_SIZE;
 		} else {
@@ -5154,7 +5273,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 			 * update c/mtime in shrink case below
 			 */
 			if (!shrink) {
-				inode->i_mtime = ext4_current_time(inode);
+				inode->i_mtime = current_time(inode);
 				inode->i_ctime = inode->i_mtime;
 			}
 			down_write(&EXT4_I(inode)->i_data_sem);
@@ -5199,12 +5318,15 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		 * in data=journal mode to make pages freeable.
 		 */
 		truncate_pagecache(inode, inode->i_size);
-		if (shrink)
-			ext4_truncate(inode);
+		if (shrink) {
+			rc = ext4_truncate(inode);
+			if (rc)
+				error = rc;
+		}
 		up_write(&EXT4_I(inode)->i_mmap_sem);
 	}
 
-	if (!rc) {
+	if (!error) {
 		setattr_copy(inode, attr);
 		mark_inode_dirty(inode);
 	}
@@ -5216,7 +5338,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 	if (orphan && inode->i_nlink)
 		ext4_orphan_del(NULL, inode);
 
-	if (!rc && (ia_valid & ATTR_MODE))
+	if (!error && (ia_valid & ATTR_MODE))
 		rc = posix_acl_chmod(inode, inode->i_mode);
 
 err_out:
@@ -5455,18 +5577,20 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
 	err = ext4_reserve_inode_write(handle, inode, &iloc);
 	if (err)
 		return err;
-	if (ext4_handle_valid(handle) &&
-	    EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
+	if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize &&
 	    !ext4_test_inode_state(inode, EXT4_STATE_NO_EXPAND)) {
 		/*
-		 * We need extra buffer credits since we may write into EA block
+		 * In nojournal mode, we can immediately attempt to expand
+		 * the inode.  When journaled, we first need to obtain extra
+		 * buffer credits since we may write into the EA block
 		 * with this same handle. If journal_extend fails, then it will
 		 * only result in a minor loss of functionality for that inode.
 		 * If this is felt to be critical, then e2fsck should be run to
 		 * force a large enough s_min_extra_isize.
 		 */
-		if ((jbd2_journal_extend(handle,
-			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb))) == 0) {
+		if (!ext4_handle_valid(handle) ||
+		    jbd2_journal_extend(handle,
+			     EXT4_DATA_TRANS_BLOCKS(inode->i_sb)) == 0) {
 			ret = ext4_expand_extra_isize(inode,
 						      sbi->s_want_extra_isize,
 						      iloc, handle);
@@ -5620,6 +5744,11 @@ int ext4_change_inode_journal_flag(struct inode *inode, int val)
 		ext4_clear_inode_flag(inode, EXT4_INODE_JOURNAL_DATA);
 	}
 	ext4_set_aops(inode);
+	/*
+	 * Update inode->i_flags after EXT4_INODE_JOURNAL_DATA was updated.
+	 * E.g. S_DAX may get cleared / set.
+	 */
+	ext4_set_inode_flags(inode);
 
 	jbd2_journal_unlock_updates(journal);
 	percpu_up_write(&sbi->s_journal_flag_rwsem);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae8ebbc97..d534399cf607 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -15,7 +15,7 @@
 #include <linux/file.h>
 #include <linux/quotaops.h>
 #include <linux/uuid.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
 
@@ -153,7 +153,7 @@ static long swap_inode_boot_loader(struct super_block *sb,
 
 	swap_inode_data(inode, inode_bl);
 
-	inode->i_ctime = inode_bl->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = inode_bl->i_ctime = current_time(inode);
 
 	spin_lock(&sbi->s_next_gen_lock);
 	inode->i_generation = sbi->s_next_generation++;
@@ -191,6 +191,7 @@ journal_err_out:
 	return err;
 }
 
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
 static int uuid_is_zero(__u8 u[16])
 {
 	int	i;
@@ -200,6 +201,7 @@ static int uuid_is_zero(__u8 u[16])
 			return 0;
 	return 1;
 }
+#endif
 
 static int ext4_ioctl_setflags(struct inode *inode,
 			       unsigned int flags)
@@ -248,8 +250,11 @@ static int ext4_ioctl_setflags(struct inode *inode,
 			err = -EOPNOTSUPP;
 			goto flags_out;
 		}
-	} else if (oldflags & EXT4_EOFBLOCKS_FL)
-		ext4_truncate(inode);
+	} else if (oldflags & EXT4_EOFBLOCKS_FL) {
+		err = ext4_truncate(inode);
+		if (err)
+			goto flags_out;
+	}
 
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 1);
 	if (IS_ERR(handle)) {
@@ -265,6 +270,9 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	for (i = 0, mask = 1; i < 32; i++, mask <<= 1) {
 		if (!(mask & EXT4_FL_USER_MODIFIABLE))
 			continue;
+		/* These flags get special treatment later */
+		if (mask == EXT4_JOURNAL_DATA_FL || mask == EXT4_EXTENTS_FL)
+			continue;
 		if (mask & flags)
 			ext4_set_inode_flag(inode, i);
 		else
@@ -272,7 +280,7 @@ static int ext4_ioctl_setflags(struct inode *inode,
 	}
 
 	ext4_set_inode_flags(inode);
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 
 	err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
@@ -368,7 +376,7 @@ static int ext4_ioctl_setproject(struct file *filp, __u32 projid)
 	}
 
 	EXT4_I(inode)->i_projid = kprojid;
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 out_dirty:
 	rc = ext4_mark_iloc_dirty(handle, inode, &iloc);
 	if (!err)
@@ -409,6 +417,10 @@ static inline __u32 ext4_iflags_to_xflags(unsigned long iflags)
 	return xflags;
 }
 
+#define EXT4_SUPPORTED_FS_XFLAGS (FS_XFLAG_SYNC | FS_XFLAG_IMMUTABLE | \
+				  FS_XFLAG_APPEND | FS_XFLAG_NODUMP | \
+				  FS_XFLAG_NOATIME | FS_XFLAG_PROJINHERIT)
+
 /* Transfer xflags flags to internal */
 static inline unsigned long ext4_xflags_to_iflags(__u32 xflags)
 {
@@ -453,12 +465,22 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		if (get_user(flags, (int __user *) arg))
 			return -EFAULT;
 
+		if (flags & ~EXT4_FL_USER_VISIBLE)
+			return -EOPNOTSUPP;
+		/*
+		 * chattr(1) grabs flags via GETFLAGS, modifies the result and
+		 * passes that to SETFLAGS. So we cannot easily make SETFLAGS
+		 * more restrictive than just silently masking off visible but
+		 * not settable flags as we always did.
+		 */
+		flags &= EXT4_FL_USER_MODIFIABLE;
+		if (ext4_mask_flags(inode->i_mode, flags) != flags)
+			return -EOPNOTSUPP;
+
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
 
-		flags = ext4_mask_flags(inode->i_mode, flags);
-
 		inode_lock(inode);
 		err = ext4_ioctl_setflags(inode, flags);
 		inode_unlock(inode);
@@ -500,7 +522,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		}
 		err = ext4_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
-			inode->i_ctime = ext4_current_time(inode);
+			inode->i_ctime = current_time(inode);
 			inode->i_generation = generation;
 			err = ext4_mark_iloc_dirty(handle, inode, &iloc);
 		}
@@ -765,28 +787,19 @@ resizefs_out:
 	}
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		return ext4_ext_precache(inode);
-	case EXT4_IOC_SET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		struct fscrypt_policy policy;
 
+	case EXT4_IOC_SET_ENCRYPTION_POLICY:
 		if (!ext4_has_feature_encrypt(sb))
 			return -EOPNOTSUPP;
+		return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 
-		if (copy_from_user(&policy,
-				   (struct fscrypt_policy __user *)arg,
-				   sizeof(policy)))
-			return -EFAULT;
-		return fscrypt_process_policy(filp, &policy);
-#else
-		return -EOPNOTSUPP;
-#endif
-	}
 	case EXT4_IOC_GET_ENCRYPTION_PWSALT: {
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
 		int err, err2;
 		struct ext4_sb_info *sbi = EXT4_SB(sb);
 		handle_t *handle;
 
-		if (!ext4_sb_has_crypto(sb))
+		if (!ext4_has_feature_encrypt(sb))
 			return -EOPNOTSUPP;
 		if (uuid_is_zero(sbi->s_es->s_encrypt_pw_salt)) {
 			err = mnt_want_write_file(filp);
@@ -816,24 +829,13 @@ resizefs_out:
 				 sbi->s_es->s_encrypt_pw_salt, 16))
 			return -EFAULT;
 		return 0;
-	}
-	case EXT4_IOC_GET_ENCRYPTION_POLICY: {
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-		struct fscrypt_policy policy;
-		int err = 0;
-
-		if (!ext4_encrypted_inode(inode))
-			return -ENOENT;
-		err = fscrypt_get_policy(inode, &policy);
-		if (err)
-			return err;
-		if (copy_to_user((void __user *)arg, &policy, sizeof(policy)))
-			return -EFAULT;
-		return 0;
 #else
 		return -EOPNOTSUPP;
 #endif
 	}
+	case EXT4_IOC_GET_ENCRYPTION_POLICY:
+		return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
+
 	case EXT4_IOC_FSGETXATTR:
 	{
 		struct fsxattr fa;
@@ -865,13 +867,17 @@ resizefs_out:
 		if (!inode_owner_or_capable(inode))
 			return -EACCES;
 
+		if (fa.fsx_xflags & ~EXT4_SUPPORTED_FS_XFLAGS)
+			return -EOPNOTSUPP;
+
+		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
+		if (ext4_mask_flags(inode->i_mode, flags) != flags)
+			return -EOPNOTSUPP;
+
 		err = mnt_want_write_file(filp);
 		if (err)
 			return err;
 
-		flags = ext4_xflags_to_iflags(fa.fsx_xflags);
-		flags = ext4_mask_flags(inode->i_mode, flags);
-
 		inode_lock(inode);
 		flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) |
 			 (flags & EXT4_FL_XFLAG_VISIBLE);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index f418f55c2bbe..7ae43c59bc79 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -669,7 +669,7 @@ static void ext4_mb_mark_free_simple(struct super_block *sb,
 	ext4_grpblk_t min;
 	ext4_grpblk_t max;
 	ext4_grpblk_t chunk;
-	unsigned short border;
+	unsigned int border;
 
 	BUG_ON(len > EXT4_CLUSTERS_PER_GROUP(sb));
 
@@ -2287,7 +2287,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v)
 	struct ext4_group_info *grinfo;
 	struct sg {
 		struct ext4_group_info info;
-		ext4_grpblk_t counters[16];
+		ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2];
 	} sg;
 
 	group--;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index d89754ef1aab..eb9835638680 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -35,7 +35,7 @@ static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp)
 }
 
 /*
- * Write the MMP block using WRITE_SYNC to try to get the block on-disk
+ * Write the MMP block using REQ_SYNC to try to get the block on-disk
  * faster.
  */
 static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
@@ -52,7 +52,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh)
 	lock_buffer(bh);
 	bh->b_end_io = end_buffer_write_sync;
 	get_bh(bh);
-	submit_bh(REQ_OP_WRITE, WRITE_SYNC | REQ_META | REQ_PRIO, bh);
+	submit_bh(REQ_OP_WRITE, REQ_SYNC | REQ_META | REQ_PRIO, bh);
 	wait_on_buffer(bh);
 	sb_end_write(sb);
 	if (unlikely(!buffer_uptodate(bh)))
@@ -88,7 +88,7 @@ static int read_mmp_block(struct super_block *sb, struct buffer_head **bh,
 	get_bh(*bh);
 	lock_buffer(*bh);
 	(*bh)->b_end_io = end_buffer_read_sync;
-	submit_bh(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, *bh);
+	submit_bh(REQ_OP_READ, REQ_META | REQ_PRIO, *bh);
 	wait_on_buffer(*bh);
 	if (!buffer_uptodate(*bh)) {
 		ret = -EIO;
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 104f8bfba718..eadba919f26b 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -1941,7 +1941,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = ext4_current_time(dir);
+	dir->i_mtime = dir->i_ctime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	dir->i_version++;
 	ext4_mark_inode_dirty(handle, dir);
@@ -2987,7 +2987,7 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry)
 	 * recovery. */
 	inode->i_size = 0;
 	ext4_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext4_current_time(inode);
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_dec_count(handle, dir);
 	ext4_update_dx_flag(dir);
@@ -3050,13 +3050,13 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	retval = ext4_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = ext4_current_time(dir);
+	dir->i_ctime = dir->i_mtime = current_time(dir);
 	ext4_update_dx_flag(dir);
 	ext4_mark_inode_dirty(handle, dir);
 	drop_nlink(inode);
 	if (!inode->i_nlink)
 		ext4_orphan_add(handle, inode);
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 
 end_unlink:
@@ -3254,7 +3254,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
-	inode->i_ctime = ext4_current_time(inode);
+	inode->i_ctime = current_time(inode);
 	ext4_inc_count(handle, inode);
 	ihold(inode);
 
@@ -3381,7 +3381,7 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent,
 		ent->de->file_type = file_type;
 	ent->dir->i_version++;
 	ent->dir->i_ctime = ent->dir->i_mtime =
-		ext4_current_time(ent->dir);
+		current_time(ent->dir);
 	ext4_mark_inode_dirty(handle, ent->dir);
 	BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata");
 	if (!ent->inlined) {
@@ -3651,7 +3651,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old.inode->i_ctime = ext4_current_time(old.inode);
+	old.inode->i_ctime = current_time(old.inode);
 	ext4_mark_inode_dirty(handle, old.inode);
 
 	if (!whiteout) {
@@ -3663,9 +3663,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 
 	if (new.inode) {
 		ext4_dec_count(handle, new.inode);
-		new.inode->i_ctime = ext4_current_time(new.inode);
+		new.inode->i_ctime = current_time(new.inode);
 	}
-	old.dir->i_ctime = old.dir->i_mtime = ext4_current_time(old.dir);
+	old.dir->i_ctime = old.dir->i_mtime = current_time(old.dir);
 	ext4_update_dx_flag(old.dir);
 	if (old.dir_bh) {
 		retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino);
@@ -3723,6 +3723,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	};
 	u8 new_file_type;
 	int retval;
+	struct timespec ctime;
 
 	if ((ext4_encrypted_inode(old_dir) ||
 	     ext4_encrypted_inode(new_dir)) &&
@@ -3823,8 +3824,9 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old.inode->i_ctime = ext4_current_time(old.inode);
-	new.inode->i_ctime = ext4_current_time(new.inode);
+	ctime = current_time(old.inode);
+	old.inode->i_ctime = ctime;
+	new.inode->i_ctime = ctime;
 	ext4_mark_inode_dirty(handle, old.inode);
 	ext4_mark_inode_dirty(handle, new.inode);
 
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 0094923e5ebf..d83b0f3c5fe9 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -340,7 +340,7 @@ void ext4_io_submit(struct ext4_io_submit *io)
 
 	if (bio) {
 		int io_op_flags = io->io_wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0;
+				  REQ_SYNC : 0;
 		bio_set_op_attrs(io->io_bio, REQ_OP_WRITE, io_op_flags);
 		submit_bio(io->io_bio);
 	}
@@ -457,7 +457,7 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		}
 		if (buffer_new(bh)) {
 			clear_buffer_new(bh);
-			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			clean_bdev_bh_alias(bh);
 		}
 		set_buffer_async_write(bh);
 		nr_to_submit++;
@@ -470,7 +470,8 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		gfp_t gfp_flags = GFP_NOFS;
 
 	retry_encrypt:
-		data_page = fscrypt_encrypt_page(inode, page, gfp_flags);
+		data_page = fscrypt_encrypt_page(inode, page, PAGE_SIZE, 0,
+						page->index, gfp_flags);
 		if (IS_ERR(data_page)) {
 			ret = PTR_ERR(data_page);
 			if (ret == -ENOMEM && wbc->sync_mode == WB_SYNC_ALL) {
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 52b0530c5d65..66845a08a87a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -38,7 +38,7 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -863,7 +863,6 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	percpu_free_rwsem(&sbi->s_journal_flag_rwsem);
-	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < EXT4_MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -895,6 +894,7 @@ static void ext4_put_super(struct super_block *sb)
 	}
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
+	brelse(sbi->s_sbh);
 	sb->s_fs_info = NULL;
 	/*
 	 * Now that we are completely done shutting down the
@@ -1114,37 +1114,55 @@ static int ext4_prepare_context(struct inode *inode)
 static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
 							void *fs_data)
 {
-	handle_t *handle;
-	int res, res2;
+	handle_t *handle = fs_data;
+	int res, res2, retries = 0;
+
+	/*
+	 * If a journal handle was specified, then the encryption context is
+	 * being set on a new inode via inheritance and is part of a larger
+	 * transaction to create the inode.  Otherwise the encryption context is
+	 * being set on an existing inode in its own transaction.  Only in the
+	 * latter case should the "retry on ENOSPC" logic be used.
+	 */
 
-	/* fs_data is null when internally used. */
-	if (fs_data) {
-		res  = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-				EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-				len, 0);
+	if (handle) {
+		res = ext4_xattr_set_handle(handle, inode,
+					    EXT4_XATTR_INDEX_ENCRYPTION,
+					    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+					    ctx, len, 0);
 		if (!res) {
 			ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
 			ext4_clear_inode_state(inode,
 					EXT4_STATE_MAY_INLINE_DATA);
+			/*
+			 * Update inode->i_flags - e.g. S_DAX may get disabled
+			 */
+			ext4_set_inode_flags(inode);
 		}
 		return res;
 	}
 
+retry:
 	handle = ext4_journal_start(inode, EXT4_HT_MISC,
 			ext4_jbd2_credits_xattr(inode));
 	if (IS_ERR(handle))
 		return PTR_ERR(handle);
 
-	res = ext4_xattr_set(inode, EXT4_XATTR_INDEX_ENCRYPTION,
-			EXT4_XATTR_NAME_ENCRYPTION_CONTEXT, ctx,
-			len, 0);
+	res = ext4_xattr_set_handle(handle, inode, EXT4_XATTR_INDEX_ENCRYPTION,
+				    EXT4_XATTR_NAME_ENCRYPTION_CONTEXT,
+				    ctx, len, 0);
 	if (!res) {
 		ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
+		/* Update inode->i_flags - e.g. S_DAX may get disabled */
+		ext4_set_inode_flags(inode);
 		res = ext4_mark_inode_dirty(handle, inode);
 		if (res)
 			EXT4_ERROR_INODE(inode, "Failed to mark inode dirty");
 	}
 	res2 = ext4_journal_stop(handle);
+
+	if (res == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
+		goto retry;
 	if (!res)
 		res = res2;
 	return res;
@@ -1187,7 +1205,7 @@ static int ext4_release_dquot(struct dquot *dquot);
 static int ext4_mark_dquot_dirty(struct dquot *dquot);
 static int ext4_write_info(struct super_block *sb, int type);
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 struct path *path);
+			 const struct path *path);
 static int ext4_quota_off(struct super_block *sb, int type);
 static int ext4_quota_on_mount(struct super_block *sb, int type);
 static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data,
@@ -1883,12 +1901,6 @@ static int parse_options(char *options, struct super_block *sb,
 			return 0;
 		}
 	}
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
-	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
-		ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit "
-			 "in data=ordered mode");
-		return 0;
-	}
 	return 1;
 }
 
@@ -2330,7 +2342,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 				struct ext4_super_block *es)
 {
 	unsigned int s_flags = sb->s_flags;
-	int nr_orphans = 0, nr_truncates = 0;
+	int ret, nr_orphans = 0, nr_truncates = 0;
 #ifdef CONFIG_QUOTA
 	int i;
 #endif
@@ -2412,7 +2424,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 				  inode->i_ino, inode->i_size);
 			inode_lock(inode);
 			truncate_inode_pages(inode->i_mapping, inode->i_size);
-			ext4_truncate(inode);
+			ret = ext4_truncate(inode);
+			if (ret)
+				ext4_std_error(inode->i_sb, ret);
 			inode_unlock(inode);
 			nr_truncates++;
 		} else {
@@ -3193,10 +3207,15 @@ static int count_overhead(struct super_block *sb, ext4_group_t grp,
 			ext4_set_bit(s++, buf);
 			count++;
 		}
-		for (j = ext4_bg_num_gdb(sb, grp); j > 0; j--) {
-			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
-			count++;
+		j = ext4_bg_num_gdb(sb, grp);
+		if (s + j > EXT4_BLOCKS_PER_GROUP(sb)) {
+			ext4_error(sb, "Invalid number of block group "
+				   "descriptor blocks: %d", j);
+			j = EXT4_BLOCKS_PER_GROUP(sb) - s;
 		}
+		count += j;
+		for (; j > 0; j--)
+			ext4_set_bit(EXT4_B2C(sbi, s++), buf);
 	}
 	if (!count)
 		return 0;
@@ -3301,7 +3320,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	char *orig_data = kstrdup(data, GFP_KERNEL);
 	struct buffer_head *bh;
 	struct ext4_super_block *es = NULL;
-	struct ext4_sb_info *sbi;
+	struct ext4_sb_info *sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
 	ext4_fsblk_t block;
 	ext4_fsblk_t sb_block = get_sb_block(&data);
 	ext4_fsblk_t logical_sb_block;
@@ -3320,16 +3339,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
 	ext4_group_t first_not_zeroed;
 
-	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
-	if (!sbi)
-		goto out_free_orig;
+	if ((data && !orig_data) || !sbi)
+		goto out_free_base;
 
 	sbi->s_blockgroup_lock =
 		kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
-	if (!sbi->s_blockgroup_lock) {
-		kfree(sbi);
-		goto out_free_orig;
-	}
+	if (!sbi->s_blockgroup_lock)
+		goto out_free_base;
+
 	sb->s_fs_info = sbi;
 	sbi->s_sb = sb;
 	sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
@@ -3475,11 +3492,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	 */
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
-	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-			   &journal_devnum, &journal_ioprio, 0)) {
-		ext4_msg(sb, KERN_WARNING,
-			 "failed to parse options in superblock: %s",
-			 sbi->s_es->s_mount_opts);
+	if (sbi->s_es->s_mount_opts[0]) {
+		char *s_mount_opts = kstrndup(sbi->s_es->s_mount_opts,
+					      sizeof(sbi->s_es->s_mount_opts),
+					      GFP_KERNEL);
+		if (!s_mount_opts)
+			goto failed_mount;
+		if (!parse_options(s_mount_opts, sb, &journal_devnum,
+				   &journal_ioprio, 0)) {
+			ext4_msg(sb, KERN_WARNING,
+				 "failed to parse options in superblock: %s",
+				 s_mount_opts);
+		}
+		kfree(s_mount_opts);
 	}
 	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
@@ -3505,6 +3530,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 				 "both data=journal and dax");
 			goto failed_mount;
 		}
+		if (ext4_has_feature_encrypt(sb)) {
+			ext4_msg(sb, KERN_WARNING,
+				 "encrypted files will use data=ordered "
+				 "instead of data journaling mode");
+		}
 		if (test_opt(sb, DELALLOC))
 			clear_opt(sb, DELALLOC);
 	} else {
@@ -3660,12 +3690,16 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
 	sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
 	sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
-	if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
-		goto cantfind_ext4;
 
 	sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
 	if (sbi->s_inodes_per_block == 0)
 		goto cantfind_ext4;
+	if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+	    sbi->s_inodes_per_group > blocksize * 8) {
+		ext4_msg(sb, KERN_ERR, "invalid inodes per group: %lu\n",
+			 sbi->s_blocks_per_group);
+		goto failed_mount;
+	}
 	sbi->s_itb_per_group = sbi->s_inodes_per_group /
 					sbi->s_inodes_per_block;
 	sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
@@ -3748,13 +3782,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	}
 	sbi->s_cluster_ratio = clustersize / blocksize;
 
-	if (sbi->s_inodes_per_group > blocksize * 8) {
-		ext4_msg(sb, KERN_ERR,
-		       "#inodes per group too big: %lu",
-		       sbi->s_inodes_per_group);
-		goto failed_mount;
-	}
-
 	/* Do we have standard group size of clustersize * 8 blocks ? */
 	if (sbi->s_blocks_per_group == clustersize << 3)
 		set_opt2(sb, STD_GROUP_SIZE);
@@ -3814,6 +3841,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 			(EXT4_MAX_BLOCK_FILE_PHYS / EXT4_BLOCKS_PER_GROUP(sb)));
 	db_count = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - 1) /
 		   EXT4_DESC_PER_BLOCK(sb);
+	if (ext4_has_feature_meta_bg(sb)) {
+		if (le32_to_cpu(es->s_first_meta_bg) >= db_count) {
+			ext4_msg(sb, KERN_WARNING,
+				 "first meta block group too large: %u "
+				 "(group descriptor block count %u)",
+				 le32_to_cpu(es->s_first_meta_bg), db_count);
+			goto failed_mount;
+		}
+	}
 	sbi->s_group_desc = ext4_kvmalloc(db_count *
 					  sizeof(struct buffer_head *),
 					  GFP_KERNEL);
@@ -3967,6 +4003,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	default:
 		break;
 	}
+
+	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA &&
+	    test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+		ext4_msg(sb, KERN_ERR, "can't mount with "
+			"journal_async_commit in data=ordered mode");
+		goto failed_mount_wq;
+	}
+
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
 	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
@@ -4160,7 +4204,9 @@ no_journal:
 
 	if (___ratelimit(&ext4_mount_msg_ratelimit, "EXT4-fs mount"))
 		ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
-			 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
+			 "Opts: %.*s%s%s", descr,
+			 (int) sizeof(sbi->s_es->s_mount_opts),
+			 sbi->s_es->s_mount_opts,
 			 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
 
 	if (es->s_error_count)
@@ -4239,8 +4285,8 @@ failed_mount:
 out_fail:
 	sb->s_fs_info = NULL;
 	kfree(sbi->s_blockgroup_lock);
+out_free_base:
 	kfree(sbi);
-out_free_orig:
 	kfree(orig_data);
 	return err ? err : ret;
 }
@@ -4550,7 +4596,8 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 				&EXT4_SB(sb)->s_freeinodes_counter));
 	BUFFER_TRACE(sbh, "marking dirty");
 	ext4_superblock_csum_set(sb);
-	lock_buffer(sbh);
+	if (sync)
+		lock_buffer(sbh);
 	if (buffer_write_io_error(sbh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the
@@ -4566,10 +4613,10 @@ static int ext4_commit_super(struct super_block *sb, int sync)
 		set_buffer_uptodate(sbh);
 	}
 	mark_buffer_dirty(sbh);
-	unlock_buffer(sbh);
 	if (sync) {
+		unlock_buffer(sbh);
 		error = __sync_dirty_buffer(sbh,
-			test_opt(sb, BARRIER) ? WRITE_FUA : WRITE_SYNC);
+			test_opt(sb, BARRIER) ? REQ_FUA : REQ_SYNC);
 		if (error)
 			return error;
 
@@ -4857,6 +4904,13 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 			err = -EINVAL;
 			goto restore_opts;
 		}
+	} else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA) {
+		if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) {
+			ext4_msg(sb, KERN_ERR, "can't mount with "
+				"journal_async_commit in data=ordered mode");
+			err = -EINVAL;
+			goto restore_opts;
+		}
 	}
 
 	if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) {
@@ -5239,7 +5293,7 @@ static void lockdep_set_quota_inode(struct inode *inode, int subclass)
  * Standard function to be called on quota_on
  */
 static int ext4_quota_on(struct super_block *sb, int type, int format_id,
-			 struct path *path)
+			 const struct path *path)
 {
 	int err;
 
@@ -5366,7 +5420,7 @@ static int ext4_quota_off(struct super_block *sb, int type)
 	handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1);
 	if (IS_ERR(handle))
 		goto out;
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_mtime = inode->i_ctime = current_time(inode);
 	ext4_mark_inode_dirty(handle, inode);
 	ext4_journal_stop(handle);
 
diff --git a/fs/ext4/symlink.c b/fs/ext4/symlink.c
index 557b3b0d668c..73b184d161fc 100644
--- a/fs/ext4/symlink.c
+++ b/fs/ext4/symlink.c
@@ -83,21 +83,18 @@ errout:
 }
 
 const struct inode_operations ext4_encrypted_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= ext4_encrypted_get_link,
 	.setattr	= ext4_setattr,
 	.listxattr	= ext4_listxattr,
 };
 
 const struct inode_operations ext4_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= ext4_setattr,
 	.listxattr	= ext4_listxattr,
 };
 
 const struct inode_operations ext4_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= simple_get_link,
 	.setattr	= ext4_setattr,
 	.listxattr	= ext4_listxattr,
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index d77be9e9f535..5a94fa52b74f 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -185,6 +185,7 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 {
 	struct ext4_xattr_entry *e = entry;
 
+	/* Find the end of the names list */
 	while (!IS_LAST_ENTRY(e)) {
 		struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
 		if ((void *)next >= end)
@@ -192,15 +193,29 @@ ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end,
 		e = next;
 	}
 
+	/* Check the values */
 	while (!IS_LAST_ENTRY(entry)) {
 		if (entry->e_value_block != 0)
 			return -EFSCORRUPTED;
-		if (entry->e_value_size != 0 &&
-		    (value_start + le16_to_cpu(entry->e_value_offs) <
-		     (void *)e + sizeof(__u32) ||
-		     value_start + le16_to_cpu(entry->e_value_offs) +
-		    le32_to_cpu(entry->e_value_size) > end))
-			return -EFSCORRUPTED;
+		if (entry->e_value_size != 0) {
+			u16 offs = le16_to_cpu(entry->e_value_offs);
+			u32 size = le32_to_cpu(entry->e_value_size);
+			void *value;
+
+			/*
+			 * The value cannot overlap the names, and the value
+			 * with padding cannot extend beyond 'end'.  Check both
+			 * the padded and unpadded sizes, since the size may
+			 * overflow to 0 when adding padding.
+			 */
+			if (offs > end - value_start)
+				return -EFSCORRUPTED;
+			value = value_start + offs;
+			if (value < (void *)e + sizeof(u32) ||
+			    size > end - value ||
+			    EXT4_XATTR_SIZE(size) > end - value)
+				return -EFSCORRUPTED;
+		}
 		entry = EXT4_XATTR_NEXT(entry);
 	}
 
@@ -231,13 +246,12 @@ static int
 __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header,
 			 void *end, const char *function, unsigned int line)
 {
-	struct ext4_xattr_entry *entry = IFIRST(header);
 	int error = -EFSCORRUPTED;
 
-	if (((void *) header >= end) ||
+	if (end - (void *)header < sizeof(*header) + sizeof(u32) ||
 	    (header->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC)))
 		goto errout;
-	error = ext4_xattr_check_names(entry, end, entry);
+	error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header));
 errout:
 	if (error)
 		__ext4_error_inode(inode, function, line, 0,
@@ -1109,7 +1123,7 @@ int ext4_xattr_ibody_inline_set(handle_t *handle, struct inode *inode,
 	return 0;
 }
 
-static int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode,
+static int ext4_xattr_ibody_set(struct inode *inode,
 				struct ext4_xattr_info *i,
 				struct ext4_xattr_ibody_find *is)
 {
@@ -1216,7 +1230,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 	}
 	if (!value) {
 		if (!is.s.not_found)
-			error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+			error = ext4_xattr_ibody_set(inode, &i, &is);
 		else if (!bs.s.not_found)
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
 	} else {
@@ -1227,7 +1241,7 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 		if (!bs.s.not_found && ext4_xattr_value_same(&bs.s, &i))
 			goto cleanup;
 
-		error = ext4_xattr_ibody_set(handle, inode, &i, &is);
+		error = ext4_xattr_ibody_set(inode, &i, &is);
 		if (!error && !bs.s.not_found) {
 			i.value = NULL;
 			error = ext4_xattr_block_set(handle, inode, &i, &bs);
@@ -1242,14 +1256,13 @@ ext4_xattr_set_handle(handle_t *handle, struct inode *inode, int name_index,
 				goto cleanup;
 			if (!is.s.not_found) {
 				i.value = NULL;
-				error = ext4_xattr_ibody_set(handle, inode, &i,
-							     &is);
+				error = ext4_xattr_ibody_set(inode, &i, &is);
 			}
 		}
 	}
 	if (!error) {
 		ext4_xattr_update_super_block(handle, inode->i_sb);
-		inode->i_ctime = ext4_current_time(inode);
+		inode->i_ctime = current_time(inode);
 		if (!value)
 			ext4_clear_inode_state(inode, EXT4_STATE_NO_EXPAND);
 		error = ext4_mark_iloc_dirty(handle, inode, &is.iloc);
@@ -1384,7 +1397,7 @@ static int ext4_xattr_move_to_block(handle_t *handle, struct inode *inode,
 		goto out;
 
 	/* Remove the chosen entry from the inode */
-	error = ext4_xattr_ibody_set(handle, inode, &i, is);
+	error = ext4_xattr_ibody_set(inode, &i, is);
 	if (error)
 		goto out;
 
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c
index 6fe23af509e1..8f487692c21f 100644
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -384,7 +384,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage,
 	if (error)
 		return error;
 
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 
 	if (default_acl) {
 		error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl,
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 7e9b504bd8b2..f73ee9534d83 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -65,7 +65,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_META | REQ_PRIO,
 		.old_blkaddr = index,
 		.new_blkaddr = index,
 		.encrypted_page = NULL,
@@ -160,7 +160,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_READ,
-		.op_flags = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD,
+		.op_flags = sync ? (REQ_META | REQ_PRIO) : REQ_RAHEAD,
 		.encrypted_page = NULL,
 	};
 	struct blk_plug plug;
@@ -228,7 +228,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index)
 	f2fs_put_page(page, 0);
 
 	if (readahead)
-		ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true);
+		ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true);
 }
 
 static int f2fs_write_meta_page(struct page *page,
@@ -770,7 +770,12 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi)
 
 	/* Sanity checking of checkpoint */
 	if (sanity_check_ckpt(sbi))
-		goto fail_no_cp;
+		goto free_fail_no_cp;
+
+	if (cur_page == cp1)
+		sbi->cur_cp_pack = 1;
+	else
+		sbi->cur_cp_pack = 2;
 
 	if (cp_blks <= 1)
 		goto done;
@@ -793,6 +798,9 @@ done:
 	f2fs_put_page(cp2, 1);
 	return 0;
 
+free_fail_no_cp:
+	f2fs_put_page(cp1, 1);
+	f2fs_put_page(cp2, 1);
 fail_no_cp:
 	kfree(sbi->ckpt);
 	return -EINVAL;
@@ -921,7 +929,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
 		inode = igrab(&fi->vfs_inode);
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
 		if (inode) {
-			update_inode_page(inode);
+			sync_inode_metadata(inode, 0);
+
+			/* it's on eviction */
+			if (is_inode_flag_set(inode, FI_DIRTY_INODE))
+				update_inode_page(inode);
 			iput(inode);
 		}
 	};
@@ -987,7 +999,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
 {
 	up_write(&sbi->node_write);
 
-	build_free_nids(sbi);
+	build_free_nids(sbi, false);
 	f2fs_unlock_all(sbi);
 }
 
@@ -998,7 +1010,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
 	for (;;) {
 		prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		if (!atomic_read(&sbi->nr_wb_bios))
+		if (!get_pages(sbi, F2FS_WB_CP_DATA))
 			break;
 
 		io_schedule_timeout(5*HZ);
@@ -1123,7 +1135,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 				le32_to_cpu(ckpt->checksum_offset)))
 				= cpu_to_le32(crc32);
 
-	start_blk = __start_cp_addr(sbi);
+	start_blk = __start_cp_next_addr(sbi);
 
 	/* need to wait for end_io results */
 	wait_on_all_pages_writeback(sbi);
@@ -1184,9 +1196,9 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 	if (unlikely(f2fs_cp_error(sbi)))
 		return -EIO;
 
-	clear_prefree_segments(sbi, cpc);
 	clear_sbi_flag(sbi, SBI_IS_DIRTY);
 	clear_sbi_flag(sbi, SBI_NEED_CP);
+	__set_cp_next_pack(sbi);
 
 	/*
 	 * redirty superblock if metadata like node page or inode cache is
@@ -1261,8 +1273,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
 
 	/* unlock all the fs_lock[] in do_checkpoint() */
 	err = do_checkpoint(sbi, cpc);
-
-	f2fs_wait_all_discard_bio(sbi);
+	if (err) {
+		release_discard_addrs(sbi);
+	} else {
+		clear_prefree_segments(sbi, cpc);
+		f2fs_wait_all_discard_bio(sbi);
+	}
 
 	unblock_operations(sbi);
 	stat_inc_cp_count(sbi->stat_info);
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 9ae194fd2fdb..9ac262564fa6 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,26 @@
 #include "trace.h"
 #include <trace/events/f2fs.h>
 
+static bool __is_cp_guaranteed(struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode;
+	struct f2fs_sb_info *sbi;
+
+	if (!mapping)
+		return false;
+
+	inode = mapping->host;
+	sbi = F2FS_I_SB(inode);
+
+	if (inode->i_ino == F2FS_META_INO(sbi) ||
+			inode->i_ino ==  F2FS_NODE_INO(sbi) ||
+			S_ISDIR(inode->i_mode) ||
+			is_cold_data(page))
+		return true;
+	return false;
+}
+
 static void f2fs_read_end_io(struct bio *bio)
 {
 	struct bio_vec *bvec;
@@ -71,6 +91,7 @@ static void f2fs_write_end_io(struct bio *bio)
 
 	bio_for_each_segment_all(bvec, bio, i) {
 		struct page *page = bvec->bv_page;
+		enum count_type type = WB_DATA_TYPE(page);
 
 		fscrypt_pullback_bio_page(&page, true);
 
@@ -78,9 +99,11 @@ static void f2fs_write_end_io(struct bio *bio)
 			mapping_set_error(page->mapping, -EIO);
 			f2fs_stop_checkpoint(sbi, true);
 		}
+		dec_page_count(sbi, type);
+		clear_cold_data(page);
 		end_page_writeback(page);
 	}
-	if (atomic_dec_and_test(&sbi->nr_wb_bios) &&
+	if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
 				wq_has_sleeper(&sbi->cp_wait))
 		wake_up(&sbi->cp_wait);
 
@@ -88,6 +111,46 @@ static void f2fs_write_end_io(struct bio *bio)
 }
 
 /*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
+struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	struct block_device *bdev = sbi->sb->s_bdev;
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		if (FDEV(i).start_blk <= blk_addr &&
+					FDEV(i).end_blk >= blk_addr) {
+			blk_addr -= FDEV(i).start_blk;
+			bdev = FDEV(i).bdev;
+			break;
+		}
+	}
+	if (bio) {
+		bio->bi_bdev = bdev;
+		bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	}
+	return bdev;
+}
+
+int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
+			return i;
+	return 0;
+}
+
+static bool __same_bdev(struct f2fs_sb_info *sbi,
+				block_t blk_addr, struct bio *bio)
+{
+	return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev;
+}
+
+/*
  * Low-level block read/write IO operations.
  */
 static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
@@ -97,8 +160,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
 
 	bio = f2fs_bio_alloc(npages);
 
-	bio->bi_bdev = sbi->sb->s_bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
+	f2fs_target_device(sbi, blk_addr, bio);
 	bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
 	bio->bi_private = is_read ? NULL : sbi;
 
@@ -109,8 +171,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
 				struct bio *bio, enum page_type type)
 {
 	if (!is_read_io(bio_op(bio))) {
-		atomic_inc(&sbi->nr_wb_bios);
-		if (f2fs_sb_mounted_hmsmr(sbi->sb) &&
+		if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
 			current->plug && (type == DATA || type == NODE))
 			blk_finish_plug(current->plug);
 	}
@@ -198,11 +259,9 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi,
 	if (type >= META_FLUSH) {
 		io->fio.type = META_FLUSH;
 		io->fio.op = REQ_OP_WRITE;
-		if (test_opt(sbi, NOBARRIER))
-			io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO;
-		else
-			io->fio.op_flags = WRITE_FLUSH_FUA | REQ_META |
-								REQ_PRIO;
+		io->fio.op_flags = REQ_PREFLUSH | REQ_META | REQ_PRIO;
+		if (!test_opt(sbi, NOBARRIER))
+			io->fio.op_flags |= REQ_FUA;
 	}
 	__submit_merged_bio(io);
 out:
@@ -270,22 +329,24 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio)
 		verify_block_addr(sbi, fio->old_blkaddr);
 	verify_block_addr(sbi, fio->new_blkaddr);
 
+	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
+
+	if (!is_read)
+		inc_page_count(sbi, WB_DATA_TYPE(bio_page));
+
 	down_write(&io->io_rwsem);
 
 	if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 ||
-	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags)))
+	    (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) ||
+			!__same_bdev(sbi, fio->new_blkaddr, io->bio)))
 		__submit_merged_bio(io);
 alloc_new:
 	if (io->bio == NULL) {
-		int bio_blocks = MAX_BIO_BLOCKS(sbi);
-
 		io->bio = __bio_alloc(sbi, fio->new_blkaddr,
-						bio_blocks, is_read);
+						BIO_MAX_PAGES, is_read);
 		io->fio = *fio;
 	}
 
-	bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
-
 	if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) <
 							PAGE_SIZE) {
 		__submit_merged_bio(io);
@@ -483,7 +544,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index)
 		return page;
 	f2fs_put_page(page, 0);
 
-	page = get_read_data_page(inode, index, READ_SYNC, false);
+	page = get_read_data_page(inode, index, 0, false);
 	if (IS_ERR(page))
 		return page;
 
@@ -509,7 +570,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct page *page;
 repeat:
-	page = get_read_data_page(inode, index, READ_SYNC, for_write);
+	page = get_read_data_page(inode, index, 0, for_write);
 	if (IS_ERR(page))
 		return page;
 
@@ -590,7 +651,6 @@ static int __allocate_data_block(struct dnode_of_data *dn)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
 	struct f2fs_summary sum;
 	struct node_info ni;
-	int seg = CURSEG_WARM_DATA;
 	pgoff_t fofs;
 	blkcnt_t count = 1;
 
@@ -608,11 +668,8 @@ alloc:
 	get_node_info(sbi, dn->nid, &ni);
 	set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
 
-	if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page)
-		seg = CURSEG_DIRECT_IO;
-
 	allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr,
-								&sum, seg);
+						&sum, CURSEG_WARM_DATA);
 	set_data_blkaddr(dn);
 
 	/* update i_size */
@@ -624,11 +681,18 @@ alloc:
 	return 0;
 }
 
-ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
+static inline bool __force_buffered_io(struct inode *inode, int rw)
+{
+	return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) ||
+			(rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) ||
+			F2FS_I_SB(inode)->s_ndevs);
+}
+
+int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	struct f2fs_map_blocks map;
-	ssize_t ret = 0;
+	int err = 0;
 
 	map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
 	map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
@@ -640,19 +704,22 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
 	map.m_next_pgofs = NULL;
 
 	if (iocb->ki_flags & IOCB_DIRECT) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
-		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO);
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
+		return f2fs_map_blocks(inode, &map, 1,
+			__force_buffered_io(inode, WRITE) ?
+				F2FS_GET_BLOCK_PRE_AIO :
+				F2FS_GET_BLOCK_PRE_DIO);
 	}
 	if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) {
-		ret = f2fs_convert_inline_inode(inode);
-		if (ret)
-			return ret;
+		err = f2fs_convert_inline_inode(inode);
+		if (err)
+			return err;
 	}
 	if (!f2fs_has_inline_data(inode))
 		return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
-	return ret;
+	return err;
 }
 
 /*
@@ -676,7 +743,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
 	unsigned int ofs_in_node, last_ofs_in_node;
 	blkcnt_t prealloc;
 	struct extent_info ei;
-	bool allocated = false;
 	block_t blkaddr;
 
 	if (!maxblocks)
@@ -716,7 +782,7 @@ next_dnode:
 	}
 
 	prealloc = 0;
-	ofs_in_node = dn.ofs_in_node;
+	last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
 	end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
 
 next_block:
@@ -735,10 +801,8 @@ next_block:
 				}
 			} else {
 				err = __allocate_data_block(&dn);
-				if (!err) {
+				if (!err)
 					set_inode_flag(inode, FI_APPEND_WRITE);
-					allocated = true;
-				}
 			}
 			if (err)
 				goto sync_out;
@@ -793,7 +857,6 @@ skip:
 		err = reserve_new_blocks(&dn, prealloc);
 		if (err)
 			goto sync_out;
-		allocated = dn.node_changed;
 
 		map->m_len += dn.ofs_in_node - ofs_in_node;
 		if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
@@ -812,9 +875,8 @@ skip:
 
 	if (create) {
 		f2fs_unlock_op(sbi);
-		f2fs_balance_fs(sbi, allocated);
+		f2fs_balance_fs(sbi, dn.node_changed);
 	}
-	allocated = false;
 	goto next_dnode;
 
 sync_out:
@@ -822,7 +884,7 @@ sync_out:
 unlock_out:
 	if (create) {
 		f2fs_unlock_op(sbi);
-		f2fs_balance_fs(sbi, allocated);
+		f2fs_balance_fs(sbi, dn.node_changed);
 	}
 out:
 	trace_f2fs_map_blocks(inode, map, err);
@@ -834,19 +896,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock,
 			pgoff_t *next_pgofs)
 {
 	struct f2fs_map_blocks map;
-	int ret;
+	int err;
 
 	map.m_lblk = iblock;
 	map.m_len = bh->b_size >> inode->i_blkbits;
 	map.m_next_pgofs = next_pgofs;
 
-	ret = f2fs_map_blocks(inode, &map, create, flag);
-	if (!ret) {
+	err = f2fs_map_blocks(inode, &map, create, flag);
+	if (!err) {
 		map_bh(bh, inode->i_sb, map.m_pblk);
 		bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
 		bh->b_size = map.m_len << inode->i_blkbits;
 	}
-	return ret;
+	return err;
 }
 
 static int get_data_block(struct inode *inode, sector_t iblock,
@@ -891,7 +953,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 	struct buffer_head map_bh;
 	sector_t start_blk, last_blk;
 	pgoff_t next_pgofs;
-	loff_t isize;
 	u64 logical = 0, phys = 0, size = 0;
 	u32 flags = 0;
 	int ret = 0;
@@ -908,13 +969,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 
 	inode_lock(inode);
 
-	isize = i_size_read(inode);
-	if (start >= isize)
-		goto out;
-
-	if (start + len > isize)
-		len = isize - start;
-
 	if (logical_to_blk(inode, len) == 0)
 		len = blk_to_logical(inode, 1);
 
@@ -933,13 +987,11 @@ next:
 	/* HOLE */
 	if (!buffer_mapped(&map_bh)) {
 		start_blk = next_pgofs;
-		/* Go through holes util pass the EOF */
-		if (blk_to_logical(inode, start_blk) < isize)
+
+		if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
+					F2FS_I_SB(inode)->max_file_blocks))
 			goto prep_next;
-		/* Found a hole beyond isize means no more extents.
-		 * Note that the premise is that filesystems don't
-		 * punch holes beyond isize and keep size unchanged.
-		 */
+
 		flags |= FIEMAP_EXTENT_LAST;
 	}
 
@@ -982,7 +1034,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct fscrypt_ctx *ctx = NULL;
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio;
 
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) {
@@ -1000,8 +1051,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr,
 			fscrypt_release_ctx(ctx);
 		return ERR_PTR(-ENOMEM);
 	}
-	bio->bi_bdev = bdev;
-	bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr);
+	f2fs_target_device(sbi, blkaddr, bio);
 	bio->bi_end_io = f2fs_read_end_io;
 	bio->bi_private = ctx;
 
@@ -1096,7 +1146,8 @@ got_it:
 		 * This page will go to BIO.  Do we need to send this
 		 * BIO off first?
 		 */
-		if (bio && (last_block_in_bio != block_nr - 1)) {
+		if (bio && (last_block_in_bio != block_nr - 1 ||
+			!__same_bdev(F2FS_I_SB(inode), block_nr, bio))) {
 submit_and_realloc:
 			__submit_bio(F2FS_I_SB(inode), bio, DATA);
 			bio = NULL;
@@ -1195,7 +1246,9 @@ int do_write_data_page(struct f2fs_io_info *fio)
 							fio->old_blkaddr);
 retry_encrypt:
 		fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page,
-								gfp_flags);
+							PAGE_SIZE, 0,
+							fio->page->index,
+							gfp_flags);
 		if (IS_ERR(fio->encrypted_page)) {
 			err = PTR_ERR(fio->encrypted_page);
 			if (err == -ENOMEM) {
@@ -1251,7 +1304,7 @@ static int f2fs_write_data_page(struct page *page,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1311,7 +1364,6 @@ done:
 	if (err && err != -ENOENT)
 		goto redirty_out;
 
-	clear_cold_data(page);
 out:
 	inode_dec_dirty_pages(inode);
 	if (err)
@@ -1332,6 +1384,8 @@ out:
 
 redirty_out:
 	redirty_page_for_writepage(wbc, page);
+	if (!err)
+		return AOP_WRITEPAGE_ACTIVATE;
 	unlock_page(page);
 	return err;
 }
@@ -1427,6 +1481,15 @@ continue_unlock:
 
 			ret = mapping->a_ops->writepage(page, wbc);
 			if (unlikely(ret)) {
+				/*
+				 * keep nr_to_write, since vfs uses this to
+				 * get # of written pages.
+				 */
+				if (ret == AOP_WRITEPAGE_ACTIVATE) {
+					unlock_page(page);
+					ret = 0;
+					continue;
+				}
 				done_index = page->index + 1;
 				done = 1;
 				break;
@@ -1663,7 +1726,7 @@ repeat:
 			err = PTR_ERR(bio);
 			goto fail;
 		}
-		bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+		bio->bi_opf = REQ_OP_READ;
 		if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
 			bio_put(bio);
 			err = -EFAULT;
@@ -1714,7 +1777,6 @@ static int f2fs_write_end(struct file *file,
 		goto unlock_out;
 
 	set_page_dirty(page);
-	clear_cold_data(page);
 
 	if (pos + copied > i_size_read(inode))
 		f2fs_i_size_write(inode, pos + copied);
@@ -1751,9 +1813,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
 	if (err)
 		return err;
 
-	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-		return 0;
-	if (test_opt(F2FS_I_SB(inode), LFS))
+	if (__force_buffered_io(inode, rw))
 		return 0;
 
 	trace_f2fs_direct_IO_enter(inode, offset, count, rw);
@@ -1785,12 +1845,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset,
 		return;
 
 	if (PageDirty(page)) {
-		if (inode->i_ino == F2FS_META_INO(sbi))
+		if (inode->i_ino == F2FS_META_INO(sbi)) {
 			dec_page_count(sbi, F2FS_DIRTY_META);
-		else if (inode->i_ino == F2FS_NODE_INO(sbi))
+		} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
 			dec_page_count(sbi, F2FS_DIRTY_NODES);
-		else
+		} else {
 			inode_dec_dirty_pages(inode);
+			remove_dirty_inode(inode);
+		}
 	}
 
 	/* This is atomic written page, keep Private */
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index fb245bd302e4..fbd5184140d0 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
 	si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
 	si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
-	si->wb_bios = atomic_read(&sbi->nr_wb_bios);
+	si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA);
+	si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA);
 	si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg;
 	si->rsvd_segs = reserved_segments(sbi);
 	si->overp_segs = overprovision_segments(sbi);
@@ -74,7 +75,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
 	si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
 	si->sits = MAIN_SEGS(sbi);
 	si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-	si->fnids = NM_I(sbi)->fcnt;
+	si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+	si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
 	si->bg_gc = sbi->bg_gc;
 	si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
 		* 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -194,7 +196,9 @@ get_cache:
 		si->cache_mem += sizeof(struct flush_cmd_control);
 
 	/* free nids */
-	si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid);
+	si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
+				NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+				sizeof(struct free_nid);
 	si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
 	si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
 					sizeof(struct nat_entry_set);
@@ -310,22 +314,22 @@ static int stat_show(struct seq_file *s, void *v)
 		seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
 				si->ext_tree, si->zombie_tree, si->ext_node);
 		seq_puts(s, "\nBalancing F2FS Async:\n");
-		seq_printf(s, "  - inmem: %4lld, wb_bios: %4d\n",
-			   si->inmem_pages, si->wb_bios);
-		seq_printf(s, "  - nodes: %4lld in %4d\n",
+		seq_printf(s, "  - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n",
+			   si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data);
+		seq_printf(s, "  - nodes: %4d in %4d\n",
 			   si->ndirty_node, si->node_pages);
-		seq_printf(s, "  - dents: %4lld in dirs:%4d (%4d)\n",
+		seq_printf(s, "  - dents: %4d in dirs:%4d (%4d)\n",
 			   si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
-		seq_printf(s, "  - datas: %4lld in files:%4d\n",
+		seq_printf(s, "  - datas: %4d in files:%4d\n",
 			   si->ndirty_data, si->ndirty_files);
-		seq_printf(s, "  - meta: %4lld in %4d\n",
+		seq_printf(s, "  - meta: %4d in %4d\n",
 			   si->ndirty_meta, si->meta_pages);
-		seq_printf(s, "  - imeta: %4lld\n",
+		seq_printf(s, "  - imeta: %4d\n",
 			   si->ndirty_imeta);
 		seq_printf(s, "  - NATs: %9d/%9d\n  - SITs: %9d/%9d\n",
 			   si->dirty_nats, si->nats, si->dirty_sits, si->sits);
-		seq_printf(s, "  - free_nids: %9d\n",
-			   si->fnids);
+		seq_printf(s, "  - free_nids: %9d, alloc_nids: %9d\n",
+			   si->free_nids, si->alloc_nids);
 		seq_puts(s, "\nDistribution of User Blocks:");
 		seq_puts(s, " [ valid | invalid | free ]\n");
 		seq_puts(s, "  [");
@@ -373,6 +377,7 @@ static int stat_open(struct inode *inode, struct file *file)
 }
 
 static const struct file_operations stat_fops = {
+	.owner = THIS_MODULE,
 	.open = stat_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 369f4513be37..827c5daef4fc 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname,
 
 		/* show encrypted name */
 		if (fname->hash) {
-			if (de->hash_code == fname->hash)
+			if (de->hash_code == cpu_to_le32(fname->hash))
 				goto found;
 		} else if (de_name.len == name->len &&
 			de->hash_code == namehash &&
@@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de,
 	set_page_dirty(page);
 
 	dir->i_mtime = dir->i_ctime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 	f2fs_put_page(page, 1);
 }
 
@@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode,
 		clear_inode_flag(inode, FI_NEW_INODE);
 	}
 	dir->i_mtime = dir->i_ctime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (F2FS_I(dir)->i_current_depth != current_depth)
 		f2fs_i_depth_write(dir, current_depth);
@@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	set_page_dirty(page);
 
 	dir->i_ctime = dir->i_mtime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
 		f2fs_drop_nlink(dir, inode);
@@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
 		ClearPagePrivate(page);
 		ClearPageUptodate(page);
 		inode_dec_dirty_pages(dir);
+		remove_dirty_inode(dir);
 	}
 	f2fs_put_page(page, 1);
 }
@@ -784,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir)
 	return true;
 }
 
-bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
+int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 			unsigned int start_pos, struct fscrypt_str *fstr)
 {
 	unsigned char d_type = DT_UNKNOWN;
@@ -819,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 						(u32)de->hash_code, 0,
 						&de_name, fstr);
 			if (err)
-				return true;
+				return err;
 
 			de_name = *fstr;
 			fstr->len = save_len;
@@ -827,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d,
 
 		if (!dir_emit(ctx, de_name.name, de_name.len,
 					le32_to_cpu(de->ino), d_type))
-			return true;
+			return 1;
 
 		bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len));
 		ctx->pos = start_pos + bit_pos;
 	}
-	return false;
+	return 0;
 }
 
 static int f2fs_readdir(struct file *file, struct dir_context *ctx)
@@ -871,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		dentry_page = get_lock_data_page(inode, n, false);
 		if (IS_ERR(dentry_page)) {
 			err = PTR_ERR(dentry_page);
-			if (err == -ENOENT)
+			if (err == -ENOENT) {
+				err = 0;
 				continue;
-			else
+			} else {
 				goto out;
+			}
 		}
 
 		dentry_blk = kmap(dentry_page);
 
 		make_dentry_ptr(inode, &d, (void *)dentry_blk, 1);
 
-		if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) {
+		err = f2fs_fill_dentries(ctx, &d,
+				n * NR_DENTRY_IN_BLOCK, &fstr);
+		if (err) {
 			kunmap(dentry_page);
 			f2fs_put_page(dentry_page, 1);
 			break;
@@ -891,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
 		kunmap(dentry_page);
 		f2fs_put_page(dentry_page, 1);
 	}
-	err = 0;
 out:
 	fscrypt_fname_free_buffer(&fstr);
-	return err;
+	return err < 0 ? err : 0;
 }
 
 static int f2fs_dir_open(struct inode *inode, struct file *filp)
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 2b06d4fcd954..4db44da7ef69 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode,
 
 	if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) {
 		largest->len = 0;
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 9e8de18a168a..2da8c3aa0ce5 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -103,7 +103,7 @@ struct f2fs_mount_info {
 };
 
 #define F2FS_FEATURE_ENCRYPT	0x0001
-#define F2FS_FEATURE_HMSMR	0x0002
+#define F2FS_FEATURE_BLKZONED	0x0002
 
 #define F2FS_HAS_FEATURE(sb, mask)					\
 	((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -401,6 +401,7 @@ struct f2fs_map_blocks {
 #define FADVISE_LOST_PINO_BIT	0x02
 #define FADVISE_ENCRYPT_BIT	0x04
 #define FADVISE_ENC_NAME_BIT	0x08
+#define FADVISE_KEEP_SIZE_BIT	0x10
 
 #define file_is_cold(inode)	is_file(inode, FADVISE_COLD_BIT)
 #define file_wrong_pino(inode)	is_file(inode, FADVISE_LOST_PINO_BIT)
@@ -413,6 +414,8 @@ struct f2fs_map_blocks {
 #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT)
 #define file_enc_name(inode)	is_file(inode, FADVISE_ENC_NAME_BIT)
 #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT)
+#define file_keep_isize(inode)	is_file(inode, FADVISE_KEEP_SIZE_BIT)
+#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT)
 
 #define DEF_DIR_LEVEL		0
 
@@ -428,7 +431,7 @@ struct f2fs_inode_info {
 	/* Use below internally in f2fs*/
 	unsigned long flags;		/* use to pass per-file flags */
 	struct rw_semaphore i_sem;	/* protect fi info */
-	struct percpu_counter dirty_pages;	/* # of dirty pages */
+	atomic_t dirty_pages;		/* # of dirty pages */
 	f2fs_hash_t chash;		/* hash value of given file name */
 	unsigned int clevel;		/* maximum level of given file name */
 	nid_t i_xattr_nid;		/* node id that contains xattrs */
@@ -493,20 +496,26 @@ static inline bool __is_front_mergeable(struct extent_info *cur,
 	return __is_extent_mergeable(cur, front);
 }
 
-extern void f2fs_mark_inode_dirty_sync(struct inode *);
+extern void f2fs_mark_inode_dirty_sync(struct inode *, bool);
 static inline void __try_update_largest_extent(struct inode *inode,
 			struct extent_tree *et, struct extent_node *en)
 {
 	if (en->ei.len > et->largest.len) {
 		et->largest = en->ei;
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
+enum nid_list {
+	FREE_NID_LIST,
+	ALLOC_NID_LIST,
+	MAX_NID_LIST,
+};
+
 struct f2fs_nm_info {
 	block_t nat_blkaddr;		/* base disk address of NAT */
 	nid_t max_nid;			/* maximum possible node ids */
-	nid_t available_nids;		/* maximum available node ids */
+	nid_t available_nids;		/* # of available node ids */
 	nid_t next_scan_nid;		/* the next nid to be scanned */
 	unsigned int ram_thresh;	/* control the memory footprint */
 	unsigned int ra_nid_pages;	/* # of nid pages to be readaheaded */
@@ -522,9 +531,9 @@ struct f2fs_nm_info {
 
 	/* free node ids management */
 	struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-	struct list_head free_nid_list;	/* a list for free nids */
-	spinlock_t free_nid_list_lock;	/* protect free nid list */
-	unsigned int fcnt;		/* the number of free node id */
+	struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
+	unsigned int nid_cnt[MAX_NID_LIST];	/* the number of free node id */
+	spinlock_t nid_list_lock;	/* protect nid lists ops */
 	struct mutex build_lock;	/* lock for build free nids */
 
 	/* for checkpoint */
@@ -585,7 +594,6 @@ enum {
 	CURSEG_WARM_NODE,	/* direct node blocks of normal files */
 	CURSEG_COLD_NODE,	/* indirect node blocks */
 	NO_CHECK_TYPE,
-	CURSEG_DIRECT_IO,	/* to use for the direct IO path */
 };
 
 struct flush_cmd {
@@ -649,6 +657,7 @@ struct f2fs_sm_info {
  * f2fs monitors the number of several block types such as on-writeback,
  * dirty dentry blocks, dirty node blocks, and dirty meta blocks.
  */
+#define WB_DATA_TYPE(p)	(__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA)
 enum count_type {
 	F2FS_DIRTY_DENTS,
 	F2FS_DIRTY_DATA,
@@ -656,6 +665,8 @@ enum count_type {
 	F2FS_DIRTY_META,
 	F2FS_INMEM_PAGES,
 	F2FS_DIRTY_IMETA,
+	F2FS_WB_CP_DATA,
+	F2FS_WB_DATA,
 	NR_COUNT_TYPE,
 };
 
@@ -688,7 +699,7 @@ struct f2fs_io_info {
 	struct f2fs_sb_info *sbi;	/* f2fs_sb_info pointer */
 	enum page_type type;	/* contains DATA/NODE/META/META_FLUSH */
 	int op;			/* contains REQ_OP_ */
-	int op_flags;		/* rq_flag_bits */
+	int op_flags;		/* req_flag_bits */
 	block_t new_blkaddr;	/* new block address to be written */
 	block_t old_blkaddr;	/* old block address before Cow */
 	struct page *page;	/* page to be written */
@@ -704,6 +715,20 @@ struct f2fs_bio_info {
 	struct rw_semaphore io_rwsem;	/* blocking op for bio */
 };
 
+#define FDEV(i)				(sbi->devs[i])
+#define RDEV(i)				(raw_super->devs[i])
+struct f2fs_dev_info {
+	struct block_device *bdev;
+	char path[MAX_PATH_LEN];
+	unsigned int total_segments;
+	block_t start_blk;
+	block_t end_blk;
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int nr_blkz;			/* Total number of zones */
+	u8 *blkz_type;				/* Array of zones type */
+#endif
+};
+
 enum inode_type {
 	DIR_INODE,			/* for dirty dir inode */
 	FILE_INODE,			/* for dirty regular/symlink inode */
@@ -750,6 +775,12 @@ struct f2fs_sb_info {
 	u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE];
 	u8 key_prefix_size;
 #endif
+
+#ifdef CONFIG_BLK_DEV_ZONED
+	unsigned int blocks_per_blkz;		/* F2FS blocks per zone */
+	unsigned int log_blocks_per_blkz;	/* log2 F2FS blocks per zone */
+#endif
+
 	/* for node-related operations */
 	struct f2fs_nm_info *nm_info;		/* node manager */
 	struct inode *node_inode;		/* cache node blocks */
@@ -764,6 +795,7 @@ struct f2fs_sb_info {
 
 	/* for checkpoint */
 	struct f2fs_checkpoint *ckpt;		/* raw checkpoint pointer */
+	int cur_cp_pack;			/* remain current cp pack */
 	spinlock_t cp_lock;			/* for flag in ckpt */
 	struct inode *meta_inode;		/* cache meta blocks */
 	struct mutex cp_mutex;			/* checkpoint procedure lock */
@@ -815,10 +847,9 @@ struct f2fs_sb_info {
 	block_t discard_blks;			/* discard command candidats */
 	block_t last_valid_block_count;		/* for recovery */
 	u32 s_next_generation;			/* for NFS support */
-	atomic_t nr_wb_bios;			/* # of writeback bios */
 
 	/* # of pages, see count_type */
-	struct percpu_counter nr_pages[NR_COUNT_TYPE];
+	atomic_t nr_pages[NR_COUNT_TYPE];
 	/* # of allocated blocks */
 	struct percpu_counter alloc_valid_block_count;
 
@@ -863,6 +894,8 @@ struct f2fs_sb_info {
 
 	/* For shrinker support */
 	struct list_head s_list;
+	int s_ndevs;				/* number of devices */
+	struct f2fs_dev_info *devs;		/* for device list */
 	struct mutex umount_mutex;
 	unsigned int shrinker_run_no;
 
@@ -1105,13 +1138,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f)
 	spin_unlock(&sbi->cp_lock);
 }
 
-static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
-{
-	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
-
-	return blk_queue_discard(q);
-}
-
 static inline void f2fs_lock_op(struct f2fs_sb_info *sbi)
 {
 	down_read(&sbi->cp_rwsem);
@@ -1232,9 +1258,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
 
 static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	percpu_counter_inc(&sbi->nr_pages[count_type]);
+	atomic_inc(&sbi->nr_pages[count_type]);
 
-	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES)
+	if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES ||
+		count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA)
 		return;
 
 	set_sbi_flag(sbi, SBI_IS_DIRTY);
@@ -1242,14 +1269,14 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type)
 
 static inline void inode_inc_dirty_pages(struct inode *inode)
 {
-	percpu_counter_inc(&F2FS_I(inode)->dirty_pages);
+	atomic_inc(&F2FS_I(inode)->dirty_pages);
 	inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
 static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
 {
-	percpu_counter_dec(&sbi->nr_pages[count_type]);
+	atomic_dec(&sbi->nr_pages[count_type]);
 }
 
 static inline void inode_dec_dirty_pages(struct inode *inode)
@@ -1258,19 +1285,19 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
 			!S_ISLNK(inode->i_mode))
 		return;
 
-	percpu_counter_dec(&F2FS_I(inode)->dirty_pages);
+	atomic_dec(&F2FS_I(inode)->dirty_pages);
 	dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
 				F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
 }
 
 static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
 {
-	return percpu_counter_sum_positive(&sbi->nr_pages[count_type]);
+	return atomic_read(&sbi->nr_pages[count_type]);
 }
 
-static inline s64 get_dirty_pages(struct inode *inode)
+static inline int get_dirty_pages(struct inode *inode)
 {
-	return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages);
+	return atomic_read(&F2FS_I(inode)->dirty_pages);
 }
 
 static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type)
@@ -1329,22 +1356,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag)
 
 static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi)
 {
-	block_t start_addr;
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
-	unsigned long long ckpt_version = cur_cp_version(ckpt);
-
-	start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
-	/*
-	 * odd numbered checkpoint should at cp segment 0
-	 * and even segment must be at cp segment 1
-	 */
-	if (!(ckpt_version & 1))
+	if (sbi->cur_cp_pack == 2)
 		start_addr += sbi->blocks_per_seg;
+	return start_addr;
+}
+
+static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi)
+{
+	block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr);
 
+	if (sbi->cur_cp_pack == 1)
+		start_addr += sbi->blocks_per_seg;
 	return start_addr;
 }
 
+static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi)
+{
+	sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1;
+}
+
 static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi)
 {
 	return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum);
@@ -1621,7 +1653,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
 			return;
 	case FI_DATA_EXIST:
 	case FI_INLINE_DOTS:
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 	}
 }
 
@@ -1648,7 +1680,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode)
 {
 	F2FS_I(inode)->i_acl_mode = mode;
 	set_inode_flag(inode, FI_ACL_MODE);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 }
 
 static inline void f2fs_i_links_write(struct inode *inode, bool inc)
@@ -1657,7 +1689,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc)
 		inc_nlink(inode);
 	else
 		drop_nlink(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_blocks_write(struct inode *inode,
@@ -1668,7 +1700,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode,
 
 	inode->i_blocks = add ? inode->i_blocks + diff :
 				inode->i_blocks - diff;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
@@ -1682,34 +1714,27 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size)
 		return;
 
 	i_size_write(inode, i_size);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (clean || recover)
 		set_inode_flag(inode, FI_AUTO_RECOVER);
 }
 
-static inline bool f2fs_skip_inode_update(struct inode *inode)
-{
-	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER))
-		return false;
-	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
-}
-
 static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth)
 {
 	F2FS_I(inode)->i_current_depth = depth;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid)
 {
 	F2FS_I(inode)->i_xattr_nid = xnid;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino)
 {
 	F2FS_I(inode)->i_pino = pino;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
@@ -1837,13 +1862,31 @@ static inline int is_file(struct inode *inode, int type)
 static inline void set_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise |= type;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 }
 
 static inline void clear_file(struct inode *inode, int type)
 {
 	F2FS_I(inode)->i_advise &= ~type;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
+}
+
+static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
+{
+	if (dsync) {
+		struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+		bool ret;
+
+		spin_lock(&sbi->inode_lock[DIRTY_META]);
+		ret = list_empty(&F2FS_I(inode)->gdirty_list);
+		spin_unlock(&sbi->inode_lock[DIRTY_META]);
+		return ret;
+	}
+	if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) ||
+			file_keep_isize(inode) ||
+			i_size_read(inode) & PAGE_MASK)
+		return false;
+	return F2FS_I(inode)->last_disk_size == i_size_read(inode);
 }
 
 static inline int f2fs_readonly(struct super_block *sb)
@@ -1955,7 +1998,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t);
 unsigned char get_de_type(struct f2fs_dir_entry *);
 struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *,
 			f2fs_hash_t, int *, struct f2fs_dentry_ptr *);
-bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
+int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *,
 			unsigned int, struct fscrypt_str *);
 void do_make_empty_dir(struct inode *, struct inode *,
 			struct f2fs_dentry_ptr *);
@@ -1995,7 +2038,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
 /*
  * super.c
  */
-int f2fs_inode_dirtied(struct inode *);
+int f2fs_inode_dirtied(struct inode *, bool);
 void f2fs_inode_synced(struct inode *);
 int f2fs_commit_super(struct f2fs_sb_info *, bool);
 int f2fs_sync_fs(struct super_block *, int);
@@ -2034,7 +2077,7 @@ void move_node_page(struct page *, int);
 int fsync_node_pages(struct f2fs_sb_info *, struct inode *,
 			struct writeback_control *, bool);
 int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *);
-void build_free_nids(struct f2fs_sb_info *);
+void build_free_nids(struct f2fs_sb_info *, bool);
 bool alloc_nid(struct f2fs_sb_info *, nid_t *);
 void alloc_nid_done(struct f2fs_sb_info *, nid_t);
 void alloc_nid_failed(struct f2fs_sb_info *, nid_t);
@@ -2060,7 +2103,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool);
 void f2fs_balance_fs_bg(struct f2fs_sb_info *);
 int f2fs_issue_flush(struct f2fs_sb_info *);
 int create_flush_cmd_control(struct f2fs_sb_info *);
-void destroy_flush_cmd_control(struct f2fs_sb_info *);
+void destroy_flush_cmd_control(struct f2fs_sb_info *, bool);
 void invalidate_blocks(struct f2fs_sb_info *, block_t);
 bool is_checkpointed_data(struct f2fs_sb_info *, block_t);
 void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t);
@@ -2132,12 +2175,15 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *,
 void f2fs_flush_merged_bios(struct f2fs_sb_info *);
 int f2fs_submit_page_bio(struct f2fs_io_info *);
 void f2fs_submit_page_mbio(struct f2fs_io_info *);
+struct block_device *f2fs_target_device(struct f2fs_sb_info *,
+				block_t, struct bio *);
+int f2fs_target_device_index(struct f2fs_sb_info *, block_t);
 void set_data_blkaddr(struct dnode_of_data *);
 void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t);
 int reserve_new_blocks(struct dnode_of_data *, blkcnt_t);
 int reserve_new_block(struct dnode_of_data *);
 int f2fs_get_block(struct dnode_of_data *, pgoff_t);
-ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
+int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *);
 int f2fs_reserve_block(struct dnode_of_data *, pgoff_t);
 struct page *get_read_data_page(struct inode *, pgoff_t, int, bool);
 struct page *find_data_page(struct inode *, pgoff_t);
@@ -2160,7 +2206,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *,
 int start_gc_thread(struct f2fs_sb_info *);
 void stop_gc_thread(struct f2fs_sb_info *);
 block_t start_bidx_of_node(unsigned int, struct inode *);
-int f2fs_gc(struct f2fs_sb_info *, bool);
+int f2fs_gc(struct f2fs_sb_info *, bool, bool);
 void build_gc_manager(struct f2fs_sb_info *);
 
 /*
@@ -2181,12 +2227,12 @@ struct f2fs_stat_info {
 	unsigned long long hit_largest, hit_cached, hit_rbtree;
 	unsigned long long hit_total, total_ext;
 	int ext_tree, zombie_tree, ext_node;
-	s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
-	s64 inmem_pages;
+	int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+	int inmem_pages;
 	unsigned int ndirty_dirs, ndirty_files, ndirty_all;
-	int nats, dirty_nats, sits, dirty_sits, fnids;
+	int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids;
 	int total_count, utilization;
-	int bg_gc, wb_bios;
+	int bg_gc, nr_wb_cp_data, nr_wb_data;
 	int inline_xattr, inline_inode, inline_dir, orphans;
 	unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks;
 	unsigned int bimodal, avg_vblocks;
@@ -2412,9 +2458,30 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb)
 	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT);
 }
 
-static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb)
+static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb)
+{
+	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED);
+}
+
+#ifdef CONFIG_BLK_DEV_ZONED
+static inline int get_blkz_type(struct f2fs_sb_info *sbi,
+			struct block_device *bdev, block_t blkaddr)
+{
+	unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz;
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++)
+		if (FDEV(i).bdev == bdev)
+			return FDEV(i).blkz_type[zno];
+	return -EINVAL;
+}
+#endif
+
+static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi)
 {
-	return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR);
+	struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev);
+
+	return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb);
 }
 
 static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
@@ -2453,8 +2520,8 @@ static inline bool f2fs_may_encrypt(struct inode *inode)
 #define fscrypt_pullback_bio_page	fscrypt_notsupp_pullback_bio_page
 #define fscrypt_restore_control_page	fscrypt_notsupp_restore_control_page
 #define fscrypt_zeroout_range		fscrypt_notsupp_zeroout_range
-#define fscrypt_process_policy		fscrypt_notsupp_process_policy
-#define fscrypt_get_policy		fscrypt_notsupp_get_policy
+#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
 #define fscrypt_has_permitted_context	fscrypt_notsupp_has_permitted_context
 #define fscrypt_inherit_context		fscrypt_notsupp_inherit_context
 #define fscrypt_get_encryption_info	fscrypt_notsupp_get_encryption_info
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index c7865073cd26..49f10dce817d 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -94,8 +94,6 @@ mapped:
 	if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
 		f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr);
 
-	/* if gced page is attached, don't write to cold segment */
-	clear_cold_data(page);
 out:
 	sb_end_pagefault(inode->i_sb);
 	f2fs_update_time(sbi, REQ_TIME);
@@ -210,7 +208,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
 	}
 
 	/* if the inode is dirty, let's recover all the time */
-	if (!datasync && !f2fs_skip_inode_update(inode)) {
+	if (!f2fs_skip_inode_update(inode, datasync)) {
 		f2fs_write_inode(inode, NULL);
 		goto go_write;
 	}
@@ -264,7 +262,7 @@ sync_nodes:
 	}
 
 	if (need_inode_block_update(sbi, ino)) {
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, true);
 		f2fs_write_inode(inode, NULL);
 		goto sync_nodes;
 	}
@@ -632,7 +630,7 @@ int f2fs_truncate(struct inode *inode)
 		return err;
 
 	inode->i_mtime = inode->i_ctime = current_time(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 	return 0;
 }
 
@@ -679,6 +677,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	struct inode *inode = d_inode(dentry);
 	int err;
+	bool size_changed = false;
 
 	err = setattr_prepare(dentry, attr);
 	if (err)
@@ -694,7 +693,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			err = f2fs_truncate(inode);
 			if (err)
 				return err;
-			f2fs_balance_fs(F2FS_I_SB(inode), true);
 		} else {
 			/*
 			 * do not trim all blocks after i_size if target size is
@@ -710,6 +708,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 			}
 			inode->i_mtime = inode->i_ctime = current_time(inode);
 		}
+
+		size_changed = true;
 	}
 
 	__setattr_copy(inode, attr);
@@ -722,7 +722,12 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
 		}
 	}
 
-	f2fs_mark_inode_dirty_sync(inode);
+	/* file size may changed here */
+	f2fs_mark_inode_dirty_sync(inode, size_changed);
+
+	/* inode change will produce dirty node pages flushed by checkpoint */
+	f2fs_balance_fs(F2FS_I_SB(inode), true);
+
 	return err;
 }
 
@@ -967,7 +972,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
 				new_size = (dst + i) << PAGE_SHIFT;
 				if (dst_inode->i_size < new_size)
 					f2fs_i_size_write(dst_inode, new_size);
-			} while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen);
+			} while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR));
 
 			f2fs_put_dnode(&dn);
 		} else {
@@ -1218,6 +1223,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len,
 			ret = f2fs_do_zero_range(&dn, index, end);
 			f2fs_put_dnode(&dn);
 			f2fs_unlock_op(sbi);
+
+			f2fs_balance_fs(sbi, dn.node_changed);
+
 			if (ret)
 				goto out;
 
@@ -1313,15 +1321,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	pgoff_t pg_end;
 	loff_t new_size = i_size_read(inode);
 	loff_t off_end;
-	int ret;
+	int err;
 
-	ret = inode_newsize_ok(inode, (len + offset));
-	if (ret)
-		return ret;
+	err = inode_newsize_ok(inode, (len + offset));
+	if (err)
+		return err;
 
-	ret = f2fs_convert_inline_inode(inode);
-	if (ret)
-		return ret;
+	err = f2fs_convert_inline_inode(inode);
+	if (err)
+		return err;
 
 	f2fs_balance_fs(sbi, true);
 
@@ -1333,12 +1341,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (off_end)
 		map.m_len++;
 
-	ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
-	if (ret) {
+	err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO);
+	if (err) {
 		pgoff_t last_off;
 
 		if (!map.m_len)
-			return ret;
+			return err;
 
 		last_off = map.m_lblk + map.m_len - 1;
 
@@ -1352,7 +1360,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
 		f2fs_i_size_write(inode, new_size);
 
-	return ret;
+	return err;
 }
 
 static long f2fs_fallocate(struct file *file, int mode,
@@ -1393,7 +1401,9 @@ static long f2fs_fallocate(struct file *file, int mode,
 
 	if (!ret) {
 		inode->i_mtime = inode->i_ctime = current_time(inode);
-		f2fs_mark_inode_dirty_sync(inode);
+		f2fs_mark_inode_dirty_sync(inode, false);
+		if (mode & FALLOC_FL_KEEP_SIZE)
+			file_set_keep_isize(inode);
 		f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 	}
 
@@ -1526,7 +1536,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp)
 		goto out;
 
 	f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING,
-		"Unexpected flush for atomic writes: ino=%lu, npages=%lld",
+		"Unexpected flush for atomic writes: ino=%lu, npages=%u",
 					inode->i_ino, get_dirty_pages(inode));
 	ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX);
 	if (ret)
@@ -1752,31 +1762,16 @@ static bool uuid_is_nonzero(__u8 u[16])
 
 static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
 	struct inode *inode = file_inode(filp);
 
-	if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg,
-							sizeof(policy)))
-		return -EFAULT;
-
 	f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
 
-	return fscrypt_process_policy(filp, &policy);
+	return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
 {
-	struct fscrypt_policy policy;
-	struct inode *inode = file_inode(filp);
-	int err;
-
-	err = fscrypt_get_policy(inode, &policy);
-	if (err)
-		return err;
-
-	if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy)))
-		return -EFAULT;
-	return 0;
+	return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
 }
 
 static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg)
@@ -1842,7 +1837,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg)
 		mutex_lock(&sbi->gc_mutex);
 	}
 
-	ret = f2fs_gc(sbi, sync);
+	ret = f2fs_gc(sbi, sync, true);
 out:
 	mnt_drop_write_file(filp);
 	return ret;
@@ -2256,12 +2251,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 	inode_lock(inode);
 	ret = generic_write_checks(iocb, from);
 	if (ret > 0) {
-		ret = f2fs_preallocate_blocks(iocb, from);
-		if (!ret) {
-			blk_start_plug(&plug);
-			ret = __generic_file_write_iter(iocb, from);
-			blk_finish_plug(&plug);
+		int err = f2fs_preallocate_blocks(iocb, from);
+
+		if (err) {
+			inode_unlock(inode);
+			return err;
 		}
+		blk_start_plug(&plug);
+		ret = __generic_file_write_iter(iocb, from);
+		blk_finish_plug(&plug);
 	}
 	inode_unlock(inode);
 
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6f14ee923acd..88bfc3dff496 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -82,7 +82,7 @@ static int gc_thread_func(void *data)
 		stat_inc_bggc_count(sbi);
 
 		/* if return value is not zero, no victim was selected */
-		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC)))
+		if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true))
 			wait_ms = gc_th->no_gc_sleep_time;
 
 		trace_f2fs_background_gc(sbi->sb, wait_ms,
@@ -544,13 +544,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
 	return true;
 }
 
-static void move_encrypted_block(struct inode *inode, block_t bidx)
+static void move_encrypted_block(struct inode *inode, block_t bidx,
+							unsigned int segno, int off)
 {
 	struct f2fs_io_info fio = {
 		.sbi = F2FS_I_SB(inode),
 		.type = DATA,
 		.op = REQ_OP_READ,
-		.op_flags = READ_SYNC,
+		.op_flags = 0,
 		.encrypted_page = NULL,
 	};
 	struct dnode_of_data dn;
@@ -565,6 +566,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	if (!page)
 		return;
 
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+		goto out;
+
 	set_new_dnode(&dn, inode, NULL, NULL, 0);
 	err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE);
 	if (err)
@@ -625,7 +629,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx)
 	f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
 
 	fio.op = REQ_OP_WRITE;
-	fio.op_flags = WRITE_SYNC;
+	fio.op_flags = REQ_SYNC;
 	fio.new_blkaddr = newaddr;
 	f2fs_submit_page_mbio(&fio);
 
@@ -645,7 +649,8 @@ out:
 	f2fs_put_page(page, 1);
 }
 
-static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
+static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
+							unsigned int segno, int off)
 {
 	struct page *page;
 
@@ -653,6 +658,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 	if (IS_ERR(page))
 		return;
 
+	if (!check_valid_map(F2FS_I_SB(inode), segno, off))
+		goto out;
+
 	if (gc_type == BG_GC) {
 		if (PageWriteback(page))
 			goto out;
@@ -663,7 +671,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 			.sbi = F2FS_I_SB(inode),
 			.type = DATA,
 			.op = REQ_OP_WRITE,
-			.op_flags = WRITE_SYNC,
+			.op_flags = REQ_SYNC,
 			.page = page,
 			.encrypted_page = NULL,
 		};
@@ -673,8 +681,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type)
 retry:
 		set_page_dirty(page);
 		f2fs_wait_on_page_writeback(page, DATA, true);
-		if (clear_page_dirty_for_io(page))
+		if (clear_page_dirty_for_io(page)) {
 			inode_dec_dirty_pages(inode);
+			remove_dirty_inode(inode);
+		}
 
 		set_cold_data(page);
 
@@ -683,8 +693,6 @@ retry:
 			congestion_wait(BLK_RW_ASYNC, HZ/50);
 			goto retry;
 		}
-
-		clear_cold_data(page);
 	}
 out:
 	f2fs_put_page(page, 1);
@@ -794,9 +802,9 @@ next_step:
 			start_bidx = start_bidx_of_node(nofs, inode)
 								+ ofs_in_node;
 			if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode))
-				move_encrypted_block(inode, start_bidx);
+				move_encrypted_block(inode, start_bidx, segno, off);
 			else
-				move_data_page(inode, start_bidx, gc_type);
+				move_data_page(inode, start_bidx, gc_type, segno, off);
 
 			if (locked) {
 				up_write(&fi->dio_rwsem[WRITE]);
@@ -899,7 +907,7 @@ next:
 	return sec_freed;
 }
 
-int f2fs_gc(struct f2fs_sb_info *sbi, bool sync)
+int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background)
 {
 	unsigned int segno;
 	int gc_type = sync ? FG_GC : BG_GC;
@@ -940,6 +948,9 @@ gc_more:
 			if (ret)
 				goto stop;
 		}
+	} else if (gc_type == BG_GC && !background) {
+		/* f2fs_balance_fs doesn't need to do BG_GC in critical path. */
+		goto stop;
 	}
 
 	if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type))
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 5f1a67f756af..e32a9e527968 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -111,7 +111,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 		.sbi = F2FS_I_SB(dn->inode),
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -137,8 +137,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
 	fio.old_blkaddr = dn->data_blkaddr;
 	write_data_page(dn, &fio);
 	f2fs_wait_on_page_writeback(page, DATA, true);
-	if (dirty)
+	if (dirty) {
 		inode_dec_dirty_pages(dn->inode);
+		remove_dirty_inode(dn->inode);
+	}
 
 	/* this converted inline_data should be recovered. */
 	set_inode_flag(dn->inode, FI_APPEND_WRITE);
@@ -419,7 +421,7 @@ static int f2fs_add_inline_entries(struct inode *dir,
 		}
 
 		new_name.name = d.filename[bit_pos];
-		new_name.len = de->name_len;
+		new_name.len = le16_to_cpu(de->name_len);
 
 		ino = le32_to_cpu(de->ino);
 		fake_mode = get_de_type(de) << S_SHIFT;
@@ -573,7 +575,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page,
 	f2fs_put_page(page, 1);
 
 	dir->i_ctime = dir->i_mtime = current_time(dir);
-	f2fs_mark_inode_dirty_sync(dir);
+	f2fs_mark_inode_dirty_sync(dir, false);
 
 	if (inode)
 		f2fs_drop_nlink(dir, inode);
@@ -610,6 +612,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 	struct f2fs_inline_dentry *inline_dentry = NULL;
 	struct page *ipage = NULL;
 	struct f2fs_dentry_ptr d;
+	int err;
 
 	if (ctx->pos == NR_INLINE_DENTRY)
 		return 0;
@@ -622,11 +625,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx,
 
 	make_dentry_ptr(inode, &d, (void *)inline_dentry, 2);
 
-	if (!f2fs_fill_dentries(ctx, &d, 0, fstr))
+	err = f2fs_fill_dentries(ctx, &d, 0, fstr);
+	if (!err)
 		ctx->pos = NR_INLINE_DENTRY;
 
 	f2fs_put_page(ipage, 1);
-	return 0;
+	return err < 0 ? err : 0;
 }
 
 int f2fs_inline_data_fiemap(struct inode *inode,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index d7369895a78a..af06bda51a54 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -19,10 +19,11 @@
 
 #include <trace/events/f2fs.h>
 
-void f2fs_mark_inode_dirty_sync(struct inode *inode)
+void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync)
 {
-	if (f2fs_inode_dirtied(inode))
+	if (f2fs_inode_dirtied(inode, sync))
 		return;
+
 	mark_inode_dirty_sync(inode);
 }
 
@@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode)
 		new_fl |= S_DIRSYNC;
 	inode_set_flags(inode, new_fl,
 			S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, false);
 }
 
 static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -252,6 +253,7 @@ retry:
 int update_inode(struct inode *inode, struct page *node_page)
 {
 	struct f2fs_inode *ri;
+	struct extent_tree *et = F2FS_I(inode)->extent_tree;
 
 	f2fs_inode_synced(inode);
 
@@ -267,11 +269,13 @@ int update_inode(struct inode *inode, struct page *node_page)
 	ri->i_size = cpu_to_le64(i_size_read(inode));
 	ri->i_blocks = cpu_to_le64(inode->i_blocks);
 
-	if (F2FS_I(inode)->extent_tree)
-		set_raw_extent(&F2FS_I(inode)->extent_tree->largest,
-							&ri->i_ext);
-	else
+	if (et) {
+		read_lock(&et->lock);
+		set_raw_extent(&et->largest, &ri->i_ext);
+		read_unlock(&et->lock);
+	} else {
 		memset(&ri->i_ext, 0, sizeof(ri->i_ext));
+	}
 	set_raw_inline(inode, ri);
 
 	ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec);
@@ -335,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc)
 	 * We need to balance fs here to prevent from producing dirty node pages
 	 * during the urgent cleaning time when runing out of free sections.
 	 */
-	if (update_inode_page(inode))
+	if (update_inode_page(inode) && wbc && wbc->nr_to_write)
 		f2fs_balance_fs(sbi, true);
 	return 0;
 }
@@ -373,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode)
 		goto no_delete;
 #endif
 
+	remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
+	remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+
 	sb_start_intwrite(inode->i_sb);
 	set_inode_flag(inode, FI_NO_ALLOC);
 	i_size_write(inode, 0);
@@ -384,6 +391,8 @@ retry:
 		f2fs_lock_op(sbi);
 		err = remove_inode_page(inode);
 		f2fs_unlock_op(sbi);
+		if (err == -ENOENT)
+			err = 0;
 	}
 
 	/* give more chances, if ENOMEM case */
@@ -403,10 +412,12 @@ no_delete:
 	invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino);
 	if (xnid)
 		invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid);
-	if (is_inode_flag_set(inode, FI_APPEND_WRITE))
-		add_ino_entry(sbi, inode->i_ino, APPEND_INO);
-	if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
-		add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	if (inode->i_nlink) {
+		if (is_inode_flag_set(inode, FI_APPEND_WRITE))
+			add_ino_entry(sbi, inode->i_ino, APPEND_INO);
+		if (is_inode_flag_set(inode, FI_UPDATE_WRITE))
+			add_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+	}
 	if (is_inode_flag_set(inode, FI_FREE_NID)) {
 		alloc_nid_failed(sbi, inode->i_ino);
 		clear_inode_flag(inode, FI_FREE_NID);
@@ -424,6 +435,18 @@ void handle_failed_inode(struct inode *inode)
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
 	struct node_info ni;
 
+	/*
+	 * clear nlink of inode in order to release resource of inode
+	 * immediately.
+	 */
+	clear_nlink(inode);
+
+	/*
+	 * we must call this to avoid inode being remained as dirty, resulting
+	 * in a panic when flushing dirty inodes in gdirty_list.
+	 */
+	update_inode_page(inode);
+
 	/* don't make bad inode, since it becomes a regular file. */
 	unlock_new_inode(inode);
 
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 489fa0d5f914..56c19b0610a8 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -778,7 +778,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	up_write(&F2FS_I(old_inode)->i_sem);
 
 	old_inode->i_ctime = current_time(old_inode);
-	f2fs_mark_inode_dirty_sync(old_inode);
+	f2fs_mark_inode_dirty_sync(old_inode, false);
 
 	f2fs_delete_entry(old_entry, old_page, old_dir, NULL);
 
@@ -938,7 +938,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		f2fs_i_links_write(old_dir, old_nlink > 0);
 		up_write(&F2FS_I(old_dir)->i_sem);
 	}
-	f2fs_mark_inode_dirty_sync(old_dir);
+	f2fs_mark_inode_dirty_sync(old_dir, false);
 
 	/* update directory entry info of new dir inode */
 	f2fs_set_link(new_dir, new_entry, new_page, old_inode);
@@ -953,7 +953,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 		f2fs_i_links_write(new_dir, new_nlink > 0);
 		up_write(&F2FS_I(new_dir)->i_sem);
 	}
-	f2fs_mark_inode_dirty_sync(new_dir);
+	f2fs_mark_inode_dirty_sync(new_dir, false);
 
 	f2fs_unlock_op(sbi);
 
@@ -1075,7 +1075,6 @@ errout:
 }
 
 const struct inode_operations f2fs_encrypted_symlink_inode_operations = {
-	.readlink       = generic_readlink,
 	.get_link       = f2fs_encrypted_get_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
@@ -1105,7 +1104,6 @@ const struct inode_operations f2fs_dir_inode_operations = {
 };
 
 const struct inode_operations f2fs_symlink_inode_operations = {
-	.readlink       = generic_readlink,
 	.get_link       = f2fs_get_link,
 	.getattr	= f2fs_getattr,
 	.setattr	= f2fs_setattr,
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 01177ecdeab8..b9078fdb3743 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
 	 * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
 	 */
 	if (type == FREE_NIDS) {
-		mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >>
-							PAGE_SHIFT;
+		mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+				sizeof(struct free_nid)) >> PAGE_SHIFT;
 		res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
 	} else if (type == NAT_ENTRIES) {
 		mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >>
@@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
 		e = grab_nat_entry(nm_i, nid);
 		node_info_from_raw_nat(&e->ni, ne);
 	} else {
-		f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino ||
-				nat_get_blkaddr(e) != ne->block_addr ||
+		f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
+				nat_get_blkaddr(e) !=
+					le32_to_cpu(ne->block_addr) ||
 				nat_get_version(e) != ne->version);
 	}
 }
@@ -1134,7 +1135,7 @@ repeat:
 	if (!page)
 		return ERR_PTR(-ENOMEM);
 
-	err = read_node_page(page, READ_SYNC);
+	err = read_node_page(page, 0);
 	if (err < 0) {
 		f2fs_put_page(page, 1);
 		return ERR_PTR(err);
@@ -1204,6 +1205,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
 
 	ret = f2fs_write_inline_data(inode, page);
 	inode_dec_dirty_pages(inode);
+	remove_dirty_inode(inode);
 	if (ret)
 		set_page_dirty(page);
 page_out:
@@ -1338,7 +1340,8 @@ retry:
 			if (unlikely(f2fs_cp_error(sbi))) {
 				f2fs_put_page(last_page, 0);
 				pagevec_release(&pvec);
-				return -EIO;
+				ret = -EIO;
+				goto out;
 			}
 
 			if (!IS_DNODE(page) || !is_cold_node(page))
@@ -1407,11 +1410,12 @@ continue_unlock:
 			"Retry to write fsync mark: ino=%u, idx=%lx",
 					ino, last_page->index);
 		lock_page(last_page);
+		f2fs_wait_on_page_writeback(last_page, NODE, true);
 		set_page_dirty(last_page);
 		unlock_page(last_page);
 		goto retry;
 	}
-
+out:
 	if (nwritten)
 		f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE);
 	return ret ? -EIO: 0;
@@ -1570,7 +1574,7 @@ static int f2fs_write_node_page(struct page *page,
 		.sbi = sbi,
 		.type = NODE,
 		.op = REQ_OP_WRITE,
-		.op_flags = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0,
+		.op_flags = wbc_to_write_flags(wbc),
 		.page = page,
 		.encrypted_page = NULL,
 	};
@@ -1692,11 +1696,35 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
 	return radix_tree_lookup(&nm_i->free_nid_root, n);
 }
 
-static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i,
-						struct free_nid *i)
+static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_list list, bool new)
 {
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	if (new) {
+		int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+		if (err)
+			return err;
+	}
+
+	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+						i->state != NID_ALLOC);
+	nm_i->nid_cnt[list]++;
+	list_add_tail(&i->list, &nm_i->nid_list[list]);
+	return 0;
+}
+
+static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
+			struct free_nid *i, enum nid_list list, bool reuse)
+{
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+	f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
+						i->state != NID_ALLOC);
+	nm_i->nid_cnt[list]--;
 	list_del(&i->list);
-	radix_tree_delete(&nm_i->free_nid_root, i->nid);
+	if (!reuse)
+		radix_tree_delete(&nm_i->free_nid_root, i->nid);
 }
 
 static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
@@ -1704,9 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	struct nat_entry *ne;
-
-	if (!available_free_memory(sbi, FREE_NIDS))
-		return -1;
+	int err;
 
 	/* 0 nid should not be used */
 	if (unlikely(nid == 0))
@@ -1729,33 +1755,30 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
 		return 0;
 	}
 
-	spin_lock(&nm_i->free_nid_list_lock);
-	if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) {
-		spin_unlock(&nm_i->free_nid_list_lock);
-		radix_tree_preload_end();
+	spin_lock(&nm_i->nid_list_lock);
+	err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+	spin_unlock(&nm_i->nid_list_lock);
+	radix_tree_preload_end();
+	if (err) {
 		kmem_cache_free(free_nid_slab, i);
 		return 0;
 	}
-	list_add_tail(&i->list, &nm_i->free_nid_list);
-	nm_i->fcnt++;
-	spin_unlock(&nm_i->free_nid_list_lock);
-	radix_tree_preload_end();
 	return 1;
 }
 
-static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid)
+static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
 {
+	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 	bool need_free = false;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
 	if (i && i->state == NID_NEW) {
-		__del_from_free_nid_list(nm_i, i);
-		nm_i->fcnt--;
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
 		need_free = true;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
 		kmem_cache_free(free_nid_slab, i);
@@ -1778,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
 
 		blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr);
 		f2fs_bug_on(sbi, blk_addr == NEW_ADDR);
-		if (blk_addr == NULL_ADDR) {
-			if (add_free_nid(sbi, start_nid, true) < 0)
-				break;
-		}
+		if (blk_addr == NULL_ADDR)
+			add_free_nid(sbi, start_nid, true);
 	}
 }
 
-void build_free_nids(struct f2fs_sb_info *sbi)
+static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync)
 {
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
@@ -1794,7 +1815,10 @@ void build_free_nids(struct f2fs_sb_info *sbi)
 	nid_t nid = nm_i->next_scan_nid;
 
 	/* Enough entries */
-	if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK)
+	if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+		return;
+
+	if (!sync && !available_free_memory(sbi, FREE_NIDS))
 		return;
 
 	/* readahead nat pages to be scanned */
@@ -1830,7 +1854,7 @@ void build_free_nids(struct f2fs_sb_info *sbi)
 		if (addr == NULL_ADDR)
 			add_free_nid(sbi, nid, true);
 		else
-			remove_free_nid(nm_i, nid);
+			remove_free_nid(sbi, nid);
 	}
 	up_read(&curseg->journal_rwsem);
 	up_read(&nm_i->nat_tree_lock);
@@ -1839,6 +1863,13 @@ void build_free_nids(struct f2fs_sb_info *sbi)
 					nm_i->ra_nid_pages, META_NAT, false);
 }
 
+void build_free_nids(struct f2fs_sb_info *sbi, bool sync)
+{
+	mutex_lock(&NM_I(sbi)->build_lock);
+	__build_free_nids(sbi, sync);
+	mutex_unlock(&NM_I(sbi)->build_lock);
+}
+
 /*
  * If this function returns success, caller can obtain a new nid
  * from second parameter of this function.
@@ -1853,31 +1884,31 @@ retry:
 	if (time_to_inject(sbi, FAULT_ALLOC_NID))
 		return false;
 #endif
-	if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids))
-		return false;
+	spin_lock(&nm_i->nid_list_lock);
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	if (unlikely(nm_i->available_nids == 0)) {
+		spin_unlock(&nm_i->nid_list_lock);
+		return false;
+	}
 
 	/* We should not use stale free nids created by build_free_nids */
-	if (nm_i->fcnt && !on_build_free_nids(nm_i)) {
-		f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
-		list_for_each_entry(i, &nm_i->free_nid_list, list)
-			if (i->state == NID_NEW)
-				break;
-
-		f2fs_bug_on(sbi, i->state != NID_NEW);
+	if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
+		f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
+		i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+					struct free_nid, list);
 		*nid = i->nid;
+
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
 		i->state = NID_ALLOC;
-		nm_i->fcnt--;
-		spin_unlock(&nm_i->free_nid_list_lock);
+		__insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+		nm_i->available_nids--;
+		spin_unlock(&nm_i->nid_list_lock);
 		return true;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	/* Let's scan nat pages and its caches to get free nids */
-	mutex_lock(&nm_i->build_lock);
-	build_free_nids(sbi);
-	mutex_unlock(&nm_i->build_lock);
+	build_free_nids(sbi, true);
 	goto retry;
 }
 
@@ -1889,11 +1920,11 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *i;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
-	__del_from_free_nid_list(nm_i, i);
-	spin_unlock(&nm_i->free_nid_list_lock);
+	f2fs_bug_on(sbi, !i);
+	__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+	spin_unlock(&nm_i->nid_list_lock);
 
 	kmem_cache_free(free_nid_slab, i);
 }
@@ -1910,17 +1941,22 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
 	if (!nid)
 		return;
 
-	spin_lock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
 	i = __lookup_free_nid_list(nm_i, nid);
-	f2fs_bug_on(sbi, !i || i->state != NID_ALLOC);
+	f2fs_bug_on(sbi, !i);
+
 	if (!available_free_memory(sbi, FREE_NIDS)) {
-		__del_from_free_nid_list(nm_i, i);
+		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
 		need_free = true;
 	} else {
+		__remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
 		i->state = NID_NEW;
-		nm_i->fcnt++;
+		__insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+
+	nm_i->available_nids++;
+
+	spin_unlock(&nm_i->nid_list_lock);
 
 	if (need_free)
 		kmem_cache_free(free_nid_slab, i);
@@ -1932,24 +1968,24 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
 	struct free_nid *i, *next;
 	int nr = nr_shrink;
 
-	if (nm_i->fcnt <= MAX_FREE_NIDS)
+	if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
 		return 0;
 
 	if (!mutex_trylock(&nm_i->build_lock))
 		return 0;
 
-	spin_lock(&nm_i->free_nid_list_lock);
-	list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
-		if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS)
+	spin_lock(&nm_i->nid_list_lock);
+	list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
+									list) {
+		if (nr_shrink <= 0 ||
+				nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
 			break;
-		if (i->state == NID_ALLOC)
-			continue;
-		__del_from_free_nid_list(nm_i, i);
+
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
 		kmem_cache_free(free_nid_slab, i);
-		nm_i->fcnt--;
 		nr_shrink--;
 	}
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 	mutex_unlock(&nm_i->build_lock);
 
 	return nr - nr_shrink;
@@ -2005,7 +2041,7 @@ recover_xnid:
 	if (unlikely(!inc_valid_node_count(sbi, inode)))
 		f2fs_bug_on(sbi, 1);
 
-	remove_free_nid(NM_I(sbi), new_xnid);
+	remove_free_nid(sbi, new_xnid);
 	get_node_info(sbi, new_xnid, &ni);
 	ni.ino = inode->i_ino;
 	set_node_addr(sbi, &ni, NEW_ADDR, false);
@@ -2035,7 +2071,7 @@ retry:
 	}
 
 	/* Should not use this inode from free nid list */
-	remove_free_nid(NM_I(sbi), ino);
+	remove_free_nid(sbi, ino);
 
 	if (!PageUptodate(ipage))
 		SetPageUptodate(ipage);
@@ -2069,7 +2105,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	struct f2fs_node *rn;
 	struct f2fs_summary *sum_entry;
 	block_t addr;
-	int bio_blocks = MAX_BIO_BLOCKS(sbi);
 	int i, idx, last_offset, nrpages;
 
 	/* scan the node segment */
@@ -2078,7 +2113,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi,
 	sum_entry = &sum->entries[0];
 
 	for (i = 0; i < last_offset; i += nrpages, addr += nrpages) {
-		nrpages = min(last_offset - i, bio_blocks);
+		nrpages = min(last_offset - i, BIO_MAX_PAGES);
 
 		/* readahead node pages */
 		ra_meta_pages(sbi, addr, nrpages, META_POR, true);
@@ -2120,6 +2155,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
 			ne = grab_nat_entry(nm_i, nid);
 			node_info_from_raw_nat(&ne->ni, &raw_ne);
 		}
+
+		/*
+		 * if a free nat in journal has not been used after last
+		 * checkpoint, we should remove it from available nids,
+		 * since later we will add it again.
+		 */
+		if (!get_nat_flag(ne, IS_DIRTY) &&
+				le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) {
+			spin_lock(&nm_i->nid_list_lock);
+			nm_i->available_nids--;
+			spin_unlock(&nm_i->nid_list_lock);
+		}
+
 		__set_nat_cache_dirty(nm_i, ne);
 	}
 	update_nats_in_cursum(journal, -i);
@@ -2192,8 +2240,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi,
 		raw_nat_from_node_info(raw_ne, &ne->ni);
 		nat_reset_flag(ne);
 		__clear_nat_cache_dirty(NM_I(sbi), ne);
-		if (nat_get_blkaddr(ne) == NULL_ADDR)
+		if (nat_get_blkaddr(ne) == NULL_ADDR) {
 			add_free_nid(sbi, nid, false);
+			spin_lock(&NM_I(sbi)->nid_list_lock);
+			NM_I(sbi)->available_nids++;
+			spin_unlock(&NM_I(sbi)->nid_list_lock);
+		}
 	}
 
 	if (to_journal)
@@ -2268,21 +2320,24 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
 	nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks;
 
 	/* not used nids: 0, node, meta, (and root counted as valid node) */
-	nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM;
-	nm_i->fcnt = 0;
+	nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
+							F2FS_RESERVED_NODE_NUM;
+	nm_i->nid_cnt[FREE_NID_LIST] = 0;
+	nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
 	nm_i->nat_cnt = 0;
 	nm_i->ram_thresh = DEF_RAM_THRESHOLD;
 	nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
 	nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
 
 	INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-	INIT_LIST_HEAD(&nm_i->free_nid_list);
+	INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
+	INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
 	INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
 	INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
 	INIT_LIST_HEAD(&nm_i->nat_entries);
 
 	mutex_init(&nm_i->build_lock);
-	spin_lock_init(&nm_i->free_nid_list_lock);
+	spin_lock_init(&nm_i->nid_list_lock);
 	init_rwsem(&nm_i->nat_tree_lock);
 
 	nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid);
@@ -2310,7 +2365,7 @@ int build_node_manager(struct f2fs_sb_info *sbi)
 	if (err)
 		return err;
 
-	build_free_nids(sbi);
+	build_free_nids(sbi, true);
 	return 0;
 }
 
@@ -2327,17 +2382,18 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
 		return;
 
 	/* destroy free nid list */
-	spin_lock(&nm_i->free_nid_list_lock);
-	list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
-		f2fs_bug_on(sbi, i->state == NID_ALLOC);
-		__del_from_free_nid_list(nm_i, i);
-		nm_i->fcnt--;
-		spin_unlock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
+	list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
+									list) {
+		__remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+		spin_unlock(&nm_i->nid_list_lock);
 		kmem_cache_free(free_nid_slab, i);
-		spin_lock(&nm_i->free_nid_list_lock);
+		spin_lock(&nm_i->nid_list_lock);
 	}
-	f2fs_bug_on(sbi, nm_i->fcnt);
-	spin_unlock(&nm_i->free_nid_list_lock);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
+	f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
+	f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+	spin_unlock(&nm_i->nid_list_lock);
 
 	/* destroy nat cache */
 	down_write(&nm_i->nat_tree_lock);
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h
index 868bec65e51c..e7997e240366 100644
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
 	struct f2fs_nm_info *nm_i = NM_I(sbi);
 	struct free_nid *fnid;
 
-	spin_lock(&nm_i->free_nid_list_lock);
-	if (nm_i->fcnt <= 0) {
-		spin_unlock(&nm_i->free_nid_list_lock);
+	spin_lock(&nm_i->nid_list_lock);
+	if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+		spin_unlock(&nm_i->nid_list_lock);
 		return;
 	}
-	fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list);
+	fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next,
+						struct free_nid, list);
 	*nid = fnid->nid;
-	spin_unlock(&nm_i->free_nid_list_lock);
+	spin_unlock(&nm_i->nid_list_lock);
 }
 
 /*
@@ -313,7 +314,7 @@ static inline bool is_recoverable_dnode(struct page *page)
 				((unsigned char *)ckpt + crc_offset)));
 		cp_ver |= (crc << 32);
 	}
-	return cpu_to_le64(cp_ver) == cpver_of_node(page);
+	return cp_ver == cpver_of_node(page);
 }
 
 /*
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 2fc84a991325..981a9584b62f 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -180,13 +180,15 @@ static void recover_inode(struct inode *inode, struct page *page)
 
 	inode->i_mode = le16_to_cpu(raw->i_mode);
 	f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
-	inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime);
+	inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
 	inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
 	inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
-	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
+	inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
 	inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
 	inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
 
+	F2FS_I(inode)->i_advise = raw->i_advise;
+
 	if (file_enc_name(inode))
 		name = "<encrypted>";
 	else
@@ -196,32 +198,6 @@ static void recover_inode(struct inode *inode, struct page *page)
 			ino_of_node(page), name);
 }
 
-static bool is_same_inode(struct inode *inode, struct page *ipage)
-{
-	struct f2fs_inode *ri = F2FS_INODE(ipage);
-	struct timespec disk;
-
-	if (!IS_INODE(ipage))
-		return true;
-
-	disk.tv_sec = le64_to_cpu(ri->i_ctime);
-	disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec);
-	if (timespec_compare(&inode->i_ctime, &disk) > 0)
-		return false;
-
-	disk.tv_sec = le64_to_cpu(ri->i_atime);
-	disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec);
-	if (timespec_compare(&inode->i_atime, &disk) > 0)
-		return false;
-
-	disk.tv_sec = le64_to_cpu(ri->i_mtime);
-	disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec);
-	if (timespec_compare(&inode->i_mtime, &disk) > 0)
-		return false;
-
-	return true;
-}
-
 static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 {
 	struct curseg_info *curseg;
@@ -248,10 +224,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head)
 			goto next;
 
 		entry = get_fsync_inode(head, ino_of_node(page));
-		if (entry) {
-			if (!is_same_inode(entry->inode, page))
-				goto next;
-		} else {
+		if (!entry) {
 			if (IS_INODE(page) && is_dent_dnode(page)) {
 				err = recover_inode_page(sbi, page);
 				if (err)
@@ -454,7 +427,8 @@ retry_dn:
 			continue;
 		}
 
-		if ((start + 1) << PAGE_SHIFT > i_size_read(inode))
+		if (!file_keep_isize(inode) &&
+				(i_size_read(inode) <= (start << PAGE_SHIFT)))
 			f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT);
 
 		/*
@@ -507,8 +481,10 @@ err:
 	f2fs_put_dnode(&dn);
 out:
 	f2fs_msg(sbi->sb, KERN_NOTICE,
-		"recover_data: ino = %lx, recovered = %d blocks, err = %d",
-		inode->i_ino, recovered, err);
+		"recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
+		inode->i_ino,
+		file_keep_isize(inode) ? "keep" : "recover",
+		recovered, err);
 	return err;
 }
 
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index fc886f008449..0d8802453758 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -259,7 +259,7 @@ static int __commit_inmem_pages(struct inode *inode,
 		.sbi = sbi,
 		.type = DATA,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_PRIO,
 		.encrypted_page = NULL,
 	};
 	bool submit_bio = false;
@@ -274,8 +274,10 @@ static int __commit_inmem_pages(struct inode *inode,
 
 			set_page_dirty(page);
 			f2fs_wait_on_page_writeback(page, DATA, true);
-			if (clear_page_dirty_for_io(page))
+			if (clear_page_dirty_for_io(page)) {
 				inode_dec_dirty_pages(inode);
+				remove_dirty_inode(inode);
+			}
 
 			fio.page = page;
 			err = do_write_data_page(&fio);
@@ -287,7 +289,6 @@ static int __commit_inmem_pages(struct inode *inode,
 			/* record old blkaddr for revoking */
 			cur->old_addr = fio.old_blkaddr;
 
-			clear_cold_data(page);
 			submit_bio = true;
 		}
 		unlock_page(page);
@@ -363,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
 	 */
 	if (has_not_enough_free_secs(sbi, 0, 0)) {
 		mutex_lock(&sbi->gc_mutex);
-		f2fs_gc(sbi, false);
+		f2fs_gc(sbi, false, false);
 	}
 }
 
@@ -380,14 +381,17 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	if (!available_free_memory(sbi, FREE_NIDS))
 		try_to_free_nids(sbi, MAX_FREE_NIDS);
 	else
-		build_free_nids(sbi);
+		build_free_nids(sbi, false);
+
+	if (!is_idle(sbi))
+		return;
 
 	/* checkpoint is the only way to shrink partial cached entries */
 	if (!available_free_memory(sbi, NAT_ENTRIES) ||
 			!available_free_memory(sbi, INO_ENTRIES) ||
 			excess_prefree_segs(sbi) ||
 			excess_dirty_nats(sbi) ||
-			(is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) {
+			f2fs_time_over(sbi, CP_TIME)) {
 		if (test_opt(sbi, DATA_FLUSH)) {
 			struct blk_plug plug;
 
@@ -400,6 +404,33 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
 	}
 }
 
+static int __submit_flush_wait(struct block_device *bdev)
+{
+	struct bio *bio = f2fs_bio_alloc(0);
+	int ret;
+
+	bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+	bio->bi_bdev = bdev;
+	ret = submit_bio_wait(bio);
+	bio_put(bio);
+	return ret;
+}
+
+static int submit_flush_wait(struct f2fs_sb_info *sbi)
+{
+	int ret = __submit_flush_wait(sbi->sb->s_bdev);
+	int i;
+
+	if (sbi->s_ndevs && !ret) {
+		for (i = 1; i < sbi->s_ndevs; i++) {
+			ret = __submit_flush_wait(FDEV(i).bdev);
+			if (ret)
+				break;
+		}
+	}
+	return ret;
+}
+
 static int issue_flush_thread(void *data)
 {
 	struct f2fs_sb_info *sbi = data;
@@ -410,25 +441,18 @@ repeat:
 		return 0;
 
 	if (!llist_empty(&fcc->issue_list)) {
-		struct bio *bio;
 		struct flush_cmd *cmd, *next;
 		int ret;
 
-		bio = f2fs_bio_alloc(0);
-
 		fcc->dispatch_list = llist_del_all(&fcc->issue_list);
 		fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
 
-		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
-		ret = submit_bio_wait(bio);
-
+		ret = submit_flush_wait(sbi);
 		llist_for_each_entry_safe(cmd, next,
 					  fcc->dispatch_list, llnode) {
 			cmd->ret = ret;
 			complete(&cmd->wait);
 		}
-		bio_put(bio);
 		fcc->dispatch_list = NULL;
 	}
 
@@ -449,15 +473,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 		return 0;
 
 	if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) {
-		struct bio *bio = f2fs_bio_alloc(0);
 		int ret;
 
 		atomic_inc(&fcc->submit_flush);
-		bio->bi_bdev = sbi->sb->s_bdev;
-		bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_FLUSH);
-		ret = submit_bio_wait(bio);
+		ret = submit_flush_wait(sbi);
 		atomic_dec(&fcc->submit_flush);
-		bio_put(bio);
 		return ret;
 	}
 
@@ -469,8 +489,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
 	if (!fcc->dispatch_list)
 		wake_up(&fcc->flush_wait_queue);
 
-	wait_for_completion(&cmd.wait);
-	atomic_dec(&fcc->submit_flush);
+	if (fcc->f2fs_issue_flush) {
+		wait_for_completion(&cmd.wait);
+		atomic_dec(&fcc->submit_flush);
+	} else {
+		llist_del_all(&fcc->issue_list);
+		atomic_set(&fcc->submit_flush, 0);
+	}
 
 	return cmd.ret;
 }
@@ -481,6 +506,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	struct flush_cmd_control *fcc;
 	int err = 0;
 
+	if (SM_I(sbi)->cmd_control_info) {
+		fcc = SM_I(sbi)->cmd_control_info;
+		goto init_thread;
+	}
+
 	fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL);
 	if (!fcc)
 		return -ENOMEM;
@@ -488,6 +518,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	init_waitqueue_head(&fcc->flush_wait_queue);
 	init_llist_head(&fcc->issue_list);
 	SM_I(sbi)->cmd_control_info = fcc;
+init_thread:
 	fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi,
 				"f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev));
 	if (IS_ERR(fcc->f2fs_issue_flush)) {
@@ -500,14 +531,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi)
 	return err;
 }
 
-void destroy_flush_cmd_control(struct f2fs_sb_info *sbi)
+void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
 {
 	struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info;
 
-	if (fcc && fcc->f2fs_issue_flush)
-		kthread_stop(fcc->f2fs_issue_flush);
-	kfree(fcc);
-	SM_I(sbi)->cmd_control_info = NULL;
+	if (fcc && fcc->f2fs_issue_flush) {
+		struct task_struct *flush_thread = fcc->f2fs_issue_flush;
+
+		fcc->f2fs_issue_flush = NULL;
+		kthread_stop(flush_thread);
+	}
+	if (free) {
+		kfree(fcc);
+		SM_I(sbi)->cmd_control_info = NULL;
+	}
 }
 
 static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
@@ -633,15 +670,23 @@ static void f2fs_submit_bio_wait_endio(struct bio *bio)
 }
 
 /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
-int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
-		sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
 {
-	struct block_device *bdev = sbi->sb->s_bdev;
 	struct bio *bio = NULL;
 	int err;
 
-	err = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
-			&bio);
+	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
+
+	if (sbi->s_ndevs) {
+		int devi = f2fs_target_device_index(sbi, blkstart);
+
+		blkstart -= FDEV(devi).start_blk;
+	}
+	err = __blkdev_issue_discard(bdev,
+				SECTOR_FROM_BLOCK(blkstart),
+				SECTOR_FROM_BLOCK(blklen),
+				GFP_NOFS, 0, &bio);
 	if (!err && bio) {
 		struct bio_entry *be = __add_bio_entry(sbi, bio);
 
@@ -654,24 +699,101 @@ int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, sector_t sector,
 	return err;
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+	sector_t nr_sects = SECTOR_FROM_BLOCK(blklen);
+	sector_t sector;
+	int devi = 0;
+
+	if (sbi->s_ndevs) {
+		devi = f2fs_target_device_index(sbi, blkstart);
+		blkstart -= FDEV(devi).start_blk;
+	}
+	sector = SECTOR_FROM_BLOCK(blkstart);
+
+	if (sector & (bdev_zone_sectors(bdev) - 1) ||
+	    nr_sects != bdev_zone_sectors(bdev)) {
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"(%d) %s: Unaligned discard attempted (block %x + %x)",
+			devi, sbi->s_ndevs ? FDEV(devi).path: "",
+			blkstart, blklen);
+		return -EIO;
+	}
+
+	/*
+	 * We need to know the type of the zone: for conventional zones,
+	 * use regular discard if the drive supports it. For sequential
+	 * zones, reset the zone write pointer.
+	 */
+	switch (get_blkz_type(sbi, bdev, blkstart)) {
+
+	case BLK_ZONE_TYPE_CONVENTIONAL:
+		if (!blk_queue_discard(bdev_get_queue(bdev)))
+			return 0;
+		return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+	case BLK_ZONE_TYPE_SEQWRITE_REQ:
+	case BLK_ZONE_TYPE_SEQWRITE_PREF:
+		trace_f2fs_issue_reset_zone(sbi->sb, blkstart);
+		return blkdev_reset_zones(bdev, sector,
+					  nr_sects, GFP_NOFS);
+	default:
+		/* Unknown zone type: broken device ? */
+		return -EIO;
+	}
+}
+#endif
+
+static int __issue_discard_async(struct f2fs_sb_info *sbi,
+		struct block_device *bdev, block_t blkstart, block_t blklen)
+{
+#ifdef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sbi->sb) &&
+				bdev_zoned_model(bdev) != BLK_ZONED_NONE)
+		return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen);
+#endif
+	return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen);
+}
+
 static int f2fs_issue_discard(struct f2fs_sb_info *sbi,
 				block_t blkstart, block_t blklen)
 {
-	sector_t start = SECTOR_FROM_BLOCK(blkstart);
-	sector_t len = SECTOR_FROM_BLOCK(blklen);
+	sector_t start = blkstart, len = 0;
+	struct block_device *bdev;
 	struct seg_entry *se;
 	unsigned int offset;
 	block_t i;
+	int err = 0;
+
+	bdev = f2fs_target_device(sbi, blkstart, NULL);
+
+	for (i = blkstart; i < blkstart + blklen; i++, len++) {
+		if (i != start) {
+			struct block_device *bdev2 =
+				f2fs_target_device(sbi, i, NULL);
+
+			if (bdev2 != bdev) {
+				err = __issue_discard_async(sbi, bdev,
+						start, len);
+				if (err)
+					return err;
+				bdev = bdev2;
+				start = i;
+				len = 0;
+			}
+		}
 
-	for (i = blkstart; i < blkstart + blklen; i++) {
 		se = get_seg_entry(sbi, GET_SEGNO(sbi, i));
 		offset = GET_BLKOFF_FROM_SEG0(sbi, i);
 
 		if (!f2fs_test_and_set_bit(offset, se->discard_map))
 			sbi->discard_blks--;
 	}
-	trace_f2fs_issue_discard(sbi->sb, blkstart, blklen);
-	return __f2fs_issue_discard_async(sbi, start, len, GFP_NOFS, 0);
+
+	if (len)
+		err = __issue_discard_async(sbi, bdev, start, len);
+	return err;
 }
 
 static void __add_discard_entry(struct f2fs_sb_info *sbi,
@@ -1296,25 +1418,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi,
 	stat_inc_seg_type(sbi, curseg);
 }
 
-static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type)
-{
-	struct curseg_info *curseg = CURSEG_I(sbi, type);
-	unsigned int old_segno;
-
-	old_segno = curseg->segno;
-	SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true);
-	locate_dirty_segment(sbi, old_segno);
-}
-
 void allocate_new_segments(struct f2fs_sb_info *sbi)
 {
+	struct curseg_info *curseg;
+	unsigned int old_segno;
 	int i;
 
 	if (test_opt(sbi, LFS))
 		return;
 
-	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++)
-		__allocate_new_segments(sbi, i);
+	for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
+		curseg = CURSEG_I(sbi, i);
+		old_segno = curseg->segno;
+		SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
+		locate_dirty_segment(sbi, old_segno);
+	}
 }
 
 static const struct segment_allocation default_salloc_ops = {
@@ -1448,21 +1566,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
 		struct f2fs_summary *sum, int type)
 {
 	struct sit_info *sit_i = SIT_I(sbi);
-	struct curseg_info *curseg;
-	bool direct_io = (type == CURSEG_DIRECT_IO);
-
-	type = direct_io ? CURSEG_WARM_DATA : type;
-
-	curseg = CURSEG_I(sbi, type);
+	struct curseg_info *curseg = CURSEG_I(sbi, type);
 
 	mutex_lock(&curseg->curseg_mutex);
 	mutex_lock(&sit_i->sentry_lock);
 
-	/* direct_io'ed data is aligned to the segment for better performance */
-	if (direct_io && curseg->next_blkoff &&
-				!has_not_enough_free_secs(sbi, 0, 0))
-		__allocate_new_segments(sbi, type);
-
 	*new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
 
 	/*
@@ -1515,7 +1623,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page)
 		.sbi = sbi,
 		.type = META,
 		.op = REQ_OP_WRITE,
-		.op_flags = WRITE_SYNC | REQ_META | REQ_PRIO,
+		.op_flags = REQ_SYNC | REQ_META | REQ_PRIO,
 		.old_blkaddr = page->index,
 		.new_blkaddr = page->index,
 		.page = page,
@@ -2166,7 +2274,6 @@ out:
 static int build_sit_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
-	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
 	struct sit_info *sit_i;
 	unsigned int sit_segs, start;
 	char *src_bitmap, *dst_bitmap;
@@ -2233,7 +2340,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
 
 	sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr);
 	sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg;
-	sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count);
+	sit_i->written_valid_blocks = 0;
 	sit_i->sit_bitmap = dst_bitmap;
 	sit_i->bitmap_size = bitmap_size;
 	sit_i->dirty_sentries = 0;
@@ -2315,10 +2422,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi)
 	int sit_blk_cnt = SIT_BLK_CNT(sbi);
 	unsigned int i, start, end;
 	unsigned int readed, start_blk = 0;
-	int nrpages = MAX_BIO_BLOCKS(sbi) * 8;
 
 	do {
-		readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true);
+		readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
+							META_SIT, true);
 
 		start = start_blk * sit_i->sents_per_block;
 		end = (start_blk + readed) * sit_i->sents_per_block;
@@ -2387,6 +2494,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi)
 		struct seg_entry *sentry = get_seg_entry(sbi, start);
 		if (!sentry->valid_blocks)
 			__set_free(sbi, start);
+		else
+			SIT_I(sbi)->written_valid_blocks +=
+						sentry->valid_blocks;
 	}
 
 	/* set use the current segments */
@@ -2645,7 +2755,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi)
 
 	if (!sm_info)
 		return;
-	destroy_flush_cmd_control(sbi);
+	destroy_flush_cmd_control(sbi, true);
 	destroy_dirty_segmap(sbi);
 	destroy_curseg(sbi);
 	destroy_free_segmap(sbi);
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index fecb856ad874..9d44ce83acb2 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -18,6 +18,8 @@
 #define DEF_RECLAIM_PREFREE_SEGMENTS	5	/* 5% over total segments */
 #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS	4096	/* 8GB in maximum */
 
+#define F2FS_MIN_SEGMENTS	9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */
+
 /* L: Logical segment # in volume, R: Relative segment # in main area */
 #define GET_L2R_SEGNO(free_i, segno)	(segno - free_i->start_segno)
 #define GET_R2L_SEGNO(free_i, segno)	(segno + free_i->start_segno)
@@ -102,8 +104,6 @@
 	(((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK)
 #define SECTOR_TO_BLOCK(sectors)					\
 	(sectors >> F2FS_LOG_SECTORS_PER_BLOCK)
-#define MAX_BIO_BLOCKS(sbi)						\
-	((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES))
 
 /*
  * indicate a block allocation direction: RIGHT and LEFT.
@@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi)
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
+	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
 
 	if (test_opt(sbi, LFS))
 		return false;
 
-	return free_sections(sbi) <= (node_secs + 2 * dent_secs +
+	return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
 						reserved_sections(sbi) + 1);
 }
 
@@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
 {
 	int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
 	int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
-
-	node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+	int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
 
 	if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
 		return false;
 
 	return (free_sections(sbi) + freed) <=
-		(node_secs + 2 * dent_secs + reserved_sections(sbi) + needed);
+		(node_secs + 2 * dent_secs + imeta_secs +
+		reserved_sections(sbi) + needed);
 }
 
 static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi)
@@ -695,13 +696,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno)
 	return false;
 }
 
-static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi)
-{
-	struct block_device *bdev = sbi->sb->s_bdev;
-	struct request_queue *q = bdev_get_queue(bdev);
-	return SECTOR_TO_BLOCK(queue_max_sectors(q));
-}
-
 /*
  * It is very important to gather dirty pages and write at once, so that we can
  * submit a big bio without interfering other data writes.
@@ -719,7 +713,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type)
 	else if (type == NODE)
 		return 8 * sbi->blocks_per_seg;
 	else if (type == META)
-		return 8 * MAX_BIO_BLOCKS(sbi);
+		return 8 * BIO_MAX_PAGES;
 	else
 		return 0;
 }
@@ -736,11 +730,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type,
 		return 0;
 
 	nr_to_write = wbc->nr_to_write;
-
+	desired = BIO_MAX_PAGES;
 	if (type == NODE)
-		desired = 2 * max_hw_blocks(sbi);
-	else
-		desired = MAX_BIO_BLOCKS(sbi);
+		desired <<= 1;
 
 	wbc->nr_to_write = desired;
 	return desired - nr_to_write;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index 46c915425923..5c60fc28ec75 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -21,14 +21,16 @@ static unsigned int shrinker_run_no;
 
 static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
 {
-	return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+	long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt;
+
+	return count > 0 ? count : 0;
 }
 
 static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
 {
-	if (NM_I(sbi)->fcnt > MAX_FREE_NIDS)
-		return NM_I(sbi)->fcnt - MAX_FREE_NIDS;
-	return 0;
+	long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+
+	return count > 0 ? count : 0;
 }
 
 static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index 6132b4ce4e4c..46fd30d8af77 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -412,14 +412,20 @@ static int parse_options(struct super_block *sb, char *options)
 			q = bdev_get_queue(sb->s_bdev);
 			if (blk_queue_discard(q)) {
 				set_opt(sbi, DISCARD);
-			} else {
+			} else if (!f2fs_sb_mounted_blkzoned(sb)) {
 				f2fs_msg(sb, KERN_WARNING,
 					"mounting with \"discard\" option, but "
 					"the device does not support discard");
 			}
 			break;
 		case Opt_nodiscard:
+			if (f2fs_sb_mounted_blkzoned(sb)) {
+				f2fs_msg(sb, KERN_WARNING,
+					"discard is required for zoned block devices");
+				return -EINVAL;
+			}
 			clear_opt(sbi, DISCARD);
+			break;
 		case Opt_noheap:
 			set_opt(sbi, NOHEAP);
 			break;
@@ -512,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options)
 				return -ENOMEM;
 			if (strlen(name) == 8 &&
 					!strncmp(name, "adaptive", 8)) {
+				if (f2fs_sb_mounted_blkzoned(sb)) {
+					f2fs_msg(sb, KERN_WARNING,
+						 "adaptive mode is not allowed with "
+						 "zoned block device feature");
+					kfree(name);
+					return -EINVAL;
+				}
 				set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
 			} else if (strlen(name) == 3 &&
 					!strncmp(name, "lfs", 3)) {
@@ -558,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
 
 	init_once((void *) fi);
 
-	if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) {
-		kmem_cache_free(f2fs_inode_cachep, fi);
-		return NULL;
-	}
-
 	/* Initialize f2fs-specific inode info */
 	fi->vfs_inode.i_version = 1;
+	atomic_set(&fi->dirty_pages, 0);
 	fi->i_current_depth = 1;
 	fi->i_advise = 0;
 	init_rwsem(&fi->i_sem);
@@ -620,24 +629,25 @@ static int f2fs_drop_inode(struct inode *inode)
 	return generic_drop_inode(inode);
 }
 
-int f2fs_inode_dirtied(struct inode *inode)
+int f2fs_inode_dirtied(struct inode *inode, bool sync)
 {
 	struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+	int ret = 0;
 
 	spin_lock(&sbi->inode_lock[DIRTY_META]);
 	if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
-		spin_unlock(&sbi->inode_lock[DIRTY_META]);
-		return 1;
+		ret = 1;
+	} else {
+		set_inode_flag(inode, FI_DIRTY_INODE);
+		stat_inc_dirty_inode(sbi, DIRTY_META);
 	}
-
-	set_inode_flag(inode, FI_DIRTY_INODE);
-	list_add_tail(&F2FS_I(inode)->gdirty_list,
+	if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) {
+		list_add_tail(&F2FS_I(inode)->gdirty_list,
 				&sbi->inode_list[DIRTY_META]);
-	inc_page_count(sbi, F2FS_DIRTY_IMETA);
-	stat_inc_dirty_inode(sbi, DIRTY_META);
+		inc_page_count(sbi, F2FS_DIRTY_IMETA);
+	}
 	spin_unlock(&sbi->inode_lock[DIRTY_META]);
-
-	return 0;
+	return ret;
 }
 
 void f2fs_inode_synced(struct inode *inode)
@@ -649,10 +659,12 @@ void f2fs_inode_synced(struct inode *inode)
 		spin_unlock(&sbi->inode_lock[DIRTY_META]);
 		return;
 	}
-	list_del_init(&F2FS_I(inode)->gdirty_list);
+	if (!list_empty(&F2FS_I(inode)->gdirty_list)) {
+		list_del_init(&F2FS_I(inode)->gdirty_list);
+		dec_page_count(sbi, F2FS_DIRTY_IMETA);
+	}
 	clear_inode_flag(inode, FI_DIRTY_INODE);
 	clear_inode_flag(inode, FI_AUTO_RECOVER);
-	dec_page_count(sbi, F2FS_DIRTY_IMETA);
 	stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META);
 	spin_unlock(&sbi->inode_lock[DIRTY_META]);
 }
@@ -676,7 +688,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags)
 	if (is_inode_flag_set(inode, FI_AUTO_RECOVER))
 		clear_inode_flag(inode, FI_AUTO_RECOVER);
 
-	f2fs_inode_dirtied(inode);
+	f2fs_inode_dirtied(inode, false);
 }
 
 static void f2fs_i_callback(struct rcu_head *head)
@@ -687,20 +699,28 @@ static void f2fs_i_callback(struct rcu_head *head)
 
 static void f2fs_destroy_inode(struct inode *inode)
 {
-	percpu_counter_destroy(&F2FS_I(inode)->dirty_pages);
 	call_rcu(&inode->i_rcu, f2fs_i_callback);
 }
 
 static void destroy_percpu_info(struct f2fs_sb_info *sbi)
 {
-	int i;
-
-	for (i = 0; i < NR_COUNT_TYPE; i++)
-		percpu_counter_destroy(&sbi->nr_pages[i]);
 	percpu_counter_destroy(&sbi->alloc_valid_block_count);
 	percpu_counter_destroy(&sbi->total_valid_inode_count);
 }
 
+static void destroy_device_list(struct f2fs_sb_info *sbi)
+{
+	int i;
+
+	for (i = 0; i < sbi->s_ndevs; i++) {
+		blkdev_put(FDEV(i).bdev, FMODE_EXCL);
+#ifdef CONFIG_BLK_DEV_ZONED
+		kfree(FDEV(i).blkz_type);
+#endif
+	}
+	kfree(sbi->devs);
+}
+
 static void f2fs_put_super(struct super_block *sb)
 {
 	struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -738,7 +758,6 @@ static void f2fs_put_super(struct super_block *sb)
 	 * In addition, EIO will skip do checkpoint, we need this as well.
 	 */
 	release_ino_entry(sbi, true);
-	release_discard_addrs(sbi);
 
 	f2fs_leave_shrinker(sbi);
 	mutex_unlock(&sbi->umount_mutex);
@@ -762,6 +781,8 @@ static void f2fs_put_super(struct super_block *sb)
 		crypto_free_shash(sbi->s_chksum_driver);
 	kfree(sbi->raw_super);
 
+	destroy_device_list(sbi);
+
 	destroy_percpu_info(sbi);
 	kfree(sbi);
 }
@@ -789,13 +810,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
 
 static int f2fs_freeze(struct super_block *sb)
 {
-	int err;
-
 	if (f2fs_readonly(sb))
 		return 0;
 
-	err = f2fs_sync_fs(sb, 1);
-	return err;
+	/* IO error happened before */
+	if (unlikely(f2fs_cp_error(F2FS_SB(sb))))
+		return -EIO;
+
+	/* must be clean, since sync_filesystem() was already called */
+	if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY))
+		return -EINVAL;
+	return 0;
 }
 
 static int f2fs_unfreeze(struct super_block *sb)
@@ -822,7 +847,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_bavail = user_block_count - valid_user_blocks(sbi);
 
 	buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
-	buf->f_ffree = buf->f_files - valid_inode_count(sbi);
+	buf->f_ffree = min(buf->f_files - valid_node_count(sbi),
+							buf->f_bavail);
 
 	buf->f_namelen = F2FS_NAME_LEN;
 	buf->f_fsid.val[0] = (u32)id;
@@ -974,7 +1000,7 @@ static void default_options(struct f2fs_sb_info *sbi)
 	set_opt(sbi, EXTENT_CACHE);
 	sbi->sb->s_flags |= MS_LAZYTIME;
 	set_opt(sbi, FLUSH_MERGE);
-	if (f2fs_sb_mounted_hmsmr(sbi->sb)) {
+	if (f2fs_sb_mounted_blkzoned(sbi->sb)) {
 		set_opt_mode(sbi, F2FS_MOUNT_LFS);
 		set_opt(sbi, DISCARD);
 	} else {
@@ -1076,8 +1102,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
 	 * or if flush_merge is not passed in mount option.
 	 */
 	if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) {
-		destroy_flush_cmd_control(sbi);
-	} else if (!SM_I(sbi)->cmd_control_info) {
+		clear_opt(sbi, FLUSH_MERGE);
+		destroy_flush_cmd_control(sbi, false);
+	} else {
 		err = create_flush_cmd_control(sbi);
 		if (err)
 			goto restore_gc;
@@ -1238,7 +1265,7 @@ static int __f2fs_commit_super(struct buffer_head *bh,
 	unlock_buffer(bh);
 
 	/* it's rare case, we can do fua all the time */
-	return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA);
+	return __sync_dirty_buffer(bh, REQ_PREFLUSH | REQ_FUA);
 }
 
 static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi,
@@ -1426,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	unsigned int total, fsmeta;
 	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
 	struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
+	unsigned int ovp_segments, reserved_segments;
 
 	total = le32_to_cpu(raw_super->segment_count);
 	fsmeta = le32_to_cpu(raw_super->segment_count_ckpt);
@@ -1437,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 	if (unlikely(fsmeta >= total))
 		return 1;
 
+	ovp_segments = le32_to_cpu(ckpt->overprov_segment_count);
+	reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count);
+
+	if (unlikely(fsmeta < F2FS_MIN_SEGMENTS ||
+			ovp_segments == 0 || reserved_segments == 0)) {
+		f2fs_msg(sbi->sb, KERN_ERR,
+			"Wrong layout: check mkfs.f2fs version");
+		return 1;
+	}
+
 	if (unlikely(f2fs_cp_error(sbi))) {
 		f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck");
 		return 1;
@@ -1447,6 +1485,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi)
 static void init_sb_info(struct f2fs_sb_info *sbi)
 {
 	struct f2fs_super_block *raw_super = sbi->raw_super;
+	int i;
 
 	sbi->log_sectors_per_block =
 		le32_to_cpu(raw_super->log_sectors_per_block);
@@ -1471,6 +1510,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 	sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL;
 	clear_sbi_flag(sbi, SBI_NEED_FSCK);
 
+	for (i = 0; i < NR_COUNT_TYPE; i++)
+		atomic_set(&sbi->nr_pages[i], 0);
+
 	INIT_LIST_HEAD(&sbi->s_list);
 	mutex_init(&sbi->umount_mutex);
 	mutex_init(&sbi->wio_mutex[NODE]);
@@ -1486,13 +1528,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
 
 static int init_percpu_info(struct f2fs_sb_info *sbi)
 {
-	int i, err;
-
-	for (i = 0; i < NR_COUNT_TYPE; i++) {
-		err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL);
-		if (err)
-			return err;
-	}
+	int err;
 
 	err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL);
 	if (err)
@@ -1502,6 +1538,71 @@ static int init_percpu_info(struct f2fs_sb_info *sbi)
 								GFP_KERNEL);
 }
 
+#ifdef CONFIG_BLK_DEV_ZONED
+static int init_blkz_info(struct f2fs_sb_info *sbi, int devi)
+{
+	struct block_device *bdev = FDEV(devi).bdev;
+	sector_t nr_sectors = bdev->bd_part->nr_sects;
+	sector_t sector = 0;
+	struct blk_zone *zones;
+	unsigned int i, nr_zones;
+	unsigned int n = 0;
+	int err = -EIO;
+
+	if (!f2fs_sb_mounted_blkzoned(sbi->sb))
+		return 0;
+
+	if (sbi->blocks_per_blkz && sbi->blocks_per_blkz !=
+				SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)))
+		return -EINVAL;
+	sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev));
+	if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz !=
+				__ilog2_u32(sbi->blocks_per_blkz))
+		return -EINVAL;
+	sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz);
+	FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >>
+					sbi->log_blocks_per_blkz;
+	if (nr_sectors & (bdev_zone_sectors(bdev) - 1))
+		FDEV(devi).nr_blkz++;
+
+	FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL);
+	if (!FDEV(devi).blkz_type)
+		return -ENOMEM;
+
+#define F2FS_REPORT_NR_ZONES   4096
+
+	zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone),
+			GFP_KERNEL);
+	if (!zones)
+		return -ENOMEM;
+
+	/* Get block zones type */
+	while (zones && sector < nr_sectors) {
+
+		nr_zones = F2FS_REPORT_NR_ZONES;
+		err = blkdev_report_zones(bdev, sector,
+					  zones, &nr_zones,
+					  GFP_KERNEL);
+		if (err)
+			break;
+		if (!nr_zones) {
+			err = -EIO;
+			break;
+		}
+
+		for (i = 0; i < nr_zones; i++) {
+			FDEV(devi).blkz_type[n] = zones[i].type;
+			sector += zones[i].len;
+			n++;
+		}
+	}
+
+	kfree(zones);
+
+	return err;
+}
+#endif
+
 /*
  * Read f2fs raw super block.
  * Because we have two copies of super block, so read both of them
@@ -1594,6 +1695,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover)
 	return err;
 }
 
+static int f2fs_scan_devices(struct f2fs_sb_info *sbi)
+{
+	struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi);
+	int i;
+
+	for (i = 0; i < MAX_DEVICES; i++) {
+		if (!RDEV(i).path[0])
+			return 0;
+
+		if (i == 0) {
+			sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) *
+						MAX_DEVICES, GFP_KERNEL);
+			if (!sbi->devs)
+				return -ENOMEM;
+		}
+
+		memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN);
+		FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments);
+		if (i == 0) {
+			FDEV(i).start_blk = 0;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1 +
+				le32_to_cpu(raw_super->segment0_blkaddr);
+		} else {
+			FDEV(i).start_blk = FDEV(i - 1).end_blk + 1;
+			FDEV(i).end_blk = FDEV(i).start_blk +
+				(FDEV(i).total_segments <<
+				sbi->log_blocks_per_seg) - 1;
+		}
+
+		FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path,
+					sbi->sb->s_mode, sbi->sb->s_type);
+		if (IS_ERR(FDEV(i).bdev))
+			return PTR_ERR(FDEV(i).bdev);
+
+		/* to release errored devices */
+		sbi->s_ndevs = i + 1;
+
+#ifdef CONFIG_BLK_DEV_ZONED
+		if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM &&
+				!f2fs_sb_mounted_blkzoned(sbi->sb)) {
+			f2fs_msg(sbi->sb, KERN_ERR,
+				"Zoned block device feature not enabled\n");
+			return -EINVAL;
+		}
+		if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) {
+			if (init_blkz_info(sbi, i)) {
+				f2fs_msg(sbi->sb, KERN_ERR,
+					"Failed to initialize F2FS blkzone information");
+				return -EINVAL;
+			}
+			f2fs_msg(sbi->sb, KERN_INFO,
+				"Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk,
+				bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ?
+				"Host-aware" : "Host-managed");
+			continue;
+		}
+#endif
+		f2fs_msg(sbi->sb, KERN_INFO,
+			"Mount Device [%2d]: %20s, %8u, %8x - %8x",
+				i, FDEV(i).path,
+				FDEV(i).total_segments,
+				FDEV(i).start_blk, FDEV(i).end_blk);
+	}
+	return 0;
+}
+
 static int f2fs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct f2fs_sb_info *sbi;
@@ -1641,6 +1813,18 @@ try_onemore:
 	sb->s_fs_info = sbi;
 	sbi->raw_super = raw_super;
 
+	/*
+	 * The BLKZONED feature indicates that the drive was formatted with
+	 * zone alignment optimization. This is optional for host-aware
+	 * devices, but mandatory for host-managed zoned block devices.
+	 */
+#ifndef CONFIG_BLK_DEV_ZONED
+	if (f2fs_sb_mounted_blkzoned(sb)) {
+		f2fs_msg(sb, KERN_ERR,
+			 "Zoned block device support is not enabled\n");
+		goto free_sb_buf;
+	}
+#endif
 	default_options(sbi);
 	/* parse mount options */
 	options = kstrdup((const char *)data, GFP_KERNEL);
@@ -1710,6 +1894,13 @@ try_onemore:
 		goto free_meta_inode;
 	}
 
+	/* Initialize device list */
+	err = f2fs_scan_devices(sbi);
+	if (err) {
+		f2fs_msg(sb, KERN_ERR, "Failed to find devices");
+		goto free_devices;
+	}
+
 	sbi->total_valid_node_count =
 				le32_to_cpu(sbi->ckpt->valid_node_count);
 	percpu_counter_set(&sbi->total_valid_inode_count,
@@ -1893,12 +2084,21 @@ free_node_inode:
 	mutex_lock(&sbi->umount_mutex);
 	release_ino_entry(sbi, true);
 	f2fs_leave_shrinker(sbi);
+	/*
+	 * Some dirty meta pages can be produced by recover_orphan_inodes()
+	 * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg()
+	 * followed by write_checkpoint() through f2fs_write_node_pages(), which
+	 * falls into an infinite loop in sync_meta_pages().
+	 */
+	truncate_inode_pages_final(META_MAPPING(sbi));
 	iput(sbi->node_inode);
 	mutex_unlock(&sbi->umount_mutex);
 free_nm:
 	destroy_node_manager(sbi);
 free_sm:
 	destroy_segment_manager(sbi);
+free_devices:
+	destroy_device_list(sbi);
 	kfree(sbi->ckpt);
 free_meta_inode:
 	make_bad_inode(sbi->meta_inode);
@@ -2044,3 +2244,4 @@ module_exit(exit_f2fs_fs)
 MODULE_AUTHOR("Samsung Electronics's Praesto Team");
 MODULE_DESCRIPTION("Flash Friendly File System");
 MODULE_LICENSE("GPL");
+
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 3e1c0280f866..c47ce2f330a1 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -106,7 +106,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler,
 		return -EINVAL;
 
 	F2FS_I(inode)->i_advise |= *(char *)value;
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	return 0;
 }
 
@@ -554,7 +554,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
 	if (index == F2FS_XATTR_INDEX_ENCRYPTION &&
 			!strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT))
 		f2fs_set_encrypted_inode(inode);
-	f2fs_mark_inode_dirty_sync(inode);
+	f2fs_mark_inode_dirty_sync(inode, true);
 	if (!error && S_ISDIR(inode->i_mode))
 		set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP);
 exit:
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 350a2c8cfd28..e1c54f20325c 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -25,7 +25,7 @@
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
@@ -52,7 +52,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		   arg |= O_NONBLOCK;
 
 	/* Pipe packetized mode is controlled by O_DIRECT flag */
-	if (!S_ISFIFO(filp->f_inode->i_mode) && (arg & O_DIRECT)) {
+	if (!S_ISFIFO(inode->i_mode) && (arg & O_DIRECT)) {
 		if (!filp->f_mapping || !filp->f_mapping->a_ops ||
 			!filp->f_mapping->a_ops->direct_IO)
 				return -EINVAL;
diff --git a/fs/fhandle.c b/fs/fhandle.c
index ca3c3dd01789..5559168d5637 100644
--- a/fs/fhandle.c
+++ b/fs/fhandle.c
@@ -8,7 +8,7 @@
 #include <linux/fs_struct.h>
 #include <linux/fsnotify.h>
 #include <linux/personality.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 #include "mount.h"
 
diff --git a/fs/file_table.c b/fs/file_table.c
index ad17e05ebf95..6d982b57de92 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -155,7 +155,7 @@ over:
  * @mode: the mode with which the new file will be opened
  * @fop: the 'struct file_operations' for the new file
  */
-struct file *alloc_file(struct path *path, fmode_t mode,
+struct file *alloc_file(const struct path *path, fmode_t mode,
 		const struct file_operations *fop)
 {
 	struct file *file;
diff --git a/fs/filesystems.c b/fs/filesystems.c
index c5618db110be..cac75547d35c 100644
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -14,7 +14,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 /*
  * Handling of filesystem drivers list.
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 05713a5da083..ef600591d96f 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1769,15 +1769,13 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * become available for writeback. Otherwise
 		 * we'll just busyloop.
 		 */
-		if (!list_empty(&wb->b_more_io))  {
-			trace_writeback_wait(wb, work);
-			inode = wb_inode(wb->b_more_io.prev);
-			spin_lock(&inode->i_lock);
-			spin_unlock(&wb->list_lock);
-			/* This function drops i_lock... */
-			inode_sleep_on_writeback(inode);
-			spin_lock(&wb->list_lock);
-		}
+		trace_writeback_wait(wb, work);
+		inode = wb_inode(wb->b_more_io.prev);
+		spin_lock(&inode->i_lock);
+		spin_unlock(&wb->list_lock);
+		/* This function drops i_lock... */
+		inode_sleep_on_writeback(inode);
+		spin_lock(&wb->list_lock);
 	}
 	spin_unlock(&wb->list_lock);
 	blk_finish_plug(&plug);
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 70ea57c7b6bb..4e06a27ed7f8 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -2025,7 +2025,6 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 		struct fuse_req *req;
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
-		clear_bit(FR_PENDING, &req->flags);
 		clear_bit(FR_SENT, &req->flags);
 		list_del_init(&req->list);
 		request_end(fc, req);
@@ -2103,6 +2102,8 @@ void fuse_abort_conn(struct fuse_conn *fc)
 		spin_lock(&fiq->waitq.lock);
 		fiq->connected = 0;
 		list_splice_init(&fiq->pending, &to_end2);
+		list_for_each_entry(req, &to_end2, list)
+			clear_bit(FR_PENDING, &req->flags);
 		while (forget_pending(fiq))
 			kfree(dequeue_forget(fiq, 1, NULL));
 		wake_up_all_locked(&fiq->waitq);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index b3ebe512d64c..811fd8929a18 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -68,7 +68,7 @@ static u64 time_to_jiffies(u64 sec, u32 nsec)
 	if (sec || nsec) {
 		struct timespec64 ts = {
 			sec,
-			max_t(u32, nsec, NSEC_PER_SEC - 1)
+			min_t(u32, nsec, NSEC_PER_SEC - 1)
 		};
 
 		return get_jiffies_64() + timespec64_to_jiffies(&ts);
@@ -1739,8 +1739,6 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 		 * This should be done on write(), truncate() and chown().
 		 */
 		if (!fc->handle_killpriv) {
-			int kill;
-
 			/*
 			 * ia_mode calculation may have used stale i_mode.
 			 * Refresh and recalculate.
@@ -1750,12 +1748,11 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 				return ret;
 
 			attr->ia_mode = inode->i_mode;
-			kill = should_remove_suid(entry);
-			if (kill & ATTR_KILL_SUID) {
+			if (inode->i_mode & S_ISUID) {
 				attr->ia_valid |= ATTR_MODE;
 				attr->ia_mode &= ~S_ISUID;
 			}
-			if (kill & ATTR_KILL_SGID) {
+			if ((inode->i_mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
 				attr->ia_valid |= ATTR_MODE;
 				attr->ia_mode &= ~S_ISGID;
 			}
@@ -1834,7 +1831,6 @@ static const struct inode_operations fuse_common_inode_operations = {
 static const struct inode_operations fuse_symlink_inode_operations = {
 	.setattr	= fuse_setattr,
 	.get_link	= fuse_get_link,
-	.readlink	= generic_readlink,
 	.getattr	= fuse_getattr,
 	.listxattr	= fuse_listxattr,
 };
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index 5a6f52ea2722..6b039d7ce160 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -839,12 +839,10 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh,
 	BUG_ON((pos + len) > (dibh->b_size - sizeof(struct gfs2_dinode)));
 	kaddr = kmap_atomic(page);
 	memcpy(buf + pos, kaddr + pos, copied);
-	memset(kaddr + pos + copied, 0, len - copied);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
 
-	if (!PageUptodate(page))
-		SetPageUptodate(page);
+	WARN_ON(!PageUptodate(page));
 	unlock_page(page);
 	put_page(page);
 
diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c
index 3cdde5f5d399..79113219be5f 100644
--- a/fs/gfs2/dir.c
+++ b/fs/gfs2/dir.c
@@ -62,6 +62,7 @@
 #include <linux/gfs2_ondisk.h>
 #include <linux/crc32.h>
 #include <linux/vmalloc.h>
+#include <linux/bio.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index e23ff70b3435..016c11eaca7c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -22,7 +22,7 @@
 #include <linux/swap.h>
 #include <linux/crc32.h>
 #include <linux/writeback.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/delay.h>
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 14cbf60167a7..94f50cac91c6 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -21,7 +21,7 @@
 #include <linux/list.h>
 #include <linux/wait.h>
 #include <linux/module.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kthread.h>
@@ -695,7 +695,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number,
 	gl->gl_target = LM_ST_UNLOCKED;
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	gl->gl_ops = glops;
-	gl->gl_dstamp = ktime_set(0, 0);
+	gl->gl_dstamp = 0;
 	preempt_disable();
 	/* We use the global stats to estimate the initial per-glock stats */
 	gl->gl_stats = this_cpu_ptr(sdp->sd_lkstats)->lkstats[glops->go_type];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index fe3f84995c48..eb7724b8578a 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -19,7 +19,7 @@
 #include <linux/crc32.h>
 #include <linux/fiemap.h>
 #include <linux/security.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -2067,7 +2067,6 @@ const struct inode_operations gfs2_dir_iops = {
 };
 
 const struct inode_operations gfs2_symlink_iops = {
-	.readlink = generic_readlink,
 	.get_link = gfs2_get_link,
 	.permission = gfs2_permission,
 	.setattr = gfs2_setattr,
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index e58ccef09c91..27c00a16def0 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -657,7 +657,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	struct gfs2_log_header *lh;
 	unsigned int tail;
 	u32 hash;
-	int op_flags = WRITE_FLUSH_FUA | REQ_META;
+	int op_flags = REQ_PREFLUSH | REQ_FUA | REQ_META;
 	struct page *page = mempool_alloc(gfs2_page_pool, GFP_NOIO);
 	enum gfs2_freeze_state state = atomic_read(&sdp->sd_freeze_state);
 	lh = page_address(page);
@@ -682,7 +682,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) {
 		gfs2_ordered_wait(sdp);
 		log_flush_wait(sdp);
-		op_flags = WRITE_SYNC | REQ_META | REQ_PRIO;
+		op_flags = REQ_SYNC | REQ_META | REQ_PRIO;
 	}
 
 	sdp->sd_log_idle = (tail == sdp->sd_log_flush_head);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index 49d5a1b61b06..b1f9144b42c7 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -231,7 +231,7 @@ static void gfs2_end_log_write(struct bio *bio)
  * gfs2_log_flush_bio - Submit any pending log bio
  * @sdp: The superblock
  * @op: REQ_OP
- * @op_flags: rq_flag_bits
+ * @op_flags: req_flag_bits
  *
  * Submit any pending part-built or full bio to the block device. If
  * there is no pending bio, then this is a no-op.
diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c
index 373639a59782..49db8ef13fdf 100644
--- a/fs/gfs2/meta_io.c
+++ b/fs/gfs2/meta_io.c
@@ -37,8 +37,7 @@ static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wb
 {
 	struct buffer_head *bh, *head;
 	int nr_underway = 0;
-	int write_flags = REQ_META | REQ_PRIO |
-		(wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : 0);
+	int write_flags = REQ_META | REQ_PRIO | wbc_to_write_flags(wbc);
 
 	BUG_ON(!PageLocked(page));
 	BUG_ON(!page_has_buffers(page));
@@ -285,7 +284,7 @@ int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags,
 		}
 	}
 
-	gfs2_submit_bhs(REQ_OP_READ, READ_SYNC | REQ_META | REQ_PRIO, bhs, num);
+	gfs2_submit_bhs(REQ_OP_READ, REQ_META | REQ_PRIO, bhs, num);
 	if (!(flags & DIO_WAIT))
 		return 0;
 
@@ -453,7 +452,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen)
 	if (buffer_uptodate(first_bh))
 		goto out;
 	if (!buffer_locked(first_bh))
-		ll_rw_block(REQ_OP_READ, READ_SYNC | REQ_META, 1, &first_bh);
+		ll_rw_block(REQ_OP_READ, REQ_META, 1, &first_bh);
 
 	dblock++;
 	extlen--;
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff72ac6439c8..a34308df927f 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -246,7 +246,7 @@ static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent)
 
 	bio->bi_end_io = end_bio_io_page;
 	bio->bi_private = page;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC | REQ_META);
+	bio_set_op_attrs(bio, REQ_OP_READ, REQ_META);
 	submit_bio(bio);
 	wait_on_page_locked(page);
 	bio_put(bio);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index c9ff1cf7d4f3..f8d30e41d1d3 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -15,7 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/module.h>
 #include <linux/kobject.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/genhd.h>
 
diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c
index aee4485ad8a9..763d659db91b 100644
--- a/fs/gfs2/util.c
+++ b/fs/gfs2/util.c
@@ -14,7 +14,7 @@
 #include <linux/buffer_head.h>
 #include <linux/crc32.h>
 #include <linux/gfs2_ondisk.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c
index a4a577088d19..d87721aeb575 100644
--- a/fs/gfs2/xattr.c
+++ b/fs/gfs2/xattr.c
@@ -14,7 +14,7 @@
 #include <linux/xattr.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/posix_acl_xattr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "gfs2.h"
 #include "incore.h"
diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h
index 4cdec5a19347..6d0783e2e276 100644
--- a/fs/hfs/hfs_fs.h
+++ b/fs/hfs/hfs_fs.h
@@ -23,7 +23,7 @@
 #include <linux/workqueue.h>
 
 #include <asm/byteorder.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "hfs.h"
 
diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c
index 99627f8a0a18..0a156d84e67d 100644
--- a/fs/hfsplus/ioctl.c
+++ b/fs/hfsplus/ioctl.c
@@ -16,7 +16,7 @@
 #include <linux/fs.h>
 #include <linux/mount.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "hfsplus_fs.h"
 
 /*
diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c
index 11854dd84572..67aedf4c2e7c 100644
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -221,7 +221,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				   sbi->part_start + HFSPLUS_VOLHEAD_SECTOR,
 				   sbi->s_vhdr_buf, NULL, REQ_OP_WRITE,
-				   WRITE_SYNC);
+				   REQ_SYNC);
 	if (!error)
 		error = error2;
 	if (!write_backup)
@@ -230,7 +230,7 @@ static int hfsplus_sync_fs(struct super_block *sb, int wait)
 	error2 = hfsplus_submit_bio(sb,
 				  sbi->part_start + sbi->sect_count - 2,
 				  sbi->s_backup_vhdr_buf, NULL, REQ_OP_WRITE,
-				  WRITE_SYNC);
+				  REQ_SYNC);
 	if (!error)
 		error2 = error;
 out:
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 23e15ea53e45..e61261a7417e 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -920,7 +920,6 @@ static const char *hostfs_get_link(struct dentry *dentry,
 }
 
 static const struct inode_operations hostfs_link_iops = {
-	.readlink	= generic_readlink,
 	.get_link	= hostfs_get_link,
 };
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 4fb7b10f3a05..54de77e78775 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -37,7 +37,7 @@
 #include <linux/migrate.h>
 #include <linux/uio.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const struct super_operations hugetlbfs_ops;
 static const struct address_space_operations hugetlbfs_aops;
diff --git a/fs/internal.h b/fs/internal.h
index f4da3341b4a3..b63cf3af2dc2 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -62,7 +62,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
 extern void *copy_mount_options(const void __user *);
 extern char *copy_mount_string(const void __user *);
 
-extern struct vfsmount *lookup_mnt(struct path *);
+extern struct vfsmount *lookup_mnt(const struct path *);
 extern int finish_automount(struct vfsmount *, struct path *);
 
 extern int sb_prepare_remount_readonly(struct super_block *);
@@ -184,3 +184,6 @@ typedef loff_t (*iomap_actor_t)(struct inode *inode, loff_t pos, loff_t len,
 loff_t iomap_apply(struct inode *inode, loff_t pos, loff_t length,
 		unsigned flags, struct iomap_ops *ops, void *data,
 		iomap_actor_t actor);
+
+/* direct-io.c: */
+int sb_init_dio_done_wq(struct super_block *sb);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index c415668c86d4..cb9b02940805 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -223,7 +223,11 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
 
 	if (!src_file.file)
 		return -EBADF;
-	ret = vfs_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != dst_file->f_path.mnt)
+		goto fdput;
+	ret = do_clone_file_range(src_file.file, off, dst_file, destoff, olen);
+fdput:
 	fdput(src_file);
 	return ret;
 }
diff --git a/fs/iomap.c b/fs/iomap.c
index a8ee8c33ca78..354a123f170e 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -24,6 +24,7 @@
 #include <linux/uio.h>
 #include <linux/backing-dev.h>
 #include <linux/buffer_head.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/dax.h>
 #include "internal.h"
 
@@ -467,8 +468,9 @@ int iomap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf,
 
 	offset = page_offset(page);
 	while (length > 0) {
-		ret = iomap_apply(inode, offset, length, IOMAP_WRITE,
-				ops, page, iomap_page_mkwrite_actor);
+		ret = iomap_apply(inode, offset, length,
+				IOMAP_WRITE | IOMAP_FAULT, ops, page,
+				iomap_page_mkwrite_actor);
 		if (unlikely(ret <= 0))
 			goto out_unlock;
 		offset += ret;
@@ -583,3 +585,375 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi,
 	return 0;
 }
 EXPORT_SYMBOL_GPL(iomap_fiemap);
+
+/*
+ * Private flags for iomap_dio, must not overlap with the public ones in
+ * iomap.h:
+ */
+#define IOMAP_DIO_WRITE		(1 << 30)
+#define IOMAP_DIO_DIRTY		(1 << 31)
+
+struct iomap_dio {
+	struct kiocb		*iocb;
+	iomap_dio_end_io_t	*end_io;
+	loff_t			i_size;
+	loff_t			size;
+	atomic_t		ref;
+	unsigned		flags;
+	int			error;
+
+	union {
+		/* used during submission and for synchronous completion: */
+		struct {
+			struct iov_iter		*iter;
+			struct task_struct	*waiter;
+			struct request_queue	*last_queue;
+			blk_qc_t		cookie;
+		} submit;
+
+		/* used for aio completion: */
+		struct {
+			struct work_struct	work;
+		} aio;
+	};
+};
+
+static ssize_t iomap_dio_complete(struct iomap_dio *dio)
+{
+	struct kiocb *iocb = dio->iocb;
+	ssize_t ret;
+
+	if (dio->end_io) {
+		ret = dio->end_io(iocb,
+				dio->error ? dio->error : dio->size,
+				dio->flags);
+	} else {
+		ret = dio->error;
+	}
+
+	if (likely(!ret)) {
+		ret = dio->size;
+		/* check for short read */
+		if (iocb->ki_pos + ret > dio->i_size &&
+		    !(dio->flags & IOMAP_DIO_WRITE))
+			ret = dio->i_size - iocb->ki_pos;
+		iocb->ki_pos += ret;
+	}
+
+	inode_dio_end(file_inode(iocb->ki_filp));
+	kfree(dio);
+
+	return ret;
+}
+
+static void iomap_dio_complete_work(struct work_struct *work)
+{
+	struct iomap_dio *dio = container_of(work, struct iomap_dio, aio.work);
+	struct kiocb *iocb = dio->iocb;
+	bool is_write = (dio->flags & IOMAP_DIO_WRITE);
+	ssize_t ret;
+
+	ret = iomap_dio_complete(dio);
+	if (is_write && ret > 0)
+		ret = generic_write_sync(iocb, ret);
+	iocb->ki_complete(iocb, ret, 0);
+}
+
+/*
+ * Set an error in the dio if none is set yet.  We have to use cmpxchg
+ * as the submission context and the completion context(s) can race to
+ * update the error.
+ */
+static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
+{
+	cmpxchg(&dio->error, 0, ret);
+}
+
+static void iomap_dio_bio_end_io(struct bio *bio)
+{
+	struct iomap_dio *dio = bio->bi_private;
+	bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
+
+	if (bio->bi_error)
+		iomap_dio_set_error(dio, bio->bi_error);
+
+	if (atomic_dec_and_test(&dio->ref)) {
+		if (is_sync_kiocb(dio->iocb)) {
+			struct task_struct *waiter = dio->submit.waiter;
+
+			WRITE_ONCE(dio->submit.waiter, NULL);
+			wake_up_process(waiter);
+		} else if (dio->flags & IOMAP_DIO_WRITE) {
+			struct inode *inode = file_inode(dio->iocb->ki_filp);
+
+			INIT_WORK(&dio->aio.work, iomap_dio_complete_work);
+			queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work);
+		} else {
+			iomap_dio_complete_work(&dio->aio.work);
+		}
+	}
+
+	if (should_dirty) {
+		bio_check_pages_dirty(bio);
+	} else {
+		struct bio_vec *bvec;
+		int i;
+
+		bio_for_each_segment_all(bvec, bio, i)
+			put_page(bvec->bv_page);
+		bio_put(bio);
+	}
+}
+
+static blk_qc_t
+iomap_dio_zero(struct iomap_dio *dio, struct iomap *iomap, loff_t pos,
+		unsigned len)
+{
+	struct page *page = ZERO_PAGE(0);
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_KERNEL, 1);
+	bio->bi_bdev = iomap->bdev;
+	bio->bi_iter.bi_sector =
+		iomap->blkno + ((pos - iomap->offset) >> 9);
+	bio->bi_private = dio;
+	bio->bi_end_io = iomap_dio_bio_end_io;
+
+	get_page(page);
+	if (bio_add_page(bio, page, len, 0) != len)
+		BUG();
+	bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+
+	atomic_inc(&dio->ref);
+	return submit_bio(bio);
+}
+
+static loff_t
+iomap_dio_actor(struct inode *inode, loff_t pos, loff_t length,
+		void *data, struct iomap *iomap)
+{
+	struct iomap_dio *dio = data;
+	unsigned blkbits = blksize_bits(bdev_logical_block_size(iomap->bdev));
+	unsigned fs_block_size = (1 << inode->i_blkbits), pad;
+	unsigned align = iov_iter_alignment(dio->submit.iter);
+	struct iov_iter iter;
+	struct bio *bio;
+	bool need_zeroout = false;
+	int nr_pages, ret;
+
+	if ((pos | length | align) & ((1 << blkbits) - 1))
+		return -EINVAL;
+
+	switch (iomap->type) {
+	case IOMAP_HOLE:
+		if (WARN_ON_ONCE(dio->flags & IOMAP_DIO_WRITE))
+			return -EIO;
+		/*FALLTHRU*/
+	case IOMAP_UNWRITTEN:
+		if (!(dio->flags & IOMAP_DIO_WRITE)) {
+			iov_iter_zero(length, dio->submit.iter);
+			dio->size += length;
+			return length;
+		}
+		dio->flags |= IOMAP_DIO_UNWRITTEN;
+		need_zeroout = true;
+		break;
+	case IOMAP_MAPPED:
+		if (iomap->flags & IOMAP_F_SHARED)
+			dio->flags |= IOMAP_DIO_COW;
+		if (iomap->flags & IOMAP_F_NEW)
+			need_zeroout = true;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+
+	/*
+	 * Operate on a partial iter trimmed to the extent we were called for.
+	 * We'll update the iter in the dio once we're done with this extent.
+	 */
+	iter = *dio->submit.iter;
+	iov_iter_truncate(&iter, length);
+
+	nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+	if (nr_pages <= 0)
+		return nr_pages;
+
+	if (need_zeroout) {
+		/* zero out from the start of the block to the write offset */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos - pad, pad);
+	}
+
+	do {
+		if (dio->error)
+			return 0;
+
+		bio = bio_alloc(GFP_KERNEL, nr_pages);
+		bio->bi_bdev = iomap->bdev;
+		bio->bi_iter.bi_sector =
+			iomap->blkno + ((pos - iomap->offset) >> 9);
+		bio->bi_private = dio;
+		bio->bi_end_io = iomap_dio_bio_end_io;
+
+		ret = bio_iov_iter_get_pages(bio, &iter);
+		if (unlikely(ret)) {
+			bio_put(bio);
+			return ret;
+		}
+
+		if (dio->flags & IOMAP_DIO_WRITE) {
+			bio_set_op_attrs(bio, REQ_OP_WRITE, REQ_SYNC | REQ_IDLE);
+			task_io_account_write(bio->bi_iter.bi_size);
+		} else {
+			bio_set_op_attrs(bio, REQ_OP_READ, 0);
+			if (dio->flags & IOMAP_DIO_DIRTY)
+				bio_set_pages_dirty(bio);
+		}
+
+		dio->size += bio->bi_iter.bi_size;
+		pos += bio->bi_iter.bi_size;
+
+		nr_pages = iov_iter_npages(&iter, BIO_MAX_PAGES);
+
+		atomic_inc(&dio->ref);
+
+		dio->submit.last_queue = bdev_get_queue(iomap->bdev);
+		dio->submit.cookie = submit_bio(bio);
+	} while (nr_pages);
+
+	if (need_zeroout) {
+		/* zero out from the end of the write to the end of the block */
+		pad = pos & (fs_block_size - 1);
+		if (pad)
+			iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
+	}
+
+	iov_iter_advance(dio->submit.iter, length);
+	return length;
+}
+
+ssize_t
+iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, struct iomap_ops *ops,
+		iomap_dio_end_io_t end_io)
+{
+	struct address_space *mapping = iocb->ki_filp->f_mapping;
+	struct inode *inode = file_inode(iocb->ki_filp);
+	size_t count = iov_iter_count(iter);
+	loff_t pos = iocb->ki_pos, end = iocb->ki_pos + count - 1, ret = 0;
+	unsigned int flags = IOMAP_DIRECT;
+	struct blk_plug plug;
+	struct iomap_dio *dio;
+
+	lockdep_assert_held(&inode->i_rwsem);
+
+	if (!count)
+		return 0;
+
+	dio = kmalloc(sizeof(*dio), GFP_KERNEL);
+	if (!dio)
+		return -ENOMEM;
+
+	dio->iocb = iocb;
+	atomic_set(&dio->ref, 1);
+	dio->size = 0;
+	dio->i_size = i_size_read(inode);
+	dio->end_io = end_io;
+	dio->error = 0;
+	dio->flags = 0;
+
+	dio->submit.iter = iter;
+	if (is_sync_kiocb(iocb)) {
+		dio->submit.waiter = current;
+		dio->submit.cookie = BLK_QC_T_NONE;
+		dio->submit.last_queue = NULL;
+	}
+
+	if (iov_iter_rw(iter) == READ) {
+		if (pos >= dio->i_size)
+			goto out_free_dio;
+
+		if (iter->type == ITER_IOVEC)
+			dio->flags |= IOMAP_DIO_DIRTY;
+	} else {
+		dio->flags |= IOMAP_DIO_WRITE;
+		flags |= IOMAP_WRITE;
+	}
+
+	if (mapping->nrpages) {
+		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
+		if (ret)
+			goto out_free_dio;
+
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+		ret = 0;
+	}
+
+	inode_dio_begin(inode);
+
+	blk_start_plug(&plug);
+	do {
+		ret = iomap_apply(inode, pos, count, flags, ops, dio,
+				iomap_dio_actor);
+		if (ret <= 0) {
+			/* magic error code to fall back to buffered I/O */
+			if (ret == -ENOTBLK)
+				ret = 0;
+			break;
+		}
+		pos += ret;
+	} while ((count = iov_iter_count(iter)) > 0);
+	blk_finish_plug(&plug);
+
+	if (ret < 0)
+		iomap_dio_set_error(dio, ret);
+
+	if (ret >= 0 && iov_iter_rw(iter) == WRITE && !is_sync_kiocb(iocb) &&
+			!inode->i_sb->s_dio_done_wq) {
+		ret = sb_init_dio_done_wq(inode->i_sb);
+		if (ret < 0)
+			iomap_dio_set_error(dio, ret);
+	}
+
+	if (!atomic_dec_and_test(&dio->ref)) {
+		if (!is_sync_kiocb(iocb))
+			return -EIOCBQUEUED;
+
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!READ_ONCE(dio->submit.waiter))
+				break;
+
+			if (!(iocb->ki_flags & IOCB_HIPRI) ||
+			    !dio->submit.last_queue ||
+			    !blk_mq_poll(dio->submit.last_queue,
+					 dio->submit.cookie))
+				io_schedule();
+		}
+		__set_current_state(TASK_RUNNING);
+	}
+
+	/*
+	 * Try again to invalidate clean pages which might have been cached by
+	 * non-direct readahead, or faulted in by get_user_pages() if the source
+	 * of the write was an mmap'ed region of the file we're writing.  Either
+	 * one is a pretty crazy thing to do, so we don't support it 100%.  If
+	 * this invalidation fails, tough, the write still worked...
+	 */
+	if (iov_iter_rw(iter) == WRITE && mapping->nrpages) {
+		ret = invalidate_inode_pages2_range(mapping,
+				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
+		WARN_ON_ONCE(ret);
+	}
+
+	return iomap_dio_complete(dio);
+
+out_free_dio:
+	kfree(dio);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(iomap_dio_rw);
diff --git a/fs/isofs/compress.c b/fs/isofs/compress.c
index 44af14b2e916..9bb2fe35799d 100644
--- a/fs/isofs/compress.c
+++ b/fs/isofs/compress.c
@@ -18,6 +18,7 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
+#include <linux/bio.h>
 
 #include <linux/vmalloc.h>
 #include <linux/zlib.h>
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index 98b3eb7d8eaf..0ec137310320 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -377,9 +377,9 @@ repeat:
 			{
 				int p;
 				for (p = 0; p < rr->u.ER.len_id; p++)
-					printk("%c", rr->u.ER.data[p]);
+					printk(KERN_CONT "%c", rr->u.ER.data[p]);
 			}
-			printk("\n");
+			printk(KERN_CONT "\n");
 			break;
 		case SIG('P', 'X'):
 			inode->i_mode = isonum_733(rr->u.PX.mode);
diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index 684996c8a3a4..4055f51617ef 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -186,7 +186,7 @@ __flush_batch(journal_t *journal, int *batch_count)
 
 	blk_start_plug(&plug);
 	for (i = 0; i < *batch_count; i++)
-		write_dirty_buffer(journal->j_chkpt_bhs[i], WRITE_SYNC);
+		write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC);
 	blk_finish_plug(&plug);
 
 	for (i = 0; i < *batch_count; i++) {
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 31f8ca046639..8c514367ba5a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -155,9 +155,10 @@ static int journal_submit_commit_record(journal_t *journal,
 
 	if (journal->j_flags & JBD2_BARRIER &&
 	    !jbd2_has_feature_async_commit(journal))
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC | WRITE_FLUSH_FUA, bh);
+		ret = submit_bh(REQ_OP_WRITE,
+			REQ_SYNC | REQ_PREFLUSH | REQ_FUA, bh);
 	else
-		ret = submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+		ret = submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 
 	*cbh = bh;
 	return ret;
@@ -402,7 +403,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_SYNC);
+						REQ_SYNC);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd_debug(3, "superblock not updated\n");
@@ -717,7 +718,7 @@ start_journal_io:
 				clear_buffer_dirty(bh);
 				set_buffer_uptodate(bh);
 				bh->b_end_io = journal_end_buffer_io_sync;
-				submit_bh(REQ_OP_WRITE, WRITE_SYNC, bh);
+				submit_bh(REQ_OP_WRITE, REQ_SYNC, bh);
 			}
 			cond_resched();
 			stats.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 927da4956a89..a097048ed1a3 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -47,7 +47,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/jbd2.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 
 #ifdef CONFIG_JBD2_DEBUG
@@ -913,7 +913,7 @@ int __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
 	 * space and if we lose sb update during power failure we'd replay
 	 * old transaction with possibly newly overwritten data.
 	 */
-	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	ret = jbd2_journal_update_sb_log_tail(journal, tid, block, REQ_FUA);
 	if (ret)
 		goto out;
 
@@ -1306,7 +1306,7 @@ static int journal_reset(journal_t *journal)
 		/* Lock here to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
 		/*
-		 * Update log tail information. We use WRITE_FUA since new
+		 * Update log tail information. We use REQ_FUA since new
 		 * transaction will start reusing journal space and so we
 		 * must make sure information about current log tail is on
 		 * disk before that.
@@ -1314,7 +1314,7 @@ static int journal_reset(journal_t *journal)
 		jbd2_journal_update_sb_log_tail(journal,
 						journal->j_tail_sequence,
 						journal->j_tail,
-						WRITE_FUA);
+						REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 	return jbd2_journal_start_thread(journal);
@@ -1454,7 +1454,7 @@ void jbd2_journal_update_sb_errno(journal_t *journal)
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
 	read_unlock(&journal->j_state_lock);
 
-	jbd2_write_superblock(journal, WRITE_FUA);
+	jbd2_write_superblock(journal, REQ_FUA);
 }
 EXPORT_SYMBOL(jbd2_journal_update_sb_errno);
 
@@ -1720,7 +1720,8 @@ int jbd2_journal_destroy(journal_t *journal)
 				++journal->j_transaction_sequence;
 			write_unlock(&journal->j_state_lock);
 
-			jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
+			jbd2_mark_journal_empty(journal,
+					REQ_PREFLUSH | REQ_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -1979,7 +1980,7 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal, WRITE_FUA);
+	jbd2_mark_journal_empty(journal, REQ_FUA);
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
@@ -2025,7 +2026,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal, WRITE_FUA);
+		jbd2_mark_journal_empty(journal, REQ_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 91171dc352cb..cfc38b552118 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -648,7 +648,7 @@ static void flush_descriptor(journal_t *journal,
 	set_buffer_jwrite(descriptor);
 	BUFFER_TRACE(descriptor, "write");
 	set_buffer_dirty(descriptor);
-	write_dirty_buffer(descriptor, WRITE_SYNC);
+	write_dirty_buffer(descriptor, REQ_SYNC);
 }
 #endif
 
diff --git a/fs/jffs2/symlink.c b/fs/jffs2/symlink.c
index 8f3f0855fcd2..d2fa138a868c 100644
--- a/fs/jffs2/symlink.c
+++ b/fs/jffs2/symlink.c
@@ -13,7 +13,6 @@
 
 const struct inode_operations jffs2_symlink_inode_operations =
 {
-	.readlink =	generic_readlink,
 	.get_link =	simple_get_link,
 	.setattr =	jffs2_setattr,
 	.listxattr =	jffs2_listxattr,
diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c
index 8653cac7e12e..fc89f9436784 100644
--- a/fs/jfs/ioctl.c
+++ b/fs/jfs/ioctl.c
@@ -13,7 +13,7 @@
 #include <linux/sched.h>
 #include <linux/blkdev.h>
 #include <asm/current.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
@@ -121,7 +121,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 		jfs_set_inode_flags(inode);
 		inode_unlock(inode);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = current_time(inode);
 		mark_inode_dirty(inode);
 setflags_out:
 		mnt_drop_write_file(filp);
diff --git a/fs/jfs/jfs_debug.c b/fs/jfs/jfs_debug.c
index a37eb5f8cbc0..a70907606025 100644
--- a/fs/jfs/jfs_debug.c
+++ b/fs/jfs/jfs_debug.c
@@ -22,7 +22,7 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "jfs_incore.h"
 #include "jfs_filsys.h"
 #include "jfs_debug.h"
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index a21ea8b3e5fa..bb1da1feafeb 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -2002,7 +2002,7 @@ static int lbmRead(struct jfs_log * log, int pn, struct lbuf ** bpp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_READ, READ_SYNC);
+	bio->bi_opf = REQ_OP_READ;
 	/*check if journaling to disk has been disabled*/
 	if (log->no_integrity) {
 		bio->bi_iter.bi_size = 0;
@@ -2146,7 +2146,7 @@ static void lbmStartIO(struct lbuf * bp)
 
 	bio->bi_end_io = lbmIODone;
 	bio->bi_private = bp;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, WRITE_SYNC);
+	bio->bi_opf = REQ_OP_WRITE | REQ_SYNC;
 
 	/* check if journaling to disk has been disabled */
 	if (log->no_integrity) {
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 85671f7f8518..2be7c9ce6663 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -31,7 +31,7 @@
 #include <linux/exportfs.h>
 #include <linux/crc32.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/seq_file.h>
 #include <linux/blkdev.h>
 
diff --git a/fs/jfs/symlink.c b/fs/jfs/symlink.c
index c82404fee6cd..38320607993e 100644
--- a/fs/jfs/symlink.c
+++ b/fs/jfs/symlink.c
@@ -22,14 +22,12 @@
 #include "jfs_xattr.h"
 
 const struct inode_operations jfs_fast_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= simple_get_link,
 	.setattr	= jfs_setattr,
 	.listxattr	= jfs_listxattr,
 };
 
 const struct inode_operations jfs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= jfs_setattr,
 	.listxattr	= jfs_listxattr,
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index a1982118f92f..ac9e108ce1ea 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -335,7 +335,7 @@ static int kernfs_xattr_set(const struct xattr_handler *handler,
 	return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
 }
 
-const struct xattr_handler kernfs_trusted_xattr_handler = {
+static const struct xattr_handler kernfs_trusted_xattr_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
 	.get = kernfs_xattr_get,
 	.set = kernfs_xattr_set,
@@ -372,7 +372,7 @@ static int kernfs_security_xattr_set(const struct xattr_handler *handler,
 	return error;
 }
 
-const struct xattr_handler kernfs_security_xattr_handler = {
+static const struct xattr_handler kernfs_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
 	.get = kernfs_xattr_get,
 	.set = kernfs_security_xattr_set,
diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c
index 9b43ca02b7ab..1684af4a8b9b 100644
--- a/fs/kernfs/symlink.c
+++ b/fs/kernfs/symlink.c
@@ -135,7 +135,6 @@ static const char *kernfs_iop_get_link(struct dentry *dentry,
 
 const struct inode_operations kernfs_symlink_iops = {
 	.listxattr	= kernfs_iop_listxattr,
-	.readlink	= generic_readlink,
 	.get_link	= kernfs_iop_get_link,
 	.setattr	= kernfs_iop_setattr,
 	.getattr	= kernfs_iop_getattr,
diff --git a/fs/libfs.c b/fs/libfs.c
index 48826d4da189..28d6f35feed6 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -16,7 +16,7 @@
 #include <linux/writeback.h>
 #include <linux/buffer_head.h> /* sync_mapping_buffers */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
@@ -245,7 +245,8 @@ struct dentry *mount_pseudo_xattr(struct file_system_type *fs_type, char *name,
 	struct inode *root;
 	struct qstr d_name = QSTR_INIT(name, strlen(name));
 
-	s = sget(fs_type, NULL, set_anon_super, MS_NOUSER, NULL);
+	s = sget_userns(fs_type, NULL, set_anon_super, MS_KERNMOUNT|MS_NOUSER,
+			&init_user_ns, NULL);
 	if (IS_ERR(s))
 		return ERR_CAST(s);
 
@@ -465,6 +466,8 @@ EXPORT_SYMBOL(simple_write_begin);
  * is not called, so a filesystem that actually does store data in .write_inode
  * should extend on what's done here with a call to mark_inode_dirty() in the
  * case that i_size has changed.
+ *
+ * Use *ONLY* with simple_readpage()
  */
 int simple_write_end(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned copied,
@@ -474,14 +477,14 @@ int simple_write_end(struct file *file, struct address_space *mapping,
 	loff_t last_pos = pos + copied;
 
 	/* zero the stale part of the page if we did a short copy */
-	if (copied < len) {
-		unsigned from = pos & (PAGE_SIZE - 1);
-
-		zero_user(page, from + copied, len - copied);
-	}
+	if (!PageUptodate(page)) {
+		if (copied < len) {
+			unsigned from = pos & (PAGE_SIZE - 1);
 
-	if (!PageUptodate(page))
+			zero_user(page, from + copied, len - copied);
+		}
 		SetPageUptodate(page);
+	}
 	/*
 	 * No need to use i_size_read() here, the i_size
 	 * cannot change under us because we hold the i_mutex.
@@ -1129,7 +1132,6 @@ EXPORT_SYMBOL(simple_get_link);
 
 const struct inode_operations simple_symlink_inode_operations = {
 	.get_link = simple_get_link,
-	.readlink = generic_readlink
 };
 EXPORT_SYMBOL(simple_symlink_inode_operations);
 
diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h
index 5426189406c1..fb8cac88251a 100644
--- a/fs/lockd/netns.h
+++ b/fs/lockd/netns.h
@@ -15,6 +15,6 @@ struct lockd_net {
 	struct list_head nsm_handles;
 };
 
-extern int lockd_net_id;
+extern unsigned int lockd_net_id;
 
 #endif
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index fc4084ef4736..1c13dd80744f 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -57,7 +57,7 @@ static struct task_struct	*nlmsvc_task;
 static struct svc_rqst		*nlmsvc_rqst;
 unsigned long			nlmsvc_timeout;
 
-int lockd_net_id;
+unsigned int lockd_net_id;
 
 /*
  * These can be set at insmod time (useful for NFS as root filesystem),
diff --git a/fs/locks.c b/fs/locks.c
index 22c5b4aa4961..26811321d39b 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -131,7 +131,7 @@
 #define CREATE_TRACE_POINTS
 #include <trace/events/filelock.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
deleted file mode 100644
index 2b4503163930..000000000000
--- a/fs/logfs/Kconfig
+++ /dev/null
@@ -1,17 +0,0 @@
-config LOGFS
-	tristate "LogFS file system"
-	depends on MTD || (!MTD && BLOCK)
-	select ZLIB_INFLATE
-	select ZLIB_DEFLATE
-	select CRC32
-	select BTREE
-	help
-	  Flash filesystem aimed to scale efficiently to large devices.
-	  In comparison to JFFS2 it offers significantly faster mount
-	  times and potentially less RAM usage, although the latter has
-	  not been measured yet.
-
-	  In its current state it is still very experimental and should
-	  not be used for other than testing purposes.
-
-	  If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
deleted file mode 100644
index 4820027787ee..000000000000
--- a/fs/logfs/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
-obj-$(CONFIG_LOGFS)	+= logfs.o
-
-logfs-y	+= compr.o
-logfs-y	+= dir.o
-logfs-y	+= file.o
-logfs-y	+= gc.o
-logfs-y	+= inode.o
-logfs-y	+= journal.o
-logfs-y	+= readwrite.o
-logfs-y	+= segment.o
-logfs-y	+= super.o
-logfs-$(CONFIG_BLOCK)	+= dev_bdev.o
-logfs-$(CONFIG_MTD)	+= dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
deleted file mode 100644
index 961f02b86d97..000000000000
--- a/fs/logfs/compr.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * fs/logfs/compr.c	- compression routines
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/vmalloc.h>
-#include <linux/zlib.h>
-
-#define COMPR_LEVEL 3
-
-static DEFINE_MUTEX(compr_mutex);
-static struct z_stream_s stream;
-
-int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
-{
-	int err, ret;
-
-	ret = -EIO;
-	mutex_lock(&compr_mutex);
-	err = zlib_deflateInit(&stream, COMPR_LEVEL);
-	if (err != Z_OK)
-		goto error;
-
-	stream.next_in = in;
-	stream.avail_in = inlen;
-	stream.total_in = 0;
-	stream.next_out = out;
-	stream.avail_out = outlen;
-	stream.total_out = 0;
-
-	err = zlib_deflate(&stream, Z_FINISH);
-	if (err != Z_STREAM_END)
-		goto error;
-
-	err = zlib_deflateEnd(&stream);
-	if (err != Z_OK)
-		goto error;
-
-	if (stream.total_out >= stream.total_in)
-		goto error;
-
-	ret = stream.total_out;
-error:
-	mutex_unlock(&compr_mutex);
-	return ret;
-}
-
-int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
-{
-	int err, ret;
-
-	ret = -EIO;
-	mutex_lock(&compr_mutex);
-	err = zlib_inflateInit(&stream);
-	if (err != Z_OK)
-		goto error;
-
-	stream.next_in = in;
-	stream.avail_in = inlen;
-	stream.total_in = 0;
-	stream.next_out = out;
-	stream.avail_out = outlen;
-	stream.total_out = 0;
-
-	err = zlib_inflate(&stream, Z_FINISH);
-	if (err != Z_STREAM_END)
-		goto error;
-
-	err = zlib_inflateEnd(&stream);
-	if (err != Z_OK)
-		goto error;
-
-	ret = 0;
-error:
-	mutex_unlock(&compr_mutex);
-	return ret;
-}
-
-int __init logfs_compr_init(void)
-{
-	size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
-			zlib_inflate_workspacesize());
-	stream.workspace = vmalloc(size);
-	if (!stream.workspace)
-		return -ENOMEM;
-	return 0;
-}
-
-void logfs_compr_exit(void)
-{
-	vfree(stream.workspace);
-}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
deleted file mode 100644
index a8329cc47dec..000000000000
--- a/fs/logfs/dev_bdev.c
+++ /dev/null
@@ -1,322 +0,0 @@
-/*
- * fs/logfs/dev_bdev.c	- Device access methods for block devices
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/buffer_head.h>
-#include <linux/gfp.h>
-#include <linux/prefetch.h>
-
-#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-
-static int sync_request(struct page *page, struct block_device *bdev, int op)
-{
-	struct bio bio;
-	struct bio_vec bio_vec;
-
-	bio_init(&bio);
-	bio.bi_max_vecs = 1;
-	bio.bi_io_vec = &bio_vec;
-	bio_vec.bv_page = page;
-	bio_vec.bv_len = PAGE_SIZE;
-	bio_vec.bv_offset = 0;
-	bio.bi_vcnt = 1;
-	bio.bi_bdev = bdev;
-	bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
-	bio.bi_iter.bi_size = PAGE_SIZE;
-	bio_set_op_attrs(&bio, op, 0);
-
-	return submit_bio_wait(&bio);
-}
-
-static int bdev_readpage(void *_sb, struct page *page)
-{
-	struct super_block *sb = _sb;
-	struct block_device *bdev = logfs_super(sb)->s_bdev;
-	int err;
-
-	err = sync_request(page, bdev, READ);
-	if (err) {
-		ClearPageUptodate(page);
-		SetPageError(page);
-	} else {
-		SetPageUptodate(page);
-		ClearPageError(page);
-	}
-	unlock_page(page);
-	return err;
-}
-
-static DECLARE_WAIT_QUEUE_HEAD(wq);
-
-static void writeseg_end_io(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-	struct super_block *sb = bio->bi_private;
-	struct logfs_super *super = logfs_super(sb);
-
-	BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		end_page_writeback(bvec->bv_page);
-		put_page(bvec->bv_page);
-	}
-	bio_put(bio);
-	if (atomic_dec_and_test(&super->s_pending_writes))
-		wake_up(&wq);
-}
-
-static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
-		size_t nr_pages)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	struct bio *bio;
-	struct page *page;
-	unsigned int max_pages;
-	int i;
-
-	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
-
-	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio);
-
-	for (i = 0; i < nr_pages; i++) {
-		if (i >= max_pages) {
-			/* Block layer cannot split bios :( */
-			bio->bi_vcnt = i;
-			bio->bi_iter.bi_size = i * PAGE_SIZE;
-			bio->bi_bdev = super->s_bdev;
-			bio->bi_iter.bi_sector = ofs >> 9;
-			bio->bi_private = sb;
-			bio->bi_end_io = writeseg_end_io;
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-			atomic_inc(&super->s_pending_writes);
-			submit_bio(bio);
-
-			ofs += i * PAGE_SIZE;
-			index += i;
-			nr_pages -= i;
-			i = 0;
-
-			bio = bio_alloc(GFP_NOFS, max_pages);
-			BUG_ON(!bio);
-		}
-		page = find_lock_page(mapping, index + i);
-		BUG_ON(!page);
-		bio->bi_io_vec[i].bv_page = page;
-		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
-		bio->bi_io_vec[i].bv_offset = 0;
-
-		BUG_ON(PageWriteback(page));
-		set_page_writeback(page);
-		unlock_page(page);
-	}
-	bio->bi_vcnt = nr_pages;
-	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
-	bio->bi_bdev = super->s_bdev;
-	bio->bi_iter.bi_sector = ofs >> 9;
-	bio->bi_private = sb;
-	bio->bi_end_io = writeseg_end_io;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	atomic_inc(&super->s_pending_writes);
-	submit_bio(bio);
-	return 0;
-}
-
-static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int head;
-
-	BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
-
-	if (len == 0) {
-		/* This can happen when the object fit perfectly into a
-		 * segment, the segment gets written per sync and subsequently
-		 * closed.
-		 */
-		return;
-	}
-	head = ofs & (PAGE_SIZE - 1);
-	if (head) {
-		ofs -= head;
-		len += head;
-	}
-	len = PAGE_ALIGN(len);
-	__bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-}
-
-
-static void erase_end_io(struct bio *bio)
-{ 
-	struct super_block *sb = bio->bi_private; 
-	struct logfs_super *super = logfs_super(sb); 
-
-	BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */ 
-	BUG_ON(bio->bi_vcnt == 0); 
-	bio_put(bio); 
-	if (atomic_dec_and_test(&super->s_pending_writes))
-		wake_up(&wq); 
-} 
-
-static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
-		size_t nr_pages)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct bio *bio;
-	unsigned int max_pages;
-	int i;
-
-	max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
-
-	bio = bio_alloc(GFP_NOFS, max_pages);
-	BUG_ON(!bio);
-
-	for (i = 0; i < nr_pages; i++) {
-		if (i >= max_pages) {
-			/* Block layer cannot split bios :( */
-			bio->bi_vcnt = i;
-			bio->bi_iter.bi_size = i * PAGE_SIZE;
-			bio->bi_bdev = super->s_bdev;
-			bio->bi_iter.bi_sector = ofs >> 9;
-			bio->bi_private = sb;
-			bio->bi_end_io = erase_end_io;
-			bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-			atomic_inc(&super->s_pending_writes);
-			submit_bio(bio);
-
-			ofs += i * PAGE_SIZE;
-			index += i;
-			nr_pages -= i;
-			i = 0;
-
-			bio = bio_alloc(GFP_NOFS, max_pages);
-			BUG_ON(!bio);
-		}
-		bio->bi_io_vec[i].bv_page = super->s_erase_page;
-		bio->bi_io_vec[i].bv_len = PAGE_SIZE;
-		bio->bi_io_vec[i].bv_offset = 0;
-	}
-	bio->bi_vcnt = nr_pages;
-	bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
-	bio->bi_bdev = super->s_bdev;
-	bio->bi_iter.bi_sector = ofs >> 9;
-	bio->bi_private = sb;
-	bio->bi_end_io = erase_end_io;
-	bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
-	atomic_inc(&super->s_pending_writes);
-	submit_bio(bio);
-	return 0;
-}
-
-static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
-		int ensure_write)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	BUG_ON(to & (PAGE_SIZE - 1));
-	BUG_ON(len & (PAGE_SIZE - 1));
-
-	if (super->s_flags & LOGFS_SB_FLAG_RO)
-		return -EROFS;
-
-	if (ensure_write) {
-		/*
-		 * Object store doesn't care whether erases happen or not.
-		 * But for the journal they are required.  Otherwise a scan
-		 * can find an old commit entry and assume it is the current
-		 * one, travelling back in time.
-		 */
-		do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
-	}
-
-	return 0;
-}
-
-static void bdev_sync(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
-}
-
-static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = bdev_readpage;
-
-	*ofs = 0;
-	return read_cache_page(mapping, 0, filler, sb);
-}
-
-static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = bdev_readpage;
-	u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
-	pgoff_t index = pos >> PAGE_SHIFT;
-
-	*ofs = pos;
-	return read_cache_page(mapping, index, filler, sb);
-}
-
-static int bdev_write_sb(struct super_block *sb, struct page *page)
-{
-	struct block_device *bdev = logfs_super(sb)->s_bdev;
-
-	/* Nothing special to do for block devices. */
-	return sync_request(page, bdev, WRITE);
-}
-
-static void bdev_put_device(struct logfs_super *s)
-{
-	blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
-{
-	return 0;
-}
-
-static const struct logfs_device_ops bd_devops = {
-	.find_first_sb	= bdev_find_first_sb,
-	.find_last_sb	= bdev_find_last_sb,
-	.write_sb	= bdev_write_sb,
-	.readpage	= bdev_readpage,
-	.writeseg	= bdev_writeseg,
-	.erase		= bdev_erase,
-	.can_write_buf	= bdev_can_write_buf,
-	.sync		= bdev_sync,
-	.put_device	= bdev_put_device,
-};
-
-int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
-		const char *devname)
-{
-	struct block_device *bdev;
-
-	bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
-				  type);
-	if (IS_ERR(bdev))
-		return PTR_ERR(bdev);
-
-	if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
-		int mtdnr = MINOR(bdev->bd_dev);
-		blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-		return logfs_get_sb_mtd(p, mtdnr);
-	}
-
-	p->s_bdev = bdev;
-	p->s_mtd = NULL;
-	p->s_devops = &bd_devops;
-	return 0;
-}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
deleted file mode 100644
index b76a62b1978f..000000000000
--- a/fs/logfs/dev_mtd.c
+++ /dev/null
@@ -1,274 +0,0 @@
-/*
- * fs/logfs/dev_mtd.c	- Device access methods for MTD
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/completion.h>
-#include <linux/mount.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
-
-static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
-			void *buf)
-{
-	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
-	size_t retlen;
-	int ret;
-
-	ret = mtd_read(mtd, ofs, len, &retlen, buf);
-	BUG_ON(ret == -EINVAL);
-	if (ret)
-		return ret;
-
-	/* Not sure if we should loop instead. */
-	if (retlen != len)
-		return -EIO;
-
-	return 0;
-}
-
-static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
-			void *buf)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct mtd_info *mtd = super->s_mtd;
-	size_t retlen;
-	loff_t page_start, page_end;
-	int ret;
-
-	if (super->s_flags & LOGFS_SB_FLAG_RO)
-		return -EROFS;
-
-	BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
-	BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
-	BUG_ON(len > PAGE_SIZE);
-	page_start = ofs & PAGE_MASK;
-	page_end = PAGE_ALIGN(ofs + len) - 1;
-	ret = mtd_write(mtd, ofs, len, &retlen, buf);
-	if (ret || (retlen != len))
-		return -EIO;
-
-	return 0;
-}
-
-/*
- * For as long as I can remember (since about 2001) mtd->erase has been an
- * asynchronous interface lacking the first driver to actually use the
- * asynchronous properties.  So just to prevent the first implementor of such
- * a thing from breaking logfs in 2350, we do the usual pointless dance to
- * declare a completion variable and wait for completion before returning
- * from logfs_mtd_erase().  What an exercise in futility!
- */
-static void logfs_erase_callback(struct erase_info *ei)
-{
-	complete((struct completion *)ei->priv);
-}
-
-static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
-				size_t len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	struct page *page;
-	pgoff_t index = ofs >> PAGE_SHIFT;
-
-	for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
-		page = find_get_page(mapping, index);
-		if (!page)
-			continue;
-		memset(page_address(page), 0xFF, PAGE_SIZE);
-		put_page(page);
-	}
-	return 0;
-}
-
-static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
-		int ensure_write)
-{
-	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
-	struct erase_info ei;
-	DECLARE_COMPLETION_ONSTACK(complete);
-	int ret;
-
-	BUG_ON(len % mtd->erasesize);
-	if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
-		return -EROFS;
-
-	memset(&ei, 0, sizeof(ei));
-	ei.mtd = mtd;
-	ei.addr = ofs;
-	ei.len = len;
-	ei.callback = logfs_erase_callback;
-	ei.priv = (long)&complete;
-	ret = mtd_erase(mtd, &ei);
-	if (ret)
-		return -EIO;
-
-	wait_for_completion(&complete);
-	if (ei.state != MTD_ERASE_DONE)
-		return -EIO;
-	return logfs_mtd_erase_mapping(sb, ofs, len);
-}
-
-static void logfs_mtd_sync(struct super_block *sb)
-{
-	struct mtd_info *mtd = logfs_super(sb)->s_mtd;
-
-	mtd_sync(mtd);
-}
-
-static int logfs_mtd_readpage(void *_sb, struct page *page)
-{
-	struct super_block *sb = _sb;
-	int err;
-
-	err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
-			page_address(page));
-	if (err == -EUCLEAN || err == -EBADMSG) {
-		/* -EBADMSG happens regularly on power failures */
-		err = 0;
-		/* FIXME: force GC this segment */
-	}
-	if (err) {
-		ClearPageUptodate(page);
-		SetPageError(page);
-	} else {
-		SetPageUptodate(page);
-		ClearPageError(page);
-	}
-	unlock_page(page);
-	return err;
-}
-
-static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = logfs_mtd_readpage;
-	struct mtd_info *mtd = super->s_mtd;
-
-	*ofs = 0;
-	while (mtd_block_isbad(mtd, *ofs)) {
-		*ofs += mtd->erasesize;
-		if (*ofs >= mtd->size)
-			return NULL;
-	}
-	BUG_ON(*ofs & ~PAGE_MASK);
-	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
-}
-
-static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = logfs_mtd_readpage;
-	struct mtd_info *mtd = super->s_mtd;
-
-	*ofs = mtd->size - mtd->erasesize;
-	while (mtd_block_isbad(mtd, *ofs)) {
-		*ofs -= mtd->erasesize;
-		if (*ofs <= 0)
-			return NULL;
-	}
-	*ofs = *ofs + mtd->erasesize - 0x1000;
-	BUG_ON(*ofs & ~PAGE_MASK);
-	return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
-}
-
-static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
-		size_t nr_pages)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	struct page *page;
-	int i, err;
-
-	for (i = 0; i < nr_pages; i++) {
-		page = find_lock_page(mapping, index + i);
-		BUG_ON(!page);
-
-		err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
-					page_address(page));
-		unlock_page(page);
-		put_page(page);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
-static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int head;
-
-	if (super->s_flags & LOGFS_SB_FLAG_RO)
-		return;
-
-	if (len == 0) {
-		/* This can happen when the object fit perfectly into a
-		 * segment, the segment gets written per sync and subsequently
-		 * closed.
-		 */
-		return;
-	}
-	head = ofs & (PAGE_SIZE - 1);
-	if (head) {
-		ofs -= head;
-		len += head;
-	}
-	len = PAGE_ALIGN(len);
-	__logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
-}
-
-static void logfs_mtd_put_device(struct logfs_super *s)
-{
-	put_mtd_device(s->s_mtd);
-}
-
-static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	void *buf;
-	int err;
-
-	buf = kmalloc(super->s_writesize, GFP_KERNEL);
-	if (!buf)
-		return -ENOMEM;
-	err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
-	if (err)
-		goto out;
-	if (memchr_inv(buf, 0xff, super->s_writesize))
-		err = -EIO;
-	kfree(buf);
-out:
-	return err;
-}
-
-static const struct logfs_device_ops mtd_devops = {
-	.find_first_sb	= logfs_mtd_find_first_sb,
-	.find_last_sb	= logfs_mtd_find_last_sb,
-	.readpage	= logfs_mtd_readpage,
-	.writeseg	= logfs_mtd_writeseg,
-	.erase		= logfs_mtd_erase,
-	.can_write_buf	= logfs_mtd_can_write_buf,
-	.sync		= logfs_mtd_sync,
-	.put_device	= logfs_mtd_put_device,
-};
-
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-{
-	struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
-	if (IS_ERR(mtd))
-		return PTR_ERR(mtd);
-
-	s->s_bdev = NULL;
-	s->s_mtd = mtd;
-	s->s_devops = &mtd_devops;
-	return 0;
-}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
deleted file mode 100644
index c87ea52de3d9..000000000000
--- a/fs/logfs/dir.c
+++ /dev/null
@@ -1,801 +0,0 @@
-/*
- * fs/logfs/dir.c	- directory-related code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-/*
- * Atomic dir operations
- *
- * Directory operations are by default not atomic.  Dentries and Inodes are
- * created/removed/altered in separate operations.  Therefore we need to do
- * a small amount of journaling.
- *
- * Create, link, mkdir, mknod and symlink all share the same function to do
- * the work: __logfs_create.  This function works in two atomic steps:
- * 1. allocate inode (remember in journal)
- * 2. allocate dentry (clear journal)
- *
- * As we can only get interrupted between the two, when the inode we just
- * created is simply stored in the anchor.  On next mount, if we were
- * interrupted, we delete the inode.  From a users point of view the
- * operation never happened.
- *
- * Unlink and rmdir also share the same function: unlink.  Again, this
- * function works in two atomic steps
- * 1. remove dentry (remember inode in journal)
- * 2. unlink inode (clear journal)
- *
- * And again, on the next mount, if we were interrupted, we delete the inode.
- * From a users point of view the operation succeeded.
- *
- * Rename is the real pain to deal with, harder than all the other methods
- * combined.  Depending on the circumstances we can run into three cases.
- * A "target rename" where the target dentry already existed, a "local
- * rename" where both parent directories are identical or a "cross-directory
- * rename" in the remaining case.
- *
- * Local rename is atomic, as the old dentry is simply rewritten with a new
- * name.
- *
- * Cross-directory rename works in two steps, similar to __logfs_create and
- * logfs_unlink:
- * 1. Write new dentry (remember old dentry in journal)
- * 2. Remove old dentry (clear journal)
- *
- * Here we remember a dentry instead of an inode.  On next mount, if we were
- * interrupted, we delete the dentry.  From a users point of view, the
- * operation succeeded.
- *
- * Target rename works in three atomic steps:
- * 1. Attach old inode to new dentry (remember old dentry and new inode)
- * 2. Remove old dentry (still remember the new inode)
- * 3. Remove victim inode
- *
- * Here we remember both an inode an a dentry.  If we get interrupted
- * between steps 1 and 2, we delete both the dentry and the inode.  If
- * we get interrupted between steps 2 and 3, we delete just the inode.
- * In either case, the remaining objects are deleted on next mount.  From
- * a users point of view, the operation succeeded.
- */
-
-static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
-		loff_t pos)
-{
-	return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
-}
-
-static int write_inode(struct inode *inode)
-{
-	return __logfs_write_inode(inode, NULL, WF_LOCK);
-}
-
-static s64 dir_seek_data(struct inode *inode, s64 pos)
-{
-	s64 new_pos = logfs_seek_data(inode, pos);
-
-	return max(pos, new_pos - 1);
-}
-
-static int beyond_eof(struct inode *inode, loff_t bix)
-{
-	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
-	return pos >= i_size_read(inode);
-}
-
-/*
- * Prime value was chosen to be roughly 256 + 26.  r5 hash uses 11,
- * so short names (len <= 9) don't even occupy the complete 32bit name
- * space.  A prime >256 ensures short names quickly spread the 32bit
- * name space.  Add about 26 for the estimated amount of information
- * of each character and pick a prime nearby, preferably a bit-sparse
- * one.
- */
-static u32 logfs_hash_32(const char *s, int len, u32 seed)
-{
-	u32 hash = seed;
-	int i;
-
-	for (i = 0; i < len; i++)
-		hash = hash * 293 + s[i];
-	return hash;
-}
-
-/*
- * We have to satisfy several conflicting requirements here.  Small
- * directories should stay fairly compact and not require too many
- * indirect blocks.  The number of possible locations for a given hash
- * should be small to make lookup() fast.  And we should try hard not
- * to overflow the 32bit name space or nfs and 32bit host systems will
- * be unhappy.
- *
- * So we use the following scheme.  First we reduce the hash to 0..15
- * and try a direct block.  If that is occupied we reduce the hash to
- * 16..255 and try an indirect block.  Same for 2x and 3x indirect
- * blocks.  Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
- * but use buckets containing eight entries instead of a single one.
- *
- * Using 16 entries should allow for a reasonable amount of hash
- * collisions, so the 32bit name space can be packed fairly tight
- * before overflowing.  Oh and currently we don't overflow but return
- * and error.
- *
- * How likely are collisions?  Doing the appropriate math is beyond me
- * and the Bronstein textbook.  But running a test program to brute
- * force collisions for a couple of days showed that on average the
- * first collision occurs after 598M entries, with 290M being the
- * smallest result.  Obviously 21 entries could already cause a
- * collision if all entries are carefully chosen.
- */
-static pgoff_t hash_index(u32 hash, int round)
-{
-	u32 i0_blocks = I0_BLOCKS;
-	u32 i1_blocks = I1_BLOCKS;
-	u32 i2_blocks = I2_BLOCKS;
-	u32 i3_blocks = I3_BLOCKS;
-
-	switch (round) {
-	case 0:
-		return hash % i0_blocks;
-	case 1:
-		return i0_blocks + hash % (i1_blocks - i0_blocks);
-	case 2:
-		return i1_blocks + hash % (i2_blocks - i1_blocks);
-	case 3:
-		return i2_blocks + hash % (i3_blocks - i2_blocks);
-	case 4 ... 19:
-		return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
-			+ round - 4;
-	}
-	BUG();
-}
-
-static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
-{
-	const struct qstr *name = &dentry->d_name;
-	struct page *page;
-	struct logfs_disk_dentry *dd;
-	u32 hash = logfs_hash_32(name->name, name->len, 0);
-	pgoff_t index;
-	int round;
-
-	if (name->len > LOGFS_MAX_NAMELEN)
-		return ERR_PTR(-ENAMETOOLONG);
-
-	for (round = 0; round < 20; round++) {
-		index = hash_index(hash, round);
-
-		if (beyond_eof(dir, index))
-			return NULL;
-		if (!logfs_exist_block(dir, index))
-			continue;
-		page = read_cache_page(dir->i_mapping, index,
-				(filler_t *)logfs_readpage, NULL);
-		if (IS_ERR(page))
-			return page;
-		dd = kmap_atomic(page);
-		BUG_ON(dd->namelen == 0);
-
-		if (name->len != be16_to_cpu(dd->namelen) ||
-				memcmp(name->name, dd->name, name->len)) {
-			kunmap_atomic(dd);
-			put_page(page);
-			continue;
-		}
-
-		kunmap_atomic(dd);
-		return page;
-	}
-	return NULL;
-}
-
-static int logfs_remove_inode(struct inode *inode)
-{
-	int ret;
-
-	drop_nlink(inode);
-	ret = write_inode(inode);
-	LOGFS_BUG_ON(ret, inode->i_sb);
-	return ret;
-}
-
-static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
-	if (logfs_inode(inode)->li_block)
-		logfs_inode(inode)->li_block->ta = NULL;
-	kfree(ta);
-}
-
-static int logfs_unlink(struct inode *dir, struct dentry *dentry)
-{
-	struct logfs_super *super = logfs_super(dir->i_sb);
-	struct inode *inode = d_inode(dentry);
-	struct logfs_transaction *ta;
-	struct page *page;
-	pgoff_t index;
-	int ret;
-
-	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-	if (!ta)
-		return -ENOMEM;
-
-	ta->state = UNLINK_1;
-	ta->ino = inode->i_ino;
-
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
-
-	page = logfs_get_dd_page(dir, dentry);
-	if (!page) {
-		kfree(ta);
-		return -ENOENT;
-	}
-	if (IS_ERR(page)) {
-		kfree(ta);
-		return PTR_ERR(page);
-	}
-	index = page->index;
-	put_page(page);
-
-	mutex_lock(&super->s_dirop_mutex);
-	logfs_add_transaction(dir, ta);
-
-	ret = logfs_delete(dir, index, NULL);
-	if (!ret)
-		ret = write_inode(dir);
-
-	if (ret) {
-		abort_transaction(dir, ta);
-		printk(KERN_ERR"LOGFS: unable to delete inode\n");
-		goto out;
-	}
-
-	ta->state = UNLINK_2;
-	logfs_add_transaction(inode, ta);
-	ret = logfs_remove_inode(inode);
-out:
-	mutex_unlock(&super->s_dirop_mutex);
-	return ret;
-}
-
-static inline int logfs_empty_dir(struct inode *dir)
-{
-	u64 data;
-
-	data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
-	return data >= i_size_read(dir);
-}
-
-static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
-{
-	struct inode *inode = d_inode(dentry);
-
-	if (!logfs_empty_dir(inode))
-		return -ENOTEMPTY;
-
-	return logfs_unlink(dir, dentry);
-}
-
-/* FIXME: readdir currently has it's own dir_walk code.  I don't see a good
- * way to combine the two copies */
-static int logfs_readdir(struct file *file, struct dir_context *ctx)
-{
-	struct inode *dir = file_inode(file);
-	loff_t pos;
-	struct page *page;
-	struct logfs_disk_dentry *dd;
-
-	if (ctx->pos < 0)
-		return -EINVAL;
-
-	if (!dir_emit_dots(file, ctx))
-		return 0;
-
-	pos = ctx->pos - 2;
-	BUG_ON(pos < 0);
-	for (;; pos++, ctx->pos++) {
-		bool full;
-		if (beyond_eof(dir, pos))
-			break;
-		if (!logfs_exist_block(dir, pos)) {
-			/* deleted dentry */
-			pos = dir_seek_data(dir, pos);
-			continue;
-		}
-		page = read_cache_page(dir->i_mapping, pos,
-				(filler_t *)logfs_readpage, NULL);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		dd = kmap(page);
-		BUG_ON(dd->namelen == 0);
-
-		full = !dir_emit(ctx, (char *)dd->name,
-				be16_to_cpu(dd->namelen),
-				be64_to_cpu(dd->ino), dd->type);
-		kunmap(page);
-		put_page(page);
-		if (full)
-			break;
-	}
-	return 0;
-}
-
-static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name)
-{
-	dd->namelen = cpu_to_be16(name->len);
-	memcpy(dd->name, name->name, name->len);
-}
-
-static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
-		unsigned int flags)
-{
-	struct page *page;
-	struct logfs_disk_dentry *dd;
-	pgoff_t index;
-	u64 ino = 0;
-	struct inode *inode;
-
-	page = logfs_get_dd_page(dir, dentry);
-	if (IS_ERR(page))
-		return ERR_CAST(page);
-	if (!page) {
-		d_add(dentry, NULL);
-		return NULL;
-	}
-	index = page->index;
-	dd = kmap_atomic(page);
-	ino = be64_to_cpu(dd->ino);
-	kunmap_atomic(dd);
-	put_page(page);
-
-	inode = logfs_iget(dir->i_sb, ino);
-	if (IS_ERR(inode))
-		printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
-				ino, dir->i_ino, index);
-	return d_splice_alias(inode, dentry);
-}
-
-static void grow_dir(struct inode *dir, loff_t index)
-{
-	index = (index + 1) << dir->i_sb->s_blocksize_bits;
-	if (i_size_read(dir) < index)
-		i_size_write(dir, index);
-}
-
-static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
-		struct inode *inode)
-{
-	struct page *page;
-	struct logfs_disk_dentry *dd;
-	u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0);
-	pgoff_t index;
-	int round, err;
-
-	for (round = 0; round < 20; round++) {
-		index = hash_index(hash, round);
-
-		if (logfs_exist_block(dir, index))
-			continue;
-		page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
-		if (!page)
-			return -ENOMEM;
-
-		dd = kmap_atomic(page);
-		memset(dd, 0, sizeof(*dd));
-		dd->ino = cpu_to_be64(inode->i_ino);
-		dd->type = logfs_type(inode);
-		logfs_set_name(dd, &dentry->d_name);
-		kunmap_atomic(dd);
-
-		err = logfs_write_buf(dir, page, WF_LOCK);
-		unlock_page(page);
-		put_page(page);
-		if (!err)
-			grow_dir(dir, index);
-		return err;
-	}
-	/* FIXME: Is there a better return value?  In most cases neither
-	 * the filesystem nor the directory are full.  But we have had
-	 * too many collisions for this particular hash and no fallback.
-	 */
-	return -ENOSPC;
-}
-
-static int __logfs_create(struct inode *dir, struct dentry *dentry,
-		struct inode *inode, const char *dest, long destlen)
-{
-	struct logfs_super *super = logfs_super(dir->i_sb);
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_transaction *ta;
-	int ret;
-
-	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-	if (!ta) {
-		drop_nlink(inode);
-		iput(inode);
-		return -ENOMEM;
-	}
-
-	ta->state = CREATE_1;
-	ta->ino = inode->i_ino;
-	mutex_lock(&super->s_dirop_mutex);
-	logfs_add_transaction(inode, ta);
-
-	if (dest) {
-		/* symlink */
-		ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
-		if (!ret)
-			ret = write_inode(inode);
-	} else {
-		/* creat/mkdir/mknod */
-		ret = write_inode(inode);
-	}
-	if (ret) {
-		abort_transaction(inode, ta);
-		li->li_flags |= LOGFS_IF_STILLBORN;
-		/* FIXME: truncate symlink */
-		drop_nlink(inode);
-		iput(inode);
-		goto out;
-	}
-
-	ta->state = CREATE_2;
-	logfs_add_transaction(dir, ta);
-	ret = logfs_write_dir(dir, dentry, inode);
-	/* sync directory */
-	if (!ret)
-		ret = write_inode(dir);
-
-	if (ret) {
-		logfs_del_transaction(dir, ta);
-		ta->state = CREATE_2;
-		logfs_add_transaction(inode, ta);
-		logfs_remove_inode(inode);
-		iput(inode);
-		goto out;
-	}
-	d_instantiate(dentry, inode);
-out:
-	mutex_unlock(&super->s_dirop_mutex);
-	return ret;
-}
-
-static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
-{
-	struct inode *inode;
-
-	/*
-	 * FIXME: why do we have to fill in S_IFDIR, while the mode is
-	 * correct for mknod, creat, etc.?  Smells like the vfs *should*
-	 * do it for us but for some reason fails to do so.
-	 */
-	inode = logfs_new_inode(dir, S_IFDIR | mode);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	inode->i_op = &logfs_dir_iops;
-	inode->i_fop = &logfs_dir_fops;
-
-	return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		bool excl)
-{
-	struct inode *inode;
-
-	inode = logfs_new_inode(dir, mode);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	inode->i_op = &logfs_reg_iops;
-	inode->i_fop = &logfs_reg_fops;
-	inode->i_mapping->a_ops = &logfs_reg_aops;
-
-	return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
-		dev_t rdev)
-{
-	struct inode *inode;
-
-	if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
-		return -ENAMETOOLONG;
-
-	inode = logfs_new_inode(dir, mode);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	init_special_inode(inode, mode, rdev);
-
-	return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_symlink(struct inode *dir, struct dentry *dentry,
-		const char *target)
-{
-	struct inode *inode;
-	size_t destlen = strlen(target) + 1;
-
-	if (destlen > dir->i_sb->s_blocksize)
-		return -ENAMETOOLONG;
-
-	inode = logfs_new_inode(dir, S_IFLNK | 0777);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-
-	inode->i_op = &page_symlink_inode_operations;
-	inode_nohighmem(inode);
-	inode->i_mapping->a_ops = &logfs_reg_aops;
-
-	return __logfs_create(dir, dentry, inode, target, destlen);
-}
-
-static int logfs_link(struct dentry *old_dentry, struct inode *dir,
-		struct dentry *dentry)
-{
-	struct inode *inode = d_inode(old_dentry);
-
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
-	ihold(inode);
-	inc_nlink(inode);
-	mark_inode_dirty_sync(inode);
-
-	return __logfs_create(dir, dentry, inode, NULL, 0);
-}
-
-static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
-		struct logfs_disk_dentry *dd, loff_t *pos)
-{
-	struct page *page;
-	void *map;
-
-	page = logfs_get_dd_page(dir, dentry);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-	*pos = page->index;
-	map = kmap_atomic(page);
-	memcpy(dd, map, sizeof(*dd));
-	kunmap_atomic(map);
-	put_page(page);
-	return 0;
-}
-
-static int logfs_delete_dd(struct inode *dir, loff_t pos)
-{
-	/*
-	 * Getting called with pos somewhere beyond eof is either a goofup
-	 * within this file or means someone maliciously edited the
-	 * (crc-protected) journal.
-	 */
-	BUG_ON(beyond_eof(dir, pos));
-	dir->i_ctime = dir->i_mtime = current_time(dir);
-	log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
-	return logfs_delete(dir, pos, NULL);
-}
-
-/*
- * Cross-directory rename, target does not exist.  Just a little nasty.
- * Create a new dentry in the target dir, then remove the old dentry,
- * all the while taking care to remember our operation in the journal.
- */
-static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
-			      struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct logfs_super *super = logfs_super(old_dir->i_sb);
-	struct logfs_disk_dentry dd;
-	struct logfs_transaction *ta;
-	loff_t pos;
-	int err;
-
-	/* 1. locate source dd */
-	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
-	if (err)
-		return err;
-
-	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-	if (!ta)
-		return -ENOMEM;
-
-	ta->state = CROSS_RENAME_1;
-	ta->dir = old_dir->i_ino;
-	ta->pos = pos;
-
-	/* 2. write target dd */
-	mutex_lock(&super->s_dirop_mutex);
-	logfs_add_transaction(new_dir, ta);
-	err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry));
-	if (!err)
-		err = write_inode(new_dir);
-
-	if (err) {
-		super->s_rename_dir = 0;
-		super->s_rename_pos = 0;
-		abort_transaction(new_dir, ta);
-		goto out;
-	}
-
-	/* 3. remove source dd */
-	ta->state = CROSS_RENAME_2;
-	logfs_add_transaction(old_dir, ta);
-	err = logfs_delete_dd(old_dir, pos);
-	if (!err)
-		err = write_inode(old_dir);
-	LOGFS_BUG_ON(err, old_dir->i_sb);
-out:
-	mutex_unlock(&super->s_dirop_mutex);
-	return err;
-}
-
-static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
-		struct logfs_disk_dentry *dd, struct inode *inode)
-{
-	loff_t pos;
-	int err;
-
-	err = logfs_get_dd(dir, dentry, dd, &pos);
-	if (err)
-		return err;
-	dd->ino = cpu_to_be64(inode->i_ino);
-	dd->type = logfs_type(inode);
-
-	err = write_dir(dir, dd, pos);
-	if (err)
-		return err;
-	log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
-			dd->name, be64_to_cpu(dd->ino));
-	return write_inode(dir);
-}
-
-/* Target dentry exists - the worst case.  We need to attach the source
- * inode to the target dentry, then remove the orphaned target inode and
- * source dentry.
- */
-static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
-			       struct inode *new_dir, struct dentry *new_dentry)
-{
-	struct logfs_super *super = logfs_super(old_dir->i_sb);
-	struct inode *old_inode = d_inode(old_dentry);
-	struct inode *new_inode = d_inode(new_dentry);
-	int isdir = S_ISDIR(old_inode->i_mode);
-	struct logfs_disk_dentry dd;
-	struct logfs_transaction *ta;
-	loff_t pos;
-	int err;
-
-	BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
-	if (isdir) {
-		if (!logfs_empty_dir(new_inode))
-			return -ENOTEMPTY;
-	}
-
-	/* 1. locate source dd */
-	err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
-	if (err)
-		return err;
-
-	ta = kzalloc(sizeof(*ta), GFP_KERNEL);
-	if (!ta)
-		return -ENOMEM;
-
-	ta->state = TARGET_RENAME_1;
-	ta->dir = old_dir->i_ino;
-	ta->pos = pos;
-	ta->ino = new_inode->i_ino;
-
-	/* 2. attach source inode to target dd */
-	mutex_lock(&super->s_dirop_mutex);
-	logfs_add_transaction(new_dir, ta);
-	err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
-	if (err) {
-		super->s_rename_dir = 0;
-		super->s_rename_pos = 0;
-		super->s_victim_ino = 0;
-		abort_transaction(new_dir, ta);
-		goto out;
-	}
-
-	/* 3. remove source dd */
-	ta->state = TARGET_RENAME_2;
-	logfs_add_transaction(old_dir, ta);
-	err = logfs_delete_dd(old_dir, pos);
-	if (!err)
-		err = write_inode(old_dir);
-	LOGFS_BUG_ON(err, old_dir->i_sb);
-
-	/* 4. remove target inode */
-	ta->state = TARGET_RENAME_3;
-	logfs_add_transaction(new_inode, ta);
-	err = logfs_remove_inode(new_inode);
-
-out:
-	mutex_unlock(&super->s_dirop_mutex);
-	return err;
-}
-
-static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
-			struct inode *new_dir, struct dentry *new_dentry,
-			unsigned int flags)
-{
-	if (flags & ~RENAME_NOREPLACE)
-		return -EINVAL;
-
-	if (d_really_is_positive(new_dentry))
-		return logfs_rename_target(old_dir, old_dentry,
-					   new_dir, new_dentry);
-	return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
-}
-
-/* No locking done here, as this is called before .get_sb() returns. */
-int logfs_replay_journal(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode;
-	u64 ino, pos;
-	int err;
-
-	if (super->s_victim_ino) {
-		/* delete victim inode */
-		ino = super->s_victim_ino;
-		printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
-		inode = logfs_iget(sb, ino);
-		if (IS_ERR(inode))
-			goto fail;
-
-		LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
-		super->s_victim_ino = 0;
-		err = logfs_remove_inode(inode);
-		iput(inode);
-		if (err) {
-			super->s_victim_ino = ino;
-			goto fail;
-		}
-	}
-	if (super->s_rename_dir) {
-		/* delete old dd from rename */
-		ino = super->s_rename_dir;
-		pos = super->s_rename_pos;
-		printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
-				ino, pos);
-		inode = logfs_iget(sb, ino);
-		if (IS_ERR(inode))
-			goto fail;
-
-		super->s_rename_dir = 0;
-		super->s_rename_pos = 0;
-		err = logfs_delete_dd(inode, pos);
-		iput(inode);
-		if (err) {
-			super->s_rename_dir = ino;
-			super->s_rename_pos = pos;
-			goto fail;
-		}
-	}
-	return 0;
-fail:
-	LOGFS_BUG(sb);
-	return -EIO;
-}
-
-const struct inode_operations logfs_dir_iops = {
-	.create		= logfs_create,
-	.link		= logfs_link,
-	.lookup		= logfs_lookup,
-	.mkdir		= logfs_mkdir,
-	.mknod		= logfs_mknod,
-	.rename		= logfs_rename,
-	.rmdir		= logfs_rmdir,
-	.symlink	= logfs_symlink,
-	.unlink		= logfs_unlink,
-};
-const struct file_operations logfs_dir_fops = {
-	.fsync		= logfs_fsync,
-	.unlocked_ioctl	= logfs_ioctl,
-	.iterate_shared	= logfs_readdir,
-	.read		= generic_read_dir,
-	.llseek		= generic_file_llseek,
-};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
deleted file mode 100644
index 1db04930ad57..000000000000
--- a/fs/logfs/file.c
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * fs/logfs/file.c	- prepare_write, commit_write and friends
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/writeback.h>
-
-static int logfs_write_begin(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned flags,
-		struct page **pagep, void **fsdata)
-{
-	struct inode *inode = mapping->host;
-	struct page *page;
-	pgoff_t index = pos >> PAGE_SHIFT;
-
-	page = grab_cache_page_write_begin(mapping, index, flags);
-	if (!page)
-		return -ENOMEM;
-	*pagep = page;
-
-	if ((len == PAGE_SIZE) || PageUptodate(page))
-		return 0;
-	if ((pos & PAGE_MASK) >= i_size_read(inode)) {
-		unsigned start = pos & (PAGE_SIZE - 1);
-		unsigned end = start + len;
-
-		/* Reading beyond i_size is simple: memset to zero */
-		zero_user_segments(page, 0, start, end, PAGE_SIZE);
-		return 0;
-	}
-	return logfs_readpage_nolock(page);
-}
-
-static int logfs_write_end(struct file *file, struct address_space *mapping,
-		loff_t pos, unsigned len, unsigned copied, struct page *page,
-		void *fsdata)
-{
-	struct inode *inode = mapping->host;
-	pgoff_t index = page->index;
-	unsigned start = pos & (PAGE_SIZE - 1);
-	unsigned end = start + copied;
-	int ret = 0;
-
-	BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
-	BUG_ON(page->index > I3_BLOCKS);
-
-	if (copied < len) {
-		/*
-		 * Short write of a non-initialized paged.  Just tell userspace
-		 * to retry the entire page.
-		 */
-		if (!PageUptodate(page)) {
-			copied = 0;
-			goto out;
-		}
-	}
-	if (copied == 0)
-		goto out; /* FIXME: do we need to update inode? */
-
-	if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
-		i_size_write(inode, (index << PAGE_SHIFT) + end);
-		mark_inode_dirty_sync(inode);
-	}
-
-	SetPageUptodate(page);
-	if (!PageDirty(page)) {
-		if (!get_page_reserve(inode, page))
-			__set_page_dirty_nobuffers(page);
-		else
-			ret = logfs_write_buf(inode, page, WF_LOCK);
-	}
-out:
-	unlock_page(page);
-	put_page(page);
-	return ret ? ret : copied;
-}
-
-int logfs_readpage(struct file *file, struct page *page)
-{
-	int ret;
-
-	ret = logfs_readpage_nolock(page);
-	unlock_page(page);
-	return ret;
-}
-
-/* Clear the page's dirty flag in the radix tree. */
-/* TODO: mucking with PageWriteback is silly.  Add a generic function to clear
- * the dirty bit from the radix tree for filesystems that don't have to wait
- * for page writeback to finish (i.e. any compressing filesystem).
- */
-static void clear_radix_tree_dirty(struct page *page)
-{
-	BUG_ON(PagePrivate(page) || page->private);
-	set_page_writeback(page);
-	end_page_writeback(page);
-}
-
-static int __logfs_writepage(struct page *page)
-{
-	struct inode *inode = page->mapping->host;
-	int err;
-
-	err = logfs_write_buf(inode, page, WF_LOCK);
-	if (err)
-		set_page_dirty(page);
-	else
-		clear_radix_tree_dirty(page);
-	unlock_page(page);
-	return err;
-}
-
-static int logfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-	struct inode *inode = page->mapping->host;
-	loff_t i_size = i_size_read(inode);
-	pgoff_t end_index = i_size >> PAGE_SHIFT;
-	unsigned offset;
-	u64 bix;
-	level_t level;
-
-	log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
-			page);
-
-	logfs_unpack_index(page->index, &bix, &level);
-
-	/* Indirect blocks are never truncated */
-	if (level != 0)
-		return __logfs_writepage(page);
-
-	/*
-	 * TODO: everything below is a near-verbatim copy of nobh_writepage().
-	 * The relevant bits should be factored out after logfs is merged.
-	 */
-
-	/* Is the page fully inside i_size? */
-	if (bix < end_index)
-		return __logfs_writepage(page);
-
-	 /* Is the page fully outside i_size? (truncate in progress) */
-	offset = i_size & (PAGE_SIZE-1);
-	if (bix > end_index || offset == 0) {
-		unlock_page(page);
-		return 0; /* don't care */
-	}
-
-	/*
-	 * The page straddles i_size.  It must be zeroed out on each and every
-	 * writepage invokation because it may be mmapped.  "A file is mapped
-	 * in multiples of the page size.  For a file that is not a multiple of
-	 * the  page size, the remaining memory is zeroed when mapped, and
-	 * writes to that region are not written out to the file."
-	 */
-	zero_user_segment(page, offset, PAGE_SIZE);
-	return __logfs_writepage(page);
-}
-
-static void logfs_invalidatepage(struct page *page, unsigned int offset,
-				 unsigned int length)
-{
-	struct logfs_block *block = logfs_block(page);
-
-	if (block->reserved_bytes) {
-		struct super_block *sb = page->mapping->host->i_sb;
-		struct logfs_super *super = logfs_super(sb);
-
-		super->s_dirty_pages -= block->reserved_bytes;
-		block->ops->free_block(sb, block);
-		BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
-	} else
-		move_page_to_btree(page);
-	BUG_ON(PagePrivate(page) || page->private);
-}
-
-static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
-{
-	return 0; /* None of these are easy to release */
-}
-
-
-long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
-	struct inode *inode = file_inode(file);
-	struct logfs_inode *li = logfs_inode(inode);
-	unsigned int oldflags, flags;
-	int err;
-
-	switch (cmd) {
-	case FS_IOC_GETFLAGS:
-		flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
-		return put_user(flags, (int __user *)arg);
-	case FS_IOC_SETFLAGS:
-		if (IS_RDONLY(inode))
-			return -EROFS;
-
-		if (!inode_owner_or_capable(inode))
-			return -EACCES;
-
-		err = get_user(flags, (int __user *)arg);
-		if (err)
-			return err;
-
-		inode_lock(inode);
-		oldflags = li->li_flags;
-		flags &= LOGFS_FL_USER_MODIFIABLE;
-		flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
-		li->li_flags = flags;
-		inode_unlock(inode);
-
-		inode->i_ctime = current_time(inode);
-		mark_inode_dirty_sync(inode);
-		return 0;
-
-	default:
-		return -ENOTTY;
-	}
-}
-
-int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
-{
-	struct super_block *sb = file->f_mapping->host->i_sb;
-	struct inode *inode = file->f_mapping->host;
-	int ret;
-
-	ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (ret)
-		return ret;
-
-	inode_lock(inode);
-	logfs_get_wblocks(sb, NULL, WF_LOCK);
-	logfs_write_anchor(sb);
-	logfs_put_wblocks(sb, NULL, WF_LOCK);
-	inode_unlock(inode);
-
-	return 0;
-}
-
-static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
-{
-	struct inode *inode = d_inode(dentry);
-	int err = 0;
-
-	err = setattr_prepare(dentry, attr);
-	if (err)
-		return err;
-
-	if (attr->ia_valid & ATTR_SIZE) {
-		err = logfs_truncate(inode, attr->ia_size);
-		if (err)
-			return err;
-	}
-
-	setattr_copy(inode, attr);
-	mark_inode_dirty(inode);
-	return 0;
-}
-
-const struct inode_operations logfs_reg_iops = {
-	.setattr	= logfs_setattr,
-};
-
-const struct file_operations logfs_reg_fops = {
-	.read_iter	= generic_file_read_iter,
-	.write_iter	= generic_file_write_iter,
-	.fsync		= logfs_fsync,
-	.unlocked_ioctl	= logfs_ioctl,
-	.llseek		= generic_file_llseek,
-	.mmap		= generic_file_readonly_mmap,
-	.open		= generic_file_open,
-};
-
-const struct address_space_operations logfs_reg_aops = {
-	.invalidatepage	= logfs_invalidatepage,
-	.readpage	= logfs_readpage,
-	.releasepage	= logfs_releasepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
-	.writepage	= logfs_writepage,
-	.writepages	= generic_writepages,
-	.write_begin	= logfs_write_begin,
-	.write_end	= logfs_write_end,
-};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
deleted file mode 100644
index d4efb061bdc5..000000000000
--- a/fs/logfs/gc.c
+++ /dev/null
@@ -1,732 +0,0 @@
-/*
- * fs/logfs/gc.c	- garbage collection code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-/*
- * Wear leveling needs to kick in when the difference between low erase
- * counts and high erase counts gets too big.  A good value for "too big"
- * may be somewhat below 10% of maximum erase count for the device.
- * Why not 397, to pick a nice round number with no specific meaning? :)
- *
- * WL_RATELIMIT is the minimum time between two wear level events.  A huge
- * number of segments may fulfil the requirements for wear leveling at the
- * same time.  If that happens we don't want to cause a latency from hell,
- * but just gently pick one segment every so often and minimize overhead.
- */
-#define WL_DELTA 397
-#define WL_RATELIMIT 100
-#define MAX_OBJ_ALIASES	2600
-#define SCAN_RATIO 512	/* number of scanned segments per gc'd segment */
-#define LIST_SIZE 64	/* base size of candidate lists */
-#define SCAN_ROUNDS 128	/* maximum number of complete medium scans */
-#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
-
-static int no_free_segments(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	return super->s_free_list.count;
-}
-
-/* journal has distance -1, top-most ifile layer distance 0 */
-static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u8 gc_level = (__force u8)__gc_level;
-
-	switch (gc_level) {
-	case 0: /* fall through */
-	case 1: /* fall through */
-	case 2: /* fall through */
-	case 3:
-		/* file data or indirect blocks */
-		return super->s_ifile_levels + super->s_iblock_levels - gc_level;
-	case 6: /* fall through */
-	case 7: /* fall through */
-	case 8: /* fall through */
-	case 9:
-		/* inode file data or indirect blocks */
-		return super->s_ifile_levels - (gc_level - 6);
-	default:
-		printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
-				gc_level);
-		WARN_ON(1);
-		return super->s_ifile_levels + super->s_iblock_levels;
-	}
-}
-
-static int segment_is_reserved(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area;
-	void *reserved;
-	int i;
-
-	/* Some segments are reserved.  Just pretend they were all valid */
-	reserved = btree_lookup32(&super->s_reserved_segments, segno);
-	if (reserved)
-		return 1;
-
-	/* Currently open segments */
-	for_each_area(i) {
-		area = super->s_area[i];
-		if (area->a_is_open && area->a_segno == segno)
-			return 1;
-	}
-
-	return 0;
-}
-
-static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
-{
-	BUG();
-}
-
-/*
- * Returns the bytes consumed by valid objects in this segment.  Object headers
- * are counted, the segment header is not.
- */
-static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
-		gc_level_t *gc_level)
-{
-	struct logfs_segment_entry se;
-	u32 ec_level;
-
-	logfs_get_segment_entry(sb, segno, &se);
-	if (se.ec_level == cpu_to_be32(BADSEG) ||
-			se.valid == cpu_to_be32(RESERVED))
-		return RESERVED;
-
-	ec_level = be32_to_cpu(se.ec_level);
-	*ec = ec_level >> 4;
-	*gc_level = GC_LEVEL(ec_level & 0xf);
-	return be32_to_cpu(se.valid);
-}
-
-static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
-		u64 bix, gc_level_t gc_level)
-{
-	struct inode *inode;
-	int err, cookie;
-
-	inode = logfs_safe_iget(sb, ino, &cookie);
-	err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
-	BUG_ON(err);
-	logfs_safe_iput(inode, cookie);
-}
-
-static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_segment_header sh;
-	struct logfs_object_header oh;
-	u64 ofs, ino, bix;
-	u32 seg_ofs, logical_segno, cleaned = 0;
-	int err, len, valid;
-	gc_level_t gc_level;
-
-	LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
-
-	btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
-	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
-	BUG_ON(err);
-	gc_level = GC_LEVEL(sh.level);
-	logical_segno = be32_to_cpu(sh.segno);
-	if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
-		logfs_mark_segment_bad(sb, segno);
-		cleaned = -1;
-		goto out;
-	}
-
-	for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
-			seg_ofs + sizeof(oh) < super->s_segsize; ) {
-		ofs = dev_ofs(sb, logical_segno, seg_ofs);
-		err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
-				&oh);
-		BUG_ON(err);
-
-		if (!memchr_inv(&oh, 0xff, sizeof(oh)))
-			break;
-
-		if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
-			logfs_mark_segment_bad(sb, segno);
-			cleaned = super->s_segsize - 1;
-			goto out;
-		}
-
-		ino = be64_to_cpu(oh.ino);
-		bix = be64_to_cpu(oh.bix);
-		len = sizeof(oh) + be16_to_cpu(oh.len);
-		valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
-		if (valid == 1) {
-			logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
-			cleaned += len;
-		} else if (valid == 2) {
-			/* Will be invalid upon journal commit */
-			cleaned += len;
-		}
-		seg_ofs += len;
-	}
-out:
-	btree_remove32(&super->s_reserved_segments, segno);
-	return cleaned;
-}
-
-static struct gc_candidate *add_list(struct gc_candidate *cand,
-		struct candidate_list *list)
-{
-	struct rb_node **p = &list->rb_tree.rb_node;
-	struct rb_node *parent = NULL;
-	struct gc_candidate *cur;
-	int comp;
-
-	cand->list = list;
-	while (*p) {
-		parent = *p;
-		cur = rb_entry(parent, struct gc_candidate, rb_node);
-
-		if (list->sort_by_ec)
-			comp = cand->erase_count < cur->erase_count;
-		else
-			comp = cand->valid < cur->valid;
-
-		if (comp)
-			p = &parent->rb_left;
-		else
-			p = &parent->rb_right;
-	}
-	rb_link_node(&cand->rb_node, parent, p);
-	rb_insert_color(&cand->rb_node, &list->rb_tree);
-
-	if (list->count <= list->maxcount) {
-		list->count++;
-		return NULL;
-	}
-	cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
-	rb_erase(&cand->rb_node, &list->rb_tree);
-	cand->list = NULL;
-	return cand;
-}
-
-static void remove_from_list(struct gc_candidate *cand)
-{
-	struct candidate_list *list = cand->list;
-
-	rb_erase(&cand->rb_node, &list->rb_tree);
-	list->count--;
-}
-
-static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	btree_remove32(&super->s_cand_tree, cand->segno);
-	kfree(cand);
-}
-
-u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
-{
-	struct gc_candidate *cand;
-	u32 segno;
-
-	BUG_ON(list->count == 0);
-
-	cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
-	remove_from_list(cand);
-	segno = cand->segno;
-	if (ec)
-		*ec = cand->erase_count;
-	free_candidate(sb, cand);
-	return segno;
-}
-
-/*
- * We have several lists to manage segments with.  The reserve_list is used to
- * deal with bad blocks.  We try to keep the best (lowest ec) segments on this
- * list.
- * The free_list contains free segments for normal usage.  It usually gets the
- * second pick after the reserve_list.  But when the free_list is running short
- * it is more important to keep the free_list full than to keep a reserve.
- *
- * Segments that are not free are put onto a per-level low_list.  If we have
- * to run garbage collection, we pick a candidate from there.  All segments on
- * those lists should have at least some free space so GC will make progress.
- *
- * And last we have the ec_list, which is used to pick segments for wear
- * leveling.
- *
- * If all appropriate lists are full, we simply free the candidate and forget
- * about that segment for a while.  We have better candidates for each purpose.
- */
-static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
-
-	if (cand->valid == 0) {
-		/* 100% free segments */
-		log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
-				cand->segno, cand->erase_count,
-				dev_ofs(sb, cand->segno, 0));
-		cand = add_list(cand, &super->s_reserve_list);
-		if (cand) {
-			log_gc_noisy("add free segment %x (ec %x) at %llx\n",
-					cand->segno, cand->erase_count,
-					dev_ofs(sb, cand->segno, 0));
-			cand = add_list(cand, &super->s_free_list);
-		}
-	} else {
-		/* good candidates for Garbage Collection */
-		if (cand->valid < full)
-			cand = add_list(cand, &super->s_low_list[cand->dist]);
-		/* good candidates for wear leveling,
-		 * segments that were recently written get ignored */
-		if (cand)
-			cand = add_list(cand, &super->s_ec_list);
-	}
-	if (cand)
-		free_candidate(sb, cand);
-}
-
-static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
-		u8 dist)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct gc_candidate *cand;
-
-	cand = kmalloc(sizeof(*cand), GFP_NOFS);
-	if (!cand)
-		return -ENOMEM;
-
-	cand->segno = segno;
-	cand->valid = valid;
-	cand->erase_count = ec;
-	cand->dist = dist;
-
-	btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
-	__add_candidate(sb, cand);
-	return 0;
-}
-
-static void remove_segment_from_lists(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct gc_candidate *cand;
-
-	cand = btree_lookup32(&super->s_cand_tree, segno);
-	if (cand) {
-		remove_from_list(cand);
-		free_candidate(sb, cand);
-	}
-}
-
-static void scan_segment(struct super_block *sb, u32 segno)
-{
-	u32 valid, ec = 0;
-	gc_level_t gc_level = 0;
-	u8 dist;
-
-	if (segment_is_reserved(sb, segno))
-		return;
-
-	remove_segment_from_lists(sb, segno);
-	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-	if (valid == RESERVED)
-		return;
-
-	dist = root_distance(sb, gc_level);
-	add_candidate(sb, segno, valid, ec, dist);
-}
-
-static struct gc_candidate *first_in_list(struct candidate_list *list)
-{
-	if (list->count == 0)
-		return NULL;
-	return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
-}
-
-/*
- * Find the best segment for garbage collection.  Main criterion is
- * the segment requiring the least effort to clean.  Secondary
- * criterion is to GC on the lowest level available.
- *
- * So we search the least effort segment on the lowest level first,
- * then move up and pick another segment iff is requires significantly
- * less effort.  Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
- */
-static struct gc_candidate *get_candidate(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i, max_dist;
-	struct gc_candidate *cand = NULL, *this;
-
-	max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
-
-	for (i = max_dist; i >= 0; i--) {
-		this = first_in_list(&super->s_low_list[i]);
-		if (!this)
-			continue;
-		if (!cand)
-			cand = this;
-		if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
-			cand = this;
-	}
-	return cand;
-}
-
-static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
-{
-	struct logfs_super *super = logfs_super(sb);
-	gc_level_t gc_level;
-	u32 cleaned, valid, segno, ec;
-	u8 dist;
-
-	if (!cand) {
-		log_gc("GC attempted, but no candidate found\n");
-		return 0;
-	}
-
-	segno = cand->segno;
-	dist = cand->dist;
-	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-	free_candidate(sb, cand);
-	log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
-			segno, (u64)segno << super->s_segshift,
-			dist, no_free_segments(sb), valid,
-			super->s_free_bytes);
-	cleaned = logfs_gc_segment(sb, segno);
-	log_gc("GC segment #%02x complete - now %x valid\n", segno,
-			valid - cleaned);
-	BUG_ON(cleaned != valid);
-	return 1;
-}
-
-static int logfs_gc_once(struct super_block *sb)
-{
-	struct gc_candidate *cand;
-
-	cand = get_candidate(sb);
-	if (cand)
-		remove_from_list(cand);
-	return __logfs_gc_once(sb, cand);
-}
-
-/* returns 1 if a wrap occurs, 0 otherwise */
-static int logfs_scan_some(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u32 segno;
-	int i, ret = 0;
-
-	segno = super->s_sweeper;
-	for (i = SCAN_RATIO; i > 0; i--) {
-		segno++;
-		if (segno >= super->s_no_segs) {
-			segno = 0;
-			ret = 1;
-			/* Break out of the loop.  We want to read a single
-			 * block from the segment size on next invocation if
-			 * SCAN_RATIO is set to match block size
-			 */
-			break;
-		}
-
-		scan_segment(sb, segno);
-	}
-	super->s_sweeper = segno;
-	return ret;
-}
-
-/*
- * In principle, this function should loop forever, looking for GC candidates
- * and moving data.  LogFS is designed in such a way that this loop is
- * guaranteed to terminate.
- *
- * Limiting the loop to some iterations serves purely to catch cases when
- * these guarantees have failed.  An actual endless loop is an obvious bug
- * and should be reported as such.
- */
-static void __logfs_gc_pass(struct super_block *sb, int target)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_block *block;
-	int round, progress, last_progress = 0;
-
-	/*
-	 * Doing too many changes to the segfile at once would result
-	 * in a large number of aliases.  Write the journal before
-	 * things get out of hand.
-	 */
-	if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
-		logfs_write_anchor(sb);
-
-	if (no_free_segments(sb) >= target &&
-			super->s_no_object_aliases < MAX_OBJ_ALIASES)
-		return;
-
-	log_gc("__logfs_gc_pass(%x)\n", target);
-	for (round = 0; round < SCAN_ROUNDS; ) {
-		if (no_free_segments(sb) >= target)
-			goto write_alias;
-
-		/* Sync in-memory state with on-medium state in case they
-		 * diverged */
-		logfs_write_anchor(sb);
-		round += logfs_scan_some(sb);
-		if (no_free_segments(sb) >= target)
-			goto write_alias;
-		progress = logfs_gc_once(sb);
-		if (progress)
-			last_progress = round;
-		else if (round - last_progress > 2)
-			break;
-		continue;
-
-		/*
-		 * The goto logic is nasty, I just don't know a better way to
-		 * code it.  GC is supposed to ensure two things:
-		 * 1. Enough free segments are available.
-		 * 2. The number of aliases is bounded.
-		 * When 1. is achieved, we take a look at 2. and write back
-		 * some alias-containing blocks, if necessary.  However, after
-		 * each such write we need to go back to 1., as writes can
-		 * consume free segments.
-		 */
-write_alias:
-		if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
-			return;
-		if (list_empty(&super->s_object_alias)) {
-			/* All aliases are still in btree */
-			return;
-		}
-		log_gc("Write back one alias\n");
-		block = list_entry(super->s_object_alias.next,
-				struct logfs_block, alias_list);
-		block->ops->write_block(block);
-		/*
-		 * To round off the nasty goto logic, we reset round here.  It
-		 * is a safety-net for GC not making any progress and limited
-		 * to something reasonably small.  If incremented it for every
-		 * single alias, the loop could terminate rather quickly.
-		 */
-		round = 0;
-	}
-	LOGFS_BUG(sb);
-}
-
-static int wl_ratelimit(struct super_block *sb, u64 *next_event)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	if (*next_event < super->s_gec) {
-		*next_event = super->s_gec + WL_RATELIMIT;
-		return 0;
-	}
-	return 1;
-}
-
-static void logfs_wl_pass(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct gc_candidate *wl_cand, *free_cand;
-
-	if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
-		return;
-
-	wl_cand = first_in_list(&super->s_ec_list);
-	if (!wl_cand)
-		return;
-	free_cand = first_in_list(&super->s_free_list);
-	if (!free_cand)
-		return;
-
-	if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
-		remove_from_list(wl_cand);
-		__logfs_gc_once(sb, wl_cand);
-	}
-}
-
-/*
- * The journal needs wear leveling as well.  But moving the journal is an
- * expensive operation so we try to avoid it as much as possible.  And if we
- * have to do it, we move the whole journal, not individual segments.
- *
- * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
- * calculations.  First we check whether moving the journal would be a
- * significant improvement.  That means that a) the current journal segments
- * have more wear than the future journal segments and b) the current journal
- * segments have more wear than normal ostore segments.
- * Rationale for b) is that we don't have to move the journal if it is aging
- * less than the ostore, even if the reserve segments age even less (they are
- * excluded from wear leveling, after all).
- * Next we check that the superblocks have less wear than the journal.  Since
- * moving the journal requires writing the superblocks, we have to protect the
- * superblocks even more than the journal.
- *
- * Also we double the acceptable wear difference, compared to ostore wear
- * leveling.  Journal data is read and rewritten rapidly, comparatively.  So
- * soft errors have much less time to accumulate and we allow the journal to
- * be a bit worse than the ostore.
- */
-static void logfs_journal_wl_pass(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct gc_candidate *cand;
-	u32 min_journal_ec = -1, max_reserve_ec = 0;
-	int i;
-
-	if (wl_ratelimit(sb, &super->s_wl_gec_journal))
-		return;
-
-	if (super->s_reserve_list.count < super->s_no_journal_segs) {
-		/* Reserve is not full enough to move complete journal */
-		return;
-	}
-
-	journal_for_each(i)
-		if (super->s_journal_seg[i])
-			min_journal_ec = min(min_journal_ec,
-					super->s_journal_ec[i]);
-	cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
-			struct gc_candidate, rb_node);
-	max_reserve_ec = cand->erase_count;
-	for (i = 0; i < 2; i++) {
-		struct logfs_segment_entry se;
-		u32 segno = seg_no(sb, super->s_sb_ofs[i]);
-		u32 ec;
-
-		logfs_get_segment_entry(sb, segno, &se);
-		ec = be32_to_cpu(se.ec_level) >> 4;
-		max_reserve_ec = max(max_reserve_ec, ec);
-	}
-
-	if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
-		do_logfs_journal_wl_pass(sb);
-	}
-}
-
-void logfs_gc_pass(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	//BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
-	/* Write journal before free space is getting saturated with dirty
-	 * objects.
-	 */
-	if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
-			+ LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
-		logfs_write_anchor(sb);
-	__logfs_gc_pass(sb, super->s_total_levels);
-	logfs_wl_pass(sb);
-	logfs_journal_wl_pass(sb);
-}
-
-static int check_area(struct super_block *sb, int i)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_area[i];
-	gc_level_t gc_level;
-	u32 cleaned, valid, ec;
-	u32 segno = area->a_segno;
-	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-
-	if (!area->a_is_open)
-		return 0;
-
-	if (super->s_devops->can_write_buf(sb, ofs) == 0)
-		return 0;
-
-	printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
-	/*
-	 * The device cannot write back the write buffer.  Most likely the
-	 * wbuf was already written out and the system crashed at some point
-	 * before the journal commit happened.  In that case we wouldn't have
-	 * to do anything.  But if the crash happened before the wbuf was
-	 * written out correctly, we must GC this segment.  So assume the
-	 * worst and always do the GC run.
-	 */
-	area->a_is_open = 0;
-	valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
-	cleaned = logfs_gc_segment(sb, segno);
-	if (cleaned != valid)
-		return -EIO;
-	return 0;
-}
-
-int logfs_check_areas(struct super_block *sb)
-{
-	int i, err;
-
-	for_each_area(i) {
-		err = check_area(sb, i);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
-static void logfs_init_candlist(struct candidate_list *list, int maxcount,
-		int sort_by_ec)
-{
-	list->count = 0;
-	list->maxcount = maxcount;
-	list->sort_by_ec = sort_by_ec;
-	list->rb_tree = RB_ROOT;
-}
-
-int logfs_init_gc(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i;
-
-	btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
-	logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
-	logfs_init_candlist(&super->s_reserve_list,
-			super->s_bad_seg_reserve, 1);
-	for_each_area(i)
-		logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
-	logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
-	return 0;
-}
-
-static void logfs_cleanup_list(struct super_block *sb,
-		struct candidate_list *list)
-{
-	struct gc_candidate *cand;
-
-	while (list->count) {
-		cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
-				rb_node);
-		remove_from_list(cand);
-		free_candidate(sb, cand);
-	}
-	BUG_ON(list->rb_tree.rb_node);
-}
-
-void logfs_cleanup_gc(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i;
-
-	if (!super->s_free_list.count)
-		return;
-
-	/*
-	 * FIXME: The btree may still contain a single empty node.  So we
-	 * call the grim visitor to clean up that mess.  Btree code should
-	 * do it for us, really.
-	 */
-	btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
-	logfs_cleanup_list(sb, &super->s_free_list);
-	logfs_cleanup_list(sb, &super->s_reserve_list);
-	for_each_area(i)
-		logfs_cleanup_list(sb, &super->s_low_list[i]);
-	logfs_cleanup_list(sb, &super->s_ec_list);
-}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
deleted file mode 100644
index f440a1525da8..000000000000
--- a/fs/logfs/inode.c
+++ /dev/null
@@ -1,428 +0,0 @@
-/*
- * fs/logfs/inode.c	- inode handling code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-#include <linux/writeback.h>
-#include <linux/backing-dev.h>
-
-/*
- * How soon to reuse old inode numbers?  LogFS doesn't store deleted inodes
- * on the medium.  It therefore also lacks a method to store the previous
- * generation number for deleted inodes.  Instead a single generation number
- * is stored which will be used for new inodes.  Being just a 32bit counter,
- * this can obvious wrap relatively quickly.  So we only reuse inodes if we
- * know that a fair number of inodes can be created before we have to increment
- * the generation again - effectively adding some bits to the counter.
- * But being too aggressive here means we keep a very large and very sparse
- * inode file, wasting space on indirect blocks.
- * So what is a good value?  Beats me.  64k seems moderately bad on both
- * fronts, so let's use that for now...
- *
- * NFS sucks, as everyone already knows.
- */
-#define INOS_PER_WRAP (0x10000)
-
-/*
- * Logfs' requirement to read inodes for garbage collection makes life a bit
- * harder.  GC may have to read inodes that are in I_FREEING state, when they
- * are being written out - and waiting for GC to make progress, naturally.
- *
- * So we cannot just call iget() or some variant of it, but first have to check
- * whether the inode in question might be in I_FREEING state.  Therefore we
- * maintain our own per-sb list of "almost deleted" inodes and check against
- * that list first.  Normally this should be at most 1-2 entries long.
- *
- * Also, inodes have logfs-specific reference counting on top of what the vfs
- * does.  When .destroy_inode is called, normally the reference count will drop
- * to zero and the inode gets deleted.  But if GC accessed the inode, its
- * refcount will remain nonzero and final deletion will have to wait.
- *
- * As a result we have two sets of functions to get/put inodes:
- * logfs_safe_iget/logfs_safe_iput	- safe to call from GC context
- * logfs_iget/iput			- normal version
- */
-static struct kmem_cache *logfs_inode_cache;
-
-static DEFINE_SPINLOCK(logfs_inode_lock);
-
-static void logfs_inode_setops(struct inode *inode)
-{
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFDIR:
-		inode->i_op = &logfs_dir_iops;
-		inode->i_fop = &logfs_dir_fops;
-		inode->i_mapping->a_ops = &logfs_reg_aops;
-		break;
-	case S_IFREG:
-		inode->i_op = &logfs_reg_iops;
-		inode->i_fop = &logfs_reg_fops;
-		inode->i_mapping->a_ops = &logfs_reg_aops;
-		break;
-	case S_IFLNK:
-		inode->i_op = &page_symlink_inode_operations;
-		inode_nohighmem(inode);
-		inode->i_mapping->a_ops = &logfs_reg_aops;
-		break;
-	case S_IFSOCK:	/* fall through */
-	case S_IFBLK:	/* fall through */
-	case S_IFCHR:	/* fall through */
-	case S_IFIFO:
-		init_special_inode(inode, inode->i_mode, inode->i_rdev);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
-{
-	struct inode *inode = iget_locked(sb, ino);
-	int err;
-
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-	if (!(inode->i_state & I_NEW))
-		return inode;
-
-	err = logfs_read_inode(inode);
-	if (err || inode->i_nlink == 0) {
-		/* inode->i_nlink == 0 can be true when called from
-		 * block validator */
-		/* set i_nlink to 0 to prevent caching */
-		clear_nlink(inode);
-		logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
-		iget_failed(inode);
-		if (!err)
-			err = -ENOENT;
-		return ERR_PTR(err);
-	}
-
-	logfs_inode_setops(inode);
-	unlock_new_inode(inode);
-	return inode;
-}
-
-struct inode *logfs_iget(struct super_block *sb, ino_t ino)
-{
-	BUG_ON(ino == LOGFS_INO_MASTER);
-	BUG_ON(ino == LOGFS_INO_SEGFILE);
-	return __logfs_iget(sb, ino);
-}
-
-/*
- * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
- * this allows logfs_iput to do the right thing later
- */
-struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_inode *li;
-
-	if (ino == LOGFS_INO_MASTER)
-		return super->s_master_inode;
-	if (ino == LOGFS_INO_SEGFILE)
-		return super->s_segfile_inode;
-
-	spin_lock(&logfs_inode_lock);
-	list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
-		if (li->vfs_inode.i_ino == ino) {
-			li->li_refcount++;
-			spin_unlock(&logfs_inode_lock);
-			*is_cached = 1;
-			return &li->vfs_inode;
-		}
-	spin_unlock(&logfs_inode_lock);
-
-	*is_cached = 0;
-	return __logfs_iget(sb, ino);
-}
-
-static void logfs_i_callback(struct rcu_head *head)
-{
-	struct inode *inode = container_of(head, struct inode, i_rcu);
-	kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
-}
-
-static void __logfs_destroy_inode(struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	BUG_ON(li->li_block);
-	list_del(&li->li_freeing_list);
-	call_rcu(&inode->i_rcu, logfs_i_callback);
-}
-
-static void __logfs_destroy_meta_inode(struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	BUG_ON(li->li_block);
-	call_rcu(&inode->i_rcu, logfs_i_callback);
-}
-
-static void logfs_destroy_inode(struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if (inode->i_ino < LOGFS_RESERVED_INOS) {
-		/*
-		 * The reserved inodes are never destroyed unless we are in
-		 * unmont path.
-		 */
-		__logfs_destroy_meta_inode(inode);
-		return;
-	}
-
-	BUG_ON(list_empty(&li->li_freeing_list));
-	spin_lock(&logfs_inode_lock);
-	li->li_refcount--;
-	if (li->li_refcount == 0)
-		__logfs_destroy_inode(inode);
-	spin_unlock(&logfs_inode_lock);
-}
-
-void logfs_safe_iput(struct inode *inode, int is_cached)
-{
-	if (inode->i_ino == LOGFS_INO_MASTER)
-		return;
-	if (inode->i_ino == LOGFS_INO_SEGFILE)
-		return;
-
-	if (is_cached) {
-		logfs_destroy_inode(inode);
-		return;
-	}
-
-	iput(inode);
-}
-
-static void logfs_init_inode(struct super_block *sb, struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	int i;
-
-	li->li_flags	= 0;
-	li->li_height	= 0;
-	li->li_used_bytes = 0;
-	li->li_block	= NULL;
-	i_uid_write(inode, 0);
-	i_gid_write(inode, 0);
-	inode->i_size	= 0;
-	inode->i_blocks	= 0;
-	inode->i_ctime	= current_time(inode);
-	inode->i_mtime	= current_time(inode);
-	li->li_refcount = 1;
-	INIT_LIST_HEAD(&li->li_freeing_list);
-
-	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-		li->li_data[i] = 0;
-
-	return;
-}
-
-static struct inode *logfs_alloc_inode(struct super_block *sb)
-{
-	struct logfs_inode *li;
-
-	li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
-	if (!li)
-		return NULL;
-	logfs_init_inode(sb, &li->vfs_inode);
-	return &li->vfs_inode;
-}
-
-/*
- * In logfs inodes are written to an inode file.  The inode file, like any
- * other file, is managed with a inode.  The inode file's inode, aka master
- * inode, requires special handling in several respects.  First, it cannot be
- * written to the inode file, so it is stored in the journal instead.
- *
- * Secondly, this inode cannot be written back and destroyed before all other
- * inodes have been written.  The ordering is important.  Linux' VFS is happily
- * unaware of the ordering constraint and would ordinarily destroy the master
- * inode at umount time while other inodes are still in use and dirty.  Not
- * good.
- *
- * So logfs makes sure the master inode is not written until all other inodes
- * have been destroyed.  Sadly, this method has another side-effect.  The VFS
- * will notice one remaining inode and print a frightening warning message.
- * Worse, it is impossible to judge whether such a warning was caused by the
- * master inode or any other inodes have leaked as well.
- *
- * Our attempt of solving this is with logfs_new_meta_inode() below.  Its
- * purpose is to create a new inode that will not trigger the warning if such
- * an inode is still in use.  An ugly hack, no doubt.  Suggections for
- * improvement are welcome.
- *
- * AV: that's what ->put_super() is for...
- */
-struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
-{
-	struct inode *inode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	inode->i_mode = S_IFREG;
-	inode->i_ino = ino;
-	inode->i_data.a_ops = &logfs_reg_aops;
-	mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
-
-	return inode;
-}
-
-struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
-{
-	struct inode *inode;
-	int err;
-
-	inode = logfs_new_meta_inode(sb, ino);
-	if (IS_ERR(inode))
-		return inode;
-
-	err = logfs_read_inode(inode);
-	if (err) {
-		iput(inode);
-		return ERR_PTR(err);
-	}
-	logfs_inode_setops(inode);
-	return inode;
-}
-
-static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
-	int ret;
-	long flags = WF_LOCK;
-
-	/* Can only happen if creat() failed.  Safe to skip. */
-	if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
-		return 0;
-
-	ret = __logfs_write_inode(inode, NULL, flags);
-	LOGFS_BUG_ON(ret, inode->i_sb);
-	return ret;
-}
-
-/* called with inode->i_lock held */
-static int logfs_drop_inode(struct inode *inode)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	struct logfs_inode *li = logfs_inode(inode);
-
-	spin_lock(&logfs_inode_lock);
-	list_move(&li->li_freeing_list, &super->s_freeing_list);
-	spin_unlock(&logfs_inode_lock);
-	return generic_drop_inode(inode);
-}
-
-static void logfs_set_ino_generation(struct super_block *sb,
-		struct inode *inode)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u64 ino;
-
-	mutex_lock(&super->s_journal_mutex);
-	ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
-	super->s_last_ino = ino;
-	super->s_inos_till_wrap--;
-	if (super->s_inos_till_wrap < 0) {
-		super->s_last_ino = LOGFS_RESERVED_INOS;
-		super->s_generation++;
-		super->s_inos_till_wrap = INOS_PER_WRAP;
-	}
-	inode->i_ino = ino;
-	inode->i_generation = super->s_generation;
-	mutex_unlock(&super->s_journal_mutex);
-}
-
-struct inode *logfs_new_inode(struct inode *dir, umode_t mode)
-{
-	struct super_block *sb = dir->i_sb;
-	struct inode *inode;
-
-	inode = new_inode(sb);
-	if (!inode)
-		return ERR_PTR(-ENOMEM);
-
-	logfs_init_inode(sb, inode);
-
-	/* inherit parent flags */
-	logfs_inode(inode)->li_flags |=
-		logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
-
-	inode->i_mode = mode;
-	logfs_set_ino_generation(sb, inode);
-
-	inode_init_owner(inode, dir, mode);
-	logfs_inode_setops(inode);
-	insert_inode_hash(inode);
-
-	return inode;
-}
-
-static void logfs_init_once(void *_li)
-{
-	struct logfs_inode *li = _li;
-	int i;
-
-	li->li_flags = 0;
-	li->li_used_bytes = 0;
-	li->li_refcount = 1;
-	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-		li->li_data[i] = 0;
-	inode_init_once(&li->vfs_inode);
-}
-
-static int logfs_sync_fs(struct super_block *sb, int wait)
-{
-	logfs_get_wblocks(sb, NULL, WF_LOCK);
-	logfs_write_anchor(sb);
-	logfs_put_wblocks(sb, NULL, WF_LOCK);
-	return 0;
-}
-
-static void logfs_put_super(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	/* kill the meta-inodes */
-	iput(super->s_segfile_inode);
-	iput(super->s_master_inode);
-	iput(super->s_mapping_inode);
-}
-
-const struct super_operations logfs_super_operations = {
-	.alloc_inode	= logfs_alloc_inode,
-	.destroy_inode	= logfs_destroy_inode,
-	.evict_inode	= logfs_evict_inode,
-	.drop_inode	= logfs_drop_inode,
-	.put_super	= logfs_put_super,
-	.write_inode	= logfs_write_inode,
-	.statfs		= logfs_statfs,
-	.sync_fs	= logfs_sync_fs,
-};
-
-int logfs_init_inode_cache(void)
-{
-	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
-			sizeof(struct logfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
-			logfs_init_once);
-	if (!logfs_inode_cache)
-		return -ENOMEM;
-	return 0;
-}
-
-void logfs_destroy_inode_cache(void)
-{
-	/*
-	 * Make sure all delayed rcu free inodes are flushed before we
-	 * destroy cache.
-	 */
-	rcu_barrier();
-	kmem_cache_destroy(logfs_inode_cache);
-}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
deleted file mode 100644
index 2a09b8d73989..000000000000
--- a/fs/logfs/journal.c
+++ /dev/null
@@ -1,894 +0,0 @@
-/*
- * fs/logfs/journal.c	- journal handling code
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-static void logfs_calc_free(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u64 reserve, no_segs = super->s_no_segs;
-	s64 free;
-	int i;
-
-	/* superblock segments */
-	no_segs -= 2;
-	super->s_no_journal_segs = 0;
-	/* journal */
-	journal_for_each(i)
-		if (super->s_journal_seg[i]) {
-			no_segs--;
-			super->s_no_journal_segs++;
-		}
-
-	/* open segments plus one extra per level for GC */
-	no_segs -= 2 * super->s_total_levels;
-
-	free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
-	free -= super->s_used_bytes;
-	/* just a bit extra */
-	free -= super->s_total_levels * 4096;
-
-	/* Bad blocks are 'paid' for with speed reserve - the filesystem
-	 * simply gets slower as bad blocks accumulate.  Until the bad blocks
-	 * exceed the speed reserve - then the filesystem gets smaller.
-	 */
-	reserve = super->s_bad_segments + super->s_bad_seg_reserve;
-	reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
-	reserve = max(reserve, super->s_speed_reserve);
-	free -= reserve;
-	if (free < 0)
-		free = 0;
-
-	super->s_free_bytes = free;
-}
-
-static void reserve_sb_and_journal(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct btree_head32 *head = &super->s_reserved_segments;
-	int i, err;
-
-	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
-			GFP_KERNEL);
-	BUG_ON(err);
-
-	err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
-			GFP_KERNEL);
-	BUG_ON(err);
-
-	journal_for_each(i) {
-		if (!super->s_journal_seg[i])
-			continue;
-		err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
-				GFP_KERNEL);
-		BUG_ON(err);
-	}
-}
-
-static void read_dynsb(struct super_block *sb,
-		struct logfs_je_dynsb *dynsb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	super->s_gec		= be64_to_cpu(dynsb->ds_gec);
-	super->s_sweeper	= be64_to_cpu(dynsb->ds_sweeper);
-	super->s_victim_ino	= be64_to_cpu(dynsb->ds_victim_ino);
-	super->s_rename_dir	= be64_to_cpu(dynsb->ds_rename_dir);
-	super->s_rename_pos	= be64_to_cpu(dynsb->ds_rename_pos);
-	super->s_used_bytes	= be64_to_cpu(dynsb->ds_used_bytes);
-	super->s_generation	= be32_to_cpu(dynsb->ds_generation);
-}
-
-static void read_anchor(struct super_block *sb,
-		struct logfs_je_anchor *da)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode = super->s_master_inode;
-	struct logfs_inode *li = logfs_inode(inode);
-	int i;
-
-	super->s_last_ino = be64_to_cpu(da->da_last_ino);
-	li->li_flags	= 0;
-	li->li_height	= da->da_height;
-	i_size_write(inode, be64_to_cpu(da->da_size));
-	li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
-
-	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-		li->li_data[i] = be64_to_cpu(da->da_data[i]);
-}
-
-static void read_erasecount(struct super_block *sb,
-		struct logfs_je_journal_ec *ec)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i;
-
-	journal_for_each(i)
-		super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
-}
-
-static int read_area(struct super_block *sb, struct logfs_je_area *a)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_area[a->gc_level];
-	u64 ofs;
-	u32 writemask = ~(super->s_writesize - 1);
-
-	if (a->gc_level >= LOGFS_NO_AREAS)
-		return -EIO;
-	if (a->vim != VIM_DEFAULT)
-		return -EIO; /* TODO: close area and continue */
-
-	area->a_used_bytes = be32_to_cpu(a->used_bytes);
-	area->a_written_bytes = area->a_used_bytes & writemask;
-	area->a_segno = be32_to_cpu(a->segno);
-	if (area->a_segno)
-		area->a_is_open = 1;
-
-	ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-	if (super->s_writesize > 1)
-		return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
-	else
-		return logfs_buf_recover(area, ofs, NULL, 0);
-}
-
-static void *unpack(void *from, void *to)
-{
-	struct logfs_journal_header *jh = from;
-	void *data = from + sizeof(struct logfs_journal_header);
-	int err;
-	size_t inlen, outlen;
-
-	inlen = be16_to_cpu(jh->h_len);
-	outlen = be16_to_cpu(jh->h_datalen);
-
-	if (jh->h_compr == COMPR_NONE)
-		memcpy(to, data, inlen);
-	else {
-		err = logfs_uncompress(data, to, inlen, outlen);
-		BUG_ON(err);
-	}
-	return to;
-}
-
-static int __read_je_header(struct super_block *sb, u64 ofs,
-		struct logfs_journal_header *jh)
-{
-	struct logfs_super *super = logfs_super(sb);
-	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
-		+ MAX_JOURNAL_HEADER;
-	u16 type, len, datalen;
-	int err;
-
-	/* read header only */
-	err = wbuf_read(sb, ofs, sizeof(*jh), jh);
-	if (err)
-		return err;
-	type = be16_to_cpu(jh->h_type);
-	len = be16_to_cpu(jh->h_len);
-	datalen = be16_to_cpu(jh->h_datalen);
-	if (len > sb->s_blocksize)
-		return -EIO;
-	if ((type < JE_FIRST) || (type > JE_LAST))
-		return -EIO;
-	if (datalen > bufsize)
-		return -EIO;
-	return 0;
-}
-
-static int __read_je_payload(struct super_block *sb, u64 ofs,
-		struct logfs_journal_header *jh)
-{
-	u16 len;
-	int err;
-
-	len = be16_to_cpu(jh->h_len);
-	err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
-	if (err)
-		return err;
-	if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
-		/* Old code was confused.  It forgot about the header length
-		 * and stopped calculating the crc 16 bytes before the end
-		 * of data - ick!
-		 * FIXME: Remove this hack once the old code is fixed.
-		 */
-		if (jh->h_crc == logfs_crc32(jh, len, 4))
-			WARN_ON_ONCE(1);
-		else
-			return -EIO;
-	}
-	return 0;
-}
-
-/*
- * jh needs to be large enough to hold the complete entry, not just the header
- */
-static int __read_je(struct super_block *sb, u64 ofs,
-		struct logfs_journal_header *jh)
-{
-	int err;
-
-	err = __read_je_header(sb, ofs, jh);
-	if (err)
-		return err;
-	return __read_je_payload(sb, ofs, jh);
-}
-
-static int read_je(struct super_block *sb, u64 ofs)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_journal_header *jh = super->s_compressed_je;
-	void *scratch = super->s_je;
-	u16 type, datalen;
-	int err;
-
-	err = __read_je(sb, ofs, jh);
-	if (err)
-		return err;
-	type = be16_to_cpu(jh->h_type);
-	datalen = be16_to_cpu(jh->h_datalen);
-
-	switch (type) {
-	case JE_DYNSB:
-		read_dynsb(sb, unpack(jh, scratch));
-		break;
-	case JE_ANCHOR:
-		read_anchor(sb, unpack(jh, scratch));
-		break;
-	case JE_ERASECOUNT:
-		read_erasecount(sb, unpack(jh, scratch));
-		break;
-	case JE_AREA:
-		err = read_area(sb, unpack(jh, scratch));
-		break;
-	case JE_OBJ_ALIAS:
-		err = logfs_load_object_aliases(sb, unpack(jh, scratch),
-				datalen);
-		break;
-	default:
-		WARN_ON_ONCE(1);
-		return -EIO;
-	}
-	return err;
-}
-
-static int logfs_read_segment(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_journal_header *jh = super->s_compressed_je;
-	u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
-	u32 h_ofs, last_ofs = 0;
-	u16 len, datalen, last_len = 0;
-	int i, err;
-
-	/* search for most recent commit */
-	for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
-		ofs = seg_ofs + h_ofs;
-		err = __read_je_header(sb, ofs, jh);
-		if (err)
-			continue;
-		if (jh->h_type != cpu_to_be16(JE_COMMIT))
-			continue;
-		err = __read_je_payload(sb, ofs, jh);
-		if (err)
-			continue;
-		len = be16_to_cpu(jh->h_len);
-		datalen = be16_to_cpu(jh->h_datalen);
-		if ((datalen > sizeof(super->s_je_array)) ||
-				(datalen % sizeof(__be64)))
-			continue;
-		last_ofs = h_ofs;
-		last_len = datalen;
-		h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
-	}
-	/* read commit */
-	if (last_ofs == 0)
-		return -ENOENT;
-	ofs = seg_ofs + last_ofs;
-	log_journal("Read commit from %llx\n", ofs);
-	err = __read_je(sb, ofs, jh);
-	BUG_ON(err); /* We should have caught it in the scan loop already */
-	if (err)
-		return err;
-	/* uncompress */
-	unpack(jh, super->s_je_array);
-	super->s_no_je = last_len / sizeof(__be64);
-	/* iterate over array */
-	for (i = 0; i < super->s_no_je; i++) {
-		err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
-		if (err)
-			return err;
-	}
-	super->s_journal_area->a_segno = segno;
-	return 0;
-}
-
-static u64 read_gec(struct super_block *sb, u32 segno)
-{
-	struct logfs_segment_header sh;
-	__be32 crc;
-	int err;
-
-	if (!segno)
-		return 0;
-	err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
-	if (err)
-		return 0;
-	crc = logfs_crc32(&sh, sizeof(sh), 4);
-	if (crc != sh.crc) {
-		WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
-		/* Most likely it was just erased */
-		return 0;
-	}
-	return be64_to_cpu(sh.gec);
-}
-
-static int logfs_read_journal(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u64 gec[LOGFS_JOURNAL_SEGS], max;
-	u32 segno;
-	int i, max_i;
-
-	max = 0;
-	max_i = -1;
-	journal_for_each(i) {
-		segno = super->s_journal_seg[i];
-		gec[i] = read_gec(sb, super->s_journal_seg[i]);
-		if (gec[i] > max) {
-			max = gec[i];
-			max_i = i;
-		}
-	}
-	if (max_i == -1)
-		return -EIO;
-	/* FIXME: Try older segments in case of error */
-	return logfs_read_segment(sb, super->s_journal_seg[max_i]);
-}
-
-/*
- * First search the current segment (outer loop), then pick the next segment
- * in the array, skipping any zero entries (inner loop).
- */
-static void journal_get_free_segment(struct logfs_area *area)
-{
-	struct logfs_super *super = logfs_super(area->a_sb);
-	int i;
-
-	journal_for_each(i) {
-		if (area->a_segno != super->s_journal_seg[i])
-			continue;
-
-		do {
-			i++;
-			if (i == LOGFS_JOURNAL_SEGS)
-				i = 0;
-		} while (!super->s_journal_seg[i]);
-
-		area->a_segno = super->s_journal_seg[i];
-		area->a_erase_count = ++(super->s_journal_ec[i]);
-		log_journal("Journal now at %x (ec %x)\n", area->a_segno,
-				area->a_erase_count);
-		return;
-	}
-	BUG();
-}
-
-static void journal_get_erase_count(struct logfs_area *area)
-{
-	/* erase count is stored globally and incremented in
-	 * journal_get_free_segment() - nothing to do here */
-}
-
-static int journal_erase_segment(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	union {
-		struct logfs_segment_header sh;
-		unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
-	} u;
-	u64 ofs;
-	int err;
-
-	err = logfs_erase_segment(sb, area->a_segno, 1);
-	if (err)
-		return err;
-
-	memset(&u, 0, sizeof(u));
-	u.sh.pad = 0;
-	u.sh.type = SEG_JOURNAL;
-	u.sh.level = 0;
-	u.sh.segno = cpu_to_be32(area->a_segno);
-	u.sh.ec = cpu_to_be32(area->a_erase_count);
-	u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
-	u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
-
-	/* This causes a bug in segment.c.  Not yet. */
-	//logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
-
-	ofs = dev_ofs(sb, area->a_segno, 0);
-	area->a_used_bytes = sizeof(u);
-	logfs_buf_write(area, ofs, &u, sizeof(u));
-	return 0;
-}
-
-static size_t __logfs_write_header(struct logfs_super *super,
-		struct logfs_journal_header *jh, size_t len, size_t datalen,
-		u16 type, u8 compr)
-{
-	jh->h_len	= cpu_to_be16(len);
-	jh->h_type	= cpu_to_be16(type);
-	jh->h_datalen	= cpu_to_be16(datalen);
-	jh->h_compr	= compr;
-	jh->h_pad[0]	= 'H';
-	jh->h_pad[1]	= 'E';
-	jh->h_pad[2]	= 'A';
-	jh->h_pad[3]	= 'D';
-	jh->h_pad[4]	= 'R';
-	jh->h_crc	= logfs_crc32(jh, len + sizeof(*jh), 4);
-	return ALIGN(len, 16) + sizeof(*jh);
-}
-
-static size_t logfs_write_header(struct logfs_super *super,
-		struct logfs_journal_header *jh, size_t datalen, u16 type)
-{
-	size_t len = datalen;
-
-	return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
-}
-
-static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
-{
-	return LOGFS_JOURNAL_SEGS * sizeof(__be32);
-}
-
-static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
-		u16 *type, size_t *len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_je_journal_ec *ec = _ec;
-	int i;
-
-	journal_for_each(i)
-		ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
-	*type = JE_ERASECOUNT;
-	*len = logfs_journal_erasecount_size(super);
-	return ec;
-}
-
-static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
-		size_t ignore2)
-{
-	struct logfs_shadow *shadow = _shadow;
-	struct super_block *sb = (void *)_sb;
-	struct logfs_super *super = logfs_super(sb);
-
-	/* consume new space */
-	super->s_free_bytes	  -= shadow->new_len;
-	super->s_used_bytes	  += shadow->new_len;
-	super->s_dirty_used_bytes -= shadow->new_len;
-
-	/* free up old space */
-	super->s_free_bytes	  += shadow->old_len;
-	super->s_used_bytes	  -= shadow->old_len;
-	super->s_dirty_free_bytes -= shadow->old_len;
-
-	logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
-	logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
-
-	log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
-			shadow->ino, shadow->bix, shadow->gc_level,
-			shadow->old_ofs, shadow->new_ofs,
-			shadow->old_len, shadow->new_len);
-	mempool_free(shadow, super->s_shadow_pool);
-}
-
-static void account_shadows(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode = super->s_master_inode;
-	struct logfs_inode *li = logfs_inode(inode);
-	struct shadow_tree *tree = &super->s_shadow_tree;
-
-	btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
-	btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
-	btree_grim_visitor32(&tree->segment_map, 0, NULL);
-	tree->no_shadowed_segments = 0;
-
-	if (li->li_block) {
-		/*
-		 * We never actually use the structure, when attached to the
-		 * master inode.  But it is easier to always free it here than
-		 * to have checks in several places elsewhere when allocating
-		 * it.
-		 */
-		li->li_block->ops->free_block(sb, li->li_block);
-	}
-	BUG_ON((s64)li->li_used_bytes < 0);
-}
-
-static void *__logfs_write_anchor(struct super_block *sb, void *_da,
-		u16 *type, size_t *len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_je_anchor *da = _da;
-	struct inode *inode = super->s_master_inode;
-	struct logfs_inode *li = logfs_inode(inode);
-	int i;
-
-	da->da_height	= li->li_height;
-	da->da_last_ino = cpu_to_be64(super->s_last_ino);
-	da->da_size	= cpu_to_be64(i_size_read(inode));
-	da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
-	for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-		da->da_data[i] = cpu_to_be64(li->li_data[i]);
-	*type = JE_ANCHOR;
-	*len = sizeof(*da);
-	return da;
-}
-
-static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
-		u16 *type, size_t *len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_je_dynsb *dynsb = _dynsb;
-
-	dynsb->ds_gec		= cpu_to_be64(super->s_gec);
-	dynsb->ds_sweeper	= cpu_to_be64(super->s_sweeper);
-	dynsb->ds_victim_ino	= cpu_to_be64(super->s_victim_ino);
-	dynsb->ds_rename_dir	= cpu_to_be64(super->s_rename_dir);
-	dynsb->ds_rename_pos	= cpu_to_be64(super->s_rename_pos);
-	dynsb->ds_used_bytes	= cpu_to_be64(super->s_used_bytes);
-	dynsb->ds_generation	= cpu_to_be32(super->s_generation);
-	*type = JE_DYNSB;
-	*len = sizeof(*dynsb);
-	return dynsb;
-}
-
-static void write_wbuf(struct super_block *sb, struct logfs_area *area,
-		void *wbuf)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	u64 ofs;
-	pgoff_t index;
-	int page_ofs;
-	struct page *page;
-
-	ofs = dev_ofs(sb, area->a_segno,
-			area->a_used_bytes & ~(super->s_writesize - 1));
-	index = ofs >> PAGE_SHIFT;
-	page_ofs = ofs & (PAGE_SIZE - 1);
-
-	page = find_or_create_page(mapping, index, GFP_NOFS);
-	BUG_ON(!page);
-	memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
-	unlock_page(page);
-}
-
-static void *logfs_write_area(struct super_block *sb, void *_a,
-		u16 *type, size_t *len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_area[super->s_sum_index];
-	struct logfs_je_area *a = _a;
-
-	a->vim = VIM_DEFAULT;
-	a->gc_level = super->s_sum_index;
-	a->used_bytes = cpu_to_be32(area->a_used_bytes);
-	a->segno = cpu_to_be32(area->a_segno);
-	if (super->s_writesize > 1)
-		write_wbuf(sb, area, a + 1);
-
-	*type = JE_AREA;
-	*len = sizeof(*a) + super->s_writesize;
-	return a;
-}
-
-static void *logfs_write_commit(struct super_block *sb, void *h,
-		u16 *type, size_t *len)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	*type = JE_COMMIT;
-	*len = super->s_no_je * sizeof(__be64);
-	return super->s_je_array;
-}
-
-static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
-		size_t len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	void *header = super->s_compressed_je;
-	void *data = header + sizeof(struct logfs_journal_header);
-	ssize_t compr_len, pad_len;
-	u8 compr = COMPR_ZLIB;
-
-	if (len == 0)
-		return logfs_write_header(super, header, 0, type);
-
-	compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
-	if (compr_len < 0 || type == JE_ANCHOR) {
-		memcpy(data, buf, len);
-		compr_len = len;
-		compr = COMPR_NONE;
-	}
-
-	pad_len = ALIGN(compr_len, 16);
-	memset(data + compr_len, 0, pad_len - compr_len);
-
-	return __logfs_write_header(super, header, compr_len, len, type, compr);
-}
-
-static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
-		int must_pad)
-{
-	u32 writesize = logfs_super(area->a_sb)->s_writesize;
-	s32 ofs;
-	int ret;
-
-	ret = logfs_open_area(area, *bytes);
-	if (ret)
-		return -EAGAIN;
-
-	ofs = area->a_used_bytes;
-	area->a_used_bytes += *bytes;
-
-	if (must_pad) {
-		area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
-		*bytes = area->a_used_bytes - ofs;
-	}
-
-	return dev_ofs(area->a_sb, area->a_segno, ofs);
-}
-
-static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
-		size_t buf_len)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_journal_area;
-	struct logfs_journal_header *jh = super->s_compressed_je;
-	size_t len;
-	int must_pad = 0;
-	s64 ofs;
-
-	len = __logfs_write_je(sb, buf, type, buf_len);
-	if (jh->h_type == cpu_to_be16(JE_COMMIT))
-		must_pad = 1;
-
-	ofs = logfs_get_free_bytes(area, &len, must_pad);
-	if (ofs < 0)
-		return ofs;
-	logfs_buf_write(area, ofs, super->s_compressed_je, len);
-	BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
-	super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
-	return 0;
-}
-
-static int logfs_write_je(struct super_block *sb,
-		void* (*write)(struct super_block *sb, void *scratch,
-			u16 *type, size_t *len))
-{
-	void *buf;
-	size_t len;
-	u16 type;
-
-	buf = write(sb, logfs_super(sb)->s_je, &type, &len);
-	return logfs_write_je_buf(sb, buf, type, len);
-}
-
-int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
-		level_t level, int child_no, __be64 val)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_obj_alias *oa = super->s_je;
-	int err = 0, fill = super->s_je_fill;
-
-	log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
-			fill, ino, bix, level, child_no, be64_to_cpu(val));
-	oa[fill].ino = cpu_to_be64(ino);
-	oa[fill].bix = cpu_to_be64(bix);
-	oa[fill].val = val;
-	oa[fill].level = (__force u8)level;
-	oa[fill].child_no = cpu_to_be16(child_no);
-	fill++;
-	if (fill >= sb->s_blocksize / sizeof(*oa)) {
-		err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
-		fill = 0;
-	}
-
-	super->s_je_fill = fill;
-	return err;
-}
-
-static int logfs_write_obj_aliases(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int err;
-
-	log_journal("logfs_write_obj_aliases: %d aliases to write\n",
-			super->s_no_object_aliases);
-	super->s_je_fill = 0;
-	err = logfs_write_obj_aliases_pagecache(sb);
-	if (err)
-		return err;
-
-	if (super->s_je_fill)
-		err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
-				super->s_je_fill
-				* sizeof(struct logfs_obj_alias));
-	return err;
-}
-
-/*
- * Write all journal entries.  The goto logic ensures that all journal entries
- * are written whenever a new segment is used.  It is ugly and potentially a
- * bit wasteful, but robustness is more important.  With this we can *always*
- * erase all journal segments except the one containing the most recent commit.
- */
-void logfs_write_anchor(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_journal_area;
-	int i, err;
-
-	if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
-		return;
-	super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
-
-	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
-	mutex_lock(&super->s_journal_mutex);
-
-	/* Do this first or suffer corruption */
-	logfs_sync_segments(sb);
-	account_shadows(sb);
-
-again:
-	super->s_no_je = 0;
-	for_each_area(i) {
-		if (!super->s_area[i]->a_is_open)
-			continue;
-		super->s_sum_index = i;
-		err = logfs_write_je(sb, logfs_write_area);
-		if (err)
-			goto again;
-	}
-	err = logfs_write_obj_aliases(sb);
-	if (err)
-		goto again;
-	err = logfs_write_je(sb, logfs_write_erasecount);
-	if (err)
-		goto again;
-	err = logfs_write_je(sb, __logfs_write_anchor);
-	if (err)
-		goto again;
-	err = logfs_write_je(sb, logfs_write_dynsb);
-	if (err)
-		goto again;
-	/*
-	 * Order is imperative.  First we sync all writes, including the
-	 * non-committed journal writes.  Then we write the final commit and
-	 * sync the current journal segment.
-	 * There is a theoretical bug here.  Syncing the journal segment will
-	 * write a number of journal entries and the final commit.  All these
-	 * are written in a single operation.  If the device layer writes the
-	 * data back-to-front, the commit will precede the other journal
-	 * entries, leaving a race window.
-	 * Two fixes are possible.  Preferred is to fix the device layer to
-	 * ensure writes happen front-to-back.  Alternatively we can insert
-	 * another logfs_sync_area() super->s_devops->sync() combo before
-	 * writing the commit.
-	 */
-	/*
-	 * On another subject, super->s_devops->sync is usually not necessary.
-	 * Unless called from sys_sync or friends, a barrier would suffice.
-	 */
-	super->s_devops->sync(sb);
-	err = logfs_write_je(sb, logfs_write_commit);
-	if (err)
-		goto again;
-	log_journal("Write commit to %llx\n",
-			be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
-	logfs_sync_area(area);
-	BUG_ON(area->a_used_bytes != area->a_written_bytes);
-	super->s_devops->sync(sb);
-
-	mutex_unlock(&super->s_journal_mutex);
-	return;
-}
-
-void do_logfs_journal_wl_pass(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_area *area = super->s_journal_area;
-	struct btree_head32 *head = &super->s_reserved_segments;
-	u32 segno, ec;
-	int i, err;
-
-	log_journal("Journal requires wear-leveling.\n");
-	/* Drop old segments */
-	journal_for_each(i)
-		if (super->s_journal_seg[i]) {
-			btree_remove32(head, super->s_journal_seg[i]);
-			logfs_set_segment_unreserved(sb,
-					super->s_journal_seg[i],
-					super->s_journal_ec[i]);
-			super->s_journal_seg[i] = 0;
-			super->s_journal_ec[i] = 0;
-		}
-	/* Get new segments */
-	for (i = 0; i < super->s_no_journal_segs; i++) {
-		segno = get_best_cand(sb, &super->s_reserve_list, &ec);
-		super->s_journal_seg[i] = segno;
-		super->s_journal_ec[i] = ec;
-		logfs_set_segment_reserved(sb, segno);
-		err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
-		BUG_ON(err); /* mempool should prevent this */
-		err = logfs_erase_segment(sb, segno, 1);
-		BUG_ON(err); /* FIXME: remount-ro would be nicer */
-	}
-	/* Manually move journal_area */
-	freeseg(sb, area->a_segno);
-	area->a_segno = super->s_journal_seg[0];
-	area->a_is_open = 0;
-	area->a_used_bytes = 0;
-	/* Write journal */
-	logfs_write_anchor(sb);
-	/* Write superblocks */
-	err = logfs_write_sb(sb);
-	BUG_ON(err);
-}
-
-static const struct logfs_area_ops journal_area_ops = {
-	.get_free_segment	= journal_get_free_segment,
-	.get_erase_count	= journal_get_erase_count,
-	.erase_segment		= journal_erase_segment,
-};
-
-int logfs_init_journal(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
-		+ MAX_JOURNAL_HEADER;
-	int ret = -ENOMEM;
-
-	mutex_init(&super->s_journal_mutex);
-	btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
-
-	super->s_je = kzalloc(bufsize, GFP_KERNEL);
-	if (!super->s_je)
-		return ret;
-
-	super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
-	if (!super->s_compressed_je)
-		return ret;
-
-	super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
-	if (IS_ERR(super->s_master_inode))
-		return PTR_ERR(super->s_master_inode);
-
-	ret = logfs_read_journal(sb);
-	if (ret)
-		return -EIO;
-
-	reserve_sb_and_journal(sb);
-	logfs_calc_free(sb);
-
-	super->s_journal_area->a_ops = &journal_area_ops;
-	return 0;
-}
-
-void logfs_cleanup_journal(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
-
-	kfree(super->s_compressed_je);
-	kfree(super->s_je);
-}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
deleted file mode 100644
index 27d040e35faa..000000000000
--- a/fs/logfs/logfs.h
+++ /dev/null
@@ -1,735 +0,0 @@
-/*
- * fs/logfs/logfs.h
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Private header for logfs.
- */
-#ifndef FS_LOGFS_LOGFS_H
-#define FS_LOGFS_LOGFS_H
-
-#undef __CHECK_ENDIAN__
-#define __CHECK_ENDIAN__
-
-#include <linux/btree.h>
-#include <linux/crc32.h>
-#include <linux/fs.h>
-#include <linux/kernel.h>
-#include <linux/mempool.h>
-#include <linux/pagemap.h>
-#include <linux/mtd/mtd.h>
-#include "logfs_abi.h"
-
-#define LOGFS_DEBUG_SUPER	(0x0001)
-#define LOGFS_DEBUG_SEGMENT	(0x0002)
-#define LOGFS_DEBUG_JOURNAL	(0x0004)
-#define LOGFS_DEBUG_DIR		(0x0008)
-#define LOGFS_DEBUG_FILE	(0x0010)
-#define LOGFS_DEBUG_INODE	(0x0020)
-#define LOGFS_DEBUG_READWRITE	(0x0040)
-#define LOGFS_DEBUG_GC		(0x0080)
-#define LOGFS_DEBUG_GC_NOISY	(0x0100)
-#define LOGFS_DEBUG_ALIASES	(0x0200)
-#define LOGFS_DEBUG_BLOCKMOVE	(0x0400)
-#define LOGFS_DEBUG_ALL		(0xffffffff)
-
-#define LOGFS_DEBUG		(0x01)
-/*
- * To enable specific log messages, simply define LOGFS_DEBUG to match any
- * or all of the above.
- */
-#ifndef LOGFS_DEBUG
-#define LOGFS_DEBUG		(0)
-#endif
-
-#define log_cond(cond, fmt, arg...) do {	\
-	if (cond)				\
-		printk(KERN_DEBUG fmt, ##arg);	\
-} while (0)
-
-#define log_super(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
-#define log_segment(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
-#define log_journal(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
-#define log_dir(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
-#define log_file(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
-#define log_inode(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
-#define log_readwrite(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
-#define log_gc(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
-#define log_gc_noisy(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
-#define log_aliases(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
-#define log_blockmove(fmt, arg...) \
-	log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
-
-#define PG_pre_locked		PG_owner_priv_1
-#define PagePreLocked(page)	test_bit(PG_pre_locked, &(page)->flags)
-#define SetPagePreLocked(page)	set_bit(PG_pre_locked, &(page)->flags)
-#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
-
-/* FIXME: This should really be somewhere in the 64bit area. */
-#define LOGFS_LINK_MAX		(1<<30)
-
-/* Read-only filesystem */
-#define LOGFS_SB_FLAG_RO	0x0001
-#define LOGFS_SB_FLAG_DIRTY	0x0002
-#define LOGFS_SB_FLAG_OBJ_ALIAS	0x0004
-#define LOGFS_SB_FLAG_SHUTDOWN	0x0008
-
-/* Write Control Flags */
-#define WF_LOCK			0x01 /* take write lock */
-#define WF_WRITE		0x02 /* write block */
-#define WF_DELETE		0x04 /* delete old block */
-
-typedef u8 __bitwise level_t;
-typedef u8 __bitwise gc_level_t;
-
-#define LEVEL(level) ((__force level_t)(level))
-#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
-
-#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)),	\
-		(__force level_t)((__force u8)(level) - 1) )
-
-/**
- * struct logfs_area - area management information
- *
- * @a_sb:			the superblock this area belongs to
- * @a_is_open:			1 if the area is currently open, else 0
- * @a_segno:			segment number of area
- * @a_written_bytes:		number of bytes already written back
- * @a_used_bytes:		number of used bytes
- * @a_ops:			area operations (either journal or ostore)
- * @a_erase_count:		erase count
- * @a_level:			GC level
- */
-struct logfs_area { /* a segment open for writing */
-	struct super_block *a_sb;
-	int	a_is_open;
-	u32	a_segno;
-	u32	a_written_bytes;
-	u32	a_used_bytes;
-	const struct logfs_area_ops *a_ops;
-	u32	a_erase_count;
-	gc_level_t a_level;
-};
-
-/**
- * struct logfs_area_ops - area operations
- *
- * @get_free_segment:		fill area->ofs with the offset of a free segment
- * @get_erase_count:		fill area->erase_count (needs area->ofs)
- * @erase_segment:		erase and setup segment
- */
-struct logfs_area_ops {
-	void	(*get_free_segment)(struct logfs_area *area);
-	void	(*get_erase_count)(struct logfs_area *area);
-	int	(*erase_segment)(struct logfs_area *area);
-};
-
-struct logfs_super;	/* forward */
-/**
- * struct logfs_device_ops - device access operations
- *
- * @readpage:			read one page (mm page)
- * @writeseg:			write one segment.  may be a partial segment
- * @erase:			erase one segment
- * @read:			read from the device
- * @erase:			erase part of the device
- * @can_write_buf:		decide whether wbuf can be written to ofs
- */
-struct logfs_device_ops {
-	struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
-	struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
-	int (*write_sb)(struct super_block *sb, struct page *page);
-	int (*readpage)(void *_sb, struct page *page);
-	void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
-	int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
-			int ensure_write);
-	int (*can_write_buf)(struct super_block *sb, u64 ofs);
-	void (*sync)(struct super_block *sb);
-	void (*put_device)(struct logfs_super *s);
-};
-
-/**
- * struct candidate_list - list of similar candidates
- */
-struct candidate_list {
-	struct rb_root rb_tree;
-	int count;
-	int maxcount;
-	int sort_by_ec;
-};
-
-/**
- * struct gc_candidate - "candidate" segment to be garbage collected next
- *
- * @list:			list (either free of low)
- * @segno:			segment number
- * @valid:			number of valid bytes
- * @erase_count:		erase count of segment
- * @dist:			distance from tree root
- *
- * Candidates can be on two lists.  The free list contains electees rather
- * than candidates - segments that no longer contain any valid data.  The
- * low list contains candidates to be picked for GC.  It should be kept
- * short.  It is not required to always pick a perfect candidate.  In the
- * worst case GC will have to move more data than absolutely necessary.
- */
-struct gc_candidate {
-	struct rb_node rb_node;
-	struct candidate_list *list;
-	u32	segno;
-	u32	valid;
-	u32	erase_count;
-	u8	dist;
-};
-
-/**
- * struct logfs_journal_entry - temporary structure used during journal scan
- *
- * @used:
- * @version:			normalized version
- * @len:			length
- * @offset:			offset
- */
-struct logfs_journal_entry {
-	int used;
-	s16 version;
-	u16 len;
-	u16 datalen;
-	u64 offset;
-};
-
-enum transaction_state {
-	CREATE_1 = 1,
-	CREATE_2,
-	UNLINK_1,
-	UNLINK_2,
-	CROSS_RENAME_1,
-	CROSS_RENAME_2,
-	TARGET_RENAME_1,
-	TARGET_RENAME_2,
-	TARGET_RENAME_3
-};
-
-/**
- * struct logfs_transaction - essential fields to support atomic dirops
- *
- * @ino:			target inode
- * @dir:			inode of directory containing dentry
- * @pos:			pos of dentry in directory
- */
-struct logfs_transaction {
-	enum transaction_state state;
-	u64	 ino;
-	u64	 dir;
-	u64	 pos;
-};
-
-/**
- * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
- * @old_ofs:			offset of old block on medium
- * @new_ofs:			offset of new block on medium
- * @ino:			inode number
- * @bix:			block index
- * @old_len:			size of old block, including header
- * @new_len:			size of new block, including header
- * @level:			block level
- */
-struct logfs_shadow {
-	u64 old_ofs;
-	u64 new_ofs;
-	u64 ino;
-	u64 bix;
-	int old_len;
-	int new_len;
-	gc_level_t gc_level;
-};
-
-/**
- * struct shadow_tree
- * @new:			shadows where old_ofs==0, indexed by new_ofs
- * @old:			shadows where old_ofs!=0, indexed by old_ofs
- * @segment_map:		bitfield of segments containing shadows
- * @no_shadowed_segment:	number of segments containing shadows
- */
-struct shadow_tree {
-	struct btree_head64 new;
-	struct btree_head64 old;
-	struct btree_head32 segment_map;
-	int no_shadowed_segments;
-};
-
-struct object_alias_item {
-	struct list_head list;
-	__be64 val;
-	int child_no;
-};
-
-/**
- * struct logfs_block - contains any block state
- * @type:			indirect block or inode
- * @full:			number of fully populated children
- * @partial:			number of partially populated children
- *
- * Most blocks are directly represented by page cache pages.  But when a block
- * becomes dirty, is part of a transaction, contains aliases or is otherwise
- * special, a struct logfs_block is allocated to track the additional state.
- * Inodes are very similar to indirect blocks, so they can also get one of
- * these structures added when appropriate.
- */
-#define BLOCK_INDIRECT	1	/* Indirect block */
-#define BLOCK_INODE	2	/* Inode */
-struct logfs_block_ops;
-struct logfs_block {
-	struct list_head alias_list;
-	struct list_head item_list;
-	struct super_block *sb;
-	u64 ino;
-	u64 bix;
-	level_t level;
-	struct page *page;
-	struct inode *inode;
-	struct logfs_transaction *ta;
-	unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
-	const struct logfs_block_ops *ops;
-	int full;
-	int partial;
-	int reserved_bytes;
-};
-
-typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
-		level_t level, int child_no, __be64 val);
-struct logfs_block_ops {
-	void	(*write_block)(struct logfs_block *block);
-	void	(*free_block)(struct super_block *sb, struct logfs_block*block);
-	int	(*write_alias)(struct super_block *sb,
-			struct logfs_block *block,
-			write_alias_t *write_one_alias);
-};
-
-#define MAX_JOURNAL_ENTRIES 256
-
-struct logfs_super {
-	struct mtd_info *s_mtd;			/* underlying device */
-	struct block_device *s_bdev;		/* underlying device */
-	const struct logfs_device_ops *s_devops;/* device access */
-	struct inode	*s_master_inode;	/* inode file */
-	struct inode	*s_segfile_inode;	/* segment file */
-	struct inode *s_mapping_inode;		/* device mapping */
-	atomic_t s_pending_writes;		/* outstanting bios */
-	long	 s_flags;
-	mempool_t *s_btree_pool;		/* for btree nodes */
-	mempool_t *s_alias_pool;		/* aliases in segment.c */
-	u64	 s_feature_incompat;
-	u64	 s_feature_ro_compat;
-	u64	 s_feature_compat;
-	u64	 s_feature_flags;
-	u64	 s_sb_ofs[2];
-	struct page *s_erase_page;		/* for dev_bdev.c */
-	/* alias.c fields */
-	struct btree_head32 s_segment_alias;	/* remapped segments */
-	int	 s_no_object_aliases;
-	struct list_head s_object_alias;	/* remapped objects */
-	struct btree_head128 s_object_alias_tree; /* remapped objects */
-	struct mutex s_object_alias_mutex;
-	/* dir.c fields */
-	struct mutex s_dirop_mutex;		/* for creat/unlink/rename */
-	u64	 s_victim_ino;			/* used for atomic dir-ops */
-	u64	 s_rename_dir;			/* source directory ino */
-	u64	 s_rename_pos;			/* position of source dd */
-	/* gc.c fields */
-	long	 s_segsize;			/* size of a segment */
-	int	 s_segshift;			/* log2 of segment size */
-	long	 s_segmask;			/* 1 << s_segshift - 1 */
-	long	 s_no_segs;			/* segments on device */
-	long	 s_no_journal_segs;		/* segments used for journal */
-	long	 s_no_blocks;			/* blocks per segment */
-	long	 s_writesize;			/* minimum write size */
-	int	 s_writeshift;			/* log2 of write size */
-	u64	 s_size;			/* filesystem size */
-	struct logfs_area *s_area[LOGFS_NO_AREAS];	/* open segment array */
-	u64	 s_gec;				/* global erase count */
-	u64	 s_wl_gec_ostore;		/* time of last wl event */
-	u64	 s_wl_gec_journal;		/* time of last wl event */
-	u64	 s_sweeper;			/* current sweeper pos */
-	u8	 s_ifile_levels;		/* max level of ifile */
-	u8	 s_iblock_levels;		/* max level of regular files */
-	u8	 s_data_levels;			/* # of segments to leaf block*/
-	u8	 s_total_levels;		/* sum of above three */
-	struct btree_head32 s_cand_tree;	/* all candidates */
-	struct candidate_list s_free_list;	/* 100% free segments */
-	struct candidate_list s_reserve_list;	/* Bad segment reserve */
-	struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
-	struct candidate_list s_ec_list;	/* wear level candidates */
-	struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
-	/* inode.c fields */
-	u64	 s_last_ino;			/* highest ino used */
-	long	 s_inos_till_wrap;
-	u32	 s_generation;			/* i_generation for new files */
-	struct list_head s_freeing_list;	/* inodes being freed */
-	/* journal.c fields */
-	struct mutex s_journal_mutex;
-	void	*s_je;				/* journal entry to compress */
-	void	*s_compressed_je;		/* block to write to journal */
-	u32	 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
-	u32	 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
-	u64	 s_last_version;
-	struct logfs_area *s_journal_area;	/* open journal segment */
-	__be64	s_je_array[MAX_JOURNAL_ENTRIES];
-	int	s_no_je;
-
-	int	 s_sum_index;			/* for the 12 summaries */
-	struct shadow_tree s_shadow_tree;
-	int	 s_je_fill;			/* index of current je */
-	/* readwrite.c fields */
-	struct mutex s_write_mutex;
-	int	 s_lock_count;
-	mempool_t *s_block_pool;		/* struct logfs_block pool */
-	mempool_t *s_shadow_pool;		/* struct logfs_shadow pool */
-	struct list_head s_writeback_list;	/* writeback pages */
-	/*
-	 * Space accounting:
-	 * - s_used_bytes specifies space used to store valid data objects.
-	 * - s_dirty_used_bytes is space used to store non-committed data
-	 *   objects.  Those objects have already been written themselves,
-	 *   but they don't become valid until all indirect blocks up to the
-	 *   journal have been written as well.
-	 * - s_dirty_free_bytes is space used to store the old copy of a
-	 *   replaced object, as long as the replacement is non-committed.
-	 *   In other words, it is the amount of space freed when all dirty
-	 *   blocks are written back.
-	 * - s_free_bytes is the amount of free space available for any
-	 *   purpose.
-	 * - s_root_reserve is the amount of free space available only to
-	 *   the root user.  Non-privileged users can no longer write once
-	 *   this watermark has been reached.
-	 * - s_speed_reserve is space which remains unused to speed up
-	 *   garbage collection performance.
-	 * - s_dirty_pages is the space reserved for currently dirty pages.
-	 *   It is a pessimistic estimate, so some/most will get freed on
-	 *   page writeback.
-	 *
-	 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
-	 */
-	u64	 s_free_bytes;
-	u64	 s_used_bytes;
-	u64	 s_dirty_free_bytes;
-	u64	 s_dirty_used_bytes;
-	u64	 s_root_reserve;
-	u64	 s_speed_reserve;
-	u64	 s_dirty_pages;
-	/* Bad block handling:
-	 * - s_bad_seg_reserve is a number of segments usually kept
-	 *   free.  When encountering bad blocks, the affected segment's data
-	 *   is _temporarily_ moved to a reserved segment.
-	 * - s_bad_segments is the number of known bad segments.
-	 */
-	u32	 s_bad_seg_reserve;
-	u32	 s_bad_segments;
-};
-
-/**
- * struct logfs_inode - in-memory inode
- *
- * @vfs_inode:			struct inode
- * @li_data:			data pointers
- * @li_used_bytes:		number of used bytes
- * @li_freeing_list:		used to track inodes currently being freed
- * @li_flags:			inode flags
- * @li_refcount:		number of internal (GC-induced) references
- */
-struct logfs_inode {
-	struct inode vfs_inode;
-	u64	li_data[LOGFS_EMBEDDED_FIELDS];
-	u64	li_used_bytes;
-	struct list_head li_freeing_list;
-	struct logfs_block *li_block;
-	u32	li_flags;
-	u8	li_height;
-	int	li_refcount;
-};
-
-#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
-#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
-#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
-
-/* compr.c */
-int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
-int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
-int __init logfs_compr_init(void);
-void logfs_compr_exit(void);
-
-/* dev_bdev.c */
-#ifdef CONFIG_BLOCK
-int logfs_get_sb_bdev(struct logfs_super *s,
-		struct file_system_type *type,
-		const char *devname);
-#else
-static inline int logfs_get_sb_bdev(struct logfs_super *s,
-		struct file_system_type *type,
-		const char *devname)
-{
-	return -ENODEV;
-}
-#endif
-
-/* dev_mtd.c */
-#if IS_ENABLED(CONFIG_MTD)
-int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
-#else
-static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
-{
-	return -ENODEV;
-}
-#endif
-
-/* dir.c */
-extern const struct inode_operations logfs_dir_iops;
-extern const struct file_operations logfs_dir_fops;
-int logfs_replay_journal(struct super_block *sb);
-
-/* file.c */
-extern const struct inode_operations logfs_reg_iops;
-extern const struct file_operations logfs_reg_fops;
-extern const struct address_space_operations logfs_reg_aops;
-int logfs_readpage(struct file *file, struct page *page);
-long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
-
-/* gc.c */
-u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
-void logfs_gc_pass(struct super_block *sb);
-int logfs_check_areas(struct super_block *sb);
-int logfs_init_gc(struct super_block *sb);
-void logfs_cleanup_gc(struct super_block *sb);
-
-/* inode.c */
-extern const struct super_operations logfs_super_operations;
-struct inode *logfs_iget(struct super_block *sb, ino_t ino);
-struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
-void logfs_safe_iput(struct inode *inode, int cookie);
-struct inode *logfs_new_inode(struct inode *dir, umode_t mode);
-struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
-struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
-int logfs_init_inode_cache(void);
-void logfs_destroy_inode_cache(void);
-void logfs_set_blocks(struct inode *inode, u64 no);
-/* these logically belong into inode.c but actually reside in readwrite.c */
-int logfs_read_inode(struct inode *inode);
-int __logfs_write_inode(struct inode *inode, struct page *, long flags);
-void logfs_evict_inode(struct inode *inode);
-
-/* journal.c */
-void logfs_write_anchor(struct super_block *sb);
-int logfs_init_journal(struct super_block *sb);
-void logfs_cleanup_journal(struct super_block *sb);
-int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
-		level_t level, int child_no, __be64 val);
-void do_logfs_journal_wl_pass(struct super_block *sb);
-
-/* readwrite.c */
-pgoff_t logfs_pack_index(u64 bix, level_t level);
-void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
-int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
-		loff_t bix, long flags, struct shadow_tree *shadow_tree);
-int logfs_readpage_nolock(struct page *page);
-int logfs_write_buf(struct inode *inode, struct page *page, long flags);
-int logfs_delete(struct inode *inode, pgoff_t index,
-		struct shadow_tree *shadow_tree);
-int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
-		gc_level_t gc_level, long flags);
-int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
-		gc_level_t gc_level);
-int logfs_truncate(struct inode *inode, u64 size);
-u64 logfs_seek_hole(struct inode *inode, u64 bix);
-u64 logfs_seek_data(struct inode *inode, u64 bix);
-int logfs_open_segfile(struct super_block *sb);
-int logfs_init_rw(struct super_block *sb);
-void logfs_cleanup_rw(struct super_block *sb);
-void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
-void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
-void logfs_write_block(struct logfs_block *block, long flags);
-int logfs_write_obj_aliases_pagecache(struct super_block *sb);
-void logfs_get_segment_entry(struct super_block *sb, u32 segno,
-		struct logfs_segment_entry *se);
-void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
-void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
-		gc_level_t gc_level);
-void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
-void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
-struct logfs_block *__alloc_block(struct super_block *sb,
-		u64 ino, u64 bix, level_t level);
-void __free_block(struct super_block *sb, struct logfs_block *block);
-void btree_write_block(struct logfs_block *block);
-void initialize_block_counters(struct page *page, struct logfs_block *block,
-		__be64 *array, int page_is_empty);
-int logfs_exist_block(struct inode *inode, u64 bix);
-int get_page_reserve(struct inode *inode, struct page *page);
-void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
-void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
-extern const struct logfs_block_ops indirect_block_ops;
-
-/* segment.c */
-int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
-int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
-int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
-		level_t level);
-int logfs_segment_write(struct inode *inode, struct page *page,
-		struct logfs_shadow *shadow);
-int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
-int logfs_load_object_aliases(struct super_block *sb,
-		struct logfs_obj_alias *oa, int count);
-void move_page_to_btree(struct page *page);
-int logfs_init_mapping(struct super_block *sb);
-void logfs_sync_area(struct logfs_area *area);
-void logfs_sync_segments(struct super_block *sb);
-void freeseg(struct super_block *sb, u32 segno);
-void free_areas(struct super_block *sb);
-
-/* area handling */
-int logfs_init_areas(struct super_block *sb);
-void logfs_cleanup_areas(struct super_block *sb);
-int logfs_open_area(struct logfs_area *area, size_t bytes);
-int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
-		int use_filler);
-
-static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
-		void *buf, size_t len)
-{
-	return __logfs_buf_write(area, ofs, buf, len, 0);
-}
-
-static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
-		void *buf, size_t len)
-{
-	return __logfs_buf_write(area, ofs, buf, len, 1);
-}
-
-/* super.c */
-struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
-void emergency_read_end(struct page *page);
-void logfs_crash_dump(struct super_block *sb);
-int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
-int logfs_check_ds(struct logfs_disk_super *ds);
-int logfs_write_sb(struct super_block *sb);
-
-static inline struct logfs_super *logfs_super(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-static inline struct logfs_inode *logfs_inode(struct inode *inode)
-{
-	return container_of(inode, struct logfs_inode, vfs_inode);
-}
-
-static inline void logfs_set_ro(struct super_block *sb)
-{
-	logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
-}
-
-#define LOGFS_BUG(sb) do {					\
-	struct super_block *__sb = sb;				\
-	logfs_crash_dump(__sb);					\
-	logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO;		\
-	BUG();							\
-} while (0)
-
-#define LOGFS_BUG_ON(condition, sb) \
-	do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
-
-static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
-{
-	return cpu_to_be32(crc32(~0, data+skip, len-skip));
-}
-
-static inline u8 logfs_type(struct inode *inode)
-{
-	return (inode->i_mode >> 12) & 15;
-}
-
-static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
-{
-	return pos >> sb->s_blocksize_bits;
-}
-
-static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
-{
-	return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
-}
-
-static inline u32 seg_no(struct super_block *sb, u64 ofs)
-{
-	return ofs >> logfs_super(sb)->s_segshift;
-}
-
-static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
-{
-	return ofs & logfs_super(sb)->s_segmask;
-}
-
-static inline u64 seg_align(struct super_block *sb, u64 ofs)
-{
-	return ofs & ~logfs_super(sb)->s_segmask;
-}
-
-static inline struct logfs_block *logfs_block(struct page *page)
-{
-	return (void *)page->private;
-}
-
-static inline level_t shrink_level(gc_level_t __level)
-{
-	u8 level = (__force u8)__level;
-
-	if (level >= LOGFS_MAX_LEVELS)
-		level -= LOGFS_MAX_LEVELS;
-	return (__force level_t)level;
-}
-
-static inline gc_level_t expand_level(u64 ino, level_t __level)
-{
-	u8 level = (__force u8)__level;
-
-	if (ino == LOGFS_INO_MASTER) {
-		/* ifile has separate areas */
-		level += LOGFS_MAX_LEVELS;
-	}
-	return (__force gc_level_t)level;
-}
-
-static inline int logfs_block_shift(struct super_block *sb, level_t level)
-{
-	level = shrink_level((__force gc_level_t)level);
-	return (__force int)level * (sb->s_blocksize_bits - 3);
-}
-
-static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
-{
-	return ~0ull << logfs_block_shift(sb, level);
-}
-
-static inline struct logfs_area *get_area(struct super_block *sb,
-		gc_level_t gc_level)
-{
-	return logfs_super(sb)->s_area[(__force u8)gc_level];
-}
-
-static inline void logfs_mempool_destroy(mempool_t *pool)
-{
-	if (pool)
-		mempool_destroy(pool);
-}
-
-#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
deleted file mode 100644
index ae960519c54a..000000000000
--- a/fs/logfs/logfs_abi.h
+++ /dev/null
@@ -1,629 +0,0 @@
-/*
- * fs/logfs/logfs_abi.h
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Public header for logfs.
- */
-#ifndef FS_LOGFS_LOGFS_ABI_H
-#define FS_LOGFS_LOGFS_ABI_H
-
-/* For out-of-kernel compiles */
-#ifndef BUILD_BUG_ON
-#define BUILD_BUG_ON(condition) /**/
-#endif
-
-#define SIZE_CHECK(type, size)					\
-static inline void check_##type(void)				\
-{								\
-	BUILD_BUG_ON(sizeof(struct type) != (size));		\
-}
-
-/*
- * Throughout the logfs code, we're constantly dealing with blocks at
- * various positions or offsets.  To remove confusion, we stricly
- * distinguish between a "position" - the logical position within a
- * file and an "offset" - the physical location within the device.
- *
- * Any usage of the term offset for a logical location or position for
- * a physical one is a bug and should get fixed.
- */
-
-/*
- * Block are allocated in one of several segments depending on their
- * level.  The following levels are used:
- *  0	- regular data block
- *  1	- i1 indirect blocks
- *  2	- i2 indirect blocks
- *  3	- i3 indirect blocks
- *  4	- i4 indirect blocks
- *  5	- i5 indirect blocks
- *  6	- ifile data blocks
- *  7	- ifile i1 indirect blocks
- *  8	- ifile i2 indirect blocks
- *  9	- ifile i3 indirect blocks
- * 10	- ifile i4 indirect blocks
- * 11	- ifile i5 indirect blocks
- * Potential levels to be used in the future:
- * 12	- gc recycled blocks, long-lived data
- * 13	- replacement blocks, short-lived data
- *
- * Levels 1-11 are necessary for robust gc operations and help separate
- * short-lived metadata from longer-lived file data.  In the future,
- * file data should get separated into several segments based on simple
- * heuristics.  Old data recycled during gc operation is expected to be
- * long-lived.  New data is of uncertain life expectancy.  New data
- * used to replace older blocks in existing files is expected to be
- * short-lived.
- */
-
-
-/* Magic numbers.  64bit for superblock, 32bit for statfs f_type */
-#define LOGFS_MAGIC		0x7a3a8e5cb9d5bf67ull
-#define LOGFS_MAGIC_U32		0xc97e8168u
-
-/*
- * Various blocksize related macros.  Blocksize is currently fixed at 4KiB.
- * Sooner or later that should become configurable and the macros replaced
- * by something superblock-dependent.  Pointers in indirect blocks are and
- * will remain 64bit.
- *
- * LOGFS_BLOCKSIZE	- self-explaining
- * LOGFS_BLOCK_FACTOR	- number of pointers per indirect block
- * LOGFS_BLOCK_BITS	- log2 of LOGFS_BLOCK_FACTOR, used for shifts
- */
-#define LOGFS_BLOCKSIZE		(4096ull)
-#define LOGFS_BLOCK_FACTOR	(LOGFS_BLOCKSIZE / sizeof(u64))
-#define LOGFS_BLOCK_BITS	(9)
-
-/*
- * Number of blocks at various levels of indirection.  There are 16 direct
- * block pointers plus a single indirect pointer.
- */
-#define I0_BLOCKS		(16)
-#define I1_BLOCKS		LOGFS_BLOCK_FACTOR
-#define I2_BLOCKS		(LOGFS_BLOCK_FACTOR * I1_BLOCKS)
-#define I3_BLOCKS		(LOGFS_BLOCK_FACTOR * I2_BLOCKS)
-#define I4_BLOCKS		(LOGFS_BLOCK_FACTOR * I3_BLOCKS)
-#define I5_BLOCKS		(LOGFS_BLOCK_FACTOR * I4_BLOCKS)
-
-#define INDIRECT_INDEX		I0_BLOCKS
-#define LOGFS_EMBEDDED_FIELDS	(I0_BLOCKS + 1)
-
-/*
- * Sizes at which files require another level of indirection.  Files smaller
- * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
- * similar like ext2 fast symlinks.
- *
- * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
- * direct pointers, else through the 1x indirect pointer and so forth.
- */
-#define LOGFS_EMBEDDED_SIZE	(LOGFS_EMBEDDED_FIELDS * sizeof(u64))
-#define LOGFS_I0_SIZE		(I0_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I1_SIZE		(I1_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I2_SIZE		(I2_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I3_SIZE		(I3_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I4_SIZE		(I4_BLOCKS * LOGFS_BLOCKSIZE)
-#define LOGFS_I5_SIZE		(I5_BLOCKS * LOGFS_BLOCKSIZE)
-
-/*
- * Each indirect block pointer must have this flag set, if all block pointers
- * behind it are set, i.e. there is no hole hidden in the shadow of this
- * indirect block pointer.
- */
-#define LOGFS_FULLY_POPULATED (1ULL << 63)
-#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
-
-/*
- * LogFS needs to separate data into levels.  Each level is defined as the
- * maximal possible distance from the master inode (inode of the inode file).
- * Data blocks reside on level 0, 1x indirect block on level 1, etc.
- * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
- * This effort is necessary to guarantee garbage collection to always make
- * progress.
- *
- * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
- * LOGFS_MAX_LEVELS is one more for the actual data level of a file.  It is
- * the maximal number of levels for one file.
- * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
- * effectively stacked on top of each other.
- */
-#define LOGFS_MAX_INDIRECT	(5)
-#define LOGFS_MAX_LEVELS	(LOGFS_MAX_INDIRECT + 1)
-#define LOGFS_NO_AREAS		(2 * LOGFS_MAX_LEVELS)
-
-/* Maximum size of filenames */
-#define LOGFS_MAX_NAMELEN	(255)
-
-/* Number of segments in the primary journal. */
-#define LOGFS_JOURNAL_SEGS	(16)
-
-/* Maximum number of free/erased/etc. segments in journal entries */
-#define MAX_CACHED_SEGS		(64)
-
-
-/*
- * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
- * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
- * its header,
- * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
- * its segment header and the padded space at the end when no further objects
- * fit.
- */
-#define LOGFS_OBJECT_HEADERSIZE	(0x1c)
-#define LOGFS_SEGMENT_HEADERSIZE (0x18)
-#define LOGFS_MAX_OBJECTSIZE	(LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
-#define LOGFS_SEGMENT_RESERVE	\
-	(LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
-
-/*
- * Segment types:
- * SEG_SUPER	- Data or indirect block
- * SEG_JOURNAL	- Inode
- * SEG_OSTORE	- Dentry
- */
-enum {
-	SEG_SUPER	= 0x01,
-	SEG_JOURNAL	= 0x02,
-	SEG_OSTORE	= 0x03,
-};
-
-/**
- * struct logfs_segment_header - per-segment header in the ostore
- *
- * @crc:			crc32 of header (there is no data)
- * @pad:			unused, must be 0
- * @type:			segment type, see above
- * @level:			GC level for all objects in this segment
- * @segno:			segment number
- * @ec:				erase count for this segment
- * @gec:			global erase count at time of writing
- */
-struct logfs_segment_header {
-	__be32	crc;
-	__be16	pad;
-	__u8	type;
-	__u8	level;
-	__be32	segno;
-	__be32	ec;
-	__be64	gec;
-};
-
-SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
-
-#define LOGFS_FEATURES_INCOMPAT		(0ull)
-#define LOGFS_FEATURES_RO_COMPAT	(0ull)
-#define LOGFS_FEATURES_COMPAT		(0ull)
-
-/**
- * struct logfs_disk_super - on-medium superblock
- *
- * @ds_magic:			magic number, must equal LOGFS_MAGIC
- * @ds_crc:			crc32 of structure starting with the next field
- * @ds_ifile_levels:		maximum number of levels for ifile
- * @ds_iblock_levels:		maximum number of levels for regular files
- * @ds_data_levels:		number of separate levels for data
- * @pad0:			reserved, must be 0
- * @ds_feature_incompat:	incompatible filesystem features
- * @ds_feature_ro_compat:	read-only compatible filesystem features
- * @ds_feature_compat:		compatible filesystem features
- * @ds_flags:			flags
- * @ds_segment_shift:		log2 of segment size
- * @ds_block_shift:		log2 of block size
- * @ds_write_shift:		log2 of write size
- * @pad1:			reserved, must be 0
- * @ds_journal_seg:		segments used by primary journal
- * @ds_root_reserve:		bytes reserved for the superuser
- * @ds_speed_reserve:		bytes reserved to speed up GC
- * @ds_bad_seg_reserve:		number of segments reserved to handle bad blocks
- * @pad2:			reserved, must be 0
- * @pad3:			reserved, must be 0
- *
- * Contains only read-only fields.  Read-write fields like the amount of used
- * space is tracked in the dynamic superblock, which is stored in the journal.
- */
-struct logfs_disk_super {
-	struct logfs_segment_header ds_sh;
-	__be64	ds_magic;
-
-	__be32	ds_crc;
-	__u8	ds_ifile_levels;
-	__u8	ds_iblock_levels;
-	__u8	ds_data_levels;
-	__u8	ds_segment_shift;
-	__u8	ds_block_shift;
-	__u8	ds_write_shift;
-	__u8	pad0[6];
-
-	__be64	ds_filesystem_size;
-	__be32	ds_segment_size;
-	__be32  ds_bad_seg_reserve;
-
-	__be64	ds_feature_incompat;
-	__be64	ds_feature_ro_compat;
-
-	__be64	ds_feature_compat;
-	__be64	ds_feature_flags;
-
-	__be64	ds_root_reserve;
-	__be64  ds_speed_reserve;
-
-	__be32	ds_journal_seg[LOGFS_JOURNAL_SEGS];
-
-	__be64	ds_super_ofs[2];
-	__be64	pad3[8];
-};
-
-SIZE_CHECK(logfs_disk_super, 256);
-
-/*
- * Object types:
- * OBJ_BLOCK	- Data or indirect block
- * OBJ_INODE	- Inode
- * OBJ_DENTRY	- Dentry
- */
-enum {
-	OBJ_BLOCK	= 0x04,
-	OBJ_INODE	= 0x05,
-	OBJ_DENTRY	= 0x06,
-};
-
-/**
- * struct logfs_object_header - per-object header in the ostore
- *
- * @crc:			crc32 of header, excluding data_crc
- * @len:			length of data
- * @type:			object type, see above
- * @compr:			compression type
- * @ino:			inode number
- * @bix:			block index
- * @data_crc:			crc32 of payload
- */
-struct logfs_object_header {
-	__be32	crc;
-	__be16	len;
-	__u8	type;
-	__u8	compr;
-	__be64	ino;
-	__be64	bix;
-	__be32	data_crc;
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
-
-/*
- * Reserved inode numbers:
- * LOGFS_INO_MASTER	- master inode (for inode file)
- * LOGFS_INO_ROOT	- root directory
- * LOGFS_INO_SEGFILE	- per-segment used bytes and erase count
- */
-enum {
-	LOGFS_INO_MAPPING	= 0x00,
-	LOGFS_INO_MASTER	= 0x01,
-	LOGFS_INO_ROOT		= 0x02,
-	LOGFS_INO_SEGFILE	= 0x03,
-	LOGFS_RESERVED_INOS	= 0x10,
-};
-
-/*
- * Inode flags.  High bits should never be written to the medium.  They are
- * reserved for in-memory usage.
- * Low bits should either remain in sync with the corresponding FS_*_FL or
- * reuse slots that obviously don't make sense for logfs.
- *
- * LOGFS_IF_DIRTY	Inode must be written back
- * LOGFS_IF_ZOMBIE	Inode has been deleted
- * LOGFS_IF_STILLBORN	-ENOSPC happened when creating inode
- */
-#define LOGFS_IF_COMPRESSED	0x00000004 /* == FS_COMPR_FL */
-#define LOGFS_IF_DIRTY		0x20000000
-#define LOGFS_IF_ZOMBIE		0x40000000
-#define LOGFS_IF_STILLBORN	0x80000000
-
-/* Flags available to chattr */
-#define LOGFS_FL_USER_VISIBLE	(LOGFS_IF_COMPRESSED)
-#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
-/* Flags inherited from parent directory on file/directory creation */
-#define LOGFS_FL_INHERITED	(LOGFS_IF_COMPRESSED)
-
-/**
- * struct logfs_disk_inode - on-medium inode
- *
- * @di_mode:			file mode
- * @di_pad:			reserved, must be 0
- * @di_flags:			inode flags, see above
- * @di_uid:			user id
- * @di_gid:			group id
- * @di_ctime:			change time
- * @di_mtime:			modify time
- * @di_refcount:		reference count (aka nlink or link count)
- * @di_generation:		inode generation, for nfs
- * @di_used_bytes:		number of bytes used
- * @di_size:			file size
- * @di_data:			data pointers
- */
-struct logfs_disk_inode {
-	__be16	di_mode;
-	__u8	di_height;
-	__u8	di_pad;
-	__be32	di_flags;
-	__be32	di_uid;
-	__be32	di_gid;
-
-	__be64	di_ctime;
-	__be64	di_mtime;
-
-	__be64	di_atime;
-	__be32	di_refcount;
-	__be32	di_generation;
-
-	__be64	di_used_bytes;
-	__be64	di_size;
-
-	__be64	di_data[LOGFS_EMBEDDED_FIELDS];
-};
-
-SIZE_CHECK(logfs_disk_inode, 200);
-
-#define INODE_POINTER_OFS \
-	(offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
-#define INODE_USED_OFS \
-	(offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
-#define INODE_SIZE_OFS \
-	(offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
-#define INODE_HEIGHT_OFS	(0)
-
-/**
- * struct logfs_disk_dentry - on-medium dentry structure
- *
- * @ino:			inode number
- * @namelen:			length of file name
- * @type:			file type, identical to bits 12..15 of mode
- * @name:			file name
- */
-/* FIXME: add 6 bytes of padding to remove the __packed */
-struct logfs_disk_dentry {
-	__be64	ino;
-	__be16	namelen;
-	__u8	type;
-	__u8	name[LOGFS_MAX_NAMELEN];
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_disk_dentry, 266);
-
-#define RESERVED		0xffffffff
-#define BADSEG			0xffffffff
-/**
- * struct logfs_segment_entry - segment file entry
- *
- * @ec_level:			erase count and level
- * @valid:			number of valid bytes
- *
- * Segment file contains one entry for every segment.  ec_level contains the
- * erasecount in the upper 28 bits and the level in the lower 4 bits.  An
- * ec_level of BADSEG (-1) identifies bad segments.  valid contains the number
- * of valid bytes or RESERVED (-1 again) if the segment is used for either the
- * superblock or the journal, or when the segment is bad.
- */
-struct logfs_segment_entry {
-	__be32	ec_level;
-	__be32	valid;
-};
-
-SIZE_CHECK(logfs_segment_entry, 8);
-
-/**
- * struct logfs_journal_header - header for journal entries (JEs)
- *
- * @h_crc:			crc32 of journal entry
- * @h_len:			length of compressed journal entry,
- *				not including header
- * @h_datalen:			length of uncompressed data
- * @h_type:			JE type
- * @h_compr:			compression type
- * @h_pad:			reserved
- */
-struct logfs_journal_header {
-	__be32	h_crc;
-	__be16	h_len;
-	__be16	h_datalen;
-	__be16	h_type;
-	__u8	h_compr;
-	__u8	h_pad[5];
-};
-
-SIZE_CHECK(logfs_journal_header, 16);
-
-/*
- * Life expectency of data.
- * VIM_DEFAULT		- default vim
- * VIM_SEGFILE		- for segment file only - very short-living
- * VIM_GC		- GC'd data - likely long-living
- */
-enum logfs_vim {
-	VIM_DEFAULT	= 0,
-	VIM_SEGFILE	= 1,
-};
-
-/**
- * struct logfs_je_area - wbuf header
- *
- * @segno:			segment number of area
- * @used_bytes:			number of bytes already used
- * @gc_level:			GC level
- * @vim:			life expectancy of data
- *
- * "Areas" are segments currently being used for writing.  There is at least
- * one area per GC level.  Several may be used to separate long-living from
- * short-living data.  If an area with unknown vim is encountered, it can
- * simply be closed.
- * The write buffer immediately follow this header.
- */
-struct logfs_je_area {
-	__be32	segno;
-	__be32	used_bytes;
-	__u8	gc_level;
-	__u8	vim;
-} __attribute__((packed));
-
-SIZE_CHECK(logfs_je_area, 10);
-
-#define MAX_JOURNAL_HEADER \
-	(sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
-
-/**
- * struct logfs_je_dynsb - dynamic superblock
- *
- * @ds_gec:			global erase count
- * @ds_sweeper:			current position of GC "sweeper"
- * @ds_rename_dir:		source directory ino (see dir.c documentation)
- * @ds_rename_pos:		position of source dd (see dir.c documentation)
- * @ds_victim_ino:		victims of incomplete dir operation (see dir.c)
- * @ds_victim_ino:		parent inode of victim (see dir.c)
- * @ds_used_bytes:		number of used bytes
- */
-struct logfs_je_dynsb {
-	__be64	ds_gec;
-	__be64	ds_sweeper;
-
-	__be64	ds_rename_dir;
-	__be64	ds_rename_pos;
-
-	__be64	ds_victim_ino;
-	__be64	ds_victim_parent; /* XXX */
-
-	__be64	ds_used_bytes;
-	__be32	ds_generation;
-	__be32	pad;
-};
-
-SIZE_CHECK(logfs_je_dynsb, 64);
-
-/**
- * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
- *
- * @da_size:			size of inode file
- * @da_last_ino:		last created inode
- * @da_used_bytes:		number of bytes used
- * @da_data:			data pointers
- */
-struct logfs_je_anchor {
-	__be64	da_size;
-	__be64	da_last_ino;
-
-	__be64	da_used_bytes;
-	u8	da_height;
-	u8	pad[7];
-
-	__be64	da_data[LOGFS_EMBEDDED_FIELDS];
-};
-
-SIZE_CHECK(logfs_je_anchor, 168);
-
-/**
- * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
- *
- * @so_segment:			segments used for 2nd journal
- *
- * Length of the array is given by h_len field in the header.
- */
-struct logfs_je_spillout {
-	__be64	so_segment[0];
-};
-
-SIZE_CHECK(logfs_je_spillout, 0);
-
-/**
- * struct logfs_je_journal_ec - erase counts for all journal segments
- *
- * @ec:				erase count
- *
- * Length of the array is given by h_len field in the header.
- */
-struct logfs_je_journal_ec {
-	__be32	ec[0];
-};
-
-SIZE_CHECK(logfs_je_journal_ec, 0);
-
-/**
- * struct logfs_je_free_segments - list of free segmetns with erase count
- */
-struct logfs_je_free_segments {
-	__be32	segno;
-	__be32	ec;
-};
-
-SIZE_CHECK(logfs_je_free_segments, 8);
-
-/**
- * struct logfs_seg_alias - list of segment aliases
- */
-struct logfs_seg_alias {
-	__be32	old_segno;
-	__be32	new_segno;
-};
-
-SIZE_CHECK(logfs_seg_alias, 8);
-
-/**
- * struct logfs_obj_alias - list of object aliases
- */
-struct logfs_obj_alias {
-	__be64	ino;
-	__be64	bix;
-	__be64	val;
-	u8	level;
-	u8	pad[5];
-	__be16	child_no;
-};
-
-SIZE_CHECK(logfs_obj_alias, 32);
-
-/**
- * Compression types.
- *
- * COMPR_NONE	- uncompressed
- * COMPR_ZLIB	- compressed with zlib
- */
-enum {
-	COMPR_NONE	= 0,
-	COMPR_ZLIB	= 1,
-};
-
-/*
- * Journal entries come in groups of 16.  First group contains unique
- * entries, next groups contain one entry per level
- *
- * JE_FIRST	- smallest possible journal entry number
- *
- * JEG_BASE	- base group, containing unique entries
- * JE_COMMIT	- commit entry, validates all previous entries
- * JE_DYNSB	- dynamic superblock, anything that ought to be in the
- *		  superblock but cannot because it is read-write data
- * JE_ANCHOR	- anchor aka master inode aka inode file's inode
- * JE_ERASECOUNT  erasecounts for all journal segments
- * JE_SPILLOUT	- unused
- * JE_SEG_ALIAS	- aliases segments
- * JE_AREA	- area description
- *
- * JE_LAST	- largest possible journal entry number
- */
-enum {
-	JE_FIRST	= 0x01,
-
-	JEG_BASE	= 0x00,
-	JE_COMMIT	= 0x02,
-	JE_DYNSB	= 0x03,
-	JE_ANCHOR	= 0x04,
-	JE_ERASECOUNT	= 0x05,
-	JE_SPILLOUT	= 0x06,
-	JE_OBJ_ALIAS	= 0x0d,
-	JE_AREA		= 0x0e,
-
-	JE_LAST		= 0x0e,
-};
-
-#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
deleted file mode 100644
index bf19bf4a243f..000000000000
--- a/fs/logfs/readwrite.c
+++ /dev/null
@@ -1,2298 +0,0 @@
-/*
- * fs/logfs/readwrite.c
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- *
- * Actually contains five sets of very similar functions:
- * read		read blocks from a file
- * seek_hole	find next hole
- * seek_data	find next data block
- * valid	check whether a block still belongs to a file
- * write	write blocks to a file
- * delete	delete a block (for directories and ifile)
- * rewrite	move existing blocks of a file to a new location (gc helper)
- * truncate	truncate a file
- */
-#include "logfs.h"
-#include <linux/sched.h>
-#include <linux/slab.h>
-
-static u64 adjust_bix(u64 bix, level_t level)
-{
-	switch (level) {
-	case 0:
-		return bix;
-	case LEVEL(1):
-		return max_t(u64, bix, I0_BLOCKS);
-	case LEVEL(2):
-		return max_t(u64, bix, I1_BLOCKS);
-	case LEVEL(3):
-		return max_t(u64, bix, I2_BLOCKS);
-	case LEVEL(4):
-		return max_t(u64, bix, I3_BLOCKS);
-	case LEVEL(5):
-		return max_t(u64, bix, I4_BLOCKS);
-	default:
-		WARN_ON(1);
-		return bix;
-	}
-}
-
-static inline u64 maxbix(u8 height)
-{
-	return 1ULL << (LOGFS_BLOCK_BITS * height);
-}
-
-/**
- * The inode address space is cut in two halves.  Lower half belongs to data
- * pages, upper half to indirect blocks.  If the high bit (INDIRECT_BIT) is
- * set, the actual block index (bix) and level can be derived from the page
- * index.
- *
- * The lowest three bits of the block index are set to 0 after packing and
- * unpacking.  Since the lowest n bits (9 for 4KiB blocksize) are ignored
- * anyway this is harmless.
- */
-#define ARCH_SHIFT	(BITS_PER_LONG - 32)
-#define INDIRECT_BIT	(0x80000000UL << ARCH_SHIFT)
-#define LEVEL_SHIFT	(28 + ARCH_SHIFT)
-static inline pgoff_t first_indirect_block(void)
-{
-	return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
-}
-
-pgoff_t logfs_pack_index(u64 bix, level_t level)
-{
-	pgoff_t index;
-
-	BUG_ON(bix >= INDIRECT_BIT);
-	if (level == 0)
-		return bix;
-
-	index  = INDIRECT_BIT;
-	index |= (__force long)level << LEVEL_SHIFT;
-	index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
-	return index;
-}
-
-void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
-{
-	u8 __level;
-
-	if (!(index & INDIRECT_BIT)) {
-		*bix = index;
-		*level = 0;
-		return;
-	}
-
-	__level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
-	*level = LEVEL(__level);
-	*bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
-	*bix = adjust_bix(*bix, *level);
-	return;
-}
-#undef ARCH_SHIFT
-#undef INDIRECT_BIT
-#undef LEVEL_SHIFT
-
-/*
- * Time is stored as nanoseconds since the epoch.
- */
-static struct timespec be64_to_timespec(__be64 betime)
-{
-	return ns_to_timespec(be64_to_cpu(betime));
-}
-
-static __be64 timespec_to_be64(struct timespec tsp)
-{
-	return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
-}
-
-static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	int i;
-
-	inode->i_mode	= be16_to_cpu(di->di_mode);
-	li->li_height	= di->di_height;
-	li->li_flags	= be32_to_cpu(di->di_flags);
-	i_uid_write(inode, be32_to_cpu(di->di_uid));
-	i_gid_write(inode, be32_to_cpu(di->di_gid));
-	inode->i_size	= be64_to_cpu(di->di_size);
-	logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
-	inode->i_atime	= be64_to_timespec(di->di_atime);
-	inode->i_ctime	= be64_to_timespec(di->di_ctime);
-	inode->i_mtime	= be64_to_timespec(di->di_mtime);
-	set_nlink(inode, be32_to_cpu(di->di_refcount));
-	inode->i_generation = be32_to_cpu(di->di_generation);
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFSOCK:	/* fall through */
-	case S_IFBLK:	/* fall through */
-	case S_IFCHR:	/* fall through */
-	case S_IFIFO:
-		inode->i_rdev = be64_to_cpu(di->di_data[0]);
-		break;
-	case S_IFDIR:	/* fall through */
-	case S_IFREG:	/* fall through */
-	case S_IFLNK:
-		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-			li->li_data[i] = be64_to_cpu(di->di_data[i]);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	int i;
-
-	di->di_mode	= cpu_to_be16(inode->i_mode);
-	di->di_height	= li->li_height;
-	di->di_pad	= 0;
-	di->di_flags	= cpu_to_be32(li->li_flags);
-	di->di_uid	= cpu_to_be32(i_uid_read(inode));
-	di->di_gid	= cpu_to_be32(i_gid_read(inode));
-	di->di_size	= cpu_to_be64(i_size_read(inode));
-	di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
-	di->di_atime	= timespec_to_be64(inode->i_atime);
-	di->di_ctime	= timespec_to_be64(inode->i_ctime);
-	di->di_mtime	= timespec_to_be64(inode->i_mtime);
-	di->di_refcount	= cpu_to_be32(inode->i_nlink);
-	di->di_generation = cpu_to_be32(inode->i_generation);
-
-	switch (inode->i_mode & S_IFMT) {
-	case S_IFSOCK:	/* fall through */
-	case S_IFBLK:	/* fall through */
-	case S_IFCHR:	/* fall through */
-	case S_IFIFO:
-		di->di_data[0] = cpu_to_be64(inode->i_rdev);
-		break;
-	case S_IFDIR:	/* fall through */
-	case S_IFREG:	/* fall through */
-	case S_IFLNK:
-		for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
-			di->di_data[i] = cpu_to_be64(li->li_data[i]);
-		break;
-	default:
-		BUG();
-	}
-}
-
-static void __logfs_set_blocks(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_inode *li = logfs_inode(inode);
-
-	inode->i_blocks = ULONG_MAX;
-	if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
-		inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
-}
-
-void logfs_set_blocks(struct inode *inode, u64 bytes)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	li->li_used_bytes = bytes;
-	__logfs_set_blocks(inode);
-}
-
-static void prelock_page(struct super_block *sb, struct page *page, int lock)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	BUG_ON(!PageLocked(page));
-	if (lock) {
-		BUG_ON(PagePreLocked(page));
-		SetPagePreLocked(page);
-	} else {
-		/* We are in GC path. */
-		if (PagePreLocked(page))
-			super->s_lock_count++;
-		else
-			SetPagePreLocked(page);
-	}
-}
-
-static void preunlock_page(struct super_block *sb, struct page *page, int lock)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	BUG_ON(!PageLocked(page));
-	if (lock)
-		ClearPagePreLocked(page);
-	else {
-		/* We are in GC path. */
-		BUG_ON(!PagePreLocked(page));
-		if (super->s_lock_count)
-			super->s_lock_count--;
-		else
-			ClearPagePreLocked(page);
-	}
-}
-
-/*
- * Logfs is prone to an AB-BA deadlock where one task tries to acquire
- * s_write_mutex with a locked page and GC tries to get that page while holding
- * s_write_mutex.
- * To solve this issue logfs will ignore the page lock iff the page in question
- * is waiting for s_write_mutex.  We annotate this fact by setting PG_pre_locked
- * in addition to PG_locked.
- */
-void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	if (page)
-		prelock_page(sb, page, lock);
-
-	if (lock) {
-		mutex_lock(&super->s_write_mutex);
-		logfs_gc_pass(sb);
-		/* FIXME: We also have to check for shadowed space
-		 * and mempool fill grade */
-	}
-}
-
-void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	if (page)
-		preunlock_page(sb, page, lock);
-	/* Order matters - we must clear PG_pre_locked before releasing
-	 * s_write_mutex or we could race against another task. */
-	if (lock)
-		mutex_unlock(&super->s_write_mutex);
-}
-
-static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
-		level_t level)
-{
-	return find_or_create_page(inode->i_mapping,
-			logfs_pack_index(bix, level), GFP_NOFS);
-}
-
-static void logfs_put_read_page(struct page *page)
-{
-	unlock_page(page);
-	put_page(page);
-}
-
-static void logfs_lock_write_page(struct page *page)
-{
-	int loop = 0;
-
-	while (unlikely(!trylock_page(page))) {
-		if (loop++ > 0x1000) {
-			/* Has been observed once so far... */
-			printk(KERN_ERR "stack at %p\n", &loop);
-			BUG();
-		}
-		if (PagePreLocked(page)) {
-			/* Holder of page lock is waiting for us, it
-			 * is safe to use this page. */
-			break;
-		}
-		/* Some other process has this page locked and has
-		 * nothing to do with us.  Wait for it to finish.
-		 */
-		schedule();
-	}
-	BUG_ON(!PageLocked(page));
-}
-
-static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
-		level_t level)
-{
-	struct address_space *mapping = inode->i_mapping;
-	pgoff_t index = logfs_pack_index(bix, level);
-	struct page *page;
-	int err;
-
-repeat:
-	page = find_get_page(mapping, index);
-	if (!page) {
-		page = __page_cache_alloc(GFP_NOFS);
-		if (!page)
-			return NULL;
-		err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
-		if (unlikely(err)) {
-			put_page(page);
-			if (err == -EEXIST)
-				goto repeat;
-			return NULL;
-		}
-	} else logfs_lock_write_page(page);
-	BUG_ON(!PageLocked(page));
-	return page;
-}
-
-static void logfs_unlock_write_page(struct page *page)
-{
-	if (!PagePreLocked(page))
-		unlock_page(page);
-}
-
-static void logfs_put_write_page(struct page *page)
-{
-	logfs_unlock_write_page(page);
-	put_page(page);
-}
-
-static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
-		int rw)
-{
-	if (rw == READ)
-		return logfs_get_read_page(inode, bix, level);
-	else
-		return logfs_get_write_page(inode, bix, level);
-}
-
-static void logfs_put_page(struct page *page, int rw)
-{
-	if (rw == READ)
-		logfs_put_read_page(page);
-	else
-		logfs_put_write_page(page);
-}
-
-static unsigned long __get_bits(u64 val, int skip, int no)
-{
-	u64 ret = val;
-
-	ret >>= skip * no;
-	ret <<= 64 - no;
-	ret >>= 64 - no;
-	return ret;
-}
-
-static unsigned long get_bits(u64 val, level_t skip)
-{
-	return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
-}
-
-static inline void init_shadow_tree(struct super_block *sb,
-		struct shadow_tree *tree)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	btree_init_mempool64(&tree->new, super->s_btree_pool);
-	btree_init_mempool64(&tree->old, super->s_btree_pool);
-}
-
-static void indirect_write_block(struct logfs_block *block)
-{
-	struct page *page;
-	struct inode *inode;
-	int ret;
-
-	page = block->page;
-	inode = page->mapping->host;
-	logfs_lock_write_page(page);
-	ret = logfs_write_buf(inode, page, 0);
-	logfs_unlock_write_page(page);
-	/*
-	 * This needs some rework.  Unless you want your filesystem to run
-	 * completely synchronously (you don't), the filesystem will always
-	 * report writes as 'successful' before the actual work has been
-	 * done.  The actual work gets done here and this is where any errors
-	 * will show up.  And there isn't much we can do about it, really.
-	 *
-	 * Some attempts to fix the errors (move from bad blocks, retry io,...)
-	 * have already been done, so anything left should be either a broken
-	 * device or a bug somewhere in logfs itself.  Being relatively new,
-	 * the odds currently favor a bug, so for now the line below isn't
-	 * entirely tasteles.
-	 */
-	BUG_ON(ret);
-}
-
-static void inode_write_block(struct logfs_block *block)
-{
-	struct inode *inode;
-	int ret;
-
-	inode = block->inode;
-	if (inode->i_ino == LOGFS_INO_MASTER)
-		logfs_write_anchor(inode->i_sb);
-	else {
-		ret = __logfs_write_inode(inode, NULL, 0);
-		/* see indirect_write_block comment */
-		BUG_ON(ret);
-	}
-}
-
-/*
- * This silences a false, yet annoying gcc warning.  I hate it when my editor
- * jumps into bitops.h each time I recompile this file.
- * TODO: Complain to gcc folks about this and upgrade compiler.
- */
-static unsigned long fnb(const unsigned long *addr,
-		unsigned long size, unsigned long offset)
-{
-	return find_next_bit(addr, size, offset);
-}
-
-static __be64 inode_val0(struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	u64 val;
-
-	/*
-	 * Explicit shifting generates good code, but must match the format
-	 * of the structure.  Add some paranoia just in case.
-	 */
-	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
-	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
-	BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
-
-	val =	(u64)inode->i_mode << 48 |
-		(u64)li->li_height << 40 |
-		(u64)li->li_flags;
-	return cpu_to_be64(val);
-}
-
-static int inode_write_alias(struct super_block *sb,
-		struct logfs_block *block, write_alias_t *write_one_alias)
-{
-	struct inode *inode = block->inode;
-	struct logfs_inode *li = logfs_inode(inode);
-	unsigned long pos;
-	u64 ino , bix;
-	__be64 val;
-	level_t level;
-	int err;
-
-	for (pos = 0; ; pos++) {
-		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
-		if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
-			return 0;
-
-		switch (pos) {
-		case INODE_HEIGHT_OFS:
-			val = inode_val0(inode);
-			break;
-		case INODE_USED_OFS:
-			val = cpu_to_be64(li->li_used_bytes);
-			break;
-		case INODE_SIZE_OFS:
-			val = cpu_to_be64(i_size_read(inode));
-			break;
-		case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
-			val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
-			break;
-		default:
-			BUG();
-		}
-
-		ino = LOGFS_INO_MASTER;
-		bix = inode->i_ino;
-		level = LEVEL(0);
-		err = write_one_alias(sb, ino, bix, level, pos, val);
-		if (err)
-			return err;
-	}
-}
-
-static int indirect_write_alias(struct super_block *sb,
-		struct logfs_block *block, write_alias_t *write_one_alias)
-{
-	unsigned long pos;
-	struct page *page = block->page;
-	u64 ino , bix;
-	__be64 *child, val;
-	level_t level;
-	int err;
-
-	for (pos = 0; ; pos++) {
-		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
-		if (pos >= LOGFS_BLOCK_FACTOR)
-			return 0;
-
-		ino = page->mapping->host->i_ino;
-		logfs_unpack_index(page->index, &bix, &level);
-		child = kmap_atomic(page);
-		val = child[pos];
-		kunmap_atomic(child);
-		err = write_one_alias(sb, ino, bix, level, pos, val);
-		if (err)
-			return err;
-	}
-}
-
-int logfs_write_obj_aliases_pagecache(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_block *block;
-	int err;
-
-	list_for_each_entry(block, &super->s_object_alias, alias_list) {
-		err = block->ops->write_alias(sb, block, write_alias_journal);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
-void __free_block(struct super_block *sb, struct logfs_block *block)
-{
-	BUG_ON(!list_empty(&block->item_list));
-	list_del(&block->alias_list);
-	mempool_free(block, logfs_super(sb)->s_block_pool);
-}
-
-static void inode_free_block(struct super_block *sb, struct logfs_block *block)
-{
-	struct inode *inode = block->inode;
-
-	logfs_inode(inode)->li_block = NULL;
-	__free_block(sb, block);
-}
-
-static void indirect_free_block(struct super_block *sb,
-		struct logfs_block *block)
-{
-	struct page *page = block->page;
-
-	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
-		put_page(page);
-		set_page_private(page, 0);
-	}
-	__free_block(sb, block);
-}
-
-
-static const struct logfs_block_ops inode_block_ops = {
-	.write_block = inode_write_block,
-	.free_block = inode_free_block,
-	.write_alias = inode_write_alias,
-};
-
-const struct logfs_block_ops indirect_block_ops = {
-	.write_block = indirect_write_block,
-	.free_block = indirect_free_block,
-	.write_alias = indirect_write_alias,
-};
-
-struct logfs_block *__alloc_block(struct super_block *sb,
-		u64 ino, u64 bix, level_t level)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_block *block;
-
-	block = mempool_alloc(super->s_block_pool, GFP_NOFS);
-	memset(block, 0, sizeof(*block));
-	INIT_LIST_HEAD(&block->alias_list);
-	INIT_LIST_HEAD(&block->item_list);
-	block->sb = sb;
-	block->ino = ino;
-	block->bix = bix;
-	block->level = level;
-	return block;
-}
-
-static void alloc_inode_block(struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_block *block;
-
-	if (li->li_block)
-		return;
-
-	block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
-	block->inode = inode;
-	li->li_block = block;
-	block->ops = &inode_block_ops;
-}
-
-void initialize_block_counters(struct page *page, struct logfs_block *block,
-		__be64 *array, int page_is_empty)
-{
-	u64 ptr;
-	int i, start;
-
-	block->partial = 0;
-	block->full = 0;
-	start = 0;
-	if (page->index < first_indirect_block()) {
-		/* Counters are pointless on level 0 */
-		return;
-	}
-	if (page->index == first_indirect_block()) {
-		/* Skip unused pointers */
-		start = I0_BLOCKS;
-		block->full = I0_BLOCKS;
-	}
-	if (!page_is_empty) {
-		for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
-			ptr = be64_to_cpu(array[i]);
-			if (ptr)
-				block->partial++;
-			if (ptr & LOGFS_FULLY_POPULATED)
-				block->full++;
-		}
-	}
-}
-
-static void alloc_data_block(struct inode *inode, struct page *page)
-{
-	struct logfs_block *block;
-	u64 bix;
-	level_t level;
-
-	if (PagePrivate(page))
-		return;
-
-	logfs_unpack_index(page->index, &bix, &level);
-	block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
-	block->page = page;
-
-	SetPagePrivate(page);
-	get_page(page);
-	set_page_private(page, (unsigned long) block);
-
-	block->ops = &indirect_block_ops;
-}
-
-static void alloc_indirect_block(struct inode *inode, struct page *page,
-		int page_is_empty)
-{
-	struct logfs_block *block;
-	__be64 *array;
-
-	if (PagePrivate(page))
-		return;
-
-	alloc_data_block(inode, page);
-
-	block = logfs_block(page);
-	array = kmap_atomic(page);
-	initialize_block_counters(page, block, array, page_is_empty);
-	kunmap_atomic(array);
-}
-
-static void block_set_pointer(struct page *page, int index, u64 ptr)
-{
-	struct logfs_block *block = logfs_block(page);
-	__be64 *array;
-	u64 oldptr;
-
-	BUG_ON(!block);
-	array = kmap_atomic(page);
-	oldptr = be64_to_cpu(array[index]);
-	array[index] = cpu_to_be64(ptr);
-	kunmap_atomic(array);
-	SetPageUptodate(page);
-
-	block->full += !!(ptr & LOGFS_FULLY_POPULATED)
-		- !!(oldptr & LOGFS_FULLY_POPULATED);
-	block->partial += !!ptr - !!oldptr;
-}
-
-static u64 block_get_pointer(struct page *page, int index)
-{
-	__be64 *block;
-	u64 ptr;
-
-	block = kmap_atomic(page);
-	ptr = be64_to_cpu(block[index]);
-	kunmap_atomic(block);
-	return ptr;
-}
-
-static int logfs_read_empty(struct page *page)
-{
-	zero_user_segment(page, 0, PAGE_SIZE);
-	return 0;
-}
-
-static int logfs_read_direct(struct inode *inode, struct page *page)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	pgoff_t index = page->index;
-	u64 block;
-
-	block = li->li_data[index];
-	if (!block)
-		return logfs_read_empty(page);
-
-	return logfs_segment_read(inode, page, block, index, 0);
-}
-
-static int logfs_read_loop(struct inode *inode, struct page *page,
-		int rw_context)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	u64 bix, bofs = li->li_data[INDIRECT_INDEX];
-	level_t level, target_level;
-	int ret;
-	struct page *ipage;
-
-	logfs_unpack_index(page->index, &bix, &target_level);
-	if (!bofs)
-		return logfs_read_empty(page);
-
-	if (bix >= maxbix(li->li_height))
-		return logfs_read_empty(page);
-
-	for (level = LEVEL(li->li_height);
-			(__force u8)level > (__force u8)target_level;
-			level = SUBLEVEL(level)){
-		ipage = logfs_get_page(inode, bix, level, rw_context);
-		if (!ipage)
-			return -ENOMEM;
-
-		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
-		if (ret) {
-			logfs_put_read_page(ipage);
-			return ret;
-		}
-
-		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
-		logfs_put_page(ipage, rw_context);
-		if (!bofs)
-			return logfs_read_empty(page);
-	}
-
-	return logfs_segment_read(inode, page, bofs, bix, 0);
-}
-
-static int logfs_read_block(struct inode *inode, struct page *page,
-		int rw_context)
-{
-	pgoff_t index = page->index;
-
-	if (index < I0_BLOCKS)
-		return logfs_read_direct(inode, page);
-	return logfs_read_loop(inode, page, rw_context);
-}
-
-static int logfs_exist_loop(struct inode *inode, u64 bix)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	u64 bofs = li->li_data[INDIRECT_INDEX];
-	level_t level;
-	int ret;
-	struct page *ipage;
-
-	if (!bofs)
-		return 0;
-	if (bix >= maxbix(li->li_height))
-		return 0;
-
-	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
-		ipage = logfs_get_read_page(inode, bix, level);
-		if (!ipage)
-			return -ENOMEM;
-
-		ret = logfs_segment_read(inode, ipage, bofs, bix, level);
-		if (ret) {
-			logfs_put_read_page(ipage);
-			return ret;
-		}
-
-		bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
-		logfs_put_read_page(ipage);
-		if (!bofs)
-			return 0;
-	}
-
-	return 1;
-}
-
-int logfs_exist_block(struct inode *inode, u64 bix)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if (bix < I0_BLOCKS)
-		return !!li->li_data[bix];
-	return logfs_exist_loop(inode, bix);
-}
-
-static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	for (; bix < I0_BLOCKS; bix++)
-		if (data ^ (li->li_data[bix] == 0))
-			return bix;
-	return I0_BLOCKS;
-}
-
-static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	__be64 *rblock;
-	u64 increment, bofs = li->li_data[INDIRECT_INDEX];
-	level_t level;
-	int ret, slot;
-	struct page *page;
-
-	BUG_ON(!bofs);
-
-	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
-		increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
-		page = logfs_get_read_page(inode, bix, level);
-		if (!page)
-			return bix;
-
-		ret = logfs_segment_read(inode, page, bofs, bix, level);
-		if (ret) {
-			logfs_put_read_page(page);
-			return bix;
-		}
-
-		slot = get_bits(bix, SUBLEVEL(level));
-		rblock = kmap_atomic(page);
-		while (slot < LOGFS_BLOCK_FACTOR) {
-			if (data && (rblock[slot] != 0))
-				break;
-			if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
-				break;
-			slot++;
-			bix += increment;
-			bix &= ~(increment - 1);
-		}
-		if (slot >= LOGFS_BLOCK_FACTOR) {
-			kunmap_atomic(rblock);
-			logfs_put_read_page(page);
-			return bix;
-		}
-		bofs = be64_to_cpu(rblock[slot]);
-		kunmap_atomic(rblock);
-		logfs_put_read_page(page);
-		if (!bofs) {
-			BUG_ON(data);
-			return bix;
-		}
-	}
-	return bix;
-}
-
-/**
- * logfs_seek_hole - find next hole starting at a given block index
- * @inode:		inode to search in
- * @bix:		block index to start searching
- *
- * Returns next hole.  If the file doesn't contain any further holes, the
- * block address next to eof is returned instead.
- */
-u64 logfs_seek_hole(struct inode *inode, u64 bix)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if (bix < I0_BLOCKS) {
-		bix = seek_holedata_direct(inode, bix, 0);
-		if (bix < I0_BLOCKS)
-			return bix;
-	}
-
-	if (!li->li_data[INDIRECT_INDEX])
-		return bix;
-	else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
-		bix = maxbix(li->li_height);
-	else if (bix >= maxbix(li->li_height))
-		return bix;
-	else {
-		bix = seek_holedata_loop(inode, bix, 0);
-		if (bix < maxbix(li->li_height))
-			return bix;
-		/* Should not happen anymore.  But if some port writes semi-
-		 * corrupt images (as this one used to) we might run into it.
-		 */
-		WARN_ON_ONCE(bix == maxbix(li->li_height));
-	}
-
-	return bix;
-}
-
-static u64 __logfs_seek_data(struct inode *inode, u64 bix)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if (bix < I0_BLOCKS) {
-		bix = seek_holedata_direct(inode, bix, 1);
-		if (bix < I0_BLOCKS)
-			return bix;
-	}
-
-	if (bix < maxbix(li->li_height)) {
-		if (!li->li_data[INDIRECT_INDEX])
-			bix = maxbix(li->li_height);
-		else
-			return seek_holedata_loop(inode, bix, 1);
-	}
-
-	return bix;
-}
-
-/**
- * logfs_seek_data - find next data block after a given block index
- * @inode:		inode to search in
- * @bix:		block index to start searching
- *
- * Returns next data block.  If the file doesn't contain any further data
- * blocks, the last block in the file is returned instead.
- */
-u64 logfs_seek_data(struct inode *inode, u64 bix)
-{
-	struct super_block *sb = inode->i_sb;
-	u64 ret, end;
-
-	ret = __logfs_seek_data(inode, bix);
-	end = i_size_read(inode) >> sb->s_blocksize_bits;
-	if (ret >= end)
-		ret = max(bix, end);
-	return ret;
-}
-
-static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
-{
-	return pure_ofs(li->li_data[bix]) == ofs;
-}
-
-static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
-		u64 ofs, u64 bofs)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	level_t level;
-	int ret;
-	struct page *page;
-
-	for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
-		page = logfs_get_write_page(inode, bix, level);
-		BUG_ON(!page);
-
-		ret = logfs_segment_read(inode, page, bofs, bix, level);
-		if (ret) {
-			logfs_put_write_page(page);
-			return 0;
-		}
-
-		bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
-		logfs_put_write_page(page);
-		if (!bofs)
-			return 0;
-
-		if (pure_ofs(bofs) == ofs)
-			return 1;
-	}
-	return 0;
-}
-
-static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	u64 bofs = li->li_data[INDIRECT_INDEX];
-
-	if (!bofs)
-		return 0;
-
-	if (bix >= maxbix(li->li_height))
-		return 0;
-
-	if (pure_ofs(bofs) == ofs)
-		return 1;
-
-	return __logfs_is_valid_loop(inode, bix, ofs, bofs);
-}
-
-static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
-		return 0;
-
-	if (bix < I0_BLOCKS)
-		return logfs_is_valid_direct(li, bix, ofs);
-	return logfs_is_valid_loop(inode, bix, ofs);
-}
-
-/**
- * logfs_is_valid_block - check whether this block is still valid
- *
- * @sb:		superblock
- * @ofs:	block physical offset
- * @ino:	block inode number
- * @bix:	block index
- * @gc_level:	block level
- *
- * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
- * become invalid once the journal is written.
- */
-int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
-		gc_level_t gc_level)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode;
-	int ret, cookie;
-
-	/* Umount closes a segment with free blocks remaining.  Those
-	 * blocks are by definition invalid. */
-	if (ino == -1)
-		return 0;
-
-	LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
-
-	inode = logfs_safe_iget(sb, ino, &cookie);
-	if (IS_ERR(inode))
-		goto invalid;
-
-	ret = __logfs_is_valid_block(inode, bix, ofs);
-	logfs_safe_iput(inode, cookie);
-	if (ret)
-		return ret;
-
-invalid:
-	/* Block is nominally invalid, but may still sit in the shadow tree,
-	 * waiting for a journal commit.
-	 */
-	if (btree_lookup64(&super->s_shadow_tree.old, ofs))
-		return 2;
-	return 0;
-}
-
-int logfs_readpage_nolock(struct page *page)
-{
-	struct inode *inode = page->mapping->host;
-	int ret = -EIO;
-
-	ret = logfs_read_block(inode, page, READ);
-
-	if (ret) {
-		ClearPageUptodate(page);
-		SetPageError(page);
-	} else {
-		SetPageUptodate(page);
-		ClearPageError(page);
-	}
-	flush_dcache_page(page);
-
-	return ret;
-}
-
-static int logfs_reserve_bytes(struct inode *inode, int bytes)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	u64 available = super->s_free_bytes + super->s_dirty_free_bytes
-			- super->s_dirty_used_bytes - super->s_dirty_pages;
-
-	if (!bytes)
-		return 0;
-
-	if (available < bytes)
-		return -ENOSPC;
-
-	if (available < bytes + super->s_root_reserve &&
-			!capable(CAP_SYS_RESOURCE))
-		return -ENOSPC;
-
-	return 0;
-}
-
-int get_page_reserve(struct inode *inode, struct page *page)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	struct logfs_block *block = logfs_block(page);
-	int ret;
-
-	if (block && block->reserved_bytes)
-		return 0;
-
-	logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
-	while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
-			!list_empty(&super->s_writeback_list)) {
-		block = list_entry(super->s_writeback_list.next,
-				struct logfs_block, alias_list);
-		block->ops->write_block(block);
-	}
-	if (!ret) {
-		alloc_data_block(inode, page);
-		block = logfs_block(page);
-		block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
-		super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
-		list_move_tail(&block->alias_list, &super->s_writeback_list);
-	}
-	logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
-	return ret;
-}
-
-/*
- * We are protected by write lock.  Push victims up to superblock level
- * and release transaction when appropriate.
- */
-/* FIXME: This is currently called from the wrong spots. */
-static void logfs_handle_transaction(struct inode *inode,
-		struct logfs_transaction *ta)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-
-	if (!ta)
-		return;
-	logfs_inode(inode)->li_block->ta = NULL;
-
-	if (inode->i_ino != LOGFS_INO_MASTER) {
-		BUG(); /* FIXME: Yes, this needs more thought */
-		/* just remember the transaction until inode is written */
-		//BUG_ON(logfs_inode(inode)->li_transaction);
-		//logfs_inode(inode)->li_transaction = ta;
-		return;
-	}
-
-	switch (ta->state) {
-	case CREATE_1: /* fall through */
-	case UNLINK_1:
-		BUG_ON(super->s_victim_ino);
-		super->s_victim_ino = ta->ino;
-		break;
-	case CREATE_2: /* fall through */
-	case UNLINK_2:
-		BUG_ON(super->s_victim_ino != ta->ino);
-		super->s_victim_ino = 0;
-		/* transaction ends here - free it */
-		kfree(ta);
-		break;
-	case CROSS_RENAME_1:
-		BUG_ON(super->s_rename_dir);
-		BUG_ON(super->s_rename_pos);
-		super->s_rename_dir = ta->dir;
-		super->s_rename_pos = ta->pos;
-		break;
-	case CROSS_RENAME_2:
-		BUG_ON(super->s_rename_dir != ta->dir);
-		BUG_ON(super->s_rename_pos != ta->pos);
-		super->s_rename_dir = 0;
-		super->s_rename_pos = 0;
-		kfree(ta);
-		break;
-	case TARGET_RENAME_1:
-		BUG_ON(super->s_rename_dir);
-		BUG_ON(super->s_rename_pos);
-		BUG_ON(super->s_victim_ino);
-		super->s_rename_dir = ta->dir;
-		super->s_rename_pos = ta->pos;
-		super->s_victim_ino = ta->ino;
-		break;
-	case TARGET_RENAME_2:
-		BUG_ON(super->s_rename_dir != ta->dir);
-		BUG_ON(super->s_rename_pos != ta->pos);
-		BUG_ON(super->s_victim_ino != ta->ino);
-		super->s_rename_dir = 0;
-		super->s_rename_pos = 0;
-		break;
-	case TARGET_RENAME_3:
-		BUG_ON(super->s_rename_dir);
-		BUG_ON(super->s_rename_pos);
-		BUG_ON(super->s_victim_ino != ta->ino);
-		super->s_victim_ino = 0;
-		kfree(ta);
-		break;
-	default:
-		BUG();
-	}
-}
-
-/*
- * Not strictly a reservation, but rather a check that we still have enough
- * space to satisfy the write.
- */
-static int logfs_reserve_blocks(struct inode *inode, int blocks)
-{
-	return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
-}
-
-struct write_control {
-	u64 ofs;
-	long flags;
-};
-
-static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
-		level_t level, u64 old_ofs)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	struct logfs_shadow *shadow;
-
-	shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
-	memset(shadow, 0, sizeof(*shadow));
-	shadow->ino = inode->i_ino;
-	shadow->bix = bix;
-	shadow->gc_level = expand_level(inode->i_ino, level);
-	shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
-	return shadow;
-}
-
-static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-
-	mempool_free(shadow, super->s_shadow_pool);
-}
-
-static void mark_segment(struct shadow_tree *tree, u32 segno)
-{
-	int err;
-
-	if (!btree_lookup32(&tree->segment_map, segno)) {
-		err = btree_insert32(&tree->segment_map, segno, (void *)1,
-				GFP_NOFS);
-		BUG_ON(err);
-		tree->no_shadowed_segments++;
-	}
-}
-
-/**
- * fill_shadow_tree - Propagate shadow tree changes due to a write
- * @inode:	Inode owning the page
- * @page:	Struct page that was written
- * @shadow:	Shadow for the current write
- *
- * Writes in logfs can result in two semi-valid objects.  The old object
- * is still valid as long as it can be reached by following pointers on
- * the medium.  Only when writes propagate all the way up to the journal
- * has the new object safely replaced the old one.
- *
- * To handle this problem, a struct logfs_shadow is used to represent
- * every single write.  It is attached to the indirect block, which is
- * marked dirty.  When the indirect block is written, its shadows are
- * handed up to the next indirect block (or inode).  Untimately they
- * will reach the master inode and be freed upon journal commit.
- *
- * This function handles a single step in the propagation.  It adds the
- * shadow for the current write to the tree, along with any shadows in
- * the page's tree, in case it was an indirect block.  If a page is
- * written, the inode parameter is left NULL, if an inode is written,
- * the page parameter is left NULL.
- */
-static void fill_shadow_tree(struct inode *inode, struct page *page,
-		struct logfs_shadow *shadow)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	struct logfs_block *block = logfs_block(page);
-	struct shadow_tree *tree = &super->s_shadow_tree;
-
-	if (PagePrivate(page)) {
-		if (block->alias_map)
-			super->s_no_object_aliases -= bitmap_weight(
-					block->alias_map, LOGFS_BLOCK_FACTOR);
-		logfs_handle_transaction(inode, block->ta);
-		block->ops->free_block(inode->i_sb, block);
-	}
-	if (shadow) {
-		if (shadow->old_ofs)
-			btree_insert64(&tree->old, shadow->old_ofs, shadow,
-					GFP_NOFS);
-		else
-			btree_insert64(&tree->new, shadow->new_ofs, shadow,
-					GFP_NOFS);
-
-		super->s_dirty_used_bytes += shadow->new_len;
-		super->s_dirty_free_bytes += shadow->old_len;
-		mark_segment(tree, shadow->old_ofs >> super->s_segshift);
-		mark_segment(tree, shadow->new_ofs >> super->s_segshift);
-	}
-}
-
-static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
-		long child_no)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
-		/* Aliases in the master inode are pointless. */
-		return;
-	}
-
-	if (!test_bit(child_no, block->alias_map)) {
-		set_bit(child_no, block->alias_map);
-		super->s_no_object_aliases++;
-	}
-	list_move_tail(&block->alias_list, &super->s_object_alias);
-}
-
-/*
- * Object aliases can and often do change the size and occupied space of a
- * file.  So not only do we have to change the pointers, we also have to
- * change inode->i_size and li->li_used_bytes.  Which is done by setting
- * another two object aliases for the inode itself.
- */
-static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-
-	if (shadow->new_len == shadow->old_len)
-		return;
-
-	alloc_inode_block(inode);
-	li->li_used_bytes += shadow->new_len - shadow->old_len;
-	__logfs_set_blocks(inode);
-	logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
-	logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
-}
-
-static int logfs_write_i0(struct inode *inode, struct page *page,
-		struct write_control *wc)
-{
-	struct logfs_shadow *shadow;
-	u64 bix;
-	level_t level;
-	int full, err = 0;
-
-	logfs_unpack_index(page->index, &bix, &level);
-	if (wc->ofs == 0)
-		if (logfs_reserve_blocks(inode, 1))
-			return -ENOSPC;
-
-	shadow = alloc_shadow(inode, bix, level, wc->ofs);
-	if (wc->flags & WF_WRITE)
-		err = logfs_segment_write(inode, page, shadow);
-	if (wc->flags & WF_DELETE)
-		logfs_segment_delete(inode, shadow);
-	if (err) {
-		free_shadow(inode, shadow);
-		return err;
-	}
-
-	set_iused(inode, shadow);
-	full = 1;
-	if (level != 0) {
-		alloc_indirect_block(inode, page, 0);
-		full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
-	}
-	fill_shadow_tree(inode, page, shadow);
-	wc->ofs = shadow->new_ofs;
-	if (wc->ofs && full)
-		wc->ofs |= LOGFS_FULLY_POPULATED;
-	return 0;
-}
-
-static int logfs_write_direct(struct inode *inode, struct page *page,
-		long flags)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct write_control wc = {
-		.ofs = li->li_data[page->index],
-		.flags = flags,
-	};
-	int err;
-
-	alloc_inode_block(inode);
-
-	err = logfs_write_i0(inode, page, &wc);
-	if (err)
-		return err;
-
-	li->li_data[page->index] = wc.ofs;
-	logfs_set_alias(inode->i_sb, li->li_block,
-			page->index + INODE_POINTER_OFS);
-	return 0;
-}
-
-static int ptr_change(u64 ofs, struct page *page)
-{
-	struct logfs_block *block = logfs_block(page);
-	int empty0, empty1, full0, full1;
-
-	empty0 = ofs == 0;
-	empty1 = block->partial == 0;
-	if (empty0 != empty1)
-		return 1;
-
-	/* The !! is necessary to shrink result to int */
-	full0 = !!(ofs & LOGFS_FULLY_POPULATED);
-	full1 = block->full == LOGFS_BLOCK_FACTOR;
-	if (full0 != full1)
-		return 1;
-	return 0;
-}
-
-static int __logfs_write_rec(struct inode *inode, struct page *page,
-		struct write_control *this_wc,
-		pgoff_t bix, level_t target_level, level_t level)
-{
-	int ret, page_empty = 0;
-	int child_no = get_bits(bix, SUBLEVEL(level));
-	struct page *ipage;
-	struct write_control child_wc = {
-		.flags = this_wc->flags,
-	};
-
-	ipage = logfs_get_write_page(inode, bix, level);
-	if (!ipage)
-		return -ENOMEM;
-
-	if (this_wc->ofs) {
-		ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
-		if (ret)
-			goto out;
-	} else if (!PageUptodate(ipage)) {
-		page_empty = 1;
-		logfs_read_empty(ipage);
-	}
-
-	child_wc.ofs = block_get_pointer(ipage, child_no);
-
-	if ((__force u8)level-1 > (__force u8)target_level)
-		ret = __logfs_write_rec(inode, page, &child_wc, bix,
-				target_level, SUBLEVEL(level));
-	else
-		ret = logfs_write_i0(inode, page, &child_wc);
-
-	if (ret)
-		goto out;
-
-	alloc_indirect_block(inode, ipage, page_empty);
-	block_set_pointer(ipage, child_no, child_wc.ofs);
-	/* FIXME: first condition seems superfluous */
-	if (child_wc.ofs || logfs_block(ipage)->partial)
-		this_wc->flags |= WF_WRITE;
-	/* the condition on this_wc->ofs ensures that we won't consume extra
-	 * space for indirect blocks in the future, which we cannot reserve */
-	if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
-		ret = logfs_write_i0(inode, ipage, this_wc);
-	else
-		logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
-out:
-	logfs_put_write_page(ipage);
-	return ret;
-}
-
-static int logfs_write_rec(struct inode *inode, struct page *page,
-		pgoff_t bix, level_t target_level, long flags)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct write_control wc = {
-		.ofs = li->li_data[INDIRECT_INDEX],
-		.flags = flags,
-	};
-	int ret;
-
-	alloc_inode_block(inode);
-
-	if (li->li_height > (__force u8)target_level)
-		ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
-				LEVEL(li->li_height));
-	else
-		ret = logfs_write_i0(inode, page, &wc);
-	if (ret)
-		return ret;
-
-	if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
-		li->li_data[INDIRECT_INDEX] = wc.ofs;
-		logfs_set_alias(inode->i_sb, li->li_block,
-				INDIRECT_INDEX + INODE_POINTER_OFS);
-	}
-	return ret;
-}
-
-void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
-	alloc_inode_block(inode);
-	logfs_inode(inode)->li_block->ta = ta;
-}
-
-void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
-{
-	struct logfs_block *block = logfs_inode(inode)->li_block;
-
-	if (block && block->ta)
-		block->ta = NULL;
-}
-
-static int grow_inode(struct inode *inode, u64 bix, level_t level)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	u8 height = (__force u8)level;
-	struct page *page;
-	struct write_control wc = {
-		.flags = WF_WRITE,
-	};
-	int err;
-
-	BUG_ON(height > 5 || li->li_height > 5);
-	while (height > li->li_height || bix >= maxbix(li->li_height)) {
-		page = logfs_get_write_page(inode, I0_BLOCKS + 1,
-				LEVEL(li->li_height + 1));
-		if (!page)
-			return -ENOMEM;
-		logfs_read_empty(page);
-		alloc_indirect_block(inode, page, 1);
-		block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
-		err = logfs_write_i0(inode, page, &wc);
-		logfs_put_write_page(page);
-		if (err)
-			return err;
-		li->li_data[INDIRECT_INDEX] = wc.ofs;
-		wc.ofs = 0;
-		li->li_height++;
-		logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
-	}
-	return 0;
-}
-
-static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
-{
-	struct logfs_super *super = logfs_super(inode->i_sb);
-	pgoff_t index = page->index;
-	u64 bix;
-	level_t level;
-	int err;
-
-	flags |= WF_WRITE | WF_DELETE;
-	inode->i_ctime = inode->i_mtime = current_time(inode);
-
-	logfs_unpack_index(index, &bix, &level);
-	if (logfs_block(page) && logfs_block(page)->reserved_bytes)
-		super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
-
-	if (index < I0_BLOCKS)
-		return logfs_write_direct(inode, page, flags);
-
-	bix = adjust_bix(bix, level);
-	err = grow_inode(inode, bix, level);
-	if (err)
-		return err;
-	return logfs_write_rec(inode, page, bix, level, flags);
-}
-
-int logfs_write_buf(struct inode *inode, struct page *page, long flags)
-{
-	struct super_block *sb = inode->i_sb;
-	int ret;
-
-	logfs_get_wblocks(sb, page, flags & WF_LOCK);
-	ret = __logfs_write_buf(inode, page, flags);
-	logfs_put_wblocks(sb, page, flags & WF_LOCK);
-	return ret;
-}
-
-static int __logfs_delete(struct inode *inode, struct page *page)
-{
-	long flags = WF_DELETE;
-	int err;
-
-	inode->i_ctime = inode->i_mtime = current_time(inode);
-
-	if (page->index < I0_BLOCKS)
-		return logfs_write_direct(inode, page, flags);
-	err = grow_inode(inode, page->index, 0);
-	if (err)
-		return err;
-	return logfs_write_rec(inode, page, page->index, 0, flags);
-}
-
-int logfs_delete(struct inode *inode, pgoff_t index,
-		struct shadow_tree *shadow_tree)
-{
-	struct super_block *sb = inode->i_sb;
-	struct page *page;
-	int ret;
-
-	page = logfs_get_read_page(inode, index, 0);
-	if (!page)
-		return -ENOMEM;
-
-	logfs_get_wblocks(sb, page, 1);
-	ret = __logfs_delete(inode, page);
-	logfs_put_wblocks(sb, page, 1);
-
-	logfs_put_read_page(page);
-
-	return ret;
-}
-
-int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
-		gc_level_t gc_level, long flags)
-{
-	level_t level = shrink_level(gc_level);
-	struct page *page;
-	int err;
-
-	page = logfs_get_write_page(inode, bix, level);
-	if (!page)
-		return -ENOMEM;
-
-	err = logfs_segment_read(inode, page, ofs, bix, level);
-	if (!err) {
-		if (level != 0)
-			alloc_indirect_block(inode, page, 0);
-		err = logfs_write_buf(inode, page, flags);
-		if (!err && shrink_level(gc_level) == 0) {
-			/* Rewrite cannot mark the inode dirty but has to
-			 * write it immediately.
-			 * Q: Can't we just create an alias for the inode
-			 * instead?  And if not, why not?
-			 */
-			if (inode->i_ino == LOGFS_INO_MASTER)
-				logfs_write_anchor(inode->i_sb);
-			else {
-				err = __logfs_write_inode(inode, page, flags);
-			}
-		}
-	}
-	logfs_put_write_page(page);
-	return err;
-}
-
-static int truncate_data_block(struct inode *inode, struct page *page,
-		u64 ofs, struct logfs_shadow *shadow, u64 size)
-{
-	loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
-	u64 bix;
-	level_t level;
-	int err;
-
-	/* Does truncation happen within this page? */
-	if (size <= pageofs || size - pageofs >= PAGE_SIZE)
-		return 0;
-
-	logfs_unpack_index(page->index, &bix, &level);
-	BUG_ON(level != 0);
-
-	err = logfs_segment_read(inode, page, ofs, bix, level);
-	if (err)
-		return err;
-
-	zero_user_segment(page, size - pageofs, PAGE_SIZE);
-	return logfs_segment_write(inode, page, shadow);
-}
-
-static int logfs_truncate_i0(struct inode *inode, struct page *page,
-		struct write_control *wc, u64 size)
-{
-	struct logfs_shadow *shadow;
-	u64 bix;
-	level_t level;
-	int err = 0;
-
-	logfs_unpack_index(page->index, &bix, &level);
-	BUG_ON(level != 0);
-	shadow = alloc_shadow(inode, bix, level, wc->ofs);
-
-	err = truncate_data_block(inode, page, wc->ofs, shadow, size);
-	if (err) {
-		free_shadow(inode, shadow);
-		return err;
-	}
-
-	logfs_segment_delete(inode, shadow);
-	set_iused(inode, shadow);
-	fill_shadow_tree(inode, page, shadow);
-	wc->ofs = shadow->new_ofs;
-	return 0;
-}
-
-static int logfs_truncate_direct(struct inode *inode, u64 size)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct write_control wc;
-	struct page *page;
-	int e;
-	int err;
-
-	alloc_inode_block(inode);
-
-	for (e = I0_BLOCKS - 1; e >= 0; e--) {
-		if (size > (e+1) * LOGFS_BLOCKSIZE)
-			break;
-
-		wc.ofs = li->li_data[e];
-		if (!wc.ofs)
-			continue;
-
-		page = logfs_get_write_page(inode, e, 0);
-		if (!page)
-			return -ENOMEM;
-		err = logfs_segment_read(inode, page, wc.ofs, e, 0);
-		if (err) {
-			logfs_put_write_page(page);
-			return err;
-		}
-		err = logfs_truncate_i0(inode, page, &wc, size);
-		logfs_put_write_page(page);
-		if (err)
-			return err;
-
-		li->li_data[e] = wc.ofs;
-	}
-	return 0;
-}
-
-/* FIXME: these need to become per-sb once we support different blocksizes */
-static u64 __logfs_step[] = {
-	1,
-	I1_BLOCKS,
-	I2_BLOCKS,
-	I3_BLOCKS,
-};
-
-static u64 __logfs_start_index[] = {
-	I0_BLOCKS,
-	I1_BLOCKS,
-	I2_BLOCKS,
-	I3_BLOCKS
-};
-
-static inline u64 logfs_step(level_t level)
-{
-	return __logfs_step[(__force u8)level];
-}
-
-static inline u64 logfs_factor(u8 level)
-{
-	return __logfs_step[level] * LOGFS_BLOCKSIZE;
-}
-
-static inline u64 logfs_start_index(level_t level)
-{
-	return __logfs_start_index[(__force u8)level];
-}
-
-static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
-{
-	logfs_unpack_index(index, bix, level);
-	if (*bix <= logfs_start_index(SUBLEVEL(*level)))
-		*bix = 0;
-}
-
-static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
-		struct write_control *this_wc, u64 size)
-{
-	int truncate_happened = 0;
-	int e, err = 0;
-	u64 bix, child_bix, next_bix;
-	level_t level;
-	struct page *page;
-	struct write_control child_wc = { /* FIXME: flags */ };
-
-	logfs_unpack_raw_index(ipage->index, &bix, &level);
-	err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
-	if (err)
-		return err;
-
-	for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
-		child_bix = bix + e * logfs_step(SUBLEVEL(level));
-		next_bix = child_bix + logfs_step(SUBLEVEL(level));
-		if (size > next_bix * LOGFS_BLOCKSIZE)
-			break;
-
-		child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
-		if (!child_wc.ofs)
-			continue;
-
-		page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
-		if (!page)
-			return -ENOMEM;
-
-		if ((__force u8)level > 1)
-			err = __logfs_truncate_rec(inode, page, &child_wc, size);
-		else
-			err = logfs_truncate_i0(inode, page, &child_wc, size);
-		logfs_put_write_page(page);
-		if (err)
-			return err;
-
-		truncate_happened = 1;
-		alloc_indirect_block(inode, ipage, 0);
-		block_set_pointer(ipage, e, child_wc.ofs);
-	}
-
-	if (!truncate_happened) {
-		printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
-		return 0;
-	}
-
-	this_wc->flags = WF_DELETE;
-	if (logfs_block(ipage)->partial)
-		this_wc->flags |= WF_WRITE;
-
-	return logfs_write_i0(inode, ipage, this_wc);
-}
-
-static int logfs_truncate_rec(struct inode *inode, u64 size)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct write_control wc = {
-		.ofs = li->li_data[INDIRECT_INDEX],
-	};
-	struct page *page;
-	int err;
-
-	alloc_inode_block(inode);
-
-	if (!wc.ofs)
-		return 0;
-
-	page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
-	if (!page)
-		return -ENOMEM;
-
-	err = __logfs_truncate_rec(inode, page, &wc, size);
-	logfs_put_write_page(page);
-	if (err)
-		return err;
-
-	if (li->li_data[INDIRECT_INDEX] != wc.ofs)
-		li->li_data[INDIRECT_INDEX] = wc.ofs;
-	return 0;
-}
-
-static int __logfs_truncate(struct inode *inode, u64 size)
-{
-	int ret;
-
-	if (size >= logfs_factor(logfs_inode(inode)->li_height))
-		return 0;
-
-	ret = logfs_truncate_rec(inode, size);
-	if (ret)
-		return ret;
-
-	return logfs_truncate_direct(inode, size);
-}
-
-/*
- * Truncate, by changing the segment file, can consume a fair amount
- * of resources.  So back off from time to time and do some GC.
- * 8 or 2048 blocks should be well within safety limits even if
- * every single block resided in a different segment.
- */
-#define TRUNCATE_STEP	(8 * 1024 * 1024)
-int logfs_truncate(struct inode *inode, u64 target)
-{
-	struct super_block *sb = inode->i_sb;
-	u64 size = i_size_read(inode);
-	int err = 0;
-
-	size = ALIGN(size, TRUNCATE_STEP);
-	while (size > target) {
-		if (size > TRUNCATE_STEP)
-			size -= TRUNCATE_STEP;
-		else
-			size = 0;
-		if (size < target)
-			size = target;
-
-		logfs_get_wblocks(sb, NULL, 1);
-		err = __logfs_truncate(inode, size);
-		if (!err)
-			err = __logfs_write_inode(inode, NULL, 0);
-		logfs_put_wblocks(sb, NULL, 1);
-	}
-
-	if (!err) {
-		err = inode_newsize_ok(inode, target);
-		if (err)
-			goto out;
-
-		truncate_setsize(inode, target);
-	}
-
- out:
-	/* I don't trust error recovery yet. */
-	WARN_ON(err);
-	return err;
-}
-
-static void move_page_to_inode(struct inode *inode, struct page *page)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_block *block = logfs_block(page);
-
-	if (!block)
-		return;
-
-	log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
-			block->ino, block->bix, block->level);
-	BUG_ON(li->li_block);
-	block->ops = &inode_block_ops;
-	block->inode = inode;
-	li->li_block = block;
-
-	block->page = NULL;
-	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
-		put_page(page);
-		set_page_private(page, 0);
-	}
-}
-
-static void move_inode_to_page(struct page *page, struct inode *inode)
-{
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_block *block = li->li_block;
-
-	if (!block)
-		return;
-
-	log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
-			block->ino, block->bix, block->level);
-	BUG_ON(PagePrivate(page));
-	block->ops = &indirect_block_ops;
-	block->page = page;
-
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		get_page(page);
-		set_page_private(page, (unsigned long) block);
-	}
-
-	block->inode = NULL;
-	li->li_block = NULL;
-}
-
-int logfs_read_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *master_inode = super->s_master_inode;
-	struct page *page;
-	struct logfs_disk_inode *di;
-	u64 ino = inode->i_ino;
-
-	if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
-		return -ENODATA;
-	if (!logfs_exist_block(master_inode, ino))
-		return -ENODATA;
-
-	page = read_cache_page(master_inode->i_mapping, ino,
-			(filler_t *)logfs_readpage, NULL);
-	if (IS_ERR(page))
-		return PTR_ERR(page);
-
-	di = kmap_atomic(page);
-	logfs_disk_to_inode(di, inode);
-	kunmap_atomic(di);
-	move_page_to_inode(inode, page);
-	put_page(page);
-	return 0;
-}
-
-/* Caller must logfs_put_write_page(page); */
-static struct page *inode_to_page(struct inode *inode)
-{
-	struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
-	struct logfs_disk_inode *di;
-	struct page *page;
-
-	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
-
-	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
-	if (!page)
-		return NULL;
-
-	di = kmap_atomic(page);
-	logfs_inode_to_disk(inode, di);
-	kunmap_atomic(di);
-	move_inode_to_page(page, inode);
-	return page;
-}
-
-static int do_write_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct inode *master_inode = logfs_super(sb)->s_master_inode;
-	loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
-	struct page *page;
-	int err;
-
-	BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
-	/* FIXME: lock inode */
-
-	if (i_size_read(master_inode) < size)
-		i_size_write(master_inode, size);
-
-	/* TODO: Tell vfs this inode is clean now */
-
-	page = inode_to_page(inode);
-	if (!page)
-		return -ENOMEM;
-
-	/* FIXME: transaction is part of logfs_block now.  Is that enough? */
-	err = logfs_write_buf(master_inode, page, 0);
-	if (err)
-		move_page_to_inode(inode, page);
-
-	logfs_put_write_page(page);
-	return err;
-}
-
-static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
-		int write,
-		void (*change_se)(struct logfs_segment_entry *, long),
-		long arg)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode;
-	struct page *page;
-	struct logfs_segment_entry *se;
-	pgoff_t page_no;
-	int child_no;
-
-	page_no = segno >> (sb->s_blocksize_bits - 3);
-	child_no = segno & ((sb->s_blocksize >> 3) - 1);
-
-	inode = super->s_segfile_inode;
-	page = logfs_get_write_page(inode, page_no, 0);
-	BUG_ON(!page); /* FIXME: We need some reserve page for this case */
-	if (!PageUptodate(page))
-		logfs_read_block(inode, page, WRITE);
-
-	if (write)
-		alloc_indirect_block(inode, page, 0);
-	se = kmap_atomic(page);
-	change_se(se + child_no, arg);
-	if (write) {
-		logfs_set_alias(sb, logfs_block(page), child_no);
-		BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
-	}
-	kunmap_atomic(se);
-
-	logfs_put_write_page(page);
-}
-
-static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
-{
-	struct logfs_segment_entry *target = (void *)_target;
-
-	*target = *se;
-}
-
-void logfs_get_segment_entry(struct super_block *sb, u32 segno,
-		struct logfs_segment_entry *se)
-{
-	logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
-}
-
-static void __set_segment_used(struct logfs_segment_entry *se, long increment)
-{
-	u32 valid;
-
-	valid = be32_to_cpu(se->valid);
-	valid += increment;
-	se->valid = cpu_to_be32(valid);
-}
-
-void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
-{
-	struct logfs_super *super = logfs_super(sb);
-	u32 segno = ofs >> super->s_segshift;
-
-	if (!increment)
-		return;
-
-	logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
-}
-
-static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
-{
-	se->ec_level = cpu_to_be32(ec_level);
-}
-
-void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
-		gc_level_t gc_level)
-{
-	u32 ec_level = ec << 4 | (__force u8)gc_level;
-
-	logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
-}
-
-static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
-{
-	se->valid = cpu_to_be32(RESERVED);
-}
-
-void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
-{
-	logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
-}
-
-static void __set_segment_unreserved(struct logfs_segment_entry *se,
-		long ec_level)
-{
-	se->valid = 0;
-	se->ec_level = cpu_to_be32(ec_level);
-}
-
-void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
-{
-	u32 ec_level = ec << 4;
-
-	logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
-			ec_level);
-}
-
-int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
-{
-	struct super_block *sb = inode->i_sb;
-	int ret;
-
-	logfs_get_wblocks(sb, page, flags & WF_LOCK);
-	ret = do_write_inode(inode);
-	logfs_put_wblocks(sb, page, flags & WF_LOCK);
-	return ret;
-}
-
-static int do_delete_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct inode *master_inode = logfs_super(sb)->s_master_inode;
-	struct page *page;
-	int ret;
-
-	page = logfs_get_write_page(master_inode, inode->i_ino, 0);
-	if (!page)
-		return -ENOMEM;
-
-	move_inode_to_page(page, inode);
-
-	logfs_get_wblocks(sb, page, 1);
-	ret = __logfs_delete(master_inode, page);
-	logfs_put_wblocks(sb, page, 1);
-
-	logfs_put_write_page(page);
-	return ret;
-}
-
-/*
- * ZOMBIE inodes have already been deleted before and should remain dead,
- * if it weren't for valid checking.  No need to kill them again here.
- */
-void logfs_evict_inode(struct inode *inode)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_inode *li = logfs_inode(inode);
-	struct logfs_block *block = li->li_block;
-	struct page *page;
-
-	if (!inode->i_nlink) {
-		if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
-			li->li_flags |= LOGFS_IF_ZOMBIE;
-			if (i_size_read(inode) > 0)
-				logfs_truncate(inode, 0);
-			do_delete_inode(inode);
-		}
-	}
-	truncate_inode_pages_final(&inode->i_data);
-	clear_inode(inode);
-
-	/* Cheaper version of write_inode.  All changes are concealed in
-	 * aliases, which are moved back.  No write to the medium happens.
-	 */
-	/* Only deleted files may be dirty at this point */
-	BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
-	if (!block)
-		return;
-	if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
-		block->ops->free_block(inode->i_sb, block);
-		return;
-	}
-
-	page = inode_to_page(inode);
-	BUG_ON(!page); /* FIXME: Use emergency page */
-	logfs_put_write_page(page);
-}
-
-void btree_write_block(struct logfs_block *block)
-{
-	struct inode *inode;
-	struct page *page;
-	int err, cookie;
-
-	inode = logfs_safe_iget(block->sb, block->ino, &cookie);
-	page = logfs_get_write_page(inode, block->bix, block->level);
-
-	err = logfs_readpage_nolock(page);
-	BUG_ON(err);
-	BUG_ON(!PagePrivate(page));
-	BUG_ON(logfs_block(page) != block);
-	err = __logfs_write_buf(inode, page, 0);
-	BUG_ON(err);
-	BUG_ON(PagePrivate(page) || page->private);
-
-	logfs_put_write_page(page);
-	logfs_safe_iput(inode, cookie);
-}
-
-/**
- * logfs_inode_write - write inode or dentry objects
- *
- * @inode:		parent inode (ifile or directory)
- * @buf:		object to write (inode or dentry)
- * @count:		object size
- * @bix:		block index
- * @flags:		write flags
- * @shadow_tree:	shadow below this inode
- *
- * FIXME: All caller of this put a 200-300 byte variable on the stack,
- * only to call here and do a memcpy from that stack variable.  A good
- * example of wasted performance and stack space.
- */
-int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
-		loff_t bix, long flags, struct shadow_tree *shadow_tree)
-{
-	loff_t pos = bix << inode->i_sb->s_blocksize_bits;
-	int err;
-	struct page *page;
-	void *pagebuf;
-
-	BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
-	BUG_ON(count > LOGFS_BLOCKSIZE);
-	page = logfs_get_write_page(inode, bix, 0);
-	if (!page)
-		return -ENOMEM;
-
-	pagebuf = kmap_atomic(page);
-	memcpy(pagebuf, buf, count);
-	flush_dcache_page(page);
-	kunmap_atomic(pagebuf);
-
-	if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
-		i_size_write(inode, pos + LOGFS_BLOCKSIZE);
-
-	err = logfs_write_buf(inode, page, flags);
-	logfs_put_write_page(page);
-	return err;
-}
-
-int logfs_open_segfile(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *inode;
-
-	inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-	super->s_segfile_inode = inode;
-	return 0;
-}
-
-int logfs_init_rw(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int min_fill = 3 * super->s_no_blocks;
-
-	INIT_LIST_HEAD(&super->s_object_alias);
-	INIT_LIST_HEAD(&super->s_writeback_list);
-	mutex_init(&super->s_write_mutex);
-	super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
-			sizeof(struct logfs_block));
-	super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
-			sizeof(struct logfs_shadow));
-	return 0;
-}
-
-void logfs_cleanup_rw(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	logfs_mempool_destroy(super->s_block_pool);
-	logfs_mempool_destroy(super->s_shadow_pool);
-}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
deleted file mode 100644
index 1efd6055f4b0..000000000000
--- a/fs/logfs/segment.c
+++ /dev/null
@@ -1,961 +0,0 @@
-/*
- * fs/logfs/segment.c	- Handling the Object Store
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Object store or ostore makes up the complete device with exception of
- * the superblock and journal areas.  Apart from its own metadata it stores
- * three kinds of objects: inodes, dentries and blocks, both data and indirect.
- */
-#include "logfs.h"
-#include <linux/slab.h>
-
-static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct btree_head32 *head = &super->s_reserved_segments;
-	int err;
-
-	err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
-	if (err)
-		return err;
-	logfs_super(sb)->s_bad_segments++;
-	/* FIXME: write to journal */
-	return 0;
-}
-
-int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	super->s_gec++;
-
-	return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
-			super->s_segsize, ensure_erase);
-}
-
-static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
-{
-	s32 ofs;
-
-	logfs_open_area(area, bytes);
-
-	ofs = area->a_used_bytes;
-	area->a_used_bytes += bytes;
-	BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
-
-	return dev_ofs(area->a_sb, area->a_segno, ofs);
-}
-
-static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
-		int use_filler)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	filler_t *filler = super->s_devops->readpage;
-	struct page *page;
-
-	BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
-	if (use_filler)
-		page = read_cache_page(mapping, index, filler, sb);
-	else {
-		page = find_or_create_page(mapping, index, GFP_NOFS);
-		if (page)
-			unlock_page(page);
-	}
-	return page;
-}
-
-int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
-		int use_filler)
-{
-	pgoff_t index = ofs >> PAGE_SHIFT;
-	struct page *page;
-	long offset = ofs & (PAGE_SIZE-1);
-	long copylen;
-
-	/* Only logfs_wbuf_recover may use len==0 */
-	BUG_ON(!len && !use_filler);
-	do {
-		copylen = min((ulong)len, PAGE_SIZE - offset);
-
-		page = get_mapping_page(area->a_sb, index, use_filler);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		BUG_ON(!page); /* FIXME: reserve a pool */
-		SetPageUptodate(page);
-		memcpy(page_address(page) + offset, buf, copylen);
-
-		if (!PagePrivate(page)) {
-			SetPagePrivate(page);
-			get_page(page);
-		}
-		put_page(page);
-
-		buf += copylen;
-		len -= copylen;
-		offset = 0;
-		index++;
-	} while (len);
-	return 0;
-}
-
-static void pad_partial_page(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	struct page *page;
-	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
-	pgoff_t index = ofs >> PAGE_SHIFT;
-	long offset = ofs & (PAGE_SIZE-1);
-	u32 len = PAGE_SIZE - offset;
-
-	if (len % PAGE_SIZE) {
-		page = get_mapping_page(sb, index, 0);
-		BUG_ON(!page); /* FIXME: reserve a pool */
-		memset(page_address(page) + offset, 0xff, len);
-		if (!PagePrivate(page)) {
-			SetPagePrivate(page);
-			get_page(page);
-		}
-		put_page(page);
-	}
-}
-
-static void pad_full_pages(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
-	u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
-	u32 len = super->s_segsize - area->a_used_bytes;
-	pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
-	pgoff_t no_indizes = len >> PAGE_SHIFT;
-	struct page *page;
-
-	while (no_indizes) {
-		page = get_mapping_page(sb, index, 0);
-		BUG_ON(!page); /* FIXME: reserve a pool */
-		SetPageUptodate(page);
-		memset(page_address(page), 0xff, PAGE_SIZE);
-		if (!PagePrivate(page)) {
-			SetPagePrivate(page);
-			get_page(page);
-		}
-		put_page(page);
-		index++;
-		no_indizes--;
-	}
-}
-
-/*
- * bdev_writeseg will write full pages.  Memset the tail to prevent data leaks.
- * Also make sure we allocate (and memset) all pages for final writeout.
- */
-static void pad_wbuf(struct logfs_area *area, int final)
-{
-	pad_partial_page(area);
-	if (final)
-		pad_full_pages(area);
-}
-
-/*
- * We have to be careful with the alias tree.  Since lookup is done by bix,
- * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
- * indirect blocks.  So always use it through accessor functions.
- */
-static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
-		level_t level)
-{
-	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
-	pgoff_t index = logfs_pack_index(bix, level);
-
-	return btree_lookup128(head, ino, index);
-}
-
-static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
-		level_t level, void *val)
-{
-	struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
-	pgoff_t index = logfs_pack_index(bix, level);
-
-	return btree_insert128(head, ino, index, val, GFP_NOFS);
-}
-
-static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
-		write_alias_t *write_one_alias)
-{
-	struct object_alias_item *item;
-	int err;
-
-	list_for_each_entry(item, &block->item_list, list) {
-		err = write_alias_journal(sb, block->ino, block->bix,
-				block->level, item->child_no, item->val);
-		if (err)
-			return err;
-	}
-	return 0;
-}
-
-static const struct logfs_block_ops btree_block_ops = {
-	.write_block	= btree_write_block,
-	.free_block	= __free_block,
-	.write_alias	= btree_write_alias,
-};
-
-int logfs_load_object_aliases(struct super_block *sb,
-		struct logfs_obj_alias *oa, int count)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_block *block;
-	struct object_alias_item *item;
-	u64 ino, bix;
-	level_t level;
-	int i, err;
-
-	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
-	count /= sizeof(*oa);
-	for (i = 0; i < count; i++) {
-		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
-		if (!item)
-			return -ENOMEM;
-		memset(item, 0, sizeof(*item));
-
-		super->s_no_object_aliases++;
-		item->val = oa[i].val;
-		item->child_no = be16_to_cpu(oa[i].child_no);
-
-		ino = be64_to_cpu(oa[i].ino);
-		bix = be64_to_cpu(oa[i].bix);
-		level = LEVEL(oa[i].level);
-
-		log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
-				ino, bix, level, item->child_no,
-				be64_to_cpu(item->val));
-		block = alias_tree_lookup(sb, ino, bix, level);
-		if (!block) {
-			block = __alloc_block(sb, ino, bix, level);
-			block->ops = &btree_block_ops;
-			err = alias_tree_insert(sb, ino, bix, level, block);
-			BUG_ON(err); /* mempool empty */
-		}
-		if (test_and_set_bit(item->child_no, block->alias_map)) {
-			printk(KERN_ERR"LogFS: Alias collision detected\n");
-			return -EIO;
-		}
-		list_move_tail(&block->alias_list, &super->s_object_alias);
-		list_add(&item->list, &block->item_list);
-	}
-	return 0;
-}
-
-static void kill_alias(void *_block, unsigned long ignore0,
-		u64 ignore1, u64 ignore2, size_t ignore3)
-{
-	struct logfs_block *block = _block;
-	struct super_block *sb = block->sb;
-	struct logfs_super *super = logfs_super(sb);
-	struct object_alias_item *item;
-
-	while (!list_empty(&block->item_list)) {
-		item = list_entry(block->item_list.next, typeof(*item), list);
-		list_del(&item->list);
-		mempool_free(item, super->s_alias_pool);
-	}
-	block->ops->free_block(sb, block);
-}
-
-static int obj_type(struct inode *inode, level_t level)
-{
-	if (level == 0) {
-		if (S_ISDIR(inode->i_mode))
-			return OBJ_DENTRY;
-		if (inode->i_ino == LOGFS_INO_MASTER)
-			return OBJ_INODE;
-	}
-	return OBJ_BLOCK;
-}
-
-static int obj_len(struct super_block *sb, int obj_type)
-{
-	switch (obj_type) {
-	case OBJ_DENTRY:
-		return sizeof(struct logfs_disk_dentry);
-	case OBJ_INODE:
-		return sizeof(struct logfs_disk_inode);
-	case OBJ_BLOCK:
-		return sb->s_blocksize;
-	default:
-		BUG();
-	}
-}
-
-static int __logfs_segment_write(struct inode *inode, void *buf,
-		struct logfs_shadow *shadow, int type, int len, int compr)
-{
-	struct logfs_area *area;
-	struct super_block *sb = inode->i_sb;
-	s64 ofs;
-	struct logfs_object_header h;
-	int acc_len;
-
-	if (shadow->gc_level == 0)
-		acc_len = len;
-	else
-		acc_len = obj_len(sb, type);
-
-	area = get_area(sb, shadow->gc_level);
-	ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
-	LOGFS_BUG_ON(ofs <= 0, sb);
-	/*
-	 * Order is important.  logfs_get_free_bytes(), by modifying the
-	 * segment file, may modify the content of the very page we're about
-	 * to write now.  Which is fine, as long as the calculated crc and
-	 * written data still match.  So do the modifications _before_
-	 * calculating the crc.
-	 */
-
-	h.len	= cpu_to_be16(len);
-	h.type	= type;
-	h.compr	= compr;
-	h.ino	= cpu_to_be64(inode->i_ino);
-	h.bix	= cpu_to_be64(shadow->bix);
-	h.crc	= logfs_crc32(&h, sizeof(h) - 4, 4);
-	h.data_crc = logfs_crc32(buf, len, 0);
-
-	logfs_buf_write(area, ofs, &h, sizeof(h));
-	logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
-
-	shadow->new_ofs = ofs;
-	shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
-
-	return 0;
-}
-
-static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
-		struct logfs_shadow *shadow, int type, int len)
-{
-	struct super_block *sb = inode->i_sb;
-	void *compressor_buf = logfs_super(sb)->s_compressed_je;
-	ssize_t compr_len;
-	int ret;
-
-	mutex_lock(&logfs_super(sb)->s_journal_mutex);
-	compr_len = logfs_compress(buf, compressor_buf, len, len);
-
-	if (compr_len >= 0) {
-		ret = __logfs_segment_write(inode, compressor_buf, shadow,
-				type, compr_len, COMPR_ZLIB);
-	} else {
-		ret = __logfs_segment_write(inode, buf, shadow, type, len,
-				COMPR_NONE);
-	}
-	mutex_unlock(&logfs_super(sb)->s_journal_mutex);
-	return ret;
-}
-
-/**
- * logfs_segment_write - write data block to object store
- * @inode:		inode containing data
- *
- * Returns an errno or zero.
- */
-int logfs_segment_write(struct inode *inode, struct page *page,
-		struct logfs_shadow *shadow)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_super *super = logfs_super(sb);
-	int do_compress, type, len;
-	int ret;
-	void *buf;
-
-	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
-	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
-	do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
-	if (shadow->gc_level != 0) {
-		/* temporarily disable compression for indirect blocks */
-		do_compress = 0;
-	}
-
-	type = obj_type(inode, shrink_level(shadow->gc_level));
-	len = obj_len(sb, type);
-	buf = kmap(page);
-	if (do_compress)
-		ret = logfs_segment_write_compress(inode, buf, shadow, type,
-				len);
-	else
-		ret = __logfs_segment_write(inode, buf, shadow, type, len,
-				COMPR_NONE);
-	kunmap(page);
-
-	log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
-			shadow->ino, shadow->bix, shadow->gc_level,
-			shadow->old_ofs, shadow->new_ofs,
-			shadow->old_len, shadow->new_len);
-	/* this BUG_ON did catch a locking bug.  useful */
-	BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
-	return ret;
-}
-
-int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
-{
-	pgoff_t index = ofs >> PAGE_SHIFT;
-	struct page *page;
-	long offset = ofs & (PAGE_SIZE-1);
-	long copylen;
-
-	while (len) {
-		copylen = min((ulong)len, PAGE_SIZE - offset);
-
-		page = get_mapping_page(sb, index, 1);
-		if (IS_ERR(page))
-			return PTR_ERR(page);
-		memcpy(buf, page_address(page) + offset, copylen);
-		put_page(page);
-
-		buf += copylen;
-		len -= copylen;
-		offset = 0;
-		index++;
-	}
-	return 0;
-}
-
-/*
- * The "position" of indirect blocks is ambiguous.  It can be the position
- * of any data block somewhere behind this indirect block.  So we need to
- * normalize the positions through logfs_block_mask() before comparing.
- */
-static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
-{
-	return	(pos1 & logfs_block_mask(sb, level)) !=
-		(pos2 & logfs_block_mask(sb, level));
-}
-
-#if 0
-static int read_seg_header(struct super_block *sb, u64 ofs,
-		struct logfs_segment_header *sh)
-{
-	__be32 crc;
-	int err;
-
-	err = wbuf_read(sb, ofs, sizeof(*sh), sh);
-	if (err)
-		return err;
-	crc = logfs_crc32(sh, sizeof(*sh), 4);
-	if (crc != sh->crc) {
-		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
-				"got %x\n", ofs, be32_to_cpu(sh->crc),
-				be32_to_cpu(crc));
-		return -EIO;
-	}
-	return 0;
-}
-#endif
-
-static int read_obj_header(struct super_block *sb, u64 ofs,
-		struct logfs_object_header *oh)
-{
-	__be32 crc;
-	int err;
-
-	err = wbuf_read(sb, ofs, sizeof(*oh), oh);
-	if (err)
-		return err;
-	crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
-	if (crc != oh->crc) {
-		printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
-				"got %x\n", ofs, be32_to_cpu(oh->crc),
-				be32_to_cpu(crc));
-		return -EIO;
-	}
-	return 0;
-}
-
-static void move_btree_to_page(struct inode *inode, struct page *page,
-		__be64 *data)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_super *super = logfs_super(sb);
-	struct btree_head128 *head = &super->s_object_alias_tree;
-	struct logfs_block *block;
-	struct object_alias_item *item, *next;
-
-	if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
-		return;
-
-	block = btree_remove128(head, inode->i_ino, page->index);
-	if (!block)
-		return;
-
-	log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
-			block->ino, block->bix, block->level);
-	list_for_each_entry_safe(item, next, &block->item_list, list) {
-		data[item->child_no] = item->val;
-		list_del(&item->list);
-		mempool_free(item, super->s_alias_pool);
-	}
-	block->page = page;
-
-	if (!PagePrivate(page)) {
-		SetPagePrivate(page);
-		get_page(page);
-		set_page_private(page, (unsigned long) block);
-	}
-	block->ops = &indirect_block_ops;
-	initialize_block_counters(page, block, data, 0);
-}
-
-/*
- * This silences a false, yet annoying gcc warning.  I hate it when my editor
- * jumps into bitops.h each time I recompile this file.
- * TODO: Complain to gcc folks about this and upgrade compiler.
- */
-static unsigned long fnb(const unsigned long *addr,
-		unsigned long size, unsigned long offset)
-{
-	return find_next_bit(addr, size, offset);
-}
-
-void move_page_to_btree(struct page *page)
-{
-	struct logfs_block *block = logfs_block(page);
-	struct super_block *sb = block->sb;
-	struct logfs_super *super = logfs_super(sb);
-	struct object_alias_item *item;
-	unsigned long pos;
-	__be64 *child;
-	int err;
-
-	if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
-		block->ops->free_block(sb, block);
-		return;
-	}
-	log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
-			block->ino, block->bix, block->level);
-	super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
-
-	for (pos = 0; ; pos++) {
-		pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
-		if (pos >= LOGFS_BLOCK_FACTOR)
-			break;
-
-		item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
-		BUG_ON(!item); /* mempool empty */
-		memset(item, 0, sizeof(*item));
-
-		child = kmap_atomic(page);
-		item->val = child[pos];
-		kunmap_atomic(child);
-		item->child_no = pos;
-		list_add(&item->list, &block->item_list);
-	}
-	block->page = NULL;
-
-	if (PagePrivate(page)) {
-		ClearPagePrivate(page);
-		put_page(page);
-		set_page_private(page, 0);
-	}
-	block->ops = &btree_block_ops;
-	err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
-			block);
-	BUG_ON(err); /* mempool empty */
-	ClearPageUptodate(page);
-}
-
-static int __logfs_segment_read(struct inode *inode, void *buf,
-		u64 ofs, u64 bix, level_t level)
-{
-	struct super_block *sb = inode->i_sb;
-	void *compressor_buf = logfs_super(sb)->s_compressed_je;
-	struct logfs_object_header oh;
-	__be32 crc;
-	u16 len;
-	int err, block_len;
-
-	block_len = obj_len(sb, obj_type(inode, level));
-	err = read_obj_header(sb, ofs, &oh);
-	if (err)
-		goto out_err;
-
-	err = -EIO;
-	if (be64_to_cpu(oh.ino) != inode->i_ino
-			|| check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
-		printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
-				"expected (%lx, %llx), got (%llx, %llx)\n",
-				ofs, inode->i_ino, bix,
-				be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
-		goto out_err;
-	}
-
-	len = be16_to_cpu(oh.len);
-
-	switch (oh.compr) {
-	case COMPR_NONE:
-		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
-		if (err)
-			goto out_err;
-		crc = logfs_crc32(buf, len, 0);
-		if (crc != oh.data_crc) {
-			printk(KERN_ERR"LOGFS: uncompressed data crc error at "
-					"%llx: expected %x, got %x\n", ofs,
-					be32_to_cpu(oh.data_crc),
-					be32_to_cpu(crc));
-			goto out_err;
-		}
-		break;
-	case COMPR_ZLIB:
-		mutex_lock(&logfs_super(sb)->s_journal_mutex);
-		err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
-				compressor_buf);
-		if (err) {
-			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
-			goto out_err;
-		}
-		crc = logfs_crc32(compressor_buf, len, 0);
-		if (crc != oh.data_crc) {
-			printk(KERN_ERR"LOGFS: compressed data crc error at "
-					"%llx: expected %x, got %x\n", ofs,
-					be32_to_cpu(oh.data_crc),
-					be32_to_cpu(crc));
-			mutex_unlock(&logfs_super(sb)->s_journal_mutex);
-			goto out_err;
-		}
-		err = logfs_uncompress(compressor_buf, buf, len, block_len);
-		mutex_unlock(&logfs_super(sb)->s_journal_mutex);
-		if (err) {
-			printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
-			goto out_err;
-		}
-		break;
-	default:
-		LOGFS_BUG(sb);
-		err = -EIO;
-		goto out_err;
-	}
-	return 0;
-
-out_err:
-	logfs_set_ro(sb);
-	printk(KERN_ERR"LOGFS: device is read-only now\n");
-	LOGFS_BUG(sb);
-	return err;
-}
-
-/**
- * logfs_segment_read - read data block from object store
- * @inode:		inode containing data
- * @buf:		data buffer
- * @ofs:		physical data offset
- * @bix:		block index
- * @level:		block level
- *
- * Returns 0 on success or a negative errno.
- */
-int logfs_segment_read(struct inode *inode, struct page *page,
-		u64 ofs, u64 bix, level_t level)
-{
-	int err;
-	void *buf;
-
-	if (PageUptodate(page))
-		return 0;
-
-	ofs &= ~LOGFS_FULLY_POPULATED;
-
-	buf = kmap(page);
-	err = __logfs_segment_read(inode, buf, ofs, bix, level);
-	if (!err) {
-		move_btree_to_page(inode, page, buf);
-		SetPageUptodate(page);
-	}
-	kunmap(page);
-	log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
-			inode->i_ino, bix, level, ofs, err);
-	return err;
-}
-
-int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
-{
-	struct super_block *sb = inode->i_sb;
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_object_header h;
-	u16 len;
-	int err;
-
-	super->s_flags |= LOGFS_SB_FLAG_DIRTY;
-	BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
-	BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
-	if (!shadow->old_ofs)
-		return 0;
-
-	log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
-			shadow->ino, shadow->bix, shadow->gc_level,
-			shadow->old_ofs, shadow->new_ofs,
-			shadow->old_len, shadow->new_len);
-	err = read_obj_header(sb, shadow->old_ofs, &h);
-	LOGFS_BUG_ON(err, sb);
-	LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
-	LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
-				shrink_level(shadow->gc_level)), sb);
-
-	if (shadow->gc_level == 0)
-		len = be16_to_cpu(h.len);
-	else
-		len = obj_len(sb, h.type);
-	shadow->old_len = len + sizeof(h);
-	return 0;
-}
-
-void freeseg(struct super_block *sb, u32 segno)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping = super->s_mapping_inode->i_mapping;
-	struct page *page;
-	u64 ofs, start, end;
-
-	start = dev_ofs(sb, segno, 0);
-	end = dev_ofs(sb, segno + 1, 0);
-	for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
-		page = find_get_page(mapping, ofs >> PAGE_SHIFT);
-		if (!page)
-			continue;
-		if (PagePrivate(page)) {
-			ClearPagePrivate(page);
-			put_page(page);
-		}
-		put_page(page);
-	}
-}
-
-int logfs_open_area(struct logfs_area *area, size_t bytes)
-{
-	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
-	int err, closed = 0;
-
-	if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
-		return 0;
-
-	if (area->a_is_open) {
-		u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-		u32 len = super->s_segsize - area->a_written_bytes;
-
-		log_gc("logfs_close_area(%x)\n", area->a_segno);
-		pad_wbuf(area, 1);
-		super->s_devops->writeseg(area->a_sb, ofs, len);
-		freeseg(sb, area->a_segno);
-		closed = 1;
-	}
-
-	area->a_used_bytes = 0;
-	area->a_written_bytes = 0;
-again:
-	area->a_ops->get_free_segment(area);
-	area->a_ops->get_erase_count(area);
-
-	log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
-	err = area->a_ops->erase_segment(area);
-	if (err) {
-		printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
-				area->a_segno);
-		logfs_mark_segment_bad(sb, area->a_segno);
-		goto again;
-	}
-	area->a_is_open = 1;
-	return closed;
-}
-
-void logfs_sync_area(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
-	u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
-	u32 len = (area->a_used_bytes - area->a_written_bytes);
-
-	if (super->s_writesize)
-		len &= ~(super->s_writesize - 1);
-	if (len == 0)
-		return;
-	pad_wbuf(area, 0);
-	super->s_devops->writeseg(sb, ofs, len);
-	area->a_written_bytes += len;
-}
-
-void logfs_sync_segments(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i;
-
-	for_each_area(i)
-		logfs_sync_area(super->s_area[i]);
-}
-
-/*
- * Pick a free segment to be used for this area.  Effectively takes a
- * candidate from the free list (not really a candidate anymore).
- */
-static void ostore_get_free_segment(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	struct logfs_super *super = logfs_super(sb);
-
-	if (super->s_free_list.count == 0) {
-		printk(KERN_ERR"LOGFS: ran out of free segments\n");
-		LOGFS_BUG(sb);
-	}
-
-	area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
-}
-
-static void ostore_get_erase_count(struct logfs_area *area)
-{
-	struct logfs_segment_entry se;
-	u32 ec_level;
-
-	logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
-	BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
-			se.valid == cpu_to_be32(RESERVED));
-
-	ec_level = be32_to_cpu(se.ec_level);
-	area->a_erase_count = (ec_level >> 4) + 1;
-}
-
-static int ostore_erase_segment(struct logfs_area *area)
-{
-	struct super_block *sb = area->a_sb;
-	struct logfs_segment_header sh;
-	u64 ofs;
-	int err;
-
-	err = logfs_erase_segment(sb, area->a_segno, 0);
-	if (err)
-		return err;
-
-	sh.pad = 0;
-	sh.type = SEG_OSTORE;
-	sh.level = (__force u8)area->a_level;
-	sh.segno = cpu_to_be32(area->a_segno);
-	sh.ec = cpu_to_be32(area->a_erase_count);
-	sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
-	sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
-
-	logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
-			area->a_level);
-
-	ofs = dev_ofs(sb, area->a_segno, 0);
-	area->a_used_bytes = sizeof(sh);
-	logfs_buf_write(area, ofs, &sh, sizeof(sh));
-	return 0;
-}
-
-static const struct logfs_area_ops ostore_area_ops = {
-	.get_free_segment	= ostore_get_free_segment,
-	.get_erase_count	= ostore_get_erase_count,
-	.erase_segment		= ostore_erase_segment,
-};
-
-static void free_area(struct logfs_area *area)
-{
-	if (area)
-		freeseg(area->a_sb, area->a_segno);
-	kfree(area);
-}
-
-void free_areas(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i;
-
-	for_each_area(i)
-		free_area(super->s_area[i]);
-	free_area(super->s_journal_area);
-}
-
-static struct logfs_area *alloc_area(struct super_block *sb)
-{
-	struct logfs_area *area;
-
-	area = kzalloc(sizeof(*area), GFP_KERNEL);
-	if (!area)
-		return NULL;
-
-	area->a_sb = sb;
-	return area;
-}
-
-static void map_invalidatepage(struct page *page, unsigned int o,
-			       unsigned int l)
-{
-	return;
-}
-
-static int map_releasepage(struct page *page, gfp_t g)
-{
-	/* Don't release these pages */
-	return 0;
-}
-
-static const struct address_space_operations mapping_aops = {
-	.invalidatepage = map_invalidatepage,
-	.releasepage	= map_releasepage,
-	.set_page_dirty = __set_page_dirty_nobuffers,
-};
-
-int logfs_init_mapping(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct address_space *mapping;
-	struct inode *inode;
-
-	inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
-	if (IS_ERR(inode))
-		return PTR_ERR(inode);
-	super->s_mapping_inode = inode;
-	mapping = inode->i_mapping;
-	mapping->a_ops = &mapping_aops;
-	/* Would it be possible to use __GFP_HIGHMEM as well? */
-	mapping_set_gfp_mask(mapping, GFP_NOFS);
-	return 0;
-}
-
-int logfs_init_areas(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int i = -1;
-
-	super->s_alias_pool = mempool_create_kmalloc_pool(600,
-			sizeof(struct object_alias_item));
-	if (!super->s_alias_pool)
-		return -ENOMEM;
-
-	super->s_journal_area = alloc_area(sb);
-	if (!super->s_journal_area)
-		goto err;
-
-	for_each_area(i) {
-		super->s_area[i] = alloc_area(sb);
-		if (!super->s_area[i])
-			goto err;
-		super->s_area[i]->a_level = GC_LEVEL(i);
-		super->s_area[i]->a_ops = &ostore_area_ops;
-	}
-	btree_init_mempool128(&super->s_object_alias_tree,
-			super->s_btree_pool);
-	return 0;
-
-err:
-	for (i--; i >= 0; i--)
-		free_area(super->s_area[i]);
-	free_area(super->s_journal_area);
-	logfs_mempool_destroy(super->s_alias_pool);
-	return -ENOMEM;
-}
-
-void logfs_cleanup_areas(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
-}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
deleted file mode 100644
index 5751082dba52..000000000000
--- a/fs/logfs/super.c
+++ /dev/null
@@ -1,653 +0,0 @@
-/*
- * fs/logfs/super.c
- *
- * As should be obvious for Linux kernel code, license is GPLv2
- *
- * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
- *
- * Generally contains mount/umount code and also serves as a dump area for
- * any functions that don't fit elsewhere and neither justify a file of their
- * own.
- */
-#include "logfs.h"
-#include <linux/bio.h>
-#include <linux/slab.h>
-#include <linux/blkdev.h>
-#include <linux/module.h>
-#include <linux/mtd/mtd.h>
-#include <linux/statfs.h>
-#include <linux/buffer_head.h>
-
-static DEFINE_MUTEX(emergency_mutex);
-static struct page *emergency_page;
-
-struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
-{
-	filler_t *filler = (filler_t *)mapping->a_ops->readpage;
-	struct page *page;
-	int err;
-
-	page = read_cache_page(mapping, index, filler, NULL);
-	if (page)
-		return page;
-
-	/* No more pages available, switch to emergency page */
-	printk(KERN_INFO"Logfs: Using emergency page\n");
-	mutex_lock(&emergency_mutex);
-	err = filler(NULL, emergency_page);
-	if (err) {
-		mutex_unlock(&emergency_mutex);
-		printk(KERN_EMERG"Logfs: Error reading emergency page\n");
-		return ERR_PTR(err);
-	}
-	return emergency_page;
-}
-
-void emergency_read_end(struct page *page)
-{
-	if (page == emergency_page)
-		mutex_unlock(&emergency_mutex);
-	else
-		put_page(page);
-}
-
-static void dump_segfile(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_segment_entry se;
-	u32 segno;
-
-	for (segno = 0; segno < super->s_no_segs; segno++) {
-		logfs_get_segment_entry(sb, segno, &se);
-		printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
-				be32_to_cpu(se.valid));
-		if (++segno < super->s_no_segs) {
-			logfs_get_segment_entry(sb, segno, &se);
-			printk(" %6x %8x", be32_to_cpu(se.ec_level),
-					be32_to_cpu(se.valid));
-		}
-		if (++segno < super->s_no_segs) {
-			logfs_get_segment_entry(sb, segno, &se);
-			printk(" %6x %8x", be32_to_cpu(se.ec_level),
-					be32_to_cpu(se.valid));
-		}
-		if (++segno < super->s_no_segs) {
-			logfs_get_segment_entry(sb, segno, &se);
-			printk(" %6x %8x", be32_to_cpu(se.ec_level),
-					be32_to_cpu(se.valid));
-		}
-		printk("\n");
-	}
-}
-
-/*
- * logfs_crash_dump - dump debug information to device
- *
- * The LogFS superblock only occupies part of a segment.  This function will
- * write as much debug information as it can gather into the spare space.
- */
-void logfs_crash_dump(struct super_block *sb)
-{
-	dump_segfile(sb);
-}
-
-/*
- * FIXME: There should be a reserve for root, similar to ext2.
- */
-int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
-{
-	struct super_block *sb = dentry->d_sb;
-	struct logfs_super *super = logfs_super(sb);
-
-	stats->f_type		= LOGFS_MAGIC_U32;
-	stats->f_bsize		= sb->s_blocksize;
-	stats->f_blocks		= super->s_size >> LOGFS_BLOCK_BITS >> 3;
-	stats->f_bfree		= super->s_free_bytes >> sb->s_blocksize_bits;
-	stats->f_bavail		= super->s_free_bytes >> sb->s_blocksize_bits;
-	stats->f_files		= 0;
-	stats->f_ffree		= 0;
-	stats->f_namelen	= LOGFS_MAX_NAMELEN;
-	return 0;
-}
-
-static int logfs_sb_set(struct super_block *sb, void *_super)
-{
-	struct logfs_super *super = _super;
-
-	sb->s_fs_info = super;
-	sb->s_mtd = super->s_mtd;
-	sb->s_bdev = super->s_bdev;
-#ifdef CONFIG_BLOCK
-	if (sb->s_bdev)
-		sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
-#endif
-#ifdef CONFIG_MTD
-	if (sb->s_mtd)
-		sb->s_bdi = sb->s_mtd->backing_dev_info;
-#endif
-	return 0;
-}
-
-static int logfs_sb_test(struct super_block *sb, void *_super)
-{
-	struct logfs_super *super = _super;
-	struct mtd_info *mtd = super->s_mtd;
-
-	if (mtd && sb->s_mtd == mtd)
-		return 1;
-	if (super->s_bdev && sb->s_bdev == super->s_bdev)
-		return 1;
-	return 0;
-}
-
-static void set_segment_header(struct logfs_segment_header *sh, u8 type,
-		u8 level, u32 segno, u32 ec)
-{
-	sh->pad = 0;
-	sh->type = type;
-	sh->level = level;
-	sh->segno = cpu_to_be32(segno);
-	sh->ec = cpu_to_be32(ec);
-	sh->gec = cpu_to_be64(segno);
-	sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
-}
-
-static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
-		u32 segno, u32 ec)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_segment_header *sh = &ds->ds_sh;
-	int i;
-
-	memset(ds, 0, sizeof(*ds));
-	set_segment_header(sh, SEG_SUPER, 0, segno, ec);
-
-	ds->ds_ifile_levels	= super->s_ifile_levels;
-	ds->ds_iblock_levels	= super->s_iblock_levels;
-	ds->ds_data_levels	= super->s_data_levels; /* XXX: Remove */
-	ds->ds_segment_shift	= super->s_segshift;
-	ds->ds_block_shift	= sb->s_blocksize_bits;
-	ds->ds_write_shift	= super->s_writeshift;
-	ds->ds_filesystem_size	= cpu_to_be64(super->s_size);
-	ds->ds_segment_size	= cpu_to_be32(super->s_segsize);
-	ds->ds_bad_seg_reserve	= cpu_to_be32(super->s_bad_seg_reserve);
-	ds->ds_feature_incompat	= cpu_to_be64(super->s_feature_incompat);
-	ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
-	ds->ds_feature_compat	= cpu_to_be64(super->s_feature_compat);
-	ds->ds_feature_flags	= cpu_to_be64(super->s_feature_flags);
-	ds->ds_root_reserve	= cpu_to_be64(super->s_root_reserve);
-	ds->ds_speed_reserve	= cpu_to_be64(super->s_speed_reserve);
-	journal_for_each(i)
-		ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
-	ds->ds_magic		= cpu_to_be64(LOGFS_MAGIC);
-	ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
-			LOGFS_SEGMENT_HEADERSIZE + 12);
-}
-
-static int write_one_sb(struct super_block *sb,
-		struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_disk_super *ds;
-	struct logfs_segment_entry se;
-	struct page *page;
-	u64 ofs;
-	u32 ec, segno;
-	int err;
-
-	page = find_sb(sb, &ofs);
-	if (!page)
-		return -EIO;
-	ds = page_address(page);
-	segno = seg_no(sb, ofs);
-	logfs_get_segment_entry(sb, segno, &se);
-	ec = be32_to_cpu(se.ec_level) >> 4;
-	ec++;
-	logfs_set_segment_erased(sb, segno, ec, 0);
-	logfs_write_ds(sb, ds, segno, ec);
-	err = super->s_devops->write_sb(sb, page);
-	put_page(page);
-	return err;
-}
-
-int logfs_write_sb(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int err;
-
-	/* First superblock */
-	err = write_one_sb(sb, super->s_devops->find_first_sb);
-	if (err)
-		return err;
-
-	/* Last superblock */
-	err = write_one_sb(sb, super->s_devops->find_last_sb);
-	if (err)
-		return err;
-	return 0;
-}
-
-static int ds_cmp(const void *ds0, const void *ds1)
-{
-	size_t len = sizeof(struct logfs_disk_super);
-
-	/* We know the segment headers differ, so ignore them */
-	len -= LOGFS_SEGMENT_HEADERSIZE;
-	ds0 += LOGFS_SEGMENT_HEADERSIZE;
-	ds1 += LOGFS_SEGMENT_HEADERSIZE;
-	return memcmp(ds0, ds1, len);
-}
-
-static int logfs_recover_sb(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct logfs_disk_super _ds0, *ds0 = &_ds0;
-	struct logfs_disk_super _ds1, *ds1 = &_ds1;
-	int err, valid0, valid1;
-
-	/* read first superblock */
-	err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
-	if (err)
-		return err;
-	/* read last superblock */
-	err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
-	if (err)
-		return err;
-	valid0 = logfs_check_ds(ds0) == 0;
-	valid1 = logfs_check_ds(ds1) == 0;
-
-	if (!valid0 && valid1) {
-		printk(KERN_INFO"First superblock is invalid - fixing.\n");
-		return write_one_sb(sb, super->s_devops->find_first_sb);
-	}
-	if (valid0 && !valid1) {
-		printk(KERN_INFO"Last superblock is invalid - fixing.\n");
-		return write_one_sb(sb, super->s_devops->find_last_sb);
-	}
-	if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
-		printk(KERN_INFO"Superblocks don't match - fixing.\n");
-		return logfs_write_sb(sb);
-	}
-	/* If neither is valid now, something's wrong.  Didn't we properly
-	 * check them before?!? */
-	BUG_ON(!valid0 && !valid1);
-	return 0;
-}
-
-static int logfs_make_writeable(struct super_block *sb)
-{
-	int err;
-
-	err = logfs_open_segfile(sb);
-	if (err)
-		return err;
-
-	/* Repair any broken superblock copies */
-	err = logfs_recover_sb(sb);
-	if (err)
-		return err;
-
-	/* Check areas for trailing unaccounted data */
-	err = logfs_check_areas(sb);
-	if (err)
-		return err;
-
-	/* Do one GC pass before any data gets dirtied */
-	logfs_gc_pass(sb);
-
-	/* after all initializations are done, replay the journal
-	 * for rw-mounts, if necessary */
-	err = logfs_replay_journal(sb);
-	if (err)
-		return err;
-
-	return 0;
-}
-
-static int logfs_get_sb_final(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct inode *rootdir;
-	int err;
-
-	/* root dir */
-	rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
-	if (IS_ERR(rootdir))
-		goto fail;
-
-	sb->s_root = d_make_root(rootdir);
-	if (!sb->s_root)
-		goto fail;
-
-	/* at that point we know that ->put_super() will be called */
-	super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
-	if (!super->s_erase_page)
-		return -ENOMEM;
-	memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
-
-	/* FIXME: check for read-only mounts */
-	err = logfs_make_writeable(sb);
-	if (err) {
-		__free_page(super->s_erase_page);
-		return err;
-	}
-
-	log_super("LogFS: Finished mounting\n");
-	return 0;
-
-fail:
-	iput(super->s_master_inode);
-	iput(super->s_segfile_inode);
-	iput(super->s_mapping_inode);
-	return -EIO;
-}
-
-int logfs_check_ds(struct logfs_disk_super *ds)
-{
-	struct logfs_segment_header *sh = &ds->ds_sh;
-
-	if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
-		return -EINVAL;
-	if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
-		return -EINVAL;
-	if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
-				LOGFS_SEGMENT_HEADERSIZE + 12))
-		return -EINVAL;
-	return 0;
-}
-
-static struct page *find_super_block(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct page *first, *last;
-
-	first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
-	if (!first || IS_ERR(first))
-		return NULL;
-	last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
-	if (!last || IS_ERR(last)) {
-		put_page(first);
-		return NULL;
-	}
-
-	if (!logfs_check_ds(page_address(first))) {
-		put_page(last);
-		return first;
-	}
-
-	/* First one didn't work, try the second superblock */
-	if (!logfs_check_ds(page_address(last))) {
-		put_page(first);
-		return last;
-	}
-
-	/* Neither worked, sorry folks */
-	put_page(first);
-	put_page(last);
-	return NULL;
-}
-
-static int __logfs_read_sb(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-	struct page *page;
-	struct logfs_disk_super *ds;
-	int i;
-
-	page = find_super_block(sb);
-	if (!page)
-		return -EINVAL;
-
-	ds = page_address(page);
-	super->s_size = be64_to_cpu(ds->ds_filesystem_size);
-	super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
-	super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
-	super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
-	super->s_segsize = 1 << ds->ds_segment_shift;
-	super->s_segmask = (1 << ds->ds_segment_shift) - 1;
-	super->s_segshift = ds->ds_segment_shift;
-	sb->s_blocksize = 1 << ds->ds_block_shift;
-	sb->s_blocksize_bits = ds->ds_block_shift;
-	super->s_writesize = 1 << ds->ds_write_shift;
-	super->s_writeshift = ds->ds_write_shift;
-	super->s_no_segs = super->s_size >> super->s_segshift;
-	super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
-	super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
-	super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
-	super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
-	super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
-
-	journal_for_each(i)
-		super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
-
-	super->s_ifile_levels = ds->ds_ifile_levels;
-	super->s_iblock_levels = ds->ds_iblock_levels;
-	super->s_data_levels = ds->ds_data_levels;
-	super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
-		+ super->s_data_levels;
-	put_page(page);
-	return 0;
-}
-
-static int logfs_read_sb(struct super_block *sb, int read_only)
-{
-	struct logfs_super *super = logfs_super(sb);
-	int ret;
-
-	super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
-	if (!super->s_btree_pool)
-		return -ENOMEM;
-
-	btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
-	btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
-	btree_init_mempool32(&super->s_shadow_tree.segment_map,
-			super->s_btree_pool);
-
-	ret = logfs_init_mapping(sb);
-	if (ret)
-		return ret;
-
-	ret = __logfs_read_sb(sb);
-	if (ret)
-		return ret;
-
-	if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
-		return -EIO;
-	if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
-			!read_only)
-		return -EIO;
-
-	ret = logfs_init_rw(sb);
-	if (ret)
-		return ret;
-
-	ret = logfs_init_areas(sb);
-	if (ret)
-		return ret;
-
-	ret = logfs_init_gc(sb);
-	if (ret)
-		return ret;
-
-	ret = logfs_init_journal(sb);
-	if (ret)
-		return ret;
-
-	return 0;
-}
-
-static void logfs_kill_sb(struct super_block *sb)
-{
-	struct logfs_super *super = logfs_super(sb);
-
-	log_super("LogFS: Start unmounting\n");
-	/* Alias entries slow down mount, so evict as many as possible */
-	sync_filesystem(sb);
-	logfs_write_anchor(sb);
-	free_areas(sb);
-
-	/*
-	 * From this point on alias entries are simply dropped - and any
-	 * writes to the object store are considered bugs.
-	 */
-	log_super("LogFS: Now in shutdown\n");
-	generic_shutdown_super(sb);
-	super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
-
-	BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
-
-	logfs_cleanup_gc(sb);
-	logfs_cleanup_journal(sb);
-	logfs_cleanup_areas(sb);
-	logfs_cleanup_rw(sb);
-	if (super->s_erase_page)
-		__free_page(super->s_erase_page);
-	super->s_devops->put_device(super);
-	logfs_mempool_destroy(super->s_btree_pool);
-	logfs_mempool_destroy(super->s_alias_pool);
-	kfree(super);
-	log_super("LogFS: Finished unmounting\n");
-}
-
-static struct dentry *logfs_get_sb_device(struct logfs_super *super,
-		struct file_system_type *type, int flags)
-{
-	struct super_block *sb;
-	int err = -ENOMEM;
-	static int mount_count;
-
-	log_super("LogFS: Start mount %x\n", mount_count++);
-
-	err = -EINVAL;
-	sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super);
-	if (IS_ERR(sb)) {
-		super->s_devops->put_device(super);
-		kfree(super);
-		return ERR_CAST(sb);
-	}
-
-	if (sb->s_root) {
-		/* Device is already in use */
-		super->s_devops->put_device(super);
-		kfree(super);
-		return dget(sb->s_root);
-	}
-
-	/*
-	 * sb->s_maxbytes is limited to 8TB.  On 32bit systems, the page cache
-	 * only covers 16TB and the upper 8TB are used for indirect blocks.
-	 * On 64bit system we could bump up the limit, but that would make
-	 * the filesystem incompatible with 32bit systems.
-	 */
-	sb->s_maxbytes	= (1ull << 43) - 1;
-	sb->s_max_links = LOGFS_LINK_MAX;
-	sb->s_op	= &logfs_super_operations;
-
-	err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
-	if (err)
-		goto err1;
-
-	sb->s_flags |= MS_ACTIVE;
-	err = logfs_get_sb_final(sb);
-	if (err) {
-		deactivate_locked_super(sb);
-		return ERR_PTR(err);
-	}
-	return dget(sb->s_root);
-
-err1:
-	/* no ->s_root, no ->put_super() */
-	iput(super->s_master_inode);
-	iput(super->s_segfile_inode);
-	iput(super->s_mapping_inode);
-	deactivate_locked_super(sb);
-	return ERR_PTR(err);
-}
-
-static struct dentry *logfs_mount(struct file_system_type *type, int flags,
-		const char *devname, void *data)
-{
-	ulong mtdnr;
-	struct logfs_super *super;
-	int err;
-
-	super = kzalloc(sizeof(*super), GFP_KERNEL);
-	if (!super)
-		return ERR_PTR(-ENOMEM);
-
-	mutex_init(&super->s_dirop_mutex);
-	mutex_init(&super->s_object_alias_mutex);
-	INIT_LIST_HEAD(&super->s_freeing_list);
-
-	if (!devname)
-		err = logfs_get_sb_bdev(super, type, devname);
-	else if (strncmp(devname, "mtd", 3))
-		err = logfs_get_sb_bdev(super, type, devname);
-	else {
-		char *garbage;
-		mtdnr = simple_strtoul(devname+3, &garbage, 0);
-		if (*garbage)
-			err = -EINVAL;
-		else
-			err = logfs_get_sb_mtd(super, mtdnr);
-	}
-
-	if (err) {
-		kfree(super);
-		return ERR_PTR(err);
-	}
-
-	return logfs_get_sb_device(super, type, flags);
-}
-
-static struct file_system_type logfs_fs_type = {
-	.owner		= THIS_MODULE,
-	.name		= "logfs",
-	.mount		= logfs_mount,
-	.kill_sb	= logfs_kill_sb,
-	.fs_flags	= FS_REQUIRES_DEV,
-
-};
-MODULE_ALIAS_FS("logfs");
-
-static int __init logfs_init(void)
-{
-	int ret;
-
-	emergency_page = alloc_pages(GFP_KERNEL, 0);
-	if (!emergency_page)
-		return -ENOMEM;
-
-	ret = logfs_compr_init();
-	if (ret)
-		goto out1;
-
-	ret = logfs_init_inode_cache();
-	if (ret)
-		goto out2;
-
-	ret = register_filesystem(&logfs_fs_type);
-	if (!ret)
-		return 0;
-	logfs_destroy_inode_cache();
-out2:
-	logfs_compr_exit();
-out1:
-	__free_pages(emergency_page, 0);
-	return ret;
-}
-
-static void __exit logfs_exit(void)
-{
-	unregister_filesystem(&logfs_fs_type);
-	logfs_destroy_inode_cache();
-	logfs_compr_exit();
-	__free_pages(emergency_page, 0);
-}
-
-module_init(logfs_init);
-module_exit(logfs_exit);
-
-MODULE_LICENSE("GPL v2");
-MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
-MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/fs/mbcache.c b/fs/mbcache.c
index c5bd19ffa326..b19be429d655 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -29,7 +29,7 @@ struct mb_cache {
 	/* log2 of hash table size */
 	int			c_bucket_bits;
 	/* Maximum entries in cache to avoid degrading hash too much */
-	int			c_max_entries;
+	unsigned long		c_max_entries;
 	/* Protects c_list, c_entry_count */
 	spinlock_t		c_list_lock;
 	struct list_head	c_list;
@@ -43,7 +43,7 @@ struct mb_cache {
 static struct kmem_cache *mb_entry_cache;
 
 static unsigned long mb_cache_shrink(struct mb_cache *cache,
-				     unsigned int nr_to_scan);
+				     unsigned long nr_to_scan);
 
 static inline struct hlist_bl_head *mb_cache_entry_head(struct mb_cache *cache,
 							u32 key)
@@ -155,12 +155,12 @@ out:
 }
 
 /*
- * mb_cache_entry_find_first - find the first entry in cache with given key
+ * mb_cache_entry_find_first - find the first reusable entry with the given key
  * @cache: cache where we should search
  * @key: key to look for
  *
- * Search in @cache for entry with key @key. Grabs reference to the first
- * entry found and returns the entry.
+ * Search in @cache for a reusable entry with key @key. Grabs reference to the
+ * first reusable entry found and returns the entry.
  */
 struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 						 u32 key)
@@ -170,14 +170,14 @@ struct mb_cache_entry *mb_cache_entry_find_first(struct mb_cache *cache,
 EXPORT_SYMBOL(mb_cache_entry_find_first);
 
 /*
- * mb_cache_entry_find_next - find next entry in cache with the same
+ * mb_cache_entry_find_next - find next reusable entry with the same key
  * @cache: cache where we should search
  * @entry: entry to start search from
  *
- * Finds next entry in the hash chain which has the same key as @entry.
- * If @entry is unhashed (which can happen when deletion of entry races
- * with the search), finds the first entry in the hash chain. The function
- * drops reference to @entry and returns with a reference to the found entry.
+ * Finds next reusable entry in the hash chain which has the same key as @entry.
+ * If @entry is unhashed (which can happen when deletion of entry races with the
+ * search), finds the first reusable entry in the hash chain. The function drops
+ * reference to @entry and returns with a reference to the found entry.
  */
 struct mb_cache_entry *mb_cache_entry_find_next(struct mb_cache *cache,
 						struct mb_cache_entry *entry)
@@ -274,11 +274,11 @@ static unsigned long mb_cache_count(struct shrinker *shrink,
 
 /* Shrink number of entries in cache */
 static unsigned long mb_cache_shrink(struct mb_cache *cache,
-				     unsigned int nr_to_scan)
+				     unsigned long nr_to_scan)
 {
 	struct mb_cache_entry *entry;
 	struct hlist_bl_head *head;
-	unsigned int shrunk = 0;
+	unsigned long shrunk = 0;
 
 	spin_lock(&cache->c_list_lock);
 	while (nr_to_scan-- && !list_empty(&cache->c_list)) {
@@ -286,7 +286,7 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 					 struct mb_cache_entry, e_list);
 		if (entry->e_referenced) {
 			entry->e_referenced = 0;
-			list_move_tail(&cache->c_list, &entry->e_list);
+			list_move_tail(&entry->e_list, &cache->c_list);
 			continue;
 		}
 		list_del_init(&entry->e_list);
@@ -316,10 +316,9 @@ static unsigned long mb_cache_shrink(struct mb_cache *cache,
 static unsigned long mb_cache_scan(struct shrinker *shrink,
 				   struct shrink_control *sc)
 {
-	int nr_to_scan = sc->nr_to_scan;
 	struct mb_cache *cache = container_of(shrink, struct mb_cache,
 					      c_shrink);
-	return mb_cache_shrink(cache, nr_to_scan);
+	return mb_cache_shrink(cache, sc->nr_to_scan);
 }
 
 /* We shrink 1/X of the cache when we have too many entries in it */
@@ -341,11 +340,8 @@ static void mb_cache_shrink_worker(struct work_struct *work)
 struct mb_cache *mb_cache_create(int bucket_bits)
 {
 	struct mb_cache *cache;
-	int bucket_count = 1 << bucket_bits;
-	int i;
-
-	if (!try_module_get(THIS_MODULE))
-		return NULL;
+	unsigned long bucket_count = 1UL << bucket_bits;
+	unsigned long i;
 
 	cache = kzalloc(sizeof(struct mb_cache), GFP_KERNEL);
 	if (!cache)
@@ -377,7 +373,6 @@ struct mb_cache *mb_cache_create(int bucket_bits)
 	return cache;
 
 err_out:
-	module_put(THIS_MODULE);
 	return NULL;
 }
 EXPORT_SYMBOL(mb_cache_create);
@@ -411,7 +406,6 @@ void mb_cache_destroy(struct mb_cache *cache)
 	}
 	kfree(cache->c_hash);
 	kfree(cache);
-	module_put(THIS_MODULE);
 }
 EXPORT_SYMBOL(mb_cache_destroy);
 
@@ -420,7 +414,8 @@ static int __init mbcache_init(void)
 	mb_entry_cache = kmem_cache_create("mbcache",
 				sizeof(struct mb_cache_entry), 0,
 				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD, NULL);
-	BUG_ON(!mb_entry_cache);
+	if (!mb_entry_cache)
+		return -ENOMEM;
 	return 0;
 }
 
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index f975d667c539..e7d9bf86d975 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -434,7 +434,6 @@ static const struct address_space_operations minix_aops = {
 };
 
 static const struct inode_operations minix_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.getattr	= minix_getattr,
 };
diff --git a/fs/mount.h b/fs/mount.h
index d2e25d7b64b3..2c856fc47ae3 100644
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -94,6 +94,12 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *);
 extern int __legitimize_mnt(struct vfsmount *, unsigned);
 extern bool legitimize_mnt(struct vfsmount *, unsigned);
 
+static inline bool __path_is_mountpoint(const struct path *path)
+{
+	struct mount *m = __lookup_mnt(path->mnt, path->dentry);
+	return m && likely(!(m->mnt.mnt_flags & MNT_SYNC_UMOUNT));
+}
+
 extern void __detach_mounts(struct dentry *dentry);
 
 static inline void detach_mounts(struct dentry *dentry)
diff --git a/fs/mpage.c b/fs/mpage.c
index d2413af0823a..28af984a3d96 100644
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -489,7 +489,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 	struct buffer_head map_bh;
 	loff_t i_size = i_size_read(inode);
 	int ret = 0;
-	int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?  WRITE_SYNC : 0);
+	int op_flags = wbc_to_write_flags(wbc);
 
 	if (page_has_buffers(page)) {
 		struct buffer_head *head = page_buffers(page);
@@ -555,8 +555,7 @@ static int __mpage_writepage(struct page *page, struct writeback_control *wbc,
 		if (mpd->get_block(inode, block_in_file, &map_bh, 1))
 			goto confused;
 		if (buffer_new(&map_bh))
-			unmap_underlying_metadata(map_bh.b_bdev,
-						map_bh.b_blocknr);
+			clean_bdev_bh_alias(&map_bh);
 		if (buffer_boundary(&map_bh)) {
 			boundary_block = map_bh.b_blocknr;
 			boundary_bdev = map_bh.b_bdev;
@@ -705,7 +704,7 @@ mpage_writepages(struct address_space *mapping,
 		ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd);
 		if (mpd.bio) {
 			int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-				  WRITE_SYNC : 0);
+				  REQ_SYNC : 0);
 			mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 		}
 	}
@@ -726,7 +725,7 @@ int mpage_writepage(struct page *page, get_block_t get_block,
 	int ret = __mpage_writepage(page, wbc, &mpd);
 	if (mpd.bio) {
 		int op_flags = (wbc->sync_mode == WB_SYNC_ALL ?
-			  WRITE_SYNC : 0);
+			  REQ_SYNC : 0);
 		mpage_bio_submit(REQ_OP_WRITE, op_flags, mpd.bio);
 	}
 	return ret;
diff --git a/fs/namei.c b/fs/namei.c
index 5b4eed221530..ad74877e1442 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -37,7 +37,7 @@
 #include <linux/hash.h>
 #include <linux/bitops.h>
 #include <linux/init_task.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 #include "mount.h"
@@ -1200,7 +1200,7 @@ static int follow_managed(struct path *path, struct nameidata *nd)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(path->dentry, false);
+			ret = path->dentry->d_op->d_manage(path, false);
 			if (ret < 0)
 				break;
 		}
@@ -1263,10 +1263,10 @@ int follow_down_one(struct path *path)
 }
 EXPORT_SYMBOL(follow_down_one);
 
-static inline int managed_dentry_rcu(struct dentry *dentry)
+static inline int managed_dentry_rcu(const struct path *path)
 {
-	return (dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
-		dentry->d_op->d_manage(dentry, true) : 0;
+	return (path->dentry->d_flags & DCACHE_MANAGE_TRANSIT) ?
+		path->dentry->d_op->d_manage(path, true) : 0;
 }
 
 /*
@@ -1282,7 +1282,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 * Don't forget we might have a non-mountpoint managed dentry
 		 * that wants to block transit.
 		 */
-		switch (managed_dentry_rcu(path->dentry)) {
+		switch (managed_dentry_rcu(path)) {
 		case -ECHILD:
 		default:
 			return false;
@@ -1392,8 +1392,7 @@ int follow_down(struct path *path)
 		if (managed & DCACHE_MANAGE_TRANSIT) {
 			BUG_ON(!path->dentry->d_op);
 			BUG_ON(!path->dentry->d_op->d_manage);
-			ret = path->dentry->d_op->d_manage(
-				path->dentry, false);
+			ret = path->dentry->d_op->d_manage(path, false);
 			if (ret < 0)
 				return ret == -EISDIR ? 0 : ret;
 		}
@@ -1725,30 +1724,35 @@ static int pick_link(struct nameidata *nd, struct path *link,
 	return 1;
 }
 
+enum {WALK_FOLLOW = 1, WALK_MORE = 2};
+
 /*
  * Do we need to follow links? We _really_ want to be able
  * to do this check without having to look at inode->i_op,
  * so we keep a cache of "no, this doesn't need follow_link"
  * for the common case.
  */
-static inline int should_follow_link(struct nameidata *nd, struct path *link,
-				     int follow,
-				     struct inode *inode, unsigned seq)
+static inline int step_into(struct nameidata *nd, struct path *path,
+			    int flags, struct inode *inode, unsigned seq)
 {
-	if (likely(!d_is_symlink(link->dentry)))
-		return 0;
-	if (!follow)
+	if (!(flags & WALK_MORE) && nd->depth)
+		put_link(nd);
+	if (likely(!d_is_symlink(path->dentry)) ||
+	   !(flags & WALK_FOLLOW || nd->flags & LOOKUP_FOLLOW)) {
+		/* not a symlink or should not follow */
+		path_to_nameidata(path, nd);
+		nd->inode = inode;
+		nd->seq = seq;
 		return 0;
+	}
 	/* make sure that d_is_symlink above matches inode */
 	if (nd->flags & LOOKUP_RCU) {
-		if (read_seqcount_retry(&link->dentry->d_seq, seq))
+		if (read_seqcount_retry(&path->dentry->d_seq, seq))
 			return -ECHILD;
 	}
-	return pick_link(nd, link, inode, seq);
+	return pick_link(nd, path, inode, seq);
 }
 
-enum {WALK_GET = 1, WALK_PUT = 2};
-
 static int walk_component(struct nameidata *nd, int flags)
 {
 	struct path path;
@@ -1762,7 +1766,7 @@ static int walk_component(struct nameidata *nd, int flags)
 	 */
 	if (unlikely(nd->last_type != LAST_NORM)) {
 		err = handle_dots(nd, nd->last_type);
-		if (flags & WALK_PUT)
+		if (!(flags & WALK_MORE) && nd->depth)
 			put_link(nd);
 		return err;
 	}
@@ -1789,15 +1793,7 @@ static int walk_component(struct nameidata *nd, int flags)
 		inode = d_backing_inode(path.dentry);
 	}
 
-	if (flags & WALK_PUT)
-		put_link(nd);
-	err = should_follow_link(nd, &path, flags & WALK_GET, inode, seq);
-	if (unlikely(err))
-		return err;
-	path_to_nameidata(&path, nd);
-	nd->inode = inode;
-	nd->seq = seq;
-	return 0;
+	return step_into(nd, &path, flags, inode, seq);
 }
 
 /*
@@ -2104,9 +2100,10 @@ OK:
 			if (!name)
 				return 0;
 			/* last component of nested symlink */
-			err = walk_component(nd, WALK_GET | WALK_PUT);
+			err = walk_component(nd, WALK_FOLLOW);
 		} else {
-			err = walk_component(nd, WALK_GET);
+			/* not the last component */
+			err = walk_component(nd, WALK_FOLLOW | WALK_MORE);
 		}
 		if (err < 0)
 			return err;
@@ -2248,12 +2245,7 @@ static inline int lookup_last(struct nameidata *nd)
 		nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
 
 	nd->flags &= ~LOOKUP_PARENT;
-	return walk_component(nd,
-			nd->flags & LOOKUP_FOLLOW
-				? nd->depth
-					? WALK_PUT | WALK_GET
-					: WALK_GET
-				: 0);
+	return walk_component(nd, 0);
 }
 
 /* Returns 0 and nd will be valid on success; Retuns error, otherwise. */
@@ -2558,28 +2550,9 @@ int user_path_at_empty(int dfd, const char __user *name, unsigned flags,
 }
 EXPORT_SYMBOL(user_path_at_empty);
 
-/*
- * NB: most callers don't do anything directly with the reference to the
- *     to struct filename, but the nd->last pointer points into the name string
- *     allocated by getname. So we must hold the reference to it until all
- *     path-walking is complete.
- */
-static inline struct filename *
-user_path_parent(int dfd, const char __user *path,
-		 struct path *parent,
-		 struct qstr *last,
-		 int *type,
-		 unsigned int flags)
-{
-	/* only LOOKUP_REVAL is allowed in extra flags */
-	return filename_parentat(dfd, getname(path), flags & LOOKUP_REVAL,
-				 parent, last, type);
-}
-
 /**
  * mountpoint_last - look up last component for umount
  * @nd:   pathwalk nameidata - currently pointing at parent directory of "last"
- * @path: pointer to container for result
  *
  * This is a special lookup_last function just for umount. In this case, we
  * need to resolve the path without doing any revalidation.
@@ -2592,23 +2565,20 @@ user_path_parent(int dfd, const char __user *path,
  *
  * Returns:
  * -error: if there was an error during lookup. This includes -ENOENT if the
- *         lookup found a negative dentry. The nd->path reference will also be
- *         put in this case.
+ *         lookup found a negative dentry.
  *
- * 0:      if we successfully resolved nd->path and found it to not to be a
- *         symlink that needs to be followed. "path" will also be populated.
- *         The nd->path reference will also be put.
+ * 0:      if we successfully resolved nd->last and found it to not to be a
+ *         symlink that needs to be followed.
  *
  * 1:      if we successfully resolved nd->last and found it to be a symlink
- *         that needs to be followed. "path" will be populated with the path
- *         to the link, and nd->path will *not* be put.
+ *         that needs to be followed.
  */
 static int
-mountpoint_last(struct nameidata *nd, struct path *path)
+mountpoint_last(struct nameidata *nd)
 {
 	int error = 0;
-	struct dentry *dentry;
 	struct dentry *dir = nd->path.dentry;
+	struct path path;
 
 	/* If we're in rcuwalk, drop out of it to handle last component */
 	if (nd->flags & LOOKUP_RCU) {
@@ -2622,37 +2592,28 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		error = handle_dots(nd, nd->last_type);
 		if (error)
 			return error;
-		dentry = dget(nd->path.dentry);
+		path.dentry = dget(nd->path.dentry);
 	} else {
-		dentry = d_lookup(dir, &nd->last);
-		if (!dentry) {
+		path.dentry = d_lookup(dir, &nd->last);
+		if (!path.dentry) {
 			/*
 			 * No cached dentry. Mounted dentries are pinned in the
 			 * cache, so that means that this dentry is probably
 			 * a symlink or the path doesn't actually point
 			 * to a mounted dentry.
 			 */
-			dentry = lookup_slow(&nd->last, dir,
+			path.dentry = lookup_slow(&nd->last, dir,
 					     nd->flags | LOOKUP_NO_REVAL);
-			if (IS_ERR(dentry))
-				return PTR_ERR(dentry);
+			if (IS_ERR(path.dentry))
+				return PTR_ERR(path.dentry);
 		}
 	}
-	if (d_is_negative(dentry)) {
-		dput(dentry);
+	if (d_is_negative(path.dentry)) {
+		dput(path.dentry);
 		return -ENOENT;
 	}
-	if (nd->depth)
-		put_link(nd);
-	path->dentry = dentry;
-	path->mnt = nd->path.mnt;
-	error = should_follow_link(nd, path, nd->flags & LOOKUP_FOLLOW,
-				   d_backing_inode(dentry), 0);
-	if (unlikely(error))
-		return error;
-	mntget(path->mnt);
-	follow_mount(path);
-	return 0;
+	path.mnt = nd->path.mnt;
+	return step_into(nd, &path, 0, d_backing_inode(path.dentry), 0);
 }
 
 /**
@@ -2672,13 +2633,19 @@ path_mountpoint(struct nameidata *nd, unsigned flags, struct path *path)
 	if (IS_ERR(s))
 		return PTR_ERR(s);
 	while (!(err = link_path_walk(s, nd)) &&
-		(err = mountpoint_last(nd, path)) > 0) {
+		(err = mountpoint_last(nd)) > 0) {
 		s = trailing_symlink(nd);
 		if (IS_ERR(s)) {
 			err = PTR_ERR(s);
 			break;
 		}
 	}
+	if (!err) {
+		*path = nd->path;
+		nd->path.mnt = NULL;
+		nd->path.dentry = NULL;
+		follow_mount(path);
+	}
 	terminate_walk(nd);
 	return err;
 }
@@ -2895,7 +2862,7 @@ bool may_open_dev(const struct path *path)
 		!(path->mnt->mnt_sb->s_iflags & SB_I_NODEV);
 }
 
-static int may_open(struct path *path, int acc_mode, int flag)
+static int may_open(const struct path *path, int acc_mode, int flag)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *inode = dentry->d_inode;
@@ -2945,7 +2912,7 @@ static int may_open(struct path *path, int acc_mode, int flag)
 
 static int handle_truncate(struct file *filp)
 {
-	struct path *path = &filp->f_path;
+	const struct path *path = &filp->f_path;
 	struct inode *inode = path->dentry->d_inode;
 	int error = get_write_access(inode);
 	if (error)
@@ -3335,18 +3302,11 @@ static int do_last(struct nameidata *nd,
 	seq = 0;	/* out of RCU mode, so the value doesn't matter */
 	inode = d_backing_inode(path.dentry);
 finish_lookup:
-	if (nd->depth)
-		put_link(nd);
-	error = should_follow_link(nd, &path, nd->flags & LOOKUP_FOLLOW,
-				   inode, seq);
+	error = step_into(nd, &path, 0, inode, seq);
 	if (unlikely(error))
 		return error;
-
-	path_to_nameidata(&path, nd);
-	nd->inode = inode;
-	nd->seq = seq;
-	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 finish_open:
+	/* Why this, you ask?  _Now_ we might have grown LOOKUP_JUMPED... */
 	error = complete_walk(nd);
 	if (error)
 		return error;
@@ -3861,8 +3821,8 @@ static long do_rmdir(int dfd, const char __user *pathname)
 	int type;
 	unsigned int lookup_flags = 0;
 retry:
-	name = user_path_parent(dfd, pathname,
-				&path, &last, &type, lookup_flags);
+	name = filename_parentat(dfd, getname(pathname), lookup_flags,
+				&path, &last, &type);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
@@ -3991,8 +3951,8 @@ static long do_unlinkat(int dfd, const char __user *pathname)
 	struct inode *delegated_inode = NULL;
 	unsigned int lookup_flags = 0;
 retry:
-	name = user_path_parent(dfd, pathname,
-				&path, &last, &type, lookup_flags);
+	name = filename_parentat(dfd, getname(pathname), lookup_flags,
+				&path, &last, &type);
 	if (IS_ERR(name))
 		return PTR_ERR(name);
 
@@ -4345,11 +4305,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	bool new_is_dir = false;
 	unsigned max_links = new_dir->i_sb->s_max_links;
 
-	/*
-	 * Check source == target.
-	 * On overlayfs need to look at underlying inodes.
-	 */
-	if (d_real_inode(old_dentry) == d_real_inode(new_dentry))
+	if (source == target)
 		return 0;
 
 	error = may_delete(old_dir, old_dentry, is_dir);
@@ -4491,15 +4447,15 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 		target_flags = 0;
 
 retry:
-	from = user_path_parent(olddfd, oldname,
-				&old_path, &old_last, &old_type, lookup_flags);
+	from = filename_parentat(olddfd, getname(oldname), lookup_flags,
+				&old_path, &old_last, &old_type);
 	if (IS_ERR(from)) {
 		error = PTR_ERR(from);
 		goto exit;
 	}
 
-	to = user_path_parent(newdfd, newname,
-				&new_path, &new_last, &new_type, lookup_flags);
+	to = filename_parentat(newdfd, getname(newname), lookup_flags,
+				&new_path, &new_last, &new_type);
 	if (IS_ERR(to)) {
 		error = PTR_ERR(to);
 		goto exit1;
@@ -4650,7 +4606,8 @@ out:
  * have ->get_link() not calling nd_jump_link().  Using (or not using) it
  * for any given inode is up to filesystem.
  */
-int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+static int generic_readlink(struct dentry *dentry, char __user *buffer,
+			    int buflen)
 {
 	DEFINE_DELAYED_CALL(done);
 	struct inode *inode = d_inode(dentry);
@@ -4666,7 +4623,36 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	do_delayed_call(&done);
 	return res;
 }
-EXPORT_SYMBOL(generic_readlink);
+
+/**
+ * vfs_readlink - copy symlink body into userspace buffer
+ * @dentry: dentry on which to get symbolic link
+ * @buffer: user memory pointer
+ * @buflen: size of buffer
+ *
+ * Does not touch atime.  That's up to the caller if necessary
+ *
+ * Does not call security hook.
+ */
+int vfs_readlink(struct dentry *dentry, char __user *buffer, int buflen)
+{
+	struct inode *inode = d_inode(dentry);
+
+	if (unlikely(!(inode->i_opflags & IOP_DEFAULT_READLINK))) {
+		if (unlikely(inode->i_op->readlink))
+			return inode->i_op->readlink(dentry, buffer, buflen);
+
+		if (!d_is_symlink(dentry))
+			return -EINVAL;
+
+		spin_lock(&inode->i_lock);
+		inode->i_opflags |= IOP_DEFAULT_READLINK;
+		spin_unlock(&inode->i_lock);
+	}
+
+	return generic_readlink(dentry, buffer, buflen);
+}
+EXPORT_SYMBOL(vfs_readlink);
 
 /**
  * vfs_get_link - get symlink body
@@ -4783,7 +4769,6 @@ int page_symlink(struct inode *inode, const char *symname, int len)
 EXPORT_SYMBOL(page_symlink);
 
 const struct inode_operations page_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 };
 EXPORT_SYMBOL(page_symlink_inode_operations);
diff --git a/fs/namespace.c b/fs/namespace.c
index e6c234b1a645..487ba30bb5c6 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -96,10 +96,6 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry)
 	return &mountpoint_hashtable[tmp & mp_hash_mask];
 }
 
-/*
- * allocation is serialized by namespace_sem, but we need the spinlock to
- * serialize with freeing.
- */
 static int mnt_alloc_id(struct mount *mnt)
 {
 	int res;
@@ -678,7 +674,7 @@ out:
  *
  * lookup_mnt takes a reference to the found vfsmount.
  */
-struct vfsmount *lookup_mnt(struct path *path)
+struct vfsmount *lookup_mnt(const struct path *path)
 {
 	struct mount *child_mnt;
 	struct vfsmount *m;
@@ -746,26 +742,50 @@ static struct mountpoint *lookup_mountpoint(struct dentry *dentry)
 	return NULL;
 }
 
-static struct mountpoint *new_mountpoint(struct dentry *dentry)
+static struct mountpoint *get_mountpoint(struct dentry *dentry)
 {
-	struct hlist_head *chain = mp_hash(dentry);
-	struct mountpoint *mp;
+	struct mountpoint *mp, *new = NULL;
 	int ret;
 
-	mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
-	if (!mp)
+	if (d_mountpoint(dentry)) {
+mountpoint:
+		read_seqlock_excl(&mount_lock);
+		mp = lookup_mountpoint(dentry);
+		read_sequnlock_excl(&mount_lock);
+		if (mp)
+			goto done;
+	}
+
+	if (!new)
+		new = kmalloc(sizeof(struct mountpoint), GFP_KERNEL);
+	if (!new)
 		return ERR_PTR(-ENOMEM);
 
+
+	/* Exactly one processes may set d_mounted */
 	ret = d_set_mounted(dentry);
-	if (ret) {
-		kfree(mp);
-		return ERR_PTR(ret);
-	}
 
-	mp->m_dentry = dentry;
-	mp->m_count = 1;
-	hlist_add_head(&mp->m_hash, chain);
-	INIT_HLIST_HEAD(&mp->m_list);
+	/* Someone else set d_mounted? */
+	if (ret == -EBUSY)
+		goto mountpoint;
+
+	/* The dentry is not available as a mountpoint? */
+	mp = ERR_PTR(ret);
+	if (ret)
+		goto done;
+
+	/* Add the new mountpoint to the hash table */
+	read_seqlock_excl(&mount_lock);
+	new->m_dentry = dentry;
+	new->m_count = 1;
+	hlist_add_head(&new->m_hash, mp_hash(dentry));
+	INIT_HLIST_HEAD(&new->m_list);
+	read_sequnlock_excl(&mount_lock);
+
+	mp = new;
+	new = NULL;
+done:
+	kfree(new);
 	return mp;
 }
 
@@ -1034,6 +1054,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 		if (IS_MNT_SLAVE(old))
 			list_add(&mnt->mnt_slave, &old->mnt_slave);
 		mnt->mnt_master = old->mnt_master;
+	} else {
+		CLEAR_MNT_SHARED(mnt);
 	}
 	if (flag & CL_MAKE_SHARED)
 		set_mnt_shared(mnt);
@@ -1159,7 +1181,36 @@ struct vfsmount *mntget(struct vfsmount *mnt)
 }
 EXPORT_SYMBOL(mntget);
 
-struct vfsmount *mnt_clone_internal(struct path *path)
+/* path_is_mountpoint() - Check if path is a mount in the current
+ *                          namespace.
+ *
+ *  d_mountpoint() can only be used reliably to establish if a dentry is
+ *  not mounted in any namespace and that common case is handled inline.
+ *  d_mountpoint() isn't aware of the possibility there may be multiple
+ *  mounts using a given dentry in a different namespace. This function
+ *  checks if the passed in path is a mountpoint rather than the dentry
+ *  alone.
+ */
+bool path_is_mountpoint(const struct path *path)
+{
+	unsigned seq;
+	bool res;
+
+	if (!d_mountpoint(path->dentry))
+		return false;
+
+	rcu_read_lock();
+	do {
+		seq = read_seqbegin(&mount_lock);
+		res = __path_is_mountpoint(path);
+	} while (read_seqretry(&mount_lock, seq));
+	rcu_read_unlock();
+
+	return res;
+}
+EXPORT_SYMBOL(path_is_mountpoint);
+
+struct vfsmount *mnt_clone_internal(const struct path *path)
 {
 	struct mount *p;
 	p = clone_mnt(real_mount(path->mnt), path->dentry, CL_PRIVATE);
@@ -1568,11 +1619,11 @@ void __detach_mounts(struct dentry *dentry)
 	struct mount *mnt;
 
 	namespace_lock();
+	lock_mount_hash();
 	mp = lookup_mountpoint(dentry);
 	if (IS_ERR_OR_NULL(mp))
 		goto out_unlock;
 
-	lock_mount_hash();
 	event++;
 	while (!hlist_empty(&mp->m_list)) {
 		mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list);
@@ -1582,9 +1633,9 @@ void __detach_mounts(struct dentry *dentry)
 		}
 		else umount_tree(mnt, UMOUNT_CONNECTED);
 	}
-	unlock_mount_hash();
 	put_mountpoint(mp);
 out_unlock:
+	unlock_mount_hash();
 	namespace_unlock();
 }
 
@@ -1758,7 +1809,7 @@ out:
 
 /* Caller should check returned pointer for errors */
 
-struct vfsmount *collect_mounts(struct path *path)
+struct vfsmount *collect_mounts(const struct path *path)
 {
 	struct mount *tree;
 	namespace_lock();
@@ -1791,7 +1842,7 @@ void drop_collected_mounts(struct vfsmount *mnt)
  *
  * Release with mntput().
  */
-struct vfsmount *clone_private_mount(struct path *path)
+struct vfsmount *clone_private_mount(const struct path *path)
 {
 	struct mount *old_mnt = real_mount(path->mnt);
 	struct mount *new_mnt;
@@ -1799,9 +1850,7 @@ struct vfsmount *clone_private_mount(struct path *path)
 	if (IS_MNT_UNBINDABLE(old_mnt))
 		return ERR_PTR(-EINVAL);
 
-	down_read(&namespace_sem);
 	new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE);
-	up_read(&namespace_sem);
 	if (IS_ERR(new_mnt))
 		return ERR_CAST(new_mnt);
 
@@ -2013,9 +2062,7 @@ retry:
 	namespace_lock();
 	mnt = lookup_mnt(path);
 	if (likely(!mnt)) {
-		struct mountpoint *mp = lookup_mountpoint(dentry);
-		if (!mp)
-			mp = new_mountpoint(dentry);
+		struct mountpoint *mp = get_mountpoint(dentry);
 		if (IS_ERR(mp)) {
 			namespace_unlock();
 			inode_unlock(dentry->d_inode);
@@ -2034,7 +2081,11 @@ retry:
 static void unlock_mount(struct mountpoint *where)
 {
 	struct dentry *dentry = where->m_dentry;
+
+	read_seqlock_excl(&mount_lock);
 	put_mountpoint(where);
+	read_sequnlock_excl(&mount_lock);
+
 	namespace_unlock();
 	inode_unlock(dentry->d_inode);
 }
@@ -2997,7 +3048,7 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry,
 	return &mnt->mnt == root->mnt && is_subdir(dentry, root->dentry);
 }
 
-bool path_is_under(struct path *path1, struct path *path2)
+bool path_is_under(const struct path *path1, const struct path *path2)
 {
 	bool res;
 	read_seqlock_excl(&mount_lock);
@@ -3110,9 +3161,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root,
 	touch_mnt_namespace(current->nsproxy->mnt_ns);
 	/* A moved mount should not expire automatically */
 	list_del_init(&new_mnt->mnt_expire);
+	put_mountpoint(root_mp);
 	unlock_mount_hash();
 	chroot_fs_refs(&root, &new);
-	put_mountpoint(root_mp);
 	error = 0;
 out4:
 	unlock_mount(old_mp);
diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c
index 6df2a3827574..088f52484d6e 100644
--- a/fs/ncpfs/dir.c
+++ b/fs/ncpfs/dir.c
@@ -18,7 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
 #include <linux/namei.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include "ncp_fs.h"
diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c
index dd38ca1f2ecb..76965e772264 100644
--- a/fs/ncpfs/file.c
+++ b/fs/ncpfs/file.c
@@ -8,7 +8,7 @@
 
 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/time.h>
 #include <linux/kernel.h>
@@ -203,7 +203,7 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
 				      bufsize - (pos % bufsize),
 				      iov_iter_count(from));
 
-		if (copy_from_iter(bouncebuffer, to_write, from) != to_write) {
+		if (!copy_from_iter_full(bouncebuffer, to_write, from)) {
 			errno = -EFAULT;
 			break;
 		}
diff --git a/fs/ncpfs/inode.c b/fs/ncpfs/inode.c
index f6cf4c7e92b1..7eb89c23c847 100644
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -13,7 +13,7 @@
 
 #include <linux/module.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 
 #include <linux/time.h>
@@ -243,7 +243,6 @@ static void ncp_set_attr(struct inode *inode, struct ncp_entry_info *nwinfo)
 
 #if defined(CONFIG_NCPFS_EXTRAS) || defined(CONFIG_NCPFS_NFS_NS)
 static const struct inode_operations ncp_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.setattr	= ncp_notify_change,
 };
diff --git a/fs/ncpfs/ioctl.c b/fs/ncpfs/ioctl.c
index 0a3f9b594602..4434e4977cf3 100644
--- a/fs/ncpfs/ioctl.c
+++ b/fs/ncpfs/ioctl.c
@@ -20,7 +20,7 @@
 #include <linux/vmalloc.h>
 #include <linux/sched.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ncp_fs.h"
 
diff --git a/fs/ncpfs/mmap.c b/fs/ncpfs/mmap.c
index 33b873b259a8..39f57bef8531 100644
--- a/fs/ncpfs/mmap.c
+++ b/fs/ncpfs/mmap.c
@@ -18,7 +18,7 @@
 #include <linux/fcntl.h>
 #include <linux/memcontrol.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "ncp_fs.h"
 
diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h
index 17cfb743b5bf..b4c87cfcee95 100644
--- a/fs/ncpfs/ncplib_kernel.h
+++ b/fs/ncpfs/ncplib_kernel.h
@@ -21,7 +21,7 @@
 #include <linux/fcntl.h>
 #include <linux/pagemap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 #include <asm/string.h>
diff --git a/fs/ncpfs/sock.c b/fs/ncpfs/sock.c
index 471bc3d1139e..f32f272ee501 100644
--- a/fs/ncpfs/sock.c
+++ b/fs/ncpfs/sock.c
@@ -16,7 +16,7 @@
 #include <linux/fcntl.h>
 #include <linux/stat.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/in.h>
 #include <linux/net.h>
 #include <linux/mm.h>
diff --git a/fs/ncpfs/symlink.c b/fs/ncpfs/symlink.c
index 421b6f91e8ec..a6d26b46fc05 100644
--- a/fs/ncpfs/symlink.c
+++ b/fs/ncpfs/symlink.c
@@ -21,7 +21,7 @@
  */
 
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index e9aa235e9d10..f073a6d2c6a5 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -110,20 +110,52 @@ out:
 #if defined(CONFIG_NFS_V4_1)
 
 /*
- * Lookup a layout by filehandle.
+ * Lookup a layout inode by stateid
  *
- * Note: gets a refcount on the layout hdr and on its respective inode.
- * Caller must put the layout hdr and the inode.
+ * Note: returns a refcount on the inode and superblock
+ */
+static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_server *server;
+	struct inode *inode;
+	struct pnfs_layout_hdr *lo;
+
+restart:
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry(lo, &server->layouts, plh_layouts) {
+			if (stateid != NULL &&
+			    !nfs4_stateid_match_other(stateid, &lo->plh_stateid))
+				continue;
+			inode = igrab(lo->plh_inode);
+			if (!inode)
+				continue;
+			if (!nfs_sb_active(inode->i_sb)) {
+				rcu_read_lock();
+				spin_unlock(&clp->cl_lock);
+				iput(inode);
+				spin_lock(&clp->cl_lock);
+				goto restart;
+			}
+			return inode;
+		}
+	}
+
+	return NULL;
+}
+
+/*
+ * Lookup a layout inode by filehandle.
+ *
+ * Note: returns a refcount on the inode and superblock
  *
- * TODO: keep track of all layouts (and delegations) in a hash table
- * hashed by filehandle.
  */
-static struct pnfs_layout_hdr * get_layout_by_fh_locked(struct nfs_client *clp,
-		struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
+		const struct nfs_fh *fh)
 {
 	struct nfs_server *server;
 	struct nfs_inode *nfsi;
-	struct inode *ino;
+	struct inode *inode;
 	struct pnfs_layout_hdr *lo;
 
 restart:
@@ -134,37 +166,38 @@ restart:
 				continue;
 			if (nfsi->layout != lo)
 				continue;
-			ino = igrab(lo->plh_inode);
-			if (!ino)
-				break;
-			spin_lock(&ino->i_lock);
-			/* Is this layout in the process of being freed? */
-			if (nfsi->layout != lo) {
-				spin_unlock(&ino->i_lock);
-				iput(ino);
+			inode = igrab(lo->plh_inode);
+			if (!inode)
+				continue;
+			if (!nfs_sb_active(inode->i_sb)) {
+				rcu_read_lock();
+				spin_unlock(&clp->cl_lock);
+				iput(inode);
+				spin_lock(&clp->cl_lock);
 				goto restart;
 			}
-			pnfs_get_layout_hdr(lo);
-			spin_unlock(&ino->i_lock);
-			return lo;
+			return inode;
 		}
 	}
 
 	return NULL;
 }
 
-static struct pnfs_layout_hdr * get_layout_by_fh(struct nfs_client *clp,
-		struct nfs_fh *fh)
+static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
+		const struct nfs_fh *fh,
+		const nfs4_stateid *stateid)
 {
-	struct pnfs_layout_hdr *lo;
+	struct inode *inode;
 
 	spin_lock(&clp->cl_lock);
 	rcu_read_lock();
-	lo = get_layout_by_fh_locked(clp, fh);
+	inode = nfs_layout_find_inode_by_stateid(clp, stateid);
+	if (!inode)
+		inode = nfs_layout_find_inode_by_fh(clp, fh);
 	rcu_read_unlock();
 	spin_unlock(&clp->cl_lock);
 
-	return lo;
+	return inode;
 }
 
 /*
@@ -213,18 +246,20 @@ static u32 initiate_file_draining(struct nfs_client *clp,
 	u32 rv = NFS4ERR_NOMATCHING_LAYOUT;
 	LIST_HEAD(free_me_list);
 
-	lo = get_layout_by_fh(clp, &args->cbl_fh);
-	if (!lo) {
-		trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, NULL,
-				&args->cbl_stateid, -rv);
+	ino = nfs_layout_find_inode(clp, &args->cbl_fh, &args->cbl_stateid);
+	if (!ino)
 		goto out;
-	}
 
-	ino = lo->plh_inode;
 	pnfs_layoutcommit_inode(ino, false);
 
 
 	spin_lock(&ino->i_lock);
+	lo = NFS_I(ino)->layout;
+	if (!lo) {
+		spin_unlock(&ino->i_lock);
+		goto out;
+	}
+	pnfs_get_layout_hdr(lo);
 	rv = pnfs_check_callback_stateid(lo, &args->cbl_stateid);
 	if (rv != NFS_OK)
 		goto unlock;
@@ -258,10 +293,10 @@ unlock:
 	/* Free all lsegs that are attached to commit buckets */
 	nfs_commit_inode(ino, 0);
 	pnfs_put_layout_hdr(lo);
+out:
 	trace_nfs4_cb_layoutrecall_file(clp, &args->cbl_fh, ino,
 			&args->cbl_stateid, -rv);
-	iput(ino);
-out:
+	nfs_iput_and_deactive(ino);
 	return rv;
 }
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index ebecfb8fba06..91a8d610ba0f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -369,9 +369,7 @@ nfs_found_client(const struct nfs_client_initdata *cl_init,
  * Look up a client by IP address and protocol version
  * - creates a new record if one doesn't yet exist
  */
-struct nfs_client *
-nfs_get_client(const struct nfs_client_initdata *cl_init,
-	       rpc_authflavor_t authflavour)
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init)
 {
 	struct nfs_client *clp, *new = NULL;
 	struct nfs_net *nn = net_generic(cl_init->net, nfs_net_id);
@@ -655,7 +653,7 @@ static int nfs_init_server(struct nfs_server *server,
 		set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, RPC_AUTH_UNIX);
+	clp = nfs_get_client(&cl_init);
 	if (IS_ERR(clp)) {
 		dprintk("<-- nfs_init_server() = error %ld\n", PTR_ERR(clp));
 		return PTR_ERR(clp);
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index dff600ae0d74..d7df5e67b0c1 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -391,10 +391,6 @@ int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct
 	rcu_assign_pointer(nfsi->delegation, delegation);
 	delegation = NULL;
 
-	/* Ensure we revalidate the attributes and page cache! */
-	spin_lock(&inode->i_lock);
-	nfsi->cache_validity |= NFS_INO_REVAL_FORCED;
-	spin_unlock(&inode->i_lock);
 	trace_nfs4_set_delegation(inode, res->delegation_type);
 
 out:
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 5f1af4cd1a33..fad81041f5ab 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -455,14 +455,17 @@ bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx)
 }
 
 /*
- * This function is called by the lookup code to request the use of
- * readdirplus to accelerate any future lookups in the same
+ * This function is called by the lookup and getattr code to request the
+ * use of readdirplus to accelerate any future lookups in the same
  * directory.
  */
-static
 void nfs_advise_use_readdirplus(struct inode *dir)
 {
-	set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(dir)->flags);
+	struct nfs_inode *nfsi = NFS_I(dir);
+
+	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+	    !list_empty(&nfsi->open_files))
+		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
 }
 
 /*
@@ -475,9 +478,12 @@ void nfs_advise_use_readdirplus(struct inode *dir)
  */
 void nfs_force_use_readdirplus(struct inode *dir)
 {
-	if (!list_empty(&NFS_I(dir)->open_files)) {
-		nfs_advise_use_readdirplus(dir);
-		nfs_zap_mapping(dir, dir->i_mapping);
+	struct nfs_inode *nfsi = NFS_I(dir);
+
+	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+	    !list_empty(&nfsi->open_files)) {
+		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+		invalidate_mapping_pages(dir->i_mapping, 0, -1);
 	}
 }
 
@@ -886,17 +892,6 @@ int uncached_readdir(nfs_readdir_descriptor_t *desc)
 	goto out;
 }
 
-static bool nfs_dir_mapping_need_revalidate(struct inode *dir)
-{
-	struct nfs_inode *nfsi = NFS_I(dir);
-
-	if (nfs_attribute_cache_expired(dir))
-		return true;
-	if (nfsi->cache_validity & NFS_INO_INVALID_DATA)
-		return true;
-	return false;
-}
-
 /* The file offset position represents the dirent entry number.  A
    last cookie cache takes care of the common case of reading the
    whole directory.
@@ -928,7 +923,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
 	desc->decode = NFS_PROTO(inode)->decode_dirent;
 	desc->plus = nfs_use_readdirplus(inode, ctx) ? 1 : 0;
 
-	if (ctx->pos == 0 || nfs_dir_mapping_need_revalidate(inode))
+	if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
 		res = nfs_revalidate_mapping(inode, file->f_mapping);
 	if (res < 0)
 		goto out;
@@ -1035,8 +1030,6 @@ EXPORT_SYMBOL_GPL(nfs_force_lookup_revalidate);
 static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
 			      int rcu_walk)
 {
-	int ret;
-
 	if (IS_ROOT(dentry))
 		return 1;
 	if (NFS_SERVER(dir)->flags & NFS_MOUNT_LOOKUP_CACHE_NONE)
@@ -1044,12 +1037,12 @@ static int nfs_check_verifier(struct inode *dir, struct dentry *dentry,
 	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
 	/* Revalidate nfsi->cache_change_attribute before we declare a match */
-	if (rcu_walk)
-		ret = nfs_revalidate_inode_rcu(NFS_SERVER(dir), dir);
-	else
-		ret = nfs_revalidate_inode(NFS_SERVER(dir), dir);
-	if (ret < 0)
-		return 0;
+	if (nfs_mapping_need_revalidate_inode(dir)) {
+		if (rcu_walk)
+			return 0;
+		if (__nfs_revalidate_inode(NFS_SERVER(dir), dir) < 0)
+			return 0;
+	}
 	if (!nfs_verify_change_attribute(dir, dentry->d_time))
 		return 0;
 	return 1;
@@ -1161,7 +1154,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 				return -ECHILD;
 			goto out_bad;
 		}
-		goto out_valid_noent;
+		goto out_valid;
 	}
 
 	if (is_bad_inode(inode)) {
@@ -1184,6 +1177,7 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 				return -ECHILD;
 			goto out_zap_parent;
 		}
+		nfs_advise_use_readdirplus(dir);
 		goto out_valid;
 	}
 
@@ -1219,12 +1213,12 @@ static int nfs_lookup_revalidate(struct dentry *dentry, unsigned int flags)
 	nfs_free_fhandle(fhandle);
 	nfs4_label_free(label);
 
+	/* set a readdirplus hint that we had a cache miss */
+	nfs_force_use_readdirplus(dir);
+
 out_set_verifier:
 	nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
  out_valid:
-	/* Success: notify readdir to use READDIRPLUS */
-	nfs_advise_use_readdirplus(dir);
- out_valid_noent:
 	if (flags & LOOKUP_RCU) {
 		if (parent != ACCESS_ONCE(dentry->d_parent))
 			return -ECHILD;
@@ -1279,8 +1273,8 @@ out_error:
  */
 static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
 {
-	int error;
 	struct inode *inode = d_inode(dentry);
+	int error = 0;
 
 	/*
 	 * I believe we can only get a negative dentry here in the case of a
@@ -1299,7 +1293,8 @@ static int nfs_weak_revalidate(struct dentry *dentry, unsigned int flags)
 		return 0;
 	}
 
-	error = nfs_revalidate_inode(NFS_SERVER(inode), inode);
+	if (nfs_mapping_need_revalidate_inode(inode))
+		error = __nfs_revalidate_inode(NFS_SERVER(inode), inode);
 	dfprintk(LOOKUPCACHE, "NFS: %s: inode %lu is %s\n",
 			__func__, inode->i_ino, error ? "invalid" : "valid");
 	return !error;
@@ -1424,8 +1419,8 @@ struct dentry *nfs_lookup(struct inode *dir, struct dentry * dentry, unsigned in
 	if (IS_ERR(res))
 		goto out_label;
 
-	/* Success: notify readdir to use READDIRPLUS */
-	nfs_advise_use_readdirplus(dir);
+	/* Notify readdir to use READDIRPLUS */
+	nfs_force_use_readdirplus(dir);
 
 no_entry:
 	res = d_splice_alias(inode, dentry);
@@ -1467,9 +1462,9 @@ static fmode_t flags_to_mode(int flags)
 	return res;
 }
 
-static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags)
+static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
 {
-	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags));
+	return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
 }
 
 static int do_open(struct inode *inode, struct file *filp)
@@ -1535,8 +1530,13 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 		return -ENAMETOOLONG;
 
 	if (open_flags & O_CREAT) {
+		struct nfs_server *server = NFS_SERVER(dir);
+
+		if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+			mode &= ~current_umask();
+
 		attr.ia_valid |= ATTR_MODE;
-		attr.ia_mode = mode & ~current_umask();
+		attr.ia_mode = mode;
 	}
 	if (open_flags & O_TRUNC) {
 		attr.ia_valid |= ATTR_SIZE;
@@ -1554,7 +1554,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 			return finish_no_open(file, dentry);
 	}
 
-	ctx = create_nfs_open_context(dentry, open_flags);
+	ctx = create_nfs_open_context(dentry, open_flags, file);
 	err = PTR_ERR(ctx);
 	if (IS_ERR(ctx))
 		goto out;
@@ -2286,8 +2286,7 @@ static int nfs_access_get_cached(struct inode *inode, struct rpc_cred *cred, str
 		if (cache == NULL)
 			goto out;
 		/* Found an entry, is our attribute cache valid? */
-		if (!nfs_attribute_cache_expired(inode) &&
-		    !(nfsi->cache_validity & NFS_INO_INVALID_ATTR))
+		if (!nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
 			break;
 		err = -ECHILD;
 		if (!may_block)
@@ -2335,12 +2334,12 @@ static int nfs_access_get_cached_rcu(struct inode *inode, struct rpc_cred *cred,
 		cache = NULL;
 	if (cache == NULL)
 		goto out;
-	err = nfs_revalidate_inode_rcu(NFS_SERVER(inode), inode);
-	if (err)
+	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS))
 		goto out;
 	res->jiffies = cache->jiffies;
 	res->cred = cache->cred;
 	res->mask = cache->mask;
+	err = 0;
 out:
 	rcu_read_unlock();
 	return err;
@@ -2492,12 +2491,13 @@ EXPORT_SYMBOL_GPL(nfs_may_open);
 static int nfs_execute_ok(struct inode *inode, int mask)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	int ret;
+	int ret = 0;
 
-	if (mask & MAY_NOT_BLOCK)
-		ret = nfs_revalidate_inode_rcu(server, inode);
-	else
-		ret = nfs_revalidate_inode(server, inode);
+	if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_ACCESS)) {
+		if (mask & MAY_NOT_BLOCK)
+			return -ECHILD;
+		ret = __nfs_revalidate_inode(server, inode);
+	}
 	if (ret == 0 && !execute_ok(inode))
 		ret = -EACCES;
 	return ret;
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index bd81bcf3ffcf..aab32fc3d6a8 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -52,7 +52,7 @@
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/atomic.h>
 
 #include "internal.h"
@@ -105,7 +105,7 @@ struct nfs_direct_req {
 
 static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
 static const struct nfs_commit_completion_ops nfs_direct_commit_completion_ops;
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode);
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq);
 static void nfs_direct_write_schedule_work(struct work_struct *work);
 
 static inline void get_dreq(struct nfs_direct_req *dreq)
@@ -684,7 +684,7 @@ out_failed:
 	}
 
 	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, dreq->inode);
+		nfs_direct_write_complete(dreq);
 }
 
 static void nfs_direct_commit_complete(struct nfs_commit_data *data)
@@ -717,7 +717,7 @@ static void nfs_direct_commit_complete(struct nfs_commit_data *data)
 	}
 
 	if (atomic_dec_and_test(&cinfo.mds->rpcs_out))
-		nfs_direct_write_complete(dreq, data->inode);
+		nfs_direct_write_complete(dreq);
 }
 
 static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
@@ -768,7 +768,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
 	}
 }
 
-static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode)
+static void nfs_direct_write_complete(struct nfs_direct_req *dreq)
 {
 	schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */
 }
@@ -824,7 +824,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
 
 out_put:
 	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, hdr->inode);
+		nfs_direct_write_complete(dreq);
 	hdr->release(hdr);
 }
 
@@ -953,7 +953,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
 	}
 
 	if (put_dreq(dreq))
-		nfs_direct_write_complete(dreq, dreq->inode);
+		nfs_direct_write_complete(dreq);
 	return 0;
 }
 
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 9ea85ae23c32..26dbe8b0c10d 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -29,7 +29,7 @@
 #include <linux/gfp.h>
 #include <linux/swap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "delegation.h"
 #include "internal.h"
@@ -101,18 +101,11 @@ EXPORT_SYMBOL_GPL(nfs_file_release);
 static int nfs_revalidate_file_size(struct inode *inode, struct file *filp)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	if (nfs_have_delegated_attributes(inode))
-		goto out_noreval;
 
 	if (filp->f_flags & O_DIRECT)
 		goto force_reval;
-	if (nfsi->cache_validity & NFS_INO_REVAL_PAGECACHE)
-		goto force_reval;
-	if (nfs_attribute_timeout(inode))
+	if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE))
 		goto force_reval;
-out_noreval:
 	return 0;
 force_reval:
 	return __nfs_revalidate_inode(server, inode);
@@ -374,7 +367,7 @@ static int nfs_write_end(struct file *file, struct address_space *mapping,
 	 */
 	if (!PageUptodate(page)) {
 		unsigned pglen = nfs_page_length(page);
-		unsigned end = offset + len;
+		unsigned end = offset + copied;
 
 		if (pglen == 0) {
 			zero_user_segments(page, 0, offset,
diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c
index 4946ef40ba87..f956ca20a8a3 100644
--- a/fs/nfs/filelayout/filelayoutdev.c
+++ b/fs/nfs/filelayout/filelayoutdev.c
@@ -279,11 +279,11 @@ nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx)
 
 	nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
 			     dataserver_retrans, 4,
-			     s->nfs_client->cl_minorversion,
-			     s->nfs_client->cl_rpcclient->cl_auth->au_flavor);
+			     s->nfs_client->cl_minorversion);
 
 out_test_devid:
-	if (filelayout_test_devid_unavailable(devid))
+	if (ret->ds_clp == NULL ||
+	    filelayout_test_devid_unavailable(devid))
 		ret = NULL;
 out:
 	return ret;
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 98ace127bf86..0ca4af8cca5d 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -25,9 +25,20 @@
 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
 
 #define FF_LAYOUT_POLL_RETRY_MAX     (15*HZ)
+#define FF_LAYOUTRETURN_MAXERR 20
+
 
 static struct group_info	*ff_zero_group;
 
+static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
+		struct nfs_pgio_header *hdr);
+static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+			       struct nfs42_layoutstat_devinfo *devinfo,
+			       int dev_limit);
+static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+			      const struct nfs42_layoutstat_devinfo *devinfo,
+			      struct nfs4_ff_layout_mirror *mirror);
+
 static struct pnfs_layout_hdr *
 ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
 {
@@ -172,7 +183,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo,
 
 	spin_lock(&inode->i_lock);
 	list_for_each_entry(pos, &ff_layout->mirrors, mirrors) {
-		if (mirror->mirror_ds != pos->mirror_ds)
+		if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0)
 			continue;
 		if (!ff_mirror_match_fh(mirror, pos))
 			continue;
@@ -349,19 +360,6 @@ static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls)
 	}
 }
 
-static void ff_layout_mark_devices_valid(struct nfs4_ff_layout_segment *fls)
-{
-	struct nfs4_deviceid_node *node;
-	int i;
-
-	if (!(fls->flags & FF_FLAGS_NO_IO_THRU_MDS))
-		return;
-	for (i = 0; i < fls->mirror_array_cnt; i++) {
-		node = &fls->mirror_array[i]->mirror_ds->id_node;
-		clear_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags);
-	}
-}
-
 static struct pnfs_layout_segment *
 ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		     struct nfs4_layoutget_res *lgr,
@@ -415,8 +413,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 
 	for (i = 0; i < fls->mirror_array_cnt; i++) {
 		struct nfs4_ff_layout_mirror *mirror;
-		struct nfs4_deviceid devid;
-		struct nfs4_deviceid_node *idnode;
 		struct auth_cred acred = { .group_info = ff_zero_group };
 		struct rpc_cred	__rcu *cred;
 		u32 ds_count, fh_count, id;
@@ -441,24 +437,10 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
 		fls->mirror_array[i]->ds_count = ds_count;
 
 		/* deviceid */
-		rc = decode_deviceid(&stream, &devid);
+		rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid);
 		if (rc)
 			goto out_err_free;
 
-		idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
-						&devid, lh->plh_lc_cred,
-						gfp_flags);
-		/*
-		 * upon success, mirror_ds is allocated by previous
-		 * getdeviceinfo, or newly by .alloc_deviceid_node
-		 * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure
-		 */
-		if (idnode)
-			fls->mirror_array[i]->mirror_ds =
-				FF_LAYOUT_MIRROR_DS(idnode);
-		else
-			goto out_err_free;
-
 		/* efficiency */
 		rc = -EIO;
 		p = xdr_inline_decode(&stream, 4);
@@ -556,8 +538,6 @@ out_sort_mirrors:
 	rc = ff_layout_check_layout(lgr);
 	if (rc)
 		goto out_err_free;
-	ff_layout_mark_devices_valid(fls);
-
 	ret = &fls->generic_hdr;
 	dprintk("<-- %s (success)\n", __func__);
 out_free_page:
@@ -639,12 +619,11 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror,
 			    struct nfs4_ff_layoutstat *layoutstat,
 			    ktime_t now)
 {
-	static const ktime_t notime = {0};
 	s64 report_interval = FF_LAYOUTSTATS_REPORT_INTERVAL;
 	struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout);
 
 	nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now);
-	if (ktime_equal(mirror->start_time, notime))
+	if (!mirror->start_time)
 		mirror->start_time = now;
 	if (mirror->report_interval != 0)
 		report_interval = (s64)mirror->report_interval * 1000LL;
@@ -702,6 +681,7 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode,
 	spin_lock(&mirror->lock);
 	report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested);
+	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 
 	if (report)
@@ -718,6 +698,7 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task,
 	nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat,
 			requested, completed,
 			ktime_get(), task->tk_start);
+	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 }
 
@@ -731,6 +712,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode,
 	spin_lock(&mirror->lock);
 	report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now);
 	nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested);
+	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 
 	if (report)
@@ -750,6 +732,7 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
 	spin_lock(&mirror->lock);
 	nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat,
 			requested, completed, ktime_get(), task->tk_start);
+	set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags);
 	spin_unlock(&mirror->lock);
 }
 
@@ -1142,7 +1125,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 	case -EPIPE:
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
-		nfs4_mark_deviceid_unavailable(devid);
+		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+				&devid->deviceid);
 		rpc_wake_up(&tbl->slot_tbl_waitq);
 		/* fall through */
 	default:
@@ -1191,7 +1175,8 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task,
 	default:
 		dprintk("%s DS connection error %d\n", __func__,
 			task->tk_status);
-		nfs4_mark_deviceid_unavailable(devid);
+		nfs4_delete_deviceid(devid->ld, devid->nfs_client,
+				&devid->deviceid);
 	}
 	/* FIXME: Need to prevent infinite looping here. */
 	return -NFS4ERR_RESET_TO_PNFS;
@@ -1293,6 +1278,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task,
 					hdr->pgio_mirror_idx + 1,
 					&hdr->pgio_mirror_idx))
 			goto out_eagain;
+		ff_layout_read_record_layoutstats_done(task, hdr);
 		pnfs_read_resend_pnfs(hdr);
 		return task->tk_status;
 	case -NFS4ERR_RESET_TO_MDS:
@@ -1961,38 +1947,88 @@ ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
 						  id_node));
 }
 
-static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo,
-				  struct xdr_stream *xdr,
-				  const struct nfs4_layoutreturn_args *args)
+static int ff_layout_encode_ioerr(struct xdr_stream *xdr,
+				  const struct nfs4_layoutreturn_args *args,
+				  const struct nfs4_flexfile_layoutreturn_args *ff_args)
 {
-	struct pnfs_layout_hdr *hdr = &flo->generic_hdr;
 	__be32 *start;
-	int count = 0, ret = 0;
 
 	start = xdr_reserve_space(xdr, 4);
 	if (unlikely(!start))
 		return -E2BIG;
 
+	*start = cpu_to_be32(ff_args->num_errors);
 	/* This assume we always return _ALL_ layouts */
-	spin_lock(&hdr->plh_inode->i_lock);
-	ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range);
-	spin_unlock(&hdr->plh_inode->i_lock);
+	return ff_layout_encode_ds_ioerr(xdr, &ff_args->errors);
+}
 
-	*start = cpu_to_be32(count);
+static void
+encode_opaque_fixed(struct xdr_stream *xdr, const void *buf, size_t len)
+{
+	__be32 *p;
 
-	return ret;
+	p = xdr_reserve_space(xdr, len);
+	xdr_encode_opaque_fixed(p, buf, len);
+}
+
+static void
+ff_layout_encode_ff_iostat_head(struct xdr_stream *xdr,
+			    const nfs4_stateid *stateid,
+			    const struct nfs42_layoutstat_devinfo *devinfo)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(xdr, 8 + 8);
+	p = xdr_encode_hyper(p, devinfo->offset);
+	p = xdr_encode_hyper(p, devinfo->length);
+	encode_opaque_fixed(xdr, stateid->data, NFS4_STATEID_SIZE);
+	p = xdr_reserve_space(xdr, 4*8);
+	p = xdr_encode_hyper(p, devinfo->read_count);
+	p = xdr_encode_hyper(p, devinfo->read_bytes);
+	p = xdr_encode_hyper(p, devinfo->write_count);
+	p = xdr_encode_hyper(p, devinfo->write_bytes);
+	encode_opaque_fixed(xdr, devinfo->dev_id.data, NFS4_DEVICEID4_SIZE);
+}
+
+static void
+ff_layout_encode_ff_iostat(struct xdr_stream *xdr,
+			    const nfs4_stateid *stateid,
+			    const struct nfs42_layoutstat_devinfo *devinfo)
+{
+	ff_layout_encode_ff_iostat_head(xdr, stateid, devinfo);
+	ff_layout_encode_ff_layoutupdate(xdr, devinfo,
+			devinfo->ld_private.data);
 }
 
 /* report nothing for now */
-static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo,
-				     struct xdr_stream *xdr,
-				     const struct nfs4_layoutreturn_args *args)
+static void ff_layout_encode_iostats_array(struct xdr_stream *xdr,
+		const struct nfs4_layoutreturn_args *args,
+		struct nfs4_flexfile_layoutreturn_args *ff_args)
 {
 	__be32 *p;
+	int i;
 
 	p = xdr_reserve_space(xdr, 4);
-	if (likely(p))
-		*p = cpu_to_be32(0);
+	*p = cpu_to_be32(ff_args->num_dev);
+	for (i = 0; i < ff_args->num_dev; i++)
+		ff_layout_encode_ff_iostat(xdr,
+				&args->layout->plh_stateid,
+				&ff_args->devinfo[i]);
+}
+
+static void
+ff_layout_free_iostats_array(struct nfs42_layoutstat_devinfo *devinfo,
+		unsigned int num_entries)
+{
+	unsigned int i;
+
+	for (i = 0; i < num_entries; i++) {
+		if (!devinfo[i].ld_private.ops)
+			continue;
+		if (!devinfo[i].ld_private.ops->free)
+			continue;
+		devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+	}
 }
 
 static struct nfs4_deviceid_node *
@@ -2008,24 +2044,91 @@ ff_layout_alloc_deviceid_node(struct nfs_server *server,
 }
 
 static void
-ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo,
-			      struct xdr_stream *xdr,
-			      const struct nfs4_layoutreturn_args *args)
-{
-	struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+ff_layout_encode_layoutreturn(struct xdr_stream *xdr,
+		const void *voidargs,
+		const struct nfs4_xdr_opaque_data *ff_opaque)
+{
+	const struct nfs4_layoutreturn_args *args = voidargs;
+	struct nfs4_flexfile_layoutreturn_args *ff_args = ff_opaque->data;
+	struct xdr_buf tmp_buf = {
+		.head = {
+			[0] = {
+				.iov_base = page_address(ff_args->pages[0]),
+			},
+		},
+		.buflen = PAGE_SIZE,
+	};
+	struct xdr_stream tmp_xdr;
 	__be32 *start;
 
 	dprintk("%s: Begin\n", __func__);
-	start = xdr_reserve_space(xdr, 4);
-	BUG_ON(!start);
 
-	ff_layout_encode_ioerr(flo, xdr, args);
-	ff_layout_encode_iostats(flo, xdr, args);
+	xdr_init_encode(&tmp_xdr, &tmp_buf, NULL);
+
+	ff_layout_encode_ioerr(&tmp_xdr, args, ff_args);
+	ff_layout_encode_iostats_array(&tmp_xdr, args, ff_args);
+
+	start = xdr_reserve_space(xdr, 4);
+	*start = cpu_to_be32(tmp_buf.len);
+	xdr_write_pages(xdr, ff_args->pages, 0, tmp_buf.len);
 
-	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 	dprintk("%s: Return\n", __func__);
 }
 
+static void
+ff_layout_free_layoutreturn(struct nfs4_xdr_opaque_data *args)
+{
+	struct nfs4_flexfile_layoutreturn_args *ff_args;
+
+	if (!args->data)
+		return;
+	ff_args = args->data;
+	args->data = NULL;
+
+	ff_layout_free_ds_ioerr(&ff_args->errors);
+	ff_layout_free_iostats_array(ff_args->devinfo, ff_args->num_dev);
+
+	put_page(ff_args->pages[0]);
+	kfree(ff_args);
+}
+
+const struct nfs4_xdr_opaque_ops layoutreturn_ops = {
+	.encode = ff_layout_encode_layoutreturn,
+	.free = ff_layout_free_layoutreturn,
+};
+
+static int
+ff_layout_prepare_layoutreturn(struct nfs4_layoutreturn_args *args)
+{
+	struct nfs4_flexfile_layoutreturn_args *ff_args;
+	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(args->layout);
+
+	ff_args = kmalloc(sizeof(*ff_args), GFP_KERNEL);
+	if (!ff_args)
+		goto out_nomem;
+	ff_args->pages[0] = alloc_page(GFP_KERNEL);
+	if (!ff_args->pages[0])
+		goto out_nomem_free;
+
+	INIT_LIST_HEAD(&ff_args->errors);
+	ff_args->num_errors = ff_layout_fetch_ds_ioerr(args->layout,
+			&args->range, &ff_args->errors,
+			FF_LAYOUTRETURN_MAXERR);
+
+	spin_lock(&args->inode->i_lock);
+	ff_args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+			&ff_args->devinfo[0], ARRAY_SIZE(ff_args->devinfo));
+	spin_unlock(&args->inode->i_lock);
+
+	args->ld_private->ops = &layoutreturn_ops;
+	args->ld_private->data = ff_args;
+	return 0;
+out_nomem_free:
+	kfree(ff_args);
+out_nomem:
+	return -ENOMEM;
+}
+
 static int
 ff_layout_ntop4(const struct sockaddr *sap, char *buf, const size_t buflen)
 {
@@ -2146,21 +2249,18 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr,
 }
 
 static void
-ff_layout_encode_layoutstats(struct xdr_stream *xdr,
-			     struct nfs42_layoutstat_args *args,
-			     struct nfs42_layoutstat_devinfo *devinfo)
+ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr,
+			      const struct nfs42_layoutstat_devinfo *devinfo,
+			      struct nfs4_ff_layout_mirror *mirror)
 {
-	struct nfs4_ff_layout_mirror *mirror = devinfo->layout_private;
 	struct nfs4_pnfs_ds_addr *da;
 	struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds;
 	struct nfs_fh *fh = &mirror->fh_versions[0];
-	__be32 *p, *start;
+	__be32 *p;
 
 	da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node);
 	dprintk("%s: DS %s: encoding address %s\n",
 		__func__, ds->ds_remotestr, da->da_remotestr);
-	/* layoutupdate length */
-	start = xdr_reserve_space(xdr, 4);
 	/* netaddr4 */
 	ff_layout_encode_netaddr(xdr, da);
 	/* nfs_fh4 */
@@ -2177,42 +2277,71 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr,
 	/* bool */
 	p = xdr_reserve_space(xdr, 4);
 	*p = cpu_to_be32(false);
+}
+
+static void
+ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args,
+			     const struct nfs4_xdr_opaque_data *opaque)
+{
+	struct nfs42_layoutstat_devinfo *devinfo = container_of(opaque,
+			struct nfs42_layoutstat_devinfo, ld_private);
+	__be32 *start;
+
+	/* layoutupdate length */
+	start = xdr_reserve_space(xdr, 4);
+	ff_layout_encode_ff_layoutupdate(xdr, devinfo, opaque->data);
 
 	*start = cpu_to_be32((xdr->p - start - 1) * 4);
 }
 
+static void
+ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque)
+{
+	struct nfs4_ff_layout_mirror *mirror = opaque->data;
+
+	ff_layout_put_mirror(mirror);
+}
+
+static const struct nfs4_xdr_opaque_ops layoutstat_ops = {
+	.encode = ff_layout_encode_layoutstats,
+	.free	= ff_layout_free_layoutstats,
+};
+
 static int
-ff_layout_mirror_prepare_stats(struct nfs42_layoutstat_args *args,
-			       struct pnfs_layout_hdr *lo,
+ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
+			       struct nfs42_layoutstat_devinfo *devinfo,
 			       int dev_limit)
 {
 	struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo);
 	struct nfs4_ff_layout_mirror *mirror;
 	struct nfs4_deviceid_node *dev;
-	struct nfs42_layoutstat_devinfo *devinfo;
 	int i = 0;
 
 	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
 		if (i >= dev_limit)
 			break;
-		if (!mirror->mirror_ds)
+		if (IS_ERR_OR_NULL(mirror->mirror_ds))
+			continue;
+		if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags))
 			continue;
 		/* mirror refcount put in cleanup_layoutstats */
 		if (!atomic_inc_not_zero(&mirror->ref))
 			continue;
 		dev = &mirror->mirror_ds->id_node; 
-		devinfo = &args->devinfo[i];
 		memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE);
 		devinfo->offset = 0;
 		devinfo->length = NFS4_MAX_UINT64;
+		spin_lock(&mirror->lock);
 		devinfo->read_count = mirror->read_stat.io_stat.ops_completed;
 		devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed;
 		devinfo->write_count = mirror->write_stat.io_stat.ops_completed;
 		devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed;
+		spin_unlock(&mirror->lock);
 		devinfo->layout_type = LAYOUT_FLEX_FILES;
-		devinfo->layoutstats_encode = ff_layout_encode_layoutstats;
-		devinfo->layout_private = mirror;
+		devinfo->ld_private.ops = &layoutstat_ops;
+		devinfo->ld_private.data = mirror;
 
+		devinfo++;
 		i++;
 	}
 	return i;
@@ -2222,47 +2351,27 @@ static int
 ff_layout_prepare_layoutstats(struct nfs42_layoutstat_args *args)
 {
 	struct nfs4_flexfile_layout *ff_layout;
-	struct nfs4_ff_layout_mirror *mirror;
-	int dev_count = 0;
+	const int dev_count = PNFS_LAYOUTSTATS_MAXDEV;
 
-	spin_lock(&args->inode->i_lock);
-	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
-	list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) {
-		if (atomic_read(&mirror->ref) != 0)
-			dev_count ++;
-	}
-	spin_unlock(&args->inode->i_lock);
 	/* For now, send at most PNFS_LAYOUTSTATS_MAXDEV statistics */
-	if (dev_count > PNFS_LAYOUTSTATS_MAXDEV) {
-		dprintk("%s: truncating devinfo to limit (%d:%d)\n",
-			__func__, dev_count, PNFS_LAYOUTSTATS_MAXDEV);
-		dev_count = PNFS_LAYOUTSTATS_MAXDEV;
-	}
 	args->devinfo = kmalloc_array(dev_count, sizeof(*args->devinfo), GFP_NOIO);
 	if (!args->devinfo)
 		return -ENOMEM;
 
 	spin_lock(&args->inode->i_lock);
-	args->num_dev = ff_layout_mirror_prepare_stats(args,
-			&ff_layout->generic_hdr, dev_count);
+	ff_layout = FF_LAYOUT_FROM_HDR(NFS_I(args->inode)->layout);
+	args->num_dev = ff_layout_mirror_prepare_stats(&ff_layout->generic_hdr,
+			&args->devinfo[0], dev_count);
 	spin_unlock(&args->inode->i_lock);
+	if (!args->num_dev) {
+		kfree(args->devinfo);
+		args->devinfo = NULL;
+		return -ENOENT;
+	}
 
 	return 0;
 }
 
-static void
-ff_layout_cleanup_layoutstats(struct nfs42_layoutstat_data *data)
-{
-	struct nfs4_ff_layout_mirror *mirror;
-	int i;
-
-	for (i = 0; i < data->args.num_dev; i++) {
-		mirror = data->args.devinfo[i].layout_private;
-		data->args.devinfo[i].layout_private = NULL;
-		ff_layout_put_mirror(mirror);
-	}
-}
-
 static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.id			= LAYOUT_FLEX_FILES,
 	.name			= "LAYOUT_FLEX_FILES",
@@ -2284,10 +2393,9 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
 	.read_pagelist		= ff_layout_read_pagelist,
 	.write_pagelist		= ff_layout_write_pagelist,
 	.alloc_deviceid_node    = ff_layout_alloc_deviceid_node,
-	.encode_layoutreturn    = ff_layout_encode_layoutreturn,
+	.prepare_layoutreturn   = ff_layout_prepare_layoutreturn,
 	.sync			= pnfs_nfs_generic_sync,
 	.prepare_layoutstats	= ff_layout_prepare_layoutstats,
-	.cleanup_layoutstats	= ff_layout_cleanup_layoutstats,
 };
 
 static int __init nfs4flexfilelayout_init(void)
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 3ee0c9fcea76..f4f39b0ab09b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -21,6 +21,7 @@
 
 /* LAYOUTSTATS report interval in ms */
 #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L)
+#define FF_LAYOUTSTATS_MAXDEV 4
 
 struct nfs4_ff_ds_version {
 	u32				version;
@@ -73,6 +74,7 @@ struct nfs4_ff_layout_mirror {
 	struct list_head		mirrors;
 	u32				ds_count;
 	u32				efficiency;
+	struct nfs4_deviceid		devid;
 	struct nfs4_ff_layout_ds	*mirror_ds;
 	u32				fh_versions_cnt;
 	struct nfs_fh			*fh_versions;
@@ -81,12 +83,15 @@ struct nfs4_ff_layout_mirror {
 	struct rpc_cred	__rcu		*rw_cred;
 	atomic_t			ref;
 	spinlock_t			lock;
+	unsigned long			flags;
 	struct nfs4_ff_layoutstat	read_stat;
 	struct nfs4_ff_layoutstat	write_stat;
 	ktime_t				start_time;
 	u32				report_interval;
 };
 
+#define NFS4_FF_MIRROR_STAT_AVAIL	(0)
+
 struct nfs4_ff_layout_segment {
 	struct pnfs_layout_segment	generic_hdr;
 	u64				stripe_unit;
@@ -103,6 +108,14 @@ struct nfs4_flexfile_layout {
 	ktime_t			last_report_time; /* Layoutstat report times */
 };
 
+struct nfs4_flexfile_layoutreturn_args {
+	struct list_head errors;
+	struct nfs42_layoutstat_devinfo devinfo[FF_LAYOUTSTATS_MAXDEV];
+	unsigned int num_errors;
+	unsigned int num_dev;
+	struct page *pages[1];
+};
+
 static inline struct nfs4_flexfile_layout *
 FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo)
 {
@@ -180,9 +193,12 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo,
 			     struct nfs4_ff_layout_mirror *mirror, u64 offset,
 			     u64 length, int status, enum nfs_opnum4 opnum,
 			     gfp_t gfp_flags);
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
-			      struct xdr_stream *xdr, int *count,
-			      const struct pnfs_layout_range *range);
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head);
+void ff_layout_free_ds_ioerr(struct list_head *head);
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+		const struct pnfs_layout_range *range,
+		struct list_head *head,
+		unsigned int maxnum);
 struct nfs_fh *
 nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx);
 
@@ -197,7 +213,6 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg,
 				 struct inode *inode);
 struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg,
 				       u32 ds_idx, struct rpc_cred *mdscred);
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
 bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg);
 bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg);
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
index f7a3f6b05369..e5a6f248697b 100644
--- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c
+++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c
@@ -20,9 +20,11 @@
 static unsigned int dataserver_timeo = NFS_DEF_TCP_RETRANS;
 static unsigned int dataserver_retrans;
 
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg);
+
 void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds)
 {
-	if (mirror_ds)
+	if (!IS_ERR_OR_NULL(mirror_ds))
 		nfs4_put_deviceid_node(&mirror_ds->id_node);
 }
 
@@ -175,19 +177,36 @@ out_err:
 static void ff_layout_mark_devid_invalid(struct pnfs_layout_segment *lseg,
 		struct nfs4_deviceid_node *devid)
 {
-	nfs4_mark_deviceid_unavailable(devid);
+	nfs4_delete_deviceid(devid->ld, devid->nfs_client, &devid->deviceid);
 	if (!ff_layout_has_available_ds(lseg))
 		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
 				lseg);
 }
 
 static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
-		struct nfs4_ff_layout_mirror *mirror)
+				   struct nfs4_ff_layout_mirror *mirror,
+				   bool create)
 {
-	if (mirror == NULL || mirror->mirror_ds == NULL) {
-		pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
-					lseg);
-		return false;
+	if (mirror == NULL || IS_ERR(mirror->mirror_ds))
+		goto outerr;
+	if (mirror->mirror_ds == NULL) {
+		if (create) {
+			struct nfs4_deviceid_node *node;
+			struct pnfs_layout_hdr *lh = lseg->pls_layout;
+			struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV);
+
+			node = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode),
+					&mirror->devid, lh->plh_lc_cred,
+					GFP_KERNEL);
+			if (node)
+				mirror_ds = FF_LAYOUT_MIRROR_DS(node);
+
+			/* check for race with another call to this function */
+			if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) &&
+			    mirror_ds != ERR_PTR(-ENODEV))
+				nfs4_put_deviceid_node(node);
+		} else
+			goto outerr;
 	}
 	if (mirror->mirror_ds->ds == NULL) {
 		struct nfs4_deviceid_node *devid;
@@ -196,15 +215,9 @@ static bool ff_layout_mirror_valid(struct pnfs_layout_segment *lseg,
 		return false;
 	}
 	return true;
-}
-
-static u64
-end_offset(u64 start, u64 len)
-{
-	u64 end;
-
-	end = start + len;
-	return end >= start ? end : NFS4_MAX_UINT64;
+outerr:
+	pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
+	return false;
 }
 
 static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
@@ -212,8 +225,8 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err,
 {
 	u64 end;
 
-	end = max_t(u64, end_offset(err->offset, err->length),
-		    end_offset(offset, length));
+	end = max_t(u64, pnfs_end_offset(err->offset, err->length),
+		    pnfs_end_offset(offset, length));
 	err->offset = min_t(u64, err->offset, offset);
 	err->length = end - err->offset;
 }
@@ -235,9 +248,9 @@ ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1,
 	ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid));
 	if (ret != 0)
 		return ret;
-	if (end_offset(e1->offset, e1->length) < e2->offset)
+	if (pnfs_end_offset(e1->offset, e1->length) < e2->offset)
 		return -1;
-	if (e1->offset > end_offset(e2->offset, e2->length))
+	if (e1->offset > pnfs_end_offset(e2->offset, e2->length))
 		return 1;
 	/* If ranges overlap or are contiguous, they are the same */
 	return 0;
@@ -263,8 +276,9 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo,
 		}
 		/* Entries match, so merge "err" into "dserr" */
 		extend_ds_error(dserr, err->offset, err->length);
-		list_del(&err->list);
+		list_replace(&err->list, &dserr->list);
 		kfree(err);
+		return;
 	}
 
 	list_add_tail(&dserr->list, head);
@@ -331,7 +345,7 @@ nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx)
 	struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx);
 	struct nfs_fh *fh = NULL;
 
-	if (!ff_layout_mirror_valid(lseg, mirror)) {
+	if (!ff_layout_mirror_valid(lseg, mirror, false)) {
 		pr_err_ratelimited("NFS: %s: No data server for mirror offset index %d\n",
 			__func__, mirror_idx);
 		goto out;
@@ -371,7 +385,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 	struct nfs_server *s = NFS_SERVER(ino);
 	unsigned int max_payload;
 
-	if (!ff_layout_mirror_valid(lseg, mirror)) {
+	if (!ff_layout_mirror_valid(lseg, mirror, true)) {
 		pr_err_ratelimited("NFS: %s: No data server for offset index %d\n",
 			__func__, ds_idx);
 		goto out;
@@ -393,8 +407,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx,
 	nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo,
 			     dataserver_retrans,
 			     mirror->mirror_ds->ds_versions[0].version,
-			     mirror->mirror_ds->ds_versions[0].minor_version,
-			     RPC_AUTH_UNIX);
+			     mirror->mirror_ds->ds_versions[0].minor_version);
 
 	/* connect success, check rsize/wsize limit */
 	if (ds->ds_clp) {
@@ -457,28 +470,26 @@ nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx,
 	}
 }
 
-static bool is_range_intersecting(u64 offset1, u64 length1,
-				  u64 offset2, u64 length2)
+void ff_layout_free_ds_ioerr(struct list_head *head)
 {
-	u64 end1 = end_offset(offset1, length1);
-	u64 end2 = end_offset(offset2, length2);
+	struct nfs4_ff_layout_ds_err *err;
 
-	return (end1 == NFS4_MAX_UINT64 || end1 > offset2) &&
-	       (end2 == NFS4_MAX_UINT64 || end2 > offset1);
+	while (!list_empty(head)) {
+		err = list_first_entry(head,
+				struct nfs4_ff_layout_ds_err,
+				list);
+		list_del(&err->list);
+		kfree(err);
+	}
 }
 
 /* called with inode i_lock held */
-int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
-			      struct xdr_stream *xdr, int *count,
-			      const struct pnfs_layout_range *range)
+int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head)
 {
-	struct nfs4_ff_layout_ds_err *err, *n;
+	struct nfs4_ff_layout_ds_err *err;
 	__be32 *p;
 
-	list_for_each_entry_safe(err, n, &flo->error_list, list) {
-		if (!is_range_intersecting(err->offset, err->length,
-					   range->offset, range->length))
-			continue;
+	list_for_each_entry(err, head, list) {
 		/* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE)
 		 * + array length + deviceid(NFS4_DEVICEID4_SIZE)
 		 * + status(4) + opnum(4)
@@ -497,17 +508,59 @@ int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo,
 					    NFS4_DEVICEID4_SIZE);
 		*p++ = cpu_to_be32(err->status);
 		*p++ = cpu_to_be32(err->opnum);
-		*count += 1;
-		list_del(&err->list);
-		dprintk("%s: offset %llu length %llu status %d op %d count %d\n",
+		dprintk("%s: offset %llu length %llu status %d op %d\n",
 			__func__, err->offset, err->length, err->status,
-			err->opnum, *count);
-		kfree(err);
+			err->opnum);
 	}
 
 	return 0;
 }
 
+static
+unsigned int do_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+				      const struct pnfs_layout_range *range,
+				      struct list_head *head,
+				      unsigned int maxnum)
+{
+	struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo);
+	struct inode *inode = lo->plh_inode;
+	struct nfs4_ff_layout_ds_err *err, *n;
+	unsigned int ret = 0;
+
+	spin_lock(&inode->i_lock);
+	list_for_each_entry_safe(err, n, &flo->error_list, list) {
+		if (!pnfs_is_range_intersecting(err->offset,
+				pnfs_end_offset(err->offset, err->length),
+				range->offset,
+				pnfs_end_offset(range->offset, range->length)))
+			continue;
+		if (!maxnum)
+			break;
+		list_move(&err->list, head);
+		maxnum--;
+		ret++;
+	}
+	spin_unlock(&inode->i_lock);
+	return ret;
+}
+
+unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo,
+				      const struct pnfs_layout_range *range,
+				      struct list_head *head,
+				      unsigned int maxnum)
+{
+	unsigned int ret;
+
+	ret = do_layout_fetch_ds_ioerr(lo, range, head, maxnum);
+	/* If we're over the max, discard all remaining entries */
+	if (ret == maxnum) {
+		LIST_HEAD(discard);
+		do_layout_fetch_ds_ioerr(lo, range, &discard, -1);
+		ff_layout_free_ds_ioerr(&discard);
+	}
+	return ret;
+}
+
 static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 	struct nfs4_ff_layout_mirror *mirror;
@@ -516,7 +569,11 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		if (mirror && mirror->mirror_ds) {
+		if (mirror) {
+			if (!mirror->mirror_ds)
+				return true;
+			if (IS_ERR(mirror->mirror_ds))
+				continue;
 			devid = &mirror->mirror_ds->id_node;
 			if (!ff_layout_test_devid_unavailable(devid))
 				return true;
@@ -534,8 +591,10 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 
 	for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) {
 		mirror = FF_LAYOUT_COMP(lseg, idx);
-		if (!mirror || !mirror->mirror_ds)
+		if (!mirror || IS_ERR(mirror->mirror_ds))
 			return false;
+		if (!mirror->mirror_ds)
+			continue;
 		devid = &mirror->mirror_ds->id_node;
 		if (ff_layout_test_devid_unavailable(devid))
 			return false;
@@ -544,7 +603,7 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 	return FF_LAYOUT_MIRROR_COUNT(lseg) != 0;
 }
 
-bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
+static bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg)
 {
 	if (lseg->pls_range.iomode == IOMODE_READ)
 		return  ff_read_layout_has_available_ds(lseg);
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index a608ffd28acc..391dafaf9182 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -30,7 +30,7 @@
 #include <linux/namei.h>
 #include <linux/security.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index bf4ec5ecc97e..5ca4d96b1942 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -39,7 +39,7 @@
 #include <linux/compat.h>
 #include <linux/freezer.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -160,6 +160,43 @@ int nfs_sync_mapping(struct address_space *mapping)
 	return ret;
 }
 
+static int nfs_attribute_timeout(struct inode *inode)
+{
+	struct nfs_inode *nfsi = NFS_I(inode);
+
+	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
+}
+
+static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags)
+{
+	unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+	/* Special case for the pagecache or access cache */
+	if (flags == NFS_INO_REVAL_PAGECACHE &&
+	    !(cache_validity & NFS_INO_REVAL_FORCED))
+		return false;
+	return (cache_validity & flags) != 0;
+}
+
+static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags)
+{
+	unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity);
+
+	if ((cache_validity & flags) != 0)
+		return true;
+	if (nfs_attribute_timeout(inode))
+		return true;
+	return false;
+}
+
+bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags)
+{
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+		return nfs_check_cache_invalid_delegated(inode, flags);
+
+	return nfs_check_cache_invalid_not_delegated(inode, flags);
+}
+
 static void nfs_set_cache_invalid(struct inode *inode, unsigned long flags)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -634,15 +671,28 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr,
 }
 EXPORT_SYMBOL_GPL(nfs_setattr_update_inode);
 
-static void nfs_request_parent_use_readdirplus(struct dentry *dentry)
+static void nfs_readdirplus_parent_cache_miss(struct dentry *dentry)
 {
 	struct dentry *parent;
 
+	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+		return;
 	parent = dget_parent(dentry);
 	nfs_force_use_readdirplus(d_inode(parent));
 	dput(parent);
 }
 
+static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry)
+{
+	struct dentry *parent;
+
+	if (!nfs_server_capable(d_inode(dentry), NFS_CAP_READDIRPLUS))
+		return;
+	parent = dget_parent(dentry);
+	nfs_advise_use_readdirplus(d_inode(parent));
+	dput(parent);
+}
+
 static bool nfs_need_revalidate_inode(struct inode *inode)
 {
 	if (NFS_I(inode)->cache_validity &
@@ -683,10 +733,10 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
 	if (need_atime || nfs_need_revalidate_inode(inode)) {
 		struct nfs_server *server = NFS_SERVER(inode);
 
-		if (server->caps & NFS_CAP_READDIRPLUS)
-			nfs_request_parent_use_readdirplus(dentry);
+		nfs_readdirplus_parent_cache_miss(dentry);
 		err = __nfs_revalidate_inode(server, inode);
-	}
+	} else
+		nfs_readdirplus_parent_cache_hit(dentry);
 	if (!err) {
 		generic_fillattr(inode, stat);
 		stat->ino = nfs_compat_user_ino64(NFS_FILEID(inode));
@@ -702,8 +752,7 @@ EXPORT_SYMBOL_GPL(nfs_getattr);
 static void nfs_init_lock_context(struct nfs_lock_context *l_ctx)
 {
 	atomic_set(&l_ctx->count, 1);
-	l_ctx->lockowner.l_owner = current->files;
-	l_ctx->lockowner.l_pid = current->tgid;
+	l_ctx->lockowner = current->files;
 	INIT_LIST_HEAD(&l_ctx->list);
 	atomic_set(&l_ctx->io_count, 0);
 }
@@ -714,9 +763,7 @@ static struct nfs_lock_context *__nfs_find_lock_context(struct nfs_open_context
 	struct nfs_lock_context *pos = head;
 
 	do {
-		if (pos->lockowner.l_owner != current->files)
-			continue;
-		if (pos->lockowner.l_pid != current->tgid)
+		if (pos->lockowner != current->files)
 			continue;
 		atomic_inc(&pos->count);
 		return pos;
@@ -785,6 +832,8 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 	if (!is_sync)
 		return;
 	inode = d_inode(ctx->dentry);
+	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
+		return;
 	nfsi = NFS_I(inode);
 	if (inode->i_mapping->nrpages == 0)
 		return;
@@ -799,7 +848,9 @@ void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
 }
 EXPORT_SYMBOL_GPL(nfs_close_context);
 
-struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode)
+struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
+						fmode_t f_mode,
+						struct file *filp)
 {
 	struct nfs_open_context *ctx;
 	struct rpc_cred *cred = rpc_lookup_cred();
@@ -818,6 +869,7 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f
 	ctx->mode = f_mode;
 	ctx->flags = 0;
 	ctx->error = 0;
+	ctx->flock_owner = (fl_owner_t)filp;
 	nfs_init_lock_context(&ctx->lock_context);
 	ctx->lock_context.open_context = ctx;
 	INIT_LIST_HEAD(&ctx->list);
@@ -942,7 +994,7 @@ int nfs_open(struct inode *inode, struct file *filp)
 {
 	struct nfs_open_context *ctx;
 
-	ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+	ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 	nfs_file_set_open_context(filp, ctx);
@@ -1031,13 +1083,6 @@ out:
 	return status;
 }
 
-int nfs_attribute_timeout(struct inode *inode)
-{
-	struct nfs_inode *nfsi = NFS_I(inode);
-
-	return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo);
-}
-
 int nfs_attribute_cache_expired(struct inode *inode)
 {
 	if (nfs_have_delegated_attributes(inode))
@@ -1060,15 +1105,6 @@ int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
 }
 EXPORT_SYMBOL_GPL(nfs_revalidate_inode);
 
-int nfs_revalidate_inode_rcu(struct nfs_server *server, struct inode *inode)
-{
-	if (!(NFS_I(inode)->cache_validity &
-			(NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL))
-			&& !nfs_attribute_cache_expired(inode))
-		return NFS_STALE(inode) ? -ESTALE : 0;
-	return -ECHILD;
-}
-
 static int nfs_invalidate_mapping(struct inode *inode, struct address_space *mapping)
 {
 	struct nfs_inode *nfsi = NFS_I(inode);
@@ -1099,13 +1135,10 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 	return 0;
 }
 
-static bool nfs_mapping_need_revalidate_inode(struct inode *inode)
+bool nfs_mapping_need_revalidate_inode(struct inode *inode)
 {
-	if (nfs_have_delegated_attributes(inode))
-		return false;
-	return (NFS_I(inode)->cache_validity & NFS_INO_REVAL_PAGECACHE)
-		|| nfs_attribute_timeout(inode)
-		|| NFS_STALE(inode);
+	return nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE) ||
+		NFS_STALE(inode);
 }
 
 int nfs_revalidate_mapping_rcu(struct inode *inode)
@@ -1317,7 +1350,7 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
 		invalid |= NFS_INO_INVALID_ATIME;
 
 	if (invalid != 0)
-		nfs_set_cache_invalid(inode, invalid);
+		nfs_set_cache_invalid(inode, invalid | NFS_INO_REVAL_FORCED);
 
 	nfsi->read_cache_jiffies = fattr->time_start;
 	return 0;
@@ -1517,13 +1550,6 @@ static int nfs_post_op_update_inode_locked(struct inode *inode, struct nfs_fattr
 {
 	unsigned long invalid = NFS_INO_INVALID_ATTR;
 
-	/*
-	 * Don't revalidate the pagecache if we hold a delegation, but do
-	 * force an attribute update
-	 */
-	if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ))
-		invalid = NFS_INO_INVALID_ATTR|NFS_INO_REVAL_FORCED;
-
 	if (S_ISDIR(inode->i_mode))
 		invalid |= NFS_INO_INVALID_DATA;
 	nfs_set_cache_invalid(inode, invalid);
@@ -2015,7 +2041,7 @@ static void nfsiod_stop(void)
 	destroy_workqueue(wq);
 }
 
-int nfs_net_id;
+unsigned int nfs_net_id;
 EXPORT_SYMBOL_GPL(nfs_net_id);
 
 static int nfs_net_init(struct net *net)
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 80bcc0befb07..09ca5095c04e 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -154,8 +154,7 @@ extern const struct rpc_program nfs_program;
 extern void nfs_clients_init(struct net *net);
 extern struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *);
 int nfs_create_rpc_client(struct nfs_client *, const struct nfs_client_initdata *, rpc_authflavor_t);
-struct nfs_client *nfs_get_client(const struct nfs_client_initdata *,
-				  rpc_authflavor_t);
+struct nfs_client *nfs_get_client(const struct nfs_client_initdata *);
 int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *, struct nfs_fattr *);
 void nfs_server_insert_lists(struct nfs_server *);
 void nfs_server_remove_lists(struct nfs_server *);
@@ -194,14 +193,13 @@ extern struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 					     int ds_addrlen, int ds_proto,
 					     unsigned int ds_timeo,
 					     unsigned int ds_retrans,
-					     u32 minor_version,
-					     rpc_authflavor_t au_flavor);
+					     u32 minor_version);
 extern struct rpc_clnt *nfs4_find_or_create_ds_client(struct nfs_client *,
 						struct inode *);
 extern struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 			const struct sockaddr *ds_addr, int ds_addrlen,
 			int ds_proto, unsigned int ds_timeo,
-			unsigned int ds_retrans, rpc_authflavor_t au_flavor);
+			unsigned int ds_retrans);
 #ifdef CONFIG_PROC_FS
 extern int __init nfs_fs_proc_init(void);
 extern void nfs_fs_proc_exit(void);
@@ -346,6 +344,7 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const struct nfs_client_initdata *);
 
 /* dir.c */
+extern void nfs_advise_use_readdirplus(struct inode *dir);
 extern void nfs_force_use_readdirplus(struct inode *dir);
 extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
 					    struct shrink_control *sc);
@@ -382,6 +381,7 @@ extern int nfs_drop_inode(struct inode *);
 extern void nfs_clear_inode(struct inode *);
 extern void nfs_evict_inode(struct inode *);
 void nfs_zap_acl_cache(struct inode *inode);
+extern bool nfs_check_cache_invalid(struct inode *, unsigned long);
 extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode);
 extern int nfs_wait_atomic_killable(atomic_t *p);
 
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index fbce0d885d4c..5fbd2bde91ba 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -35,6 +35,6 @@ struct nfs_net {
 #endif
 };
 
-extern int nfs_net_id;
+extern unsigned int nfs_net_id;
 
 #endif
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index ee753547fb0a..7879f2a0fcfd 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -78,8 +78,7 @@ struct nfs_server *nfs3_clone_server(struct nfs_server *source,
  */
 struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 		const struct sockaddr *ds_addr, int ds_addrlen,
-		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
-		rpc_authflavor_t au_flavor)
+		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans)
 {
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -106,7 +105,7 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv,
 
 	/* Use the MDS nfs_client cl_ipaddr. */
 	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-	clp = nfs_get_client(&cl_init, au_flavor);
+	clp = nfs_get_client(&cl_init);
 
 	return clp;
 }
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 608501971fe0..d12ff9385f49 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -397,10 +397,13 @@ static void
 nfs42_layoutstat_release(void *calldata)
 {
 	struct nfs42_layoutstat_data *data = calldata;
-	struct nfs_server *nfss = NFS_SERVER(data->args.inode);
+	struct nfs42_layoutstat_devinfo *devinfo = data->args.devinfo;
+	int i;
 
-	if (nfss->pnfs_curr_ld->cleanup_layoutstats)
-		nfss->pnfs_curr_ld->cleanup_layoutstats(data);
+	for (i = 0; i < data->args.num_dev; i++) {
+		if (devinfo[i].ld_private.ops && devinfo[i].ld_private.ops->free)
+			devinfo[i].ld_private.ops->free(&devinfo[i].ld_private);
+	}
 
 	pnfs_put_layout_hdr(NFS_I(data->args.inode)->layout);
 	smp_mb__before_atomic();
diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c
index 8b2605882a20..6c7296454bbc 100644
--- a/fs/nfs/nfs42xdr.c
+++ b/fs/nfs/nfs42xdr.c
@@ -181,8 +181,9 @@ static void encode_layoutstats(struct xdr_stream *xdr,
 			NFS4_DEVICEID4_SIZE);
 	/* Encode layoutupdate4 */
 	*p++ = cpu_to_be32(devinfo->layout_type);
-	if (devinfo->layoutstats_encode != NULL)
-		devinfo->layoutstats_encode(xdr, args, devinfo);
+	if (devinfo->ld_private.ops)
+		devinfo->ld_private.ops->encode(xdr, args,
+				&devinfo->ld_private);
 	else
 		encode_uint32(xdr, 0);
 }
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 1452177c822d..665165833660 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -457,7 +457,7 @@ extern void nfs41_handle_server_scope(struct nfs_client *,
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
 extern int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl);
 extern int nfs4_select_rw_stateid(struct nfs4_state *, fmode_t,
-		const struct nfs_lockowner *, nfs4_stateid *,
+		const struct nfs_lock_context *, nfs4_stateid *,
 		struct rpc_cred **);
 
 extern struct nfs_seqid *nfs_alloc_seqid(struct nfs_seqid_counter *counter, gfp_t gfp_mask);
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 074ac7131459..5ae9d64ea08b 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -464,6 +464,11 @@ static bool nfs4_match_client_owner_id(const struct nfs_client *clp1,
 	return strcmp(clp1->cl_owner_id, clp2->cl_owner_id) == 0;
 }
 
+static bool nfs4_same_verifier(nfs4_verifier *v1, nfs4_verifier *v2)
+{
+	return memcmp(v1->data, v2->data, sizeof(v1->data)) == 0;
+}
+
 /**
  * nfs40_walk_client_list - Find server that recognizes a client ID
  *
@@ -521,7 +526,21 @@ int nfs40_walk_client_list(struct nfs_client *new,
 
 		if (!nfs4_match_client_owner_id(pos, new))
 			continue;
-
+		/*
+		 * We just sent a new SETCLIENTID, which should have
+		 * caused the server to return a new cl_confirm.  So if
+		 * cl_confirm is the same, then this is a different
+		 * server that just returned the same cl_confirm by
+		 * coincidence:
+		 */
+		if ((new != pos) && nfs4_same_verifier(&pos->cl_confirm,
+						       &new->cl_confirm))
+			continue;
+		/*
+		 * But if the cl_confirm's are different, then the only
+		 * way that a SETCLIENTID_CONFIRM to pos can succeed is
+		 * if new and pos point to the same server:
+		 */
 		atomic_inc(&pos->cl_count);
 		spin_unlock(&nn->nfs_client_lock);
 
@@ -534,6 +553,7 @@ int nfs40_walk_client_list(struct nfs_client *new,
 			break;
 		case 0:
 			nfs4_swap_callback_idents(pos, new);
+			pos->cl_confirm = new->cl_confirm;
 
 			prev = NULL;
 			*result = pos;
@@ -881,7 +901,6 @@ static int nfs4_set_client(struct nfs_server *server,
 		const struct sockaddr *addr,
 		const size_t addrlen,
 		const char *ip_addr,
-		rpc_authflavor_t authflavour,
 		int proto, const struct rpc_timeout *timeparms,
 		u32 minorversion, struct net *net)
 {
@@ -907,7 +926,7 @@ static int nfs4_set_client(struct nfs_server *server,
 		set_bit(NFS_CS_MIGRATION, &cl_init.init_flags);
 
 	/* Allocate or find a client reference we can use */
-	clp = nfs_get_client(&cl_init, authflavour);
+	clp = nfs_get_client(&cl_init);
 	if (IS_ERR(clp)) {
 		error = PTR_ERR(clp);
 		goto error;
@@ -948,7 +967,7 @@ error:
 struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 		const struct sockaddr *ds_addr, int ds_addrlen,
 		int ds_proto, unsigned int ds_timeo, unsigned int ds_retrans,
-		u32 minor_version, rpc_authflavor_t au_flavor)
+		u32 minor_version)
 {
 	struct rpc_timeout ds_timeout;
 	struct nfs_client *mds_clp = mds_srv->nfs_client;
@@ -979,7 +998,7 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv,
 	 * (section 13.1 RFC 5661).
 	 */
 	nfs_init_timeout_values(&ds_timeout, ds_proto, ds_timeo, ds_retrans);
-	clp = nfs_get_client(&cl_init, au_flavor);
+	clp = nfs_get_client(&cl_init);
 
 	dprintk("<-- %s %p\n", __func__, clp);
 	return clp;
@@ -1103,7 +1122,6 @@ static int nfs4_init_server(struct nfs_server *server,
 			(const struct sockaddr *)&data->nfs_server.address,
 			data->nfs_server.addrlen,
 			data->client_address,
-			data->selected_flavor,
 			data->nfs_server.protocol,
 			&timeparms,
 			data->minorversion,
@@ -1200,7 +1218,6 @@ struct nfs_server *nfs4_create_referral_server(struct nfs_clone_mount *data,
 				data->addr,
 				data->addrlen,
 				parent_client->cl_ipaddr,
-				data->authflavor,
 				rpc_protocol(parent_server->client),
 				parent_server->client->cl_timeout,
 				parent_client->cl_mvops->minor_version,
@@ -1311,7 +1328,6 @@ int nfs4_update_server(struct nfs_server *server, const char *hostname,
 
 	nfs_server_remove_lists(server);
 	error = nfs4_set_client(server, hostname, sap, salen, buf,
-				clp->cl_rpcclient->cl_auth->au_flavor,
 				clp->cl_proto, clnt->cl_timeout,
 				clp->cl_minorversion, net);
 	nfs_put_client(clp);
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 89a77950e0b0..0efba77789b9 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -57,7 +57,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 	parent = dget_parent(dentry);
 	dir = d_inode(parent);
 
-	ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode);
+	ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
 	err = PTR_ERR(ctx);
 	if (IS_ERR(ctx))
 		goto out;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 241da19b7da4..ecc151697fd4 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -38,7 +38,6 @@
 #include <linux/mm.h>
 #include <linux/delay.h>
 #include <linux/errno.h>
-#include <linux/file.h>
 #include <linux/string.h>
 #include <linux/ratelimit.h>
 #include <linux/printk.h>
@@ -94,7 +93,7 @@ static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fa
 static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label);
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			    struct nfs_fattr *fattr, struct iattr *sattr,
-			    struct nfs4_state *state, struct nfs4_label *ilabel,
+			    struct nfs_open_context *ctx, struct nfs4_label *ilabel,
 			    struct nfs4_label *olabel);
 #ifdef CONFIG_NFS_V4_1
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
@@ -226,7 +225,6 @@ static const u32 nfs4_pnfs_open_bitmap[3] = {
 
 static const u32 nfs4_open_noattr_bitmap[3] = {
 	FATTR4_WORD0_TYPE
-	| FATTR4_WORD0_CHANGE
 	| FATTR4_WORD0_FILEID,
 };
 
@@ -817,6 +815,10 @@ static int nfs41_sequence_process(struct rpc_task *task,
 	case -NFS4ERR_SEQ_FALSE_RETRY:
 		++slot->seq_nr;
 		goto retry_nowait;
+	case -NFS4ERR_DEADSESSION:
+	case -NFS4ERR_BADSESSION:
+		nfs4_schedule_session_recovery(session, res->sr_status);
+		goto retry_nowait;
 	default:
 		/* Just update the slot sequence no. */
 		slot->seq_done = 1;
@@ -1080,15 +1082,24 @@ int nfs4_call_sync(struct rpc_clnt *clnt,
 	return nfs4_call_sync_sequence(clnt, server, msg, args, res);
 }
 
-static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo)
+static void update_changeattr(struct inode *dir, struct nfs4_change_info *cinfo,
+		unsigned long timestamp)
 {
 	struct nfs_inode *nfsi = NFS_I(dir);
 
 	spin_lock(&dir->i_lock);
 	nfsi->cache_validity |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
-	if (!cinfo->atomic || cinfo->before != dir->i_version)
+	if (cinfo->atomic && cinfo->before == dir->i_version) {
+		nfsi->cache_validity &= ~NFS_INO_REVAL_PAGECACHE;
+		nfsi->attrtimeo_timestamp = jiffies;
+	} else {
 		nfs_force_lookup_revalidate(dir);
+		if (cinfo->before != dir->i_version)
+			nfsi->cache_validity |= NFS_INO_INVALID_ACCESS |
+				NFS_INO_INVALID_ACL;
+	}
 	dir->i_version = cinfo->after;
+	nfsi->read_cache_jiffies = timestamp;
 	nfsi->attr_gencount = nfs_inc_attr_generation_counter();
 	nfs_fscache_invalidate(dir);
 	spin_unlock(&dir->i_lock);
@@ -1221,6 +1232,8 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	atomic_inc(&sp->so_count);
 	p->o_arg.open_flags = flags;
 	p->o_arg.fmode = fmode & (FMODE_READ|FMODE_WRITE);
+	p->o_arg.umask = current_umask();
+	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
 	p->o_arg.share_access = nfs4_map_atomic_open_share(server,
 			fmode, flags);
 	/* don't put an ACCESS op in OPEN compound if O_EXCL, because ACCESS
@@ -1228,8 +1241,16 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	if (!(flags & O_EXCL)) {
 		/* ask server to check for all possible rights as results
 		 * are cached */
-		p->o_arg.access = NFS4_ACCESS_READ | NFS4_ACCESS_MODIFY |
-				  NFS4_ACCESS_EXTEND | NFS4_ACCESS_EXECUTE;
+		switch (p->o_arg.claim) {
+		default:
+			break;
+		case NFS4_OPEN_CLAIM_NULL:
+		case NFS4_OPEN_CLAIM_FH:
+			p->o_arg.access = NFS4_ACCESS_READ |
+				NFS4_ACCESS_MODIFY |
+				NFS4_ACCESS_EXTEND |
+				NFS4_ACCESS_EXECUTE;
+		}
 	}
 	p->o_arg.clientid = server->nfs_client->cl_clientid;
 	p->o_arg.id.create_time = ktime_to_ns(sp->so_seqid.create_time);
@@ -1239,7 +1260,6 @@ static struct nfs4_opendata *nfs4_opendata_alloc(struct dentry *dentry,
 	p->o_arg.bitmask = nfs4_bitmask(server, label);
 	p->o_arg.open_bitmap = &nfs4_fattr_bitmap[0];
 	p->o_arg.label = nfs4_label_copy(p->a_label, label);
-	p->o_arg.claim = nfs4_map_atomic_open_claim(server, claim);
 	switch (p->o_arg.claim) {
 	case NFS4_OPEN_CLAIM_NULL:
 	case NFS4_OPEN_CLAIM_DELEGATE_CUR:
@@ -2372,11 +2392,13 @@ static int _nfs4_proc_open(struct nfs4_opendata *data)
 	nfs_fattr_map_and_free_names(server, &data->f_attr);
 
 	if (o_arg->open_flags & O_CREAT) {
-		update_changeattr(dir, &o_res->cinfo);
 		if (o_arg->open_flags & O_EXCL)
 			data->file_created = 1;
 		else if (o_res->cinfo.before != o_res->cinfo.after)
 			data->file_created = 1;
+		if (data->file_created || dir->i_version != o_res->cinfo.after)
+			update_changeattr(dir, &o_res->cinfo,
+					o_res->f_attr->time_start);
 	}
 	if ((o_res->rflags & NFS4_OPEN_RESULT_LOCKTYPE_POSIX) == 0)
 		server->caps &= ~NFS_CAP_POSIX_LOCK;
@@ -2819,7 +2841,7 @@ static int _nfs4_do_open(struct inode *dir,
 			nfs_fattr_init(opendata->o_res.f_attr);
 			status = nfs4_do_setattr(state->inode, cred,
 					opendata->o_res.f_attr, sattr,
-					state, label, olabel);
+					ctx, label, olabel);
 			if (status == 0) {
 				nfs_setattr_update_inode(state->inode, sattr,
 						opendata->o_res.f_attr);
@@ -2914,7 +2936,7 @@ static int _nfs4_do_setattr(struct inode *inode,
 			    struct nfs_setattrargs *arg,
 			    struct nfs_setattrres *res,
 			    struct rpc_cred *cred,
-			    struct nfs4_state *state)
+			    struct nfs_open_context *ctx)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
         struct rpc_message msg = {
@@ -2937,15 +2959,17 @@ static int _nfs4_do_setattr(struct inode *inode,
 
 	if (nfs4_copy_delegation_stateid(inode, fmode, &arg->stateid, &delegation_cred)) {
 		/* Use that stateid */
-	} else if (truncate && state != NULL) {
-		struct nfs_lockowner lockowner = {
-			.l_owner = current->files,
-			.l_pid = current->tgid,
-		};
-		if (!nfs4_valid_open_stateid(state))
+	} else if (truncate && ctx != NULL) {
+		struct nfs_lock_context *l_ctx;
+		if (!nfs4_valid_open_stateid(ctx->state))
 			return -EBADF;
-		if (nfs4_select_rw_stateid(state, FMODE_WRITE, &lockowner,
-				&arg->stateid, &delegation_cred) == -EIO)
+		l_ctx = nfs_get_lock_context(ctx);
+		if (IS_ERR(l_ctx))
+			return PTR_ERR(l_ctx);
+		status = nfs4_select_rw_stateid(ctx->state, FMODE_WRITE, l_ctx,
+						&arg->stateid, &delegation_cred);
+		nfs_put_lock_context(l_ctx);
+		if (status == -EIO)
 			return -EBADF;
 	} else
 		nfs4_stateid_copy(&arg->stateid, &zero_stateid);
@@ -2955,7 +2979,7 @@ static int _nfs4_do_setattr(struct inode *inode,
 	status = nfs4_call_sync(server->client, server, &msg, &arg->seq_args, &res->seq_res, 1);
 
 	put_rpccred(delegation_cred);
-	if (status == 0 && state != NULL)
+	if (status == 0 && ctx != NULL)
 		renew_lease(server, timestamp);
 	trace_nfs4_setattr(inode, &arg->stateid, status);
 	return status;
@@ -2963,10 +2987,11 @@ static int _nfs4_do_setattr(struct inode *inode,
 
 static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 			   struct nfs_fattr *fattr, struct iattr *sattr,
-			   struct nfs4_state *state, struct nfs4_label *ilabel,
+			   struct nfs_open_context *ctx, struct nfs4_label *ilabel,
 			   struct nfs4_label *olabel)
 {
 	struct nfs_server *server = NFS_SERVER(inode);
+	struct nfs4_state *state = ctx ? ctx->state : NULL;
         struct nfs_setattrargs  arg = {
                 .fh             = NFS_FH(inode),
                 .iap            = sattr,
@@ -2991,7 +3016,7 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 		arg.bitmask = nfs4_bitmask(server, olabel);
 
 	do {
-		err = _nfs4_do_setattr(inode, &arg, &res, cred, state);
+		err = _nfs4_do_setattr(inode, &arg, &res, cred, ctx);
 		switch (err) {
 		case -NFS4ERR_OPENMODE:
 			if (!(sattr->ia_valid & ATTR_SIZE)) {
@@ -3028,10 +3053,15 @@ struct nfs4_closedata {
 	struct nfs4_state *state;
 	struct nfs_closeargs arg;
 	struct nfs_closeres res;
+	struct {
+		struct nfs4_layoutreturn_args arg;
+		struct nfs4_layoutreturn_res res;
+		struct nfs4_xdr_opaque_data ld_private;
+		u32 roc_barrier;
+		bool roc;
+	} lr;
 	struct nfs_fattr fattr;
 	unsigned long timestamp;
-	bool roc;
-	u32 roc_barrier;
 };
 
 static void nfs4_free_closedata(void *data)
@@ -3040,8 +3070,9 @@ static void nfs4_free_closedata(void *data)
 	struct nfs4_state_owner *sp = calldata->state->owner;
 	struct super_block *sb = calldata->state->inode->i_sb;
 
-	if (calldata->roc)
-		pnfs_roc_release(calldata->state->inode);
+	if (calldata->lr.roc)
+		pnfs_roc_release(&calldata->lr.arg, &calldata->lr.res,
+				calldata->res.lr_ret);
 	nfs4_put_open_state(calldata->state);
 	nfs_free_seqid(calldata->arg.seqid);
 	nfs4_put_state_owner(sp);
@@ -3060,17 +3091,50 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 	if (!nfs4_sequence_done(task, &calldata->res.seq_res))
 		return;
 	trace_nfs4_close(state, &calldata->arg, &calldata->res, task->tk_status);
+
+	/* Handle Layoutreturn errors */
+	if (calldata->arg.lr_args && task->tk_status != 0) {
+		switch (calldata->res.lr_ret) {
+		default:
+			calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+			break;
+		case 0:
+			calldata->arg.lr_args = NULL;
+			calldata->res.lr_res = NULL;
+			break;
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_EXPIRED:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+		case -NFS4ERR_WRONG_CRED:
+			calldata->arg.lr_args = NULL;
+			calldata->res.lr_res = NULL;
+			calldata->res.lr_ret = 0;
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+
         /* hmm. we are done with the inode, and in the process of freeing
 	 * the state_owner. we keep this around to process errors
 	 */
 	switch (task->tk_status) {
 		case 0:
 			res_stateid = &calldata->res.stateid;
-			if (calldata->roc)
-				pnfs_roc_set_barrier(state->inode,
-						     calldata->roc_barrier);
 			renew_lease(server, calldata->timestamp);
 			break;
+		case -NFS4ERR_ACCESS:
+			if (calldata->arg.bitmask != NULL) {
+				calldata->arg.bitmask = NULL;
+				calldata->res.fattr = NULL;
+				task->tk_status = 0;
+				rpc_restart_call_prepare(task);
+				goto out_release;
+
+			}
+			break;
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_EXPIRED:
@@ -3096,7 +3160,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			res_stateid, calldata->arg.fmode);
 out_release:
 	nfs_release_seqid(calldata->arg.seqid);
-	nfs_refresh_inode(calldata->inode, calldata->res.fattr);
+	nfs_refresh_inode(calldata->inode, &calldata->fattr);
 	dprintk("%s: done, ret = %d!\n", __func__, task->tk_status);
 }
 
@@ -3144,21 +3208,30 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 		goto out_no_action;
 	}
 
-	if (nfs4_wait_on_layoutreturn(inode, task)) {
+	if (!calldata->lr.roc && nfs4_wait_on_layoutreturn(inode, task)) {
 		nfs_release_seqid(calldata->arg.seqid);
 		goto out_wait;
 	}
 
 	if (calldata->arg.fmode == 0)
 		task->tk_msg.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_CLOSE];
-	if (calldata->roc)
-		pnfs_roc_get_barrier(inode, &calldata->roc_barrier);
+
+	if (calldata->arg.fmode == 0 || calldata->arg.fmode == FMODE_READ) {
+		/* Close-to-open cache consistency revalidation */
+		if (!nfs4_have_delegation(inode, FMODE_READ))
+			calldata->arg.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
+		else
+			calldata->arg.bitmask = NULL;
+	}
 
 	calldata->arg.share_access =
 		nfs4_map_atomic_open_share(NFS_SERVER(inode),
 				calldata->arg.fmode, 0);
 
-	nfs_fattr_init(calldata->res.fattr);
+	if (calldata->res.fattr == NULL)
+		calldata->arg.bitmask = NULL;
+	else if (calldata->arg.bitmask == NULL)
+		calldata->res.fattr = NULL;
 	calldata->timestamp = jiffies;
 	if (nfs4_setup_sequence(NFS_SERVER(inode),
 				&calldata->arg.seq_args,
@@ -3179,13 +3252,6 @@ static const struct rpc_call_ops nfs4_close_ops = {
 	.rpc_release = nfs4_free_closedata,
 };
 
-static bool nfs4_roc(struct inode *inode)
-{
-	if (!nfs_have_layout(inode))
-		return false;
-	return pnfs_roc(inode);
-}
-
 /* 
  * It is possible for data to be read/written from a mem-mapped file 
  * after the sys_close call (which hits the vfs layer as a flush).
@@ -3232,12 +3298,19 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
 	calldata->arg.seqid = alloc_seqid(&state->owner->so_seqid, gfp_mask);
 	if (IS_ERR(calldata->arg.seqid))
 		goto out_free_calldata;
+	nfs_fattr_init(&calldata->fattr);
 	calldata->arg.fmode = 0;
-	calldata->arg.bitmask = server->cache_consistency_bitmask;
+	calldata->lr.arg.ld_private = &calldata->lr.ld_private;
 	calldata->res.fattr = &calldata->fattr;
 	calldata->res.seqid = calldata->arg.seqid;
 	calldata->res.server = server;
-	calldata->roc = nfs4_roc(state->inode);
+	calldata->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+	calldata->lr.roc = pnfs_roc(state->inode,
+			&calldata->lr.arg, &calldata->lr.res, msg.rpc_cred);
+	if (calldata->lr.roc) {
+		calldata->arg.lr_args = &calldata->lr.arg;
+		calldata->res.lr_res = &calldata->lr.res;
+	}
 	nfs_sb_active(calldata->inode->i_sb);
 
 	msg.rpc_argp = &calldata->arg;
@@ -3290,7 +3363,7 @@ static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
 
 #define FATTR4_WORD1_NFS40_MASK (2*FATTR4_WORD1_MOUNTED_ON_FILEID - 1UL)
 #define FATTR4_WORD2_NFS41_MASK (2*FATTR4_WORD2_SUPPATTR_EXCLCREAT - 1UL)
-#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_SECURITY_LABEL - 1UL)
+#define FATTR4_WORD2_NFS42_MASK (2*FATTR4_WORD2_MODE_UMASK - 1UL)
 
 static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
 {
@@ -3687,7 +3760,7 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 {
 	struct inode *inode = d_inode(dentry);
 	struct rpc_cred *cred = NULL;
-	struct nfs4_state *state = NULL;
+	struct nfs_open_context *ctx = NULL;
 	struct nfs4_label *label = NULL;
 	int status;
 
@@ -3708,20 +3781,17 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr,
 
 	/* Search for an existing open(O_WRITE) file */
 	if (sattr->ia_valid & ATTR_FILE) {
-		struct nfs_open_context *ctx;
 
 		ctx = nfs_file_open_context(sattr->ia_file);
-		if (ctx) {
+		if (ctx)
 			cred = ctx->cred;
-			state = ctx->state;
-		}
 	}
 
 	label = nfs4_label_alloc(NFS_SERVER(inode), GFP_KERNEL);
 	if (IS_ERR(label))
 		return PTR_ERR(label);
 
-	status = nfs4_do_setattr(inode, cred, fattr, sattr, state, NULL, label);
+	status = nfs4_do_setattr(inode, cred, fattr, sattr, ctx, NULL, label);
 	if (status == 0) {
 		nfs_setattr_update_inode(inode, sattr, fattr);
 		nfs_setsecurity(inode, fattr, label);
@@ -3966,18 +4036,20 @@ static int
 nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 		 int flags)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_label l, *ilabel = NULL;
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
 	int status = 0;
 
-	ctx = alloc_nfs_open_context(dentry, FMODE_READ);
+	ctx = alloc_nfs_open_context(dentry, FMODE_READ, NULL);
 	if (IS_ERR(ctx))
 		return PTR_ERR(ctx);
 
 	ilabel = nfs4_label_init_security(dir, dentry, sattr, &l);
 
-	sattr->ia_mode &= ~current_umask();
+	if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+		sattr->ia_mode &= ~current_umask();
 	state = nfs4_do_open(dir, ctx, flags, sattr, ilabel, NULL);
 	if (IS_ERR(state)) {
 		status = PTR_ERR(state);
@@ -4004,11 +4076,12 @@ static int _nfs4_proc_remove(struct inode *dir, const struct qstr *name)
 		.rpc_argp = &args,
 		.rpc_resp = &res,
 	};
+	unsigned long timestamp = jiffies;
 	int status;
 
 	status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 1);
 	if (status == 0)
-		update_changeattr(dir, &res.cinfo);
+		update_changeattr(dir, &res.cinfo, timestamp);
 	return status;
 }
 
@@ -4056,7 +4129,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir)
 	if (nfs4_async_handle_error(task, res->server, NULL,
 				    &data->timeout) == -EAGAIN)
 		return 0;
-	update_changeattr(dir, &res->cinfo);
+	if (task->tk_status == 0)
+		update_changeattr(dir, &res->cinfo, res->dir_attr->time_start);
 	return 1;
 }
 
@@ -4090,8 +4164,11 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir,
 	if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN)
 		return 0;
 
-	update_changeattr(old_dir, &res->old_cinfo);
-	update_changeattr(new_dir, &res->new_cinfo);
+	if (task->tk_status == 0) {
+		update_changeattr(old_dir, &res->old_cinfo, res->old_fattr->time_start);
+		if (new_dir != old_dir)
+			update_changeattr(new_dir, &res->new_cinfo, res->new_fattr->time_start);
+	}
 	return 1;
 }
 
@@ -4128,7 +4205,7 @@ static int _nfs4_proc_link(struct inode *inode, struct inode *dir, const struct
 
 	status = nfs4_call_sync(server->client, server, &msg, &arg.seq_args, &res.seq_res, 1);
 	if (!status) {
-		update_changeattr(dir, &res.cinfo);
+		update_changeattr(dir, &res.cinfo, res.fattr->time_start);
 		status = nfs_post_op_update_inode(inode, res.fattr);
 		if (!status)
 			nfs_setsecurity(inode, res.fattr, res.label);
@@ -4185,6 +4262,7 @@ static struct nfs4_createdata *nfs4_alloc_createdata(struct inode *dir,
 		data->arg.attrs = sattr;
 		data->arg.ftype = ftype;
 		data->arg.bitmask = nfs4_bitmask(server, data->label);
+		data->arg.umask = current_umask();
 		data->res.server = server;
 		data->res.fh = &data->fh;
 		data->res.fattr = &data->fattr;
@@ -4202,7 +4280,8 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_
 	int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg,
 				    &data->arg.seq_args, &data->res.seq_res, 1);
 	if (status == 0) {
-		update_changeattr(dir, &data->res.dir_cinfo);
+		update_changeattr(dir, &data->res.dir_cinfo,
+				data->res.fattr->time_start);
 		status = nfs_instantiate(dentry, data->res.fh, data->res.fattr, data->res.label);
 	}
 	return status;
@@ -4282,13 +4361,15 @@ out:
 static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
 	struct nfs4_label l, *label = NULL;
 	int err;
 
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
 
-	sattr->ia_mode &= ~current_umask();
+	if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+		sattr->ia_mode &= ~current_umask();
 	do {
 		err = _nfs4_proc_mkdir(dir, dentry, sattr, label);
 		trace_nfs4_mkdir(dir, &dentry->d_name, err);
@@ -4391,13 +4472,15 @@ out:
 static int nfs4_proc_mknod(struct inode *dir, struct dentry *dentry,
 		struct iattr *sattr, dev_t rdev)
 {
+	struct nfs_server *server = NFS_SERVER(dir);
 	struct nfs4_exception exception = { };
 	struct nfs4_label l, *label = NULL;
 	int err;
 
 	label = nfs4_label_init_security(dir, dentry, sattr, &l);
 
-	sattr->ia_mode &= ~current_umask();
+	if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+		sattr->ia_mode &= ~current_umask();
 	do {
 		err = _nfs4_proc_mknod(dir, dentry, sattr, label, rdev);
 		trace_nfs4_mknod(dir, &dentry->d_name, err);
@@ -4541,11 +4624,7 @@ int nfs4_set_rw_stateid(nfs4_stateid *stateid,
 		const struct nfs_lock_context *l_ctx,
 		fmode_t fmode)
 {
-	const struct nfs_lockowner *lockowner = NULL;
-
-	if (l_ctx != NULL)
-		lockowner = &l_ctx->lockowner;
-	return nfs4_select_rw_stateid(ctx->state, fmode, lockowner, stateid, NULL);
+	return nfs4_select_rw_stateid(ctx->state, fmode, l_ctx, stateid, NULL);
 }
 EXPORT_SYMBOL_GPL(nfs4_set_rw_stateid);
 
@@ -5564,11 +5643,16 @@ struct nfs4_delegreturndata {
 	struct nfs_fh fh;
 	nfs4_stateid stateid;
 	unsigned long timestamp;
+	struct {
+		struct nfs4_layoutreturn_args arg;
+		struct nfs4_layoutreturn_res res;
+		struct nfs4_xdr_opaque_data ld_private;
+		u32 roc_barrier;
+		bool roc;
+	} lr;
 	struct nfs_fattr fattr;
 	int rpc_status;
 	struct inode *inode;
-	bool roc;
-	u32 roc_barrier;
 };
 
 static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
@@ -5579,6 +5663,32 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 		return;
 
 	trace_nfs4_delegreturn_exit(&data->args, &data->res, task->tk_status);
+
+	/* Handle Layoutreturn errors */
+	if (data->args.lr_args && task->tk_status != 0) {
+		switch(data->res.lr_ret) {
+		default:
+			data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+			break;
+		case 0:
+			data->args.lr_args = NULL;
+			data->res.lr_res = NULL;
+			break;
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_DELEG_REVOKED:
+		case -NFS4ERR_EXPIRED:
+		case -NFS4ERR_BAD_STATEID:
+		case -NFS4ERR_OLD_STATEID:
+		case -NFS4ERR_UNKNOWN_LAYOUTTYPE:
+		case -NFS4ERR_WRONG_CRED:
+			data->args.lr_args = NULL;
+			data->res.lr_res = NULL;
+			data->res.lr_ret = 0;
+			rpc_restart_call_prepare(task);
+			return;
+		}
+	}
+
 	switch (task->tk_status) {
 	case 0:
 		renew_lease(data->res.server, data->timestamp);
@@ -5594,6 +5704,14 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 	case -NFS4ERR_STALE_STATEID:
 		task->tk_status = 0;
 		break;
+	case -NFS4ERR_ACCESS:
+		if (data->args.bitmask) {
+			data->args.bitmask = NULL;
+			data->res.fattr = NULL;
+			task->tk_status = 0;
+			rpc_restart_call_prepare(task);
+			return;
+		}
 	default:
 		if (nfs4_async_handle_error(task, data->res.server,
 					    NULL, NULL) == -EAGAIN) {
@@ -5602,8 +5720,6 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 		}
 	}
 	data->rpc_status = task->tk_status;
-	if (data->roc && data->rpc_status == 0)
-		pnfs_roc_set_barrier(data->inode, data->roc_barrier);
 }
 
 static void nfs4_delegreturn_release(void *calldata)
@@ -5612,8 +5728,10 @@ static void nfs4_delegreturn_release(void *calldata)
 	struct inode *inode = data->inode;
 
 	if (inode) {
-		if (data->roc)
-			pnfs_roc_release(inode);
+		if (data->lr.roc)
+			pnfs_roc_release(&data->lr.arg, &data->lr.res,
+					data->res.lr_ret);
+		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
 		nfs_iput_and_deactive(inode);
 	}
 	kfree(calldata);
@@ -5625,12 +5743,9 @@ static void nfs4_delegreturn_prepare(struct rpc_task *task, void *data)
 
 	d_data = (struct nfs4_delegreturndata *)data;
 
-	if (nfs4_wait_on_layoutreturn(d_data->inode, task))
+	if (!d_data->lr.roc && nfs4_wait_on_layoutreturn(d_data->inode, task))
 		return;
 
-	if (d_data->roc)
-		pnfs_roc_get_barrier(d_data->inode, &d_data->roc_barrier);
-
 	nfs4_setup_sequence(d_data->res.server,
 			&d_data->args.seq_args,
 			&d_data->res.seq_res,
@@ -5676,12 +5791,22 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	nfs4_stateid_copy(&data->stateid, stateid);
 	data->res.fattr = &data->fattr;
 	data->res.server = server;
+	data->res.lr_ret = -NFS4ERR_NOMATCHING_LAYOUT;
+	data->lr.arg.ld_private = &data->lr.ld_private;
 	nfs_fattr_init(data->res.fattr);
 	data->timestamp = jiffies;
 	data->rpc_status = 0;
+	data->lr.roc = pnfs_roc(inode, &data->lr.arg, &data->lr.res, cred);
 	data->inode = nfs_igrab_and_active(inode);
-	if (data->inode)
-		data->roc = nfs4_roc(inode);
+	if (data->inode) {
+		if (data->lr.roc) {
+			data->args.lr_args = &data->lr.arg;
+			data->res.lr_res = &data->lr.res;
+		}
+	} else if (data->lr.roc) {
+		pnfs_roc_release(&data->lr.arg, &data->lr.res, 0);
+		data->lr.roc = false;
+	}
 
 	task_setup_data.callback_data = data;
 	msg.rpc_argp = &data->args;
@@ -5695,10 +5820,6 @@ static int _nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, co
 	if (status != 0)
 		goto out;
 	status = data->rpc_status;
-	if (status == 0)
-		nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
-	else
-		nfs_refresh_inode(inode, &data->fattr);
 out:
 	rpc_put_task(task);
 	return status;
@@ -6015,7 +6136,6 @@ static struct nfs4_lockdata *nfs4_alloc_lockdata(struct file_lock *fl,
 	p->server = server;
 	atomic_inc(&lsp->ls_count);
 	p->ctx = get_nfs_open_context(ctx);
-	get_file(fl->fl_file);
 	memcpy(&p->fl, fl, sizeof(p->fl));
 	return p;
 out_free_seqid:
@@ -6128,7 +6248,6 @@ static void nfs4_lock_release(void *calldata)
 		nfs_free_seqid(data->arg.lock_seqid);
 	nfs4_put_lock_state(data->lsp);
 	put_nfs_open_context(data->ctx);
-	fput(data->fl.fl_file);
 	kfree(data);
 	dprintk("%s: done!\n", __func__);
 }
@@ -8559,21 +8678,13 @@ static void nfs4_layoutreturn_release(void *calldata)
 {
 	struct nfs4_layoutreturn *lrp = calldata;
 	struct pnfs_layout_hdr *lo = lrp->args.layout;
-	LIST_HEAD(freeme);
 
 	dprintk("--> %s\n", __func__);
-	spin_lock(&lo->plh_inode->i_lock);
-	if (lrp->res.lrs_present) {
-		pnfs_mark_matching_lsegs_invalid(lo, &freeme,
-				&lrp->args.range,
-				be32_to_cpu(lrp->args.stateid.seqid));
-		pnfs_set_layout_stateid(lo, &lrp->res.stateid, true);
-	} else
-		pnfs_mark_layout_stateid_invalid(lo, &freeme);
-	pnfs_clear_layoutreturn_waitbit(lo);
-	spin_unlock(&lo->plh_inode->i_lock);
+	pnfs_layoutreturn_free_lsegs(lo, &lrp->args.stateid, &lrp->args.range,
+			lrp->res.lrs_present ? &lrp->res.stateid : NULL);
 	nfs4_sequence_free_slot(&lrp->res.seq_res);
-	pnfs_free_lseg_list(&freeme);
+	if (lrp->ld_private.ops && lrp->ld_private.ops->free)
+		lrp->ld_private.ops->free(&lrp->ld_private);
 	pnfs_put_layout_hdr(lrp->args.layout);
 	nfs_iput_and_deactive(lrp->inode);
 	kfree(calldata);
diff --git a/fs/nfs/nfs4session.c b/fs/nfs/nfs4session.c
index a61350f75c74..769b85655c4b 100644
--- a/fs/nfs/nfs4session.c
+++ b/fs/nfs/nfs4session.c
@@ -169,7 +169,7 @@ bool nfs4_try_to_lock_slot(struct nfs4_slot_table *tbl, struct nfs4_slot *slot)
 struct nfs4_slot *nfs4_lookup_slot(struct nfs4_slot_table *tbl, u32 slotid)
 {
 	if (slotid <= tbl->max_slotid)
-		return nfs4_find_or_create_slot(tbl, slotid, 1, GFP_NOWAIT);
+		return nfs4_find_or_create_slot(tbl, slotid, 0, GFP_NOWAIT);
 	return ERR_PTR(-E2BIG);
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 0959c9661662..90e6193ce6be 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -494,21 +494,18 @@ nfs4_alloc_state_owner(struct nfs_server *server,
 }
 
 static void
-nfs4_drop_state_owner(struct nfs4_state_owner *sp)
-{
-	struct rb_node *rb_node = &sp->so_server_node;
-
-	if (!RB_EMPTY_NODE(rb_node)) {
-		struct nfs_server *server = sp->so_server;
-		struct nfs_client *clp = server->nfs_client;
-
-		spin_lock(&clp->cl_lock);
-		if (!RB_EMPTY_NODE(rb_node)) {
-			rb_erase(rb_node, &server->state_owners);
-			RB_CLEAR_NODE(rb_node);
-		}
-		spin_unlock(&clp->cl_lock);
-	}
+nfs4_reset_state_owner(struct nfs4_state_owner *sp)
+{
+	/* This state_owner is no longer usable, but must
+	 * remain in place so that state recovery can find it
+	 * and the opens associated with it.
+	 * It may also be used for new 'open' request to
+	 * return a delegation to the server.
+	 * So update the 'create_time' so that it looks like
+	 * a new state_owner.  This will cause the server to
+	 * request an OPEN_CONFIRM to start a new sequence.
+	 */
+	sp->so_seqid.create_time = ktime_get();
 }
 
 static void nfs4_free_state_owner(struct nfs4_state_owner *sp)
@@ -797,19 +794,33 @@ void nfs4_close_sync(struct nfs4_state *state, fmode_t fmode)
 
 /*
  * Search the state->lock_states for an existing lock_owner
- * that is compatible with current->files
+ * that is compatible with either of the given owners.
+ * If the second is non-zero, then the first refers to a Posix-lock
+ * owner (current->files) and the second refers to a flock/OFD
+ * owner (struct file*).  In that case, prefer a match for the first
+ * owner.
+ * If both sorts of locks are held on the one file we cannot know
+ * which stateid was intended to be used, so a "correct" choice cannot
+ * be made.  Failing that, a "consistent" choice is preferable.  The
+ * consistent choice we make is to prefer the first owner, that of a
+ * Posix lock.
  */
 static struct nfs4_lock_state *
-__nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
+__nfs4_find_lock_state(struct nfs4_state *state,
+		       fl_owner_t fl_owner, fl_owner_t fl_owner2)
 {
-	struct nfs4_lock_state *pos;
+	struct nfs4_lock_state *pos, *ret = NULL;
 	list_for_each_entry(pos, &state->lock_states, ls_locks) {
-		if (pos->ls_owner != fl_owner)
-			continue;
-		atomic_inc(&pos->ls_count);
-		return pos;
+		if (pos->ls_owner == fl_owner) {
+			ret = pos;
+			break;
+		}
+		if (pos->ls_owner == fl_owner2)
+			ret = pos;
 	}
-	return NULL;
+	if (ret)
+		atomic_inc(&ret->ls_count);
+	return ret;
 }
 
 /*
@@ -857,7 +868,7 @@ static struct nfs4_lock_state *nfs4_get_lock_state(struct nfs4_state *state, fl_
 	
 	for(;;) {
 		spin_lock(&state->state_lock);
-		lsp = __nfs4_find_lock_state(state, owner);
+		lsp = __nfs4_find_lock_state(state, owner, 0);
 		if (lsp != NULL)
 			break;
 		if (new != NULL) {
@@ -939,22 +950,23 @@ int nfs4_set_lock_state(struct nfs4_state *state, struct file_lock *fl)
 
 static int nfs4_copy_lock_stateid(nfs4_stateid *dst,
 		struct nfs4_state *state,
-		const struct nfs_lockowner *lockowner)
+		const struct nfs_lock_context *l_ctx)
 {
 	struct nfs4_lock_state *lsp;
-	fl_owner_t fl_owner;
+	fl_owner_t fl_owner, fl_flock_owner;
 	int ret = -ENOENT;
 
-
-	if (lockowner == NULL)
+	if (l_ctx == NULL)
 		goto out;
 
 	if (test_bit(LK_STATE_IN_USE, &state->flags) == 0)
 		goto out;
 
-	fl_owner = lockowner->l_owner;
+	fl_owner = l_ctx->lockowner;
+	fl_flock_owner = l_ctx->open_context->flock_owner;
+
 	spin_lock(&state->state_lock);
-	lsp = __nfs4_find_lock_state(state, fl_owner);
+	lsp = __nfs4_find_lock_state(state, fl_owner, fl_flock_owner);
 	if (lsp && test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
 		ret = -EIO;
 	else if (lsp != NULL && test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) != 0) {
@@ -986,7 +998,7 @@ static void nfs4_copy_open_stateid(nfs4_stateid *dst, struct nfs4_state *state)
  * requests.
  */
 int nfs4_select_rw_stateid(struct nfs4_state *state,
-		fmode_t fmode, const struct nfs_lockowner *lockowner,
+		fmode_t fmode, const struct nfs_lock_context *l_ctx,
 		nfs4_stateid *dst, struct rpc_cred **cred)
 {
 	int ret;
@@ -995,7 +1007,7 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
 		return -EIO;
 	if (cred != NULL)
 		*cred = NULL;
-	ret = nfs4_copy_lock_stateid(dst, state, lockowner);
+	ret = nfs4_copy_lock_stateid(dst, state, l_ctx);
 	if (ret == -EIO)
 		/* A lost lock - don't even consider delegations */
 		goto out;
@@ -1098,7 +1110,7 @@ void nfs_increment_open_seqid(int status, struct nfs_seqid *seqid)
 
 	sp = container_of(seqid->sequence, struct nfs4_state_owner, so_seqid);
 	if (status == -NFS4ERR_BAD_SEQID)
-		nfs4_drop_state_owner(sp);
+		nfs4_reset_state_owner(sp);
 	if (!nfs4_has_session(sp->so_server->nfs_client))
 		nfs_increment_seqid(status, seqid);
 }
@@ -1717,7 +1729,6 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error)
 			break;
 		case -NFS4ERR_STALE_CLIENTID:
 			set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state);
-			nfs4_state_clear_reclaim_reboot(clp);
 			nfs4_state_start_reclaim_reboot(clp);
 			break;
 		case -NFS4ERR_EXPIRED:
@@ -2190,7 +2201,7 @@ void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
 	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
 		set_bit(NFS4CLNT_BIND_CONN_TO_SESSION, &clp->cl_state);
 	}
-	nfs4_schedule_lease_recovery(clp);
+	nfs4_schedule_state_manager(clp);
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
 
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fc89e5ed07ee..e9255cb453e6 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -52,6 +52,7 @@
 #include <linux/nfs.h>
 #include <linux/nfs4.h>
 #include <linux/nfs_fs.h>
+#include <linux/fs_struct.h>
 
 #include "nfs4_fs.h"
 #include "internal.h"
@@ -415,6 +416,8 @@ static int nfs4_stat_to_errno(int);
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
+#define encode_layoutreturn_maxsz 0
+#define decode_layoutreturn_maxsz 0
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFS4_enc_compound_sz	(1024)  /* XXX: large enough? */
@@ -499,22 +502,24 @@ static int nfs4_stat_to_errno(int);
 				(compound_encode_hdr_maxsz + \
 				 encode_sequence_maxsz + \
 				 encode_putfh_maxsz + \
-				 encode_open_downgrade_maxsz + \
-				 encode_getattr_maxsz)
+				 encode_layoutreturn_maxsz + \
+				 encode_open_downgrade_maxsz)
 #define NFS4_dec_open_downgrade_sz \
 				(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz + \
 				 decode_putfh_maxsz + \
-				 decode_open_downgrade_maxsz + \
-				 decode_getattr_maxsz)
+				 decode_layoutreturn_maxsz + \
+				 decode_open_downgrade_maxsz)
 #define NFS4_enc_close_sz	(compound_encode_hdr_maxsz + \
 				 encode_sequence_maxsz + \
 				 encode_putfh_maxsz + \
+				 encode_layoutreturn_maxsz + \
 				 encode_close_maxsz + \
 				 encode_getattr_maxsz)
 #define NFS4_dec_close_sz	(compound_decode_hdr_maxsz + \
 				 decode_sequence_maxsz + \
 				 decode_putfh_maxsz + \
+				 decode_layoutreturn_maxsz + \
 				 decode_close_maxsz + \
 				 decode_getattr_maxsz)
 #define NFS4_enc_setattr_sz	(compound_encode_hdr_maxsz + \
@@ -708,10 +713,13 @@ static int nfs4_stat_to_errno(int);
 #define NFS4_enc_delegreturn_sz	(compound_encode_hdr_maxsz + \
 				encode_sequence_maxsz + \
 				encode_putfh_maxsz + \
+				encode_layoutreturn_maxsz + \
 				encode_delegreturn_maxsz + \
 				encode_getattr_maxsz)
 #define NFS4_dec_delegreturn_sz (compound_decode_hdr_maxsz + \
 				decode_sequence_maxsz + \
+				decode_putfh_maxsz + \
+				decode_layoutreturn_maxsz + \
 				decode_delegreturn_maxsz + \
 				decode_getattr_maxsz)
 #define NFS4_enc_getacl_sz	(compound_encode_hdr_maxsz + \
@@ -1003,7 +1011,7 @@ static void encode_nfs4_verifier(struct xdr_stream *xdr, const nfs4_verifier *ve
 static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 				const struct nfs4_label *label,
 				const struct nfs_server *server,
-				bool excl_check)
+				bool excl_check, const umode_t *umask)
 {
 	char owner_name[IDMAP_NAMESZ];
 	char owner_group[IDMAP_NAMESZ];
@@ -1017,18 +1025,21 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 
 	/*
 	 * We reserve enough space to write the entire attribute buffer at once.
-	 * In the worst-case, this would be
-	 * 16(bitmap) + 4(attrlen) + 8(size) + 4(mode) + 4(atime) + 4(mtime)
-	 * = 40 bytes, plus any contribution from variable-length fields
-	 *            such as owner/group.
 	 */
 	if (iap->ia_valid & ATTR_SIZE) {
 		bmval[0] |= FATTR4_WORD0_SIZE;
 		len += 8;
 	}
+	if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK))
+		umask = NULL;
 	if (iap->ia_valid & ATTR_MODE) {
-		bmval[1] |= FATTR4_WORD1_MODE;
-		len += 4;
+		if (umask) {
+			bmval[2] |= FATTR4_WORD2_MODE_UMASK;
+			len += 8;
+		} else {
+			bmval[1] |= FATTR4_WORD1_MODE;
+			len += 4;
+		}
 	}
 	if (iap->ia_valid & ATTR_UID) {
 		owner_namelen = nfs_map_uid_to_name(server, iap->ia_uid, owner_name, IDMAP_NAMESZ);
@@ -1129,6 +1140,10 @@ static void encode_attrs(struct xdr_stream *xdr, const struct iattr *iap,
 		*p++ = cpu_to_be32(label->len);
 		p = xdr_encode_opaque_fixed(p, label->label, label->len);
 	}
+	if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+		*p++ = cpu_to_be32(iap->ia_mode & S_IALLUGO);
+		*p++ = cpu_to_be32(*umask);
+	}
 
 /* out: */
 }
@@ -1183,7 +1198,8 @@ static void encode_create(struct xdr_stream *xdr, const struct nfs4_create_arg *
 	}
 
 	encode_string(xdr, create->name->len, create->name->name);
-	encode_attrs(xdr, create->attrs, create->label, create->server, false);
+	encode_attrs(xdr, create->attrs, create->label, create->server, false,
+		     &create->umask);
 }
 
 static void encode_getattr_one(struct xdr_stream *xdr, uint32_t bitmap, struct compound_hdr *hdr)
@@ -1403,11 +1419,13 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	switch(arg->createmode) {
 	case NFS4_CREATE_UNCHECKED:
 		*p = cpu_to_be32(NFS4_CREATE_UNCHECKED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+			     &arg->umask);
 		break;
 	case NFS4_CREATE_GUARDED:
 		*p = cpu_to_be32(NFS4_CREATE_GUARDED);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, false,
+			     &arg->umask);
 		break;
 	case NFS4_CREATE_EXCLUSIVE:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE);
@@ -1416,7 +1434,8 @@ static inline void encode_createmode(struct xdr_stream *xdr, const struct nfs_op
 	case NFS4_CREATE_EXCLUSIVE4_1:
 		*p = cpu_to_be32(NFS4_CREATE_EXCLUSIVE4_1);
 		encode_nfs4_verifier(xdr, &arg->u.verifier);
-		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true);
+		encode_attrs(xdr, arg->u.attrs, arg->label, arg->server, true,
+			     &arg->umask);
 	}
 }
 
@@ -1672,7 +1691,7 @@ static void encode_setattr(struct xdr_stream *xdr, const struct nfs_setattrargs
 {
 	encode_op_hdr(xdr, OP_SETATTR, decode_setattr_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &arg->stateid);
-	encode_attrs(xdr, arg->iap, arg->label, server, false);
+	encode_attrs(xdr, arg->iap, arg->label, server, false, NULL);
 }
 
 static void encode_setclientid(struct xdr_stream *xdr, const struct nfs4_setclientid *setclientid, struct compound_hdr *hdr)
@@ -2015,6 +2034,7 @@ encode_layoutreturn(struct xdr_stream *xdr,
 		    const struct nfs4_layoutreturn_args *args,
 		    struct compound_hdr *hdr)
 {
+	const struct pnfs_layoutdriver_type *lr_ops = NFS_SERVER(args->inode)->pnfs_curr_ld;
 	__be32 *p;
 
 	encode_op_hdr(xdr, OP_LAYOUTRETURN, decode_layoutreturn_maxsz, hdr);
@@ -2029,10 +2049,11 @@ encode_layoutreturn(struct xdr_stream *xdr,
 	spin_lock(&args->inode->i_lock);
 	encode_nfs4_stateid(xdr, &args->stateid);
 	spin_unlock(&args->inode->i_lock);
-	if (NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn) {
-		NFS_SERVER(args->inode)->pnfs_curr_ld->encode_layoutreturn(
-			NFS_I(args->inode)->layout, xdr, args);
-	} else
+	if (args->ld_private->ops && args->ld_private->ops->encode)
+		args->ld_private->ops->encode(xdr, args, args->ld_private);
+	else if (lr_ops->encode_layoutreturn)
+		lr_ops->encode_layoutreturn(xdr, args);
+	else
 		encode_uint32(xdr, 0);
 }
 
@@ -2062,6 +2083,13 @@ static void encode_free_stateid(struct xdr_stream *xdr,
 	encode_op_hdr(xdr, OP_FREE_STATEID, decode_free_stateid_maxsz, hdr);
 	encode_nfs4_stateid(xdr, &args->stateid);
 }
+#else
+static inline void
+encode_layoutreturn(struct xdr_stream *xdr,
+		    const struct nfs4_layoutreturn_args *args,
+		    struct compound_hdr *hdr)
+{
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -2249,8 +2277,11 @@ static void nfs4_xdr_enc_close(struct rpc_rqst *req, struct xdr_stream *xdr,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
+	if (args->lr_args)
+		encode_layoutreturn(xdr, args->lr_args, &hdr);
+	if (args->bitmask != NULL)
+		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_close(xdr, args, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2327,8 +2358,9 @@ static void nfs4_xdr_enc_open_downgrade(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fh, &hdr);
+	if (args->lr_args)
+		encode_layoutreturn(xdr, args->lr_args, &hdr);
 	encode_open_downgrade(xdr, args, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_nops(&hdr);
 }
 
@@ -2671,7 +2703,10 @@ static void nfs4_xdr_enc_delegreturn(struct rpc_rqst *req,
 	encode_compound_hdr(xdr, req, &hdr);
 	encode_sequence(xdr, &args->seq_args, &hdr);
 	encode_putfh(xdr, args->fhandle, &hdr);
-	encode_getfattr(xdr, args->bitmask, &hdr);
+	if (args->lr_args)
+		encode_layoutreturn(xdr, args->lr_args, &hdr);
+	if (args->bitmask)
+		encode_getfattr(xdr, args->bitmask, &hdr);
 	encode_delegreturn(xdr, args->stateid, &hdr);
 	encode_nops(&hdr);
 }
@@ -6089,6 +6124,13 @@ static int decode_free_stateid(struct xdr_stream *xdr,
 	res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
 	return res->status;
 }
+#else
+static inline
+int decode_layoutreturn(struct xdr_stream *xdr,
+			       struct nfs4_layoutreturn_res *res)
+{
+	return 0;
+}
 #endif /* CONFIG_NFS_V4_1 */
 
 /*
@@ -6114,10 +6156,13 @@ static int nfs4_xdr_dec_open_downgrade(struct rpc_rqst *rqstp,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
+	if (res->lr_res) {
+		status = decode_layoutreturn(xdr, res->lr_res);
+		res->lr_ret = status;
+		if (status)
+			goto out;
+	}
 	status = decode_open_downgrade(xdr, res);
-	if (status != 0)
-		goto out;
-	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6444,16 +6489,18 @@ static int nfs4_xdr_dec_close(struct rpc_rqst *rqstp, struct xdr_stream *xdr,
 	status = decode_putfh(xdr);
 	if (status)
 		goto out;
+	if (res->lr_res) {
+		status = decode_layoutreturn(xdr, res->lr_res);
+		res->lr_ret = status;
+		if (status)
+			goto out;
+	}
+	if (res->fattr != NULL) {
+		status = decode_getfattr(xdr, res->fattr, res->server);
+		if (status != 0)
+			goto out;
+	}
 	status = decode_close(xdr, res);
-	if (status != 0)
-		goto out;
-	/*
-	 * Note: Server may do delete on close for this file
-	 * 	in which case the getattr call will fail with
-	 * 	an ESTALE error. Shouldn't be a problem,
-	 * 	though, since fattr->valid will remain unset.
-	 */
-	decode_getfattr(xdr, res->fattr, res->server);
 out:
 	return status;
 }
@@ -6920,9 +6967,17 @@ static int nfs4_xdr_dec_delegreturn(struct rpc_rqst *rqstp,
 	status = decode_putfh(xdr);
 	if (status != 0)
 		goto out;
-	status = decode_getfattr(xdr, res->fattr, res->server);
-	if (status != 0)
-		goto out;
+	if (res->lr_res) {
+		status = decode_layoutreturn(xdr, res->lr_res);
+		res->lr_ret = status;
+		if (status)
+			goto out;
+	}
+	if (res->fattr) {
+		status = decode_getfattr(xdr, res->fattr, res->server);
+		if (status != 0)
+			goto out;
+	}
 	status = decode_delegreturn(xdr);
 out:
 	return status;
diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c
index 919efd4a1a23..2a4cdce939a0 100644
--- a/fs/nfs/objlayout/objlayout.c
+++ b/fs/nfs/objlayout/objlayout.c
@@ -504,10 +504,10 @@ encode_accumulated_error(struct objlayout *objlay, __be32 *p)
 }
 
 void
-objlayout_encode_layoutreturn(struct pnfs_layout_hdr *pnfslay,
-			      struct xdr_stream *xdr,
+objlayout_encode_layoutreturn(struct xdr_stream *xdr,
 			      const struct nfs4_layoutreturn_args *args)
 {
+	struct pnfs_layout_hdr *pnfslay = args->layout;
 	struct objlayout *objlay = OBJLAYOUT(pnfslay);
 	struct objlayout_io_res *oir, *tmp;
 	__be32 *start;
diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h
index 2641dbad345c..fc94a5872ed4 100644
--- a/fs/nfs/objlayout/objlayout.h
+++ b/fs/nfs/objlayout/objlayout.h
@@ -175,7 +175,6 @@ extern void objlayout_encode_layoutcommit(
 	const struct nfs4_layoutcommit_args *);
 
 extern void objlayout_encode_layoutreturn(
-	struct pnfs_layout_hdr *,
 	struct xdr_stream *,
 	const struct nfs4_layoutreturn_args *);
 
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 965db474f4b0..6e629b856a00 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -867,8 +867,7 @@ static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
 static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
 		const struct nfs_lock_context *l2)
 {
-	return l1->lockowner.l_owner == l2->lockowner.l_owner
-		&& l1->lockowner.l_pid == l2->lockowner.l_pid;
+	return l1->lockowner == l2->lockowner;
 }
 
 /**
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 259ef85f435a..59554f3adf29 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -54,6 +54,12 @@ static DEFINE_SPINLOCK(pnfs_spinlock);
 static LIST_HEAD(pnfs_modules_tbl);
 
 static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo);
+static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+		struct list_head *free_me,
+		const struct pnfs_layout_range *range,
+		u32 seq);
+static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
+		                struct list_head *tmp_list);
 
 /* Return the registered pnfs layout driver module matching given id */
 static struct pnfs_layoutdriver_type *
@@ -299,6 +305,49 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
 	}
 }
 
+static void
+pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
+			 u32 seq)
+{
+	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
+		iomode = IOMODE_ANY;
+	lo->plh_return_iomode = iomode;
+	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+	if (seq != 0) {
+		WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
+		lo->plh_return_seq = seq;
+	}
+}
+
+static void
+pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
+{
+	lo->plh_return_iomode = 0;
+	lo->plh_return_seq = 0;
+	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
+}
+
+static void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+{
+	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
+	clear_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags);
+	smp_mb__after_atomic();
+	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
+	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+}
+
+static void
+pnfs_clear_lseg_state(struct pnfs_layout_segment *lseg,
+		struct list_head *free_me)
+{
+	clear_bit(NFS_LSEG_ROC, &lseg->pls_flags);
+	clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+	if (test_and_clear_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+	if (test_and_clear_bit(NFS_LSEG_LAYOUTCOMMIT, &lseg->pls_flags))
+		pnfs_lseg_dec_and_remove_zero(lseg, free_me);
+}
+
 /*
  * Mark a pnfs_layout_hdr and all associated layout segments as invalid
  *
@@ -315,9 +364,17 @@ pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 		.offset = 0,
 		.length = NFS4_MAX_UINT64,
 	};
+	struct pnfs_layout_segment *lseg, *next;
 
 	set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
-	return pnfs_mark_matching_lsegs_invalid(lo, lseg_list, &range, 0);
+	pnfs_clear_layoutreturn_info(lo);
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list)
+		pnfs_clear_lseg_state(lseg, lseg_list);
+	pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+	if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
+	    !test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
+		pnfs_clear_layoutreturn_waitbit(lo);
+	return !list_empty(&lo->plh_segs);
 }
 
 static int
@@ -396,27 +453,42 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
 
 static void pnfs_free_lseg(struct pnfs_layout_segment *lseg)
 {
-	struct inode *ino = lseg->pls_layout->plh_inode;
-
-	NFS_SERVER(ino)->pnfs_curr_ld->free_lseg(lseg);
+	if (lseg != NULL) {
+		struct inode *inode = lseg->pls_layout->plh_inode;
+		NFS_SERVER(inode)->pnfs_curr_ld->free_lseg(lseg);
+	}
 }
 
 static void
 pnfs_layout_remove_lseg(struct pnfs_layout_hdr *lo,
 		struct pnfs_layout_segment *lseg)
 {
-	struct inode *inode = lo->plh_inode;
-
 	WARN_ON(test_bit(NFS_LSEG_VALID, &lseg->pls_flags));
 	list_del_init(&lseg->pls_list);
 	/* Matched by pnfs_get_layout_hdr in pnfs_layout_insert_lseg */
 	atomic_dec(&lo->plh_refcount);
-	if (list_empty(&lo->plh_segs)) {
+	if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+		return;
+	if (list_empty(&lo->plh_segs) &&
+	    !test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags) &&
+	    !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
 		if (atomic_read(&lo->plh_outstanding) == 0)
 			set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags);
 		clear_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags);
 	}
-	rpc_wake_up(&NFS_SERVER(inode)->roc_rpcwaitq);
+}
+
+static bool
+pnfs_cache_lseg_for_layoutreturn(struct pnfs_layout_hdr *lo,
+		struct pnfs_layout_segment *lseg)
+{
+	if (test_and_clear_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags) &&
+	    pnfs_layout_is_valid(lo)) {
+		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+		list_move_tail(&lseg->pls_list, &lo->plh_return_segs);
+		return true;
+	}
+	return false;
 }
 
 void
@@ -442,6 +514,8 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg)
 		}
 		pnfs_get_layout_hdr(lo);
 		pnfs_layout_remove_lseg(lo, lseg);
+		if (pnfs_cache_lseg_for_layoutreturn(lo, lseg))
+			lseg = NULL;
 		spin_unlock(&inode->i_lock);
 		pnfs_free_lseg(lseg);
 		pnfs_put_layout_hdr(lo);
@@ -482,22 +556,15 @@ pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg)
 		struct pnfs_layout_hdr *lo = lseg->pls_layout;
 		if (test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
 			return;
-		pnfs_get_layout_hdr(lo);
 		pnfs_layout_remove_lseg(lo, lseg);
-		pnfs_free_lseg_async(lseg);
+		if (!pnfs_cache_lseg_for_layoutreturn(lo, lseg)) {
+			pnfs_get_layout_hdr(lo);
+			pnfs_free_lseg_async(lseg);
+		}
 	}
 }
 EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked);
 
-static u64
-end_offset(u64 start, u64 len)
-{
-	u64 end;
-
-	end = start + len;
-	return end >= start ? end : NFS4_MAX_UINT64;
-}
-
 /*
  * is l2 fully contained in l1?
  *   start1                             end1
@@ -510,33 +577,13 @@ pnfs_lseg_range_contained(const struct pnfs_layout_range *l1,
 		 const struct pnfs_layout_range *l2)
 {
 	u64 start1 = l1->offset;
-	u64 end1 = end_offset(start1, l1->length);
+	u64 end1 = pnfs_end_offset(start1, l1->length);
 	u64 start2 = l2->offset;
-	u64 end2 = end_offset(start2, l2->length);
+	u64 end2 = pnfs_end_offset(start2, l2->length);
 
 	return (start1 <= start2) && (end1 >= end2);
 }
 
-/*
- * is l1 and l2 intersecting?
- *   start1                             end1
- *   [----------------------------------)
- *                              start2           end2
- *                              [----------------)
- */
-static bool
-pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
-		    const struct pnfs_layout_range *l2)
-{
-	u64 start1 = l1->offset;
-	u64 end1 = end_offset(start1, l1->length);
-	u64 start2 = l2->offset;
-	u64 end2 = end_offset(start2, l2->length);
-
-	return (end1 == NFS4_MAX_UINT64 || end1 > start2) &&
-	       (end2 == NFS4_MAX_UINT64 || end2 > start1);
-}
-
 static bool pnfs_lseg_dec_and_remove_zero(struct pnfs_layout_segment *lseg,
 		struct list_head *tmp_list)
 {
@@ -637,6 +684,20 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
 	return remaining;
 }
 
+static void
+pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo,
+		struct list_head *free_me,
+		const struct pnfs_layout_range *range,
+		u32 seq)
+{
+	struct pnfs_layout_segment *lseg, *next;
+
+	list_for_each_entry_safe(lseg, next, &lo->plh_return_segs, pls_list) {
+		if (pnfs_match_lseg_recall(lseg, range, seq))
+			list_move_tail(&lseg->pls_list, free_me);
+	}
+}
+
 /* note free_me must contain lsegs from a single layout_hdr */
 void
 pnfs_free_lseg_list(struct list_head *free_me)
@@ -701,6 +762,8 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
 	struct inode *inode;
 
 	list_for_each_entry_safe(lo, next, &server->layouts, plh_layouts) {
+		if (test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags))
+			continue;
 		inode = igrab(lo->plh_inode);
 		if (inode == NULL)
 			continue;
@@ -816,14 +879,6 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
 	pnfs_destroy_layouts_byclid(clp, false);
 }
 
-static void
-pnfs_clear_layoutreturn_info(struct pnfs_layout_hdr *lo)
-{
-	lo->plh_return_iomode = 0;
-	lo->plh_return_seq = 0;
-	clear_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-}
-
 /* update lo->plh_stateid with new if is more recent */
 void
 pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
@@ -941,12 +996,31 @@ static void pnfs_clear_layoutcommit(struct inode *inode,
 	}
 }
 
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo)
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+		const nfs4_stateid *arg_stateid,
+		const struct pnfs_layout_range *range,
+		const nfs4_stateid *stateid)
 {
-	clear_bit_unlock(NFS_LAYOUT_RETURN, &lo->plh_flags);
-	smp_mb__after_atomic();
-	wake_up_bit(&lo->plh_flags, NFS_LAYOUT_RETURN);
-	rpc_wake_up(&NFS_SERVER(lo->plh_inode)->roc_rpcwaitq);
+	struct inode *inode = lo->plh_inode;
+	LIST_HEAD(freeme);
+
+	spin_lock(&inode->i_lock);
+	if (!pnfs_layout_is_valid(lo) || !arg_stateid ||
+	    !nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid))
+		goto out_unlock;
+	if (stateid) {
+		u32 seq = be32_to_cpu(arg_stateid->seqid);
+
+		pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
+		pnfs_free_returned_lsegs(lo, &freeme, range, seq);
+		pnfs_set_layout_stateid(lo, stateid, true);
+	} else
+		pnfs_mark_layout_stateid_invalid(lo, &freeme);
+out_unlock:
+	pnfs_clear_layoutreturn_waitbit(lo);
+	spin_unlock(&inode->i_lock);
+	pnfs_free_lseg_list(&freeme);
+
 }
 
 static bool
@@ -957,8 +1031,9 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 	/* Serialise LAYOUTGET/LAYOUTRETURN */
 	if (atomic_read(&lo->plh_outstanding) != 0)
 		return false;
-	if (test_and_set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+	if (test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
 		return false;
+	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
 	pnfs_get_layout_hdr(lo);
 	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
 		if (stateid != NULL) {
@@ -978,11 +1053,29 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
 	return true;
 }
 
+static void
+pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
+		struct pnfs_layout_hdr *lo,
+		const nfs4_stateid *stateid,
+		enum pnfs_iomode iomode)
+{
+	struct inode *inode = lo->plh_inode;
+
+	args->layout_type = NFS_SERVER(inode)->pnfs_curr_ld->id;
+	args->inode = inode;
+	args->range.iomode = iomode;
+	args->range.offset = 0;
+	args->range.length = NFS4_MAX_UINT64;
+	args->layout = lo;
+	nfs4_stateid_copy(&args->stateid, stateid);
+}
+
 static int
 pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
 		       enum pnfs_iomode iomode, bool sync)
 {
 	struct inode *ino = lo->plh_inode;
+	struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
 	struct nfs4_layoutreturn *lrp;
 	int status = 0;
 
@@ -996,15 +1089,12 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
 		goto out;
 	}
 
-	nfs4_stateid_copy(&lrp->args.stateid, stateid);
-	lrp->args.layout_type = NFS_SERVER(ino)->pnfs_curr_ld->id;
-	lrp->args.inode = ino;
-	lrp->args.range.iomode = iomode;
-	lrp->args.range.offset = 0;
-	lrp->args.range.length = NFS4_MAX_UINT64;
-	lrp->args.layout = lo;
+	pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
+	lrp->args.ld_private = &lrp->ld_private;
 	lrp->clp = NFS_SERVER(ino)->nfs_client;
 	lrp->cred = lo->plh_lc_cred;
+	if (ld->prepare_layoutreturn)
+		ld->prepare_layoutreturn(&lrp->args);
 
 	status = nfs4_proc_layoutreturn(lrp, sync);
 out:
@@ -1067,7 +1157,7 @@ _pnfs_return_layout(struct inode *ino)
 	struct nfs_inode *nfsi = NFS_I(ino);
 	LIST_HEAD(tmp_list);
 	nfs4_stateid stateid;
-	int status = 0, empty;
+	int status = 0;
 	bool send;
 
 	dprintk("NFS: %s for inode %lu\n", __func__, ino->i_ino);
@@ -1081,7 +1171,14 @@ _pnfs_return_layout(struct inode *ino)
 	}
 	/* Reference matched in nfs4_layoutreturn_release */
 	pnfs_get_layout_hdr(lo);
-	empty = list_empty(&lo->plh_segs);
+	/* Is there an outstanding layoutreturn ? */
+	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+		spin_unlock(&ino->i_lock);
+		if (wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+					TASK_UNINTERRUPTIBLE))
+			goto out_put_layout_hdr;
+		spin_lock(&ino->i_lock);
+	}
 	pnfs_clear_layoutcommit(ino, &tmp_list);
 	pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL, 0);
 
@@ -1095,7 +1192,7 @@ _pnfs_return_layout(struct inode *ino)
 	}
 
 	/* Don't send a LAYOUTRETURN if list was initially empty */
-	if (empty) {
+	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
 		spin_unlock(&ino->i_lock);
 		dprintk("NFS: %s no layout segments to return\n", __func__);
 		goto out_put_layout_hdr;
@@ -1141,105 +1238,125 @@ pnfs_commit_and_return_layout(struct inode *inode)
 	return ret;
 }
 
-bool pnfs_roc(struct inode *ino)
+bool pnfs_roc(struct inode *ino,
+		struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		const struct rpc_cred *cred)
 {
 	struct nfs_inode *nfsi = NFS_I(ino);
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
 	struct pnfs_layout_hdr *lo;
-	struct pnfs_layout_segment *lseg, *tmp;
+	struct pnfs_layout_segment *lseg, *next;
 	nfs4_stateid stateid;
-	LIST_HEAD(tmp_list);
-	bool found = false, layoutreturn = false, roc = false;
+	enum pnfs_iomode iomode = 0;
+	bool layoutreturn = false, roc = false;
+	bool skip_read = false;
 
+	if (!nfs_have_layout(ino))
+		return false;
+retry:
 	spin_lock(&ino->i_lock);
 	lo = nfsi->layout;
-	if (!lo || test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
+	if (!lo || !pnfs_layout_is_valid(lo) ||
+	    test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags))
 		goto out_noroc;
+	if (test_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags)) {
+		pnfs_get_layout_hdr(lo);
+		spin_unlock(&ino->i_lock);
+		wait_on_bit(&lo->plh_flags, NFS_LAYOUT_RETURN,
+				TASK_UNINTERRUPTIBLE);
+		pnfs_put_layout_hdr(lo);
+		goto retry;
+	}
 
 	/* no roc if we hold a delegation */
-	if (nfs4_check_delegation(ino, FMODE_READ))
-		goto out_noroc;
+	if (nfs4_check_delegation(ino, FMODE_READ)) {
+		if (nfs4_check_delegation(ino, FMODE_WRITE))
+			goto out_noroc;
+		skip_read = true;
+	}
 
 	list_for_each_entry(ctx, &nfsi->open_files, list) {
 		state = ctx->state;
+		if (state == NULL)
+			continue;
 		/* Don't return layout if there is open file state */
-		if (state != NULL && state->state != 0)
+		if (state->state & FMODE_WRITE)
 			goto out_noroc;
+		if (state->state & FMODE_READ)
+			skip_read = true;
 	}
 
-	/* always send layoutreturn if being marked so */
-	if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
-		layoutreturn = pnfs_prepare_layoutreturn(lo,
-				&stateid, NULL);
 
-	list_for_each_entry_safe(lseg, tmp, &lo->plh_segs, pls_list)
+	list_for_each_entry_safe(lseg, next, &lo->plh_segs, pls_list) {
+		if (skip_read && lseg->pls_range.iomode == IOMODE_READ)
+			continue;
 		/* If we are sending layoutreturn, invalidate all valid lsegs */
-		if (layoutreturn || test_bit(NFS_LSEG_ROC, &lseg->pls_flags)) {
-			mark_lseg_invalid(lseg, &tmp_list);
-			found = true;
-		}
+		if (!test_and_clear_bit(NFS_LSEG_ROC, &lseg->pls_flags))
+			continue;
+		/*
+		 * Note: mark lseg for return so pnfs_layout_remove_lseg
+		 * doesn't invalidate the layout for us.
+		 */
+		set_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags);
+		if (!mark_lseg_invalid(lseg, &lo->plh_return_segs))
+			continue;
+		pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0);
+	}
+
+	if (!test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+		goto out_noroc;
+
 	/* ROC in two conditions:
 	 * 1. there are ROC lsegs
 	 * 2. we don't send layoutreturn
 	 */
-	if (found && !layoutreturn) {
-		/* lo ref dropped in pnfs_roc_release() */
-		pnfs_get_layout_hdr(lo);
-		roc = true;
-	}
+	/* lo ref dropped in pnfs_roc_release() */
+	layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+	/* If the creds don't match, we can't compound the layoutreturn */
+	if (!layoutreturn || cred != lo->plh_lc_cred)
+		goto out_noroc;
+
+	roc = layoutreturn;
+	pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
+	res->lrs_present = 0;
+	layoutreturn = false;
 
 out_noroc:
 	spin_unlock(&ino->i_lock);
-	pnfs_free_lseg_list(&tmp_list);
 	pnfs_layoutcommit_inode(ino, true);
+	if (roc) {
+		struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
+		if (ld->prepare_layoutreturn)
+			ld->prepare_layoutreturn(args);
+		return true;
+	}
 	if (layoutreturn)
-		pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
-	return roc;
-}
-
-void pnfs_roc_release(struct inode *ino)
-{
-	struct pnfs_layout_hdr *lo;
-
-	spin_lock(&ino->i_lock);
-	lo = NFS_I(ino)->layout;
-	pnfs_clear_layoutreturn_waitbit(lo);
-	if (atomic_dec_and_test(&lo->plh_refcount)) {
-		pnfs_detach_layout_hdr(lo);
-		spin_unlock(&ino->i_lock);
-		pnfs_free_layout_hdr(lo);
-	} else
-		spin_unlock(&ino->i_lock);
-}
-
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
-{
-	struct pnfs_layout_hdr *lo;
-
-	spin_lock(&ino->i_lock);
-	lo = NFS_I(ino)->layout;
-	if (pnfs_seqid_is_newer(barrier, lo->plh_barrier))
-		lo->plh_barrier = barrier;
-	spin_unlock(&ino->i_lock);
-	trace_nfs4_layoutreturn_on_close(ino, 0);
+		pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+	return false;
 }
 
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		int ret)
 {
-	struct nfs_inode *nfsi = NFS_I(ino);
-	struct pnfs_layout_hdr *lo;
-	u32 current_seqid;
-
-	spin_lock(&ino->i_lock);
-	lo = nfsi->layout;
-	current_seqid = be32_to_cpu(lo->plh_stateid.seqid);
+	struct pnfs_layout_hdr *lo = args->layout;
+	const nfs4_stateid *arg_stateid = NULL;
+	const nfs4_stateid *res_stateid = NULL;
+	struct nfs4_xdr_opaque_data *ld_private = args->ld_private;
 
-	/* Since close does not return a layout stateid for use as
-	 * a barrier, we choose the worst-case barrier.
-	 */
-	*barrier = current_seqid + atomic_read(&lo->plh_outstanding);
-	spin_unlock(&ino->i_lock);
+	if (ret == 0) {
+		arg_stateid = &args->stateid;
+		if (res->lrs_present)
+			res_stateid = &res->stateid;
+	}
+	pnfs_layoutreturn_free_lsegs(lo, arg_stateid, &args->range,
+			res_stateid);
+	if (ld_private && ld_private->ops && ld_private->ops->free)
+		ld_private->ops->free(ld_private);
+	pnfs_put_layout_hdr(lo);
+	trace_nfs4_layoutreturn_on_close(args->inode, 0);
 }
 
 bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
@@ -1252,13 +1369,11 @@ bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
 	 * i_lock */
         spin_lock(&ino->i_lock);
         lo = nfsi->layout;
-        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags))
+        if (lo && test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags)) {
+                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
                 sleep = true;
+	}
         spin_unlock(&ino->i_lock);
-
-        if (sleep)
-                rpc_sleep_on(&NFS_SERVER(ino)->roc_rpcwaitq, task, NULL);
-
         return sleep;
 }
 
@@ -1375,6 +1490,7 @@ alloc_init_layout_hdr(struct inode *ino,
 	atomic_set(&lo->plh_refcount, 1);
 	INIT_LIST_HEAD(&lo->plh_layouts);
 	INIT_LIST_HEAD(&lo->plh_segs);
+	INIT_LIST_HEAD(&lo->plh_return_segs);
 	INIT_LIST_HEAD(&lo->plh_bulk_destroy);
 	lo->plh_inode = ino;
 	lo->plh_lc_cred = get_rpccred(ctx->cred);
@@ -1841,7 +1957,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 		goto out_forget;
 	}
 
-	if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
+	if (!pnfs_layout_is_valid(lo)) {
+		/* We have a completely new layout */
+		pnfs_set_layout_stateid(lo, &res->stateid, true);
+	} else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
 		/* existing state ID, make sure the sequence number matches. */
 		if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
 			dprintk("%s forget reply due to sequence\n", __func__);
@@ -1851,12 +1970,10 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
 	} else {
 		/*
 		 * We got an entirely new state ID.  Mark all segments for the
-		 * inode invalid, and don't bother validating the stateid
-		 * sequence number.
+		 * inode invalid, and retry the layoutget
 		 */
 		pnfs_mark_layout_stateid_invalid(lo, &free_me);
-
-		pnfs_set_layout_stateid(lo, &res->stateid, true);
+		goto out_forget;
 	}
 
 	pnfs_get_lseg(lseg);
@@ -1877,20 +1994,6 @@ out_forget:
 	return ERR_PTR(-EAGAIN);
 }
 
-static void
-pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
-			 u32 seq)
-{
-	if (lo->plh_return_iomode != 0 && lo->plh_return_iomode != iomode)
-		iomode = IOMODE_ANY;
-	lo->plh_return_iomode = iomode;
-	set_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags);
-	if (seq != 0) {
-		WARN_ON_ONCE(lo->plh_return_seq != 0 && lo->plh_return_seq != seq);
-		lo->plh_return_seq = seq;
-	}
-}
-
 /**
  * pnfs_mark_matching_lsegs_return - Free or return matching layout segments
  * @lo: pointer to layout header
@@ -1945,17 +2048,18 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
 		.offset = 0,
 		.length = NFS4_MAX_UINT64,
 	};
-	LIST_HEAD(free_me);
 	bool return_now = false;
 
 	spin_lock(&inode->i_lock);
 	pnfs_set_plh_return_info(lo, range.iomode, 0);
+	/* Block LAYOUTGET */
+	set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
 	/*
 	 * mark all matching lsegs so that we are sure to have no live
 	 * segments at hand when sending layoutreturn. See pnfs_put_lseg()
 	 * for how it works.
 	 */
-	if (!pnfs_mark_matching_lsegs_return(lo, &free_me, &range, 0)) {
+	if (!pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0)) {
 		nfs4_stateid stateid;
 		enum pnfs_iomode iomode;
 
@@ -1967,7 +2071,6 @@ void pnfs_error_mark_layout_for_return(struct inode *inode,
 		spin_unlock(&inode->i_lock);
 		nfs_commit_inode(inode, 0);
 	}
-	pnfs_free_lseg_list(&free_me);
 }
 EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
 
@@ -2063,7 +2166,7 @@ pnfs_generic_pg_test(struct nfs_pageio_descriptor *pgio,
 	 *
 	 */
 	if (pgio->pg_lseg) {
-		seg_end = end_offset(pgio->pg_lseg->pls_range.offset,
+		seg_end = pnfs_end_offset(pgio->pg_lseg->pls_range.offset,
 				     pgio->pg_lseg->pls_range.length);
 		req_start = req_offset(req);
 		WARN_ON_ONCE(req_start >= seg_end);
@@ -2286,6 +2389,10 @@ void pnfs_read_resend_pnfs(struct nfs_pgio_header *hdr)
 	struct nfs_pageio_descriptor pgio;
 
 	if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) {
+		/* Prevent deadlocks with layoutreturn! */
+		pnfs_put_lseg(hdr->lseg);
+		hdr->lseg = NULL;
+
 		nfs_pageio_init_read(&pgio, hdr->inode, false,
 					hdr->completion_ops);
 		hdr->task.tk_status = nfs_pageio_resend(&pgio, hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 5c295512c967..63f77b49a586 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -96,6 +96,7 @@ enum {
 	NFS_LAYOUT_RW_FAILED,		/* get rw layout failed stop trying */
 	NFS_LAYOUT_BULK_RECALL,		/* bulk recall affecting layout */
 	NFS_LAYOUT_RETURN,		/* layoutreturn in progress */
+	NFS_LAYOUT_RETURN_LOCK,		/* Serialise layoutreturn */
 	NFS_LAYOUT_RETURN_REQUESTED,	/* Return this layout ASAP */
 	NFS_LAYOUT_INVALID_STID,	/* layout stateid id is invalid */
 	NFS_LAYOUT_FIRST_LAYOUTGET,	/* Serialize first layoutget */
@@ -171,8 +172,8 @@ struct pnfs_layoutdriver_type {
 			(struct nfs_server *server, struct pnfs_device *pdev,
 			gfp_t gfp_flags);
 
-	void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid,
-				     struct xdr_stream *xdr,
+	int (*prepare_layoutreturn) (struct nfs4_layoutreturn_args *);
+	void (*encode_layoutreturn) (struct xdr_stream *xdr,
 				     const struct nfs4_layoutreturn_args *args);
 
 	void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data);
@@ -181,7 +182,6 @@ struct pnfs_layoutdriver_type {
 				     struct xdr_stream *xdr,
 				     const struct nfs4_layoutcommit_args *args);
 	int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
-	void (*cleanup_layoutstats) (struct nfs42_layoutstat_data *data);
 };
 
 struct pnfs_layout_hdr {
@@ -190,6 +190,7 @@ struct pnfs_layout_hdr {
 	struct list_head	plh_layouts;   /* other client layouts */
 	struct list_head	plh_bulk_destroy;
 	struct list_head	plh_segs;      /* layout segments list */
+	struct list_head	plh_return_segs; /* invalid layout segments */
 	unsigned long		plh_block_lgets; /* block LAYOUTGET if >0 */
 	unsigned long		plh_retry_timestamp;
 	unsigned long		plh_flags;
@@ -270,10 +271,13 @@ int pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
 				u32 seq);
 int pnfs_mark_layout_stateid_invalid(struct pnfs_layout_hdr *lo,
 		struct list_head *lseg_list);
-bool pnfs_roc(struct inode *ino);
-void pnfs_roc_release(struct inode *ino);
-void pnfs_roc_set_barrier(struct inode *ino, u32 barrier);
-void pnfs_roc_get_barrier(struct inode *ino, u32 *barrier);
+bool pnfs_roc(struct inode *ino,
+		struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		const struct rpc_cred *cred);
+void pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		int ret);
 bool pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task);
 void pnfs_set_layoutcommit(struct inode *, struct pnfs_layout_segment *, loff_t);
 void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data);
@@ -292,7 +296,10 @@ struct pnfs_layout_segment *pnfs_update_layout(struct inode *ino,
 					       enum pnfs_iomode iomode,
 					       bool strict_iomode,
 					       gfp_t gfp_flags);
-void pnfs_clear_layoutreturn_waitbit(struct pnfs_layout_hdr *lo);
+void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
+		const nfs4_stateid *arg_stateid,
+		const struct pnfs_layout_range *range,
+		const nfs4_stateid *stateid);
 
 void pnfs_generic_layout_insert_lseg(struct pnfs_layout_hdr *lo,
 		   struct pnfs_layout_segment *lseg,
@@ -362,8 +369,7 @@ struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs,
 void nfs4_pnfs_v3_ds_connect_unload(void);
 void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
 			  struct nfs4_deviceid_node *devid, unsigned int timeo,
-			  unsigned int retrans, u32 version, u32 minor_version,
-			  rpc_authflavor_t au_flavor);
+			  unsigned int retrans, u32 version, u32 minor_version);
 struct nfs4_pnfs_ds_addr *nfs4_decode_mp_ds_addr(struct net *net,
 						 struct xdr_stream *xdr,
 						 gfp_t gfp_flags);
@@ -559,6 +565,38 @@ pnfs_copy_range(struct pnfs_layout_range *dst,
 	memcpy(dst, src, sizeof(*dst));
 }
 
+static inline u64
+pnfs_end_offset(u64 start, u64 len)
+{
+	if (NFS4_MAX_UINT64 - start <= len)
+		return NFS4_MAX_UINT64;
+	return start + len;
+}
+
+/*
+ * Are 2 ranges intersecting?
+ *   start1                             end1
+ *   [----------------------------------)
+ *                                start2           end2
+ *                                [----------------)
+ */
+static inline bool
+pnfs_is_range_intersecting(u64 start1, u64 end1, u64 start2, u64 end2)
+{
+	return (end1 == NFS4_MAX_UINT64 || start2 < end1) &&
+		(end2 == NFS4_MAX_UINT64 || start1 < end2);
+}
+
+static inline bool
+pnfs_lseg_range_intersecting(const struct pnfs_layout_range *l1,
+		const struct pnfs_layout_range *l2)
+{
+	u64 end1 = pnfs_end_offset(l1->offset, l1->length);
+	u64 end2 = pnfs_end_offset(l2->offset, l2->length);
+
+	return pnfs_is_range_intersecting(l1->offset, end1, l2->offset, end2);
+}
+
 extern unsigned int layoutstats_timer;
 
 #ifdef NFS_DEBUG
@@ -630,23 +668,18 @@ pnfs_layoutcommit_outstanding(struct inode *inode)
 
 
 static inline bool
-pnfs_roc(struct inode *ino)
+pnfs_roc(struct inode *ino,
+		struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		const struct rpc_cred *cred)
 {
 	return false;
 }
 
 static inline void
-pnfs_roc_release(struct inode *ino)
-{
-}
-
-static inline void
-pnfs_roc_set_barrier(struct inode *ino, u32 barrier)
-{
-}
-
-static inline void
-pnfs_roc_get_barrier(struct inode *ino, u32 *barrier)
+pnfs_roc_release(struct nfs4_layoutreturn_args *args,
+		struct nfs4_layoutreturn_res *res,
+		int ret)
 {
 }
 
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 53b4705abcc7..9414b492439f 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -600,8 +600,7 @@ static struct nfs_client *(*get_v3_ds_connect)(
 			int ds_addrlen,
 			int ds_proto,
 			unsigned int ds_timeo,
-			unsigned int ds_retrans,
-			rpc_authflavor_t au_flavor);
+			unsigned int ds_retrans);
 
 static bool load_v3_ds_connect(void)
 {
@@ -625,15 +624,13 @@ EXPORT_SYMBOL_GPL(nfs4_pnfs_v3_ds_connect_unload);
 static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 				 struct nfs4_pnfs_ds *ds,
 				 unsigned int timeo,
-				 unsigned int retrans,
-				 rpc_authflavor_t au_flavor)
+				 unsigned int retrans)
 {
 	struct nfs_client *clp = ERR_PTR(-EIO);
 	struct nfs4_pnfs_ds_addr *da;
 	int status = 0;
 
-	dprintk("--> %s DS %s au_flavor %d\n", __func__,
-		ds->ds_remotestr, au_flavor);
+	dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
 
 	if (!load_v3_ds_connect())
 		goto out;
@@ -657,7 +654,7 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv,
 			clp = get_v3_ds_connect(mds_srv,
 					(struct sockaddr *)&da->da_addr,
 					da->da_addrlen, IPPROTO_TCP,
-					timeo, retrans, au_flavor);
+					timeo, retrans);
 	}
 
 	if (IS_ERR(clp)) {
@@ -676,15 +673,13 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 				 struct nfs4_pnfs_ds *ds,
 				 unsigned int timeo,
 				 unsigned int retrans,
-				 u32 minor_version,
-				 rpc_authflavor_t au_flavor)
+				 u32 minor_version)
 {
 	struct nfs_client *clp = ERR_PTR(-EIO);
 	struct nfs4_pnfs_ds_addr *da;
 	int status = 0;
 
-	dprintk("--> %s DS %s au_flavor %d\n", __func__, ds->ds_remotestr,
-		au_flavor);
+	dprintk("--> %s DS %s\n", __func__, ds->ds_remotestr);
 
 	list_for_each_entry(da, &ds->ds_addrs, da_node) {
 		dprintk("%s: DS %s: trying address %s\n",
@@ -720,8 +715,7 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 			clp = nfs4_set_ds_client(mds_srv,
 						(struct sockaddr *)&da->da_addr,
 						da->da_addrlen, IPPROTO_TCP,
-						timeo, retrans, minor_version,
-						au_flavor);
+						timeo, retrans, minor_version);
 			if (IS_ERR(clp))
 				continue;
 
@@ -755,19 +749,17 @@ out:
  */
 void nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds,
 			  struct nfs4_deviceid_node *devid, unsigned int timeo,
-			  unsigned int retrans, u32 version,
-			  u32 minor_version, rpc_authflavor_t au_flavor)
+			  unsigned int retrans, u32 version, u32 minor_version)
 {
 	if (test_and_set_bit(NFS4DS_CONNECTING, &ds->ds_state) == 0) {
 		int err = 0;
 
 		if (version == 3) {
 			err = _nfs4_pnfs_v3_ds_connect(mds_srv, ds, timeo,
-						       retrans, au_flavor);
+						       retrans);
 		} else if (version == 4) {
 			err = _nfs4_pnfs_v4_ds_connect(mds_srv, ds, timeo,
-						       retrans, minor_version,
-						       au_flavor);
+						       retrans, minor_version);
 		} else {
 			dprintk("%s: unsupported DS version %d\n", __func__,
 				version);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 001796bcd6c8..6bca17883b93 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -55,7 +55,7 @@
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "nfs4_fs.h"
 #include "callback.h"
@@ -2904,7 +2904,7 @@ module_param(max_session_slots, ushort, 0644);
 MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 		"requests the client will negotiate");
 module_param(max_session_cb_slots, ushort, 0644);
-MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+MODULE_PARM_DESC(max_session_cb_slots, "Maximum number of parallel NFSv4.1 "
 		"callbacks the client will process for a given server");
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c
index 4fe3eead3868..5a1d0ded8979 100644
--- a/fs/nfs/symlink.c
+++ b/fs/nfs/symlink.c
@@ -77,7 +77,6 @@ static const char *nfs_get_link(struct dentry *dentry,
  * symlinks can't do much...
  */
 const struct inode_operations nfs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= nfs_get_link,
 	.getattr	= nfs_getattr,
 	.setattr	= nfs_setattr,
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 53211838f72a..b00d53d13d47 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -24,7 +24,7 @@
 #include <linux/freezer.h>
 #include <linux/wait.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "delegation.h"
 #include "internal.h"
@@ -1151,8 +1151,7 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
 		if (l_ctx && flctx &&
 		    !(list_empty_careful(&flctx->flc_posix) &&
 		      list_empty_careful(&flctx->flc_flock))) {
-			do_flush |= l_ctx->lockowner.l_owner != current->files
-				|| l_ctx->lockowner.l_pid != current->tgid;
+			do_flush |= l_ctx->lockowner != current->files;
 		}
 		nfs_release_request(req);
 		if (!do_flush)
diff --git a/fs/nfs_common/grace.c b/fs/nfs_common/grace.c
index fd8c9a5bcac4..420d3a0ab258 100644
--- a/fs/nfs_common/grace.c
+++ b/fs/nfs_common/grace.c
@@ -9,7 +9,7 @@
 #include <net/netns/generic.h>
 #include <linux/fs.h>
 
-static int grace_net_id;
+static unsigned int grace_net_id;
 static DEFINE_SPINLOCK(grace_lock);
 
 /**
diff --git a/fs/nfsd/fault_inject.c b/fs/nfsd/fault_inject.c
index c16bf5af6831..34c1c449fddf 100644
--- a/fs/nfsd/fault_inject.c
+++ b/fs/nfsd/fault_inject.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/nsproxy.h>
 #include <linux/sunrpc/addr.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "state.h"
 #include "netns.h"
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index ee36efd5aece..3714231a9d0f 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -124,5 +124,5 @@ struct nfsd_net {
 /* Simple check to find out if a given net was properly initialized */
 #define nfsd_netns_ready(nn) ((nn)->sessionid_hashtbl)
 
-extern int nfsd_net_id;
+extern unsigned int nfsd_net_id;
 #endif /* __NFSD_NETNS_H__ */
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 211dc2aed8e1..eb78109d666c 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -1061,7 +1061,7 @@ static const struct rpc_call_ops nfsd4_cb_ops = {
 
 int nfsd4_create_callback_queue(void)
 {
-	callback_wq = create_singlethread_workqueue("nfsd4_callbacks");
+	callback_wq = alloc_ordered_workqueue("nfsd4_callbacks", 0);
 	if (!callback_wq)
 		return -ENOMEM;
 	return 0;
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 42aace4fc4c8..596205d939a1 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -686,10 +686,6 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 			return 0;
 		}
 		/* Fallthrough */
-	case -NFS4ERR_NOMATCHING_LAYOUT:
-		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
-		task->tk_status = 0;
-		return 1;
 	default:
 		/*
 		 * Unknown error or non-responding client, we'll need to fence.
@@ -702,6 +698,10 @@ nfsd4_cb_layout_done(struct nfsd4_callback *cb, struct rpc_task *task)
 		else
 			nfsd4_cb_layout_fail(ls);
 		return -1;
+	case -NFS4ERR_NOMATCHING_LAYOUT:
+		trace_layout_recall_done(&ls->ls_stid.sc_stateid);
+		task->tk_status = 0;
+		return 1;
 	}
 }
 
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index abb09b580389..74a6e573e061 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -96,33 +96,15 @@ check_attr_support(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 {
 	struct dentry *dentry = cstate->current_fh.fh_dentry;
 
-	/*
-	 * Check about attributes are supported by the NFSv4 server or not.
-	 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP.
-	 */
-	if ((bmval[0] & ~nfsd_suppattrs0(cstate->minorversion)) ||
-	    (bmval[1] & ~nfsd_suppattrs1(cstate->minorversion)) ||
-	    (bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
+	if (!nfsd_attrs_supported(cstate->minorversion, bmval))
 		return nfserr_attrnotsupp;
-
-	/*
-	 * Check FATTR4_WORD0_ACL can be supported
-	 * in current environment or not.
-	 */
-	if (bmval[0] & FATTR4_WORD0_ACL) {
-		if (!IS_POSIXACL(d_inode(dentry)))
-			return nfserr_attrnotsupp;
-	}
-
-	/*
-	 * According to spec, read-only attributes return ERR_INVAL.
-	 */
-	if (writable) {
-		if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
-		    (bmval[2] & ~writable[2]))
-			return nfserr_inval;
-	}
-
+	if ((bmval[0] & FATTR4_WORD0_ACL) && !IS_POSIXACL(d_inode(dentry)))
+		return nfserr_attrnotsupp;
+	if (writable && !bmval_is_subset(bmval, writable))
+		return nfserr_inval;
+	if (writable && (bmval[2] & FATTR4_WORD2_MODE_UMASK) &&
+			(bmval[1] & FATTR4_WORD1_MODE))
+		return nfserr_inval;
 	return nfs_ok;
 }
 
@@ -695,9 +677,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-	getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
-	getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+	getattr->ga_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+	getattr->ga_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+	getattr->ga_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
 
 	getattr->ga_fhp = &cstate->current_fh;
 	return nfs_ok;
@@ -799,9 +781,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
 		return nfserr_inval;
 
-	readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
-	readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
-	readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
+	readdir->rd_bmval[0] &= nfsd_suppattrs[cstate->minorversion][0];
+	readdir->rd_bmval[1] &= nfsd_suppattrs[cstate->minorversion][1];
+	readdir->rd_bmval[2] &= nfsd_suppattrs[cstate->minorversion][2];
 
 	if ((cookie == 1) || (cookie == 2) ||
 	    (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index c2d2895a1ec1..8fae53ce21d1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -33,6 +33,7 @@
  *  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include <linux/fs_struct.h>
 #include <linux/file.h>
 #include <linux/slab.h>
 #include <linux/namei.h>
@@ -57,6 +58,20 @@
 
 #define NFSDDBG_FACILITY		NFSDDBG_XDR
 
+u32 nfsd_suppattrs[3][3] = {
+	{NFSD4_SUPPORTED_ATTRS_WORD0,
+	 NFSD4_SUPPORTED_ATTRS_WORD1,
+	 NFSD4_SUPPORTED_ATTRS_WORD2},
+
+	{NFSD4_1_SUPPORTED_ATTRS_WORD0,
+	 NFSD4_1_SUPPORTED_ATTRS_WORD1,
+	 NFSD4_1_SUPPORTED_ATTRS_WORD2},
+
+	{NFSD4_1_SUPPORTED_ATTRS_WORD0,
+	 NFSD4_1_SUPPORTED_ATTRS_WORD1,
+	 NFSD4_2_SUPPORTED_ATTRS_WORD2},
+};
+
 /*
  * As per referral draft, the fsid for a referral MUST be different from the fsid of the containing
  * directory in order to indicate to the client that a filesystem boundary is present
@@ -285,7 +300,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
 static __be32
 nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 		   struct iattr *iattr, struct nfs4_acl **acl,
-		   struct xdr_netobj *label)
+		   struct xdr_netobj *label, int *umask)
 {
 	int expected_len, len = 0;
 	u32 dummy32;
@@ -296,6 +311,14 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 	if ((status = nfsd4_decode_bitmap(argp, bmval)))
 		return status;
 
+	if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
+	    || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
+	    || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2) {
+		if (nfsd_attrs_supported(argp->minorversion, bmval))
+			return nfserr_inval;
+		return nfserr_attrnotsupp;
+	}
+
 	READ_BUF(4);
 	expected_len = be32_to_cpup(p++);
 
@@ -435,12 +458,18 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval,
 			return nfserr_jukebox;
 	}
 #endif
-
-	if (bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0
-	    || bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1
-	    || bmval[2] & ~NFSD_WRITEABLE_ATTRS_WORD2)
-		READ_BUF(expected_len - len);
-	else if (len != expected_len)
+	if (bmval[2] & FATTR4_WORD2_MODE_UMASK) {
+		if (!umask)
+			goto xdr_error;
+		READ_BUF(8);
+		len += 8;
+		dummy32 = be32_to_cpup(p++);
+		iattr->ia_mode = dummy32 & (S_IFMT | S_IALLUGO);
+		dummy32 = be32_to_cpup(p++);
+		*umask = dummy32 & S_IRWXUGO;
+		iattr->ia_valid |= ATTR_MODE;
+	}
+	if (len != expected_len)
 		goto xdr_error;
 
 	DECODE_TAIL;
@@ -634,7 +663,8 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
 		return status;
 
 	status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr,
-				    &create->cr_acl, &create->cr_label);
+				    &create->cr_acl, &create->cr_label,
+				    &current->fs->umask);
 	if (status)
 		goto out;
 
@@ -879,13 +909,15 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 	case NFS4_OPEN_NOCREATE:
 		break;
 	case NFS4_OPEN_CREATE:
+		current->fs->umask = 0;
 		READ_BUF(4);
 		open->op_createmode = be32_to_cpup(p++);
 		switch (open->op_createmode) {
 		case NFS4_CREATE_UNCHECKED:
 		case NFS4_CREATE_GUARDED:
 			status = nfsd4_decode_fattr(argp, open->op_bmval,
-				&open->op_iattr, &open->op_acl, &open->op_label);
+				&open->op_iattr, &open->op_acl, &open->op_label,
+				&current->fs->umask);
 			if (status)
 				goto out;
 			break;
@@ -899,7 +931,8 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
 			READ_BUF(NFS4_VERIFIER_SIZE);
 			COPYMEM(open->op_verf.data, NFS4_VERIFIER_SIZE);
 			status = nfsd4_decode_fattr(argp, open->op_bmval,
-				&open->op_iattr, &open->op_acl, &open->op_label);
+				&open->op_iattr, &open->op_acl, &open->op_label,
+				&current->fs->umask);
 			if (status)
 				goto out;
 			break;
@@ -1136,7 +1169,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
 	if (status)
 		return status;
 	return nfsd4_decode_fattr(argp, setattr->sa_bmval, &setattr->sa_iattr,
-				  &setattr->sa_acl, &setattr->sa_label);
+				  &setattr->sa_acl, &setattr->sa_label, NULL);
 }
 
 static __be32
@@ -2340,9 +2373,7 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 	struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
 
 	BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
-	BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
-	BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
-	BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
+	BUG_ON(!nfsd_attrs_supported(minorversion, bmval));
 
 	if (exp->ex_fslocs.migrated) {
 		status = fattr_handle_absent_fs(&bmval0, &bmval1, &bmval2, &rdattr_err);
@@ -2409,29 +2440,29 @@ nfsd4_encode_fattr(struct xdr_stream *xdr, struct svc_fh *fhp,
 	p++;                /* to be backfilled later */
 
 	if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
-		u32 word0 = nfsd_suppattrs0(minorversion);
-		u32 word1 = nfsd_suppattrs1(minorversion);
-		u32 word2 = nfsd_suppattrs2(minorversion);
+		u32 supp[3];
+
+		memcpy(supp, nfsd_suppattrs[minorversion], sizeof(supp));
 
 		if (!IS_POSIXACL(dentry->d_inode))
-			word0 &= ~FATTR4_WORD0_ACL;
+			supp[0] &= ~FATTR4_WORD0_ACL;
 		if (!contextsupport)
-			word2 &= ~FATTR4_WORD2_SECURITY_LABEL;
-		if (!word2) {
+			supp[2] &= ~FATTR4_WORD2_SECURITY_LABEL;
+		if (!supp[2]) {
 			p = xdr_reserve_space(xdr, 12);
 			if (!p)
 				goto out_resource;
 			*p++ = cpu_to_be32(2);
-			*p++ = cpu_to_be32(word0);
-			*p++ = cpu_to_be32(word1);
+			*p++ = cpu_to_be32(supp[0]);
+			*p++ = cpu_to_be32(supp[1]);
 		} else {
 			p = xdr_reserve_space(xdr, 16);
 			if (!p)
 				goto out_resource;
 			*p++ = cpu_to_be32(3);
-			*p++ = cpu_to_be32(word0);
-			*p++ = cpu_to_be32(word1);
-			*p++ = cpu_to_be32(word2);
+			*p++ = cpu_to_be32(supp[0]);
+			*p++ = cpu_to_be32(supp[1]);
+			*p++ = cpu_to_be32(supp[2]);
 		}
 	}
 	if (bmval0 & FATTR4_WORD0_TYPE) {
@@ -3576,10 +3607,10 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd
 	if (!p)
 		return nfserr_resource;
 	/*
-	 * XXX: By default, the ->readlink() VFS op will truncate symlinks
-	 * if they would overflow the buffer.  Is this kosher in NFSv4?  If
-	 * not, one easy fix is: if ->readlink() precisely fills the buffer,
-	 * assume that truncation occurred, and return NFS4ERR_RESOURCE.
+	 * XXX: By default, vfs_readlink() will truncate symlinks if they
+	 * would overflow the buffer.  Is this kosher in NFSv4?  If not, one
+	 * easy fix is: if vfs_readlink() precisely fills the buffer, assume
+	 * that truncation occurred, and return NFS4ERR_RESOURCE.
 	 */
 	nfserr = nfsd_readlink(readlink->rl_rqstp, readlink->rl_fhp,
 						(char *)p, &maxcount);
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index 54cde9a5864e..d6b97b424ad1 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -9,6 +9,7 @@
  */
 
 #include <linux/slab.h>
+#include <linux/vmalloc.h>
 #include <linux/sunrpc/addr.h>
 #include <linux/highmem.h>
 #include <linux/log2.h>
@@ -174,8 +175,12 @@ int nfsd_reply_cache_init(void)
 		goto out_nomem;
 
 	drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL);
-	if (!drc_hashtbl)
-		goto out_nomem;
+	if (!drc_hashtbl) {
+		drc_hashtbl = vzalloc(hashsize * sizeof(*drc_hashtbl));
+		if (!drc_hashtbl)
+			goto out_nomem;
+	}
+
 	for (i = 0; i < hashsize; i++) {
 		INIT_LIST_HEAD(&drc_hashtbl[i].lru_head);
 		spin_lock_init(&drc_hashtbl[i].cache_lock);
@@ -204,7 +209,7 @@ void nfsd_reply_cache_shutdown(void)
 		}
 	}
 
-	kfree (drc_hashtbl);
+	kvfree(drc_hashtbl);
 	drc_hashtbl = NULL;
 	drc_hashsize = 0;
 
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 36b2af931e06..f3b2f34b10a3 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -217,7 +217,7 @@ static const struct file_operations pool_stats_operations = {
 	.release	= nfsd_pool_stats_release,
 };
 
-static struct file_operations reply_cache_stats_operations = {
+static const struct file_operations reply_cache_stats_operations = {
 	.open		= nfsd_reply_cache_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -1201,7 +1201,7 @@ static int create_proc_exports_entry(void)
 }
 #endif
 
-int nfsd_net_id;
+unsigned int nfsd_net_id;
 
 static __net_init int nfsd_init_net(struct net *net)
 {
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index 9446849888d5..d74c8c44dc35 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -359,44 +359,46 @@ void		nfsd_lockd_shutdown(void);
 
 #define NFSD4_2_SUPPORTED_ATTRS_WORD2 \
 	(NFSD4_1_SUPPORTED_ATTRS_WORD2 | \
+	FATTR4_WORD2_MODE_UMASK | \
 	NFSD4_2_SECURITY_ATTRS)
 
-static inline u32 nfsd_suppattrs0(u32 minorversion)
-{
-	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD0
-			    : NFSD4_SUPPORTED_ATTRS_WORD0;
-}
+extern u32 nfsd_suppattrs[3][3];
 
-static inline u32 nfsd_suppattrs1(u32 minorversion)
+static inline bool bmval_is_subset(u32 *bm1, u32 *bm2)
 {
-	return minorversion ? NFSD4_1_SUPPORTED_ATTRS_WORD1
-			    : NFSD4_SUPPORTED_ATTRS_WORD1;
+	return !((bm1[0] & ~bm2[0]) ||
+	         (bm1[1] & ~bm2[1]) ||
+		 (bm1[2] & ~bm2[2]));
 }
 
-static inline u32 nfsd_suppattrs2(u32 minorversion)
+static inline bool nfsd_attrs_supported(u32 minorversion, u32 *bmval)
 {
-	switch (minorversion) {
-	default: return NFSD4_2_SUPPORTED_ATTRS_WORD2;
-	case 1:  return NFSD4_1_SUPPORTED_ATTRS_WORD2;
-	case 0:  return NFSD4_SUPPORTED_ATTRS_WORD2;
-	}
+	return bmval_is_subset(bmval, nfsd_suppattrs[minorversion]);
 }
 
 /* These will return ERR_INVAL if specified in GETATTR or READDIR. */
 #define NFSD_WRITEONLY_ATTRS_WORD1 \
 	(FATTR4_WORD1_TIME_ACCESS_SET   | FATTR4_WORD1_TIME_MODIFY_SET)
 
-/* These are the only attrs allowed in CREATE/OPEN/SETATTR. */
+/*
+ * These are the only attrs allowed in CREATE/OPEN/SETATTR. Don't add
+ * a writeable attribute here without also adding code to parse it to
+ * nfsd4_decode_fattr().
+ */
 #define NFSD_WRITEABLE_ATTRS_WORD0 \
 	(FATTR4_WORD0_SIZE | FATTR4_WORD0_ACL)
 #define NFSD_WRITEABLE_ATTRS_WORD1 \
 	(FATTR4_WORD1_MODE | FATTR4_WORD1_OWNER | FATTR4_WORD1_OWNER_GROUP \
 	| FATTR4_WORD1_TIME_ACCESS_SET | FATTR4_WORD1_TIME_MODIFY_SET)
 #ifdef CONFIG_NFSD_V4_SECURITY_LABEL
-#define NFSD_WRITEABLE_ATTRS_WORD2 FATTR4_WORD2_SECURITY_LABEL
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL \
+	FATTR4_WORD2_SECURITY_LABEL
 #else
-#define NFSD_WRITEABLE_ATTRS_WORD2 0
+#define MAYBE_FATTR4_WORD2_SECURITY_LABEL 0
 #endif
+#define NFSD_WRITEABLE_ATTRS_WORD2 \
+	(FATTR4_WORD2_MODE_UMASK \
+	| MAYBE_FATTR4_WORD2_SECURITY_LABEL)
 
 #define NFSD_SUPPATTR_EXCLCREAT_WORD0 \
 	NFSD_WRITEABLE_ATTRS_WORD0
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index a2b65fc56dd6..e6bfd96734c0 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -661,8 +661,8 @@ nfsd(void *vrqstp)
 	mutex_lock(&nfsd_mutex);
 
 	/* At this point, the thread shares current->fs
-	 * with the init process. We need to create files with a
-	 * umask of 0 instead of init's umask. */
+	 * with the init process. We need to create files with the
+	 * umask as defined by the client instead of init's umask. */
 	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 8ca642fe9b21..26c6fdb4bf67 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -26,7 +26,7 @@
 #include <linux/jhash.h>
 #include <linux/ima.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/exportfs.h>
 #include <linux/writeback.h>
 #include <linux/security.h>
@@ -509,8 +509,7 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp,
 __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
 		u64 dst_pos, u64 count)
 {
-	return nfserrno(vfs_clone_file_range(src, src_pos, dst, dst_pos,
-			count));
+	return nfserrno(do_clone_file_range(src, src_pos, dst, dst_pos, count));
 }
 
 ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
@@ -1451,7 +1450,6 @@ do_nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
 __be32
 nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 {
-	struct inode	*inode;
 	mm_segment_t	oldfs;
 	__be32		err;
 	int		host_err;
@@ -1463,10 +1461,9 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 
 	path.mnt = fhp->fh_export->ex_path.mnt;
 	path.dentry = fhp->fh_dentry;
-	inode = d_inode(path.dentry);
 
 	err = nfserr_inval;
-	if (!inode->i_op->readlink)
+	if (!d_is_symlink(path.dentry))
 		goto out;
 
 	touch_atime(&path);
@@ -1475,7 +1472,7 @@ nfsd_readlink(struct svc_rqst *rqstp, struct svc_fh *fhp, char *buf, int *lenp)
 	 */
 
 	oldfs = get_fs(); set_fs(KERNEL_DS);
-	host_err = inode->i_op->readlink(path.dentry, (char __user *)buf, *lenp);
+	host_err = vfs_readlink(path.dentry, (char __user *)buf, *lenp);
 	set_fs(oldfs);
 
 	if (host_err < 0)
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
index 2b71c60fe982..515d13c196da 100644
--- a/fs/nilfs2/namei.c
+++ b/fs/nilfs2/namei.c
@@ -568,7 +568,6 @@ const struct inode_operations nilfs_special_inode_operations = {
 };
 
 const struct inode_operations nilfs_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.permission     = nilfs_permission,
 };
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index c95d369e90aa..12eeae62a2b1 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -189,7 +189,7 @@ static int nilfs_sync_super(struct super_block *sb, int flag)
 	set_buffer_dirty(nilfs->ns_sbh[0]);
 	if (nilfs_test_opt(nilfs, BARRIER)) {
 		err = __sync_dirty_buffer(nilfs->ns_sbh[0],
-					  WRITE_SYNC | WRITE_FLUSH_FUA);
+					  REQ_SYNC | REQ_PREFLUSH | REQ_FUA);
 	} else {
 		err = sync_dirty_buffer(nilfs->ns_sbh[0]);
 	}
diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c
index 6faaf710e563..5a4ec309e283 100644
--- a/fs/notify/dnotify/dnotify.c
+++ b/fs/notify/dnotify/dnotify.c
@@ -85,7 +85,7 @@ static int dnotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
+				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie)
 {
 	struct dnotify_mark *dn_mark;
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index e0e5f7c3c99f..bbc175d4213d 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -90,10 +90,10 @@ static int fanotify_get_response(struct fsnotify_group *group,
 static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 				       struct fsnotify_mark *vfsmnt_mark,
 				       u32 event_mask,
-				       void *data, int data_type)
+				       const void *data, int data_type)
 {
 	__u32 marks_mask, marks_ignored_mask;
-	struct path *path = data;
+	const struct path *path = data;
 
 	pr_debug("%s: inode_mark=%p vfsmnt_mark=%p mask=%x data=%p"
 		 " data_type=%d\n", __func__, inode_mark, vfsmnt_mark,
@@ -140,7 +140,7 @@ static bool fanotify_should_send_event(struct fsnotify_mark *inode_mark,
 }
 
 struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
-						 struct path *path)
+						 const struct path *path)
 {
 	struct fanotify_event_info *event;
 
@@ -177,7 +177,7 @@ static int fanotify_handle_event(struct fsnotify_group *group,
 				 struct inode *inode,
 				 struct fsnotify_mark *inode_mark,
 				 struct fsnotify_mark *fanotify_mark,
-				 u32 mask, void *data, int data_type,
+				 u32 mask, const void *data, int data_type,
 				 const unsigned char *file_name, u32 cookie)
 {
 	int ret = 0;
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index 2a5fb14115df..4500a74f8d38 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -47,4 +47,4 @@ static inline struct fanotify_event_info *FANOTIFY_E(struct fsnotify_event *fse)
 }
 
 struct fanotify_event_info *fanotify_alloc_event(struct inode *inode, u32 mask,
-						 struct path *path);
+						 const struct path *path);
diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c
index db39de2dd4cb..b41515d3f081 100644
--- a/fs/notify/fsnotify.c
+++ b/fs/notify/fsnotify.c
@@ -86,7 +86,7 @@ void __fsnotify_update_child_dentry_flags(struct inode *inode)
 }
 
 /* Notify this dentry's parent about a child's events. */
-int __fsnotify_parent(struct path *path, struct dentry *dentry, __u32 mask)
+int __fsnotify_parent(const struct path *path, struct dentry *dentry, __u32 mask)
 {
 	struct dentry *parent;
 	struct inode *p_inode;
@@ -125,7 +125,7 @@ EXPORT_SYMBOL_GPL(__fsnotify_parent);
 static int send_to_group(struct inode *to_tell,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
-			 __u32 mask, void *data,
+			 __u32 mask, const void *data,
 			 int data_is, u32 cookie,
 			 const unsigned char *file_name)
 {
@@ -187,7 +187,7 @@ static int send_to_group(struct inode *to_tell,
  * out to all of the registered fsnotify_group.  Those groups can then use the
  * notification event in whatever means they feel necessary.
  */
-int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
+int fsnotify(struct inode *to_tell, __u32 mask, const void *data, int data_is,
 	     const unsigned char *file_name, u32 cookie)
 {
 	struct hlist_node *inode_node = NULL, *vfsmount_node = NULL;
@@ -199,7 +199,7 @@ int fsnotify(struct inode *to_tell, __u32 mask, void *data, int data_is,
 	__u32 test_mask = (mask & ~FS_EVENT_ON_CHILD);
 
 	if (data_is == FSNOTIFY_EVENT_PATH)
-		mnt = real_mount(((struct path *)data)->mnt);
+		mnt = real_mount(((const struct path *)data)->mnt);
 	else
 		mnt = NULL;
 
diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c
index 741077deef3b..a3645249f7ec 100644
--- a/fs/notify/inode_mark.c
+++ b/fs/notify/inode_mark.c
@@ -150,12 +150,10 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark,
  */
 void fsnotify_unmount_inodes(struct super_block *sb)
 {
-	struct inode *inode, *next_i, *need_iput = NULL;
+	struct inode *inode, *iput_inode = NULL;
 
 	spin_lock(&sb->s_inode_list_lock);
-	list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) {
-		struct inode *need_iput_tmp;
-
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
 		/*
 		 * We cannot __iget() an inode in state I_FREEING,
 		 * I_WILL_FREE, or I_NEW which is fine because by that point
@@ -178,49 +176,24 @@ void fsnotify_unmount_inodes(struct super_block *sb)
 			continue;
 		}
 
-		need_iput_tmp = need_iput;
-		need_iput = NULL;
-
-		/* In case fsnotify_inode_delete() drops a reference. */
-		if (inode != need_iput_tmp)
-			__iget(inode);
-		else
-			need_iput_tmp = NULL;
+		__iget(inode);
 		spin_unlock(&inode->i_lock);
-
-		/* In case the dropping of a reference would nuke next_i. */
-		while (&next_i->i_sb_list != &sb->s_inodes) {
-			spin_lock(&next_i->i_lock);
-			if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) &&
-						atomic_read(&next_i->i_count)) {
-				__iget(next_i);
-				need_iput = next_i;
-				spin_unlock(&next_i->i_lock);
-				break;
-			}
-			spin_unlock(&next_i->i_lock);
-			next_i = list_next_entry(next_i, i_sb_list);
-		}
-
-		/*
-		 * We can safely drop s_inode_list_lock here because either
-		 * we actually hold references on both inode and next_i or
-		 * end of list.  Also no new inodes will be added since the
-		 * umount has begun.
-		 */
 		spin_unlock(&sb->s_inode_list_lock);
 
-		if (need_iput_tmp)
-			iput(need_iput_tmp);
+		if (iput_inode)
+			iput(iput_inode);
 
 		/* for each watch, send FS_UNMOUNT and then remove it */
 		fsnotify(inode, FS_UNMOUNT, inode, FSNOTIFY_EVENT_INODE, NULL, 0);
 
 		fsnotify_inode_delete(inode);
 
-		iput(inode);
+		iput_inode = inode;
 
 		spin_lock(&sb->s_inode_list_lock);
 	}
 	spin_unlock(&sb->s_inode_list_lock);
+
+	if (iput_inode)
+		iput(iput_inode);
 }
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index ed855ef6f077..a6f5907a3fee 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -26,7 +26,7 @@ extern int inotify_handle_event(struct fsnotify_group *group,
 				struct inode *inode,
 				struct fsnotify_mark *inode_mark,
 				struct fsnotify_mark *vfsmount_mark,
-				u32 mask, void *data, int data_type,
+				u32 mask, const void *data, int data_type,
 				const unsigned char *file_name, u32 cookie);
 
 extern const struct fsnotify_ops inotify_fsnotify_ops;
diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c
index 2cd900c2c737..19e7ec109a75 100644
--- a/fs/notify/inotify/inotify_fsnotify.c
+++ b/fs/notify/inotify/inotify_fsnotify.c
@@ -66,7 +66,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 			 struct inode *inode,
 			 struct fsnotify_mark *inode_mark,
 			 struct fsnotify_mark *vfsmount_mark,
-			 u32 mask, void *data, int data_type,
+			 u32 mask, const void *data, int data_type,
 			 const unsigned char *file_name, u32 cookie)
 {
 	struct inotify_inode_mark *i_mark;
@@ -80,7 +80,7 @@ int inotify_handle_event(struct fsnotify_group *group,
 
 	if ((inode_mark->mask & FS_EXCL_UNLINK) &&
 	    (data_type == FSNOTIFY_EVENT_PATH)) {
-		struct path *path = data;
+		const struct path *path = data;
 
 		if (d_unlinked(path->dentry))
 			return 0;
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index d3fea0bd89e2..6043306e8e21 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -510,18 +510,6 @@ void fsnotify_detach_group_marks(struct fsnotify_group *group)
 	}
 }
 
-void fsnotify_duplicate_mark(struct fsnotify_mark *new, struct fsnotify_mark *old)
-{
-	assert_spin_locked(&old->lock);
-	new->inode = old->inode;
-	new->mnt = old->mnt;
-	if (old->group)
-		fsnotify_get_group(old->group);
-	new->group = old->group;
-	new->mask = old->mask;
-	new->free_mark = old->free_mark;
-}
-
 /*
  * Nothing fancy, just initialize lists and locks and counters.
  */
diff --git a/fs/nsfs.c b/fs/nsfs.c
index 8718af895eab..8c9fb29c6673 100644
--- a/fs/nsfs.c
+++ b/fs/nsfs.c
@@ -118,7 +118,7 @@ again:
 	return ret;
 }
 
-static int open_related_ns(struct ns_common *ns,
+int open_related_ns(struct ns_common *ns,
 		   struct ns_common *(*get_ns)(struct ns_common *ns))
 {
 	struct path path = {};
diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c
index fe251f187ff8..cc91856b5e2d 100644
--- a/fs/ntfs/aops.c
+++ b/fs/ntfs/aops.c
@@ -29,6 +29,7 @@
 #include <linux/buffer_head.h>
 #include <linux/writeback.h>
 #include <linux/bit_spinlock.h>
+#include <linux/bio.h>
 
 #include "aops.h"
 #include "attrib.h"
@@ -764,7 +765,7 @@ lock_retry_remap:
 			}
 			// TODO: Instantiate the hole.
 			// clear_buffer_new(bh);
-			// unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			// clean_bdev_bh_alias(bh);
 			ntfs_error(vol->sb, "Writing into sparse regions is "
 					"not supported yet. Sorry.");
 			err = -EOPNOTSUPP;
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index bf72a2c58b75..358ed7e1195a 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -30,7 +30,7 @@
 #include <linux/writeback.h>
 
 #include <asm/page.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "attrib.h"
 #include "bitmap.h"
@@ -740,8 +740,7 @@ map_buffer_cached:
 					set_buffer_uptodate(bh);
 				if (unlikely(was_hole)) {
 					/* We allocated the buffer. */
-					unmap_underlying_metadata(bh->b_bdev,
-							bh->b_blocknr);
+					clean_bdev_bh_alias(bh);
 					if (bh_end <= pos || bh_pos >= end)
 						mark_buffer_dirty(bh);
 					else
@@ -784,7 +783,7 @@ map_buffer_cached:
 				continue;
 			}
 			/* We allocated the buffer. */
-			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			clean_bdev_bh_alias(bh);
 			/*
 			 * If the buffer is fully outside the write, zero it,
 			 * set it uptodate, and mark it dirty so it gets
diff --git a/fs/ntfs/logfile.c b/fs/ntfs/logfile.c
index 761f12f7f3ef..353379ff6057 100644
--- a/fs/ntfs/logfile.c
+++ b/fs/ntfs/logfile.c
@@ -27,6 +27,7 @@
 #include <linux/buffer_head.h>
 #include <linux/bitops.h>
 #include <linux/log2.h>
+#include <linux/bio.h>
 
 #include "attrib.h"
 #include "aops.h"
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index d3c009626032..b6f402194f02 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -23,6 +23,7 @@
 #include <linux/buffer_head.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
+#include <linux/bio.h>
 
 #include "attrib.h"
 #include "aops.h"
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f72712f6c28d..d4ec0d8961a6 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -5194,7 +5194,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
 	rec = &el->l_recs[index];
 	if (new_flags && (rec->e_flags & new_flags)) {
 		mlog(ML_ERROR, "Owner %llu tried to set %d flags on an "
-		     "extent that already had them",
+		     "extent that already had them\n",
 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 		     new_flags);
 		goto out;
@@ -5202,7 +5202,7 @@ int ocfs2_change_extent_flag(handle_t *handle,
 
 	if (clear_flags && !(rec->e_flags & clear_flags)) {
 		mlog(ML_ERROR, "Owner %llu tried to clear %d flags on an "
-		     "extent that didn't have them",
+		     "extent that didn't have them\n",
 		     (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci),
 		     clear_flags);
 		goto out;
@@ -5713,8 +5713,7 @@ int ocfs2_remove_btree_range(struct inode *inode,
 	struct ocfs2_refcount_tree *ref_tree = NULL;
 
 	if ((flags & OCFS2_EXT_REFCOUNTED) && len) {
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
+		BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 		if (!refcount_tree_locked) {
 			ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index c5c5b9748ea3..11556b7d93ec 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -464,6 +464,15 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 	trace_ocfs2_bmap((unsigned long long)OCFS2_I(inode)->ip_blkno,
 			 (unsigned long long)block);
 
+	/*
+	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+	 * bypasseѕ the file system for actual I/O.  We really can't allow
+	 * that on refcounted inodes, so we have to skip out here.  And yes,
+	 * 0 is the magic code for a bmap error..
+	 */
+	if (ocfs2_is_refcount_inode(inode))
+		return 0;
+
 	/* We don't need to lock journal system files, since they aren't
 	 * accessed concurrently from multiple nodes.
 	 */
@@ -630,7 +639,7 @@ int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
 
 		if (!buffer_mapped(bh)) {
 			map_bh(bh, inode->i_sb, *p_blkno);
-			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+			clean_bdev_bh_alias(bh);
 		}
 
 		if (PageUptodate(page)) {
@@ -1950,8 +1959,7 @@ static void ocfs2_write_end_inline(struct inode *inode, loff_t pos,
 }
 
 int ocfs2_write_end_nolock(struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied,
-			   struct page *page, void *fsdata)
+			   loff_t pos, unsigned len, unsigned copied, void *fsdata)
 {
 	int i, ret;
 	unsigned from, to, start = pos & (PAGE_SIZE - 1);
@@ -2064,7 +2072,7 @@ static int ocfs2_write_end(struct file *file, struct address_space *mapping,
 	int ret;
 	struct inode *inode = mapping->host;
 
-	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, page, fsdata);
+	ret = ocfs2_write_end_nolock(mapping, pos, len, copied, fsdata);
 
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 	ocfs2_inode_unlock(inode, 1);
@@ -2241,7 +2249,7 @@ static int ocfs2_dio_get_block(struct inode *inode, sector_t iblock,
 		dwc->dw_zero_count++;
 	}
 
-	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, NULL, wc);
+	ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc);
 	BUG_ON(ret != len);
 	ret = 0;
 unlock:
@@ -2254,10 +2262,10 @@ out:
 	return ret;
 }
 
-static void ocfs2_dio_end_io_write(struct inode *inode,
-				   struct ocfs2_dio_write_ctxt *dwc,
-				   loff_t offset,
-				   ssize_t bytes)
+static int ocfs2_dio_end_io_write(struct inode *inode,
+				  struct ocfs2_dio_write_ctxt *dwc,
+				  loff_t offset,
+				  ssize_t bytes)
 {
 	struct ocfs2_cached_dealloc_ctxt dealloc;
 	struct ocfs2_extent_tree et;
@@ -2308,7 +2316,7 @@ static void ocfs2_dio_end_io_write(struct inode *inode,
 			mlog_errno(ret);
 	}
 
-	di = (struct ocfs2_dinode *)di_bh;
+	di = (struct ocfs2_dinode *)di_bh->b_data;
 
 	ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh);
 
@@ -2365,6 +2373,8 @@ out:
 	if (locked)
 		inode_unlock(inode);
 	ocfs2_dio_free_write_ctx(inode, dwc);
+
+	return ret;
 }
 
 /*
@@ -2379,21 +2389,19 @@ static int ocfs2_dio_end_io(struct kiocb *iocb,
 {
 	struct inode *inode = file_inode(iocb->ki_filp);
 	int level;
-
-	if (bytes <= 0)
-		return 0;
+	int ret = 0;
 
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
 
-	if (private)
-		ocfs2_dio_end_io_write(inode, private, offset, bytes);
+	if (bytes > 0 && private)
+		ret = ocfs2_dio_end_io_write(inode, private, offset, bytes);
 
 	ocfs2_iocb_clear_rw_locked(iocb);
 
 	level = ocfs2_iocb_rw_locked_level(iocb);
 	ocfs2_rw_unlock(inode, level);
-	return 0;
+	return ret;
 }
 
 static ssize_t ocfs2_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index b1c9f28a57b1..8614ff069d99 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -44,8 +44,7 @@ int walk_page_buffers(	handle_t *handle,
 					struct buffer_head *bh));
 
 int ocfs2_write_end_nolock(struct address_space *mapping,
-			   loff_t pos, unsigned len, unsigned copied,
-			   struct page *page, void *fsdata);
+			   loff_t pos, unsigned len, unsigned copied, void *fsdata);
 
 typedef enum {
 	OCFS2_WRITE_BUFFER = 0,
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index 8f040f88ade4..d9ebe11c8990 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -26,6 +26,7 @@
 #include <linux/fs.h>
 #include <linux/types.h>
 #include <linux/highmem.h>
+#include <linux/bio.h>
 
 #include <cluster/masklog.h>
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 636abcbd4650..f6e871760f8d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -627,7 +627,7 @@ static int o2hb_issue_node_write(struct o2hb_region *reg,
 	slot = o2nm_this_node();
 
 	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1, REQ_OP_WRITE,
-				 WRITE_SYNC);
+				 REQ_SYNC);
 	if (IS_ERR(bio)) {
 		status = PTR_ERR(bio);
 		mlog_errno(status);
@@ -741,7 +741,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg,
 	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
 	memset(hb_block, 0, reg->hr_block_bytes);
 	/* TODO: time stuff */
-	cputime = CURRENT_TIME.tv_sec;
+	cputime = ktime_get_real_seconds();
 	if (!cputime)
 		cputime = 1;
 
@@ -1250,7 +1250,7 @@ static int o2hb_thread(void *data)
 
 		mlog(ML_HEARTBEAT,
 		     "start = %lld, end = %lld, msec = %u, ret = %d\n",
-		     before_hb.tv64, after_hb.tv64, elapsed_msec, ret);
+		     before_hb, after_hb, elapsed_msec, ret);
 
 		if (!kthread_should_stop() &&
 		    elapsed_msec < reg->hr_timeout_ms) {
diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c
index dfe162f5fd4c..d331c2386b94 100644
--- a/fs/ocfs2/cluster/masklog.c
+++ b/fs/ocfs2/cluster/masklog.c
@@ -24,7 +24,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/string.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "masklog.h"
 
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 8abab16b4602..d4b5c81f0445 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -62,7 +62,7 @@
 #include <linux/export.h>
 #include <net/tcp.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "heartbeat.h"
 #include "tcp.h"
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 3f828a187049..a464c8088170 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1609,8 +1609,6 @@ way_up_top:
 		__dlm_insert_mle(dlm, mle);
 		response = DLM_MASTER_RESP_NO;
 	} else {
-		// mlog(0, "mle was found\n");
-		set_maybe = 1;
 		spin_lock(&tmpmle->spinlock);
 		if (tmpmle->master == dlm->node_num) {
 			mlog(ML_ERROR, "no lockres, but an mle with this node as master!\n");
@@ -1625,8 +1623,7 @@ way_up_top:
 			response = DLM_MASTER_RESP_NO;
 		} else
 			response = DLM_MASTER_RESP_MAYBE;
-		if (set_maybe)
-			set_bit(request->node_idx, tmpmle->maybe_map);
+		set_bit(request->node_idx, tmpmle->maybe_map);
 		spin_unlock(&tmpmle->spinlock);
 	}
 	spin_unlock(&dlm->master_lock);
@@ -1644,12 +1641,6 @@ send_response:
 	 * dlm_assert_master_worker() isn't called, we drop it here.
 	 */
 	if (dispatch_assert) {
-		if (response != DLM_MASTER_RESP_YES)
-			mlog(ML_ERROR, "invalid response %d\n", response);
-		if (!res) {
-			mlog(ML_ERROR, "bad lockres while trying to assert!\n");
-			BUG();
-		}
 		mlog(0, "%u is the owner of %.*s, cleaning everyone else\n",
 			     dlm->node_num, res->lockname.len, res->lockname.name);
 		spin_lock(&res->spinlock);
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index dd5cb8bcefd1..74407c6dd592 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2966,8 +2966,6 @@ int dlm_finalize_reco_handler(struct o2net_msg *msg, u32 len, void *data,
 			spin_unlock(&dlm->spinlock);
 			dlm_kick_recovery_thread(dlm);
 			break;
-		default:
-			BUG();
 	}
 
 	mlog(0, "%s: recovery done, reco master was %u, dead now %u, master now %u\n",
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 1079fae5aa12..9ab9e1892b5f 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -45,7 +45,7 @@
 #include <linux/backing-dev.h>
 #include <linux/poll.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "stackglue.h"
 #include "userdlm.h"
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 83d576f6a287..77d1632e905d 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -3303,6 +3303,16 @@ static int ocfs2_downconvert_lock(struct ocfs2_super *osb,
 	mlog(ML_BASTS, "lockres %s, level %d => %d\n", lockres->l_name,
 	     lockres->l_level, new_level);
 
+	/*
+	 * On DLM_LKF_VALBLK, fsdlm behaves differently with o2cb. It always
+	 * expects DLM_LKF_VALBLK being set if the LKB has LVB, so that
+	 * we can recover correctly from node failure. Otherwise, we may get
+	 * invalid LVB in LKB, but without DLM_SBF_VALNOTVALID being set.
+	 */
+	if (!ocfs2_is_o2cb_active() &&
+	    lockres->l_ops->flags & LOCK_TYPE_USES_LVB)
+		lvb = 1;
+
 	if (lvb)
 		dlm_flags |= DLM_LKF_VALBLK;
 
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 000c234d7bbd..c4889655d32b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1030,7 +1030,7 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh,
 	 * Only quota files call this without a bh, and they can't be
 	 * refcounted.
 	 */
-	BUG_ON(!di_bh && (oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!di_bh && ocfs2_is_refcount_inode(inode));
 	BUG_ON(!di_bh && !(oi->ip_flags & OCFS2_INODE_SYSTEM_FILE));
 
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size);
@@ -1667,9 +1667,9 @@ static void ocfs2_calc_trunc_pos(struct inode *inode,
 	*done = ret;
 }
 
-static int ocfs2_remove_inode_range(struct inode *inode,
-				    struct buffer_head *di_bh, u64 byte_start,
-				    u64 byte_len)
+int ocfs2_remove_inode_range(struct inode *inode,
+			     struct buffer_head *di_bh, u64 byte_start,
+			     u64 byte_len)
 {
 	int ret = 0, flags = 0, done = 0, i;
 	u32 trunc_start, trunc_len, trunc_end, trunc_cpos, phys_cpos;
@@ -1719,8 +1719,7 @@ static int ocfs2_remove_inode_range(struct inode *inode,
 	 * within one cluster(means is not exactly aligned to clustersize).
 	 */
 
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
-
+	if (ocfs2_is_refcount_inode(inode)) {
 		ret = ocfs2_cow_file_pos(inode, di_bh, byte_start);
 		if (ret) {
 			mlog_errno(ret);
@@ -2036,7 +2035,7 @@ int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
 	struct super_block *sb = inode->i_sb;
 
 	if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)) ||
-	    !(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) ||
+	    !ocfs2_is_refcount_inode(inode) ||
 	    OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)
 		return 0;
 
@@ -2440,6 +2439,31 @@ out:
 	return offset;
 }
 
+static int ocfs2_file_clone_range(struct file *file_in,
+				  loff_t pos_in,
+				  struct file *file_out,
+				  loff_t pos_out,
+				  u64 len)
+{
+	return ocfs2_reflink_remap_range(file_in, pos_in, file_out, pos_out,
+					 len, false);
+}
+
+static ssize_t ocfs2_file_dedupe_range(struct file *src_file,
+				       u64 loff,
+				       u64 len,
+				       struct file *dst_file,
+				       u64 dst_loff)
+{
+	int error;
+
+	error = ocfs2_reflink_remap_range(src_file, loff, dst_file, dst_loff,
+					  len, true);
+	if (error)
+		return error;
+	return len;
+}
+
 const struct inode_operations ocfs2_file_iops = {
 	.setattr	= ocfs2_setattr,
 	.getattr	= ocfs2_getattr,
@@ -2479,6 +2503,8 @@ const struct file_operations ocfs2_fops = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
+	.clone_file_range = ocfs2_file_clone_range,
+	.dedupe_file_range = ocfs2_file_dedupe_range,
 };
 
 const struct file_operations ocfs2_dops = {
@@ -2524,6 +2550,8 @@ const struct file_operations ocfs2_fops_no_plocks = {
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
 	.fallocate	= ocfs2_fallocate,
+	.clone_file_range = ocfs2_file_clone_range,
+	.dedupe_file_range = ocfs2_file_dedupe_range,
 };
 
 const struct file_operations ocfs2_dops_no_plocks = {
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index e8c62f22215c..897fd9a2e51d 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -82,4 +82,7 @@ int ocfs2_change_file_space(struct file *file, unsigned int cmd,
 
 int ocfs2_check_range_for_refcount(struct inode *inode, loff_t pos,
 				   size_t count);
+int ocfs2_remove_inode_range(struct inode *inode,
+			     struct buffer_head *di_bh, u64 byte_start,
+			     u64 byte_len);
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index c56a7679df93..382401d3e88f 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -703,7 +703,7 @@ static int ocfs2_remove_inode(struct inode *inode,
 		goto bail_commit;
 	}
 
-	di->i_dtime = cpu_to_le64(CURRENT_TIME.tv_sec);
+	di->i_dtime = cpu_to_le64(ktime_get_real_seconds());
 	di->i_flags &= cpu_to_le32(~(OCFS2_VALID_FL | OCFS2_ORPHANED_FL));
 	ocfs2_journal_dirty(handle, di_bh);
 
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 5af68fcdf9d3..9b955f732bca 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -181,4 +181,10 @@ static inline struct ocfs2_inode_info *cache_info_to_inode(struct ocfs2_caching_
 	return container_of(ci, struct ocfs2_inode_info, ip_metadata_cache);
 }
 
+/* Does this inode have the reflink flag set? */
+static inline bool ocfs2_is_refcount_inode(struct inode *inode)
+{
+	return (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+}
+
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index a244f14c6b87..d5e5fa7f0743 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1947,7 +1947,7 @@ static void ocfs2_queue_orphan_scan(struct ocfs2_super *osb)
 	 */
 	seqno++;
 	os->os_count++;
-	os->os_scantime = CURRENT_TIME;
+	os->os_scantime = ktime_get_seconds();
 unlock:
 	ocfs2_orphan_scan_unlock(osb, seqno);
 out:
@@ -2004,7 +2004,7 @@ void ocfs2_orphan_scan_start(struct ocfs2_super *osb)
 	struct ocfs2_orphan_scan *os;
 
 	os = &osb->osb_orphan_scan;
-	os->os_scantime = CURRENT_TIME;
+	os->os_scantime = ktime_get_seconds();
 	if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb))
 		atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE);
 	else {
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 71545ad4628c..429088786e93 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -120,8 +120,7 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
 		ret = VM_FAULT_NOPAGE;
 		goto out;
 	}
-	ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
-				     fsdata);
+	ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata);
 	BUG_ON(ret != len);
 	ret = VM_FAULT_LOCKED;
 out:
diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c
index 4e8f32eb0bdb..e52a2852d50d 100644
--- a/fs/ocfs2/move_extents.c
+++ b/fs/ocfs2/move_extents.c
@@ -235,10 +235,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context,
 	u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 
 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && *len) {
-
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
-
+		BUG_ON(!ocfs2_is_refcount_inode(inode));
 		BUG_ON(!context->refcount_loc);
 
 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
@@ -581,10 +578,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context,
 	phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos);
 
 	if ((ext_flags & OCFS2_EXT_REFCOUNTED) && len) {
-
-		BUG_ON(!(OCFS2_I(inode)->ip_dyn_features &
-			 OCFS2_HAS_REFCOUNT_FL));
-
+		BUG_ON(!ocfs2_is_refcount_inode(inode));
 		BUG_ON(!context->refcount_loc);
 
 		ret = ocfs2_lock_refcount_tree(osb, context->refcount_loc, 1,
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 8d887c75765c..3b0a10d9b36f 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -516,6 +516,7 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 	struct ocfs2_extent_list *fel;
 	u16 feat;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct timespec64 ts;
 
 	*new_fe_bh = NULL;
 
@@ -564,10 +565,11 @@ static int __ocfs2_mknod_locked(struct inode *dir,
 	fe->i_last_eb_blk = 0;
 	strcpy(fe->i_signature, OCFS2_INODE_SIGNATURE);
 	fe->i_flags |= cpu_to_le32(OCFS2_VALID_FL);
+	ktime_get_real_ts64(&ts);
 	fe->i_atime = fe->i_ctime = fe->i_mtime =
-		cpu_to_le64(CURRENT_TIME.tv_sec);
+		cpu_to_le64(ts.tv_sec);
 	fe->i_mtime_nsec = fe->i_ctime_nsec = fe->i_atime_nsec =
-		cpu_to_le32(CURRENT_TIME.tv_nsec);
+		cpu_to_le32(ts.tv_nsec);
 	fe->i_dtime = 0;
 
 	/*
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index e63af7ddfe68..7e5958b0be6b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -224,7 +224,7 @@ struct ocfs2_orphan_scan {
 	struct ocfs2_super 	*os_osb;
 	struct ocfs2_lock_res 	os_lockres;     /* lock to synchronize scans */
 	struct delayed_work 	os_orphan_scan_work;
-	struct timespec		os_scantime;  /* time this node ran the scan */
+	time64_t		os_scantime;  /* time this node ran the scan */
 	u32			os_count;      /* tracks node specific scans */
 	u32  			os_seqno;       /* tracks cluster wide scans */
 	atomic_t		os_state;              /* ACTIVE or INACTIVE */
diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c
index 87e577a49b0d..cec495a921e3 100644
--- a/fs/ocfs2/quota_global.c
+++ b/fs/ocfs2/quota_global.c
@@ -634,7 +634,15 @@ static void qsync_work_fn(struct work_struct *work)
 						      dqi_sync_work.work);
 	struct super_block *sb = oinfo->dqi_gqinode->i_sb;
 
-	dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+	/*
+	 * We have to be careful here not to deadlock on s_umount as umount
+	 * disabling quotas may be in progress and it waits for this work to
+	 * complete. If trylock fails, we'll do the sync next time...
+	 */
+	if (down_read_trylock(&sb->s_umount)) {
+		dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type);
+		up_read(&sb->s_umount);
+	}
 	schedule_delayed_work(&oinfo->dqi_sync_work,
 			      msecs_to_jiffies(oinfo->dqi_syncms));
 }
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index 8a54fd8a4fa5..32c5a40c1257 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -454,7 +454,7 @@ out:
 /* Sync changes in local quota file into global quota file and
  * reinitialize local quota file.
  * The function expects local quota file to be already locked and
- * dqonoff_mutex locked. */
+ * s_umount locked in shared mode. */
 static int ocfs2_recover_local_quota_file(struct inode *lqinode,
 					  int type,
 					  struct ocfs2_quota_recovery *rec)
@@ -597,7 +597,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
 	printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
 	       "slot %u\n", osb->dev_str, slot_num);
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	down_read(&sb->s_umount);
 	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
 		if (list_empty(&(rec->r_list[type])))
 			continue;
@@ -674,7 +674,7 @@ out_put:
 			break;
 	}
 out:
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	up_read(&sb->s_umount);
 	kfree(rec);
 	return status;
 }
@@ -840,7 +840,10 @@ static int ocfs2_local_free_info(struct super_block *sb, int type)
 	}
 	ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk);
 
-	/* dqonoff_mutex protects us against racing with recovery thread... */
+	/*
+	 * s_umount held in exclusive mode protects us against racing with
+	 * recovery thread...
+	 */
 	if (oinfo->dqi_rec) {
 		ocfs2_free_quota_recovery(oinfo->dqi_rec);
 		mark_clean = 0;
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index 19238512a324..f8933cb53d68 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -34,6 +34,7 @@
 #include "xattr.h"
 #include "namei.h"
 #include "ocfs2_trace.h"
+#include "file.h"
 
 #include <linux/bio.h>
 #include <linux/blkdev.h>
@@ -410,7 +411,7 @@ static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
 		goto out;
 	}
 
-	BUG_ON(!(OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 	di = (struct ocfs2_dinode *)di_bh->b_data;
 	*ref_blkno = le64_to_cpu(di->i_refcount_loc);
@@ -478,7 +479,6 @@ again:
 	if (ret) {
 		mlog_errno(ret);
 		ocfs2_unlock_refcount_tree(osb, tree, rw);
-		ocfs2_refcount_tree_put(tree);
 		goto out;
 	}
 
@@ -570,7 +570,7 @@ static int ocfs2_create_refcount_tree(struct inode *inode,
 	u32 num_got;
 	u64 suballoc_loc, first_blkno;
 
-	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+	BUG_ON(ocfs2_is_refcount_inode(inode));
 
 	trace_ocfs2_create_refcount_tree(
 		(unsigned long long)OCFS2_I(inode)->ip_blkno);
@@ -708,7 +708,7 @@ static int ocfs2_set_refcount_tree(struct inode *inode,
 	struct ocfs2_refcount_block *rb;
 	struct ocfs2_refcount_tree *ref_tree;
 
-	BUG_ON(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL);
+	BUG_ON(ocfs2_is_refcount_inode(inode));
 
 	ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
 				       &ref_tree, &ref_root_bh);
@@ -775,7 +775,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
 	u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
 	u16 bit = 0;
 
-	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL))
+	if (!ocfs2_is_refcount_inode(inode))
 		return 0;
 
 	BUG_ON(!ref_blkno);
@@ -2299,11 +2299,10 @@ int ocfs2_decrease_refcount(struct inode *inode,
 {
 	int ret;
 	u64 ref_blkno;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *tree;
 
-	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 	ret = ocfs2_get_refcount_block(inode, &ref_blkno);
 	if (ret) {
@@ -2533,7 +2532,6 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 					  int *ref_blocks)
 {
 	int ret;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *tree;
 	u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
@@ -2544,7 +2542,7 @@ int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
 		goto out;
 	}
 
-	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 	ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
 				      refcount_loc, &tree);
@@ -3412,14 +3410,13 @@ static int ocfs2_refcount_cow_hunk(struct inode *inode,
 {
 	int ret;
 	u32 cow_start = 0, cow_len = 0;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
 	struct buffer_head *ref_root_bh = NULL;
 	struct ocfs2_refcount_tree *ref_tree;
 	struct ocfs2_cow_context *context = NULL;
 
-	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 	ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
 					      cpos, write_len, max_cpos,
@@ -3629,11 +3626,10 @@ int ocfs2_refcount_cow_xattr(struct inode *inode,
 {
 	int ret;
 	struct ocfs2_xattr_value_root *xv = vb->vb_xv;
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 	struct ocfs2_cow_context *context = NULL;
 	u32 cow_start, cow_len;
 
-	BUG_ON(!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL));
+	BUG_ON(!ocfs2_is_refcount_inode(inode));
 
 	ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
 					      cpos, write_len, UINT_MAX,
@@ -3696,6 +3692,9 @@ int ocfs2_add_refcount_flag(struct inode *inode,
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	struct ocfs2_alloc_context *meta_ac = NULL;
 
+	/* We need to be able to handle at least an extent tree split. */
+	ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
+
 	ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
 					       ref_ci, ref_root_bh,
 					       p_cluster, num_clusters,
@@ -3807,7 +3806,7 @@ static int ocfs2_attach_refcount_tree(struct inode *inode,
 
 	ocfs2_init_dealloc_ctxt(&dealloc);
 
-	if (!(oi->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL)) {
+	if (!ocfs2_is_refcount_inode(inode)) {
 		ret = ocfs2_create_refcount_tree(inode, di_bh);
 		if (ret) {
 			mlog_errno(ret);
@@ -3934,6 +3933,13 @@ static int ocfs2_add_refcounted_extent(struct inode *inode,
 	ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
 				      p_cluster, num_clusters,
 				      meta_ac, dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = dquot_alloc_space_nodirty(inode,
+		ocfs2_clusters_to_bytes(osb->sb, num_clusters));
 	if (ret)
 		mlog_errno(ret);
 
@@ -4442,3 +4448,434 @@ out:
 
 	return error;
 }
+
+/* Update destination inode size, if necessary. */
+static int ocfs2_reflink_update_dest(struct inode *dest,
+				     struct buffer_head *d_bh,
+				     loff_t newlen)
+{
+	handle_t *handle;
+	int ret;
+
+	dest->i_blocks = ocfs2_inode_sector_count(dest);
+
+	if (newlen <= i_size_read(dest))
+		return 0;
+
+	handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
+				   OCFS2_INODE_UPDATE_CREDITS);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/* Extend i_size if needed. */
+	spin_lock(&OCFS2_I(dest)->ip_lock);
+	if (newlen > i_size_read(dest))
+		i_size_write(dest, newlen);
+	spin_unlock(&OCFS2_I(dest)->ip_lock);
+	dest->i_ctime = dest->i_mtime = current_time(dest);
+
+	ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+out_commit:
+	ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
+	return ret;
+}
+
+/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
+static int ocfs2_reflink_remap_extent(struct inode *s_inode,
+				      struct buffer_head *s_bh,
+				      loff_t pos_in,
+				      struct inode *t_inode,
+				      struct buffer_head *t_bh,
+				      loff_t pos_out,
+				      loff_t len,
+				      struct ocfs2_cached_dealloc_ctxt *dealloc)
+{
+	struct ocfs2_extent_tree s_et;
+	struct ocfs2_extent_tree t_et;
+	struct ocfs2_dinode *dis;
+	struct buffer_head *ref_root_bh = NULL;
+	struct ocfs2_refcount_tree *ref_tree;
+	struct ocfs2_super *osb;
+	loff_t pstart, plen;
+	u32 p_cluster, num_clusters, slast, spos, tpos;
+	unsigned int ext_flags;
+	int ret = 0;
+
+	osb = OCFS2_SB(s_inode->i_sb);
+	dis = (struct ocfs2_dinode *)s_bh->b_data;
+	ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
+	ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
+
+	spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
+	tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
+	slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
+
+	while (spos < slast) {
+		if (fatal_signal_pending(current)) {
+			ret = -EINTR;
+			goto out;
+		}
+
+		/* Look up the extent. */
+		ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
+					 &num_clusters, &ext_flags);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		num_clusters = min_t(u32, num_clusters, slast - spos);
+
+		/* Punch out the dest range. */
+		pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
+		plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
+		ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (p_cluster == 0)
+			goto next_loop;
+
+		/* Lock the refcount btree... */
+		ret = ocfs2_lock_refcount_tree(osb,
+					       le64_to_cpu(dis->i_refcount_loc),
+					       1, &ref_tree, &ref_root_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/* Mark s_inode's extent as refcounted. */
+		if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
+			ret = ocfs2_add_refcount_flag(s_inode, &s_et,
+						      &ref_tree->rf_ci,
+						      ref_root_bh, spos,
+						      p_cluster, num_clusters,
+						      dealloc, NULL);
+			if (ret) {
+				mlog_errno(ret);
+				goto out_unlock_refcount;
+			}
+		}
+
+		/* Map in the new extent. */
+		ext_flags |= OCFS2_EXT_REFCOUNTED;
+		ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
+						  &ref_tree->rf_ci,
+						  ref_root_bh,
+						  tpos, p_cluster,
+						  num_clusters,
+						  ext_flags,
+						  dealloc);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_unlock_refcount;
+		}
+
+		ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+		brelse(ref_root_bh);
+next_loop:
+		spos += num_clusters;
+		tpos += num_clusters;
+	}
+
+out:
+	return ret;
+out_unlock_refcount:
+	ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
+	brelse(ref_root_bh);
+	return ret;
+}
+
+/* Set up refcount tree and remap s_inode to t_inode. */
+static int ocfs2_reflink_remap_blocks(struct inode *s_inode,
+				      struct buffer_head *s_bh,
+				      loff_t pos_in,
+				      struct inode *t_inode,
+				      struct buffer_head *t_bh,
+				      loff_t pos_out,
+				      loff_t len)
+{
+	struct ocfs2_cached_dealloc_ctxt dealloc;
+	struct ocfs2_super *osb;
+	struct ocfs2_dinode *dis;
+	struct ocfs2_dinode *dit;
+	int ret;
+
+	osb = OCFS2_SB(s_inode->i_sb);
+	dis = (struct ocfs2_dinode *)s_bh->b_data;
+	dit = (struct ocfs2_dinode *)t_bh->b_data;
+	ocfs2_init_dealloc_ctxt(&dealloc);
+
+	/*
+	 * If we're reflinking the entire file and the source is inline
+	 * data, just copy the contents.
+	 */
+	if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
+	    i_size_read(t_inode) <= len &&
+	    (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
+		ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
+		if (ret)
+			mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * If both inodes belong to two different refcount groups then
+	 * forget it because we don't know how (or want) to go merging
+	 * refcount trees.
+	 */
+	ret = -EOPNOTSUPP;
+	if (ocfs2_is_refcount_inode(s_inode) &&
+	    ocfs2_is_refcount_inode(t_inode) &&
+	    le64_to_cpu(dis->i_refcount_loc) !=
+	    le64_to_cpu(dit->i_refcount_loc))
+		goto out;
+
+	/* Neither inode has a refcount tree.  Add one to s_inode. */
+	if (!ocfs2_is_refcount_inode(s_inode) &&
+	    !ocfs2_is_refcount_inode(t_inode)) {
+		ret = ocfs2_create_refcount_tree(s_inode, s_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* Ensure that both inodes end up with the same refcount tree. */
+	if (!ocfs2_is_refcount_inode(s_inode)) {
+		ret = ocfs2_set_refcount_tree(s_inode, s_bh,
+					      le64_to_cpu(dit->i_refcount_loc));
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+	if (!ocfs2_is_refcount_inode(t_inode)) {
+		ret = ocfs2_set_refcount_tree(t_inode, t_bh,
+					      le64_to_cpu(dis->i_refcount_loc));
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* Turn off inline data in the dest file. */
+	if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
+		ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/* Actually remap extents now. */
+	ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
+					 pos_out, len, &dealloc);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ocfs2_dealloc_has_cluster(&dealloc)) {
+		ocfs2_schedule_truncate_log_flush(osb, 1);
+		ocfs2_run_deallocs(osb, &dealloc);
+	}
+
+	return ret;
+}
+
+/* Lock an inode and grab a bh pointing to the inode. */
+static int ocfs2_reflink_inodes_lock(struct inode *s_inode,
+				     struct buffer_head **bh1,
+				     struct inode *t_inode,
+				     struct buffer_head **bh2)
+{
+	struct inode *inode1;
+	struct inode *inode2;
+	struct ocfs2_inode_info *oi1;
+	struct ocfs2_inode_info *oi2;
+	bool same_inode = (s_inode == t_inode);
+	int status;
+
+	/* First grab the VFS and rw locks. */
+	lock_two_nondirectories(s_inode, t_inode);
+	inode1 = s_inode;
+	inode2 = t_inode;
+	if (inode1->i_ino > inode2->i_ino)
+		swap(inode1, inode2);
+
+	status = ocfs2_rw_lock(inode1, 1);
+	if (status) {
+		mlog_errno(status);
+		goto out_i1;
+	}
+	if (!same_inode) {
+		status = ocfs2_rw_lock(inode2, 1);
+		if (status) {
+			mlog_errno(status);
+			goto out_i2;
+		}
+	}
+
+	/* Now go for the cluster locks */
+	oi1 = OCFS2_I(inode1);
+	oi2 = OCFS2_I(inode2);
+
+	trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
+				(unsigned long long)oi2->ip_blkno);
+
+	if (*bh1)
+		*bh1 = NULL;
+	if (*bh2)
+		*bh2 = NULL;
+
+	/* We always want to lock the one with the lower lockid first. */
+	if (oi1->ip_blkno > oi2->ip_blkno)
+		mlog_errno(-ENOLCK);
+
+	/* lock id1 */
+	status = ocfs2_inode_lock_nested(inode1, bh1, 1, OI_LS_REFLINK_TARGET);
+	if (status < 0) {
+		if (status != -ENOENT)
+			mlog_errno(status);
+		goto out_rw2;
+	}
+
+	/* lock id2 */
+	if (!same_inode) {
+		status = ocfs2_inode_lock_nested(inode2, bh2, 1,
+						 OI_LS_REFLINK_TARGET);
+		if (status < 0) {
+			if (status != -ENOENT)
+				mlog_errno(status);
+			goto out_cl1;
+		}
+	} else
+		*bh2 = *bh1;
+
+	trace_ocfs2_double_lock_end(
+			(unsigned long long)OCFS2_I(inode1)->ip_blkno,
+			(unsigned long long)OCFS2_I(inode2)->ip_blkno);
+
+	return 0;
+
+out_cl1:
+	ocfs2_inode_unlock(inode1, 1);
+	brelse(*bh1);
+	*bh1 = NULL;
+out_rw2:
+	ocfs2_rw_unlock(inode2, 1);
+out_i2:
+	ocfs2_rw_unlock(inode1, 1);
+out_i1:
+	unlock_two_nondirectories(s_inode, t_inode);
+	return status;
+}
+
+/* Unlock both inodes and release buffers. */
+static void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
+					struct buffer_head *s_bh,
+					struct inode *t_inode,
+					struct buffer_head *t_bh)
+{
+	ocfs2_inode_unlock(s_inode, 1);
+	ocfs2_rw_unlock(s_inode, 1);
+	brelse(s_bh);
+	if (s_inode != t_inode) {
+		ocfs2_inode_unlock(t_inode, 1);
+		ocfs2_rw_unlock(t_inode, 1);
+		brelse(t_bh);
+	}
+	unlock_two_nondirectories(s_inode, t_inode);
+}
+
+/* Link a range of blocks from one file to another. */
+int ocfs2_reflink_remap_range(struct file *file_in,
+			      loff_t pos_in,
+			      struct file *file_out,
+			      loff_t pos_out,
+			      u64 len,
+			      bool is_dedupe)
+{
+	struct inode *inode_in = file_inode(file_in);
+	struct inode *inode_out = file_inode(file_out);
+	struct ocfs2_super *osb = OCFS2_SB(inode_in->i_sb);
+	struct buffer_head *in_bh = NULL, *out_bh = NULL;
+	bool same_inode = (inode_in == inode_out);
+	ssize_t ret;
+
+	if (!ocfs2_refcount_tree(osb))
+		return -EOPNOTSUPP;
+	if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb))
+		return -EROFS;
+
+	/* Lock both files against IO */
+	ret = ocfs2_reflink_inodes_lock(inode_in, &in_bh, inode_out, &out_bh);
+	if (ret)
+		return ret;
+
+	/* Check file eligibility and prepare for block sharing. */
+	ret = -EINVAL;
+	if ((OCFS2_I(inode_in)->ip_flags & OCFS2_INODE_SYSTEM_FILE) ||
+	    (OCFS2_I(inode_out)->ip_flags & OCFS2_INODE_SYSTEM_FILE))
+		goto out_unlock;
+
+	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+			&len, is_dedupe);
+	if (ret <= 0)
+		goto out_unlock;
+
+	/* Lock out changes to the allocation maps and remap. */
+	down_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		down_write_nested(&OCFS2_I(inode_out)->ip_alloc_sem,
+				  SINGLE_DEPTH_NESTING);
+
+	ret = ocfs2_reflink_remap_blocks(inode_in, in_bh, pos_in, inode_out,
+					 out_bh, pos_out, len);
+
+	/* Zap any page cache for the destination file's range. */
+	if (!ret)
+		truncate_inode_pages_range(&inode_out->i_data, pos_out,
+					   PAGE_ALIGN(pos_out + len) - 1);
+
+	up_write(&OCFS2_I(inode_in)->ip_alloc_sem);
+	if (!same_inode)
+		up_write(&OCFS2_I(inode_out)->ip_alloc_sem);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	/*
+	 * Empty the extent map so that we may get the right extent
+	 * record from the disk.
+	 */
+	ocfs2_extent_map_trunc(inode_in, 0);
+	ocfs2_extent_map_trunc(inode_out, 0);
+
+	ret = ocfs2_reflink_update_dest(inode_out, out_bh, pos_out + len);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_unlock;
+	}
+
+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+	return 0;
+
+out_unlock:
+	ocfs2_reflink_inodes_unlock(inode_in, in_bh, inode_out, out_bh);
+	return ret;
+}
diff --git a/fs/ocfs2/refcounttree.h b/fs/ocfs2/refcounttree.h
index 6422bbcdb525..4af55bf4b35b 100644
--- a/fs/ocfs2/refcounttree.h
+++ b/fs/ocfs2/refcounttree.h
@@ -115,4 +115,11 @@ int ocfs2_reflink_ioctl(struct inode *inode,
 			const char __user *oldname,
 			const char __user *newname,
 			bool preserve);
+int ocfs2_reflink_remap_range(struct file *file_in,
+			      loff_t pos_in,
+			      struct file *file_out,
+			      loff_t pos_out,
+			      u64 len,
+			      bool is_dedupe);
+
 #endif /* OCFS2_REFCOUNTTREE_H */
diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c
index c9e828ec3c8e..dae9eb7c441e 100644
--- a/fs/ocfs2/stack_user.c
+++ b/fs/ocfs2/stack_user.c
@@ -24,7 +24,7 @@
 #include <linux/slab.h>
 #include <linux/reboot.h>
 #include <linux/sched.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "stackglue.h"
 
diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c
index 52c07346bea3..820359096c7a 100644
--- a/fs/ocfs2/stackglue.c
+++ b/fs/ocfs2/stackglue.c
@@ -48,6 +48,12 @@ static char ocfs2_hb_ctl_path[OCFS2_MAX_HB_CTL_PATH] = "/sbin/ocfs2_hb_ctl";
  */
 static struct ocfs2_stack_plugin *active_stack;
 
+inline int ocfs2_is_o2cb_active(void)
+{
+	return !strcmp(active_stack->sp_name, OCFS2_STACK_PLUGIN_O2CB);
+}
+EXPORT_SYMBOL_GPL(ocfs2_is_o2cb_active);
+
 static struct ocfs2_stack_plugin *ocfs2_stack_lookup(const char *name)
 {
 	struct ocfs2_stack_plugin *p;
diff --git a/fs/ocfs2/stackglue.h b/fs/ocfs2/stackglue.h
index f2dce10fae54..e3036e1790e8 100644
--- a/fs/ocfs2/stackglue.h
+++ b/fs/ocfs2/stackglue.h
@@ -298,6 +298,9 @@ void ocfs2_stack_glue_set_max_proto_version(struct ocfs2_protocol_version *max_p
 int ocfs2_stack_glue_register(struct ocfs2_stack_plugin *plugin);
 void ocfs2_stack_glue_unregister(struct ocfs2_stack_plugin *plugin);
 
+/* In ocfs2_downconvert_lock(), we need to know which stack we are using */
+int ocfs2_is_o2cb_active(void);
+
 extern struct kset *ocfs2_kset;
 
 #endif  /* STACKGLUE_H */
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index f56fe39fab04..a24e42f95341 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -337,7 +337,7 @@ static int ocfs2_osb_dump(struct ocfs2_super *osb, char *buf, int len)
 		out += snprintf(buf + out, len - out, "Disabled\n");
 	else
 		out += snprintf(buf + out, len - out, "%lu seconds ago\n",
-				(get_seconds() - os->os_scantime.tv_sec));
+				(unsigned long)(ktime_get_seconds() - os->os_scantime));
 
 	out += snprintf(buf + out, len - out, "%10s => %3s  %10s\n",
 			"Slots", "Num", "RecoGen");
@@ -985,7 +985,6 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb)
 	for (type = 0; type < OCFS2_MAXQUOTAS; type++) {
 		if (!sb_has_quota_loaded(sb, type))
 			continue;
-		/* Cancel periodic syncing before we grab dqonoff_mutex */
 		oinfo = sb_dqinfo(sb, type)->dqi_priv;
 		cancel_delayed_work_sync(&oinfo->dqi_sync_work);
 		inode = igrab(sb->s_dquot.files[type]);
diff --git a/fs/ocfs2/symlink.c b/fs/ocfs2/symlink.c
index 6ad8eecefe21..94cfacc9bad7 100644
--- a/fs/ocfs2/symlink.c
+++ b/fs/ocfs2/symlink.c
@@ -87,7 +87,6 @@ const struct address_space_operations ocfs2_fast_symlink_aops = {
 };
 
 const struct inode_operations ocfs2_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.getattr	= ocfs2_getattr,
 	.setattr	= ocfs2_setattr,
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index cb157a34a656..3c5384d9b3a5 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2577,7 +2577,7 @@ int ocfs2_xattr_remove(struct inode *inode, struct buffer_head *di_bh)
 	if (!(oi->ip_dyn_features & OCFS2_HAS_XATTR_FL))
 		return 0;
 
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL) {
+	if (ocfs2_is_refcount_inode(inode)) {
 		ret = ocfs2_lock_refcount_tree(OCFS2_SB(inode->i_sb),
 					       le64_to_cpu(di->i_refcount_loc),
 					       1, &ref_tree, &ref_root_bh);
@@ -3608,7 +3608,7 @@ int ocfs2_xattr_set(struct inode *inode,
 	}
 
 	/* Check whether the value is refcounted and do some preparation. */
-	if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_REFCOUNT_FL &&
+	if (ocfs2_is_refcount_inode(inode) &&
 	    (!xis.not_found || !xbs.not_found)) {
 		ret = ocfs2_prepare_refcount_xattr(inode, di, &xi,
 						   &xis, &xbs, &ref_tree,
diff --git a/fs/open.c b/fs/open.c
index d3ed8171e8e0..9921f70bc5ca 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -19,7 +19,7 @@
 #include <linux/mount.h>
 #include <linux/fcntl.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/fs.h>
 #include <linux/personality.h>
 #include <linux/pagemap.h>
diff --git a/fs/openpromfs/inode.c b/fs/openpromfs/inode.c
index c003a667ed1a..13215f26e321 100644
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -16,7 +16,7 @@
 #include <asm/openprom.h>
 #include <asm/oplib.h>
 #include <asm/prom.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static DEFINE_MUTEX(op_mutex);
 
diff --git a/fs/orangefs/devorangefs-req.c b/fs/orangefs/devorangefs-req.c
index 516ffb4dc9a0..b0ced669427e 100644
--- a/fs/orangefs/devorangefs-req.c
+++ b/fs/orangefs/devorangefs-req.c
@@ -355,7 +355,6 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		__u64 tag;
 	} head;
 	int total = ret = iov_iter_count(iter);
-	int n;
 	int downcall_size = sizeof(struct orangefs_downcall_s);
 	int head_size = sizeof(head);
 
@@ -372,8 +371,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		return -EFAULT;
 	}
      
-	n = copy_from_iter(&head, head_size, iter);
-	if (n < head_size) {
+	if (!copy_from_iter_full(&head, head_size, iter)) {
 		gossip_err("%s: failed to copy head.\n", __func__);
 		return -EFAULT;
 	}
@@ -407,8 +405,7 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		return ret;
 	}
 
-	n = copy_from_iter(&op->downcall, downcall_size, iter);
-	if (n != downcall_size) {
+	if (!copy_from_iter_full(&op->downcall, downcall_size, iter)) {
 		gossip_err("%s: failed to copy downcall.\n", __func__);
 		goto Efault;
 	}
@@ -462,10 +459,8 @@ static ssize_t orangefs_devreq_write_iter(struct kiocb *iocb,
 		goto Enomem;
 	}
 	memset(op->downcall.trailer_buf, 0, op->downcall.trailer_size);
-	n = copy_from_iter(op->downcall.trailer_buf,
-			   op->downcall.trailer_size,
-			   iter);
-	if (n != op->downcall.trailer_size) {
+	if (!copy_from_iter_full(op->downcall.trailer_buf,
+			         op->downcall.trailer_size, iter)) {
 		gossip_err("%s: failed to copy trailer.\n", __func__);
 		vfree(op->downcall.trailer_buf);
 		goto Efault;
diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c
index 02cc6139ec90..e6bbc8083d77 100644
--- a/fs/orangefs/file.c
+++ b/fs/orangefs/file.c
@@ -724,7 +724,7 @@ static int orangefs_lock(struct file *filp, int cmd, struct file_lock *fl)
 {
 	int rc = -EINVAL;
 
-	if (ORANGEFS_SB(filp->f_inode->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
+	if (ORANGEFS_SB(file_inode(filp)->i_sb)->flags & ORANGEFS_OPT_LOCAL_LOCK) {
 		if (cmd == F_GETLK) {
 			rc = 0;
 			posix_test_lock(filp, fl);
diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c
index ef3b4eb54cf2..551bc74ed2b8 100644
--- a/fs/orangefs/inode.c
+++ b/fs/orangefs/inode.c
@@ -8,6 +8,7 @@
  *  Linux VFS inode operations.
  */
 
+#include <linux/bvec.h>
 #include "protocol.h"
 #include "orangefs-kernel.h"
 #include "orangefs-bufmap.h"
diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c
index 38887cc5577f..27e75cf28b3a 100644
--- a/fs/orangefs/orangefs-debugfs.c
+++ b/fs/orangefs/orangefs-debugfs.c
@@ -434,6 +434,7 @@ static ssize_t orangefs_debug_write(struct file *file,
 	char *debug_string;
 	struct orangefs_kernel_op_s *new_op = NULL;
 	struct client_debug_mask c_mask = { NULL, 0, 0 };
+	char *s;
 
 	gossip_debug(GOSSIP_DEBUGFS_DEBUG,
 		"orangefs_debug_write: %pD\n",
@@ -521,8 +522,9 @@ static ssize_t orangefs_debug_write(struct file *file,
 	}
 
 	mutex_lock(&orangefs_debug_lock);
-	memset(file->f_inode->i_private, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
-	sprintf((char *)file->f_inode->i_private, "%s\n", debug_string);
+	s = file_inode(file)->i_private;
+	memset(s, 0, ORANGEFS_MAX_DEBUG_STRING_LEN);
+	sprintf(s, "%s\n", debug_string);
 	mutex_unlock(&orangefs_debug_lock);
 
 	*ppos += count;
@@ -671,8 +673,10 @@ int orangefs_prepare_debugfs_help_string(int at_boot)
 		 */
 		cdm_element_count =
 			orangefs_prepare_cdm_array(client_debug_array_string);
-		if (cdm_element_count <= 0)
+		if (cdm_element_count <= 0) {
+			kfree(new);
 			goto out;
+		}
 
 		for (i = 0; i < cdm_element_count; i++) {
 			strlcat(new, "\t", string_size);
diff --git a/fs/orangefs/orangefs-sysfs.c b/fs/orangefs/orangefs-sysfs.c
index a799546a67f7..084954448f18 100644
--- a/fs/orangefs/orangefs-sysfs.c
+++ b/fs/orangefs/orangefs-sysfs.c
@@ -609,15 +609,6 @@ static ssize_t sysfs_service_op_store(struct kobject *kobj,
 			new_op->upcall.req.param.u.value32[0] = val1;
 			new_op->upcall.req.param.u.value32[1] = val2;
 			goto value_set;
-		} else if (!strcmp(attr->attr.name,
-				   "perf_counter_reset")) {
-			if ((val > 0)) {
-				new_op->upcall.req.param.op =
-				ORANGEFS_PARAM_REQUEST_OP_READAHEAD_COUNT_SIZE;
-			} else {
-				rc = 0;
-				goto out;
-			}
 		}
 
 	} else if (!strcmp(kobj->name, ACACHE_KOBJ_ID)) {
diff --git a/fs/orangefs/symlink.c b/fs/orangefs/symlink.c
index 10b0b06e075e..02b1bbdbcc42 100644
--- a/fs/orangefs/symlink.c
+++ b/fs/orangefs/symlink.c
@@ -9,7 +9,6 @@
 #include "orangefs-bufmap.h"
 
 const struct inode_operations orangefs_symlink_inode_operations = {
-	.readlink = generic_readlink,
 	.get_link = simple_get_link,
 	.setattr = orangefs_setattr,
 	.getattr = orangefs_getattr,
diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig
index 34355818a2e0..0daac5112f7a 100644
--- a/fs/overlayfs/Kconfig
+++ b/fs/overlayfs/Kconfig
@@ -8,3 +8,17 @@ config OVERLAY_FS
 	  merged with the 'upper' object.
 
 	  For more information see Documentation/filesystems/overlayfs.txt
+
+config OVERLAY_FS_REDIRECT_DIR
+	bool "Overlayfs: turn on redirect dir feature by default"
+	depends on OVERLAY_FS
+	help
+	  If this config option is enabled then overlay filesystems will use
+	  redirects when renaming directories by default.  In this case it is
+	  still possible to turn off redirects globally with the
+	  "redirect_dir=off" module option or on a filesystem instance basis
+	  with the "redirect_dir=off" mount option.
+
+	  Note, that redirects are not backward compatible.  That is, mounting
+	  an overlay which has redirects on a kernel that doesn't support this
+	  feature will have unexpected results.
diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile
index 900daed3e91d..99373bbc1478 100644
--- a/fs/overlayfs/Makefile
+++ b/fs/overlayfs/Makefile
@@ -4,4 +4,4 @@
 
 obj-$(CONFIG_OVERLAY_FS) += overlay.o
 
-overlay-objs := super.o inode.o dir.o readdir.o copy_up.o
+overlay-objs := super.o namei.o util.o inode.o dir.o readdir.o copy_up.o
diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c
index 36795eed40b0..f57043dace62 100644
--- a/fs/overlayfs/copy_up.c
+++ b/fs/overlayfs/copy_up.c
@@ -33,7 +33,7 @@ static int ovl_check_fd(const void *data, struct file *f, unsigned int fd)
 {
 	const struct dentry *dentry = data;
 
-	if (f->f_inode == d_inode(dentry))
+	if (file_inode(f) == d_inode(dentry))
 		pr_warn_ratelimited("overlayfs: Warning: Copying up %pD, but open R/O on fd %u which will cease to be coherent [pid=%d %s]\n",
 				    f, fd, current->pid, current->comm);
 	return 0;
@@ -153,6 +153,13 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
 		goto out_fput;
 	}
 
+	/* Try to use clone_file_range to clone up within the same fs */
+	error = vfs_clone_file_range(old_file, 0, new_file, 0, len);
+	if (!error)
+		goto out;
+	/* Couldn't clone, so now we try to copy the data */
+	error = 0;
+
 	/* FIXME: copy up sparse files efficiently */
 	while (len) {
 		size_t this_len = OVL_COPY_UP_CHUNK_SIZE;
@@ -177,7 +184,7 @@ static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len)
 
 		len -= bytes;
 	}
-
+out:
 	if (!error)
 		error = vfs_fsync(new_file, 0);
 	fput(new_file);
@@ -231,10 +238,15 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	struct inode *udir = upperdir->d_inode;
 	struct dentry *newdentry = NULL;
 	struct dentry *upper = NULL;
-	umode_t mode = stat->mode;
 	int err;
 	const struct cred *old_creds = NULL;
 	struct cred *new_creds = NULL;
+	struct cattr cattr = {
+		/* Can't properly set mode on creation because of the umask */
+		.mode = stat->mode & S_IFMT,
+		.rdev = stat->rdev,
+		.link = link
+	};
 
 	newdentry = ovl_lookup_temp(workdir, dentry);
 	err = PTR_ERR(newdentry);
@@ -254,10 +266,7 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	if (new_creds)
 		old_creds = override_creds(new_creds);
 
-	/* Can't properly set mode on creation because of the umask */
-	stat->mode &= S_IFMT;
-	err = ovl_create_real(wdir, newdentry, stat, link, NULL, true);
-	stat->mode = mode;
+	err = ovl_create_real(wdir, newdentry, &cattr, NULL, true);
 
 	if (new_creds) {
 		revert_creds(old_creds);
@@ -296,12 +305,6 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir,
 	ovl_dentry_update(dentry, newdentry);
 	ovl_inode_update(d_inode(dentry), d_inode(newdentry));
 	newdentry = NULL;
-
-	/*
-	 * Non-directores become opaque when copied up.
-	 */
-	if (!S_ISDIR(stat->mode))
-		ovl_dentry_set_opaque(dentry, true);
 out2:
 	dput(upper);
 out1:
@@ -317,20 +320,14 @@ out_cleanup:
 /*
  * Copy up a single dentry
  *
- * Directory renames only allowed on "pure upper" (already created on
- * upper filesystem, never copied up).  Directories which are on lower or
- * are merged may not be renamed.  For these -EXDEV is returned and
- * userspace has to deal with it.  This means, when copying up a
- * directory we can rely on it and ancestors being stable.
- *
- * Non-directory renames start with copy up of source if necessary.  The
- * actual rename will only proceed once the copy up was successful.  Copy
- * up uses upper parent i_mutex for exclusion.  Since rename can change
- * d_parent it is possible that the copy up will lock the old parent.  At
- * that point the file will have already been copied up anyway.
+ * All renames start with copy up of source if necessary.  The actual
+ * rename will only proceed once the copy up was successful.  Copy up uses
+ * upper parent i_mutex for exclusion.  Since rename can change d_parent it
+ * is possible that the copy up will lock the old parent.  At that point
+ * the file will have already been copied up anyway.
  */
-int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
-		    struct path *lowerpath, struct kstat *stat)
+static int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
+			   struct path *lowerpath, struct kstat *stat)
 {
 	DEFINE_DELAYED_CALL(done);
 	struct dentry *workdir = ovl_workdir(dentry);
@@ -339,7 +336,6 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 	struct path parentpath;
 	struct dentry *lowerdentry = lowerpath->dentry;
 	struct dentry *upperdir;
-	struct dentry *upperdentry;
 	const char *link = NULL;
 
 	if (WARN_ON(!workdir))
@@ -365,8 +361,7 @@ int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
 		pr_err("overlayfs: failed to lock workdir+upperdir\n");
 		goto out_unlock;
 	}
-	upperdentry = ovl_dentry_upper(dentry);
-	if (upperdentry) {
+	if (ovl_dentry_upper(dentry)) {
 		/* Raced with another copy-up?  Nothing to do, then... */
 		err = 0;
 		goto out_unlock;
@@ -385,7 +380,7 @@ out_unlock:
 	return err;
 }
 
-int ovl_copy_up(struct dentry *dentry)
+int ovl_copy_up_flags(struct dentry *dentry, int flags)
 {
 	int err = 0;
 	const struct cred *old_cred = ovl_override_creds(dentry->d_sb);
@@ -415,6 +410,9 @@ int ovl_copy_up(struct dentry *dentry)
 
 		ovl_path_lower(next, &lowerpath);
 		err = vfs_getattr(&lowerpath, &stat);
+		/* maybe truncate regular file. this has no effect on dirs */
+		if (flags & O_TRUNC)
+			stat.size = 0;
 		if (!err)
 			err = ovl_copy_up_one(parent, next, &lowerpath, &stat);
 
@@ -425,3 +423,8 @@ int ovl_copy_up(struct dentry *dentry)
 
 	return err;
 }
+
+int ovl_copy_up(struct dentry *dentry)
+{
+	return ovl_copy_up_flags(dentry, 0);
+}
diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c
index 306b6c161840..16e06dd89457 100644
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -12,11 +12,18 @@
 #include <linux/xattr.h>
 #include <linux/security.h>
 #include <linux/cred.h>
+#include <linux/module.h>
 #include <linux/posix_acl.h>
 #include <linux/posix_acl_xattr.h>
 #include <linux/atomic.h>
+#include <linux/ratelimit.h>
 #include "overlayfs.h"
 
+static unsigned short ovl_redirect_max = 256;
+module_param_named(redirect_max, ovl_redirect_max, ushort, 0644);
+MODULE_PARM_DESC(ovl_redirect_max,
+		 "Maximum length of absolute redirect xattr value");
+
 void ovl_cleanup(struct inode *wdir, struct dentry *wdentry)
 {
 	int err;
@@ -75,8 +82,7 @@ static struct dentry *ovl_whiteout(struct dentry *workdir,
 }
 
 int ovl_create_real(struct inode *dir, struct dentry *newdentry,
-		    struct kstat *stat, const char *link,
-		    struct dentry *hardlink, bool debug)
+		    struct cattr *attr, struct dentry *hardlink, bool debug)
 {
 	int err;
 
@@ -86,13 +92,13 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
 	if (hardlink) {
 		err = ovl_do_link(hardlink, dir, newdentry, debug);
 	} else {
-		switch (stat->mode & S_IFMT) {
+		switch (attr->mode & S_IFMT) {
 		case S_IFREG:
-			err = ovl_do_create(dir, newdentry, stat->mode, debug);
+			err = ovl_do_create(dir, newdentry, attr->mode, debug);
 			break;
 
 		case S_IFDIR:
-			err = ovl_do_mkdir(dir, newdentry, stat->mode, debug);
+			err = ovl_do_mkdir(dir, newdentry, attr->mode, debug);
 			break;
 
 		case S_IFCHR:
@@ -100,11 +106,11 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
 		case S_IFIFO:
 		case S_IFSOCK:
 			err = ovl_do_mknod(dir, newdentry,
-					   stat->mode, stat->rdev, debug);
+					   attr->mode, attr->rdev, debug);
 			break;
 
 		case S_IFLNK:
-			err = ovl_do_symlink(dir, newdentry, link, debug);
+			err = ovl_do_symlink(dir, newdentry, attr->link, debug);
 			break;
 
 		default:
@@ -121,20 +127,15 @@ int ovl_create_real(struct inode *dir, struct dentry *newdentry,
 	return err;
 }
 
-static int ovl_set_opaque(struct dentry *upperdentry)
-{
-	return ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
-}
-
-static void ovl_remove_opaque(struct dentry *upperdentry)
+static int ovl_set_opaque(struct dentry *dentry, struct dentry *upperdentry)
 {
 	int err;
 
-	err = ovl_do_removexattr(upperdentry, OVL_XATTR_OPAQUE);
-	if (err) {
-		pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n",
-			upperdentry->d_name.name, err);
-	}
+	err = ovl_do_setxattr(upperdentry, OVL_XATTR_OPAQUE, "y", 1, 0);
+	if (!err)
+		ovl_dentry_set_opaque(dentry);
+
+	return err;
 }
 
 static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry,
@@ -182,9 +183,13 @@ static void ovl_instantiate(struct dentry *dentry, struct inode *inode,
 	d_instantiate(dentry, inode);
 }
 
+static bool ovl_type_merge(struct dentry *dentry)
+{
+	return OVL_TYPE_MERGE(ovl_path_type(dentry));
+}
+
 static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
-			    struct kstat *stat, const char *link,
-			    struct dentry *hardlink)
+			    struct cattr *attr, struct dentry *hardlink)
 {
 	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
 	struct inode *udir = upperdir->d_inode;
@@ -192,7 +197,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	int err;
 
 	if (!hardlink && !IS_POSIXACL(udir))
-		stat->mode &= ~current_umask();
+		attr->mode &= ~current_umask();
 
 	inode_lock_nested(udir, I_MUTEX_PARENT);
 	newdentry = lookup_one_len(dentry->d_name.name, upperdir,
@@ -200,10 +205,15 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode,
 	err = PTR_ERR(newdentry);
 	if (IS_ERR(newdentry))
 		goto out_unlock;
-	err = ovl_create_real(udir, newdentry, stat, link, hardlink, false);
+	err = ovl_create_real(udir, newdentry, attr, hardlink, false);
 	if (err)
 		goto out_dput;
 
+	if (ovl_type_merge(dentry->d_parent)) {
+		/* Setting opaque here is just an optimization, allow to fail */
+		ovl_set_opaque(dentry, newdentry);
+	}
+
 	ovl_instantiate(dentry, inode, newdentry, !!hardlink);
 	newdentry = NULL;
 out_dput:
@@ -270,7 +280,8 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (IS_ERR(opaquedir))
 		goto out_unlock;
 
-	err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true);
+	err = ovl_create_real(wdir, opaquedir,
+			      &(struct cattr){.mode = stat.mode}, NULL, true);
 	if (err)
 		goto out_dput;
 
@@ -278,7 +289,7 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry,
 	if (err)
 		goto out_cleanup;
 
-	err = ovl_set_opaque(opaquedir);
+	err = ovl_set_opaque(dentry, opaquedir);
 	if (err)
 		goto out_cleanup;
 
@@ -370,7 +381,7 @@ out_free:
 }
 
 static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
-				    struct kstat *stat, const char *link,
+				    struct cattr *cattr,
 				    struct dentry *hardlink)
 {
 	struct dentry *workdir = ovl_workdir(dentry);
@@ -387,7 +398,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 
 	if (!hardlink) {
 		err = posix_acl_create(dentry->d_parent->d_inode,
-				       &stat->mode, &default_acl, &acl);
+				       &cattr->mode, &default_acl, &acl);
 		if (err)
 			return err;
 	}
@@ -407,7 +418,7 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 	if (IS_ERR(upper))
 		goto out_dput;
 
-	err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true);
+	err = ovl_create_real(wdir, newdentry, cattr, hardlink, true);
 	if (err)
 		goto out_dput2;
 
@@ -415,10 +426,11 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 	 * mode could have been mutilated due to umask (e.g. sgid directory)
 	 */
 	if (!hardlink &&
-	    !S_ISLNK(stat->mode) && newdentry->d_inode->i_mode != stat->mode) {
+	    !S_ISLNK(cattr->mode) &&
+	    newdentry->d_inode->i_mode != cattr->mode) {
 		struct iattr attr = {
 			.ia_valid = ATTR_MODE,
-			.ia_mode = stat->mode,
+			.ia_mode = cattr->mode,
 		};
 		inode_lock(newdentry->d_inode);
 		err = notify_change(newdentry, &attr, NULL);
@@ -438,8 +450,8 @@ static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode,
 			goto out_cleanup;
 	}
 
-	if (!hardlink && S_ISDIR(stat->mode)) {
-		err = ovl_set_opaque(newdentry);
+	if (!hardlink && S_ISDIR(cattr->mode)) {
+		err = ovl_set_opaque(dentry, newdentry);
 		if (err)
 			goto out_cleanup;
 
@@ -475,8 +487,7 @@ out_cleanup:
 }
 
 static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
-			      struct kstat *stat, const char *link,
-			      struct dentry *hardlink)
+			      struct cattr *attr, struct dentry *hardlink)
 {
 	int err;
 	const struct cred *old_cred;
@@ -494,7 +505,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 		override_cred->fsgid = inode->i_gid;
 		if (!hardlink) {
 			err = security_dentry_create_files_as(dentry,
-					stat->mode, &dentry->d_name, old_cred,
+					attr->mode, &dentry->d_name, old_cred,
 					override_cred);
 			if (err) {
 				put_cred(override_cred);
@@ -504,12 +515,12 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode,
 		put_cred(override_creds(override_cred));
 		put_cred(override_cred);
 
-		if (!ovl_dentry_is_opaque(dentry))
-			err = ovl_create_upper(dentry, inode, stat, link,
+		if (!ovl_dentry_is_whiteout(dentry))
+			err = ovl_create_upper(dentry, inode, attr,
 						hardlink);
 		else
-			err = ovl_create_over_whiteout(dentry, inode, stat,
-							link, hardlink);
+			err = ovl_create_over_whiteout(dentry, inode, attr,
+							hardlink);
 	}
 out_revert_creds:
 	revert_creds(old_cred);
@@ -528,8 +539,9 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 {
 	int err;
 	struct inode *inode;
-	struct kstat stat = {
+	struct cattr attr = {
 		.rdev = rdev,
+		.link = link,
 	};
 
 	err = ovl_want_write(dentry);
@@ -537,14 +549,14 @@ static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev,
 		goto out;
 
 	err = -ENOMEM;
-	inode = ovl_new_inode(dentry->d_sb, mode);
+	inode = ovl_new_inode(dentry->d_sb, mode, rdev);
 	if (!inode)
 		goto out_drop_write;
 
 	inode_init_owner(inode, dentry->d_parent->d_inode, mode);
-	stat.mode = inode->i_mode;
+	attr.mode = inode->i_mode;
 
-	err = ovl_create_or_link(dentry, inode, &stat, link, NULL);
+	err = ovl_create_or_link(dentry, inode, &attr, NULL);
 	if (err)
 		iput(inode);
 
@@ -598,7 +610,7 @@ static int ovl_link(struct dentry *old, struct inode *newdir,
 	inode = d_inode(old);
 	ihold(inode);
 
-	err = ovl_create_or_link(new, inode, NULL, NULL, ovl_dentry_upper(old));
+	err = ovl_create_or_link(new, inode, NULL, ovl_dentry_upper(old));
 	if (err)
 		iput(inode);
 
@@ -684,8 +696,17 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent);
 	struct inode *dir = upperdir->d_inode;
 	struct dentry *upper;
+	struct dentry *opaquedir = NULL;
 	int err;
 
+	/* Redirect dir can be !ovl_lower_positive && OVL_TYPE_MERGE */
+	if (is_dir && ovl_dentry_get_redirect(dentry)) {
+		opaquedir = ovl_check_empty_and_clear(dentry);
+		err = PTR_ERR(opaquedir);
+		if (IS_ERR(opaquedir))
+			goto out;
+	}
+
 	inode_lock_nested(dir, I_MUTEX_PARENT);
 	upper = lookup_one_len(dentry->d_name.name, upperdir,
 			       dentry->d_name.len);
@@ -694,14 +715,15 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 		goto out_unlock;
 
 	err = -ESTALE;
-	if (upper == ovl_dentry_upper(dentry)) {
-		if (is_dir)
-			err = vfs_rmdir(dir, upper);
-		else
-			err = vfs_unlink(dir, upper, NULL);
-		ovl_dentry_version_inc(dentry->d_parent);
-	}
-	dput(upper);
+	if ((opaquedir && upper != opaquedir) ||
+	    (!opaquedir && upper != ovl_dentry_upper(dentry)))
+		goto out_dput_upper;
+
+	if (is_dir)
+		err = vfs_rmdir(dir, upper);
+	else
+		err = vfs_unlink(dir, upper, NULL);
+	ovl_dentry_version_inc(dentry->d_parent);
 
 	/*
 	 * Keeping this dentry hashed would mean having to release
@@ -711,34 +733,21 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir)
 	 */
 	if (!err)
 		d_drop(dentry);
+out_dput_upper:
+	dput(upper);
 out_unlock:
 	inode_unlock(dir);
-
+	dput(opaquedir);
+out:
 	return err;
 }
 
-static inline int ovl_check_sticky(struct dentry *dentry)
-{
-	struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode;
-	struct inode *inode = ovl_dentry_real(dentry)->d_inode;
-
-	if (check_sticky(dir, inode))
-		return -EPERM;
-
-	return 0;
-}
-
 static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 {
 	enum ovl_path_type type;
 	int err;
 	const struct cred *old_cred;
 
-
-	err = ovl_check_sticky(dentry);
-	if (err)
-		goto out;
-
 	err = ovl_want_write(dentry);
 	if (err)
 		goto out;
@@ -750,7 +759,7 @@ static int ovl_do_remove(struct dentry *dentry, bool is_dir)
 	type = ovl_path_type(dentry);
 
 	old_cred = ovl_override_creds(dentry->d_sb);
-	if (OVL_TYPE_PURE_UPPER(type))
+	if (!ovl_lower_positive(dentry))
 		err = ovl_remove_upper(dentry, is_dir);
 	else
 		err = ovl_remove_and_whiteout(dentry, is_dir);
@@ -777,13 +786,114 @@ static int ovl_rmdir(struct inode *dir, struct dentry *dentry)
 	return ovl_do_remove(dentry, true);
 }
 
-static int ovl_rename2(struct inode *olddir, struct dentry *old,
-		       struct inode *newdir, struct dentry *new,
-		       unsigned int flags)
+static bool ovl_type_merge_or_lower(struct dentry *dentry)
+{
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	return OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type);
+}
+
+static bool ovl_can_move(struct dentry *dentry)
+{
+	return ovl_redirect_dir(dentry->d_sb) ||
+		!d_is_dir(dentry) || !ovl_type_merge_or_lower(dentry);
+}
+
+static char *ovl_get_redirect(struct dentry *dentry, bool samedir)
+{
+	char *buf, *ret;
+	struct dentry *d, *tmp;
+	int buflen = ovl_redirect_max + 1;
+
+	if (samedir) {
+		ret = kstrndup(dentry->d_name.name, dentry->d_name.len,
+			       GFP_KERNEL);
+		goto out;
+	}
+
+	buf = ret = kmalloc(buflen, GFP_TEMPORARY);
+	if (!buf)
+		goto out;
+
+	buflen--;
+	buf[buflen] = '\0';
+	for (d = dget(dentry); !IS_ROOT(d);) {
+		const char *name;
+		int thislen;
+
+		spin_lock(&d->d_lock);
+		name = ovl_dentry_get_redirect(d);
+		if (name) {
+			thislen = strlen(name);
+		} else {
+			name = d->d_name.name;
+			thislen = d->d_name.len;
+		}
+
+		/* If path is too long, fall back to userspace move */
+		if (thislen + (name[0] != '/') > buflen) {
+			ret = ERR_PTR(-EXDEV);
+			spin_unlock(&d->d_lock);
+			goto out_put;
+		}
+
+		buflen -= thislen;
+		memcpy(&buf[buflen], name, thislen);
+		tmp = dget_dlock(d->d_parent);
+		spin_unlock(&d->d_lock);
+
+		dput(d);
+		d = tmp;
+
+		/* Absolute redirect: finished */
+		if (buf[buflen] == '/')
+			break;
+		buflen--;
+		buf[buflen] = '/';
+	}
+	ret = kstrdup(&buf[buflen], GFP_KERNEL);
+out_put:
+	dput(d);
+	kfree(buf);
+out:
+	return ret ? ret : ERR_PTR(-ENOMEM);
+}
+
+static int ovl_set_redirect(struct dentry *dentry, bool samedir)
+{
+	int err;
+	const char *redirect = ovl_dentry_get_redirect(dentry);
+
+	if (redirect && (samedir || redirect[0] == '/'))
+		return 0;
+
+	redirect = ovl_get_redirect(dentry, samedir);
+	if (IS_ERR(redirect))
+		return PTR_ERR(redirect);
+
+	err = ovl_do_setxattr(ovl_dentry_upper(dentry), OVL_XATTR_REDIRECT,
+			      redirect, strlen(redirect), 0);
+	if (!err) {
+		spin_lock(&dentry->d_lock);
+		ovl_dentry_set_redirect(dentry, redirect);
+		spin_unlock(&dentry->d_lock);
+	} else {
+		kfree(redirect);
+		if (err == -EOPNOTSUPP)
+			ovl_clear_redirect_dir(dentry->d_sb);
+		else
+			pr_warn_ratelimited("overlay: failed to set redirect (%i)\n", err);
+		/* Fall back to userspace copy-up */
+		err = -EXDEV;
+	}
+	return err;
+}
+
+static int ovl_rename(struct inode *olddir, struct dentry *old,
+		      struct inode *newdir, struct dentry *new,
+		      unsigned int flags)
 {
 	int err;
-	enum ovl_path_type old_type;
-	enum ovl_path_type new_type;
 	struct dentry *old_upperdir;
 	struct dentry *new_upperdir;
 	struct dentry *olddentry;
@@ -794,7 +904,8 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	bool cleanup_whiteout = false;
 	bool overwrite = !(flags & RENAME_EXCHANGE);
 	bool is_dir = d_is_dir(old);
-	bool new_is_dir = false;
+	bool new_is_dir = d_is_dir(new);
+	bool samedir = olddir == newdir;
 	struct dentry *opaquedir = NULL;
 	const struct cred *old_cred = NULL;
 
@@ -804,46 +915,12 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 
 	flags &= ~RENAME_NOREPLACE;
 
-	err = ovl_check_sticky(old);
-	if (err)
-		goto out;
-
 	/* Don't copy up directory trees */
-	old_type = ovl_path_type(old);
 	err = -EXDEV;
-	if (OVL_TYPE_MERGE_OR_LOWER(old_type) && is_dir)
+	if (!ovl_can_move(old))
+		goto out;
+	if (!overwrite && !ovl_can_move(new))
 		goto out;
-
-	if (new->d_inode) {
-		err = ovl_check_sticky(new);
-		if (err)
-			goto out;
-
-		if (d_is_dir(new))
-			new_is_dir = true;
-
-		new_type = ovl_path_type(new);
-		err = -EXDEV;
-		if (!overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir)
-			goto out;
-
-		err = 0;
-		if (!OVL_TYPE_UPPER(new_type) && !OVL_TYPE_UPPER(old_type)) {
-			if (ovl_dentry_lower(old)->d_inode ==
-			    ovl_dentry_lower(new)->d_inode)
-				goto out;
-		}
-		if (OVL_TYPE_UPPER(new_type) && OVL_TYPE_UPPER(old_type)) {
-			if (ovl_dentry_upper(old)->d_inode ==
-			    ovl_dentry_upper(new)->d_inode)
-				goto out;
-		}
-	} else {
-		if (ovl_dentry_is_opaque(new))
-			new_type = __OVL_PATH_UPPER;
-		else
-			new_type = __OVL_PATH_UPPER | __OVL_PATH_PURE;
-	}
 
 	err = ovl_want_write(old);
 	if (err)
@@ -862,12 +939,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 			goto out_drop_write;
 	}
 
-	old_opaque = !OVL_TYPE_PURE_UPPER(old_type);
-	new_opaque = !OVL_TYPE_PURE_UPPER(new_type);
-
 	old_cred = ovl_override_creds(old->d_sb);
 
-	if (overwrite && OVL_TYPE_MERGE_OR_LOWER(new_type) && new_is_dir) {
+	if (overwrite && new_is_dir && ovl_type_merge_or_lower(new)) {
 		opaquedir = ovl_check_empty_and_clear(new);
 		err = PTR_ERR(opaquedir);
 		if (IS_ERR(opaquedir)) {
@@ -877,15 +951,15 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	}
 
 	if (overwrite) {
-		if (old_opaque) {
-			if (new->d_inode || !new_opaque) {
+		if (ovl_lower_positive(old)) {
+			if (!ovl_dentry_is_whiteout(new)) {
 				/* Whiteout source */
 				flags |= RENAME_WHITEOUT;
 			} else {
 				/* Switch whiteouts */
 				flags |= RENAME_EXCHANGE;
 			}
-		} else if (is_dir && !new->d_inode && new_opaque) {
+		} else if (is_dir && ovl_dentry_is_whiteout(new)) {
 			flags |= RENAME_EXCHANGE;
 			cleanup_whiteout = true;
 		}
@@ -896,7 +970,6 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 
 	trap = lock_rename(new_upperdir, old_upperdir);
 
-
 	olddentry = lookup_one_len(old->d_name.name, old_upperdir,
 				   old->d_name.len);
 	err = PTR_ERR(olddentry);
@@ -913,6 +986,9 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	if (IS_ERR(newdentry))
 		goto out_dput_old;
 
+	old_opaque = ovl_dentry_is_opaque(old);
+	new_opaque = ovl_dentry_is_opaque(new);
+
 	err = -ESTALE;
 	if (ovl_dentry_upper(new)) {
 		if (opaquedir) {
@@ -933,54 +1009,31 @@ static int ovl_rename2(struct inode *olddir, struct dentry *old,
 	if (newdentry == trap)
 		goto out_dput;
 
-	if (is_dir && !old_opaque && new_opaque) {
-		err = ovl_set_opaque(olddentry);
+	if (WARN_ON(olddentry->d_inode == newdentry->d_inode))
+		goto out_dput;
+
+	err = 0;
+	if (is_dir) {
+		if (ovl_type_merge_or_lower(old))
+			err = ovl_set_redirect(old, samedir);
+		else if (!old_opaque && ovl_type_merge(new->d_parent))
+			err = ovl_set_opaque(old, olddentry);
 		if (err)
 			goto out_dput;
 	}
-	if (!overwrite && new_is_dir && old_opaque && !new_opaque) {
-		err = ovl_set_opaque(newdentry);
+	if (!overwrite && new_is_dir) {
+		if (ovl_type_merge_or_lower(new))
+			err = ovl_set_redirect(new, samedir);
+		else if (!new_opaque && ovl_type_merge(old->d_parent))
+			err = ovl_set_opaque(new, newdentry);
 		if (err)
 			goto out_dput;
 	}
 
-	if (old_opaque || new_opaque) {
-		err = ovl_do_rename(old_upperdir->d_inode, olddentry,
-				    new_upperdir->d_inode, newdentry,
-				    flags);
-	} else {
-		/* No debug for the plain case */
-		BUG_ON(flags & ~RENAME_EXCHANGE);
-		err = vfs_rename(old_upperdir->d_inode, olddentry,
-				 new_upperdir->d_inode, newdentry,
-				 NULL, flags);
-	}
-
-	if (err) {
-		if (is_dir && !old_opaque && new_opaque)
-			ovl_remove_opaque(olddentry);
-		if (!overwrite && new_is_dir && old_opaque && !new_opaque)
-			ovl_remove_opaque(newdentry);
+	err = ovl_do_rename(old_upperdir->d_inode, olddentry,
+			    new_upperdir->d_inode, newdentry, flags);
+	if (err)
 		goto out_dput;
-	}
-
-	if (is_dir && old_opaque && !new_opaque)
-		ovl_remove_opaque(olddentry);
-	if (!overwrite && new_is_dir && !old_opaque && new_opaque)
-		ovl_remove_opaque(newdentry);
-
-	/*
-	 * Old dentry now lives in different location. Dentries in
-	 * lowerstack are stale. We cannot drop them here because
-	 * access to them is lockless. This could be only pure upper
-	 * or opaque directory - numlower is zero. Or upper non-dir
-	 * entry - its pureness is tracked by flag opaque.
-	 */
-	if (old_opaque != new_opaque) {
-		ovl_dentry_set_opaque(old, new_opaque);
-		if (!overwrite)
-			ovl_dentry_set_opaque(new, old_opaque);
-	}
 
 	if (cleanup_whiteout)
 		ovl_cleanup(old_upperdir->d_inode, newdentry);
@@ -1009,7 +1062,7 @@ const struct inode_operations ovl_dir_inode_operations = {
 	.symlink	= ovl_symlink,
 	.unlink		= ovl_unlink,
 	.rmdir		= ovl_rmdir,
-	.rename		= ovl_rename2,
+	.rename		= ovl_rename,
 	.link		= ovl_link,
 	.setattr	= ovl_setattr,
 	.create		= ovl_create,
diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c
index 7fb53d055537..08643ac44a02 100644
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -13,34 +13,6 @@
 #include <linux/posix_acl.h>
 #include "overlayfs.h"
 
-static int ovl_copy_up_truncate(struct dentry *dentry)
-{
-	int err;
-	struct dentry *parent;
-	struct kstat stat;
-	struct path lowerpath;
-	const struct cred *old_cred;
-
-	parent = dget_parent(dentry);
-	err = ovl_copy_up(parent);
-	if (err)
-		goto out_dput_parent;
-
-	ovl_path_lower(dentry, &lowerpath);
-
-	old_cred = ovl_override_creds(dentry->d_sb);
-	err = vfs_getattr(&lowerpath, &stat);
-	if (!err) {
-		stat.size = 0;
-		err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat);
-	}
-	revert_creds(old_cred);
-
-out_dput_parent:
-	dput(parent);
-	return err;
-}
-
 int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 {
 	int err;
@@ -64,27 +36,10 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 	if (err)
 		goto out;
 
-	if (attr->ia_valid & ATTR_SIZE) {
-		struct inode *realinode = d_inode(ovl_dentry_real(dentry));
-
-		err = -ETXTBSY;
-		if (atomic_read(&realinode->i_writecount) < 0)
-			goto out_drop_write;
-	}
-
 	err = ovl_copy_up(dentry);
 	if (!err) {
-		struct inode *winode = NULL;
-
 		upperdentry = ovl_dentry_upper(dentry);
 
-		if (attr->ia_valid & ATTR_SIZE) {
-			winode = d_inode(upperdentry);
-			err = get_write_access(winode);
-			if (err)
-				goto out_drop_write;
-		}
-
 		if (attr->ia_valid & (ATTR_KILL_SUID|ATTR_KILL_SGID))
 			attr->ia_valid &= ~ATTR_MODE;
 
@@ -95,11 +50,7 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr)
 		if (!err)
 			ovl_copyattr(upperdentry->d_inode, dentry->d_inode);
 		inode_unlock(upperdentry->d_inode);
-
-		if (winode)
-			put_write_access(winode);
 	}
-out_drop_write:
 	ovl_drop_write(dentry);
 out:
 	return err;
@@ -302,10 +253,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
 	if (ovl_open_need_copy_up(file_flags, type, realpath.dentry)) {
 		err = ovl_want_write(dentry);
 		if (!err) {
-			if (file_flags & O_TRUNC)
-				err = ovl_copy_up_truncate(dentry);
-			else
-				err = ovl_copy_up(dentry);
+			err = ovl_copy_up_flags(dentry, file_flags);
 			ovl_drop_write(dentry);
 		}
 	}
@@ -348,13 +296,12 @@ static const struct inode_operations ovl_file_inode_operations = {
 static const struct inode_operations ovl_symlink_inode_operations = {
 	.setattr	= ovl_setattr,
 	.get_link	= ovl_get_link,
-	.readlink	= generic_readlink,
 	.getattr	= ovl_getattr,
 	.listxattr	= ovl_listxattr,
 	.update_time	= ovl_update_time,
 };
 
-static void ovl_fill_inode(struct inode *inode, umode_t mode)
+static void ovl_fill_inode(struct inode *inode, umode_t mode, dev_t rdev)
 {
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
@@ -363,8 +310,11 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
 	inode->i_acl = inode->i_default_acl = ACL_DONT_CACHE;
 #endif
 
-	mode &= S_IFMT;
-	switch (mode) {
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &ovl_file_inode_operations;
+		break;
+
 	case S_IFDIR:
 		inode->i_op = &ovl_dir_inode_operations;
 		inode->i_fop = &ovl_dir_operations;
@@ -375,26 +325,19 @@ static void ovl_fill_inode(struct inode *inode, umode_t mode)
 		break;
 
 	default:
-		WARN(1, "illegal file type: %i\n", mode);
-		/* Fall through */
-
-	case S_IFREG:
-	case S_IFSOCK:
-	case S_IFBLK:
-	case S_IFCHR:
-	case S_IFIFO:
 		inode->i_op = &ovl_file_inode_operations;
+		init_special_inode(inode, mode, rdev);
 		break;
 	}
 }
 
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode)
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev)
 {
 	struct inode *inode;
 
 	inode = new_inode(sb);
 	if (inode)
-		ovl_fill_inode(inode, mode);
+		ovl_fill_inode(inode, mode, rdev);
 
 	return inode;
 }
@@ -418,7 +361,7 @@ struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode)
 	inode = iget5_locked(sb, (unsigned long) realinode,
 			     ovl_inode_test, ovl_inode_set, realinode);
 	if (inode && inode->i_state & I_NEW) {
-		ovl_fill_inode(inode, realinode->i_mode);
+		ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev);
 		set_nlink(inode, realinode->i_nlink);
 		unlock_new_inode(inode);
 	}
diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c
new file mode 100644
index 000000000000..023bb0b03352
--- /dev/null
+++ b/fs/overlayfs/namei.c
@@ -0,0 +1,410 @@
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/xattr.h>
+#include <linux/ratelimit.h>
+#include "overlayfs.h"
+#include "ovl_entry.h"
+
+struct ovl_lookup_data {
+	struct qstr name;
+	bool is_dir;
+	bool opaque;
+	bool stop;
+	bool last;
+	char *redirect;
+};
+
+static int ovl_check_redirect(struct dentry *dentry, struct ovl_lookup_data *d,
+			      size_t prelen, const char *post)
+{
+	int res;
+	char *s, *next, *buf = NULL;
+
+	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, NULL, 0);
+	if (res < 0) {
+		if (res == -ENODATA || res == -EOPNOTSUPP)
+			return 0;
+		goto fail;
+	}
+	buf = kzalloc(prelen + res + strlen(post) + 1, GFP_TEMPORARY);
+	if (!buf)
+		return -ENOMEM;
+
+	if (res == 0)
+		goto invalid;
+
+	res = vfs_getxattr(dentry, OVL_XATTR_REDIRECT, buf, res);
+	if (res < 0)
+		goto fail;
+	if (res == 0)
+		goto invalid;
+	if (buf[0] == '/') {
+		for (s = buf; *s++ == '/'; s = next) {
+			next = strchrnul(s, '/');
+			if (s == next)
+				goto invalid;
+		}
+	} else {
+		if (strchr(buf, '/') != NULL)
+			goto invalid;
+
+		memmove(buf + prelen, buf, res);
+		memcpy(buf, d->name.name, prelen);
+	}
+
+	strcat(buf, post);
+	kfree(d->redirect);
+	d->redirect = buf;
+	d->name.name = d->redirect;
+	d->name.len = strlen(d->redirect);
+
+	return 0;
+
+err_free:
+	kfree(buf);
+	return 0;
+fail:
+	pr_warn_ratelimited("overlayfs: failed to get redirect (%i)\n", res);
+	goto err_free;
+invalid:
+	pr_warn_ratelimited("overlayfs: invalid redirect (%s)\n", buf);
+	goto err_free;
+}
+
+static bool ovl_is_opaquedir(struct dentry *dentry)
+{
+	int res;
+	char val;
+
+	if (!d_is_dir(dentry))
+		return false;
+
+	res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
+	if (res == 1 && val == 'y')
+		return true;
+
+	return false;
+}
+
+static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d,
+			     const char *name, unsigned int namelen,
+			     size_t prelen, const char *post,
+			     struct dentry **ret)
+{
+	struct dentry *this;
+	int err;
+
+	this = lookup_one_len_unlocked(name, base, namelen);
+	if (IS_ERR(this)) {
+		err = PTR_ERR(this);
+		this = NULL;
+		if (err == -ENOENT || err == -ENAMETOOLONG)
+			goto out;
+		goto out_err;
+	}
+	if (!this->d_inode)
+		goto put_and_out;
+
+	if (ovl_dentry_weird(this)) {
+		/* Don't support traversing automounts and other weirdness */
+		err = -EREMOTE;
+		goto out_err;
+	}
+	if (ovl_is_whiteout(this)) {
+		d->stop = d->opaque = true;
+		goto put_and_out;
+	}
+	if (!d_can_lookup(this)) {
+		d->stop = true;
+		if (d->is_dir)
+			goto put_and_out;
+		goto out;
+	}
+	d->is_dir = true;
+	if (!d->last && ovl_is_opaquedir(this)) {
+		d->stop = d->opaque = true;
+		goto out;
+	}
+	err = ovl_check_redirect(this, d, prelen, post);
+	if (err)
+		goto out_err;
+out:
+	*ret = this;
+	return 0;
+
+put_and_out:
+	dput(this);
+	this = NULL;
+	goto out;
+
+out_err:
+	dput(this);
+	return err;
+}
+
+static int ovl_lookup_layer(struct dentry *base, struct ovl_lookup_data *d,
+			    struct dentry **ret)
+{
+	/* Counting down from the end, since the prefix can change */
+	size_t rem = d->name.len - 1;
+	struct dentry *dentry = NULL;
+	int err;
+
+	if (d->name.name[0] != '/')
+		return ovl_lookup_single(base, d, d->name.name, d->name.len,
+					 0, "", ret);
+
+	while (!IS_ERR_OR_NULL(base) && d_can_lookup(base)) {
+		const char *s = d->name.name + d->name.len - rem;
+		const char *next = strchrnul(s, '/');
+		size_t thislen = next - s;
+		bool end = !next[0];
+
+		/* Verify we did not go off the rails */
+		if (WARN_ON(s[-1] != '/'))
+			return -EIO;
+
+		err = ovl_lookup_single(base, d, s, thislen,
+					d->name.len - rem, next, &base);
+		dput(dentry);
+		if (err)
+			return err;
+		dentry = base;
+		if (end)
+			break;
+
+		rem -= thislen + 1;
+
+		if (WARN_ON(rem >= d->name.len))
+			return -EIO;
+	}
+	*ret = dentry;
+	return 0;
+}
+
+/*
+ * Returns next layer in stack starting from top.
+ * Returns -1 if this is the last layer.
+ */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	BUG_ON(idx < 0);
+	if (idx == 0) {
+		ovl_path_upper(dentry, path);
+		if (path->dentry)
+			return oe->numlower ? 1 : -1;
+		idx++;
+	}
+	BUG_ON(idx > oe->numlower);
+	*path = oe->lowerstack[idx - 1];
+
+	return (idx < oe->numlower) ? idx + 1 : -1;
+}
+
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
+			  unsigned int flags)
+{
+	struct ovl_entry *oe;
+	const struct cred *old_cred;
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+	struct path *stack = NULL;
+	struct dentry *upperdir, *upperdentry = NULL;
+	unsigned int ctr = 0;
+	struct inode *inode = NULL;
+	bool upperopaque = false;
+	char *upperredirect = NULL;
+	struct dentry *this;
+	unsigned int i;
+	int err;
+	struct ovl_lookup_data d = {
+		.name = dentry->d_name,
+		.is_dir = false,
+		.opaque = false,
+		.stop = false,
+		.last = !poe->numlower,
+		.redirect = NULL,
+	};
+
+	if (dentry->d_name.len > ofs->namelen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	old_cred = ovl_override_creds(dentry->d_sb);
+	upperdir = ovl_upperdentry_dereference(poe);
+	if (upperdir) {
+		err = ovl_lookup_layer(upperdir, &d, &upperdentry);
+		if (err)
+			goto out;
+
+		if (upperdentry && unlikely(ovl_dentry_remote(upperdentry))) {
+			dput(upperdentry);
+			err = -EREMOTE;
+			goto out;
+		}
+
+		if (d.redirect) {
+			upperredirect = kstrdup(d.redirect, GFP_KERNEL);
+			if (!upperredirect)
+				goto out_put_upper;
+			if (d.redirect[0] == '/')
+				poe = dentry->d_sb->s_root->d_fsdata;
+		}
+		upperopaque = d.opaque;
+	}
+
+	if (!d.stop && poe->numlower) {
+		err = -ENOMEM;
+		stack = kcalloc(ofs->numlower, sizeof(struct path),
+				GFP_TEMPORARY);
+		if (!stack)
+			goto out_put_upper;
+	}
+
+	for (i = 0; !d.stop && i < poe->numlower; i++) {
+		struct path lowerpath = poe->lowerstack[i];
+
+		d.last = i == poe->numlower - 1;
+		err = ovl_lookup_layer(lowerpath.dentry, &d, &this);
+		if (err)
+			goto out_put;
+
+		if (!this)
+			continue;
+
+		stack[ctr].dentry = this;
+		stack[ctr].mnt = lowerpath.mnt;
+		ctr++;
+
+		if (d.stop)
+			break;
+
+		if (d.redirect &&
+		    d.redirect[0] == '/' &&
+		    poe != dentry->d_sb->s_root->d_fsdata) {
+			poe = dentry->d_sb->s_root->d_fsdata;
+
+			/* Find the current layer on the root dentry */
+			for (i = 0; i < poe->numlower; i++)
+				if (poe->lowerstack[i].mnt == lowerpath.mnt)
+					break;
+			if (WARN_ON(i == poe->numlower))
+				break;
+		}
+	}
+
+	oe = ovl_alloc_entry(ctr);
+	err = -ENOMEM;
+	if (!oe)
+		goto out_put;
+
+	if (upperdentry || ctr) {
+		struct dentry *realdentry;
+		struct inode *realinode;
+
+		realdentry = upperdentry ? upperdentry : stack[0].dentry;
+		realinode = d_inode(realdentry);
+
+		err = -ENOMEM;
+		if (upperdentry && !d_is_dir(upperdentry)) {
+			inode = ovl_get_inode(dentry->d_sb, realinode);
+		} else {
+			inode = ovl_new_inode(dentry->d_sb, realinode->i_mode,
+					      realinode->i_rdev);
+			if (inode)
+				ovl_inode_init(inode, realinode, !!upperdentry);
+		}
+		if (!inode)
+			goto out_free_oe;
+		ovl_copyattr(realdentry->d_inode, inode);
+	}
+
+	revert_creds(old_cred);
+	oe->opaque = upperopaque;
+	oe->redirect = upperredirect;
+	oe->__upperdentry = upperdentry;
+	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
+	kfree(stack);
+	kfree(d.redirect);
+	dentry->d_fsdata = oe;
+	d_add(dentry, inode);
+
+	return NULL;
+
+out_free_oe:
+	kfree(oe);
+out_put:
+	for (i = 0; i < ctr; i++)
+		dput(stack[i].dentry);
+	kfree(stack);
+out_put_upper:
+	dput(upperdentry);
+	kfree(upperredirect);
+out:
+	kfree(d.redirect);
+	revert_creds(old_cred);
+	return ERR_PTR(err);
+}
+
+bool ovl_lower_positive(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
+	const struct qstr *name = &dentry->d_name;
+	unsigned int i;
+	bool positive = false;
+	bool done = false;
+
+	/*
+	 * If dentry is negative, then lower is positive iff this is a
+	 * whiteout.
+	 */
+	if (!dentry->d_inode)
+		return oe->opaque;
+
+	/* Negative upper -> positive lower */
+	if (!oe->__upperdentry)
+		return true;
+
+	/* Positive upper -> have to look up lower to see whether it exists */
+	for (i = 0; !done && !positive && i < poe->numlower; i++) {
+		struct dentry *this;
+		struct dentry *lowerdir = poe->lowerstack[i].dentry;
+
+		this = lookup_one_len_unlocked(name->name, lowerdir,
+					       name->len);
+		if (IS_ERR(this)) {
+			switch (PTR_ERR(this)) {
+			case -ENOENT:
+			case -ENAMETOOLONG:
+				break;
+
+			default:
+				/*
+				 * Assume something is there, we just couldn't
+				 * access it.
+				 */
+				positive = true;
+				break;
+			}
+		} else {
+			if (this->d_inode) {
+				positive = !ovl_is_whiteout(this);
+				done = true;
+			}
+			dput(this);
+		}
+	}
+
+	return positive;
+}
diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h
index e218e741cb99..8af450b0e57a 100644
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -9,23 +9,17 @@
 
 #include <linux/kernel.h>
 
-struct ovl_entry;
-
 enum ovl_path_type {
-	__OVL_PATH_PURE		= (1 << 0),
-	__OVL_PATH_UPPER	= (1 << 1),
-	__OVL_PATH_MERGE	= (1 << 2),
+	__OVL_PATH_UPPER	= (1 << 0),
+	__OVL_PATH_MERGE	= (1 << 1),
 };
 
 #define OVL_TYPE_UPPER(type)	((type) & __OVL_PATH_UPPER)
 #define OVL_TYPE_MERGE(type)	((type) & __OVL_PATH_MERGE)
-#define OVL_TYPE_PURE_UPPER(type) ((type) & __OVL_PATH_PURE)
-#define OVL_TYPE_MERGE_OR_LOWER(type) \
-	(OVL_TYPE_MERGE(type) || !OVL_TYPE_UPPER(type))
-
 
 #define OVL_XATTR_PREFIX XATTR_TRUSTED_PREFIX "overlay."
 #define OVL_XATTR_OPAQUE OVL_XATTR_PREFIX "opaque"
+#define OVL_XATTR_REDIRECT OVL_XATTR_PREFIX "redirect"
 
 #define OVL_ISUPPER_MASK 1UL
 
@@ -143,35 +137,43 @@ static inline struct inode *ovl_inode_real(struct inode *inode, bool *is_upper)
 	return (struct inode *) (x & ~OVL_ISUPPER_MASK);
 }
 
+/* util.c */
+int ovl_want_write(struct dentry *dentry);
+void ovl_drop_write(struct dentry *dentry);
+struct dentry *ovl_workdir(struct dentry *dentry);
+const struct cred *ovl_override_creds(struct super_block *sb);
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+bool ovl_dentry_remote(struct dentry *dentry);
+bool ovl_dentry_weird(struct dentry *dentry);
 enum ovl_path_type ovl_path_type(struct dentry *dentry);
-u64 ovl_dentry_version_get(struct dentry *dentry);
-void ovl_dentry_version_inc(struct dentry *dentry);
 void ovl_path_upper(struct dentry *dentry, struct path *path);
 void ovl_path_lower(struct dentry *dentry, struct path *path);
 enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path);
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
 struct dentry *ovl_dentry_upper(struct dentry *dentry);
 struct dentry *ovl_dentry_lower(struct dentry *dentry);
 struct dentry *ovl_dentry_real(struct dentry *dentry);
-struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
-				    bool is_upper);
 struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry);
 void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache);
-struct dentry *ovl_workdir(struct dentry *dentry);
-int ovl_want_write(struct dentry *dentry);
-void ovl_drop_write(struct dentry *dentry);
 bool ovl_dentry_is_opaque(struct dentry *dentry);
-void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque);
-bool ovl_is_whiteout(struct dentry *dentry);
-const struct cred *ovl_override_creds(struct super_block *sb);
+bool ovl_dentry_is_whiteout(struct dentry *dentry);
+void ovl_dentry_set_opaque(struct dentry *dentry);
+bool ovl_redirect_dir(struct super_block *sb);
+void ovl_clear_redirect_dir(struct super_block *sb);
+const char *ovl_dentry_get_redirect(struct dentry *dentry);
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect);
 void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry);
+void ovl_inode_init(struct inode *inode, struct inode *realinode,
+		    bool is_upper);
 void ovl_inode_update(struct inode *inode, struct inode *upperinode);
-struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
-			  unsigned int flags);
+void ovl_dentry_version_inc(struct dentry *dentry);
+u64 ovl_dentry_version_get(struct dentry *dentry);
+bool ovl_is_whiteout(struct dentry *dentry);
 struct file *ovl_path_open(struct path *path, int flags);
 
-struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry,
-				struct kstat *stat, const char *link);
+/* namei.c */
+int ovl_path_next(int idx, struct dentry *dentry, struct path *path);
+struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags);
+bool ovl_lower_positive(struct dentry *dentry);
 
 /* readdir.c */
 extern const struct file_operations ovl_dir_operations;
@@ -195,7 +197,7 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
 int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
 bool ovl_is_private_xattr(const char *name);
 
-struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
+struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, dev_t rdev);
 struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
 static inline void ovl_copyattr(struct inode *from, struct inode *to)
 {
@@ -210,14 +212,18 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
 /* dir.c */
 extern const struct inode_operations ovl_dir_inode_operations;
 struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry);
+struct cattr {
+	dev_t rdev;
+	umode_t mode;
+	const char *link;
+};
 int ovl_create_real(struct inode *dir, struct dentry *newdentry,
-		    struct kstat *stat, const char *link,
+		    struct cattr *attr,
 		    struct dentry *hardlink, bool debug);
 void ovl_cleanup(struct inode *dir, struct dentry *dentry);
 
 /* copy_up.c */
 int ovl_copy_up(struct dentry *dentry);
-int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry,
-		    struct path *lowerpath, struct kstat *stat);
+int ovl_copy_up_flags(struct dentry *dentry, int flags);
 int ovl_copy_xattr(struct dentry *old, struct dentry *new);
 int ovl_set_attr(struct dentry *upper, struct kstat *stat);
diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h
new file mode 100644
index 000000000000..d14bca1850d9
--- /dev/null
+++ b/fs/overlayfs/ovl_entry.h
@@ -0,0 +1,53 @@
+/*
+ *
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+struct ovl_config {
+	char *lowerdir;
+	char *upperdir;
+	char *workdir;
+	bool default_permissions;
+	bool redirect_dir;
+};
+
+/* private information held for overlayfs's superblock */
+struct ovl_fs {
+	struct vfsmount *upper_mnt;
+	unsigned numlower;
+	struct vfsmount **lower_mnt;
+	struct dentry *workdir;
+	long namelen;
+	/* pathnames of lower and upper dirs, for show_options */
+	struct ovl_config config;
+	/* creds of process who forced instantiation of super block */
+	const struct cred *creator_cred;
+};
+
+/* private information held for every overlayfs dentry */
+struct ovl_entry {
+	struct dentry *__upperdentry;
+	struct ovl_dir_cache *cache;
+	union {
+		struct {
+			u64 version;
+			const char *redirect;
+			bool opaque;
+		};
+		struct rcu_head rcu;
+	};
+	unsigned numlower;
+	struct path lowerstack[];
+};
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower);
+
+static inline struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
+{
+	return lockless_dereference(oe->__upperdentry);
+}
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index edd46a0e951d..20f48abbb82f 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -9,280 +9,29 @@
 
 #include <linux/fs.h>
 #include <linux/namei.h>
-#include <linux/pagemap.h>
 #include <linux/xattr.h>
-#include <linux/security.h>
 #include <linux/mount.h>
-#include <linux/slab.h>
 #include <linux/parser.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 #include <linux/statfs.h>
 #include <linux/seq_file.h>
 #include <linux/posix_acl_xattr.h>
 #include "overlayfs.h"
+#include "ovl_entry.h"
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Overlay filesystem");
 MODULE_LICENSE("GPL");
 
-struct ovl_config {
-	char *lowerdir;
-	char *upperdir;
-	char *workdir;
-	bool default_permissions;
-};
-
-/* private information held for overlayfs's superblock */
-struct ovl_fs {
-	struct vfsmount *upper_mnt;
-	unsigned numlower;
-	struct vfsmount **lower_mnt;
-	struct dentry *workdir;
-	long lower_namelen;
-	/* pathnames of lower and upper dirs, for show_options */
-	struct ovl_config config;
-	/* creds of process who forced instantiation of super block */
-	const struct cred *creator_cred;
-};
 
 struct ovl_dir_cache;
 
-/* private information held for every overlayfs dentry */
-struct ovl_entry {
-	struct dentry *__upperdentry;
-	struct ovl_dir_cache *cache;
-	union {
-		struct {
-			u64 version;
-			bool opaque;
-		};
-		struct rcu_head rcu;
-	};
-	unsigned numlower;
-	struct path lowerstack[];
-};
-
 #define OVL_MAX_STACK 500
 
-static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
-{
-	return oe->numlower ? oe->lowerstack[0].dentry : NULL;
-}
-
-enum ovl_path_type ovl_path_type(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-	enum ovl_path_type type = 0;
-
-	if (oe->__upperdentry) {
-		type = __OVL_PATH_UPPER;
-
-		/*
-		 * Non-dir dentry can hold lower dentry from previous
-		 * location. Its purity depends only on opaque flag.
-		 */
-		if (oe->numlower && S_ISDIR(dentry->d_inode->i_mode))
-			type |= __OVL_PATH_MERGE;
-		else if (!oe->opaque)
-			type |= __OVL_PATH_PURE;
-	} else {
-		if (oe->numlower > 1)
-			type |= __OVL_PATH_MERGE;
-	}
-	return type;
-}
-
-static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe)
-{
-	return lockless_dereference(oe->__upperdentry);
-}
-
-void ovl_path_upper(struct dentry *dentry, struct path *path)
-{
-	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	path->mnt = ofs->upper_mnt;
-	path->dentry = ovl_upperdentry_dereference(oe);
-}
-
-enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
-{
-	enum ovl_path_type type = ovl_path_type(dentry);
-
-	if (!OVL_TYPE_UPPER(type))
-		ovl_path_lower(dentry, path);
-	else
-		ovl_path_upper(dentry, path);
-
-	return type;
-}
-
-struct dentry *ovl_dentry_upper(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	return ovl_upperdentry_dereference(oe);
-}
-
-struct dentry *ovl_dentry_lower(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	return __ovl_dentry_lower(oe);
-}
-
-struct dentry *ovl_dentry_real(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-	struct dentry *realdentry;
-
-	realdentry = ovl_upperdentry_dereference(oe);
-	if (!realdentry)
-		realdentry = __ovl_dentry_lower(oe);
-
-	return realdentry;
-}
-
-static void ovl_inode_init(struct inode *inode, struct inode *realinode,
-			   bool is_upper)
-{
-	WRITE_ONCE(inode->i_private, (unsigned long) realinode |
-		   (is_upper ? OVL_ISUPPER_MASK : 0));
-}
-
-struct vfsmount *ovl_entry_mnt_real(struct ovl_entry *oe, struct inode *inode,
-				    bool is_upper)
-{
-	if (is_upper) {
-		struct ovl_fs *ofs = inode->i_sb->s_fs_info;
-
-		return ofs->upper_mnt;
-	} else {
-		return oe->numlower ? oe->lowerstack[0].mnt : NULL;
-	}
-}
-
-struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	return oe->cache;
-}
-
-void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	oe->cache = cache;
-}
-
-void ovl_path_lower(struct dentry *dentry, struct path *path)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
-}
-
-int ovl_want_write(struct dentry *dentry)
-{
-	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
-	return mnt_want_write(ofs->upper_mnt);
-}
-
-void ovl_drop_write(struct dentry *dentry)
-{
-	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
-	mnt_drop_write(ofs->upper_mnt);
-}
-
-struct dentry *ovl_workdir(struct dentry *dentry)
-{
-	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
-	return ofs->workdir;
-}
-
-bool ovl_dentry_is_opaque(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-	return oe->opaque;
-}
-
-void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-	oe->opaque = opaque;
-}
-
-void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
-	WARN_ON(oe->__upperdentry);
-	/*
-	 * Make sure upperdentry is consistent before making it visible to
-	 * ovl_upperdentry_dereference().
-	 */
-	smp_wmb();
-	oe->__upperdentry = upperdentry;
-}
-
-void ovl_inode_update(struct inode *inode, struct inode *upperinode)
-{
-	WARN_ON(!upperinode);
-	WARN_ON(!inode_unhashed(inode));
-	WRITE_ONCE(inode->i_private,
-		   (unsigned long) upperinode | OVL_ISUPPER_MASK);
-	if (!S_ISDIR(upperinode->i_mode))
-		__insert_inode_hash(inode, (unsigned long) upperinode);
-}
-
-void ovl_dentry_version_inc(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	WARN_ON(!inode_is_locked(dentry->d_inode));
-	oe->version++;
-}
-
-u64 ovl_dentry_version_get(struct dentry *dentry)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	WARN_ON(!inode_is_locked(dentry->d_inode));
-	return oe->version;
-}
-
-bool ovl_is_whiteout(struct dentry *dentry)
-{
-	struct inode *inode = dentry->d_inode;
-
-	return inode && IS_WHITEOUT(inode);
-}
-
-const struct cred *ovl_override_creds(struct super_block *sb)
-{
-	struct ovl_fs *ofs = sb->s_fs_info;
-
-	return override_creds(ofs->creator_cred);
-}
-
-static bool ovl_is_opaquedir(struct dentry *dentry)
-{
-	int res;
-	char val;
-
-	if (!d_is_dir(dentry))
-		return false;
-
-	res = vfs_getxattr(dentry, OVL_XATTR_OPAQUE, &val, 1);
-	if (res == 1 && val == 'y')
-		return true;
-
-	return false;
-}
+static bool ovl_redirect_dir_def = IS_ENABLED(CONFIG_OVERLAY_FS_REDIRECT_DIR);
+module_param_named(redirect_dir, ovl_redirect_dir_def, bool, 0644);
+MODULE_PARM_DESC(ovl_redirect_dir_def,
+		 "Default to on or off for the redirect_dir feature");
 
 static void ovl_dentry_release(struct dentry *dentry)
 {
@@ -292,6 +41,7 @@ static void ovl_dentry_release(struct dentry *dentry)
 		unsigned int i;
 
 		dput(oe->__upperdentry);
+		kfree(oe->redirect);
 		for (i = 0; i < oe->numlower; i++)
 			dput(oe->lowerstack[i].dentry);
 		kfree_rcu(oe, rcu);
@@ -304,7 +54,7 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 {
 	struct dentry *real;
 
-	if (d_is_dir(dentry)) {
+	if (!d_is_reg(dentry)) {
 		if (!inode || inode == d_inode(dentry))
 			return dentry;
 		goto bug;
@@ -328,11 +78,11 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	if (!real)
 		goto bug;
 
+	/* Handle recursion */
+	real = d_real(real, inode, open_flags);
+
 	if (!inode || inode == d_inode(real))
 		return real;
-
-	/* Handle recursion */
-	return d_real(real, inode, open_flags);
 bug:
 	WARN(1, "ovl_d_real(%pd4, %s:%lu): real dentry not found\n", dentry,
 	     inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
@@ -392,226 +142,6 @@ static const struct dentry_operations ovl_reval_dentry_operations = {
 	.d_weak_revalidate = ovl_dentry_weak_revalidate,
 };
 
-static struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
-{
-	size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
-	struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
-
-	if (oe)
-		oe->numlower = numlower;
-
-	return oe;
-}
-
-static bool ovl_dentry_remote(struct dentry *dentry)
-{
-	return dentry->d_flags &
-		(DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
-		 DCACHE_OP_REAL);
-}
-
-static bool ovl_dentry_weird(struct dentry *dentry)
-{
-	return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
-				  DCACHE_MANAGE_TRANSIT |
-				  DCACHE_OP_HASH |
-				  DCACHE_OP_COMPARE);
-}
-
-static inline struct dentry *ovl_lookup_real(struct dentry *dir,
-					     const struct qstr *name)
-{
-	struct dentry *dentry;
-
-	dentry = lookup_one_len_unlocked(name->name, dir, name->len);
-
-	if (IS_ERR(dentry)) {
-		if (PTR_ERR(dentry) == -ENOENT)
-			dentry = NULL;
-	} else if (!dentry->d_inode) {
-		dput(dentry);
-		dentry = NULL;
-	} else if (ovl_dentry_weird(dentry)) {
-		dput(dentry);
-		/* Don't support traversing automounts and other weirdness */
-		dentry = ERR_PTR(-EREMOTE);
-	}
-	return dentry;
-}
-
-/*
- * Returns next layer in stack starting from top.
- * Returns -1 if this is the last layer.
- */
-int ovl_path_next(int idx, struct dentry *dentry, struct path *path)
-{
-	struct ovl_entry *oe = dentry->d_fsdata;
-
-	BUG_ON(idx < 0);
-	if (idx == 0) {
-		ovl_path_upper(dentry, path);
-		if (path->dentry)
-			return oe->numlower ? 1 : -1;
-		idx++;
-	}
-	BUG_ON(idx > oe->numlower);
-	*path = oe->lowerstack[idx - 1];
-
-	return (idx < oe->numlower) ? idx + 1 : -1;
-}
-
-struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry,
-			  unsigned int flags)
-{
-	struct ovl_entry *oe;
-	const struct cred *old_cred;
-	struct ovl_entry *poe = dentry->d_parent->d_fsdata;
-	struct path *stack = NULL;
-	struct dentry *upperdir, *upperdentry = NULL;
-	unsigned int ctr = 0;
-	struct inode *inode = NULL;
-	bool upperopaque = false;
-	struct dentry *this, *prev = NULL;
-	unsigned int i;
-	int err;
-
-	old_cred = ovl_override_creds(dentry->d_sb);
-	upperdir = ovl_upperdentry_dereference(poe);
-	if (upperdir) {
-		this = ovl_lookup_real(upperdir, &dentry->d_name);
-		err = PTR_ERR(this);
-		if (IS_ERR(this))
-			goto out;
-
-		if (this) {
-			if (unlikely(ovl_dentry_remote(this))) {
-				dput(this);
-				err = -EREMOTE;
-				goto out;
-			}
-			if (ovl_is_whiteout(this)) {
-				dput(this);
-				this = NULL;
-				upperopaque = true;
-			} else if (poe->numlower && ovl_is_opaquedir(this)) {
-				upperopaque = true;
-			}
-		}
-		upperdentry = prev = this;
-	}
-
-	if (!upperopaque && poe->numlower) {
-		err = -ENOMEM;
-		stack = kcalloc(poe->numlower, sizeof(struct path), GFP_KERNEL);
-		if (!stack)
-			goto out_put_upper;
-	}
-
-	for (i = 0; !upperopaque && i < poe->numlower; i++) {
-		bool opaque = false;
-		struct path lowerpath = poe->lowerstack[i];
-
-		this = ovl_lookup_real(lowerpath.dentry, &dentry->d_name);
-		err = PTR_ERR(this);
-		if (IS_ERR(this)) {
-			/*
-			 * If it's positive, then treat ENAMETOOLONG as ENOENT.
-			 */
-			if (err == -ENAMETOOLONG && (upperdentry || ctr))
-				continue;
-			goto out_put;
-		}
-		if (!this)
-			continue;
-		if (ovl_is_whiteout(this)) {
-			dput(this);
-			break;
-		}
-		/*
-		 * Only makes sense to check opaque dir if this is not the
-		 * lowermost layer.
-		 */
-		if (i < poe->numlower - 1 && ovl_is_opaquedir(this))
-			opaque = true;
-
-		if (prev && (!S_ISDIR(prev->d_inode->i_mode) ||
-			     !S_ISDIR(this->d_inode->i_mode))) {
-			/*
-			 * FIXME: check for upper-opaqueness maybe better done
-			 * in remove code.
-			 */
-			if (prev == upperdentry)
-				upperopaque = true;
-			dput(this);
-			break;
-		}
-		/*
-		 * If this is a non-directory then stop here.
-		 */
-		if (!S_ISDIR(this->d_inode->i_mode))
-			opaque = true;
-
-		stack[ctr].dentry = this;
-		stack[ctr].mnt = lowerpath.mnt;
-		ctr++;
-		prev = this;
-		if (opaque)
-			break;
-	}
-
-	oe = ovl_alloc_entry(ctr);
-	err = -ENOMEM;
-	if (!oe)
-		goto out_put;
-
-	if (upperdentry || ctr) {
-		struct dentry *realdentry;
-		struct inode *realinode;
-
-		realdentry = upperdentry ? upperdentry : stack[0].dentry;
-		realinode = d_inode(realdentry);
-
-		err = -ENOMEM;
-		if (upperdentry && !d_is_dir(upperdentry)) {
-			inode = ovl_get_inode(dentry->d_sb, realinode);
-		} else {
-			inode = ovl_new_inode(dentry->d_sb, realinode->i_mode);
-			if (inode)
-				ovl_inode_init(inode, realinode, !!upperdentry);
-		}
-		if (!inode)
-			goto out_free_oe;
-		ovl_copyattr(realdentry->d_inode, inode);
-	}
-
-	revert_creds(old_cred);
-	oe->opaque = upperopaque;
-	oe->__upperdentry = upperdentry;
-	memcpy(oe->lowerstack, stack, sizeof(struct path) * ctr);
-	kfree(stack);
-	dentry->d_fsdata = oe;
-	d_add(dentry, inode);
-
-	return NULL;
-
-out_free_oe:
-	kfree(oe);
-out_put:
-	for (i = 0; i < ctr; i++)
-		dput(stack[i].dentry);
-	kfree(stack);
-out_put_upper:
-	dput(upperdentry);
-out:
-	revert_creds(old_cred);
-	return ERR_PTR(err);
-}
-
-struct file *ovl_path_open(struct path *path, int flags)
-{
-	return dentry_open(path, flags | O_NOATIME, current_cred());
-}
-
 static void ovl_put_super(struct super_block *sb)
 {
 	struct ovl_fs *ufs = sb->s_fs_info;
@@ -649,7 +179,7 @@ static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf)
 
 	err = vfs_statfs(&path, buf);
 	if (!err) {
-		buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen);
+		buf->f_namelen = ofs->namelen;
 		buf->f_type = OVERLAYFS_SUPER_MAGIC;
 	}
 
@@ -674,6 +204,9 @@ static int ovl_show_options(struct seq_file *m, struct dentry *dentry)
 	}
 	if (ufs->config.default_permissions)
 		seq_puts(m, ",default_permissions");
+	if (ufs->config.redirect_dir != ovl_redirect_dir_def)
+		seq_printf(m, ",redirect_dir=%s",
+			   ufs->config.redirect_dir ? "on" : "off");
 	return 0;
 }
 
@@ -700,6 +233,8 @@ enum {
 	OPT_UPPERDIR,
 	OPT_WORKDIR,
 	OPT_DEFAULT_PERMISSIONS,
+	OPT_REDIRECT_DIR_ON,
+	OPT_REDIRECT_DIR_OFF,
 	OPT_ERR,
 };
 
@@ -708,6 +243,8 @@ static const match_table_t ovl_tokens = {
 	{OPT_UPPERDIR,			"upperdir=%s"},
 	{OPT_WORKDIR,			"workdir=%s"},
 	{OPT_DEFAULT_PERMISSIONS,	"default_permissions"},
+	{OPT_REDIRECT_DIR_ON,		"redirect_dir=on"},
+	{OPT_REDIRECT_DIR_OFF,		"redirect_dir=off"},
 	{OPT_ERR,			NULL}
 };
 
@@ -772,6 +309,14 @@ static int ovl_parse_opt(char *opt, struct ovl_config *config)
 			config->default_permissions = true;
 			break;
 
+		case OPT_REDIRECT_DIR_ON:
+			config->redirect_dir = true;
+			break;
+
+		case OPT_REDIRECT_DIR_OFF:
+			config->redirect_dir = false;
+			break;
+
 		default:
 			pr_err("overlayfs: unrecognized mount option \"%s\" or missing value\n", p);
 			return -EINVAL;
@@ -809,12 +354,9 @@ retry:
 			      strlen(OVL_WORKDIR_NAME));
 
 	if (!IS_ERR(work)) {
-		struct kstat stat = {
-			.mode = S_IFDIR | 0,
-		};
 		struct iattr attr = {
 			.ia_valid = ATTR_MODE,
-			.ia_mode = stat.mode,
+			.ia_mode = S_IFDIR | 0,
 		};
 
 		if (work->d_inode) {
@@ -828,7 +370,9 @@ retry:
 			goto retry;
 		}
 
-		err = ovl_create_real(dir, work, &stat, NULL, NULL, true);
+		err = ovl_create_real(dir, work,
+				      &(struct cattr){.mode = S_IFDIR | 0},
+				      NULL, true);
 		if (err)
 			goto out_dput;
 
@@ -903,7 +447,7 @@ static int ovl_mount_dir_noesc(const char *name, struct path *path)
 		pr_err("overlayfs: filesystem on '%s' not supported\n", name);
 		goto out_put;
 	}
-	if (!S_ISDIR(path->dentry->d_inode->i_mode)) {
+	if (!d_is_dir(path->dentry)) {
 		pr_err("overlayfs: '%s' not a directory\n", name);
 		goto out_put;
 	}
@@ -936,22 +480,33 @@ static int ovl_mount_dir(const char *name, struct path *path)
 	return err;
 }
 
-static int ovl_lower_dir(const char *name, struct path *path, long *namelen,
-			 int *stack_depth, bool *remote)
+static int ovl_check_namelen(struct path *path, struct ovl_fs *ofs,
+			     const char *name)
 {
-	int err;
 	struct kstatfs statfs;
+	int err = vfs_statfs(path, &statfs);
+
+	if (err)
+		pr_err("overlayfs: statfs failed on '%s'\n", name);
+	else
+		ofs->namelen = max(ofs->namelen, statfs.f_namelen);
+
+	return err;
+}
+
+static int ovl_lower_dir(const char *name, struct path *path,
+			 struct ovl_fs *ofs, int *stack_depth, bool *remote)
+{
+	int err;
 
 	err = ovl_mount_dir_noesc(name, path);
 	if (err)
 		goto out;
 
-	err = vfs_statfs(path, &statfs);
-	if (err) {
-		pr_err("overlayfs: statfs failed on '%s'\n", name);
+	err = ovl_check_namelen(path, ofs, name);
+	if (err)
 		goto out_put;
-	}
-	*namelen = max(*namelen, statfs.f_namelen);
+
 	*stack_depth = max(*stack_depth, path->mnt->mnt_sb->s_stack_depth);
 
 	if (ovl_dentry_remote(path->dentry))
@@ -1067,7 +622,7 @@ static int ovl_own_xattr_get(const struct xattr_handler *handler,
 			     struct dentry *dentry, struct inode *inode,
 			     const char *name, void *buffer, size_t size)
 {
-	return -EPERM;
+	return -EOPNOTSUPP;
 }
 
 static int ovl_own_xattr_set(const struct xattr_handler *handler,
@@ -1075,7 +630,7 @@ static int ovl_own_xattr_set(const struct xattr_handler *handler,
 			     const char *name, const void *value,
 			     size_t size, int flags)
 {
-	return -EPERM;
+	return -EOPNOTSUPP;
 }
 
 static int ovl_other_xattr_get(const struct xattr_handler *handler,
@@ -1153,6 +708,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	if (!ufs)
 		goto out;
 
+	ufs->config.redirect_dir = ovl_redirect_dir_def;
 	err = ovl_parse_opt((char *) data, &ufs->config);
 	if (err)
 		goto out_free_config;
@@ -1183,6 +739,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			goto out_put_upperpath;
 		}
 
+		err = ovl_check_namelen(&upperpath, ufs, ufs->config.upperdir);
+		if (err)
+			goto out_put_upperpath;
+
 		err = ovl_mount_dir(ufs->config.workdir, &workpath);
 		if (err)
 			goto out_put_upperpath;
@@ -1214,15 +774,16 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 		goto out_free_lowertmp;
 	}
 
+	err = -ENOMEM;
 	stack = kcalloc(stacklen, sizeof(struct path), GFP_KERNEL);
 	if (!stack)
 		goto out_free_lowertmp;
 
+	err = -EINVAL;
 	lower = lowertmp;
 	for (numlower = 0; numlower < stacklen; numlower++) {
-		err = ovl_lower_dir(lower, &stack[numlower],
-				    &ufs->lower_namelen, &sb->s_stack_depth,
-				    &remote);
+		err = ovl_lower_dir(lower, &stack[numlower], ufs,
+				    &sb->s_stack_depth, &remote);
 		if (err)
 			goto out_put_lowerpath;
 
@@ -1324,7 +885,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_fs_info = ufs;
 	sb->s_flags |= MS_POSIXACL | MS_NOREMOTELOCK;
 
-	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR));
+	root_dentry = d_make_root(ovl_new_inode(sb, S_IFDIR, 0));
 	if (!root_dentry)
 		goto out_free_oe;
 
diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c
new file mode 100644
index 000000000000..952286f4826c
--- /dev/null
+++ b/fs/overlayfs/util.c
@@ -0,0 +1,265 @@
+/*
+ * Copyright (C) 2011 Novell Inc.
+ * Copyright (C) 2016 Red Hat, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/fs.h>
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/xattr.h>
+#include "overlayfs.h"
+#include "ovl_entry.h"
+
+int ovl_want_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return mnt_want_write(ofs->upper_mnt);
+}
+
+void ovl_drop_write(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	mnt_drop_write(ofs->upper_mnt);
+}
+
+struct dentry *ovl_workdir(struct dentry *dentry)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	return ofs->workdir;
+}
+
+const struct cred *ovl_override_creds(struct super_block *sb)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	return override_creds(ofs->creator_cred);
+}
+
+struct ovl_entry *ovl_alloc_entry(unsigned int numlower)
+{
+	size_t size = offsetof(struct ovl_entry, lowerstack[numlower]);
+	struct ovl_entry *oe = kzalloc(size, GFP_KERNEL);
+
+	if (oe)
+		oe->numlower = numlower;
+
+	return oe;
+}
+
+bool ovl_dentry_remote(struct dentry *dentry)
+{
+	return dentry->d_flags &
+		(DCACHE_OP_REVALIDATE | DCACHE_OP_WEAK_REVALIDATE |
+		 DCACHE_OP_REAL);
+}
+
+bool ovl_dentry_weird(struct dentry *dentry)
+{
+	return dentry->d_flags & (DCACHE_NEED_AUTOMOUNT |
+				  DCACHE_MANAGE_TRANSIT |
+				  DCACHE_OP_HASH |
+				  DCACHE_OP_COMPARE);
+}
+
+enum ovl_path_type ovl_path_type(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	enum ovl_path_type type = 0;
+
+	if (oe->__upperdentry) {
+		type = __OVL_PATH_UPPER;
+
+		/*
+		 * Non-dir dentry can hold lower dentry from previous
+		 * location.
+		 */
+		if (oe->numlower && d_is_dir(dentry))
+			type |= __OVL_PATH_MERGE;
+	} else {
+		if (oe->numlower > 1)
+			type |= __OVL_PATH_MERGE;
+	}
+	return type;
+}
+
+void ovl_path_upper(struct dentry *dentry, struct path *path)
+{
+	struct ovl_fs *ofs = dentry->d_sb->s_fs_info;
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	path->mnt = ofs->upper_mnt;
+	path->dentry = ovl_upperdentry_dereference(oe);
+}
+
+void ovl_path_lower(struct dentry *dentry, struct path *path)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	*path = oe->numlower ? oe->lowerstack[0] : (struct path) { NULL, NULL };
+}
+
+enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path)
+{
+	enum ovl_path_type type = ovl_path_type(dentry);
+
+	if (!OVL_TYPE_UPPER(type))
+		ovl_path_lower(dentry, path);
+	else
+		ovl_path_upper(dentry, path);
+
+	return type;
+}
+
+struct dentry *ovl_dentry_upper(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return ovl_upperdentry_dereference(oe);
+}
+
+static struct dentry *__ovl_dentry_lower(struct ovl_entry *oe)
+{
+	return oe->numlower ? oe->lowerstack[0].dentry : NULL;
+}
+
+struct dentry *ovl_dentry_lower(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return __ovl_dentry_lower(oe);
+}
+
+struct dentry *ovl_dentry_real(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	struct dentry *realdentry;
+
+	realdentry = ovl_upperdentry_dereference(oe);
+	if (!realdentry)
+		realdentry = __ovl_dentry_lower(oe);
+
+	return realdentry;
+}
+
+struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->cache;
+}
+
+void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	oe->cache = cache;
+}
+
+bool ovl_dentry_is_opaque(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+	return oe->opaque;
+}
+
+bool ovl_dentry_is_whiteout(struct dentry *dentry)
+{
+	return !dentry->d_inode && ovl_dentry_is_opaque(dentry);
+}
+
+void ovl_dentry_set_opaque(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	oe->opaque = true;
+}
+
+bool ovl_redirect_dir(struct super_block *sb)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	return ofs->config.redirect_dir;
+}
+
+void ovl_clear_redirect_dir(struct super_block *sb)
+{
+	struct ovl_fs *ofs = sb->s_fs_info;
+
+	ofs->config.redirect_dir = false;
+}
+
+const char *ovl_dentry_get_redirect(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	return oe->redirect;
+}
+
+void ovl_dentry_set_redirect(struct dentry *dentry, const char *redirect)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	kfree(oe->redirect);
+	oe->redirect = redirect;
+}
+
+void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode));
+	WARN_ON(oe->__upperdentry);
+	/*
+	 * Make sure upperdentry is consistent before making it visible to
+	 * ovl_upperdentry_dereference().
+	 */
+	smp_wmb();
+	oe->__upperdentry = upperdentry;
+}
+
+void ovl_inode_init(struct inode *inode, struct inode *realinode, bool is_upper)
+{
+	WRITE_ONCE(inode->i_private, (unsigned long) realinode |
+		   (is_upper ? OVL_ISUPPER_MASK : 0));
+}
+
+void ovl_inode_update(struct inode *inode, struct inode *upperinode)
+{
+	WARN_ON(!upperinode);
+	WARN_ON(!inode_unhashed(inode));
+	WRITE_ONCE(inode->i_private,
+		   (unsigned long) upperinode | OVL_ISUPPER_MASK);
+	if (!S_ISDIR(upperinode->i_mode))
+		__insert_inode_hash(inode, (unsigned long) upperinode);
+}
+
+void ovl_dentry_version_inc(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!inode_is_locked(dentry->d_inode));
+	oe->version++;
+}
+
+u64 ovl_dentry_version_get(struct dentry *dentry)
+{
+	struct ovl_entry *oe = dentry->d_fsdata;
+
+	WARN_ON(!inode_is_locked(dentry->d_inode));
+	return oe->version;
+}
+
+bool ovl_is_whiteout(struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+
+	return inode && IS_WHITEOUT(inode);
+}
+
+struct file *ovl_path_open(struct path *path, int flags)
+{
+	return dentry_open(path, flags | O_NOATIME, current_cred());
+}
diff --git a/fs/pipe.c b/fs/pipe.c
index 8e0d9f26dfad..73b84baf58f8 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -23,7 +23,7 @@
 #include <linux/fcntl.h>
 #include <linux/memcontrol.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/ioctls.h>
 
 #include "internal.h"
diff --git a/fs/pnode.c b/fs/pnode.c
index 234a9ac49958..06a793f4ae38 100644
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -67,49 +67,47 @@ int get_dominating_id(struct mount *mnt, const struct path *root)
 
 static int do_make_slave(struct mount *mnt)
 {
-	struct mount *peer_mnt = mnt, *master = mnt->mnt_master;
-	struct mount *slave_mnt;
+	struct mount *master, *slave_mnt;
 
-	/*
-	 * slave 'mnt' to a peer mount that has the
-	 * same root dentry. If none is available then
-	 * slave it to anything that is available.
-	 */
-	while ((peer_mnt = next_peer(peer_mnt)) != mnt &&
-	       peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ;
-
-	if (peer_mnt == mnt) {
-		peer_mnt = next_peer(mnt);
-		if (peer_mnt == mnt)
-			peer_mnt = NULL;
-	}
-	if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) &&
-	    list_empty(&mnt->mnt_share))
-		mnt_release_group_id(mnt);
-
-	list_del_init(&mnt->mnt_share);
-	mnt->mnt_group_id = 0;
-
-	if (peer_mnt)
-		master = peer_mnt;
-
-	if (master) {
-		list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
-			slave_mnt->mnt_master = master;
-		list_move(&mnt->mnt_slave, &master->mnt_slave_list);
-		list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
-		INIT_LIST_HEAD(&mnt->mnt_slave_list);
+	if (list_empty(&mnt->mnt_share)) {
+		if (IS_MNT_SHARED(mnt)) {
+			mnt_release_group_id(mnt);
+			CLEAR_MNT_SHARED(mnt);
+		}
+		master = mnt->mnt_master;
+		if (!master) {
+			struct list_head *p = &mnt->mnt_slave_list;
+			while (!list_empty(p)) {
+				slave_mnt = list_first_entry(p,
+						struct mount, mnt_slave);
+				list_del_init(&slave_mnt->mnt_slave);
+				slave_mnt->mnt_master = NULL;
+			}
+			return 0;
+		}
 	} else {
-		struct list_head *p = &mnt->mnt_slave_list;
-		while (!list_empty(p)) {
-                        slave_mnt = list_first_entry(p,
-					struct mount, mnt_slave);
-			list_del_init(&slave_mnt->mnt_slave);
-			slave_mnt->mnt_master = NULL;
+		struct mount *m;
+		/*
+		 * slave 'mnt' to a peer mount that has the
+		 * same root dentry. If none is available then
+		 * slave it to anything that is available.
+		 */
+		for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) {
+			if (m->mnt.mnt_root == mnt->mnt.mnt_root) {
+				master = m;
+				break;
+			}
 		}
+		list_del_init(&mnt->mnt_share);
+		mnt->mnt_group_id = 0;
+		CLEAR_MNT_SHARED(mnt);
 	}
+	list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave)
+		slave_mnt->mnt_master = master;
+	list_move(&mnt->mnt_slave, &master->mnt_slave_list);
+	list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev);
+	INIT_LIST_HEAD(&mnt->mnt_slave_list);
 	mnt->mnt_master = master;
-	CLEAR_MNT_SHARED(mnt);
 	return 0;
 }
 
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 595522022aca..c9d48dc78495 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -922,11 +922,10 @@ int simple_set_acl(struct inode *inode, struct posix_acl *acl, int type)
 	int error;
 
 	if (type == ACL_TYPE_ACCESS) {
-		error = posix_acl_equiv_mode(acl, &inode->i_mode);
-		if (error < 0)
-			return 0;
-		if (error == 0)
-			acl = NULL;
+		error = posix_acl_update_mode(inode,
+				&inode->i_mode, &acl);
+		if (error)
+			return error;
 	}
 
 	inode->i_ctime = current_time(inode);
diff --git a/fs/proc/array.c b/fs/proc/array.c
index 81818adb8e9e..51a4213afa2e 100644
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -245,7 +245,7 @@ void render_sigset_t(struct seq_file *m, const char *header,
 		if (sigismember(set, i+2)) x |= 2;
 		if (sigismember(set, i+3)) x |= 4;
 		if (sigismember(set, i+4)) x |= 8;
-		seq_printf(m, "%x", x);
+		seq_putc(m, hex_asc[x]);
 	} while (i >= 4);
 
 	seq_putc(m, '\n');
@@ -342,10 +342,11 @@ static inline void task_cap(struct seq_file *m, struct task_struct *p)
 
 static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 {
+	seq_put_decimal_ull(m, "NoNewPrivs:\t", task_no_new_privs(p));
 #ifdef CONFIG_SECCOMP
-	seq_put_decimal_ull(m, "Seccomp:\t", p->seccomp.mode);
-	seq_putc(m, '\n');
+	seq_put_decimal_ull(m, "\nSeccomp:\t", p->seccomp.mode);
 #endif
+	seq_putc(m, '\n');
 }
 
 static inline void task_context_switch_counts(struct seq_file *m,
diff --git a/fs/proc/base.c b/fs/proc/base.c
index ca651ac00660..87c9a9aacda3 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -47,7 +47,7 @@
  *  Overall revision about smaps.
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
@@ -104,9 +104,12 @@
  *	in /proc for a task before it execs a suid executable.
  */
 
+static u8 nlink_tid;
+static u8 nlink_tgid;
+
 struct pid_entry {
 	const char *name;
-	int len;
+	unsigned int len;
 	umode_t mode;
 	const struct inode_operations *iop;
 	const struct file_operations *fop;
@@ -139,13 +142,13 @@ struct pid_entry {
  * Count the number of hardlinks for the pid_entry table, excluding the .
  * and .. links.
  */
-static unsigned int pid_entry_count_dirs(const struct pid_entry *entries,
+static unsigned int __init pid_entry_nlink(const struct pid_entry *entries,
 	unsigned int n)
 {
 	unsigned int i;
 	unsigned int count;
 
-	count = 0;
+	count = 2;
 	for (i = 0; i < n; ++i) {
 		if (S_ISDIR(entries[i].mode))
 			++count;
@@ -1243,7 +1246,7 @@ static const struct file_operations proc_oom_score_adj_operations = {
 };
 
 #ifdef CONFIG_AUDITSYSCALL
-#define TMPBUFLEN 21
+#define TMPBUFLEN 11
 static ssize_t proc_loginuid_read(struct file * file, char __user * buf,
 				  size_t count, loff_t *ppos)
 {
@@ -1664,7 +1667,8 @@ const struct inode_operations proc_pid_link_inode_operations = {
 
 /* building an inode */
 
-struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *task)
+struct inode *proc_pid_make_inode(struct super_block * sb,
+				  struct task_struct *task, umode_t mode)
 {
 	struct inode * inode;
 	struct proc_inode *ei;
@@ -1678,6 +1682,7 @@ struct inode *proc_pid_make_inode(struct super_block * sb, struct task_struct *t
 
 	/* Common stuff */
 	ei = PROC_I(inode);
+	inode->i_mode = mode;
 	inode->i_ino = get_next_ino();
 	inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
 	inode->i_op = &proc_def_inode_operations;
@@ -1967,7 +1972,7 @@ out:
 
 struct map_files_info {
 	fmode_t		mode;
-	unsigned long	len;
+	unsigned int	len;
 	unsigned char	name[4*sizeof(long)+2]; /* max: %lx-%lx\0 */
 };
 
@@ -2004,7 +2009,9 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK |
+				    ((mode & FMODE_READ ) ? S_IRUSR : 0) |
+				    ((mode & FMODE_WRITE) ? S_IWUSR : 0));
 	if (!inode)
 		return -ENOENT;
 
@@ -2013,12 +2020,6 @@ proc_map_files_instantiate(struct inode *dir, struct dentry *dentry,
 
 	inode->i_op = &proc_map_files_link_inode_operations;
 	inode->i_size = 64;
-	inode->i_mode = S_IFLNK;
-
-	if (mode & FMODE_READ)
-		inode->i_mode |= S_IRUSR;
-	if (mode & FMODE_WRITE)
-		inode->i_mode |= S_IWUSR;
 
 	d_set_d_op(dentry, &tid_map_files_dentry_operations);
 	d_add(dentry, inode);
@@ -2372,12 +2373,11 @@ static int proc_pident_instantiate(struct inode *dir,
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, p->mode);
 	if (!inode)
 		goto out;
 
 	ei = PROC_I(inode);
-	inode->i_mode = p->mode;
 	if (S_ISDIR(inode->i_mode))
 		set_nlink(inode, 2);	/* Use getattr to fix if necessary */
 	if (p->iop)
@@ -2412,14 +2412,14 @@ static struct dentry *proc_pident_lookup(struct inode *dir,
 	 * Yes, it does not scale. And it should not. Don't add
 	 * new entries into /proc/<tgid>/ without very good reasons.
 	 */
-	last = &ents[nents - 1];
-	for (p = ents; p <= last; p++) {
+	last = &ents[nents];
+	for (p = ents; p < last; p++) {
 		if (p->len != dentry->d_name.len)
 			continue;
 		if (!memcmp(dentry->d_name.name, p->name, p->len))
 			break;
 	}
-	if (p > last)
+	if (p >= last)
 		goto out;
 
 	error = proc_pident_instantiate(dir, dentry, task, p);
@@ -2444,7 +2444,7 @@ static int proc_pident_readdir(struct file *file, struct dir_context *ctx,
 	if (ctx->pos >= nents + 2)
 		goto out;
 
-	for (p = ents + (ctx->pos - 2); p <= ents + nents - 1; p++) {
+	for (p = ents + (ctx->pos - 2); p < ents + nents; p++) {
 		if (!proc_fill_cache(file, ctx, p->name, p->len,
 				proc_pident_instantiate, task, p))
 			break;
@@ -3059,17 +3059,15 @@ static int proc_pid_instantiate(struct inode *dir,
 {
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
 	if (!inode)
 		goto out;
 
-	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
 	inode->i_op = &proc_tgid_base_inode_operations;
 	inode->i_fop = &proc_tgid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 
-	set_nlink(inode, 2 + pid_entry_count_dirs(tgid_base_stuff,
-						  ARRAY_SIZE(tgid_base_stuff)));
+	set_nlink(inode, nlink_tgid);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 
@@ -3181,6 +3179,8 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx)
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
 		char name[PROC_NUMBUF];
 		int len;
+
+		cond_resched();
 		if (!has_pid_permissions(ns, iter.task, 2))
 			continue;
 
@@ -3352,17 +3352,15 @@ static int proc_task_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
 	struct inode *inode;
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFDIR | S_IRUGO | S_IXUGO);
 
 	if (!inode)
 		goto out;
-	inode->i_mode = S_IFDIR|S_IRUGO|S_IXUGO;
 	inode->i_op = &proc_tid_base_inode_operations;
 	inode->i_fop = &proc_tid_base_operations;
 	inode->i_flags|=S_IMMUTABLE;
 
-	set_nlink(inode, 2 + pid_entry_count_dirs(tid_base_stuff,
-						  ARRAY_SIZE(tid_base_stuff)));
+	set_nlink(inode, nlink_tid);
 
 	d_set_d_op(dentry, &pid_dentry_operations);
 
@@ -3552,3 +3550,9 @@ static const struct file_operations proc_task_operations = {
 	.iterate_shared	= proc_task_readdir,
 	.llseek		= generic_file_llseek,
 };
+
+void __init set_proc_pid_nlink(void)
+{
+	nlink_tid = pid_entry_nlink(tid_base_stuff, ARRAY_SIZE(tid_base_stuff));
+	nlink_tgid = pid_entry_nlink(tgid_base_stuff, ARRAY_SIZE(tgid_base_stuff));
+}
diff --git a/fs/proc/fd.c b/fs/proc/fd.c
index d21dafef3102..4274f83bf100 100644
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -183,14 +183,13 @@ proc_fd_instantiate(struct inode *dir, struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK);
 	if (!inode)
 		goto out;
 
 	ei = PROC_I(inode);
 	ei->fd = fd;
 
-	inode->i_mode = S_IFLNK;
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
 
@@ -322,14 +321,13 @@ proc_fdinfo_instantiate(struct inode *dir, struct dentry *dentry,
 	struct proc_inode *ei;
 	struct inode *inode;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFREG | S_IRUSR);
 	if (!inode)
 		goto out;
 
 	ei = PROC_I(inode);
 	ei->fd = fd;
 
-	inode->i_mode = S_IFREG | S_IRUSR;
 	inode->i_fop = &proc_fdinfo_file_operations;
 
 	d_set_d_op(dentry, &tid_fd_dentry_operations);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 5f2dc2032c79..f6a01f09f79d 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -22,7 +22,7 @@
 #include <linux/bitops.h>
 #include <linux/spinlock.h>
 #include <linux/completion.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
@@ -479,6 +479,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name)
 	}
 	return ent;
 }
+EXPORT_SYMBOL(proc_create_mount_point);
 
 struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 					struct proc_dir_entry *parent,
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index e69ebe648a34..842a5ff5b85c 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -24,7 +24,7 @@
 #include <linux/mount.h>
 #include <linux/magic.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include "internal.h"
 
@@ -138,6 +138,16 @@ static void unuse_pde(struct proc_dir_entry *pde)
 /* pde is locked */
 static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 {
+	/*
+	 * close() (proc_reg_release()) can't delete an entry and proceed:
+	 * ->release hook needs to be available at the right moment.
+	 *
+	 * rmmod (remove_proc_entry() et al) can't delete an entry and proceed:
+	 * "struct file" needs to be available at the right moment.
+	 *
+	 * Therefore, first process to enter this function does ->release() and
+	 * signals its completion to the other process which does nothing.
+	 */
 	if (pdeo->closing) {
 		/* somebody else is doing that, just wait */
 		DECLARE_COMPLETION_ONSTACK(c);
@@ -147,12 +157,13 @@ static void close_pdeo(struct proc_dir_entry *pde, struct pde_opener *pdeo)
 		spin_lock(&pde->pde_unload_lock);
 	} else {
 		struct file *file;
-		pdeo->closing = 1;
+		pdeo->closing = true;
 		spin_unlock(&pde->pde_unload_lock);
 		file = pdeo->file;
 		pde->proc_fops->release(file_inode(file), file);
 		spin_lock(&pde->pde_unload_lock);
-		list_del_init(&pdeo->lh);
+		/* After ->release. */
+		list_del(&pdeo->lh);
 		if (pdeo->c)
 			complete(pdeo->c);
 		kfree(pdeo);
@@ -167,6 +178,8 @@ void proc_entry_rundown(struct proc_dir_entry *de)
 	if (atomic_add_return(BIAS, &de->in_use) != BIAS)
 		wait_for_completion(&c);
 
+	/* ->pde_openers list can't grow from now on. */
+
 	spin_lock(&de->pde_unload_lock);
 	while (!list_empty(&de->pde_openers)) {
 		struct pde_opener *pdeo;
@@ -312,16 +325,17 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	struct pde_opener *pdeo;
 
 	/*
-	 * What for, you ask? Well, we can have open, rmmod, remove_proc_entry
-	 * sequence. ->release won't be called because ->proc_fops will be
-	 * cleared. Depending on complexity of ->release, consequences vary.
+	 * Ensure that
+	 * 1) PDE's ->release hook will be called no matter what
+	 *    either normally by close()/->release, or forcefully by
+	 *    rmmod/remove_proc_entry.
+	 *
+	 * 2) rmmod isn't blocked by opening file in /proc and sitting on
+	 *    the descriptor (including "rmmod foo </proc/foo" scenario).
 	 *
-	 * We can't wait for mercy when close will be done for real, it's
-	 * deadlockable: rmmod foo </proc/foo . So, we're going to do ->release
-	 * by hand in remove_proc_entry(). For this, save opener's credentials
-	 * for later.
+	 * Save every "struct file" with custom ->release hook.
 	 */
-	pdeo = kzalloc(sizeof(struct pde_opener), GFP_KERNEL);
+	pdeo = kmalloc(sizeof(struct pde_opener), GFP_KERNEL);
 	if (!pdeo)
 		return -ENOMEM;
 
@@ -338,7 +352,8 @@ static int proc_reg_open(struct inode *inode, struct file *file)
 	if (rv == 0 && release) {
 		/* To know what to release. */
 		pdeo->file = file;
-		/* Strictly for "too late" ->release in proc_reg_release(). */
+		pdeo->closing = false;
+		pdeo->c = NULL;
 		spin_lock(&pde->pde_unload_lock);
 		list_add(&pdeo->lh, &pde->pde_openers);
 		spin_unlock(&pde->pde_unload_lock);
@@ -410,7 +425,6 @@ static const char *proc_get_link(struct dentry *dentry,
 }
 
 const struct inode_operations proc_link_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= proc_get_link,
 };
 
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index 5378441ec1b7..2de5194ba378 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -162,7 +162,7 @@ extern int proc_pid_statm(struct seq_file *, struct pid_namespace *,
 extern const struct dentry_operations pid_dentry_operations;
 extern int pid_getattr(struct vfsmount *, struct dentry *, struct kstat *);
 extern int proc_setattr(struct dentry *, struct iattr *);
-extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *);
+extern struct inode *proc_pid_make_inode(struct super_block *, struct task_struct *, umode_t);
 extern int pid_revalidate(struct dentry *, unsigned int);
 extern int pid_delete_dentry(const struct dentry *);
 extern int proc_pid_readdir(struct file *, struct dir_context *);
@@ -195,7 +195,6 @@ static inline bool is_empty_pde(const struct proc_dir_entry *pde)
 {
 	return S_ISDIR(pde->mode) && !pde->proc_iops;
 }
-struct proc_dir_entry *proc_create_mount_point(const char *name);
 
 /*
  * inode.c
@@ -203,7 +202,7 @@ struct proc_dir_entry *proc_create_mount_point(const char *name);
 struct pde_opener {
 	struct file *file;
 	struct list_head lh;
-	int closing;
+	bool closing;
 	struct completion *c;
 };
 extern const struct inode_operations proc_link_inode_operations;
@@ -211,6 +210,7 @@ extern const struct inode_operations proc_link_inode_operations;
 extern const struct inode_operations proc_pid_link_inode_operations;
 
 extern void proc_init_inodecache(void);
+void set_proc_pid_nlink(void);
 extern struct inode *proc_get_inode(struct super_block *, struct proc_dir_entry *);
 extern int proc_fill_super(struct super_block *, void *data, int flags);
 extern void proc_entry_rundown(struct proc_dir_entry *);
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index 5c89a07e3d7f..0b80ad87b4d6 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -23,7 +23,7 @@
 #include <linux/bootmem.h>
 #include <linux/init.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include <linux/list.h>
 #include <linux/ioport.h>
diff --git a/fs/proc/kmsg.c b/fs/proc/kmsg.c
index 05f8dcdb086e..f9387bb7631b 100644
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -14,7 +14,7 @@
 #include <linux/fs.h>
 #include <linux/syslog.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 
 extern wait_queue_head_t log_wait;
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 51b8b0a8ad91..766f0c637ad1 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -92,12 +92,11 @@ static int proc_ns_instantiate(struct inode *dir,
 	struct inode *inode;
 	struct proc_inode *ei;
 
-	inode = proc_pid_make_inode(dir->i_sb, task);
+	inode = proc_pid_make_inode(dir->i_sb, task, S_IFLNK | S_IRWXUGO);
 	if (!inode)
 		goto out;
 
 	ei = PROC_I(inode);
-	inode->i_mode = S_IFLNK|S_IRWXUGO;
 	inode->i_op = &proc_ns_link_inode_operations;
 	ei->ns_ops = ns_ops;
 
diff --git a/fs/proc/nommu.c b/fs/proc/nommu.c
index f8595e8b5cd0..75634379f82e 100644
--- a/fs/proc/nommu.c
+++ b/fs/proc/nommu.c
@@ -25,7 +25,7 @@
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
 #include <linux/vmalloc.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/tlb.h>
 #include <asm/div64.h>
diff --git a/fs/proc/page.c b/fs/proc/page.c
index 3ecd445e830d..a2066e6dee90 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -13,7 +13,7 @@
 #include <linux/mmu_notifier.h>
 #include <linux/page_idle.h>
 #include <linux/kernel-page-flags.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 #define KPMSIZE sizeof(u64)
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 7ae6b1da7cab..ffd72a6c6e04 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -8,7 +8,7 @@
  *  proc net directory handling functions
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c
index 55313d994895..d4e37acd4821 100644
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -709,7 +709,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 	ctl_dir = container_of(head, struct ctl_dir, header);
 
 	if (!dir_emit_dots(file, ctx))
-		return 0;
+		goto out;
 
 	pos = 2;
 
@@ -719,6 +719,7 @@ static int proc_sys_readdir(struct file *file, struct dir_context *ctx)
 			break;
 		}
 	}
+out:
 	sysctl_head_finish(head);
 	return 0;
 }
diff --git a/fs/proc/proc_tty.c b/fs/proc/proc_tty.c
index 15f327bed8c6..901bd06f437d 100644
--- a/fs/proc/proc_tty.c
+++ b/fs/proc/proc_tty.c
@@ -4,7 +4,7 @@
  * Copyright 1997, Theodore Ts'o
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/fs/proc/root.c b/fs/proc/root.c
index 8d3e484055a6..1988440b2049 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -6,7 +6,7 @@
  *  proc root directory handling functions
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/time.h>
@@ -122,6 +122,7 @@ void __init proc_root_init(void)
 	int err;
 
 	proc_init_inodecache();
+	set_proc_pid_nlink();
 	err = register_filesystem(&proc_fs_type);
 	if (err)
 		return;
diff --git a/fs/proc/self.c b/fs/proc/self.c
index 40245954c450..39857f6db5cf 100644
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -6,18 +6,6 @@
 /*
  * /proc/self:
  */
-static int proc_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF];
-	if (!tgid)
-		return -ENOENT;
-	sprintf(tmp, "%d", tgid);
-	return readlink_copy(buffer, buflen, tmp);
-}
-
 static const char *proc_self_get_link(struct dentry *dentry,
 				      struct inode *inode,
 				      struct delayed_call *done)
@@ -38,7 +26,6 @@ static const char *proc_self_get_link(struct dentry *dentry,
 }
 
 static const struct inode_operations proc_self_inode_operations = {
-	.readlink	= proc_self_readlink,
 	.get_link	= proc_self_get_link,
 };
 
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index 35b92d81692f..8f96a49178d0 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -17,7 +17,7 @@
 #include <linux/shmem_fs.h>
 
 #include <asm/elf.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/tlbflush.h>
 #include "internal.h"
 
@@ -1588,6 +1588,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr,
 
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 	pte_unmap_unlock(orig_pte, ptl);
+	cond_resched();
 	return 0;
 }
 #ifdef CONFIG_HUGETLB_PAGE
diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c
index 595b90a9766c..20614b62a9b7 100644
--- a/fs/proc/thread_self.c
+++ b/fs/proc/thread_self.c
@@ -6,19 +6,6 @@
 /*
  * /proc/thread_self:
  */
-static int proc_thread_self_readlink(struct dentry *dentry, char __user *buffer,
-			      int buflen)
-{
-	struct pid_namespace *ns = dentry->d_sb->s_fs_info;
-	pid_t tgid = task_tgid_nr_ns(current, ns);
-	pid_t pid = task_pid_nr_ns(current, ns);
-	char tmp[PROC_NUMBUF + 6 + PROC_NUMBUF];
-	if (!pid)
-		return -ENOENT;
-	sprintf(tmp, "%d/task/%d", tgid, pid);
-	return readlink_copy(buffer, buflen, tmp);
-}
-
 static const char *proc_thread_self_get_link(struct dentry *dentry,
 					     struct inode *inode,
 					     struct delayed_call *done)
@@ -40,7 +27,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry,
 }
 
 static const struct inode_operations proc_thread_self_inode_operations = {
-	.readlink	= proc_thread_self_readlink,
 	.get_link	= proc_thread_self_get_link,
 };
 
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 8ab782d8b33d..5105b1599981 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -22,7 +22,7 @@
 #include <linux/list.h>
 #include <linux/vmalloc.h>
 #include <linux/pagemap.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/io.h>
 #include "internal.h"
 
diff --git a/fs/pstore/Kconfig b/fs/pstore/Kconfig
index be40813eff52..b42e5bd6d8ff 100644
--- a/fs/pstore/Kconfig
+++ b/fs/pstore/Kconfig
@@ -86,4 +86,4 @@ config PSTORE_RAM
 	  Note that for historical reasons, the module will be named
 	  "ramoops.ko".
 
-	  For more information, see Documentation/ramoops.txt.
+	  For more information, see Documentation/admin-guide/ramoops.rst.
diff --git a/fs/pstore/ftrace.c b/fs/pstore/ftrace.c
index d4887705bb61..899d0ba0bd6c 100644
--- a/fs/pstore/ftrace.c
+++ b/fs/pstore/ftrace.c
@@ -27,6 +27,9 @@
 #include <asm/barrier.h>
 #include "internal.h"
 
+/* This doesn't need to be atomic: speed is chosen over correctness here. */
+static u64 pstore_ftrace_stamp;
+
 static void notrace pstore_ftrace_call(unsigned long ip,
 				       unsigned long parent_ip,
 				       struct ftrace_ops *op,
@@ -42,6 +45,7 @@ static void notrace pstore_ftrace_call(unsigned long ip,
 
 	rec.ip = ip;
 	rec.parent_ip = parent_ip;
+	pstore_ftrace_write_timestamp(&rec, pstore_ftrace_stamp++);
 	pstore_ftrace_encode_cpu(&rec, raw_smp_processor_id());
 	psinfo->write_buf(PSTORE_TYPE_FTRACE, 0, NULL, 0, (void *)&rec,
 			  0, sizeof(rec), psinfo);
@@ -71,10 +75,13 @@ static ssize_t pstore_ftrace_knob_write(struct file *f, const char __user *buf,
 	if (!on ^ pstore_ftrace_enabled)
 		goto out;
 
-	if (on)
+	if (on) {
+		ftrace_ops_set_global_filter(&pstore_ftrace_ops);
 		ret = register_ftrace_function(&pstore_ftrace_ops);
-	else
+	} else {
 		ret = unregister_ftrace_function(&pstore_ftrace_ops);
+	}
+
 	if (ret) {
 		pr_err("%s: unable to %sregister ftrace ops: %zd\n",
 		       __func__, on ? "" : "un", ret);
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1781dc50762e..57c0646479f5 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -107,9 +107,11 @@ static int pstore_ftrace_seq_show(struct seq_file *s, void *v)
 	struct pstore_ftrace_seq_data *data = v;
 	struct pstore_ftrace_record *rec = (void *)(ps->data + data->off);
 
-	seq_printf(s, "%d %08lx  %08lx  %pf <- %pF\n",
-		pstore_ftrace_decode_cpu(rec), rec->ip, rec->parent_ip,
-		(void *)rec->ip, (void *)rec->parent_ip);
+	seq_printf(s, "CPU:%d ts:%llu %08lx  %08lx  %pf <- %pF\n",
+		   pstore_ftrace_decode_cpu(rec),
+		   pstore_ftrace_read_timestamp(rec),
+		   rec->ip, rec->parent_ip, (void *)rec->ip,
+		   (void *)rec->parent_ip);
 
 	return 0;
 }
@@ -197,11 +199,14 @@ static int pstore_unlink(struct inode *dir, struct dentry *dentry)
 	if (err)
 		return err;
 
-	if (p->psi->erase)
+	if (p->psi->erase) {
+		mutex_lock(&p->psi->read_mutex);
 		p->psi->erase(p->type, p->id, p->count,
 			      d_inode(dentry)->i_ctime, p->psi);
-	else
+		mutex_unlock(&p->psi->read_mutex);
+	} else {
 		return -EPERM;
+	}
 
 	return simple_unlink(dir, dentry);
 }
diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h
index e38a22b31282..da416e6591c9 100644
--- a/fs/pstore/internal.h
+++ b/fs/pstore/internal.h
@@ -5,40 +5,6 @@
 #include <linux/time.h>
 #include <linux/pstore.h>
 
-#if NR_CPUS <= 2 && defined(CONFIG_ARM_THUMB)
-#define PSTORE_CPU_IN_IP 0x1
-#elif NR_CPUS <= 4 && defined(CONFIG_ARM)
-#define PSTORE_CPU_IN_IP 0x3
-#endif
-
-struct pstore_ftrace_record {
-	unsigned long ip;
-	unsigned long parent_ip;
-#ifndef PSTORE_CPU_IN_IP
-	unsigned int cpu;
-#endif
-};
-
-static inline void
-pstore_ftrace_encode_cpu(struct pstore_ftrace_record *rec, unsigned int cpu)
-{
-#ifndef PSTORE_CPU_IN_IP
-	rec->cpu = cpu;
-#else
-	rec->ip |= cpu;
-#endif
-}
-
-static inline unsigned int
-pstore_ftrace_decode_cpu(struct pstore_ftrace_record *rec)
-{
-#ifndef PSTORE_CPU_IN_IP
-	return rec->cpu;
-#else
-	return rec->ip & PSTORE_CPU_IN_IP;
-#endif
-}
-
 #ifdef CONFIG_PSTORE_FTRACE
 extern void pstore_register_ftrace(void);
 extern void pstore_unregister_ftrace(void);
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 14984d902a99..729677e18e36 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -493,6 +493,7 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 		if (!is_locked) {
 			pr_err("pstore dump routine blocked in %s path, may corrupt error record\n"
 				       , in_nmi() ? "NMI" : why);
+			return;
 		}
 	} else {
 		spin_lock_irqsave(&psinfo->buf_lock, flags);
@@ -584,8 +585,8 @@ static void pstore_console_write(struct console *con, const char *s, unsigned c)
 		} else {
 			spin_lock_irqsave(&psinfo->buf_lock, flags);
 		}
-		memcpy(psinfo->buf, s, c);
-		psinfo->write(PSTORE_TYPE_CONSOLE, 0, &id, 0, 0, 0, c, psinfo);
+		psinfo->write_buf(PSTORE_TYPE_CONSOLE, 0, &id, 0,
+				  s, 0, c, psinfo);
 		spin_unlock_irqrestore(&psinfo->buf_lock, flags);
 		s += c;
 		c = e - s;
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c
index 6ad831b9d1b8..27c059e1760a 100644
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -85,10 +85,10 @@ MODULE_PARM_DESC(ramoops_ecc,
 		"bytes ECC)");
 
 struct ramoops_context {
-	struct persistent_ram_zone **przs;
-	struct persistent_ram_zone *cprz;
-	struct persistent_ram_zone *fprz;
-	struct persistent_ram_zone *mprz;
+	struct persistent_ram_zone **dprzs;	/* Oops dump zones */
+	struct persistent_ram_zone *cprz;	/* Console zone */
+	struct persistent_ram_zone **fprzs;	/* Ftrace zones */
+	struct persistent_ram_zone *mprz;	/* PMSG zone */
 	phys_addr_t phys_addr;
 	unsigned long size;
 	unsigned int memtype;
@@ -97,12 +97,14 @@ struct ramoops_context {
 	size_t ftrace_size;
 	size_t pmsg_size;
 	int dump_oops;
+	u32 flags;
 	struct persistent_ram_ecc_info ecc_info;
 	unsigned int max_dump_cnt;
 	unsigned int dump_write_cnt;
 	/* _read_cnt need clear on ramoops_pstore_open */
 	unsigned int dump_read_cnt;
 	unsigned int console_read_cnt;
+	unsigned int max_ftrace_cnt;
 	unsigned int ftrace_read_cnt;
 	unsigned int pmsg_read_cnt;
 	struct pstore_info pstore;
@@ -180,16 +182,69 @@ static bool prz_ok(struct persistent_ram_zone *prz)
 			   persistent_ram_ecc_string(prz, NULL, 0));
 }
 
+static ssize_t ftrace_log_combine(struct persistent_ram_zone *dest,
+				  struct persistent_ram_zone *src)
+{
+	size_t dest_size, src_size, total, dest_off, src_off;
+	size_t dest_idx = 0, src_idx = 0, merged_idx = 0;
+	void *merged_buf;
+	struct pstore_ftrace_record *drec, *srec, *mrec;
+	size_t record_size = sizeof(struct pstore_ftrace_record);
+
+	dest_off = dest->old_log_size % record_size;
+	dest_size = dest->old_log_size - dest_off;
+
+	src_off = src->old_log_size % record_size;
+	src_size = src->old_log_size - src_off;
+
+	total = dest_size + src_size;
+	merged_buf = kmalloc(total, GFP_KERNEL);
+	if (!merged_buf)
+		return -ENOMEM;
+
+	drec = (struct pstore_ftrace_record *)(dest->old_log + dest_off);
+	srec = (struct pstore_ftrace_record *)(src->old_log + src_off);
+	mrec = (struct pstore_ftrace_record *)(merged_buf);
+
+	while (dest_size > 0 && src_size > 0) {
+		if (pstore_ftrace_read_timestamp(&drec[dest_idx]) <
+		    pstore_ftrace_read_timestamp(&srec[src_idx])) {
+			mrec[merged_idx++] = drec[dest_idx++];
+			dest_size -= record_size;
+		} else {
+			mrec[merged_idx++] = srec[src_idx++];
+			src_size -= record_size;
+		}
+	}
+
+	while (dest_size > 0) {
+		mrec[merged_idx++] = drec[dest_idx++];
+		dest_size -= record_size;
+	}
+
+	while (src_size > 0) {
+		mrec[merged_idx++] = srec[src_idx++];
+		src_size -= record_size;
+	}
+
+	kfree(dest->old_log);
+	dest->old_log = merged_buf;
+	dest->old_log_size = total;
+
+	return 0;
+}
+
 static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 				   int *count, struct timespec *time,
 				   char **buf, bool *compressed,
 				   ssize_t *ecc_notice_size,
 				   struct pstore_info *psi)
 {
-	ssize_t size;
+	ssize_t size = 0;
 	struct ramoops_context *cxt = psi->data;
 	struct persistent_ram_zone *prz = NULL;
 	int header_length = 0;
+	bool free_prz = false;
 
 	/* Ramoops headers provide time stamps for PSTORE_TYPE_DMESG, but
 	 * PSTORE_TYPE_CONSOLE and PSTORE_TYPE_FTRACE don't currently have
@@ -201,7 +256,7 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 
 	/* Find the next valid persistent_ram_zone for DMESG */
 	while (cxt->dump_read_cnt < cxt->max_dump_cnt && !prz) {
-		prz = ramoops_get_next_prz(cxt->przs, &cxt->dump_read_cnt,
+		prz = ramoops_get_next_prz(cxt->dprzs, &cxt->dump_read_cnt,
 					   cxt->max_dump_cnt, id, type,
 					   PSTORE_TYPE_DMESG, 1);
 		if (!prz_ok(prz))
@@ -219,14 +274,56 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->cprz, &cxt->console_read_cnt,
 					   1, id, type, PSTORE_TYPE_CONSOLE, 0);
-	if (!prz_ok(prz))
-		prz = ramoops_get_next_prz(&cxt->fprz, &cxt->ftrace_read_cnt,
-					   1, id, type, PSTORE_TYPE_FTRACE, 0);
+
 	if (!prz_ok(prz))
 		prz = ramoops_get_next_prz(&cxt->mprz, &cxt->pmsg_read_cnt,
 					   1, id, type, PSTORE_TYPE_PMSG, 0);
-	if (!prz_ok(prz))
-		return 0;
+
+	/* ftrace is last since it may want to dynamically allocate memory. */
+	if (!prz_ok(prz)) {
+		if (!(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)) {
+			prz = ramoops_get_next_prz(cxt->fprzs,
+					&cxt->ftrace_read_cnt, 1, id, type,
+					PSTORE_TYPE_FTRACE, 0);
+		} else {
+			/*
+			 * Build a new dummy record which combines all the
+			 * per-cpu records including metadata and ecc info.
+			 */
+			struct persistent_ram_zone *tmp_prz, *prz_next;
+
+			tmp_prz = kzalloc(sizeof(struct persistent_ram_zone),
+					  GFP_KERNEL);
+			if (!tmp_prz)
+				return -ENOMEM;
+			free_prz = true;
+
+			while (cxt->ftrace_read_cnt < cxt->max_ftrace_cnt) {
+				prz_next = ramoops_get_next_prz(cxt->fprzs,
+						&cxt->ftrace_read_cnt,
+						cxt->max_ftrace_cnt, id,
+						type, PSTORE_TYPE_FTRACE, 0);
+
+				if (!prz_ok(prz_next))
+					continue;
+
+				tmp_prz->ecc_info = prz_next->ecc_info;
+				tmp_prz->corrected_bytes +=
+						prz_next->corrected_bytes;
+				tmp_prz->bad_blocks += prz_next->bad_blocks;
+				size = ftrace_log_combine(tmp_prz, prz_next);
+				if (size)
+					goto out;
+			}
+			*id = 0;
+			prz = tmp_prz;
+		}
+	}
+
+	if (!prz_ok(prz)) {
+		size = 0;
+		goto out;
+	}
 
 	size = persistent_ram_old_size(prz) - header_length;
 
@@ -234,12 +331,21 @@ static ssize_t ramoops_pstore_read(u64 *id, enum pstore_type_id *type,
 	*ecc_notice_size = persistent_ram_ecc_string(prz, NULL, 0);
 
 	*buf = kmalloc(size + *ecc_notice_size + 1, GFP_KERNEL);
-	if (*buf == NULL)
-		return -ENOMEM;
+	if (*buf == NULL) {
+		size = -ENOMEM;
+		goto out;
+	}
 
 	memcpy(*buf, (char *)persistent_ram_old(prz) + header_length, size);
+
 	persistent_ram_ecc_string(prz, *buf + size, *ecc_notice_size + 1);
 
+out:
+	if (free_prz) {
+		kfree(prz->old_log);
+		kfree(prz);
+	}
+
 	return size;
 }
 
@@ -283,15 +389,23 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 		persistent_ram_write(cxt->cprz, buf, size);
 		return 0;
 	} else if (type == PSTORE_TYPE_FTRACE) {
-		if (!cxt->fprz)
+		int zonenum;
+
+		if (!cxt->fprzs)
 			return -ENOMEM;
-		persistent_ram_write(cxt->fprz, buf, size);
+		/*
+		 * Choose zone by if we're using per-cpu buffers.
+		 */
+		if (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+			zonenum = smp_processor_id();
+		else
+			zonenum = 0;
+
+		persistent_ram_write(cxt->fprzs[zonenum], buf, size);
 		return 0;
 	} else if (type == PSTORE_TYPE_PMSG) {
-		if (!cxt->mprz)
-			return -ENOMEM;
-		persistent_ram_write(cxt->mprz, buf, size);
-		return 0;
+		pr_warn_ratelimited("PMSG shouldn't call %s\n", __func__);
+		return -EINVAL;
 	}
 
 	if (type != PSTORE_TYPE_DMESG)
@@ -316,10 +430,10 @@ static int notrace ramoops_pstore_write_buf(enum pstore_type_id type,
 	if (part != 1)
 		return -ENOSPC;
 
-	if (!cxt->przs)
+	if (!cxt->dprzs)
 		return -ENOSPC;
 
-	prz = cxt->przs[cxt->dump_write_cnt];
+	prz = cxt->dprzs[cxt->dump_write_cnt];
 
 	hlen = ramoops_write_kmsg_hdr(prz, compressed);
 	if (size + hlen > prz->buffer_size)
@@ -359,13 +473,15 @@ static int ramoops_pstore_erase(enum pstore_type_id type, u64 id, int count,
 	case PSTORE_TYPE_DMESG:
 		if (id >= cxt->max_dump_cnt)
 			return -EINVAL;
-		prz = cxt->przs[id];
+		prz = cxt->dprzs[id];
 		break;
 	case PSTORE_TYPE_CONSOLE:
 		prz = cxt->cprz;
 		break;
 	case PSTORE_TYPE_FTRACE:
-		prz = cxt->fprz;
+		if (id >= cxt->max_ftrace_cnt)
+			return -EINVAL;
+		prz = cxt->fprzs[id];
 		break;
 	case PSTORE_TYPE_PMSG:
 		prz = cxt->mprz;
@@ -396,68 +512,113 @@ static void ramoops_free_przs(struct ramoops_context *cxt)
 {
 	int i;
 
-	if (!cxt->przs)
-		return;
+	/* Free dump PRZs */
+	if (cxt->dprzs) {
+		for (i = 0; i < cxt->max_dump_cnt; i++)
+			persistent_ram_free(cxt->dprzs[i]);
 
-	for (i = 0; i < cxt->max_dump_cnt; i++)
-		persistent_ram_free(cxt->przs[i]);
+		kfree(cxt->dprzs);
+		cxt->max_dump_cnt = 0;
+	}
 
-	kfree(cxt->przs);
-	cxt->max_dump_cnt = 0;
+	/* Free ftrace PRZs */
+	if (cxt->fprzs) {
+		for (i = 0; i < cxt->max_ftrace_cnt; i++)
+			persistent_ram_free(cxt->fprzs[i]);
+		kfree(cxt->fprzs);
+		cxt->max_ftrace_cnt = 0;
+	}
 }
 
-static int ramoops_init_przs(struct device *dev, struct ramoops_context *cxt,
-			     phys_addr_t *paddr, size_t dump_mem_sz)
+static int ramoops_init_przs(const char *name,
+			     struct device *dev, struct ramoops_context *cxt,
+			     struct persistent_ram_zone ***przs,
+			     phys_addr_t *paddr, size_t mem_sz,
+			     ssize_t record_size,
+			     unsigned int *cnt, u32 sig, u32 flags)
 {
 	int err = -ENOMEM;
 	int i;
+	size_t zone_sz;
+	struct persistent_ram_zone **prz_ar;
 
-	if (!cxt->record_size)
+	/* Allocate nothing for 0 mem_sz or 0 record_size. */
+	if (mem_sz == 0 || record_size == 0) {
+		*cnt = 0;
 		return 0;
+	}
 
-	if (*paddr + dump_mem_sz - cxt->phys_addr > cxt->size) {
-		dev_err(dev, "no room for dumps\n");
-		return -ENOMEM;
+	/*
+	 * If we have a negative record size, calculate it based on
+	 * mem_sz / *cnt. If we have a positive record size, calculate
+	 * cnt from mem_sz / record_size.
+	 */
+	if (record_size < 0) {
+		if (*cnt == 0)
+			return 0;
+		record_size = mem_sz / *cnt;
+		if (record_size == 0) {
+			dev_err(dev, "%s record size == 0 (%zu / %u)\n",
+				name, mem_sz, *cnt);
+			goto fail;
+		}
+	} else {
+		*cnt = mem_sz / record_size;
+		if (*cnt == 0) {
+			dev_err(dev, "%s record count == 0 (%zu / %zu)\n",
+				name, mem_sz, record_size);
+			goto fail;
+		}
 	}
 
-	cxt->max_dump_cnt = dump_mem_sz / cxt->record_size;
-	if (!cxt->max_dump_cnt)
-		return -ENOMEM;
+	if (*paddr + mem_sz - cxt->phys_addr > cxt->size) {
+		dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+			name,
+			mem_sz, (unsigned long long)*paddr,
+			cxt->size, (unsigned long long)cxt->phys_addr);
+		goto fail;
+	}
 
-	cxt->przs = kzalloc(sizeof(*cxt->przs) * cxt->max_dump_cnt,
-			     GFP_KERNEL);
-	if (!cxt->przs) {
-		dev_err(dev, "failed to initialize a prz array for dumps\n");
-		goto fail_mem;
+	zone_sz = mem_sz / *cnt;
+	if (!zone_sz) {
+		dev_err(dev, "%s zone size == 0\n", name);
+		goto fail;
 	}
 
-	for (i = 0; i < cxt->max_dump_cnt; i++) {
-		cxt->przs[i] = persistent_ram_new(*paddr, cxt->record_size, 0,
+	prz_ar = kcalloc(*cnt, sizeof(**przs), GFP_KERNEL);
+	if (!prz_ar)
+		goto fail;
+
+	for (i = 0; i < *cnt; i++) {
+		prz_ar[i] = persistent_ram_new(*paddr, zone_sz, sig,
 						  &cxt->ecc_info,
-						  cxt->memtype);
-		if (IS_ERR(cxt->przs[i])) {
-			err = PTR_ERR(cxt->przs[i]);
-			dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
-				cxt->record_size, (unsigned long long)*paddr, err);
+						  cxt->memtype, flags);
+		if (IS_ERR(prz_ar[i])) {
+			err = PTR_ERR(prz_ar[i]);
+			dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+				name, record_size,
+				(unsigned long long)*paddr, err);
 
 			while (i > 0) {
 				i--;
-				persistent_ram_free(cxt->przs[i]);
+				persistent_ram_free(prz_ar[i]);
 			}
-			goto fail_prz;
+			kfree(prz_ar);
+			goto fail;
 		}
-		*paddr += cxt->record_size;
+		*paddr += zone_sz;
 	}
 
+	*przs = prz_ar;
 	return 0;
-fail_prz:
-	kfree(cxt->przs);
-fail_mem:
-	cxt->max_dump_cnt = 0;
+
+fail:
+	*cnt = 0;
 	return err;
 }
 
-static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
+static int ramoops_init_prz(const char *name,
+			    struct device *dev, struct ramoops_context *cxt,
 			    struct persistent_ram_zone **prz,
 			    phys_addr_t *paddr, size_t sz, u32 sig)
 {
@@ -465,18 +626,19 @@ static int ramoops_init_prz(struct device *dev, struct ramoops_context *cxt,
 		return 0;
 
 	if (*paddr + sz - cxt->phys_addr > cxt->size) {
-		dev_err(dev, "no room for mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
-			sz, (unsigned long long)*paddr,
+		dev_err(dev, "no room for %s mem region (0x%zx@0x%llx) in (0x%lx@0x%llx)\n",
+			name, sz, (unsigned long long)*paddr,
 			cxt->size, (unsigned long long)cxt->phys_addr);
 		return -ENOMEM;
 	}
 
-	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info, cxt->memtype);
+	*prz = persistent_ram_new(*paddr, sz, sig, &cxt->ecc_info,
+				  cxt->memtype, 0);
 	if (IS_ERR(*prz)) {
 		int err = PTR_ERR(*prz);
 
-		dev_err(dev, "failed to request mem region (0x%zx@0x%llx): %d\n",
-			sz, (unsigned long long)*paddr, err);
+		dev_err(dev, "failed to request %s mem region (0x%zx@0x%llx): %d\n",
+			name, sz, (unsigned long long)*paddr, err);
 		return err;
 	}
 
@@ -543,6 +705,7 @@ static int ramoops_parse_dt(struct platform_device *pdev,
 	parse_size("ftrace-size", pdata->ftrace_size);
 	parse_size("pmsg-size", pdata->pmsg_size);
 	parse_size("ecc-size", pdata->ecc_info.ecc_size);
+	parse_size("flags", pdata->flags);
 
 #undef parse_size
 
@@ -561,6 +724,7 @@ static int ramoops_probe(struct platform_device *pdev)
 	if (dev_of_node(dev) && !pdata) {
 		pdata = devm_kzalloc(&pdev->dev, sizeof(*pdata), GFP_KERNEL);
 		if (!pdata) {
+			pr_err("cannot allocate platform data buffer\n");
 			err = -ENOMEM;
 			goto fail_out;
 		}
@@ -570,11 +734,20 @@ static int ramoops_probe(struct platform_device *pdev)
 			goto fail_out;
 	}
 
-	/* Only a single ramoops area allowed at a time, so fail extra
+	/*
+	 * Only a single ramoops area allowed at a time, so fail extra
 	 * probes.
 	 */
-	if (cxt->max_dump_cnt)
+	if (cxt->max_dump_cnt) {
+		pr_err("already initialized\n");
 		goto fail_out;
+	}
+
+	/* Make sure we didn't get bogus platform data pointer. */
+	if (!pdata) {
+		pr_err("NULL platform data\n");
+		goto fail_out;
+	}
 
 	if (!pdata->mem_size || (!pdata->record_size && !pdata->console_size &&
 			!pdata->ftrace_size && !pdata->pmsg_size)) {
@@ -600,27 +773,37 @@ static int ramoops_probe(struct platform_device *pdev)
 	cxt->ftrace_size = pdata->ftrace_size;
 	cxt->pmsg_size = pdata->pmsg_size;
 	cxt->dump_oops = pdata->dump_oops;
+	cxt->flags = pdata->flags;
 	cxt->ecc_info = pdata->ecc_info;
 
 	paddr = cxt->phys_addr;
 
 	dump_mem_sz = cxt->size - cxt->console_size - cxt->ftrace_size
 			- cxt->pmsg_size;
-	err = ramoops_init_przs(dev, cxt, &paddr, dump_mem_sz);
+	err = ramoops_init_przs("dump", dev, cxt, &cxt->dprzs, &paddr,
+				dump_mem_sz, cxt->record_size,
+				&cxt->max_dump_cnt, 0, 0);
 	if (err)
 		goto fail_out;
 
-	err = ramoops_init_prz(dev, cxt, &cxt->cprz, &paddr,
+	err = ramoops_init_prz("console", dev, cxt, &cxt->cprz, &paddr,
 			       cxt->console_size, 0);
 	if (err)
 		goto fail_init_cprz;
 
-	err = ramoops_init_prz(dev, cxt, &cxt->fprz, &paddr, cxt->ftrace_size,
-			       LINUX_VERSION_CODE);
+	cxt->max_ftrace_cnt = (cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+				? nr_cpu_ids
+				: 1;
+	err = ramoops_init_przs("ftrace", dev, cxt, &cxt->fprzs, &paddr,
+				cxt->ftrace_size, -1,
+				&cxt->max_ftrace_cnt, LINUX_VERSION_CODE,
+				(cxt->flags & RAMOOPS_FLAG_FTRACE_PER_CPU)
+					? PRZ_FLAG_NO_LOCK : 0);
 	if (err)
 		goto fail_init_fprz;
 
-	err = ramoops_init_prz(dev, cxt, &cxt->mprz, &paddr, cxt->pmsg_size, 0);
+	err = ramoops_init_prz("pmsg", dev, cxt, &cxt->mprz, &paddr,
+				cxt->pmsg_size, 0);
 	if (err)
 		goto fail_init_mprz;
 
@@ -680,7 +863,6 @@ fail_clear:
 	cxt->pstore.bufsize = 0;
 	persistent_ram_free(cxt->mprz);
 fail_init_mprz:
-	persistent_ram_free(cxt->fprz);
 fail_init_fprz:
 	persistent_ram_free(cxt->cprz);
 fail_init_cprz:
@@ -699,7 +881,6 @@ static int ramoops_remove(struct platform_device *pdev)
 	cxt->pstore.bufsize = 0;
 
 	persistent_ram_free(cxt->mprz);
-	persistent_ram_free(cxt->fprz);
 	persistent_ram_free(cxt->cprz);
 	ramoops_free_przs(cxt);
 
@@ -741,6 +922,8 @@ static void ramoops_register_dummy(void)
 	dummy_data->ftrace_size = ramoops_ftrace_size;
 	dummy_data->pmsg_size = ramoops_pmsg_size;
 	dummy_data->dump_oops = dump_oops;
+	dummy_data->flags = RAMOOPS_FLAG_FTRACE_PER_CPU;
+
 	/*
 	 * For backwards compatibility ramoops.ecc=1 means 16 bytes ECC
 	 * (using 1 byte for ECC isn't much of use anyway).
diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c
index 3975deec02f8..a857338b7dab 100644
--- a/fs/pstore/ram_core.c
+++ b/fs/pstore/ram_core.c
@@ -48,16 +48,15 @@ static inline size_t buffer_start(struct persistent_ram_zone *prz)
 	return atomic_read(&prz->buffer->start);
 }
 
-static DEFINE_RAW_SPINLOCK(buffer_lock);
-
 /* increase and wrap the start pointer, returning the old value */
 static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
 {
 	int old;
 	int new;
-	unsigned long flags;
+	unsigned long flags = 0;
 
-	raw_spin_lock_irqsave(&buffer_lock, flags);
+	if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+		raw_spin_lock_irqsave(&prz->buffer_lock, flags);
 
 	old = atomic_read(&prz->buffer->start);
 	new = old + a;
@@ -65,7 +64,8 @@ static size_t buffer_start_add(struct persistent_ram_zone *prz, size_t a)
 		new -= prz->buffer_size;
 	atomic_set(&prz->buffer->start, new);
 
-	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+	if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+		raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
 
 	return old;
 }
@@ -75,9 +75,10 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
 {
 	size_t old;
 	size_t new;
-	unsigned long flags;
+	unsigned long flags = 0;
 
-	raw_spin_lock_irqsave(&buffer_lock, flags);
+	if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+		raw_spin_lock_irqsave(&prz->buffer_lock, flags);
 
 	old = atomic_read(&prz->buffer->size);
 	if (old == prz->buffer_size)
@@ -89,7 +90,8 @@ static void buffer_size_add(struct persistent_ram_zone *prz, size_t a)
 	atomic_set(&prz->buffer->size, new);
 
 exit:
-	raw_spin_unlock_irqrestore(&buffer_lock, flags);
+	if (!(prz->flags & PRZ_FLAG_NO_LOCK))
+		raw_spin_unlock_irqrestore(&prz->buffer_lock, flags);
 }
 
 static void notrace persistent_ram_encode_rs8(struct persistent_ram_zone *prz,
@@ -465,7 +467,8 @@ static int persistent_ram_buffer_map(phys_addr_t start, phys_addr_t size,
 }
 
 static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
-				    struct persistent_ram_ecc_info *ecc_info)
+				    struct persistent_ram_ecc_info *ecc_info,
+				    unsigned long flags)
 {
 	int ret;
 
@@ -493,6 +496,8 @@ static int persistent_ram_post_init(struct persistent_ram_zone *prz, u32 sig,
 
 	prz->buffer->sig = sig;
 	persistent_ram_zap(prz);
+	prz->buffer_lock = __RAW_SPIN_LOCK_UNLOCKED(buffer_lock);
+	prz->flags = flags;
 
 	return 0;
 }
@@ -517,7 +522,7 @@ void persistent_ram_free(struct persistent_ram_zone *prz)
 
 struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 			u32 sig, struct persistent_ram_ecc_info *ecc_info,
-			unsigned int memtype)
+			unsigned int memtype, u32 flags)
 {
 	struct persistent_ram_zone *prz;
 	int ret = -ENOMEM;
@@ -532,7 +537,7 @@ struct persistent_ram_zone *persistent_ram_new(phys_addr_t start, size_t size,
 	if (ret)
 		goto err;
 
-	ret = persistent_ram_post_init(prz, sig, ecc_info);
+	ret = persistent_ram_post_init(prz, sig, ecc_info, flags);
 	if (ret)
 		goto err;
 
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 1bfac28b7e7d..406fed92362a 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -119,8 +119,7 @@
  * spinlock to internal buffers before writing.
  *
  * Lock ordering (including related VFS locks) is the following:
- *   dqonoff_mutex > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
- * dqonoff_mutex > i_mutex comes from dquot_quota_sync, dquot_enable, etc.
+ *   s_umount > i_mutex > journal_lock > dquot->dq_lock > dqio_mutex
  */
 
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dq_list_lock);
@@ -572,7 +571,8 @@ int dquot_scan_active(struct super_block *sb,
 	struct dquot *dquot, *old_dquot = NULL;
 	int ret = 0;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
+	WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));
+
 	spin_lock(&dq_list_lock);
 	list_for_each_entry(dquot, &inuse_list, dq_inuse) {
 		if (!test_bit(DQ_ACTIVE_B, &dquot->dq_flags))
@@ -603,7 +603,6 @@ int dquot_scan_active(struct super_block *sb,
 	spin_unlock(&dq_list_lock);
 out:
 	dqput(old_dquot);
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return ret;
 }
 EXPORT_SYMBOL(dquot_scan_active);
@@ -617,7 +616,8 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
 	int cnt;
 	int err, ret = 0;
 
-	mutex_lock(&dqopt->dqonoff_mutex);
+	WARN_ON_ONCE(!rwsem_is_locked(&sb->s_umount));
+
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -653,7 +653,6 @@ int dquot_writeback_dquots(struct super_block *sb, int type)
 		    && info_dirty(&dqopt->info[cnt]))
 			sb->dq_op->write_info(sb, cnt);
 	dqstats_inc(DQST_SYNCS);
-	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	return ret;
 }
@@ -683,7 +682,6 @@ int dquot_quota_sync(struct super_block *sb, int type)
 	 * Now when everything is written we can discard the pagecache so
 	 * that userspace sees the changes.
 	 */
-	mutex_lock(&dqopt->dqonoff_mutex);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
@@ -693,7 +691,6 @@ int dquot_quota_sync(struct super_block *sb, int type)
 		truncate_inode_pages(&dqopt->files[cnt]->i_data, 0);
 		inode_unlock(dqopt->files[cnt]);
 	}
-	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	return 0;
 }
@@ -935,7 +932,7 @@ static int dqinit_needed(struct inode *inode, int type)
 	return 0;
 }
 
-/* This routine is guarded by dqonoff_mutex mutex */
+/* This routine is guarded by s_umount semaphore */
 static void add_dquot_ref(struct super_block *sb, int type)
 {
 	struct inode *inode, *old_inode = NULL;
@@ -2050,21 +2047,13 @@ int dquot_get_next_id(struct super_block *sb, struct kqid *qid)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int err;
 
-	mutex_lock(&dqopt->dqonoff_mutex);
-	if (!sb_has_quota_active(sb, qid->type)) {
-		err = -ESRCH;
-		goto out;
-	}
-	if (!dqopt->ops[qid->type]->get_next_id) {
-		err = -ENOSYS;
-		goto out;
-	}
+	if (!sb_has_quota_active(sb, qid->type))
+		return -ESRCH;
+	if (!dqopt->ops[qid->type]->get_next_id)
+		return -ENOSYS;
 	mutex_lock(&dqopt->dqio_mutex);
 	err = dqopt->ops[qid->type]->get_next_id(sb, qid);
 	mutex_unlock(&dqopt->dqio_mutex);
-out:
-	mutex_unlock(&dqopt->dqonoff_mutex);
-
 	return err;
 }
 EXPORT_SYMBOL(dquot_get_next_id);
@@ -2107,6 +2096,10 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	struct inode *toputinode[MAXQUOTAS];
 
+	/* s_umount should be held in exclusive mode */
+	if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+		up_read(&sb->s_umount);
+
 	/* Cannot turn off usage accounting without turning off limits, or
 	 * suspend quotas and simultaneously turn quotas off. */
 	if ((flags & DQUOT_USAGE_ENABLED && !(flags & DQUOT_LIMITS_ENABLED))
@@ -2114,18 +2107,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 	    DQUOT_USAGE_ENABLED)))
 		return -EINVAL;
 
-	/* We need to serialize quota_off() for device */
-	mutex_lock(&dqopt->dqonoff_mutex);
-
 	/*
 	 * Skip everything if there's nothing to do. We have to do this because
 	 * sometimes we are called when fill_super() failed and calling
 	 * sync_fs() in such cases does no good.
 	 */
-	if (!sb_any_quota_loaded(sb)) {
-		mutex_unlock(&dqopt->dqonoff_mutex);
+	if (!sb_any_quota_loaded(sb))
 		return 0;
-	}
+
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		toputinode[cnt] = NULL;
 		if (type != -1 && cnt != type)
@@ -2179,7 +2168,6 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 		dqopt->info[cnt].dqi_bgrace = 0;
 		dqopt->ops[cnt] = NULL;
 	}
-	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	/* Skip syncing and setting flags if quota files are hidden */
 	if (dqopt->flags & DQUOT_QUOTA_SYS_FILE)
@@ -2196,20 +2184,14 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags)
 	 * must also discard the blockdev buffers so that we see the
 	 * changes done by userspace on the next quotaon() */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
-		if (toputinode[cnt]) {
-			mutex_lock(&dqopt->dqonoff_mutex);
-			/* If quota was reenabled in the meantime, we have
-			 * nothing to do */
-			if (!sb_has_quota_loaded(sb, cnt)) {
-				inode_lock(toputinode[cnt]);
-				toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
+		/* This can happen when suspending quotas on remount-ro... */
+		if (toputinode[cnt] && !sb_has_quota_loaded(sb, cnt)) {
+			inode_lock(toputinode[cnt]);
+			toputinode[cnt]->i_flags &= ~(S_IMMUTABLE |
 				  S_NOATIME | S_NOQUOTA);
-				truncate_inode_pages(&toputinode[cnt]->i_data,
-						     0);
-				inode_unlock(toputinode[cnt]);
-				mark_inode_dirty_sync(toputinode[cnt]);
-			}
-			mutex_unlock(&dqopt->dqonoff_mutex);
+			truncate_inode_pages(&toputinode[cnt]->i_data, 0);
+			inode_unlock(toputinode[cnt]);
+			mark_inode_dirty_sync(toputinode[cnt]);
 		}
 	if (sb->s_bdev)
 		invalidate_bdev(sb->s_bdev);
@@ -2281,6 +2263,10 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		error = -EINVAL;
 		goto out_fmt;
 	}
+	if (sb_has_quota_loaded(sb, type)) {
+		error = -EBUSY;
+		goto out_fmt;
+	}
 
 	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
 		/* As we bypass the pagecache we must now flush all the
@@ -2292,11 +2278,6 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 		sync_filesystem(sb);
 		invalidate_bdev(sb->s_bdev);
 	}
-	mutex_lock(&dqopt->dqonoff_mutex);
-	if (sb_has_quota_loaded(sb, type)) {
-		error = -EBUSY;
-		goto out_lock;
-	}
 
 	if (!(dqopt->flags & DQUOT_QUOTA_SYS_FILE)) {
 		/* We don't want quota and atime on quota files (deadlocks
@@ -2317,7 +2298,7 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	error = -EIO;
 	dqopt->files[type] = igrab(inode);
 	if (!dqopt->files[type])
-		goto out_lock;
+		goto out_file_flags;
 	error = -EINVAL;
 	if (!fmt->qf_ops->check_quota_file(sb, type))
 		goto out_file_init;
@@ -2340,14 +2321,13 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id,
 	spin_unlock(&dq_state_lock);
 
 	add_dquot_ref(sb, type);
-	mutex_unlock(&dqopt->dqonoff_mutex);
 
 	return 0;
 
 out_file_init:
 	dqopt->files[type] = NULL;
 	iput(inode);
-out_lock:
+out_file_flags:
 	if (oldflags != -1) {
 		inode_lock(inode);
 		/* Set the flags back (in the case of accidental quotaon()
@@ -2356,7 +2336,6 @@ out_lock:
 		inode->i_flags |= oldflags;
 		inode_unlock(inode);
 	}
-	mutex_unlock(&dqopt->dqonoff_mutex);
 out_fmt:
 	put_quota_format(fmt);
 
@@ -2371,15 +2350,16 @@ int dquot_resume(struct super_block *sb, int type)
 	int ret = 0, cnt;
 	unsigned int flags;
 
+	/* s_umount should be held in exclusive mode */
+	if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+		up_read(&sb->s_umount);
+
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
 		if (type != -1 && cnt != type)
 			continue;
-
-		mutex_lock(&dqopt->dqonoff_mutex);
-		if (!sb_has_quota_suspended(sb, cnt)) {
-			mutex_unlock(&dqopt->dqonoff_mutex);
+		if (!sb_has_quota_suspended(sb, cnt))
 			continue;
-		}
+
 		inode = dqopt->files[cnt];
 		dqopt->files[cnt] = NULL;
 		spin_lock(&dq_state_lock);
@@ -2388,7 +2368,6 @@ int dquot_resume(struct super_block *sb, int type)
 							cnt);
 		dqopt->flags &= ~dquot_state_flag(DQUOT_STATE_FLAGS, cnt);
 		spin_unlock(&dq_state_lock);
-		mutex_unlock(&dqopt->dqonoff_mutex);
 
 		flags = dquot_generic_flag(flags, cnt);
 		ret = vfs_load_quota_inode(inode, cnt,
@@ -2401,7 +2380,7 @@ int dquot_resume(struct super_block *sb, int type)
 EXPORT_SYMBOL(dquot_resume);
 
 int dquot_quota_on(struct super_block *sb, int type, int format_id,
-		   struct path *path)
+		   const struct path *path)
 {
 	int error = security_quota_on(path->dentry);
 	if (error)
@@ -2424,42 +2403,30 @@ EXPORT_SYMBOL(dquot_quota_on);
 int dquot_enable(struct inode *inode, int type, int format_id,
 		 unsigned int flags)
 {
-	int ret = 0;
 	struct super_block *sb = inode->i_sb;
-	struct quota_info *dqopt = sb_dqopt(sb);
 
 	/* Just unsuspend quotas? */
 	BUG_ON(flags & DQUOT_SUSPENDED);
+	/* s_umount should be held in exclusive mode */
+	if (WARN_ON_ONCE(down_read_trylock(&sb->s_umount)))
+		up_read(&sb->s_umount);
 
 	if (!flags)
 		return 0;
 	/* Just updating flags needed? */
 	if (sb_has_quota_loaded(sb, type)) {
-		mutex_lock(&dqopt->dqonoff_mutex);
-		/* Now do a reliable test... */
-		if (!sb_has_quota_loaded(sb, type)) {
-			mutex_unlock(&dqopt->dqonoff_mutex);
-			goto load_quota;
-		}
 		if (flags & DQUOT_USAGE_ENABLED &&
-		    sb_has_quota_usage_enabled(sb, type)) {
-			ret = -EBUSY;
-			goto out_lock;
-		}
+		    sb_has_quota_usage_enabled(sb, type))
+			return -EBUSY;
 		if (flags & DQUOT_LIMITS_ENABLED &&
-		    sb_has_quota_limits_enabled(sb, type)) {
-			ret = -EBUSY;
-			goto out_lock;
-		}
+		    sb_has_quota_limits_enabled(sb, type))
+			return -EBUSY;
 		spin_lock(&dq_state_lock);
 		sb_dqopt(sb)->flags |= dquot_state_flag(flags, type);
 		spin_unlock(&dq_state_lock);
-out_lock:
-		mutex_unlock(&dqopt->dqonoff_mutex);
-		return ret;
+		return 0;
 	}
 
-load_quota:
 	return vfs_load_quota_inode(inode, type, format_id, flags);
 }
 EXPORT_SYMBOL(dquot_enable);
@@ -2751,7 +2718,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
 	struct quota_info *dqopt = sb_dqopt(sb);
 	int type;
   
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
 	memset(state, 0, sizeof(*state));
 	for (type = 0; type < MAXQUOTAS; type++) {
 		if (!sb_has_quota_active(sb, type))
@@ -2773,7 +2739,6 @@ int dquot_get_state(struct super_block *sb, struct qc_state *state)
 		tstate->nextents = 1;	/* We don't know... */
 		spin_unlock(&dq_data_lock);
 	}
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return 0;
 }
 EXPORT_SYMBOL(dquot_get_state);
@@ -2787,18 +2752,13 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
 	if ((ii->i_fieldmask & QC_WARNS_MASK) ||
 	    (ii->i_fieldmask & QC_RT_SPC_TIMER))
 		return -EINVAL;
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_active(sb, type)) {
-		err = -ESRCH;
-		goto out;
-	}
+	if (!sb_has_quota_active(sb, type))
+		return -ESRCH;
 	mi = sb_dqopt(sb)->info + type;
 	if (ii->i_fieldmask & QC_FLAGS) {
 		if ((ii->i_flags & QCI_ROOT_SQUASH &&
-		     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD)) {
-			err = -EINVAL;
-			goto out;
-		}
+		     mi->dqi_format->qf_fmt_id != QFMT_VFS_OLD))
+			return -EINVAL;
 	}
 	spin_lock(&dq_data_lock);
 	if (ii->i_fieldmask & QC_SPC_TIMER)
@@ -2815,8 +2775,6 @@ int dquot_set_dqinfo(struct super_block *sb, int type, struct qc_info *ii)
 	mark_info_dirty(sb, type);
 	/* Force write to disk */
 	sb->dq_op->write_info(sb, type);
-out:
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	return err;
 }
 EXPORT_SYMBOL(dquot_set_dqinfo);
diff --git a/fs/quota/netlink.c b/fs/quota/netlink.c
index 8b252673d454..e99b1a72d9a7 100644
--- a/fs/quota/netlink.c
+++ b/fs/quota/netlink.c
@@ -12,14 +12,8 @@ static const struct genl_multicast_group quota_mcgrps[] = {
 };
 
 /* Netlink family structure for quota */
-static struct genl_family quota_genl_family = {
-	/*
-	 * Needed due to multicast group ID abuse - old code assumed
-	 * the family ID was also a valid multicast group ID (which
-	 * isn't true) and userspace might thus rely on it. Assign a
-	 * static ID for this group to make dealing with that easier.
-	 */
-	.id = GENL_ID_VFS_DQUOT,
+static struct genl_family quota_genl_family __ro_after_init = {
+	.module = THIS_MODULE,
 	.hdrsize = 0,
 	.name = "VFS_DQUOT",
 	.version = 1,
diff --git a/fs/quota/quota.c b/fs/quota/quota.c
index 2d445425aad7..07e08c7d05ca 100644
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -80,7 +80,7 @@ unsigned int qtype_enforce_flag(int type)
 }
 
 static int quota_quotaon(struct super_block *sb, int type, qid_t id,
-		         struct path *path)
+		         const struct path *path)
 {
 	if (!sb->s_qcop->quota_on && !sb->s_qcop->quota_enable)
 		return -ENOSYS;
@@ -104,13 +104,9 @@ static int quota_getfmt(struct super_block *sb, int type, void __user *addr)
 {
 	__u32 fmt;
 
-	mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
-	if (!sb_has_quota_active(sb, type)) {
-		mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
+	if (!sb_has_quota_active(sb, type))
 		return -ESRCH;
-	}
 	fmt = sb_dqopt(sb)->info[type].dqi_format->qf_fmt_id;
-	mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex);
 	if (copy_to_user(addr, &fmt, sizeof(fmt)))
 		return -EFAULT;
 	return 0;
@@ -700,7 +696,7 @@ static int quota_rmxquota(struct super_block *sb, void __user *addr)
 
 /* Copy parameters and call proper function */
 static int do_quotactl(struct super_block *sb, int type, int cmd, qid_t id,
-		       void __user *addr, struct path *path)
+		       void __user *addr, const struct path *path)
 {
 	int ret;
 
@@ -789,9 +785,14 @@ static int quotactl_cmd_write(int cmd)
 	}
 	return 1;
 }
-
 #endif /* CONFIG_BLOCK */
 
+/* Return true if quotactl command is manipulating quota on/off state */
+static bool quotactl_cmd_onoff(int cmd)
+{
+	return (cmd == Q_QUOTAON) || (cmd == Q_QUOTAOFF);
+}
+
 /*
  * look up a superblock on which quota ops will be performed
  * - use the name of a block device to find the superblock thereon
@@ -809,7 +810,9 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 	putname(tmp);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
-	if (quotactl_cmd_write(cmd))
+	if (quotactl_cmd_onoff(cmd))
+		sb = get_super_exclusive_thawed(bdev);
+	else if (quotactl_cmd_write(cmd))
 		sb = get_super_thawed(bdev);
 	else
 		sb = get_super(bdev);
@@ -872,7 +875,10 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 
 	ret = do_quotactl(sb, type, cmds, id, addr, pathp);
 
-	drop_super(sb);
+	if (!quotactl_cmd_onoff(cmds))
+		drop_super(sb);
+	else
+		drop_super_exclusive(sb);
 out:
 	if (pathp && !IS_ERR(pathp))
 		path_put(pathp);
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 2bcbf4e77982..2ef7ce75c062 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -23,7 +23,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 static int ramfs_nommu_setattr(struct dentry *, struct iattr *);
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index 8621c039b536..26e45863e499 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -35,7 +35,7 @@
 #include <linux/parser.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "internal.h"
 
 #define RAMFS_DEFAULT_MODE	0755
diff --git a/fs/read_write.c b/fs/read_write.c
index 190e0d362581..5816d4c4cab0 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -20,7 +20,7 @@
 #include <linux/fs.h>
 #include "internal.h"
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
@@ -1538,28 +1538,43 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
 	if (len == 0)
 		return 0;
 
-	ret = mnt_want_write_file(file_out);
-	if (ret)
-		return ret;
+	sb_start_write(inode_out->i_sb);
 
-	ret = -EOPNOTSUPP;
-	if (file_out->f_op->copy_file_range)
+	/*
+	 * Try cloning first, this is supported by more file systems, and
+	 * more efficient if both clone and copy are supported (e.g. NFS).
+	 */
+	if (file_in->f_op->clone_file_range) {
+		ret = file_in->f_op->clone_file_range(file_in, pos_in,
+				file_out, pos_out, len);
+		if (ret == 0) {
+			ret = len;
+			goto done;
+		}
+	}
+
+	if (file_out->f_op->copy_file_range) {
 		ret = file_out->f_op->copy_file_range(file_in, pos_in, file_out,
 						      pos_out, len, flags);
-	if (ret == -EOPNOTSUPP)
-		ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
-				len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
+		if (ret != -EOPNOTSUPP)
+			goto done;
+	}
+
+	ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
+			len > MAX_RW_COUNT ? MAX_RW_COUNT : len, 0);
 
+done:
 	if (ret > 0) {
 		fsnotify_access(file_in);
 		add_rchar(current, ret);
 		fsnotify_modify(file_out);
 		add_wchar(current, ret);
 	}
+
 	inc_syscr(current);
 	inc_syscw(current);
 
-	mnt_drop_write_file(file_out);
+	sb_end_write(inode_out->i_sb);
 
 	return ret;
 }
@@ -1650,6 +1665,115 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write)
 	return security_file_permission(file, write ? MAY_WRITE : MAY_READ);
 }
 
+/*
+ * Check that the two inodes are eligible for cloning, the ranges make
+ * sense, and then flush all dirty data.  Caller must ensure that the
+ * inodes have been locked against any other modifications.
+ *
+ * Returns: 0 for "nothing to clone", 1 for "something to clone", or
+ * the usual negative error code.
+ */
+int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in,
+			       struct inode *inode_out, loff_t pos_out,
+			       u64 *len, bool is_dedupe)
+{
+	loff_t bs = inode_out->i_sb->s_blocksize;
+	loff_t blen;
+	loff_t isize;
+	bool same_inode = (inode_in == inode_out);
+	int ret;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode_out))
+		return -EPERM;
+
+	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
+		return -ETXTBSY;
+
+	/* Don't reflink dirs, pipes, sockets... */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	/* Are we going all the way to the end? */
+	isize = i_size_read(inode_in);
+	if (isize == 0)
+		return 0;
+
+	/* Zero length dedupe exits immediately; reflink goes to EOF. */
+	if (*len == 0) {
+		if (is_dedupe || pos_in == isize)
+			return 0;
+		if (pos_in > isize)
+			return -EINVAL;
+		*len = isize - pos_in;
+	}
+
+	/* Ensure offsets don't wrap and the input is inside i_size */
+	if (pos_in + *len < pos_in || pos_out + *len < pos_out ||
+	    pos_in + *len > isize)
+		return -EINVAL;
+
+	/* Don't allow dedupe past EOF in the dest file */
+	if (is_dedupe) {
+		loff_t	disize;
+
+		disize = i_size_read(inode_out);
+		if (pos_out >= disize || pos_out + *len > disize)
+			return -EINVAL;
+	}
+
+	/* If we're linking to EOF, continue to the block boundary. */
+	if (pos_in + *len == isize)
+		blen = ALIGN(isize, bs) - pos_in;
+	else
+		blen = *len;
+
+	/* Only reflink if we're aligned to block boundaries */
+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+		return -EINVAL;
+
+	/* Don't allow overlapped reflink within the same file */
+	if (same_inode) {
+		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
+			return -EINVAL;
+	}
+
+	/* Wait for the completion of any pending IOs on both files */
+	inode_dio_wait(inode_in);
+	if (!same_inode)
+		inode_dio_wait(inode_out);
+
+	ret = filemap_write_and_wait_range(inode_in->i_mapping,
+			pos_in, pos_in + *len - 1);
+	if (ret)
+		return ret;
+
+	ret = filemap_write_and_wait_range(inode_out->i_mapping,
+			pos_out, pos_out + *len - 1);
+	if (ret)
+		return ret;
+
+	/*
+	 * Check that the extents are the same.
+	 */
+	if (is_dedupe) {
+		bool		is_same = false;
+
+		ret = vfs_dedupe_file_range_compare(inode_in, pos_in,
+				inode_out, pos_out, *len, &is_same);
+		if (ret)
+			return ret;
+		if (!is_same)
+			return -EBADE;
+	}
+
+	return 1;
+}
+EXPORT_SYMBOL(vfs_clone_file_prep_inodes);
+
 int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 		struct file *file_out, loff_t pos_out, u64 len)
 {
@@ -1657,15 +1781,19 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 	struct inode *inode_out = file_inode(file_out);
 	int ret;
 
-	if (inode_in->i_sb != inode_out->i_sb ||
-	    file_in->f_path.mnt != file_out->f_path.mnt)
-		return -EXDEV;
-
 	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
 		return -EISDIR;
 	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
 		return -EINVAL;
 
+	/*
+	 * FICLONE/FICLONERANGE ioctls enforce that src and dest files are on
+	 * the same mount. Practically, they only need to be on the same file
+	 * system.
+	 */
+	if (inode_in->i_sb != inode_out->i_sb)
+		return -EXDEV;
+
 	if (!(file_in->f_mode & FMODE_READ) ||
 	    !(file_out->f_mode & FMODE_WRITE) ||
 	    (file_out->f_flags & O_APPEND))
@@ -1685,10 +1813,6 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 	if (pos_in + len > i_size_read(inode_in))
 		return -EINVAL;
 
-	ret = mnt_want_write_file(file_out);
-	if (ret)
-		return ret;
-
 	ret = file_in->f_op->clone_file_range(file_in, pos_in,
 			file_out, pos_out, len);
 	if (!ret) {
@@ -1696,11 +1820,106 @@ int vfs_clone_file_range(struct file *file_in, loff_t pos_in,
 		fsnotify_modify(file_out);
 	}
 
-	mnt_drop_write_file(file_out);
 	return ret;
 }
 EXPORT_SYMBOL(vfs_clone_file_range);
 
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *vfs_dedupe_get_page(struct inode *inode, loff_t offset)
+{
+	struct address_space *mapping;
+	struct page *page;
+	pgoff_t n;
+
+	n = offset >> PAGE_SHIFT;
+	mapping = inode->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	lock_page(page);
+	return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ * Caller must have locked both inodes to prevent write races.
+ */
+int vfs_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
+				  struct inode *dest, loff_t destoff,
+				  loff_t len, bool *is_same)
+{
+	loff_t src_poff;
+	loff_t dest_poff;
+	void *src_addr;
+	void *dest_addr;
+	struct page *src_page;
+	struct page *dest_page;
+	loff_t cmp_len;
+	bool same;
+	int error;
+
+	error = -EINVAL;
+	same = true;
+	while (len) {
+		src_poff = srcoff & (PAGE_SIZE - 1);
+		dest_poff = destoff & (PAGE_SIZE - 1);
+		cmp_len = min(PAGE_SIZE - src_poff,
+			      PAGE_SIZE - dest_poff);
+		cmp_len = min(cmp_len, len);
+		if (cmp_len <= 0)
+			goto out_error;
+
+		src_page = vfs_dedupe_get_page(src, srcoff);
+		if (IS_ERR(src_page)) {
+			error = PTR_ERR(src_page);
+			goto out_error;
+		}
+		dest_page = vfs_dedupe_get_page(dest, destoff);
+		if (IS_ERR(dest_page)) {
+			error = PTR_ERR(dest_page);
+			unlock_page(src_page);
+			put_page(src_page);
+			goto out_error;
+		}
+		src_addr = kmap_atomic(src_page);
+		dest_addr = kmap_atomic(dest_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dest_page);
+
+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+			same = false;
+
+		kunmap_atomic(dest_addr);
+		kunmap_atomic(src_addr);
+		unlock_page(dest_page);
+		unlock_page(src_page);
+		put_page(dest_page);
+		put_page(src_page);
+
+		if (!same)
+			break;
+
+		srcoff += cmp_len;
+		destoff += cmp_len;
+		len -= cmp_len;
+	}
+
+	*is_same = same;
+	return 0;
+
+out_error:
+	return error;
+}
+EXPORT_SYMBOL(vfs_dedupe_file_range_compare);
+
 int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 {
 	struct file_dedupe_range_info *info;
@@ -1737,6 +1956,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
 		goto out;
 	ret = 0;
 
+	if (off + len > i_size_read(src))
+		return -EINVAL;
+
 	/* pre-format output fields to sane values */
 	for (i = 0; i < count; i++) {
 		same->info[i].bytes_deduped = 0ULL;
diff --git a/fs/readdir.c b/fs/readdir.c
index 9d0212c374d6..0e8a7f355f7a 100644
--- a/fs/readdir.c
+++ b/fs/readdir.c
@@ -19,7 +19,7 @@
 #include <linux/syscalls.h>
 #include <linux/unistd.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 int iterate_dir(struct file *file, struct dir_context *ctx)
 {
diff --git a/fs/reiserfs/inode.c b/fs/reiserfs/inode.c
index 58b2dedb2a3a..cfeae9b0a2b7 100644
--- a/fs/reiserfs/inode.c
+++ b/fs/reiserfs/inode.c
@@ -19,6 +19,7 @@
 #include <linux/quotaops.h>
 #include <linux/swap.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 
 int reiserfs_commit_write(struct file *f, struct page *page,
 			  unsigned from, unsigned to);
diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c
index bc2dde2423c2..aa40c242f1db 100644
--- a/fs/reiserfs/journal.c
+++ b/fs/reiserfs/journal.c
@@ -1111,7 +1111,8 @@ static int flush_commit_list(struct super_block *s,
 		mark_buffer_dirty(jl->j_commit_bh) ;
 		depth = reiserfs_write_unlock_nested(s);
 		if (reiserfs_barrier_flush(s))
-			__sync_dirty_buffer(jl->j_commit_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(jl->j_commit_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(jl->j_commit_bh);
 		reiserfs_write_lock_nested(s, depth);
@@ -1269,7 +1270,8 @@ static int _update_journal_header_block(struct super_block *sb,
 		depth = reiserfs_write_unlock_nested(sb);
 
 		if (reiserfs_barrier_flush(sb))
-			__sync_dirty_buffer(journal->j_header_bh, WRITE_FLUSH_FUA);
+			__sync_dirty_buffer(journal->j_header_bh,
+					REQ_PREFLUSH | REQ_FUA);
 		else
 			sync_dirty_buffer(journal->j_header_bh);
 
diff --git a/fs/reiserfs/namei.c b/fs/reiserfs/namei.c
index e6a2b406af36..bd39a998843d 100644
--- a/fs/reiserfs/namei.c
+++ b/fs/reiserfs/namei.c
@@ -1665,7 +1665,6 @@ const struct inode_operations reiserfs_dir_inode_operations = {
  * stuff added
  */
 const struct inode_operations reiserfs_symlink_inode_operations = {
-	.readlink = generic_readlink,
 	.get_link	= page_get_link,
 	.setattr = reiserfs_setattr,
 	.listxattr = reiserfs_listxattr,
diff --git a/fs/reiserfs/stree.c b/fs/reiserfs/stree.c
index a97e352d05d3..0037aea97d39 100644
--- a/fs/reiserfs/stree.c
+++ b/fs/reiserfs/stree.c
@@ -11,6 +11,7 @@
 #include <linux/time.h>
 #include <linux/string.h>
 #include <linux/pagemap.h>
+#include <linux/bio.h>
 #include "reiserfs.h"
 #include <linux/buffer_head.h>
 #include <linux/quotaops.h>
diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c
index 0a6ad4e71e88..e314cb30a181 100644
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -802,7 +802,7 @@ static int reiserfs_acquire_dquot(struct dquot *);
 static int reiserfs_release_dquot(struct dquot *);
 static int reiserfs_mark_dquot_dirty(struct dquot *);
 static int reiserfs_write_info(struct super_block *, int);
-static int reiserfs_quota_on(struct super_block *, int, int, struct path *);
+static int reiserfs_quota_on(struct super_block *, int, int, const struct path *);
 
 static const struct dquot_operations reiserfs_quota_operations = {
 	.write_dquot = reiserfs_write_dquot,
@@ -2348,7 +2348,7 @@ static int reiserfs_quota_on_mount(struct super_block *sb, int type)
  * Standard function to be called on quota_on
  */
 static int reiserfs_quota_on(struct super_block *sb, int type, int format_id,
-			     struct path *path)
+			     const struct path *path)
 {
 	int err;
 	struct inode *inode;
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
index d0f8a38dfafa..0186fe6d39f3 100644
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -74,6 +74,7 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <linux/uaccess.h>
+#include <linux/major.h>
 #include "internal.h"
 
 static struct kmem_cache *romfs_inode_cachep;
@@ -416,7 +417,22 @@ static void romfs_destroy_inode(struct inode *inode)
 static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
 	struct super_block *sb = dentry->d_sb;
-	u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+	u64 id = 0;
+
+	/* When calling huge_encode_dev(),
+	 * use sb->s_bdev->bd_dev when,
+	 *   - CONFIG_ROMFS_ON_BLOCK defined
+	 * use sb->s_dev when,
+	 *   - CONFIG_ROMFS_ON_BLOCK undefined and
+	 *   - CONFIG_ROMFS_ON_MTD defined
+	 * leave id as 0 when,
+	 *   - CONFIG_ROMFS_ON_BLOCK undefined and
+	 *   - CONFIG_ROMFS_ON_MTD undefined
+	 */
+	if (sb->s_bdev)
+		id = huge_encode_dev(sb->s_bdev->bd_dev);
+	else if (sb->s_dev)
+		id = huge_encode_dev(sb->s_dev);
 
 	buf->f_type = ROMFS_MAGIC;
 	buf->f_namelen = ROMFS_MAXFN;
@@ -489,6 +505,11 @@ static int romfs_fill_super(struct super_block *sb, void *data, int silent)
 	sb->s_flags |= MS_RDONLY | MS_NOATIME;
 	sb->s_op = &romfs_super_ops;
 
+#ifdef CONFIG_ROMFS_ON_MTD
+	/* Use same dev ID from the underlying mtdblock device */
+	if (sb->s_mtd)
+		sb->s_dev = MKDEV(MTD_BLOCK_MAJOR, sb->s_mtd->index);
+#endif
 	/* read the image superblock and check it */
 	rsb = kmalloc(512, GFP_KERNEL);
 	if (!rsb)
diff --git a/fs/select.c b/fs/select.c
index 3d4f85defeab..305c0daf5d67 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -31,7 +31,7 @@
 #include <net/busy_poll.h>
 #include <linux/vmalloc.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 
 /*
diff --git a/fs/seq_file.c b/fs/seq_file.c
index 368bfb92b115..ca69fb99e41a 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -15,7 +15,7 @@
 #include <linux/printk.h>
 #include <linux/string_helpers.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/page.h>
 
 static void seq_set_overflow(struct seq_file *m)
@@ -190,6 +190,13 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	 */
 	m->version = file->f_version;
 
+	/*
+	 * if request is to read from zero offset, reset iterator to first
+	 * record as it might have been already advanced by previous requests
+	 */
+	if (*ppos == 0)
+		m->index = 0;
+
 	/* Don't assume *ppos is where we left it */
 	if (unlikely(*ppos != m->read_pos)) {
 		while ((err = traverse(m, *ppos)) == -EAGAIN)
diff --git a/fs/splice.c b/fs/splice.c
index 5a7750bd2eea..873d83104e79 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -17,6 +17,7 @@
  * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
  */
+#include <linux/bvec.h>
 #include <linux/fs.h>
 #include <linux/file.h>
 #include <linux/pagemap.h>
@@ -1086,7 +1087,13 @@ EXPORT_SYMBOL(do_splice_direct);
 
 static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 {
-	while (pipe->nrbufs == pipe->buffers) {
+	for (;;) {
+		if (unlikely(!pipe->readers)) {
+			send_sig(SIGPIPE, current, 0);
+			return -EPIPE;
+		}
+		if (pipe->nrbufs != pipe->buffers)
+			return 0;
 		if (flags & SPLICE_F_NONBLOCK)
 			return -EAGAIN;
 		if (signal_pending(current))
@@ -1095,7 +1102,6 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags)
 		pipe_wait(pipe);
 		pipe->waiting_writers--;
 	}
-	return 0;
 }
 
 static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe,
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index ce62a380314f..2751476e6b6e 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -31,6 +31,7 @@
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/buffer_head.h>
+#include <linux/bio.h>
 
 #include "squashfs_fs.h"
 #include "squashfs_fs_sb.h"
diff --git a/fs/squashfs/symlink.c b/fs/squashfs/symlink.c
index 79b9c31a0c8f..befeba0fa70a 100644
--- a/fs/squashfs/symlink.c
+++ b/fs/squashfs/symlink.c
@@ -118,7 +118,6 @@ const struct address_space_operations squashfs_symlink_aops = {
 };
 
 const struct inode_operations squashfs_symlink_inode_ops = {
-	.readlink = generic_readlink,
 	.get_link = page_get_link,
 	.listxattr = squashfs_listxattr
 };
diff --git a/fs/stat.c b/fs/stat.c
index bc045c7994e1..a268b7f27adf 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -15,7 +15,7 @@
 #include <linux/syscalls.h>
 #include <linux/pagemap.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 void generic_fillattr(struct inode *inode, struct kstat *stat)
@@ -329,12 +329,14 @@ retry:
 		struct inode *inode = d_backing_inode(path.dentry);
 
 		error = empty ? -ENOENT : -EINVAL;
-		if (inode->i_op->readlink) {
+		/*
+		 * AFS mountpoints allow readlink(2) but are not symlinks
+		 */
+		if (d_is_symlink(path.dentry) || inode->i_op->readlink) {
 			error = security_inode_readlink(path.dentry);
 			if (!error) {
 				touch_atime(&path);
-				error = inode->i_op->readlink(path.dentry,
-							      buf, bufsiz);
+				error = vfs_readlink(path.dentry, buf, bufsiz);
 			}
 		}
 		path_put(&path);
diff --git a/fs/statfs.c b/fs/statfs.c
index 083dc0ac9140..13ae259d4879 100644
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -63,7 +63,7 @@ static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
 	return retval;
 }
 
-int vfs_statfs(struct path *path, struct kstatfs *buf)
+int vfs_statfs(const struct path *path, struct kstatfs *buf)
 {
 	int error;
 
diff --git a/fs/super.c b/fs/super.c
index c183835566c1..1709ed029a2c 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -244,7 +244,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags,
 	mutex_init(&s->s_vfs_rename_mutex);
 	lockdep_set_class(&s->s_vfs_rename_mutex, &type->s_vfs_rename_key);
 	mutex_init(&s->s_dquot.dqio_mutex);
-	mutex_init(&s->s_dquot.dqonoff_mutex);
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
@@ -558,6 +557,13 @@ void drop_super(struct super_block *sb)
 
 EXPORT_SYMBOL(drop_super);
 
+void drop_super_exclusive(struct super_block *sb)
+{
+	up_write(&sb->s_umount);
+	put_super(sb);
+}
+EXPORT_SYMBOL(drop_super_exclusive);
+
 /**
  *	iterate_supers - call function for all active superblocks
  *	@f: function to call
@@ -628,15 +634,7 @@ void iterate_supers_type(struct file_system_type *type,
 
 EXPORT_SYMBOL(iterate_supers_type);
 
-/**
- *	get_super - get the superblock of a device
- *	@bdev: device to get the superblock for
- *	
- *	Scans the superblock list and finds the superblock of the file system
- *	mounted on the device given. %NULL is returned if no match is found.
- */
-
-struct super_block *get_super(struct block_device *bdev)
+static struct super_block *__get_super(struct block_device *bdev, bool excl)
 {
 	struct super_block *sb;
 
@@ -651,11 +649,17 @@ rescan:
 		if (sb->s_bdev == bdev) {
 			sb->s_count++;
 			spin_unlock(&sb_lock);
-			down_read(&sb->s_umount);
+			if (!excl)
+				down_read(&sb->s_umount);
+			else
+				down_write(&sb->s_umount);
 			/* still alive? */
 			if (sb->s_root && (sb->s_flags & MS_BORN))
 				return sb;
-			up_read(&sb->s_umount);
+			if (!excl)
+				up_read(&sb->s_umount);
+			else
+				up_write(&sb->s_umount);
 			/* nope, got unmounted */
 			spin_lock(&sb_lock);
 			__put_super(sb);
@@ -666,32 +670,67 @@ rescan:
 	return NULL;
 }
 
-EXPORT_SYMBOL(get_super);
-
 /**
- *	get_super_thawed - get thawed superblock of a device
+ *	get_super - get the superblock of a device
  *	@bdev: device to get the superblock for
  *
  *	Scans the superblock list and finds the superblock of the file system
- *	mounted on the device. The superblock is returned once it is thawed
- *	(or immediately if it was not frozen). %NULL is returned if no match
- *	is found.
+ *	mounted on the device given. %NULL is returned if no match is found.
  */
-struct super_block *get_super_thawed(struct block_device *bdev)
+struct super_block *get_super(struct block_device *bdev)
+{
+	return __get_super(bdev, false);
+}
+EXPORT_SYMBOL(get_super);
+
+static struct super_block *__get_super_thawed(struct block_device *bdev,
+					      bool excl)
 {
 	while (1) {
-		struct super_block *s = get_super(bdev);
+		struct super_block *s = __get_super(bdev, excl);
 		if (!s || s->s_writers.frozen == SB_UNFROZEN)
 			return s;
-		up_read(&s->s_umount);
+		if (!excl)
+			up_read(&s->s_umount);
+		else
+			up_write(&s->s_umount);
 		wait_event(s->s_writers.wait_unfrozen,
 			   s->s_writers.frozen == SB_UNFROZEN);
 		put_super(s);
 	}
 }
+
+/**
+ *	get_super_thawed - get thawed superblock of a device
+ *	@bdev: device to get the superblock for
+ *
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device. The superblock is returned once it is thawed
+ *	(or immediately if it was not frozen). %NULL is returned if no match
+ *	is found.
+ */
+struct super_block *get_super_thawed(struct block_device *bdev)
+{
+	return __get_super_thawed(bdev, false);
+}
 EXPORT_SYMBOL(get_super_thawed);
 
 /**
+ *	get_super_exclusive_thawed - get thawed superblock of a device
+ *	@bdev: device to get the superblock for
+ *
+ *	Scans the superblock list and finds the superblock of the file system
+ *	mounted on the device. The superblock is returned once it is thawed
+ *	(or immediately if it was not frozen) and s_umount semaphore is held
+ *	in exclusive mode. %NULL is returned if no match is found.
+ */
+struct super_block *get_super_exclusive_thawed(struct block_device *bdev)
+{
+	return __get_super_thawed(bdev, true);
+}
+EXPORT_SYMBOL(get_super_exclusive_thawed);
+
+/**
  * get_active_super - get an active reference to the superblock of a device
  * @bdev: device to get the superblock for
  *
diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c
index d62c423a5a2d..858fb72f9e0f 100644
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -145,7 +145,6 @@ static inline void write3byte(struct sysv_sb_info *sbi,
 }
 
 static const struct inode_operations sysv_symlink_inode_operations = {
-	.readlink	= generic_readlink,
 	.get_link	= page_get_link,
 	.getattr	= sysv_getattr,
 };
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9ae4abb4110b..c173cc196175 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -55,7 +55,7 @@ static inline bool isalarm(struct timerfd_ctx *ctx)
 /*
  * This gets called when the timer event triggers. We set the "expired"
  * flag, but we do not re-arm the timer (in case it's necessary,
- * tintv.tv64 != 0) until the timer is accessed.
+ * tintv != 0) until the timer is accessed.
  */
 static void timerfd_triggered(struct timerfd_ctx *ctx)
 {
@@ -93,7 +93,7 @@ static enum alarmtimer_restart timerfd_alarmproc(struct alarm *alarm,
  */
 void timerfd_clock_was_set(void)
 {
-	ktime_t moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ktime_t moffs = ktime_mono_to_real(0);
 	struct timerfd_ctx *ctx;
 	unsigned long flags;
 
@@ -102,8 +102,8 @@ void timerfd_clock_was_set(void)
 		if (!ctx->might_cancel)
 			continue;
 		spin_lock_irqsave(&ctx->wqh.lock, flags);
-		if (ctx->moffs.tv64 != moffs.tv64) {
-			ctx->moffs.tv64 = KTIME_MAX;
+		if (ctx->moffs != moffs) {
+			ctx->moffs = KTIME_MAX;
 			ctx->ticks++;
 			wake_up_locked(&ctx->wqh);
 		}
@@ -124,9 +124,9 @@ static void timerfd_remove_cancel(struct timerfd_ctx *ctx)
 
 static bool timerfd_canceled(struct timerfd_ctx *ctx)
 {
-	if (!ctx->might_cancel || ctx->moffs.tv64 != KTIME_MAX)
+	if (!ctx->might_cancel || ctx->moffs != KTIME_MAX)
 		return false;
-	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ctx->moffs = ktime_mono_to_real(0);
 	return true;
 }
 
@@ -155,7 +155,7 @@ static ktime_t timerfd_get_remaining(struct timerfd_ctx *ctx)
 	else
 		remaining = hrtimer_expires_remaining_adjusted(&ctx->t.tmr);
 
-	return remaining.tv64 < 0 ? ktime_set(0, 0): remaining;
+	return remaining < 0 ? 0: remaining;
 }
 
 static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
@@ -184,7 +184,7 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags,
 		ctx->t.tmr.function = timerfd_tmrproc;
 	}
 
-	if (texp.tv64 != 0) {
+	if (texp != 0) {
 		if (isalarm(ctx)) {
 			if (flags & TFD_TIMER_ABSTIME)
 				alarm_start(&ctx->t.alarm, texp);
@@ -261,9 +261,9 @@ static ssize_t timerfd_read(struct file *file, char __user *buf, size_t count,
 	if (ctx->ticks) {
 		ticks = ctx->ticks;
 
-		if (ctx->expired && ctx->tintv.tv64) {
+		if (ctx->expired && ctx->tintv) {
 			/*
-			 * If tintv.tv64 != 0, this is a periodic timer that
+			 * If tintv != 0, this is a periodic timer that
 			 * needs to be re-armed. We avoid doing it in the timer
 			 * callback to avoid DoS attacks specifying a very
 			 * short timer period.
@@ -410,7 +410,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	else
 		hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS);
 
-	ctx->moffs = ktime_mono_to_real((ktime_t){ .tv64 = 0 });
+	ctx->moffs = ktime_mono_to_real(0);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
 			       O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
@@ -469,7 +469,7 @@ static int do_timerfd_settime(int ufd, int flags,
 	 * We do not update "ticks" and "expired" since the timer will be
 	 * re-programmed again in the following timerfd_setup() call.
 	 */
-	if (ctx->expired && ctx->tintv.tv64) {
+	if (ctx->expired && ctx->tintv) {
 		if (isalarm(ctx))
 			alarm_forward_now(&ctx->t.alarm, ctx->tintv);
 		else
@@ -499,7 +499,7 @@ static int do_timerfd_gettime(int ufd, struct itimerspec *t)
 	ctx = f.file->private_data;
 
 	spin_lock_irq(&ctx->wqh.lock);
-	if (ctx->expired && ctx->tintv.tv64) {
+	if (ctx->expired && ctx->tintv) {
 		ctx->expired = 0;
 
 		if (isalarm(ctx)) {
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index 7ff7712f284e..b0d0623c83ed 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -50,3 +50,14 @@ config UBIFS_ATIME_SUPPORT
 	  strictatime is the "heavy", relatime is "lighter", etc.
 
 	  If unsure, say 'N'
+
+config UBIFS_FS_ENCRYPTION
+	bool "UBIFS Encryption"
+	depends on UBIFS_FS && BLOCK
+	select FS_ENCRYPTION
+	default n
+	help
+	  Enable encryption of UBIFS files and directories. This
+	  feature is similar to ecryptfs, but it is more memory
+	  efficient since it avoids caching the encrypted and
+	  decrypted pages in the page cache.
diff --git a/fs/ubifs/Makefile b/fs/ubifs/Makefile
index c54a24360f85..6f3251c2bf08 100644
--- a/fs/ubifs/Makefile
+++ b/fs/ubifs/Makefile
@@ -5,3 +5,4 @@ ubifs-y += tnc.o master.o scan.o replay.o log.o commit.o gc.o orphan.o
 ubifs-y += budget.o find.o tnc_commit.o compress.o lpt.o lprops.o
 ubifs-y += recovery.o ioctl.o lpt_commit.o tnc_misc.o xattr.o debug.o
 ubifs-y += misc.o
+ubifs-$(CONFIG_UBIFS_FS_ENCRYPTION) += crypto.o
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c
new file mode 100644
index 000000000000..3402720f2b28
--- /dev/null
+++ b/fs/ubifs/crypto.c
@@ -0,0 +1,97 @@
+#include "ubifs.h"
+
+static int ubifs_crypt_get_context(struct inode *inode, void *ctx, size_t len)
+{
+	return ubifs_xattr_get(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+			       ctx, len);
+}
+
+static int ubifs_crypt_set_context(struct inode *inode, const void *ctx,
+				   size_t len, void *fs_data)
+{
+	return ubifs_xattr_set(inode, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT,
+			       ctx, len, 0);
+}
+
+static bool ubifs_crypt_empty_dir(struct inode *inode)
+{
+	return ubifs_check_dir_empty(inode) == 0;
+}
+
+static unsigned int ubifs_crypt_max_namelen(struct inode *inode)
+{
+	if (S_ISLNK(inode->i_mode))
+		return UBIFS_MAX_INO_DATA;
+	else
+		return UBIFS_MAX_NLEN;
+}
+
+static int ubifs_key_prefix(struct inode *inode, u8 **key)
+{
+	static char prefix[] = "ubifs:";
+
+	*key = prefix;
+
+	return sizeof(prefix) - 1;
+}
+
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+		  unsigned int in_len, unsigned int *out_len, int block)
+{
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	void *p = &dn->data;
+	struct page *ret;
+	unsigned int pad_len = round_up(in_len, UBIFS_CIPHER_BLOCK_SIZE);
+
+	ubifs_assert(pad_len <= *out_len);
+	dn->compr_size = cpu_to_le16(in_len);
+
+	/* pad to full block cipher length */
+	if (pad_len != in_len)
+		memset(p + in_len, 0, pad_len - in_len);
+
+	ret = fscrypt_encrypt_page(inode, virt_to_page(&dn->data), pad_len,
+			offset_in_page(&dn->data), block, GFP_NOFS);
+	if (IS_ERR(ret)) {
+		ubifs_err(c, "fscrypt_encrypt_page failed: %ld", PTR_ERR(ret));
+		return PTR_ERR(ret);
+	}
+	*out_len = pad_len;
+
+	return 0;
+}
+
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+		  unsigned int *out_len, int block)
+{
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+	int err;
+	unsigned int clen = le16_to_cpu(dn->compr_size);
+	unsigned int dlen = *out_len;
+
+	if (clen <= 0 || clen > UBIFS_BLOCK_SIZE || clen > dlen) {
+		ubifs_err(c, "bad compr_size: %i", clen);
+		return -EINVAL;
+	}
+
+	ubifs_assert(dlen <= UBIFS_BLOCK_SIZE);
+	err = fscrypt_decrypt_page(inode, virt_to_page(&dn->data), dlen,
+			offset_in_page(&dn->data), block);
+	if (err) {
+		ubifs_err(c, "fscrypt_decrypt_page failed: %i", err);
+		return err;
+	}
+	*out_len = clen;
+
+	return 0;
+}
+
+struct fscrypt_operations ubifs_crypt_operations = {
+	.flags			= FS_CFLG_OWN_PAGES,
+	.get_context		= ubifs_crypt_get_context,
+	.set_context		= ubifs_crypt_set_context,
+	.is_encrypted		= __ubifs_crypt_is_encrypted,
+	.empty_dir		= ubifs_crypt_empty_dir,
+	.max_namelen		= ubifs_crypt_max_namelen,
+	.key_prefix		= ubifs_key_prefix,
+};
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 69e287e20732..1e712a364680 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -233,7 +233,7 @@ static void dump_ch(const struct ubifs_ch *ch)
 void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 {
 	const struct ubifs_inode *ui = ubifs_inode(inode);
-	struct qstr nm = { .name = NULL };
+	struct fscrypt_name nm = {0};
 	union ubifs_key key;
 	struct ubifs_dent_node *dent, *pdent = NULL;
 	int count = 2;
@@ -289,8 +289,8 @@ void ubifs_dump_inode(struct ubifs_info *c, const struct inode *inode)
 		pr_err("\t%d: %s (%s)\n",
 		       count++, dent->name, get_dent_type(dent->type));
 
-		nm.name = dent->name;
-		nm.len = le16_to_cpu(dent->nlen);
+		fname_name(&nm) = dent->name;
+		fname_len(&nm) = le16_to_cpu(dent->nlen);
 		kfree(pdent);
 		pdent = dent;
 		key_read(c, &dent->key, &key);
@@ -1107,7 +1107,7 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
 	unsigned int nlink = 2;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent, *pdent = NULL;
-	struct qstr nm = { .name = NULL };
+	struct fscrypt_name nm = {0};
 	loff_t size = UBIFS_INO_NODE_SZ;
 
 	if (!dbg_is_chk_gen(c))
@@ -1128,9 +1128,9 @@ int dbg_check_dir(struct ubifs_info *c, const struct inode *dir)
 			return err;
 		}
 
-		nm.name = dent->name;
-		nm.len = le16_to_cpu(dent->nlen);
-		size += CALC_DENT_SIZE(nm.len);
+		fname_name(&nm) = dent->name;
+		fname_len(&nm) = le16_to_cpu(dent->nlen);
+		size += CALC_DENT_SIZE(fname_len(&nm));
 		if (dent->type == UBIFS_ITYPE_DIR)
 			nlink += 1;
 		kfree(pdent);
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ca16c5d7bab1..528369f3e472 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -85,11 +85,26 @@ static int inherit_flags(const struct inode *dir, umode_t mode)
  * initializes it. Returns new inode in case of success and an error code in
  * case of failure.
  */
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode)
 {
+	int err;
 	struct inode *inode;
 	struct ubifs_inode *ui;
+	bool encrypted = false;
+
+	if (ubifs_crypt_is_encrypted(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err) {
+			ubifs_err(c, "fscrypt_get_encryption_info failed: %i", err);
+			return ERR_PTR(err);
+		}
+
+		if (!fscrypt_has_encryption_key(dir))
+			return ERR_PTR(-EPERM);
+
+		encrypted = true;
+	}
 
 	inode = new_inode(c->vfs_sb);
 	ui = ubifs_inode(inode);
@@ -165,18 +180,29 @@ struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
 	 */
 	ui->creat_sqnum = ++c->max_sqnum;
 	spin_unlock(&c->cnt_lock);
+
+	if (encrypted) {
+		err = fscrypt_inherit_context(dir, inode, &encrypted, true);
+		if (err) {
+			ubifs_err(c, "fscrypt_inherit_context failed: %i", err);
+			make_bad_inode(inode);
+			iput(inode);
+			return ERR_PTR(err);
+		}
+	}
+
 	return inode;
 }
 
 static int dbg_check_name(const struct ubifs_info *c,
 			  const struct ubifs_dent_node *dent,
-			  const struct qstr *nm)
+			  const struct fscrypt_name *nm)
 {
 	if (!dbg_is_chk_gen(c))
 		return 0;
-	if (le16_to_cpu(dent->nlen) != nm->len)
+	if (le16_to_cpu(dent->nlen) != fname_len(nm))
 		return -EINVAL;
-	if (memcmp(dent->name, nm->name, nm->len))
+	if (memcmp(dent->name, fname_name(nm), fname_len(nm)))
 		return -EINVAL;
 	return 0;
 }
@@ -189,30 +215,61 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 	struct inode *inode = NULL;
 	struct ubifs_dent_node *dent;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct fscrypt_name nm;
 
 	dbg_gen("'%pd' in dir ino %lu", dentry, dir->i_ino);
 
-	if (dentry->d_name.len > UBIFS_MAX_NLEN)
-		return ERR_PTR(-ENAMETOOLONG);
+	if (ubifs_crypt_is_encrypted(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+
+		/*
+		 * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
+		 * created while the directory was encrypted and we
+		 * have access to the key.
+		 */
+		if (fscrypt_has_encryption_key(dir))
+			fscrypt_set_encrypted_dentry(dentry);
+		fscrypt_set_d_op(dentry);
+		if (err && err != -ENOKEY)
+			return ERR_PTR(err);
+	}
+
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+	if (err)
+		return ERR_PTR(err);
+
+	if (fname_len(&nm) > UBIFS_MAX_NLEN) {
+		err = -ENAMETOOLONG;
+		goto out_fname;
+	}
 
 	dent = kmalloc(UBIFS_MAX_DENT_NODE_SZ, GFP_NOFS);
-	if (!dent)
-		return ERR_PTR(-ENOMEM);
+	if (!dent) {
+		err = -ENOMEM;
+		goto out_fname;
+	}
 
-	dent_key_init(c, &key, dir->i_ino, &dentry->d_name);
+	if (nm.hash) {
+		ubifs_assert(fname_len(&nm) == 0);
+		ubifs_assert(fname_name(&nm) == NULL);
+		dent_key_init_hash(c, &key, dir->i_ino, nm.hash);
+		err = ubifs_tnc_lookup_dh(c, &key, dent, nm.minor_hash);
+	} else {
+		dent_key_init(c, &key, dir->i_ino, &nm);
+		err = ubifs_tnc_lookup_nm(c, &key, dent, &nm);
+	}
 
-	err = ubifs_tnc_lookup_nm(c, &key, dent, &dentry->d_name);
 	if (err) {
 		if (err == -ENOENT) {
 			dbg_gen("not found");
 			goto done;
 		}
-		goto out;
+		goto out_dent;
 	}
 
-	if (dbg_check_name(c, dent, &dentry->d_name)) {
+	if (dbg_check_name(c, dent, &nm)) {
 		err = -EINVAL;
-		goto out;
+		goto out_dent;
 	}
 
 	inode = ubifs_iget(dir->i_sb, le64_to_cpu(dent->inum));
@@ -225,11 +282,12 @@ static struct dentry *ubifs_lookup(struct inode *dir, struct dentry *dentry,
 		ubifs_err(c, "dead directory entry '%pd', error %d",
 			  dentry, err);
 		ubifs_ro_mode(c, err);
-		goto out;
+		goto out_dent;
 	}
 
 done:
 	kfree(dent);
+	fscrypt_free_filename(&nm);
 	/*
 	 * Note, d_splice_alias() would be required instead if we supported
 	 * NFS.
@@ -237,8 +295,10 @@ done:
 	d_add(dentry, inode);
 	return NULL;
 
-out:
+out_dent:
 	kfree(dent);
+out_fname:
+	fscrypt_free_filename(&nm);
 	return ERR_PTR(err);
 }
 
@@ -247,10 +307,11 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 {
 	struct inode *inode;
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
-	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
 					.dirtied_ino = 1 };
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
+	struct fscrypt_name nm;
+	int err, sz_change;
 
 	/*
 	 * Budget request settings: new inode, new direntry, changing the
@@ -264,10 +325,16 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	if (err)
 		return err;
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+	if (err)
+		goto out_budg;
+
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	inode = ubifs_new_inode(c, dir, mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto out_budg;
+		goto out_fname;
 	}
 
 	err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -278,12 +345,13 @@ static int ubifs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
 
 	ubifs_release_budget(c, &req);
+	fscrypt_free_filename(&nm);
 	insert_inode_hash(inode);
 	d_instantiate(dentry, inode);
 	return 0;
@@ -295,6 +363,8 @@ out_cancel:
 out_inode:
 	make_bad_inode(inode);
 	iput(inode);
+out_fname:
+	fscrypt_free_filename(&nm);
 out_budg:
 	ubifs_release_budget(c, &req);
 	ubifs_err(c, "cannot create regular file, error %d", err);
@@ -310,6 +380,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
 	struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
 	int err, instantiated = 0;
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: new dirty inode, new direntry,
@@ -319,13 +390,20 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
 		dentry, mode, dir->i_ino);
 
-	err = ubifs_budget_space(c, &req);
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
 		return err;
 
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		fscrypt_free_filename(&nm);
+		return err;
+	}
+
 	err = ubifs_budget_space(c, &ino_req);
 	if (err) {
 		ubifs_release_budget(c, &req);
+		fscrypt_free_filename(&nm);
 		return err;
 	}
 
@@ -361,7 +439,7 @@ static int do_tmpfile(struct inode *dir, struct dentry *dentry,
 	mutex_unlock(&ui->ui_mutex);
 
 	mutex_lock(&dir_ui->ui_mutex);
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -380,6 +458,7 @@ out_budg:
 	ubifs_release_budget(c, &req);
 	if (!instantiated)
 		ubifs_release_budget(c, &ino_req);
+	fscrypt_free_filename(&nm);
 	ubifs_err(c, "cannot create temporary file, error %d", err);
 	return err;
 }
@@ -439,12 +518,14 @@ static unsigned int vfs_dent_type(uint8_t type)
  */
 static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 {
-	int err = 0;
-	struct qstr nm;
+	int fstr_real_len = 0, err = 0;
+	struct fscrypt_name nm;
+	struct fscrypt_str fstr = {0};
 	union ubifs_key key;
 	struct ubifs_dent_node *dent;
 	struct inode *dir = file_inode(file);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	bool encrypted = ubifs_crypt_is_encrypted(dir);
 
 	dbg_gen("dir ino %lu, f_pos %#llx", dir->i_ino, ctx->pos);
 
@@ -455,6 +536,18 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		 */
 		return 0;
 
+	if (encrypted) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err && err != -ENOKEY)
+			return err;
+
+		err = fscrypt_fname_alloc_buffer(dir, UBIFS_MAX_NLEN, &fstr);
+		if (err)
+			return err;
+
+		fstr_real_len = fstr.len;
+	}
+
 	if (file->f_version == 0) {
 		/*
 		 * The file was seek'ed, which means that @file->private_data
@@ -476,12 +569,15 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 	/* File positions 0 and 1 correspond to "." and ".." */
 	if (ctx->pos < 2) {
 		ubifs_assert(!file->private_data);
-		if (!dir_emit_dots(file, ctx))
+		if (!dir_emit_dots(file, ctx)) {
+			if (encrypted)
+				fscrypt_fname_free_buffer(&fstr);
 			return 0;
+		}
 
 		/* Find the first entry in TNC and save it */
 		lowest_dent_key(c, &key, dir->i_ino);
-		nm.name = NULL;
+		fname_len(&nm) = 0;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
@@ -499,7 +595,7 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		 * Find the entry corresponding to @ctx->pos or the closest one.
 		 */
 		dent_key_init_hash(c, &key, dir->i_ino, ctx->pos);
-		nm.name = NULL;
+		fname_len(&nm) = 0;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
@@ -516,15 +612,33 @@ static int ubifs_readdir(struct file *file, struct dir_context *ctx)
 		ubifs_assert(le64_to_cpu(dent->ch.sqnum) >
 			     ubifs_inode(dir)->creat_sqnum);
 
-		nm.len = le16_to_cpu(dent->nlen);
-		if (!dir_emit(ctx, dent->name, nm.len,
+		fname_len(&nm) = le16_to_cpu(dent->nlen);
+		fname_name(&nm) = dent->name;
+
+		if (encrypted) {
+			fstr.len = fstr_real_len;
+
+			err = fscrypt_fname_disk_to_usr(dir, key_hash_flash(c,
+							&dent->key),
+							le32_to_cpu(dent->cookie),
+							&nm.disk_name, &fstr);
+			if (err)
+				goto out;
+		} else {
+			fstr.len = fname_len(&nm);
+			fstr.name = fname_name(&nm);
+		}
+
+		if (!dir_emit(ctx, fstr.name, fstr.len,
 			       le64_to_cpu(dent->inum),
-			       vfs_dent_type(dent->type)))
+			       vfs_dent_type(dent->type))) {
+			if (encrypted)
+				fscrypt_fname_free_buffer(&fstr);
 			return 0;
+		}
 
 		/* Switch to the next entry */
 		key_read(c, &dent->key, &key);
-		nm.name = dent->name;
 		dent = ubifs_tnc_next_ent(c, &key, &nm);
 		if (IS_ERR(dent)) {
 			err = PTR_ERR(dent);
@@ -541,6 +655,9 @@ out:
 	kfree(file->private_data);
 	file->private_data = NULL;
 
+	if (encrypted)
+		fscrypt_fname_free_buffer(&fstr);
+
 	if (err != -ENOENT)
 		ubifs_err(c, "cannot find next direntry, error %d", err);
 	else
@@ -601,6 +718,7 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
 	struct ubifs_budget_req req = { .new_dent = 1, .dirtied_ino = 2,
 				.dirtied_ino_d = ALIGN(ui->data_len, 8) };
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: new direntry, changing the target inode,
@@ -613,13 +731,21 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	ubifs_assert(inode_is_locked(dir));
 	ubifs_assert(inode_is_locked(inode));
 
-	err = dbg_check_synced_i_size(c, inode);
+	if (ubifs_crypt_is_encrypted(dir) &&
+	    !fscrypt_has_permitted_context(dir, inode))
+		return -EPERM;
+
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
 	if (err)
 		return err;
 
+	err = dbg_check_synced_i_size(c, inode);
+	if (err)
+		goto out_fname;
+
 	err = ubifs_budget_space(c, &req);
 	if (err)
-		return err;
+		goto out_fname;
 
 	lock_2_inodes(dir, inode);
 	inc_nlink(inode);
@@ -628,13 +754,14 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir,
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
 
 	ubifs_release_budget(c, &req);
 	d_instantiate(dentry, inode);
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -644,6 +771,8 @@ out_cancel:
 	unlock_2_inodes(dir, inode);
 	ubifs_release_budget(c, &req);
 	iput(inode);
+out_fname:
+	fscrypt_free_filename(&nm);
 	return err;
 }
 
@@ -652,10 +781,10 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	struct inode *inode = d_inode(dentry);
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
-	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-	int err, budgeted = 1;
+	int err, sz_change, budgeted = 1;
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
 	unsigned int saved_nlink = inode->i_nlink;
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: deletion direntry, deletion inode (+1 for
@@ -667,16 +796,29 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu",
 		dentry, inode->i_ino,
 		inode->i_nlink, dir->i_ino);
+
+	if (ubifs_crypt_is_encrypted(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err && err != -ENOKEY)
+			return err;
+	}
+
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
+	if (err)
+		return err;
+
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	ubifs_assert(inode_is_locked(dir));
 	ubifs_assert(inode_is_locked(inode));
 	err = dbg_check_synced_i_size(c, inode);
 	if (err)
-		return err;
+		goto out_fname;
 
 	err = ubifs_budget_space(c, &req);
 	if (err) {
 		if (err != -ENOSPC)
-			return err;
+			goto out_fname;
 		budgeted = 0;
 	}
 
@@ -686,7 +828,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -698,6 +840,7 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry)
 		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -707,21 +850,23 @@ out_cancel:
 	unlock_2_inodes(dir, inode);
 	if (budgeted)
 		ubifs_release_budget(c, &req);
+out_fname:
+	fscrypt_free_filename(&nm);
 	return err;
 }
 
 /**
  * check_dir_empty - check if a directory is empty or not.
- * @c: UBIFS file-system description object
  * @dir: VFS inode object of the directory to check
  *
  * This function checks if directory @dir is empty. Returns zero if the
  * directory is empty, %-ENOTEMPTY if it is not, and other negative error codes
  * in case of of errors.
  */
-static int check_dir_empty(struct ubifs_info *c, struct inode *dir)
+int ubifs_check_dir_empty(struct inode *dir)
 {
-	struct qstr nm = { .name = NULL };
+	struct ubifs_info *c = dir->i_sb->s_fs_info;
+	struct fscrypt_name nm = { 0 };
 	struct ubifs_dent_node *dent;
 	union ubifs_key key;
 	int err;
@@ -743,10 +888,10 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 {
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	struct inode *inode = d_inode(dentry);
-	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
-	int err, budgeted = 1;
+	int err, sz_change, budgeted = 1;
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_budget_req req = { .mod_dent = 1, .dirtied_ino = 2 };
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: deletion direntry, deletion inode and
@@ -758,14 +903,26 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 		inode->i_ino, dir->i_ino);
 	ubifs_assert(inode_is_locked(dir));
 	ubifs_assert(inode_is_locked(inode));
-	err = check_dir_empty(c, d_inode(dentry));
+	err = ubifs_check_dir_empty(d_inode(dentry));
+	if (err)
+		return err;
+
+	if (ubifs_crypt_is_encrypted(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err && err != -ENOKEY)
+			return err;
+	}
+
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 1, &nm);
 	if (err)
 		return err;
 
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	err = ubifs_budget_space(c, &req);
 	if (err) {
 		if (err != -ENOSPC)
-			return err;
+			goto out_fname;
 		budgeted = 0;
 	}
 
@@ -776,7 +933,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 	dir->i_size -= sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 1, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 1, 0);
 	if (err)
 		goto out_cancel;
 	unlock_2_inodes(dir, inode);
@@ -788,6 +945,7 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry)
 		c->bi.nospace = c->bi.nospace_rp = 0;
 		smp_wmb();
 	}
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -798,6 +956,8 @@ out_cancel:
 	unlock_2_inodes(dir, inode);
 	if (budgeted)
 		ubifs_release_budget(c, &req);
+out_fname:
+	fscrypt_free_filename(&nm);
 	return err;
 }
 
@@ -806,8 +966,9 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct inode *inode;
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
-	int err, sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int err, sz_change;
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -821,10 +982,16 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	if (err)
 		return err;
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+	if (err)
+		goto out_budg;
+
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	inode = ubifs_new_inode(c, dir, S_IFDIR | mode);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto out_budg;
+		goto out_fname;
 	}
 
 	err = ubifs_init_security(dir, inode, &dentry->d_name);
@@ -838,7 +1005,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err) {
 		ubifs_err(c, "cannot create directory, error %d", err);
 		goto out_cancel;
@@ -847,6 +1014,7 @@ static int ubifs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 
 	ubifs_release_budget(c, &req);
 	d_instantiate(dentry, inode);
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -857,6 +1025,8 @@ out_cancel:
 out_inode:
 	make_bad_inode(inode);
 	iput(inode);
+out_fname:
+	fscrypt_free_filename(&nm);
 out_budg:
 	ubifs_release_budget(c, &req);
 	return err;
@@ -870,11 +1040,12 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	union ubifs_dev_desc *dev = NULL;
-	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int sz_change;
 	int err, devlen = 0;
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
 					.new_ino_d = ALIGN(devlen, 8),
 					.dirtied_ino = 1 };
+	struct fscrypt_name nm;
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -896,11 +1067,17 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 		return err;
 	}
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+	if (err)
+		goto out_budg;
+
+	sz_change = CALC_DENT_SIZE(fname_len(&nm));
+
 	inode = ubifs_new_inode(c, dir, mode);
 	if (IS_ERR(inode)) {
 		kfree(dev);
 		err = PTR_ERR(inode);
-		goto out_budg;
+		goto out_fname;
 	}
 
 	init_special_inode(inode, inode->i_mode, rdev);
@@ -917,7 +1094,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -925,6 +1102,7 @@ static int ubifs_mknod(struct inode *dir, struct dentry *dentry,
 	ubifs_release_budget(c, &req);
 	insert_inode_hash(inode);
 	d_instantiate(dentry, inode);
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -934,6 +1112,8 @@ out_cancel:
 out_inode:
 	make_bad_inode(inode);
 	iput(inode);
+out_fname:
+	fscrypt_free_filename(&nm);
 out_budg:
 	ubifs_release_budget(c, &req);
 	return err;
@@ -947,10 +1127,27 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	struct ubifs_inode *dir_ui = ubifs_inode(dir);
 	struct ubifs_info *c = dir->i_sb->s_fs_info;
 	int err, len = strlen(symname);
-	int sz_change = CALC_DENT_SIZE(dentry->d_name.len);
+	int sz_change = CALC_DENT_SIZE(len);
+	struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1);
+	struct fscrypt_symlink_data *sd = NULL;
 	struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
 					.new_ino_d = ALIGN(len, 8),
 					.dirtied_ino = 1 };
+	struct fscrypt_name nm;
+
+	if (ubifs_crypt_is_encrypted(dir)) {
+		err = fscrypt_get_encryption_info(dir);
+		if (err)
+			goto out_budg;
+
+		if (!fscrypt_has_encryption_key(dir)) {
+			err = -EPERM;
+			goto out_budg;
+		}
+
+		disk_link.len = (fscrypt_fname_encrypted_size(dir, len) +
+				sizeof(struct fscrypt_symlink_data));
+	}
 
 	/*
 	 * Budget request settings: new inode, new direntry and changing parent
@@ -960,36 +1157,65 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	dbg_gen("dent '%pd', target '%s' in dir ino %lu", dentry,
 		symname, dir->i_ino);
 
-	if (len > UBIFS_MAX_INO_DATA)
+	if (disk_link.len > UBIFS_MAX_INO_DATA)
 		return -ENAMETOOLONG;
 
 	err = ubifs_budget_space(c, &req);
 	if (err)
 		return err;
 
+	err = fscrypt_setup_filename(dir, &dentry->d_name, 0, &nm);
+	if (err)
+		goto out_budg;
+
 	inode = ubifs_new_inode(c, dir, S_IFLNK | S_IRWXUGO);
 	if (IS_ERR(inode)) {
 		err = PTR_ERR(inode);
-		goto out_budg;
+		goto out_fname;
 	}
 
 	ui = ubifs_inode(inode);
-	ui->data = kmalloc(len + 1, GFP_NOFS);
+	ui->data = kmalloc(disk_link.len, GFP_NOFS);
 	if (!ui->data) {
 		err = -ENOMEM;
 		goto out_inode;
 	}
 
-	memcpy(ui->data, symname, len);
-	((char *)ui->data)[len] = '\0';
-	inode->i_link = ui->data;
+	if (ubifs_crypt_is_encrypted(dir)) {
+		struct qstr istr = QSTR_INIT(symname, len);
+		struct fscrypt_str ostr;
+
+		sd = kzalloc(disk_link.len, GFP_NOFS);
+		if (!sd) {
+			err = -ENOMEM;
+			goto out_inode;
+		}
+
+		ostr.name = sd->encrypted_path;
+		ostr.len = disk_link.len;
+
+		err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr);
+		if (err) {
+			kfree(sd);
+			goto out_inode;
+		}
+
+		sd->len = cpu_to_le16(ostr.len);
+		disk_link.name = (char *)sd;
+	} else {
+		inode->i_link = ui->data;
+	}
+
+	memcpy(ui->data, disk_link.name, disk_link.len);
+	((char *)ui->data)[disk_link.len - 1] = '\0';
+
 	/*
 	 * The terminating zero byte is not written to the flash media and it
 	 * is put just to make later in-memory string processing simpler. Thus,
 	 * data length is @len, not @len + %1.
 	 */
-	ui->data_len = len;
-	inode->i_size = ubifs_inode(inode)->ui_size = len;
+	ui->data_len = disk_link.len - 1;
+	inode->i_size = ubifs_inode(inode)->ui_size = disk_link.len - 1;
 
 	err = ubifs_init_security(dir, inode, &dentry->d_name);
 	if (err)
@@ -999,7 +1225,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	dir->i_size += sz_change;
 	dir_ui->ui_size = dir->i_size;
 	dir->i_mtime = dir->i_ctime = inode->i_ctime;
-	err = ubifs_jnl_update(c, dir, &dentry->d_name, inode, 0, 0);
+	err = ubifs_jnl_update(c, dir, &nm, inode, 0, 0);
 	if (err)
 		goto out_cancel;
 	mutex_unlock(&dir_ui->ui_mutex);
@@ -1007,6 +1233,7 @@ static int ubifs_symlink(struct inode *dir, struct dentry *dentry,
 	ubifs_release_budget(c, &req);
 	insert_inode_hash(inode);
 	d_instantiate(dentry, inode);
+	fscrypt_free_filename(&nm);
 	return 0;
 
 out_cancel:
@@ -1016,6 +1243,8 @@ out_cancel:
 out_inode:
 	make_bad_inode(inode);
 	iput(inode);
+out_fname:
+	fscrypt_free_filename(&nm);
 out_budg:
 	ubifs_release_budget(c, &req);
 	return err;
@@ -1078,15 +1307,14 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	struct ubifs_inode *whiteout_ui = NULL;
 	int err, release, sync = 0, move = (new_dir != old_dir);
 	int is_dir = S_ISDIR(old_inode->i_mode);
-	int unlink = !!new_inode;
-	int new_sz = CALC_DENT_SIZE(new_dentry->d_name.len);
-	int old_sz = CALC_DENT_SIZE(old_dentry->d_name.len);
+	int unlink = !!new_inode, new_sz, old_sz;
 	struct ubifs_budget_req req = { .new_dent = 1, .mod_dent = 1,
 					.dirtied_ino = 3 };
 	struct ubifs_budget_req ino_req = { .dirtied_ino = 1,
 			.dirtied_ino_d = ALIGN(old_inode_ui->data_len, 8) };
 	struct timespec time;
 	unsigned int uninitialized_var(saved_nlink);
+	struct fscrypt_name old_nm, new_nm;
 
 	if (flags & ~RENAME_NOREPLACE)
 		return -EINVAL;
@@ -1107,17 +1335,41 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (unlink)
 		ubifs_assert(inode_is_locked(new_inode));
 
+	if (old_dir != new_dir) {
+		if (ubifs_crypt_is_encrypted(new_dir) &&
+		    !fscrypt_has_permitted_context(new_dir, old_inode))
+			return -EPERM;
+	}
+
 	if (unlink && is_dir) {
-		err = check_dir_empty(c, new_inode);
+		err = ubifs_check_dir_empty(new_inode);
 		if (err)
 			return err;
 	}
 
-	err = ubifs_budget_space(c, &req);
+	err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &old_nm);
 	if (err)
 		return err;
+
+	err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &new_nm);
+	if (err) {
+		fscrypt_free_filename(&old_nm);
+		return err;
+	}
+
+	new_sz = CALC_DENT_SIZE(fname_len(&new_nm));
+	old_sz = CALC_DENT_SIZE(fname_len(&old_nm));
+
+	err = ubifs_budget_space(c, &req);
+	if (err) {
+		fscrypt_free_filename(&old_nm);
+		fscrypt_free_filename(&new_nm);
+		return err;
+	}
 	err = ubifs_budget_space(c, &ino_req);
 	if (err) {
+		fscrypt_free_filename(&old_nm);
+		fscrypt_free_filename(&new_nm);
 		ubifs_release_budget(c, &req);
 		return err;
 	}
@@ -1239,8 +1491,8 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		iput(whiteout);
 	}
 
-	err = ubifs_jnl_rename(c, old_dir, old_dentry, new_dir, new_dentry, whiteout,
-			       sync);
+	err = ubifs_jnl_rename(c, old_dir, old_inode, &old_nm, new_dir,
+			       new_inode, &new_nm, whiteout, sync);
 	if (err)
 		goto out_cancel;
 
@@ -1256,6 +1508,9 @@ static int do_rename(struct inode *old_dir, struct dentry *old_dentry,
 		ubifs_release_budget(c, &ino_req);
 	if (IS_SYNC(old_inode))
 		err = old_inode->i_sb->s_op->write_inode(old_inode, NULL);
+
+	fscrypt_free_filename(&old_nm);
+	fscrypt_free_filename(&new_nm);
 	return err;
 
 out_cancel:
@@ -1284,6 +1539,8 @@ out_cancel:
 	unlock_4_inodes(old_dir, new_dir, new_inode, whiteout);
 	ubifs_release_budget(c, &ino_req);
 	ubifs_release_budget(c, &req);
+	fscrypt_free_filename(&old_nm);
+	fscrypt_free_filename(&new_nm);
 	return err;
 }
 
@@ -1298,9 +1555,27 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 	struct inode *snd_inode = d_inode(new_dentry);
 	struct timespec time;
 	int err;
+	struct fscrypt_name fst_nm, snd_nm;
 
 	ubifs_assert(fst_inode && snd_inode);
 
+	if ((ubifs_crypt_is_encrypted(old_dir) ||
+	    ubifs_crypt_is_encrypted(new_dir)) &&
+	    (old_dir != new_dir) &&
+	    (!fscrypt_has_permitted_context(new_dir, fst_inode) ||
+	     !fscrypt_has_permitted_context(old_dir, snd_inode)))
+		return -EPERM;
+
+	err = fscrypt_setup_filename(old_dir, &old_dentry->d_name, 0, &fst_nm);
+	if (err)
+		return err;
+
+	err = fscrypt_setup_filename(new_dir, &new_dentry->d_name, 0, &snd_nm);
+	if (err) {
+		fscrypt_free_filename(&fst_nm);
+		return err;
+	}
+
 	lock_4_inodes(old_dir, new_dir, NULL, NULL);
 
 	time = ubifs_current_time(old_dir);
@@ -1320,12 +1595,14 @@ static int ubifs_xrename(struct inode *old_dir, struct dentry *old_dentry,
 		}
 	}
 
-	err = ubifs_jnl_xrename(c, old_dir, old_dentry, new_dir, new_dentry,
-				sync);
+	err = ubifs_jnl_xrename(c, old_dir, fst_inode, &fst_nm, new_dir,
+				snd_inode, &snd_nm, sync);
 
 	unlock_4_inodes(old_dir, new_dir, NULL, NULL);
 	ubifs_release_budget(c, &req);
 
+	fscrypt_free_filename(&fst_nm);
+	fscrypt_free_filename(&snd_nm);
 	return err;
 }
 
@@ -1384,6 +1661,14 @@ int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	return 0;
 }
 
+static int ubifs_dir_open(struct inode *dir, struct file *file)
+{
+	if (ubifs_crypt_is_encrypted(dir))
+		return fscrypt_get_encryption_info(dir) ? -EACCES : 0;
+
+	return 0;
+}
+
 const struct inode_operations ubifs_dir_inode_operations = {
 	.lookup      = ubifs_lookup,
 	.create      = ubifs_create,
@@ -1410,6 +1695,7 @@ const struct file_operations ubifs_dir_operations = {
 	.iterate_shared = ubifs_readdir,
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
+	.open		= ubifs_dir_open,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index b4fbeefba246..b0d783774c96 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -78,6 +78,13 @@ static int read_block(struct inode *inode, void *addr, unsigned int block,
 		goto dump;
 
 	dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+
+	if (ubifs_crypt_is_encrypted(inode)) {
+		err = ubifs_decrypt(inode, dn, &dlen, block);
+		if (err)
+			goto dump;
+	}
+
 	out_len = UBIFS_BLOCK_SIZE;
 	err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
 			       le16_to_cpu(dn->compr_type));
@@ -650,6 +657,13 @@ static int populate_page(struct ubifs_info *c, struct page *page,
 
 			dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
 			out_len = UBIFS_BLOCK_SIZE;
+
+			if (ubifs_crypt_is_encrypted(inode)) {
+				err = ubifs_decrypt(inode, dn, &dlen, page_block);
+				if (err)
+					goto out_err;
+			}
+
 			err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len,
 					       le16_to_cpu(dn->compr_type));
 			if (err || len != out_len)
@@ -1594,6 +1608,15 @@ static const struct vm_operations_struct ubifs_file_vm_ops = {
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
 	int err;
+	struct inode *inode = file->f_mapping->host;
+
+	if (ubifs_crypt_is_encrypted(inode)) {
+		err = fscrypt_get_encryption_info(inode);
+		if (err)
+			return -EACCES;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
 
 	err = generic_file_mmap(file, vma);
 	if (err)
@@ -1605,6 +1628,88 @@ static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
 	return 0;
 }
 
+static int ubifs_file_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct dentry *dir;
+	struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+	if (ubifs_crypt_is_encrypted(inode)) {
+		ret = fscrypt_get_encryption_info(inode);
+		if (ret)
+			return -EACCES;
+		if (!fscrypt_has_encryption_key(inode))
+			return -ENOKEY;
+	}
+
+	dir = dget_parent(file_dentry(filp));
+	if (ubifs_crypt_is_encrypted(d_inode(dir)) &&
+			!fscrypt_has_permitted_context(d_inode(dir), inode)) {
+		ubifs_err(c, "Inconsistent encryption contexts: %lu/%lu",
+			  (unsigned long) d_inode(dir)->i_ino,
+			  (unsigned long) inode->i_ino);
+		dput(dir);
+		ubifs_ro_mode(c, -EPERM);
+		return -EPERM;
+	}
+	dput(dir);
+
+	return 0;
+}
+
+static const char *ubifs_get_link(struct dentry *dentry,
+					    struct inode *inode,
+					    struct delayed_call *done)
+{
+	int err;
+	struct fscrypt_symlink_data *sd;
+	struct ubifs_inode *ui = ubifs_inode(inode);
+	struct fscrypt_str cstr;
+	struct fscrypt_str pstr;
+
+	if (!ubifs_crypt_is_encrypted(inode))
+		return ui->data;
+
+	if (!dentry)
+		return ERR_PTR(-ECHILD);
+
+	err = fscrypt_get_encryption_info(inode);
+	if (err)
+		return ERR_PTR(err);
+
+	sd = (struct fscrypt_symlink_data *)ui->data;
+	cstr.name = sd->encrypted_path;
+	cstr.len = le16_to_cpu(sd->len);
+
+	if (cstr.len == 0)
+		return ERR_PTR(-ENOENT);
+
+	if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > ui->data_len)
+		return ERR_PTR(-EIO);
+
+	err = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr);
+	if (err)
+		return ERR_PTR(err);
+
+	err = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr);
+	if (err) {
+		fscrypt_fname_free_buffer(&pstr);
+		return ERR_PTR(err);
+	}
+
+	pstr.name[pstr.len] = '\0';
+
+	// XXX this probably won't happen anymore...
+	if (pstr.name[0] == '\0') {
+		fscrypt_fname_free_buffer(&pstr);
+		return ERR_PTR(-ENOENT);
+	}
+
+	set_delayed_call(done, kfree_link, pstr.name);
+	return pstr.name;
+}
+
+
 const struct address_space_operations ubifs_file_address_operations = {
 	.readpage       = ubifs_readpage,
 	.writepage      = ubifs_writepage,
@@ -1628,8 +1733,7 @@ const struct inode_operations ubifs_file_inode_operations = {
 };
 
 const struct inode_operations ubifs_symlink_inode_operations = {
-	.readlink    = generic_readlink,
-	.get_link    = simple_get_link,
+	.get_link    = ubifs_get_link,
 	.setattr     = ubifs_setattr,
 	.getattr     = ubifs_getattr,
 	.listxattr   = ubifs_listxattr,
@@ -1647,6 +1751,7 @@ const struct file_operations ubifs_file_operations = {
 	.unlocked_ioctl = ubifs_ioctl,
 	.splice_read	= generic_file_splice_read,
 	.splice_write	= iter_file_splice_write,
+	.open		= ubifs_file_open,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index e845c64b6ce1..7b35e3d6cde7 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -846,10 +846,6 @@ int ubifs_gc_start_commit(struct ubifs_info *c)
 	 */
 	while (1) {
 		lp = ubifs_fast_find_freeable(c);
-		if (IS_ERR(lp)) {
-			err = PTR_ERR(lp);
-			goto out;
-		}
 		if (!lp)
 			break;
 		ubifs_assert(!(lp->flags & LPROPS_TAKEN));
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 97be41215332..3be28900bf37 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -452,16 +452,22 @@ static enum hrtimer_restart wbuf_timer_callback_nolock(struct hrtimer *timer)
  */
 static void new_wbuf_timer_nolock(struct ubifs_wbuf *wbuf)
 {
+	ktime_t softlimit = ms_to_ktime(dirty_writeback_interval * 10);
+	unsigned long long delta = dirty_writeback_interval;
+
+	/* centi to milli, milli to nano, then 10% */
+	delta *= 10ULL * NSEC_PER_MSEC / 10ULL;
+
 	ubifs_assert(!hrtimer_active(&wbuf->timer));
+	ubifs_assert(delta <= ULONG_MAX);
 
 	if (wbuf->no_timer)
 		return;
 	dbg_io("set timer for jhead %s, %llu-%llu millisecs",
 	       dbg_jhead(wbuf->jhead),
-	       div_u64(ktime_to_ns(wbuf->softlimit), USEC_PER_SEC),
-	       div_u64(ktime_to_ns(wbuf->softlimit) + wbuf->delta,
-		       USEC_PER_SEC));
-	hrtimer_start_range_ns(&wbuf->timer, wbuf->softlimit, wbuf->delta,
+	       div_u64(ktime_to_ns(softlimit), USEC_PER_SEC),
+	       div_u64(ktime_to_ns(softlimit) + delta, USEC_PER_SEC));
+	hrtimer_start_range_ns(&wbuf->timer, softlimit, delta,
 			       HRTIMER_MODE_REL);
 }
 
@@ -1059,10 +1065,6 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf)
 
 	hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 	wbuf->timer.function = wbuf_timer_callback_nolock;
-	wbuf->softlimit = ktime_set(WBUF_TIMEOUT_SOFTLIMIT, 0);
-	wbuf->delta = WBUF_TIMEOUT_HARDLIMIT - WBUF_TIMEOUT_SOFTLIMIT;
-	wbuf->delta *= 1000000000ULL;
-	ubifs_assert(wbuf->delta <= ULONG_MAX);
 	return 0;
 }
 
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 3c7b29de0ca7..da519ba205f6 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -181,6 +181,26 @@ long ubifs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		mnt_drop_write_file(file);
 		return err;
 	}
+	case FS_IOC_SET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+		struct ubifs_info *c = inode->i_sb->s_fs_info;
+
+		err = ubifs_enable_encryption(c);
+		if (err)
+			return err;
+
+		return fscrypt_ioctl_set_policy(file, (const void __user *)arg);
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
+	case FS_IOC_GET_ENCRYPTION_POLICY: {
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+		return fscrypt_ioctl_get_policy(file, (void __user *)arg);
+#else
+		return -EOPNOTSUPP;
+#endif
+	}
 
 	default:
 		return -ENOTTY;
@@ -197,6 +217,9 @@ long ubifs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case FS_IOC32_SETFLAGS:
 		cmd = FS_IOC_SETFLAGS;
 		break;
+	case FS_IOC_SET_ENCRYPTION_POLICY:
+	case FS_IOC_GET_ENCRYPTION_POLICY:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 91bc76dc559e..294519b98874 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -78,16 +78,6 @@ static inline void zero_ino_node_unused(struct ubifs_ino_node *ino)
 static inline void zero_dent_node_unused(struct ubifs_dent_node *dent)
 {
 	dent->padding1 = 0;
-	memset(dent->padding2, 0, 4);
-}
-
-/**
- * zero_data_node_unused - zero out unused fields of an on-flash data node.
- * @data: the data node to zero out
- */
-static inline void zero_data_node_unused(struct ubifs_data_node *data)
-{
-	memset(data->padding, 0, 2);
 }
 
 /**
@@ -511,6 +501,14 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
 	ui->dirty = 0;
 }
 
+static void set_dent_cookie(struct ubifs_info *c, struct ubifs_dent_node *dent)
+{
+	if (c->double_hash)
+		dent->cookie = prandom_u32();
+	else
+		dent->cookie = 0;
+}
+
 /**
  * ubifs_jnl_update - update inode.
  * @c: UBIFS file-system description object
@@ -539,7 +537,7 @@ static void mark_inode_clean(struct ubifs_info *c, struct ubifs_inode *ui)
  * success. In case of failure, a negative error code is returned.
  */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
-		     const struct qstr *nm, const struct inode *inode,
+		     const struct fscrypt_name *nm, const struct inode *inode,
 		     int deletion, int xent)
 {
 	int err, dlen, ilen, len, lnum, ino_offs, dent_offs;
@@ -551,11 +549,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	struct ubifs_ino_node *ino;
 	union ubifs_key dent_key, ino_key;
 
-	dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
-		inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
+	//dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu",
+	//	inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino);
 	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
 
-	dlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	dlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
 	ilen = UBIFS_INO_NODE_SZ;
 
 	/*
@@ -596,9 +594,11 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
 	key_write(c, &dent_key, dent->key);
 	dent->inum = deletion ? 0 : cpu_to_le64(inode->i_ino);
 	dent->type = get_dent_type(inode->i_mode);
-	dent->nlen = cpu_to_le16(nm->len);
-	memcpy(dent->name, nm->name, nm->len);
-	dent->name[nm->len] = '\0';
+	dent->nlen = cpu_to_le16(fname_len(nm));
+	memcpy(dent->name, fname_name(nm), fname_len(nm));
+	dent->name[fname_len(nm)] = '\0';
+	set_dent_cookie(c, dent);
+
 	zero_dent_node_unused(dent);
 	ubifs_prep_grp_node(c, dent, dlen, 0);
 
@@ -697,14 +697,18 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 			 const union ubifs_key *key, const void *buf, int len)
 {
 	struct ubifs_data_node *data;
-	int err, lnum, offs, compr_type, out_len;
+	int err, lnum, offs, compr_type, out_len, compr_len;
 	int dlen = COMPRESSED_DATA_NODE_BUF_SZ, allocated = 1;
 	struct ubifs_inode *ui = ubifs_inode(inode);
+	bool encrypted = ubifs_crypt_is_encrypted(inode);
 
 	dbg_jnlk(key, "ino %lu, blk %u, len %d, key ",
 		(unsigned long)key_inum(c, key), key_block(c, key), len);
 	ubifs_assert(len <= UBIFS_BLOCK_SIZE);
 
+	if (encrypted)
+		dlen += UBIFS_CIPHER_BLOCK_SIZE;
+
 	data = kmalloc(dlen, GFP_NOFS | __GFP_NOWARN);
 	if (!data) {
 		/*
@@ -722,7 +726,6 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	data->ch.node_type = UBIFS_DATA_NODE;
 	key_write(c, key, &data->key);
 	data->size = cpu_to_le32(len);
-	zero_data_node_unused(data);
 
 	if (!(ui->flags & UBIFS_COMPR_FL))
 		/* Compression is disabled for this inode */
@@ -730,9 +733,19 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 	else
 		compr_type = ui->compr_type;
 
-	out_len = dlen - UBIFS_DATA_NODE_SZ;
-	ubifs_compress(c, buf, len, &data->data, &out_len, &compr_type);
-	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
+	out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ;
+	ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type);
+	ubifs_assert(compr_len <= UBIFS_BLOCK_SIZE);
+
+	if (encrypted) {
+		err = ubifs_encrypt(inode, data, compr_len, &out_len, key_block(c, key));
+		if (err)
+			goto out_free;
+
+	} else {
+		data->compr_size = 0;
+		out_len = compr_len;
+	}
 
 	dlen = UBIFS_DATA_NODE_SZ + out_len;
 	data->compr_type = cpu_to_le16(compr_type);
@@ -911,9 +924,11 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
  * ubifs_jnl_xrename - cross rename two directory entries.
  * @c: UBIFS file-system description object
  * @fst_dir: parent inode of 1st directory entry to exchange
- * @fst_dentry: 1st directory entry to exchange
+ * @fst_inode: 1st inode to exchange
+ * @fst_nm: name of 1st inode to exchange
  * @snd_dir: parent inode of 2nd directory entry to exchange
- * @snd_dentry: 2nd directory entry to exchange
+ * @snd_inode: 2nd inode to exchange
+ * @snd_nm: name of 2nd inode to exchange
  * @sync: non-zero if the write-buffer has to be synchronized
  *
  * This function implements the cross rename operation which may involve
@@ -922,29 +937,29 @@ int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode)
  * returned.
  */
 int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
-		      const struct dentry *fst_dentry,
+		      const struct inode *fst_inode,
+		      const struct fscrypt_name *fst_nm,
 		      const struct inode *snd_dir,
-		      const struct dentry *snd_dentry, int sync)
+		      const struct inode *snd_inode,
+		      const struct fscrypt_name *snd_nm, int sync)
 {
 	union ubifs_key key;
 	struct ubifs_dent_node *dent1, *dent2;
 	int err, dlen1, dlen2, lnum, offs, len, plen = UBIFS_INO_NODE_SZ;
 	int aligned_dlen1, aligned_dlen2;
 	int twoparents = (fst_dir != snd_dir);
-	const struct inode *fst_inode = d_inode(fst_dentry);
-	const struct inode *snd_inode = d_inode(snd_dentry);
 	void *p;
 
-	dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
-		fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
+	//dbg_jnl("dent '%pd' in dir ino %lu between dent '%pd' in dir ino %lu",
+	//	fst_dentry, fst_dir->i_ino, snd_dentry, snd_dir->i_ino);
 
 	ubifs_assert(ubifs_inode(fst_dir)->data_len == 0);
 	ubifs_assert(ubifs_inode(snd_dir)->data_len == 0);
 	ubifs_assert(mutex_is_locked(&ubifs_inode(fst_dir)->ui_mutex));
 	ubifs_assert(mutex_is_locked(&ubifs_inode(snd_dir)->ui_mutex));
 
-	dlen1 = UBIFS_DENT_NODE_SZ + snd_dentry->d_name.len + 1;
-	dlen2 = UBIFS_DENT_NODE_SZ + fst_dentry->d_name.len + 1;
+	dlen1 = UBIFS_DENT_NODE_SZ + fname_len(snd_nm) + 1;
+	dlen2 = UBIFS_DENT_NODE_SZ + fname_len(fst_nm) + 1;
 	aligned_dlen1 = ALIGN(dlen1, 8);
 	aligned_dlen2 = ALIGN(dlen2, 8);
 
@@ -963,24 +978,24 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
 
 	/* Make new dent for 1st entry */
 	dent1->ch.node_type = UBIFS_DENT_NODE;
-	dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, &snd_dentry->d_name);
+	dent_key_init_flash(c, &dent1->key, snd_dir->i_ino, snd_nm);
 	dent1->inum = cpu_to_le64(fst_inode->i_ino);
 	dent1->type = get_dent_type(fst_inode->i_mode);
-	dent1->nlen = cpu_to_le16(snd_dentry->d_name.len);
-	memcpy(dent1->name, snd_dentry->d_name.name, snd_dentry->d_name.len);
-	dent1->name[snd_dentry->d_name.len] = '\0';
+	dent1->nlen = cpu_to_le16(fname_len(snd_nm));
+	memcpy(dent1->name, fname_name(snd_nm), fname_len(snd_nm));
+	dent1->name[fname_len(snd_nm)] = '\0';
 	zero_dent_node_unused(dent1);
 	ubifs_prep_grp_node(c, dent1, dlen1, 0);
 
 	/* Make new dent for 2nd entry */
 	dent2 = (void *)dent1 + aligned_dlen1;
 	dent2->ch.node_type = UBIFS_DENT_NODE;
-	dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, &fst_dentry->d_name);
+	dent_key_init_flash(c, &dent2->key, fst_dir->i_ino, fst_nm);
 	dent2->inum = cpu_to_le64(snd_inode->i_ino);
 	dent2->type = get_dent_type(snd_inode->i_mode);
-	dent2->nlen = cpu_to_le16(fst_dentry->d_name.len);
-	memcpy(dent2->name, fst_dentry->d_name.name, fst_dentry->d_name.len);
-	dent2->name[fst_dentry->d_name.len] = '\0';
+	dent2->nlen = cpu_to_le16(fname_len(fst_nm));
+	memcpy(dent2->name, fname_name(fst_nm), fname_len(fst_nm));
+	dent2->name[fname_len(fst_nm)] = '\0';
 	zero_dent_node_unused(dent2);
 	ubifs_prep_grp_node(c, dent2, dlen2, 0);
 
@@ -1004,14 +1019,14 @@ int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
 	}
 	release_head(c, BASEHD);
 
-	dent_key_init(c, &key, snd_dir->i_ino, &snd_dentry->d_name);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &snd_dentry->d_name);
+	dent_key_init(c, &key, snd_dir->i_ino, snd_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, snd_nm);
 	if (err)
 		goto out_ro;
 
 	offs += aligned_dlen1;
-	dent_key_init(c, &key, fst_dir->i_ino, &fst_dentry->d_name);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &fst_dentry->d_name);
+	dent_key_init(c, &key, fst_dir->i_ino, fst_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, fst_nm);
 	if (err)
 		goto out_ro;
 
@@ -1063,31 +1078,31 @@ out_free:
  * returned.
  */
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
-		     const struct dentry *old_dentry,
+		     const struct inode *old_inode,
+		     const struct fscrypt_name *old_nm,
 		     const struct inode *new_dir,
-		     const struct dentry *new_dentry,
+		     const struct inode *new_inode,
+		     const struct fscrypt_name *new_nm,
 		     const struct inode *whiteout, int sync)
 {
 	void *p;
 	union ubifs_key key;
 	struct ubifs_dent_node *dent, *dent2;
 	int err, dlen1, dlen2, ilen, lnum, offs, len;
-	const struct inode *old_inode = d_inode(old_dentry);
-	const struct inode *new_inode = d_inode(new_dentry);
 	int aligned_dlen1, aligned_dlen2, plen = UBIFS_INO_NODE_SZ;
 	int last_reference = !!(new_inode && new_inode->i_nlink == 0);
 	int move = (old_dir != new_dir);
 	struct ubifs_inode *uninitialized_var(new_ui);
 
-	dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
-		old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
+	//dbg_jnl("dent '%pd' in dir ino %lu to dent '%pd' in dir ino %lu",
+	//	old_dentry, old_dir->i_ino, new_dentry, new_dir->i_ino);
 	ubifs_assert(ubifs_inode(old_dir)->data_len == 0);
 	ubifs_assert(ubifs_inode(new_dir)->data_len == 0);
 	ubifs_assert(mutex_is_locked(&ubifs_inode(old_dir)->ui_mutex));
 	ubifs_assert(mutex_is_locked(&ubifs_inode(new_dir)->ui_mutex));
 
-	dlen1 = UBIFS_DENT_NODE_SZ + new_dentry->d_name.len + 1;
-	dlen2 = UBIFS_DENT_NODE_SZ + old_dentry->d_name.len + 1;
+	dlen1 = UBIFS_DENT_NODE_SZ + fname_len(new_nm) + 1;
+	dlen2 = UBIFS_DENT_NODE_SZ + fname_len(old_nm) + 1;
 	if (new_inode) {
 		new_ui = ubifs_inode(new_inode);
 		ubifs_assert(mutex_is_locked(&new_ui->ui_mutex));
@@ -1113,19 +1128,19 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 
 	/* Make new dent */
 	dent->ch.node_type = UBIFS_DENT_NODE;
-	dent_key_init_flash(c, &dent->key, new_dir->i_ino, &new_dentry->d_name);
+	dent_key_init_flash(c, &dent->key, new_dir->i_ino, new_nm);
 	dent->inum = cpu_to_le64(old_inode->i_ino);
 	dent->type = get_dent_type(old_inode->i_mode);
-	dent->nlen = cpu_to_le16(new_dentry->d_name.len);
-	memcpy(dent->name, new_dentry->d_name.name, new_dentry->d_name.len);
-	dent->name[new_dentry->d_name.len] = '\0';
+	dent->nlen = cpu_to_le16(fname_len(new_nm));
+	memcpy(dent->name, fname_name(new_nm), fname_len(new_nm));
+	dent->name[fname_len(new_nm)] = '\0';
+	set_dent_cookie(c, dent);
 	zero_dent_node_unused(dent);
 	ubifs_prep_grp_node(c, dent, dlen1, 0);
 
 	dent2 = (void *)dent + aligned_dlen1;
 	dent2->ch.node_type = UBIFS_DENT_NODE;
-	dent_key_init_flash(c, &dent2->key, old_dir->i_ino,
-			    &old_dentry->d_name);
+	dent_key_init_flash(c, &dent2->key, old_dir->i_ino, old_nm);
 
 	if (whiteout) {
 		dent2->inum = cpu_to_le64(whiteout->i_ino);
@@ -1135,9 +1150,10 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		dent2->inum = 0;
 		dent2->type = DT_UNKNOWN;
 	}
-	dent2->nlen = cpu_to_le16(old_dentry->d_name.len);
-	memcpy(dent2->name, old_dentry->d_name.name, old_dentry->d_name.len);
-	dent2->name[old_dentry->d_name.len] = '\0';
+	dent2->nlen = cpu_to_le16(fname_len(old_nm));
+	memcpy(dent2->name, fname_name(old_nm), fname_len(old_nm));
+	dent2->name[fname_len(old_nm)] = '\0';
+	set_dent_cookie(c, dent2);
 	zero_dent_node_unused(dent2);
 	ubifs_prep_grp_node(c, dent2, dlen2, 0);
 
@@ -1178,15 +1194,15 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 	}
 	release_head(c, BASEHD);
 
-	dent_key_init(c, &key, new_dir->i_ino, &new_dentry->d_name);
-	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, &new_dentry->d_name);
+	dent_key_init(c, &key, new_dir->i_ino, new_nm);
+	err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen1, new_nm);
 	if (err)
 		goto out_ro;
 
 	offs += aligned_dlen1;
 	if (whiteout) {
-		dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
-		err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, &old_dentry->d_name);
+		dent_key_init(c, &key, old_dir->i_ino, old_nm);
+		err = ubifs_tnc_add_nm(c, &key, lnum, offs, dlen2, old_nm);
 		if (err)
 			goto out_ro;
 
@@ -1196,8 +1212,8 @@ int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
 		if (err)
 			goto out_ro;
 
-		dent_key_init(c, &key, old_dir->i_ino, &old_dentry->d_name);
-		err = ubifs_tnc_remove_nm(c, &key, &old_dentry->d_name);
+		dent_key_init(c, &key, old_dir->i_ino, old_nm);
+		err = ubifs_tnc_remove_nm(c, &key, old_nm);
 		if (err)
 			goto out_ro;
 	}
@@ -1251,35 +1267,60 @@ out_free:
 }
 
 /**
- * recomp_data_node - re-compress a truncated data node.
+ * truncate_data_node - re-compress/encrypt a truncated data node.
+ * @c: UBIFS file-system description object
+ * @inode: inode which referes to the data node
+ * @block: data block number
  * @dn: data node to re-compress
  * @new_len: new length
  *
  * This function is used when an inode is truncated and the last data node of
- * the inode has to be re-compressed and re-written.
+ * the inode has to be re-compressed/encrypted and re-written.
  */
-static int recomp_data_node(const struct ubifs_info *c,
-			    struct ubifs_data_node *dn, int *new_len)
+static int truncate_data_node(const struct ubifs_info *c, const struct inode *inode,
+			      unsigned int block, struct ubifs_data_node *dn,
+			      int *new_len)
 {
 	void *buf;
-	int err, len, compr_type, out_len;
+	int err, dlen, compr_type, out_len, old_dlen;
 
 	out_len = le32_to_cpu(dn->size);
 	buf = kmalloc(out_len * WORST_COMPR_FACTOR, GFP_NOFS);
 	if (!buf)
 		return -ENOMEM;
 
-	len = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
+	dlen = old_dlen = le32_to_cpu(dn->ch.len) - UBIFS_DATA_NODE_SZ;
 	compr_type = le16_to_cpu(dn->compr_type);
-	err = ubifs_decompress(c, &dn->data, len, buf, &out_len, compr_type);
-	if (err)
-		goto out;
 
-	ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+	if (ubifs_crypt_is_encrypted(inode)) {
+		err = ubifs_decrypt(inode, dn, &dlen, block);
+		if (err)
+			goto out;
+	}
+
+	if (compr_type != UBIFS_COMPR_NONE) {
+		err = ubifs_decompress(c, &dn->data, dlen, buf, &out_len, compr_type);
+		if (err)
+			goto out;
+
+		ubifs_compress(c, buf, *new_len, &dn->data, &out_len, &compr_type);
+	}
+
+	if (ubifs_crypt_is_encrypted(inode)) {
+		err = ubifs_encrypt(inode, dn, out_len, &old_dlen, block);
+		if (err)
+			goto out;
+
+		out_len = old_dlen;
+	} else {
+		dn->compr_size = 0;
+	}
+
 	ubifs_assert(out_len <= UBIFS_BLOCK_SIZE);
 	dn->compr_type = cpu_to_le16(compr_type);
 	dn->size = cpu_to_le32(*new_len);
 	*new_len = UBIFS_DATA_NODE_SZ + out_len;
+	err = 0;
 out:
 	kfree(buf);
 	return err;
@@ -1347,17 +1388,9 @@ int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 			if (le32_to_cpu(dn->size) <= dlen)
 				dlen = 0; /* Nothing to do */
 			else {
-				int compr_type = le16_to_cpu(dn->compr_type);
-
-				if (compr_type != UBIFS_COMPR_NONE) {
-					err = recomp_data_node(c, dn, &dlen);
-					if (err)
-						goto out_free;
-				} else {
-					dn->size = cpu_to_le32(dlen);
-					dlen += UBIFS_DATA_NODE_SZ;
-				}
-				zero_data_node_unused(dn);
+				err = truncate_data_node(c, inode, blk, dn, &dlen);
+				if (err)
+					goto out_free;
 			}
 		}
 	}
@@ -1442,7 +1475,8 @@ out_free:
  * error code in case of failure.
  */
 int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
-			   const struct inode *inode, const struct qstr *nm)
+			   const struct inode *inode,
+			   const struct fscrypt_name *nm)
 {
 	int err, xlen, hlen, len, lnum, xent_offs, aligned_xlen;
 	struct ubifs_dent_node *xent;
@@ -1451,9 +1485,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
 	int sync = IS_DIRSYNC(host);
 	struct ubifs_inode *host_ui = ubifs_inode(host);
 
-	dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
-		host->i_ino, inode->i_ino, nm->name,
-		ubifs_inode(inode)->data_len);
+	//dbg_jnl("host %lu, xattr ino %lu, name '%s', data len %d",
+	//	host->i_ino, inode->i_ino, nm->name,
+	//	ubifs_inode(inode)->data_len);
 	ubifs_assert(inode->i_nlink == 0);
 	ubifs_assert(mutex_is_locked(&host_ui->ui_mutex));
 
@@ -1461,7 +1495,7 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
 	 * Since we are deleting the inode, we do not bother to attach any data
 	 * to it and assume its length is %UBIFS_INO_NODE_SZ.
 	 */
-	xlen = UBIFS_DENT_NODE_SZ + nm->len + 1;
+	xlen = UBIFS_DENT_NODE_SZ + fname_len(nm) + 1;
 	aligned_xlen = ALIGN(xlen, 8);
 	hlen = host_ui->data_len + UBIFS_INO_NODE_SZ;
 	len = aligned_xlen + UBIFS_INO_NODE_SZ + ALIGN(hlen, 8);
@@ -1482,9 +1516,9 @@ int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
 	key_write(c, &xent_key, xent->key);
 	xent->inum = 0;
 	xent->type = get_dent_type(inode->i_mode);
-	xent->nlen = cpu_to_le16(nm->len);
-	memcpy(xent->name, nm->name, nm->len);
-	xent->name[nm->len] = '\0';
+	xent->nlen = cpu_to_le16(fname_len(nm));
+	memcpy(xent->name, fname_name(nm), fname_len(nm));
+	xent->name[fname_len(nm)] = '\0';
 	zero_dent_node_unused(xent);
 	ubifs_prep_grp_node(c, xent, xlen, 0);
 
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index c0a95e393347..7547be512db2 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -69,7 +69,7 @@ static inline uint32_t key_r5_hash(const char *s, int len)
 	uint32_t a = 0;
 	const signed char *str = (const signed char *)s;
 
-	while (*str) {
+	while (len--) {
 		a += *str << 4;
 		a += *str >> 4;
 		a *= 11;
@@ -153,13 +153,13 @@ static inline void highest_ino_key(const struct ubifs_info *c,
  * @c: UBIFS file-system description object
  * @key: key to initialize
  * @inum: parent inode number
- * @nm: direntry name and length
+ * @nm: direntry name and length. Not a string when encrypted!
  */
 static inline void dent_key_init(const struct ubifs_info *c,
 				 union ubifs_key *key, ino_t inum,
-				 const struct qstr *nm)
+				 const struct fscrypt_name *nm)
 {
-	uint32_t hash = c->key_hash(nm->name, nm->len);
+	uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
 
 	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
 	key->u32[0] = inum;
@@ -191,10 +191,11 @@ static inline void dent_key_init_hash(const struct ubifs_info *c,
  * @nm: direntry name and length
  */
 static inline void dent_key_init_flash(const struct ubifs_info *c, void *k,
-				       ino_t inum, const struct qstr *nm)
+				       ino_t inum,
+				       const struct fscrypt_name *nm)
 {
 	union ubifs_key *key = k;
-	uint32_t hash = c->key_hash(nm->name, nm->len);
+	uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
 
 	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
 	key->j32[0] = cpu_to_le32(inum);
@@ -225,9 +226,9 @@ static inline void lowest_dent_key(const struct ubifs_info *c,
  */
 static inline void xent_key_init(const struct ubifs_info *c,
 				 union ubifs_key *key, ino_t inum,
-				 const struct qstr *nm)
+				 const struct fscrypt_name *nm)
 {
-	uint32_t hash = c->key_hash(nm->name, nm->len);
+	uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
 
 	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
 	key->u32[0] = inum;
@@ -242,10 +243,10 @@ static inline void xent_key_init(const struct ubifs_info *c,
  * @nm: extended attribute entry name and length
  */
 static inline void xent_key_init_flash(const struct ubifs_info *c, void *k,
-				       ino_t inum, const struct qstr *nm)
+				       ino_t inum, const struct fscrypt_name *nm)
 {
 	union ubifs_key *key = k;
-	uint32_t hash = c->key_hash(nm->name, nm->len);
+	uint32_t hash = c->key_hash(fname_name(nm), fname_len(nm));
 
 	ubifs_assert(!(hash & ~UBIFS_S_KEY_HASH_MASK));
 	key->j32[0] = cpu_to_le32(inum);
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index fb0f44cd1e28..ae5c02f22f3e 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -61,7 +61,7 @@ struct replay_entry {
 	struct list_head list;
 	union ubifs_key key;
 	union {
-		struct qstr nm;
+		struct fscrypt_name nm;
 		struct {
 			loff_t old_size;
 			loff_t new_size;
@@ -327,7 +327,7 @@ static void destroy_replay_list(struct ubifs_info *c)
 
 	list_for_each_entry_safe(r, tmp, &c->replay_list, list) {
 		if (is_hash_key(c, &r->key))
-			kfree(r->nm.name);
+			kfree(fname_name(&r->nm));
 		list_del(&r->list);
 		kfree(r);
 	}
@@ -430,10 +430,10 @@ static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
 	r->deletion = !!deletion;
 	r->sqnum = sqnum;
 	key_copy(c, key, &r->key);
-	r->nm.len = nlen;
+	fname_len(&r->nm) = nlen;
 	memcpy(nbuf, name, nlen);
 	nbuf[nlen] = '\0';
-	r->nm.name = nbuf;
+	fname_name(&r->nm) = nbuf;
 
 	list_add_tail(&r->list, &c->replay_list);
 	return 0;
@@ -456,7 +456,7 @@ int ubifs_validate_entry(struct ubifs_info *c,
 	if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
 	    dent->type >= UBIFS_ITYPES_CNT ||
 	    nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
-	    strnlen(dent->name, nlen) != nlen ||
+	    (key_type == UBIFS_XENT_KEY && strnlen(dent->name, nlen) != nlen) ||
 	    le64_to_cpu(dent->inum) > MAX_INUM) {
 		ubifs_err(c, "bad %s node", key_type == UBIFS_DENT_KEY ?
 			  "directory entry" : "extended attribute entry");
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index 3cbb904a6d7d..7f1ead29e727 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -163,6 +163,7 @@ static int create_default_filesystem(struct ubifs_info *c)
 	tmp64 = (long long)max_buds * c->leb_size;
 	if (big_lpt)
 		sup_flags |= UBIFS_FLG_BIGLPT;
+	sup_flags |= UBIFS_FLG_DOUBLE_HASH;
 
 	sup->ch.node_type  = UBIFS_SB_NODE;
 	sup->key_hash      = UBIFS_KEY_HASH_R5;
@@ -465,6 +466,16 @@ static int validate_sb(struct ubifs_info *c, struct ubifs_sb_node *sup)
 		goto failed;
 	}
 
+	if (!c->double_hash && c->fmt_version >= 5) {
+		err = 16;
+		goto failed;
+	}
+
+	if (c->encrypted && c->fmt_version < 5) {
+		err = 17;
+		goto failed;
+	}
+
 	return 0;
 
 failed:
@@ -620,6 +631,24 @@ int ubifs_read_superblock(struct ubifs_info *c)
 	memcpy(&c->uuid, &sup->uuid, 16);
 	c->big_lpt = !!(sup_flags & UBIFS_FLG_BIGLPT);
 	c->space_fixup = !!(sup_flags & UBIFS_FLG_SPACE_FIXUP);
+	c->double_hash = !!(sup_flags & UBIFS_FLG_DOUBLE_HASH);
+	c->encrypted = !!(sup_flags & UBIFS_FLG_ENCRYPTION);
+
+	if ((sup_flags & ~UBIFS_FLG_MASK) != 0) {
+		ubifs_err(c, "Unknown feature flags found: %#x",
+			  sup_flags & ~UBIFS_FLG_MASK);
+		err = -EINVAL;
+		goto out;
+	}
+
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+	if (c->encrypted) {
+		ubifs_err(c, "file system contains encrypted files but UBIFS"
+			     " was built without crypto support.");
+		err = -EINVAL;
+		goto out;
+	}
+#endif
 
 	/* Automatically increase file system size to the maximum size */
 	c->old_leb_cnt = c->leb_cnt;
@@ -807,3 +836,33 @@ int ubifs_fixup_free_space(struct ubifs_info *c)
 	ubifs_msg(c, "free space fixup complete");
 	return err;
 }
+
+int ubifs_enable_encryption(struct ubifs_info *c)
+{
+	int err;
+	struct ubifs_sb_node *sup;
+
+	if (c->encrypted)
+		return 0;
+
+	if (c->ro_mount || c->ro_media)
+		return -EROFS;
+
+	if (c->fmt_version < 5) {
+		ubifs_err(c, "on-flash format version 5 is needed for encryption");
+		return -EINVAL;
+	}
+
+	sup = ubifs_read_sb_node(c);
+	if (IS_ERR(sup))
+		return PTR_ERR(sup);
+
+	sup->flags |= cpu_to_le32(UBIFS_FLG_ENCRYPTION);
+
+	err = ubifs_write_sb_node(c, sup);
+	if (!err)
+		c->encrypted = 1;
+	kfree(sup);
+
+	return err;
+}
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 4ec051089186..e08aa04fc835 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -198,7 +198,6 @@ struct inode *ubifs_iget(struct super_block *sb, unsigned long inum)
 		}
 		memcpy(ui->data, ino->data, ui->data_len);
 		((char *)ui->data)[ui->data_len] = '\0';
-		inode->i_link = ui->data;
 		break;
 	case S_IFBLK:
 	case S_IFCHR:
@@ -380,6 +379,9 @@ out:
 	}
 done:
 	clear_inode(inode);
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+	fscrypt_put_encryption_info(inode, NULL);
+#endif
 }
 
 static void ubifs_dirty_inode(struct inode *inode, int flags)
@@ -1207,7 +1209,8 @@ static int mount_ubifs(struct ubifs_info *c)
 		bu_init(c);
 
 	if (!c->ro_mount) {
-		c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ,
+		c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+					       UBIFS_CIPHER_BLOCK_SIZE,
 					       GFP_KERNEL);
 		if (!c->write_reserve_buf)
 			goto out_free;
@@ -1620,7 +1623,8 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 		goto out;
 	}
 
-	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ, GFP_KERNEL);
+	c->write_reserve_buf = kmalloc(COMPRESSED_DATA_NODE_BUF_SZ + \
+				       UBIFS_CIPHER_BLOCK_SIZE, GFP_KERNEL);
 	if (!c->write_reserve_buf) {
 		err = -ENOMEM;
 		goto out;
@@ -1995,6 +1999,12 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
 	return c;
 }
 
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+struct fscrypt_operations ubifs_crypt_operations = {
+	.is_encrypted		= __ubifs_crypt_is_encrypted,
+};
+#endif
+
 static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct ubifs_info *c = sb->s_fs_info;
@@ -2041,6 +2051,7 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
 		sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
 	sb->s_op = &ubifs_super_operations;
 	sb->s_xattr = ubifs_xattr_handlers;
+	sb->s_cop = &ubifs_crypt_operations;
 
 	mutex_lock(&c->umount_mutex);
 	err = mount_ubifs(c);
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa9a20cc60d6..709aa098dd46 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -34,6 +34,11 @@
 #include <linux/slab.h>
 #include "ubifs.h"
 
+static int try_read_node(const struct ubifs_info *c, void *buf, int type,
+			 int len, int lnum, int offs);
+static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
+			      struct ubifs_zbranch *zbr, void *node);
+
 /*
  * Returned codes of 'matches_name()' and 'fallible_matches_name()' functions.
  * @NAME_LESS: name corresponding to the first argument is less than second
@@ -378,7 +383,7 @@ static void lnc_free(struct ubifs_zbranch *zbr)
 }
 
 /**
- * tnc_read_node_nm - read a "hashed" leaf node.
+ * tnc_read_hashed_node - read a "hashed" leaf node.
  * @c: UBIFS file-system description object
  * @zbr: key and position of the node
  * @node: node is returned here
@@ -388,8 +393,8 @@ static void lnc_free(struct ubifs_zbranch *zbr)
  * added to LNC. Returns zero in case of success or a negative negative error
  * code in case of failure.
  */
-static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
-			    void *node)
+static int tnc_read_hashed_node(struct ubifs_info *c, struct ubifs_zbranch *zbr,
+				void *node)
 {
 	int err;
 
@@ -402,7 +407,19 @@ static int tnc_read_node_nm(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		return 0;
 	}
 
-	err = ubifs_tnc_read_node(c, zbr, node);
+	if (c->replaying) {
+		err = fallible_read_node(c, &zbr->key, zbr, node);
+		/*
+		 * When the node was not found, return -ENOENT, 0 otherwise.
+		 * Negative return codes stay as-is.
+		 */
+		if (err == 0)
+			err = -ENOENT;
+		else if (err == 1)
+			err = 0;
+	} else {
+		err = ubifs_tnc_read_node(c, zbr, node);
+	}
 	if (err)
 		return err;
 
@@ -519,7 +536,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key,
  * of failure, a negative error code is returned.
  */
 static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
-			const struct qstr *nm)
+			const struct fscrypt_name *nm)
 {
 	struct ubifs_dent_node *dent;
 	int nlen, err;
@@ -542,11 +559,11 @@ static int matches_name(struct ubifs_info *c, struct ubifs_zbranch *zbr,
 		dent = zbr->leaf;
 
 	nlen = le16_to_cpu(dent->nlen);
-	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
 	if (err == 0) {
-		if (nlen == nm->len)
+		if (nlen == fname_len(nm))
 			return NAME_MATCHES;
-		else if (nlen < nm->len)
+		else if (nlen < fname_len(nm))
 			return NAME_LESS;
 		else
 			return NAME_GREATER;
@@ -689,7 +706,7 @@ static int tnc_prev(struct ubifs_info *c, struct ubifs_znode **zn, int *n)
  */
 static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
 			     struct ubifs_znode **zn, int *n,
-			     const struct qstr *nm)
+			     const struct fscrypt_name *nm)
 {
 	int err;
 
@@ -807,7 +824,7 @@ static int resolve_collision(struct ubifs_info *c, const union ubifs_key *key,
  */
 static int fallible_matches_name(struct ubifs_info *c,
 				 struct ubifs_zbranch *zbr,
-				 const struct qstr *nm)
+				 const struct fscrypt_name *nm)
 {
 	struct ubifs_dent_node *dent;
 	int nlen, err;
@@ -835,11 +852,11 @@ static int fallible_matches_name(struct ubifs_info *c,
 		dent = zbr->leaf;
 
 	nlen = le16_to_cpu(dent->nlen);
-	err = memcmp(dent->name, nm->name, min_t(int, nlen, nm->len));
+	err = memcmp(dent->name, fname_name(nm), min_t(int, nlen, fname_len(nm)));
 	if (err == 0) {
-		if (nlen == nm->len)
+		if (nlen == fname_len(nm))
 			return NAME_MATCHES;
-		else if (nlen < nm->len)
+		else if (nlen < fname_len(nm))
 			return NAME_LESS;
 		else
 			return NAME_GREATER;
@@ -878,7 +895,8 @@ out_free:
 static int fallible_resolve_collision(struct ubifs_info *c,
 				      const union ubifs_key *key,
 				      struct ubifs_znode **zn, int *n,
-				      const struct qstr *nm, int adding)
+				      const struct fscrypt_name *nm,
+				      int adding)
 {
 	struct ubifs_znode *o_znode = NULL, *znode = *zn;
 	int uninitialized_var(o_n), err, cmp, unsure = 0, nn = *n;
@@ -1453,7 +1471,7 @@ again:
 		 * In this case the leaf node cache gets used, so we pass the
 		 * address of the zbranch and keep the mutex locked
 		 */
-		err = tnc_read_node_nm(c, zt, node);
+		err = tnc_read_hashed_node(c, zt, node);
 		goto out;
 	}
 	if (safely) {
@@ -1782,19 +1800,19 @@ int ubifs_tnc_bulk_read(struct ubifs_info *c, struct bu_info *bu)
  * @node: the node is returned here
  * @nm: node name
  *
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
  * Since the hash may have collisions, there may be many nodes with the same
  * key, so we have to sequentially look to all of them until the needed one is
  * found. This function returns zero in case of success, %-ENOENT if the node
  * was not found, and a negative error code in case of failure.
  */
 static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
-			void *node, const struct qstr *nm)
+			void *node, const struct fscrypt_name *nm)
 {
 	int found, n, err;
 	struct ubifs_znode *znode;
 
-	dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
+	//dbg_tnck(key, "name '%.*s' key ", nm->len, nm->name);
 	mutex_lock(&c->tnc_mutex);
 	found = ubifs_lookup_level0(c, key, &znode, &n);
 	if (!found) {
@@ -1816,7 +1834,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 		goto out_unlock;
 	}
 
-	err = tnc_read_node_nm(c, &znode->zbranch[n], node);
+	err = tnc_read_hashed_node(c, &znode->zbranch[n], node);
 
 out_unlock:
 	mutex_unlock(&c->tnc_mutex);
@@ -1830,14 +1848,14 @@ out_unlock:
  * @node: the node is returned here
  * @nm: node name
  *
- * This function look up and reads a node which contains name hash in the key.
+ * This function looks up and reads a node which contains name hash in the key.
  * Since the hash may have collisions, there may be many nodes with the same
  * key, so we have to sequentially look to all of them until the needed one is
  * found. This function returns zero in case of success, %-ENOENT if the node
  * was not found, and a negative error code in case of failure.
  */
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
-			void *node, const struct qstr *nm)
+			void *node, const struct fscrypt_name *nm)
 {
 	int err, len;
 	const struct ubifs_dent_node *dent = node;
@@ -1851,16 +1869,105 @@ int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
 		return err;
 
 	len = le16_to_cpu(dent->nlen);
-	if (nm->len == len && !memcmp(dent->name, nm->name, len))
+	if (fname_len(nm) == len && !memcmp(dent->name, fname_name(nm), len))
 		return 0;
 
 	/*
 	 * Unluckily, there are hash collisions and we have to iterate over
 	 * them look at each direntry with colliding name hash sequentially.
 	 */
+
 	return do_lookup_nm(c, key, node, nm);
 }
 
+static int do_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+			struct ubifs_dent_node *dent, uint32_t cookie)
+{
+	int n, err, type = key_type(c, key);
+	struct ubifs_znode *znode;
+	struct ubifs_zbranch *zbr;
+	union ubifs_key *dkey, start_key;
+
+	ubifs_assert(is_hash_key(c, key));
+
+	lowest_dent_key(c, &start_key, key_inum(c, key));
+
+	mutex_lock(&c->tnc_mutex);
+	err = ubifs_lookup_level0(c, &start_key, &znode, &n);
+	if (unlikely(err < 0))
+		goto out_unlock;
+
+	for (;;) {
+		if (!err) {
+			err = tnc_next(c, &znode, &n);
+			if (err)
+				goto out_unlock;
+		}
+
+		zbr = &znode->zbranch[n];
+		dkey = &zbr->key;
+
+		if (key_inum(c, dkey) != key_inum(c, key) ||
+		    key_type(c, dkey) != type) {
+			err = -ENOENT;
+			goto out_unlock;
+		}
+
+		err = tnc_read_hashed_node(c, zbr, dent);
+		if (err)
+			goto out_unlock;
+
+		if (key_hash(c, key) == key_hash(c, dkey) &&
+		    le32_to_cpu(dent->cookie) == cookie)
+			goto out_unlock;
+	}
+
+out_unlock:
+	mutex_unlock(&c->tnc_mutex);
+	return err;
+}
+
+/**
+ * ubifs_tnc_lookup_dh - look up a "double hashed" node.
+ * @c: UBIFS file-system description object
+ * @key: node key to lookup
+ * @node: the node is returned here
+ * @cookie: node cookie for collision resolution
+ *
+ * This function looks up and reads a node which contains name hash in the key.
+ * Since the hash may have collisions, there may be many nodes with the same
+ * key, so we have to sequentially look to all of them until the needed one
+ * with the same cookie value is found.
+ * This function returns zero in case of success, %-ENOENT if the node
+ * was not found, and a negative error code in case of failure.
+ */
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, uint32_t cookie)
+{
+	int err;
+	const struct ubifs_dent_node *dent = node;
+
+	if (!c->double_hash)
+		return -EOPNOTSUPP;
+
+	/*
+	 * We assume that in most of the cases there are no name collisions and
+	 * 'ubifs_tnc_lookup()' returns us the right direntry.
+	 */
+	err = ubifs_tnc_lookup(c, key, node);
+	if (err)
+		return err;
+
+	if (le32_to_cpu(dent->cookie) == cookie)
+		return 0;
+
+	/*
+	 * Unluckily, there are hash collisions and we have to iterate over
+	 * them look at each direntry with colliding name hash sequentially.
+	 */
+	return do_lookup_dh(c, key, node, cookie);
+}
+
 /**
  * correct_parent_keys - correct parent znodes' keys.
  * @c: UBIFS file-system description object
@@ -2279,14 +2386,15 @@ out_unlock:
  * may have collisions, like directory entry keys.
  */
 int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
-		     int lnum, int offs, int len, const struct qstr *nm)
+		     int lnum, int offs, int len,
+		     const struct fscrypt_name *nm)
 {
 	int found, n, err = 0;
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
-		 lnum, offs, nm->len, nm->name);
+	//dbg_tnck(key, "LEB %d:%d, name '%.*s', key ",
+	//	 lnum, offs, nm->len, nm->name);
 	found = lookup_level0_dirty(c, key, &znode, &n);
 	if (found < 0) {
 		err = found;
@@ -2344,7 +2452,7 @@ int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
 			 * by passing 'ubifs_tnc_remove_nm()' the same key but
 			 * an unmatchable name.
 			 */
-			struct qstr noname = { .name = "" };
+			struct fscrypt_name noname = { .disk_name = { .name = "", .len = 1 } };
 
 			err = dbg_check_tnc(c, 0);
 			mutex_unlock(&c->tnc_mutex);
@@ -2514,13 +2622,13 @@ out_unlock:
  * Returns %0 on success or negative error code on failure.
  */
 int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
-			const struct qstr *nm)
+			const struct fscrypt_name *nm)
 {
 	int n, err;
 	struct ubifs_znode *znode;
 
 	mutex_lock(&c->tnc_mutex);
-	dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
+	//dbg_tnck(key, "%.*s, key ", nm->len, nm->name);
 	err = lookup_level0_dirty(c, key, &znode, &n);
 	if (err < 0)
 		goto out_unlock;
@@ -2669,7 +2777,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 {
 	union ubifs_key key1, key2;
 	struct ubifs_dent_node *xent, *pxent = NULL;
-	struct qstr nm = { .name = NULL };
+	struct fscrypt_name nm = {0};
 
 	dbg_tnc("ino %lu", (unsigned long)inum);
 
@@ -2694,8 +2802,8 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
 		dbg_tnc("xent '%s', ino %lu", xent->name,
 			(unsigned long)xattr_inum);
 
-		nm.name = xent->name;
-		nm.len = le16_to_cpu(xent->nlen);
+		fname_name(&nm) = xent->name;
+		fname_len(&nm) = le16_to_cpu(xent->nlen);
 		err = ubifs_tnc_remove_nm(c, &key1, &nm);
 		if (err) {
 			kfree(xent);
@@ -2747,7 +2855,7 @@ int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum)
  */
 struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 					   union ubifs_key *key,
-					   const struct qstr *nm)
+					   const struct fscrypt_name *nm)
 {
 	int n, err, type = key_type(c, key);
 	struct ubifs_znode *znode;
@@ -2755,7 +2863,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 	struct ubifs_zbranch *zbr;
 	union ubifs_key *dkey;
 
-	dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
+	//dbg_tnck(key, "%s ", nm->name ? (char *)nm->name : "(lowest)");
 	ubifs_assert(is_hash_key(c, key));
 
 	mutex_lock(&c->tnc_mutex);
@@ -2763,10 +2871,14 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 	if (unlikely(err < 0))
 		goto out_unlock;
 
-	if (nm->name) {
+	if (fname_len(nm) > 0) {
 		if (err) {
 			/* Handle collisions */
-			err = resolve_collision(c, key, &znode, &n, nm);
+			if (c->replaying)
+				err = fallible_resolve_collision(c, key, &znode, &n,
+							 nm, 0);
+			else
+				err = resolve_collision(c, key, &znode, &n, nm);
 			dbg_tnc("rc returned %d, znode %p, n %d",
 				err, znode, n);
 			if (unlikely(err < 0))
@@ -2813,7 +2925,7 @@ struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 		goto out_free;
 	}
 
-	err = tnc_read_node_nm(c, zbr, dent);
+	err = tnc_read_hashed_node(c, zbr, dent);
 	if (unlikely(err))
 		goto out_free;
 
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index e24380cf46ed..e8c23c9d4f4a 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -46,7 +46,7 @@
  * UBIFS went into mainline kernel with format version 4. The older formats
  * were development formats.
  */
-#define UBIFS_FORMAT_VERSION 4
+#define UBIFS_FORMAT_VERSION 5
 
 /*
  * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
@@ -301,6 +301,13 @@ enum {
 #define UBIFS_MAX_NODE_SZ UBIFS_MAX_INO_NODE_SZ
 
 /*
+ * xattr name of UBIFS encryption context, we don't use a prefix
+ * nor a long name to not waste space on the flash.
+ */
+#define UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT "c"
+
+
+/*
  * On-flash inode flags.
  *
  * UBIFS_COMPR_FL: use compression for this inode
@@ -309,6 +316,7 @@ enum {
  * UBIFS_APPEND_FL: writes to the inode may only append data
  * UBIFS_DIRSYNC_FL: I/O on this directory inode has to be synchronous
  * UBIFS_XATTR_FL: this inode is the inode for an extended attribute value
+ * UBIFS_CRYPT_FL: use encryption for this inode
  *
  * Note, these are on-flash flags which correspond to ioctl flags
  * (@FS_COMPR_FL, etc). They have the same values now, but generally, do not
@@ -321,6 +329,7 @@ enum {
 	UBIFS_APPEND_FL    = 0x08,
 	UBIFS_DIRSYNC_FL   = 0x10,
 	UBIFS_XATTR_FL     = 0x20,
+	UBIFS_CRYPT_FL     = 0x40,
 };
 
 /* Inode flag bits used by UBIFS */
@@ -409,12 +418,19 @@ enum {
  *
  * UBIFS_FLG_BIGLPT: if "big" LPT model is used if set
  * UBIFS_FLG_SPACE_FIXUP: first-mount "fixup" of free space within LEBs needed
+ * UBIFS_FLG_DOUBLE_HASH: store a 32bit cookie in directory entry nodes to
+ *			  support 64bit cookies for lookups by hash
+ * UBIFS_FLG_ENCRYPTION: this filesystem contains encrypted files
  */
 enum {
 	UBIFS_FLG_BIGLPT = 0x02,
 	UBIFS_FLG_SPACE_FIXUP = 0x04,
+	UBIFS_FLG_DOUBLE_HASH = 0x08,
+	UBIFS_FLG_ENCRYPTION = 0x10,
 };
 
+#define UBIFS_FLG_MASK (UBIFS_FLG_BIGLPT|UBIFS_FLG_SPACE_FIXUP|UBIFS_FLG_DOUBLE_HASH|UBIFS_FLG_ENCRYPTION)
+
 /**
  * struct ubifs_ch - common header node.
  * @magic: UBIFS node magic number (%UBIFS_NODE_MAGIC)
@@ -521,7 +537,8 @@ struct ubifs_ino_node {
  * @padding1: reserved for future, zeroes
  * @type: type of the target inode (%UBIFS_ITYPE_REG, %UBIFS_ITYPE_DIR, etc)
  * @nlen: name length
- * @padding2: reserved for future, zeroes
+ * @cookie: A 32bits random number, used to construct a 64bits
+ *          identifier.
  * @name: zero-terminated name
  *
  * Note, do not forget to amend 'zero_dent_node_unused()' function when
@@ -534,7 +551,7 @@ struct ubifs_dent_node {
 	__u8 padding1;
 	__u8 type;
 	__le16 nlen;
-	__u8 padding2[4]; /* Watch 'zero_dent_node_unused()' if changing! */
+	__le32 cookie;
 	__u8 name[];
 } __packed;
 
@@ -544,18 +561,16 @@ struct ubifs_dent_node {
  * @key: node key
  * @size: uncompressed data size in bytes
  * @compr_type: compression type (%UBIFS_COMPR_NONE, %UBIFS_COMPR_LZO, etc)
- * @padding: reserved for future, zeroes
+ * @compr_size: compressed data size in bytes, only valid when data is encrypted
  * @data: data
  *
- * Note, do not forget to amend 'zero_data_node_unused()' function when
- * changing the padding fields.
  */
 struct ubifs_data_node {
 	struct ubifs_ch ch;
 	__u8 key[UBIFS_MAX_KEY_LEN];
 	__le32 size;
 	__le16 compr_type;
-	__u8 padding[2]; /* Watch 'zero_data_node_unused()' if changing! */
+	__le16 compr_size;
 	__u8 data[];
 } __packed;
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 096035eb29d0..ca72382ce6cc 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,6 +38,8 @@
 #include <linux/backing-dev.h>
 #include <linux/security.h>
 #include <linux/xattr.h>
+#include <linux/fscrypto.h>
+#include <linux/random.h>
 #include "ubifs-media.h"
 
 /* Version of this UBIFS implementation */
@@ -83,10 +85,6 @@
  */
 #define BGT_NAME_PATTERN "ubifs_bgt%d_%d"
 
-/* Write-buffer synchronization timeout interval in seconds */
-#define WBUF_TIMEOUT_SOFTLIMIT 3
-#define WBUF_TIMEOUT_HARDLIMIT 5
-
 /* Maximum possible inode number (only 32-bit inodes are supported now) */
 #define MAX_INUM 0xFFFFFFFF
 
@@ -138,6 +136,12 @@
  */
 #define WORST_COMPR_FACTOR 2
 
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
+#define UBIFS_CIPHER_BLOCK_SIZE FS_CRYPTO_BLOCK_SIZE
+#else
+#define UBIFS_CIPHER_BLOCK_SIZE 0
+#endif
+
 /*
  * How much memory is needed for a buffer where we compress a data node.
  */
@@ -645,9 +649,6 @@ typedef int (*ubifs_lpt_scan_callback)(struct ubifs_info *c,
  * @io_mutex: serializes write-buffer I/O
  * @lock: serializes @buf, @lnum, @offs, @avail, @used, @next_ino and @inodes
  *        fields
- * @softlimit: soft write-buffer timeout interval
- * @delta: hard and soft timeouts delta (the timer expire interval is @softlimit
- *         and @softlimit + @delta)
  * @timer: write-buffer timer
  * @no_timer: non-zero if this write-buffer does not have a timer
  * @need_sync: non-zero if the timer expired and the wbuf needs sync'ing
@@ -676,8 +677,6 @@ struct ubifs_wbuf {
 	int (*sync_callback)(struct ubifs_info *c, int lnum, int free, int pad);
 	struct mutex io_mutex;
 	spinlock_t lock;
-	ktime_t softlimit;
-	unsigned long long delta;
 	struct hrtimer timer;
 	unsigned int no_timer:1;
 	unsigned int need_sync:1;
@@ -1007,6 +1006,8 @@ struct ubifs_debug_info;
  *
  * @big_lpt: flag that LPT is too big to write whole during commit
  * @space_fixup: flag indicating that free space in LEBs needs to be cleaned up
+ * @double_hash: flag indicating that we can do lookups by hash
+ * @encrypted: flag indicating that this file system contains encrypted files
  * @no_chk_data_crc: do not check CRCs when reading data nodes (except during
  *                   recovery)
  * @bulk_read: enable bulk-reads
@@ -1249,6 +1250,8 @@ struct ubifs_info {
 
 	unsigned int big_lpt:1;
 	unsigned int space_fixup:1;
+	unsigned int double_hash:1;
+	unsigned int encrypted:1;
 	unsigned int no_chk_data_crc:1;
 	unsigned int bulk_read:1;
 	unsigned int default_compr:2;
@@ -1515,25 +1518,29 @@ int ubifs_consolidate_log(struct ubifs_info *c);
 
 /* journal.c */
 int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir,
-		     const struct qstr *nm, const struct inode *inode,
+		     const struct fscrypt_name *nm, const struct inode *inode,
 		     int deletion, int xent);
 int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode,
 			 const union ubifs_key *key, const void *buf, int len);
 int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode);
 int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir,
-		      const struct dentry *fst_dentry,
+		      const struct inode *fst_inode,
+		      const struct fscrypt_name *fst_nm,
 		      const struct inode *snd_dir,
-		      const struct dentry *snd_dentry, int sync);
+		      const struct inode *snd_inode,
+		      const struct fscrypt_name *snd_nm, int sync);
 int ubifs_jnl_rename(struct ubifs_info *c, const struct inode *old_dir,
-		     const struct dentry *old_dentry,
+		     const struct inode *old_inode,
+		     const struct fscrypt_name *old_nm,
 		     const struct inode *new_dir,
-		     const struct dentry *new_dentry,
+		     const struct inode *new_inode,
+		     const struct fscrypt_name *new_nm,
 		     const struct inode *whiteout, int sync);
 int ubifs_jnl_truncate(struct ubifs_info *c, const struct inode *inode,
 		       loff_t old_size, loff_t new_size);
 int ubifs_jnl_delete_xattr(struct ubifs_info *c, const struct inode *host,
-			   const struct inode *inode, const struct qstr *nm);
+			   const struct inode *inode, const struct fscrypt_name *nm);
 int ubifs_jnl_change_xattr(struct ubifs_info *c, const struct inode *inode1,
 			   const struct inode *inode2);
 
@@ -1568,7 +1575,9 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c);
 int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
 			struct ubifs_znode **zn, int *n);
 int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key,
-			void *node, const struct qstr *nm);
+			void *node, const struct fscrypt_name *nm);
+int ubifs_tnc_lookup_dh(struct ubifs_info *c, const union ubifs_key *key,
+			void *node, uint32_t secondary_hash);
 int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key,
 		     void *node, int *lnum, int *offs);
 int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
@@ -1576,16 +1585,16 @@ int ubifs_tnc_add(struct ubifs_info *c, const union ubifs_key *key, int lnum,
 int ubifs_tnc_replace(struct ubifs_info *c, const union ubifs_key *key,
 		      int old_lnum, int old_offs, int lnum, int offs, int len);
 int ubifs_tnc_add_nm(struct ubifs_info *c, const union ubifs_key *key,
-		     int lnum, int offs, int len, const struct qstr *nm);
+		     int lnum, int offs, int len, const struct fscrypt_name *nm);
 int ubifs_tnc_remove(struct ubifs_info *c, const union ubifs_key *key);
 int ubifs_tnc_remove_nm(struct ubifs_info *c, const union ubifs_key *key,
-			const struct qstr *nm);
+			const struct fscrypt_name *nm);
 int ubifs_tnc_remove_range(struct ubifs_info *c, union ubifs_key *from_key,
 			   union ubifs_key *to_key);
 int ubifs_tnc_remove_ino(struct ubifs_info *c, ino_t inum);
 struct ubifs_dent_node *ubifs_tnc_next_ent(struct ubifs_info *c,
 					   union ubifs_key *key,
-					   const struct qstr *nm);
+					   const struct fscrypt_name *nm);
 void ubifs_tnc_close(struct ubifs_info *c);
 int ubifs_tnc_has_node(struct ubifs_info *c, union ubifs_key *key, int level,
 		       int lnum, int offs, int is_idx);
@@ -1642,6 +1651,7 @@ int ubifs_read_superblock(struct ubifs_info *c);
 struct ubifs_sb_node *ubifs_read_sb_node(struct ubifs_info *c);
 int ubifs_write_sb_node(struct ubifs_info *c, struct ubifs_sb_node *sup);
 int ubifs_fixup_free_space(struct ubifs_info *c);
+int ubifs_enable_encryption(struct ubifs_info *c);
 
 /* replay.c */
 int ubifs_validate_entry(struct ubifs_info *c,
@@ -1733,16 +1743,21 @@ int ubifs_update_time(struct inode *inode, struct timespec *time, int flags);
 #endif
 
 /* dir.c */
-struct inode *ubifs_new_inode(struct ubifs_info *c, const struct inode *dir,
+struct inode *ubifs_new_inode(struct ubifs_info *c, struct inode *dir,
 			      umode_t mode);
 int ubifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
+int ubifs_check_dir_empty(struct inode *dir);
 
 /* xattr.c */
 extern const struct xattr_handler *ubifs_xattr_handlers[];
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size);
 int ubifs_init_security(struct inode *dentry, struct inode *inode,
 			const struct qstr *qstr);
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+		    size_t size, int flags);
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+			size_t size);
 
 /* super.c */
 struct inode *ubifs_iget(struct super_block *sb, unsigned long inum);
@@ -1781,6 +1796,66 @@ int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len,
 #include "misc.h"
 #include "key.h"
 
+#ifndef CONFIG_UBIFS_FS_ENCRYPTION
+#define fscrypt_set_d_op(i)
+#define fscrypt_get_ctx                 fscrypt_notsupp_get_ctx
+#define fscrypt_release_ctx             fscrypt_notsupp_release_ctx
+#define fscrypt_encrypt_page            fscrypt_notsupp_encrypt_page
+#define fscrypt_decrypt_page            fscrypt_notsupp_decrypt_page
+#define fscrypt_decrypt_bio_pages       fscrypt_notsupp_decrypt_bio_pages
+#define fscrypt_pullback_bio_page       fscrypt_notsupp_pullback_bio_page
+#define fscrypt_restore_control_page    fscrypt_notsupp_restore_control_page
+#define fscrypt_zeroout_range           fscrypt_notsupp_zeroout_range
+#define fscrypt_ioctl_set_policy	fscrypt_notsupp_ioctl_set_policy
+#define fscrypt_ioctl_get_policy	fscrypt_notsupp_ioctl_get_policy
+#define fscrypt_has_permitted_context   fscrypt_notsupp_has_permitted_context
+#define fscrypt_inherit_context         fscrypt_notsupp_inherit_context
+#define fscrypt_get_encryption_info     fscrypt_notsupp_get_encryption_info
+#define fscrypt_put_encryption_info     fscrypt_notsupp_put_encryption_info
+#define fscrypt_setup_filename          fscrypt_notsupp_setup_filename
+#define fscrypt_free_filename           fscrypt_notsupp_free_filename
+#define fscrypt_fname_encrypted_size    fscrypt_notsupp_fname_encrypted_size
+#define fscrypt_fname_alloc_buffer      fscrypt_notsupp_fname_alloc_buffer
+#define fscrypt_fname_free_buffer       fscrypt_notsupp_fname_free_buffer
+#define fscrypt_fname_disk_to_usr       fscrypt_notsupp_fname_disk_to_usr
+#define fscrypt_fname_usr_to_disk       fscrypt_notsupp_fname_usr_to_disk
+static inline int ubifs_encrypt(const struct inode *inode,
+				struct ubifs_data_node *dn,
+				unsigned int in_len, unsigned int *out_len,
+				int block)
+{
+	ubifs_assert(0);
+	return -EOPNOTSUPP;
+}
+static inline int ubifs_decrypt(const struct inode *inode,
+				struct ubifs_data_node *dn,
+				unsigned int *out_len, int block)
+{
+	ubifs_assert(0);
+	return -EOPNOTSUPP;
+}
+#else
+/* crypto.c */
+int ubifs_encrypt(const struct inode *inode, struct ubifs_data_node *dn,
+		  unsigned int in_len, unsigned int *out_len, int block);
+int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
+		  unsigned int *out_len, int block);
+#endif
+
+extern struct fscrypt_operations ubifs_crypt_operations;
+
+static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+{
+	struct ubifs_inode *ui = ubifs_inode(inode);
+
+	return ui->flags & UBIFS_CRYPT_FL;
+}
+
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
+{
+	return __ubifs_crypt_is_encrypted((struct inode *)inode);
+}
+
 /* Normal UBIFS messages */
 __printf(2, 3)
 void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c
index d9f9615bfd71..efe00fcb8b75 100644
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -97,7 +97,7 @@ static const struct file_operations empty_fops;
  * of failure.
  */
 static int create_xattr(struct ubifs_info *c, struct inode *host,
-			const struct qstr *nm, const void *value, int size)
+			const struct fscrypt_name *nm, const void *value, int size)
 {
 	int err, names_len;
 	struct inode *inode;
@@ -117,7 +117,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	 * extended attributes if the name list becomes larger. This limitation
 	 * is artificial for UBIFS, though.
 	 */
-	names_len = host_ui->xattr_names + host_ui->xattr_cnt + nm->len + 1;
+	names_len = host_ui->xattr_names + host_ui->xattr_cnt + fname_len(nm) + 1;
 	if (names_len > XATTR_LIST_MAX) {
 		ubifs_err(c, "cannot add one more xattr name to inode %lu, total names length would become %d, max. is %d",
 			  host->i_ino, names_len, XATTR_LIST_MAX);
@@ -154,9 +154,18 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 	mutex_lock(&host_ui->ui_mutex);
 	host->i_ctime = ubifs_current_time(host);
 	host_ui->xattr_cnt += 1;
-	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
 	host_ui->xattr_size += CALC_XATTR_BYTES(size);
-	host_ui->xattr_names += nm->len;
+	host_ui->xattr_names += fname_len(nm);
+
+	/*
+	 * We handle UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT here because we
+	 * have to set the UBIFS_CRYPT_FL flag on the host inode.
+	 * To avoid multiple updates of the same inode in the same operation,
+	 * let's do it here.
+	 */
+	if (strcmp(fname_name(nm), UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+		host_ui->flags |= UBIFS_CRYPT_FL;
 
 	err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
 	if (err)
@@ -170,9 +179,10 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
 
 out_cancel:
 	host_ui->xattr_cnt -= 1;
-	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
 	host_ui->xattr_size -= CALC_XATTR_BYTES(size);
-	host_ui->xattr_names -= nm->len;
+	host_ui->xattr_names -= fname_len(nm);
+	host_ui->flags &= ~UBIFS_CRYPT_FL;
 	mutex_unlock(&host_ui->ui_mutex);
 out_free:
 	make_bad_inode(inode);
@@ -269,22 +279,28 @@ static struct inode *iget_xattr(struct ubifs_info *c, ino_t inum)
 	return ERR_PTR(-EINVAL);
 }
 
-static int __ubifs_setxattr(struct inode *host, const char *name,
-			    const void *value, size_t size, int flags)
+int ubifs_xattr_set(struct inode *host, const char *name, const void *value,
+		    size_t size, int flags)
 {
 	struct inode *inode;
 	struct ubifs_info *c = host->i_sb->s_fs_info;
-	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
 	struct ubifs_dent_node *xent;
 	union ubifs_key key;
 	int err;
 
-	ubifs_assert(inode_is_locked(host));
+	/*
+	 * Creating an encryption context is done unlocked since we
+	 * operate on a new inode which is not visible to other users
+	 * at this point.
+	 */
+	if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) != 0)
+		ubifs_assert(inode_is_locked(host));
 
 	if (size > UBIFS_MAX_INO_DATA)
 		return -ERANGE;
 
-	if (nm.len > UBIFS_MAX_NLEN)
+	if (fname_len(&nm) > UBIFS_MAX_NLEN)
 		return -ENAMETOOLONG;
 
 	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -329,18 +345,18 @@ out_free:
 	return err;
 }
 
-static ssize_t __ubifs_getxattr(struct inode *host, const char *name,
-				void *buf, size_t size)
+ssize_t ubifs_xattr_get(struct inode *host, const char *name, void *buf,
+			size_t size)
 {
 	struct inode *inode;
 	struct ubifs_info *c = host->i_sb->s_fs_info;
-	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
 	struct ubifs_inode *ui;
 	struct ubifs_dent_node *xent;
 	union ubifs_key key;
 	int err;
 
-	if (nm.len > UBIFS_MAX_NLEN)
+	if (fname_len(&nm) > UBIFS_MAX_NLEN)
 		return -ENAMETOOLONG;
 
 	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -387,6 +403,20 @@ out_unlock:
 	return err;
 }
 
+static bool xattr_visible(const char *name)
+{
+	/* File encryption related xattrs are for internal use only */
+	if (strcmp(name, UBIFS_XATTR_NAME_ENCRYPTION_CONTEXT) == 0)
+		return false;
+
+	/* Show trusted namespace only for "power" users */
+	if (strncmp(name, XATTR_TRUSTED_PREFIX,
+		    XATTR_TRUSTED_PREFIX_LEN) == 0 && !capable(CAP_SYS_ADMIN))
+		return false;
+
+	return true;
+}
+
 ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 {
 	union ubifs_key key;
@@ -395,7 +425,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 	struct ubifs_inode *host_ui = ubifs_inode(host);
 	struct ubifs_dent_node *xent, *pxent = NULL;
 	int err, len, written = 0;
-	struct qstr nm = { .name = NULL };
+	struct fscrypt_name nm = {0};
 
 	dbg_gen("ino %lu ('%pd'), buffer size %zd", host->i_ino,
 		dentry, size);
@@ -419,15 +449,12 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 			break;
 		}
 
-		nm.name = xent->name;
-		nm.len = le16_to_cpu(xent->nlen);
+		fname_name(&nm) = xent->name;
+		fname_len(&nm) = le16_to_cpu(xent->nlen);
 
-		/* Show trusted namespace only for "power" users */
-		if (strncmp(xent->name, XATTR_TRUSTED_PREFIX,
-			    XATTR_TRUSTED_PREFIX_LEN) ||
-		    capable(CAP_SYS_ADMIN)) {
-			memcpy(buffer + written, nm.name, nm.len + 1);
-			written += nm.len + 1;
+		if (xattr_visible(xent->name)) {
+			memcpy(buffer + written, fname_name(&nm), fname_len(&nm) + 1);
+			written += fname_len(&nm) + 1;
 		}
 
 		kfree(pxent);
@@ -446,7 +473,7 @@ ssize_t ubifs_listxattr(struct dentry *dentry, char *buffer, size_t size)
 }
 
 static int remove_xattr(struct ubifs_info *c, struct inode *host,
-			struct inode *inode, const struct qstr *nm)
+			struct inode *inode, const struct fscrypt_name *nm)
 {
 	int err;
 	struct ubifs_inode *host_ui = ubifs_inode(host);
@@ -463,9 +490,9 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
 	mutex_lock(&host_ui->ui_mutex);
 	host->i_ctime = ubifs_current_time(host);
 	host_ui->xattr_cnt -= 1;
-	host_ui->xattr_size -= CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size -= CALC_DENT_SIZE(fname_len(nm));
 	host_ui->xattr_size -= CALC_XATTR_BYTES(ui->data_len);
-	host_ui->xattr_names -= nm->len;
+	host_ui->xattr_names -= fname_len(nm);
 
 	err = ubifs_jnl_delete_xattr(c, host, inode, nm);
 	if (err)
@@ -477,27 +504,27 @@ static int remove_xattr(struct ubifs_info *c, struct inode *host,
 
 out_cancel:
 	host_ui->xattr_cnt += 1;
-	host_ui->xattr_size += CALC_DENT_SIZE(nm->len);
+	host_ui->xattr_size += CALC_DENT_SIZE(fname_len(nm));
 	host_ui->xattr_size += CALC_XATTR_BYTES(ui->data_len);
-	host_ui->xattr_names += nm->len;
+	host_ui->xattr_names += fname_len(nm);
 	mutex_unlock(&host_ui->ui_mutex);
 	ubifs_release_budget(c, &req);
 	make_bad_inode(inode);
 	return err;
 }
 
-static int __ubifs_removexattr(struct inode *host, const char *name)
+static int ubifs_xattr_remove(struct inode *host, const char *name)
 {
 	struct inode *inode;
 	struct ubifs_info *c = host->i_sb->s_fs_info;
-	struct qstr nm = QSTR_INIT(name, strlen(name));
+	struct fscrypt_name nm = { .disk_name = FSTR_INIT((char *)name, strlen(name))};
 	struct ubifs_dent_node *xent;
 	union ubifs_key key;
 	int err;
 
 	ubifs_assert(inode_is_locked(host));
 
-	if (nm.len > UBIFS_MAX_NLEN)
+	if (fname_len(&nm) > UBIFS_MAX_NLEN)
 		return -ENAMETOOLONG;
 
 	xent = kmalloc(UBIFS_MAX_XENT_NODE_SZ, GFP_NOFS);
@@ -548,7 +575,8 @@ static int init_xattrs(struct inode *inode, const struct xattr *xattr_array,
 		}
 		strcpy(name, XATTR_SECURITY_PREFIX);
 		strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name);
-		err = __ubifs_setxattr(inode, name, xattr->value, xattr->value_len, 0);
+		err = ubifs_xattr_set(inode, name, xattr->value,
+				      xattr->value_len, 0);
 		kfree(name);
 		if (err < 0)
 			break;
@@ -572,7 +600,7 @@ int ubifs_init_security(struct inode *dentry, struct inode *inode,
 	return err;
 }
 
-static int ubifs_xattr_get(const struct xattr_handler *handler,
+static int xattr_get(const struct xattr_handler *handler,
 			   struct dentry *dentry, struct inode *inode,
 			   const char *name, void *buffer, size_t size)
 {
@@ -580,10 +608,10 @@ static int ubifs_xattr_get(const struct xattr_handler *handler,
 		inode->i_ino, dentry, size);
 
 	name = xattr_full_name(handler, name);
-	return __ubifs_getxattr(inode, name, buffer, size);
+	return ubifs_xattr_get(inode, name, buffer, size);
 }
 
-static int ubifs_xattr_set(const struct xattr_handler *handler,
+static int xattr_set(const struct xattr_handler *handler,
 			   struct dentry *dentry, struct inode *inode,
 			   const char *name, const void *value,
 			   size_t size, int flags)
@@ -594,27 +622,27 @@ static int ubifs_xattr_set(const struct xattr_handler *handler,
 	name = xattr_full_name(handler, name);
 
 	if (value)
-		return __ubifs_setxattr(inode, name, value, size, flags);
+		return ubifs_xattr_set(inode, name, value, size, flags);
 	else
-		return __ubifs_removexattr(inode, name);
+		return ubifs_xattr_remove(inode, name);
 }
 
 static const struct xattr_handler ubifs_user_xattr_handler = {
 	.prefix = XATTR_USER_PREFIX,
-	.get = ubifs_xattr_get,
-	.set = ubifs_xattr_set,
+	.get = xattr_get,
+	.set = xattr_set,
 };
 
 static const struct xattr_handler ubifs_trusted_xattr_handler = {
 	.prefix = XATTR_TRUSTED_PREFIX,
-	.get = ubifs_xattr_get,
-	.set = ubifs_xattr_set,
+	.get = xattr_get,
+	.set = xattr_set,
 };
 
 static const struct xattr_handler ubifs_security_xattr_handler = {
 	.prefix = XATTR_SECURITY_PREFIX,
-	.get = ubifs_xattr_get,
-	.set = ubifs_xattr_set,
+	.get = xattr_get,
+	.set = xattr_set,
 };
 
 const struct xattr_handler *ubifs_xattr_handlers[] = {
diff --git a/fs/udf/dir.c b/fs/udf/dir.c
index aaec13c95253..2d0e028067eb 100644
--- a/fs/udf/dir.c
+++ b/fs/udf/dir.c
@@ -30,6 +30,7 @@
 #include <linux/errno.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include <linux/bio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/udf/directory.c b/fs/udf/directory.c
index 988d5352bdb8..7aa48bd7cbaf 100644
--- a/fs/udf/directory.c
+++ b/fs/udf/directory.c
@@ -16,6 +16,7 @@
 
 #include <linux/fs.h>
 #include <linux/string.h>
+#include <linux/bio.h>
 
 struct fileIdentDesc *udf_fileident_read(struct inode *dir, loff_t *nf_pos,
 					 struct udf_fileident_bh *fibh,
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index aad46401ede5..0f3db71753aa 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -38,6 +38,7 @@
 #include <linux/crc-itu-t.h>
 #include <linux/mpage.h>
 #include <linux/uio.h>
+#include <linux/bio.h>
 
 #include "udf_i.h"
 #include "udf_sb.h"
diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c
index 67e085d591d8..a0376a2c1c29 100644
--- a/fs/ufs/balloc.c
+++ b/fs/ufs/balloc.c
@@ -15,6 +15,7 @@
 #include <linux/buffer_head.h>
 #include <linux/capability.h>
 #include <linux/bitops.h>
+#include <linux/bio.h>
 #include <asm/byteorder.h>
 
 #include "ufs_fs.h"
@@ -306,8 +307,7 @@ static void ufs_change_blocknr(struct inode *inode, sector_t beg,
 			     (unsigned long long)(pos + newb), pos);
 
 			bh->b_blocknr = newb + pos;
-			unmap_underlying_metadata(bh->b_bdev,
-						  bh->b_blocknr);
+			clean_bdev_bh_alias(bh);
 			mark_buffer_dirty(bh);
 			++j;
 			bh = bh->b_this_page;
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index 190d64be22ed..7e41aee7b69a 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -25,7 +25,7 @@
  *        David S. Miller (davem@caip.rutgers.edu), 1995
  */
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
@@ -1070,8 +1070,7 @@ static int ufs_alloc_lastblock(struct inode *inode, loff_t size)
 
        if (buffer_new(bh)) {
 	       clear_buffer_new(bh);
-	       unmap_underlying_metadata(bh->b_bdev,
-					 bh->b_blocknr);
+	       clean_bdev_bh_alias(bh);
 	       /*
 		* we do not zeroize fragment, because of
 		* if it maped to hole, it already contains zeroes
@@ -1192,7 +1191,7 @@ out:
 	return err;
 }
 
-void ufs_truncate_blocks(struct inode *inode)
+static void ufs_truncate_blocks(struct inode *inode)
 {
 	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
 	      S_ISLNK(inode->i_mode)))
diff --git a/fs/ufs/super.c b/fs/ufs/super.c
index f04ab232d08d..131b2b77c818 100644
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -71,7 +71,7 @@
 
 #include <stdarg.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 #include <linux/errno.h>
 #include <linux/fs.h>
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index 85959d8324df..43953e03c356 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -63,6 +63,7 @@ struct userfaultfd_wait_queue {
 	struct uffd_msg msg;
 	wait_queue_t wq;
 	struct userfaultfd_ctx *ctx;
+	bool waken;
 };
 
 struct userfaultfd_wake_range {
@@ -86,6 +87,12 @@ static int userfaultfd_wake_function(wait_queue_t *wq, unsigned mode,
 	if (len && (start > uwq->msg.arg.pagefault.address ||
 		    start + len <= uwq->msg.arg.pagefault.address))
 		goto out;
+	WRITE_ONCE(uwq->waken, true);
+	/*
+	 * The implicit smp_mb__before_spinlock in try_to_wake_up()
+	 * renders uwq->waken visible to other CPUs before the task is
+	 * waken.
+	 */
 	ret = wake_up_state(wq->private, mode);
 	if (ret)
 		/*
@@ -257,18 +264,19 @@ out:
  * fatal_signal_pending()s, and the mmap_sem must be released before
  * returning it.
  */
-int handle_userfault(struct fault_env *fe, unsigned long reason)
+int handle_userfault(struct vm_fault *vmf, unsigned long reason)
 {
-	struct mm_struct *mm = fe->vma->vm_mm;
+	struct mm_struct *mm = vmf->vma->vm_mm;
 	struct userfaultfd_ctx *ctx;
 	struct userfaultfd_wait_queue uwq;
 	int ret;
 	bool must_wait, return_to_userland;
+	long blocking_state;
 
 	BUG_ON(!rwsem_is_locked(&mm->mmap_sem));
 
 	ret = VM_FAULT_SIGBUS;
-	ctx = fe->vma->vm_userfaultfd_ctx.ctx;
+	ctx = vmf->vma->vm_userfaultfd_ctx.ctx;
 	if (!ctx)
 		goto out;
 
@@ -301,17 +309,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
 	 * without first stopping userland access to the memory. For
 	 * VM_UFFD_MISSING userfaults this is enough for now.
 	 */
-	if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) {
+	if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) {
 		/*
 		 * Validate the invariant that nowait must allow retry
 		 * to be sure not to return SIGBUS erroneously on
 		 * nowait invocations.
 		 */
-		BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT);
+		BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT);
 #ifdef CONFIG_DEBUG_VM
 		if (printk_ratelimit()) {
 			printk(KERN_WARNING
-			       "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags);
+			       "FAULT_FLAG_ALLOW_RETRY missing %x\n",
+			       vmf->flags);
 			dump_stack();
 		}
 #endif
@@ -323,7 +332,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
 	 * and wait.
 	 */
 	ret = VM_FAULT_RETRY;
-	if (fe->flags & FAULT_FLAG_RETRY_NOWAIT)
+	if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
 		goto out;
 
 	/* take the reference before dropping the mmap_sem */
@@ -331,12 +340,15 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
 
 	init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function);
 	uwq.wq.private = current;
-	uwq.msg = userfault_msg(fe->address, fe->flags, reason);
+	uwq.msg = userfault_msg(vmf->address, vmf->flags, reason);
 	uwq.ctx = ctx;
+	uwq.waken = false;
 
 	return_to_userland =
-		(fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
+		(vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) ==
 		(FAULT_FLAG_USER|FAULT_FLAG_KILLABLE);
+	blocking_state = return_to_userland ? TASK_INTERRUPTIBLE :
+			 TASK_KILLABLE;
 
 	spin_lock(&ctx->fault_pending_wqh.lock);
 	/*
@@ -349,11 +361,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
 	 * following the spin_unlock to happen before the list_add in
 	 * __add_wait_queue.
 	 */
-	set_current_state(return_to_userland ? TASK_INTERRUPTIBLE :
-			  TASK_KILLABLE);
+	set_current_state(blocking_state);
 	spin_unlock(&ctx->fault_pending_wqh.lock);
 
-	must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason);
+	must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags,
+					  reason);
 	up_read(&mm->mmap_sem);
 
 	if (likely(must_wait && !ACCESS_ONCE(ctx->released) &&
@@ -362,6 +374,29 @@ int handle_userfault(struct fault_env *fe, unsigned long reason)
 		wake_up_poll(&ctx->fd_wqh, POLLIN);
 		schedule();
 		ret |= VM_FAULT_MAJOR;
+
+		/*
+		 * False wakeups can orginate even from rwsem before
+		 * up_read() however userfaults will wait either for a
+		 * targeted wakeup on the specific uwq waitqueue from
+		 * wake_userfault() or for signals or for uffd
+		 * release.
+		 */
+		while (!READ_ONCE(uwq.waken)) {
+			/*
+			 * This needs the full smp_store_mb()
+			 * guarantee as the state write must be
+			 * visible to other CPUs before reading
+			 * uwq.waken from other CPUs.
+			 */
+			set_current_state(blocking_state);
+			if (READ_ONCE(uwq.waken) ||
+			    READ_ONCE(ctx->released) ||
+			    (return_to_userland ? signal_pending(current) :
+			     fatal_signal_pending(current)))
+				break;
+			schedule();
+		}
 	}
 
 	__set_current_state(TASK_RUNNING);
diff --git a/fs/utimes.c b/fs/utimes.c
index 22307cdf7014..32b15b3f6629 100644
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -8,7 +8,7 @@
 #include <linux/stat.h>
 #include <linux/utime.h>
 #include <linux/syscalls.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/unistd.h>
 
 #ifdef __ARCH_WANT_SYS_UTIME
@@ -48,7 +48,7 @@ static bool nsec_valid(long nsec)
 	return nsec >= 0 && nsec <= 999999999;
 }
 
-static int utimes_common(struct path *path, struct timespec *times)
+static int utimes_common(const struct path *path, struct timespec *times)
 {
 	int error;
 	struct iattr newattrs;
diff --git a/fs/xattr.c b/fs/xattr.c
index 2d13b4e62fae..7e3317cf4045 100644
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -22,7 +22,7 @@
 #include <linux/vmalloc.h>
 #include <linux/posix_acl_xattr.h>
 
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 
 static const char *
 strcmp_prefix(const char *a, const char *a_prefix)
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e5ebc3770460..d346d42c54d1 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -256,6 +256,9 @@ xfs_ag_resv_init(
 			goto out;
 	}
 
+	ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved +
+	       xfs_perag_resv(pag, XFS_AG_RESV_AGFL)->ar_reserved <=
+	       pag->pagf_freeblks + pag->pagf_flcount);
 out:
 	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index effb64cf714f..9f06a211e157 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -95,10 +95,7 @@ unsigned int
 xfs_alloc_set_aside(
 	struct xfs_mount	*mp)
 {
-	unsigned int		blocks;
-
-	blocks = 4 + (mp->m_sb.sb_agcount * XFS_ALLOC_AGFL_RESERVE);
-	return blocks;
+	return mp->m_sb.sb_agcount * (XFS_ALLOC_AGFL_RESERVE + 4);
 }
 
 /*
@@ -365,36 +362,12 @@ xfs_alloc_fix_len(
 		return;
 	ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
 	ASSERT(rlen % args->prod == args->mod);
+	ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
+		rlen + args->minleft);
 	args->len = rlen;
 }
 
 /*
- * Fix up length if there is too little space left in the a.g.
- * Return 1 if ok, 0 if too little, should give up.
- */
-STATIC int
-xfs_alloc_fix_minleft(
-	xfs_alloc_arg_t	*args)		/* allocation argument structure */
-{
-	xfs_agf_t	*agf;		/* a.g. freelist header */
-	int		diff;		/* free space difference */
-
-	if (args->minleft == 0)
-		return 1;
-	agf = XFS_BUF_TO_AGF(args->agbp);
-	diff = be32_to_cpu(agf->agf_freeblks)
-		- args->len - args->minleft;
-	if (diff >= 0)
-		return 1;
-	args->len += diff;		/* shrink the allocated space */
-	/* casts to (int) catch length underflows */
-	if ((int)args->len >= (int)args->minlen)
-		return 1;
-	args->agbno = NULLAGBLOCK;
-	return 0;
-}
-
-/*
  * Update the two btrees, logically removing from freespace the extent
  * starting at rbno, rlen blocks.  The extent is contained within the
  * actual (current) free extent fbno for flen blocks.
@@ -689,8 +662,6 @@ xfs_alloc_ag_vextent(
 	xfs_alloc_arg_t	*args)	/* argument structure for allocation */
 {
 	int		error=0;
-	xfs_extlen_t	reservation;
-	xfs_extlen_t	oldmax;
 
 	ASSERT(args->minlen > 0);
 	ASSERT(args->maxlen > 0);
@@ -699,20 +670,6 @@ xfs_alloc_ag_vextent(
 	ASSERT(args->alignment > 0);
 
 	/*
-	 * Clamp maxlen to the amount of free space minus any reservations
-	 * that have been made.
-	 */
-	oldmax = args->maxlen;
-	reservation = xfs_ag_resv_needed(args->pag, args->resv);
-	if (args->maxlen > args->pag->pagf_freeblks - reservation)
-		args->maxlen = args->pag->pagf_freeblks - reservation;
-	if (args->maxlen == 0) {
-		args->agbno = NULLAGBLOCK;
-		args->maxlen = oldmax;
-		return 0;
-	}
-
-	/*
 	 * Branch to correct routine based on the type.
 	 */
 	args->wasfromfl = 0;
@@ -731,8 +688,6 @@ xfs_alloc_ag_vextent(
 		/* NOTREACHED */
 	}
 
-	args->maxlen = oldmax;
-
 	if (error || args->agbno == NULLAGBLOCK)
 		return error;
 
@@ -841,9 +796,6 @@ xfs_alloc_ag_vextent_exact(
 	args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
 						- args->agbno;
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args))
-		goto not_found;
-
 	ASSERT(args->agbno + args->len <= tend);
 
 	/*
@@ -1149,12 +1101,7 @@ restart:
 		XFS_WANT_CORRUPTED_GOTO(args->mp, i == 1, error0);
 		ASSERT(ltbno + ltlen <= be32_to_cpu(XFS_BUF_TO_AGF(args->agbp)->agf_length));
 		args->len = blen;
-		if (!xfs_alloc_fix_minleft(args)) {
-			xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-			trace_xfs_alloc_near_nominleft(args);
-			return 0;
-		}
-		blen = args->len;
+
 		/*
 		 * We are allocating starting at bnew for blen blocks.
 		 */
@@ -1346,12 +1293,6 @@ restart:
 	 */
 	args->len = XFS_EXTLEN_MIN(ltlena, args->maxlen);
 	xfs_alloc_fix_len(args);
-	if (!xfs_alloc_fix_minleft(args)) {
-		trace_xfs_alloc_near_nominleft(args);
-		xfs_btree_del_cursor(bno_cur_lt, XFS_BTREE_NOERROR);
-		xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
-		return 0;
-	}
 	rlen = args->len;
 	(void)xfs_alloc_compute_diff(args->agbno, rlen, args->alignment,
 				     args->datatype, ltbnoa, ltlena, &ltnew);
@@ -1553,8 +1494,6 @@ restart:
 	}
 	xfs_alloc_fix_len(args);
 
-	if (!xfs_alloc_fix_minleft(args))
-		goto out_nominleft;
 	rlen = args->len;
 	XFS_WANT_CORRUPTED_GOTO(args->mp, rlen <= flen, error0);
 	/*
@@ -2056,7 +1995,7 @@ xfs_alloc_space_available(
 	int			flags)
 {
 	struct xfs_perag	*pag = args->pag;
-	xfs_extlen_t		longest;
+	xfs_extlen_t		alloc_len, longest;
 	xfs_extlen_t		reservation; /* blocks that are still reserved */
 	int			available;
 
@@ -2066,17 +2005,28 @@ xfs_alloc_space_available(
 	reservation = xfs_ag_resv_needed(pag, args->resv);
 
 	/* do we have enough contiguous free space for the allocation? */
+	alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
 	longest = xfs_alloc_longest_free_extent(args->mp, pag, min_free,
 			reservation);
-	if ((args->minlen + args->alignment + args->minalignslop - 1) > longest)
+	if (longest < alloc_len)
 		return false;
 
 	/* do we have enough free space remaining for the allocation? */
 	available = (int)(pag->pagf_freeblks + pag->pagf_flcount -
-			  reservation - min_free - args->total);
-	if (available < (int)args->minleft || available <= 0)
+			  reservation - min_free - args->minleft);
+	if (available < (int)max(args->total, alloc_len))
 		return false;
 
+	/*
+	 * Clamp maxlen to the amount of free space available for the actual
+	 * extent allocation.
+	 */
+	if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
+		args->maxlen = available;
+		ASSERT(args->maxlen > 0);
+		ASSERT(args->maxlen >= args->minlen);
+	}
+
 	return true;
 }
 
@@ -2122,7 +2072,8 @@ xfs_alloc_fix_freelist(
 	}
 
 	need = xfs_alloc_min_freelist(mp, pag);
-	if (!xfs_alloc_space_available(args, need, flags))
+	if (!xfs_alloc_space_available(args, need, flags |
+			XFS_ALLOC_FLAG_CHECK))
 		goto out_agbp_relse;
 
 	/*
@@ -2455,12 +2406,15 @@ xfs_agf_verify(
 	      be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp)))
 		return false;
 
-	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
+	if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
+	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS ||
 	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS)
 		return false;
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb) &&
-	    be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS)
+	    (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
+	     be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > XFS_BTREE_MAXLEVELS))
 		return false;
 
 	/*
@@ -2477,7 +2431,8 @@ xfs_agf_verify(
 		return false;
 
 	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
-	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+	    (be32_to_cpu(agf->agf_refcount_level) < 1 ||
+	     be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS))
 		return false;
 
 	return true;;
@@ -2634,12 +2589,10 @@ xfs_alloc_vextent(
 	xfs_agblock_t	agsize;	/* allocation group size */
 	int		error;
 	int		flags;	/* XFS_ALLOC_FLAG_... locking flags */
-	xfs_extlen_t	minleft;/* minimum left value, temp copy */
 	xfs_mount_t	*mp;	/* mount structure pointer */
 	xfs_agnumber_t	sagno;	/* starting allocation group number */
 	xfs_alloctype_t	type;	/* input allocation type */
 	int		bump_rotor = 0;
-	int		no_min = 0;
 	xfs_agnumber_t	rotorstep = xfs_rotorstep; /* inode32 agf stepper */
 
 	mp = args->mp;
@@ -2668,7 +2621,6 @@ xfs_alloc_vextent(
 		trace_xfs_alloc_vextent_badargs(args);
 		return 0;
 	}
-	minleft = args->minleft;
 
 	switch (type) {
 	case XFS_ALLOCTYPE_THIS_AG:
@@ -2679,9 +2631,7 @@ xfs_alloc_vextent(
 		 */
 		args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
 		args->pag = xfs_perag_get(mp, args->agno);
-		args->minleft = 0;
 		error = xfs_alloc_fix_freelist(args, 0);
-		args->minleft = minleft;
 		if (error) {
 			trace_xfs_alloc_vextent_nofix(args);
 			goto error0;
@@ -2746,9 +2696,7 @@ xfs_alloc_vextent(
 		 */
 		for (;;) {
 			args->pag = xfs_perag_get(mp, args->agno);
-			if (no_min) args->minleft = 0;
 			error = xfs_alloc_fix_freelist(args, flags);
-			args->minleft = minleft;
 			if (error) {
 				trace_xfs_alloc_vextent_nofix(args);
 				goto error0;
@@ -2788,20 +2736,17 @@ xfs_alloc_vextent(
 			 * or switch to non-trylock mode.
 			 */
 			if (args->agno == sagno) {
-				if (no_min == 1) {
+				if (flags == 0) {
 					args->agbno = NULLAGBLOCK;
 					trace_xfs_alloc_vextent_allfailed(args);
 					break;
 				}
-				if (flags == 0) {
-					no_min = 1;
-				} else {
-					flags = 0;
-					if (type == XFS_ALLOCTYPE_START_BNO) {
-						args->agbno = XFS_FSB_TO_AGBNO(mp,
-							args->fsbno);
-						args->type = XFS_ALLOCTYPE_NEAR_BNO;
-					}
+
+				flags = 0;
+				if (type == XFS_ALLOCTYPE_START_BNO) {
+					args->agbno = XFS_FSB_TO_AGBNO(mp,
+						args->fsbno);
+					args->type = XFS_ALLOCTYPE_NEAR_BNO;
 				}
 			}
 			xfs_perag_put(args->pag);
diff --git a/fs/xfs/libxfs/xfs_alloc.h b/fs/xfs/libxfs/xfs_alloc.h
index 7c404a6b0ae3..1d0f48a501a3 100644
--- a/fs/xfs/libxfs/xfs_alloc.h
+++ b/fs/xfs/libxfs/xfs_alloc.h
@@ -56,7 +56,7 @@ typedef unsigned int xfs_alloctype_t;
 #define	XFS_ALLOC_FLAG_FREEING	0x00000002  /* indicate caller is freeing extents*/
 #define	XFS_ALLOC_FLAG_NORMAP	0x00000004  /* don't modify the rmapbt */
 #define	XFS_ALLOC_FLAG_NOSHRINK	0x00000008  /* don't shrink the freelist */
-
+#define	XFS_ALLOC_FLAG_CHECK	0x00000010  /* test only, don't modify args */
 
 /*
  * Argument structure for xfs_alloc routines.
diff --git a/fs/xfs/libxfs/xfs_alloc_btree.c b/fs/xfs/libxfs/xfs_alloc_btree.c
index 5ba2dac5e67c..efb467b10a71 100644
--- a/fs/xfs/libxfs/xfs_alloc_btree.c
+++ b/fs/xfs/libxfs/xfs_alloc_btree.c
@@ -421,13 +421,17 @@ xfs_allocbt_init_cursor(
 
 	ASSERT(btnum == XFS_BTNUM_BNO || btnum == XFS_BTNUM_CNT);
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_btnum = btnum;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_allocbt_ops;
+	if (btnum == XFS_BTNUM_BNO)
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtb_2);
+	else
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_abtc_2);
 
 	if (btnum == XFS_BTNUM_CNT) {
 		cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]);
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c
index 8ea91f363093..2852521fc8ec 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.c
+++ b/fs/xfs/libxfs/xfs_attr_leaf.c
@@ -253,6 +253,7 @@ xfs_attr3_leaf_verify(
 {
 	struct xfs_mount	*mp = bp->b_target->bt_mount;
 	struct xfs_attr_leafblock *leaf = bp->b_addr;
+	struct xfs_perag *pag = bp->b_pag;
 	struct xfs_attr3_icleaf_hdr ichdr;
 
 	xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &ichdr, leaf);
@@ -273,7 +274,12 @@ xfs_attr3_leaf_verify(
 		if (ichdr.magic != XFS_ATTR_LEAF_MAGIC)
 			return false;
 	}
-	if (ichdr.count == 0)
+	/*
+	 * In recovery there is a transient state where count == 0 is valid
+	 * because we may have transitioned an empty shortform attr to a leaf
+	 * if the attr didn't fit in shortform.
+	 */
+	if (pag && pag->pagf_init && ichdr.count == 0)
 		return false;
 
 	/* XXX: need to range check rest of attr header values */
diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h
index 4f2aed04f827..f7dda0c237b0 100644
--- a/fs/xfs/libxfs/xfs_attr_leaf.h
+++ b/fs/xfs/libxfs/xfs_attr_leaf.h
@@ -51,7 +51,7 @@ int	xfs_attr_shortform_getvalue(struct xfs_da_args *args);
 int	xfs_attr_shortform_to_leaf(struct xfs_da_args *args);
 int	xfs_attr_shortform_remove(struct xfs_da_args *args);
 int	xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp);
-int	xfs_attr_shortform_bytesfit(xfs_inode_t *dp, int bytes);
+int	xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes);
 void	xfs_attr_fork_remove(struct xfs_inode *ip, struct xfs_trans *tp);
 
 /*
@@ -77,7 +77,7 @@ int	xfs_attr3_leaf_add(struct xfs_buf *leaf_buffer,
 				 struct xfs_da_args *args);
 int	xfs_attr3_leaf_remove(struct xfs_buf *leaf_buffer,
 				    struct xfs_da_args *args);
-int	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
+void	xfs_attr3_leaf_list_int(struct xfs_buf *bp,
 				      struct xfs_attr_list_context *context);
 
 /*
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index c6eb21940783..44773c9eb957 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -49,6 +49,8 @@
 #include "xfs_rmap.h"
 #include "xfs_ag_resv.h"
 #include "xfs_refcount.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_icache.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -190,8 +192,12 @@ xfs_bmap_worst_indlen(
 	int		maxrecs;	/* maximum record count at this level */
 	xfs_mount_t	*mp;		/* mount structure */
 	xfs_filblks_t	rval;		/* return value */
+	xfs_filblks_t   orig_len;
 
 	mp = ip->i_mount;
+
+	/* Calculate the worst-case size of the bmbt. */
+	orig_len = len;
 	maxrecs = mp->m_bmap_dmxr[0];
 	for (level = 0, rval = 0;
 	     level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK);
@@ -199,12 +205,20 @@ xfs_bmap_worst_indlen(
 		len += maxrecs - 1;
 		do_div(len, maxrecs);
 		rval += len;
-		if (len == 1)
-			return rval + XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
+		if (len == 1) {
+			rval += XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) -
 				level - 1;
+			break;
+		}
 		if (level == 0)
 			maxrecs = mp->m_bmap_dmxr[1];
 	}
+
+	/* Calculate the worst-case size of the rmapbt. */
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		rval += 1 + xfs_rmapbt_calc_size(mp, orig_len) +
+				mp->m_rmap_maxlevels;
+
 	return rval;
 }
 
@@ -504,7 +518,7 @@ void
 xfs_bmap_trace_exlist(
 	xfs_inode_t	*ip,		/* incore inode pointer */
 	xfs_extnum_t	cnt,		/* count of entries in the list */
-	int		whichfork,	/* data or attr fork */
+	int		whichfork,	/* data or attr or cow fork */
 	unsigned long	caller_ip)
 {
 	xfs_extnum_t	idx;		/* extent record index */
@@ -513,11 +527,13 @@ xfs_bmap_trace_exlist(
 
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
+	else if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT(cnt == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(cnt == xfs_iext_count(ifp));
 	for (idx = 0; idx < cnt; idx++)
-		trace_xfs_extlist(ip, idx, whichfork, caller_ip);
+		trace_xfs_extlist(ip, idx, state, caller_ip);
 }
 
 /*
@@ -811,7 +827,7 @@ try_another_ag:
 				XFS_BTREE_LONG_PTRS);
 
 	arp = XFS_BMBT_REC_ADDR(mp, ablock, 1);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents =  xfs_iext_count(ifp);
 	for (cnt = i = 0; i < nextents; i++) {
 		ep = xfs_iext_get_ext(ifp, i);
 		if (!isnullstartblock(xfs_bmbt_get_startblock(ep))) {
@@ -1137,6 +1153,10 @@ xfs_bmap_add_attrfork(
 		goto trans_cancel;
 	if (XFS_IFORK_Q(ip))
 		goto trans_cancel;
+	if (ip->i_d.di_anextents != 0) {
+		error = -EFSCORRUPTED;
+		goto trans_cancel;
+	}
 	if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS) {
 		/*
 		 * For inodes coming from pre-6.2 filesystems.
@@ -1144,7 +1164,6 @@ xfs_bmap_add_attrfork(
 		ASSERT(ip->i_d.di_aformat == 0);
 		ip->i_d.di_aformat = XFS_DINODE_FMT_EXTENTS;
 	}
-	ASSERT(ip->i_d.di_anextents == 0);
 
 	xfs_trans_ijoin(tp, ip, 0);
 	xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
@@ -1296,7 +1315,7 @@ xfs_bmap_read_extents(
 	/*
 	 * Here with bp and block set to the leftmost leaf node in the tree.
 	 */
-	room = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	room = xfs_iext_count(ifp);
 	i = 0;
 	/*
 	 * Loop over all leaf nodes.  Copy information to the extent records.
@@ -1361,8 +1380,9 @@ xfs_bmap_read_extents(
 			return error;
 		block = XFS_BUF_TO_BLOCK(bp);
 	}
-	ASSERT(i == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)));
-	ASSERT(i == XFS_IFORK_NEXTENTS(ip, whichfork));
+	if (i != XFS_IFORK_NEXTENTS(ip, whichfork))
+		return -EFSCORRUPTED;
+	ASSERT(i == xfs_iext_count(ifp));
 	XFS_BMAP_TRACE_EXLIST(ip, i, whichfork);
 	return 0;
 error0:
@@ -1370,97 +1390,6 @@ error0:
 	return -EFSCORRUPTED;
 }
 
-
-/*
- * Search the extent records for the entry containing block bno.
- * If bno lies in a hole, point to the next entry.  If bno lies
- * past eof, *eofp will be set, and *prevp will contain the last
- * entry (null if none).  Else, *lastxp will be set to the index
- * of the found entry; *gotp will contain the entry.
- */
-STATIC xfs_bmbt_rec_host_t *		/* pointer to found extent entry */
-xfs_bmap_search_multi_extents(
-	xfs_ifork_t	*ifp,		/* inode fork pointer */
-	xfs_fileoff_t	bno,		/* block number searched for */
-	int		*eofp,		/* out: end of file found */
-	xfs_extnum_t	*lastxp,	/* out: last extent index */
-	xfs_bmbt_irec_t	*gotp,		/* out: extent entry found */
-	xfs_bmbt_irec_t	*prevp)		/* out: previous extent entry found */
-{
-	xfs_bmbt_rec_host_t *ep;		/* extent record pointer */
-	xfs_extnum_t	lastx;		/* last extent index */
-
-	/*
-	 * Initialize the extent entry structure to catch access to
-	 * uninitialized br_startblock field.
-	 */
-	gotp->br_startoff = 0xffa5a5a5a5a5a5a5LL;
-	gotp->br_blockcount = 0xa55a5a5a5a5a5a5aLL;
-	gotp->br_state = XFS_EXT_INVALID;
-	gotp->br_startblock = 0xffffa5a5a5a5a5a5LL;
-	prevp->br_startoff = NULLFILEOFF;
-
-	ep = xfs_iext_bno_to_ext(ifp, bno, &lastx);
-	if (lastx > 0) {
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx - 1), prevp);
-	}
-	if (lastx < (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))) {
-		xfs_bmbt_get_all(ep, gotp);
-		*eofp = 0;
-	} else {
-		if (lastx > 0) {
-			*gotp = *prevp;
-		}
-		*eofp = 1;
-		ep = NULL;
-	}
-	*lastxp = lastx;
-	return ep;
-}
-
-/*
- * Search the extents list for the inode, for the extent containing bno.
- * If bno lies in a hole, point to the next entry.  If bno lies past eof,
- * *eofp will be set, and *prevp will contain the last entry (null if none).
- * Else, *lastxp will be set to the index of the found
- * entry; *gotp will contain the entry.
- */
-xfs_bmbt_rec_host_t *                 /* pointer to found extent entry */
-xfs_bmap_search_extents(
-	xfs_inode_t     *ip,            /* incore inode pointer */
-	xfs_fileoff_t   bno,            /* block number searched for */
-	int             fork,      	/* data or attr fork */
-	int             *eofp,          /* out: end of file found */
-	xfs_extnum_t    *lastxp,        /* out: last extent index */
-	xfs_bmbt_irec_t *gotp,          /* out: extent entry found */
-	xfs_bmbt_irec_t *prevp)         /* out: previous extent entry found */
-{
-	xfs_ifork_t	*ifp;		/* inode fork pointer */
-	xfs_bmbt_rec_host_t  *ep;            /* extent record pointer */
-
-	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
-	ifp = XFS_IFORK_PTR(ip, fork);
-
-	ep = xfs_bmap_search_multi_extents(ifp, bno, eofp, lastxp, gotp, prevp);
-
-	if (unlikely(!(gotp->br_startblock) && (*lastxp != NULLEXTNUM) &&
-		     !(XFS_IS_REALTIME_INODE(ip) && fork == XFS_DATA_FORK))) {
-		xfs_alert_tag(ip->i_mount, XFS_PTAG_FSBLOCK_ZERO,
-				"Access to block zero in inode %llu "
-				"start_block: %llx start_off: %llx "
-				"blkcnt: %llx extent-state: %x lastx: %x",
-			(unsigned long long)ip->i_ino,
-			(unsigned long long)gotp->br_startblock,
-			(unsigned long long)gotp->br_startoff,
-			(unsigned long long)gotp->br_blockcount,
-			gotp->br_state, *lastxp);
-		*lastxp = NULLEXTNUM;
-		*eofp = 1;
-		return NULL;
-	}
-	return ep;
-}
-
 /*
  * Returns the file-relative block number of the first unused block(s)
  * in the file with at least "len" logically contiguous blocks free.
@@ -1497,7 +1426,7 @@ xfs_bmap_first_unused(
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
 	lowest = *first_unused;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	for (idx = 0, lastaddr = 0, max = lowest; idx < nextents; idx++) {
 		xfs_bmbt_rec_host_t *ep = xfs_iext_get_ext(ifp, idx);
 		off = xfs_bmbt_get_startoff(ep);
@@ -1523,44 +1452,44 @@ xfs_bmap_first_unused(
  */
 int						/* error */
 xfs_bmap_last_before(
-	xfs_trans_t	*tp,			/* transaction pointer */
-	xfs_inode_t	*ip,			/* incore inode */
-	xfs_fileoff_t	*last_block,		/* last block */
-	int		whichfork)		/* data or attr fork */
+	struct xfs_trans	*tp,		/* transaction pointer */
+	struct xfs_inode	*ip,		/* incore inode */
+	xfs_fileoff_t		*last_block,	/* last block */
+	int			whichfork)	/* data or attr fork */
 {
-	xfs_fileoff_t	bno;			/* input file offset */
-	int		eof;			/* hit end of file */
-	xfs_bmbt_rec_host_t *ep;		/* pointer to last extent */
-	int		error;			/* error return value */
-	xfs_bmbt_irec_t	got;			/* current extent value */
-	xfs_ifork_t	*ifp;			/* inode fork pointer */
-	xfs_extnum_t	lastx;			/* last extent used */
-	xfs_bmbt_irec_t	prev;			/* previous extent value */
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
+	struct xfs_bmbt_irec	got;
+	xfs_extnum_t		idx;
+	int			error;
 
-	if (XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_BTREE &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
-	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL)
-	       return -EIO;
-	if (XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL) {
+	switch (XFS_IFORK_FORMAT(ip, whichfork)) {
+	case XFS_DINODE_FMT_LOCAL:
 		*last_block = 0;
 		return 0;
+	case XFS_DINODE_FMT_BTREE:
+	case XFS_DINODE_FMT_EXTENTS:
+		break;
+	default:
+		return -EIO;
 	}
-	ifp = XFS_IFORK_PTR(ip, whichfork);
-	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
-	    (error = xfs_iread_extents(tp, ip, whichfork)))
-		return error;
-	bno = *last_block - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
-	if (eof || xfs_bmbt_get_startoff(ep) > bno) {
-		if (prev.br_startoff == NULLFILEOFF)
-			*last_block = 0;
-		else
-			*last_block = prev.br_startoff + prev.br_blockcount;
+
+	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
+		error = xfs_iread_extents(tp, ip, whichfork);
+		if (error)
+			return error;
 	}
-	/*
-	 * Otherwise *last_block is already the right answer.
-	 */
+
+	if (xfs_iext_lookup_extent(ip, ifp, *last_block - 1, &idx, &got)) {
+		if (got.br_startoff <= *last_block - 1)
+			return 0;
+	}
+
+	if (xfs_iext_get_extent(ifp, idx - 1, &got)) {
+		*last_block = got.br_startoff + got.br_blockcount;
+		return 0;
+	}
+
+	*last_block = 0;
 	return 0;
 }
 
@@ -1582,7 +1511,7 @@ xfs_bmap_last_extent(
 			return error;
 	}
 
-	nextents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	if (nextents == 0) {
 		*is_empty = 1;
 		return 0;
@@ -1735,7 +1664,7 @@ xfs_bmap_add_extent_delay_real(
 						&bma->ip->i_d.di_nextents);
 
 	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       (bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -1794,7 +1723,7 @@ xfs_bmap_add_extent_delay_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (bma->idx < xfs_iext_count(ifp) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx + 1), &RIGHT);
 
@@ -2300,7 +2229,7 @@ xfs_bmap_add_extent_unwritten_real(
 	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 
 	ASSERT(*idx >= 0);
-	ASSERT(*idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 
 	XFS_STATS_INC(mp, xs_add_exlist);
@@ -2356,7 +2285,7 @@ xfs_bmap_add_extent_unwritten_real(
 	 * Don't set contiguous if the combined extent would be too large.
 	 * Also check for all-three-contiguous being too large.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t) - 1) {
+	if (*idx < xfs_iext_count(&ip->i_df) - 1) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx + 1), &RIGHT);
 		if (isnullstartblock(RIGHT.br_startblock))
@@ -2836,7 +2765,7 @@ xfs_bmap_add_extent_hole_delay(
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < xfs_iext_count(ifp)) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 
@@ -2966,7 +2895,7 @@ xfs_bmap_add_extent_hole_real(
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 
 	ASSERT(bma->idx >= 0);
-	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(bma->idx <= xfs_iext_count(ifp));
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
@@ -2992,7 +2921,7 @@ xfs_bmap_add_extent_hole_real(
 	 * Check and set flags if this segment has a current value.
 	 * Not true if we're inserting into the "hole" at eof.
 	 */
-	if (bma->idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (bma->idx < xfs_iext_count(ifp)) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx), &right);
 		if (isnullstartblock(right.br_startblock))
@@ -3883,7 +3812,6 @@ xfs_bmap_btalloc(
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
 		args.total = ap->minlen;
-		args.minleft = 0;
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 		ap->dfops->dop_low = true;
@@ -4145,12 +4073,11 @@ xfs_bmapi_read(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
 	struct xfs_bmbt_irec	got;
-	struct xfs_bmbt_irec	prev;
 	xfs_fileoff_t		obno;
 	xfs_fileoff_t		end;
-	xfs_extnum_t		lastx;
+	xfs_extnum_t		idx;
 	int			error;
-	int			eof;
+	bool			eof = false;
 	int			n = 0;
 	int			whichfork = xfs_bmapi_whichfork(flags);
 
@@ -4190,7 +4117,8 @@ xfs_bmapi_read(
 			return error;
 	}
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got, &prev);
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &idx, &got))
+		eof = true;
 	end = bno + len;
 	obno = bno;
 
@@ -4221,10 +4149,8 @@ xfs_bmapi_read(
 			break;
 
 		/* Else go on to the next record. */
-		if (++lastx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, lastx), &got);
-		else
-			eof = 1;
+		if (!xfs_iext_get_extent(ifp, ++idx, &got))
+			eof = true;
 	}
 	*nmap = n;
 	return 0;
@@ -4234,10 +4160,10 @@ int
 xfs_bmapi_reserve_delalloc(
 	struct xfs_inode	*ip,
 	int			whichfork,
-	xfs_fileoff_t		aoff,
+	xfs_fileoff_t		off,
 	xfs_filblks_t		len,
+	xfs_filblks_t		prealloc,
 	struct xfs_bmbt_irec	*got,
-	struct xfs_bmbt_irec	*prev,
 	xfs_extnum_t		*lastx,
 	int			eof)
 {
@@ -4248,10 +4174,17 @@ xfs_bmapi_reserve_delalloc(
 	char			rt = XFS_IS_REALTIME_INODE(ip);
 	xfs_extlen_t		extsz;
 	int			error;
+	xfs_fileoff_t		aoff = off;
 
-	alen = XFS_FILBLKS_MIN(len, MAXEXTLEN);
+	/*
+	 * Cap the alloc length. Keep track of prealloc so we know whether to
+	 * tag the inode before we return.
+	 */
+	alen = XFS_FILBLKS_MIN(len + prealloc, MAXEXTLEN);
 	if (!eof)
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
+	if (prealloc && alen >= len)
+		prealloc = alen - len;
 
 	/* Figure out the extent size, adjust alen */
 	if (whichfork == XFS_COW_FORK)
@@ -4259,7 +4192,12 @@ xfs_bmapi_reserve_delalloc(
 	else
 		extsz = xfs_get_extsz_hint(ip);
 	if (extsz) {
-		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
+		struct xfs_bmbt_irec	prev;
+
+		if (!xfs_iext_get_extent(ifp, *lastx - 1, &prev))
+			prev.br_startoff = NULLFILEOFF;
+
+		error = xfs_bmap_extsize_align(mp, got, &prev, extsz, rt, eof,
 					       1, 0, &aoff, &alen);
 		ASSERT(!error);
 	}
@@ -4312,6 +4250,16 @@ xfs_bmapi_reserve_delalloc(
 	 */
 	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *lastx), got);
 
+	/*
+	 * Tag the inode if blocks were preallocated. Note that COW fork
+	 * preallocation can occur at the start or end of the extent, even when
+	 * prealloc == 0, so we must also check the aligned offset and length.
+	 */
+	if (whichfork == XFS_DATA_FORK && prealloc)
+		xfs_inode_set_eofblocks_tag(ip);
+	if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len))
+		xfs_inode_set_cowblocks_tag(ip);
+
 	ASSERT(got->br_startoff <= aoff);
 	ASSERT(got->br_startoff + got->br_blockcount >= aoff + alen);
 	ASSERT(isnullstartblock(got->br_startblock));
@@ -4349,7 +4297,7 @@ xfs_bmapi_allocate(
 	if (bma->wasdel) {
 		bma->length = (xfs_extlen_t)bma->got.br_blockcount;
 		bma->offset = bma->got.br_startoff;
-		if (bma->idx != NULLEXTNUM && bma->idx) {
+		if (bma->idx) {
 			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma->idx - 1),
 					 &bma->prev);
 		}
@@ -4395,8 +4343,6 @@ xfs_bmapi_allocate(
 	if (error)
 		return error;
 
-	if (bma->dfops->dop_low)
-		bma->minleft = 0;
 	if (bma->cur)
 		bma->cur->bc_private.b.firstblock = *bma->firstblock;
 	if (bma->blkno == NULLFSBLOCK)
@@ -4563,7 +4509,7 @@ xfs_bmapi_write(
 	struct xfs_ifork	*ifp;
 	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
 	xfs_fileoff_t		end;		/* end of mapped file region */
-	int			eof;		/* after the end of extents */
+	bool			eof = false;	/* after the end of extents */
 	int			error;		/* error return */
 	int			n;		/* current extent index */
 	xfs_fileoff_t		obno;		/* old block number (offset) */
@@ -4641,12 +4587,14 @@ xfs_bmapi_write(
 			goto error0;
 	}
 
-	xfs_bmap_search_extents(ip, bno, whichfork, &eof, &bma.idx, &bma.got,
-				&bma.prev);
 	n = 0;
 	end = bno + len;
 	obno = bno;
 
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &bma.idx, &bma.got))
+		eof = true;
+	if (!xfs_iext_get_extent(ifp, bma.idx - 1, &bma.prev))
+		bma.prev.br_startoff = NULLFILEOFF;
 	bma.tp = tp;
 	bma.ip = ip;
 	bma.total = total;
@@ -4733,11 +4681,8 @@ xfs_bmapi_write(
 
 		/* Else go on to the next record. */
 		bma.prev = bma.got;
-		if (++bma.idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t)) {
-			xfs_bmbt_get_all(xfs_iext_get_ext(ifp, bma.idx),
-					 &bma.got);
-		} else
-			eof = 1;
+		if (!xfs_iext_get_extent(ifp, ++bma.idx, &bma.got))
+			eof = true;
 	}
 	*nmap = n;
 
@@ -4885,7 +4830,7 @@ xfs_bmap_del_extent_delay(
 	da_new = 0;
 
 	ASSERT(*idx >= 0);
-	ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got_endoff >= del_endoff);
@@ -4902,8 +4847,11 @@ xfs_bmap_del_extent_delay(
 	 * sb counters as we might have to borrow some blocks for the
 	 * indirect block accounting.
 	 */
-	xfs_trans_reserve_quota_nblks(NULL, ip, -((long)del->br_blockcount), 0,
+	error = xfs_trans_reserve_quota_nblks(NULL, ip,
+			-((long)del->br_blockcount), 0,
 			isrt ? XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS);
+	if (error)
+		return error;
 	ip->i_delayed_blks -= del->br_blockcount;
 
 	if (whichfork == XFS_COW_FORK)
@@ -5013,7 +4961,7 @@ xfs_bmap_del_extent_cow(
 	got_endoff = got->br_startoff + got->br_blockcount;
 
 	ASSERT(*idx >= 0);
-	ASSERT(*idx < ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
+	ASSERT(*idx <= xfs_iext_count(ifp));
 	ASSERT(del->br_blockcount > 0);
 	ASSERT(got->br_startoff <= del->br_startoff);
 	ASSERT(got_endoff >= del_endoff);
@@ -5119,8 +5067,7 @@ xfs_bmap_del_extent(
 		state |= BMAP_COWFORK;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
-		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT((*idx >= 0) && (*idx < xfs_iext_count(ifp)));
 	ASSERT(del->br_blockcount > 0);
 	ep = xfs_iext_get_ext(ifp, *idx);
 	xfs_bmbt_get_all(ep, &got);
@@ -5434,8 +5381,6 @@ __xfs_bunmapi(
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
 	xfs_bmbt_irec_t		del;		/* extent being deleted */
-	int			eof;		/* is deleting at eof */
-	xfs_bmbt_rec_host_t	*ep;		/* extent record pointer */
 	int			error;		/* error return value */
 	xfs_extnum_t		extno;		/* extent number in list */
 	xfs_bmbt_irec_t		got;		/* current extent record */
@@ -5445,8 +5390,6 @@ __xfs_bunmapi(
 	int			logflags;	/* transaction logging flags */
 	xfs_extlen_t		mod;		/* rt extent offset */
 	xfs_mount_t		*mp;		/* mount structure */
-	xfs_extnum_t		nextents;	/* number of file extents */
-	xfs_bmbt_irec_t		prev;		/* previous extent record */
 	xfs_fileoff_t		start;		/* first file offset deleted */
 	int			tmp_logflags;	/* partial logging flags */
 	int			wasdel;		/* was a delayed alloc extent */
@@ -5477,8 +5420,7 @@ __xfs_bunmapi(
 	if (!(ifp->if_flags & XFS_IFEXTENTS) &&
 	    (error = xfs_iread_extents(tp, ip, whichfork)))
 		return error;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
-	if (nextents == 0) {
+	if (xfs_iext_count(ifp) == 0) {
 		*rlen = 0;
 		return 0;
 	}
@@ -5486,18 +5428,17 @@ __xfs_bunmapi(
 	isrt = (whichfork == XFS_DATA_FORK) && XFS_IS_REALTIME_INODE(ip);
 	start = bno;
 	bno = start + len - 1;
-	ep = xfs_bmap_search_extents(ip, bno, whichfork, &eof, &lastx, &got,
-		&prev);
 
 	/*
 	 * Check to see if the given block number is past the end of the
 	 * file, back up to the last block if so...
 	 */
-	if (eof) {
-		ep = xfs_iext_get_ext(ifp, --lastx);
-		xfs_bmbt_get_all(ep, &got);
+	if (!xfs_iext_lookup_extent(ip, ifp, bno, &lastx, &got)) {
+		ASSERT(lastx > 0);
+		xfs_iext_get_extent(ifp, --lastx, &got);
 		bno = got.br_startoff + got.br_blockcount - 1;
 	}
+
 	logflags = 0;
 	if (ifp->if_flags & XFS_IFBROOT) {
 		ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
@@ -5528,8 +5469,7 @@ __xfs_bunmapi(
 		if (got.br_startoff > bno) {
 			if (--lastx < 0)
 				break;
-			ep = xfs_iext_get_ext(ifp, lastx);
-			xfs_bmbt_get_all(ep, &got);
+			xfs_iext_get_extent(ifp, lastx, &got);
 		}
 		/*
 		 * Is the last block of this extent before the range
@@ -5543,7 +5483,6 @@ __xfs_bunmapi(
 		 * Then deal with the (possibly delayed) allocated space
 		 * we found.
 		 */
-		ASSERT(ep != NULL);
 		del = got;
 		wasdel = isnullstartblock(del.br_startblock);
 		if (got.br_startoff < start) {
@@ -5624,15 +5563,12 @@ __xfs_bunmapi(
 				 */
 				ASSERT(bno >= del.br_blockcount);
 				bno -= del.br_blockcount;
-				if (got.br_startoff > bno) {
-					if (--lastx >= 0) {
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-						xfs_bmbt_get_all(ep, &got);
-					}
-				}
+				if (got.br_startoff > bno && --lastx >= 0)
+					xfs_iext_get_extent(ifp, lastx, &got);
 				continue;
 			} else if (del.br_state == XFS_EXT_UNWRITTEN) {
+				struct xfs_bmbt_irec	prev;
+
 				/*
 				 * This one is already unwritten.
 				 * It must have a written left neighbor.
@@ -5640,8 +5576,7 @@ __xfs_bunmapi(
 				 * try again.
 				 */
 				ASSERT(lastx > 0);
-				xfs_bmbt_get_all(xfs_iext_get_ext(ifp,
-						lastx - 1), &prev);
+				xfs_iext_get_extent(ifp, lastx - 1, &prev);
 				ASSERT(prev.br_state == XFS_EXT_NORM);
 				ASSERT(!isnullstartblock(prev.br_startblock));
 				ASSERT(del.br_startblock ==
@@ -5739,13 +5674,9 @@ nodelete:
 		 */
 		if (bno != (xfs_fileoff_t)-1 && bno >= start) {
 			if (lastx >= 0) {
-				ep = xfs_iext_get_ext(ifp, lastx);
-				if (xfs_bmbt_get_startoff(ep) > bno) {
-					if (--lastx >= 0)
-						ep = xfs_iext_get_ext(ifp,
-								      lastx);
-				}
-				xfs_bmbt_get_all(ep, &got);
+				xfs_iext_get_extent(ifp, lastx, &got);
+				if (got.br_startoff > bno && --lastx >= 0)
+					xfs_iext_get_extent(ifp, lastx, &got);
 			}
 			extno++;
 		}
@@ -5963,7 +5894,7 @@ xfs_bmse_shift_one(
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
-	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	total_extents = xfs_iext_count(ifp);
 
 	xfs_bmbt_get_all(gotp, &got);
 
@@ -6140,7 +6071,7 @@ xfs_bmap_shift_extents(
 	 * are collapsing out, so we cannot use the count of real extents here.
 	 * Instead we have to calculate it from the incore fork.
 	 */
-	total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+	total_extents = xfs_iext_count(ifp);
 	if (total_extents == 0) {
 		*done = 1;
 		goto del_cursor;
@@ -6200,7 +6131,7 @@ xfs_bmap_shift_extents(
 		 * count can change. Update the total and grade the next record.
 		 */
 		if (direction == SHIFT_LEFT) {
-			total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t);
+			total_extents = xfs_iext_count(ifp);
 			stop_extent = total_extents;
 		}
 
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 7cae6ec27fa6..cecd094404cc 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -237,14 +237,9 @@ int	xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip,
 		struct xfs_defer_ops *dfops, enum shift_direction direction,
 		int num_exts);
 int	xfs_bmap_split_extent(struct xfs_inode *ip, xfs_fileoff_t split_offset);
-struct xfs_bmbt_rec_host *
-	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
-		int fork, int *eofp, xfs_extnum_t *lastxp,
-		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
 int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
-		xfs_fileoff_t aoff, xfs_filblks_t len,
-		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
-		xfs_extnum_t *lastx, int eof);
+		xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc,
+		struct xfs_bmbt_irec *got, xfs_extnum_t *lastx, int eof);
 
 enum xfs_bmap_intent_type {
 	XFS_BMAP_MAP = 1,
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index 8007d2ba9aef..d9be241fc86f 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -502,12 +502,11 @@ try_another_ag:
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
-		 * a full btree split.  Try again without minleft and if
+		 * a full btree split.  Try again and if
 		 * successful activate the lowspace algorithm.
 		 */
 		args.fsbno = 0;
 		args.type = XFS_ALLOCTYPE_FIRST_AG;
-		args.minleft = 0;
 		error = xfs_alloc_vextent(&args);
 		if (error)
 			goto error0;
@@ -796,13 +795,14 @@ xfs_bmbt_init_cursor(
 	struct xfs_btree_cur	*cur;
 	ASSERT(whichfork != XFS_COW_FORK);
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
 	cur->bc_nlevels = be16_to_cpu(ifp->if_broot->bb_level) + 1;
 	cur->bc_btnum = XFS_BTNUM_BMAP;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_bmbt_2);
 
 	cur->bc_ops = &xfs_bmbt_ops;
 	cur->bc_flags = XFS_BTREE_LONG_PTRS | XFS_BTREE_ROOT_IN_INODE;
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 0e80993c8a59..21e6a6ab6b9a 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -1769,8 +1769,28 @@ xfs_btree_lookup_get_block(
 	if (error)
 		return error;
 
+	/* Check the inode owner since the verifiers don't. */
+	if (xfs_sb_version_hascrc(&cur->bc_mp->m_sb) &&
+	    (cur->bc_flags & XFS_BTREE_LONG_PTRS) &&
+	    be64_to_cpu((*blkp)->bb_u.l.bb_owner) !=
+			cur->bc_private.b.ip->i_ino)
+		goto out_bad;
+
+	/* Did we get the level we were looking for? */
+	if (be16_to_cpu((*blkp)->bb_level) != level)
+		goto out_bad;
+
+	/* Check that internal nodes have at least one record. */
+	if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
+		goto out_bad;
+
 	xfs_btree_setbuf(cur, level, bp);
 	return 0;
+
+out_bad:
+	*blkp = NULL;
+	xfs_trans_brelse(cur->bc_tp, bp);
+	return -EFSCORRUPTED;
 }
 
 /*
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index c2b01d1c79ee..b69b947c4c1b 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -96,46 +96,10 @@ union xfs_btree_rec {
 /*
  * Generic stats interface
  */
-#define __XFS_BTREE_STATS_INC(mp, type, stat) \
-	XFS_STATS_INC(mp, xs_ ## type ## _2_ ## stat)
 #define XFS_BTREE_STATS_INC(cur, stat)	\
-do {    \
-	struct xfs_mount *__mp = cur->bc_mp; \
-	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO: __XFS_BTREE_STATS_INC(__mp, abtb, stat); break; \
-	case XFS_BTNUM_CNT: __XFS_BTREE_STATS_INC(__mp, abtc, stat); break; \
-	case XFS_BTNUM_BMAP: __XFS_BTREE_STATS_INC(__mp, bmbt, stat); break; \
-	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
-	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
-	case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
-	case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
-	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
-	}       \
-} while (0)
-
-#define __XFS_BTREE_STATS_ADD(mp, type, stat, val) \
-	XFS_STATS_ADD(mp, xs_ ## type ## _2_ ## stat, val)
-#define XFS_BTREE_STATS_ADD(cur, stat, val)  \
-do {    \
-	struct xfs_mount *__mp = cur->bc_mp; \
-	switch (cur->bc_btnum) {  \
-	case XFS_BTNUM_BNO:	\
-		__XFS_BTREE_STATS_ADD(__mp, abtb, stat, val); break; \
-	case XFS_BTNUM_CNT:	\
-		__XFS_BTREE_STATS_ADD(__mp, abtc, stat, val); break; \
-	case XFS_BTNUM_BMAP:	\
-		__XFS_BTREE_STATS_ADD(__mp, bmbt, stat, val); break; \
-	case XFS_BTNUM_INO:	\
-		__XFS_BTREE_STATS_ADD(__mp, ibt, stat, val); break; \
-	case XFS_BTNUM_FINO:	\
-		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
-	case XFS_BTNUM_RMAP:	\
-		__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
-	case XFS_BTNUM_REFC:	\
-		__XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
-	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
-	}       \
-} while (0)
+	XFS_STATS_INC_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat)
+#define XFS_BTREE_STATS_ADD(cur, stat, val)	\
+	XFS_STATS_ADD_OFF((cur)->bc_mp, (cur)->bc_statoff + __XBTS_ ## stat, val)
 
 #define	XFS_BTREE_MAXLEVELS	9	/* max of all btrees */
 
@@ -253,6 +217,7 @@ typedef struct xfs_btree_cur
 	__uint8_t	bc_nlevels;	/* number of levels in the tree */
 	__uint8_t	bc_blocklog;	/* log2(blocksize) of btree blocks */
 	xfs_btnum_t	bc_btnum;	/* identifies which btree type */
+	int		bc_statoff;	/* offset of btre stats array */
 	union {
 		struct {			/* needed for BNO, CNT, INO */
 			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
diff --git a/fs/xfs/libxfs/xfs_cksum.h b/fs/xfs/libxfs/xfs_cksum.h
index fad1676ad8cd..a416c7cb23ea 100644
--- a/fs/xfs/libxfs/xfs_cksum.h
+++ b/fs/xfs/libxfs/xfs_cksum.h
@@ -6,10 +6,11 @@
 /*
  * Calculate the intermediate checksum for a buffer that has the CRC field
  * inside it.  The offset of the 32bit crc fields is passed as the
- * cksum_offset parameter.
+ * cksum_offset parameter. We do not modify the buffer during verification,
+ * hence we have to split the CRC calculation across the cksum_offset.
  */
 static inline __uint32_t
-xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
+xfs_start_cksum_safe(char *buffer, size_t length, unsigned long cksum_offset)
 {
 	__uint32_t zero = 0;
 	__uint32_t crc;
@@ -26,6 +27,20 @@ xfs_start_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 }
 
 /*
+ * Fast CRC method where the buffer is modified. Callers must have exclusive
+ * access to the buffer while the calculation takes place.
+ */
+static inline __uint32_t
+xfs_start_cksum_update(char *buffer, size_t length, unsigned long cksum_offset)
+{
+	/* zero the CRC field */
+	*(__le32 *)(buffer + cksum_offset) = 0;
+
+	/* single pass CRC calculation for the entire buffer */
+	return crc32c(XFS_CRC_SEED, buffer, length);
+}
+
+/*
  * Convert the intermediate checksum to the final ondisk format.
  *
  * The CRC32c calculation uses LE format even on BE machines, but returns the
@@ -40,11 +55,14 @@ xfs_end_cksum(__uint32_t crc)
 
 /*
  * Helper to generate the checksum for a buffer.
+ *
+ * This modifies the buffer temporarily - callers must have exclusive
+ * access to the buffer while the calculation takes place.
  */
 static inline void
 xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_update(buffer, length, cksum_offset);
 
 	*(__le32 *)(buffer + cksum_offset) = xfs_end_cksum(crc);
 }
@@ -55,7 +73,7 @@ xfs_update_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 static inline int
 xfs_verify_cksum(char *buffer, size_t length, unsigned long cksum_offset)
 {
-	__uint32_t crc = xfs_start_cksum(buffer, length, cksum_offset);
+	__uint32_t crc = xfs_start_cksum_safe(buffer, length, cksum_offset);
 
 	return *(__le32 *)(buffer + cksum_offset) == xfs_end_cksum(crc);
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c
index 20a96dd5af7e..2f389d366e93 100644
--- a/fs/xfs/libxfs/xfs_dir2.c
+++ b/fs/xfs/libxfs/xfs_dir2.c
@@ -36,21 +36,29 @@
 struct xfs_name xfs_name_dotdot = { (unsigned char *)"..", 2, XFS_DIR3_FT_DIR };
 
 /*
- * @mode, if set, indicates that the type field needs to be set up.
- * This uses the transformation from file mode to DT_* as defined in linux/fs.h
- * for file type specification. This will be propagated into the directory
- * structure if appropriate for the given operation and filesystem config.
+ * Convert inode mode to directory entry filetype
  */
-const unsigned char xfs_mode_to_ftype[S_IFMT >> S_SHIFT] = {
-	[0]			= XFS_DIR3_FT_UNKNOWN,
-	[S_IFREG >> S_SHIFT]    = XFS_DIR3_FT_REG_FILE,
-	[S_IFDIR >> S_SHIFT]    = XFS_DIR3_FT_DIR,
-	[S_IFCHR >> S_SHIFT]    = XFS_DIR3_FT_CHRDEV,
-	[S_IFBLK >> S_SHIFT]    = XFS_DIR3_FT_BLKDEV,
-	[S_IFIFO >> S_SHIFT]    = XFS_DIR3_FT_FIFO,
-	[S_IFSOCK >> S_SHIFT]   = XFS_DIR3_FT_SOCK,
-	[S_IFLNK >> S_SHIFT]    = XFS_DIR3_FT_SYMLINK,
-};
+unsigned char xfs_mode_to_ftype(int mode)
+{
+	switch (mode & S_IFMT) {
+	case S_IFREG:
+		return XFS_DIR3_FT_REG_FILE;
+	case S_IFDIR:
+		return XFS_DIR3_FT_DIR;
+	case S_IFCHR:
+		return XFS_DIR3_FT_CHRDEV;
+	case S_IFBLK:
+		return XFS_DIR3_FT_BLKDEV;
+	case S_IFIFO:
+		return XFS_DIR3_FT_FIFO;
+	case S_IFSOCK:
+		return XFS_DIR3_FT_SOCK;
+	case S_IFLNK:
+		return XFS_DIR3_FT_SYMLINK;
+	default:
+		return XFS_DIR3_FT_UNKNOWN;
+	}
+}
 
 /*
  * ASCII case-insensitive (ie. A-Z) support for directories that was
@@ -93,7 +101,7 @@ xfs_ascii_ci_compname(
 	return result;
 }
 
-static struct xfs_nameops xfs_ascii_ci_nameops = {
+static const struct xfs_nameops xfs_ascii_ci_nameops = {
 	.hashname	= xfs_ascii_ci_hashname,
 	.compname	= xfs_ascii_ci_compname,
 };
@@ -631,7 +639,8 @@ xfs_dir2_isblock(
 	if ((rval = xfs_bmap_last_offset(args->dp, &last, XFS_DATA_FORK)))
 		return rval;
 	rval = XFS_FSB_TO_B(args->dp->i_mount, last) == args->geo->blksize;
-	ASSERT(rval == 0 || args->dp->i_d.di_size == args->geo->blksize);
+	if (rval != 0 && args->dp->i_d.di_size != args->geo->blksize)
+		return -EFSCORRUPTED;
 	*vp = rval;
 	return 0;
 }
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index becc926c3e3d..d6e6d9d16f6c 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -18,6 +18,9 @@
 #ifndef __XFS_DIR2_H__
 #define __XFS_DIR2_H__
 
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+
 struct xfs_defer_ops;
 struct xfs_da_args;
 struct xfs_inode;
@@ -32,10 +35,9 @@ struct xfs_dir2_data_unused;
 extern struct xfs_name	xfs_name_dotdot;
 
 /*
- * directory filetype conversion tables.
+ * Convert inode mode to directory entry filetype
  */
-#define S_SHIFT 12
-extern const unsigned char xfs_mode_to_ftype[];
+extern unsigned char xfs_mode_to_ftype(int mode);
 
 /*
  * directory operations vector for encode/decode routines
@@ -157,6 +159,9 @@ extern int xfs_dir2_isleaf(struct xfs_da_args *args, int *r);
 extern int xfs_dir2_shrink_inode(struct xfs_da_args *args, xfs_dir2_db_t db,
 				struct xfs_buf *bp);
 
+extern void xfs_dir2_data_freescan_int(struct xfs_da_geometry *geo,
+		const struct xfs_dir_ops *ops,
+		struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern void xfs_dir2_data_freescan(struct xfs_inode *dp,
 		struct xfs_dir2_data_hdr *hdr, int *loghead);
 extern void xfs_dir2_data_log_entry(struct xfs_da_args *args,
@@ -177,6 +182,8 @@ extern struct xfs_dir2_data_free *xfs_dir2_data_freefind(
 		struct xfs_dir2_data_hdr *hdr, struct xfs_dir2_data_free *bf,
 		struct xfs_dir2_data_unused *dup);
 
+extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
+
 extern const struct xfs_buf_ops xfs_dir3_block_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leafn_buf_ops;
 extern const struct xfs_buf_ops xfs_dir3_leaf1_buf_ops;
diff --git a/fs/xfs/libxfs/xfs_dir2_data.c b/fs/xfs/libxfs/xfs_dir2_data.c
index 725fc7841fde..d478065b9544 100644
--- a/fs/xfs/libxfs/xfs_dir2_data.c
+++ b/fs/xfs/libxfs/xfs_dir2_data.c
@@ -329,7 +329,7 @@ xfs_dir3_data_read(
 
 	err = xfs_da_read_buf(tp, dp, bno, mapped_bno, bpp,
 				XFS_DATA_FORK, &xfs_dir3_data_buf_ops);
-	if (!err && tp)
+	if (!err && tp && *bpp)
 		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_DIR_DATA_BUF);
 	return err;
 }
@@ -505,8 +505,9 @@ xfs_dir2_data_freeremove(
  * Given a data block, reconstruct its bestfree map.
  */
 void
-xfs_dir2_data_freescan(
-	struct xfs_inode	*dp,
+xfs_dir2_data_freescan_int(
+	struct xfs_da_geometry	*geo,
+	const struct xfs_dir_ops *ops,
 	struct xfs_dir2_data_hdr *hdr,
 	int			*loghead)
 {
@@ -516,7 +517,6 @@ xfs_dir2_data_freescan(
 	struct xfs_dir2_data_free *bf;
 	char			*endp;		/* end of block's data */
 	char			*p;		/* current entry pointer */
-	struct xfs_da_geometry	*geo = dp->i_mount->m_dir_geo;
 
 	ASSERT(hdr->magic == cpu_to_be32(XFS_DIR2_DATA_MAGIC) ||
 	       hdr->magic == cpu_to_be32(XFS_DIR3_DATA_MAGIC) ||
@@ -526,13 +526,13 @@ xfs_dir2_data_freescan(
 	/*
 	 * Start by clearing the table.
 	 */
-	bf = dp->d_ops->data_bestfree_p(hdr);
+	bf = ops->data_bestfree_p(hdr);
 	memset(bf, 0, sizeof(*bf) * XFS_DIR2_DATA_FD_COUNT);
 	*loghead = 1;
 	/*
 	 * Set up pointers.
 	 */
-	p = (char *)dp->d_ops->data_entry_p(hdr);
+	p = (char *)ops->data_entry_p(hdr);
 	if (hdr->magic == cpu_to_be32(XFS_DIR2_BLOCK_MAGIC) ||
 	    hdr->magic == cpu_to_be32(XFS_DIR3_BLOCK_MAGIC)) {
 		btp = xfs_dir2_block_tail_p(geo, hdr);
@@ -559,12 +559,22 @@ xfs_dir2_data_freescan(
 		else {
 			dep = (xfs_dir2_data_entry_t *)p;
 			ASSERT((char *)dep - (char *)hdr ==
-			       be16_to_cpu(*dp->d_ops->data_entry_tag_p(dep)));
-			p += dp->d_ops->data_entsize(dep->namelen);
+			       be16_to_cpu(*ops->data_entry_tag_p(dep)));
+			p += ops->data_entsize(dep->namelen);
 		}
 	}
 }
 
+void
+xfs_dir2_data_freescan(
+	struct xfs_inode	*dp,
+	struct xfs_dir2_data_hdr *hdr,
+	int			*loghead)
+{
+	return xfs_dir2_data_freescan_int(dp->i_mount->m_dir_geo, dp->d_ops,
+			hdr, loghead);
+}
+
 /*
  * Initialize a data block at the given block number in the directory.
  * Give back the buffer for the created block.
diff --git a/fs/xfs/libxfs/xfs_dir2_priv.h b/fs/xfs/libxfs/xfs_dir2_priv.h
index ef9f6ead96a4..d04547fcf274 100644
--- a/fs/xfs/libxfs/xfs_dir2_priv.h
+++ b/fs/xfs/libxfs/xfs_dir2_priv.h
@@ -21,7 +21,6 @@
 struct dir_context;
 
 /* xfs_dir2.c */
-extern int xfs_dir_ino_validate(struct xfs_mount *mp, xfs_ino_t ino);
 extern int xfs_dir2_grow_inode(struct xfs_da_args *args, int space,
 				xfs_dir2_db_t *dbp);
 extern int xfs_dir_cilookup_result(struct xfs_da_args *args,
diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c
index 51b4e0de1fdc..f272abff11e1 100644
--- a/fs/xfs/libxfs/xfs_ialloc.c
+++ b/fs/xfs/libxfs/xfs_ialloc.c
@@ -2344,7 +2344,8 @@ xfs_imap(
 
 		imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, agbno);
 		imap->im_len = XFS_FSB_TO_BB(mp, 1);
-		imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+		imap->im_boffset = (unsigned short)(offset <<
+							mp->m_sb.sb_inodelog);
 		return 0;
 	}
 
@@ -2372,7 +2373,7 @@ out_map:
 
 	imap->im_blkno = XFS_AGB_TO_DADDR(mp, agno, cluster_agbno);
 	imap->im_len = XFS_FSB_TO_BB(mp, blks_per_cluster);
-	imap->im_boffset = (ushort)(offset << mp->m_sb.sb_inodelog);
+	imap->im_boffset = (unsigned short)(offset << mp->m_sb.sb_inodelog);
 
 	/*
 	 * If the inode number maps to a block outside the bounds
@@ -2450,8 +2451,6 @@ xfs_ialloc_log_agi(
 	ASSERT(agi->agi_magicnum == cpu_to_be32(XFS_AGI_MAGIC));
 #endif
 
-	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGI_BUF);
-
 	/*
 	 * Compute byte offsets for the first and last fields in the first
 	 * region and log the agi buffer. This only logs up through
@@ -2512,8 +2511,15 @@ xfs_agi_verify(
 	if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum)))
 		return false;
 
-	if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+	if (be32_to_cpu(agi->agi_level) < 1 ||
+	    be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS)
+		return false;
+
+	if (xfs_sb_version_hasfinobt(&mp->m_sb) &&
+	    (be32_to_cpu(agi->agi_free_level) < 1 ||
+	     be32_to_cpu(agi->agi_free_level) > XFS_BTREE_MAXLEVELS))
 		return false;
+
 	/*
 	 * during growfs operations, the perag is not fully initialised,
 	 * so we can't use it for any useful checking. growfs ensures we can't
@@ -2592,6 +2598,8 @@ xfs_read_agi(
 			XFS_FSS_TO_BB(mp, 1), 0, bpp, &xfs_agi_buf_ops);
 	if (error)
 		return error;
+	if (tp)
+		xfs_trans_buf_set_type(tp, *bpp, XFS_BLFT_AGI_BUF);
 
 	xfs_buf_set_ref(*bpp, XFS_AGI_REF);
 	return 0;
diff --git a/fs/xfs/libxfs/xfs_ialloc_btree.c b/fs/xfs/libxfs/xfs_ialloc_btree.c
index eab68ae2e011..0fd086d03d41 100644
--- a/fs/xfs/libxfs/xfs_ialloc_btree.c
+++ b/fs/xfs/libxfs/xfs_ialloc_btree.c
@@ -357,7 +357,7 @@ xfs_inobt_init_cursor(
 	struct xfs_agi		*agi = XFS_BUF_TO_AGI(agbp);
 	struct xfs_btree_cur	*cur;
 
-	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
 
 	cur->bc_tp = tp;
 	cur->bc_mp = mp;
@@ -365,9 +365,11 @@ xfs_inobt_init_cursor(
 	if (btnum == XFS_BTNUM_INO) {
 		cur->bc_nlevels = be32_to_cpu(agi->agi_level);
 		cur->bc_ops = &xfs_inobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_ibt_2);
 	} else {
 		cur->bc_nlevels = be32_to_cpu(agi->agi_free_level);
 		cur->bc_ops = &xfs_finobt_ops;
+		cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_fibt_2);
 	}
 
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 134424fac434..d93f9d918cfc 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -29,6 +29,7 @@
 #include "xfs_icache.h"
 #include "xfs_trans.h"
 #include "xfs_ialloc.h"
+#include "xfs_dir2.h"
 
 /*
  * Check that none of the inode's in the buffer have a next
@@ -383,15 +384,28 @@ xfs_log_dinode_to_disk(
 static bool
 xfs_dinode_verify(
 	struct xfs_mount	*mp,
-	struct xfs_inode	*ip,
+	xfs_ino_t		ino,
 	struct xfs_dinode	*dip)
 {
+	uint16_t		mode;
 	uint16_t		flags;
 	uint64_t		flags2;
 
 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 		return false;
 
+	/* don't allow invalid i_size */
+	if (be64_to_cpu(dip->di_size) & (1ULL << 63))
+		return false;
+
+	mode = be16_to_cpu(dip->di_mode);
+	if (mode && xfs_mode_to_ftype(mode) == XFS_DIR3_FT_UNKNOWN)
+		return false;
+
+	/* No zero-length symlinks/dirs. */
+	if ((S_ISLNK(mode) || S_ISDIR(mode)) && dip->di_size == 0)
+		return false;
+
 	/* only version 3 or greater inodes are extensively verified here */
 	if (dip->di_version < 3)
 		return true;
@@ -401,7 +415,7 @@ xfs_dinode_verify(
 	if (!xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
 			      XFS_DINODE_CRC_OFF))
 		return false;
-	if (be64_to_cpu(dip->di_ino) != ip->i_ino)
+	if (be64_to_cpu(dip->di_ino) != ino)
 		return false;
 	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 		return false;
@@ -436,7 +450,7 @@ xfs_dinode_calc_crc(
 		return;
 
 	ASSERT(xfs_sb_version_hascrc(&mp->m_sb));
-	crc = xfs_start_cksum((char *)dip, mp->m_sb.sb_inodesize,
+	crc = xfs_start_cksum_update((char *)dip, mp->m_sb.sb_inodesize,
 			      XFS_DINODE_CRC_OFF);
 	dip->di_crc = xfs_end_cksum(crc);
 }
@@ -493,7 +507,7 @@ xfs_iread(
 		return error;
 
 	/* even unallocated inodes are verified */
-	if (!xfs_dinode_verify(mp, ip, dip)) {
+	if (!xfs_dinode_verify(mp, ip->i_ino, dip)) {
 		xfs_alert(mp, "%s: validation failed for inode %lld failed",
 				__func__, ip->i_ino);
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 3cfe12a4f58a..6848a0afbce7 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -58,8 +58,8 @@ struct xfs_icdinode {
  */
 struct xfs_imap {
 	xfs_daddr_t	im_blkno;	/* starting BB of inode chunk */
-	ushort		im_len;		/* length in BBs of inode chunk */
-	ushort		im_boffset;	/* inode offset in block in bytes */
+	unsigned short	im_len;		/* length in BBs of inode chunk */
+	unsigned short	im_boffset;	/* inode offset in block in bytes */
 };
 
 int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index 5dd56d3dbb3a..222e103356c6 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -775,6 +775,13 @@ xfs_idestroy_fork(
 	}
 }
 
+/* Count number of incore extents based on if_bytes */
+xfs_extnum_t
+xfs_iext_count(struct xfs_ifork *ifp)
+{
+	return ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+}
+
 /*
  * Convert in-core extents to on-disk form
  *
@@ -803,7 +810,7 @@ xfs_iextents_copy(
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
 	ASSERT(ifp->if_bytes > 0);
 
-	nrecs = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nrecs = xfs_iext_count(ifp);
 	XFS_BMAP_TRACE_EXLIST(ip, nrecs, whichfork);
 	ASSERT(nrecs > 0);
 
@@ -941,7 +948,7 @@ xfs_iext_get_ext(
 	xfs_extnum_t	idx)		/* index of target extent */
 {
 	ASSERT(idx >= 0);
-	ASSERT(idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
+	ASSERT(idx < xfs_iext_count(ifp));
 
 	if ((ifp->if_flags & XFS_IFEXTIREC) && (idx == 0)) {
 		return ifp->if_u1.if_ext_irec->er_extbuf;
@@ -1017,7 +1024,7 @@ xfs_iext_add(
 	int		new_size;	/* size of extents after adding */
 	xfs_extnum_t	nextents;	/* number of extents in file */
 
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT((idx >= 0) && (idx <= nextents));
 	byte_diff = ext_diff * sizeof(xfs_bmbt_rec_t);
 	new_size = ifp->if_bytes + byte_diff;
@@ -1241,7 +1248,7 @@ xfs_iext_remove(
 	trace_xfs_iext_remove(ip, idx, state, _RET_IP_);
 
 	ASSERT(ext_diff > 0);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	new_size = (nextents - ext_diff) * sizeof(xfs_bmbt_rec_t);
 
 	if (new_size == 0) {
@@ -1270,7 +1277,7 @@ xfs_iext_remove_inline(
 
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	ASSERT(idx < XFS_INLINE_EXTS);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(((nextents - ext_diff) > 0) &&
 		(nextents - ext_diff) < XFS_INLINE_EXTS);
 
@@ -1309,7 +1316,7 @@ xfs_iext_remove_direct(
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
 	new_size = ifp->if_bytes -
 		(ext_diff * sizeof(xfs_bmbt_rec_t));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 
 	if (new_size == 0) {
 		xfs_iext_destroy(ifp);
@@ -1546,7 +1553,7 @@ xfs_iext_indirect_to_direct(
 	int		size;		/* size of file extents */
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 	size = nextents * sizeof(xfs_bmbt_rec_t);
 
@@ -1620,7 +1627,7 @@ xfs_iext_bno_to_ext(
 	xfs_extnum_t	nextents;	/* number of file extents */
 	xfs_fileoff_t	startoff = 0;	/* start offset of extent */
 
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	if (nextents == 0) {
 		*idxp = 0;
 		return NULL;
@@ -1733,8 +1740,8 @@ xfs_iext_idx_to_irec(
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	ASSERT(page_idx >= 0);
-	ASSERT(page_idx <= ifp->if_bytes / sizeof(xfs_bmbt_rec_t));
-	ASSERT(page_idx < ifp->if_bytes / sizeof(xfs_bmbt_rec_t) || realloc);
+	ASSERT(page_idx <= xfs_iext_count(ifp));
+	ASSERT(page_idx < xfs_iext_count(ifp) || realloc);
 
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
 	erp_idx = 0;
@@ -1782,7 +1789,7 @@ xfs_iext_irec_init(
 	xfs_extnum_t	nextents;	/* number of extents in file */
 
 	ASSERT(!(ifp->if_flags & XFS_IFEXTIREC));
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	ASSERT(nextents <= XFS_LINEAR_EXTS);
 
 	erp = kmem_alloc(sizeof(xfs_ext_irec_t), KM_NOFS);
@@ -1906,7 +1913,7 @@ xfs_iext_irec_compact(
 
 	ASSERT(ifp->if_flags & XFS_IFEXTIREC);
 	nlists = ifp->if_real_bytes / XFS_IEXT_BUFSZ;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 
 	if (nextents == 0) {
 		xfs_iext_destroy(ifp);
@@ -1996,3 +2003,49 @@ xfs_ifork_init_cow(
 	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	ip->i_cnextents = 0;
 }
+
+/*
+ * Lookup the extent covering bno.
+ *
+ * If there is an extent covering bno return the extent index, and store the
+ * expanded extent structure in *gotp, and the extent index in *idx.
+ * If there is no extent covering bno, but there is an extent after it (e.g.
+ * it lies in a hole) return that extent in *gotp and its index in *idx
+ * instead.
+ * If bno is beyond the last extent return false, and return the index after
+ * the last valid index in *idxp.
+ */
+bool
+xfs_iext_lookup_extent(
+	struct xfs_inode	*ip,
+	struct xfs_ifork	*ifp,
+	xfs_fileoff_t		bno,
+	xfs_extnum_t		*idxp,
+	struct xfs_bmbt_irec	*gotp)
+{
+	struct xfs_bmbt_rec_host *ep;
+
+	XFS_STATS_INC(ip->i_mount, xs_look_exlist);
+
+	ep = xfs_iext_bno_to_ext(ifp, bno, idxp);
+	if (!ep)
+		return false;
+	xfs_bmbt_get_all(ep, gotp);
+	return true;
+}
+
+/*
+ * Return true if there is an extent at index idx, and return the expanded
+ * extent structure at idx in that case.  Else return false.
+ */
+bool
+xfs_iext_get_extent(
+	struct xfs_ifork	*ifp,
+	xfs_extnum_t		idx,
+	struct xfs_bmbt_irec	*gotp)
+{
+	if (idx < 0 || idx >= xfs_iext_count(ifp))
+		return false;
+	xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), gotp);
+	return true;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index c9476f50e32d..7fb8365326d1 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -152,6 +152,7 @@ void		xfs_init_local_fork(struct xfs_inode *, int, const void *, int);
 
 struct xfs_bmbt_rec_host *
 		xfs_iext_get_ext(struct xfs_ifork *, xfs_extnum_t);
+xfs_extnum_t	xfs_iext_count(struct xfs_ifork *);
 void		xfs_iext_insert(struct xfs_inode *, xfs_extnum_t, xfs_extnum_t,
 				struct xfs_bmbt_irec *, int);
 void		xfs_iext_add(struct xfs_ifork *, xfs_extnum_t, int);
@@ -181,6 +182,12 @@ void		xfs_iext_irec_compact_pages(struct xfs_ifork *);
 void		xfs_iext_irec_compact_full(struct xfs_ifork *);
 void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
 
+bool		xfs_iext_lookup_extent(struct xfs_inode *ip,
+			struct xfs_ifork *ifp, xfs_fileoff_t bno,
+			xfs_extnum_t *idxp, struct xfs_bmbt_irec *gotp);
+bool		xfs_iext_get_extent(struct xfs_ifork *ifp, xfs_extnum_t idx,
+			struct xfs_bmbt_irec *gotp);
+
 extern struct kmem_zone	*xfs_ifork_zone;
 
 extern void xfs_ifork_init_cow(struct xfs_inode *ip);
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 083cdd6d6c28..7ae571f8e34a 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -481,8 +481,8 @@ static inline uint xfs_log_dinode_size(int version)
 typedef struct xfs_buf_log_format {
 	unsigned short	blf_type;	/* buf log item type indicator */
 	unsigned short	blf_size;	/* size of this item */
-	ushort		blf_flags;	/* misc state */
-	ushort		blf_len;	/* number of blocks in this buf */
+	unsigned short	blf_flags;	/* misc state */
+	unsigned short	blf_len;	/* number of blocks in this buf */
 	__int64_t	blf_blkno;	/* starting blkno of this buf */
 	unsigned int	blf_map_size;	/* used size of data bitmap in words */
 	unsigned int	blf_data_map[XFS_BLF_DATAMAP_SIZE]; /* dirty bitmap */
diff --git a/fs/xfs/libxfs/xfs_log_recover.h b/fs/xfs/libxfs/xfs_log_recover.h
index 8e385f91d660..d9f65e2d5cc8 100644
--- a/fs/xfs/libxfs/xfs_log_recover.h
+++ b/fs/xfs/libxfs/xfs_log_recover.h
@@ -52,7 +52,7 @@ typedef struct xlog_recover {
 	struct list_head	r_itemq;	/* q for items */
 } xlog_recover_t;
 
-#define ITEM_TYPE(i)	(*(ushort *)(i)->ri_buf[0].i_addr)
+#define ITEM_TYPE(i)	(*(unsigned short *)(i)->ri_buf[0].i_addr)
 
 /*
  * This is the number of entries in the l_buf_cancel_table used during
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
index 453bb2757ec2..50add5272807 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.c
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -354,6 +354,7 @@ xfs_refcountbt_init_cursor(
 	cur->bc_btnum = XFS_BTNUM_REFC;
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_refcountbt_ops;
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_refcbt_2);
 
 	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
 
@@ -408,13 +409,14 @@ xfs_refcountbt_calc_size(
  */
 xfs_extlen_t
 xfs_refcountbt_max_size(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
 {
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
 	if (mp->m_refc_mxr[0] == 0)
 		return 0;
 
-	return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+	return xfs_refcountbt_calc_size(mp, agblocks);
 }
 
 /*
@@ -429,22 +431,24 @@ xfs_refcountbt_calc_reserves(
 {
 	struct xfs_buf		*agbp;
 	struct xfs_agf		*agf;
+	xfs_agblock_t		agblocks;
 	xfs_extlen_t		tree_len;
 	int			error;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return 0;
 
-	*ask += xfs_refcountbt_max_size(mp);
 
 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 	if (error)
 		return error;
 
 	agf = XFS_BUF_TO_AGF(agbp);
+	agblocks = be32_to_cpu(agf->agf_length);
 	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
 	xfs_buf_relse(agbp);
 
+	*ask += xfs_refcountbt_max_size(mp, agblocks);
 	*used += tree_len;
 
 	return error;
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
index 3be7768bd51a..9db008b955b7 100644
--- a/fs/xfs/libxfs/xfs_refcount_btree.h
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -66,7 +66,8 @@ extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
-extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
 
 extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 83e672ff7577..74e5a54bc428 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -484,6 +484,7 @@ xfs_rmapbt_init_cursor(
 	cur->bc_blocklog = mp->m_sb.sb_blocklog;
 	cur->bc_ops = &xfs_rmapbt_ops;
 	cur->bc_nlevels = be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]);
+	cur->bc_statoff = XFS_STATS_CALC_INDEX(xs_rmap_2);
 
 	cur->bc_private.a.agbp = agbp;
 	cur->bc_private.a.agno = agno;
@@ -549,13 +550,14 @@ xfs_rmapbt_calc_size(
  */
 xfs_extlen_t
 xfs_rmapbt_max_size(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	xfs_agblock_t		agblocks)
 {
 	/* Bail out if we're uninitialized, which can happen in mkfs. */
 	if (mp->m_rmap_mxr[0] == 0)
 		return 0;
 
-	return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+	return xfs_rmapbt_calc_size(mp, agblocks);
 }
 
 /*
@@ -570,25 +572,24 @@ xfs_rmapbt_calc_reserves(
 {
 	struct xfs_buf		*agbp;
 	struct xfs_agf		*agf;
-	xfs_extlen_t		pool_len;
+	xfs_agblock_t		agblocks;
 	xfs_extlen_t		tree_len;
 	int			error;
 
 	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return 0;
 
-	/* Reserve 1% of the AG or enough for 1 block per record. */
-	pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
-	*ask += pool_len;
-
 	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
 	if (error)
 		return error;
 
 	agf = XFS_BUF_TO_AGF(agbp);
+	agblocks = be32_to_cpu(agf->agf_length);
 	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 	xfs_buf_relse(agbp);
 
+	/* Reserve 1% of the AG or enough for 1 block per record. */
+	*ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 	*used += tree_len;
 
 	return error;
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index 2a9ac472fb15..19c08e933049 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -60,7 +60,8 @@ extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
 extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
 		unsigned long long len);
-extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp,
+		xfs_agblock_t agblocks);
 
 extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
 		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c
index e2e1106c9fad..ea45584a9913 100644
--- a/fs/xfs/libxfs/xfs_rtbitmap.c
+++ b/fs/xfs/libxfs/xfs_rtbitmap.c
@@ -1016,4 +1016,3 @@ xfs_rtfree_extent(
 	}
 	return 0;
 }
-
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index a70aec910626..2580262e4ea0 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -262,6 +262,12 @@ xfs_mount_validate_sb(
 		return -EFSCORRUPTED;
 	}
 
+	if (xfs_sb_version_hascrc(&mp->m_sb) &&
+	    sbp->sb_blocksize < XFS_MIN_CRC_BLOCKSIZE) {
+		xfs_notice(mp, "v5 SB sanity check failed");
+		return -EFSCORRUPTED;
+	}
+
 	/*
 	 * Until this is fixed only page-sized or smaller data blocks work.
 	 */
@@ -338,13 +344,16 @@ xfs_sb_quota_from_disk(struct xfs_sb *sbp)
 					XFS_PQUOTA_CHKD : XFS_GQUOTA_CHKD;
 	sbp->sb_qflags &= ~(XFS_OQUOTA_ENFD | XFS_OQUOTA_CHKD);
 
-	if (sbp->sb_qflags & XFS_PQUOTA_ACCT)  {
+	if (sbp->sb_qflags & XFS_PQUOTA_ACCT &&
+	    sbp->sb_gquotino != NULLFSINO)  {
 		/*
 		 * In older version of superblock, on-disk superblock only
 		 * has sb_gquotino, and in-core superblock has both sb_gquotino
 		 * and sb_pquotino. But, only one of them is supported at any
 		 * point of time. So, if PQUOTA is set in disk superblock,
-		 * copy over sb_gquotino to sb_pquotino.
+		 * copy over sb_gquotino to sb_pquotino.  The NULLFSINO test
+		 * above is to make sure we don't do this twice and wipe them
+		 * both out!
 		 */
 		sbp->sb_pquotino = sbp->sb_gquotino;
 		sbp->sb_gquotino = NULLFSINO;
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 8d74870468c2..717909f2f7b7 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -57,7 +57,6 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
 
 #define	NULLAGBLOCK	((xfs_agblock_t)-1)
 #define	NULLAGNUMBER	((xfs_agnumber_t)-1)
-#define	NULLEXTNUM	((xfs_extnum_t)-1)
 
 #define NULLCOMMITLSN	((xfs_lsn_t)-1)
 
@@ -75,11 +74,14 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
  * Minimum and maximum blocksize and sectorsize.
  * The blocksize upper limit is pretty much arbitrary.
  * The sectorsize upper limit is due to sizeof(sb_sectsize).
+ * CRC enable filesystems use 512 byte inodes, meaning 512 byte block sizes
+ * cannot be used.
  */
 #define XFS_MIN_BLOCKSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MAX_BLOCKSIZE_LOG	16	/* i.e. 65536 bytes */
 #define XFS_MIN_BLOCKSIZE	(1 << XFS_MIN_BLOCKSIZE_LOG)
 #define XFS_MAX_BLOCKSIZE	(1 << XFS_MAX_BLOCKSIZE_LOG)
+#define XFS_MIN_CRC_BLOCKSIZE	(1 << (XFS_MIN_BLOCKSIZE_LOG + 1))
 #define XFS_MIN_SECTORSIZE_LOG	9	/* i.e. 512 bytes */
 #define XFS_MAX_SECTORSIZE_LOG	15	/* i.e. 32768 bytes */
 #define XFS_MIN_SECTORSIZE	(1 << XFS_MIN_SECTORSIZE_LOG)
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3e57a56cf829..631e7c0e0a29 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -37,11 +37,6 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
-/* flags for direct write completions */
-#define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
-#define XFS_DIO_FLAG_APPEND	(1 << 1)
-#define XFS_DIO_FLAG_COW	(1 << 2)
-
 /*
  * structure owned by writepages passed to individual writepage calls
  */
@@ -495,8 +490,8 @@ xfs_submit_ioend(
 
 	ioend->io_bio->bi_private = ioend;
 	ioend->io_bio->bi_end_io = xfs_end_bio;
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			 (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
+
 	/*
 	 * If we are failing the IO now, just mark the ioend with an
 	 * error and finish it. This will run IO completion immediately
@@ -567,8 +562,7 @@ xfs_chain_bio(
 
 	bio_chain(ioend->io_bio, new);
 	bio_get(ioend->io_bio);		/* for xfs_destroy_ioend */
-	bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
-			  (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
+	ioend->io_bio->bi_opf = REQ_OP_WRITE | wbc_to_write_flags(wbc);
 	submit_bio(ioend->io_bio);
 	ioend->io_bio = new;
 }
@@ -777,7 +771,7 @@ xfs_map_cow(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_bmbt_irec	imap;
-	bool			is_cow = false, need_alloc = false;
+	bool			is_cow = false;
 	int			error;
 
 	/*
@@ -795,7 +789,7 @@ xfs_map_cow(
 	 * Else we need to check if there is a COW mapping at this offset.
 	 */
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
-	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (!is_cow)
@@ -805,7 +799,7 @@ xfs_map_cow(
 	 * And if the COW mapping has a delayed extent here we need to
 	 * allocate real space for it now.
 	 */
-	if (need_alloc) {
+	if (isnullstartblock(imap.br_startblock)) {
 		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 				&imap);
 		if (error)
@@ -1158,63 +1152,27 @@ xfs_vm_releasepage(
 	 * block_invalidatepage() can send pages that are still marked dirty
 	 * but otherwise have invalidated buffers.
 	 *
-	 * We've historically freed buffers on the latter. Instead, quietly
-	 * filter out all dirty pages to avoid spurious buffer state warnings.
-	 * This can likely be removed once shrink_active_list() is fixed.
+	 * We want to release the latter to avoid unnecessary buildup of the
+	 * LRU, skip the former and warn if we've left any lingering
+	 * delalloc/unwritten buffers on clean pages. Skip pages with delalloc
+	 * or unwritten buffers and warn if the page is not dirty. Otherwise
+	 * try to release the buffers.
 	 */
-	if (PageDirty(page))
-		return 0;
-
 	xfs_count_page_state(page, &delalloc, &unwritten);
 
-	if (WARN_ON_ONCE(delalloc))
+	if (delalloc) {
+		WARN_ON_ONCE(!PageDirty(page));
 		return 0;
-	if (WARN_ON_ONCE(unwritten))
+	}
+	if (unwritten) {
+		WARN_ON_ONCE(!PageDirty(page));
 		return 0;
+	}
 
 	return try_to_free_buffers(page);
 }
 
 /*
- * When we map a DIO buffer, we may need to pass flags to
- * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
- *
- * Note that for DIO, an IO to the highest supported file block offset (i.e.
- * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
- * bit variable. Hence if we see this overflow, we have to assume that the IO is
- * extending the file size. We won't know for sure until IO completion is run
- * and the actual max write offset is communicated to the IO completion
- * routine.
- */
-static void
-xfs_map_direct(
-	struct inode		*inode,
-	struct buffer_head	*bh_result,
-	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset,
-	bool			is_cow)
-{
-	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
-	xfs_off_t		size = bh_result->b_size;
-
-	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
-		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
-		XFS_IO_OVERWRITE, imap);
-
-	if (ISUNWRITTEN(imap)) {
-		*flags |= XFS_DIO_FLAG_UNWRITTEN;
-		set_buffer_defer_completion(bh_result);
-	} else if (is_cow) {
-		*flags |= XFS_DIO_FLAG_COW;
-		set_buffer_defer_completion(bh_result);
-	}
-	if (offset + size > i_size_read(inode) || offset + size < 0) {
-		*flags |= XFS_DIO_FLAG_APPEND;
-		set_buffer_defer_completion(bh_result);
-	}
-}
-
-/*
  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
  * is, so that we can avoid repeated get_blocks calls.
  *
@@ -1254,52 +1212,12 @@ xfs_map_trim_size(
 	bh_result->b_size = mapping_size;
 }
 
-/* Bounce unaligned directio writes to the page cache. */
 static int
-xfs_bounce_unaligned_dio_write(
-	struct xfs_inode	*ip,
-	xfs_fileoff_t		offset_fsb,
-	struct xfs_bmbt_irec	*imap)
-{
-	struct xfs_bmbt_irec	irec;
-	xfs_fileoff_t		delta;
-	bool			shared;
-	bool			x;
-	int			error;
-
-	irec = *imap;
-	if (offset_fsb > irec.br_startoff) {
-		delta = offset_fsb - irec.br_startoff;
-		irec.br_blockcount -= delta;
-		irec.br_startblock += delta;
-		irec.br_startoff = offset_fsb;
-	}
-	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
-	if (error)
-		return error;
-
-	/*
-	 * We're here because we're trying to do a directio write to a
-	 * region that isn't aligned to a filesystem block.  If any part
-	 * of the extent is shared, fall back to buffered mode to handle
-	 * the RMW.  This is done by returning -EREMCHG ("remote addr
-	 * changed"), which is caught further up the call stack.
-	 */
-	if (shared) {
-		trace_xfs_reflink_bounce_dio_write(ip, imap);
-		return -EREMCHG;
-	}
-	return 0;
-}
-
-STATIC int
-__xfs_get_blocks(
+xfs_get_blocks(
 	struct inode		*inode,
 	sector_t		iblock,
 	struct buffer_head	*bh_result,
-	int			create,
-	bool			direct,
-	bool			dax_fault)
+	int			create)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	struct xfs_mount	*mp = ip->i_mount;
@@ -1310,11 +1228,8 @@ __xfs_get_blocks(
 	int			nimaps = 1;
 	xfs_off_t		offset;
 	ssize_t			size;
-	int			new = 0;
-	bool			is_cow = false;
-	bool			need_alloc = false;
 
-	BUG_ON(create && !direct);
+	BUG_ON(create);
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
@@ -1323,7 +1238,7 @@ __xfs_get_blocks(
 	ASSERT(bh_result->b_size >= (1 << inode->i_blkbits));
 	size = bh_result->b_size;
 
-	if (!create && offset >= i_size_read(inode))
+	if (offset >= i_size_read(inode))
 		return 0;
 
 	/*
@@ -1338,52 +1253,12 @@ __xfs_get_blocks(
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	if (create && direct && xfs_is_reflink_inode(ip))
-		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
-					&need_alloc);
-	if (!is_cow) {
-		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-					&imap, &nimaps, XFS_BMAPI_ENTIRE);
-		/*
-		 * Truncate an overwrite extent if there's a pending CoW
-		 * reservation before the end of this extent.  This
-		 * forces us to come back to get_blocks to take care of
-		 * the CoW.
-		 */
-		if (create && direct && nimaps &&
-		    imap.br_startblock != HOLESTARTBLOCK &&
-		    imap.br_startblock != DELAYSTARTBLOCK &&
-		    !ISUNWRITTEN(&imap))
-			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
-					&imap);
-	}
-	ASSERT(!need_alloc);
+	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+				&imap, &nimaps, XFS_BMAPI_ENTIRE);
 	if (error)
 		goto out_unlock;
 
-	/* for DAX, we convert unwritten extents directly */
-	if (create &&
-	    (!nimaps ||
-	     (imap.br_startblock == HOLESTARTBLOCK ||
-	      imap.br_startblock == DELAYSTARTBLOCK) ||
-	     (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
-		/*
-		 * xfs_iomap_write_direct() expects the shared lock. It
-		 * is unlocked on return.
-		 */
-		if (lockmode == XFS_ILOCK_EXCL)
-			xfs_ilock_demote(ip, lockmode);
-
-		error = xfs_iomap_write_direct(ip, offset, size,
-					       &imap, nimaps);
-		if (error)
-			return error;
-		new = 1;
-
-		trace_xfs_get_blocks_alloc(ip, offset, size,
-				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
-						   : XFS_IO_DELALLOC, &imap);
-	} else if (nimaps) {
+	if (nimaps) {
 		trace_xfs_get_blocks_found(ip, offset, size,
 				ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
 						   : XFS_IO_OVERWRITE, &imap);
@@ -1393,12 +1268,6 @@ __xfs_get_blocks(
 		goto out_unlock;
 	}
 
-	if (IS_DAX(inode) && create) {
-		ASSERT(!ISUNWRITTEN(&imap));
-		/* zeroing is not needed at a higher layer */
-		new = 0;
-	}
-
 	/* trim mapping down to size requested */
 	xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
 
@@ -1408,50 +1277,14 @@ __xfs_get_blocks(
 	 */
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK &&
-	    (create || !ISUNWRITTEN(&imap))) {
-		if (create && direct && !is_cow) {
-			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
-					&imap);
-			if (error)
-				return error;
-		}
-
+	    !ISUNWRITTEN(&imap))
 		xfs_map_buffer(inode, bh_result, &imap, offset);
-		if (ISUNWRITTEN(&imap))
-			set_buffer_unwritten(bh_result);
-		/* direct IO needs special help */
-		if (create) {
-			if (dax_fault)
-				ASSERT(!ISUNWRITTEN(&imap));
-			else
-				xfs_map_direct(inode, bh_result, &imap, offset,
-						is_cow);
-		}
-	}
 
 	/*
 	 * If this is a realtime file, data may be on a different device.
 	 * to that pointed to from the buffer_head b_bdev currently.
 	 */
 	bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
-
-	/*
-	 * If we previously allocated a block out beyond eof and we are now
-	 * coming back to use it then we will need to flag it as new even if it
-	 * has a disk address.
-	 *
-	 * With sub-block writes into unwritten extents we also need to mark
-	 * the buffer as new so that the unwritten parts of the buffer gets
-	 * correctly zeroed.
-	 */
-	if (create &&
-	    ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
-	     (offset >= i_size_read(inode)) ||
-	     (new || ISUNWRITTEN(&imap))))
-		set_buffer_new(bh_result);
-
-	BUG_ON(direct && imap.br_startblock == DELAYSTARTBLOCK);
-
 	return 0;
 
 out_unlock:
@@ -1459,110 +1292,6 @@ out_unlock:
 	return error;
 }
 
-int
-xfs_get_blocks(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
-}
-
-int
-xfs_get_blocks_direct(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
-}
-
-int
-xfs_get_blocks_dax_fault(
-	struct inode		*inode,
-	sector_t		iblock,
-	struct buffer_head	*bh_result,
-	int			create)
-{
-	return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
-}
-
-/*
- * Complete a direct I/O write request.
- *
- * xfs_map_direct passes us some flags in the private data to tell us what to
- * do.  If no flags are set, then the write IO is an overwrite wholly within
- * the existing allocated file size and so there is nothing for us to do.
- *
- * Note that in this case the completion can be called in interrupt context,
- * whereas if we have flags set we will always be called in task context
- * (i.e. from a workqueue).
- */
-int
-xfs_end_io_direct_write(
-	struct kiocb		*iocb,
-	loff_t			offset,
-	ssize_t			size,
-	void			*private)
-{
-	struct inode		*inode = file_inode(iocb->ki_filp);
-	struct xfs_inode	*ip = XFS_I(inode);
-	uintptr_t		flags = (uintptr_t)private;
-	int			error = 0;
-
-	trace_xfs_end_io_direct_write(ip, offset, size);
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	if (size <= 0)
-		return size;
-
-	/*
-	 * The flags tell us whether we are doing unwritten extent conversions
-	 * or an append transaction that updates the on-disk file size. These
-	 * cases are the only cases where we should *potentially* be needing
-	 * to update the VFS inode size.
-	 */
-	if (flags == 0) {
-		ASSERT(offset + size <= i_size_read(inode));
-		return 0;
-	}
-
-	/*
-	 * We need to update the in-core inode size here so that we don't end up
-	 * with the on-disk inode size being outside the in-core inode size. We
-	 * have no other method of updating EOF for AIO, so always do it here
-	 * if necessary.
-	 *
-	 * We need to lock the test/set EOF update as we can be racing with
-	 * other IO completions here to update the EOF. Failing to serialise
-	 * here can result in EOF moving backwards and Bad Things Happen when
-	 * that occurs.
-	 */
-	spin_lock(&ip->i_flags_lock);
-	if (offset + size > i_size_read(inode))
-		i_size_write(inode, offset + size);
-	spin_unlock(&ip->i_flags_lock);
-
-	if (flags & XFS_DIO_FLAG_COW)
-		error = xfs_reflink_end_cow(ip, offset, size);
-	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
-		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
-
-		error = xfs_iomap_write_unwritten(ip, offset, size);
-	}
-	if (flags & XFS_DIO_FLAG_APPEND) {
-		trace_xfs_end_io_direct_write_append(ip, offset, size);
-
-		error = xfs_setfilesize(ip, offset, size);
-	}
-
-	return error;
-}
-
 STATIC ssize_t
 xfs_vm_direct_IO(
 	struct kiocb		*iocb,
@@ -1583,7 +1312,6 @@ xfs_vm_bmap(
 	struct xfs_inode	*ip = XFS_I(inode);
 
 	trace_xfs_vm_bmap(XFS_I(inode));
-	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 
 	/*
 	 * The swap code (ab-)uses ->bmap to get a block mapping and then
@@ -1591,12 +1319,10 @@ xfs_vm_bmap(
 	 * that on reflinks inodes, so we have to skip out here.  And yes,
 	 * 0 is the magic code for a bmap error..
 	 */
-	if (xfs_is_reflink_inode(ip)) {
-		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+	if (xfs_is_reflink_inode(ip))
 		return 0;
-	}
+
 	filemap_write_and_wait(mapping);
-	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
 }
 
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index b3c6634f9518..cc174ec6c2fd 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -55,15 +55,6 @@ struct xfs_ioend {
 
 extern const struct address_space_operations xfs_address_space_operations;
 
-int	xfs_get_blocks(struct inode *inode, sector_t offset,
-		       struct buffer_head *map_bh, int create);
-int	xfs_get_blocks_direct(struct inode *inode, sector_t offset,
-			      struct buffer_head *map_bh, int create);
-int	xfs_get_blocks_dax_fault(struct inode *inode, sector_t offset,
-			         struct buffer_head *map_bh, int create);
-
-int	xfs_end_io_direct_write(struct kiocb *iocb, loff_t offset,
-		ssize_t size, void *private);
 int	xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size);
 
 extern void xfs_count_page_state(struct page *, int *, int *);
diff --git a/fs/xfs/xfs_attr.h b/fs/xfs/xfs_attr.h
index e3da5d448bcf..d14691aa02b4 100644
--- a/fs/xfs/xfs_attr.h
+++ b/fs/xfs/xfs_attr.h
@@ -112,8 +112,8 @@ typedef struct attrlist_cursor_kern {
  *========================================================================*/
 
 
-/* Return 0 on success, or -errno; other state communicated via *context */
-typedef int (*put_listent_func_t)(struct xfs_attr_list_context *, int,
+/* void; state communicated via *context */
+typedef void (*put_listent_func_t)(struct xfs_attr_list_context *, int,
 			      unsigned char *, int, int);
 
 typedef struct xfs_attr_list_context {
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 25e76cd6c053..97c45b6eb91e 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -74,7 +74,6 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	xfs_attr_sf_entry_t *sfe;
 	xfs_inode_t *dp;
 	int sbsize, nsbuf, count, i;
-	int error;
 
 	ASSERT(context != NULL);
 	dp = context->dp;
@@ -102,13 +101,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 	    (XFS_ISRESET_CURSOR(cursor) &&
              (dp->i_afp->if_bytes + sf->hdr.count * 16) < context->bufsize)) {
 		for (i = 0, sfe = &sf->list[0]; i < sf->hdr.count; i++) {
-			error = context->put_listent(context,
-					   sfe->flags,
-					   sfe->nameval,
-					   (int)sfe->namelen,
-					   (int)sfe->valuelen);
-			if (error)
-				return error;
+			context->put_listent(context,
+					     sfe->flags,
+					     sfe->nameval,
+					     (int)sfe->namelen,
+					     (int)sfe->valuelen);
 			/*
 			 * Either search callback finished early or
 			 * didn't fit it all in the buffer after all.
@@ -193,15 +190,11 @@ xfs_attr_shortform_list(xfs_attr_list_context_t *context)
 			cursor->hashval = sbp->hash;
 			cursor->offset = 0;
 		}
-		error = context->put_listent(context,
-					sbp->flags,
-					sbp->name,
-					sbp->namelen,
-					sbp->valuelen);
-		if (error) {
-			kmem_free(sbuf);
-			return error;
-		}
+		context->put_listent(context,
+				     sbp->flags,
+				     sbp->name,
+				     sbp->namelen,
+				     sbp->valuelen);
 		if (context->seen_enough)
 			break;
 		cursor->offset++;
@@ -335,11 +328,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 	 */
 	for (;;) {
 		leaf = bp->b_addr;
-		error = xfs_attr3_leaf_list_int(bp, context);
-		if (error) {
-			xfs_trans_brelse(NULL, bp);
-			return error;
-		}
+		xfs_attr3_leaf_list_int(bp, context);
 		xfs_attr3_leaf_hdr_from_disk(mp->m_attr_geo, &leafhdr, leaf);
 		if (context->seen_enough || leafhdr.forw == 0)
 			break;
@@ -356,7 +345,7 @@ xfs_attr_node_list(xfs_attr_list_context_t *context)
 /*
  * Copy out attribute list entries for attr_list(), for leaf attribute lists.
  */
-int
+void
 xfs_attr3_leaf_list_int(
 	struct xfs_buf			*bp,
 	struct xfs_attr_list_context	*context)
@@ -366,7 +355,6 @@ xfs_attr3_leaf_list_int(
 	struct xfs_attr3_icleaf_hdr	ichdr;
 	struct xfs_attr_leaf_entry	*entries;
 	struct xfs_attr_leaf_entry	*entry;
-	int				retval;
 	int				i;
 	struct xfs_mount		*mp = context->dp->i_mount;
 
@@ -399,7 +387,7 @@ xfs_attr3_leaf_list_int(
 		}
 		if (i == ichdr.count) {
 			trace_xfs_attr_list_notfound(context);
-			return 0;
+			return;
 		}
 	} else {
 		entry = &entries[0];
@@ -410,7 +398,6 @@ xfs_attr3_leaf_list_int(
 	/*
 	 * We have found our place, start copying out the new attributes.
 	 */
-	retval = 0;
 	for (; i < ichdr.count; entry++, i++) {
 		char *name;
 		int namelen, valuelen;
@@ -439,16 +426,14 @@ xfs_attr3_leaf_list_int(
 			valuelen = be32_to_cpu(name_rmt->valuelen);
 		}
 
-		retval = context->put_listent(context, entry->flags,
+		context->put_listent(context, entry->flags,
 					      name, namelen, valuelen);
-		if (retval)
-			break;
 		if (context->seen_enough)
 			break;
 		cursor->offset++;
 	}
 	trace_xfs_attr_list_leaf_end(context);
-	return retval;
+	return;
 }
 
 /*
@@ -467,9 +452,9 @@ xfs_attr_leaf_list(xfs_attr_list_context_t *context)
 	if (error)
 		return error;
 
-	error = xfs_attr3_leaf_list_int(bp, context);
+	xfs_attr3_leaf_list_int(bp, context);
 	xfs_trans_brelse(NULL, bp);
-	return error;
+	return 0;
 }
 
 int
@@ -513,7 +498,7 @@ xfs_attr_list_int(
  * Take care to check values and protect against them changing later,
  * we may be reading them directly out of a user buffer.
  */
-STATIC int
+STATIC void
 xfs_attr_put_listent(
 	xfs_attr_list_context_t *context,
 	int		flags,
@@ -536,10 +521,10 @@ xfs_attr_put_listent(
 	 */
 	if (((context->flags & ATTR_SECURE) == 0) !=
 	    ((flags & XFS_ATTR_SECURE) == 0))
-		return 0;
+		return;
 	if (((context->flags & ATTR_ROOT) == 0) !=
 	    ((flags & XFS_ATTR_ROOT) == 0))
-		return 0;
+		return;
 
 	arraytop = sizeof(*alist) +
 			context->count * sizeof(alist->al_offset[0]);
@@ -548,7 +533,7 @@ xfs_attr_put_listent(
 		trace_xfs_attr_list_full(context);
 		alist->al_more = 1;
 		context->seen_enough = 1;
-		return 0;
+		return;
 	}
 
 	aep = (attrlist_ent_t *)&context->alist[context->firstu];
@@ -558,7 +543,7 @@ xfs_attr_put_listent(
 	alist->al_offset[context->count++] = context->firstu;
 	alist->al_count = context->count;
 	trace_xfs_attr_list_add(context);
-	return 0;
+	return;
 }
 
 /*
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 552465e011ec..b9abce524c33 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -359,9 +359,7 @@ xfs_bmap_count_blocks(
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if ( XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS ) {
-		xfs_bmap_count_leaves(ifp, 0,
-			ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t),
-			count);
+		xfs_bmap_count_leaves(ifp, 0, xfs_iext_count(ifp), count);
 		return 0;
 	}
 
@@ -426,7 +424,7 @@ xfs_getbmapx_fix_eof_hole(
 		ifp = XFS_IFORK_PTR(ip, whichfork);
 		if (!moretocome &&
 		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
-		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
+		   (lastx == xfs_iext_count(ifp) - 1))
 			out->bmv_oflags |= BMV_OF_LAST;
 	}
 
@@ -1792,6 +1790,7 @@ xfs_swap_extent_forks(
 	struct xfs_ifork	tempifp, *ifp, *tifp;
 	int			aforkblks = 0;
 	int			taforkblks = 0;
+	xfs_extnum_t		nextents;
 	__uint64_t		tmp;
 	int			error;
 
@@ -1877,14 +1876,13 @@ xfs_swap_extent_forks(
 
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
-		/* If the extents fit in the inode, fix the
-		 * pointer.  Otherwise it's already NULL or
-		 * pointing to the extent.
+		/*
+		 * If the extents fit in the inode, fix the pointer.  Otherwise
+		 * it's already NULL or pointing to the extent.
 		 */
-		if (ip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-			ifp->if_u1.if_extents =
-				ifp->if_u2.if_inline_ext;
-		}
+		nextents = xfs_iext_count(&ip->i_df);
+		if (nextents <= XFS_INLINE_EXTS)
+			ifp->if_u1.if_extents = ifp->if_u2.if_inline_ext;
 		(*src_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
@@ -1896,14 +1894,13 @@ xfs_swap_extent_forks(
 
 	switch (tip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
-		/* If the extents fit in the inode, fix the
-		 * pointer.  Otherwise it's already NULL or
-		 * pointing to the extent.
+		/*
+		 * If the extents fit in the inode, fix the pointer.  Otherwise
+		 * it's already NULL or pointing to the extent.
 		 */
-		if (tip->i_d.di_nextents <= XFS_INLINE_EXTS) {
-			tifp->if_u1.if_extents =
-				tifp->if_u2.if_inline_ext;
-		}
+		nextents = xfs_iext_count(&tip->i_df);
+		if (nextents <= XFS_INLINE_EXTS)
+			tifp->if_u1.if_extents = tifp->if_u2.if_inline_ext;
 		(*target_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
@@ -1938,8 +1935,8 @@ xfs_swap_extents(
 	 * page cache safely. Once we have done this we can take the ilocks and
 	 * do the rest of the checks.
 	 */
-	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	lock_two_nondirectories(VFS_I(ip), VFS_I(tip));
+	lock_flags = XFS_MMAPLOCK_EXCL;
 	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
 
 	/* Verify that both files have the same format */
@@ -2079,15 +2076,13 @@ xfs_swap_extents(
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
 
+out_unlock:
 	xfs_iunlock(ip, lock_flags);
 	xfs_iunlock(tip, lock_flags);
+	unlock_two_nondirectories(VFS_I(ip), VFS_I(tip));
 	return error;
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
-
-out_unlock:
-	xfs_iunlock(ip, lock_flags);
-	xfs_iunlock(tip, lock_flags);
-	return error;
+	goto out_unlock;
 }
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index b5b9bffe3520..7f0a01f7b592 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -219,7 +219,6 @@ _xfs_buf_alloc(
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
-	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	spin_lock_init(&bp->b_lock);
 	XB_SET_OWNER(bp);
@@ -473,6 +472,62 @@ _xfs_buf_map_pages(
 /*
  *	Finding and Reading Buffers
  */
+static int
+_xfs_buf_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_buf_map	*map = arg->key;
+	const struct xfs_buf		*bp = obj;
+
+	/*
+	 * The key hashing in the lookup path depends on the key being the
+	 * first element of the compare_arg, make sure to assert this.
+	 */
+	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+	if (bp->b_bn != map->bm_bn)
+		return 1;
+
+	if (unlikely(bp->b_length != map->bm_len)) {
+		/*
+		 * found a block number match. If the range doesn't
+		 * match, the only way this is allowed is if the buffer
+		 * in the cache is stale and the transaction that made
+		 * it stale has not yet committed. i.e. we are
+		 * reallocating a busy extent. Skip this buffer and
+		 * continue searching for an exact match.
+		 */
+		ASSERT(bp->b_flags & XBF_STALE);
+		return 1;
+	}
+	return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+	.min_size		= 32,	/* empty AGs have minimal footprint */
+	.nelem_hint		= 16,
+	.key_len		= sizeof(xfs_daddr_t),
+	.key_offset		= offsetof(struct xfs_buf, b_bn),
+	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+	struct xfs_perag	*pag)
+{
+	spin_lock_init(&pag->pag_buf_lock);
+	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+	struct xfs_perag	*pag)
+{
+	rhashtable_destroy(&pag->pag_buf_hash);
+}
 
 /*
  *	Look up, and creates if absent, a lockable buffer for
@@ -488,27 +543,24 @@ _xfs_buf_find(
 	xfs_buf_t		*new_bp)
 {
 	struct xfs_perag	*pag;
-	struct rb_node		**rbp;
-	struct rb_node		*parent;
 	xfs_buf_t		*bp;
-	xfs_daddr_t		blkno = map[0].bm_bn;
+	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
 	xfs_daddr_t		eofs;
-	int			numblks = 0;
 	int			i;
 
 	for (i = 0; i < nmaps; i++)
-		numblks += map[i].bm_len;
+		cmap.bm_len += map[i].bm_len;
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
-	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 
 	/*
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (blkno < 0 || blkno >= eofs) {
+	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
 		/*
 		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * but none of the higher level infrastructure supports
@@ -516,53 +568,29 @@ _xfs_buf_find(
 		 */
 		xfs_alert(btp->bt_mount,
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
-			  __func__, blkno, eofs);
+			  __func__, cmap.bm_bn, eofs);
 		WARN_ON(1);
 		return NULL;
 	}
 
-	/* get tree root */
 	pag = xfs_perag_get(btp->bt_mount,
-				xfs_daddr_to_agno(btp->bt_mount, blkno));
+			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
 
-	/* walk tree */
 	spin_lock(&pag->pag_buf_lock);
-	rbp = &pag->pag_buf_tree.rb_node;
-	parent = NULL;
-	bp = NULL;
-	while (*rbp) {
-		parent = *rbp;
-		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
-		if (blkno < bp->b_bn)
-			rbp = &(*rbp)->rb_left;
-		else if (blkno > bp->b_bn)
-			rbp = &(*rbp)->rb_right;
-		else {
-			/*
-			 * found a block number match. If the range doesn't
-			 * match, the only way this is allowed is if the buffer
-			 * in the cache is stale and the transaction that made
-			 * it stale has not yet committed. i.e. we are
-			 * reallocating a busy extent. Skip this buffer and
-			 * continue searching to the right for an exact match.
-			 */
-			if (bp->b_length != numblks) {
-				ASSERT(bp->b_flags & XBF_STALE);
-				rbp = &(*rbp)->rb_right;
-				continue;
-			}
-			atomic_inc(&bp->b_hold);
-			goto found;
-		}
+	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+				    xfs_buf_hash_params);
+	if (bp) {
+		atomic_inc(&bp->b_hold);
+		goto found;
 	}
 
 	/* No match found */
 	if (new_bp) {
-		rb_link_node(&new_bp->b_rbnode, parent, rbp);
-		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 		/* the buffer keeps the perag reference until it is freed */
 		new_bp->b_pag = pag;
+		rhashtable_insert_fast(&pag->pag_buf_hash,
+				       &new_bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +958,6 @@ xfs_buf_rele(
 
 	if (!pag) {
 		ASSERT(list_empty(&bp->b_lru));
-		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold)) {
 			xfs_buf_ioacct_dec(bp);
 			xfs_buf_free(bp);
@@ -938,8 +965,6 @@ xfs_buf_rele(
 		return;
 	}
 
-	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 
 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1008,8 @@ xfs_buf_rele(
 		}
 
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 		xfs_perag_put(pag);
 		freebuf = true;
@@ -1304,7 +1330,7 @@ _xfs_buf_ioapply(
 	if (bp->b_flags & XBF_WRITE) {
 		op = REQ_OP_WRITE;
 		if (bp->b_flags & XBF_SYNCIO)
-			op_flags = WRITE_SYNC;
+			op_flags = REQ_SYNC;
 		if (bp->b_flags & XBF_FUA)
 			op_flags |= REQ_FUA;
 		if (bp->b_flags & XBF_FLUSH)
@@ -1711,8 +1737,7 @@ xfs_free_buftarg(
 	percpu_counter_destroy(&btp->bt_io_count);
 	list_lru_destroy(&btp->bt_lru);
 
-	if (mp->m_flags & XFS_MOUNT_BARRIER)
-		xfs_blkdev_issue_flush(btp);
+	xfs_blkdev_issue_flush(btp);
 
 	kmem_free(btp);
 }
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 1c2e52b2d926..8a9d3a9599f0 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -71,6 +71,7 @@ typedef unsigned int xfs_buf_flags_t;
 	{ XBF_READ,		"READ" }, \
 	{ XBF_WRITE,		"WRITE" }, \
 	{ XBF_READ_AHEAD,	"READ_AHEAD" }, \
+	{ XBF_NO_IOACCT,	"NO_IOACCT" }, \
 	{ XBF_ASYNC,		"ASYNC" }, \
 	{ XBF_DONE,		"DONE" }, \
 	{ XBF_STALE,		"STALE" }, \
@@ -150,7 +151,7 @@ typedef struct xfs_buf {
 	 * which is the only bit that is touched if we hit the semaphore
 	 * fast-path on locking.
 	 */
-	struct rb_node		b_rbnode;	/* rbtree node */
+	struct rhash_head	b_rhash_head;	/* pag buffer hash node */
 	xfs_daddr_t		b_bn;		/* block number of buffer */
 	int			b_length;	/* size of buffer in BBs */
 	atomic_t		b_hold;		/* reference count */
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index 29816981b50a..003a99b83bd8 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -677,7 +677,6 @@ xfs_readdir(
 	args.dp = dp;
 	args.geo = dp->i_mount->m_dir_geo;
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
 		rval = xfs_dir2_sf_getdents(&args, ctx);
 	else if ((rval = xfs_dir2_isblock(&args, &v)))
@@ -686,7 +685,6 @@ xfs_readdir(
 		rval = xfs_dir2_block_getdents(&args, ctx);
 	else
 		rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 
 	return rval;
 }
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 7a30b8f11db7..9d06cc30e875 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -710,6 +710,10 @@ xfs_dq_get_next_id(
 	/* Simple advance */
 	next_id = *id + 1;
 
+	/* If we'd wrap past the max ID, stop */
+	if (next_id < *id)
+		return -ENOENT;
+
 	/* If new ID is within the current chunk, advancing it sufficed */
 	if (next_id % mp->m_quotainfo->qi_dqperchunk) {
 		*id = next_id;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 6e4f7f900fea..bbb9eb6811b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -48,40 +48,6 @@
 static const struct vm_operations_struct xfs_file_vm_ops;
 
 /*
- * Locking primitives for read and write IO paths to ensure we consistently use
- * and order the inode->i_mutex, ip->i_lock and ip->i_iolock.
- */
-static inline void
-xfs_rw_ilock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	if (type & XFS_IOLOCK_EXCL)
-		inode_lock(VFS_I(ip));
-	xfs_ilock(ip, type);
-}
-
-static inline void
-xfs_rw_iunlock(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_iunlock(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
-static inline void
-xfs_rw_ilock_demote(
-	struct xfs_inode	*ip,
-	int			type)
-{
-	xfs_ilock_demote(ip, type);
-	if (type & XFS_IOLOCK_EXCL)
-		inode_unlock(VFS_I(ip));
-}
-
-/*
  * Clear the specified ranges to zero through either the pagecache or DAX.
  * Holes and unwritten extents will be left as-is as they already are zeroed.
  */
@@ -183,19 +149,16 @@ xfs_file_fsync(
 
 	xfs_iflags_clear(ip, XFS_ITRUNCATED);
 
-	if (mp->m_flags & XFS_MOUNT_BARRIER) {
-		/*
-		 * If we have an RT and/or log subvolume we need to make sure
-		 * to flush the write cache the device used for file data
-		 * first.  This is to ensure newly written file data make
-		 * it to disk before logging the new inode size in case of
-		 * an extending write.
-		 */
-		if (XFS_IS_REALTIME_INODE(ip))
-			xfs_blkdev_issue_flush(mp->m_rtdev_targp);
-		else if (mp->m_logdev_targp != mp->m_ddev_targp)
-			xfs_blkdev_issue_flush(mp->m_ddev_targp);
-	}
+	/*
+	 * If we have an RT and/or log subvolume we need to make sure to flush
+	 * the write cache the device used for file data first.  This is to
+	 * ensure newly written file data make it to disk before logging the new
+	 * inode size in case of an extending write.
+	 */
+	if (XFS_IS_REALTIME_INODE(ip))
+		xfs_blkdev_issue_flush(mp->m_rtdev_targp);
+	else if (mp->m_logdev_targp != mp->m_ddev_targp)
+		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
 	/*
 	 * All metadata updates are logged, which means that we just have to
@@ -230,10 +193,8 @@ xfs_file_fsync(
 	 * an already allocated file and thus do not have any metadata to
 	 * commit.
 	 */
-	if ((mp->m_flags & XFS_MOUNT_BARRIER) &&
-	    mp->m_logdev_targp == mp->m_ddev_targp &&
-	    !XFS_IS_REALTIME_INODE(ip) &&
-	    !log_flushed)
+	if (!log_flushed && !XFS_IS_REALTIME_INODE(ip) &&
+	    mp->m_logdev_targp == mp->m_ddev_targp)
 		xfs_blkdev_issue_flush(mp->m_ddev_targp);
 
 	return error;
@@ -244,62 +205,21 @@ xfs_file_dio_aio_read(
 	struct kiocb		*iocb,
 	struct iov_iter		*to)
 {
-	struct address_space	*mapping = iocb->ki_filp->f_mapping;
-	struct inode		*inode = mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	loff_t			isize = i_size_read(inode);
+	struct xfs_inode	*ip = XFS_I(file_inode(iocb->ki_filp));
 	size_t			count = iov_iter_count(to);
-	loff_t			end = iocb->ki_pos + count - 1;
-	struct iov_iter		data;
-	struct xfs_buftarg	*target;
-	ssize_t			ret = 0;
+	ssize_t			ret;
 
 	trace_xfs_file_direct_read(ip, count, iocb->ki_pos);
 
 	if (!count)
 		return 0; /* skip atime */
 
-	if (XFS_IS_REALTIME_INODE(ip))
-		target = ip->i_mount->m_rtdev_targp;
-	else
-		target = ip->i_mount->m_ddev_targp;
-
-	/* DIO must be aligned to device logical sector size */
-	if ((iocb->ki_pos | count) & target->bt_logical_sectormask) {
-		if (iocb->ki_pos == isize)
-			return 0;
-		return -EINVAL;
-	}
-
 	file_accessed(iocb->ki_filp);
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if (mapping->nrpages) {
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
-		if (ret)
-			goto out_unlock;
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = iomap_dio_rw(iocb, to, &xfs_iomap_ops, NULL);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
-		/*
-		 * Invalidate whole pages. This can return an error if we fail
-		 * to invalidate a page, but this should never happen on XFS.
-		 * Warn if it does fail.
-		 */
-		ret = invalidate_inode_pages2_range(mapping,
-				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-		ret = 0;
-	}
-
-	data = *to;
-	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-			xfs_get_blocks_direct, NULL, NULL, 0);
-	if (ret >= 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(to, ret);
-	}
-
-out_unlock:
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 	return ret;
 }
 
@@ -317,9 +237,9 @@ xfs_file_dax_read(
 	if (!count)
 		return 0; /* skip atime */
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	ret = iomap_dax_rw(iocb, to, &xfs_iomap_ops);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+	ret = dax_iomap_rw(iocb, to, &xfs_iomap_ops);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	file_accessed(iocb->ki_filp);
 	return ret;
@@ -335,9 +255,9 @@ xfs_file_buffered_aio_read(
 
 	trace_xfs_file_buffered_read(ip, iov_iter_count(to), iocb->ki_pos);
 
-	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
+	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	ret = generic_file_read_iter(iocb, to);
-	xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
+	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 
 	return ret;
 }
@@ -418,15 +338,18 @@ restart:
 	if (error <= 0)
 		return error;
 
-	error = xfs_break_layouts(inode, iolock, true);
+	error = xfs_break_layouts(inode, iolock);
 	if (error)
 		return error;
 
-	/* For changing security info in file_remove_privs() we need i_mutex */
+	/*
+	 * For changing security info in file_remove_privs() we need i_rwsem
+	 * exclusively.
+	 */
 	if (*iolock == XFS_IOLOCK_SHARED && !IS_NOSEC(inode)) {
-		xfs_rw_iunlock(ip, *iolock);
+		xfs_iunlock(ip, *iolock);
 		*iolock = XFS_IOLOCK_EXCL;
-		xfs_rw_ilock(ip, *iolock);
+		xfs_ilock(ip, *iolock);
 		goto restart;
 	}
 	/*
@@ -451,9 +374,9 @@ restart:
 		spin_unlock(&ip->i_flags_lock);
 		if (!drained_dio) {
 			if (*iolock == XFS_IOLOCK_SHARED) {
-				xfs_rw_iunlock(ip, *iolock);
+				xfs_iunlock(ip, *iolock);
 				*iolock = XFS_IOLOCK_EXCL;
-				xfs_rw_ilock(ip, *iolock);
+				xfs_ilock(ip, *iolock);
 				iov_iter_reexpand(from, count);
 			}
 			/*
@@ -496,6 +419,58 @@ restart:
 	return 0;
 }
 
+static int
+xfs_dio_write_end_io(
+	struct kiocb		*iocb,
+	ssize_t			size,
+	unsigned		flags)
+{
+	struct inode		*inode = file_inode(iocb->ki_filp);
+	struct xfs_inode	*ip = XFS_I(inode);
+	loff_t			offset = iocb->ki_pos;
+	bool			update_size = false;
+	int			error = 0;
+
+	trace_xfs_end_io_direct_write(ip, offset, size);
+
+	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
+		return -EIO;
+
+	if (size <= 0)
+		return size;
+
+	/*
+	 * We need to update the in-core inode size here so that we don't end up
+	 * with the on-disk inode size being outside the in-core inode size. We
+	 * have no other method of updating EOF for AIO, so always do it here
+	 * if necessary.
+	 *
+	 * We need to lock the test/set EOF update as we can be racing with
+	 * other IO completions here to update the EOF. Failing to serialise
+	 * here can result in EOF moving backwards and Bad Things Happen when
+	 * that occurs.
+	 */
+	spin_lock(&ip->i_flags_lock);
+	if (offset + size > i_size_read(inode)) {
+		i_size_write(inode, offset + size);
+		update_size = true;
+	}
+	spin_unlock(&ip->i_flags_lock);
+
+	if (flags & IOMAP_DIO_COW) {
+		error = xfs_reflink_end_cow(ip, offset, size);
+		if (error)
+			return error;
+	}
+
+	if (flags & IOMAP_DIO_UNWRITTEN)
+		error = xfs_iomap_write_unwritten(ip, offset, size);
+	else if (update_size)
+		error = xfs_setfilesize(ip, offset, size);
+
+	return error;
+}
+
 /*
  * xfs_file_dio_aio_write - handle direct IO writes
  *
@@ -535,9 +510,7 @@ xfs_file_dio_aio_write(
 	int			unaligned_io = 0;
 	int			iolock;
 	size_t			count = iov_iter_count(from);
-	loff_t			end;
-	struct iov_iter		data;
-	struct xfs_buftarg	*target = XFS_IS_REALTIME_INODE(ip) ?
+	struct xfs_buftarg      *target = XFS_IS_REALTIME_INODE(ip) ?
 					mp->m_rtdev_targp : mp->m_ddev_targp;
 
 	/* DIO must be aligned to device logical sector size */
@@ -559,29 +532,12 @@ xfs_file_dio_aio_write(
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
 	count = iov_iter_count(from);
-	end = iocb->ki_pos + count - 1;
-
-	if (mapping->nrpages) {
-		ret = filemap_write_and_wait_range(mapping, iocb->ki_pos, end);
-		if (ret)
-			goto out;
-
-		/*
-		 * Invalidate whole pages. This can return an error if we fail
-		 * to invalidate a page, but this should never happen on XFS.
-		 * Warn if it does fail.
-		 */
-		ret = invalidate_inode_pages2_range(mapping,
-				iocb->ki_pos >> PAGE_SHIFT, end >> PAGE_SHIFT);
-		WARN_ON_ONCE(ret);
-		ret = 0;
-	}
 
 	/*
 	 * If we are doing unaligned IO, wait for all other IO to drain,
@@ -591,7 +547,7 @@ xfs_file_dio_aio_write(
 	if (unaligned_io)
 		inode_dio_wait(inode);
 	else if (iolock == XFS_IOLOCK_EXCL) {
-		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock_demote(ip, XFS_IOLOCK_EXCL);
 		iolock = XFS_IOLOCK_SHARED;
 	}
 
@@ -604,24 +560,9 @@ xfs_file_dio_aio_write(
 			goto out;
 	}
 
-	data = *from;
-	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
-			xfs_get_blocks_direct, xfs_end_io_direct_write,
-			NULL, DIO_ASYNC_EXTEND);
-
-	/* see generic_file_direct_write() for why this is necessary */
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      iocb->ki_pos >> PAGE_SHIFT,
-					      end >> PAGE_SHIFT);
-	}
-
-	if (ret > 0) {
-		iocb->ki_pos += ret;
-		iov_iter_advance(from, ret);
-	}
+	ret = iomap_dio_rw(iocb, from, &xfs_iomap_ops, xfs_dio_write_end_io);
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 
 	/*
 	 * No fallback to buffered IO on errors for XFS, direct IO will either
@@ -643,7 +584,7 @@ xfs_file_dax_write(
 	size_t			count;
 	loff_t			pos;
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
 		goto out;
@@ -652,15 +593,13 @@ xfs_file_dax_write(
 	count = iov_iter_count(from);
 
 	trace_xfs_file_dax_write(ip, count, pos);
-
-	ret = iomap_dax_rw(iocb, from, &xfs_iomap_ops);
+	ret = dax_iomap_rw(iocb, from, &xfs_iomap_ops);
 	if (ret > 0 && iocb->ki_pos > i_size_read(inode)) {
 		i_size_write(inode, iocb->ki_pos);
 		error = xfs_setfilesize(ip, pos, ret);
 	}
-
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return error ? error : ret;
 }
 
@@ -677,7 +616,7 @@ xfs_file_buffered_aio_write(
 	int			enospc = 0;
 	int			iolock = XFS_IOLOCK_EXCL;
 
-	xfs_rw_ilock(ip, iolock);
+	xfs_ilock(ip, iolock);
 
 	ret = xfs_file_aio_write_checks(iocb, from, &iolock);
 	if (ret)
@@ -721,7 +660,7 @@ write_retry:
 
 	current->backing_dev_info = NULL;
 out:
-	xfs_rw_iunlock(ip, iolock);
+	xfs_iunlock(ip, iolock);
 	return ret;
 }
 
@@ -797,7 +736,7 @@ xfs_file_fallocate(
 		return -EOPNOTSUPP;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
@@ -909,24 +848,6 @@ out_unlock:
 	return error;
 }
 
-STATIC ssize_t
-xfs_file_copy_range(
-	struct file	*file_in,
-	loff_t		pos_in,
-	struct file	*file_out,
-	loff_t		pos_out,
-	size_t		len,
-	unsigned int	flags)
-{
-	int		error;
-
-	error = xfs_reflink_remap_range(file_in, pos_in, file_out, pos_out,
-				     len, false);
-	if (error)
-		return error;
-	return len;
-}
-
 STATIC int
 xfs_file_clone_range(
 	struct file	*file_in,
@@ -939,7 +860,6 @@ xfs_file_clone_range(
 				     len, false);
 }
 
-#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
 STATIC ssize_t
 xfs_file_dedupe_range(
 	struct file	*src_file,
@@ -950,14 +870,6 @@ xfs_file_dedupe_range(
 {
 	int		error;
 
-	/*
-	 * Limit the total length we will dedupe for each operation.
-	 * This is intended to bound the total time spent in this
-	 * ioctl to something sane.
-	 */
-	if (len > XFS_MAX_DEDUPE_LEN)
-		len = XFS_MAX_DEDUPE_LEN;
-
 	error = xfs_reflink_remap_range(src_file, loff, dst_file, dst_loff,
 				     len, true);
 	if (error)
@@ -1474,7 +1386,7 @@ xfs_filemap_page_mkwrite(
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (IS_DAX(inode)) {
-		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
+		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
 	} else {
 		ret = iomap_page_mkwrite(vma, vmf, &xfs_iomap_ops);
 		ret = block_page_mkwrite_return(ret);
@@ -1501,15 +1413,9 @@ xfs_filemap_fault(
 		return xfs_filemap_page_mkwrite(vma, vmf);
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	if (IS_DAX(inode)) {
-		/*
-		 * we do not want to trigger unwritten extent conversion on read
-		 * faults - that is unnecessary overhead and would also require
-		 * changes to xfs_get_blocks_direct() to map unwritten extent
-		 * ioend for conversion on read-only mappings.
-		 */
-		ret = iomap_dax_fault(vma, vmf, &xfs_iomap_ops);
-	} else
+	if (IS_DAX(inode))
+		ret = dax_iomap_fault(vma, vmf, &xfs_iomap_ops);
+	else
 		ret = filemap_fault(vma, vmf);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
@@ -1545,7 +1451,7 @@ xfs_filemap_pmd_fault(
 	}
 
 	xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
-	ret = dax_pmd_fault(vma, addr, pmd, flags, xfs_get_blocks_dax_fault);
+	ret = dax_iomap_pmd_fault(vma, addr, pmd, flags, &xfs_iomap_ops);
 	xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
 
 	if (flags & FAULT_FLAG_WRITE)
@@ -1625,7 +1531,6 @@ const struct file_operations xfs_file_operations = {
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
-	.copy_file_range = xfs_file_copy_range,
 	.clone_file_range = xfs_file_clone_range,
 	.dedupe_file_range = xfs_file_dedupe_range,
 };
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 93d12fa2670d..242e8091296d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -631,6 +631,20 @@ xfs_growfs_data_private(
 	xfs_set_low_space_thresholds(mp);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
+	/*
+	 * If we expanded the last AG, free the per-AG reservation
+	 * so we can reinitialize it with the new size.
+	 */
+	if (new) {
+		struct xfs_perag	*pag;
+
+		pag = xfs_perag_get(mp, agno);
+		error = xfs_ag_resv_free(pag);
+		xfs_perag_put(pag);
+		if (error)
+			goto out;
+	}
+
 	/* Reserve AG metadata blocks. */
 	error = xfs_fs_reserve_ag_blocks(mp);
 	if (error && error != -ENOSPC)
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f295049db681..70ca4f608321 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -70,8 +70,6 @@ xfs_inode_alloc(
 	ASSERT(!xfs_isiflocked(ip));
 	ASSERT(ip->i_ino == 0);
 
-	mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
-
 	/* initialise the xfs inode */
 	ip->i_ino = ino;
 	ip->i_mount = mp;
@@ -123,7 +121,6 @@ __xfs_inode_free(
 {
 	/* asserts to verify all state is correct here */
 	ASSERT(atomic_read(&ip->i_pincount) == 0);
-	ASSERT(!xfs_isiflocked(ip));
 	XFS_STATS_DEC(ip->i_mount, vn_active);
 
 	call_rcu(&VFS_I(ip)->i_rcu, xfs_inode_free_callback);
@@ -133,6 +130,8 @@ void
 xfs_inode_free(
 	struct xfs_inode	*ip)
 {
+	ASSERT(!xfs_isiflocked(ip));
+
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always
 	 * appears to be reclaimed with an invalid inode number when in the
@@ -393,8 +392,8 @@ xfs_iget_cache_hit(
 		xfs_inode_clear_reclaim_tag(pag, ip->i_ino);
 		inode->i_state = I_NEW;
 
-		ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
-		mrlock_init(&ip->i_iolock, MRLOCK_BARRIER, "xfsio", ip->i_ino);
+		ASSERT(!rwsem_is_locked(&inode->i_rwsem));
+		init_rwsem(&inode->i_rwsem);
 
 		spin_unlock(&ip->i_flags_lock);
 		spin_unlock(&pag->pag_ici_lock);
@@ -981,6 +980,7 @@ restart:
 
 	if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 		xfs_iunpin_wait(ip);
+		/* xfs_iflush_abort() drops the flush lock */
 		xfs_iflush_abort(ip, false);
 		goto reclaim;
 	}
@@ -989,10 +989,10 @@ restart:
 			goto out_ifunlock;
 		xfs_iunpin_wait(ip);
 	}
-	if (xfs_iflags_test(ip, XFS_ISTALE))
-		goto reclaim;
-	if (xfs_inode_clean(ip))
+	if (xfs_iflags_test(ip, XFS_ISTALE) || xfs_inode_clean(ip)) {
+		xfs_ifunlock(ip);
 		goto reclaim;
+	}
 
 	/*
 	 * Never flush out dirty data during non-blocking reclaim, as it would
@@ -1030,25 +1030,24 @@ restart:
 		xfs_buf_relse(bp);
 	}
 
-	xfs_iflock(ip);
 reclaim:
+	ASSERT(!xfs_isiflocked(ip));
+
 	/*
 	 * Because we use RCU freeing we need to ensure the inode always appears
 	 * to be reclaimed with an invalid inode number when in the free state.
-	 * We do this as early as possible under the ILOCK and flush lock so
-	 * that xfs_iflush_cluster() can be guaranteed to detect races with us
-	 * here. By doing this, we guarantee that once xfs_iflush_cluster has
-	 * locked both the XFS_ILOCK and the flush lock that it will see either
-	 * a valid, flushable inode that will serialise correctly against the
-	 * locks below, or it will see a clean (and invalid) inode that it can
-	 * skip.
+	 * We do this as early as possible under the ILOCK so that
+	 * xfs_iflush_cluster() can be guaranteed to detect races with us here.
+	 * By doing this, we guarantee that once xfs_iflush_cluster has locked
+	 * XFS_ILOCK that it will see either a valid, flushable inode that will
+	 * serialise correctly, or it will see a clean (and invalid) inode that
+	 * it can skip.
 	 */
 	spin_lock(&ip->i_flags_lock);
 	ip->i_flags = XFS_IRECLAIM;
 	ip->i_ino = 0;
 	spin_unlock(&ip->i_flags_lock);
 
-	xfs_ifunlock(ip);
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
 
 	XFS_STATS_INC(ip->i_mount, xs_ig_reclaims);
@@ -1580,10 +1579,15 @@ xfs_inode_free_cowblocks(
 	struct xfs_eofblocks *eofb = args;
 	bool need_iolock = true;
 	int match;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
 
 	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
 
-	if (!xfs_reflink_has_real_cow_blocks(ip)) {
+	/*
+	 * Just clear the tag if we have an empty cow fork or none at all. It's
+	 * possible the inode was fully unshared since it was originally tagged.
+	 */
+	if (!xfs_is_reflink_inode(ip) || !ifp->if_bytes) {
 		trace_xfs_inode_free_cowblocks_invalid(ip);
 		xfs_inode_clear_cowblocks_tag(ip);
 		return 0;
@@ -1593,7 +1597,8 @@ xfs_inode_free_cowblocks(
 	 * If the mapping is dirty or under writeback we cannot touch the
 	 * CoW fork.  Leave it alone if we're in the midst of a directio.
 	 */
-	if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	if ((VFS_I(ip)->i_state & I_DIRTY_PAGES) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
 	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
 	    atomic_read(&VFS_I(ip)->i_dio_count))
 		return 0;
diff --git a/fs/xfs/xfs_icreate_item.c b/fs/xfs/xfs_icreate_item.c
index d45ca72af6fb..865ad1373e5e 100644
--- a/fs/xfs/xfs_icreate_item.c
+++ b/fs/xfs/xfs_icreate_item.c
@@ -133,7 +133,7 @@ xfs_icreate_item_committing(
 /*
  * This is the ops vector shared by all buf log items.
  */
-static struct xfs_item_ops xfs_icreate_item_ops = {
+static const struct xfs_item_ops xfs_icreate_item_ops = {
 	.iop_size	= xfs_icreate_item_size,
 	.iop_format	= xfs_icreate_item_format,
 	.iop_pin	= xfs_icreate_item_pin,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 4e560e6a12c1..b9557795eb74 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -142,31 +142,31 @@ xfs_ilock_attr_map_shared(
 }
 
 /*
- * The xfs inode contains 3 multi-reader locks: the i_iolock the i_mmap_lock and
- * the i_lock.  This routine allows various combinations of the locks to be
- * obtained.
+ * In addition to i_rwsem in the VFS inode, the xfs inode contains 2
+ * multi-reader locks: i_mmap_lock and the i_lock.  This routine allows
+ * various combinations of the locks to be obtained.
  *
  * The 3 locks should always be ordered so that the IO lock is obtained first,
  * the mmap lock second and the ilock last in order to prevent deadlock.
  *
  * Basic locking order:
  *
- * i_iolock -> i_mmap_lock -> page_lock -> i_ilock
+ * i_rwsem -> i_mmap_lock -> page_lock -> i_ilock
  *
  * mmap_sem locking order:
  *
- * i_iolock -> page lock -> mmap_sem
+ * i_rwsem -> page lock -> mmap_sem
  * mmap_sem -> i_mmap_lock -> page_lock
  *
  * The difference in mmap_sem locking order mean that we cannot hold the
  * i_mmap_lock over syscall based read(2)/write(2) based IO. These IO paths can
  * fault in pages during copy in/out (for buffered IO) or require the mmap_sem
  * in get_user_pages() to map the user pages into the kernel address space for
- * direct IO. Similarly the i_iolock cannot be taken inside a page fault because
+ * direct IO. Similarly the i_rwsem cannot be taken inside a page fault because
  * page faults already hold the mmap_sem.
  *
  * Hence to serialise fully against both syscall and mmap based IO, we need to
- * take both the i_iolock and the i_mmap_lock. These locks should *only* be both
+ * take both the i_rwsem and the i_mmap_lock. These locks should *only* be both
  * taken in places where we need to invalidate the page cache in a race
  * free manner (e.g. truncate, hole punch and other extent manipulation
  * functions).
@@ -191,10 +191,13 @@ xfs_ilock(
 	       (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL));
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
-	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrupdate_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
-	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mraccess_nested(&ip->i_iolock, XFS_IOLOCK_DEP(lock_flags));
+	if (lock_flags & XFS_IOLOCK_EXCL) {
+		down_write_nested(&VFS_I(ip)->i_rwsem,
+				  XFS_IOLOCK_DEP(lock_flags));
+	} else if (lock_flags & XFS_IOLOCK_SHARED) {
+		down_read_nested(&VFS_I(ip)->i_rwsem,
+				 XFS_IOLOCK_DEP(lock_flags));
+	}
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrupdate_nested(&ip->i_mmaplock, XFS_MMAPLOCK_DEP(lock_flags));
@@ -240,10 +243,10 @@ xfs_ilock_nowait(
 	ASSERT((lock_flags & ~(XFS_LOCK_MASK | XFS_LOCK_SUBCLASS_MASK)) == 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL) {
-		if (!mrtryupdate(&ip->i_iolock))
+		if (!down_write_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 	} else if (lock_flags & XFS_IOLOCK_SHARED) {
-		if (!mrtryaccess(&ip->i_iolock))
+		if (!down_read_trylock(&VFS_I(ip)->i_rwsem))
 			goto out;
 	}
 
@@ -271,9 +274,9 @@ out_undo_mmaplock:
 		mrunlock_shared(&ip->i_mmaplock);
 out_undo_iolock:
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 out:
 	return 0;
 }
@@ -310,9 +313,9 @@ xfs_iunlock(
 	ASSERT(lock_flags != 0);
 
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrunlock_excl(&ip->i_iolock);
+		up_write(&VFS_I(ip)->i_rwsem);
 	else if (lock_flags & XFS_IOLOCK_SHARED)
-		mrunlock_shared(&ip->i_iolock);
+		up_read(&VFS_I(ip)->i_rwsem);
 
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrunlock_excl(&ip->i_mmaplock);
@@ -345,7 +348,7 @@ xfs_ilock_demote(
 	if (lock_flags & XFS_MMAPLOCK_EXCL)
 		mrdemote(&ip->i_mmaplock);
 	if (lock_flags & XFS_IOLOCK_EXCL)
-		mrdemote(&ip->i_iolock);
+		downgrade_write(&VFS_I(ip)->i_rwsem);
 
 	trace_xfs_ilock_demote(ip, lock_flags, _RET_IP_);
 }
@@ -370,8 +373,9 @@ xfs_isilocked(
 
 	if (lock_flags & (XFS_IOLOCK_EXCL|XFS_IOLOCK_SHARED)) {
 		if (!(lock_flags & XFS_IOLOCK_SHARED))
-			return !!ip->i_iolock.mr_writer;
-		return rwsem_is_locked(&ip->i_iolock.mr_lock);
+			return !debug_locks ||
+				lockdep_is_held_type(&VFS_I(ip)->i_rwsem, 0);
+		return rwsem_is_locked(&VFS_I(ip)->i_rwsem);
 	}
 
 	ASSERT(0);
@@ -421,11 +425,7 @@ xfs_lock_inumorder(int lock_mode, int subclass)
 
 	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
 		ASSERT(subclass <= XFS_IOLOCK_MAX_SUBCLASS);
-		ASSERT(xfs_lockdep_subclass_ok(subclass +
-						XFS_IOLOCK_PARENT_VAL));
 		class += subclass << XFS_IOLOCK_SHIFT;
-		if (lock_mode & XFS_IOLOCK_PARENT)
-			class += XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT;
 	}
 
 	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)) {
@@ -477,8 +477,6 @@ xfs_lock_inodes(
 			    XFS_ILOCK_EXCL));
 	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED | XFS_MMAPLOCK_SHARED |
 			      XFS_ILOCK_SHARED)));
-	ASSERT(!(lock_mode & XFS_IOLOCK_EXCL) ||
-		inodes <= XFS_IOLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_MMAPLOCK_EXCL) ||
 		inodes <= XFS_MMAPLOCK_MAX_SUBCLASS + 1);
 	ASSERT(!(lock_mode & XFS_ILOCK_EXCL) ||
@@ -581,10 +579,8 @@ xfs_lock_two_inodes(
 	int			attempts = 0;
 	xfs_log_item_t		*lp;
 
-	if (lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)) {
-		ASSERT(!(lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL)));
-		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
-	} else if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
+	ASSERT(!(lock_mode & (XFS_IOLOCK_SHARED|XFS_IOLOCK_EXCL)));
+	if (lock_mode & (XFS_MMAPLOCK_SHARED|XFS_MMAPLOCK_EXCL))
 		ASSERT(!(lock_mode & (XFS_ILOCK_SHARED|XFS_ILOCK_EXCL)));
 
 	ASSERT(ip0->i_ino != ip1->i_ino);
@@ -715,7 +711,6 @@ xfs_lookup(
 	if (XFS_FORCED_SHUTDOWN(dp->i_mount))
 		return -EIO;
 
-	xfs_ilock(dp, XFS_IOLOCK_SHARED);
 	error = xfs_dir_lookup(NULL, dp, name, &inum, ci_name);
 	if (error)
 		goto out_unlock;
@@ -724,14 +719,12 @@ xfs_lookup(
 	if (error)
 		goto out_free_name;
 
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	return 0;
 
 out_free_name:
 	if (ci_name)
 		kmem_free(ci_name->name);
 out_unlock:
-	xfs_iunlock(dp, XFS_IOLOCK_SHARED);
 	*ipp = NULL;
 	return error;
 }
@@ -1215,8 +1208,7 @@ xfs_create(
 	if (error)
 		goto out_release_inode;
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 
 	xfs_defer_init(&dfops, &first_block);
@@ -1252,7 +1244,7 @@ xfs_create(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 
 	error = xfs_dir_createname(tp, dp, name, ip->i_ino,
@@ -1325,7 +1317,7 @@ xfs_create(
 	xfs_qm_dqrele(pdqp);
 
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
 
@@ -1466,11 +1458,10 @@ xfs_link(
 	if (error)
 		goto std_return;
 
-	xfs_ilock(tdp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(sip, tdp, XFS_ILOCK_EXCL);
 
 	xfs_trans_ijoin(tp, sip, XFS_ILOCK_EXCL);
-	xfs_trans_ijoin(tp, tdp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, tdp, XFS_ILOCK_EXCL);
 
 	/*
 	 * If we are using project inheritance, we only allow hard link
@@ -2041,7 +2032,6 @@ xfs_iunlink(
 	agi->agi_unlinked[bucket_index] = cpu_to_be32(agino);
 	offset = offsetof(xfs_agi_t, agi_unlinked) +
 		(sizeof(xfs_agino_t) * bucket_index);
-	xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
 	xfs_trans_log_buf(tp, agibp, offset,
 			  (offset + sizeof(xfs_agino_t) - 1));
 	return 0;
@@ -2133,7 +2123,6 @@ xfs_iunlink_remove(
 		agi->agi_unlinked[bucket_index] = cpu_to_be32(next_agino);
 		offset = offsetof(xfs_agi_t, agi_unlinked) +
 			(sizeof(xfs_agino_t) * bucket_index);
-		xfs_trans_buf_set_type(tp, agibp, XFS_BLFT_AGI_BUF);
 		xfs_trans_log_buf(tp, agibp, offset,
 				  (offset + sizeof(xfs_agino_t) - 1));
 	} else {
@@ -2579,10 +2568,9 @@ xfs_remove(
 		goto std_return;
 	}
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
 	xfs_lock_two_inodes(dp, ip, XFS_ILOCK_EXCL);
 
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 
 	/*
@@ -2963,12 +2951,6 @@ xfs_rename(
 	 * whether the target directory is the same as the source
 	 * directory, we can lock from 2 to 4 inodes.
 	 */
-	if (!new_parent)
-		xfs_ilock(src_dp, XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-	else
-		xfs_lock_two_inodes(src_dp, target_dp,
-				    XFS_IOLOCK_EXCL | XFS_IOLOCK_PARENT);
-
 	xfs_lock_inodes(inodes, num_inodes, XFS_ILOCK_EXCL);
 
 	/*
@@ -2976,9 +2958,9 @@ xfs_rename(
 	 * we can rely on either trans_commit or trans_cancel to unlock
 	 * them.
 	 */
-	xfs_trans_ijoin(tp, src_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, src_dp, XFS_ILOCK_EXCL);
 	if (new_parent)
-		xfs_trans_ijoin(tp, target_dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_trans_ijoin(tp, target_dp, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, src_ip, XFS_ILOCK_EXCL);
 	if (target_ip)
 		xfs_trans_ijoin(tp, target_ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index f14c1de2549d..10dcf27b4c85 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -56,7 +56,6 @@ typedef struct xfs_inode {
 	/* Transaction and locking information. */
 	struct xfs_inode_log_item *i_itemp;	/* logging information */
 	mrlock_t		i_lock;		/* inode lock */
-	mrlock_t		i_iolock;	/* inode IO lock */
 	mrlock_t		i_mmaplock;	/* inode mmap IO lock */
 	atomic_t		i_pincount;	/* inode pin count */
 	spinlock_t		i_flags_lock;	/* inode i_flags lock */
@@ -246,6 +245,11 @@ static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
  * Synchronize processes attempting to flush the in-core inode back to disk.
  */
 
+static inline int xfs_isiflocked(struct xfs_inode *ip)
+{
+	return xfs_iflags_test(ip, XFS_IFLOCK);
+}
+
 extern void __xfs_iflock(struct xfs_inode *ip);
 
 static inline int xfs_iflock_nowait(struct xfs_inode *ip)
@@ -261,16 +265,12 @@ static inline void xfs_iflock(struct xfs_inode *ip)
 
 static inline void xfs_ifunlock(struct xfs_inode *ip)
 {
+	ASSERT(xfs_isiflocked(ip));
 	xfs_iflags_clear(ip, XFS_IFLOCK);
 	smp_mb();
 	wake_up_bit(&ip->i_flags, __XFS_IFLOCK_BIT);
 }
 
-static inline int xfs_isiflocked(struct xfs_inode *ip)
-{
-	return xfs_iflags_test(ip, XFS_IFLOCK);
-}
-
 /*
  * Flags for inode locking.
  * Bit ranges:	1<<1  - 1<<16-1 -- iolock/ilock modes (bitfield)
@@ -332,7 +332,7 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * IOLOCK values
  *
  * 0-3		subclass value
- * 4-7		PARENT subclass values
+ * 4-7		unused
  *
  * MMAPLOCK values
  *
@@ -347,10 +347,8 @@ static inline int xfs_isiflocked(struct xfs_inode *ip)
  * 
  */
 #define XFS_IOLOCK_SHIFT		16
-#define XFS_IOLOCK_PARENT_VAL		4
-#define XFS_IOLOCK_MAX_SUBCLASS		(XFS_IOLOCK_PARENT_VAL - 1)
+#define XFS_IOLOCK_MAX_SUBCLASS		3
 #define XFS_IOLOCK_DEP_MASK		0x000f0000
-#define	XFS_IOLOCK_PARENT		(XFS_IOLOCK_PARENT_VAL << XFS_IOLOCK_SHIFT)
 
 #define XFS_MMAPLOCK_SHIFT		20
 #define XFS_MMAPLOCK_NUMORDER		0
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 9610e9c00952..d90e7811ccdd 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -164,7 +164,7 @@ xfs_inode_item_format_data_fork(
 			struct xfs_bmbt_rec *p;
 
 			ASSERT(ip->i_df.if_u1.if_extents != NULL);
-			ASSERT(ip->i_df.if_bytes / sizeof(xfs_bmbt_rec_t) > 0);
+			ASSERT(xfs_iext_count(&ip->i_df) > 0);
 
 			p = xlog_prepare_iovec(lv, vecp, XLOG_REG_TYPE_IEXT);
 			data_bytes = xfs_iextents_copy(ip, p, XFS_DATA_FORK);
@@ -261,7 +261,7 @@ xfs_inode_item_format_attr_fork(
 		    ip->i_afp->if_bytes > 0) {
 			struct xfs_bmbt_rec *p;
 
-			ASSERT(ip->i_afp->if_bytes / sizeof(xfs_bmbt_rec_t) ==
+			ASSERT(xfs_iext_count(ip->i_afp) ==
 				ip->i_d.di_anextents);
 			ASSERT(ip->i_afp->if_u1.if_extents != NULL);
 
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index c245bed3249b..c67cfb451fd3 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -287,7 +287,7 @@ xfs_readlink_by_handle(
 		return PTR_ERR(dentry);
 
 	/* Restrict this handle operation to symlinks only. */
-	if (!d_inode(dentry)->i_op->readlink) {
+	if (!d_is_symlink(dentry)) {
 		error = -EINVAL;
 		goto out_dput;
 	}
@@ -297,7 +297,7 @@ xfs_readlink_by_handle(
 		goto out_dput;
 	}
 
-	error = d_inode(dentry)->i_op->readlink(dentry, hreq->ohandle, olen);
+	error = vfs_readlink(dentry, hreq->ohandle, olen);
 
  out_dput:
 	dput(dentry);
@@ -639,7 +639,7 @@ xfs_ioc_space(
 		return error;
 
 	xfs_ilock(ip, iolock);
-	error = xfs_break_layouts(inode, &iolock, false);
+	error = xfs_break_layouts(inode, &iolock);
 	if (error)
 		goto out_unlock;
 
@@ -910,16 +910,14 @@ xfs_ioc_fsgetxattr(
 	if (attr) {
 		if (ip->i_afp) {
 			if (ip->i_afp->if_flags & XFS_IFEXTENTS)
-				fa.fsx_nextents = ip->i_afp->if_bytes /
-							sizeof(xfs_bmbt_rec_t);
+				fa.fsx_nextents = xfs_iext_count(ip->i_afp);
 			else
 				fa.fsx_nextents = ip->i_d.di_anextents;
 		} else
 			fa.fsx_nextents = 0;
 	} else {
 		if (ip->i_df.if_flags & XFS_IFEXTENTS)
-			fa.fsx_nextents = ip->i_df.if_bytes /
-						sizeof(xfs_bmbt_rec_t);
+			fa.fsx_nextents = xfs_iext_count(&ip->i_df);
 		else
 			fa.fsx_nextents = ip->i_d.di_nextents;
 	}
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index 321f57721b92..7c49938c5aed 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -19,7 +19,7 @@
 #include <linux/ioctl.h>
 #include <linux/mount.h>
 #include <linux/slab.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include "xfs.h"
 #include "xfs_fs.h"
 #include "xfs_format.h"
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 436e109bb01e..0d147428971e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -395,11 +395,12 @@ xfs_iomap_prealloc_size(
 	struct xfs_inode	*ip,
 	loff_t			offset,
 	loff_t			count,
-	xfs_extnum_t		idx,
-	struct xfs_bmbt_irec	*prev)
+	xfs_extnum_t		idx)
 {
 	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	struct xfs_bmbt_irec	prev;
 	int			shift = 0;
 	int64_t			freesp;
 	xfs_fsblock_t		qblocks;
@@ -419,8 +420,8 @@ xfs_iomap_prealloc_size(
 	 */
 	if ((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ||
 	    XFS_ISIZE(ip) < XFS_FSB_TO_B(mp, mp->m_dalign) ||
-	    idx == 0 ||
-	    prev->br_startoff + prev->br_blockcount < offset_fsb)
+	    !xfs_iext_get_extent(ifp, idx - 1, &prev) ||
+	    prev.br_startoff + prev.br_blockcount < offset_fsb)
 		return mp->m_writeio_blocks;
 
 	/*
@@ -439,8 +440,8 @@ xfs_iomap_prealloc_size(
 	 * always extends to MAXEXTLEN rather than falling short due to things
 	 * like stripe unit/width alignment of real extents.
 	 */
-	if (prev->br_blockcount <= (MAXEXTLEN >> 1))
-		alloc_blocks = prev->br_blockcount << 1;
+	if (prev.br_blockcount <= (MAXEXTLEN >> 1))
+		alloc_blocks = prev.br_blockcount << 1;
 	else
 		alloc_blocks = XFS_B_TO_FSB(mp, offset);
 	if (!alloc_blocks)
@@ -535,11 +536,11 @@ xfs_file_iomap_begin_delay(
 	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	xfs_fileoff_t		maxbytes_fsb =
 		XFS_B_TO_FSB(mp, mp->m_super->s_maxbytes);
-	xfs_fileoff_t		end_fsb, orig_end_fsb;
+	xfs_fileoff_t		end_fsb;
 	int			error = 0, eof = 0;
 	struct xfs_bmbt_irec	got;
-	struct xfs_bmbt_irec	prev;
 	xfs_extnum_t		idx;
+	xfs_fsblock_t		prealloc_blocks = 0;
 
 	ASSERT(!XFS_IS_REALTIME_INODE(ip));
 	ASSERT(!xfs_get_extsz_hint(ip));
@@ -563,8 +564,7 @@ xfs_file_iomap_begin_delay(
 			goto out_unlock;
 	}
 
-	xfs_bmap_search_extents(ip, offset_fsb, XFS_DATA_FORK, &eof, &idx,
-			&got, &prev);
+	eof = !xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got);
 	if (!eof && got.br_startoff <= offset_fsb) {
 		if (xfs_is_reflink_inode(ip)) {
 			bool		shared;
@@ -595,35 +595,32 @@ xfs_file_iomap_begin_delay(
 	 * the lower level functions are updated.
 	 */
 	count = min_t(loff_t, count, 1024 * PAGE_SIZE);
-	end_fsb = orig_end_fsb =
-		min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
+	end_fsb = min(XFS_B_TO_FSB(mp, offset + count), maxbytes_fsb);
 
 	if (eof) {
-		xfs_fsblock_t	prealloc_blocks;
-
-		prealloc_blocks =
-			xfs_iomap_prealloc_size(ip, offset, count, idx, &prev);
+		prealloc_blocks = xfs_iomap_prealloc_size(ip, offset, count, idx);
 		if (prealloc_blocks) {
 			xfs_extlen_t	align;
 			xfs_off_t	end_offset;
+			xfs_fileoff_t	p_end_fsb;
 
 			end_offset = XFS_WRITEIO_ALIGN(mp, offset + count - 1);
-			end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
-				prealloc_blocks;
+			p_end_fsb = XFS_B_TO_FSBT(mp, end_offset) +
+					prealloc_blocks;
 
 			align = xfs_eof_alignment(ip, 0);
 			if (align)
-				end_fsb = roundup_64(end_fsb, align);
+				p_end_fsb = roundup_64(p_end_fsb, align);
 
-			end_fsb = min(end_fsb, maxbytes_fsb);
-			ASSERT(end_fsb > offset_fsb);
+			p_end_fsb = min(p_end_fsb, maxbytes_fsb);
+			ASSERT(p_end_fsb > offset_fsb);
+			prealloc_blocks = p_end_fsb - end_fsb;
 		}
 	}
 
 retry:
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
-			end_fsb - offset_fsb, &got,
-			&prev, &idx, eof);
+			end_fsb - offset_fsb, prealloc_blocks, &got, &idx, eof);
 	switch (error) {
 	case 0:
 		break;
@@ -631,8 +628,8 @@ retry:
 	case -EDQUOT:
 		/* retry without any preallocation */
 		trace_xfs_delalloc_enospc(ip, offset, count);
-		if (end_fsb != orig_end_fsb) {
-			end_fsb = orig_end_fsb;
+		if (prealloc_blocks) {
+			prealloc_blocks = 0;
 			goto retry;
 		}
 		/*FALLTHRU*/
@@ -640,13 +637,6 @@ retry:
 		goto out_unlock;
 	}
 
-	/*
-	 * Tag the inode as speculatively preallocated so we can reclaim this
-	 * space on demand, if necessary.
-	 */
-	if (end_fsb != orig_end_fsb)
-		xfs_inode_set_eofblocks_tag(ip);
-
 	trace_xfs_iomap_alloc(ip, offset, count, 0, &got);
 done:
 	if (isnullstartblock(got.br_startblock))
@@ -960,6 +950,19 @@ static inline bool imap_needs_alloc(struct inode *inode,
 		(IS_DAX(inode) && ISUNWRITTEN(imap));
 }
 
+static inline bool need_excl_ilock(struct xfs_inode *ip, unsigned flags)
+{
+	/*
+	 * COW writes will allocate delalloc space, so we need to make sure
+	 * to take the lock exclusively here.
+	 */
+	if (xfs_is_reflink_inode(ip) && (flags & (IOMAP_WRITE | IOMAP_ZERO)))
+		return true;
+	if ((flags & IOMAP_DIRECT) && (flags & IOMAP_WRITE))
+		return true;
+	return false;
+}
+
 static int
 xfs_file_iomap_begin(
 	struct inode		*inode,
@@ -979,18 +982,14 @@ xfs_file_iomap_begin(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
-		   !xfs_get_extsz_hint(ip)) {
+	if (((flags & (IOMAP_WRITE | IOMAP_DIRECT)) == IOMAP_WRITE) &&
+			!IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
 		/* Reserve delalloc blocks for regular writeback. */
 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 				iomap);
 	}
 
-	/*
-	 * COW writes will allocate delalloc space, so we need to make sure
-	 * to take the lock exclusively here.
-	 */
-	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+	if (need_excl_ilock(ip, flags)) {
 		lockmode = XFS_ILOCK_EXCL;
 		xfs_ilock(ip, XFS_ILOCK_EXCL);
 	} else {
@@ -1003,17 +1002,41 @@ xfs_file_iomap_begin(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
+	if (xfs_is_reflink_inode(ip) &&
+	    (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT)) {
+		shared = xfs_reflink_find_cow_mapping(ip, offset, &imap);
+		if (shared) {
+			xfs_iunlock(ip, lockmode);
+			goto alloc_done;
+		}
+		ASSERT(!isnullstartblock(imap.br_startblock));
+	}
+
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
 			       &nimaps, 0);
 	if (error)
 		goto out_unlock;
 
-	if (flags & IOMAP_REPORT) {
+	if ((flags & IOMAP_REPORT) ||
+	    (xfs_is_reflink_inode(ip) &&
+	     (flags & IOMAP_WRITE) && (flags & IOMAP_DIRECT))) {
 		/* Trim the mapping to the nearest shared extent boundary. */
 		error = xfs_reflink_trim_around_shared(ip, &imap, &shared,
 				&trimmed);
 		if (error)
 			goto out_unlock;
+
+		/*
+		 * We're here because we're trying to do a directio write to a
+		 * region that isn't aligned to a filesystem block.  If the
+		 * extent is shared, fall back to buffered mode to handle the
+		 * RMW.
+		 */
+		if (!(flags & IOMAP_REPORT) && shared) {
+			trace_xfs_reflink_bounce_dio_write(ip, &imap);
+			error = -EREMCHG;
+			goto out_unlock;
+		}
 	}
 
 	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
@@ -1048,6 +1071,7 @@ xfs_file_iomap_begin(
 		if (error)
 			return error;
 
+alloc_done:
 		iomap->flags = IOMAP_F_NEW;
 		trace_xfs_iomap_alloc(ip, offset, length, 0, &imap);
 	} else {
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index 405a65cd9d6b..22c16155f1b4 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -98,12 +98,27 @@ xfs_init_security(
 static void
 xfs_dentry_to_name(
 	struct xfs_name	*namep,
+	struct dentry	*dentry)
+{
+	namep->name = dentry->d_name.name;
+	namep->len = dentry->d_name.len;
+	namep->type = XFS_DIR3_FT_UNKNOWN;
+}
+
+static int
+xfs_dentry_mode_to_name(
+	struct xfs_name	*namep,
 	struct dentry	*dentry,
 	int		mode)
 {
 	namep->name = dentry->d_name.name;
 	namep->len = dentry->d_name.len;
-	namep->type = xfs_mode_to_ftype[(mode & S_IFMT) >> S_SHIFT];
+	namep->type = xfs_mode_to_ftype(mode);
+
+	if (unlikely(namep->type == XFS_DIR3_FT_UNKNOWN))
+		return -EFSCORRUPTED;
+
+	return 0;
 }
 
 STATIC void
@@ -119,7 +134,7 @@ xfs_cleanup_inode(
 	 * xfs_init_security we must back out.
 	 * ENOSPC can hit here, among other things.
 	 */
-	xfs_dentry_to_name(&teardown, dentry, 0);
+	xfs_dentry_to_name(&teardown, dentry);
 
 	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
 }
@@ -154,8 +169,12 @@ xfs_generic_create(
 	if (error)
 		return error;
 
+	/* Verify mode is valid also for tmpfile case */
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out_free_acl;
+
 	if (!tmpfile) {
-		xfs_dentry_to_name(&name, dentry, mode);
 		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
 	} else {
 		error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
@@ -248,7 +267,7 @@ xfs_vn_lookup(
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	xfs_dentry_to_name(&name, dentry, 0);
+	xfs_dentry_to_name(&name, dentry);
 	error = xfs_lookup(XFS_I(dir), &name, &cip, NULL);
 	if (unlikely(error)) {
 		if (unlikely(error != -ENOENT))
@@ -275,7 +294,7 @@ xfs_vn_ci_lookup(
 	if (dentry->d_name.len >= MAXNAMELEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	xfs_dentry_to_name(&xname, dentry, 0);
+	xfs_dentry_to_name(&xname, dentry);
 	error = xfs_lookup(XFS_I(dir), &xname, &ip, &ci_name);
 	if (unlikely(error)) {
 		if (unlikely(error != -ENOENT))
@@ -310,7 +329,9 @@ xfs_vn_link(
 	struct xfs_name	name;
 	int		error;
 
-	xfs_dentry_to_name(&name, dentry, inode->i_mode);
+	error = xfs_dentry_mode_to_name(&name, dentry, inode->i_mode);
+	if (unlikely(error))
+		return error;
 
 	error = xfs_link(XFS_I(dir), XFS_I(inode), &name);
 	if (unlikely(error))
@@ -329,7 +350,7 @@ xfs_vn_unlink(
 	struct xfs_name	name;
 	int		error;
 
-	xfs_dentry_to_name(&name, dentry, 0);
+	xfs_dentry_to_name(&name, dentry);
 
 	error = xfs_remove(XFS_I(dir), &name, XFS_I(d_inode(dentry)));
 	if (error)
@@ -359,7 +380,9 @@ xfs_vn_symlink(
 
 	mode = S_IFLNK |
 		(irix_symlink_mode ? 0777 & ~current_umask() : S_IRWXUGO);
-	xfs_dentry_to_name(&name, dentry, mode);
+	error = xfs_dentry_mode_to_name(&name, dentry, mode);
+	if (unlikely(error))
+		goto out;
 
 	error = xfs_symlink(XFS_I(dir), &name, symname, mode, &cip);
 	if (unlikely(error))
@@ -395,6 +418,7 @@ xfs_vn_rename(
 {
 	struct inode	*new_inode = d_inode(ndentry);
 	int		omode = 0;
+	int		error;
 	struct xfs_name	oname;
 	struct xfs_name	nname;
 
@@ -405,8 +429,14 @@ xfs_vn_rename(
 	if (flags & RENAME_EXCHANGE)
 		omode = d_inode(ndentry)->i_mode;
 
-	xfs_dentry_to_name(&oname, odentry, omode);
-	xfs_dentry_to_name(&nname, ndentry, d_inode(odentry)->i_mode);
+	error = xfs_dentry_mode_to_name(&oname, odentry, omode);
+	if (omode && unlikely(error))
+		return error;
+
+	error = xfs_dentry_mode_to_name(&nname, ndentry,
+					d_inode(odentry)->i_mode);
+	if (unlikely(error))
+		return error;
 
 	return xfs_rename(XFS_I(odir), &oname, XFS_I(d_inode(odentry)),
 			  XFS_I(ndir), &nname,
@@ -983,15 +1013,13 @@ xfs_vn_setattr(
 		struct xfs_inode	*ip = XFS_I(d_inode(dentry));
 		uint			iolock = XFS_IOLOCK_EXCL;
 
-		xfs_ilock(ip, iolock);
-		error = xfs_break_layouts(d_inode(dentry), &iolock, true);
-		if (!error) {
-			xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
-			iolock |= XFS_MMAPLOCK_EXCL;
+		error = xfs_break_layouts(d_inode(dentry), &iolock);
+		if (error)
+			return error;
 
-			error = xfs_vn_setattr_size(dentry, iattr);
-		}
-		xfs_iunlock(ip, iolock);
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+		error = xfs_vn_setattr_size(dentry, iattr);
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
 	} else {
 		error = xfs_vn_setattr_nonsize(dentry, iattr);
 	}
@@ -1122,7 +1150,6 @@ static const struct inode_operations xfs_dir_ci_inode_operations = {
 };
 
 static const struct inode_operations xfs_symlink_inode_operations = {
-	.readlink		= generic_readlink,
 	.get_link		= xfs_vn_get_link,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
@@ -1131,7 +1158,6 @@ static const struct inode_operations xfs_symlink_inode_operations = {
 };
 
 static const struct inode_operations xfs_inline_symlink_inode_operations = {
-	.readlink		= generic_readlink,
 	.get_link		= xfs_vn_get_link_inline,
 	.getattr		= xfs_vn_getattr,
 	.setattr		= xfs_vn_setattr,
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index 68640fb63a54..7a989de224f4 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -78,11 +78,12 @@ typedef __u32			xfs_nlink_t;
 #include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
 
 #include <asm/page.h>
 #include <asm/div64.h>
 #include <asm/param.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <asm/byteorder.h>
 #include <asm/unaligned.h>
 
@@ -330,11 +331,11 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 }
 
 #define ASSERT_ALWAYS(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifdef DEBUG
 #define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
 # define STATIC noinline
@@ -345,7 +346,7 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y)
 #ifdef XFS_WARN
 
 #define ASSERT(expr)	\
-	(unlikely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
+	(likely(expr) ? (void)0 : asswarn(#expr, __FILE__, __LINE__))
 
 #ifndef STATIC
 # define STATIC static noinline
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 3b74fa011bb1..b1469f0a91a6 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -1668,7 +1668,7 @@ xlog_cksum(
 	__uint32_t		crc;
 
 	/* first generate the crc for the record header ... */
-	crc = xfs_start_cksum((char *)rhead,
+	crc = xfs_start_cksum_update((char *)rhead,
 			      sizeof(struct xlog_rec_header),
 			      offsetof(struct xlog_rec_header, h_crc));
 
@@ -1862,26 +1862,21 @@ xlog_sync(
 
 	bp->b_io_length = BTOBB(count);
 	bp->b_fspriv = iclog;
-	bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
+	bp->b_flags &= ~XBF_FLUSH;
+	bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
 
-	if (log->l_mp->m_flags & XFS_MOUNT_BARRIER) {
-		bp->b_flags |= XBF_FUA;
-
-		/*
-		 * Flush the data device before flushing the log to make
-		 * sure all meta data written back from the AIL actually made
-		 * it to disk before stamping the new log tail LSN into the
-		 * log buffer.  For an external log we need to issue the
-		 * flush explicitly, and unfortunately synchronously here;
-		 * for an internal log we can simply use the block layer
-		 * state machine for preflushes.
-		 */
-		if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
-			xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
-		else
-			bp->b_flags |= XBF_FLUSH;
-	}
+	/*
+	 * Flush the data device before flushing the log to make sure all meta
+	 * data written back from the AIL actually made it to disk before
+	 * stamping the new log tail LSN into the log buffer.  For an external
+	 * log we need to issue the flush explicitly, and unfortunately
+	 * synchronously here; for an internal log we can simply use the block
+	 * layer state machine for preflushes.
+	 */
+	if (log->l_mp->m_logdev_targp != log->l_mp->m_ddev_targp)
+		xfs_blkdev_issue_flush(log->l_mp->m_ddev_targp);
+	else
+		bp->b_flags |= XBF_FLUSH;
 
 	ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 	ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -1906,10 +1901,8 @@ xlog_sync(
 		xfs_buf_associate_memory(bp,
 				(char *)&iclog->ic_header + count, split);
 		bp->b_fspriv = iclog;
-		bp->b_flags &= ~(XBF_FUA | XBF_FLUSH);
-		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE);
-		if (log->l_mp->m_flags & XFS_MOUNT_BARRIER)
-			bp->b_flags |= XBF_FUA;
+		bp->b_flags &= ~XBF_FLUSH;
+		bp->b_flags |= (XBF_ASYNC | XBF_SYNCIO | XBF_WRITE | XBF_FUA);
 
 		ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1);
 		ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize);
@@ -3324,12 +3317,8 @@ xfs_log_force(
 	xfs_mount_t	*mp,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, 0, _RET_IP_);
-	error = _xfs_log_force(mp, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force(mp, flags, NULL);
 }
 
 /*
@@ -3473,12 +3462,8 @@ xfs_log_force_lsn(
 	xfs_lsn_t	lsn,
 	uint		flags)
 {
-	int	error;
-
 	trace_xfs_log_force(mp, lsn, _RET_IP_);
-	error = _xfs_log_force_lsn(mp, lsn, flags, NULL);
-	if (error)
-		xfs_warn(mp, "%s: error %d returned.", __func__, error);
+	_xfs_log_force_lsn(mp, lsn, flags, NULL);
 }
 
 /*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 9b3d7c76915d..4a98762ec8b4 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2025,7 +2025,7 @@ xlog_peek_buffer_cancelled(
 	struct xlog		*log,
 	xfs_daddr_t		blkno,
 	uint			len,
-	ushort			flags)
+	unsigned short			flags)
 {
 	struct list_head	*bucket;
 	struct xfs_buf_cancel	*bcp;
@@ -2065,7 +2065,7 @@ xlog_check_buffer_cancelled(
 	struct xlog		*log,
 	xfs_daddr_t		blkno,
 	uint			len,
-	ushort			flags)
+	unsigned short			flags)
 {
 	struct xfs_buf_cancel	*bcp;
 
@@ -5113,19 +5113,21 @@ xlog_recover_process(
 	struct list_head	*buffer_list)
 {
 	int			error;
+	__le32			old_crc = rhead->h_crc;
 	__le32			crc;
 
+
 	crc = xlog_cksum(log, rhead, dp, be32_to_cpu(rhead->h_len));
 
 	/*
 	 * Nothing else to do if this is a CRC verification pass. Just return
 	 * if this a record with a non-zero crc. Unfortunately, mkfs always
-	 * sets h_crc to 0 so we must consider this valid even on v5 supers.
+	 * sets old_crc to 0 so we must consider this valid even on v5 supers.
 	 * Otherwise, return EFSBADCRC on failure so the callers up the stack
 	 * know precisely what failed.
 	 */
 	if (pass == XLOG_RECOVER_CRCPASS) {
-		if (rhead->h_crc && crc != rhead->h_crc)
+		if (old_crc && crc != old_crc)
 			return -EFSBADCRC;
 		return 0;
 	}
@@ -5136,11 +5138,11 @@ xlog_recover_process(
 	 * zero CRC check prevents warnings from being emitted when upgrading
 	 * the kernel from one that does not add CRCs by default.
 	 */
-	if (crc != rhead->h_crc) {
-		if (rhead->h_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
+	if (crc != old_crc) {
+		if (old_crc || xfs_sb_version_hascrc(&log->l_mp->m_sb)) {
 			xfs_alert(log->l_mp,
 		"log record CRC mismatch: found 0x%x, expected 0x%x.",
-					le32_to_cpu(rhead->h_crc),
+					le32_to_cpu(old_crc),
 					le32_to_cpu(crc));
 			xfs_hex_dump(dp, 32);
 		}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index b341f10cf481..9b9540db17a6 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -157,6 +157,7 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 }
@@ -212,8 +213,8 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-		spin_lock_init(&pag->pag_buf_lock);
-		pag->pag_buf_tree = RB_ROOT;
+		if (xfs_buf_hash_init(pag))
+			goto out_unwind;
 
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
 	return 0;
 
 out_unwind:
+	xfs_buf_hash_destroy(pag);
 	kmem_free(pag);
 	for (; index > first_initialised; index--) {
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		xfs_buf_hash_destroy(pag);
 		kmem_free(pag);
 	}
 	return error;
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 819b80b15bfb..84f785218907 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -393,8 +393,8 @@ typedef struct xfs_perag {
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 	/* buffer cache index */
-	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
-	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_hash */
+	struct rhashtable pag_buf_hash;
 
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
@@ -424,6 +424,9 @@ xfs_perag_resv(
 	}
 }
 
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
 extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 93a7aafa56d6..2f2dc3c09ad0 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -32,8 +32,7 @@
 int
 xfs_break_layouts(
 	struct inode		*inode,
-	uint			*iolock,
-	bool			with_imutex)
+	uint			*iolock)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 	int			error;
@@ -42,12 +41,8 @@ xfs_break_layouts(
 
 	while ((error = break_layout(inode, false) == -EWOULDBLOCK)) {
 		xfs_iunlock(ip, *iolock);
-		if (with_imutex && (*iolock & XFS_IOLOCK_EXCL))
-			inode_unlock(inode);
 		error = break_layout(inode, true);
 		*iolock = XFS_IOLOCK_EXCL;
-		if (with_imutex)
-			inode_lock(inode);
 		xfs_ilock(ip, *iolock);
 	}
 
diff --git a/fs/xfs/xfs_pnfs.h b/fs/xfs/xfs_pnfs.h
index e8339f74966b..b587cb99b2b7 100644
--- a/fs/xfs/xfs_pnfs.h
+++ b/fs/xfs/xfs_pnfs.h
@@ -8,10 +8,10 @@ int xfs_fs_map_blocks(struct inode *inode, loff_t offset, u64 length,
 int xfs_fs_commit_blocks(struct inode *inode, struct iomap *maps, int nr_maps,
 		struct iattr *iattr);
 
-int xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex);
+int xfs_break_layouts(struct inode *inode, uint *iolock);
 #else
 static inline int
-xfs_break_layouts(struct inode *inode, uint *iolock, bool with_imutex)
+xfs_break_layouts(struct inode *inode, uint *iolock)
 {
 	return 0;
 }
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index a60d9e2739d1..45e50ea90769 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1135,7 +1135,7 @@ xfs_qm_get_rtblks(
 			return error;
 	}
 	rtblks = 0;
-	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
+	nextents = xfs_iext_count(ifp);
 	for (idx = 0; idx < nextents; idx++)
 		rtblks += xfs_bmbt_get_blockcount(xfs_iext_get_ext(ifp, idx));
 	*O_rtblks = (xfs_qcnt_t)rtblks;
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
index fe86a668a57e..6e4c7446c3d4 100644
--- a/fs/xfs/xfs_refcount_item.c
+++ b/fs/xfs/xfs_refcount_item.c
@@ -526,13 +526,14 @@ xfs_cui_recover(
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
 	error = xfs_defer_finish(&tp, &dfops, NULL);
 	if (error)
-		goto abort_error;
+		goto abort_defer;
 	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
 	error = xfs_trans_commit(tp);
 	return error;
 
 abort_error:
 	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+abort_defer:
 	xfs_defer_cancel(&dfops);
 	xfs_trans_cancel(tp);
 	return error;
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index a279b4e7f5fe..07593a362cd0 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -243,12 +243,11 @@ xfs_reflink_reserve_cow(
 	struct xfs_bmbt_irec	*imap,
 	bool			*shared)
 {
-	struct xfs_bmbt_irec	got, prev;
-	xfs_fileoff_t		end_fsb, orig_end_fsb;
-	int			eof = 0, error = 0;
-	bool			trimmed;
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec	got;
+	int			error = 0;
+	bool			eof = false, trimmed;
 	xfs_extnum_t		idx;
-	xfs_extlen_t		align;
 
 	/*
 	 * Search the COW fork extent list first.  This serves two purposes:
@@ -258,8 +257,9 @@ xfs_reflink_reserve_cow(
 	 * extent list is generally faster than going out to the shared extent
 	 * tree.
 	 */
-	xfs_bmap_search_extents(ip, imap->br_startoff, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
+
+	if (!xfs_iext_lookup_extent(ip, ifp, imap->br_startoff, &idx, &got))
+		eof = true;
 	if (!eof && got.br_startoff <= imap->br_startoff) {
 		trace_xfs_reflink_cow_found(ip, imap);
 		xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
@@ -285,33 +285,12 @@ xfs_reflink_reserve_cow(
 	if (error)
 		return error;
 
-	end_fsb = orig_end_fsb = imap->br_startoff + imap->br_blockcount;
-
-	align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
-	if (align)
-		end_fsb = roundup_64(end_fsb, align);
-
-retry:
 	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, imap->br_startoff,
-			end_fsb - imap->br_startoff, &got, &prev, &idx, eof);
-	switch (error) {
-	case 0:
-		break;
-	case -ENOSPC:
-	case -EDQUOT:
-		/* retry without any preallocation */
+			imap->br_blockcount, 0, &got, &idx, eof);
+	if (error == -ENOSPC || error == -EDQUOT)
 		trace_xfs_reflink_cow_enospc(ip, imap);
-		if (end_fsb != orig_end_fsb) {
-			end_fsb = orig_end_fsb;
-			goto retry;
-		}
-		/*FALLTHRU*/
-	default:
+	if (error)
 		return error;
-	}
-
-	if (end_fsb != orig_end_fsb)
-		xfs_inode_set_cowblocks_tag(ip);
 
 	trace_xfs_reflink_cow_alloc(ip, &got);
 	return 0;
@@ -418,87 +397,65 @@ xfs_reflink_allocate_cow_range(
 }
 
 /*
- * Find the CoW reservation (and whether or not it needs block allocation)
- * for a given byte offset of a file.
+ * Find the CoW reservation for a given byte offset of a file.
  */
 bool
 xfs_reflink_find_cow_mapping(
 	struct xfs_inode		*ip,
 	xfs_off_t			offset,
-	struct xfs_bmbt_irec		*imap,
-	bool				*need_alloc)
+	struct xfs_bmbt_irec		*imap)
 {
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
-	xfs_fileoff_t			bno;
+	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	xfs_fileoff_t			offset_fsb;
+	struct xfs_bmbt_irec		got;
 	xfs_extnum_t			idx;
 
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
 	ASSERT(xfs_is_reflink_inode(ip));
 
-	/* Find the extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
-	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
-	if (!gotp)
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
 		return false;
-
-	xfs_bmbt_get_all(gotp, &irec);
-	if (bno >= irec.br_startoff + irec.br_blockcount ||
-	    bno < irec.br_startoff)
+	if (got.br_startoff > offset_fsb)
 		return false;
 
 	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
-			&irec);
-
-	/* If it's still delalloc, we must allocate later. */
-	*imap = irec;
-	*need_alloc = !!(isnullstartblock(irec.br_startblock));
-
+			&got);
+	*imap = got;
 	return true;
 }
 
 /*
  * Trim an extent to end at the next CoW reservation past offset_fsb.
  */
-int
+void
 xfs_reflink_trim_irec_to_next_cow(
 	struct xfs_inode		*ip,
 	xfs_fileoff_t			offset_fsb,
 	struct xfs_bmbt_irec		*imap)
 {
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
+	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	struct xfs_bmbt_irec		got;
 	xfs_extnum_t			idx;
 
 	if (!xfs_is_reflink_inode(ip))
-		return 0;
+		return;
 
 	/* Find the extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
-	if (!gotp)
-		return 0;
-	xfs_bmbt_get_all(gotp, &irec);
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
+		return;
 
 	/* This is the extent before; try sliding up one. */
-	if (irec.br_startoff < offset_fsb) {
-		idx++;
-		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			return 0;
-		gotp = xfs_iext_get_ext(ifp, idx);
-		xfs_bmbt_get_all(gotp, &irec);
+	if (got.br_startoff < offset_fsb) {
+		if (!xfs_iext_get_extent(ifp, idx + 1, &got))
+			return;
 	}
 
-	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
-		return 0;
+	if (got.br_startoff >= imap->br_startoff + imap->br_blockcount)
+		return;
 
-	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+	imap->br_blockcount = got.br_startoff - imap->br_startoff;
 	trace_xfs_reflink_trim_irec(ip, imap);
-
-	return 0;
 }
 
 /*
@@ -512,18 +469,15 @@ xfs_reflink_cancel_cow_blocks(
 	xfs_fileoff_t			end_fsb)
 {
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got, prev, del;
+	struct xfs_bmbt_irec		got, del;
 	xfs_extnum_t			idx;
 	xfs_fsblock_t			firstfsb;
 	struct xfs_defer_ops		dfops;
-	int				error = 0, eof = 0;
+	int				error = 0;
 
 	if (!xfs_is_reflink_inode(ip))
 		return 0;
-
-	xfs_bmap_search_extents(ip, offset_fsb, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
-	if (eof)
+	if (!xfs_iext_lookup_extent(ip, ifp, offset_fsb, &idx, &got))
 		return 0;
 
 	while (got.br_startoff < end_fsb) {
@@ -566,9 +520,8 @@ xfs_reflink_cancel_cow_blocks(
 			xfs_bmap_del_extent_cow(ip, &idx, &got, &del);
 		}
 
-		if (++idx >= ifp->if_bytes / sizeof(struct xfs_bmbt_rec))
+		if (!xfs_iext_get_extent(ifp, ++idx, &got))
 			break;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
 	}
 
 	/* clear tag if cow fork is emptied */
@@ -638,13 +591,13 @@ xfs_reflink_end_cow(
 	xfs_off_t			count)
 {
 	struct xfs_ifork		*ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	struct xfs_bmbt_irec		got, prev, del;
+	struct xfs_bmbt_irec		got, del;
 	struct xfs_trans		*tp;
 	xfs_fileoff_t			offset_fsb;
 	xfs_fileoff_t			end_fsb;
 	xfs_fsblock_t			firstfsb;
 	struct xfs_defer_ops		dfops;
-	int				error, eof = 0;
+	int				error;
 	unsigned int			resblks;
 	xfs_filblks_t			rlen;
 	xfs_extnum_t			idx;
@@ -668,13 +621,11 @@ xfs_reflink_end_cow(
 	xfs_ilock(ip, XFS_ILOCK_EXCL);
 	xfs_trans_ijoin(tp, ip, 0);
 
-	xfs_bmap_search_extents(ip, end_fsb - 1, XFS_COW_FORK, &eof, &idx,
-			&got, &prev);
-
 	/* If there is a hole at end_fsb - 1 go to the previous extent */
-	if (eof || got.br_startoff > end_fsb) {
+	if (!xfs_iext_lookup_extent(ip, ifp, end_fsb - 1, &idx, &got) ||
+	    got.br_startoff > end_fsb) {
 		ASSERT(idx > 0);
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, --idx), &got);
+		xfs_iext_get_extent(ifp, --idx, &got);
 	}
 
 	/* Walk backwards until we're out of the I/O range... */
@@ -722,11 +673,9 @@ xfs_reflink_end_cow(
 		error = xfs_defer_finish(&tp, &dfops, ip);
 		if (error)
 			goto out_defer;
-
 next_extent:
-		if (idx < 0)
+		if (!xfs_iext_get_extent(ifp, idx, &got))
 			break;
-		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &got);
 	}
 
 	error = xfs_trans_commit(tp);
@@ -1165,111 +1114,6 @@ err:
 }
 
 /*
- * Read a page's worth of file data into the page cache.  Return the page
- * locked.
- */
-static struct page *
-xfs_get_page(
-	struct inode	*inode,
-	xfs_off_t	offset)
-{
-	struct address_space	*mapping;
-	struct page		*page;
-	pgoff_t			n;
-
-	n = offset >> PAGE_SHIFT;
-	mapping = inode->i_mapping;
-	page = read_mapping_page(mapping, n, NULL);
-	if (IS_ERR(page))
-		return page;
-	if (!PageUptodate(page)) {
-		put_page(page);
-		return ERR_PTR(-EIO);
-	}
-	lock_page(page);
-	return page;
-}
-
-/*
- * Compare extents of two files to see if they are the same.
- */
-static int
-xfs_compare_extents(
-	struct inode	*src,
-	xfs_off_t	srcoff,
-	struct inode	*dest,
-	xfs_off_t	destoff,
-	xfs_off_t	len,
-	bool		*is_same)
-{
-	xfs_off_t	src_poff;
-	xfs_off_t	dest_poff;
-	void		*src_addr;
-	void		*dest_addr;
-	struct page	*src_page;
-	struct page	*dest_page;
-	xfs_off_t	cmp_len;
-	bool		same;
-	int		error;
-
-	error = -EINVAL;
-	same = true;
-	while (len) {
-		src_poff = srcoff & (PAGE_SIZE - 1);
-		dest_poff = destoff & (PAGE_SIZE - 1);
-		cmp_len = min(PAGE_SIZE - src_poff,
-			      PAGE_SIZE - dest_poff);
-		cmp_len = min(cmp_len, len);
-		ASSERT(cmp_len > 0);
-
-		trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
-				XFS_I(dest), destoff);
-
-		src_page = xfs_get_page(src, srcoff);
-		if (IS_ERR(src_page)) {
-			error = PTR_ERR(src_page);
-			goto out_error;
-		}
-		dest_page = xfs_get_page(dest, destoff);
-		if (IS_ERR(dest_page)) {
-			error = PTR_ERR(dest_page);
-			unlock_page(src_page);
-			put_page(src_page);
-			goto out_error;
-		}
-		src_addr = kmap_atomic(src_page);
-		dest_addr = kmap_atomic(dest_page);
-
-		flush_dcache_page(src_page);
-		flush_dcache_page(dest_page);
-
-		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
-			same = false;
-
-		kunmap_atomic(dest_addr);
-		kunmap_atomic(src_addr);
-		unlock_page(dest_page);
-		unlock_page(src_page);
-		put_page(dest_page);
-		put_page(src_page);
-
-		if (!same)
-			break;
-
-		srcoff += cmp_len;
-		destoff += cmp_len;
-		len -= cmp_len;
-	}
-
-	*is_same = same;
-	return 0;
-
-out_error:
-	trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
-	return error;
-}
-
-/*
  * Link a range of blocks from one file to another.
  */
 int
@@ -1286,14 +1130,11 @@ xfs_reflink_remap_range(
 	struct inode		*inode_out = file_inode(file_out);
 	struct xfs_inode	*dest = XFS_I(inode_out);
 	struct xfs_mount	*mp = src->i_mount;
-	loff_t			bs = inode_out->i_sb->s_blocksize;
 	bool			same_inode = (inode_in == inode_out);
 	xfs_fileoff_t		sfsbno, dfsbno;
 	xfs_filblks_t		fsblen;
 	xfs_extlen_t		cowextsize;
-	loff_t			isize;
 	ssize_t			ret;
-	loff_t			blen;
 
 	if (!xfs_sb_version_hasreflink(&mp->m_sb))
 		return -EOPNOTSUPP;
@@ -1302,34 +1143,14 @@ xfs_reflink_remap_range(
 		return -EIO;
 
 	/* Lock both files against IO */
-	if (same_inode) {
-		xfs_ilock(src, XFS_IOLOCK_EXCL);
+	lock_two_nondirectories(inode_in, inode_out);
+	if (same_inode)
 		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
-	} else {
-		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+	else
 		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
-	}
-
-	/* Don't touch certain kinds of inodes */
-	ret = -EPERM;
-	if (IS_IMMUTABLE(inode_out))
-		goto out_unlock;
 
-	ret = -ETXTBSY;
-	if (IS_SWAPFILE(inode_in) || IS_SWAPFILE(inode_out))
-		goto out_unlock;
-
-
-	/* Don't reflink dirs, pipes, sockets... */
-	ret = -EISDIR;
-	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
-		goto out_unlock;
+	/* Check file eligibility and prepare for block sharing. */
 	ret = -EINVAL;
-	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
-		goto out_unlock;
-	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
-		goto out_unlock;
-
 	/* Don't reflink realtime inodes */
 	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
 		goto out_unlock;
@@ -1338,91 +1159,18 @@ xfs_reflink_remap_range(
 	if (IS_DAX(inode_in) || IS_DAX(inode_out))
 		goto out_unlock;
 
-	/* Are we going all the way to the end? */
-	isize = i_size_read(inode_in);
-	if (isize == 0) {
-		ret = 0;
-		goto out_unlock;
-	}
-
-	if (len == 0)
-		len = isize - pos_in;
-
-	/* Ensure offsets don't wrap and the input is inside i_size */
-	if (pos_in + len < pos_in || pos_out + len < pos_out ||
-	    pos_in + len > isize)
-		goto out_unlock;
-
-	/* Don't allow dedupe past EOF in the dest file */
-	if (is_dedupe) {
-		loff_t	disize;
-
-		disize = i_size_read(inode_out);
-		if (pos_out >= disize || pos_out + len > disize)
-			goto out_unlock;
-	}
-
-	/* If we're linking to EOF, continue to the block boundary. */
-	if (pos_in + len == isize)
-		blen = ALIGN(isize, bs) - pos_in;
-	else
-		blen = len;
-
-	/* Only reflink if we're aligned to block boundaries */
-	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
-	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
-		goto out_unlock;
-
-	/* Don't allow overlapped reflink within the same file */
-	if (same_inode) {
-		if (pos_out + blen > pos_in && pos_out < pos_in + blen)
-			goto out_unlock;
-	}
-
-	/* Wait for the completion of any pending IOs on both files */
-	inode_dio_wait(inode_in);
-	if (!same_inode)
-		inode_dio_wait(inode_out);
-
-	ret = filemap_write_and_wait_range(inode_in->i_mapping,
-			pos_in, pos_in + len - 1);
-	if (ret)
-		goto out_unlock;
-
-	ret = filemap_write_and_wait_range(inode_out->i_mapping,
-			pos_out, pos_out + len - 1);
-	if (ret)
+	ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out,
+			&len, is_dedupe);
+	if (ret <= 0)
 		goto out_unlock;
 
 	trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out);
 
-	/*
-	 * Check that the extents are the same.
-	 */
-	if (is_dedupe) {
-		bool		is_same = false;
-
-		ret = xfs_compare_extents(inode_in, pos_in, inode_out, pos_out,
-				len, &is_same);
-		if (ret)
-			goto out_unlock;
-		if (!is_same) {
-			ret = -EBADE;
-			goto out_unlock;
-		}
-	}
-
+	/* Set flags and remap blocks. */
 	ret = xfs_reflink_set_inode_flag(src, dest);
 	if (ret)
 		goto out_unlock;
 
-	/*
-	 * Invalidate the page cache so that we can clear any CoW mappings
-	 * in the destination file.
-	 */
-	truncate_inode_pages_range(&inode_out->i_data, pos_out,
-				   PAGE_ALIGN(pos_out + len) - 1);
-
 	dfsbno = XFS_B_TO_FSBT(mp, pos_out);
 	sfsbno = XFS_B_TO_FSBT(mp, pos_in);
 	fsblen = XFS_B_TO_FSB(mp, len);
@@ -1431,6 +1179,10 @@ xfs_reflink_remap_range(
 	if (ret)
 		goto out_unlock;
 
+	/* Zap any page cache for the destination file's range. */
+	truncate_inode_pages_range(&inode_out->i_data, pos_out,
+				   PAGE_ALIGN(pos_out + len) - 1);
+
 	/*
 	 * Carry the cowextsize hint from src to dest if we're sharing the
 	 * entire source file to the entire destination file, the source file
@@ -1447,11 +1199,9 @@ xfs_reflink_remap_range(
 
 out_unlock:
 	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
-	xfs_iunlock(src, XFS_IOLOCK_EXCL);
-	if (src->i_ino != dest->i_ino) {
+	if (!same_inode)
 		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
-		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
-	}
+	unlock_two_nondirectories(inode_in, inode_out);
 	if (ret)
 		trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_);
 	return ret;
@@ -1697,37 +1447,3 @@ out:
 	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
 	return error;
 }
-
-/*
- * Does this inode have any real CoW reservations?
- */
-bool
-xfs_reflink_has_real_cow_blocks(
-	struct xfs_inode		*ip)
-{
-	struct xfs_bmbt_irec		irec;
-	struct xfs_ifork		*ifp;
-	struct xfs_bmbt_rec_host	*gotp;
-	xfs_extnum_t			idx;
-
-	if (!xfs_is_reflink_inode(ip))
-		return false;
-
-	/* Go find the old extent in the CoW fork. */
-	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
-	gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
-	while (gotp) {
-		xfs_bmbt_get_all(gotp, &irec);
-
-		if (!isnullstartblock(irec.br_startblock))
-			return true;
-
-		/* Roll on... */
-		idx++;
-		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
-			break;
-		gotp = xfs_iext_get_ext(ifp, idx);
-	}
-
-	return false;
-}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
index fad11607c9ad..aa6a4d64bd35 100644
--- a/fs/xfs/xfs_reflink.h
+++ b/fs/xfs/xfs_reflink.h
@@ -31,8 +31,8 @@ extern int xfs_reflink_reserve_cow(struct xfs_inode *ip,
 extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
 		xfs_off_t offset, xfs_off_t count);
 extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
-		struct xfs_bmbt_irec *imap, bool *need_alloc);
-extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *imap);
+extern void xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
 		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
 
 extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
@@ -50,6 +50,4 @@ extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
 extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
 		xfs_off_t len);
 
-extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
-
 #endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 12d48cd8f8a4..f11282c96887 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -80,9 +80,9 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 	}
 	/* extra precision counters */
 	for_each_possible_cpu(i) {
-		xs_xstrat_bytes += per_cpu_ptr(stats, i)->xs_xstrat_bytes;
-		xs_write_bytes += per_cpu_ptr(stats, i)->xs_write_bytes;
-		xs_read_bytes += per_cpu_ptr(stats, i)->xs_read_bytes;
+		xs_xstrat_bytes += per_cpu_ptr(stats, i)->s.xs_xstrat_bytes;
+		xs_write_bytes += per_cpu_ptr(stats, i)->s.xs_write_bytes;
+		xs_read_bytes += per_cpu_ptr(stats, i)->s.xs_read_bytes;
 	}
 
 	len += snprintf(buf + len, PATH_MAX-len, "xpc %Lu %Lu %Lu\n",
@@ -106,9 +106,9 @@ void xfs_stats_clearall(struct xfsstats __percpu *stats)
 	for_each_possible_cpu(c) {
 		preempt_disable();
 		/* save vn_active, it's a universal truth! */
-		vn_active = per_cpu_ptr(stats, c)->vn_active;
+		vn_active = per_cpu_ptr(stats, c)->s.vn_active;
 		memset(per_cpu_ptr(stats, c), 0, sizeof(*stats));
-		per_cpu_ptr(stats, c)->vn_active = vn_active;
+		per_cpu_ptr(stats, c)->s.vn_active = vn_active;
 		preempt_enable();
 	}
 }
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 79ad2e69fc33..375840f5a99a 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -22,9 +22,37 @@
 #include <linux/percpu.h>
 
 /*
+ * The btree stats arrays have fixed offsets for the different stats. We
+ * store the base index in the btree cursor via XFS_STATS_CALC_INDEX() and
+ * that allows us to use fixed offsets into the stats array for each btree
+ * stat. These index offsets are defined in the order they will be emitted
+ * in the stats files, so it is possible to add new btree stat types by
+ * appending to the enum list below.
+ */
+enum {
+	__XBTS_lookup = 0,
+	__XBTS_compare = 1,
+	__XBTS_insrec = 2,
+	__XBTS_delrec = 3,
+	__XBTS_newroot = 4,
+	__XBTS_killroot = 5,
+	__XBTS_increment = 6,
+	__XBTS_decrement = 7,
+	__XBTS_lshift = 8,
+	__XBTS_rshift = 9,
+	__XBTS_split = 10,
+	__XBTS_join = 11,
+	__XBTS_alloc = 12,
+	__XBTS_free = 13,
+	__XBTS_moves = 14,
+
+	__XBTS_MAX = 15,
+};
+
+/*
  * XFS global statistics
  */
-struct xfsstats {
+struct __xfsstats {
 # define XFSSTAT_END_EXTENT_ALLOC	4
 	__uint32_t		xs_allocx;
 	__uint32_t		xs_allocb;
@@ -117,118 +145,20 @@ struct xfsstats {
 	__uint32_t		xb_page_found;
 	__uint32_t		xb_get_read;
 /* Version 2 btree counters */
-#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF+15)
-	__uint32_t		xs_abtb_2_lookup;
-	__uint32_t		xs_abtb_2_compare;
-	__uint32_t		xs_abtb_2_insrec;
-	__uint32_t		xs_abtb_2_delrec;
-	__uint32_t		xs_abtb_2_newroot;
-	__uint32_t		xs_abtb_2_killroot;
-	__uint32_t		xs_abtb_2_increment;
-	__uint32_t		xs_abtb_2_decrement;
-	__uint32_t		xs_abtb_2_lshift;
-	__uint32_t		xs_abtb_2_rshift;
-	__uint32_t		xs_abtb_2_split;
-	__uint32_t		xs_abtb_2_join;
-	__uint32_t		xs_abtb_2_alloc;
-	__uint32_t		xs_abtb_2_free;
-	__uint32_t		xs_abtb_2_moves;
-#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2+15)
-	__uint32_t		xs_abtc_2_lookup;
-	__uint32_t		xs_abtc_2_compare;
-	__uint32_t		xs_abtc_2_insrec;
-	__uint32_t		xs_abtc_2_delrec;
-	__uint32_t		xs_abtc_2_newroot;
-	__uint32_t		xs_abtc_2_killroot;
-	__uint32_t		xs_abtc_2_increment;
-	__uint32_t		xs_abtc_2_decrement;
-	__uint32_t		xs_abtc_2_lshift;
-	__uint32_t		xs_abtc_2_rshift;
-	__uint32_t		xs_abtc_2_split;
-	__uint32_t		xs_abtc_2_join;
-	__uint32_t		xs_abtc_2_alloc;
-	__uint32_t		xs_abtc_2_free;
-	__uint32_t		xs_abtc_2_moves;
-#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2+15)
-	__uint32_t		xs_bmbt_2_lookup;
-	__uint32_t		xs_bmbt_2_compare;
-	__uint32_t		xs_bmbt_2_insrec;
-	__uint32_t		xs_bmbt_2_delrec;
-	__uint32_t		xs_bmbt_2_newroot;
-	__uint32_t		xs_bmbt_2_killroot;
-	__uint32_t		xs_bmbt_2_increment;
-	__uint32_t		xs_bmbt_2_decrement;
-	__uint32_t		xs_bmbt_2_lshift;
-	__uint32_t		xs_bmbt_2_rshift;
-	__uint32_t		xs_bmbt_2_split;
-	__uint32_t		xs_bmbt_2_join;
-	__uint32_t		xs_bmbt_2_alloc;
-	__uint32_t		xs_bmbt_2_free;
-	__uint32_t		xs_bmbt_2_moves;
-#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2+15)
-	__uint32_t		xs_ibt_2_lookup;
-	__uint32_t		xs_ibt_2_compare;
-	__uint32_t		xs_ibt_2_insrec;
-	__uint32_t		xs_ibt_2_delrec;
-	__uint32_t		xs_ibt_2_newroot;
-	__uint32_t		xs_ibt_2_killroot;
-	__uint32_t		xs_ibt_2_increment;
-	__uint32_t		xs_ibt_2_decrement;
-	__uint32_t		xs_ibt_2_lshift;
-	__uint32_t		xs_ibt_2_rshift;
-	__uint32_t		xs_ibt_2_split;
-	__uint32_t		xs_ibt_2_join;
-	__uint32_t		xs_ibt_2_alloc;
-	__uint32_t		xs_ibt_2_free;
-	__uint32_t		xs_ibt_2_moves;
-#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2+15)
-	__uint32_t		xs_fibt_2_lookup;
-	__uint32_t		xs_fibt_2_compare;
-	__uint32_t		xs_fibt_2_insrec;
-	__uint32_t		xs_fibt_2_delrec;
-	__uint32_t		xs_fibt_2_newroot;
-	__uint32_t		xs_fibt_2_killroot;
-	__uint32_t		xs_fibt_2_increment;
-	__uint32_t		xs_fibt_2_decrement;
-	__uint32_t		xs_fibt_2_lshift;
-	__uint32_t		xs_fibt_2_rshift;
-	__uint32_t		xs_fibt_2_split;
-	__uint32_t		xs_fibt_2_join;
-	__uint32_t		xs_fibt_2_alloc;
-	__uint32_t		xs_fibt_2_free;
-	__uint32_t		xs_fibt_2_moves;
-#define XFSSTAT_END_RMAP_V2		(XFSSTAT_END_FIBT_V2+15)
-	__uint32_t		xs_rmap_2_lookup;
-	__uint32_t		xs_rmap_2_compare;
-	__uint32_t		xs_rmap_2_insrec;
-	__uint32_t		xs_rmap_2_delrec;
-	__uint32_t		xs_rmap_2_newroot;
-	__uint32_t		xs_rmap_2_killroot;
-	__uint32_t		xs_rmap_2_increment;
-	__uint32_t		xs_rmap_2_decrement;
-	__uint32_t		xs_rmap_2_lshift;
-	__uint32_t		xs_rmap_2_rshift;
-	__uint32_t		xs_rmap_2_split;
-	__uint32_t		xs_rmap_2_join;
-	__uint32_t		xs_rmap_2_alloc;
-	__uint32_t		xs_rmap_2_free;
-	__uint32_t		xs_rmap_2_moves;
-#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + 15)
-	__uint32_t		xs_refcbt_2_lookup;
-	__uint32_t		xs_refcbt_2_compare;
-	__uint32_t		xs_refcbt_2_insrec;
-	__uint32_t		xs_refcbt_2_delrec;
-	__uint32_t		xs_refcbt_2_newroot;
-	__uint32_t		xs_refcbt_2_killroot;
-	__uint32_t		xs_refcbt_2_increment;
-	__uint32_t		xs_refcbt_2_decrement;
-	__uint32_t		xs_refcbt_2_lshift;
-	__uint32_t		xs_refcbt_2_rshift;
-	__uint32_t		xs_refcbt_2_split;
-	__uint32_t		xs_refcbt_2_join;
-	__uint32_t		xs_refcbt_2_alloc;
-	__uint32_t		xs_refcbt_2_free;
-	__uint32_t		xs_refcbt_2_moves;
+#define XFSSTAT_END_ABTB_V2		(XFSSTAT_END_BUF + __XBTS_MAX)
+	__uint32_t		xs_abtb_2[__XBTS_MAX];
+#define XFSSTAT_END_ABTC_V2		(XFSSTAT_END_ABTB_V2 + __XBTS_MAX)
+	__uint32_t		xs_abtc_2[__XBTS_MAX];
+#define XFSSTAT_END_BMBT_V2		(XFSSTAT_END_ABTC_V2 + __XBTS_MAX)
+	__uint32_t		xs_bmbt_2[__XBTS_MAX];
+#define XFSSTAT_END_IBT_V2		(XFSSTAT_END_BMBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_ibt_2[__XBTS_MAX];
+#define XFSSTAT_END_FIBT_V2		(XFSSTAT_END_IBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_fibt_2[__XBTS_MAX];
+#define XFSSTAT_END_RMAP_V2		(XFSSTAT_END_FIBT_V2 + __XBTS_MAX)
+	__uint32_t		xs_rmap_2[__XBTS_MAX];
+#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + __XBTS_MAX)
+	__uint32_t		xs_refcbt_2[__XBTS_MAX];
 #define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 	__uint32_t		xs_qm_dqreclaims;
 	__uint32_t		xs_qm_dqreclaim_misses;
@@ -245,26 +175,58 @@ struct xfsstats {
 	__uint64_t		xs_read_bytes;
 };
 
+struct xfsstats {
+	union {
+		struct __xfsstats	s;
+		uint32_t		a[XFSSTAT_END_XQMSTAT];
+	};
+};
+
+/*
+ * simple wrapper for getting the array index of s struct member offset
+ */
+#define XFS_STATS_CALC_INDEX(member)	\
+	(offsetof(struct __xfsstats, member) / (int)sizeof(__uint32_t))
+
+
 int xfs_stats_format(struct xfsstats __percpu *stats, char *buf);
 void xfs_stats_clearall(struct xfsstats __percpu *stats);
 extern struct xstats xfsstats;
 
 #define XFS_STATS_INC(mp, v)					\
 do {								\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v++;	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v++;	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v++;	\
 } while (0)
 
 #define XFS_STATS_DEC(mp, v)					\
 do {								\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v--;	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v--;	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v--;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v--;	\
 } while (0)
 
 #define XFS_STATS_ADD(mp, v, inc)					\
 do {									\
-	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->v += (inc);	\
-	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->v += (inc);	\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->s.v += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->s.v += (inc);	\
+} while (0)
+
+#define XFS_STATS_INC_OFF(mp, off)				\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off]++;	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off]++;	\
+} while (0)
+
+#define XFS_STATS_DEC_OFF(mp, off)					\
+do {								\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off];	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off];	\
+} while (0)
+
+#define XFS_STATS_ADD_OFF(mp, off, inc)					\
+do {									\
+	per_cpu_ptr(xfsstats.xs_stats, current_cpu())->a[off] += (inc);	\
+	per_cpu_ptr(mp->m_stats.xs_stats, current_cpu())->a[off] += (inc);	\
 } while (0)
 
 #if defined(CONFIG_PROC_FS)
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index ade4691e3f74..eecbaac08eba 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -104,9 +104,6 @@ static const match_table_t tokens = {
 	{Opt_sysvgroups,"sysvgroups"},	/* group-ID from current process */
 	{Opt_allocsize,	"allocsize=%s"},/* preferred allocation size */
 	{Opt_norecovery,"norecovery"},	/* don't run XFS recovery */
-	{Opt_barrier,	"barrier"},	/* use writer barriers for log write and
-					 * unwritten extent conversion */
-	{Opt_nobarrier,	"nobarrier"},	/* .. disable */
 	{Opt_inode64,	"inode64"},	/* inodes can be allocated anywhere */
 	{Opt_inode32,   "inode32"},	/* inode allocation limited to
 					 * XFS_MAXINUMBER_32 */
@@ -134,6 +131,12 @@ static const match_table_t tokens = {
 	{Opt_nodiscard,	"nodiscard"},	/* Do not discard unused blocks */
 
 	{Opt_dax,	"dax"},		/* Enable direct access to bdev pages */
+
+	/* Deprecated mount options scheduled for removal */
+	{Opt_barrier,	"barrier"},	/* use writer barriers for log write and
+					 * unwritten extent conversion */
+	{Opt_nobarrier,	"nobarrier"},	/* .. disable */
+
 	{Opt_err,	NULL},
 };
 
@@ -301,12 +304,6 @@ xfs_parseargs(
 		case Opt_nouuid:
 			mp->m_flags |= XFS_MOUNT_NOUUID;
 			break;
-		case Opt_barrier:
-			mp->m_flags |= XFS_MOUNT_BARRIER;
-			break;
-		case Opt_nobarrier:
-			mp->m_flags &= ~XFS_MOUNT_BARRIER;
-			break;
 		case Opt_ikeep:
 			mp->m_flags |= XFS_MOUNT_IKEEP;
 			break;
@@ -374,6 +371,14 @@ xfs_parseargs(
 			mp->m_flags |= XFS_MOUNT_DAX;
 			break;
 #endif
+		case Opt_barrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+			mp->m_flags |= XFS_MOUNT_BARRIER;
+			break;
+		case Opt_nobarrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
+			mp->m_flags &= ~XFS_MOUNT_BARRIER;
+			break;
 		default:
 			xfs_warn(mp, "unknown mount option [%s].", p);
 			return -EINVAL;
@@ -943,7 +948,7 @@ xfs_fs_destroy_inode(
 
 	trace_xfs_destroy_inode(ip);
 
-	ASSERT(!rwsem_is_locked(&ip->i_iolock.mr_lock));
+	ASSERT(!rwsem_is_locked(&inode->i_rwsem));
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 
@@ -1238,9 +1243,11 @@ xfs_fs_remount(
 		token = match_token(p, tokens, args);
 		switch (token) {
 		case Opt_barrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
 			mp->m_flags |= XFS_MOUNT_BARRIER;
 			break;
 		case Opt_nobarrier:
+			xfs_warn(mp, "%s option is deprecated, ignoring.", p);
 			mp->m_flags &= ~XFS_MOUNT_BARRIER;
 			break;
 		case Opt_inode64:
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 58142aeeeea6..f2cb45ed1d54 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -238,8 +238,7 @@ xfs_symlink(
 	if (error)
 		goto out_release_inode;
 
-	xfs_ilock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL |
-		      XFS_IOLOCK_PARENT | XFS_ILOCK_PARENT);
+	xfs_ilock(dp, XFS_ILOCK_EXCL | XFS_ILOCK_PARENT);
 	unlock_dp_on_error = true;
 
 	/*
@@ -287,7 +286,7 @@ xfs_symlink(
 	 * the transaction cancel unlocking dp so don't do it explicitly in the
 	 * error path.
 	 */
-	xfs_trans_ijoin(tp, dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dp, XFS_ILOCK_EXCL);
 	unlock_dp_on_error = false;
 
 	/*
@@ -412,7 +411,7 @@ out_release_inode:
 	xfs_qm_dqrele(pdqp);
 
 	if (unlock_dp_on_error)
-		xfs_iunlock(dp, XFS_IOLOCK_EXCL | XFS_ILOCK_EXCL);
+		xfs_iunlock(dp, XFS_ILOCK_EXCL);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c
index 276d3023d60f..de6195e38910 100644
--- a/fs/xfs/xfs_sysfs.c
+++ b/fs/xfs/xfs_sysfs.c
@@ -396,7 +396,7 @@ max_retries_show(
 	int		retries;
 	struct xfs_error_cfg *cfg = to_error_cfg(kobject);
 
-	if (cfg->retry_timeout == XFS_ERR_RETRY_FOREVER)
+	if (cfg->max_retries == XFS_ERR_RETRY_FOREVER)
 		retries = -1;
 	else
 		retries = cfg->max_retries;
@@ -422,7 +422,7 @@ max_retries_store(
 		return -EINVAL;
 
 	if (val == -1)
-		cfg->retry_timeout = XFS_ERR_RETRY_FOREVER;
+		cfg->max_retries = XFS_ERR_RETRY_FOREVER;
 	else
 		cfg->max_retries = val;
 	return count;
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 0907752be62d..69c5bcd9a51b 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -355,7 +355,6 @@ DEFINE_BUF_EVENT(xfs_buf_rele);
 DEFINE_BUF_EVENT(xfs_buf_iodone);
 DEFINE_BUF_EVENT(xfs_buf_submit);
 DEFINE_BUF_EVENT(xfs_buf_submit_wait);
-DEFINE_BUF_EVENT(xfs_buf_bawrite);
 DEFINE_BUF_EVENT(xfs_buf_lock);
 DEFINE_BUF_EVENT(xfs_buf_lock_done);
 DEFINE_BUF_EVENT(xfs_buf_trylock_fail);
@@ -367,19 +366,15 @@ DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
 DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
 DEFINE_BUF_EVENT(xfs_buf_delwri_split);
 DEFINE_BUF_EVENT(xfs_buf_get_uncached);
-DEFINE_BUF_EVENT(xfs_bdstrat_shut);
 DEFINE_BUF_EVENT(xfs_buf_item_relse);
 DEFINE_BUF_EVENT(xfs_buf_item_iodone_async);
 DEFINE_BUF_EVENT(xfs_buf_error_relse);
 DEFINE_BUF_EVENT(xfs_buf_wait_buftarg);
-DEFINE_BUF_EVENT(xfs_trans_read_buf_io);
 DEFINE_BUF_EVENT(xfs_trans_read_buf_shut);
 
 /* not really buffer traces, but the buf provides useful information */
 DEFINE_BUF_EVENT(xfs_btree_corrupt);
-DEFINE_BUF_EVENT(xfs_da_btree_corrupt);
 DEFINE_BUF_EVENT(xfs_reset_dqcounts);
-DEFINE_BUF_EVENT(xfs_inode_item_push);
 
 /* pass flags explicitly */
 DECLARE_EVENT_CLASS(xfs_buf_flags_class,
@@ -541,7 +536,6 @@ DEFINE_BUF_ITEM_EVENT(xfs_trans_bjoin);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_bhold_release);
 DEFINE_BUF_ITEM_EVENT(xfs_trans_binval);
-DEFINE_BUF_ITEM_EVENT(xfs_trans_buf_ordered);
 
 DECLARE_EVENT_CLASS(xfs_filestream_class,
 	TP_PROTO(struct xfs_inode *ip, xfs_agnumber_t agno),
@@ -680,7 +674,6 @@ DEFINE_INODE_EVENT(xfs_ioctl_setattr);
 DEFINE_INODE_EVENT(xfs_dir_fsync);
 DEFINE_INODE_EVENT(xfs_file_fsync);
 DEFINE_INODE_EVENT(xfs_destroy_inode);
-DEFINE_INODE_EVENT(xfs_evict_inode);
 DEFINE_INODE_EVENT(xfs_update_time);
 
 DEFINE_INODE_EVENT(xfs_dquot_dqalloc);
@@ -798,7 +791,6 @@ TRACE_EVENT(xfs_irec_merge_post,
 DEFINE_EVENT(xfs_iref_class, name, \
 	TP_PROTO(struct xfs_inode *ip, unsigned long caller_ip), \
 	TP_ARGS(ip, caller_ip))
-DEFINE_IREF_EVENT(xfs_ihold);
 DEFINE_IREF_EVENT(xfs_irele);
 DEFINE_IREF_EVENT(xfs_inode_pin);
 DEFINE_IREF_EVENT(xfs_inode_unpin);
@@ -939,7 +931,6 @@ DEFINE_DQUOT_EVENT(xfs_dqget_miss);
 DEFINE_DQUOT_EVENT(xfs_dqget_freeing);
 DEFINE_DQUOT_EVENT(xfs_dqget_dup);
 DEFINE_DQUOT_EVENT(xfs_dqput);
-DEFINE_DQUOT_EVENT(xfs_dqput_wait);
 DEFINE_DQUOT_EVENT(xfs_dqput_free);
 DEFINE_DQUOT_EVENT(xfs_dqrele);
 DEFINE_DQUOT_EVENT(xfs_dqflush);
@@ -1815,7 +1806,6 @@ DEFINE_ATTR_EVENT(xfs_attr_sf_addname);
 DEFINE_ATTR_EVENT(xfs_attr_sf_create);
 DEFINE_ATTR_EVENT(xfs_attr_sf_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_sf_remove);
-DEFINE_ATTR_EVENT(xfs_attr_sf_removename);
 DEFINE_ATTR_EVENT(xfs_attr_sf_to_leaf);
 
 DEFINE_ATTR_EVENT(xfs_attr_leaf_add);
@@ -1844,7 +1834,6 @@ DEFINE_ATTR_EVENT(xfs_attr_leaf_toosmall);
 
 DEFINE_ATTR_EVENT(xfs_attr_node_addname);
 DEFINE_ATTR_EVENT(xfs_attr_node_get);
-DEFINE_ATTR_EVENT(xfs_attr_node_lookup);
 DEFINE_ATTR_EVENT(xfs_attr_node_replace);
 DEFINE_ATTR_EVENT(xfs_attr_node_removename);
 
@@ -2440,11 +2429,9 @@ DEFINE_DEFER_EVENT(xfs_defer_finish_done);
 
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_trans_roll_error);
 DEFINE_DEFER_ERROR_EVENT(xfs_defer_finish_error);
-DEFINE_DEFER_ERROR_EVENT(xfs_defer_op_finish_error);
 
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_work);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_intake_cancel);
-DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_commit);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_cancel);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_finish);
 DEFINE_DEFER_PENDING_EVENT(xfs_defer_pending_abort);
@@ -3092,87 +3079,6 @@ DEFINE_EVENT(xfs_double_io_class, name,	\
 		 struct xfs_inode *dest, xfs_off_t doffset), \
 	TP_ARGS(src, soffset, len, dest, doffset))
 
-/* two-file vfs io tracepoint class */
-DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
-	TP_PROTO(struct inode *src, u64 soffset, u64 len,
-		 struct inode *dest, u64 doffset),
-	TP_ARGS(src, soffset, len, dest, doffset),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(unsigned long, src_ino)
-		__field(loff_t, src_isize)
-		__field(loff_t, src_offset)
-		__field(size_t, len)
-		__field(unsigned long, dest_ino)
-		__field(loff_t, dest_isize)
-		__field(loff_t, dest_offset)
-	),
-	TP_fast_assign(
-		__entry->dev = src->i_sb->s_dev;
-		__entry->src_ino = src->i_ino;
-		__entry->src_isize = i_size_read(src);
-		__entry->src_offset = soffset;
-		__entry->len = len;
-		__entry->dest_ino = dest->i_ino;
-		__entry->dest_isize = i_size_read(dest);
-		__entry->dest_offset = doffset;
-	),
-	TP_printk("dev %d:%d count %zd "
-		  "ino 0x%lx isize 0x%llx offset 0x%llx -> "
-		  "ino 0x%lx isize 0x%llx offset 0x%llx",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->len,
-		  __entry->src_ino,
-		  __entry->src_isize,
-		  __entry->src_offset,
-		  __entry->dest_ino,
-		  __entry->dest_isize,
-		  __entry->dest_offset)
-)
-
-#define DEFINE_DOUBLE_VFS_IO_EVENT(name)	\
-DEFINE_EVENT(xfs_double_vfs_io_class, name,	\
-	TP_PROTO(struct inode *src, u64 soffset, u64 len, \
-		 struct inode *dest, u64 doffset), \
-	TP_ARGS(src, soffset, len, dest, doffset))
-
-/* CoW write tracepoint */
-DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
-	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
-		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
-	TP_ARGS(ip, lblk, pblk, len, new_pblk),
-	TP_STRUCT__entry(
-		__field(dev_t, dev)
-		__field(xfs_ino_t, ino)
-		__field(xfs_fileoff_t, lblk)
-		__field(xfs_fsblock_t, pblk)
-		__field(xfs_extlen_t, len)
-		__field(xfs_fsblock_t, new_pblk)
-	),
-	TP_fast_assign(
-		__entry->dev = VFS_I(ip)->i_sb->s_dev;
-		__entry->ino = ip->i_ino;
-		__entry->lblk = lblk;
-		__entry->pblk = pblk;
-		__entry->len = len;
-		__entry->new_pblk = new_pblk;
-	),
-	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
-		  "len 0x%x new_pblk %llu",
-		  MAJOR(__entry->dev), MINOR(__entry->dev),
-		  __entry->ino,
-		  __entry->lblk,
-		  __entry->pblk,
-		  __entry->len,
-		  __entry->new_pblk)
-)
-
-#define DEFINE_COW_EVENT(name)	\
-DEFINE_EVENT(xfs_copy_on_write_class, name,	\
-	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
-		 xfs_extlen_t len, xfs_fsblock_t new_pblk), \
-	TP_ARGS(ip, lblk, pblk, len, new_pblk))
-
 /* inode/irec events */
 DECLARE_EVENT_CLASS(xfs_inode_irec_class,
 	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
@@ -3292,8 +3198,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
 
@@ -3302,9 +3206,6 @@ DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
 
 /* ioctl tracepoints */
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
-DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
 TRACE_EVENT(xfs_ioctl_clone,
 	TP_PROTO(struct inode *src, struct inode *dest),
 	TP_ARGS(src, dest),
@@ -3334,11 +3235,7 @@ TRACE_EVENT(xfs_ioctl_clone,
 
 /* unshare tracepoints */
 DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
-DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
-DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
 
 /* copy on write */
 DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
@@ -3361,14 +3258,8 @@ DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
 DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
 
-DEFINE_COW_EVENT(xfs_reflink_fork_buf);
-DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
 
-DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
 DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
-DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
 
 /* rmap swapext tracepoints */
 DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
diff --git a/fs/xfs/xfs_xattr.c b/fs/xfs/xfs_xattr.c
index 62900938f26d..0594db435972 100644
--- a/fs/xfs/xfs_xattr.c
+++ b/fs/xfs/xfs_xattr.c
@@ -130,7 +130,7 @@ const struct xattr_handler *xfs_xattr_handlers[] = {
 	NULL
 };
 
-static int
+static void
 __xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	char *prefix,
@@ -148,7 +148,7 @@ __xfs_xattr_put_listent(
 	if (arraytop > context->firstu) {
 		context->count = -1;	/* insufficient space */
 		context->seen_enough = 1;
-		return 0;
+		return;
 	}
 	offset = (char *)context->alist + context->count;
 	strncpy(offset, prefix, prefix_len);
@@ -159,10 +159,10 @@ __xfs_xattr_put_listent(
 
 compute_size:
 	context->count += prefix_len + namelen + 1;
-	return 0;
+	return;
 }
 
-static int
+static void
 xfs_xattr_put_listent(
 	struct xfs_attr_list_context *context,
 	int		flags,
@@ -180,23 +180,19 @@ xfs_xattr_put_listent(
 		if (namelen == SGI_ACL_FILE_SIZE &&
 		    strncmp(name, SGI_ACL_FILE,
 			    SGI_ACL_FILE_SIZE) == 0) {
-			int ret = __xfs_xattr_put_listent(
+			__xfs_xattr_put_listent(
 					context, XATTR_SYSTEM_PREFIX,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_POSIX_ACL_ACCESS,
 					strlen(XATTR_POSIX_ACL_ACCESS));
-			if (ret)
-				return ret;
 		} else if (namelen == SGI_ACL_DEFAULT_SIZE &&
 			 strncmp(name, SGI_ACL_DEFAULT,
 				 SGI_ACL_DEFAULT_SIZE) == 0) {
-			int ret = __xfs_xattr_put_listent(
+			__xfs_xattr_put_listent(
 					context, XATTR_SYSTEM_PREFIX,
 					XATTR_SYSTEM_PREFIX_LEN,
 					XATTR_POSIX_ACL_DEFAULT,
 					strlen(XATTR_POSIX_ACL_DEFAULT));
-			if (ret)
-				return ret;
 		}
 #endif
 
@@ -205,7 +201,7 @@ xfs_xattr_put_listent(
 		 * see them.
 		 */
 		if (!capable(CAP_SYS_ADMIN))
-			return 0;
+			return;
 
 		prefix = XATTR_TRUSTED_PREFIX;
 		prefix_len = XATTR_TRUSTED_PREFIX_LEN;
@@ -217,8 +213,9 @@ xfs_xattr_put_listent(
 		prefix_len = XATTR_USER_PREFIX_LEN;
 	}
 
-	return __xfs_xattr_put_listent(context, prefix, prefix_len, name,
-				       namelen);
+	__xfs_xattr_put_listent(context, prefix, prefix_len, name,
+				namelen);
+	return;
 }
 
 ssize_t