99 files changed, 1132 insertions, 3827 deletions
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index d8223209d4b1..59e3fe3d56c0 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -352,9 +352,6 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd,
 		invalidate_mapping_pages(&inode->i_data, 0, -1);
 	}
 	/* Convert flock to posix lock */
-	fl->fl_owner = (fl_owner_t)filp;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
 	fl->fl_flags |= FL_POSIX;
 	fl->fl_flags ^= FL_FLOCK;
 
diff --git a/fs/Makefile b/fs/Makefile
index f9cb9876e466..4030cbfbc9af 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -14,14 +14,13 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		stack.o fs_struct.o statfs.o
 
 ifeq ($(CONFIG_BLOCK),y)
-obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
+obj-y +=	buffer.o block_dev.o direct-io.o mpage.o
 else
 obj-y +=	no-block.o
 endif
 
 obj-$(CONFIG_PROC_FS) += proc_namespace.o
 
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o
 obj-y				+= notify/
 obj-$(CONFIG_EPOLL)		+= eventpoll.o
 obj-$(CONFIG_ANON_INODES)	+= anon_inodes.o
diff --git a/fs/affs/super.c b/fs/affs/super.c
index 6d589f28bf9b..895ac7dc9dbf 100644
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -340,8 +340,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent)
 				&blocksize,&sbi->s_prefix,
 				sbi->s_volume, &mount_flags)) {
 		printk(KERN_ERR "AFFS: Error parsing options\n");
-		kfree(sbi->s_prefix);
-		kfree(sbi);
 		return -EINVAL;
 	}
 	/* N.B. after this point s_prefix must be released */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 1c8c6cc6de30..4b0eff6da674 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -130,6 +130,15 @@ static void afs_cm_destructor(struct afs_call *call)
 {
 	_enter("");
 
+	/* Break the callbacks here so that we do it after the final ACK is
+	 * received.  The step number here must match the final number in
+	 * afs_deliver_cb_callback().
+	 */
+	if (call->unmarshall == 6) {
+		ASSERT(call->server && call->count && call->request);
+		afs_break_callbacks(call->server, call->count, call->request);
+	}
+
 	afs_put_server(call->server);
 	call->server = NULL;
 	kfree(call->buffer);
@@ -272,6 +281,16 @@ static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
 		_debug("trailer");
 		if (skb->len != 0)
 			return -EBADMSG;
+
+		/* Record that the message was unmarshalled successfully so
+		 * that the call destructor can know do the callback breaking
+		 * work, even if the final ACK isn't received.
+		 *
+		 * If the step number changes, then afs_cm_destructor() must be
+		 * updated also.
+		 */
+		call->unmarshall++;
+	case 6:
 		break;
 	}
 
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index a8cf2cff836c..4baf1d2b39e4 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -555,10 +555,6 @@ int afs_flock(struct file *file, int cmd, struct file_lock *fl)
 		return -ENOLCK;
 
 	/* we're simulating flock() locks using posix locks on the server */
-	fl->fl_owner = (fl_owner_t) file;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
-
 	if (fl->fl_type == F_UNLCK)
 		return afs_do_unlk(file, fl);
 	return afs_do_setlk(file, fl);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index be75b500005d..590b55f46d61 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -75,7 +75,7 @@ struct afs_call {
 	const struct afs_call_type *type;	/* type of call */
 	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
 	wait_queue_head_t	waitq;		/* processes awaiting completion */
-	work_func_t		async_workfn;
+	void (*async_workfn)(struct afs_call *call); /* asynchronous work function */
 	struct work_struct	async_work;	/* asynchronous work processor */
 	struct work_struct	work;		/* actual work processor */
 	struct sk_buff_head	rx_queue;	/* received packets */
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index ef943df73b8c..03a3beb17004 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -25,7 +25,7 @@ static void afs_wake_up_call_waiter(struct afs_call *);
 static int afs_wait_for_call_to_complete(struct afs_call *);
 static void afs_wake_up_async_call(struct afs_call *);
 static int afs_dont_wait_for_call_to_complete(struct afs_call *);
-static void afs_process_async_call(struct work_struct *);
+static void afs_process_async_call(struct afs_call *);
 static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
 static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
 
@@ -58,6 +58,13 @@ static void afs_collect_incoming_call(struct work_struct *);
 static struct sk_buff_head afs_incoming_calls;
 static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
 
+static void afs_async_workfn(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, async_work);
+
+	call->async_workfn(call);
+}
+
 /*
  * open an RxRPC socket and bind it to be a server for callback notifications
  * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
@@ -184,6 +191,28 @@ static void afs_free_call(struct afs_call *call)
 }
 
 /*
+ * End a call but do not free it
+ */
+static void afs_end_call_nofree(struct afs_call *call)
+{
+	if (call->rxcall) {
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+	}
+	if (call->type->destructor)
+		call->type->destructor(call);
+}
+
+/*
+ * End a call and free it
+ */
+static void afs_end_call(struct afs_call *call)
+{
+	afs_end_call_nofree(call);
+	afs_free_call(call);
+}
+
+/*
  * allocate a call with flat request and reply buffers
  */
 struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
@@ -326,7 +355,8 @@ int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
 	       atomic_read(&afs_outstanding_calls));
 
 	call->wait_mode = wait_mode;
-	INIT_WORK(&call->async_work, afs_process_async_call);
+	call->async_workfn = afs_process_async_call;
+	INIT_WORK(&call->async_work, afs_async_workfn);
 
 	memset(&srx, 0, sizeof(srx));
 	srx.srx_family = AF_RXRPC;
@@ -383,11 +413,8 @@ error_do_abort:
 	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
 	while ((skb = skb_dequeue(&call->rx_queue)))
 		afs_free_skb(skb);
-	rxrpc_kernel_end_call(rxcall);
-	call->rxcall = NULL;
 error_kill_call:
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -509,12 +536,8 @@ static void afs_deliver_to_call(struct afs_call *call)
 	if (call->state >= AFS_CALL_COMPLETE) {
 		while ((skb = skb_dequeue(&call->rx_queue)))
 			afs_free_skb(skb);
-		if (call->incoming) {
-			rxrpc_kernel_end_call(call->rxcall);
-			call->rxcall = NULL;
-			call->type->destructor(call);
-			afs_free_call(call);
-		}
+		if (call->incoming)
+			afs_end_call(call);
 	}
 
 	_leave("");
@@ -564,10 +587,7 @@ static int afs_wait_for_call_to_complete(struct afs_call *call)
 	}
 
 	_debug("call complete");
-	rxrpc_kernel_end_call(call->rxcall);
-	call->rxcall = NULL;
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" = %d", ret);
 	return ret;
 }
@@ -603,11 +623,8 @@ static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
 /*
  * delete an asynchronous call
  */
-static void afs_delete_async_call(struct work_struct *work)
+static void afs_delete_async_call(struct afs_call *call)
 {
-	struct afs_call *call =
-		container_of(work, struct afs_call, async_work);
-
 	_enter("");
 
 	afs_free_call(call);
@@ -620,11 +637,8 @@ static void afs_delete_async_call(struct work_struct *work)
  * - on a multiple-thread workqueue this work item may try to run on several
  *   CPUs at the same time
  */
-static void afs_process_async_call(struct work_struct *work)
+static void afs_process_async_call(struct afs_call *call)
 {
-	struct afs_call *call =
-		container_of(work, struct afs_call, async_work);
-
 	_enter("");
 
 	if (!skb_queue_empty(&call->rx_queue))
@@ -637,10 +651,7 @@ static void afs_process_async_call(struct work_struct *work)
 		call->reply = NULL;
 
 		/* kill the call */
-		rxrpc_kernel_end_call(call->rxcall);
-		call->rxcall = NULL;
-		if (call->type->destructor)
-			call->type->destructor(call);
+		afs_end_call_nofree(call);
 
 		/* we can't just delete the call because the work item may be
 		 * queued */
@@ -663,13 +674,6 @@ void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
 	call->reply_size += len;
 }
 
-static void afs_async_workfn(struct work_struct *work)
-{
-	struct afs_call *call = container_of(work, struct afs_call, async_work);
-
-	call->async_workfn(work);
-}
-
 /*
  * accept the backlog of incoming calls
  */
@@ -790,10 +794,7 @@ void afs_send_empty_reply(struct afs_call *call)
 		_debug("oom");
 		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
 	default:
-		rxrpc_kernel_end_call(call->rxcall);
-		call->rxcall = NULL;
-		call->type->destructor(call);
-		afs_free_call(call);
+		afs_end_call(call);
 		_leave(" [error]");
 		return;
 	}
@@ -823,17 +824,16 @@ void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
 	call->state = AFS_CALL_AWAIT_ACK;
 	n = rxrpc_kernel_send_data(call->rxcall, &msg, len);
 	if (n >= 0) {
+		/* Success */
 		_leave(" [replied]");
 		return;
 	}
+
 	if (n == -ENOMEM) {
 		_debug("oom");
 		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
 	}
-	rxrpc_kernel_end_call(call->rxcall);
-	call->rxcall = NULL;
-	call->type->destructor(call);
-	afs_free_call(call);
+	afs_end_call(call);
 	_leave(" [error]");
 }
 
diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c
index 2caf36ac3e93..cc87c1abac97 100644
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -179,7 +179,7 @@ static struct dentry *autofs4_lookup_active(struct dentry *dentry)
 		spin_lock(&active->d_lock);
 
 		/* Already gone? */
-		if (!d_count(active))
+		if ((int) d_count(active) <= 0)
 			goto next;
 
 		qstr = &active->d_name;
@@ -230,7 +230,7 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry)
 
 		spin_lock(&expiring->d_lock);
 
-		/* Bad luck, we've already been dentry_iput */
+		/* We've already been dentry_iput or unlinked */
 		if (!expiring->d_inode)
 			goto next;
 
diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c
deleted file mode 100644
index 1c2ce0c87711..000000000000
--- a/fs/bio-integrity.c
+++ /dev/null
@@ -1,657 +0,0 @@
-/*
- * bio-integrity.c - bio data integrity extensions
- *
- * Copyright (C) 2007, 2008, 2009 Oracle Corporation
- * Written by: Martin K. Petersen <martin.petersen@oracle.com>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; see the file COPYING.  If not, write to
- * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139,
- * USA.
- *
- */
-
-#include <linux/blkdev.h>
-#include <linux/mempool.h>
-#include <linux/export.h>
-#include <linux/bio.h>
-#include <linux/workqueue.h>
-#include <linux/slab.h>
-
-#define BIP_INLINE_VECS	4
-
-static struct kmem_cache *bip_slab;
-static struct workqueue_struct *kintegrityd_wq;
-
-/**
- * bio_integrity_alloc - Allocate integrity payload and attach it to bio
- * @bio:	bio to attach integrity metadata to
- * @gfp_mask:	Memory allocation mask
- * @nr_vecs:	Number of integrity metadata scatter-gather elements
- *
- * Description: This function prepares a bio for attaching integrity
- * metadata.  nr_vecs specifies the maximum number of pages containing
- * integrity metadata that can be attached.
- */
-struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
-						  gfp_t gfp_mask,
-						  unsigned int nr_vecs)
-{
-	struct bio_integrity_payload *bip;
-	struct bio_set *bs = bio->bi_pool;
-	unsigned long idx = BIO_POOL_NONE;
-	unsigned inline_vecs;
-
-	if (!bs) {
-		bip = kmalloc(sizeof(struct bio_integrity_payload) +
-			      sizeof(struct bio_vec) * nr_vecs, gfp_mask);
-		inline_vecs = nr_vecs;
-	} else {
-		bip = mempool_alloc(bs->bio_integrity_pool, gfp_mask);
-		inline_vecs = BIP_INLINE_VECS;
-	}
-
-	if (unlikely(!bip))
-		return NULL;
-
-	memset(bip, 0, sizeof(*bip));
-
-	if (nr_vecs > inline_vecs) {
-		bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
-					  bs->bvec_integrity_pool);
-		if (!bip->bip_vec)
-			goto err;
-	} else {
-		bip->bip_vec = bip->bip_inline_vecs;
-	}
-
-	bip->bip_slab = idx;
-	bip->bip_bio = bio;
-	bio->bi_integrity = bip;
-
-	return bip;
-err:
-	mempool_free(bip, bs->bio_integrity_pool);
-	return NULL;
-}
-EXPORT_SYMBOL(bio_integrity_alloc);
-
-/**
- * bio_integrity_free - Free bio integrity payload
- * @bio:	bio containing bip to be freed
- *
- * Description: Used to free the integrity portion of a bio. Usually
- * called from bio_free().
- */
-void bio_integrity_free(struct bio *bio)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct bio_set *bs = bio->bi_pool;
-
-	if (bip->bip_owns_buf)
-		kfree(bip->bip_buf);
-
-	if (bs) {
-		if (bip->bip_slab != BIO_POOL_NONE)
-			bvec_free(bs->bvec_integrity_pool, bip->bip_vec,
-				  bip->bip_slab);
-
-		mempool_free(bip, bs->bio_integrity_pool);
-	} else {
-		kfree(bip);
-	}
-
-	bio->bi_integrity = NULL;
-}
-EXPORT_SYMBOL(bio_integrity_free);
-
-static inline unsigned int bip_integrity_vecs(struct bio_integrity_payload *bip)
-{
-	if (bip->bip_slab == BIO_POOL_NONE)
-		return BIP_INLINE_VECS;
-
-	return bvec_nr_vecs(bip->bip_slab);
-}
-
-/**
- * bio_integrity_add_page - Attach integrity metadata
- * @bio:	bio to update
- * @page:	page containing integrity metadata
- * @len:	number of bytes of integrity metadata in page
- * @offset:	start offset within page
- *
- * Description: Attach a page containing integrity metadata to bio.
- */
-int bio_integrity_add_page(struct bio *bio, struct page *page,
-			   unsigned int len, unsigned int offset)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct bio_vec *iv;
-
-	if (bip->bip_vcnt >= bip_integrity_vecs(bip)) {
-		printk(KERN_ERR "%s: bip_vec full\n", __func__);
-		return 0;
-	}
-
-	iv = bip->bip_vec + bip->bip_vcnt;
-
-	iv->bv_page = page;
-	iv->bv_len = len;
-	iv->bv_offset = offset;
-	bip->bip_vcnt++;
-
-	return len;
-}
-EXPORT_SYMBOL(bio_integrity_add_page);
-
-static int bdev_integrity_enabled(struct block_device *bdev, int rw)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bdev);
-
-	if (bi == NULL)
-		return 0;
-
-	if (rw == READ && bi->verify_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_READ))
-		return 1;
-
-	if (rw == WRITE && bi->generate_fn != NULL &&
-	    (bi->flags & INTEGRITY_FLAG_WRITE))
-		return 1;
-
-	return 0;
-}
-
-/**
- * bio_integrity_enabled - Check whether integrity can be passed
- * @bio:	bio to check
- *
- * Description: Determines whether bio_integrity_prep() can be called
- * on this bio or not.	bio data direction and target device must be
- * set prior to calling.  The functions honors the write_generate and
- * read_verify flags in sysfs.
- */
-int bio_integrity_enabled(struct bio *bio)
-{
-	if (!bio_is_rw(bio))
-		return 0;
-
-	/* Already protected? */
-	if (bio_integrity(bio))
-		return 0;
-
-	return bdev_integrity_enabled(bio->bi_bdev, bio_data_dir(bio));
-}
-EXPORT_SYMBOL(bio_integrity_enabled);
-
-/**
- * bio_integrity_hw_sectors - Convert 512b sectors to hardware ditto
- * @bi:		blk_integrity profile for device
- * @sectors:	Number of 512 sectors to convert
- *
- * Description: The block layer calculates everything in 512 byte
- * sectors but integrity metadata is done in terms of the hardware
- * sector size of the storage device.  Convert the block layer sectors
- * to physical sectors.
- */
-static inline unsigned int bio_integrity_hw_sectors(struct blk_integrity *bi,
-						    unsigned int sectors)
-{
-	/* At this point there are only 512b or 4096b DIF/EPP devices */
-	if (bi->sector_size == 4096)
-		return sectors >>= 3;
-
-	return sectors;
-}
-
-static inline unsigned int bio_integrity_bytes(struct blk_integrity *bi,
-					       unsigned int sectors)
-{
-	return bio_integrity_hw_sectors(bi, sectors) * bi->tuple_size;
-}
-
-/**
- * bio_integrity_tag_size - Retrieve integrity tag space
- * @bio:	bio to inspect
- *
- * Description: Returns the maximum number of tag bytes that can be
- * attached to this bio. Filesystems can use this to determine how
- * much metadata to attach to an I/O.
- */
-unsigned int bio_integrity_tag_size(struct bio *bio)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
-	BUG_ON(bio->bi_iter.bi_size == 0);
-
-	return bi->tag_size * (bio->bi_iter.bi_size / bi->sector_size);
-}
-EXPORT_SYMBOL(bio_integrity_tag_size);
-
-static int bio_integrity_tag(struct bio *bio, void *tag_buf, unsigned int len,
-			     int set)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned int nr_sectors;
-
-	BUG_ON(bip->bip_buf == NULL);
-
-	if (bi->tag_size == 0)
-		return -1;
-
-	nr_sectors = bio_integrity_hw_sectors(bi,
-					DIV_ROUND_UP(len, bi->tag_size));
-
-	if (nr_sectors * bi->tuple_size > bip->bip_iter.bi_size) {
-		printk(KERN_ERR "%s: tag too big for bio: %u > %u\n", __func__,
-		       nr_sectors * bi->tuple_size, bip->bip_iter.bi_size);
-		return -1;
-	}
-
-	if (set)
-		bi->set_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-	else
-		bi->get_tag_fn(bip->bip_buf, tag_buf, nr_sectors);
-
-	return 0;
-}
-
-/**
- * bio_integrity_set_tag - Attach a tag buffer to a bio
- * @bio:	bio to attach buffer to
- * @tag_buf:	Pointer to a buffer containing tag data
- * @len:	Length of the included buffer
- *
- * Description: Use this function to tag a bio by leveraging the extra
- * space provided by devices formatted with integrity protection.  The
- * size of the integrity buffer must be <= to the size reported by
- * bio_integrity_tag_size().
- */
-int bio_integrity_set_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
-	BUG_ON(bio_data_dir(bio) != WRITE);
-
-	return bio_integrity_tag(bio, tag_buf, len, 1);
-}
-EXPORT_SYMBOL(bio_integrity_set_tag);
-
-/**
- * bio_integrity_get_tag - Retrieve a tag buffer from a bio
- * @bio:	bio to retrieve buffer from
- * @tag_buf:	Pointer to a buffer for the tag data
- * @len:	Length of the target buffer
- *
- * Description: Use this function to retrieve the tag buffer from a
- * completed I/O. The size of the integrity buffer must be <= to the
- * size reported by bio_integrity_tag_size().
- */
-int bio_integrity_get_tag(struct bio *bio, void *tag_buf, unsigned int len)
-{
-	BUG_ON(bio_data_dir(bio) != READ);
-
-	return bio_integrity_tag(bio, tag_buf, len, 0);
-}
-EXPORT_SYMBOL(bio_integrity_get_tag);
-
-/**
- * bio_integrity_generate_verify - Generate/verify integrity metadata for a bio
- * @bio:	bio to generate/verify integrity metadata for
- * @operate:	operate number, 1 for generate, 0 for verify
- */
-static int bio_integrity_generate_verify(struct bio *bio, int operate)
-{
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	struct blk_integrity_exchg bix;
-	struct bio_vec *bv;
-	sector_t sector;
-	unsigned int sectors, ret = 0, i;
-	void *prot_buf = bio->bi_integrity->bip_buf;
-
-	if (operate)
-		sector = bio->bi_iter.bi_sector;
-	else
-		sector = bio->bi_integrity->bip_iter.bi_sector;
-
-	bix.disk_name = bio->bi_bdev->bd_disk->disk_name;
-	bix.sector_size = bi->sector_size;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		void *kaddr = kmap_atomic(bv->bv_page);
-		bix.data_buf = kaddr + bv->bv_offset;
-		bix.data_size = bv->bv_len;
-		bix.prot_buf = prot_buf;
-		bix.sector = sector;
-
-		if (operate)
-			bi->generate_fn(&bix);
-		else {
-			ret = bi->verify_fn(&bix);
-			if (ret) {
-				kunmap_atomic(kaddr);
-				return ret;
-			}
-		}
-
-		sectors = bv->bv_len / bi->sector_size;
-		sector += sectors;
-		prot_buf += sectors * bi->tuple_size;
-
-		kunmap_atomic(kaddr);
-	}
-	return ret;
-}
-
-/**
- * bio_integrity_generate - Generate integrity metadata for a bio
- * @bio:	bio to generate integrity metadata for
- *
- * Description: Generates integrity metadata for a bio by calling the
- * block device's generation callback function.  The bio must have a
- * bip attached with enough room to accommodate the generated
- * integrity metadata.
- */
-static void bio_integrity_generate(struct bio *bio)
-{
-	bio_integrity_generate_verify(bio, 1);
-}
-
-static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi)
-{
-	if (bi)
-		return bi->tuple_size;
-
-	return 0;
-}
-
-/**
- * bio_integrity_prep - Prepare bio for integrity I/O
- * @bio:	bio to prepare
- *
- * Description: Allocates a buffer for integrity metadata, maps the
- * pages and attaches them to a bio.  The bio must have data
- * direction, target device and start sector set priot to calling.  In
- * the WRITE case, integrity metadata will be generated using the
- * block device's integrity function.  In the READ case, the buffer
- * will be prepared for DMA and a suitable end_io handler set up.
- */
-int bio_integrity_prep(struct bio *bio)
-{
-	struct bio_integrity_payload *bip;
-	struct blk_integrity *bi;
-	struct request_queue *q;
-	void *buf;
-	unsigned long start, end;
-	unsigned int len, nr_pages;
-	unsigned int bytes, offset, i;
-	unsigned int sectors;
-
-	bi = bdev_get_integrity(bio->bi_bdev);
-	q = bdev_get_queue(bio->bi_bdev);
-	BUG_ON(bi == NULL);
-	BUG_ON(bio_integrity(bio));
-
-	sectors = bio_integrity_hw_sectors(bi, bio_sectors(bio));
-
-	/* Allocate kernel buffer for protection data */
-	len = sectors * blk_integrity_tuple_size(bi);
-	buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
-	if (unlikely(buf == NULL)) {
-		printk(KERN_ERR "could not allocate integrity buffer\n");
-		return -ENOMEM;
-	}
-
-	end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	start = ((unsigned long) buf) >> PAGE_SHIFT;
-	nr_pages = end - start;
-
-	/* Allocate bio integrity payload and integrity vectors */
-	bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages);
-	if (unlikely(bip == NULL)) {
-		printk(KERN_ERR "could not allocate data integrity bioset\n");
-		kfree(buf);
-		return -EIO;
-	}
-
-	bip->bip_owns_buf = 1;
-	bip->bip_buf = buf;
-	bip->bip_iter.bi_size = len;
-	bip->bip_iter.bi_sector = bio->bi_iter.bi_sector;
-
-	/* Map it */
-	offset = offset_in_page(buf);
-	for (i = 0 ; i < nr_pages ; i++) {
-		int ret;
-		bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
-
-		ret = bio_integrity_add_page(bio, virt_to_page(buf),
-					     bytes, offset);
-
-		if (ret == 0)
-			return 0;
-
-		if (ret < bytes)
-			break;
-
-		buf += bytes;
-		len -= bytes;
-		offset = 0;
-	}
-
-	/* Install custom I/O completion handler if read verify is enabled */
-	if (bio_data_dir(bio) == READ) {
-		bip->bip_end_io = bio->bi_end_io;
-		bio->bi_end_io = bio_integrity_endio;
-	}
-
-	/* Auto-generate integrity metadata if this is a write */
-	if (bio_data_dir(bio) == WRITE)
-		bio_integrity_generate(bio);
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_integrity_prep);
-
-/**
- * bio_integrity_verify - Verify integrity metadata for a bio
- * @bio:	bio to verify
- *
- * Description: This function is called to verify the integrity of a
- * bio.	 The data in the bio io_vec is compared to the integrity
- * metadata returned by the HBA.
- */
-static int bio_integrity_verify(struct bio *bio)
-{
-	return bio_integrity_generate_verify(bio, 0);
-}
-
-/**
- * bio_integrity_verify_fn - Integrity I/O completion worker
- * @work:	Work struct stored in bio to be verified
- *
- * Description: This workqueue function is called to complete a READ
- * request.  The function verifies the transferred integrity metadata
- * and then calls the original bio end_io function.
- */
-static void bio_integrity_verify_fn(struct work_struct *work)
-{
-	struct bio_integrity_payload *bip =
-		container_of(work, struct bio_integrity_payload, bip_work);
-	struct bio *bio = bip->bip_bio;
-	int error;
-
-	error = bio_integrity_verify(bio);
-
-	/* Restore original bio completion handler */
-	bio->bi_end_io = bip->bip_end_io;
-	bio_endio_nodec(bio, error);
-}
-
-/**
- * bio_integrity_endio - Integrity I/O completion function
- * @bio:	Protected bio
- * @error:	Pointer to errno
- *
- * Description: Completion for integrity I/O
- *
- * Normally I/O completion is done in interrupt context.  However,
- * verifying I/O integrity is a time-consuming task which must be run
- * in process context.	This function postpones completion
- * accordingly.
- */
-void bio_integrity_endio(struct bio *bio, int error)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-
-	BUG_ON(bip->bip_bio != bio);
-
-	/* In case of an I/O error there is no point in verifying the
-	 * integrity metadata.  Restore original bio end_io handler
-	 * and run it.
-	 */
-	if (error) {
-		bio->bi_end_io = bip->bip_end_io;
-		bio_endio(bio, error);
-
-		return;
-	}
-
-	INIT_WORK(&bip->bip_work, bio_integrity_verify_fn);
-	queue_work(kintegrityd_wq, &bip->bip_work);
-}
-EXPORT_SYMBOL(bio_integrity_endio);
-
-/**
- * bio_integrity_advance - Advance integrity vector
- * @bio:	bio whose integrity vector to update
- * @bytes_done:	number of data bytes that have been completed
- *
- * Description: This function calculates how many integrity bytes the
- * number of completed data bytes correspond to and advances the
- * integrity vector accordingly.
- */
-void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-	unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
-
-	bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
-}
-EXPORT_SYMBOL(bio_integrity_advance);
-
-/**
- * bio_integrity_trim - Trim integrity vector
- * @bio:	bio whose integrity vector to update
- * @offset:	offset to first data sector
- * @sectors:	number of data sectors
- *
- * Description: Used to trim the integrity vector in a cloned bio.
- * The ivec will be advanced corresponding to 'offset' data sectors
- * and the length will be truncated corresponding to 'len' data
- * sectors.
- */
-void bio_integrity_trim(struct bio *bio, unsigned int offset,
-			unsigned int sectors)
-{
-	struct bio_integrity_payload *bip = bio->bi_integrity;
-	struct blk_integrity *bi = bdev_get_integrity(bio->bi_bdev);
-
-	bio_integrity_advance(bio, offset << 9);
-	bip->bip_iter.bi_size = bio_integrity_bytes(bi, sectors);
-}
-EXPORT_SYMBOL(bio_integrity_trim);
-
-/**
- * bio_integrity_clone - Callback for cloning bios with integrity metadata
- * @bio:	New bio
- * @bio_src:	Original bio
- * @gfp_mask:	Memory allocation mask
- *
- * Description:	Called to allocate a bip when cloning a bio
- */
-int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
-			gfp_t gfp_mask)
-{
-	struct bio_integrity_payload *bip_src = bio_src->bi_integrity;
-	struct bio_integrity_payload *bip;
-
-	BUG_ON(bip_src == NULL);
-
-	bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt);
-
-	if (bip == NULL)
-		return -EIO;
-
-	memcpy(bip->bip_vec, bip_src->bip_vec,
-	       bip_src->bip_vcnt * sizeof(struct bio_vec));
-
-	bip->bip_vcnt = bip_src->bip_vcnt;
-	bip->bip_iter = bip_src->bip_iter;
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_integrity_clone);
-
-int bioset_integrity_create(struct bio_set *bs, int pool_size)
-{
-	if (bs->bio_integrity_pool)
-		return 0;
-
-	bs->bio_integrity_pool = mempool_create_slab_pool(pool_size, bip_slab);
-	if (!bs->bio_integrity_pool)
-		return -1;
-
-	bs->bvec_integrity_pool = biovec_create_pool(bs, pool_size);
-	if (!bs->bvec_integrity_pool) {
-		mempool_destroy(bs->bio_integrity_pool);
-		return -1;
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bioset_integrity_create);
-
-void bioset_integrity_free(struct bio_set *bs)
-{
-	if (bs->bio_integrity_pool)
-		mempool_destroy(bs->bio_integrity_pool);
-
-	if (bs->bvec_integrity_pool)
-		mempool_destroy(bs->bvec_integrity_pool);
-}
-EXPORT_SYMBOL(bioset_integrity_free);
-
-void __init bio_integrity_init(void)
-{
-	/*
-	 * kintegrityd won't block much but may burn a lot of CPU cycles.
-	 * Make it highpri CPU intensive wq with max concurrency of 1.
-	 */
-	kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM |
-					 WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1);
-	if (!kintegrityd_wq)
-		panic("Failed to create kintegrityd\n");
-
-	bip_slab = kmem_cache_create("bio_integrity_payload",
-				     sizeof(struct bio_integrity_payload) +
-				     sizeof(struct bio_vec) * BIP_INLINE_VECS,
-				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	if (!bip_slab)
-		panic("Failed to create slab\n");
-}
diff --git a/fs/bio.c b/fs/bio.c
deleted file mode 100644
index 6f0362b77806..000000000000
--- a/fs/bio.c
+++ /dev/null
@@ -1,2037 +0,0 @@
-/*
- * Copyright (C) 2001 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public Licens
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
- *
- */
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/uio.h>
-#include <linux/iocontext.h>
-#include <linux/slab.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/mempool.h>
-#include <linux/workqueue.h>
-#include <linux/cgroup.h>
-#include <scsi/sg.h>		/* for struct sg_iovec */
-
-#include <trace/events/block.h>
-
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS		4
-
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x) { .nr_vecs = x, .name = "biovec-"__stringify(x) }
-static struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly = {
-	BV(1), BV(4), BV(16), BV(64), BV(128), BV(BIO_MAX_PAGES),
-};
-#undef BV
-
-/*
- * fs_bio_set is the bio_set containing bio and iovec memory pools used by
- * IO code that does not need private memory pools.
- */
-struct bio_set *fs_bio_set;
-EXPORT_SYMBOL(fs_bio_set);
-
-/*
- * Our slab pool management
- */
-struct bio_slab {
-	struct kmem_cache *slab;
-	unsigned int slab_ref;
-	unsigned int slab_size;
-	char name[8];
-};
-static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
-
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
-{
-	unsigned int sz = sizeof(struct bio) + extra_size;
-	struct kmem_cache *slab = NULL;
-	struct bio_slab *bslab, *new_bio_slabs;
-	unsigned int new_bio_slab_max;
-	unsigned int i, entry = -1;
-
-	mutex_lock(&bio_slab_lock);
-
-	i = 0;
-	while (i < bio_slab_nr) {
-		bslab = &bio_slabs[i];
-
-		if (!bslab->slab && entry == -1)
-			entry = i;
-		else if (bslab->slab_size == sz) {
-			slab = bslab->slab;
-			bslab->slab_ref++;
-			break;
-		}
-		i++;
-	}
-
-	if (slab)
-		goto out_unlock;
-
-	if (bio_slab_nr == bio_slab_max && entry == -1) {
-		new_bio_slab_max = bio_slab_max << 1;
-		new_bio_slabs = krealloc(bio_slabs,
-					 new_bio_slab_max * sizeof(struct bio_slab),
-					 GFP_KERNEL);
-		if (!new_bio_slabs)
-			goto out_unlock;
-		bio_slab_max = new_bio_slab_max;
-		bio_slabs = new_bio_slabs;
-	}
-	if (entry == -1)
-		entry = bio_slab_nr++;
-
-	bslab = &bio_slabs[entry];
-
-	snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
-	slab = kmem_cache_create(bslab->name, sz, 0, SLAB_HWCACHE_ALIGN, NULL);
-	if (!slab)
-		goto out_unlock;
-
-	bslab->slab = slab;
-	bslab->slab_ref = 1;
-	bslab->slab_size = sz;
-out_unlock:
-	mutex_unlock(&bio_slab_lock);
-	return slab;
-}
-
-static void bio_put_slab(struct bio_set *bs)
-{
-	struct bio_slab *bslab = NULL;
-	unsigned int i;
-
-	mutex_lock(&bio_slab_lock);
-
-	for (i = 0; i < bio_slab_nr; i++) {
-		if (bs->bio_slab == bio_slabs[i].slab) {
-			bslab = &bio_slabs[i];
-			break;
-		}
-	}
-
-	if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
-		goto out;
-
-	WARN_ON(!bslab->slab_ref);
-
-	if (--bslab->slab_ref)
-		goto out;
-
-	kmem_cache_destroy(bslab->slab);
-	bslab->slab = NULL;
-
-out:
-	mutex_unlock(&bio_slab_lock);
-}
-
-unsigned int bvec_nr_vecs(unsigned short idx)
-{
-	return bvec_slabs[idx].nr_vecs;
-}
-
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
-	BIO_BUG_ON(idx >= BIOVEC_NR_POOLS);
-
-	if (idx == BIOVEC_MAX_IDX)
-		mempool_free(bv, pool);
-	else {
-		struct biovec_slab *bvs = bvec_slabs + idx;
-
-		kmem_cache_free(bvs->slab, bv);
-	}
-}
-
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
-			   mempool_t *pool)
-{
-	struct bio_vec *bvl;
-
-	/*
-	 * see comment near bvec_array define!
-	 */
-	switch (nr) {
-	case 1:
-		*idx = 0;
-		break;
-	case 2 ... 4:
-		*idx = 1;
-		break;
-	case 5 ... 16:
-		*idx = 2;
-		break;
-	case 17 ... 64:
-		*idx = 3;
-		break;
-	case 65 ... 128:
-		*idx = 4;
-		break;
-	case 129 ... BIO_MAX_PAGES:
-		*idx = 5;
-		break;
-	default:
-		return NULL;
-	}
-
-	/*
-	 * idx now points to the pool we want to allocate from. only the
-	 * 1-vec entry pool is mempool backed.
-	 */
-	if (*idx == BIOVEC_MAX_IDX) {
-fallback:
-		bvl = mempool_alloc(pool, gfp_mask);
-	} else {
-		struct biovec_slab *bvs = bvec_slabs + *idx;
-		gfp_t __gfp_mask = gfp_mask & ~(__GFP_WAIT | __GFP_IO);
-
-		/*
-		 * Make this allocation restricted and don't dump info on
-		 * allocation failures, since we'll fallback to the mempool
-		 * in case of failure.
-		 */
-		__gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
-
-		/*
-		 * Try a slab allocation. If this fails and __GFP_WAIT
-		 * is set, retry with the 1-entry mempool
-		 */
-		bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
-		if (unlikely(!bvl && (gfp_mask & __GFP_WAIT))) {
-			*idx = BIOVEC_MAX_IDX;
-			goto fallback;
-		}
-	}
-
-	return bvl;
-}
-
-static void __bio_free(struct bio *bio)
-{
-	bio_disassociate_task(bio);
-
-	if (bio_integrity(bio))
-		bio_integrity_free(bio);
-}
-
-static void bio_free(struct bio *bio)
-{
-	struct bio_set *bs = bio->bi_pool;
-	void *p;
-
-	__bio_free(bio);
-
-	if (bs) {
-		if (bio_flagged(bio, BIO_OWNS_VEC))
-			bvec_free(bs->bvec_pool, bio->bi_io_vec, BIO_POOL_IDX(bio));
-
-		/*
-		 * If we have front padding, adjust the bio pointer before freeing
-		 */
-		p = bio;
-		p -= bs->front_pad;
-
-		mempool_free(p, bs->bio_pool);
-	} else {
-		/* Bio was allocated by bio_kmalloc() */
-		kfree(bio);
-	}
-}
-
-void bio_init(struct bio *bio)
-{
-	memset(bio, 0, sizeof(*bio));
-	bio->bi_flags = 1 << BIO_UPTODATE;
-	atomic_set(&bio->bi_remaining, 1);
-	atomic_set(&bio->bi_cnt, 1);
-}
-EXPORT_SYMBOL(bio_init);
-
-/**
- * bio_reset - reinitialize a bio
- * @bio:	bio to reset
- *
- * Description:
- *   After calling bio_reset(), @bio will be in the same state as a freshly
- *   allocated bio returned bio bio_alloc_bioset() - the only fields that are
- *   preserved are the ones that are initialized by bio_alloc_bioset(). See
- *   comment in struct bio.
- */
-void bio_reset(struct bio *bio)
-{
-	unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
-	__bio_free(bio);
-
-	memset(bio, 0, BIO_RESET_BYTES);
-	bio->bi_flags = flags|(1 << BIO_UPTODATE);
-	atomic_set(&bio->bi_remaining, 1);
-}
-EXPORT_SYMBOL(bio_reset);
-
-static void bio_chain_endio(struct bio *bio, int error)
-{
-	bio_endio(bio->bi_private, error);
-	bio_put(bio);
-}
-
-/**
- * bio_chain - chain bio completions
- *
- * The caller won't have a bi_end_io called when @bio completes - instead,
- * @parent's bi_end_io won't be called until both @parent and @bio have
- * completed; the chained bio will also be freed when it completes.
- *
- * The caller must not set bi_private or bi_end_io in @bio.
- */
-void bio_chain(struct bio *bio, struct bio *parent)
-{
-	BUG_ON(bio->bi_private || bio->bi_end_io);
-
-	bio->bi_private = parent;
-	bio->bi_end_io	= bio_chain_endio;
-	atomic_inc(&parent->bi_remaining);
-}
-EXPORT_SYMBOL(bio_chain);
-
-static void bio_alloc_rescue(struct work_struct *work)
-{
-	struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
-	struct bio *bio;
-
-	while (1) {
-		spin_lock(&bs->rescue_lock);
-		bio = bio_list_pop(&bs->rescue_list);
-		spin_unlock(&bs->rescue_lock);
-
-		if (!bio)
-			break;
-
-		generic_make_request(bio);
-	}
-}
-
-static void punt_bios_to_rescuer(struct bio_set *bs)
-{
-	struct bio_list punt, nopunt;
-	struct bio *bio;
-
-	/*
-	 * In order to guarantee forward progress we must punt only bios that
-	 * were allocated from this bio_set; otherwise, if there was a bio on
-	 * there for a stacking driver higher up in the stack, processing it
-	 * could require allocating bios from this bio_set, and doing that from
-	 * our own rescuer would be bad.
-	 *
-	 * Since bio lists are singly linked, pop them all instead of trying to
-	 * remove from the middle of the list:
-	 */
-
-	bio_list_init(&punt);
-	bio_list_init(&nopunt);
-
-	while ((bio = bio_list_pop(current->bio_list)))
-		bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio);
-
-	*current->bio_list = nopunt;
-
-	spin_lock(&bs->rescue_lock);
-	bio_list_merge(&bs->rescue_list, &punt);
-	spin_unlock(&bs->rescue_lock);
-
-	queue_work(bs->rescue_workqueue, &bs->rescue_work);
-}
-
-/**
- * bio_alloc_bioset - allocate a bio for I/O
- * @gfp_mask:   the GFP_ mask given to the slab allocator
- * @nr_iovecs:	number of iovecs to pre-allocate
- * @bs:		the bio_set to allocate from.
- *
- * Description:
- *   If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- *   backed by the @bs's mempool.
- *
- *   When @bs is not NULL, if %__GFP_WAIT is set then bio_alloc will always be
- *   able to allocate a bio. This is due to the mempool guarantees. To make this
- *   work, callers must never allocate more than 1 bio at a time from this pool.
- *   Callers that need to allocate more than 1 bio must always submit the
- *   previously allocated bio for IO before attempting to allocate a new one.
- *   Failure to do so can cause deadlocks under memory pressure.
- *
- *   Note that when running under generic_make_request() (i.e. any block
- *   driver), bios are not submitted until after you return - see the code in
- *   generic_make_request() that converts recursion into iteration, to prevent
- *   stack overflows.
- *
- *   This would normally mean allocating multiple bios under
- *   generic_make_request() would be susceptible to deadlocks, but we have
- *   deadlock avoidance code that resubmits any blocked bios from a rescuer
- *   thread.
- *
- *   However, we do not guarantee forward progress for allocations from other
- *   mempools. Doing multiple allocations from the same mempool under
- *   generic_make_request() should be avoided - instead, use bio_set's front_pad
- *   for per bio allocations.
- *
- *   RETURNS:
- *   Pointer to new bio on success, NULL on failure.
- */
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
-{
-	gfp_t saved_gfp = gfp_mask;
-	unsigned front_pad;
-	unsigned inline_vecs;
-	unsigned long idx = BIO_POOL_NONE;
-	struct bio_vec *bvl = NULL;
-	struct bio *bio;
-	void *p;
-
-	if (!bs) {
-		if (nr_iovecs > UIO_MAXIOV)
-			return NULL;
-
-		p = kmalloc(sizeof(struct bio) +
-			    nr_iovecs * sizeof(struct bio_vec),
-			    gfp_mask);
-		front_pad = 0;
-		inline_vecs = nr_iovecs;
-	} else {
-		/*
-		 * generic_make_request() converts recursion to iteration; this
-		 * means if we're running beneath it, any bios we allocate and
-		 * submit will not be submitted (and thus freed) until after we
-		 * return.
-		 *
-		 * This exposes us to a potential deadlock if we allocate
-		 * multiple bios from the same bio_set() while running
-		 * underneath generic_make_request(). If we were to allocate
-		 * multiple bios (say a stacking block driver that was splitting
-		 * bios), we would deadlock if we exhausted the mempool's
-		 * reserve.
-		 *
-		 * We solve this, and guarantee forward progress, with a rescuer
-		 * workqueue per bio_set. If we go to allocate and there are
-		 * bios on current->bio_list, we first try the allocation
-		 * without __GFP_WAIT; if that fails, we punt those bios we
-		 * would be blocking to the rescuer workqueue before we retry
-		 * with the original gfp_flags.
-		 */
-
-		if (current->bio_list && !bio_list_empty(current->bio_list))
-			gfp_mask &= ~__GFP_WAIT;
-
-		p = mempool_alloc(bs->bio_pool, gfp_mask);
-		if (!p && gfp_mask != saved_gfp) {
-			punt_bios_to_rescuer(bs);
-			gfp_mask = saved_gfp;
-			p = mempool_alloc(bs->bio_pool, gfp_mask);
-		}
-
-		front_pad = bs->front_pad;
-		inline_vecs = BIO_INLINE_VECS;
-	}
-
-	if (unlikely(!p))
-		return NULL;
-
-	bio = p + front_pad;
-	bio_init(bio);
-
-	if (nr_iovecs > inline_vecs) {
-		bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
-		if (!bvl && gfp_mask != saved_gfp) {
-			punt_bios_to_rescuer(bs);
-			gfp_mask = saved_gfp;
-			bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, bs->bvec_pool);
-		}
-
-		if (unlikely(!bvl))
-			goto err_free;
-
-		bio->bi_flags |= 1 << BIO_OWNS_VEC;
-	} else if (nr_iovecs) {
-		bvl = bio->bi_inline_vecs;
-	}
-
-	bio->bi_pool = bs;
-	bio->bi_flags |= idx << BIO_POOL_OFFSET;
-	bio->bi_max_vecs = nr_iovecs;
-	bio->bi_io_vec = bvl;
-	return bio;
-
-err_free:
-	mempool_free(p, bs->bio_pool);
-	return NULL;
-}
-EXPORT_SYMBOL(bio_alloc_bioset);
-
-void zero_fill_bio(struct bio *bio)
-{
-	unsigned long flags;
-	struct bio_vec bv;
-	struct bvec_iter iter;
-
-	bio_for_each_segment(bv, bio, iter) {
-		char *data = bvec_kmap_irq(&bv, &flags);
-		memset(data, 0, bv.bv_len);
-		flush_dcache_page(bv.bv_page);
-		bvec_kunmap_irq(data, &flags);
-	}
-}
-EXPORT_SYMBOL(zero_fill_bio);
-
-/**
- * bio_put - release a reference to a bio
- * @bio:   bio to release reference to
- *
- * Description:
- *   Put a reference to a &struct bio, either one you have gotten with
- *   bio_alloc, bio_get or bio_clone. The last put of a bio will free it.
- **/
-void bio_put(struct bio *bio)
-{
-	BIO_BUG_ON(!atomic_read(&bio->bi_cnt));
-
-	/*
-	 * last put frees it
-	 */
-	if (atomic_dec_and_test(&bio->bi_cnt))
-		bio_free(bio);
-}
-EXPORT_SYMBOL(bio_put);
-
-inline int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
-	if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
-		blk_recount_segments(q, bio);
-
-	return bio->bi_phys_segments;
-}
-EXPORT_SYMBOL(bio_phys_segments);
-
-/**
- * 	__bio_clone_fast - clone a bio that shares the original bio's biovec
- * 	@bio: destination bio
- * 	@bio_src: bio to clone
- *
- *	Clone a &bio. Caller will own the returned bio, but not
- *	the actual data it points to. Reference count of returned
- * 	bio will be one.
- *
- * 	Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
-{
-	BUG_ON(bio->bi_pool && BIO_POOL_IDX(bio) != BIO_POOL_NONE);
-
-	/*
-	 * most users will be overriding ->bi_bdev with a new target,
-	 * so we don't set nor calculate new physical/hw segment counts here
-	 */
-	bio->bi_bdev = bio_src->bi_bdev;
-	bio->bi_flags |= 1 << BIO_CLONED;
-	bio->bi_rw = bio_src->bi_rw;
-	bio->bi_iter = bio_src->bi_iter;
-	bio->bi_io_vec = bio_src->bi_io_vec;
-}
-EXPORT_SYMBOL(__bio_clone_fast);
-
-/**
- *	bio_clone_fast - clone a bio that shares the original bio's biovec
- *	@bio: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- * 	Like __bio_clone_fast, only also allocates the returned bio
- */
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
-{
-	struct bio *b;
-
-	b = bio_alloc_bioset(gfp_mask, 0, bs);
-	if (!b)
-		return NULL;
-
-	__bio_clone_fast(b, bio);
-
-	if (bio_integrity(bio)) {
-		int ret;
-
-		ret = bio_integrity_clone(b, bio, gfp_mask);
-
-		if (ret < 0) {
-			bio_put(b);
-			return NULL;
-		}
-	}
-
-	return b;
-}
-EXPORT_SYMBOL(bio_clone_fast);
-
-/**
- * 	bio_clone_bioset - clone a bio
- * 	@bio_src: bio to clone
- *	@gfp_mask: allocation priority
- *	@bs: bio_set to allocate from
- *
- *	Clone bio. Caller will own the returned bio, but not the actual data it
- *	points to. Reference count of returned bio will be one.
- */
-struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask,
-			     struct bio_set *bs)
-{
-	struct bvec_iter iter;
-	struct bio_vec bv;
-	struct bio *bio;
-
-	/*
-	 * Pre immutable biovecs, __bio_clone() used to just do a memcpy from
-	 * bio_src->bi_io_vec to bio->bi_io_vec.
-	 *
-	 * We can't do that anymore, because:
-	 *
-	 *  - The point of cloning the biovec is to produce a bio with a biovec
-	 *    the caller can modify: bi_idx and bi_bvec_done should be 0.
-	 *
-	 *  - The original bio could've had more than BIO_MAX_PAGES biovecs; if
-	 *    we tried to clone the whole thing bio_alloc_bioset() would fail.
-	 *    But the clone should succeed as long as the number of biovecs we
-	 *    actually need to allocate is fewer than BIO_MAX_PAGES.
-	 *
-	 *  - Lastly, bi_vcnt should not be looked at or relied upon by code
-	 *    that does not own the bio - reason being drivers don't use it for
-	 *    iterating over the biovec anymore, so expecting it to be kept up
-	 *    to date (i.e. for clones that share the parent biovec) is just
-	 *    asking for trouble and would force extra work on
-	 *    __bio_clone_fast() anyways.
-	 */
-
-	bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
-	if (!bio)
-		return NULL;
-
-	bio->bi_bdev		= bio_src->bi_bdev;
-	bio->bi_rw		= bio_src->bi_rw;
-	bio->bi_iter.bi_sector	= bio_src->bi_iter.bi_sector;
-	bio->bi_iter.bi_size	= bio_src->bi_iter.bi_size;
-
-	if (bio->bi_rw & REQ_DISCARD)
-		goto integrity_clone;
-
-	if (bio->bi_rw & REQ_WRITE_SAME) {
-		bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
-		goto integrity_clone;
-	}
-
-	bio_for_each_segment(bv, bio_src, iter)
-		bio->bi_io_vec[bio->bi_vcnt++] = bv;
-
-integrity_clone:
-	if (bio_integrity(bio_src)) {
-		int ret;
-
-		ret = bio_integrity_clone(bio, bio_src, gfp_mask);
-		if (ret < 0) {
-			bio_put(bio);
-			return NULL;
-		}
-	}
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_clone_bioset);
-
-/**
- *	bio_get_nr_vecs		- return approx number of vecs
- *	@bdev:  I/O target
- *
- *	Return the approximate number of pages we can send to this target.
- *	There's no guarantee that you will be able to fit this number of pages
- *	into a bio, it does not account for dynamic restrictions that vary
- *	on offset.
- */
-int bio_get_nr_vecs(struct block_device *bdev)
-{
-	struct request_queue *q = bdev_get_queue(bdev);
-	int nr_pages;
-
-	nr_pages = min_t(unsigned,
-		     queue_max_segments(q),
-		     queue_max_sectors(q) / (PAGE_SIZE >> 9) + 1);
-
-	return min_t(unsigned, nr_pages, BIO_MAX_PAGES);
-
-}
-EXPORT_SYMBOL(bio_get_nr_vecs);
-
-static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page
-			  *page, unsigned int len, unsigned int offset,
-			  unsigned int max_sectors)
-{
-	int retried_segments = 0;
-	struct bio_vec *bvec;
-
-	/*
-	 * cloned bio must not modify vec list
-	 */
-	if (unlikely(bio_flagged(bio, BIO_CLONED)))
-		return 0;
-
-	if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
-		return 0;
-
-	/*
-	 * For filesystems with a blocksize smaller than the pagesize
-	 * we will often be called with the same page as last time and
-	 * a consecutive offset.  Optimize this special case.
-	 */
-	if (bio->bi_vcnt > 0) {
-		struct bio_vec *prev = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
-		if (page == prev->bv_page &&
-		    offset == prev->bv_offset + prev->bv_len) {
-			unsigned int prev_bv_len = prev->bv_len;
-			prev->bv_len += len;
-
-			if (q->merge_bvec_fn) {
-				struct bvec_merge_data bvm = {
-					/* prev_bvec is already charged in
-					   bi_size, discharge it in order to
-					   simulate merging updated prev_bvec
-					   as new bvec. */
-					.bi_bdev = bio->bi_bdev,
-					.bi_sector = bio->bi_iter.bi_sector,
-					.bi_size = bio->bi_iter.bi_size -
-						prev_bv_len,
-					.bi_rw = bio->bi_rw,
-				};
-
-				if (q->merge_bvec_fn(q, &bvm, prev) < prev->bv_len) {
-					prev->bv_len -= len;
-					return 0;
-				}
-			}
-
-			goto done;
-		}
-	}
-
-	if (bio->bi_vcnt >= bio->bi_max_vecs)
-		return 0;
-
-	/*
-	 * we might lose a segment or two here, but rather that than
-	 * make this too complex.
-	 */
-
-	while (bio->bi_phys_segments >= queue_max_segments(q)) {
-
-		if (retried_segments)
-			return 0;
-
-		retried_segments = 1;
-		blk_recount_segments(q, bio);
-	}
-
-	/*
-	 * setup the new entry, we might clear it again later if we
-	 * cannot add the page
-	 */
-	bvec = &bio->bi_io_vec[bio->bi_vcnt];
-	bvec->bv_page = page;
-	bvec->bv_len = len;
-	bvec->bv_offset = offset;
-
-	/*
-	 * if queue has other restrictions (eg varying max sector size
-	 * depending on offset), it can specify a merge_bvec_fn in the
-	 * queue to get further control
-	 */
-	if (q->merge_bvec_fn) {
-		struct bvec_merge_data bvm = {
-			.bi_bdev = bio->bi_bdev,
-			.bi_sector = bio->bi_iter.bi_sector,
-			.bi_size = bio->bi_iter.bi_size,
-			.bi_rw = bio->bi_rw,
-		};
-
-		/*
-		 * merge_bvec_fn() returns number of bytes it can accept
-		 * at this offset
-		 */
-		if (q->merge_bvec_fn(q, &bvm, bvec) < bvec->bv_len) {
-			bvec->bv_page = NULL;
-			bvec->bv_len = 0;
-			bvec->bv_offset = 0;
-			return 0;
-		}
-	}
-
-	/* If we may be able to merge these biovecs, force a recount */
-	if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec)))
-		bio->bi_flags &= ~(1 << BIO_SEG_VALID);
-
-	bio->bi_vcnt++;
-	bio->bi_phys_segments++;
- done:
-	bio->bi_iter.bi_size += len;
-	return len;
-}
-
-/**
- *	bio_add_pc_page	-	attempt to add page to bio
- *	@q: the target queue
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
- *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block device
- *	limitations. The target block device must allow bio's up to PAGE_SIZE,
- *	so it is always possible to add a single page to an empty bio.
- *
- *	This should only be used by REQ_PC bios.
- */
-int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page,
-		    unsigned int len, unsigned int offset)
-{
-	return __bio_add_page(q, bio, page, len, offset,
-			      queue_max_hw_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_pc_page);
-
-/**
- *	bio_add_page	-	attempt to add page to bio
- *	@bio: destination bio
- *	@page: page to add
- *	@len: vec entry length
- *	@offset: vec entry offset
- *
- *	Attempt to add a page to the bio_vec maplist. This can fail for a
- *	number of reasons, such as the bio being full or target block device
- *	limitations. The target block device must allow bio's up to PAGE_SIZE,
- *	so it is always possible to add a single page to an empty bio.
- */
-int bio_add_page(struct bio *bio, struct page *page, unsigned int len,
-		 unsigned int offset)
-{
-	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-	return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q));
-}
-EXPORT_SYMBOL(bio_add_page);
-
-struct submit_bio_ret {
-	struct completion event;
-	int error;
-};
-
-static void submit_bio_wait_endio(struct bio *bio, int error)
-{
-	struct submit_bio_ret *ret = bio->bi_private;
-
-	ret->error = error;
-	complete(&ret->event);
-}
-
-/**
- * submit_bio_wait - submit a bio, and wait until it completes
- * @rw: whether to %READ or %WRITE, or maybe to %READA (read ahead)
- * @bio: The &struct bio which describes the I/O
- *
- * Simple wrapper around submit_bio(). Returns 0 on success, or the error from
- * bio_endio() on failure.
- */
-int submit_bio_wait(int rw, struct bio *bio)
-{
-	struct submit_bio_ret ret;
-
-	rw |= REQ_SYNC;
-	init_completion(&ret.event);
-	bio->bi_private = &ret;
-	bio->bi_end_io = submit_bio_wait_endio;
-	submit_bio(rw, bio);
-	wait_for_completion(&ret.event);
-
-	return ret.error;
-}
-EXPORT_SYMBOL(submit_bio_wait);
-
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio:	bio to advance
- * @bytes:	number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
-{
-	if (bio_integrity(bio))
-		bio_integrity_advance(bio, bytes);
-
-	bio_advance_iter(bio, &bio->bi_iter, bytes);
-}
-EXPORT_SYMBOL(bio_advance);
-
-/**
- * bio_alloc_pages - allocates a single page for each bvec in a bio
- * @bio: bio to allocate pages for
- * @gfp_mask: flags for allocation
- *
- * Allocates pages up to @bio->bi_vcnt.
- *
- * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
- * freed.
- */
-int bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
-{
-	int i;
-	struct bio_vec *bv;
-
-	bio_for_each_segment_all(bv, bio, i) {
-		bv->bv_page = alloc_page(gfp_mask);
-		if (!bv->bv_page) {
-			while (--bv >= bio->bi_io_vec)
-				__free_page(bv->bv_page);
-			return -ENOMEM;
-		}
-	}
-
-	return 0;
-}
-EXPORT_SYMBOL(bio_alloc_pages);
-
-/**
- * bio_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * If @src and @dst are single bios, bi_next must be NULL - otherwise, treats
- * @src and @dst as linked lists of bios.
- *
- * Stops when it reaches the end of either @src or @dst - that is, copies
- * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios).
- */
-void bio_copy_data(struct bio *dst, struct bio *src)
-{
-	struct bvec_iter src_iter, dst_iter;
-	struct bio_vec src_bv, dst_bv;
-	void *src_p, *dst_p;
-	unsigned bytes;
-
-	src_iter = src->bi_iter;
-	dst_iter = dst->bi_iter;
-
-	while (1) {
-		if (!src_iter.bi_size) {
-			src = src->bi_next;
-			if (!src)
-				break;
-
-			src_iter = src->bi_iter;
-		}
-
-		if (!dst_iter.bi_size) {
-			dst = dst->bi_next;
-			if (!dst)
-				break;
-
-			dst_iter = dst->bi_iter;
-		}
-
-		src_bv = bio_iter_iovec(src, src_iter);
-		dst_bv = bio_iter_iovec(dst, dst_iter);
-
-		bytes = min(src_bv.bv_len, dst_bv.bv_len);
-
-		src_p = kmap_atomic(src_bv.bv_page);
-		dst_p = kmap_atomic(dst_bv.bv_page);
-
-		memcpy(dst_p + dst_bv.bv_offset,
-		       src_p + src_bv.bv_offset,
-		       bytes);
-
-		kunmap_atomic(dst_p);
-		kunmap_atomic(src_p);
-
-		bio_advance_iter(src, &src_iter, bytes);
-		bio_advance_iter(dst, &dst_iter, bytes);
-	}
-}
-EXPORT_SYMBOL(bio_copy_data);
-
-struct bio_map_data {
-	int nr_sgvecs;
-	int is_our_pages;
-	struct sg_iovec sgvecs[];
-};
-
-static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio,
-			     const struct sg_iovec *iov, int iov_count,
-			     int is_our_pages)
-{
-	memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count);
-	bmd->nr_sgvecs = iov_count;
-	bmd->is_our_pages = is_our_pages;
-	bio->bi_private = bmd;
-}
-
-static struct bio_map_data *bio_alloc_map_data(int nr_segs,
-					       unsigned int iov_count,
-					       gfp_t gfp_mask)
-{
-	if (iov_count > UIO_MAXIOV)
-		return NULL;
-
-	return kmalloc(sizeof(struct bio_map_data) +
-		       sizeof(struct sg_iovec) * iov_count, gfp_mask);
-}
-
-static int __bio_copy_iov(struct bio *bio, const struct sg_iovec *iov, int iov_count,
-			  int to_user, int from_user, int do_free_page)
-{
-	int ret = 0, i;
-	struct bio_vec *bvec;
-	int iov_idx = 0;
-	unsigned int iov_off = 0;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		char *bv_addr = page_address(bvec->bv_page);
-		unsigned int bv_len = bvec->bv_len;
-
-		while (bv_len && iov_idx < iov_count) {
-			unsigned int bytes;
-			char __user *iov_addr;
-
-			bytes = min_t(unsigned int,
-				      iov[iov_idx].iov_len - iov_off, bv_len);
-			iov_addr = iov[iov_idx].iov_base + iov_off;
-
-			if (!ret) {
-				if (to_user)
-					ret = copy_to_user(iov_addr, bv_addr,
-							   bytes);
-
-				if (from_user)
-					ret = copy_from_user(bv_addr, iov_addr,
-							     bytes);
-
-				if (ret)
-					ret = -EFAULT;
-			}
-
-			bv_len -= bytes;
-			bv_addr += bytes;
-			iov_addr += bytes;
-			iov_off += bytes;
-
-			if (iov[iov_idx].iov_len == iov_off) {
-				iov_idx++;
-				iov_off = 0;
-			}
-		}
-
-		if (do_free_page)
-			__free_page(bvec->bv_page);
-	}
-
-	return ret;
-}
-
-/**
- *	bio_uncopy_user	-	finish previously mapped bio
- *	@bio: bio being terminated
- *
- *	Free pages allocated from bio_copy_user() and write back data
- *	to user space in case of a read.
- */
-int bio_uncopy_user(struct bio *bio)
-{
-	struct bio_map_data *bmd = bio->bi_private;
-	struct bio_vec *bvec;
-	int ret = 0, i;
-
-	if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
-		/*
-		 * if we're in a workqueue, the request is orphaned, so
-		 * don't copy into a random user address space, just free.
-		 */
-		if (current->mm)
-			ret = __bio_copy_iov(bio, bmd->sgvecs, bmd->nr_sgvecs,
-					     bio_data_dir(bio) == READ,
-					     0, bmd->is_our_pages);
-		else if (bmd->is_our_pages)
-			bio_for_each_segment_all(bvec, bio, i)
-				__free_page(bvec->bv_page);
-	}
-	kfree(bmd);
-	bio_put(bio);
-	return ret;
-}
-EXPORT_SYMBOL(bio_uncopy_user);
-
-/**
- *	bio_copy_user_iov	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user_iov(struct request_queue *q,
-			      struct rq_map_data *map_data,
-			      const struct sg_iovec *iov, int iov_count,
-			      int write_to_vm, gfp_t gfp_mask)
-{
-	struct bio_map_data *bmd;
-	struct bio_vec *bvec;
-	struct page *page;
-	struct bio *bio;
-	int i, ret;
-	int nr_pages = 0;
-	unsigned int len = 0;
-	unsigned int offset = map_data ? map_data->offset & ~PAGE_MASK : 0;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr;
-		unsigned long end;
-		unsigned long start;
-
-		uaddr = (unsigned long)iov[i].iov_base;
-		end = (uaddr + iov[i].iov_len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		start = uaddr >> PAGE_SHIFT;
-
-		/*
-		 * Overflow, abort
-		 */
-		if (end < start)
-			return ERR_PTR(-EINVAL);
-
-		nr_pages += end - start;
-		len += iov[i].iov_len;
-	}
-
-	if (offset)
-		nr_pages++;
-
-	bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask);
-	if (!bmd)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		goto out_bmd;
-
-	if (!write_to_vm)
-		bio->bi_rw |= REQ_WRITE;
-
-	ret = 0;
-
-	if (map_data) {
-		nr_pages = 1 << map_data->page_order;
-		i = map_data->offset / PAGE_SIZE;
-	}
-	while (len) {
-		unsigned int bytes = PAGE_SIZE;
-
-		bytes -= offset;
-
-		if (bytes > len)
-			bytes = len;
-
-		if (map_data) {
-			if (i == map_data->nr_entries * nr_pages) {
-				ret = -ENOMEM;
-				break;
-			}
-
-			page = map_data->pages[i / nr_pages];
-			page += (i % nr_pages);
-
-			i++;
-		} else {
-			page = alloc_page(q->bounce_gfp | gfp_mask);
-			if (!page) {
-				ret = -ENOMEM;
-				break;
-			}
-		}
-
-		if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes)
-			break;
-
-		len -= bytes;
-		offset = 0;
-	}
-
-	if (ret)
-		goto cleanup;
-
-	/*
-	 * success
-	 */
-	if ((!write_to_vm && (!map_data || !map_data->null_mapped)) ||
-	    (map_data && map_data->from_user)) {
-		ret = __bio_copy_iov(bio, iov, iov_count, 0, 1, 0);
-		if (ret)
-			goto cleanup;
-	}
-
-	bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1);
-	return bio;
-cleanup:
-	if (!map_data)
-		bio_for_each_segment_all(bvec, bio, i)
-			__free_page(bvec->bv_page);
-
-	bio_put(bio);
-out_bmd:
-	kfree(bmd);
-	return ERR_PTR(ret);
-}
-
-/**
- *	bio_copy_user	-	copy user data to bio
- *	@q: destination block queue
- *	@map_data: pointer to the rq_map_data holding pages (if necessary)
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Prepares and returns a bio for indirect user io, bouncing data
- *	to/from kernel pages as necessary. Must be paired with
- *	call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data,
-			  unsigned long uaddr, unsigned int len,
-			  int write_to_vm, gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_copy_user);
-
-static struct bio *__bio_map_user_iov(struct request_queue *q,
-				      struct block_device *bdev,
-				      const struct sg_iovec *iov, int iov_count,
-				      int write_to_vm, gfp_t gfp_mask)
-{
-	int i, j;
-	int nr_pages = 0;
-	struct page **pages;
-	struct bio *bio;
-	int cur_page = 0;
-	int ret, offset;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
-		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long start = uaddr >> PAGE_SHIFT;
-
-		/*
-		 * Overflow, abort
-		 */
-		if (end < start)
-			return ERR_PTR(-EINVAL);
-
-		nr_pages += end - start;
-		/*
-		 * buffer must be aligned to at least hardsector size for now
-		 */
-		if (uaddr & queue_dma_alignment(q))
-			return ERR_PTR(-EINVAL);
-	}
-
-	if (!nr_pages)
-		return ERR_PTR(-EINVAL);
-
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	ret = -ENOMEM;
-	pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
-	if (!pages)
-		goto out;
-
-	for (i = 0; i < iov_count; i++) {
-		unsigned long uaddr = (unsigned long)iov[i].iov_base;
-		unsigned long len = iov[i].iov_len;
-		unsigned long end = (uaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-		unsigned long start = uaddr >> PAGE_SHIFT;
-		const int local_nr_pages = end - start;
-		const int page_limit = cur_page + local_nr_pages;
-
-		ret = get_user_pages_fast(uaddr, local_nr_pages,
-				write_to_vm, &pages[cur_page]);
-		if (ret < local_nr_pages) {
-			ret = -EFAULT;
-			goto out_unmap;
-		}
-
-		offset = uaddr & ~PAGE_MASK;
-		for (j = cur_page; j < page_limit; j++) {
-			unsigned int bytes = PAGE_SIZE - offset;
-
-			if (len <= 0)
-				break;
-			
-			if (bytes > len)
-				bytes = len;
-
-			/*
-			 * sorry...
-			 */
-			if (bio_add_pc_page(q, bio, pages[j], bytes, offset) <
-					    bytes)
-				break;
-
-			len -= bytes;
-			offset = 0;
-		}
-
-		cur_page = j;
-		/*
-		 * release the pages we didn't map into the bio, if any
-		 */
-		while (j < page_limit)
-			page_cache_release(pages[j++]);
-	}
-
-	kfree(pages);
-
-	/*
-	 * set data direction, and check if mapped pages need bouncing
-	 */
-	if (!write_to_vm)
-		bio->bi_rw |= REQ_WRITE;
-
-	bio->bi_bdev = bdev;
-	bio->bi_flags |= (1 << BIO_USER_MAPPED);
-	return bio;
-
- out_unmap:
-	for (i = 0; i < nr_pages; i++) {
-		if(!pages[i])
-			break;
-		page_cache_release(pages[i]);
-	}
- out:
-	kfree(pages);
-	bio_put(bio);
-	return ERR_PTR(ret);
-}
-
-/**
- *	bio_map_user	-	map user address into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@uaddr: start of user address
- *	@len: length in bytes
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev,
-			 unsigned long uaddr, unsigned int len, int write_to_vm,
-			 gfp_t gfp_mask)
-{
-	struct sg_iovec iov;
-
-	iov.iov_base = (void __user *)uaddr;
-	iov.iov_len = len;
-
-	return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask);
-}
-EXPORT_SYMBOL(bio_map_user);
-
-/**
- *	bio_map_user_iov - map user sg_iovec table into bio
- *	@q: the struct request_queue for the bio
- *	@bdev: destination block device
- *	@iov:	the iovec.
- *	@iov_count: number of elements in the iovec
- *	@write_to_vm: bool indicating writing to pages or not
- *	@gfp_mask: memory allocation flags
- *
- *	Map the user space address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev,
-			     const struct sg_iovec *iov, int iov_count,
-			     int write_to_vm, gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm,
-				 gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	/*
-	 * subtle -- if __bio_map_user() ended up bouncing a bio,
-	 * it would normally disappear when its bi_end_io is run.
-	 * however, we need it for the unmap, so grab an extra
-	 * reference to it
-	 */
-	bio_get(bio);
-
-	return bio;
-}
-
-static void __bio_unmap_user(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	/*
-	 * make sure we dirty pages we wrote to
-	 */
-	bio_for_each_segment_all(bvec, bio, i) {
-		if (bio_data_dir(bio) == READ)
-			set_page_dirty_lock(bvec->bv_page);
-
-		page_cache_release(bvec->bv_page);
-	}
-
-	bio_put(bio);
-}
-
-/**
- *	bio_unmap_user	-	unmap a bio
- *	@bio:		the bio being unmapped
- *
- *	Unmap a bio previously mapped by bio_map_user(). Must be called with
- *	a process context.
- *
- *	bio_unmap_user() may sleep.
- */
-void bio_unmap_user(struct bio *bio)
-{
-	__bio_unmap_user(bio);
-	bio_put(bio);
-}
-EXPORT_SYMBOL(bio_unmap_user);
-
-static void bio_map_kern_endio(struct bio *bio, int err)
-{
-	bio_put(bio);
-}
-
-static struct bio *__bio_map_kern(struct request_queue *q, void *data,
-				  unsigned int len, gfp_t gfp_mask)
-{
-	unsigned long kaddr = (unsigned long)data;
-	unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
-	unsigned long start = kaddr >> PAGE_SHIFT;
-	const int nr_pages = end - start;
-	int offset, i;
-	struct bio *bio;
-
-	bio = bio_kmalloc(gfp_mask, nr_pages);
-	if (!bio)
-		return ERR_PTR(-ENOMEM);
-
-	offset = offset_in_page(kaddr);
-	for (i = 0; i < nr_pages; i++) {
-		unsigned int bytes = PAGE_SIZE - offset;
-
-		if (len <= 0)
-			break;
-
-		if (bytes > len)
-			bytes = len;
-
-		if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
-				    offset) < bytes)
-			break;
-
-		data += bytes;
-		len -= bytes;
-		offset = 0;
-	}
-
-	bio->bi_end_io = bio_map_kern_endio;
-	return bio;
-}
-
-/**
- *	bio_map_kern	-	map kernel address into bio
- *	@q: the struct request_queue for the bio
- *	@data: pointer to buffer to map
- *	@len: length in bytes
- *	@gfp_mask: allocation flags for bio allocation
- *
- *	Map the kernel address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
-			 gfp_t gfp_mask)
-{
-	struct bio *bio;
-
-	bio = __bio_map_kern(q, data, len, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	if (bio->bi_iter.bi_size == len)
-		return bio;
-
-	/*
-	 * Don't support partial mappings.
-	 */
-	bio_put(bio);
-	return ERR_PTR(-EINVAL);
-}
-EXPORT_SYMBOL(bio_map_kern);
-
-static void bio_copy_kern_endio(struct bio *bio, int err)
-{
-	struct bio_vec *bvec;
-	const int read = bio_data_dir(bio) == READ;
-	struct bio_map_data *bmd = bio->bi_private;
-	int i;
-	char *p = bmd->sgvecs[0].iov_base;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		char *addr = page_address(bvec->bv_page);
-
-		if (read)
-			memcpy(p, addr, bvec->bv_len);
-
-		__free_page(bvec->bv_page);
-		p += bvec->bv_len;
-	}
-
-	kfree(bmd);
-	bio_put(bio);
-}
-
-/**
- *	bio_copy_kern	-	copy kernel address into bio
- *	@q: the struct request_queue for the bio
- *	@data: pointer to buffer to copy
- *	@len: length in bytes
- *	@gfp_mask: allocation flags for bio and page allocation
- *	@reading: data direction is READ
- *
- *	copy the kernel address into a bio suitable for io to a block
- *	device. Returns an error pointer in case of error.
- */
-struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
-			  gfp_t gfp_mask, int reading)
-{
-	struct bio *bio;
-	struct bio_vec *bvec;
-	int i;
-
-	bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask);
-	if (IS_ERR(bio))
-		return bio;
-
-	if (!reading) {
-		void *p = data;
-
-		bio_for_each_segment_all(bvec, bio, i) {
-			char *addr = page_address(bvec->bv_page);
-
-			memcpy(addr, p, bvec->bv_len);
-			p += bvec->bv_len;
-		}
-	}
-
-	bio->bi_end_io = bio_copy_kern_endio;
-
-	return bio;
-}
-EXPORT_SYMBOL(bio_copy_kern);
-
-/*
- * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
- * for performing direct-IO in BIOs.
- *
- * The problem is that we cannot run set_page_dirty() from interrupt context
- * because the required locks are not interrupt-safe.  So what we can do is to
- * mark the pages dirty _before_ performing IO.  And in interrupt context,
- * check that the pages are still dirty.   If so, fine.  If not, redirty them
- * in process context.
- *
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages.  The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all.  So we skip compound pages here at an early stage.
- *
- * Note that this code is very hard to test under normal circumstances because
- * direct-io pins the pages with get_user_pages().  This makes
- * is_page_cache_freeable return false, and the VM will not clean the pages.
- * But other code (eg, flusher threads) could clean the pages if they are mapped
- * pagecache.
- *
- * Simply disabling the call to bio_set_pages_dirty() is a good way to test the
- * deferred bio dirtying paths.
- */
-
-/*
- * bio_set_pages_dirty() will mark all the bio's pages as dirty.
- */
-void bio_set_pages_dirty(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		struct page *page = bvec->bv_page;
-
-		if (page && !PageCompound(page))
-			set_page_dirty_lock(page);
-	}
-}
-
-static void bio_release_pages(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		struct page *page = bvec->bv_page;
-
-		if (page)
-			put_page(page);
-	}
-}
-
-/*
- * bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
- * If they are, then fine.  If, however, some pages are clean then they must
- * have been written out during the direct-IO read.  So we take another ref on
- * the BIO and the offending pages and re-dirty the pages in process context.
- *
- * It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on.  It will run one page_cache_release() against each page and will
- * run one bio_put() against the BIO.
- */
-
-static void bio_dirty_fn(struct work_struct *work);
-
-static DECLARE_WORK(bio_dirty_work, bio_dirty_fn);
-static DEFINE_SPINLOCK(bio_dirty_lock);
-static struct bio *bio_dirty_list;
-
-/*
- * This runs in process context
- */
-static void bio_dirty_fn(struct work_struct *work)
-{
-	unsigned long flags;
-	struct bio *bio;
-
-	spin_lock_irqsave(&bio_dirty_lock, flags);
-	bio = bio_dirty_list;
-	bio_dirty_list = NULL;
-	spin_unlock_irqrestore(&bio_dirty_lock, flags);
-
-	while (bio) {
-		struct bio *next = bio->bi_private;
-
-		bio_set_pages_dirty(bio);
-		bio_release_pages(bio);
-		bio_put(bio);
-		bio = next;
-	}
-}
-
-void bio_check_pages_dirty(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int nr_clean_pages = 0;
-	int i;
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		struct page *page = bvec->bv_page;
-
-		if (PageDirty(page) || PageCompound(page)) {
-			page_cache_release(page);
-			bvec->bv_page = NULL;
-		} else {
-			nr_clean_pages++;
-		}
-	}
-
-	if (nr_clean_pages) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&bio_dirty_lock, flags);
-		bio->bi_private = bio_dirty_list;
-		bio_dirty_list = bio;
-		spin_unlock_irqrestore(&bio_dirty_lock, flags);
-		schedule_work(&bio_dirty_work);
-	} else {
-		bio_put(bio);
-	}
-}
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
-	struct bio_vec bvec;
-	struct bvec_iter iter;
-
-	bio_for_each_segment(bvec, bi, iter)
-		flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
-/**
- * bio_endio - end I/O on a bio
- * @bio:	bio
- * @error:	error, if any
- *
- * Description:
- *   bio_endio() will end I/O on the whole bio. bio_endio() is the
- *   preferred way to end I/O on a bio, it takes care of clearing
- *   BIO_UPTODATE on error. @error is 0 on success, and and one of the
- *   established -Exxxx (-EIO, for instance) error values in case
- *   something went wrong. No one should call bi_end_io() directly on a
- *   bio unless they own it and thus know that it has an end_io
- *   function.
- **/
-void bio_endio(struct bio *bio, int error)
-{
-	while (bio) {
-		BUG_ON(atomic_read(&bio->bi_remaining) <= 0);
-
-		if (error)
-			clear_bit(BIO_UPTODATE, &bio->bi_flags);
-		else if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
-			error = -EIO;
-
-		if (!atomic_dec_and_test(&bio->bi_remaining))
-			return;
-
-		/*
-		 * Need to have a real endio function for chained bios,
-		 * otherwise various corner cases will break (like stacking
-		 * block devices that save/restore bi_end_io) - however, we want
-		 * to avoid unbounded recursion and blowing the stack. Tail call
-		 * optimization would handle this, but compiling with frame
-		 * pointers also disables gcc's sibling call optimization.
-		 */
-		if (bio->bi_end_io == bio_chain_endio) {
-			struct bio *parent = bio->bi_private;
-			bio_put(bio);
-			bio = parent;
-		} else {
-			if (bio->bi_end_io)
-				bio->bi_end_io(bio, error);
-			bio = NULL;
-		}
-	}
-}
-EXPORT_SYMBOL(bio_endio);
-
-/**
- * bio_endio_nodec - end I/O on a bio, without decrementing bi_remaining
- * @bio:	bio
- * @error:	error, if any
- *
- * For code that has saved and restored bi_end_io; thing hard before using this
- * function, probably you should've cloned the entire bio.
- **/
-void bio_endio_nodec(struct bio *bio, int error)
-{
-	atomic_inc(&bio->bi_remaining);
-	bio_endio(bio, error);
-}
-EXPORT_SYMBOL(bio_endio_nodec);
-
-/**
- * bio_split - split a bio
- * @bio:	bio to split
- * @sectors:	number of sectors to split from the front of @bio
- * @gfp:	gfp mask
- * @bs:		bio set to allocate from
- *
- * Allocates and returns a new bio which represents @sectors from the start of
- * @bio, and updates @bio to represent the remaining sectors.
- *
- * The newly allocated bio will point to @bio's bi_io_vec; it is the caller's
- * responsibility to ensure that @bio is not freed before the split.
- */
-struct bio *bio_split(struct bio *bio, int sectors,
-		      gfp_t gfp, struct bio_set *bs)
-{
-	struct bio *split = NULL;
-
-	BUG_ON(sectors <= 0);
-	BUG_ON(sectors >= bio_sectors(bio));
-
-	split = bio_clone_fast(bio, gfp, bs);
-	if (!split)
-		return NULL;
-
-	split->bi_iter.bi_size = sectors << 9;
-
-	if (bio_integrity(split))
-		bio_integrity_trim(split, 0, sectors);
-
-	bio_advance(bio, split->bi_iter.bi_size);
-
-	return split;
-}
-EXPORT_SYMBOL(bio_split);
-
-/**
- * bio_trim - trim a bio
- * @bio:	bio to trim
- * @offset:	number of sectors to trim from the front of @bio
- * @size:	size we want to trim @bio to, in sectors
- */
-void bio_trim(struct bio *bio, int offset, int size)
-{
-	/* 'bio' is a cloned bio which we need to trim to match
-	 * the given offset and size.
-	 */
-
-	size <<= 9;
-	if (offset == 0 && size == bio->bi_iter.bi_size)
-		return;
-
-	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
-
-	bio_advance(bio, offset << 9);
-
-	bio->bi_iter.bi_size = size;
-}
-EXPORT_SYMBOL_GPL(bio_trim);
-
-/*
- * create memory pools for biovec's in a bio_set.
- * use the global biovec slabs created for general use.
- */
-mempool_t *biovec_create_pool(struct bio_set *bs, int pool_entries)
-{
-	struct biovec_slab *bp = bvec_slabs + BIOVEC_MAX_IDX;
-
-	return mempool_create_slab_pool(pool_entries, bp->slab);
-}
-
-void bioset_free(struct bio_set *bs)
-{
-	if (bs->rescue_workqueue)
-		destroy_workqueue(bs->rescue_workqueue);
-
-	if (bs->bio_pool)
-		mempool_destroy(bs->bio_pool);
-
-	if (bs->bvec_pool)
-		mempool_destroy(bs->bvec_pool);
-
-	bioset_integrity_free(bs);
-	bio_put_slab(bs);
-
-	kfree(bs);
-}
-EXPORT_SYMBOL(bioset_free);
-
-/**
- * bioset_create  - Create a bio_set
- * @pool_size:	Number of bio and bio_vecs to cache in the mempool
- * @front_pad:	Number of bytes to allocate in front of the returned bio
- *
- * Description:
- *    Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller
- *    to ask for a number of bytes to be allocated in front of the bio.
- *    Front pad allocation is useful for embedding the bio inside
- *    another structure, to avoid allocating extra data to go with the bio.
- *    Note that the bio must be embedded at the END of that structure always,
- *    or things will break badly.
- */
-struct bio_set *bioset_create(unsigned int pool_size, unsigned int front_pad)
-{
-	unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-	struct bio_set *bs;
-
-	bs = kzalloc(sizeof(*bs), GFP_KERNEL);
-	if (!bs)
-		return NULL;
-
-	bs->front_pad = front_pad;
-
-	spin_lock_init(&bs->rescue_lock);
-	bio_list_init(&bs->rescue_list);
-	INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
-
-	bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
-	if (!bs->bio_slab) {
-		kfree(bs);
-		return NULL;
-	}
-
-	bs->bio_pool = mempool_create_slab_pool(pool_size, bs->bio_slab);
-	if (!bs->bio_pool)
-		goto bad;
-
-	bs->bvec_pool = biovec_create_pool(bs, pool_size);
-	if (!bs->bvec_pool)
-		goto bad;
-
-	bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
-	if (!bs->rescue_workqueue)
-		goto bad;
-
-	return bs;
-bad:
-	bioset_free(bs);
-	return NULL;
-}
-EXPORT_SYMBOL(bioset_create);
-
-#ifdef CONFIG_BLK_CGROUP
-/**
- * bio_associate_current - associate a bio with %current
- * @bio: target bio
- *
- * Associate @bio with %current if it hasn't been associated yet.  Block
- * layer will treat @bio as if it were issued by %current no matter which
- * task actually issues it.
- *
- * This function takes an extra reference of @task's io_context and blkcg
- * which will be put when @bio is released.  The caller must own @bio,
- * ensure %current->io_context exists, and is responsible for synchronizing
- * calls to this function.
- */
-int bio_associate_current(struct bio *bio)
-{
-	struct io_context *ioc;
-	struct cgroup_subsys_state *css;
-
-	if (bio->bi_ioc)
-		return -EBUSY;
-
-	ioc = current->io_context;
-	if (!ioc)
-		return -ENOENT;
-
-	/* acquire active ref on @ioc and associate */
-	get_io_context_active(ioc);
-	bio->bi_ioc = ioc;
-
-	/* associate blkcg if exists */
-	rcu_read_lock();
-	css = task_css(current, blkio_cgrp_id);
-	if (css && css_tryget(css))
-		bio->bi_css = css;
-	rcu_read_unlock();
-
-	return 0;
-}
-
-/**
- * bio_disassociate_task - undo bio_associate_current()
- * @bio: target bio
- */
-void bio_disassociate_task(struct bio *bio)
-{
-	if (bio->bi_ioc) {
-		put_io_context(bio->bi_ioc);
-		bio->bi_ioc = NULL;
-	}
-	if (bio->bi_css) {
-		css_put(bio->bi_css);
-		bio->bi_css = NULL;
-	}
-}
-
-#endif /* CONFIG_BLK_CGROUP */
-
-static void __init biovec_init_slabs(void)
-{
-	int i;
-
-	for (i = 0; i < BIOVEC_NR_POOLS; i++) {
-		int size;
-		struct biovec_slab *bvs = bvec_slabs + i;
-
-		if (bvs->nr_vecs <= BIO_INLINE_VECS) {
-			bvs->slab = NULL;
-			continue;
-		}
-
-		size = bvs->nr_vecs * sizeof(struct bio_vec);
-		bvs->slab = kmem_cache_create(bvs->name, size, 0,
-                                SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
-	}
-}
-
-static int __init init_bio(void)
-{
-	bio_slab_max = 2;
-	bio_slab_nr = 0;
-	bio_slabs = kzalloc(bio_slab_max * sizeof(struct bio_slab), GFP_KERNEL);
-	if (!bio_slabs)
-		panic("bio: can't allocate bios\n");
-
-	bio_integrity_init();
-	biovec_init_slabs();
-
-	fs_bio_set = bioset_create(BIO_POOL_SIZE, 0);
-	if (!fs_bio_set)
-		panic("bio: can't allocate bios\n");
-
-	if (bioset_integrity_create(fs_bio_set, BIO_POOL_SIZE))
-		panic("bio: can't create integrity pool\n");
-
-	return 0;
-}
-subsys_initcall(init_bio);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index c9a24444ec9a..2256e9cceec5 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -279,7 +279,7 @@ static inline void btrfs_inode_block_unlocked_dio(struct inode *inode)
 
 static inline void btrfs_inode_resume_unlocked_dio(struct inode *inode)
 {
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(BTRFS_INODE_READDIO_NEED_LOCK,
 		  &BTRFS_I(inode)->runtime_flags);
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 3955e475ceec..f29a54e454d4 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3458,7 +3458,7 @@ static int lock_extent_buffer_for_io(struct extent_buffer *eb,
 static void end_extent_buffer_writeback(struct extent_buffer *eb)
 {
 	clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK);
 }
 
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 5f805bc944fa..5a3b8371772e 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7126,7 +7126,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 		 * before atomic variable goto zero, we must make sure
 		 * dip->errors is perceived to be set.
 		 */
-		smp_mb__before_atomic_dec();
+		smp_mb__before_atomic();
 	}
 
 	/* if there are more bios still pending for this dio, just exit */
@@ -7306,7 +7306,7 @@ out_err:
 	 * before atomic variable goto zero, we must
 	 * make sure dip->errors is perceived to be set.
 	 */
-	smp_mb__before_atomic_dec();
+	smp_mb__before_atomic();
 	if (atomic_dec_and_test(&dip->pending_bios))
 		bio_io_error(dip->orig_bio);
 
@@ -7449,7 +7449,7 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb,
 		return 0;
 
 	atomic_inc(&inode->i_dio_count);
-	smp_mb__after_atomic_inc();
+	smp_mb__after_atomic();
 
 	/*
 	 * The generic stuff only does filemap_write_and_wait_range, which
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 2ad7de94efef..3f52bb7a58d2 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -642,7 +642,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 		return -EINVAL;
 
 	atomic_inc(&root->will_be_snapshoted);
-	smp_mb__after_atomic_inc();
+	smp_mb__after_atomic();
 	btrfs_wait_nocow_write(root);
 
 	ret = btrfs_start_delalloc_inodes(root, 0);
@@ -3120,6 +3120,8 @@ process_slot:
 			} else if (type == BTRFS_FILE_EXTENT_INLINE) {
 				u64 skip = 0;
 				u64 trim = 0;
+				u64 aligned_end = 0;
+
 				if (off > key.offset) {
 					skip = off - key.offset;
 					new_key.offset += skip;
@@ -3136,9 +3138,11 @@ process_slot:
 				size -= skip + trim;
 				datal -= skip + trim;
 
+				aligned_end = ALIGN(new_key.offset + datal,
+						    root->sectorsize);
 				ret = btrfs_drop_extents(trans, root, inode,
 							 new_key.offset,
-							 new_key.offset + datal,
+							 aligned_end,
 							 1);
 				if (ret) {
 					if (ret != -EOPNOTSUPP)
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index eb6537a08c1b..fd38b5053479 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -1668,7 +1668,7 @@ static int get_first_ref(struct btrfs_root *root, u64 ino,
 		goto out;
 	}
 
-	if (key.type == BTRFS_INODE_REF_KEY) {
+	if (found_key.type == BTRFS_INODE_REF_KEY) {
 		struct btrfs_inode_ref *iref;
 		iref = btrfs_item_ptr(path->nodes[0], path->slots[0],
 				      struct btrfs_inode_ref);
diff --git a/fs/buffer.c b/fs/buffer.c
index 9ddb9fc7d923..6a8110c03a47 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -77,7 +77,7 @@ EXPORT_SYMBOL(__lock_buffer);
 void unlock_buffer(struct buffer_head *bh)
 {
 	clear_bit_unlock(BH_Lock, &bh->b_state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&bh->b_state, BH_Lock);
 }
 EXPORT_SYMBOL(unlock_buffer);
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index 2e5e648eb5c3..c561b628ebce 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -3261,7 +3261,7 @@ int ceph_encode_inode_release(void **p, struct inode *inode,
 			rel->seq = cpu_to_le32(cap->seq);
 			rel->issue_seq = cpu_to_le32(cap->issue_seq),
 			rel->mseq = cpu_to_le32(cap->mseq);
-			rel->caps = cpu_to_le32(cap->issued);
+			rel->caps = cpu_to_le32(cap->implemented);
 			rel->wanted = cpu_to_le32(cap->mds_wanted);
 			rel->dname_len = 0;
 			rel->dname_seq = 0;
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index 766410a12c2c..c29d6ae68874 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -141,7 +141,7 @@ static int __dcache_readdir(struct file *file,  struct dir_context *ctx,
 
 	/* start at beginning? */
 	if (ctx->pos == 2 || last == NULL ||
-	    ctx->pos < ceph_dentry(last)->offset) {
+	    fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) {
 		if (list_empty(&parent->d_subdirs))
 			goto out_unlock;
 		p = parent->d_subdirs.prev;
@@ -182,9 +182,16 @@ more:
 	spin_unlock(&dentry->d_lock);
 	spin_unlock(&parent->d_lock);
 
+	/* make sure a dentry wasn't dropped while we didn't have parent lock */
+	if (!ceph_dir_is_complete(dir)) {
+		dout(" lost dir complete on %p; falling back to mds\n", dir);
+		dput(dentry);
+		err = -EAGAIN;
+		goto out;
+	}
+
 	dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, ctx->pos,
 	     dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
-	ctx->pos = di->offset;
 	if (!dir_emit(ctx, dentry->d_name.name,
 		      dentry->d_name.len,
 		      ceph_translate_ino(dentry->d_sb, dentry->d_inode->i_ino),
@@ -198,19 +205,12 @@ more:
 		return 0;
 	}
 
+	ctx->pos = di->offset + 1;
+
 	if (last)
 		dput(last);
 	last = dentry;
 
-	ctx->pos++;
-
-	/* make sure a dentry wasn't dropped while we didn't have parent lock */
-	if (!ceph_dir_is_complete(dir)) {
-		dout(" lost dir complete on %p; falling back to mds\n", dir);
-		err = -EAGAIN;
-		goto out;
-	}
-
 	spin_lock(&parent->d_lock);
 	p = p->prev;	/* advance to next dentry */
 	goto more;
@@ -296,6 +296,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
 		err = __dcache_readdir(file, ctx, shared_gen);
 		if (err != -EAGAIN)
 			return err;
+		frag = fpos_frag(ctx->pos);
+		off = fpos_off(ctx->pos);
 	} else {
 		spin_unlock(&ci->i_ceph_lock);
 	}
@@ -446,7 +448,6 @@ more:
 	if (atomic_read(&ci->i_release_count) == fi->dir_release_count) {
 		dout(" marking %p complete\n", inode);
 		__ceph_dir_set_complete(ci, fi->dir_release_count);
-		ci->i_max_offset = ctx->pos;
 	}
 	spin_unlock(&ci->i_ceph_lock);
 
@@ -935,14 +936,16 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
 		 * to do it here.
 		 */
 
-		/* d_move screws up d_subdirs order */
-		ceph_dir_clear_complete(new_dir);
-
 		d_move(old_dentry, new_dentry);
 
 		/* ensure target dentry is invalidated, despite
 		   rehashing bug in vfs_rename_dir */
 		ceph_invalidate_dentry_lease(new_dentry);
+
+		/* d_move screws up sibling dentries' offsets */
+		ceph_dir_clear_complete(old_dir);
+		ceph_dir_clear_complete(new_dir);
+
 	}
 	ceph_mdsc_put_request(req);
 	return err;
diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
index 0b0728e5be2d..233c6f96910a 100644
--- a/fs/ceph/inode.c
+++ b/fs/ceph/inode.c
@@ -744,7 +744,6 @@ static int fill_inode(struct inode *inode,
 	    !__ceph_dir_is_complete(ci)) {
 		dout(" marking %p complete (empty)\n", inode);
 		__ceph_dir_set_complete(ci, atomic_read(&ci->i_release_count));
-		ci->i_max_offset = 2;
 	}
 no_change:
 	/* only update max_size on auth cap */
@@ -890,41 +889,6 @@ out_unlock:
 }
 
 /*
- * Set dentry's directory position based on the current dir's max, and
- * order it in d_subdirs, so that dcache_readdir behaves.
- *
- * Always called under directory's i_mutex.
- */
-static void ceph_set_dentry_offset(struct dentry *dn)
-{
-	struct dentry *dir = dn->d_parent;
-	struct inode *inode = dir->d_inode;
-	struct ceph_inode_info *ci;
-	struct ceph_dentry_info *di;
-
-	BUG_ON(!inode);
-
-	ci = ceph_inode(inode);
-	di = ceph_dentry(dn);
-
-	spin_lock(&ci->i_ceph_lock);
-	if (!__ceph_dir_is_complete(ci)) {
-		spin_unlock(&ci->i_ceph_lock);
-		return;
-	}
-	di->offset = ceph_inode(inode)->i_max_offset++;
-	spin_unlock(&ci->i_ceph_lock);
-
-	spin_lock(&dir->d_lock);
-	spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
-	list_move(&dn->d_u.d_child, &dir->d_subdirs);
-	dout("set_dentry_offset %p %lld (%p %p)\n", dn, di->offset,
-	     dn->d_u.d_child.prev, dn->d_u.d_child.next);
-	spin_unlock(&dn->d_lock);
-	spin_unlock(&dir->d_lock);
-}
-
-/*
  * splice a dentry to an inode.
  * caller must hold directory i_mutex for this to be safe.
  *
@@ -933,7 +897,7 @@ static void ceph_set_dentry_offset(struct dentry *dn)
  * the caller) if we fail.
  */
 static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
-				    bool *prehash, bool set_offset)
+				    bool *prehash)
 {
 	struct dentry *realdn;
 
@@ -965,8 +929,6 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in,
 	}
 	if ((!prehash || *prehash) && d_unhashed(dn))
 		d_rehash(dn);
-	if (set_offset)
-		ceph_set_dentry_offset(dn);
 out:
 	return dn;
 }
@@ -987,7 +949,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
 {
 	struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info;
 	struct inode *in = NULL;
-	struct ceph_mds_reply_inode *ininfo;
 	struct ceph_vino vino;
 	struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
 	int err = 0;
@@ -1161,6 +1122,9 @@ retry_lookup:
 
 		/* rename? */
 		if (req->r_old_dentry && req->r_op == CEPH_MDS_OP_RENAME) {
+			struct inode *olddir = req->r_old_dentry_dir;
+			BUG_ON(!olddir);
+
 			dout(" src %p '%.*s' dst %p '%.*s'\n",
 			     req->r_old_dentry,
 			     req->r_old_dentry->d_name.len,
@@ -1180,13 +1144,10 @@ retry_lookup:
 			   rehashing bug in vfs_rename_dir */
 			ceph_invalidate_dentry_lease(dn);
 
-			/*
-			 * d_move() puts the renamed dentry at the end of
-			 * d_subdirs.  We need to assign it an appropriate
-			 * directory offset so we can behave when dir is
-			 * complete.
-			 */
-			ceph_set_dentry_offset(req->r_old_dentry);
+			/* d_move screws up sibling dentries' offsets */
+			ceph_dir_clear_complete(dir);
+			ceph_dir_clear_complete(olddir);
+
 			dout("dn %p gets new offset %lld\n", req->r_old_dentry,
 			     ceph_dentry(req->r_old_dentry)->offset);
 
@@ -1213,8 +1174,9 @@ retry_lookup:
 
 		/* attach proper inode */
 		if (!dn->d_inode) {
+			ceph_dir_clear_complete(dir);
 			ihold(in);
-			dn = splice_dentry(dn, in, &have_lease, true);
+			dn = splice_dentry(dn, in, &have_lease);
 			if (IS_ERR(dn)) {
 				err = PTR_ERR(dn);
 				goto done;
@@ -1235,17 +1197,16 @@ retry_lookup:
 		   (req->r_op == CEPH_MDS_OP_LOOKUPSNAP ||
 		    req->r_op == CEPH_MDS_OP_MKSNAP)) {
 		struct dentry *dn = req->r_dentry;
+		struct inode *dir = req->r_locked_dir;
 
 		/* fill out a snapdir LOOKUPSNAP dentry */
 		BUG_ON(!dn);
-		BUG_ON(!req->r_locked_dir);
-		BUG_ON(ceph_snap(req->r_locked_dir) != CEPH_SNAPDIR);
-		ininfo = rinfo->targeti.in;
-		vino.ino = le64_to_cpu(ininfo->ino);
-		vino.snap = le64_to_cpu(ininfo->snapid);
+		BUG_ON(!dir);
+		BUG_ON(ceph_snap(dir) != CEPH_SNAPDIR);
 		dout(" linking snapped dir %p to dn %p\n", in, dn);
+		ceph_dir_clear_complete(dir);
 		ihold(in);
-		dn = splice_dentry(dn, in, NULL, true);
+		dn = splice_dentry(dn, in, NULL);
 		if (IS_ERR(dn)) {
 			err = PTR_ERR(dn);
 			goto done;
@@ -1407,7 +1368,7 @@ retry_lookup:
 		}
 
 		if (!dn->d_inode) {
-			dn = splice_dentry(dn, in, NULL, false);
+			dn = splice_dentry(dn, in, NULL);
 			if (IS_ERR(dn)) {
 				err = PTR_ERR(dn);
 				dn = NULL;
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
index fdf941b44ff1..a822a6e58290 100644
--- a/fs/ceph/ioctl.c
+++ b/fs/ceph/ioctl.c
@@ -109,6 +109,8 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg)
 		return PTR_ERR(req);
 	req->r_inode = inode;
 	ihold(inode);
+	req->r_num_caps = 1;
+
 	req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
 
 	req->r_args.setlayout.layout.fl_stripe_unit =
@@ -153,6 +155,7 @@ static long ceph_ioctl_set_layout_policy (struct file *file, void __user *arg)
 		return PTR_ERR(req);
 	req->r_inode = inode;
 	ihold(inode);
+	req->r_num_caps = 1;
 
 	req->r_args.setlayout.layout.fl_stripe_unit =
 			cpu_to_le32(l.stripe_unit);
diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c
index d94ba0df9f4d..fbc39c47bacd 100644
--- a/fs/ceph/locks.c
+++ b/fs/ceph/locks.c
@@ -45,6 +45,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 		return PTR_ERR(req);
 	req->r_inode = inode;
 	ihold(inode);
+	req->r_num_caps = 1;
 
 	/* mds requires start and length rather than start and end */
 	if (LLONG_MAX == fl->fl_end)
@@ -52,10 +53,7 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct file *file,
 	else
 		length = fl->fl_end - fl->fl_start + 1;
 
-	if (lock_type == CEPH_LOCK_FCNTL)
-		owner = secure_addr(fl->fl_owner);
-	else
-		owner = secure_addr(fl->fl_file);
+	owner = secure_addr(fl->fl_owner);
 
 	dout("ceph_lock_message: rule: %d, op: %d, owner: %llx, pid: %llu, "
 	     "start: %llu, length: %llu, wait: %d, type: %d", (int)lock_type,
@@ -313,10 +311,7 @@ int lock_to_ceph_filelock(struct file_lock *lock,
 	cephlock->length = cpu_to_le64(lock->fl_end - lock->fl_start + 1);
 	cephlock->client = cpu_to_le64(0);
 	cephlock->pid = cpu_to_le64((u64)lock->fl_pid);
-	if (lock->fl_flags & FL_POSIX)
-		cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
-	else
-		cephlock->owner = cpu_to_le64(secure_addr(lock->fl_file));
+	cephlock->owner = cpu_to_le64(secure_addr(lock->fl_owner));
 
 	switch (lock->fl_type) {
 	case F_RDLCK:
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
index 7866cd05a6bb..ead05cc1f447 100644
--- a/fs/ceph/super.h
+++ b/fs/ceph/super.h
@@ -266,7 +266,6 @@ struct ceph_inode_info {
 	struct timespec i_rctime;
 	u64 i_rbytes, i_rfiles, i_rsubdirs;
 	u64 i_files, i_subdirs;
-	u64 i_max_offset;  /* largest readdir offset, set with complete dir */
 
 	struct rb_root i_fragtree;
 	struct mutex i_fragtree_mutex;
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index aadc2b68678b..a22d667f1069 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1737,6 +1737,9 @@ cifs_inode_needs_reval(struct inode *inode)
 	if (cifs_i->time == 0)
 		return true;
 
+	if (!cifs_sb->actimeo)
+		return true;
+
 	if (!time_in_range(jiffies, cifs_i->time,
 				cifs_i->time + cifs_sb->actimeo))
 		return true;
diff --git a/fs/dcache.c b/fs/dcache.c
index 40707d88a945..be2bea834bf4 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -246,16 +246,8 @@ static void __d_free(struct rcu_head *head)
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
-/*
- * no locks, please.
- */
-static void d_free(struct dentry *dentry)
+static void dentry_free(struct dentry *dentry)
 {
-	BUG_ON((int)dentry->d_lockref.count > 0);
-	this_cpu_dec(nr_dentry);
-	if (dentry->d_op && dentry->d_op->d_release)
-		dentry->d_op->d_release(dentry);
-
 	/* if dentry was never visible to RCU, immediate free is OK */
 	if (!(dentry->d_flags & DCACHE_RCUACCESS))
 		__d_free(&dentry->d_u.d_rcu);
@@ -403,56 +395,6 @@ static void dentry_lru_add(struct dentry *dentry)
 		d_lru_add(dentry);
 }
 
-/*
- * Remove a dentry with references from the LRU.
- *
- * If we are on the shrink list, then we can get to try_prune_one_dentry() and
- * lose our last reference through the parent walk. In this case, we need to
- * remove ourselves from the shrink list, not the LRU.
- */
-static void dentry_lru_del(struct dentry *dentry)
-{
-	if (dentry->d_flags & DCACHE_LRU_LIST) {
-		if (dentry->d_flags & DCACHE_SHRINK_LIST)
-			return d_shrink_del(dentry);
-		d_lru_del(dentry);
-	}
-}
-
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
- */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
-	__releases(dentry->d_lock)
-	__releases(parent->d_lock)
-	__releases(dentry->d_inode->i_lock)
-{
-	list_del(&dentry->d_u.d_child);
-	/*
-	 * Inform d_walk() that we are no longer attached to the
-	 * dentry tree
-	 */
-	dentry->d_flags |= DCACHE_DENTRY_KILLED;
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	dentry_iput(dentry);
-	/*
-	 * dentry_iput drops the locks, at which point nobody (except
-	 * transient RCU lookups) can reach this dentry.
-	 */
-	d_free(dentry);
-	return parent;
-}
-
 /**
  * d_drop - drop a dentry
  * @dentry: dentry to drop
@@ -499,37 +441,12 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static struct dentry *
-dentry_kill(struct dentry *dentry, int unlock_on_failure)
-	__releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
 {
-	struct inode *inode;
-	struct dentry *parent;
-
-	inode = dentry->d_inode;
-	if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
-		if (unlock_on_failure) {
-			spin_unlock(&dentry->d_lock);
-			cpu_relax();
-		}
-		return dentry; /* try again with same dentry */
-	}
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
+	struct dentry *parent = NULL;
+	bool can_free = true;
+	if (!IS_ROOT(dentry))
 		parent = dentry->d_parent;
-	if (parent && !spin_trylock(&parent->d_lock)) {
-		if (inode)
-			spin_unlock(&inode->i_lock);
-		goto relock;
-	}
 
 	/*
 	 * The dentry is now unrecoverably dead to the world.
@@ -543,10 +460,103 @@ relock:
 	if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
 		dentry->d_op->d_prune(dentry);
 
-	dentry_lru_del(dentry);
+	if (dentry->d_flags & DCACHE_LRU_LIST) {
+		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+			d_lru_del(dentry);
+	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	return d_kill(dentry, parent);
+	list_del(&dentry->d_u.d_child);
+	/*
+	 * Inform d_walk() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
+	BUG_ON((int)dentry->d_lockref.count > 0);
+	this_cpu_dec(nr_dentry);
+	if (dentry->d_op && dentry->d_op->d_release)
+		dentry->d_op->d_release(dentry);
+
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		dentry->d_flags |= DCACHE_MAY_FREE;
+		can_free = false;
+	}
+	spin_unlock(&dentry->d_lock);
+	if (likely(can_free))
+		dentry_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto failed;
+
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		if (unlikely(!spin_trylock(&parent->d_lock))) {
+			if (inode)
+				spin_unlock(&inode->i_lock);
+			goto failed;
+		}
+	}
+
+	__dentry_kill(dentry);
+	return parent;
+
+failed:
+	spin_unlock(&dentry->d_lock);
+	cpu_relax();
+	return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		return NULL;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return parent;
+	spin_unlock(&dentry->d_lock);
+	rcu_read_lock();
+again:
+	parent = ACCESS_ONCE(dentry->d_parent);
+	spin_lock(&parent->d_lock);
+	/*
+	 * We can't blindly lock dentry until we are sure
+	 * that we won't violate the locking order.
+	 * Any changes of dentry->d_parent must have
+	 * been done with parent->d_lock held, so
+	 * spin_lock() above is enough of a barrier
+	 * for checking if it's still our child.
+	 */
+	if (unlikely(parent != dentry->d_parent)) {
+		spin_unlock(&parent->d_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+	if (parent != dentry)
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	else
+		parent = NULL;
+	return parent;
 }
 
 /* 
@@ -602,7 +612,7 @@ repeat:
 	return;
 
 kill_it:
-	dentry = dentry_kill(dentry, 1);
+	dentry = dentry_kill(dentry);
 	if (dentry)
 		goto repeat;
 }
@@ -815,64 +825,15 @@ restart:
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static struct dentry * try_prune_one_dentry(struct dentry *dentry)
-	__releases(dentry->d_lock)
-{
-	struct dentry *parent;
-
-	parent = dentry_kill(dentry, 0);
-	/*
-	 * If dentry_kill returns NULL, we have nothing more to do.
-	 * if it returns the same dentry, trylocks failed. In either
-	 * case, just loop again.
-	 *
-	 * Otherwise, we need to prune ancestors too. This is necessary
-	 * to prevent quadratic behavior of shrink_dcache_parent(), but
-	 * is also expected to be beneficial in reducing dentry cache
-	 * fragmentation.
-	 */
-	if (!parent)
-		return NULL;
-	if (parent == dentry)
-		return dentry;
-
-	/* Prune ancestors. */
-	dentry = parent;
-	while (dentry) {
-		if (lockref_put_or_lock(&dentry->d_lockref))
-			return NULL;
-		dentry = dentry_kill(dentry, 1);
-	}
-	return NULL;
-}
-
 static void shrink_dentry_list(struct list_head *list)
 {
-	struct dentry *dentry;
-
-	rcu_read_lock();
-	for (;;) {
-		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
-		if (&dentry->d_lru == list)
-			break; /* empty */
+	struct dentry *dentry, *parent;
 
-		/*
-		 * Get the dentry lock, and re-verify that the dentry is
-		 * this on the shrinking list. If it is, we know that
-		 * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
-		 */
+	while (!list_empty(list)) {
+		struct inode *inode;
+		dentry = list_entry(list->prev, struct dentry, d_lru);
 		spin_lock(&dentry->d_lock);
-		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
-			spin_unlock(&dentry->d_lock);
-			continue;
-		}
+		parent = lock_parent(dentry);
 
 		/*
 		 * The dispose list is isolated and dentries are not accounted
@@ -885,30 +846,63 @@ static void shrink_dentry_list(struct list_head *list)
 		 * We found an inuse dentry which was not removed from
 		 * the LRU because of laziness during lookup. Do not free it.
 		 */
-		if (dentry->d_lockref.count) {
+		if ((int)dentry->d_lockref.count > 0) {
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			continue;
 		}
-		rcu_read_unlock();
 
-		/*
-		 * If 'try_to_prune()' returns a dentry, it will
-		 * be the same one we passed in, and d_lock will
-		 * have been held the whole time, so it will not
-		 * have been added to any other lists. We failed
-		 * to get the inode lock.
-		 *
-		 * We just add it back to the shrink list.
-		 */
-		dentry = try_prune_one_dentry(dentry);
 
-		rcu_read_lock();
-		if (dentry) {
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			if (can_free)
+				dentry_free(dentry);
+			continue;
+		}
+
+		inode = dentry->d_inode;
+		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
 			d_shrink_add(dentry, list);
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			continue;
+		}
+
+		__dentry_kill(dentry);
+
+		/*
+		 * We need to prune ancestors too. This is necessary to prevent
+		 * quadratic behavior of shrink_dcache_parent(), but is also
+		 * expected to be beneficial in reducing dentry cache
+		 * fragmentation.
+		 */
+		dentry = parent;
+		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+			parent = lock_parent(dentry);
+			if (dentry->d_lockref.count != 1) {
+				dentry->d_lockref.count--;
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				break;
+			}
+			inode = dentry->d_inode;	/* can't be NULL */
+			if (unlikely(!spin_trylock(&inode->i_lock))) {
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				cpu_relax();
+				continue;
+			}
+			__dentry_kill(dentry);
+			dentry = parent;
 		}
 	}
-	rcu_read_unlock();
 }
 
 static enum lru_status
@@ -1261,34 +1255,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 	if (data->start == dentry)
 		goto out;
 
-	/*
-	 * move only zero ref count dentries to the dispose list.
-	 *
-	 * Those which are presently on the shrink list, being processed
-	 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
-	 * loop in shrink_dcache_parent() might not make any progress
-	 * and loop forever.
-	 */
-	if (dentry->d_lockref.count) {
-		dentry_lru_del(dentry);
-	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-		/*
-		 * We can't use d_lru_shrink_move() because we
-		 * need to get the global LRU lock and do the
-		 * LRU accounting.
-		 */
-		d_lru_del(dentry);
-		d_shrink_add(dentry, &data->dispose);
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		data->found++;
-		ret = D_WALK_NORETRY;
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!dentry->d_lockref.count) {
+			d_shrink_add(dentry, &data->dispose);
+			data->found++;
+		}
 	}
 	/*
 	 * We can return to the caller if we have found some (this
 	 * ensures forward progress). We'll be coming back to find
 	 * the rest.
 	 */
-	if (data->found && need_resched())
-		ret = D_WALK_QUIT;
+	if (!list_empty(&data->dispose))
+		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
 out:
 	return ret;
 }
@@ -1318,45 +1301,35 @@ void shrink_dcache_parent(struct dentry *parent)
 }
 EXPORT_SYMBOL(shrink_dcache_parent);
 
-static enum d_walk_ret umount_collect(void *_data, struct dentry *dentry)
+static enum d_walk_ret umount_check(void *_data, struct dentry *dentry)
 {
-	struct select_data *data = _data;
-	enum d_walk_ret ret = D_WALK_CONTINUE;
+	/* it has busy descendents; complain about those instead */
+	if (!list_empty(&dentry->d_subdirs))
+		return D_WALK_CONTINUE;
 
-	if (dentry->d_lockref.count) {
-		dentry_lru_del(dentry);
-		if (likely(!list_empty(&dentry->d_subdirs)))
-			goto out;
-		if (dentry == data->start && dentry->d_lockref.count == 1)
-			goto out;
-		printk(KERN_ERR
-		       "BUG: Dentry %p{i=%lx,n=%s}"
-		       " still in use (%d)"
-		       " [unmount of %s %s]\n",
+	/* root with refcount 1 is fine */
+	if (dentry == _data && dentry->d_lockref.count == 1)
+		return D_WALK_CONTINUE;
+
+	printk(KERN_ERR "BUG: Dentry %p{i=%lx,n=%pd} "
+			" still in use (%d) [unmount of %s %s]\n",
 		       dentry,
 		       dentry->d_inode ?
 		       dentry->d_inode->i_ino : 0UL,
-		       dentry->d_name.name,
+		       dentry,
 		       dentry->d_lockref.count,
 		       dentry->d_sb->s_type->name,
 		       dentry->d_sb->s_id);
-		BUG();
-	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-		/*
-		 * We can't use d_lru_shrink_move() because we
-		 * need to get the global LRU lock and do the
-		 * LRU accounting.
-		 */
-		if (dentry->d_flags & DCACHE_LRU_LIST)
-			d_lru_del(dentry);
-		d_shrink_add(dentry, &data->dispose);
-		data->found++;
-		ret = D_WALK_NORETRY;
-	}
-out:
-	if (data->found && need_resched())
-		ret = D_WALK_QUIT;
-	return ret;
+	WARN_ON(1);
+	return D_WALK_CONTINUE;
+}
+
+static void do_one_tree(struct dentry *dentry)
+{
+	shrink_dcache_parent(dentry);
+	d_walk(dentry, dentry, umount_check, NULL);
+	d_drop(dentry);
+	dput(dentry);
 }
 
 /*
@@ -1366,40 +1339,15 @@ void shrink_dcache_for_umount(struct super_block *sb)
 {
 	struct dentry *dentry;
 
-	if (down_read_trylock(&sb->s_umount))
-		BUG();
+	WARN(down_read_trylock(&sb->s_umount), "s_umount should've been locked");
 
 	dentry = sb->s_root;
 	sb->s_root = NULL;
-	for (;;) {
-		struct select_data data;
-
-		INIT_LIST_HEAD(&data.dispose);
-		data.start = dentry;
-		data.found = 0;
-
-		d_walk(dentry, &data, umount_collect, NULL);
-		if (!data.found)
-			break;
-
-		shrink_dentry_list(&data.dispose);
-		cond_resched();
-	}
-	d_drop(dentry);
-	dput(dentry);
+	do_one_tree(dentry);
 
 	while (!hlist_bl_empty(&sb->s_anon)) {
-		struct select_data data;
-		dentry = hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash);
-
-		INIT_LIST_HEAD(&data.dispose);
-		data.start = NULL;
-		data.found = 0;
-
-		d_walk(dentry, &data, umount_collect, NULL);
-		if (data.found)
-			shrink_dentry_list(&data.dispose);
-		cond_resched();
+		dentry = dget(hlist_bl_entry(hlist_bl_first(&sb->s_anon), struct dentry, d_hash));
+		do_one_tree(dentry);
 	}
 }
 
@@ -1647,8 +1595,7 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode)
 	unsigned add_flags = d_flags_for_inode(inode);
 
 	spin_lock(&dentry->d_lock);
-	dentry->d_flags &= ~DCACHE_ENTRY_TYPE;
-	dentry->d_flags |= add_flags;
+	__d_set_type(dentry, add_flags);
 	if (inode)
 		hlist_add_head(&dentry->d_alias, &inode->i_dentry);
 	dentry->d_inode = inode;
diff --git a/fs/exec.c b/fs/exec.c
index 476f3ebf437e..238b7aa26f68 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -657,10 +657,10 @@ int setup_arg_pages(struct linux_binprm *bprm,
 	unsigned long rlim_stack;
 
 #ifdef CONFIG_STACK_GROWSUP
-	/* Limit stack size to 1GB */
+	/* Limit stack size */
 	stack_base = rlimit_max(RLIMIT_STACK);
-	if (stack_base > (1 << 30))
-		stack_base = 1 << 30;
+	if (stack_base > STACK_SIZE_MAX)
+		stack_base = STACK_SIZE_MAX;
 
 	/* Make sure we didn't let the argument array grow too large. */
 	if (vma->vm_end - vma->vm_start > stack_base)
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f3b84cd9de56..08b3c116915b 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -42,7 +42,7 @@ int ext4_resize_begin(struct super_block *sb)
 void ext4_resize_end(struct super_block *sb)
 {
 	clear_bit_unlock(EXT4_RESIZING, &EXT4_SB(sb)->s_resize_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 }
 
 static ext4_group_t ext4_meta_bg_first_group(struct super_block *sb,
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index a0b0855d00a9..205e0d5d5307 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -348,7 +348,7 @@ int __init fuse_ctl_init(void)
 	return register_filesystem(&fuse_ctl_fs_type);
 }
 
-void fuse_ctl_cleanup(void)
+void __exit fuse_ctl_cleanup(void)
 {
 	unregister_filesystem(&fuse_ctl_fs_type);
 }
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 5b4e035b364c..42198359fa1b 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -679,6 +679,14 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 	return create_new_entry(fc, req, dir, entry, S_IFLNK);
 }
 
+static inline void fuse_update_ctime(struct inode *inode)
+{
+	if (!IS_NOCMTIME(inode)) {
+		inode->i_ctime = current_fs_time(inode->i_sb);
+		mark_inode_dirty_sync(inode);
+	}
+}
+
 static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
 	int err;
@@ -713,6 +721,7 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 		fuse_invalidate_attr(inode);
 		fuse_invalidate_attr(dir);
 		fuse_invalidate_entry_cache(entry);
+		fuse_update_ctime(inode);
 	} else if (err == -EINTR)
 		fuse_invalidate_entry(entry);
 	return err;
@@ -743,23 +752,26 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 	return err;
 }
 
-static int fuse_rename(struct inode *olddir, struct dentry *oldent,
-		       struct inode *newdir, struct dentry *newent)
+static int fuse_rename_common(struct inode *olddir, struct dentry *oldent,
+			      struct inode *newdir, struct dentry *newent,
+			      unsigned int flags, int opcode, size_t argsize)
 {
 	int err;
-	struct fuse_rename_in inarg;
+	struct fuse_rename2_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_req_nopages(fc);
+	struct fuse_req *req;
 
+	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req))
 		return PTR_ERR(req);
 
-	memset(&inarg, 0, sizeof(inarg));
+	memset(&inarg, 0, argsize);
 	inarg.newdir = get_node_id(newdir);
-	req->in.h.opcode = FUSE_RENAME;
+	inarg.flags = flags;
+	req->in.h.opcode = opcode;
 	req->in.h.nodeid = get_node_id(olddir);
 	req->in.numargs = 3;
-	req->in.args[0].size = sizeof(inarg);
+	req->in.args[0].size = argsize;
 	req->in.args[0].value = &inarg;
 	req->in.args[1].size = oldent->d_name.len + 1;
 	req->in.args[1].value = oldent->d_name.name;
@@ -771,15 +783,22 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	if (!err) {
 		/* ctime changes */
 		fuse_invalidate_attr(oldent->d_inode);
+		fuse_update_ctime(oldent->d_inode);
+
+		if (flags & RENAME_EXCHANGE) {
+			fuse_invalidate_attr(newent->d_inode);
+			fuse_update_ctime(newent->d_inode);
+		}
 
 		fuse_invalidate_attr(olddir);
 		if (olddir != newdir)
 			fuse_invalidate_attr(newdir);
 
 		/* newent will end up negative */
-		if (newent->d_inode) {
+		if (!(flags & RENAME_EXCHANGE) && newent->d_inode) {
 			fuse_invalidate_attr(newent->d_inode);
 			fuse_invalidate_entry_cache(newent);
+			fuse_update_ctime(newent->d_inode);
 		}
 	} else if (err == -EINTR) {
 		/* If request was interrupted, DEITY only knows if the
@@ -795,6 +814,36 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	return err;
 }
 
+static int fuse_rename(struct inode *olddir, struct dentry *oldent,
+		       struct inode *newdir, struct dentry *newent)
+{
+	return fuse_rename_common(olddir, oldent, newdir, newent, 0,
+				  FUSE_RENAME, sizeof(struct fuse_rename_in));
+}
+
+static int fuse_rename2(struct inode *olddir, struct dentry *oldent,
+			struct inode *newdir, struct dentry *newent,
+			unsigned int flags)
+{
+	struct fuse_conn *fc = get_fuse_conn(olddir);
+	int err;
+
+	if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
+		return -EINVAL;
+
+	if (fc->no_rename2 || fc->minor < 23)
+		return -EINVAL;
+
+	err = fuse_rename_common(olddir, oldent, newdir, newent, flags,
+				 FUSE_RENAME2, sizeof(struct fuse_rename2_in));
+	if (err == -ENOSYS) {
+		fc->no_rename2 = 1;
+		err = -EINVAL;
+	}
+	return err;
+
+}
+
 static int fuse_link(struct dentry *entry, struct inode *newdir,
 		     struct dentry *newent)
 {
@@ -829,6 +878,7 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 		inc_nlink(inode);
 		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
 	} else if (err == -EINTR) {
 		fuse_invalidate_attr(inode);
 	}
@@ -846,6 +896,8 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 		attr->size = i_size_read(inode);
 		attr->mtime = inode->i_mtime.tv_sec;
 		attr->mtimensec = inode->i_mtime.tv_nsec;
+		attr->ctime = inode->i_ctime.tv_sec;
+		attr->ctimensec = inode->i_ctime.tv_nsec;
 	}
 
 	stat->dev = inode->i_sb->s_dev;
@@ -1504,7 +1556,7 @@ static bool update_mtime(unsigned ivalid, bool trust_local_mtime)
 }
 
 static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
-			   bool trust_local_mtime)
+			   bool trust_local_cmtime)
 {
 	unsigned ivalid = iattr->ia_valid;
 
@@ -1523,13 +1575,18 @@ static void iattr_to_fattr(struct iattr *iattr, struct fuse_setattr_in *arg,
 		if (!(ivalid & ATTR_ATIME_SET))
 			arg->valid |= FATTR_ATIME_NOW;
 	}
-	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_mtime)) {
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid, trust_local_cmtime)) {
 		arg->valid |= FATTR_MTIME;
 		arg->mtime = iattr->ia_mtime.tv_sec;
 		arg->mtimensec = iattr->ia_mtime.tv_nsec;
-		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_mtime)
+		if (!(ivalid & ATTR_MTIME_SET) && !trust_local_cmtime)
 			arg->valid |= FATTR_MTIME_NOW;
 	}
+	if ((ivalid & ATTR_CTIME) && trust_local_cmtime) {
+		arg->valid |= FATTR_CTIME;
+		arg->ctime = iattr->ia_ctime.tv_sec;
+		arg->ctimensec = iattr->ia_ctime.tv_nsec;
+	}
 }
 
 /*
@@ -1597,39 +1654,38 @@ static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
 /*
  * Flush inode->i_mtime to the server
  */
-int fuse_flush_mtime(struct file *file, bool nofail)
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff)
 {
-	struct inode *inode = file->f_mapping->host;
-	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = NULL;
+	struct fuse_req *req;
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
 	int err;
 
-	if (nofail) {
-		req = fuse_get_req_nofail_nopages(fc, file);
-	} else {
-		req = fuse_get_req_nopages(fc);
-		if (IS_ERR(req))
-			return PTR_ERR(req);
-	}
+	req = fuse_get_req_nopages(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
 
-	inarg.valid |= FATTR_MTIME;
+	inarg.valid = FATTR_MTIME;
 	inarg.mtime = inode->i_mtime.tv_sec;
 	inarg.mtimensec = inode->i_mtime.tv_nsec;
-
+	if (fc->minor >= 23) {
+		inarg.valid |= FATTR_CTIME;
+		inarg.ctime = inode->i_ctime.tv_sec;
+		inarg.ctimensec = inode->i_ctime.tv_nsec;
+	}
+	if (ff) {
+		inarg.valid |= FATTR_FH;
+		inarg.fh = ff->fh;
+	}
 	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
 	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
 
-	if (!err)
-		clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
-
 	return err;
 }
 
@@ -1653,7 +1709,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	bool is_wb = fc->writeback_cache;
 	loff_t oldsize;
 	int err;
-	bool trust_local_mtime = is_wb && S_ISREG(inode->i_mode);
+	bool trust_local_cmtime = is_wb && S_ISREG(inode->i_mode);
 
 	if (!(fc->flags & FUSE_DEFAULT_PERMISSIONS))
 		attr->ia_valid |= ATTR_FORCE;
@@ -1678,11 +1734,13 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	if (is_truncate) {
 		fuse_set_nowrite(inode);
 		set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
+		if (trust_local_cmtime && attr->ia_size != inode->i_size)
+			attr->ia_valid |= ATTR_MTIME | ATTR_CTIME;
 	}
 
 	memset(&inarg, 0, sizeof(inarg));
 	memset(&outarg, 0, sizeof(outarg));
-	iattr_to_fattr(attr, &inarg, trust_local_mtime);
+	iattr_to_fattr(attr, &inarg, trust_local_cmtime);
 	if (file) {
 		struct fuse_file *ff = file->private_data;
 		inarg.valid |= FATTR_FH;
@@ -1711,9 +1769,12 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 
 	spin_lock(&fc->lock);
 	/* the kernel maintains i_mtime locally */
-	if (trust_local_mtime && (attr->ia_valid & ATTR_MTIME)) {
-		inode->i_mtime = attr->ia_mtime;
-		clear_bit(FUSE_I_MTIME_DIRTY, &fi->state);
+	if (trust_local_cmtime) {
+		if (attr->ia_valid & ATTR_MTIME)
+			inode->i_mtime = attr->ia_mtime;
+		if (attr->ia_valid & ATTR_CTIME)
+			inode->i_ctime = attr->ia_ctime;
+		/* FIXME: clear I_DIRTY_SYNC? */
 	}
 
 	fuse_change_attributes_common(inode, &outarg.attr,
@@ -1810,8 +1871,10 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 		fc->no_setxattr = 1;
 		err = -EOPNOTSUPP;
 	}
-	if (!err)
+	if (!err) {
 		fuse_invalidate_attr(inode);
+		fuse_update_ctime(inode);
+	}
 	return err;
 }
 
@@ -1941,20 +2004,11 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 		fc->no_removexattr = 1;
 		err = -EOPNOTSUPP;
 	}
-	if (!err)
+	if (!err) {
 		fuse_invalidate_attr(inode);
-	return err;
-}
-
-static int fuse_update_time(struct inode *inode, struct timespec *now,
-			    int flags)
-{
-	if (flags & S_MTIME) {
-		inode->i_mtime = *now;
-		set_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state);
-		BUG_ON(!S_ISREG(inode->i_mode));
+		fuse_update_ctime(inode);
 	}
-	return 0;
+	return err;
 }
 
 static const struct inode_operations fuse_dir_inode_operations = {
@@ -1964,6 +2018,7 @@ static const struct inode_operations fuse_dir_inode_operations = {
 	.unlink		= fuse_unlink,
 	.rmdir		= fuse_rmdir,
 	.rename		= fuse_rename,
+	.rename2	= fuse_rename2,
 	.link		= fuse_link,
 	.setattr	= fuse_setattr,
 	.create		= fuse_create,
@@ -1996,7 +2051,6 @@ static const struct inode_operations fuse_common_inode_operations = {
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
-	.update_time	= fuse_update_time,
 };
 
 static const struct inode_operations fuse_symlink_inode_operations = {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 13f8bdec5110..f680d2c44e97 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -223,6 +223,8 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		i_size_write(inode, 0);
 		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
+		if (fc->writeback_cache)
+			file_update_time(file);
 	}
 	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
 		fuse_link_write_file(file);
@@ -232,18 +234,26 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
+	bool lock_inode = (file->f_flags & O_TRUNC) &&
+			  fc->atomic_o_trunc &&
+			  fc->writeback_cache;
 
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
 
+	if (lock_inode)
+		mutex_lock(&inode->i_mutex);
+
 	err = fuse_do_open(fc, get_node_id(inode), file, isdir);
-	if (err)
-		return err;
 
-	fuse_finish_open(inode, file);
+	if (!err)
+		fuse_finish_open(inode, file);
 
-	return 0;
+	if (lock_inode)
+		mutex_unlock(&inode->i_mutex);
+
+	return err;
 }
 
 static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
@@ -314,10 +324,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 
 	/* see fuse_vma_close() for !writeback_cache case */
 	if (fc->writeback_cache)
-		filemap_write_and_wait(file->f_mapping);
-
-	if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state))
-		fuse_flush_mtime(file, true);
+		write_inode_now(inode, 1);
 
 	fuse_release_common(file, FUSE_RELEASE);
 
@@ -439,7 +446,7 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (fc->no_flush)
 		return 0;
 
-	err = filemap_write_and_wait(file->f_mapping);
+	err = write_inode_now(inode, 1);
 	if (err)
 		return err;
 
@@ -480,13 +487,6 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
-	if (err)
-		return err;
-
-	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
-		return 0;
-
 	mutex_lock(&inode->i_mutex);
 
 	/*
@@ -494,17 +494,17 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 	 * wait for all outstanding writes, before sending the FSYNC
 	 * request.
 	 */
-	err = write_inode_now(inode, 0);
+	err = filemap_write_and_wait_range(inode->i_mapping, start, end);
 	if (err)
 		goto out;
 
 	fuse_sync_writes(inode);
+	err = sync_inode_metadata(inode, 1);
+	if (err)
+		goto out;
 
-	if (test_bit(FUSE_I_MTIME_DIRTY, &get_fuse_inode(inode)->state)) {
-		int err = fuse_flush_mtime(file, false);
-		if (err)
-			goto out;
-	}
+	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
+		goto out;
 
 	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
@@ -1659,13 +1659,13 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_free(fc, req);
 }
 
-static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
-					     struct fuse_inode *fi)
+static struct fuse_file *__fuse_write_file_get(struct fuse_conn *fc,
+					       struct fuse_inode *fi)
 {
 	struct fuse_file *ff = NULL;
 
 	spin_lock(&fc->lock);
-	if (!WARN_ON(list_empty(&fi->write_files))) {
+	if (!list_empty(&fi->write_files)) {
 		ff = list_entry(fi->write_files.next, struct fuse_file,
 				write_entry);
 		fuse_file_get(ff);
@@ -1675,6 +1675,29 @@ static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
 	return ff;
 }
 
+static struct fuse_file *fuse_write_file_get(struct fuse_conn *fc,
+					     struct fuse_inode *fi)
+{
+	struct fuse_file *ff = __fuse_write_file_get(fc, fi);
+	WARN_ON(!ff);
+	return ff;
+}
+
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff;
+	int err;
+
+	ff = __fuse_write_file_get(fc, fi);
+	err = fuse_flush_times(inode, ff);
+	if (ff)
+		fuse_file_put(ff, 0);
+
+	return err;
+}
+
 static int fuse_writepage_locked(struct page *page)
 {
 	struct address_space *mapping = page->mapping;
@@ -2281,7 +2304,6 @@ static int fuse_file_flock(struct file *file, int cmd, struct file_lock *fl)
 		struct fuse_file *ff = file->private_data;
 
 		/* emulate flock with POSIX locks */
-		fl->fl_owner = (fl_owner_t) file;
 		ff->flock = true;
 		err = fuse_setlk(file, fl, 1);
 	}
@@ -2972,6 +2994,9 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
 			   (mode & FALLOC_FL_PUNCH_HOLE);
 
+	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
+		return -EOPNOTSUPP;
+
 	if (fc->no_fallocate)
 		return -EOPNOTSUPP;
 
@@ -3017,12 +3042,8 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 	if (!(mode & FALLOC_FL_KEEP_SIZE)) {
 		bool changed = fuse_write_update_size(inode, offset + length);
 
-		if (changed && fc->writeback_cache) {
-			struct fuse_inode *fi = get_fuse_inode(inode);
-
-			inode->i_mtime = current_fs_time(inode->i_sb);
-			set_bit(FUSE_I_MTIME_DIRTY, &fi->state);
-		}
+		if (changed && fc->writeback_cache)
+			file_update_time(file);
 	}
 
 	if (mode & FALLOC_FL_PUNCH_HOLE)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a257ed8ebee6..7aa5c75e0de1 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -119,8 +119,6 @@ enum {
 	FUSE_I_INIT_RDPLUS,
 	/** An operation changing file size is in progress  */
 	FUSE_I_SIZE_UNSTABLE,
-	/** i_mtime has been updated locally; a flush to userspace needed */
-	FUSE_I_MTIME_DIRTY,
 };
 
 struct fuse_conn;
@@ -544,6 +542,9 @@ struct fuse_conn {
 	/** Is fallocate not implemented by fs? */
 	unsigned no_fallocate:1;
 
+	/** Is rename with flags implemented by fs? */
+	unsigned no_rename2:1;
+
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
@@ -725,7 +726,7 @@ int fuse_dev_init(void);
 void fuse_dev_cleanup(void);
 
 int fuse_ctl_init(void);
-void fuse_ctl_cleanup(void);
+void __exit fuse_ctl_cleanup(void);
 
 /**
  * Allocate a request
@@ -891,7 +892,8 @@ int fuse_dev_release(struct inode *inode, struct file *file);
 
 bool fuse_write_update_size(struct inode *inode, loff_t pos);
 
-int fuse_flush_mtime(struct file *file, bool nofail);
+int fuse_flush_times(struct inode *inode, struct fuse_file *ff);
+int fuse_write_inode(struct inode *inode, struct writeback_control *wbc);
 
 int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 		    struct file *file);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 8d611696fcad..754dcf23de8a 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -175,9 +175,9 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
 		inode->i_mtime.tv_sec   = attr->mtime;
 		inode->i_mtime.tv_nsec  = attr->mtimensec;
+		inode->i_ctime.tv_sec   = attr->ctime;
+		inode->i_ctime.tv_nsec  = attr->ctimensec;
 	}
-	inode->i_ctime.tv_sec   = attr->ctime;
-	inode->i_ctime.tv_nsec  = attr->ctimensec;
 
 	if (attr->blksize != 0)
 		inode->i_blkbits = ilog2(attr->blksize);
@@ -256,6 +256,8 @@ static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
 	inode->i_size = attr->size;
 	inode->i_mtime.tv_sec  = attr->mtime;
 	inode->i_mtime.tv_nsec = attr->mtimensec;
+	inode->i_ctime.tv_sec  = attr->ctime;
+	inode->i_ctime.tv_nsec = attr->ctimensec;
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
 		fuse_init_file_inode(inode);
@@ -303,7 +305,7 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 
 	if ((inode->i_state & I_NEW)) {
 		inode->i_flags |= S_NOATIME;
-		if (!fc->writeback_cache || !S_ISREG(inode->i_mode))
+		if (!fc->writeback_cache || !S_ISREG(attr->mode))
 			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
 		inode->i_data.backing_dev_info = &fc->bdi;
@@ -788,6 +790,7 @@ static const struct super_operations fuse_super_operations = {
 	.alloc_inode    = fuse_alloc_inode,
 	.destroy_inode  = fuse_destroy_inode,
 	.evict_inode	= fuse_evict_inode,
+	.write_inode	= fuse_write_inode,
 	.drop_inode	= generic_delete_inode,
 	.remount_fs	= fuse_remount_fs,
 	.put_super	= fuse_put_super,
@@ -890,6 +893,11 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 				fc->async_dio = 1;
 			if (arg->flags & FUSE_WRITEBACK_CACHE)
 				fc->writeback_cache = 1;
+			if (arg->time_gran && arg->time_gran <= 1000000000)
+				fc->sb->s_time_gran = arg->time_gran;
+			else
+				fc->sb->s_time_gran = 1000000000;
+
 		} else {
 			ra_pages = fc->max_read / PAGE_CACHE_SIZE;
 			fc->no_lock = 1;
@@ -996,7 +1004,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (sb->s_flags & MS_MANDLOCK)
 		goto err;
 
-	sb->s_flags &= ~MS_NOSEC;
+	sb->s_flags &= ~(MS_NOSEC | MS_I_VERSION);
 
 	if (!parse_fuse_opt((char *) data, &d, is_bdev))
 		goto err;
diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c
index ce62dcac90b6..5a49b037da81 100644
--- a/fs/gfs2/aops.c
+++ b/fs/gfs2/aops.c
@@ -431,7 +431,7 @@ static int gfs2_jdata_writepages(struct address_space *mapping,
 
 	ret = gfs2_write_cache_jdata(mapping, wbc);
 	if (ret == 0 && wbc->sync_mode == WB_SYNC_ALL) {
-		gfs2_log_flush(sdp, ip->i_gl);
+		gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
 		ret = gfs2_write_cache_jdata(mapping, wbc);
 	}
 	return ret;
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index c62d4b9f51dc..e6ee5b6e8d99 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -707,7 +707,7 @@ int gfs2_extent_map(struct inode *inode, u64 lblock, int *new, u64 *dblock, unsi
  * @top: The first pointer in the buffer
  * @bottom: One more than the last pointer
  * @height: the height this buffer is at
- * @data: a pointer to a struct strip_mine
+ * @sm: a pointer to a struct strip_mine
  *
  * Returns: errno
  */
@@ -992,6 +992,8 @@ unlock:
 	return err;
 }
 
+#define GFS2_JTRUNC_REVOKES 8192
+
 /**
  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
  * @inode: The inode being truncated
@@ -1003,8 +1005,6 @@ unlock:
  * if the number of pages being truncated gets too large.
  */
 
-#define GFS2_JTRUNC_REVOKES 8192
-
 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
 {
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -1348,7 +1348,7 @@ void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
  * gfs2_add_jextent - Add or merge a new extent to extent cache
  * @jd: The journal descriptor
  * @lblock: The logical block at start of new extent
- * @pblock: The physical block at start of new extent
+ * @dblock: The physical block at start of new extent
  * @blocks: Size of extent in fs blocks
  *
  * Returns: 0 on success or -ENOMEM
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 80d67253623c..6ab0cfb2e891 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -203,9 +203,9 @@ void gfs2_set_inode_flags(struct inode *inode)
 			     GFS2_DIF_INHERIT_JDATA)
 
 /**
- * gfs2_set_flags - set flags on an inode
- * @inode: The inode
- * @flags: The flags to set
+ * do_gfs2_set_flags - set flags on an inode
+ * @filp: file pointer
+ * @reqflags: The flags to set
  * @mask: Indicates which flags are valid
  *
  */
@@ -256,7 +256,7 @@ static int do_gfs2_set_flags(struct file *filp, u32 reqflags, u32 mask)
 	}
 	if ((flags ^ new_flags) & GFS2_DIF_JDATA) {
 		if (flags & GFS2_DIF_JDATA)
-			gfs2_log_flush(sdp, ip->i_gl);
+			gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
 		error = filemap_fdatawrite(inode->i_mapping);
 		if (error)
 			goto out;
@@ -318,7 +318,7 @@ static long gfs2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 
 /**
  * gfs2_size_hint - Give a hint to the size of a write request
- * @file: The struct file
+ * @filep: The struct file
  * @offset: The file offset of the write
  * @size: The length of the write
  *
@@ -371,7 +371,7 @@ static int gfs2_allocate_page_backing(struct page *page)
 /**
  * gfs2_page_mkwrite - Make a shared, mmap()ed, page writable
  * @vma: The virtual memory area
- * @page: The page which is about to become writable
+ * @vmf: The virtual memory fault containing the page to become writable
  *
  * When the page becomes writable, we need to ensure that we have
  * blocks allocated on disk to back that page.
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index aec7f73832f0..c355f7320e44 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -277,7 +277,7 @@ static inline int may_grant(const struct gfs2_glock *gl, const struct gfs2_holde
 static void gfs2_holder_wake(struct gfs2_holder *gh)
 {
 	clear_bit(HIF_WAIT, &gh->gh_iflags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&gh->gh_iflags, HIF_WAIT);
 }
 
@@ -411,7 +411,7 @@ static void gfs2_demote_wake(struct gfs2_glock *gl)
 {
 	gl->gl_demote_state = LM_ST_EXCLUSIVE;
 	clear_bit(GLF_DEMOTE, &gl->gl_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&gl->gl_flags, GLF_DEMOTE);
 }
 
@@ -620,7 +620,7 @@ out:
 
 out_sched:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	gl->gl_lockref.count++;
 	if (queue_delayed_work(glock_workqueue, &gl->gl_work, 0) == 0)
 		gl->gl_lockref.count--;
@@ -628,7 +628,7 @@ out_sched:
 
 out_unlock:
 	clear_bit(GLF_LOCK, &gl->gl_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	return;
 }
 
diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c
index 54b66809e818..fc1100781bbc 100644
--- a/fs/gfs2/glops.c
+++ b/fs/gfs2/glops.c
@@ -89,18 +89,23 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl)
 	if (!tr.tr_revokes)
 		return;
 
-	/* A shortened, inline version of gfs2_trans_begin() */
+	/* A shortened, inline version of gfs2_trans_begin()
+         * tr->alloced is not set since the transaction structure is
+         * on the stack */
 	tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64));
 	tr.tr_ip = (unsigned long)__builtin_return_address(0);
 	sb_start_intwrite(sdp->sd_vfs);
-	gfs2_log_reserve(sdp, tr.tr_reserved);
+	if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) {
+		sb_end_intwrite(sdp->sd_vfs);
+		return;
+	}
 	WARN_ON_ONCE(current->journal_info);
 	current->journal_info = &tr;
 
 	__gfs2_ail_flush(gl, 0, tr.tr_revokes);
 
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL);
+	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 }
 
 void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
@@ -121,7 +126,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync)
 		return;
 	__gfs2_ail_flush(gl, fsync, max_revokes);
 	gfs2_trans_end(sdp);
-	gfs2_log_flush(sdp, NULL);
+	gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 }
 
 /**
@@ -144,7 +149,7 @@ static void rgrp_go_sync(struct gfs2_glock *gl)
 		return;
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(sdp, gl);
+	gfs2_log_flush(sdp, gl, NORMAL_FLUSH);
 	filemap_fdatawrite_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	error = filemap_fdatawait_range(mapping, gl->gl_vm.start, gl->gl_vm.end);
 	mapping_set_error(mapping, error);
@@ -206,7 +211,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 
 	GLOCK_BUG_ON(gl, gl->gl_state != LM_ST_EXCLUSIVE);
 
-	gfs2_log_flush(gl->gl_sbd, gl);
+	gfs2_log_flush(gl->gl_sbd, gl, NORMAL_FLUSH);
 	filemap_fdatawrite(metamapping);
 	if (ip) {
 		struct address_space *mapping = ip->i_inode.i_mapping;
@@ -221,7 +226,7 @@ static void inode_go_sync(struct gfs2_glock *gl)
 	 * Writeback of the data mapping may cause the dirty flag to be set
 	 * so we have to clear it again here.
 	 */
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(GLF_DIRTY, &gl->gl_flags);
 }
 
@@ -253,7 +258,7 @@ static void inode_go_inval(struct gfs2_glock *gl, int flags)
 	}
 
 	if (ip == GFS2_I(gl->gl_sbd->sd_rindex)) {
-		gfs2_log_flush(gl->gl_sbd, NULL);
+		gfs2_log_flush(gl->gl_sbd, NULL, NORMAL_FLUSH);
 		gl->gl_sbd->sd_rindex_uptodate = 0;
 	}
 	if (ip && S_ISREG(ip->i_inode.i_mode))
@@ -455,31 +460,39 @@ static void inode_go_dump(struct seq_file *seq, const struct gfs2_glock *gl)
 }
 
 /**
- * trans_go_sync - promote/demote the transaction glock
+ * freeze_go_sync - promote/demote the freeze glock
  * @gl: the glock
  * @state: the requested state
  * @flags:
  *
  */
 
-static void trans_go_sync(struct gfs2_glock *gl)
+static void freeze_go_sync(struct gfs2_glock *gl)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
+	DEFINE_WAIT(wait);
 
-	if (gl->gl_state != LM_ST_UNLOCKED &&
+	if (gl->gl_state == LM_ST_SHARED &&
 	    test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) {
-		gfs2_meta_syncfs(sdp);
-		gfs2_log_shutdown(sdp);
+		atomic_set(&sdp->sd_log_freeze, 1);
+		wake_up(&sdp->sd_logd_waitq);
+		do {
+			prepare_to_wait(&sdp->sd_log_frozen_wait, &wait,
+					TASK_UNINTERRUPTIBLE);
+			if (atomic_read(&sdp->sd_log_freeze))
+				io_schedule();
+		} while(atomic_read(&sdp->sd_log_freeze));
+		finish_wait(&sdp->sd_log_frozen_wait, &wait);
 	}
 }
 
 /**
- * trans_go_xmote_bh - After promoting/demoting the transaction glock
+ * freeze_go_xmote_bh - After promoting/demoting the freeze glock
  * @gl: the glock
  *
  */
 
-static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
+static int freeze_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
 {
 	struct gfs2_sbd *sdp = gl->gl_sbd;
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
@@ -512,7 +525,7 @@ static int trans_go_xmote_bh(struct gfs2_glock *gl, struct gfs2_holder *gh)
  * Always returns 0
  */
 
-static int trans_go_demote_ok(const struct gfs2_glock *gl)
+static int freeze_go_demote_ok(const struct gfs2_glock *gl)
 {
 	return 0;
 }
@@ -563,10 +576,10 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = {
 	.go_flags = GLOF_LVB,
 };
 
-const struct gfs2_glock_operations gfs2_trans_glops = {
-	.go_sync = trans_go_sync,
-	.go_xmote_bh = trans_go_xmote_bh,
-	.go_demote_ok = trans_go_demote_ok,
+const struct gfs2_glock_operations gfs2_freeze_glops = {
+	.go_sync = freeze_go_sync,
+	.go_xmote_bh = freeze_go_xmote_bh,
+	.go_demote_ok = freeze_go_demote_ok,
 	.go_type = LM_TYPE_NONDISK,
 };
 
diff --git a/fs/gfs2/glops.h b/fs/gfs2/glops.h
index bf95a2dc1662..7455d2629bcb 100644
--- a/fs/gfs2/glops.h
+++ b/fs/gfs2/glops.h
@@ -15,7 +15,7 @@
 extern const struct gfs2_glock_operations gfs2_meta_glops;
 extern const struct gfs2_glock_operations gfs2_inode_glops;
 extern const struct gfs2_glock_operations gfs2_rgrp_glops;
-extern const struct gfs2_glock_operations gfs2_trans_glops;
+extern const struct gfs2_glock_operations gfs2_freeze_glops;
 extern const struct gfs2_glock_operations gfs2_iopen_glops;
 extern const struct gfs2_glock_operations gfs2_flock_glops;
 extern const struct gfs2_glock_operations gfs2_nondisk_glops;
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index bdf70c18610c..67d310c9ada3 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -465,9 +465,7 @@ struct gfs2_trans {
 	unsigned int tr_reserved;
 	unsigned int tr_touched:1;
 	unsigned int tr_attached:1;
-
-	struct gfs2_holder tr_t_gh;
-
+	unsigned int tr_alloced:1;
 
 	unsigned int tr_num_buf_new;
 	unsigned int tr_num_databuf_new;
@@ -682,7 +680,7 @@ struct gfs2_sbd {
 	struct lm_lockstruct sd_lockstruct;
 	struct gfs2_holder sd_live_gh;
 	struct gfs2_glock *sd_rename_gl;
-	struct gfs2_glock *sd_trans_gl;
+	struct gfs2_glock *sd_freeze_gl;
 	wait_queue_head_t sd_glock_wait;
 	atomic_t sd_glock_disposal;
 	struct completion sd_locking_init;
@@ -730,6 +728,8 @@ struct gfs2_sbd {
 	struct gfs2_holder sd_sc_gh;
 	struct gfs2_holder sd_qc_gh;
 
+	struct completion sd_journal_ready;
+
 	/* Daemon stuff */
 
 	struct task_struct *sd_logd_process;
@@ -794,6 +794,12 @@ struct gfs2_sbd {
 
 	/* For quiescing the filesystem */
 	struct gfs2_holder sd_freeze_gh;
+	struct gfs2_holder sd_freeze_root_gh;
+	struct gfs2_holder sd_thaw_gh;
+	atomic_t sd_log_freeze;
+	atomic_t sd_frozen_root;
+	wait_queue_head_t sd_frozen_root_wait;
+	wait_queue_head_t sd_log_frozen_wait;
 
 	char sd_fsname[GFS2_FSNAME_LEN];
 	char sd_table_name[GFS2_FSNAME_LEN];
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 28cc7bf6575a..e62e59477884 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -1613,18 +1613,26 @@ int gfs2_permission(struct inode *inode, int mask)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_holder i_gh;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 	int unlock = 0;
+	int frozen_root = 0;
 
 
 	ip = GFS2_I(inode);
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		if (mask & MAY_NOT_BLOCK)
-			return -ECHILD;
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
-		if (error)
-			return error;
-		unlock = 1;
+		if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
+			     inode == sdp->sd_root_dir->d_inode &&
+			     atomic_inc_not_zero(&sdp->sd_frozen_root)))
+			frozen_root = 1;
+		else {
+			if (mask & MAY_NOT_BLOCK)
+				return -ECHILD;
+			error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &i_gh);
+			if (error)
+				return error;
+			unlock = 1;
+		}
 	}
 
 	if ((mask & MAY_WRITE) && IS_IMMUTABLE(inode))
@@ -1633,6 +1641,8 @@ int gfs2_permission(struct inode *inode, int mask)
 		error = generic_permission(inode, mask);
 	if (unlock)
 		gfs2_glock_dq_uninit(&i_gh);
+	else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
+		wake_up(&sdp->sd_frozen_root_wait);
 
 	return error;
 }
@@ -1805,19 +1815,29 @@ static int gfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	struct inode *inode = dentry->d_inode;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_holder gh;
+	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	int error;
 	int unlock = 0;
+	int frozen_root = 0;
 
 	if (gfs2_glock_is_locked_by_me(ip->i_gl) == NULL) {
-		error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
-		if (error)
-			return error;
-		unlock = 1;
+		if (unlikely(gfs2_glock_is_held_excl(sdp->sd_freeze_gl) &&
+			     inode == sdp->sd_root_dir->d_inode &&
+			     atomic_inc_not_zero(&sdp->sd_frozen_root)))
+			frozen_root = 1;
+		else {
+			error = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, LM_FLAG_ANY, &gh);
+			if (error)
+				return error;
+			unlock = 1;
+		}
 	}
 
 	generic_fillattr(inode, stat);
 	if (unlock)
 		gfs2_glock_dq_uninit(&gh);
+	else if (frozen_root && atomic_dec_and_test(&sdp->sd_frozen_root))
+		wake_up(&sdp->sd_frozen_root_wait);
 
 	return 0;
 }
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index c1eb555dc588..91f274de1246 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1134,7 +1134,7 @@ static void gdlm_recover_done(void *arg, struct dlm_slot *slots, int num_slots,
 		queue_delayed_work(gfs2_control_wq, &sdp->sd_control_work, 0);
 
 	clear_bit(DFL_DLM_RECOVERY, &ls->ls_recover_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&ls->ls_recover_flags, DFL_DLM_RECOVERY);
 	spin_unlock(&ls->ls_recover_spin);
 }
@@ -1271,7 +1271,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 
 	ls->ls_first = !!test_bit(DFL_FIRST_MOUNT, &ls->ls_recover_flags);
 	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
 	return 0;
 
diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
index 4a14d504ef83..3966fadbcebd 100644
--- a/fs/gfs2/log.c
+++ b/fs/gfs2/log.c
@@ -301,6 +301,23 @@ static void ail2_empty(struct gfs2_sbd *sdp, unsigned int new_tail)
 }
 
 /**
+ * gfs2_log_release - Release a given number of log blocks
+ * @sdp: The GFS2 superblock
+ * @blks: The number of blocks
+ *
+ */
+
+void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
+{
+
+	atomic_add(blks, &sdp->sd_log_blks_free);
+	trace_gfs2_log_blocks(sdp, blks);
+	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
+				  sdp->sd_jdesc->jd_blocks);
+	up_read(&sdp->sd_log_flush_lock);
+}
+
+/**
  * gfs2_log_reserve - Make a log reservation
  * @sdp: The GFS2 superblock
  * @blks: The number of blocks to reserve
@@ -358,7 +375,10 @@ retry:
 		wake_up(&sdp->sd_log_waitq);
 
 	down_read(&sdp->sd_log_flush_lock);
-
+	if (unlikely(!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags))) {
+		gfs2_log_release(sdp, blks);
+		return -EROFS;
+	}
 	return 0;
 }
 
@@ -671,7 +691,8 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags)
  *
  */
 
-void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
+void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+		    enum gfs2_flush_type type)
 {
 	struct gfs2_trans *tr;
 
@@ -723,6 +744,42 @@ void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl)
 	}
 	spin_unlock(&sdp->sd_ail_lock);
 	gfs2_log_unlock(sdp);
+
+	if (atomic_read(&sdp->sd_log_freeze))
+		type = FREEZE_FLUSH;
+	if (type != NORMAL_FLUSH) {
+		if (!sdp->sd_log_idle) {
+			for (;;) {
+				gfs2_ail1_start(sdp);
+				gfs2_ail1_wait(sdp);
+				if (gfs2_ail1_empty(sdp))
+					break;
+			}
+			atomic_dec(&sdp->sd_log_blks_free); /* Adjust for unreserved buffer */
+			trace_gfs2_log_blocks(sdp, -1);
+			sdp->sd_log_flush_wrapped = 0;
+			log_write_header(sdp, 0);
+			sdp->sd_log_head = sdp->sd_log_flush_head;
+		}
+		if (type == SHUTDOWN_FLUSH || type == FREEZE_FLUSH)
+			gfs2_log_shutdown(sdp);
+		if (type == FREEZE_FLUSH) {
+			int error;
+
+			atomic_set(&sdp->sd_log_freeze, 0);
+			wake_up(&sdp->sd_log_frozen_wait);
+			error = gfs2_glock_nq_init(sdp->sd_freeze_gl,
+						   LM_ST_SHARED, 0,
+						   &sdp->sd_thaw_gh);
+			if (error) {
+				printk(KERN_INFO "GFS2: couln't get freeze lock : %d\n", error);
+				gfs2_assert_withdraw(sdp, 0);
+			}
+			else
+				gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
+		}
+	}
+
 	trace_gfs2_log_flush(sdp, 0);
 	up_write(&sdp->sd_log_flush_lock);
 
@@ -761,7 +818,7 @@ static void log_refund(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 	if (sdp->sd_log_tr) {
 		gfs2_merge_trans(sdp->sd_log_tr, tr);
 	} else if (tr->tr_num_buf_new || tr->tr_num_databuf_new) {
-		gfs2_assert_withdraw(sdp, tr->tr_t_gh.gh_gl);
+		gfs2_assert_withdraw(sdp, tr->tr_alloced);
 		sdp->sd_log_tr = tr;
 		tr->tr_attached = 1;
 	}
@@ -813,8 +870,6 @@ void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr)
 
 void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 {
-	down_write(&sdp->sd_log_flush_lock);
-
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_blks_reserved);
 	gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke);
 	gfs2_assert_withdraw(sdp, list_empty(&sdp->sd_ail1_list));
@@ -824,38 +879,16 @@ void gfs2_log_shutdown(struct gfs2_sbd *sdp)
 
 	log_write_header(sdp, GFS2_LOG_HEAD_UNMOUNT);
 
-	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
 	gfs2_assert_warn(sdp, sdp->sd_log_head == sdp->sd_log_tail);
 	gfs2_assert_warn(sdp, list_empty(&sdp->sd_ail2_list));
 
 	sdp->sd_log_head = sdp->sd_log_flush_head;
 	sdp->sd_log_tail = sdp->sd_log_head;
-
-	up_write(&sdp->sd_log_flush_lock);
-}
-
-
-/**
- * gfs2_meta_syncfs - sync all the buffers in a filesystem
- * @sdp: the filesystem
- *
- */
-
-void gfs2_meta_syncfs(struct gfs2_sbd *sdp)
-{
-	gfs2_log_flush(sdp, NULL);
-	for (;;) {
-		gfs2_ail1_start(sdp);
-		gfs2_ail1_wait(sdp);
-		if (gfs2_ail1_empty(sdp))
-			break;
-	}
-	gfs2_log_flush(sdp, NULL);
 }
 
 static inline int gfs2_jrnl_flush_reqd(struct gfs2_sbd *sdp)
 {
-	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1));
+	return (atomic_read(&sdp->sd_log_pinned) >= atomic_read(&sdp->sd_log_thresh1) || atomic_read(&sdp->sd_log_freeze));
 }
 
 static inline int gfs2_ail_flush_reqd(struct gfs2_sbd *sdp)
@@ -882,14 +915,14 @@ int gfs2_logd(void *data)
 
 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL);
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 		}
 
 		if (gfs2_ail_flush_reqd(sdp)) {
 			gfs2_ail1_start(sdp);
 			gfs2_ail1_wait(sdp);
 			gfs2_ail1_empty(sdp);
-			gfs2_log_flush(sdp, NULL);
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 		}
 
 		if (!gfs2_ail_flush_reqd(sdp))
diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h
index 37216634f0aa..9499a6049212 100644
--- a/fs/gfs2/log.h
+++ b/fs/gfs2/log.h
@@ -63,14 +63,21 @@ extern void gfs2_ordered_del_inode(struct gfs2_inode *ip);
 extern unsigned int gfs2_struct2blk(struct gfs2_sbd *sdp, unsigned int nstruct,
 			    unsigned int ssize);
 
+extern void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks);
 extern int gfs2_log_reserve(struct gfs2_sbd *sdp, unsigned int blks);
-extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl);
+enum gfs2_flush_type {
+	NORMAL_FLUSH = 0,
+	SYNC_FLUSH,
+	SHUTDOWN_FLUSH,
+	FREEZE_FLUSH
+};
+extern void gfs2_log_flush(struct gfs2_sbd *sdp, struct gfs2_glock *gl,
+			   enum gfs2_flush_type type);
 extern void gfs2_log_commit(struct gfs2_sbd *sdp, struct gfs2_trans *trans);
 extern void gfs2_remove_from_ail(struct gfs2_bufdata *bd);
 extern void gfs2_ail1_flush(struct gfs2_sbd *sdp, struct writeback_control *wbc);
 
 extern void gfs2_log_shutdown(struct gfs2_sbd *sdp);
-extern void gfs2_meta_syncfs(struct gfs2_sbd *sdp);
 extern int gfs2_logd(void *data);
 extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd);
 extern void gfs2_write_revokes(struct gfs2_sbd *sdp);
diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c
index a294d8d8bcd4..2c1ae861dc94 100644
--- a/fs/gfs2/lops.c
+++ b/fs/gfs2/lops.c
@@ -75,7 +75,7 @@ static void maybe_release_space(struct gfs2_bufdata *bd)
 	unsigned int index = bd->bd_bh->b_blocknr - gl->gl_name.ln_number;
 	struct gfs2_bitmap *bi = rgd->rd_bits + index;
 
-	if (bi->bi_clone == 0)
+	if (bi->bi_clone == NULL)
 		return;
 	if (sdp->sd_args.ar_discard)
 		gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bd->bd_bh, bi, 1, NULL);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index 22f954051bb8..bc564c0d6d16 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -94,6 +94,7 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	INIT_LIST_HEAD(&sdp->sd_jindex_list);
 	spin_lock_init(&sdp->sd_jindex_spin);
 	mutex_init(&sdp->sd_jindex_mutex);
+	init_completion(&sdp->sd_journal_ready);
 
 	INIT_LIST_HEAD(&sdp->sd_quota_list);
 	mutex_init(&sdp->sd_quota_mutex);
@@ -129,6 +130,10 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	init_rwsem(&sdp->sd_log_flush_lock);
 	atomic_set(&sdp->sd_log_in_flight, 0);
 	init_waitqueue_head(&sdp->sd_log_flush_wait);
+	init_waitqueue_head(&sdp->sd_log_frozen_wait);
+	atomic_set(&sdp->sd_log_freeze, 0);
+	atomic_set(&sdp->sd_frozen_root, 0);
+	init_waitqueue_head(&sdp->sd_frozen_root_wait);
 
 	return sdp;
 }
@@ -419,8 +424,8 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 		goto fail_live;
 	}
 
-	error = gfs2_glock_get(sdp, GFS2_TRANS_LOCK, &gfs2_trans_glops,
-			       CREATE, &sdp->sd_trans_gl);
+	error = gfs2_glock_get(sdp, GFS2_FREEZE_LOCK, &gfs2_freeze_glops,
+			       CREATE, &sdp->sd_freeze_gl);
 	if (error) {
 		fs_err(sdp, "can't create transaction glock: %d\n", error);
 		goto fail_rename;
@@ -429,7 +434,7 @@ static int init_locking(struct gfs2_sbd *sdp, struct gfs2_holder *mount_gh,
 	return 0;
 
 fail_trans:
-	gfs2_glock_put(sdp->sd_trans_gl);
+	gfs2_glock_put(sdp->sd_freeze_gl);
 fail_rename:
 	gfs2_glock_put(sdp->sd_rename_gl);
 fail_live:
@@ -755,7 +760,15 @@ static int init_journal(struct gfs2_sbd *sdp, int undo)
 	set_bit(SDF_JOURNAL_CHECKED, &sdp->sd_flags);
 	gfs2_glock_dq_uninit(&ji_gh);
 	jindex = 0;
-
+	if (!sdp->sd_args.ar_spectator) {
+		error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+					   &sdp->sd_thaw_gh);
+		if (error) {
+			fs_err(sdp, "can't acquire freeze glock: %d\n", error);
+			goto fail_jinode_gh;
+		}
+	}
+	gfs2_glock_dq_uninit(&sdp->sd_thaw_gh);
 	return 0;
 
 fail_jinode_gh:
@@ -784,6 +797,7 @@ static int init_inodes(struct gfs2_sbd *sdp, int undo)
 		goto fail_qinode;
 
 	error = init_journal(sdp, undo);
+	complete_all(&sdp->sd_journal_ready);
 	if (error)
 		goto fail;
 
@@ -1200,6 +1214,7 @@ fail_sb:
 fail_locking:
 	init_locking(sdp, &mount_gh, UNDO);
 fail_lm:
+	complete_all(&sdp->sd_journal_ready);
 	gfs2_gl_hash_clear(sdp);
 	gfs2_lm_unmount(sdp);
 fail_debug:
@@ -1380,7 +1395,7 @@ static void gfs2_kill_sb(struct super_block *sb)
 		return;
 	}
 
-	gfs2_meta_syncfs(sdp);
+	gfs2_log_flush(sdp, NULL, SYNC_FLUSH);
 	dput(sdp->sd_root_dir);
 	dput(sdp->sd_master_dir);
 	sdp->sd_root_dir = NULL;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index c4effff7cf55..64b29f7f6b4c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -778,6 +778,7 @@ get_a_page:
 		i_size_write(inode, size);
 	inode->i_mtime = inode->i_atime = CURRENT_TIME;
 	mark_inode_dirty(inode);
+	set_bit(QDF_REFRESH, &qd->qd_flags);
 	return 0;
 
 unlock_out:
@@ -879,7 +880,7 @@ out:
 		gfs2_glock_dq_uninit(&ghs[qx]);
 	mutex_unlock(&ip->i_inode.i_mutex);
 	kfree(ghs);
-	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl);
+	gfs2_log_flush(ip->i_gl->gl_sbd, ip->i_gl, NORMAL_FLUSH);
 	return error;
 }
 
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 7ad4094d68c0..94555d4c5698 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -454,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work)
 	struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
 	struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode);
 	struct gfs2_log_header_host head;
-	struct gfs2_holder j_gh, ji_gh, t_gh;
+	struct gfs2_holder j_gh, ji_gh, thaw_gh;
 	unsigned long t;
 	int ro = 0;
 	unsigned int pass;
@@ -508,11 +508,11 @@ void gfs2_recover_func(struct work_struct *work)
 
 		t = jiffies;
 
-		/* Acquire a shared hold on the transaction lock */
+		/* Acquire a shared hold on the freeze lock */
 
-		error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED,
-					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY |
-					   GL_NOCACHE, &t_gh);
+		error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED,
+					   LM_FLAG_NOEXP | LM_FLAG_PRIORITY,
+					   &thaw_gh);
 		if (error)
 			goto fail_gunlock_ji;
 
@@ -538,7 +538,7 @@ void gfs2_recover_func(struct work_struct *work)
 			fs_warn(sdp, "jid=%u: Can't replay: read-only block "
 				"device\n", jd->jd_jid);
 			error = -EROFS;
-			goto fail_gunlock_tr;
+			goto fail_gunlock_thaw;
 		}
 
 		fs_info(sdp, "jid=%u: Replaying journal...\n", jd->jd_jid);
@@ -549,14 +549,14 @@ void gfs2_recover_func(struct work_struct *work)
 						   head.lh_blkno, pass);
 			lops_after_scan(jd, error, pass);
 			if (error)
-				goto fail_gunlock_tr;
+				goto fail_gunlock_thaw;
 		}
 
 		error = clean_journal(jd, &head);
 		if (error)
-			goto fail_gunlock_tr;
+			goto fail_gunlock_thaw;
 
-		gfs2_glock_dq_uninit(&t_gh);
+		gfs2_glock_dq_uninit(&thaw_gh);
 		t = DIV_ROUND_UP(jiffies - t, HZ);
 		fs_info(sdp, "jid=%u: Journal replayed in %lus\n",
 			jd->jd_jid, t);
@@ -572,8 +572,8 @@ void gfs2_recover_func(struct work_struct *work)
 	fs_info(sdp, "jid=%u: Done\n", jd->jd_jid);
 	goto done;
 
-fail_gunlock_tr:
-	gfs2_glock_dq_uninit(&t_gh);
+fail_gunlock_thaw:
+	gfs2_glock_dq_uninit(&thaw_gh);
 fail_gunlock_ji:
 	if (jlocked) {
 		gfs2_glock_dq_uninit(&ji_gh);
@@ -587,7 +587,7 @@ fail:
 	gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP);
 done:
 	clear_bit(JDF_RECOVERY, &jd->jd_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&jd->jd_flags, JDF_RECOVERY);
 }
 
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 281a7716e3f3..db629d1bd1bd 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -2001,7 +2001,7 @@ next_rgrp:
 		}
 		/* Flushing the log may release space */
 		if (loops == 2)
-			gfs2_log_flush(sdp, NULL);
+			gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	}
 
 	return -ENOSPC;
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index de8afad89e51..1319b5c4ec68 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -399,7 +399,7 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 {
 	struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode);
 	struct gfs2_glock *j_gl = ip->i_gl;
-	struct gfs2_holder t_gh;
+	struct gfs2_holder thaw_gh;
 	struct gfs2_log_header_host head;
 	int error;
 
@@ -407,7 +407,8 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 	if (error)
 		return error;
 
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &t_gh);
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, 0,
+				   &thaw_gh);
 	if (error)
 		goto fail_threads;
 
@@ -433,13 +434,13 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp)
 
 	set_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
 
-	gfs2_glock_dq_uninit(&t_gh);
+	gfs2_glock_dq_uninit(&thaw_gh);
 
 	return 0;
 
 fail:
-	t_gh.gh_flags |= GL_NOCACHE;
-	gfs2_glock_dq_uninit(&t_gh);
+	thaw_gh.gh_flags |= GL_NOCACHE;
+	gfs2_glock_dq_uninit(&thaw_gh);
 fail_threads:
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
@@ -635,15 +636,21 @@ struct lfcc {
  */
 
 static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
-				    struct gfs2_holder *t_gh)
+				    struct gfs2_holder *freeze_gh)
 {
 	struct gfs2_inode *ip;
 	struct gfs2_jdesc *jd;
 	struct lfcc *lfcc;
 	LIST_HEAD(list);
 	struct gfs2_log_header_host lh;
+	struct gfs2_inode *dip = GFS2_I(sdp->sd_root_dir->d_inode);
 	int error;
 
+	error = gfs2_glock_nq_init(dip->i_gl, LM_ST_SHARED, 0,
+				   &sdp->sd_freeze_root_gh);
+	if (error)
+		return error;
+	atomic_set(&sdp->sd_frozen_root, 1);
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		lfcc = kmalloc(sizeof(struct lfcc), GFP_KERNEL);
 		if (!lfcc) {
@@ -659,8 +666,8 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 		list_add(&lfcc->list, &list);
 	}
 
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_DEFERRED,
-				   GL_NOCACHE, t_gh);
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_EXCLUSIVE,
+				   GL_NOCACHE, freeze_gh);
 
 	list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) {
 		error = gfs2_jdesc_check(jd);
@@ -676,7 +683,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp,
 	}
 
 	if (error)
-		gfs2_glock_dq_uninit(t_gh);
+		gfs2_glock_dq_uninit(freeze_gh);
 
 out:
 	while (!list_empty(&list)) {
@@ -685,6 +692,11 @@ out:
 		gfs2_glock_dq_uninit(&lfcc->gh);
 		kfree(lfcc);
 	}
+	if (error) {
+		atomic_dec(&sdp->sd_frozen_root);
+		wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
+		gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
+	}
 	return error;
 }
 
@@ -742,7 +754,7 @@ static int gfs2_write_inode(struct inode *inode, struct writeback_control *wbc)
 	int ret = 0;
 
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		gfs2_log_flush(GFS2_SB(inode), ip->i_gl);
+		gfs2_log_flush(GFS2_SB(inode), ip->i_gl, NORMAL_FLUSH);
 	if (bdi->dirty_exceeded)
 		gfs2_ail1_flush(sdp, wbc);
 	else
@@ -822,9 +834,18 @@ out:
 
 static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 {
-	struct gfs2_holder t_gh;
+	struct gfs2_holder thaw_gh;
 	int error;
 
+	error = gfs2_glock_nq_init(sdp->sd_freeze_gl, LM_ST_SHARED, GL_NOCACHE,
+				   &thaw_gh);
+	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
+		return error;
+
+	down_write(&sdp->sd_log_flush_lock);
+	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+	up_write(&sdp->sd_log_flush_lock);
+
 	kthread_stop(sdp->sd_quotad_process);
 	kthread_stop(sdp->sd_logd_process);
 
@@ -832,18 +853,11 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp)
 	gfs2_quota_sync(sdp->sd_vfs, 0);
 	gfs2_statfs_sync(sdp->sd_vfs, 0);
 
-	error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE,
-				   &t_gh);
-	if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags))
-		return error;
-
-	gfs2_meta_syncfs(sdp);
-	gfs2_log_shutdown(sdp);
-
-	clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags);
+	gfs2_log_flush(sdp, NULL, SHUTDOWN_FLUSH);
+	gfs2_assert_warn(sdp, atomic_read(&sdp->sd_log_blks_free) == sdp->sd_jdesc->jd_blocks);
 
-	if (t_gh.gh_gl)
-		gfs2_glock_dq_uninit(&t_gh);
+	if (thaw_gh.gh_gl)
+		gfs2_glock_dq_uninit(&thaw_gh);
 
 	gfs2_quota_cleanup(sdp);
 
@@ -900,7 +914,7 @@ restart:
 	iput(sdp->sd_quota_inode);
 
 	gfs2_glock_put(sdp->sd_rename_gl);
-	gfs2_glock_put(sdp->sd_trans_gl);
+	gfs2_glock_put(sdp->sd_freeze_gl);
 
 	if (!sdp->sd_args.ar_spectator) {
 		gfs2_glock_dq_uninit(&sdp->sd_journal_gh);
@@ -935,8 +949,8 @@ static int gfs2_sync_fs(struct super_block *sb, int wait)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 
 	gfs2_quota_sync(sb, -1);
-	if (wait && sdp)
-		gfs2_log_flush(sdp, NULL);
+	if (wait && sdp && !atomic_read(&sdp->sd_log_freeze))
+		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	return 0;
 }
 
@@ -986,6 +1000,9 @@ static int gfs2_unfreeze(struct super_block *sb)
 	struct gfs2_sbd *sdp = sb->s_fs_info;
 
 	gfs2_glock_dq_uninit(&sdp->sd_freeze_gh);
+	atomic_dec(&sdp->sd_frozen_root);
+	wait_event(sdp->sd_frozen_root_wait, atomic_read(&sdp->sd_frozen_root) == 0);
+	gfs2_glock_dq_uninit(&sdp->sd_freeze_root_gh);
 	return 0;
 }
 
@@ -1525,7 +1542,7 @@ static void gfs2_evict_inode(struct inode *inode)
 	goto out_unlock;
 
 out_truncate:
-	gfs2_log_flush(sdp, ip->i_gl);
+	gfs2_log_flush(sdp, ip->i_gl, NORMAL_FLUSH);
 	if (test_bit(GLF_DIRTY, &ip->i_gl->gl_flags)) {
 		struct address_space *metamapping = gfs2_glock2aspace(ip->i_gl);
 		filemap_fdatawrite(metamapping);
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index de25d5577e5d..3ab566ba5696 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -240,8 +240,8 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len
 
 	if (gltype > LM_TYPE_JOURNAL)
 		return -EINVAL;
-	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_TRANS_LOCK)
-		glops = &gfs2_trans_glops;
+	if (gltype == LM_TYPE_NONDISK && glnum == GFS2_FREEZE_LOCK)
+		glops = &gfs2_freeze_glops;
 	else
 		glops = gfs2_glops_list[gltype];
 	if (glops == NULL)
@@ -333,7 +333,7 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 		set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
 	else if (val == 0) {
 		clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		gfs2_glock_thaw(sdp);
 	} else {
 		ret = -EINVAL;
@@ -407,6 +407,9 @@ int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
 	struct gfs2_jdesc *jd;
 	int rv;
 
+	/* Wait for our primary journal to be initialized */
+	wait_for_completion(&sdp->sd_journal_ready);
+
 	spin_lock(&sdp->sd_jindex_spin);
 	rv = -EBUSY;
 	if (sdp->sd_jdesc->jd_jid == jid)
@@ -482,7 +485,7 @@ static ssize_t jid_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
 		rv = jid = -EINVAL;
 	sdp->sd_lockstruct.ls_jid = jid;
 	clear_bit(SDF_NOJOURNALID, &sdp->sd_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&sdp->sd_flags, SDF_NOJOURNALID);
 out:
 	spin_unlock(&sdp->sd_jindex_spin);
diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c
index bead90d27bad..0546ab4e28e8 100644
--- a/fs/gfs2/trans.c
+++ b/fs/gfs2/trans.c
@@ -48,6 +48,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	tr->tr_blocks = blocks;
 	tr->tr_revokes = revokes;
 	tr->tr_reserved = 1;
+	tr->tr_alloced = 1;
 	if (blocks)
 		tr->tr_reserved += 6 + blocks;
 	if (revokes)
@@ -57,48 +58,22 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks,
 	INIT_LIST_HEAD(&tr->tr_buf);
 
 	sb_start_intwrite(sdp->sd_vfs);
-	gfs2_holder_init(sdp->sd_trans_gl, LM_ST_SHARED, 0, &tr->tr_t_gh);
-
-	error = gfs2_glock_nq(&tr->tr_t_gh);
-	if (error)
-		goto fail_holder_uninit;
 
 	error = gfs2_log_reserve(sdp, tr->tr_reserved);
 	if (error)
-		goto fail_gunlock;
+		goto fail;
 
 	current->journal_info = tr;
 
 	return 0;
 
-fail_gunlock:
-	gfs2_glock_dq(&tr->tr_t_gh);
-
-fail_holder_uninit:
+fail:
 	sb_end_intwrite(sdp->sd_vfs);
-	gfs2_holder_uninit(&tr->tr_t_gh);
 	kfree(tr);
 
 	return error;
 }
 
-/**
- * gfs2_log_release - Release a given number of log blocks
- * @sdp: The GFS2 superblock
- * @blks: The number of blocks
- *
- */
-
-static void gfs2_log_release(struct gfs2_sbd *sdp, unsigned int blks)
-{
-
-	atomic_add(blks, &sdp->sd_log_blks_free);
-	trace_gfs2_log_blocks(sdp, blks);
-	gfs2_assert_withdraw(sdp, atomic_read(&sdp->sd_log_blks_free) <=
-				  sdp->sd_jdesc->jd_blocks);
-	up_read(&sdp->sd_log_flush_lock);
-}
-
 static void gfs2_print_trans(const struct gfs2_trans *tr)
 {
 	pr_warn("Transaction created at: %pSR\n", (void *)tr->tr_ip);
@@ -119,11 +94,8 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 
 	if (!tr->tr_touched) {
 		gfs2_log_release(sdp, tr->tr_reserved);
-		if (tr->tr_t_gh.gh_gl) {
-			gfs2_glock_dq(&tr->tr_t_gh);
-			gfs2_holder_uninit(&tr->tr_t_gh);
+		if (tr->tr_alloced)
 			kfree(tr);
-		}
 		sb_end_intwrite(sdp->sd_vfs);
 		return;
 	}
@@ -137,16 +109,12 @@ void gfs2_trans_end(struct gfs2_sbd *sdp)
 		gfs2_print_trans(tr);
 
 	gfs2_log_commit(sdp, tr);
-	if (tr->tr_t_gh.gh_gl) {
-		gfs2_glock_dq(&tr->tr_t_gh);
-		gfs2_holder_uninit(&tr->tr_t_gh);
-		if (!tr->tr_attached)
+	if (tr->tr_alloced && !tr->tr_attached)
 			kfree(tr);
-	}
 	up_read(&sdp->sd_log_flush_lock);
 
 	if (sdp->sd_vfs->s_flags & MS_SYNCHRONOUS)
-		gfs2_log_flush(sdp, NULL);
+		gfs2_log_flush(sdp, NULL, NORMAL_FLUSH);
 	sb_end_intwrite(sdp->sd_vfs);
 }
 
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 204027520937..e19d4c0cacae 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -1030,6 +1030,11 @@ static int __init init_hugetlbfs_fs(void)
 	int error;
 	int i;
 
+	if (!hugepages_supported()) {
+		pr_info("hugetlbfs: disabling because there are no supported hugepage sizes\n");
+		return -ENOTSUPP;
+	}
+
 	error = bdi_init(&hugetlbfs_backing_dev_info);
 	if (error)
 		return error;
diff --git a/fs/ioprio.c b/fs/ioprio.c
deleted file mode 100644
index e50170ca7c33..000000000000
--- a/fs/ioprio.c
+++ /dev/null
@@ -1,241 +0,0 @@
-/*
- * fs/ioprio.c
- *
- * Copyright (C) 2004 Jens Axboe <axboe@kernel.dk>
- *
- * Helper functions for setting/querying io priorities of processes. The
- * system calls closely mimmick getpriority/setpriority, see the man page for
- * those. The prio argument is a composite of prio class and prio data, where
- * the data argument has meaning within that class. The standard scheduling
- * classes have 8 distinct prio levels, with 0 being the highest prio and 7
- * being the lowest.
- *
- * IOW, setting BE scheduling class with prio 2 is done ala:
- *
- * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2;
- *
- * ioprio_set(PRIO_PROCESS, pid, prio);
- *
- * See also Documentation/block/ioprio.txt
- *
- */
-#include <linux/gfp.h>
-#include <linux/kernel.h>
-#include <linux/export.h>
-#include <linux/ioprio.h>
-#include <linux/blkdev.h>
-#include <linux/capability.h>
-#include <linux/syscalls.h>
-#include <linux/security.h>
-#include <linux/pid_namespace.h>
-
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
-	int err;
-	struct io_context *ioc;
-	const struct cred *cred = current_cred(), *tcred;
-
-	rcu_read_lock();
-	tcred = __task_cred(task);
-	if (!uid_eq(tcred->uid, cred->euid) &&
-	    !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
-		rcu_read_unlock();
-		return -EPERM;
-	}
-	rcu_read_unlock();
-
-	err = security_task_setioprio(task, ioprio);
-	if (err)
-		return err;
-
-	ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
-	if (ioc) {
-		ioc->ioprio = ioprio;
-		put_io_context(ioc);
-	}
-
-	return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
-SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
-{
-	int class = IOPRIO_PRIO_CLASS(ioprio);
-	int data = IOPRIO_PRIO_DATA(ioprio);
-	struct task_struct *p, *g;
-	struct user_struct *user;
-	struct pid *pgrp;
-	kuid_t uid;
-	int ret;
-
-	switch (class) {
-		case IOPRIO_CLASS_RT:
-			if (!capable(CAP_SYS_ADMIN))
-				return -EPERM;
-			/* fall through, rt has prio field too */
-		case IOPRIO_CLASS_BE:
-			if (data >= IOPRIO_BE_NR || data < 0)
-				return -EINVAL;
-
-			break;
-		case IOPRIO_CLASS_IDLE:
-			break;
-		case IOPRIO_CLASS_NONE:
-			if (data)
-				return -EINVAL;
-			break;
-		default:
-			return -EINVAL;
-	}
-
-	ret = -ESRCH;
-	rcu_read_lock();
-	switch (which) {
-		case IOPRIO_WHO_PROCESS:
-			if (!who)
-				p = current;
-			else
-				p = find_task_by_vpid(who);
-			if (p)
-				ret = set_task_ioprio(p, ioprio);
-			break;
-		case IOPRIO_WHO_PGRP:
-			if (!who)
-				pgrp = task_pgrp(current);
-			else
-				pgrp = find_vpid(who);
-			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-				ret = set_task_ioprio(p, ioprio);
-				if (ret)
-					break;
-			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
-			break;
-		case IOPRIO_WHO_USER:
-			uid = make_kuid(current_user_ns(), who);
-			if (!uid_valid(uid))
-				break;
-			if (!who)
-				user = current_user();
-			else
-				user = find_user(uid);
-
-			if (!user)
-				break;
-
-			do_each_thread(g, p) {
-				if (!uid_eq(task_uid(p), uid))
-					continue;
-				ret = set_task_ioprio(p, ioprio);
-				if (ret)
-					goto free_uid;
-			} while_each_thread(g, p);
-free_uid:
-			if (who)
-				free_uid(user);
-			break;
-		default:
-			ret = -EINVAL;
-	}
-
-	rcu_read_unlock();
-	return ret;
-}
-
-static int get_task_ioprio(struct task_struct *p)
-{
-	int ret;
-
-	ret = security_task_getioprio(p);
-	if (ret)
-		goto out;
-	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
-	if (p->io_context)
-		ret = p->io_context->ioprio;
-out:
-	return ret;
-}
-
-int ioprio_best(unsigned short aprio, unsigned short bprio)
-{
-	unsigned short aclass = IOPRIO_PRIO_CLASS(aprio);
-	unsigned short bclass = IOPRIO_PRIO_CLASS(bprio);
-
-	if (aclass == IOPRIO_CLASS_NONE)
-		aclass = IOPRIO_CLASS_BE;
-	if (bclass == IOPRIO_CLASS_NONE)
-		bclass = IOPRIO_CLASS_BE;
-
-	if (aclass == bclass)
-		return min(aprio, bprio);
-	if (aclass > bclass)
-		return bprio;
-	else
-		return aprio;
-}
-
-SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
-{
-	struct task_struct *g, *p;
-	struct user_struct *user;
-	struct pid *pgrp;
-	kuid_t uid;
-	int ret = -ESRCH;
-	int tmpio;
-
-	rcu_read_lock();
-	switch (which) {
-		case IOPRIO_WHO_PROCESS:
-			if (!who)
-				p = current;
-			else
-				p = find_task_by_vpid(who);
-			if (p)
-				ret = get_task_ioprio(p);
-			break;
-		case IOPRIO_WHO_PGRP:
-			if (!who)
-				pgrp = task_pgrp(current);
-			else
-				pgrp = find_vpid(who);
-			do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
-				tmpio = get_task_ioprio(p);
-				if (tmpio < 0)
-					continue;
-				if (ret == -ESRCH)
-					ret = tmpio;
-				else
-					ret = ioprio_best(ret, tmpio);
-			} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
-			break;
-		case IOPRIO_WHO_USER:
-			uid = make_kuid(current_user_ns(), who);
-			if (!who)
-				user = current_user();
-			else
-				user = find_user(uid);
-
-			if (!user)
-				break;
-
-			do_each_thread(g, p) {
-				if (!uid_eq(task_uid(p), user->uid))
-					continue;
-				tmpio = get_task_ioprio(p);
-				if (tmpio < 0)
-					continue;
-				if (ret == -ESRCH)
-					ret = tmpio;
-				else
-					ret = ioprio_best(ret, tmpio);
-			} while_each_thread(g, p);
-
-			if (who)
-				free_uid(user);
-			break;
-		default:
-			ret = -EINVAL;
-	}
-
-	rcu_read_unlock();
-	return ret;
-}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 5f26139a165a..6fac74349856 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -43,7 +43,7 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
 		clear_buffer_uptodate(bh);
 	if (orig_bh) {
 		clear_bit_unlock(BH_Shadow, &orig_bh->b_state);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		wake_up_bit(&orig_bh->b_state, BH_Shadow);
 	}
 	unlock_buffer(bh);
@@ -239,7 +239,7 @@ static int journal_submit_data_buffers(journal_t *journal,
 		spin_lock(&journal->j_list_lock);
 		J_ASSERT(jinode->i_transaction == commit_transaction);
 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 	spin_unlock(&journal->j_list_lock);
@@ -277,7 +277,7 @@ static int journal_finish_inode_data_buffers(journal_t *journal,
 		}
 		spin_lock(&journal->j_list_lock);
 		clear_bit(__JI_COMMIT_RUNNING, &jinode->i_flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		wake_up_bit(&jinode->i_flags, __JI_COMMIT_RUNNING);
 	}
 
diff --git a/fs/jfs/acl.c b/fs/jfs/acl.c
index 5a8ea16eedbc..0c8ca830b113 100644
--- a/fs/jfs/acl.c
+++ b/fs/jfs/acl.c
@@ -83,13 +83,15 @@ static int __jfs_set_acl(tid_t tid, struct inode *inode, int type,
 	switch (type) {
 	case ACL_TYPE_ACCESS:
 		ea_name = POSIX_ACL_XATTR_ACCESS;
-		rc = posix_acl_equiv_mode(acl, &inode->i_mode);
-		if (rc < 0)
-			return rc;
-		inode->i_ctime = CURRENT_TIME;
-		mark_inode_dirty(inode);
-		if (rc == 0)
-			acl = NULL;
+		if (acl) {
+			rc = posix_acl_equiv_mode(acl, &inode->i_mode);
+			if (rc < 0)
+				return rc;
+			inode->i_ctime = CURRENT_TIME;
+			mark_inode_dirty(inode);
+			if (rc == 0)
+				acl = NULL;
+		}
 		break;
 	case ACL_TYPE_DEFAULT:
 		ea_name = POSIX_ACL_XATTR_DEFAULT;
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 370d7b6c5942..2d514c7affc2 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -1208,7 +1208,7 @@ static int dbAllocNext(struct bmap * bmp, struct dmap * dp, s64 blkno,
 				 * by this leaf.
 				 */
 				l2size =
-				    min((int)leaf[word], NLSTOL2BSZ(nwords));
+				    min_t(int, leaf[word], NLSTOL2BSZ(nwords));
 
 				/* determine how many words were handled.
 				 */
@@ -1902,7 +1902,7 @@ dbAllocCtl(struct bmap * bmp, s64 nblocks, int l2nb, s64 blkno, s64 * results)
 
 		/* determine how many blocks to allocate from this dmap.
 		 */
-		nb = min(n, (s64)BPERDMAP);
+		nb = min_t(s64, n, BPERDMAP);
 
 		/* allocate the blocks from the dmap.
 		 */
@@ -2260,7 +2260,8 @@ static void dbAllocBits(struct bmap * bmp, struct dmap * dp, s64 blkno,
 				 * of bits being allocated and the l2 number
 				 * of bits currently described by this leaf.
 				 */
-				size = min((int)leaf[word], NLSTOL2BSZ(nwords));
+				size = min_t(int, leaf[word],
+					     NLSTOL2BSZ(nwords));
 
 				/* update the leaf to reflect the allocation.
 				 * in addition to setting the leaf value to
@@ -3563,7 +3564,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno,	s64 nblocks)
 					if (mp == NULL)
 						goto errout;
 
-					n = min(nblocks, (s64)BPERDMAP);
+					n = min_t(s64, nblocks, BPERDMAP);
 				}
 
 				dp = (struct dmap *) mp->data;
diff --git a/fs/jfs/jfs_inode.c b/fs/jfs/jfs_inode.c
index 7f464c513ba0..6b0f816201a2 100644
--- a/fs/jfs/jfs_inode.c
+++ b/fs/jfs/jfs_inode.c
@@ -29,20 +29,20 @@
 void jfs_set_inode_flags(struct inode *inode)
 {
 	unsigned int flags = JFS_IP(inode)->mode2;
-
-	inode->i_flags &= ~(S_IMMUTABLE | S_APPEND |
-		S_NOATIME | S_DIRSYNC | S_SYNC);
+	unsigned int new_fl = 0;
 
 	if (flags & JFS_IMMUTABLE_FL)
-		inode->i_flags |= S_IMMUTABLE;
+		new_fl |= S_IMMUTABLE;
 	if (flags & JFS_APPEND_FL)
-		inode->i_flags |= S_APPEND;
+		new_fl |= S_APPEND;
 	if (flags & JFS_NOATIME_FL)
-		inode->i_flags |= S_NOATIME;
+		new_fl |= S_NOATIME;
 	if (flags & JFS_DIRSYNC_FL)
-		inode->i_flags |= S_DIRSYNC;
+		new_fl |= S_DIRSYNC;
 	if (flags & JFS_SYNC_FL)
-		inode->i_flags |= S_SYNC;
+		new_fl |= S_SYNC;
+	inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND | S_NOATIME |
+			S_DIRSYNC | S_SYNC);
 }
 
 void jfs_get_inode_flags(struct jfs_inode_info *jfs_ip)
diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
index 8d811e02b4b9..0acddf60af55 100644
--- a/fs/jfs/jfs_logmgr.c
+++ b/fs/jfs/jfs_logmgr.c
@@ -167,7 +167,7 @@ do {						\
  * Global list of active external journals
  */
 static LIST_HEAD(jfs_external_logs);
-static struct jfs_log *dummy_log = NULL;
+static struct jfs_log *dummy_log;
 static DEFINE_MUTEX(jfs_log_mutex);
 
 /*
diff --git a/fs/jfs/super.c b/fs/jfs/super.c
index 97f7fda51890..adf8cb045b9e 100644
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -50,14 +50,14 @@ MODULE_DESCRIPTION("The Journaled Filesystem (JFS)");
 MODULE_AUTHOR("Steve Best/Dave Kleikamp/Barry Arndt, IBM");
 MODULE_LICENSE("GPL");
 
-static struct kmem_cache * jfs_inode_cachep;
+static struct kmem_cache *jfs_inode_cachep;
 
 static const struct super_operations jfs_super_operations;
 static const struct export_operations jfs_export_operations;
 static struct file_system_type jfs_fs_type;
 
 #define MAX_COMMIT_THREADS 64
-static int commit_threads = 0;
+static int commit_threads;
 module_param(commit_threads, int, 0);
 MODULE_PARM_DESC(commit_threads, "Number of commit threads");
 
@@ -84,8 +84,7 @@ static void jfs_handle_error(struct super_block *sb)
 		panic("JFS (device %s): panic forced after error\n",
 			sb->s_id);
 	else if (sbi->flag & JFS_ERR_REMOUNT_RO) {
-		jfs_err("ERROR: (device %s): remounting filesystem "
-			"as read-only\n",
+		jfs_err("ERROR: (device %s): remounting filesystem as read-only\n",
 			sb->s_id);
 		sb->s_flags |= MS_RDONLY;
 	}
@@ -273,7 +272,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 		case Opt_resize:
 		{
 			char *resize = args[0].from;
-			*newLVSize = simple_strtoull(resize, &resize, 0);
+			int rc = kstrtoll(resize, 0, newLVSize);
+
+			if (rc)
+				goto cleanup;
 			break;
 		}
 		case Opt_resize_nosize:
@@ -327,7 +329,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 		case Opt_uid:
 		{
 			char *uid = args[0].from;
-			uid_t val = simple_strtoul(uid, &uid, 0);
+			uid_t val;
+			int rc = kstrtouint(uid, 0, &val);
+
+			if (rc)
+				goto cleanup;
 			sbi->uid = make_kuid(current_user_ns(), val);
 			if (!uid_valid(sbi->uid))
 				goto cleanup;
@@ -337,7 +343,11 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 		case Opt_gid:
 		{
 			char *gid = args[0].from;
-			gid_t val = simple_strtoul(gid, &gid, 0);
+			gid_t val;
+			int rc = kstrtouint(gid, 0, &val);
+
+			if (rc)
+				goto cleanup;
 			sbi->gid = make_kgid(current_user_ns(), val);
 			if (!gid_valid(sbi->gid))
 				goto cleanup;
@@ -347,7 +357,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 		case Opt_umask:
 		{
 			char *umask = args[0].from;
-			sbi->umask = simple_strtoul(umask, &umask, 8);
+			int rc = kstrtouint(umask, 8, &sbi->umask);
+
+			if (rc)
+				goto cleanup;
 			if (sbi->umask & ~0777) {
 				pr_err("JFS: Invalid value of umask\n");
 				goto cleanup;
@@ -363,12 +376,10 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 			 * -> user has more control over the online trimming
 			 */
 			sbi->minblks_trim = 64;
-			if (blk_queue_discard(q)) {
+			if (blk_queue_discard(q))
 				*flag |= JFS_DISCARD;
-			} else {
-				pr_err("JFS: discard option " \
-					"not supported on device\n");
-			}
+			else
+				pr_err("JFS: discard option not supported on device\n");
 			break;
 		}
 
@@ -380,20 +391,21 @@ static int parse_options(char *options, struct super_block *sb, s64 *newLVSize,
 		{
 			struct request_queue *q = bdev_get_queue(sb->s_bdev);
 			char *minblks_trim = args[0].from;
+			int rc;
 			if (blk_queue_discard(q)) {
 				*flag |= JFS_DISCARD;
-				sbi->minblks_trim = simple_strtoull(
-					minblks_trim, &minblks_trim, 0);
-			} else {
-				pr_err("JFS: discard option " \
-					"not supported on device\n");
-			}
+				rc = kstrtouint(minblks_trim, 0,
+						&sbi->minblks_trim);
+				if (rc)
+					goto cleanup;
+			} else
+				pr_err("JFS: discard option not supported on device\n");
 			break;
 		}
 
 		default:
-			printk("jfs: Unrecognized mount option \"%s\" "
-					" or missing value\n", p);
+			printk("jfs: Unrecognized mount option \"%s\" or missing value\n",
+			       p);
 			goto cleanup;
 		}
 	}
@@ -419,14 +431,12 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	int ret;
 
 	sync_filesystem(sb);
-	if (!parse_options(data, sb, &newLVSize, &flag)) {
+	if (!parse_options(data, sb, &newLVSize, &flag))
 		return -EINVAL;
-	}
 
 	if (newLVSize) {
 		if (sb->s_flags & MS_RDONLY) {
-			pr_err("JFS: resize requires volume" \
-				" to be mounted read-write\n");
+			pr_err("JFS: resize requires volume to be mounted read-write\n");
 			return -EROFS;
 		}
 		rc = jfs_extendfs(sb, newLVSize, 0);
@@ -452,9 +462,8 @@ static int jfs_remount(struct super_block *sb, int *flags, char *data)
 	}
 	if ((!(sb->s_flags & MS_RDONLY)) && (*flags & MS_RDONLY)) {
 		rc = dquot_suspend(sb, -1);
-		if (rc < 0) {
+		if (rc < 0)
 			return rc;
-		}
 		rc = jfs_umount_rw(sb);
 		JFS_SBI(sb)->flag = flag;
 		return rc;
@@ -487,7 +496,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	if (!new_valid_dev(sb->s_bdev->bd_dev))
 		return -EOVERFLOW;
 
-	sbi = kzalloc(sizeof (struct jfs_sb_info), GFP_KERNEL);
+	sbi = kzalloc(sizeof(struct jfs_sb_info), GFP_KERNEL);
 	if (!sbi)
 		return -ENOMEM;
 
@@ -548,9 +557,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 
 	rc = jfs_mount(sb);
 	if (rc) {
-		if (!silent) {
+		if (!silent)
 			jfs_err("jfs_mount failed w/return code = %d", rc);
-		}
 		goto out_mount_failed;
 	}
 	if (sb->s_flags & MS_RDONLY)
@@ -587,7 +595,8 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent)
 	 * Page cache is indexed by long.
 	 * I would use MAX_LFS_FILESIZE, but it's only half as big
 	 */
-	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1, (u64)sb->s_maxbytes);
+	sb->s_maxbytes = min(((u64) PAGE_CACHE_SIZE << 32) - 1,
+			     (u64)sb->s_maxbytes);
 #endif
 	sb->s_time_gran = 1;
 	return 0;
@@ -597,9 +606,8 @@ out_no_root:
 
 out_no_rw:
 	rc = jfs_umount(sb);
-	if (rc) {
+	if (rc)
 		jfs_err("jfs_umount failed with return code %d", rc);
-	}
 out_mount_failed:
 	filemap_write_and_wait(sbi->direct_inode->i_mapping);
 	truncate_inode_pages(sbi->direct_inode->i_mapping, 0);
@@ -924,7 +932,8 @@ static int __init init_jfs_fs(void)
 		commit_threads = MAX_COMMIT_THREADS;
 
 	for (i = 0; i < commit_threads; i++) {
-		jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL, "jfsCommit");
+		jfsCommitThread[i] = kthread_run(jfs_lazycommit, NULL,
+						 "jfsCommit");
 		if (IS_ERR(jfsCommitThread[i])) {
 			rc = PTR_ERR(jfsCommitThread[i]);
 			jfs_err("init_jfs_fs: fork failed w/rc = %d", rc);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index ac127cd008bf..a693f5b01ae6 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -714,6 +714,7 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 		return ERR_PTR(-ENOMEM);
 
 	ida_init(&root->ino_ida);
+	INIT_LIST_HEAD(&root->supers);
 
 	kn = __kernfs_new_node(root, "", S_IFDIR | S_IRUGO | S_IXUGO,
 			       KERNFS_DIR);
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index e01ea4a14a01..e3d37f607f97 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -14,6 +14,7 @@
 #include <linux/poll.h>
 #include <linux/pagemap.h>
 #include <linux/sched.h>
+#include <linux/fsnotify.h>
 
 #include "kernfs-internal.h"
 
@@ -610,6 +611,7 @@ static void kernfs_put_open_node(struct kernfs_node *kn,
 static int kernfs_fop_open(struct inode *inode, struct file *file)
 {
 	struct kernfs_node *kn = file->f_path.dentry->d_fsdata;
+	struct kernfs_root *root = kernfs_root(kn);
 	const struct kernfs_ops *ops;
 	struct kernfs_open_file *of;
 	bool has_read, has_write, has_mmap;
@@ -624,14 +626,16 @@ static int kernfs_fop_open(struct inode *inode, struct file *file)
 	has_write = ops->write || ops->mmap;
 	has_mmap = ops->mmap;
 
-	/* check perms and supported operations */
-	if ((file->f_mode & FMODE_WRITE) &&
-	    (!(inode->i_mode & S_IWUGO) || !has_write))
-		goto err_out;
+	/* see the flag definition for details */
+	if (root->flags & KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK) {
+		if ((file->f_mode & FMODE_WRITE) &&
+		    (!(inode->i_mode & S_IWUGO) || !has_write))
+			goto err_out;
 
-	if ((file->f_mode & FMODE_READ) &&
-	    (!(inode->i_mode & S_IRUGO) || !has_read))
-		goto err_out;
+		if ((file->f_mode & FMODE_READ) &&
+		    (!(inode->i_mode & S_IRUGO) || !has_read))
+			goto err_out;
+	}
 
 	/* allocate a kernfs_open_file for the file */
 	error = -ENOMEM;
@@ -787,20 +791,48 @@ static unsigned int kernfs_fop_poll(struct file *filp, poll_table *wait)
  */
 void kernfs_notify(struct kernfs_node *kn)
 {
+	struct kernfs_root *root = kernfs_root(kn);
 	struct kernfs_open_node *on;
+	struct kernfs_super_info *info;
 	unsigned long flags;
 
+	if (WARN_ON(kernfs_type(kn) != KERNFS_FILE))
+		return;
+
+	/* kick poll */
 	spin_lock_irqsave(&kernfs_open_node_lock, flags);
 
-	if (!WARN_ON(kernfs_type(kn) != KERNFS_FILE)) {
-		on = kn->attr.open;
-		if (on) {
-			atomic_inc(&on->event);
-			wake_up_interruptible(&on->poll);
-		}
+	on = kn->attr.open;
+	if (on) {
+		atomic_inc(&on->event);
+		wake_up_interruptible(&on->poll);
 	}
 
 	spin_unlock_irqrestore(&kernfs_open_node_lock, flags);
+
+	/* kick fsnotify */
+	mutex_lock(&kernfs_mutex);
+
+	list_for_each_entry(info, &root->supers, node) {
+		struct inode *inode;
+		struct dentry *dentry;
+
+		inode = ilookup(info->sb, kn->ino);
+		if (!inode)
+			continue;
+
+		dentry = d_find_any_alias(inode);
+		if (dentry) {
+			fsnotify_parent(NULL, dentry, FS_MODIFY);
+			fsnotify(inode, FS_MODIFY, inode, FSNOTIFY_EVENT_INODE,
+				 NULL, 0);
+			dput(dentry);
+		}
+
+		iput(inode);
+	}
+
+	mutex_unlock(&kernfs_mutex);
 }
 EXPORT_SYMBOL_GPL(kernfs_notify);
 
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 8be13b2a079b..dc84a3ef9ca2 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -49,6 +49,8 @@ static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn)
  * mount.c
  */
 struct kernfs_super_info {
+	struct super_block	*sb;
+
 	/*
 	 * The root associated with this super_block.  Each super_block is
 	 * identified by the root and ns it's associated with.
@@ -62,6 +64,9 @@ struct kernfs_super_info {
 	 * an array and compare kernfs_node tag against every entry.
 	 */
 	const void		*ns;
+
+	/* anchored at kernfs_root->supers, protected by kernfs_mutex */
+	struct list_head	node;
 };
 #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info))
 
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index 6a5f04ac8704..d171b98a6cdd 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -62,15 +62,16 @@ struct kernfs_root *kernfs_root_from_sb(struct super_block *sb)
 	return NULL;
 }
 
-static int kernfs_fill_super(struct super_block *sb)
+static int kernfs_fill_super(struct super_block *sb, unsigned long magic)
 {
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct inode *inode;
 	struct dentry *root;
 
+	info->sb = sb;
 	sb->s_blocksize = PAGE_CACHE_SIZE;
 	sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
-	sb->s_magic = SYSFS_MAGIC;
+	sb->s_magic = magic;
 	sb->s_op = &kernfs_sops;
 	sb->s_time_gran = 1;
 
@@ -131,6 +132,7 @@ const void *kernfs_super_ns(struct super_block *sb)
  * @fs_type: file_system_type of the fs being mounted
  * @flags: mount flags specified for the mount
  * @root: kernfs_root of the hierarchy being mounted
+ * @magic: file system specific magic number
  * @new_sb_created: tell the caller if we allocated a new superblock
  * @ns: optional namespace tag of the mount
  *
@@ -142,8 +144,8 @@ const void *kernfs_super_ns(struct super_block *sb)
  * The return value can be passed to the vfs layer verbatim.
  */
 struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
-			       struct kernfs_root *root, bool *new_sb_created,
-			       const void *ns)
+				struct kernfs_root *root, unsigned long magic,
+				bool *new_sb_created, const void *ns)
 {
 	struct super_block *sb;
 	struct kernfs_super_info *info;
@@ -166,12 +168,18 @@ struct dentry *kernfs_mount_ns(struct file_system_type *fs_type, int flags,
 		*new_sb_created = !sb->s_root;
 
 	if (!sb->s_root) {
-		error = kernfs_fill_super(sb);
+		struct kernfs_super_info *info = kernfs_info(sb);
+
+		error = kernfs_fill_super(sb, magic);
 		if (error) {
 			deactivate_locked_super(sb);
 			return ERR_PTR(error);
 		}
 		sb->s_flags |= MS_ACTIVE;
+
+		mutex_lock(&kernfs_mutex);
+		list_add(&info->node, &root->supers);
+		mutex_unlock(&kernfs_mutex);
 	}
 
 	return dget(sb->s_root);
@@ -190,6 +198,10 @@ void kernfs_kill_sb(struct super_block *sb)
 	struct kernfs_super_info *info = kernfs_info(sb);
 	struct kernfs_node *root_kn = sb->s_root->d_fsdata;
 
+	mutex_lock(&kernfs_mutex);
+	list_del(&info->node);
+	mutex_unlock(&kernfs_mutex);
+
 	/*
 	 * Remove the superblock from fs_supers/s_instances
 	 * so we can't find it, before freeing kernfs_super_info.
diff --git a/fs/locks.c b/fs/locks.c
index e663aeac579e..da57c9b7e844 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -130,6 +130,9 @@
 #include <linux/percpu.h>
 #include <linux/lglock.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/filelock.h>
+
 #include <asm/uaccess.h>
 
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
@@ -322,6 +325,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
 		return -ENOMEM;
 
 	fl->fl_file = filp;
+	fl->fl_owner = (fl_owner_t)filp;
 	fl->fl_pid = current->tgid;
 	fl->fl_flags = FL_FLOCK;
 	fl->fl_type = type;
@@ -389,18 +393,6 @@ static int flock64_to_posix_lock(struct file *filp, struct file_lock *fl,
 	fl->fl_ops = NULL;
 	fl->fl_lmops = NULL;
 
-	/* Ensure that fl->fl_filp has compatible f_mode */
-	switch (l->l_type) {
-	case F_RDLCK:
-		if (!(filp->f_mode & FMODE_READ))
-			return -EBADF;
-		break;
-	case F_WRLCK:
-		if (!(filp->f_mode & FMODE_WRITE))
-			return -EBADF;
-		break;
-	}
-
 	return assign_type(fl, l->l_type);
 }
 
@@ -439,7 +431,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
 	if (assign_type(fl, type) != 0)
 		return -EINVAL;
 
-	fl->fl_owner = current->files;
+	fl->fl_owner = (fl_owner_t)filp;
 	fl->fl_pid = current->tgid;
 
 	fl->fl_file = filp;
@@ -1298,6 +1290,7 @@ static void time_out_leases(struct inode *inode)
 
 	before = &inode->i_flock;
 	while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) {
+		trace_time_out_leases(inode, fl);
 		if (past_time(fl->fl_downgrade_time))
 			lease_modify(before, F_RDLCK);
 		if (past_time(fl->fl_break_time))
@@ -1385,6 +1378,7 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type)
 	}
 
 	if (i_have_this_lease || (mode & O_NONBLOCK)) {
+		trace_break_lease_noblock(inode, new_fl);
 		error = -EWOULDBLOCK;
 		goto out;
 	}
@@ -1396,10 +1390,12 @@ restart:
 	if (break_time == 0)
 		break_time++;
 	locks_insert_block(flock, new_fl);
+	trace_break_lease_block(inode, new_fl);
 	spin_unlock(&inode->i_lock);
 	error = wait_event_interruptible_timeout(new_fl->fl_wait,
 						!new_fl->fl_next, break_time);
 	spin_lock(&inode->i_lock);
+	trace_break_lease_unblock(inode, new_fl);
 	locks_delete_block(new_fl);
 	if (error >= 0) {
 		if (error == 0)
@@ -1521,6 +1517,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp
 	int error;
 
 	lease = *flp;
+	trace_generic_add_lease(inode, lease);
+
 	/*
 	 * In the delegation case we need mutual exclusion with
 	 * a number of operations that take the i_mutex.  We trylock
@@ -1610,6 +1608,8 @@ static int generic_delete_lease(struct file *filp, struct file_lock **flp)
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 
+	trace_generic_delete_lease(inode, *flp);
+
 	for (before = &inode->i_flock;
 			((fl = *before) != NULL) && IS_LEASE(fl);
 			before = &fl->fl_next) {
@@ -2034,6 +2034,22 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 	return error;
 }
 
+/* Ensure that fl->fl_filp has compatible f_mode for F_SETLK calls */
+static int
+check_fmode_for_setlk(struct file_lock *fl)
+{
+	switch (fl->fl_type) {
+	case F_RDLCK:
+		if (!(fl->fl_file->f_mode & FMODE_READ))
+			return -EBADF;
+		break;
+	case F_WRLCK:
+		if (!(fl->fl_file->f_mode & FMODE_WRITE))
+			return -EBADF;
+	}
+	return 0;
+}
+
 /* Apply the lock described by l to an open file descriptor.
  * This implements both the F_SETLK and F_SETLKW commands of fcntl().
  */
@@ -2071,6 +2087,10 @@ again:
 	if (error)
 		goto out;
 
+	error = check_fmode_for_setlk(file_lock);
+	if (error)
+		goto out;
+
 	/*
 	 * If the cmd is requesting file-private locks, then set the
 	 * FL_OFDLCK flag and override the owner.
@@ -2206,6 +2226,10 @@ again:
 	if (error)
 		goto out;
 
+	error = check_fmode_for_setlk(file_lock);
+	if (error)
+		goto out;
+
 	/*
 	 * If the cmd is requesting file-private locks, then set the
 	 * FL_OFDLCK flag and override the owner.
@@ -2304,6 +2328,7 @@ void locks_remove_file(struct file *filp)
 
 	if (filp->f_op->flock) {
 		struct file_lock fl = {
+			.fl_owner = (fl_owner_t)filp,
 			.fl_pid = current->tgid,
 			.fl_file = filp,
 			.fl_flags = FL_FLOCK,
@@ -2411,31 +2436,31 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 	seq_printf(f, "%lld:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
 		if (fl->fl_flags & FL_ACCESS)
-			seq_printf(f, "ACCESS");
+			seq_puts(f, "ACCESS");
 		else if (IS_OFDLCK(fl))
-			seq_printf(f, "OFDLCK");
+			seq_puts(f, "OFDLCK");
 		else
-			seq_printf(f, "POSIX ");
+			seq_puts(f, "POSIX ");
 
 		seq_printf(f, " %s ",
 			     (inode == NULL) ? "*NOINODE*" :
 			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
 	} else if (IS_FLOCK(fl)) {
 		if (fl->fl_type & LOCK_MAND) {
-			seq_printf(f, "FLOCK  MSNFS     ");
+			seq_puts(f, "FLOCK  MSNFS     ");
 		} else {
-			seq_printf(f, "FLOCK  ADVISORY  ");
+			seq_puts(f, "FLOCK  ADVISORY  ");
 		}
 	} else if (IS_LEASE(fl)) {
-		seq_printf(f, "LEASE  ");
+		seq_puts(f, "LEASE  ");
 		if (lease_breaking(fl))
-			seq_printf(f, "BREAKING  ");
+			seq_puts(f, "BREAKING  ");
 		else if (fl->fl_file)
-			seq_printf(f, "ACTIVE    ");
+			seq_puts(f, "ACTIVE    ");
 		else
-			seq_printf(f, "BREAKER   ");
+			seq_puts(f, "BREAKER   ");
 	} else {
-		seq_printf(f, "UNKNOWN UNKNOWN  ");
+		seq_puts(f, "UNKNOWN UNKNOWN  ");
 	}
 	if (fl->fl_type & LOCK_MAND) {
 		seq_printf(f, "%s ",
@@ -2467,7 +2492,7 @@ static void lock_get_status(struct seq_file *f, struct file_lock *fl,
 		else
 			seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
 	} else {
-		seq_printf(f, "0 EOF\n");
+		seq_puts(f, "0 EOF\n");
 	}
 }
 
diff --git a/fs/namei.c b/fs/namei.c
index c6157c894fce..80168273396b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1542,7 +1542,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
 		inode = path->dentry->d_inode;
 	}
 	err = -ENOENT;
-	if (!inode)
+	if (!inode || d_is_negative(path->dentry))
 		goto out_path_put;
 
 	if (should_follow_link(path->dentry, follow)) {
@@ -2249,7 +2249,7 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 	mutex_unlock(&dir->d_inode->i_mutex);
 
 done:
-	if (!dentry->d_inode) {
+	if (!dentry->d_inode || d_is_negative(dentry)) {
 		error = -ENOENT;
 		dput(dentry);
 		goto out;
@@ -2994,7 +2994,7 @@ retry_lookup:
 finish_lookup:
 	/* we _can_ be in RCU mode here */
 	error = -ENOENT;
-	if (d_is_negative(path->dentry)) {
+	if (!inode || d_is_negative(path->dentry)) {
 		path_to_nameidata(path, nd);
 		goto out;
 	}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d9f3d067cd15..4a3d4ef76127 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -2032,9 +2032,9 @@ static void nfs_access_free_entry(struct nfs_access_entry *entry)
 {
 	put_rpccred(entry->cred);
 	kfree(entry);
-	smp_mb__before_atomic_dec();
+	smp_mb__before_atomic();
 	atomic_long_dec(&nfs_access_nr_entries);
-	smp_mb__after_atomic_dec();
+	smp_mb__after_atomic();
 }
 
 static void nfs_access_free_list(struct list_head *head)
@@ -2082,9 +2082,9 @@ nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 		else {
 remove_lru_entry:
 			list_del_init(&nfsi->access_cache_inode_lru);
-			smp_mb__before_clear_bit();
+			smp_mb__before_atomic();
 			clear_bit(NFS_INO_ACL_LRU_SET, &nfsi->flags);
-			smp_mb__after_clear_bit();
+			smp_mb__after_atomic();
 		}
 		spin_unlock(&inode->i_lock);
 	}
@@ -2232,9 +2232,9 @@ void nfs_access_add_cache(struct inode *inode, struct nfs_access_entry *set)
 	nfs_access_add_rbtree(inode, cache);
 
 	/* Update accounting */
-	smp_mb__before_atomic_inc();
+	smp_mb__before_atomic();
 	atomic_long_inc(&nfs_access_nr_entries);
-	smp_mb__after_atomic_inc();
+	smp_mb__after_atomic();
 
 	/* Add inode to global LRU list */
 	if (!test_bit(NFS_INO_ACL_LRU_SET, &NFS_I(inode)->flags)) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 284ca901fe16..c1edf7336315 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -916,10 +916,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 		is_local = 1;
 
 	/* We're simulating flock() locks using posix locks on the server */
-	fl->fl_owner = (fl_owner_t)filp;
-	fl->fl_start = 0;
-	fl->fl_end = OFFSET_MAX;
-
 	if (fl->fl_type == F_UNLCK)
 		return do_unlk(filp, cmd, fl, is_local);
 	return do_setlk(filp, cmd, fl, is_local);
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c438973f3c8..e6f7398d2b3c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1085,7 +1085,7 @@ int nfs_revalidate_mapping(struct inode *inode, struct address_space *mapping)
 	trace_nfs_invalidate_mapping_exit(inode, ret);
 
 	clear_bit_unlock(NFS_INO_INVALIDATING, bitlock);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(bitlock, NFS_INO_INVALIDATING);
 out:
 	return ret;
diff --git a/fs/nfs/nfs4filelayoutdev.c b/fs/nfs/nfs4filelayoutdev.c
index efac602edb37..b9c61efe9660 100644
--- a/fs/nfs/nfs4filelayoutdev.c
+++ b/fs/nfs/nfs4filelayoutdev.c
@@ -789,9 +789,9 @@ static void nfs4_wait_ds_connect(struct nfs4_pnfs_ds *ds)
 
 static void nfs4_clear_ds_conn_bit(struct nfs4_pnfs_ds *ds)
 {
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(NFS4DS_CONNECTING, &ds->ds_state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&ds->ds_state, NFS4DS_CONNECTING);
 }
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2349518eef2c..c0583b9bef71 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -1140,9 +1140,9 @@ static int nfs4_run_state_manager(void *);
 
 static void nfs4_clear_state_manager_bit(struct nfs_client *clp)
 {
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING);
 	rpc_wake_up(&clp->cl_rpcwaitq);
 }
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 2ffebf2081ce..03ed984ab4d8 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -95,7 +95,7 @@ nfs_iocounter_dec(struct nfs_io_counter *c)
 {
 	if (atomic_dec_and_test(&c->io_count)) {
 		clear_bit(NFS_IO_INPROGRESS, &c->flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		wake_up_bit(&c->flags, NFS_IO_INPROGRESS);
 	}
 }
@@ -193,9 +193,9 @@ void nfs_unlock_request(struct nfs_page *req)
 		printk(KERN_ERR "NFS: Invalid unlock attempted\n");
 		BUG();
 	}
-	smp_mb__before_clear_bit();
+	smp_mb__before_atomic();
 	clear_bit(PG_BUSY, &req->wb_flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&req->wb_flags, PG_BUSY);
 }
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index cb53d450ae32..fd9536e494bc 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -1810,7 +1810,7 @@ static void pnfs_clear_layoutcommitting(struct inode *inode)
 	unsigned long *bitlock = &NFS_I(inode)->flags;
 
 	clear_bit_unlock(NFS_INO_LAYOUTCOMMITTING, bitlock);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(bitlock, NFS_INO_LAYOUTCOMMITTING);
 }
 
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 023793909778..c3058a076596 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -275,7 +275,7 @@ pnfs_get_lseg(struct pnfs_layout_segment *lseg)
 {
 	if (lseg) {
 		atomic_inc(&lseg->pls_refcount);
-		smp_mb__after_atomic_inc();
+		smp_mb__after_atomic();
 	}
 	return lseg;
 }
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9a3b6a4cd6b9..ffb9459f180b 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -405,7 +405,7 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
 	nfs_pageio_complete(&pgio);
 
 	clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(bitlock, NFS_INO_FLUSHING);
 
 	if (err < 0)
@@ -1458,7 +1458,7 @@ static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait)
 static void nfs_commit_clear_lock(struct nfs_inode *nfsi)
 {
 	clear_bit(NFS_INO_COMMIT, &nfsi->flags);
-	smp_mb__after_clear_bit();
+	smp_mb__after_atomic();
 	wake_up_bit(&nfsi->flags, NFS_INO_COMMIT);
 }
 
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 6f3f392d48af..f66c66b9f182 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -402,8 +402,10 @@ sort_pacl(struct posix_acl *pacl)
 	 * by uid/gid. */
 	int i, j;
 
-	if (pacl->a_count <= 4)
-		return; /* no users or groups */
+	/* no users or groups */
+	if (!pacl || pacl->a_count <= 4)
+		return;
+
 	i = 1;
 	while (pacl->a_entries[i].e_tag == ACL_USER)
 		i++;
@@ -530,13 +532,12 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 
 	/*
 	 * ACLs with no ACEs are treated differently in the inheritable
-	 * and effective cases: when there are no inheritable ACEs, we
-	 * set a zero-length default posix acl:
+	 * and effective cases: when there are no inheritable ACEs,
+	 * calls ->set_acl with a NULL ACL structure.
 	 */
-	if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT)) {
-		pacl = posix_acl_alloc(0, GFP_KERNEL);
-		return pacl ? pacl : ERR_PTR(-ENOMEM);
-	}
+	if (state->empty && (flags & NFS4_ACL_TYPE_DEFAULT))
+		return NULL;
+
 	/*
 	 * When there are no effective ACEs, the following will end
 	 * up setting a 3-element effective posix ACL with all
@@ -589,7 +590,7 @@ posix_state_to_acl(struct posix_acl_state *state, unsigned int flags)
 		add_to_mask(state, &state->groups->aces[i].perms);
 	}
 
-	if (!state->users->n && !state->groups->n) {
+	if (state->users->n || state->groups->n) {
 		pace++;
 		pace->e_tag = ACL_MASK;
 		low_mode_from_nfs4(state->mask.allow, &pace->e_perm, flags);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3ba65979a3cd..9a77a5a21557 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1078,6 +1078,18 @@ static struct nfs4_client *alloc_client(struct xdr_netobj name)
 		return NULL;
 	}
 	clp->cl_name.len = name.len;
+	INIT_LIST_HEAD(&clp->cl_sessions);
+	idr_init(&clp->cl_stateids);
+	atomic_set(&clp->cl_refcount, 0);
+	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
+	INIT_LIST_HEAD(&clp->cl_idhash);
+	INIT_LIST_HEAD(&clp->cl_openowners);
+	INIT_LIST_HEAD(&clp->cl_delegations);
+	INIT_LIST_HEAD(&clp->cl_lru);
+	INIT_LIST_HEAD(&clp->cl_callbacks);
+	INIT_LIST_HEAD(&clp->cl_revoked);
+	spin_lock_init(&clp->cl_lock);
+	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	return clp;
 }
 
@@ -1095,6 +1107,7 @@ free_client(struct nfs4_client *clp)
 		WARN_ON_ONCE(atomic_read(&ses->se_ref));
 		free_session(ses);
 	}
+	rpc_destroy_wait_queue(&clp->cl_cb_waitq);
 	free_svc_cred(&clp->cl_cred);
 	kfree(clp->cl_name.data);
 	idr_destroy(&clp->cl_stateids);
@@ -1347,7 +1360,6 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 	if (clp == NULL)
 		return NULL;
 
-	INIT_LIST_HEAD(&clp->cl_sessions);
 	ret = copy_cred(&clp->cl_cred, &rqstp->rq_cred);
 	if (ret) {
 		spin_lock(&nn->client_lock);
@@ -1355,20 +1367,9 @@ static struct nfs4_client *create_client(struct xdr_netobj name,
 		spin_unlock(&nn->client_lock);
 		return NULL;
 	}
-	idr_init(&clp->cl_stateids);
-	atomic_set(&clp->cl_refcount, 0);
-	clp->cl_cb_state = NFSD4_CB_UNKNOWN;
-	INIT_LIST_HEAD(&clp->cl_idhash);
-	INIT_LIST_HEAD(&clp->cl_openowners);
-	INIT_LIST_HEAD(&clp->cl_delegations);
-	INIT_LIST_HEAD(&clp->cl_lru);
-	INIT_LIST_HEAD(&clp->cl_callbacks);
-	INIT_LIST_HEAD(&clp->cl_revoked);
-	spin_lock_init(&clp->cl_lock);
 	nfsd4_init_callback(&clp->cl_cb_null);
 	clp->cl_time = get_seconds();
 	clear_bit(0, &clp->cl_cb_slot_busy);
-	rpc_init_wait_queue(&clp->cl_cb_waitq, "Backchannel slot table");
 	copy_verf(clp, verf);
 	rpc_copy_addr((struct sockaddr *) &clp->cl_addr, sa);
 	gen_confirm(clp);
@@ -3716,9 +3717,16 @@ out:
 static __be32
 nfsd4_free_lock_stateid(struct nfs4_ol_stateid *stp)
 {
-	if (check_for_locks(stp->st_file, lockowner(stp->st_stateowner)))
+	struct nfs4_lockowner *lo = lockowner(stp->st_stateowner);
+
+	if (check_for_locks(stp->st_file, lo))
 		return nfserr_locks_held;
-	release_lock_stateid(stp);
+	/*
+	 * Currently there's a 1-1 lock stateid<->lockowner
+	 * correspondance, and we have to delete the lockowner when we
+	 * delete the lock stateid:
+	 */
+	unhash_lockowner(lo);
 	return nfs_ok;
 }
 
@@ -4158,6 +4166,10 @@ static bool same_lockowner_ino(struct nfs4_lockowner *lo, struct inode *inode, c
 
 	if (!same_owner_str(&lo->lo_owner, owner, clid))
 		return false;
+	if (list_empty(&lo->lo_owner.so_stateids)) {
+		WARN_ON_ONCE(1);
+		return false;
+	}
 	lst = list_first_entry(&lo->lo_owner.so_stateids,
 			       struct nfs4_ol_stateid, st_perstateowner);
 	return lst->st_file->fi_inode == inode;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 4e565c814309..732648b270dc 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -698,6 +698,8 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	}
 	group->overflow_event = &oevent->fse;
 
+	if (force_o_largefile())
+		event_f_flags |= O_LARGEFILE;
 	group->fanotify_data.f_flags = event_f_flags;
 #ifdef CONFIG_FANOTIFY_ACCESS_PERMISSIONS
 	spin_lock_init(&group->fanotify_data.access_lock);
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bf482dfed14f..73039295d0d1 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1107,7 +1107,7 @@ static int o2hb_thread(void *data)
 
 	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");
 
-	set_user_nice(current, -20);
+	set_user_nice(current, MIN_NICE);
 
 	/* Pin node */
 	o2nm_depend_this_node();
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index af3f7aa73e13..ee1f88419cb0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -472,11 +472,15 @@ bail:
 
 void dlm_destroy_master_caches(void)
 {
-	if (dlm_lockname_cache)
+	if (dlm_lockname_cache) {
 		kmem_cache_destroy(dlm_lockname_cache);
+		dlm_lockname_cache = NULL;
+	}
 
-	if (dlm_lockres_cache)
+	if (dlm_lockres_cache) {
 		kmem_cache_destroy(dlm_lockres_cache);
+		dlm_lockres_cache = NULL;
+	}
 }
 
 static void dlm_lockres_release(struct kref *kref)
diff --git a/fs/posix_acl.c b/fs/posix_acl.c
index 9e363e41dacc..0855f772cd41 100644
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -246,6 +246,12 @@ posix_acl_equiv_mode(const struct posix_acl *acl, umode_t *mode_p)
 	umode_t mode = 0;
 	int not_equiv = 0;
 
+	/*
+	 * A null ACL can always be presented as mode bits.
+	 */
+	if (!acl)
+		return 0;
+
 	FOREACH_ACL_ENTRY(pa, acl, pe) {
 		switch (pa->e_tag) {
 			case ACL_USER_OBJ:
diff --git a/fs/splice.c b/fs/splice.c
index 9bc07d2b53cf..e246954ea48c 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -1537,7 +1537,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
 	struct iovec iovstack[UIO_FASTIOV];
 	struct iovec *iov = iovstack;
 	struct iov_iter iter;
-	ssize_t count = 0;
+	ssize_t count;
 
 	pipe = get_pipe_info(file);
 	if (!pipe)
@@ -1546,8 +1546,9 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
 	ret = rw_copy_check_uvector(READ, uiov, nr_segs,
 				    ARRAY_SIZE(iovstack), iovstack, &iov);
 	if (ret <= 0)
-		return ret;
+		goto out;
 
+	count = ret;
 	iov_iter_init(&iter, iov, nr_segs, count, 0);
 
 	sd.len = 0;
@@ -1560,6 +1561,7 @@ static long vmsplice_to_user(struct file *file, const struct iovec __user *uiov,
 	ret = __splice_from_pipe(pipe, &sd, pipe_to_user);
 	pipe_unlock(pipe);
 
+out:
 	if (iov != iovstack)
 		kfree(iov);
 
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index 28cc1acd5439..e9ef59b3abb1 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -47,12 +47,13 @@ static int sysfs_kf_seq_show(struct seq_file *sf, void *v)
 	ssize_t count;
 	char *buf;
 
-	/* acquire buffer and ensure that it's >= PAGE_SIZE */
+	/* acquire buffer and ensure that it's >= PAGE_SIZE and clear */
 	count = seq_get_buf(sf, &buf);
 	if (count < PAGE_SIZE) {
 		seq_commit(sf, -1);
 		return 0;
 	}
+	memset(buf, 0, PAGE_SIZE);
 
 	/*
 	 * Invoke show().  Control may reach here via seq file lseek even
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index aa0406895b53..7d2a860ba788 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -18,7 +18,7 @@
 #include "sysfs.h"
 
 
-static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
+static void remove_files(struct kernfs_node *parent,
 			 const struct attribute_group *grp)
 {
 	struct attribute *const *attr;
@@ -29,7 +29,7 @@ static void remove_files(struct kernfs_node *parent, struct kobject *kobj,
 			kernfs_remove_by_name(parent, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
-			sysfs_remove_bin_file(kobj, *bin_attr);
+			kernfs_remove_by_name(parent, (*bin_attr)->attr.name);
 }
 
 static int create_files(struct kernfs_node *parent, struct kobject *kobj,
@@ -62,7 +62,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 				break;
 		}
 		if (error) {
-			remove_files(parent, kobj, grp);
+			remove_files(parent, grp);
 			goto exit;
 		}
 	}
@@ -79,7 +79,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
 				break;
 		}
 		if (error)
-			remove_files(parent, kobj, grp);
+			remove_files(parent, grp);
 	}
 exit:
 	return error;
@@ -224,7 +224,7 @@ void sysfs_remove_group(struct kobject *kobj,
 		kernfs_get(kn);
 	}
 
-	remove_files(kn, kobj, grp);
+	remove_files(kn, grp);
 	if (grp->name)
 		kernfs_remove(kn);
 
diff --git a/fs/sysfs/mount.c b/fs/sysfs/mount.c
index a66ad6196f59..8a49486bf30c 100644
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -13,6 +13,7 @@
 #define DEBUG
 
 #include <linux/fs.h>
+#include <linux/magic.h>
 #include <linux/mount.h>
 #include <linux/init.h>
 #include <linux/user_namespace.h>
@@ -38,7 +39,8 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 	}
 
 	ns = kobj_ns_grab_current(KOBJ_NS_TYPE_NET);
-	root = kernfs_mount_ns(fs_type, flags, sysfs_root, &new_sb, ns);
+	root = kernfs_mount_ns(fs_type, flags, sysfs_root,
+				SYSFS_MAGIC, &new_sb, ns);
 	if (IS_ERR(root) || !new_sb)
 		kobj_ns_drop(KOBJ_NS_TYPE_NET, ns);
 	return root;
@@ -63,7 +65,8 @@ int __init sysfs_init(void)
 {
 	int err;
 
-	sysfs_root = kernfs_create_root(NULL, 0, NULL);
+	sysfs_root = kernfs_create_root(NULL, KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK,
+					NULL);
 	if (IS_ERR(sysfs_root))
 		return PTR_ERR(sysfs_root);
 
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 4b826abb1528..45d4e96a6bac 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -460,9 +460,9 @@ static int write_cnodes(struct ubifs_info *c)
 		 * important.
 		 */
 		clear_bit(DIRTY_CNODE, &cnode->flags);
-		smp_mb__before_clear_bit();
+		smp_mb__before_atomic();
 		clear_bit(COW_CNODE, &cnode->flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 		offs += len;
 		dbg_chk_lpt_sz(c, 1, len);
 		cnode = cnode->cnext;
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index a1266089eca1..a81c7b556896 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -1556,7 +1556,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
 	if (c->space_fixup) {
 		err = ubifs_fixup_free_space(c);
 		if (err)
-			return err;
+			goto out;
 	}
 
 	err = check_free_space(c);
diff --git a/fs/ubifs/tnc_commit.c b/fs/ubifs/tnc_commit.c
index 52a6559275c4..3600994f8411 100644
--- a/fs/ubifs/tnc_commit.c
+++ b/fs/ubifs/tnc_commit.c
@@ -895,9 +895,9 @@ static int write_index(struct ubifs_info *c)
 		 * the reason for the second barrier.
 		 */
 		clear_bit(DIRTY_ZNODE, &znode->flags);
-		smp_mb__before_clear_bit();
+		smp_mb__before_atomic();
 		clear_bit(COW_ZNODE, &znode->flags);
-		smp_mb__after_clear_bit();
+		smp_mb__after_atomic();
 
 		/*
 		 * We have marked the znode as clean but have not updated the
diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c
index 01b6a0102fbd..abda1124a70f 100644
--- a/fs/xfs/xfs_attr.c
+++ b/fs/xfs/xfs_attr.c
@@ -213,7 +213,7 @@ xfs_attr_calc_size(
 		 * Out of line attribute, cannot double split, but
 		 * make room for the attribute value itself.
 		 */
-		uint	dblocks = XFS_B_TO_FSB(mp, valuelen);
+		uint	dblocks = xfs_attr3_rmt_blocks(mp, valuelen);
 		nblks += dblocks;
 		nblks += XFS_NEXTENTADD_SPACE_RES(mp, dblocks, XFS_ATTR_FORK);
 	}
@@ -698,11 +698,22 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 
 		trace_xfs_attr_leaf_replace(args);
 
+		/* save the attribute state for later removal*/
 		args->op_flags |= XFS_DA_OP_RENAME;	/* an atomic rename */
 		args->blkno2 = args->blkno;		/* set 2nd entry info*/
 		args->index2 = args->index;
 		args->rmtblkno2 = args->rmtblkno;
 		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
+		args->rmtblkno = 0;
+		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
 	}
 
 	/*
@@ -794,6 +805,7 @@ xfs_attr_leaf_addname(xfs_da_args_t *args)
 		args->blkno = args->blkno2;
 		args->rmtblkno = args->rmtblkno2;
 		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
 		if (args->rmtblkno) {
 			error = xfs_attr_rmtval_remove(args);
 			if (error)
@@ -999,13 +1011,22 @@ restart:
 
 		trace_xfs_attr_node_replace(args);
 
+		/* save the attribute state for later removal*/
 		args->op_flags |= XFS_DA_OP_RENAME;	/* atomic rename op */
 		args->blkno2 = args->blkno;		/* set 2nd entry info*/
 		args->index2 = args->index;
 		args->rmtblkno2 = args->rmtblkno;
 		args->rmtblkcnt2 = args->rmtblkcnt;
+		args->rmtvaluelen2 = args->rmtvaluelen;
+
+		/*
+		 * clear the remote attr state now that it is saved so that the
+		 * values reflect the state of the attribute we are about to
+		 * add, not the attribute we just found and will remove later.
+		 */
 		args->rmtblkno = 0;
 		args->rmtblkcnt = 0;
+		args->rmtvaluelen = 0;
 	}
 
 	retval = xfs_attr3_leaf_add(blk->bp, state->args);
@@ -1133,6 +1154,7 @@ restart:
 		args->blkno = args->blkno2;
 		args->rmtblkno = args->rmtblkno2;
 		args->rmtblkcnt = args->rmtblkcnt2;
+		args->rmtvaluelen = args->rmtvaluelen2;
 		if (args->rmtblkno) {
 			error = xfs_attr_rmtval_remove(args);
 			if (error)
diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c
index fe9587fab17a..511c283459b1 100644
--- a/fs/xfs/xfs_attr_leaf.c
+++ b/fs/xfs/xfs_attr_leaf.c
@@ -1229,6 +1229,7 @@ xfs_attr3_leaf_add_work(
 		name_rmt->valueblk = 0;
 		args->rmtblkno = 1;
 		args->rmtblkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+		args->rmtvaluelen = args->valuelen;
 	}
 	xfs_trans_log_buf(args->trans, bp,
 	     XFS_DA_LOGRANGE(leaf, xfs_attr3_leaf_name(leaf, args->index),
@@ -2167,11 +2168,11 @@ xfs_attr3_leaf_lookup_int(
 			if (!xfs_attr_namesp_match(args->flags, entry->flags))
 				continue;
 			args->index = probe;
-			args->valuelen = be32_to_cpu(name_rmt->valuelen);
+			args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
 			args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
 			args->rmtblkcnt = xfs_attr3_rmt_blocks(
 							args->dp->i_mount,
-							args->valuelen);
+							args->rmtvaluelen);
 			return XFS_ERROR(EEXIST);
 		}
 	}
@@ -2220,19 +2221,19 @@ xfs_attr3_leaf_getvalue(
 		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		ASSERT(name_rmt->namelen == args->namelen);
 		ASSERT(memcmp(args->name, name_rmt->name, args->namelen) == 0);
-		valuelen = be32_to_cpu(name_rmt->valuelen);
+		args->rmtvaluelen = be32_to_cpu(name_rmt->valuelen);
 		args->rmtblkno = be32_to_cpu(name_rmt->valueblk);
 		args->rmtblkcnt = xfs_attr3_rmt_blocks(args->dp->i_mount,
-						       valuelen);
+						       args->rmtvaluelen);
 		if (args->flags & ATTR_KERNOVAL) {
-			args->valuelen = valuelen;
+			args->valuelen = args->rmtvaluelen;
 			return 0;
 		}
-		if (args->valuelen < valuelen) {
-			args->valuelen = valuelen;
+		if (args->valuelen < args->rmtvaluelen) {
+			args->valuelen = args->rmtvaluelen;
 			return XFS_ERROR(ERANGE);
 		}
-		args->valuelen = valuelen;
+		args->valuelen = args->rmtvaluelen;
 	}
 	return 0;
 }
@@ -2519,7 +2520,7 @@ xfs_attr3_leaf_clearflag(
 		ASSERT((entry->flags & XFS_ATTR_LOCAL) == 0);
 		name_rmt = xfs_attr3_leaf_name_remote(leaf, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-		name_rmt->valuelen = cpu_to_be32(args->valuelen);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
 		xfs_trans_log_buf(args->trans, bp,
 			 XFS_DA_LOGRANGE(leaf, name_rmt, sizeof(*name_rmt)));
 	}
@@ -2677,7 +2678,7 @@ xfs_attr3_leaf_flipflags(
 		ASSERT((entry1->flags & XFS_ATTR_LOCAL) == 0);
 		name_rmt = xfs_attr3_leaf_name_remote(leaf1, args->index);
 		name_rmt->valueblk = cpu_to_be32(args->rmtblkno);
-		name_rmt->valuelen = cpu_to_be32(args->valuelen);
+		name_rmt->valuelen = cpu_to_be32(args->rmtvaluelen);
 		xfs_trans_log_buf(args->trans, bp1,
 			 XFS_DA_LOGRANGE(leaf1, name_rmt, sizeof(*name_rmt)));
 	}
diff --git a/fs/xfs/xfs_attr_list.c b/fs/xfs/xfs_attr_list.c
index 01db96f60cf0..833fe5d98d80 100644
--- a/fs/xfs/xfs_attr_list.c
+++ b/fs/xfs/xfs_attr_list.c
@@ -447,6 +447,7 @@ xfs_attr3_leaf_list_int(
 				args.dp = context->dp;
 				args.whichfork = XFS_ATTR_FORK;
 				args.valuelen = valuelen;
+				args.rmtvaluelen = valuelen;
 				args.value = kmem_alloc(valuelen, KM_SLEEP | KM_NOFS);
 				args.rmtblkno = be32_to_cpu(name_rmt->valueblk);
 				args.rmtblkcnt = xfs_attr3_rmt_blocks(
diff --git a/fs/xfs/xfs_attr_remote.c b/fs/xfs/xfs_attr_remote.c
index 6e37823e2932..d2e6e948cec7 100644
--- a/fs/xfs/xfs_attr_remote.c
+++ b/fs/xfs/xfs_attr_remote.c
@@ -337,7 +337,7 @@ xfs_attr_rmtval_get(
 	struct xfs_buf		*bp;
 	xfs_dablk_t		lblkno = args->rmtblkno;
 	__uint8_t		*dst = args->value;
-	int			valuelen = args->valuelen;
+	int			valuelen;
 	int			nmap;
 	int			error;
 	int			blkcnt = args->rmtblkcnt;
@@ -347,7 +347,9 @@ xfs_attr_rmtval_get(
 	trace_xfs_attr_rmtval_get(args);
 
 	ASSERT(!(args->flags & ATTR_KERNOVAL));
+	ASSERT(args->rmtvaluelen == args->valuelen);
 
+	valuelen = args->rmtvaluelen;
 	while (valuelen > 0) {
 		nmap = ATTR_RMTVALUE_MAPSIZE;
 		error = xfs_bmapi_read(args->dp, (xfs_fileoff_t)lblkno,
@@ -415,7 +417,7 @@ xfs_attr_rmtval_set(
 	 * attributes have headers, we can't just do a straight byte to FSB
 	 * conversion and have to take the header space into account.
 	 */
-	blkcnt = xfs_attr3_rmt_blocks(mp, args->valuelen);
+	blkcnt = xfs_attr3_rmt_blocks(mp, args->rmtvaluelen);
 	error = xfs_bmap_first_unused(args->trans, args->dp, blkcnt, &lfileoff,
 						   XFS_ATTR_FORK);
 	if (error)
@@ -480,7 +482,7 @@ xfs_attr_rmtval_set(
 	 */
 	lblkno = args->rmtblkno;
 	blkcnt = args->rmtblkcnt;
-	valuelen = args->valuelen;
+	valuelen = args->rmtvaluelen;
 	while (valuelen > 0) {
 		struct xfs_buf	*bp;
 		xfs_daddr_t	dblkno;
diff --git a/fs/xfs/xfs_da_btree.h b/fs/xfs/xfs_da_btree.h
index 6e95ea79f5d7..201c6091d26a 100644
--- a/fs/xfs/xfs_da_btree.h
+++ b/fs/xfs/xfs_da_btree.h
@@ -60,10 +60,12 @@ typedef struct xfs_da_args {
 	int		index;		/* index of attr of interest in blk */
 	xfs_dablk_t	rmtblkno;	/* remote attr value starting blkno */
 	int		rmtblkcnt;	/* remote attr value block count */
+	int		rmtvaluelen;	/* remote attr value length in bytes */
 	xfs_dablk_t	blkno2;		/* blkno of 2nd attr leaf of interest */
 	int		index2;		/* index of 2nd attr in blk */
 	xfs_dablk_t	rmtblkno2;	/* remote attr value starting blkno */
 	int		rmtblkcnt2;	/* remote attr value block count */
+	int		rmtvaluelen2;	/* remote attr value length in bytes */
 	int		op_flags;	/* operation flags */
 	enum xfs_dacmp	cmpresult;	/* name compare result for lookups */
 } xfs_da_args_t;
diff --git a/fs/xfs/xfs_export.c b/fs/xfs/xfs_export.c
index 1399e187d425..753e467aa1a5 100644
--- a/fs/xfs/xfs_export.c
+++ b/fs/xfs/xfs_export.c
@@ -237,7 +237,7 @@ xfs_fs_nfs_commit_metadata(
 
 	if (!lsn)
 		return 0;
-	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+	return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 const struct export_operations xfs_export_operations = {
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 951a2321ee01..830c1c937b88 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -155,7 +155,7 @@ xfs_dir_fsync(
 
 	if (!lsn)
 		return 0;
-	return _xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
+	return -_xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, NULL);
 }
 
 STATIC int
@@ -295,7 +295,7 @@ xfs_file_aio_read(
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
 		if (inode->i_mapping->nrpages) {
-			ret = -filemap_write_and_wait_range(
+			ret = filemap_write_and_wait_range(
 							VFS_I(ip)->i_mapping,
 							pos, -1);
 			if (ret) {
@@ -837,7 +837,7 @@ xfs_file_fallocate(
 		unsigned blksize_mask = (1 << inode->i_blkbits) - 1;
 
 		if (offset & blksize_mask || len & blksize_mask) {
-			error = -EINVAL;
+			error = EINVAL;
 			goto out_unlock;
 		}
 
@@ -846,7 +846,7 @@ xfs_file_fallocate(
 		 * in which case it is effectively a truncate operation
 		 */
 		if (offset + len >= i_size_read(inode)) {
-			error = -EINVAL;
+			error = EINVAL;
 			goto out_unlock;
 		}
 
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index ef1ca010f417..36d630319a27 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -72,8 +72,8 @@ xfs_initxattrs(
 	int			error = 0;
 
 	for (xattr = xattr_array; xattr->name != NULL; xattr++) {
-		error = xfs_attr_set(ip, xattr->name, xattr->value,
-				     xattr->value_len, ATTR_SECURE);
+		error = -xfs_attr_set(ip, xattr->name, xattr->value,
+				      xattr->value_len, ATTR_SECURE);
 		if (error < 0)
 			break;
 	}
@@ -93,8 +93,8 @@ xfs_init_security(
 	struct inode	*dir,
 	const struct qstr *qstr)
 {
-	return security_inode_init_security(inode, dir, qstr,
-					    &xfs_initxattrs, NULL);
+	return -security_inode_init_security(inode, dir, qstr,
+					     &xfs_initxattrs, NULL);
 }
 
 static void
@@ -124,15 +124,15 @@ xfs_cleanup_inode(
 	xfs_dentry_to_name(&teardown, dentry, 0);
 
 	xfs_remove(XFS_I(dir), &teardown, XFS_I(inode));
-	iput(inode);
 }
 
 STATIC int
-xfs_vn_mknod(
+xfs_generic_create(
 	struct inode	*dir,
 	struct dentry	*dentry,
 	umode_t		mode,
-	dev_t		rdev)
+	dev_t		rdev,
+	bool		tmpfile)	/* unnamed file */
 {
 	struct inode	*inode;
 	struct xfs_inode *ip = NULL;
@@ -156,8 +156,12 @@ xfs_vn_mknod(
 	if (error)
 		return error;
 
-	xfs_dentry_to_name(&name, dentry, mode);
-	error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+	if (!tmpfile) {
+		xfs_dentry_to_name(&name, dentry, mode);
+		error = xfs_create(XFS_I(dir), &name, mode, rdev, &ip);
+	} else {
+		error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
+	}
 	if (unlikely(error))
 		goto out_free_acl;
 
@@ -169,18 +173,22 @@ xfs_vn_mknod(
 
 #ifdef CONFIG_XFS_POSIX_ACL
 	if (default_acl) {
-		error = xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
+		error = -xfs_set_acl(inode, default_acl, ACL_TYPE_DEFAULT);
 		if (error)
 			goto out_cleanup_inode;
 	}
 	if (acl) {
-		error = xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
+		error = -xfs_set_acl(inode, acl, ACL_TYPE_ACCESS);
 		if (error)
 			goto out_cleanup_inode;
 	}
 #endif
 
-	d_instantiate(dentry, inode);
+	if (tmpfile)
+		d_tmpfile(dentry, inode);
+	else
+		d_instantiate(dentry, inode);
+
  out_free_acl:
 	if (default_acl)
 		posix_acl_release(default_acl);
@@ -189,11 +197,23 @@ xfs_vn_mknod(
 	return -error;
 
  out_cleanup_inode:
-	xfs_cleanup_inode(dir, inode, dentry);
+	if (!tmpfile)
+		xfs_cleanup_inode(dir, inode, dentry);
+	iput(inode);
 	goto out_free_acl;
 }
 
 STATIC int
+xfs_vn_mknod(
+	struct inode	*dir,
+	struct dentry	*dentry,
+	umode_t		mode,
+	dev_t		rdev)
+{
+	return xfs_generic_create(dir, dentry, mode, rdev, false);
+}
+
+STATIC int
 xfs_vn_create(
 	struct inode	*dir,
 	struct dentry	*dentry,
@@ -353,6 +373,7 @@ xfs_vn_symlink(
 
  out_cleanup_inode:
 	xfs_cleanup_inode(dir, inode, dentry);
+	iput(inode);
  out:
 	return -error;
 }
@@ -1053,25 +1074,7 @@ xfs_vn_tmpfile(
 	struct dentry	*dentry,
 	umode_t		mode)
 {
-	int			error;
-	struct xfs_inode	*ip;
-	struct inode		*inode;
-
-	error = xfs_create_tmpfile(XFS_I(dir), dentry, mode, &ip);
-	if (unlikely(error))
-		return -error;
-
-	inode = VFS_I(ip);
-
-	error = xfs_init_security(inode, dir, &dentry->d_name);
-	if (unlikely(error)) {
-		iput(inode);
-		return -error;
-	}
-
-	d_tmpfile(dentry, inode);
-
-	return 0;
+	return xfs_generic_create(dir, dentry, mode, 0, true);
 }
 
 static const struct inode_operations xfs_inode_operations = {
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 08624dc67317..a5f8bd9899d3 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -616,11 +616,13 @@ xfs_log_mount(
 	int		error = 0;
 	int		min_logfsbs;
 
-	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY))
-		xfs_notice(mp, "Mounting Filesystem");
-	else {
+	if (!(mp->m_flags & XFS_MOUNT_NORECOVERY)) {
+		xfs_notice(mp, "Mounting V%d Filesystem",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
+	} else {
 		xfs_notice(mp,
-"Mounting filesystem in no-recovery mode.  Filesystem will be inconsistent.");
+"Mounting V%d filesystem in no-recovery mode. Filesystem will be inconsistent.",
+			   XFS_SB_VERSION_NUM(&mp->m_sb));
 		ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
 	}
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 993cb19e7d39..944f3d9456a8 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -743,8 +743,6 @@ xfs_mountfs(
 		new_size *= mp->m_sb.sb_inodesize / XFS_DINODE_MIN_SIZE;
 		if (mp->m_sb.sb_inoalignmt >= XFS_B_TO_FSBT(mp, new_size))
 			mp->m_inode_cluster_size = new_size;
-		xfs_info(mp, "Using inode cluster size of %d bytes",
-			 mp->m_inode_cluster_size);
 	}
 
 	/*
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 348e4d2ed6e6..dc977b6e6a36 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -843,22 +843,17 @@ xfs_qm_init_quotainfo(
 
 	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
-	if ((error = list_lru_init(&qinf->qi_lru))) {
-		kmem_free(qinf);
-		mp->m_quotainfo = NULL;
-		return error;
-	}
+	error = -list_lru_init(&qinf->qi_lru);
+	if (error)
+		goto out_free_qinf;
 
 	/*
 	 * See if quotainodes are setup, and if not, allocate them,
 	 * and change the superblock accordingly.
 	 */
-	if ((error = xfs_qm_init_quotainos(mp))) {
-		list_lru_destroy(&qinf->qi_lru);
-		kmem_free(qinf);
-		mp->m_quotainfo = NULL;
-		return error;
-	}
+	error = xfs_qm_init_quotainos(mp);
+	if (error)
+		goto out_free_lru;
 
 	INIT_RADIX_TREE(&qinf->qi_uquota_tree, GFP_NOFS);
 	INIT_RADIX_TREE(&qinf->qi_gquota_tree, GFP_NOFS);
@@ -918,7 +913,7 @@ xfs_qm_init_quotainfo(
 		qinf->qi_isoftlimit = be64_to_cpu(ddqp->d_ino_softlimit);
 		qinf->qi_rtbhardlimit = be64_to_cpu(ddqp->d_rtb_hardlimit);
 		qinf->qi_rtbsoftlimit = be64_to_cpu(ddqp->d_rtb_softlimit);
- 
+
 		xfs_qm_dqdestroy(dqp);
 	} else {
 		qinf->qi_btimelimit = XFS_QM_BTIMELIMIT;
@@ -935,6 +930,13 @@ xfs_qm_init_quotainfo(
 	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&qinf->qi_shrinker);
 	return 0;
+
+out_free_lru:
+	list_lru_destroy(&qinf->qi_lru);
+out_free_qinf:
+	kmem_free(qinf);
+	mp->m_quotainfo = NULL;
+	return error;
 }
 
 
diff --git a/fs/xfs/xfs_sb.c b/fs/xfs/xfs_sb.c
index 0c0e41bbe4e3..8baf61afae1d 100644
--- a/fs/xfs/xfs_sb.c
+++ b/fs/xfs/xfs_sb.c
@@ -201,10 +201,6 @@ xfs_mount_validate_sb(
 	 * write validation, we don't need to check feature masks.
 	 */
 	if (check_version && XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5) {
-		xfs_alert(mp,
-"Version 5 superblock detected. This kernel has EXPERIMENTAL support enabled!\n"
-"Use of these features in this kernel is at your own risk!");
-
 		if (xfs_sb_has_compat_feature(sbp,
 					XFS_SB_FEAT_COMPAT_UNKNOWN)) {
 			xfs_warn(mp,
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 205376776377..3494eff8e4eb 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1433,11 +1433,11 @@ xfs_fs_fill_super(
 	if (error)
 		goto out_free_fsname;
 
-	error = xfs_init_mount_workqueues(mp);
+	error = -xfs_init_mount_workqueues(mp);
 	if (error)
 		goto out_close_devices;
 
-	error = xfs_icsb_init_counters(mp);
+	error = -xfs_icsb_init_counters(mp);
 	if (error)
 		goto out_destroy_workqueues;