164 files changed, 7641 insertions, 6025 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 58e6cbce4156..08f2e1e9a7e6 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -603,10 +603,11 @@ static int v9fs_cache_register(void)
 	if (ret < 0)
 		return ret;
 #ifdef CONFIG_9P_FSCACHE
-	return fscache_register_netfs(&v9fs_cache_netfs);
-#else
-	return ret;
+	ret = fscache_register_netfs(&v9fs_cache_netfs);
+	if (ret < 0)
+		v9fs_destroy_inode_cache();
 #endif
+	return ret;
 }
 
 static void v9fs_cache_unregister(void)
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 53687bbf2296..a7c481402c46 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -267,14 +267,8 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry,
 	}
 
 	/* Only creates */
-	if (!(flags & O_CREAT))
+	if (!(flags & O_CREAT) || dentry->d_inode)
 		return	finish_no_open(file, res);
-	else if (dentry->d_inode) {
-		if ((flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
-			return -EEXIST;
-		else
-			return finish_no_open(file, res);
-	}
 
 	v9ses = v9fs_inode2v9ses(dir);
 
diff --git a/fs/adfs/inode.c b/fs/adfs/inode.c
index 5f95d1ed9c6d..b9acadafa4a1 100644
--- a/fs/adfs/inode.c
+++ b/fs/adfs/inode.c
@@ -50,7 +50,7 @@ static void adfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 
 static int adfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/affs/file.c b/fs/affs/file.c
index 776e3935a758..8669b6ecddee 100644
--- a/fs/affs/file.c
+++ b/fs/affs/file.c
@@ -406,7 +406,7 @@ static void affs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		affs_truncate(inode);
 	}
 }
diff --git a/fs/aio.c b/fs/aio.c
index 9b5ca1137419..6b868f0e0c4c 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -26,6 +26,7 @@
 #include <linux/mm.h>
 #include <linux/mman.h>
 #include <linux/mmu_context.h>
+#include <linux/percpu.h>
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
@@ -35,6 +36,10 @@
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
+#include <linux/anon_inodes.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>
+#include <linux/percpu-refcount.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -61,14 +66,29 @@ struct aio_ring {
 
 #define AIO_RING_PAGES	8
 
+struct kioctx_table {
+	struct rcu_head	rcu;
+	unsigned	nr;
+	struct kioctx	*table[];
+};
+
+struct kioctx_cpu {
+	unsigned		reqs_available;
+};
+
 struct kioctx {
-	atomic_t		users;
+	struct percpu_ref	users;
 	atomic_t		dead;
 
-	/* This needs improving */
 	unsigned long		user_id;
-	struct hlist_node	list;
 
+	struct __percpu kioctx_cpu *cpu;
+
+	/*
+	 * For percpu reqs_available, number of slots we move to/from global
+	 * counter at a time:
+	 */
+	unsigned		req_batch;
 	/*
 	 * This is what userspace passed to io_setup(), it's not used for
 	 * anything but counting against the global max_reqs quota.
@@ -88,10 +108,18 @@ struct kioctx {
 	long			nr_pages;
 
 	struct rcu_head		rcu_head;
-	struct work_struct	rcu_work;
+	struct work_struct	free_work;
 
 	struct {
-		atomic_t	reqs_active;
+		/*
+		 * This counts the number of available slots in the ringbuffer,
+		 * so we avoid overflowing it: it's decremented (if positive)
+		 * when allocating a kiocb and incremented when the resulting
+		 * io_event is pulled off the ringbuffer.
+		 *
+		 * We batch accesses to it with a percpu version.
+		 */
+		atomic_t	reqs_available;
 	} ____cacheline_aligned_in_smp;
 
 	struct {
@@ -110,6 +138,9 @@ struct kioctx {
 	} ____cacheline_aligned_in_smp;
 
 	struct page		*internal_pages[AIO_RING_PAGES];
+	struct file		*aio_ring_file;
+
+	unsigned		id;
 };
 
 /*------ sysctl variables----*/
@@ -138,15 +169,77 @@ __initcall(aio_setup);
 
 static void aio_free_ring(struct kioctx *ctx)
 {
-	long i;
+	int i;
+	struct file *aio_ring_file = ctx->aio_ring_file;
 
-	for (i = 0; i < ctx->nr_pages; i++)
+	for (i = 0; i < ctx->nr_pages; i++) {
+		pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+				page_count(ctx->ring_pages[i]));
 		put_page(ctx->ring_pages[i]);
+	}
 
 	if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
 		kfree(ctx->ring_pages);
+
+	if (aio_ring_file) {
+		truncate_setsize(aio_ring_file->f_inode, 0);
+		fput(aio_ring_file);
+		ctx->aio_ring_file = NULL;
+	}
+}
+
+static int aio_ring_mmap(struct file *file, struct vm_area_struct *vma)
+{
+	vma->vm_ops = &generic_file_vm_ops;
+	return 0;
 }
 
+static const struct file_operations aio_ring_fops = {
+	.mmap = aio_ring_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+	return 0;
+}
+
+#if IS_ENABLED(CONFIG_MIGRATION)
+static int aio_migratepage(struct address_space *mapping, struct page *new,
+			struct page *old, enum migrate_mode mode)
+{
+	struct kioctx *ctx = mapping->private_data;
+	unsigned long flags;
+	unsigned idx = old->index;
+	int rc;
+
+	/* Writeback must be complete */
+	BUG_ON(PageWriteback(old));
+	put_page(old);
+
+	rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+	if (rc != MIGRATEPAGE_SUCCESS) {
+		get_page(old);
+		return rc;
+	}
+
+	get_page(new);
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	migrate_page_copy(new, old);
+	ctx->ring_pages[idx] = new;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return rc;
+}
+#endif
+
+static const struct address_space_operations aio_ctx_aops = {
+	.set_page_dirty = aio_set_page_dirty,
+#if IS_ENABLED(CONFIG_MIGRATION)
+	.migratepage	= aio_migratepage,
+#endif
+};
+
 static int aio_setup_ring(struct kioctx *ctx)
 {
 	struct aio_ring *ring;
@@ -154,20 +247,45 @@ static int aio_setup_ring(struct kioctx *ctx)
 	struct mm_struct *mm = current->mm;
 	unsigned long size, populate;
 	int nr_pages;
+	int i;
+	struct file *file;
 
 	/* Compensate for the ring buffer's head/tail overlap entry */
 	nr_events += 2;	/* 1 is required, 2 for good luck */
 
 	size = sizeof(struct aio_ring);
 	size += sizeof(struct io_event) * nr_events;
-	nr_pages = (size + PAGE_SIZE-1) >> PAGE_SHIFT;
 
+	nr_pages = PFN_UP(size);
 	if (nr_pages < 0)
 		return -EINVAL;
 
-	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / sizeof(struct io_event);
+	file = anon_inode_getfile_private("[aio]", &aio_ring_fops, ctx, O_RDWR);
+	if (IS_ERR(file)) {
+		ctx->aio_ring_file = NULL;
+		return -EAGAIN;
+	}
+
+	file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
+	file->f_inode->i_mapping->private_data = ctx;
+	file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+
+	for (i = 0; i < nr_pages; i++) {
+		struct page *page;
+		page = find_or_create_page(file->f_inode->i_mapping,
+					   i, GFP_HIGHUSER | __GFP_ZERO);
+		if (!page)
+			break;
+		pr_debug("pid(%d) page[%d]->count=%d\n",
+			 current->pid, i, page_count(page));
+		SetPageUptodate(page);
+		SetPageDirty(page);
+		unlock_page(page);
+	}
+	ctx->aio_ring_file = file;
+	nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring))
+			/ sizeof(struct io_event);
 
-	ctx->nr_events = 0;
 	ctx->ring_pages = ctx->internal_pages;
 	if (nr_pages > AIO_RING_PAGES) {
 		ctx->ring_pages = kcalloc(nr_pages, sizeof(struct page *),
@@ -178,10 +296,11 @@ static int aio_setup_ring(struct kioctx *ctx)
 
 	ctx->mmap_size = nr_pages * PAGE_SIZE;
 	pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
+
 	down_write(&mm->mmap_sem);
-	ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
-				       PROT_READ|PROT_WRITE,
-				       MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+	ctx->mmap_base = do_mmap_pgoff(ctx->aio_ring_file, 0, ctx->mmap_size,
+				       PROT_READ | PROT_WRITE,
+				       MAP_SHARED | MAP_POPULATE, 0, &populate);
 	if (IS_ERR((void *)ctx->mmap_base)) {
 		up_write(&mm->mmap_sem);
 		ctx->mmap_size = 0;
@@ -190,23 +309,34 @@ static int aio_setup_ring(struct kioctx *ctx)
 	}
 
 	pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
+
+	/* We must do this while still holding mmap_sem for write, as we
+	 * need to be protected against userspace attempting to mremap()
+	 * or munmap() the ring buffer.
+	 */
 	ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
 				       1, 0, ctx->ring_pages, NULL);
+
+	/* Dropping the reference here is safe as the page cache will hold
+	 * onto the pages for us.  It is also required so that page migration
+	 * can unmap the pages and get the right reference count.
+	 */
+	for (i = 0; i < ctx->nr_pages; i++)
+		put_page(ctx->ring_pages[i]);
+
 	up_write(&mm->mmap_sem);
 
 	if (unlikely(ctx->nr_pages != nr_pages)) {
 		aio_free_ring(ctx);
 		return -EAGAIN;
 	}
-	if (populate)
-		mm_populate(ctx->mmap_base, populate);
 
 	ctx->user_id = ctx->mmap_base;
 	ctx->nr_events = nr_events; /* trusted copy */
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	ring->nr = nr_events;	/* user copy */
-	ring->id = ctx->user_id;
+	ring->id = ~0U;
 	ring->head = ring->tail = 0;
 	ring->magic = AIO_RING_MAGIC;
 	ring->compat_features = AIO_RING_COMPAT_FEATURES;
@@ -238,11 +368,9 @@ void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel)
 }
 EXPORT_SYMBOL(kiocb_set_cancel_fn);
 
-static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
-			struct io_event *res)
+static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb)
 {
 	kiocb_cancel_fn *old, *cancel;
-	int ret = -EINVAL;
 
 	/*
 	 * Don't want to set kiocb->ki_cancel = KIOCB_CANCELLED unless it
@@ -252,28 +380,20 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
 	cancel = ACCESS_ONCE(kiocb->ki_cancel);
 	do {
 		if (!cancel || cancel == KIOCB_CANCELLED)
-			return ret;
+			return -EINVAL;
 
 		old = cancel;
 		cancel = cmpxchg(&kiocb->ki_cancel, old, KIOCB_CANCELLED);
 	} while (cancel != old);
 
-	atomic_inc(&kiocb->ki_users);
-	spin_unlock_irq(&ctx->ctx_lock);
-
-	memset(res, 0, sizeof(*res));
-	res->obj = (u64)(unsigned long)kiocb->ki_obj.user;
-	res->data = kiocb->ki_user_data;
-	ret = cancel(kiocb, res);
-
-	spin_lock_irq(&ctx->ctx_lock);
-
-	return ret;
+	return cancel(kiocb);
 }
 
 static void free_ioctx_rcu(struct rcu_head *head)
 {
 	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+
+	free_percpu(ctx->cpu);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -282,12 +402,13 @@ static void free_ioctx_rcu(struct rcu_head *head)
  * and ctx->users has dropped to 0, so we know no more kiocbs can be submitted -
  * now it's safe to cancel any that need to be.
  */
-static void free_ioctx(struct kioctx *ctx)
+static void free_ioctx(struct work_struct *work)
 {
+	struct kioctx *ctx = container_of(work, struct kioctx, free_work);
 	struct aio_ring *ring;
-	struct io_event res;
 	struct kiocb *req;
-	unsigned head, avail;
+	unsigned cpu, avail;
+	DEFINE_WAIT(wait);
 
 	spin_lock_irq(&ctx->ctx_lock);
 
@@ -296,28 +417,38 @@ static void free_ioctx(struct kioctx *ctx)
 				       struct kiocb, ki_list);
 
 		list_del_init(&req->ki_list);
-		kiocb_cancel(ctx, req, &res);
+		kiocb_cancel(ctx, req);
 	}
 
 	spin_unlock_irq(&ctx->ctx_lock);
 
-	ring = kmap_atomic(ctx->ring_pages[0]);
-	head = ring->head;
-	kunmap_atomic(ring);
+	for_each_possible_cpu(cpu) {
+		struct kioctx_cpu *kcpu = per_cpu_ptr(ctx->cpu, cpu);
 
-	while (atomic_read(&ctx->reqs_active) > 0) {
-		wait_event(ctx->wait,
-				head != ctx->tail ||
-				atomic_read(&ctx->reqs_active) <= 0);
+		atomic_add(kcpu->reqs_available, &ctx->reqs_available);
+		kcpu->reqs_available = 0;
+	}
 
-		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
+	while (1) {
+		prepare_to_wait(&ctx->wait, &wait, TASK_UNINTERRUPTIBLE);
 
-		atomic_sub(avail, &ctx->reqs_active);
-		head += avail;
-		head %= ctx->nr_events;
+		ring = kmap_atomic(ctx->ring_pages[0]);
+		avail = (ring->head <= ring->tail)
+			 ? ring->tail - ring->head
+			 : ctx->nr_events - ring->head + ring->tail;
+
+		atomic_add(avail, &ctx->reqs_available);
+		ring->head = ring->tail;
+		kunmap_atomic(ring);
+
+		if (atomic_read(&ctx->reqs_available) >= ctx->nr_events - 1)
+			break;
+
+		schedule();
 	}
+	finish_wait(&ctx->wait, &wait);
 
-	WARN_ON(atomic_read(&ctx->reqs_active) < 0);
+	WARN_ON(atomic_read(&ctx->reqs_available) > ctx->nr_events - 1);
 
 	aio_free_ring(ctx);
 
@@ -333,10 +464,68 @@ static void free_ioctx(struct kioctx *ctx)
 	call_rcu(&ctx->rcu_head, free_ioctx_rcu);
 }
 
-static void put_ioctx(struct kioctx *ctx)
+static void free_ioctx_ref(struct percpu_ref *ref)
 {
-	if (unlikely(atomic_dec_and_test(&ctx->users)))
-		free_ioctx(ctx);
+	struct kioctx *ctx = container_of(ref, struct kioctx, users);
+
+	INIT_WORK(&ctx->free_work, free_ioctx);
+	schedule_work(&ctx->free_work);
+}
+
+static int ioctx_add_table(struct kioctx *ctx, struct mm_struct *mm)
+{
+	unsigned i, new_nr;
+	struct kioctx_table *table, *old;
+	struct aio_ring *ring;
+
+	spin_lock(&mm->ioctx_lock);
+	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
+
+	while (1) {
+		if (table)
+			for (i = 0; i < table->nr; i++)
+				if (!table->table[i]) {
+					ctx->id = i;
+					table->table[i] = ctx;
+					rcu_read_unlock();
+					spin_unlock(&mm->ioctx_lock);
+
+					ring = kmap_atomic(ctx->ring_pages[0]);
+					ring->id = ctx->id;
+					kunmap_atomic(ring);
+					return 0;
+				}
+
+		new_nr = (table ? table->nr : 1) * 4;
+
+		rcu_read_unlock();
+		spin_unlock(&mm->ioctx_lock);
+
+		table = kzalloc(sizeof(*table) + sizeof(struct kioctx *) *
+				new_nr, GFP_KERNEL);
+		if (!table)
+			return -ENOMEM;
+
+		table->nr = new_nr;
+
+		spin_lock(&mm->ioctx_lock);
+		rcu_read_lock();
+		old = rcu_dereference(mm->ioctx_table);
+
+		if (!old) {
+			rcu_assign_pointer(mm->ioctx_table, table);
+		} else if (table->nr > old->nr) {
+			memcpy(table->table, old->table,
+			       old->nr * sizeof(struct kioctx *));
+
+			rcu_assign_pointer(mm->ioctx_table, table);
+			kfree_rcu(old, rcu);
+		} else {
+			kfree(table);
+			table = old;
+		}
+	}
 }
 
 /* ioctx_alloc
@@ -348,6 +537,18 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	struct kioctx *ctx;
 	int err = -ENOMEM;
 
+	/*
+	 * We keep track of the number of available ringbuffer slots, to prevent
+	 * overflow (reqs_available), and we also use percpu counters for this.
+	 *
+	 * So since up to half the slots might be on other cpu's percpu counters
+	 * and unavailable, double nr_events so userspace sees what they
+	 * expected: additionally, we move req_batch slots to/from percpu
+	 * counters at a time, so make sure that isn't 0:
+	 */
+	nr_events = max(nr_events, num_possible_cpus() * 4);
+	nr_events *= 2;
+
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
 	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -355,7 +556,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)nr_events > aio_max_nr)
+	if (!nr_events || (unsigned long)nr_events > (aio_max_nr * 2UL))
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -364,8 +565,9 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	ctx->max_reqs = nr_events;
 
-	atomic_set(&ctx->users, 2);
-	atomic_set(&ctx->dead, 0);
+	if (percpu_ref_init(&ctx->users, free_ioctx_ref))
+		goto out_freectx;
+
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
 	mutex_init(&ctx->ring_lock);
@@ -373,12 +575,21 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 
 	INIT_LIST_HEAD(&ctx->active_reqs);
 
+	ctx->cpu = alloc_percpu(struct kioctx_cpu);
+	if (!ctx->cpu)
+		goto out_freeref;
+
 	if (aio_setup_ring(ctx) < 0)
-		goto out_freectx;
+		goto out_freepcpu;
+
+	atomic_set(&ctx->reqs_available, ctx->nr_events - 1);
+	ctx->req_batch = (ctx->nr_events - 1) / (num_possible_cpus() * 4);
+	if (ctx->req_batch < 1)
+		ctx->req_batch = 1;
 
 	/* limit the number of system wide aios */
 	spin_lock(&aio_nr_lock);
-	if (aio_nr + nr_events > aio_max_nr ||
+	if (aio_nr + nr_events > (aio_max_nr * 2UL) ||
 	    aio_nr + nr_events < aio_nr) {
 		spin_unlock(&aio_nr_lock);
 		goto out_cleanup;
@@ -386,49 +597,54 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 	aio_nr += ctx->max_reqs;
 	spin_unlock(&aio_nr_lock);
 
-	/* now link into global list. */
-	spin_lock(&mm->ioctx_lock);
-	hlist_add_head_rcu(&ctx->list, &mm->ioctx_list);
-	spin_unlock(&mm->ioctx_lock);
+	percpu_ref_get(&ctx->users); /* io_setup() will drop this ref */
+
+	err = ioctx_add_table(ctx, mm);
+	if (err)
+		goto out_cleanup_put;
 
 	pr_debug("allocated ioctx %p[%ld]: mm=%p mask=0x%x\n",
 		 ctx, ctx->user_id, mm, ctx->nr_events);
 	return ctx;
 
+out_cleanup_put:
+	percpu_ref_put(&ctx->users);
 out_cleanup:
 	err = -EAGAIN;
 	aio_free_ring(ctx);
+out_freepcpu:
+	free_percpu(ctx->cpu);
+out_freeref:
+	free_percpu(ctx->users.pcpu_count);
 out_freectx:
+	if (ctx->aio_ring_file)
+		fput(ctx->aio_ring_file);
 	kmem_cache_free(kioctx_cachep, ctx);
 	pr_debug("error allocating ioctx %d\n", err);
 	return ERR_PTR(err);
 }
 
-static void kill_ioctx_work(struct work_struct *work)
-{
-	struct kioctx *ctx = container_of(work, struct kioctx, rcu_work);
-
-	wake_up_all(&ctx->wait);
-	put_ioctx(ctx);
-}
-
-static void kill_ioctx_rcu(struct rcu_head *head)
-{
-	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
-
-	INIT_WORK(&ctx->rcu_work, kill_ioctx_work);
-	schedule_work(&ctx->rcu_work);
-}
-
 /* kill_ioctx
  *	Cancels all outstanding aio requests on an aio context.  Used
  *	when the processes owning a context have all exited to encourage
  *	the rapid destruction of the kioctx.
  */
-static void kill_ioctx(struct kioctx *ctx)
+static void kill_ioctx(struct mm_struct *mm, struct kioctx *ctx)
 {
 	if (!atomic_xchg(&ctx->dead, 1)) {
-		hlist_del_rcu(&ctx->list);
+		struct kioctx_table *table;
+
+		spin_lock(&mm->ioctx_lock);
+		rcu_read_lock();
+		table = rcu_dereference(mm->ioctx_table);
+
+		WARN_ON(ctx != table->table[ctx->id]);
+		table->table[ctx->id] = NULL;
+		rcu_read_unlock();
+		spin_unlock(&mm->ioctx_lock);
+
+		/* percpu_ref_kill() will do the necessary call_rcu() */
+		wake_up_all(&ctx->wait);
 
 		/*
 		 * It'd be more correct to do this in free_ioctx(), after all
@@ -445,24 +661,23 @@ static void kill_ioctx(struct kioctx *ctx)
 		if (ctx->mmap_size)
 			vm_munmap(ctx->mmap_base, ctx->mmap_size);
 
-		/* Between hlist_del_rcu() and dropping the initial ref */
-		call_rcu(&ctx->rcu_head, kill_ioctx_rcu);
+		percpu_ref_kill(&ctx->users);
 	}
 }
 
 /* wait_on_sync_kiocb:
  *	Waits on the given sync kiocb to complete.
  */
-ssize_t wait_on_sync_kiocb(struct kiocb *iocb)
+ssize_t wait_on_sync_kiocb(struct kiocb *req)
 {
-	while (atomic_read(&iocb->ki_users)) {
+	while (!req->ki_ctx) {
 		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (!atomic_read(&iocb->ki_users))
+		if (req->ki_ctx)
 			break;
 		io_schedule();
 	}
 	__set_current_state(TASK_RUNNING);
-	return iocb->ki_user_data;
+	return req->ki_user_data;
 }
 EXPORT_SYMBOL(wait_on_sync_kiocb);
 
@@ -476,16 +691,28 @@ EXPORT_SYMBOL(wait_on_sync_kiocb);
  */
 void exit_aio(struct mm_struct *mm)
 {
+	struct kioctx_table *table;
 	struct kioctx *ctx;
-	struct hlist_node *n;
-
-	hlist_for_each_entry_safe(ctx, n, &mm->ioctx_list, list) {
-		if (1 != atomic_read(&ctx->users))
-			printk(KERN_DEBUG
-				"exit_aio:ioctx still alive: %d %d %d\n",
-				atomic_read(&ctx->users),
-				atomic_read(&ctx->dead),
-				atomic_read(&ctx->reqs_active));
+	unsigned i = 0;
+
+	while (1) {
+		rcu_read_lock();
+		table = rcu_dereference(mm->ioctx_table);
+
+		do {
+			if (!table || i >= table->nr) {
+				rcu_read_unlock();
+				rcu_assign_pointer(mm->ioctx_table, NULL);
+				if (table)
+					kfree(table);
+				return;
+			}
+
+			ctx = table->table[i++];
+		} while (!ctx);
+
+		rcu_read_unlock();
+
 		/*
 		 * We don't need to bother with munmap() here -
 		 * exit_mmap(mm) is coming and it'll unmap everything.
@@ -496,40 +723,75 @@ void exit_aio(struct mm_struct *mm)
 		 */
 		ctx->mmap_size = 0;
 
-		kill_ioctx(ctx);
+		kill_ioctx(mm, ctx);
+	}
+}
+
+static void put_reqs_available(struct kioctx *ctx, unsigned nr)
+{
+	struct kioctx_cpu *kcpu;
+
+	preempt_disable();
+	kcpu = this_cpu_ptr(ctx->cpu);
+
+	kcpu->reqs_available += nr;
+	while (kcpu->reqs_available >= ctx->req_batch * 2) {
+		kcpu->reqs_available -= ctx->req_batch;
+		atomic_add(ctx->req_batch, &ctx->reqs_available);
+	}
+
+	preempt_enable();
+}
+
+static bool get_reqs_available(struct kioctx *ctx)
+{
+	struct kioctx_cpu *kcpu;
+	bool ret = false;
+
+	preempt_disable();
+	kcpu = this_cpu_ptr(ctx->cpu);
+
+	if (!kcpu->reqs_available) {
+		int old, avail = atomic_read(&ctx->reqs_available);
+
+		do {
+			if (avail < ctx->req_batch)
+				goto out;
+
+			old = avail;
+			avail = atomic_cmpxchg(&ctx->reqs_available,
+					       avail, avail - ctx->req_batch);
+		} while (avail != old);
+
+		kcpu->reqs_available += ctx->req_batch;
 	}
+
+	ret = true;
+	kcpu->reqs_available--;
+out:
+	preempt_enable();
+	return ret;
 }
 
 /* aio_get_req
- *	Allocate a slot for an aio request.  Increments the ki_users count
- * of the kioctx so that the kioctx stays around until all requests are
- * complete.  Returns NULL if no requests are free.
- *
- * Returns with kiocb->ki_users set to 2.  The io submit code path holds
- * an extra reference while submitting the i/o.
- * This prevents races between the aio code path referencing the
- * req (after submitting it) and aio_complete() freeing the req.
+ *	Allocate a slot for an aio request.
+ * Returns NULL if no requests are free.
  */
 static inline struct kiocb *aio_get_req(struct kioctx *ctx)
 {
 	struct kiocb *req;
 
-	if (atomic_read(&ctx->reqs_active) >= ctx->nr_events)
+	if (!get_reqs_available(ctx))
 		return NULL;
 
-	if (atomic_inc_return(&ctx->reqs_active) > ctx->nr_events - 1)
-		goto out_put;
-
 	req = kmem_cache_alloc(kiocb_cachep, GFP_KERNEL|__GFP_ZERO);
 	if (unlikely(!req))
 		goto out_put;
 
-	atomic_set(&req->ki_users, 2);
 	req->ki_ctx = ctx;
-
 	return req;
 out_put:
-	atomic_dec(&ctx->reqs_active);
+	put_reqs_available(ctx, 1);
 	return NULL;
 }
 
@@ -539,35 +801,32 @@ static void kiocb_free(struct kiocb *req)
 		fput(req->ki_filp);
 	if (req->ki_eventfd != NULL)
 		eventfd_ctx_put(req->ki_eventfd);
-	if (req->ki_dtor)
-		req->ki_dtor(req);
-	if (req->ki_iovec != &req->ki_inline_vec)
-		kfree(req->ki_iovec);
 	kmem_cache_free(kiocb_cachep, req);
 }
 
-void aio_put_req(struct kiocb *req)
-{
-	if (atomic_dec_and_test(&req->ki_users))
-		kiocb_free(req);
-}
-EXPORT_SYMBOL(aio_put_req);
-
 static struct kioctx *lookup_ioctx(unsigned long ctx_id)
 {
+	struct aio_ring __user *ring  = (void __user *)ctx_id;
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx, *ret = NULL;
+	struct kioctx_table *table;
+	unsigned id;
+
+	if (get_user(id, &ring->id))
+		return NULL;
 
 	rcu_read_lock();
+	table = rcu_dereference(mm->ioctx_table);
 
-	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
-		if (ctx->user_id == ctx_id) {
-			atomic_inc(&ctx->users);
-			ret = ctx;
-			break;
-		}
-	}
+	if (!table || id >= table->nr)
+		goto out;
 
+	ctx = table->table[id];
+	if (ctx && ctx->user_id == ctx_id) {
+		percpu_ref_get(&ctx->users);
+		ret = ctx;
+	}
+out:
 	rcu_read_unlock();
 	return ret;
 }
@@ -591,16 +850,16 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	 *  - the sync task helpfully left a reference to itself in the iocb
 	 */
 	if (is_sync_kiocb(iocb)) {
-		BUG_ON(atomic_read(&iocb->ki_users) != 1);
 		iocb->ki_user_data = res;
-		atomic_set(&iocb->ki_users, 0);
+		smp_wmb();
+		iocb->ki_ctx = ERR_PTR(-EXDEV);
 		wake_up_process(iocb->ki_obj.tsk);
 		return;
 	}
 
 	/*
 	 * Take rcu_read_lock() in case the kioctx is being destroyed, as we
-	 * need to issue a wakeup after decrementing reqs_active.
+	 * need to issue a wakeup after incrementing reqs_available.
 	 */
 	rcu_read_lock();
 
@@ -613,17 +872,6 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	}
 
 	/*
-	 * cancelled requests don't get events, userland was given one
-	 * when the event got cancelled.
-	 */
-	if (unlikely(xchg(&iocb->ki_cancel,
-			  KIOCB_CANCELLED) == KIOCB_CANCELLED)) {
-		atomic_dec(&ctx->reqs_active);
-		/* Still need the wake_up in case free_ioctx is waiting */
-		goto put_rq;
-	}
-
-	/*
 	 * Add a completion event to the ring buffer. Must be done holding
 	 * ctx->completion_lock to prevent other code from messing with the tail
 	 * pointer since we might be called from irq context.
@@ -675,9 +923,8 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 	if (iocb->ki_eventfd != NULL)
 		eventfd_signal(iocb->ki_eventfd, 1);
 
-put_rq:
 	/* everything turned out well, dispose of the aiocb. */
-	aio_put_req(iocb);
+	kiocb_free(iocb);
 
 	/*
 	 * We have to order our ring_info tail store above and test
@@ -702,7 +949,7 @@ static long aio_read_events_ring(struct kioctx *ctx,
 				 struct io_event __user *event, long nr)
 {
 	struct aio_ring *ring;
-	unsigned head, pos;
+	unsigned head, tail, pos;
 	long ret = 0;
 	int copy_ret;
 
@@ -710,11 +957,12 @@ static long aio_read_events_ring(struct kioctx *ctx,
 
 	ring = kmap_atomic(ctx->ring_pages[0]);
 	head = ring->head;
+	tail = ring->tail;
 	kunmap_atomic(ring);
 
-	pr_debug("h%u t%u m%u\n", head, ctx->tail, ctx->nr_events);
+	pr_debug("h%u t%u m%u\n", head, tail, ctx->nr_events);
 
-	if (head == ctx->tail)
+	if (head == tail)
 		goto out;
 
 	while (ret < nr) {
@@ -722,8 +970,8 @@ static long aio_read_events_ring(struct kioctx *ctx,
 		struct io_event *ev;
 		struct page *page;
 
-		avail = (head <= ctx->tail ? ctx->tail : ctx->nr_events) - head;
-		if (head == ctx->tail)
+		avail = (head <= tail ?  tail : ctx->nr_events) - head;
+		if (head == tail)
 			break;
 
 		avail = min(avail, nr - ret);
@@ -754,9 +1002,9 @@ static long aio_read_events_ring(struct kioctx *ctx,
 	kunmap_atomic(ring);
 	flush_dcache_page(ctx->ring_pages[0]);
 
-	pr_debug("%li  h%u t%u\n", ret, head, ctx->tail);
+	pr_debug("%li  h%u t%u\n", ret, head, tail);
 
-	atomic_sub(ret, &ctx->reqs_active);
+	put_reqs_available(ctx, ret);
 out:
 	mutex_unlock(&ctx->ring_lock);
 
@@ -854,8 +1102,8 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, aio_context_t __user *, ctxp)
 	if (!IS_ERR(ioctx)) {
 		ret = put_user(ioctx->user_id, ctxp);
 		if (ret)
-			kill_ioctx(ioctx);
-		put_ioctx(ioctx);
+			kill_ioctx(current->mm, ioctx);
+		percpu_ref_put(&ioctx->users);
 	}
 
 out:
@@ -872,101 +1120,37 @@ SYSCALL_DEFINE1(io_destroy, aio_context_t, ctx)
 {
 	struct kioctx *ioctx = lookup_ioctx(ctx);
 	if (likely(NULL != ioctx)) {
-		kill_ioctx(ioctx);
-		put_ioctx(ioctx);
+		kill_ioctx(current->mm, ioctx);
+		percpu_ref_put(&ioctx->users);
 		return 0;
 	}
 	pr_debug("EINVAL: io_destroy: invalid context id\n");
 	return -EINVAL;
 }
 
-static void aio_advance_iovec(struct kiocb *iocb, ssize_t ret)
-{
-	struct iovec *iov = &iocb->ki_iovec[iocb->ki_cur_seg];
-
-	BUG_ON(ret <= 0);
-
-	while (iocb->ki_cur_seg < iocb->ki_nr_segs && ret > 0) {
-		ssize_t this = min((ssize_t)iov->iov_len, ret);
-		iov->iov_base += this;
-		iov->iov_len -= this;
-		iocb->ki_left -= this;
-		ret -= this;
-		if (iov->iov_len == 0) {
-			iocb->ki_cur_seg++;
-			iov++;
-		}
-	}
-
-	/* the caller should not have done more io than what fit in
-	 * the remaining iovecs */
-	BUG_ON(ret > 0 && iocb->ki_left == 0);
-}
-
 typedef ssize_t (aio_rw_op)(struct kiocb *, const struct iovec *,
 			    unsigned long, loff_t);
 
-static ssize_t aio_rw_vect_retry(struct kiocb *iocb, int rw, aio_rw_op *rw_op)
-{
-	struct file *file = iocb->ki_filp;
-	struct address_space *mapping = file->f_mapping;
-	struct inode *inode = mapping->host;
-	ssize_t ret = 0;
-
-	/* This matches the pread()/pwrite() logic */
-	if (iocb->ki_pos < 0)
-		return -EINVAL;
-
-	if (rw == WRITE)
-		file_start_write(file);
-	do {
-		ret = rw_op(iocb, &iocb->ki_iovec[iocb->ki_cur_seg],
-			    iocb->ki_nr_segs - iocb->ki_cur_seg,
-			    iocb->ki_pos);
-		if (ret > 0)
-			aio_advance_iovec(iocb, ret);
-
-	/* retry all partial writes.  retry partial reads as long as its a
-	 * regular file. */
-	} while (ret > 0 && iocb->ki_left > 0 &&
-		 (rw == WRITE ||
-		  (!S_ISFIFO(inode->i_mode) && !S_ISSOCK(inode->i_mode))));
-	if (rw == WRITE)
-		file_end_write(file);
-
-	/* This means we must have transferred all that we could */
-	/* No need to retry anymore */
-	if ((ret == 0) || (iocb->ki_left == 0))
-		ret = iocb->ki_nbytes - iocb->ki_left;
-
-	/* If we managed to write some out we return that, rather than
-	 * the eventual error. */
-	if (rw == WRITE
-	    && ret < 0 && ret != -EIOCBQUEUED
-	    && iocb->ki_nbytes - iocb->ki_left)
-		ret = iocb->ki_nbytes - iocb->ki_left;
-
-	return ret;
-}
-
-static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
+static ssize_t aio_setup_vectored_rw(struct kiocb *kiocb,
+				     int rw, char __user *buf,
+				     unsigned long *nr_segs,
+				     struct iovec **iovec,
+				     bool compat)
 {
 	ssize_t ret;
 
-	kiocb->ki_nr_segs = kiocb->ki_nbytes;
+	*nr_segs = kiocb->ki_nbytes;
 
 #ifdef CONFIG_COMPAT
 	if (compat)
 		ret = compat_rw_copy_check_uvector(rw,
-				(struct compat_iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+				(struct compat_iovec __user *)buf,
+				*nr_segs, 1, *iovec, iovec);
 	else
 #endif
 		ret = rw_copy_check_uvector(rw,
-				(struct iovec __user *)kiocb->ki_buf,
-				kiocb->ki_nr_segs, 1, &kiocb->ki_inline_vec,
-				&kiocb->ki_iovec);
+				(struct iovec __user *)buf,
+				*nr_segs, 1, *iovec, iovec);
 	if (ret < 0)
 		return ret;
 
@@ -975,15 +1159,17 @@ static ssize_t aio_setup_vectored_rw(int rw, struct kiocb *kiocb, bool compat)
 	return 0;
 }
 
-static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
+static ssize_t aio_setup_single_vector(struct kiocb *kiocb,
+				       int rw, char __user *buf,
+				       unsigned long *nr_segs,
+				       struct iovec *iovec)
 {
-	if (unlikely(!access_ok(!rw, kiocb->ki_buf, kiocb->ki_nbytes)))
+	if (unlikely(!access_ok(!rw, buf, kiocb->ki_nbytes)))
 		return -EFAULT;
 
-	kiocb->ki_iovec = &kiocb->ki_inline_vec;
-	kiocb->ki_iovec->iov_base = kiocb->ki_buf;
-	kiocb->ki_iovec->iov_len = kiocb->ki_nbytes;
-	kiocb->ki_nr_segs = 1;
+	iovec->iov_base = buf;
+	iovec->iov_len = kiocb->ki_nbytes;
+	*nr_segs = 1;
 	return 0;
 }
 
@@ -992,15 +1178,18 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
  *	Performs the initial checks and aio retry method
  *	setup for the kiocb at the time of io submission.
  */
-static ssize_t aio_run_iocb(struct kiocb *req, bool compat)
+static ssize_t aio_run_iocb(struct kiocb *req, unsigned opcode,
+			    char __user *buf, bool compat)
 {
 	struct file *file = req->ki_filp;
 	ssize_t ret;
+	unsigned long nr_segs;
 	int rw;
 	fmode_t mode;
 	aio_rw_op *rw_op;
+	struct iovec inline_vec, *iovec = &inline_vec;
 
-	switch (req->ki_opcode) {
+	switch (opcode) {
 	case IOCB_CMD_PREAD:
 	case IOCB_CMD_PREADV:
 		mode	= FMODE_READ;
@@ -1021,21 +1210,38 @@ rw_common:
 		if (!rw_op)
 			return -EINVAL;
 
-		ret = (req->ki_opcode == IOCB_CMD_PREADV ||
-		       req->ki_opcode == IOCB_CMD_PWRITEV)
-			? aio_setup_vectored_rw(rw, req, compat)
-			: aio_setup_single_vector(rw, req);
+		ret = (opcode == IOCB_CMD_PREADV ||
+		       opcode == IOCB_CMD_PWRITEV)
+			? aio_setup_vectored_rw(req, rw, buf, &nr_segs,
+						&iovec, compat)
+			: aio_setup_single_vector(req, rw, buf, &nr_segs,
+						  iovec);
 		if (ret)
 			return ret;
 
 		ret = rw_verify_area(rw, file, &req->ki_pos, req->ki_nbytes);
-		if (ret < 0)
+		if (ret < 0) {
+			if (iovec != &inline_vec)
+				kfree(iovec);
 			return ret;
+		}
 
 		req->ki_nbytes = ret;
-		req->ki_left = ret;
 
-		ret = aio_rw_vect_retry(req, rw, rw_op);
+		/* XXX: move/kill - rw_verify_area()? */
+		/* This matches the pread()/pwrite() logic */
+		if (req->ki_pos < 0) {
+			ret = -EINVAL;
+			break;
+		}
+
+		if (rw == WRITE)
+			file_start_write(file);
+
+		ret = rw_op(req, iovec, nr_segs, req->ki_pos);
+
+		if (rw == WRITE)
+			file_end_write(file);
 		break;
 
 	case IOCB_CMD_FDSYNC:
@@ -1057,6 +1263,9 @@ rw_common:
 		return -EINVAL;
 	}
 
+	if (iovec != &inline_vec)
+		kfree(iovec);
+
 	if (ret != -EIOCBQUEUED) {
 		/*
 		 * There's no easy way to restart the syscall since other AIO's
@@ -1128,21 +1337,18 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 	req->ki_obj.user = user_iocb;
 	req->ki_user_data = iocb->aio_data;
 	req->ki_pos = iocb->aio_offset;
+	req->ki_nbytes = iocb->aio_nbytes;
 
-	req->ki_buf = (char __user *)(unsigned long)iocb->aio_buf;
-	req->ki_left = req->ki_nbytes = iocb->aio_nbytes;
-	req->ki_opcode = iocb->aio_lio_opcode;
-
-	ret = aio_run_iocb(req, compat);
+	ret = aio_run_iocb(req, iocb->aio_lio_opcode,
+			   (char __user *)(unsigned long)iocb->aio_buf,
+			   compat);
 	if (ret)
 		goto out_put_req;
 
-	aio_put_req(req);	/* drop extra ref to req */
 	return 0;
 out_put_req:
-	atomic_dec(&ctx->reqs_active);
-	aio_put_req(req);	/* drop extra ref to req */
-	aio_put_req(req);	/* drop i/o ref to req */
+	put_reqs_available(ctx, 1);
+	kiocb_free(req);
 	return ret;
 }
 
@@ -1195,7 +1401,7 @@ long do_io_submit(aio_context_t ctx_id, long nr,
 	}
 	blk_finish_plug(&plug);
 
-	put_ioctx(ctx);
+	percpu_ref_put(&ctx->users);
 	return i ? i : ret;
 }
 
@@ -1252,7 +1458,6 @@ static struct kiocb *lookup_kiocb(struct kioctx *ctx, struct iocb __user *iocb,
 SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 		struct io_event __user *, result)
 {
-	struct io_event res;
 	struct kioctx *ctx;
 	struct kiocb *kiocb;
 	u32 key;
@@ -1270,21 +1475,22 @@ SYSCALL_DEFINE3(io_cancel, aio_context_t, ctx_id, struct iocb __user *, iocb,
 
 	kiocb = lookup_kiocb(ctx, iocb, key);
 	if (kiocb)
-		ret = kiocb_cancel(ctx, kiocb, &res);
+		ret = kiocb_cancel(ctx, kiocb);
 	else
 		ret = -EINVAL;
 
 	spin_unlock_irq(&ctx->ctx_lock);
 
 	if (!ret) {
-		/* Cancellation succeeded -- copy the result
-		 * into the user's buffer.
+		/*
+		 * The result argument is no longer used - the io_event is
+		 * always delivered via the ring buffer. -EINPROGRESS indicates
+		 * cancellation is progress:
 		 */
-		if (copy_to_user(result, &res, sizeof(res)))
-			ret = -EFAULT;
+		ret = -EINPROGRESS;
 	}
 
-	put_ioctx(ctx);
+	percpu_ref_put(&ctx->users);
 
 	return ret;
 }
@@ -1313,7 +1519,7 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	if (likely(ioctx)) {
 		if (likely(min_nr <= nr && min_nr >= 0))
 			ret = read_events(ioctx, min_nr, nr, events, timeout);
-		put_ioctx(ioctx);
+		percpu_ref_put(&ioctx->users);
 	}
 	return ret;
 }
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47a65df8c871..85c961849953 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -109,6 +109,72 @@ static struct file_system_type anon_inode_fs_type = {
 };
 
 /**
+ * anon_inode_getfile_private - creates a new file instance by hooking it up to an
+ *                      anonymous inode, and a dentry that describe the "class"
+ *                      of the file
+ *
+ * @name:    [in]    name of the "class" of the new file
+ * @fops:    [in]    file operations for the new file
+ * @priv:    [in]    private data for the new file (will be file's private_data)
+ * @flags:   [in]    flags
+ *
+ *
+ * Similar to anon_inode_getfile, but each file holds a single inode.
+ *
+ */
+struct file *anon_inode_getfile_private(const char *name,
+					const struct file_operations *fops,
+					void *priv, int flags)
+{
+	struct qstr this;
+	struct path path;
+	struct file *file;
+	struct inode *inode;
+
+	if (fops->owner && !try_module_get(fops->owner))
+		return ERR_PTR(-ENOENT);
+
+	inode = anon_inode_mkinode(anon_inode_mnt->mnt_sb);
+	if (IS_ERR(inode)) {
+		file = ERR_PTR(-ENOMEM);
+		goto err_module;
+	}
+
+	/*
+	 * Link the inode to a directory entry by creating a unique name
+	 * using the inode sequence number.
+	 */
+	file = ERR_PTR(-ENOMEM);
+	this.name = name;
+	this.len = strlen(name);
+	this.hash = 0;
+	path.dentry = d_alloc_pseudo(anon_inode_mnt->mnt_sb, &this);
+	if (!path.dentry)
+		goto err_module;
+
+	path.mnt = mntget(anon_inode_mnt);
+
+	d_instantiate(path.dentry, inode);
+
+	file = alloc_file(&path, OPEN_FMODE(flags), fops);
+	if (IS_ERR(file))
+		goto err_dput;
+
+	file->f_mapping = inode->i_mapping;
+	file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
+	file->private_data = priv;
+
+	return file;
+
+err_dput:
+	path_put(&path);
+err_module:
+	module_put(fops->owner);
+	return file;
+}
+EXPORT_SYMBOL_GPL(anon_inode_getfile_private);
+
+/**
  * anon_inode_getfile - creates a new file instance by hooking it up to an
  *                      anonymous inode, and a dentry that describe the "class"
  *                      of the file
diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c
index 3db70dae40d3..689e40d983ad 100644
--- a/fs/autofs4/waitq.c
+++ b/fs/autofs4/waitq.c
@@ -109,13 +109,7 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi,
 
 	pkt.hdr.proto_version = sbi->version;
 	pkt.hdr.type = type;
-	mutex_lock(&sbi->wq_mutex);
 
-	/* Check if we have become catatonic */
-	if (sbi->catatonic) {
-		mutex_unlock(&sbi->wq_mutex);
-		return;
-	}
 	switch (type) {
 	/* Kernel protocol v4 missing and expire packets */
 	case autofs_ptype_missing:
@@ -427,7 +421,6 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 		wq->tgid = current->tgid;
 		wq->status = -EINTR; /* Status return if interrupted */
 		wq->wait_ctr = 2;
-		mutex_unlock(&sbi->wq_mutex);
 
 		if (sbi->version < 5) {
 			if (notify == NFY_MOUNT)
@@ -449,15 +442,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *dentry,
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
 
-		/* autofs4_notify_daemon() may block */
+		/* autofs4_notify_daemon() may block; it will unlock ->wq_mutex */
 		autofs4_notify_daemon(sbi, wq, type);
 	} else {
 		wq->wait_ctr++;
-		mutex_unlock(&sbi->wq_mutex);
-		kfree(qstr.name);
 		DPRINTK("existing wait id = 0x%08lx, name = %.*s, nfy=%d",
 			(unsigned long) wq->wait_queue_token, wq->name.len,
 			wq->name.name, notify);
+		mutex_unlock(&sbi->wq_mutex);
+		kfree(qstr.name);
 	}
 
 	/*
diff --git a/fs/bfs/file.c b/fs/bfs/file.c
index ad3ea1497cc3..ae2892218335 100644
--- a/fs/bfs/file.c
+++ b/fs/bfs/file.c
@@ -166,7 +166,7 @@ static void bfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 
 static int bfs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 1173a4ee0830..1e86823a9cbd 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -592,7 +592,7 @@ static struct block_device *bd_acquire(struct inode *inode)
 	return bdev;
 }
 
-static inline int sb_is_blkdev_sb(struct super_block *sb)
+int sb_is_blkdev_sb(struct super_block *sb)
 {
 	return sb == blockdev_superblock;
 }
@@ -1542,7 +1542,7 @@ static ssize_t blkdev_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		return 0;
 
 	size -= pos;
-	if (size < iocb->ki_left)
+	if (size < iocb->ki_nbytes)
 		nr_segs = iov_shorten((struct iovec *)iov, nr_segs, size);
 	return generic_file_aio_read(iocb, iov, nr_segs, pos);
 }
diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig
index 2b3b83296977..398cbd517be2 100644
--- a/fs/btrfs/Kconfig
+++ b/fs/btrfs/Kconfig
@@ -72,3 +72,12 @@ config BTRFS_DEBUG
 	  performance, or export extra information via sysfs.
 
 	  If unsure, say N.
+
+config BTRFS_ASSERT
+	bool "Btrfs assert support"
+	depends on BTRFS_FS
+	help
+	  Enable run-time assertion checking.  This will result in panics if
+	  any of the assertions trip.  This is meant for btrfs developers only.
+
+	  If unsure, say N.
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 3932224f99e9..a91a6a355cc5 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,10 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
 	   export.o tree-log.o free-space-cache.o zlib.o lzo.o \
 	   compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
-	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o
+	   reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
+	   uuid-tree.o
 
 btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
 btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
+
+btrfs-$(CONFIG_BTRFS_FS_RUN_SANITY_TESTS) += tests/free-space-tests.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index 8bc5e8ccb091..0552a599b28f 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -119,6 +119,26 @@ struct __prelim_ref {
 	u64 wanted_disk_byte;
 };
 
+static struct kmem_cache *btrfs_prelim_ref_cache;
+
+int __init btrfs_prelim_ref_init(void)
+{
+	btrfs_prelim_ref_cache = kmem_cache_create("btrfs_prelim_ref",
+					sizeof(struct __prelim_ref),
+					0,
+					SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+					NULL);
+	if (!btrfs_prelim_ref_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void btrfs_prelim_ref_exit(void)
+{
+	if (btrfs_prelim_ref_cache)
+		kmem_cache_destroy(btrfs_prelim_ref_cache);
+}
+
 /*
  * the rules for all callers of this function are:
  * - obtaining the parent is the goal
@@ -160,12 +180,12 @@ struct __prelim_ref {
 
 static int __add_prelim_ref(struct list_head *head, u64 root_id,
 			    struct btrfs_key *key, int level,
-			    u64 parent, u64 wanted_disk_byte, int count)
+			    u64 parent, u64 wanted_disk_byte, int count,
+			    gfp_t gfp_mask)
 {
 	struct __prelim_ref *ref;
 
-	/* in case we're adding delayed refs, we're holding the refs spinlock */
-	ref = kmalloc(sizeof(*ref), GFP_ATOMIC);
+	ref = kmem_cache_alloc(btrfs_prelim_ref_cache, gfp_mask);
 	if (!ref)
 		return -ENOMEM;
 
@@ -295,10 +315,9 @@ static int __resolve_indirect_ref(struct btrfs_fs_info *fs_info,
 	ret = btrfs_search_old_slot(root, &ref->key_for_search, path, time_seq);
 	pr_debug("search slot in root %llu (level %d, ref count %d) returned "
 		 "%d for key (%llu %u %llu)\n",
-		 (unsigned long long)ref->root_id, level, ref->count, ret,
-		 (unsigned long long)ref->key_for_search.objectid,
-		 ref->key_for_search.type,
-		 (unsigned long long)ref->key_for_search.offset);
+		 ref->root_id, level, ref->count, ret,
+		 ref->key_for_search.objectid, ref->key_for_search.type,
+		 ref->key_for_search.offset);
 	if (ret < 0)
 		goto out;
 
@@ -365,11 +384,12 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info,
 		node = ulist_next(parents, &uiter);
 		ref->parent = node ? node->val : 0;
 		ref->inode_list = node ?
-			(struct extent_inode_elem *)(uintptr_t)node->aux : 0;
+			(struct extent_inode_elem *)(uintptr_t)node->aux : NULL;
 
 		/* additional parents require new refs being added here */
 		while ((node = ulist_next(parents, &uiter))) {
-			new_ref = kmalloc(sizeof(*new_ref), GFP_NOFS);
+			new_ref = kmem_cache_alloc(btrfs_prelim_ref_cache,
+						   GFP_NOFS);
 			if (!new_ref) {
 				ret = -ENOMEM;
 				goto out;
@@ -493,7 +513,7 @@ static void __merge_refs(struct list_head *head, int mode)
 			ref1->count += ref2->count;
 
 			list_del(&ref2->list);
-			kfree(ref2);
+			kmem_cache_free(btrfs_prelim_ref_cache, ref2);
 		}
 
 	}
@@ -548,7 +568,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			ref = btrfs_delayed_node_to_tree_ref(node);
 			ret = __add_prelim_ref(prefs, ref->root, &op_key,
 					       ref->level + 1, 0, node->bytenr,
-					       node->ref_mod * sgn);
+					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_BLOCK_REF_KEY: {
@@ -558,7 +578,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			ret = __add_prelim_ref(prefs, ref->root, NULL,
 					       ref->level + 1, ref->parent,
 					       node->bytenr,
-					       node->ref_mod * sgn);
+					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_EXTENT_DATA_REF_KEY: {
@@ -570,7 +590,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			key.offset = ref->offset;
 			ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0,
 					       node->bytenr,
-					       node->ref_mod * sgn);
+					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		case BTRFS_SHARED_DATA_REF_KEY: {
@@ -583,7 +603,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq,
 			key.offset = ref->offset;
 			ret = __add_prelim_ref(prefs, ref->root, &key, 0,
 					       ref->parent, node->bytenr,
-					       node->ref_mod * sgn);
+					       node->ref_mod * sgn, GFP_ATOMIC);
 			break;
 		}
 		default:
@@ -657,7 +677,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, 0, NULL,
 						*info_level + 1, offset,
-						bytenr, 1);
+						bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -666,13 +686,13 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			sdref = (struct btrfs_shared_data_ref *)(iref + 1);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, offset,
-					       bytenr, count);
+					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, offset, NULL,
 					       *info_level + 1, 0,
-					       bytenr, 1);
+					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -687,7 +707,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info,
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-					       bytenr, count);
+					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -738,7 +758,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, 0, NULL,
 						info_level + 1, key.offset,
-						bytenr, 1);
+						bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_SHARED_DATA_REF_KEY: {
 			struct btrfs_shared_data_ref *sdref;
@@ -748,13 +768,13 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 					      struct btrfs_shared_data_ref);
 			count = btrfs_shared_data_ref_count(leaf, sdref);
 			ret = __add_prelim_ref(prefs, 0, NULL, 0, key.offset,
-						bytenr, count);
+						bytenr, count, GFP_NOFS);
 			break;
 		}
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			ret = __add_prelim_ref(prefs, key.offset, NULL,
 					       info_level + 1, 0,
-					       bytenr, 1);
+					       bytenr, 1, GFP_NOFS);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY: {
 			struct btrfs_extent_data_ref *dref;
@@ -770,7 +790,7 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info,
 			key.offset = btrfs_extent_data_ref_offset(leaf, dref);
 			root = btrfs_extent_data_ref_root(leaf, dref);
 			ret = __add_prelim_ref(prefs, root, &key, 0, 0,
-					       bytenr, count);
+					       bytenr, count, GFP_NOFS);
 			break;
 		}
 		default:
@@ -911,7 +931,6 @@ again:
 
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
-		list_del(&ref->list);
 		WARN_ON(ref->count < 0);
 		if (ref->count && ref->root_id && ref->parent == 0) {
 			/* no parent == root of tree */
@@ -935,8 +954,10 @@ again:
 				}
 				ret = find_extent_in_eb(eb, bytenr,
 							*extent_item_pos, &eie);
-				ref->inode_list = eie;
 				free_extent_buffer(eb);
+				if (ret < 0)
+					goto out;
+				ref->inode_list = eie;
 			}
 			ret = ulist_add_merge(refs, ref->parent,
 					      (uintptr_t)ref->inode_list,
@@ -954,7 +975,8 @@ again:
 				eie->next = ref->inode_list;
 			}
 		}
-		kfree(ref);
+		list_del(&ref->list);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 
 out:
@@ -962,13 +984,13 @@ out:
 	while (!list_empty(&prefs)) {
 		ref = list_first_entry(&prefs, struct __prelim_ref, list);
 		list_del(&ref->list);
-		kfree(ref);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 	while (!list_empty(&prefs_delayed)) {
 		ref = list_first_entry(&prefs_delayed, struct __prelim_ref,
 				       list);
 		list_del(&ref->list);
-		kfree(ref);
+		kmem_cache_free(btrfs_prelim_ref_cache, ref);
 	}
 
 	return ret;
@@ -1326,8 +1348,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 	     found_key->type != BTRFS_METADATA_ITEM_KEY) ||
 	    found_key->objectid > logical ||
 	    found_key->objectid + size <= logical) {
-		pr_debug("logical %llu is not within any extent\n",
-			 (unsigned long long)logical);
+		pr_debug("logical %llu is not within any extent\n", logical);
 		return -ENOENT;
 	}
 
@@ -1340,11 +1361,8 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
 
 	pr_debug("logical %llu is at position %llu within the extent (%llu "
 		 "EXTENT_ITEM %llu) flags %#llx size %u\n",
-		 (unsigned long long)logical,
-		 (unsigned long long)(logical - found_key->objectid),
-		 (unsigned long long)found_key->objectid,
-		 (unsigned long long)found_key->offset,
-		 (unsigned long long)flags, item_size);
+		 logical, logical - found_key->objectid, found_key->objectid,
+		 found_key->offset, flags, item_size);
 
 	WARN_ON(!flags_ret);
 	if (flags_ret) {
@@ -1516,7 +1534,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
 		while (!ret && (root_node = ulist_next(roots, &root_uiter))) {
 			pr_debug("root %llu references leaf %llu, data list "
 				 "%#llx\n", root_node->val, ref_node->val,
-				 (long long)ref_node->aux);
+				 ref_node->aux);
 			ret = iterate_leaf_refs((struct extent_inode_elem *)
 						(uintptr_t)ref_node->aux,
 						root_node->val,
@@ -1608,9 +1626,8 @@ static int iterate_inode_refs(u64 inum, struct btrfs_root *fs_root,
 			name_len = btrfs_inode_ref_name_len(eb, iref);
 			/* path must be released before calling iterate()! */
 			pr_debug("following ref at offset %u for inode %llu in "
-				 "tree %llu\n", cur,
-				 (unsigned long long)found_key.objectid,
-				 (unsigned long long)fs_root->objectid);
+				 "tree %llu\n", cur, found_key.objectid,
+				 fs_root->objectid);
 			ret = iterate(parent, name_len,
 				      (unsigned long)(iref + 1), eb, ctx);
 			if (ret)
diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h
index 8f2e76702932..a910b27a8ad9 100644
--- a/fs/btrfs/backref.h
+++ b/fs/btrfs/backref.h
@@ -72,4 +72,6 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
 			  struct btrfs_inode_extref **ret_extref,
 			  u64 *found_off);
 
+int __init btrfs_prelim_ref_init(void);
+void btrfs_prelim_ref_exit(void);
 #endif
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 08b286b2a2c5..d0ae226926ee 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -218,6 +218,27 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	return 0;
 }
 
+struct btrfs_dio_private {
+	struct inode *inode;
+	u64 logical_offset;
+	u64 disk_bytenr;
+	u64 bytes;
+	void *private;
+
+	/* number of bios pending for this dio */
+	atomic_t pending_bios;
+
+	/* IO errors */
+	int errors;
+
+	/* orig_bio is our btrfs_io_bio */
+	struct bio *orig_bio;
+
+	/* dio_bio came from fs/direct-io.c */
+	struct bio *dio_bio;
+	u8 csum[0];
+};
+
 /*
  * Disable DIO read nolock optimization, so new dio readers will be forced
  * to grab i_mutex. It is used to avoid the endless truncate due to
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 1431a6965017..1c47be187240 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -701,15 +701,13 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 			next_bytenr = btrfs_super_root(selected_super);
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "root@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
 			break;
 		case 1:
 			next_bytenr = btrfs_super_chunk_root(selected_super);
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "chunk@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
 			break;
 		case 2:
 			next_bytenr = btrfs_super_log_root(selected_super);
@@ -717,8 +715,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 				continue;
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "log@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
 			break;
 		}
 
@@ -727,7 +724,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
+			       next_bytenr, num_copies);
 
 		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
 			struct btrfsic_block *next_block;
@@ -742,8 +739,7 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 				printk(KERN_INFO "btrfsic:"
 				       " btrfsic_map_block(root @%llu,"
 				       " mirror %d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
+				       next_bytenr, mirror_num);
 				kfree(selected_super);
 				return -1;
 			}
@@ -767,7 +763,6 @@ static int btrfsic_process_superblock(struct btrfsic_state *state,
 			if (ret < (int)PAGE_CACHE_SIZE) {
 				printk(KERN_INFO
 				       "btrfsic: read @logical %llu failed!\n",
-				       (unsigned long long)
 				       tmp_next_block_ctx.start);
 				btrfsic_release_block_ctx(&tmp_next_block_ctx);
 				kfree(selected_super);
@@ -813,7 +808,7 @@ static int btrfsic_process_superblock_dev_mirror(
 	    (bh->b_data + (dev_bytenr & 4095));
 
 	if (btrfs_super_bytenr(super_tmp) != dev_bytenr ||
-	    super_tmp->magic != cpu_to_le64(BTRFS_MAGIC) ||
+	    btrfs_super_magic(super_tmp) != BTRFS_MAGIC ||
 	    memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) ||
 	    btrfs_super_nodesize(super_tmp) != state->metablock_size ||
 	    btrfs_super_leafsize(super_tmp) != state->metablock_size ||
@@ -847,10 +842,8 @@ static int btrfsic_process_superblock_dev_mirror(
 			printk_in_rcu(KERN_INFO "New initial S-block (bdev %p, %s)"
 				     " @%llu (%s/%llu/%d)\n",
 				     superblock_bdev,
-				     rcu_str_deref(device->name),
-				     (unsigned long long)dev_bytenr,
-				     dev_state->name,
-				     (unsigned long long)dev_bytenr,
+				     rcu_str_deref(device->name), dev_bytenr,
+				     dev_state->name, dev_bytenr,
 				     superblock_mirror_num);
 		list_add(&superblock_tmp->all_blocks_node,
 			 &state->all_blocks_list);
@@ -880,20 +873,20 @@ static int btrfsic_process_superblock_dev_mirror(
 		tmp_disk_key.offset = 0;
 		switch (pass) {
 		case 0:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
 			additional_string = "initial root ";
 			next_bytenr = btrfs_super_root(super_tmp);
 			break;
 		case 1:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
 			additional_string = "initial chunk ";
 			next_bytenr = btrfs_super_chunk_root(super_tmp);
 			break;
 		case 2:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
 			additional_string = "initial log ";
 			next_bytenr = btrfs_super_log_root(super_tmp);
 			if (0 == next_bytenr)
@@ -906,7 +899,7 @@ static int btrfsic_process_superblock_dev_mirror(
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
+			       next_bytenr, num_copies);
 		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
 			struct btrfsic_block *next_block;
 			struct btrfsic_block_data_ctx tmp_next_block_ctx;
@@ -918,8 +911,7 @@ static int btrfsic_process_superblock_dev_mirror(
 					      mirror_num)) {
 				printk(KERN_INFO "btrfsic: btrfsic_map_block("
 				       "bytenr @%llu, mirror %d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
+				       next_bytenr, mirror_num);
 				brelse(bh);
 				return -1;
 			}
@@ -1003,19 +995,17 @@ continue_with_new_stack_frame:
 		    (struct btrfs_leaf *)sf->hdr;
 
 		if (-1 == sf->i) {
-			sf->nr = le32_to_cpu(leafhdr->header.nritems);
+			sf->nr = btrfs_stack_header_nritems(&leafhdr->header);
 
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO
 				       "leaf %llu items %d generation %llu"
 				       " owner %llu\n",
-				       (unsigned long long)
-				       sf->block_ctx->start,
-				       sf->nr,
-				       (unsigned long long)
-				       le64_to_cpu(leafhdr->header.generation),
-				       (unsigned long long)
-				       le64_to_cpu(leafhdr->header.owner));
+				       sf->block_ctx->start, sf->nr,
+				       btrfs_stack_header_generation(
+					       &leafhdr->header),
+				       btrfs_stack_header_owner(
+					       &leafhdr->header));
 		}
 
 continue_with_current_leaf_stack_frame:
@@ -1047,10 +1037,10 @@ leaf_item_out_of_bounce_error:
 						     &disk_item,
 						     disk_item_offset,
 						     sizeof(struct btrfs_item));
-			item_offset = le32_to_cpu(disk_item.offset);
-			item_size = le32_to_cpu(disk_item.size);
+			item_offset = btrfs_stack_item_offset(&disk_item);
+			item_size = btrfs_stack_item_offset(&disk_item);
 			disk_key = &disk_item.key;
-			type = disk_key->type;
+			type = btrfs_disk_key_type(disk_key);
 
 			if (BTRFS_ROOT_ITEM_KEY == type) {
 				struct btrfs_root_item root_item;
@@ -1066,7 +1056,7 @@ leaf_item_out_of_bounce_error:
 					sf->block_ctx, &root_item,
 					root_item_offset,
 					item_size);
-				next_bytenr = le64_to_cpu(root_item.bytenr);
+				next_bytenr = btrfs_root_bytenr(&root_item);
 
 				sf->error =
 				    btrfsic_create_link_to_next_block(
@@ -1081,8 +1071,8 @@ leaf_item_out_of_bounce_error:
 						&sf->num_copies,
 						&sf->mirror_num,
 						disk_key,
-						le64_to_cpu(root_item.
-						generation));
+						btrfs_root_generation(
+						&root_item));
 				if (sf->error)
 					goto one_stack_frame_backwards;
 
@@ -1130,18 +1120,17 @@ leaf_item_out_of_bounce_error:
 		struct btrfs_node *const nodehdr = (struct btrfs_node *)sf->hdr;
 
 		if (-1 == sf->i) {
-			sf->nr = le32_to_cpu(nodehdr->header.nritems);
+			sf->nr = btrfs_stack_header_nritems(&nodehdr->header);
 
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO "node %llu level %d items %d"
 				       " generation %llu owner %llu\n",
-				       (unsigned long long)
 				       sf->block_ctx->start,
 				       nodehdr->header.level, sf->nr,
-				       (unsigned long long)
-				       le64_to_cpu(nodehdr->header.generation),
-				       (unsigned long long)
-				       le64_to_cpu(nodehdr->header.owner));
+				       btrfs_stack_header_generation(
+				       &nodehdr->header),
+				       btrfs_stack_header_owner(
+				       &nodehdr->header));
 		}
 
 continue_with_current_node_stack_frame:
@@ -1168,7 +1157,7 @@ continue_with_current_node_stack_frame:
 			btrfsic_read_from_block_data(
 				sf->block_ctx, &key_ptr, key_ptr_offset,
 				sizeof(struct btrfs_key_ptr));
-			next_bytenr = le64_to_cpu(key_ptr.blockptr);
+			next_bytenr = btrfs_stack_key_blockptr(&key_ptr);
 
 			sf->error = btrfsic_create_link_to_next_block(
 					state,
@@ -1182,7 +1171,7 @@ continue_with_current_node_stack_frame:
 					&sf->num_copies,
 					&sf->mirror_num,
 					&key_ptr.key,
-					le64_to_cpu(key_ptr.generation));
+					btrfs_stack_key_generation(&key_ptr));
 			if (sf->error)
 				goto one_stack_frame_backwards;
 
@@ -1247,8 +1236,7 @@ static void btrfsic_read_from_block_data(
 	unsigned long i = (start_offset + offset) >> PAGE_CACHE_SHIFT;
 
 	WARN_ON(offset + len > block_ctx->len);
-	offset_in_page = (start_offset + offset) &
-			 ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset_in_page = (start_offset + offset) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page));
@@ -1290,7 +1278,7 @@ static int btrfsic_create_link_to_next_block(
 				     next_bytenr, state->metablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, *num_copiesp);
+			       next_bytenr, *num_copiesp);
 		*mirror_nump = 1;
 	}
 
@@ -1307,7 +1295,7 @@ static int btrfsic_create_link_to_next_block(
 	if (ret) {
 		printk(KERN_INFO
 		       "btrfsic: btrfsic_map_block(@%llu, mirror=%d) failed!\n",
-		       (unsigned long long)next_bytenr, *mirror_nump);
+		       next_bytenr, *mirror_nump);
 		btrfsic_release_block_ctx(next_block_ctx);
 		*next_blockp = NULL;
 		return -1;
@@ -1335,20 +1323,16 @@ static int btrfsic_create_link_to_next_block(
 			       "Referenced block @%llu (%s/%llu/%d)"
 			       " found in hash table, %c,"
 			       " bytenr mismatch (!= stored %llu).\n",
-			       (unsigned long long)next_bytenr,
-			       next_block_ctx->dev->name,
-			       (unsigned long long)next_block_ctx->dev_bytenr,
-			       *mirror_nump,
+			       next_bytenr, next_block_ctx->dev->name,
+			       next_block_ctx->dev_bytenr, *mirror_nump,
 			       btrfsic_get_block_type(state, next_block),
-			       (unsigned long long)next_block->logical_bytenr);
+			       next_block->logical_bytenr);
 		} else if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 			printk(KERN_INFO
 			       "Referenced block @%llu (%s/%llu/%d)"
 			       " found in hash table, %c.\n",
-			       (unsigned long long)next_bytenr,
-			       next_block_ctx->dev->name,
-			       (unsigned long long)next_block_ctx->dev_bytenr,
-			       *mirror_nump,
+			       next_bytenr, next_block_ctx->dev->name,
+			       next_block_ctx->dev_bytenr, *mirror_nump,
 			       btrfsic_get_block_type(state, next_block));
 		next_block->logical_bytenr = next_bytenr;
 
@@ -1400,7 +1384,7 @@ static int btrfsic_create_link_to_next_block(
 		if (ret < (int)next_block_ctx->len) {
 			printk(KERN_INFO
 			       "btrfsic: read block @logical %llu failed!\n",
-			       (unsigned long long)next_bytenr);
+			       next_bytenr);
 			btrfsic_release_block_ctx(next_block_ctx);
 			*next_blockp = NULL;
 			return -1;
@@ -1444,12 +1428,12 @@ static int btrfsic_handle_extent_data(
 		file_extent_item_offset,
 		offsetof(struct btrfs_file_extent_item, disk_num_bytes));
 	if (BTRFS_FILE_EXTENT_REG != file_extent_item.type ||
-	    ((u64)0) == le64_to_cpu(file_extent_item.disk_bytenr)) {
+	    btrfs_stack_file_extent_disk_bytenr(&file_extent_item) == 0) {
 		if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
 			printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu\n",
 			       file_extent_item.type,
-			       (unsigned long long)
-			       le64_to_cpu(file_extent_item.disk_bytenr));
+			       btrfs_stack_file_extent_disk_bytenr(
+			       &file_extent_item));
 		return 0;
 	}
 
@@ -1463,20 +1447,19 @@ static int btrfsic_handle_extent_data(
 	btrfsic_read_from_block_data(block_ctx, &file_extent_item,
 				     file_extent_item_offset,
 				     sizeof(struct btrfs_file_extent_item));
-	next_bytenr = le64_to_cpu(file_extent_item.disk_bytenr) +
-		      le64_to_cpu(file_extent_item.offset);
-	generation = le64_to_cpu(file_extent_item.generation);
-	num_bytes = le64_to_cpu(file_extent_item.num_bytes);
-	generation = le64_to_cpu(file_extent_item.generation);
+	next_bytenr = btrfs_stack_file_extent_disk_bytenr(&file_extent_item) +
+		      btrfs_stack_file_extent_offset(&file_extent_item);
+	generation = btrfs_stack_file_extent_generation(&file_extent_item);
+	num_bytes = btrfs_stack_file_extent_num_bytes(&file_extent_item);
+	generation = btrfs_stack_file_extent_generation(&file_extent_item);
 
 	if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
 		printk(KERN_INFO "extent_data: type %u, disk_bytenr = %llu,"
 		       " offset = %llu, num_bytes = %llu\n",
 		       file_extent_item.type,
-		       (unsigned long long)
-		       le64_to_cpu(file_extent_item.disk_bytenr),
-		       (unsigned long long)le64_to_cpu(file_extent_item.offset),
-		       (unsigned long long)num_bytes);
+		       btrfs_stack_file_extent_disk_bytenr(&file_extent_item),
+		       btrfs_stack_file_extent_offset(&file_extent_item),
+		       num_bytes);
 	while (num_bytes > 0) {
 		u32 chunk_len;
 		int num_copies;
@@ -1492,7 +1475,7 @@ static int btrfsic_handle_extent_data(
 				     next_bytenr, state->datablock_size);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
+			       next_bytenr, num_copies);
 		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
 			struct btrfsic_block_data_ctx next_block_ctx;
 			struct btrfsic_block *next_block;
@@ -1504,8 +1487,7 @@ static int btrfsic_handle_extent_data(
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERY_VERBOSE)
 				printk(KERN_INFO
 				       "\tdisk_bytenr = %llu, num_bytes %u\n",
-				       (unsigned long long)next_bytenr,
-				       chunk_len);
+				       next_bytenr, chunk_len);
 			ret = btrfsic_map_block(state, next_bytenr,
 						chunk_len, &next_block_ctx,
 						mirror_num);
@@ -1513,8 +1495,7 @@ static int btrfsic_handle_extent_data(
 				printk(KERN_INFO
 				       "btrfsic: btrfsic_map_block(@%llu,"
 				       " mirror=%d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
+				       next_bytenr, mirror_num);
 				return -1;
 			}
 
@@ -1543,12 +1524,10 @@ static int btrfsic_handle_extent_data(
 					       " found in hash table, D,"
 					       " bytenr mismatch"
 					       " (!= stored %llu).\n",
-					       (unsigned long long)next_bytenr,
+					       next_bytenr,
 					       next_block_ctx.dev->name,
-					       (unsigned long long)
 					       next_block_ctx.dev_bytenr,
 					       mirror_num,
-					       (unsigned long long)
 					       next_block->logical_bytenr);
 				}
 				next_block->logical_bytenr = next_bytenr;
@@ -1675,7 +1654,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
 	if (block_ctx->dev_bytenr & ((u64)PAGE_CACHE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: read_block() with unaligned bytenr %llu\n",
-		       (unsigned long long)block_ctx->dev_bytenr);
+		       block_ctx->dev_bytenr);
 		return -1;
 	}
 
@@ -1772,10 +1751,8 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
 
 		printk(KERN_INFO "%c-block @%llu (%s/%llu/%d)\n",
 		       btrfsic_get_block_type(state, b_all),
-		       (unsigned long long)b_all->logical_bytenr,
-		       b_all->dev_state->name,
-		       (unsigned long long)b_all->dev_bytenr,
-		       b_all->mirror_num);
+		       b_all->logical_bytenr, b_all->dev_state->name,
+		       b_all->dev_bytenr, b_all->mirror_num);
 
 		list_for_each(elem_ref_to, &b_all->ref_to_list) {
 			const struct btrfsic_block_link *const l =
@@ -1787,16 +1764,13 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
 			       " refers %u* to"
 			       " %c @%llu (%s/%llu/%d)\n",
 			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num,
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
 			       l->ref_cnt,
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 		}
 
@@ -1810,16 +1784,12 @@ static void btrfsic_dump_database(struct btrfsic_state *state)
 			       " is ref %u* from"
 			       " %c @%llu (%s/%llu/%d)\n",
 			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num,
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num,
 			       l->ref_cnt,
 			       btrfsic_get_block_type(state, l->block_ref_from),
-			       (unsigned long long)
 			       l->block_ref_from->logical_bytenr,
 			       l->block_ref_from->dev_state->name,
-			       (unsigned long long)
 			       l->block_ref_from->dev_bytenr,
 			       l->block_ref_from->mirror_num);
 		}
@@ -1896,8 +1866,8 @@ again:
 		struct list_head *tmp_ref_to;
 
 		if (block->is_superblock) {
-			bytenr = le64_to_cpu(((struct btrfs_super_block *)
-					      mapped_datav[0])->bytenr);
+			bytenr = btrfs_super_bytenr((struct btrfs_super_block *)
+						    mapped_datav[0]);
 			if (num_pages * PAGE_CACHE_SIZE <
 			    BTRFS_SUPER_INFO_SIZE) {
 				printk(KERN_INFO
@@ -1923,8 +1893,9 @@ again:
 					return;
 				}
 				processed_len = state->metablock_size;
-				bytenr = le64_to_cpu(((struct btrfs_header *)
-						      mapped_datav[0])->bytenr);
+				bytenr = btrfs_stack_header_bytenr(
+						(struct btrfs_header *)
+						mapped_datav[0]);
 				btrfsic_cmp_log_and_dev_bytenr(state, bytenr,
 							       dev_state,
 							       dev_bytenr);
@@ -1935,12 +1906,9 @@ again:
 				       " found in hash table, %c,"
 				       " bytenr mismatch"
 				       " (!= stored %llu).\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
+				       bytenr, dev_state->name, dev_bytenr,
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block),
-				       (unsigned long long)
 				       block->logical_bytenr);
 				block->logical_bytenr = bytenr;
 			} else if (state->print_mask &
@@ -1948,9 +1916,7 @@ again:
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
+				       bytenr, dev_state->name, dev_bytenr,
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block));
 		} else {
@@ -1966,9 +1932,7 @@ again:
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/%d)"
 				       " found in hash table, %c.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr,
+				       bytenr, dev_state->name, dev_bytenr,
 				       block->mirror_num,
 				       btrfsic_get_block_type(state, block));
 		}
@@ -1985,21 +1949,14 @@ again:
 			       " new(gen=%llu),"
 			       " which is referenced by most recent superblock"
 			       " (superblockgen=%llu)!\n",
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)bytenr,
-			       dev_state->name,
-			       (unsigned long long)dev_bytenr,
-			       block->mirror_num,
-			       (unsigned long long)block->generation,
-			       (unsigned long long)
-			       le64_to_cpu(block->disk_key.objectid),
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_disk_key_objectid(&block->disk_key),
 			       block->disk_key.type,
-			       (unsigned long long)
-			       le64_to_cpu(block->disk_key.offset),
-			       (unsigned long long)
-			       le64_to_cpu(((struct btrfs_header *)
-					    mapped_datav[0])->generation),
-			       (unsigned long long)
+			       btrfs_disk_key_offset(&block->disk_key),
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *) mapped_datav[0]),
 			       state->max_superblock_generation);
 			btrfsic_dump_tree(state);
 		}
@@ -2008,15 +1965,12 @@ again:
 			printk(KERN_INFO "btrfs: attempt to overwrite %c-block"
 			       " @%llu (%s/%llu/%d), oldgen=%llu, newgen=%llu,"
 			       " which is not yet iodone!\n",
-			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)bytenr,
-			       dev_state->name,
-			       (unsigned long long)dev_bytenr,
-			       block->mirror_num,
-			       (unsigned long long)block->generation,
-			       (unsigned long long)
-			       le64_to_cpu(((struct btrfs_header *)
-					    mapped_datav[0])->generation));
+			       btrfsic_get_block_type(state, block), bytenr,
+			       dev_state->name, dev_bytenr, block->mirror_num,
+			       block->generation,
+			       btrfs_stack_header_generation(
+				       (struct btrfs_header *)
+				       mapped_datav[0]));
 			/* it would not be safe to go on */
 			btrfsic_dump_tree(state);
 			goto continue_loop;
@@ -2056,7 +2010,7 @@ again:
 		if (ret) {
 			printk(KERN_INFO
 			       "btrfsic: btrfsic_map_block(root @%llu)"
-			       " failed!\n", (unsigned long long)bytenr);
+			       " failed!\n", bytenr);
 			goto continue_loop;
 		}
 		block_ctx.datav = mapped_datav;
@@ -2140,7 +2094,7 @@ again:
 				printk(KERN_INFO
 				       "btrfsic: btrfsic_process_metablock"
 				       "(root @%llu) failed!\n",
-				       (unsigned long long)dev_bytenr);
+				       dev_bytenr);
 		} else {
 			block->is_metadata = 0;
 			block->mirror_num = 0;	/* unknown */
@@ -2168,8 +2122,7 @@ again:
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO "Written block (%s/%llu/?)"
 				       " !found in hash table, D.\n",
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr);
+				       dev_state->name, dev_bytenr);
 			if (!state->include_extent_data) {
 				/* ignore that written D block */
 				goto continue_loop;
@@ -2184,17 +2137,16 @@ again:
 			block_ctx.pagev = NULL;
 		} else {
 			processed_len = state->metablock_size;
-			bytenr = le64_to_cpu(((struct btrfs_header *)
-					      mapped_datav[0])->bytenr);
+			bytenr = btrfs_stack_header_bytenr(
+					(struct btrfs_header *)
+					mapped_datav[0]);
 			btrfsic_cmp_log_and_dev_bytenr(state, bytenr, dev_state,
 						       dev_bytenr);
 			if (state->print_mask & BTRFSIC_PRINT_MASK_VERBOSE)
 				printk(KERN_INFO
 				       "Written block @%llu (%s/%llu/?)"
 				       " !found in hash table, M.\n",
-				       (unsigned long long)bytenr,
-				       dev_state->name,
-				       (unsigned long long)dev_bytenr);
+				       bytenr, dev_state->name, dev_bytenr);
 
 			ret = btrfsic_map_block(state, bytenr, processed_len,
 						&block_ctx, 0);
@@ -2202,7 +2154,7 @@ again:
 				printk(KERN_INFO
 				       "btrfsic: btrfsic_map_block(root @%llu)"
 				       " failed!\n",
-				       (unsigned long long)dev_bytenr);
+				       dev_bytenr);
 				goto continue_loop;
 			}
 		}
@@ -2267,10 +2219,8 @@ again:
 			printk(KERN_INFO
 			       "New written %c-block @%llu (%s/%llu/%d)\n",
 			       is_metadata ? 'M' : 'D',
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num);
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
 		list_add(&block->all_blocks_node, &state->all_blocks_list);
 		btrfsic_block_hashtable_add(block, &state->block_hashtable);
 
@@ -2281,7 +2231,7 @@ again:
 				printk(KERN_INFO
 				       "btrfsic: process_metablock(root @%llu)"
 				       " failed!\n",
-				       (unsigned long long)dev_bytenr);
+				       dev_bytenr);
 		}
 		btrfsic_release_block_ctx(&block_ctx);
 	}
@@ -2319,10 +2269,8 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
 			       "bio_end_io(err=%d) for %c @%llu (%s/%llu/%d)\n",
 			       bio_error_status,
 			       btrfsic_get_block_type(dev_state->state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num);
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, block->mirror_num);
 		next_block = block->next_in_same_bio;
 		block->iodone_w_error = iodone_w_error;
 		if (block->submit_bio_bh_rw & REQ_FLUSH) {
@@ -2332,7 +2280,6 @@ static void btrfsic_bio_end_io(struct bio *bp, int bio_error_status)
 				printk(KERN_INFO
 				       "bio_end_io() new %s flush_gen=%llu\n",
 				       dev_state->name,
-				       (unsigned long long)
 				       dev_state->last_flush_gen);
 		}
 		if (block->submit_bio_bh_rw & REQ_FUA)
@@ -2358,10 +2305,8 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
 		       "bh_end_io(error=%d) for %c @%llu (%s/%llu/%d)\n",
 		       iodone_w_error,
 		       btrfsic_get_block_type(dev_state->state, block),
-		       (unsigned long long)block->logical_bytenr,
-		       block->dev_state->name,
-		       (unsigned long long)block->dev_bytenr,
-		       block->mirror_num);
+		       block->logical_bytenr, block->dev_state->name,
+		       block->dev_bytenr, block->mirror_num);
 
 	block->iodone_w_error = iodone_w_error;
 	if (block->submit_bio_bh_rw & REQ_FLUSH) {
@@ -2370,8 +2315,7 @@ static void btrfsic_bh_end_io(struct buffer_head *bh, int uptodate)
 		     BTRFSIC_PRINT_MASK_END_IO_BIO_BH))
 			printk(KERN_INFO
 			       "bh_end_io() new %s flush_gen=%llu\n",
-			       dev_state->name,
-			       (unsigned long long)dev_state->last_flush_gen);
+			       dev_state->name, dev_state->last_flush_gen);
 	}
 	if (block->submit_bio_bh_rw & REQ_FUA)
 		block->flush_gen = 0; /* FUA completed means block is on disk */
@@ -2396,26 +2340,20 @@ static int btrfsic_process_written_superblock(
 			printk(KERN_INFO
 			       "btrfsic: superblock @%llu (%s/%llu/%d)"
 			       " with old gen %llu <= %llu\n",
-			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->logical_bytenr,
 			       superblock->dev_state->name,
-			       (unsigned long long)superblock->dev_bytenr,
-			       superblock->mirror_num,
-			       (unsigned long long)
+			       superblock->dev_bytenr, superblock->mirror_num,
 			       btrfs_super_generation(super_hdr),
-			       (unsigned long long)
 			       state->max_superblock_generation);
 	} else {
 		if (state->print_mask & BTRFSIC_PRINT_MASK_SUPERBLOCK_WRITE)
 			printk(KERN_INFO
 			       "btrfsic: got new superblock @%llu (%s/%llu/%d)"
 			       " with new gen %llu > %llu\n",
-			       (unsigned long long)superblock->logical_bytenr,
+			       superblock->logical_bytenr,
 			       superblock->dev_state->name,
-			       (unsigned long long)superblock->dev_bytenr,
-			       superblock->mirror_num,
-			       (unsigned long long)
+			       superblock->dev_bytenr, superblock->mirror_num,
 			       btrfs_super_generation(super_hdr),
-			       (unsigned long long)
 			       state->max_superblock_generation);
 
 		state->max_superblock_generation =
@@ -2432,43 +2370,41 @@ static int btrfsic_process_written_superblock(
 		int num_copies;
 		int mirror_num;
 		const char *additional_string = NULL;
-		struct btrfs_disk_key tmp_disk_key;
+		struct btrfs_disk_key tmp_disk_key = {0};
 
-		tmp_disk_key.type = BTRFS_ROOT_ITEM_KEY;
-		tmp_disk_key.offset = 0;
+		btrfs_set_disk_key_objectid(&tmp_disk_key,
+					    BTRFS_ROOT_ITEM_KEY);
+		btrfs_set_disk_key_objectid(&tmp_disk_key, 0);
 
 		switch (pass) {
 		case 0:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_ROOT_TREE_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_ROOT_TREE_OBJECTID);
 			additional_string = "root ";
 			next_bytenr = btrfs_super_root(super_hdr);
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "root@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "root@%llu\n", next_bytenr);
 			break;
 		case 1:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_CHUNK_TREE_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_CHUNK_TREE_OBJECTID);
 			additional_string = "chunk ";
 			next_bytenr = btrfs_super_chunk_root(super_hdr);
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "chunk@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "chunk@%llu\n", next_bytenr);
 			break;
 		case 2:
-			tmp_disk_key.objectid =
-			    cpu_to_le64(BTRFS_TREE_LOG_OBJECTID);
+			btrfs_set_disk_key_objectid(&tmp_disk_key,
+						    BTRFS_TREE_LOG_OBJECTID);
 			additional_string = "log ";
 			next_bytenr = btrfs_super_log_root(super_hdr);
 			if (0 == next_bytenr)
 				continue;
 			if (state->print_mask &
 			    BTRFSIC_PRINT_MASK_ROOT_CHUNK_LOG_TREE_LOCATION)
-				printk(KERN_INFO "log@%llu\n",
-				       (unsigned long long)next_bytenr);
+				printk(KERN_INFO "log@%llu\n", next_bytenr);
 			break;
 		}
 
@@ -2477,7 +2413,7 @@ static int btrfsic_process_written_superblock(
 				     next_bytenr, BTRFS_SUPER_INFO_SIZE);
 		if (state->print_mask & BTRFSIC_PRINT_MASK_NUM_COPIES)
 			printk(KERN_INFO "num_copies(log_bytenr=%llu) = %d\n",
-			       (unsigned long long)next_bytenr, num_copies);
+			       next_bytenr, num_copies);
 		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
 			int was_created;
 
@@ -2493,8 +2429,7 @@ static int btrfsic_process_written_superblock(
 				printk(KERN_INFO
 				       "btrfsic: btrfsic_map_block(@%llu,"
 				       " mirror=%d) failed!\n",
-				       (unsigned long long)next_bytenr,
-				       mirror_num);
+				       next_bytenr, mirror_num);
 				return -1;
 			}
 
@@ -2579,26 +2514,22 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       " %u* refers to %c @%llu (%s/%llu/%d)\n",
 			       recursion_level,
 			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num,
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
 			       l->ref_cnt,
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 		if (l->block_ref_to->never_written) {
 			printk(KERN_INFO "btrfs: attempt to write superblock"
 			       " which references block %c @%llu (%s/%llu/%d)"
 			       " which is never written!\n",
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 			ret = -1;
 		} else if (!l->block_ref_to->is_iodone) {
@@ -2606,10 +2537,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       " which references block %c @%llu (%s/%llu/%d)"
 			       " which is not yet iodone!\n",
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 			ret = -1;
 		} else if (l->block_ref_to->iodone_w_error) {
@@ -2617,10 +2547,9 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       " which references block %c @%llu (%s/%llu/%d)"
 			       " which has write error!\n",
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num);
 			ret = -1;
 		} else if (l->parent_generation !=
@@ -2634,13 +2563,12 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       " with generation %llu !="
 			       " parent generation %llu!\n",
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->dev_bytenr,
 			       l->block_ref_to->mirror_num,
-			       (unsigned long long)l->block_ref_to->generation,
-			       (unsigned long long)l->parent_generation);
+			       l->block_ref_to->generation,
+			       l->parent_generation);
 			ret = -1;
 		} else if (l->block_ref_to->flush_gen >
 			   l->block_ref_to->dev_state->last_flush_gen) {
@@ -2650,13 +2578,10 @@ static int btrfsic_check_all_ref_blocks(struct btrfsic_state *state,
 			       " (block flush_gen=%llu,"
 			       " dev->flush_gen=%llu)!\n",
 			       btrfsic_get_block_type(state, l->block_ref_to),
-			       (unsigned long long)
 			       l->block_ref_to->logical_bytenr,
 			       l->block_ref_to->dev_state->name,
-			       (unsigned long long)l->block_ref_to->dev_bytenr,
-			       l->block_ref_to->mirror_num,
-			       (unsigned long long)block->flush_gen,
-			       (unsigned long long)
+			       l->block_ref_to->dev_bytenr,
+			       l->block_ref_to->mirror_num, block->flush_gen,
 			       l->block_ref_to->dev_state->last_flush_gen);
 			ret = -1;
 		} else if (-1 == btrfsic_check_all_ref_blocks(state,
@@ -2701,16 +2626,12 @@ static int btrfsic_is_block_ref_by_superblock(
 			       " is ref %u* from %c @%llu (%s/%llu/%d)\n",
 			       recursion_level,
 			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       block->dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       block->mirror_num,
+			       block->logical_bytenr, block->dev_state->name,
+			       block->dev_bytenr, block->mirror_num,
 			       l->ref_cnt,
 			       btrfsic_get_block_type(state, l->block_ref_from),
-			       (unsigned long long)
 			       l->block_ref_from->logical_bytenr,
 			       l->block_ref_from->dev_state->name,
-			       (unsigned long long)
 			       l->block_ref_from->dev_bytenr,
 			       l->block_ref_from->mirror_num);
 		if (l->block_ref_from->is_superblock &&
@@ -2737,14 +2658,12 @@ static void btrfsic_print_add_link(const struct btrfsic_state *state,
 	       " to %c @%llu (%s/%llu/%d).\n",
 	       l->ref_cnt,
 	       btrfsic_get_block_type(state, l->block_ref_from),
-	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->logical_bytenr,
 	       l->block_ref_from->dev_state->name,
-	       (unsigned long long)l->block_ref_from->dev_bytenr,
-	       l->block_ref_from->mirror_num,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
 	       btrfsic_get_block_type(state, l->block_ref_to),
-	       (unsigned long long)l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->name,
-	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
 	       l->block_ref_to->mirror_num);
 }
 
@@ -2756,14 +2675,12 @@ static void btrfsic_print_rem_link(const struct btrfsic_state *state,
 	       " to %c @%llu (%s/%llu/%d).\n",
 	       l->ref_cnt,
 	       btrfsic_get_block_type(state, l->block_ref_from),
-	       (unsigned long long)l->block_ref_from->logical_bytenr,
+	       l->block_ref_from->logical_bytenr,
 	       l->block_ref_from->dev_state->name,
-	       (unsigned long long)l->block_ref_from->dev_bytenr,
-	       l->block_ref_from->mirror_num,
+	       l->block_ref_from->dev_bytenr, l->block_ref_from->mirror_num,
 	       btrfsic_get_block_type(state, l->block_ref_to),
-	       (unsigned long long)l->block_ref_to->logical_bytenr,
-	       l->block_ref_to->dev_state->name,
-	       (unsigned long long)l->block_ref_to->dev_bytenr,
+	       l->block_ref_to->logical_bytenr,
+	       l->block_ref_to->dev_state->name, l->block_ref_to->dev_bytenr,
 	       l->block_ref_to->mirror_num);
 }
 
@@ -2807,10 +2724,8 @@ static void btrfsic_dump_tree_sub(const struct btrfsic_state *state,
 	 */
 	indent_add = sprintf(buf, "%c-%llu(%s/%llu/%d)",
 			     btrfsic_get_block_type(state, block),
-			     (unsigned long long)block->logical_bytenr,
-			     block->dev_state->name,
-			     (unsigned long long)block->dev_bytenr,
-			     block->mirror_num);
+			     block->logical_bytenr, block->dev_state->name,
+			     block->dev_bytenr, block->mirror_num);
 	if (indent_level + indent_add > BTRFSIC_TREE_DUMP_MAX_INDENT_LEVEL) {
 		printk("[...]\n");
 		return;
@@ -2943,10 +2858,8 @@ static struct btrfsic_block *btrfsic_block_lookup_or_add(
 			       "New %s%c-block @%llu (%s/%llu/%d)\n",
 			       additional_string,
 			       btrfsic_get_block_type(state, block),
-			       (unsigned long long)block->logical_bytenr,
-			       dev_state->name,
-			       (unsigned long long)block->dev_bytenr,
-			       mirror_num);
+			       block->logical_bytenr, dev_state->name,
+			       block->dev_bytenr, mirror_num);
 		list_add(&block->all_blocks_node, &state->all_blocks_list);
 		btrfsic_block_hashtable_add(block, &state->block_hashtable);
 		if (NULL != was_created)
@@ -2980,7 +2893,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 			printk(KERN_INFO "btrfsic:"
 			       " btrfsic_map_block(logical @%llu,"
 			       " mirror %d) failed!\n",
-			       (unsigned long long)bytenr, mirror_num);
+			       bytenr, mirror_num);
 			continue;
 		}
 
@@ -2997,8 +2910,7 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 		printk(KERN_INFO "btrfs: attempt to write M-block which contains logical bytenr that doesn't map to dev+physical bytenr of submit_bio,"
 		       " buffer->log_bytenr=%llu, submit_bio(bdev=%s,"
 		       " phys_bytenr=%llu)!\n",
-		       (unsigned long long)bytenr, dev_state->name,
-		       (unsigned long long)dev_bytenr);
+		       bytenr, dev_state->name, dev_bytenr);
 		for (mirror_num = 1; mirror_num <= num_copies; mirror_num++) {
 			ret = btrfsic_map_block(state, bytenr,
 						state->metablock_size,
@@ -3008,10 +2920,8 @@ static void btrfsic_cmp_log_and_dev_bytenr(struct btrfsic_state *state,
 
 			printk(KERN_INFO "Read logical bytenr @%llu maps to"
 			       " (%s/%llu/%d)\n",
-			       (unsigned long long)bytenr,
-			       block_ctx.dev->name,
-			       (unsigned long long)block_ctx.dev_bytenr,
-			       mirror_num);
+			       bytenr, block_ctx.dev->name,
+			       block_ctx.dev_bytenr, mirror_num);
 		}
 		WARN_ON(1);
 	}
@@ -3048,12 +2958,10 @@ int btrfsic_submit_bh(int rw, struct buffer_head *bh)
 		if (dev_state->state->print_mask &
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			printk(KERN_INFO
-			       "submit_bh(rw=0x%x, blocknr=%lu (bytenr %llu),"
-			       " size=%lu, data=%p, bdev=%p)\n",
-			       rw, (unsigned long)bh->b_blocknr,
-			       (unsigned long long)dev_bytenr,
-			       (unsigned long)bh->b_size, bh->b_data,
-			       bh->b_bdev);
+			       "submit_bh(rw=0x%x, blocknr=%llu (bytenr %llu),"
+			       " size=%zu, data=%p, bdev=%p)\n",
+			       rw, (unsigned long long)bh->b_blocknr,
+			       dev_bytenr, bh->b_size, bh->b_data, bh->b_bdev);
 		btrfsic_process_written_block(dev_state, dev_bytenr,
 					      &bh->b_data, 1, NULL,
 					      NULL, bh, rw);
@@ -3118,9 +3026,9 @@ void btrfsic_submit_bio(int rw, struct bio *bio)
 		    BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
 			printk(KERN_INFO
 			       "submit_bio(rw=0x%x, bi_vcnt=%u,"
-			       " bi_sector=%lu (bytenr %llu), bi_bdev=%p)\n",
-			       rw, bio->bi_vcnt, (unsigned long)bio->bi_sector,
-			       (unsigned long long)dev_bytenr,
+			       " bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
+			       rw, bio->bi_vcnt,
+			       (unsigned long long)bio->bi_sector, dev_bytenr,
 			       bio->bi_bdev);
 
 		mapped_datav = kmalloc(sizeof(*mapped_datav) * bio->bi_vcnt,
@@ -3213,19 +3121,19 @@ int btrfsic_mount(struct btrfs_root *root,
 	if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-		       root->nodesize, (unsigned long)PAGE_CACHE_SIZE);
+		       root->nodesize, PAGE_CACHE_SIZE);
 		return -1;
 	}
 	if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-		       root->leafsize, (unsigned long)PAGE_CACHE_SIZE);
+		       root->leafsize, PAGE_CACHE_SIZE);
 		return -1;
 	}
 	if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) {
 		printk(KERN_INFO
 		       "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n",
-		       root->sectorsize, (unsigned long)PAGE_CACHE_SIZE);
+		       root->sectorsize, PAGE_CACHE_SIZE);
 		return -1;
 	}
 	state = kzalloc(sizeof(*state), GFP_NOFS);
@@ -3369,10 +3277,8 @@ void btrfsic_unmount(struct btrfs_root *root,
 			       " @%llu (%s/%llu/%d) on umount which is"
 			       " not yet iodone!\n",
 			       btrfsic_get_block_type(state, b_all),
-			       (unsigned long long)b_all->logical_bytenr,
-			       b_all->dev_state->name,
-			       (unsigned long long)b_all->dev_bytenr,
-			       b_all->mirror_num);
+			       b_all->logical_bytenr, b_all->dev_state->name,
+			       b_all->dev_bytenr, b_all->mirror_num);
 	}
 
 	mutex_unlock(&btrfsic_mutex);
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index b189bd1e7a3e..6aad98cb343f 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -132,9 +132,8 @@ static int check_compressed_csum(struct inode *inode,
 			printk(KERN_INFO "btrfs csum failed ino %llu "
 			       "extent %llu csum %u "
 			       "wanted %u mirror %d\n",
-			       (unsigned long long)btrfs_ino(inode),
-			       (unsigned long long)disk_start,
-			       csum, *cb_sum, cb->mirror_num);
+			       btrfs_ino(inode), disk_start, csum, *cb_sum,
+			       cb->mirror_num);
 			ret = -EIO;
 			goto fail;
 		}
@@ -639,7 +638,11 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
 	faili = nr_pages - 1;
 	cb->nr_pages = nr_pages;
 
-	add_ra_bio_pages(inode, em_start + em_len, cb);
+	/* In the parent-locked case, we only locked the range we are
+	 * interested in.  In all other cases, we can opportunistically
+	 * cache decompressed data that goes beyond the requested range. */
+	if (!(bio_flags & EXTENT_BIO_PARENT_LOCKED))
+		add_ra_bio_pages(inode, em_start + em_len, cb);
 
 	/* include any pages we added in add_ra-bio_pages */
 	uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ed504607d8ec..64346721173f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -274,8 +274,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, new_root_objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	WARN_ON(btrfs_header_generation(buf) > trans->transid);
@@ -484,8 +483,27 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 	struct rb_node **new;
 	struct rb_node *parent = NULL;
 	struct tree_mod_elem *cur;
+	int ret = 0;
+
+	BUG_ON(!tm);
+
+	tree_mod_log_write_lock(fs_info);
+	if (list_empty(&fs_info->tree_mod_seq_list)) {
+		tree_mod_log_write_unlock(fs_info);
+		/*
+		 * Ok we no longer care about logging modifications, free up tm
+		 * and return 0.  Any callers shouldn't be using tm after
+		 * calling tree_mod_log_insert, but if they do we can just
+		 * change this to return a special error code to let the callers
+		 * do their own thing.
+		 */
+		kfree(tm);
+		return 0;
+	}
 
-	BUG_ON(!tm || !tm->seq);
+	spin_lock(&fs_info->tree_mod_seq_lock);
+	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
+	spin_unlock(&fs_info->tree_mod_seq_lock);
 
 	tm_root = &fs_info->tree_mod_log;
 	new = &tm_root->rb_node;
@@ -501,14 +519,17 @@ __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
 		else if (cur->seq > tm->seq)
 			new = &((*new)->rb_right);
 		else {
+			ret = -EEXIST;
 			kfree(tm);
-			return -EEXIST;
+			goto out;
 		}
 	}
 
 	rb_link_node(&tm->node, parent, new);
 	rb_insert_color(&tm->node, tm_root);
-	return 0;
+out:
+	tree_mod_log_write_unlock(fs_info);
+	return ret;
 }
 
 /*
@@ -524,57 +545,19 @@ static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
 		return 1;
 	if (eb && btrfs_header_level(eb) == 0)
 		return 1;
-
-	tree_mod_log_write_lock(fs_info);
-	if (list_empty(&fs_info->tree_mod_seq_list)) {
-		/*
-		 * someone emptied the list while we were waiting for the lock.
-		 * we must not add to the list when no blocker exists.
-		 */
-		tree_mod_log_write_unlock(fs_info);
-		return 1;
-	}
-
 	return 0;
 }
 
-/*
- * This allocates memory and gets a tree modification sequence number.
- *
- * Returns <0 on error.
- * Returns >0 (the added sequence number) on success.
- */
-static inline int tree_mod_alloc(struct btrfs_fs_info *fs_info, gfp_t flags,
-				 struct tree_mod_elem **tm_ret)
-{
-	struct tree_mod_elem *tm;
-
-	/*
-	 * once we switch from spin locks to something different, we should
-	 * honor the flags parameter here.
-	 */
-	tm = *tm_ret = kzalloc(sizeof(*tm), GFP_ATOMIC);
-	if (!tm)
-		return -ENOMEM;
-
-	spin_lock(&fs_info->tree_mod_seq_lock);
-	tm->seq = btrfs_inc_tree_mod_seq_minor(fs_info);
-	spin_unlock(&fs_info->tree_mod_seq_lock);
-
-	return tm->seq;
-}
-
 static inline int
 __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 			  struct extent_buffer *eb, int slot,
 			  enum mod_log_op op, gfp_t flags)
 {
-	int ret;
 	struct tree_mod_elem *tm;
 
-	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret < 0)
-		return ret;
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
 
 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
 	if (op != MOD_LOG_KEY_ADD) {
@@ -589,34 +572,14 @@ __tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
 }
 
 static noinline int
-tree_mod_log_insert_key_mask(struct btrfs_fs_info *fs_info,
-			     struct extent_buffer *eb, int slot,
-			     enum mod_log_op op, gfp_t flags)
+tree_mod_log_insert_key(struct btrfs_fs_info *fs_info,
+			struct extent_buffer *eb, int slot,
+			enum mod_log_op op, gfp_t flags)
 {
-	int ret;
-
 	if (tree_mod_dont_log(fs_info, eb))
 		return 0;
 
-	ret = __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
-
-	tree_mod_log_write_unlock(fs_info);
-	return ret;
-}
-
-static noinline int
-tree_mod_log_insert_key(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
-			int slot, enum mod_log_op op)
-{
-	return tree_mod_log_insert_key_mask(fs_info, eb, slot, op, GFP_NOFS);
-}
-
-static noinline int
-tree_mod_log_insert_key_locked(struct btrfs_fs_info *fs_info,
-			     struct extent_buffer *eb, int slot,
-			     enum mod_log_op op)
-{
-	return __tree_mod_log_insert_key(fs_info, eb, slot, op, GFP_NOFS);
+	return __tree_mod_log_insert_key(fs_info, eb, slot, op, flags);
 }
 
 static noinline int
@@ -637,14 +600,14 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	 * buffer, i.e. dst_slot < src_slot.
 	 */
 	for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
-		ret = tree_mod_log_insert_key_locked(fs_info, eb, i + dst_slot,
-					      MOD_LOG_KEY_REMOVE_WHILE_MOVING);
+		ret = __tree_mod_log_insert_key(fs_info, eb, i + dst_slot,
+				MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 
-	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret < 0)
-		goto out;
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
 
 	tm->index = eb->start >> PAGE_CACHE_SHIFT;
 	tm->slot = src_slot;
@@ -652,10 +615,7 @@ tree_mod_log_insert_move(struct btrfs_fs_info *fs_info,
 	tm->move.nr_items = nr_items;
 	tm->op = MOD_LOG_MOVE_KEYS;
 
-	ret = __tree_mod_log_insert(fs_info, tm);
-out:
-	tree_mod_log_write_unlock(fs_info);
-	return ret;
+	return __tree_mod_log_insert(fs_info, tm);
 }
 
 static inline void
@@ -670,8 +630,8 @@ __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 
 	nritems = btrfs_header_nritems(eb);
 	for (i = nritems - 1; i >= 0; i--) {
-		ret = tree_mod_log_insert_key_locked(fs_info, eb, i,
-					      MOD_LOG_KEY_REMOVE_WHILE_FREEING);
+		ret = __tree_mod_log_insert_key(fs_info, eb, i,
+				MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 }
@@ -683,7 +643,6 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 			 int log_removal)
 {
 	struct tree_mod_elem *tm;
-	int ret;
 
 	if (tree_mod_dont_log(fs_info, NULL))
 		return 0;
@@ -691,9 +650,9 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	if (log_removal)
 		__tree_mod_log_free_eb(fs_info, old_root);
 
-	ret = tree_mod_alloc(fs_info, flags, &tm);
-	if (ret < 0)
-		goto out;
+	tm = kzalloc(sizeof(*tm), flags);
+	if (!tm)
+		return -ENOMEM;
 
 	tm->index = new_root->start >> PAGE_CACHE_SHIFT;
 	tm->old_root.logical = old_root->start;
@@ -701,10 +660,7 @@ tree_mod_log_insert_root(struct btrfs_fs_info *fs_info,
 	tm->generation = btrfs_header_generation(old_root);
 	tm->op = MOD_LOG_ROOT_REPLACE;
 
-	ret = __tree_mod_log_insert(fs_info, tm);
-out:
-	tree_mod_log_write_unlock(fs_info);
-	return ret;
+	return __tree_mod_log_insert(fs_info, tm);
 }
 
 static struct tree_mod_elem *
@@ -784,23 +740,20 @@ tree_mod_log_eb_copy(struct btrfs_fs_info *fs_info, struct extent_buffer *dst,
 	if (tree_mod_dont_log(fs_info, NULL))
 		return;
 
-	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) {
-		tree_mod_log_write_unlock(fs_info);
+	if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
 		return;
-	}
 
 	for (i = 0; i < nr_items; i++) {
-		ret = tree_mod_log_insert_key_locked(fs_info, src,
+		ret = __tree_mod_log_insert_key(fs_info, src,
 						i + src_offset,
-						MOD_LOG_KEY_REMOVE);
+						MOD_LOG_KEY_REMOVE, GFP_NOFS);
 		BUG_ON(ret < 0);
-		ret = tree_mod_log_insert_key_locked(fs_info, dst,
+		ret = __tree_mod_log_insert_key(fs_info, dst,
 						     i + dst_offset,
-						     MOD_LOG_KEY_ADD);
+						     MOD_LOG_KEY_ADD,
+						     GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
-
-	tree_mod_log_write_unlock(fs_info);
 }
 
 static inline void
@@ -819,9 +772,9 @@ tree_mod_log_set_node_key(struct btrfs_fs_info *fs_info,
 {
 	int ret;
 
-	ret = tree_mod_log_insert_key_mask(fs_info, eb, slot,
-					   MOD_LOG_KEY_REPLACE,
-					   atomic ? GFP_ATOMIC : GFP_NOFS);
+	ret = __tree_mod_log_insert_key(fs_info, eb, slot,
+					MOD_LOG_KEY_REPLACE,
+					atomic ? GFP_ATOMIC : GFP_NOFS);
 	BUG_ON(ret < 0);
 }
 
@@ -830,10 +783,7 @@ tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
 {
 	if (tree_mod_dont_log(fs_info, eb))
 		return;
-
 	__tree_mod_log_free_eb(fs_info, eb);
-
-	tree_mod_log_write_unlock(fs_info);
 }
 
 static noinline void
@@ -1046,8 +996,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	else
 		btrfs_set_header_owner(cow, root->root_key.objectid);
 
-	write_extent_buffer(cow, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(cow),
+	write_extent_buffer(cow, root->fs_info->fsid, btrfs_header_fsid(cow),
 			    BTRFS_FSID_SIZE);
 
 	ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
@@ -1083,7 +1032,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 		WARN_ON(trans->transid != btrfs_header_generation(parent));
 		tree_mod_log_insert_key(root->fs_info, parent, parent_slot,
-					MOD_LOG_KEY_REPLACE);
+					MOD_LOG_KEY_REPLACE, GFP_NOFS);
 		btrfs_set_node_blockptr(parent, parent_slot,
 					cow->start);
 		btrfs_set_node_ptr_generation(parent, parent_slot,
@@ -1116,7 +1065,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 	int looped = 0;
 
 	if (!time_seq)
-		return 0;
+		return NULL;
 
 	/*
 	 * the very last operation that's logged for a root is the replacement
@@ -1127,7 +1076,7 @@ __tree_mod_log_oldest_root(struct btrfs_fs_info *fs_info,
 		tm = tree_mod_log_search_oldest(fs_info, root_logical,
 						time_seq);
 		if (!looped && !tm)
-			return 0;
+			return NULL;
 		/*
 		 * if there are no tree operation for the oldest root, we simply
 		 * return it. this should only happen if that (old) root is at
@@ -1240,8 +1189,8 @@ __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
  * is freed (its refcount is decremented).
  */
 static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
-		    u64 time_seq)
+tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
+		    struct extent_buffer *eb, u64 time_seq)
 {
 	struct extent_buffer *eb_rewin;
 	struct tree_mod_elem *tm;
@@ -1256,11 +1205,18 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 	if (!tm)
 		return eb;
 
+	btrfs_set_path_blocking(path);
+	btrfs_set_lock_blocking_rw(eb, BTRFS_READ_LOCK);
+
 	if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
 		BUG_ON(tm->slot != 0);
 		eb_rewin = alloc_dummy_extent_buffer(eb->start,
 						fs_info->tree_root->nodesize);
-		BUG_ON(!eb_rewin);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
 		btrfs_set_header_bytenr(eb_rewin, eb->start);
 		btrfs_set_header_backref_rev(eb_rewin,
 					     btrfs_header_backref_rev(eb));
@@ -1268,10 +1224,15 @@ tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
 		btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
 	} else {
 		eb_rewin = btrfs_clone_extent_buffer(eb);
-		BUG_ON(!eb_rewin);
+		if (!eb_rewin) {
+			btrfs_tree_read_unlock_blocking(eb);
+			free_extent_buffer(eb);
+			return NULL;
+		}
 	}
 
-	btrfs_tree_read_unlock(eb);
+	btrfs_clear_path_blocking(path, NULL, BTRFS_READ_LOCK);
+	btrfs_tree_read_unlock_blocking(eb);
 	free_extent_buffer(eb);
 
 	extent_buffer_get(eb_rewin);
@@ -1335,8 +1296,9 @@ get_old_root(struct btrfs_root *root, u64 time_seq)
 		free_extent_buffer(eb_root);
 		eb = alloc_dummy_extent_buffer(logical, root->nodesize);
 	} else {
+		btrfs_set_lock_blocking_rw(eb_root, BTRFS_READ_LOCK);
 		eb = btrfs_clone_extent_buffer(eb_root);
-		btrfs_tree_read_unlock(eb_root);
+		btrfs_tree_read_unlock_blocking(eb_root);
 		free_extent_buffer(eb_root);
 	}
 
@@ -1419,14 +1381,12 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
 
 	if (trans->transaction != root->fs_info->running_transaction)
 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)
+		       trans->transid,
 		       root->fs_info->running_transaction->transid);
 
 	if (trans->transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "trans %llu running %llu\n",
-		       (unsigned long long)trans->transid,
-		       (unsigned long long)root->fs_info->generation);
+		       trans->transid, root->fs_info->generation);
 
 	if (!should_cow_block(trans, root, buf)) {
 		*cow_ret = buf;
@@ -2466,6 +2426,40 @@ done:
 	return ret;
 }
 
+static void key_search_validate(struct extent_buffer *b,
+				struct btrfs_key *key,
+				int level)
+{
+#ifdef CONFIG_BTRFS_ASSERT
+	struct btrfs_disk_key disk_key;
+
+	btrfs_cpu_key_to_disk(&disk_key, key);
+
+	if (level == 0)
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_leaf, items[0].key),
+		    sizeof(disk_key)));
+	else
+		ASSERT(!memcmp_extent_buffer(b, &disk_key,
+		    offsetof(struct btrfs_node, ptrs[0].key),
+		    sizeof(disk_key)));
+#endif
+}
+
+static int key_search(struct extent_buffer *b, struct btrfs_key *key,
+		      int level, int *prev_cmp, int *slot)
+{
+	if (*prev_cmp != 0) {
+		*prev_cmp = bin_search(b, key, level, slot);
+		return *prev_cmp;
+	}
+
+	key_search_validate(b, key, level);
+	*slot = 0;
+
+	return 0;
+}
+
 /*
  * look for key in the tree.  path is filled in with nodes along the way
  * if key is found, we return zero and you can find the item in the leaf
@@ -2494,6 +2488,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	int write_lock_level = 0;
 	u8 lowest_level = 0;
 	int min_write_lock_level;
+	int prev_cmp;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(lowest_level && ins_len > 0);
@@ -2524,6 +2519,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
 	min_write_lock_level = write_lock_level;
 
 again:
+	prev_cmp = -1;
 	/*
 	 * we try very hard to do read locks on the root
 	 */
@@ -2624,7 +2620,7 @@ cow_done:
 		if (!cow)
 			btrfs_unlock_up_safe(p, level + 1);
 
-		ret = bin_search(b, key, level, &slot);
+		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {
 			int dec = 0;
@@ -2759,6 +2755,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	int level;
 	int lowest_unlock = 1;
 	u8 lowest_level = 0;
+	int prev_cmp;
 
 	lowest_level = p->lowest_level;
 	WARN_ON(p->nodes[0] != NULL);
@@ -2769,6 +2766,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, struct btrfs_key *key,
 	}
 
 again:
+	prev_cmp = -1;
 	b = get_old_root(root, time_seq);
 	level = btrfs_header_level(b);
 	p->locks[level] = BTRFS_READ_LOCK;
@@ -2786,7 +2784,7 @@ again:
 		 */
 		btrfs_unlock_up_safe(p, level + 1);
 
-		ret = bin_search(b, key, level, &slot);
+		ret = key_search(b, key, level, &prev_cmp, &slot);
 
 		if (level != 0) {
 			int dec = 0;
@@ -2820,7 +2818,11 @@ again:
 				btrfs_clear_path_blocking(p, b,
 							  BTRFS_READ_LOCK);
 			}
-			b = tree_mod_log_rewind(root->fs_info, b, time_seq);
+			b = tree_mod_log_rewind(root->fs_info, p, b, time_seq);
+			if (!b) {
+				ret = -ENOMEM;
+				goto done;
+			}
 			p->locks[level] = BTRFS_READ_LOCK;
 			p->nodes[level] = b;
 		} else {
@@ -3143,13 +3145,11 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(c, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(c, root->root_key.objectid);
 
-	write_extent_buffer(c, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(c),
+	write_extent_buffer(c, root->fs_info->fsid, btrfs_header_fsid(c),
 			    BTRFS_FSID_SIZE);
 
 	write_extent_buffer(c, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(c),
-			    BTRFS_UUID_SIZE);
+			    btrfs_header_chunk_tree_uuid(c), BTRFS_UUID_SIZE);
 
 	btrfs_set_node_key(c, &lower_key, 0);
 	btrfs_set_node_blockptr(c, 0, lower->start);
@@ -3208,7 +3208,7 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
 	}
 	if (level) {
 		ret = tree_mod_log_insert_key(root->fs_info, lower, slot,
-					      MOD_LOG_KEY_ADD);
+					      MOD_LOG_KEY_ADD, GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 	btrfs_set_node_key(lower, key, slot);
@@ -3284,10 +3284,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
 	btrfs_set_header_backref_rev(split, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(split, root->root_key.objectid);
 	write_extent_buffer(split, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(split),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(split), BTRFS_FSID_SIZE);
 	write_extent_buffer(split, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(split),
+			    btrfs_header_chunk_tree_uuid(split),
 			    BTRFS_UUID_SIZE);
 
 	tree_mod_log_eb_copy(root->fs_info, split, c, 0, mid, c_nritems - mid);
@@ -4040,11 +4039,10 @@ again:
 	btrfs_set_header_owner(right, root->root_key.objectid);
 	btrfs_set_header_level(right, 0);
 	write_extent_buffer(right, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(right),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(right), BTRFS_FSID_SIZE);
 
 	write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(right),
+			    btrfs_header_chunk_tree_uuid(right),
 			    BTRFS_UUID_SIZE);
 
 	if (split == 0) {
@@ -4642,7 +4640,7 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
 			      (nritems - slot - 1));
 	} else if (level) {
 		ret = tree_mod_log_insert_key(root->fs_info, parent, slot,
-					      MOD_LOG_KEY_REMOVE);
+					      MOD_LOG_KEY_REMOVE, GFP_NOFS);
 		BUG_ON(ret < 0);
 	}
 
@@ -4814,7 +4812,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
  * This may release the path, and so you may lose any locks held at the
  * time you call it.
  */
-int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
+static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
 {
 	struct btrfs_key key;
 	struct btrfs_disk_key found_key;
@@ -5329,19 +5327,20 @@ int btrfs_compare_trees(struct btrfs_root *left_root,
 					goto out;
 				advance_right = ADVANCE;
 			} else {
+				enum btrfs_compare_tree_result cmp;
+
 				WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
 				ret = tree_compare_item(left_root, left_path,
 						right_path, tmp_buf);
-				if (ret) {
-					WARN_ON(!extent_buffer_uptodate(left_path->nodes[0]));
-					ret = changed_cb(left_root, right_root,
-						left_path, right_path,
-						&left_key,
-						BTRFS_COMPARE_TREE_CHANGED,
-						ctx);
-					if (ret < 0)
-						goto out;
-				}
+				if (ret)
+					cmp = BTRFS_COMPARE_TREE_CHANGED;
+				else
+					cmp = BTRFS_COMPARE_TREE_SAME;
+				ret = changed_cb(left_root, right_root,
+						 left_path, right_path,
+						 &left_key, cmp, ctx);
+				if (ret < 0)
+					goto out;
 				advance_left = ADVANCE;
 				advance_right = ADVANCE;
 			}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index e795bf135e80..3c1da6f98a4d 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -23,6 +23,7 @@
 #include <linux/highmem.h>
 #include <linux/fs.h>
 #include <linux/rwsem.h>
+#include <linux/semaphore.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
 #include <linux/wait.h>
@@ -91,6 +92,9 @@ struct btrfs_ordered_sum;
 /* holds quota configuration and tracking */
 #define BTRFS_QUOTA_TREE_OBJECTID 8ULL
 
+/* for storing items that use the BTRFS_UUID_KEY* types */
+#define BTRFS_UUID_TREE_OBJECTID 9ULL
+
 /* for storing balance parameters in the root tree */
 #define BTRFS_BALANCE_OBJECTID -4ULL
 
@@ -142,7 +146,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_EMPTY_SUBVOL_DIR_OBJECTID 2
 
-#define BTRFS_DEV_REPLACE_DEVID 0
+#define BTRFS_DEV_REPLACE_DEVID 0ULL
 
 /*
  * the max metadata block size.  This limit is somewhat artificial,
@@ -478,9 +482,10 @@ struct btrfs_super_block {
 	char label[BTRFS_LABEL_SIZE];
 
 	__le64 cache_generation;
+	__le64 uuid_tree_generation;
 
 	/* future expansion */
-	__le64 reserved[31];
+	__le64 reserved[30];
 	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 	struct btrfs_root_backup super_roots[BTRFS_NUM_BACKUP_ROOTS];
 } __attribute__ ((__packed__));
@@ -1188,6 +1193,7 @@ enum btrfs_caching_type {
 	BTRFS_CACHE_STARTED	= 1,
 	BTRFS_CACHE_FAST	= 2,
 	BTRFS_CACHE_FINISHED	= 3,
+	BTRFS_CACHE_ERROR	= 4,
 };
 
 enum btrfs_disk_cache_state {
@@ -1302,6 +1308,7 @@ struct btrfs_fs_info {
 	struct btrfs_root *fs_root;
 	struct btrfs_root *csum_root;
 	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 
 	/* the log root tree is a directory of all the other log roots */
 	struct btrfs_root *log_root_tree;
@@ -1350,6 +1357,7 @@ struct btrfs_fs_info {
 	u64 last_trans_log_full_commit;
 	unsigned long mount_opt;
 	unsigned long compress_type:4;
+	int commit_interval;
 	/*
 	 * It is a suggestive number, the read side is safe even it gets a
 	 * wrong number because we will write out the data into a regular
@@ -1411,6 +1419,13 @@ struct btrfs_fs_info {
 	 * before jumping into the main commit.
 	 */
 	struct mutex ordered_operations_mutex;
+
+	/*
+	 * Same as ordered_operations_mutex except this is for ordered extents
+	 * and not the operations.
+	 */
+	struct mutex ordered_extent_flush_mutex;
+
 	struct rw_semaphore extent_commit_sem;
 
 	struct rw_semaphore cleanup_work_sem;
@@ -1641,6 +1656,9 @@ struct btrfs_fs_info {
 	struct btrfs_dev_replace dev_replace;
 
 	atomic_t mutually_exclusive_operation_running;
+
+	struct semaphore uuid_tree_rescan_sem;
+	unsigned int update_uuid_tree_gen:1;
 };
 
 /*
@@ -1934,6 +1952,19 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_DEV_REPLACE_KEY	250
 
 /*
+ * Stores items that allow to quickly map UUIDs to something else.
+ * These items are part of the filesystem UUID tree.
+ * The key is built like this:
+ * (UUID_upper_64_bits, BTRFS_UUID_KEY*, UUID_lower_64_bits).
+ */
+#if BTRFS_UUID_SIZE != 16
+#error "UUID items require BTRFS_UUID_SIZE == 16!"
+#endif
+#define BTRFS_UUID_KEY_SUBVOL	251	/* for UUIDs assigned to subvols */
+#define BTRFS_UUID_KEY_RECEIVED_SUBVOL	252	/* for UUIDs assigned to
+						 * received subvols */
+
+/*
  * string items are for debugging.  They just store a short string of
  * data in the FS
  */
@@ -1967,6 +1998,9 @@ struct btrfs_ioctl_defrag_range_args {
 #define BTRFS_MOUNT_CHECK_INTEGRITY	(1 << 20)
 #define BTRFS_MOUNT_CHECK_INTEGRITY_INCLUDING_EXTENT_DATA (1 << 21)
 #define BTRFS_MOUNT_PANIC_ON_FATAL_ERROR	(1 << 22)
+#define BTRFS_MOUNT_RESCAN_UUID_TREE	(1 << 23)
+
+#define BTRFS_DEFAULT_COMMIT_INTERVAL	(30)
 
 #define btrfs_clear_opt(o, opt)		((o) &= ~BTRFS_MOUNT_##opt)
 #define btrfs_set_opt(o, opt)		((o) |= BTRFS_MOUNT_##opt)
@@ -2130,14 +2164,14 @@ BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item,
 BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item,
 			 generation, 64);
 
-static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_uuid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, uuid);
 }
 
-static inline char *btrfs_device_fsid(struct btrfs_dev_item *d)
+static inline unsigned long btrfs_device_fsid(struct btrfs_dev_item *d)
 {
-	return (char *)d + offsetof(struct btrfs_dev_item, fsid);
+	return (unsigned long)d + offsetof(struct btrfs_dev_item, fsid);
 }
 
 BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64);
@@ -2240,6 +2274,23 @@ BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
 BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
+			 sequence, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
+			 transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
+			 nbytes, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
+			 block_group, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
 
 static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
@@ -2267,6 +2318,8 @@ btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 
 BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
 BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_dev_extent */
 BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent,
@@ -2277,10 +2330,10 @@ BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent,
 		   chunk_offset, 64);
 BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
 
-static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
+static inline unsigned long btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev)
 {
 	unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid);
-	return (u8 *)((unsigned long)dev + ptr);
+	return (unsigned long)dev + ptr;
 }
 
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 64);
@@ -2348,6 +2401,10 @@ BTRFS_SETGET_FUNCS(ref_count_v0, struct btrfs_extent_ref_v0, count, 32);
 /* struct btrfs_node */
 BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64);
 BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_blockptr, struct btrfs_key_ptr,
+			 blockptr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_key_generation, struct btrfs_key_ptr,
+			 generation, 64);
 
 static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr)
 {
@@ -2404,6 +2461,8 @@ static inline void btrfs_set_node_key(struct extent_buffer *eb,
 /* struct btrfs_item */
 BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32);
 BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_offset, struct btrfs_item, offset, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_item_size, struct btrfs_item, size, 32);
 
 static inline unsigned long btrfs_item_nr_offset(int nr)
 {
@@ -2466,6 +2525,13 @@ BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16);
 BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8);
 BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16);
 BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_type, struct btrfs_dir_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_data_len, struct btrfs_dir_item,
+			 data_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_name_len, struct btrfs_dir_item,
+			 name_len, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_dir_transid, struct btrfs_dir_item,
+			 transid, 64);
 
 static inline void btrfs_dir_item_key(struct extent_buffer *eb,
 				      struct btrfs_dir_item *item,
@@ -2568,6 +2634,12 @@ BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32);
 BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64);
 BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_header_generation, struct btrfs_header,
+			 generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_owner, struct btrfs_header, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_header_nritems, struct btrfs_header,
+			 nritems, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_header_bytenr, struct btrfs_header, bytenr, 64);
 
 static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag)
 {
@@ -2603,16 +2675,14 @@ static inline void btrfs_set_header_backref_rev(struct extent_buffer *eb,
 	btrfs_set_header_flags(eb, flags);
 }
 
-static inline u8 *btrfs_header_fsid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_fsid(struct extent_buffer *eb)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, fsid);
-	return (u8 *)ptr;
+	return offsetof(struct btrfs_header, fsid);
 }
 
-static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
+static inline unsigned long btrfs_header_chunk_tree_uuid(struct extent_buffer *eb)
 {
-	unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid);
-	return (u8 *)ptr;
+	return offsetof(struct btrfs_header, chunk_tree_uuid);
 }
 
 static inline int btrfs_is_leaf(struct extent_buffer *eb)
@@ -2830,6 +2900,9 @@ BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block,
 			 csum_type, 16);
 BTRFS_SETGET_STACK_FUNCS(super_cache_generation, struct btrfs_super_block,
 			 cache_generation, 64);
+BTRFS_SETGET_STACK_FUNCS(super_magic, struct btrfs_super_block, magic, 64);
+BTRFS_SETGET_STACK_FUNCS(super_uuid_tree_generation, struct btrfs_super_block,
+			 uuid_tree_generation, 64);
 
 static inline int btrfs_super_csum_size(struct btrfs_super_block *s)
 {
@@ -2847,6 +2920,14 @@ static inline unsigned long btrfs_leaf_data(struct extent_buffer *l)
 
 /* struct btrfs_file_extent_item */
 BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_disk_bytenr,
+			 struct btrfs_file_extent_item, disk_bytenr, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_offset,
+			 struct btrfs_file_extent_item, offset, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_generation,
+			 struct btrfs_file_extent_item, generation, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_file_extent_num_bytes,
+			 struct btrfs_file_extent_item, num_bytes, 64);
 
 static inline unsigned long
 btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e)
@@ -3107,11 +3188,9 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   u64 root_objectid, u64 owner, u64 offset,
 				   struct btrfs_key *ins);
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-				  struct btrfs_root *root,
-				  u64 num_bytes, u64 min_alloc_size,
-				  u64 empty_size, u64 hint_byte,
-				  struct btrfs_key *ins, int is_data);
+int btrfs_reserve_extent(struct btrfs_root *root, u64 num_bytes,
+			 u64 min_alloc_size, u64 empty_size, u64 hint_byte,
+			 struct btrfs_key *ins, int is_data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf, int full_backref, int for_cow);
 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
@@ -3175,7 +3254,7 @@ void btrfs_orphan_release_metadata(struct inode *inode);
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int nitems,
-				     u64 *qgroup_reserved);
+				     u64 *qgroup_reserved, bool use_global_rsv);
 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
 				      struct btrfs_block_rsv *rsv,
 				      u64 qgroup_reserved);
@@ -3245,6 +3324,7 @@ enum btrfs_compare_tree_result {
 	BTRFS_COMPARE_TREE_NEW,
 	BTRFS_COMPARE_TREE_DELETED,
 	BTRFS_COMPARE_TREE_CHANGED,
+	BTRFS_COMPARE_TREE_SAME,
 };
 typedef int (*btrfs_changed_cb_t)(struct btrfs_root *left_root,
 				  struct btrfs_root *right_root,
@@ -3380,6 +3460,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info)
 	kfree(fs_info->dev_root);
 	kfree(fs_info->csum_root);
 	kfree(fs_info->quota_root);
+	kfree(fs_info->uuid_root);
 	kfree(fs_info->super_copy);
 	kfree(fs_info->super_for_commit);
 	kfree(fs_info);
@@ -3414,8 +3495,6 @@ int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root,
 				   struct btrfs_key *key,
 				   struct btrfs_root_item *item);
-void btrfs_read_root_item(struct extent_buffer *eb, int slot,
-			  struct btrfs_root_item *item);
 int btrfs_find_root(struct btrfs_root *root, struct btrfs_key *search_key,
 		    struct btrfs_path *path, struct btrfs_root_item *root_item,
 		    struct btrfs_key *root_key);
@@ -3426,6 +3505,17 @@ void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
 void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
 
+/* uuid-tree.c */
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid);
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64));
+
 /* dir-item.c */
 int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
 			  const char *name, int name_len);
@@ -3509,12 +3599,14 @@ int btrfs_find_name_in_ext_backref(struct btrfs_path *path,
 				   struct btrfs_inode_extref **extref_ret);
 
 /* file-item.c */
+struct btrfs_dio_private;
 int btrfs_del_csums(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, u64 bytenr, u64 len);
 int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 			  struct bio *bio, u32 *dst);
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 logical_offset);
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 logical_offset);
 int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     u64 objectid, u64 pos,
@@ -3552,8 +3644,7 @@ void btrfs_wait_and_free_delalloc_work(struct btrfs_delalloc_work *work);
 struct extent_map *btrfs_get_extent_fiemap(struct inode *inode, struct page *page,
 					   size_t pg_offset, u64 start, u64 len,
 					   int create);
-noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
-			      struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes);
 
@@ -3643,11 +3734,15 @@ extern const struct dentry_operations btrfs_dentry_operations;
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 void btrfs_update_iflags(struct inode *inode);
 void btrfs_inherit_iflags(struct inode *inode, struct inode *dir);
+int btrfs_is_empty_uuid(u8 *uuid);
 int btrfs_defrag_file(struct inode *inode, struct file *file,
 		      struct btrfs_ioctl_defrag_range_args *range,
 		      u64 newer_than, unsigned long max_pages);
 void btrfs_get_block_group_info(struct list_head *groups_list,
 				struct btrfs_ioctl_space_info *space);
+void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
+			       struct btrfs_ioctl_balance_args *bargs);
+
 
 /* file.c */
 int btrfs_auto_defrag_init(void);
@@ -3720,6 +3815,22 @@ void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
 #define btrfs_debug(fs_info, fmt, args...) \
 	btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
 
+#ifdef CONFIG_BTRFS_ASSERT
+
+static inline void assfail(char *expr, char *file, int line)
+{
+	printk(KERN_ERR "BTRFS assertion failed: %s, file: %s, line: %d",
+	       expr, file, line);
+	BUG();
+}
+
+#define ASSERT(expr)	\
+	(likely(expr) ? (void)0 : assfail(#expr, __FILE__, __LINE__))
+#else
+#define ASSERT(expr)	((void)0)
+#endif
+
+#define btrfs_assert()
 __printf(5, 6)
 void __btrfs_std_error(struct btrfs_fs_info *fs_info, const char *function,
 		     unsigned int line, int errno, const char *fmt, ...);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index 375510913fe7..cbd9523ad09c 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -21,6 +21,7 @@
 #include "delayed-inode.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "ctree.h"
 
 #define BTRFS_DELAYED_WRITEBACK		512
 #define BTRFS_DELAYED_BACKGROUND	128
@@ -1453,10 +1454,10 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 
 	dir_item = (struct btrfs_dir_item *)delayed_item->data;
 	dir_item->location = *disk_key;
-	dir_item->transid = cpu_to_le64(trans->transid);
-	dir_item->data_len = 0;
-	dir_item->name_len = cpu_to_le16(name_len);
-	dir_item->type = type;
+	btrfs_set_stack_dir_transid(dir_item, trans->transid);
+	btrfs_set_stack_dir_data_len(dir_item, 0);
+	btrfs_set_stack_dir_name_len(dir_item, name_len);
+	btrfs_set_stack_dir_type(dir_item, type);
 	memcpy((char *)(dir_item + 1), name, name_len);
 
 	ret = btrfs_delayed_item_reserve_metadata(trans, root, delayed_item);
@@ -1470,13 +1471,11 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
 	mutex_lock(&delayed_node->mutex);
 	ret = __btrfs_add_delayed_insertion_item(delayed_node, delayed_item);
 	if (unlikely(ret)) {
-		printk(KERN_ERR "err add delayed dir index item(name: %s) into "
-				"the insertion tree of the delayed node"
+		printk(KERN_ERR "err add delayed dir index item(name: %.*s) "
+				"into the insertion tree of the delayed node"
 				"(root id: %llu, inode id: %llu, errno: %d)\n",
-				name,
-				(unsigned long long)delayed_node->root->objectid,
-				(unsigned long long)delayed_node->inode_id,
-				ret);
+				name_len, name, delayed_node->root->objectid,
+				delayed_node->inode_id, ret);
 		BUG();
 	}
 	mutex_unlock(&delayed_node->mutex);
@@ -1547,9 +1546,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,
 		printk(KERN_ERR "err add delayed dir index item(index: %llu) "
 				"into the deletion tree of the delayed node"
 				"(root id: %llu, inode id: %llu, errno: %d)\n",
-				(unsigned long long)index,
-				(unsigned long long)node->root->objectid,
-				(unsigned long long)node->inode_id,
+				index, node->root->objectid, node->inode_id,
 				ret);
 		BUG();
 	}
@@ -1699,7 +1696,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 
 		di = (struct btrfs_dir_item *)curr->data;
 		name = (char *)(di + 1);
-		name_len = le16_to_cpu(di->name_len);
+		name_len = btrfs_stack_dir_name_len(di);
 
 		d_type = btrfs_filetype_table[di->type];
 		btrfs_disk_key_to_cpu(&location, &di->location);
@@ -1716,27 +1713,6 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
 	return 0;
 }
 
-BTRFS_SETGET_STACK_FUNCS(stack_inode_generation, struct btrfs_inode_item,
-			 generation, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_sequence, struct btrfs_inode_item,
-			 sequence, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_transid, struct btrfs_inode_item,
-			 transid, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_size, struct btrfs_inode_item, size, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nbytes, struct btrfs_inode_item,
-			 nbytes, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_block_group, struct btrfs_inode_item,
-			 block_group, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_nlink, struct btrfs_inode_item, nlink, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_uid, struct btrfs_inode_item, uid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_gid, struct btrfs_inode_item, gid, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_rdev, struct btrfs_inode_item, rdev, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_inode_flags, struct btrfs_inode_item, flags, 64);
-
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_sec, struct btrfs_timespec, sec, 64);
-BTRFS_SETGET_STACK_FUNCS(stack_timespec_nsec, struct btrfs_timespec, nsec, 32);
-
 static void fill_stack_inode_item(struct btrfs_trans_handle *trans,
 				  struct btrfs_inode_item *inode_item,
 				  struct inode *inode)
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index c219463fb1fd..e4d467be2dd4 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -241,7 +241,7 @@ int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-static void inline drop_delayed_ref(struct btrfs_trans_handle *trans,
+static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
 				    struct btrfs_delayed_ref_root *delayed_refs,
 				    struct btrfs_delayed_ref_node *ref)
 {
@@ -600,7 +600,7 @@ static noinline void add_delayed_ref_head(struct btrfs_fs_info *fs_info,
 	INIT_LIST_HEAD(&head_ref->cluster);
 	mutex_init(&head_ref->mutex);
 
-	trace_btrfs_delayed_ref_head(ref, head_ref, action);
+	trace_add_delayed_ref_head(ref, head_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
@@ -661,7 +661,7 @@ static noinline void add_delayed_tree_ref(struct btrfs_fs_info *fs_info,
 		ref->type = BTRFS_TREE_BLOCK_REF_KEY;
 	full_ref->level = level;
 
-	trace_btrfs_delayed_tree_ref(ref, full_ref, action);
+	trace_add_delayed_tree_ref(ref, full_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
@@ -722,7 +722,7 @@ static noinline void add_delayed_data_ref(struct btrfs_fs_info *fs_info,
 	full_ref->objectid = owner;
 	full_ref->offset = offset;
 
-	trace_btrfs_delayed_data_ref(ref, full_ref, action);
+	trace_add_delayed_data_ref(ref, full_ref, action);
 
 	existing = tree_insert(&delayed_refs->root, &ref->rb_node);
 
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 5f8f3341c099..a64435359385 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -148,13 +148,13 @@ no_valid_dev_replace_entry_found:
 		    !btrfs_test_opt(dev_root, DEGRADED)) {
 			ret = -EIO;
 			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "srcdev (devid %llu) is missing, need to run 'btrfs dev scan'?\n",
-				(unsigned long long)src_devid);
+				src_devid);
 		}
 		if (!dev_replace->tgtdev &&
 		    !btrfs_test_opt(dev_root, DEGRADED)) {
 			ret = -EIO;
 			pr_warn("btrfs: cannot mount because device replace operation is ongoing and\n" "tgtdev (devid %llu) is missing, need to run btrfs dev scan?\n",
-				(unsigned long long)BTRFS_DEV_REPLACE_DEVID);
+				BTRFS_DEV_REPLACE_DEVID);
 		}
 		if (dev_replace->tgtdev) {
 			if (dev_replace->srcdev) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6b092a1c4e37..4cbb00af92ff 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -31,6 +31,7 @@
 #include <linux/migrate.h>
 #include <linux/ratelimit.h>
 #include <linux/uuid.h>
+#include <linux/semaphore.h>
 #include <asm/unaligned.h>
 #include "compat.h"
 #include "ctree.h"
@@ -302,9 +303,8 @@ static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf,
 			printk_ratelimited(KERN_INFO "btrfs: %s checksum verify "
 				       "failed on %llu wanted %X found %X "
 				       "level %d\n",
-				       root->fs_info->sb->s_id,
-				       (unsigned long long)buf->start, val, found,
-				       btrfs_header_level(buf));
+				       root->fs_info->sb->s_id, buf->start,
+				       val, found, btrfs_header_level(buf));
 			if (result != (char *)&inline_result)
 				kfree(result);
 			return 1;
@@ -345,9 +345,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree,
 	}
 	printk_ratelimited("parent transid verify failed on %llu wanted %llu "
 		       "found %llu\n",
-		       (unsigned long long)eb->start,
-		       (unsigned long long)parent_transid,
-		       (unsigned long long)btrfs_header_generation(eb));
+		       eb->start, parent_transid, btrfs_header_generation(eb));
 	ret = 1;
 	clear_extent_buffer_uptodate(eb);
 out:
@@ -497,8 +495,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	u8 fsid[BTRFS_UUID_SIZE];
 	int ret = 1;
 
-	read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb),
-			   BTRFS_FSID_SIZE);
+	read_extent_buffer(eb, fsid, btrfs_header_fsid(eb), BTRFS_FSID_SIZE);
 	while (fs_devices) {
 		if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) {
 			ret = 0;
@@ -512,8 +509,7 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 #define CORRUPT(reason, eb, root, slot)				\
 	printk(KERN_CRIT "btrfs: corrupt leaf, %s: block=%llu,"	\
 	       "root=%llu, slot=%d\n", reason,			\
-	       (unsigned long long)btrfs_header_bytenr(eb),	\
-	       (unsigned long long)root->objectid, slot)
+	       btrfs_header_bytenr(eb),	root->objectid, slot)
 
 static noinline int check_leaf(struct btrfs_root *root,
 			       struct extent_buffer *leaf)
@@ -576,8 +572,9 @@ static noinline int check_leaf(struct btrfs_root *root,
 	return 0;
 }
 
-static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state, int mirror)
+static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
 	struct extent_io_tree *tree;
 	u64 found_start;
@@ -612,14 +609,13 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	if (found_start != eb->start) {
 		printk_ratelimited(KERN_INFO "btrfs bad tree block start "
 			       "%llu %llu\n",
-			       (unsigned long long)found_start,
-			       (unsigned long long)eb->start);
+			       found_start, eb->start);
 		ret = -EIO;
 		goto err;
 	}
 	if (check_tree_block_fsid(root, eb)) {
 		printk_ratelimited(KERN_INFO "btrfs bad fsid on block %llu\n",
-			       (unsigned long long)eb->start);
+			       eb->start);
 		ret = -EIO;
 		goto err;
 	}
@@ -1148,6 +1144,10 @@ struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 		return NULL;
 
 	ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid);
+	if (ret) {
+		free_extent_buffer(buf);
+		return NULL;
+	}
 	return buf;
 
 }
@@ -1291,11 +1291,10 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
 	btrfs_set_header_owner(leaf, objectid);
 	root->node = leaf;
 
-	write_extent_buffer(leaf, fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, fs_info->fsid, btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    btrfs_header_chunk_tree_uuid(leaf),
 			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1379,8 +1378,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans,
 	root->node = leaf;
 
 	write_extent_buffer(root->node, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(root->node),
-			    BTRFS_FSID_SIZE);
+			    btrfs_header_fsid(root->node), BTRFS_FSID_SIZE);
 	btrfs_mark_buffer_dirty(root->node);
 	btrfs_tree_unlock(root->node);
 	return root;
@@ -1413,11 +1411,11 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	log_root->root_key.offset = root->root_key.objectid;
 
 	inode_item = &log_root->root_item.inode;
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 
 	btrfs_set_root_node(&log_root->root_item, log_root->node);
 
@@ -1428,8 +1426,8 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 	return 0;
 }
 
-struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
-					struct btrfs_key *key)
+static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root,
+					       struct btrfs_key *key)
 {
 	struct btrfs_root *root;
 	struct btrfs_fs_info *fs_info = tree_root->fs_info;
@@ -1529,8 +1527,8 @@ fail:
 	return ret;
 }
 
-struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
-					u64 root_id)
+static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					       u64 root_id)
 {
 	struct btrfs_root *root;
 
@@ -1581,10 +1579,16 @@ struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 	if (location->objectid == BTRFS_QUOTA_TREE_OBJECTID)
 		return fs_info->quota_root ? fs_info->quota_root :
 					     ERR_PTR(-ENOENT);
+	if (location->objectid == BTRFS_UUID_TREE_OBJECTID)
+		return fs_info->uuid_root ? fs_info->uuid_root :
+					    ERR_PTR(-ENOENT);
 again:
 	root = btrfs_lookup_fs_root(fs_info, location->objectid);
-	if (root)
+	if (root) {
+		if (btrfs_root_refs(&root->root_item) == 0)
+			return ERR_PTR(-ENOENT);
 		return root;
+	}
 
 	root = btrfs_read_fs_root(fs_info->tree_root, location);
 	if (IS_ERR(root))
@@ -1737,7 +1741,7 @@ static int transaction_kthread(void *arg)
 
 	do {
 		cannot_commit = false;
-		delay = HZ * 30;
+		delay = HZ * root->fs_info->commit_interval;
 		mutex_lock(&root->fs_info->transaction_kthread_mutex);
 
 		spin_lock(&root->fs_info->trans_lock);
@@ -1749,7 +1753,8 @@ static int transaction_kthread(void *arg)
 
 		now = get_seconds();
 		if (cur->state < TRANS_STATE_BLOCKED &&
-		    (now < cur->start_time || now - cur->start_time < 30)) {
+		    (now < cur->start_time ||
+		     now - cur->start_time < root->fs_info->commit_interval)) {
 			spin_unlock(&root->fs_info->trans_lock);
 			delay = HZ * 5;
 			goto sleep;
@@ -2038,6 +2043,12 @@ static void free_root_pointers(struct btrfs_fs_info *info, int chunk_root)
 		info->quota_root->node = NULL;
 		info->quota_root->commit_root = NULL;
 	}
+	if (info->uuid_root) {
+		free_extent_buffer(info->uuid_root->node);
+		free_extent_buffer(info->uuid_root->commit_root);
+		info->uuid_root->node = NULL;
+		info->uuid_root->commit_root = NULL;
+	}
 	if (chunk_root) {
 		free_extent_buffer(info->chunk_root->node);
 		free_extent_buffer(info->chunk_root->commit_root);
@@ -2098,11 +2109,14 @@ int open_ctree(struct super_block *sb,
 	struct btrfs_root *chunk_root;
 	struct btrfs_root *dev_root;
 	struct btrfs_root *quota_root;
+	struct btrfs_root *uuid_root;
 	struct btrfs_root *log_tree_root;
 	int ret;
 	int err = -EINVAL;
 	int num_backups_tried = 0;
 	int backup_index = 0;
+	bool create_uuid_tree;
+	bool check_uuid_tree;
 
 	tree_root = fs_info->tree_root = btrfs_alloc_root(fs_info);
 	chunk_root = fs_info->chunk_root = btrfs_alloc_root(fs_info);
@@ -2189,6 +2203,7 @@ int open_ctree(struct super_block *sb,
 	fs_info->defrag_inodes = RB_ROOT;
 	fs_info->free_chunk_space = 0;
 	fs_info->tree_mod_log = RB_ROOT;
+	fs_info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
 
 	/* readahead state */
 	INIT_RADIX_TREE(&fs_info->reada_tree, GFP_NOFS & ~__GFP_WAIT);
@@ -2270,6 +2285,7 @@ int open_ctree(struct super_block *sb,
 
 
 	mutex_init(&fs_info->ordered_operations_mutex);
+	mutex_init(&fs_info->ordered_extent_flush_mutex);
 	mutex_init(&fs_info->tree_log_mutex);
 	mutex_init(&fs_info->chunk_mutex);
 	mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2278,6 +2294,7 @@ int open_ctree(struct super_block *sb,
 	init_rwsem(&fs_info->extent_commit_sem);
 	init_rwsem(&fs_info->cleanup_work_sem);
 	init_rwsem(&fs_info->subvol_sem);
+	sema_init(&fs_info->uuid_tree_rescan_sem, 1);
 	fs_info->dev_replace.lock_owner = 0;
 	atomic_set(&fs_info->dev_replace.nesting_level, 0);
 	mutex_init(&fs_info->dev_replace.lock_finishing_cancel_unmount);
@@ -2383,7 +2400,7 @@ int open_ctree(struct super_block *sb,
 	if (features) {
 		printk(KERN_ERR "BTRFS: couldn't mount because of "
 		       "unsupported optional features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
 		err = -EINVAL;
 		goto fail_alloc;
 	}
@@ -2453,7 +2470,7 @@ int open_ctree(struct super_block *sb,
 	if (!(sb->s_flags & MS_RDONLY) && features) {
 		printk(KERN_ERR "BTRFS: couldn't mount RDWR because of "
 		       "unsupported option features (%Lx).\n",
-		       (unsigned long long)features);
+		       features);
 		err = -EINVAL;
 		goto fail_alloc;
 	}
@@ -2466,20 +2483,17 @@ int open_ctree(struct super_block *sb,
 			   &fs_info->generic_worker);
 
 	btrfs_init_workers(&fs_info->delalloc_workers, "delalloc",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+			   fs_info->thread_pool_size, NULL);
 
 	btrfs_init_workers(&fs_info->flush_workers, "flush_delalloc",
-			   fs_info->thread_pool_size,
-			   &fs_info->generic_worker);
+			   fs_info->thread_pool_size, NULL);
 
 	btrfs_init_workers(&fs_info->submit_workers, "submit",
 			   min_t(u64, fs_devices->num_devices,
-			   fs_info->thread_pool_size),
-			   &fs_info->generic_worker);
+			   fs_info->thread_pool_size), NULL);
 
 	btrfs_init_workers(&fs_info->caching_workers, "cache",
-			   2, &fs_info->generic_worker);
+			   fs_info->thread_pool_size, NULL);
 
 	/* a higher idle thresh on the submit workers makes it much more
 	 * likely that bios will be send down in a sane order to the
@@ -2575,7 +2589,7 @@ int open_ctree(struct super_block *sb,
 	sb->s_blocksize = sectorsize;
 	sb->s_blocksize_bits = blksize_bits(sectorsize);
 
-	if (disk_super->magic != cpu_to_le64(BTRFS_MAGIC)) {
+	if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
 		printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id);
 		goto fail_sb_buffer;
 	}
@@ -2615,8 +2629,7 @@ int open_ctree(struct super_block *sb,
 	chunk_root->commit_root = btrfs_root_node(chunk_root);
 
 	read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid,
-	   (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node),
-	   BTRFS_UUID_SIZE);
+	   btrfs_header_chunk_tree_uuid(chunk_root->node), BTRFS_UUID_SIZE);
 
 	ret = btrfs_read_chunk_tree(chunk_root);
 	if (ret) {
@@ -2696,6 +2709,22 @@ retry_root_backup:
 		fs_info->quota_root = quota_root;
 	}
 
+	location.objectid = BTRFS_UUID_TREE_OBJECTID;
+	uuid_root = btrfs_read_tree_root(tree_root, &location);
+	if (IS_ERR(uuid_root)) {
+		ret = PTR_ERR(uuid_root);
+		if (ret != -ENOENT)
+			goto recovery_tree_root;
+		create_uuid_tree = true;
+		check_uuid_tree = false;
+	} else {
+		uuid_root->track_dirty = 1;
+		fs_info->uuid_root = uuid_root;
+		create_uuid_tree = false;
+		check_uuid_tree =
+		    generation != btrfs_super_uuid_tree_generation(disk_super);
+	}
+
 	fs_info->generation = generation;
 	fs_info->last_trans_committed = generation;
 
@@ -2882,6 +2911,29 @@ retry_root_backup:
 
 	btrfs_qgroup_rescan_resume(fs_info);
 
+	if (create_uuid_tree) {
+		pr_info("btrfs: creating UUID tree\n");
+		ret = btrfs_create_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to create the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else if (check_uuid_tree ||
+		   btrfs_test_opt(tree_root, RESCAN_UUID_TREE)) {
+		pr_info("btrfs: checking UUID tree\n");
+		ret = btrfs_check_uuid_tree(fs_info);
+		if (ret) {
+			pr_warn("btrfs: failed to check the UUID tree %d\n",
+				ret);
+			close_ctree(tree_root);
+			return ret;
+		}
+	} else {
+		fs_info->update_uuid_tree_gen = 1;
+	}
+
 	return 0;
 
 fail_qgroup:
@@ -2983,15 +3035,17 @@ struct buffer_head *btrfs_read_dev_super(struct block_device *bdev)
 	 */
 	for (i = 0; i < 1; i++) {
 		bytenr = btrfs_sb_offset(i);
-		if (bytenr + 4096 >= i_size_read(bdev->bd_inode))
+		if (bytenr + BTRFS_SUPER_INFO_SIZE >=
+					i_size_read(bdev->bd_inode))
 			break;
-		bh = __bread(bdev, bytenr / 4096, 4096);
+		bh = __bread(bdev, bytenr / 4096,
+					BTRFS_SUPER_INFO_SIZE);
 		if (!bh)
 			continue;
 
 		super = (struct btrfs_super_block *)bh->b_data;
 		if (btrfs_super_bytenr(super) != bytenr ||
-		    super->magic != cpu_to_le64(BTRFS_MAGIC)) {
+		    btrfs_super_magic(super) != BTRFS_MAGIC) {
 			brelse(bh);
 			continue;
 		}
@@ -3311,7 +3365,6 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 	int total_errors = 0;
 	u64 flags;
 
-	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	do_barriers = !btrfs_test_opt(root, NOBARRIER);
 	backup_super_roots(root->fs_info);
 
@@ -3320,6 +3373,7 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	head = &root->fs_info->fs_devices->devices;
+	max_errors = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 
 	if (do_barriers) {
 		ret = barrier_all_devices(root->fs_info);
@@ -3362,8 +3416,10 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors)
 		printk(KERN_ERR "btrfs: %d errors while writing supers\n",
 		       total_errors);
 
-		/* This shouldn't happen. FUA is masked off if unsupported */
-		BUG();
+		/* FUA is masked off if unsupported and can't be the reason */
+		btrfs_error(root->fs_info, -EIO,
+			    "%d errors while writing supers", total_errors);
+		return -EIO;
 	}
 
 	total_errors = 0;
@@ -3421,6 +3477,8 @@ static void free_fs_root(struct btrfs_root *root)
 {
 	iput(root->cache_inode);
 	WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree));
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
 	if (root->anon_dev)
 		free_anon_bdev(root->anon_dev);
 	free_extent_buffer(root->node);
@@ -3510,6 +3568,11 @@ int close_ctree(struct btrfs_root *root)
 	fs_info->closing = 1;
 	smp_mb();
 
+	/* wait for the uuid_scan task to finish */
+	down(&fs_info->uuid_tree_rescan_sem);
+	/* avoid complains from lockdep et al., set sem back to initial state */
+	up(&fs_info->uuid_tree_rescan_sem);
+
 	/* pause restriper - we want to resume on mount */
 	btrfs_pause_balance(fs_info);
 
@@ -3573,6 +3636,9 @@ int close_ctree(struct btrfs_root *root)
 
 	btrfs_free_stripe_hash_table(fs_info);
 
+	btrfs_free_block_rsv(root, root->orphan_block_rsv);
+	root->orphan_block_rsv = NULL;
+
 	return 0;
 }
 
@@ -3608,9 +3674,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 	if (transid != root->fs_info->generation)
 		WARN(1, KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
-			(unsigned long long)buf->start,
-			(unsigned long long)transid,
-			(unsigned long long)root->fs_info->generation);
+			buf->start, transid, root->fs_info->generation);
 	was_dirty = set_extent_buffer_dirty(buf);
 	if (!was_dirty)
 		__percpu_counter_add(&root->fs_info->dirty_metadata_bytes,
@@ -3744,8 +3808,8 @@ static void btrfs_destroy_all_ordered_extents(struct btrfs_fs_info *fs_info)
 	spin_unlock(&fs_info->ordered_root_lock);
 }
 
-int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
-			       struct btrfs_root *root)
+static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans,
+				      struct btrfs_root *root)
 {
 	struct rb_node *node;
 	struct btrfs_delayed_ref_root *delayed_refs;
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 1204c8ef6f32..cfb3cf711b34 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -113,7 +113,8 @@ static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	smp_mb();
-	return cache->cached == BTRFS_CACHE_FINISHED;
+	return cache->cached == BTRFS_CACHE_FINISHED ||
+		cache->cached == BTRFS_CACHE_ERROR;
 }
 
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
@@ -389,7 +390,7 @@ static noinline void caching_thread(struct btrfs_work *work)
 	u64 total_found = 0;
 	u64 last = 0;
 	u32 nritems;
-	int ret = 0;
+	int ret = -ENOMEM;
 
 	caching_ctl = container_of(work, struct btrfs_caching_control, work);
 	block_group = caching_ctl->block_group;
@@ -420,6 +421,7 @@ again:
 	/* need to make sure the commit_root doesn't disappear */
 	down_read(&fs_info->extent_commit_sem);
 
+next:
 	ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto err;
@@ -459,6 +461,16 @@ again:
 			continue;
 		}
 
+		if (key.objectid < last) {
+			key.objectid = last;
+			key.offset = 0;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+
+			caching_ctl->progress = last;
+			btrfs_release_path(path);
+			goto next;
+		}
+
 		if (key.objectid < block_group->key.objectid) {
 			path->slots[0]++;
 			continue;
@@ -506,6 +518,12 @@ err:
 
 	mutex_unlock(&caching_ctl->mutex);
 out:
+	if (ret) {
+		spin_lock(&block_group->lock);
+		block_group->caching_ctl = NULL;
+		block_group->cached = BTRFS_CACHE_ERROR;
+		spin_unlock(&block_group->lock);
+	}
 	wake_up(&caching_ctl->wait);
 
 	put_caching_control(caching_ctl);
@@ -771,10 +789,23 @@ again:
 		goto out_free;
 
 	if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
-		key.type = BTRFS_EXTENT_ITEM_KEY;
-		key.offset = root->leafsize;
-		btrfs_release_path(path);
-		goto again;
+		metadata = 0;
+		if (path->slots[0]) {
+			path->slots[0]--;
+			btrfs_item_key_to_cpu(path->nodes[0], &key,
+					      path->slots[0]);
+			if (key.objectid == bytenr &&
+			    key.type == BTRFS_EXTENT_ITEM_KEY &&
+			    key.offset == root->leafsize)
+				ret = 0;
+		}
+		if (ret) {
+			key.objectid = bytenr;
+			key.type = BTRFS_EXTENT_ITEM_KEY;
+			key.offset = root->leafsize;
+			btrfs_release_path(path);
+			goto again;
+		}
 	}
 
 	if (ret == 0) {
@@ -2011,6 +2042,8 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
 	ins.type = BTRFS_EXTENT_ITEM_KEY;
 
 	ref = btrfs_delayed_node_to_data_ref(node);
+	trace_run_delayed_data_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_DATA_REF_KEY)
 		parent = ref->parent;
 	else
@@ -2154,6 +2187,8 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
 						 SKINNY_METADATA);
 
 	ref = btrfs_delayed_node_to_tree_ref(node);
+	trace_run_delayed_tree_ref(node, ref, node->action);
+
 	if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
 		parent = ref->parent;
 	else
@@ -2212,6 +2247,8 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
 		 */
 		BUG_ON(extent_op);
 		head = btrfs_delayed_node_to_head(node);
+		trace_run_delayed_ref_head(node, head, node->action);
+
 		if (insert_reserved) {
 			btrfs_pin_extent(root, node->bytenr,
 					 node->num_bytes, 1);
@@ -2403,6 +2440,8 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 			default:
 				WARN_ON(1);
 			}
+		} else {
+			list_del_init(&locked_ref->cluster);
 		}
 		spin_unlock(&delayed_refs->lock);
 
@@ -2425,7 +2464,6 @@ static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
 		 * list before we release it.
 		 */
 		if (btrfs_delayed_ref_is_head(ref)) {
-			list_del_init(&locked_ref->cluster);
 			btrfs_delayed_ref_unlock(locked_ref);
 			locked_ref = NULL;
 		}
@@ -3799,8 +3837,12 @@ again:
 	if (force < space_info->force_alloc)
 		force = space_info->force_alloc;
 	if (space_info->full) {
+		if (should_alloc_chunk(extent_root, space_info, force))
+			ret = -ENOSPC;
+		else
+			ret = 0;
 		spin_unlock(&space_info->lock);
-		return 0;
+		return ret;
 	}
 
 	if (!should_alloc_chunk(extent_root, space_info, force)) {
@@ -4320,6 +4362,9 @@ static struct btrfs_block_rsv *get_block_rsv(
 	if (root == root->fs_info->csum_root && trans->adding_csums)
 		block_rsv = trans->block_rsv;
 
+	if (root == root->fs_info->uuid_root)
+		block_rsv = trans->block_rsv;
+
 	if (!block_rsv)
 		block_rsv = root->block_rsv;
 
@@ -4729,10 +4774,12 @@ void btrfs_orphan_release_metadata(struct inode *inode)
 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 				     struct btrfs_block_rsv *rsv,
 				     int items,
-				     u64 *qgroup_reserved)
+				     u64 *qgroup_reserved,
+				     bool use_global_rsv)
 {
 	u64 num_bytes;
 	int ret;
+	struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
 
 	if (root->fs_info->quota_enabled) {
 		/* One for parent inode, two for dir entries */
@@ -4751,6 +4798,10 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
 					    BTRFS_BLOCK_GROUP_METADATA);
 	ret = btrfs_block_rsv_add(root, rsv, num_bytes,
 				  BTRFS_RESERVE_FLUSH_ALL);
+
+	if (ret == -ENOSPC && use_global_rsv)
+		ret = btrfs_block_rsv_migrate(global_rsv, rsv, num_bytes);
+
 	if (ret) {
 		if (*qgroup_reserved)
 			btrfs_qgroup_free(root, *qgroup_reserved);
@@ -5668,7 +5719,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 
 			if (ret) {
 				btrfs_err(info, "umm, got %d back from search, was looking for %llu",
-					ret, (unsigned long long)bytenr);
+					ret, bytenr);
 				if (ret > 0)
 					btrfs_print_leaf(extent_root,
 							 path->nodes[0]);
@@ -5684,11 +5735,8 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 		WARN_ON(1);
 		btrfs_err(info,
 			"unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
-			(unsigned long long)bytenr,
-			(unsigned long long)parent,
-			(unsigned long long)root_objectid,
-			(unsigned long long)owner_objectid,
-			(unsigned long long)owner_offset);
+			bytenr, parent, root_objectid, owner_objectid,
+			owner_offset);
 	} else {
 		btrfs_abort_transaction(trans, extent_root, ret);
 		goto out;
@@ -5717,7 +5765,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
 					-1, 1);
 		if (ret) {
 			btrfs_err(info, "umm, got %d back from search, was looking for %llu",
-				ret, (unsigned long long)bytenr);
+				ret, bytenr);
 			btrfs_print_leaf(extent_root, path->nodes[0]);
 		}
 		if (ret < 0) {
@@ -5999,8 +6047,11 @@ static u64 stripe_align(struct btrfs_root *root,
  * for our min num_bytes.  Another option is to have it go ahead
  * and look in the rbtree for a free extent of a given size, but this
  * is a good start.
+ *
+ * Callers of this must check if cache->cached == BTRFS_CACHE_ERROR before using
+ * any of the information in this block group.
  */
-static noinline int
+static noinline void
 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 				u64 num_bytes)
 {
@@ -6008,28 +6059,29 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
 		   (cache->free_space_ctl->free_space >= num_bytes));
 
 	put_caching_control(caching_ctl);
-	return 0;
 }
 
 static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
 	struct btrfs_caching_control *caching_ctl;
+	int ret = 0;
 
 	caching_ctl = get_caching_control(cache);
 	if (!caching_ctl)
-		return 0;
+		return (cache->cached == BTRFS_CACHE_ERROR) ? -EIO : 0;
 
 	wait_event(caching_ctl->wait, block_group_cache_done(cache));
-
+	if (cache->cached == BTRFS_CACHE_ERROR)
+		ret = -EIO;
 	put_caching_control(caching_ctl);
-	return 0;
+	return ret;
 }
 
 int __get_raid_index(u64 flags)
@@ -6070,8 +6122,7 @@ enum btrfs_loop_type {
  * ins->offset == number of blocks
  * Any available blocks before search_start are skipped.
  */
-static noinline int find_free_extent(struct btrfs_trans_handle *trans,
-				     struct btrfs_root *orig_root,
+static noinline int find_free_extent(struct btrfs_root *orig_root,
 				     u64 num_bytes, u64 empty_size,
 				     u64 hint_byte, struct btrfs_key *ins,
 				     u64 flags)
@@ -6212,6 +6263,8 @@ have_block_group:
 			ret = 0;
 		}
 
+		if (unlikely(block_group->cached == BTRFS_CACHE_ERROR))
+			goto loop;
 		if (unlikely(block_group->ro))
 			goto loop;
 
@@ -6292,10 +6345,10 @@ refill_cluster:
 					      block_group->full_stripe_len);
 
 			/* allocate a cluster in this block group */
-			ret = btrfs_find_space_cluster(trans, root,
-					       block_group, last_ptr,
-					       search_start, num_bytes,
-					       aligned_cluster);
+			ret = btrfs_find_space_cluster(root, block_group,
+						       last_ptr, search_start,
+						       num_bytes,
+						       aligned_cluster);
 			if (ret == 0) {
 				/*
 				 * now pull our allocation out of this
@@ -6426,17 +6479,28 @@ loop:
 		index = 0;
 		loop++;
 		if (loop == LOOP_ALLOC_CHUNK) {
+			struct btrfs_trans_handle *trans;
+
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				goto out;
+			}
+
 			ret = do_chunk_alloc(trans, root, flags,
 					     CHUNK_ALLOC_FORCE);
 			/*
 			 * Do not bail out on ENOSPC since we
 			 * can do more things.
 			 */
-			if (ret < 0 && ret != -ENOSPC) {
+			if (ret < 0 && ret != -ENOSPC)
 				btrfs_abort_transaction(trans,
 							root, ret);
+			else
+				ret = 0;
+			btrfs_end_transaction(trans, root);
+			if (ret)
 				goto out;
-			}
 		}
 
 		if (loop == LOOP_NO_EMPTY_SIZE) {
@@ -6463,19 +6527,15 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
 
 	spin_lock(&info->lock);
 	printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
-	       (unsigned long long)info->flags,
-	       (unsigned long long)(info->total_bytes - info->bytes_used -
-				    info->bytes_pinned - info->bytes_reserved -
-				    info->bytes_readonly),
+	       info->flags,
+	       info->total_bytes - info->bytes_used - info->bytes_pinned -
+	       info->bytes_reserved - info->bytes_readonly,
 	       (info->full) ? "" : "not ");
 	printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
 	       "reserved=%llu, may_use=%llu, readonly=%llu\n",
-	       (unsigned long long)info->total_bytes,
-	       (unsigned long long)info->bytes_used,
-	       (unsigned long long)info->bytes_pinned,
-	       (unsigned long long)info->bytes_reserved,
-	       (unsigned long long)info->bytes_may_use,
-	       (unsigned long long)info->bytes_readonly);
+	       info->total_bytes, info->bytes_used, info->bytes_pinned,
+	       info->bytes_reserved, info->bytes_may_use,
+	       info->bytes_readonly);
 	spin_unlock(&info->lock);
 
 	if (!dump_block_groups)
@@ -6486,12 +6546,9 @@ again:
 	list_for_each_entry(cache, &info->block_groups[index], list) {
 		spin_lock(&cache->lock);
 		printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
-		       (unsigned long long)cache->key.objectid,
-		       (unsigned long long)cache->key.offset,
-		       (unsigned long long)btrfs_block_group_used(&cache->item),
-		       (unsigned long long)cache->pinned,
-		       (unsigned long long)cache->reserved,
-		       cache->ro ? "[readonly]" : "");
+		       cache->key.objectid, cache->key.offset,
+		       btrfs_block_group_used(&cache->item), cache->pinned,
+		       cache->reserved, cache->ro ? "[readonly]" : "");
 		btrfs_dump_free_space(cache, bytes);
 		spin_unlock(&cache->lock);
 	}
@@ -6500,8 +6557,7 @@ again:
 	up_read(&info->groups_sem);
 }
 
-int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
-			 struct btrfs_root *root,
+int btrfs_reserve_extent(struct btrfs_root *root,
 			 u64 num_bytes, u64 min_alloc_size,
 			 u64 empty_size, u64 hint_byte,
 			 struct btrfs_key *ins, int is_data)
@@ -6513,8 +6569,8 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 	flags = btrfs_get_alloc_profile(root, is_data);
 again:
 	WARN_ON(num_bytes < root->sectorsize);
-	ret = find_free_extent(trans, root, num_bytes, empty_size,
-			       hint_byte, ins, flags);
+	ret = find_free_extent(root, num_bytes, empty_size, hint_byte, ins,
+			       flags);
 
 	if (ret == -ENOSPC) {
 		if (!final_tried) {
@@ -6529,8 +6585,7 @@ again:
 
 			sinfo = __find_space_info(root->fs_info, flags);
 			btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
-				(unsigned long long)flags,
-				(unsigned long long)num_bytes);
+				flags, num_bytes);
 			if (sinfo)
 				dump_space_info(sinfo, num_bytes, 1);
 		}
@@ -6550,7 +6605,7 @@ static int __btrfs_free_reserved_extent(struct btrfs_root *root,
 	cache = btrfs_lookup_block_group(root->fs_info, start);
 	if (!cache) {
 		btrfs_err(root->fs_info, "Unable to find block group for %llu",
-			(unsigned long long)start);
+			start);
 		return -ENOSPC;
 	}
 
@@ -6646,8 +6701,7 @@ static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
 	ret = update_block_group(root, ins->objectid, ins->offset, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
-			(unsigned long long)ins->objectid,
-			(unsigned long long)ins->offset);
+			ins->objectid, ins->offset);
 		BUG();
 	}
 	return ret;
@@ -6719,8 +6773,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
 	ret = update_block_group(root, ins->objectid, root->leafsize, 1);
 	if (ret) { /* -ENOENT, logic error */
 		btrfs_err(fs_info, "update block group failed for %llu %llu",
-			(unsigned long long)ins->objectid,
-			(unsigned long long)ins->offset);
+			ins->objectid, ins->offset);
 		BUG();
 	}
 	return ret;
@@ -6902,7 +6955,7 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 	if (IS_ERR(block_rsv))
 		return ERR_CAST(block_rsv);
 
-	ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
+	ret = btrfs_reserve_extent(root, blocksize, blocksize,
 				   empty_size, hint, &ins, 0);
 	if (ret) {
 		unuse_block_rsv(root->fs_info, block_rsv, blocksize);
@@ -7173,6 +7226,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
 		next = btrfs_find_create_tree_block(root, bytenr, blocksize);
 		if (!next)
 			return -ENOMEM;
+		btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
+					       level - 1);
 		reada = 1;
 	}
 	btrfs_tree_lock(next);
@@ -7658,7 +7713,7 @@ out:
 	 * don't have it in the radix (like when we recover after a power fail
 	 * or unmount) so we don't leak memory.
 	 */
-	if (root_dropped == false)
+	if (!for_reloc && root_dropped == false)
 		btrfs_add_dead_root(root);
 	if (err)
 		btrfs_std_error(root->fs_info, err);
@@ -8192,7 +8247,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
 		 * We haven't cached this block group, which means we could
 		 * possibly have excluded extents on this block group.
 		 */
-		if (block_group->cached == BTRFS_CACHE_NO)
+		if (block_group->cached == BTRFS_CACHE_NO ||
+		    block_group->cached == BTRFS_CACHE_ERROR)
 			free_excluded_extents(info->extent_root, block_group);
 
 		btrfs_remove_free_space_cache(block_group);
@@ -8409,9 +8465,13 @@ int btrfs_read_block_groups(struct btrfs_root *root)
 		 * avoid allocating from un-mirrored block group if there are
 		 * mirrored block groups.
 		 */
-		list_for_each_entry(cache, &space_info->block_groups[3], list)
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_RAID0],
+				list)
 			set_block_group_ro(cache, 1);
-		list_for_each_entry(cache, &space_info->block_groups[4], list)
+		list_for_each_entry(cache,
+				&space_info->block_groups[BTRFS_RAID_SINGLE],
+				list)
 			set_block_group_ro(cache, 1);
 	}
 
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fe443fece851..09582b81640c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -61,9 +61,8 @@ void btrfs_leak_debug_check(void)
 		state = list_entry(states.next, struct extent_state, leak_list);
 		printk(KERN_ERR "btrfs state leak: start %llu end %llu "
 		       "state %lu in tree %p refs %d\n",
-		       (unsigned long long)state->start,
-		       (unsigned long long)state->end,
-		       state->state, state->tree, atomic_read(&state->refs));
+		       state->start, state->end, state->state, state->tree,
+		       atomic_read(&state->refs));
 		list_del(&state->leak_list);
 		kmem_cache_free(extent_state_cache, state);
 	}
@@ -71,8 +70,8 @@ void btrfs_leak_debug_check(void)
 	while (!list_empty(&buffers)) {
 		eb = list_entry(buffers.next, struct extent_buffer, leak_list);
 		printk(KERN_ERR "btrfs buffer leak start %llu len %lu "
-		       "refs %d\n", (unsigned long long)eb->start,
-		       eb->len, atomic_read(&eb->refs));
+		       "refs %d\n",
+		       eb->start, eb->len, atomic_read(&eb->refs));
 		list_del(&eb->leak_list);
 		kmem_cache_free(extent_buffer_cache, eb);
 	}
@@ -88,11 +87,7 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
 	if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
 		printk_ratelimited(KERN_DEBUG
 		    "btrfs: %s: ino %llu isize %llu odd range [%llu,%llu]\n",
-				caller,
-				(unsigned long long)btrfs_ino(inode),
-				(unsigned long long)isize,
-				(unsigned long long)start,
-				(unsigned long long)end);
+				caller, btrfs_ino(inode), isize, start, end);
 	}
 }
 #else
@@ -388,8 +383,7 @@ static int insert_state(struct extent_io_tree *tree,
 
 	if (end < start)
 		WARN(1, KERN_ERR "btrfs end < start %llu %llu\n",
-		       (unsigned long long)end,
-		       (unsigned long long)start);
+		       end, start);
 	state->start = start;
 	state->end = end;
 
@@ -400,9 +394,8 @@ static int insert_state(struct extent_io_tree *tree,
 		struct extent_state *found;
 		found = rb_entry(node, struct extent_state, rb_node);
 		printk(KERN_ERR "btrfs found node %llu %llu on insert of "
-		       "%llu %llu\n", (unsigned long long)found->start,
-		       (unsigned long long)found->end,
-		       (unsigned long long)start, (unsigned long long)end);
+		       "%llu %llu\n",
+		       found->start, found->end, start, end);
 		return -EEXIST;
 	}
 	state->tree = tree;
@@ -762,15 +755,6 @@ static void cache_state(struct extent_state *state,
 	}
 }
 
-static void uncache_state(struct extent_state **cached_ptr)
-{
-	if (cached_ptr && (*cached_ptr)) {
-		struct extent_state *state = *cached_ptr;
-		*cached_ptr = NULL;
-		free_extent_state(state);
-	}
-}
-
 /*
  * set some bits on a range in the tree.  This may require allocations or
  * sleeping, so the gfp mask is used to indicate what is allowed.
@@ -1687,31 +1671,21 @@ out_failed:
 	return found;
 }
 
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op)
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long clear_bits,
+				 unsigned long page_ops)
 {
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	int ret;
 	struct page *pages[16];
 	unsigned long index = start >> PAGE_CACHE_SHIFT;
 	unsigned long end_index = end >> PAGE_CACHE_SHIFT;
 	unsigned long nr_pages = end_index - index + 1;
 	int i;
-	unsigned long clear_bits = 0;
-
-	if (op & EXTENT_CLEAR_UNLOCK)
-		clear_bits |= EXTENT_LOCKED;
-	if (op & EXTENT_CLEAR_DIRTY)
-		clear_bits |= EXTENT_DIRTY;
-
-	if (op & EXTENT_CLEAR_DELALLOC)
-		clear_bits |= EXTENT_DELALLOC;
 
 	clear_extent_bit(tree, start, end, clear_bits, 1, 0, NULL, GFP_NOFS);
-	if (!(op & (EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-		    EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK |
-		    EXTENT_SET_PRIVATE2)))
+	if (page_ops == 0)
 		return 0;
 
 	while (nr_pages > 0) {
@@ -1720,20 +1694,20 @@ int extent_clear_unlock_delalloc(struct inode *inode,
 				     nr_pages, ARRAY_SIZE(pages)), pages);
 		for (i = 0; i < ret; i++) {
 
-			if (op & EXTENT_SET_PRIVATE2)
+			if (page_ops & PAGE_SET_PRIVATE2)
 				SetPagePrivate2(pages[i]);
 
 			if (pages[i] == locked_page) {
 				page_cache_release(pages[i]);
 				continue;
 			}
-			if (op & EXTENT_CLEAR_DIRTY)
+			if (page_ops & PAGE_CLEAR_DIRTY)
 				clear_page_dirty_for_io(pages[i]);
-			if (op & EXTENT_SET_WRITEBACK)
+			if (page_ops & PAGE_SET_WRITEBACK)
 				set_page_writeback(pages[i]);
-			if (op & EXTENT_END_WRITEBACK)
+			if (page_ops & PAGE_END_WRITEBACK)
 				end_page_writeback(pages[i]);
-			if (op & EXTENT_CLEAR_UNLOCK_PAGE)
+			if (page_ops & PAGE_UNLOCK)
 				unlock_page(pages[i]);
 			page_cache_release(pages[i]);
 		}
@@ -1810,7 +1784,7 @@ out:
  * set the private field for a given byte offset in the tree.  If there isn't
  * an extent_state there already, this does nothing.
  */
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
+static int set_state_private(struct extent_io_tree *tree, u64 start, u64 private)
 {
 	struct rb_node *node;
 	struct extent_state *state;
@@ -1837,64 +1811,6 @@ out:
 	return ret;
 }
 
-void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
-			    int count)
-{
-	struct rb_node *node;
-	struct extent_state *state;
-
-	spin_lock(&tree->lock);
-	/*
-	 * this search will find all the extents that end after
-	 * our range starts.
-	 */
-	node = tree_search(tree, start);
-	BUG_ON(!node);
-
-	state = rb_entry(node, struct extent_state, rb_node);
-	BUG_ON(state->start != start);
-
-	while (count) {
-		state->private = *csums++;
-		count--;
-		state = next_state(state);
-	}
-	spin_unlock(&tree->lock);
-}
-
-static inline u64 __btrfs_get_bio_offset(struct bio *bio, int bio_index)
-{
-	struct bio_vec *bvec = bio->bi_io_vec + bio_index;
-
-	return page_offset(bvec->bv_page) + bvec->bv_offset;
-}
-
-void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio, int bio_index,
-			u32 csums[], int count)
-{
-	struct rb_node *node;
-	struct extent_state *state = NULL;
-	u64 start;
-
-	spin_lock(&tree->lock);
-	do {
-		start = __btrfs_get_bio_offset(bio, bio_index);
-		if (state == NULL || state->start != start) {
-			node = tree_search(tree, start);
-			BUG_ON(!node);
-
-			state = rb_entry(node, struct extent_state, rb_node);
-			BUG_ON(state->start != start);
-		}
-		state->private = *csums++;
-		count--;
-		bio_index++;
-
-		state = next_state(state);
-	} while (count);
-	spin_unlock(&tree->lock);
-}
-
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private)
 {
 	struct rb_node *node;
@@ -2173,7 +2089,8 @@ static int clean_io_failure(u64 start, struct page *page)
 					    EXTENT_LOCKED);
 	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
 
-	if (state && state->start == failrec->start) {
+	if (state && state->start <= failrec->start &&
+	    state->end >= failrec->start + failrec->len - 1) {
 		fs_info = BTRFS_I(inode)->root->fs_info;
 		num_copies = btrfs_num_copies(fs_info, failrec->logical,
 					      failrec->len);
@@ -2201,9 +2118,9 @@ out:
  * needed
  */
 
-static int bio_readpage_error(struct bio *failed_bio, struct page *page,
-				u64 start, u64 end, int failed_mirror,
-				struct extent_state *state)
+static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset,
+			      struct page *page, u64 start, u64 end,
+			      int failed_mirror)
 {
 	struct io_failure_record *failrec = NULL;
 	u64 private;
@@ -2213,6 +2130,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 	struct bio *bio;
+	struct btrfs_io_bio *btrfs_failed_bio;
+	struct btrfs_io_bio *btrfs_bio;
 	int num_copies;
 	int ret;
 	int read_mode;
@@ -2296,23 +2215,12 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		 * all the retry and error correction code that follows. no
 		 * matter what the error is, it is very likely to persist.
 		 */
-		pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
-			 "state=%p, num_copies=%d, next_mirror %d, "
-			 "failed_mirror %d\n", state, num_copies,
-			 failrec->this_mirror, failed_mirror);
+		pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n",
+			 num_copies, failrec->this_mirror, failed_mirror);
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
 	}
 
-	if (!state) {
-		spin_lock(&tree->lock);
-		state = find_first_extent_bit_state(tree, failrec->start,
-						    EXTENT_LOCKED);
-		if (state && state->start != failrec->start)
-			state = NULL;
-		spin_unlock(&tree->lock);
-	}
-
 	/*
 	 * there are two premises:
 	 *	a) deliver good data to the caller
@@ -2349,9 +2257,8 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		read_mode = READ_SYNC;
 	}
 
-	if (!state || failrec->this_mirror > num_copies) {
-		pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
-			 "next_mirror %d, failed_mirror %d\n", state,
+	if (failrec->this_mirror > num_copies) {
+		pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n",
 			 num_copies, failrec->this_mirror, failed_mirror);
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
@@ -2362,12 +2269,24 @@ static int bio_readpage_error(struct bio *failed_bio, struct page *page,
 		free_io_failure(inode, failrec, 0);
 		return -EIO;
 	}
-	bio->bi_private = state;
 	bio->bi_end_io = failed_bio->bi_end_io;
 	bio->bi_sector = failrec->logical >> 9;
 	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
 	bio->bi_size = 0;
 
+	btrfs_failed_bio = btrfs_io_bio(failed_bio);
+	if (btrfs_failed_bio->csum) {
+		struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
+		u16 csum_size = btrfs_super_csum_size(fs_info->super_copy);
+
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = btrfs_bio->csum_inline;
+		phy_offset >>= inode->i_sb->s_blocksize_bits;
+		phy_offset *= csum_size;
+		memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset,
+		       csum_size);
+	}
+
 	bio_add_page(bio, page, failrec->len, start - page_offset(page));
 
 	pr_debug("bio_readpage_error: submitting new read[%#x] to "
@@ -2450,6 +2369,18 @@ static void end_bio_extent_writepage(struct bio *bio, int err)
 	bio_put(bio);
 }
 
+static void
+endio_readpage_release_extent(struct extent_io_tree *tree, u64 start, u64 len,
+			      int uptodate)
+{
+	struct extent_state *cached = NULL;
+	u64 end = start + len - 1;
+
+	if (uptodate && tree->track_uptodate)
+		set_extent_uptodate(tree, start, end, &cached, GFP_ATOMIC);
+	unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
+}
+
 /*
  * after a readpage IO is done, we need to:
  * clear the uptodate bits on error
@@ -2466,9 +2397,14 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
 	struct bio_vec *bvec_end = bio->bi_io_vec + bio->bi_vcnt - 1;
 	struct bio_vec *bvec = bio->bi_io_vec;
+	struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 	struct extent_io_tree *tree;
+	u64 offset = 0;
 	u64 start;
 	u64 end;
+	u64 len;
+	u64 extent_start = 0;
+	u64 extent_len = 0;
 	int mirror;
 	int ret;
 
@@ -2477,9 +2413,6 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
 	do {
 		struct page *page = bvec->bv_page;
-		struct extent_state *cached = NULL;
-		struct extent_state *state;
-		struct btrfs_io_bio *io_bio = btrfs_io_bio(bio);
 		struct inode *inode = page->mapping->host;
 
 		pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, "
@@ -2500,37 +2433,32 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 
 		start = page_offset(page);
 		end = start + bvec->bv_offset + bvec->bv_len - 1;
+		len = bvec->bv_len;
 
 		if (++bvec <= bvec_end)
 			prefetchw(&bvec->bv_page->flags);
 
-		spin_lock(&tree->lock);
-		state = find_first_extent_bit_state(tree, start, EXTENT_LOCKED);
-		if (state && state->start == start) {
-			/*
-			 * take a reference on the state, unlock will drop
-			 * the ref
-			 */
-			cache_state(state, &cached);
-		}
-		spin_unlock(&tree->lock);
-
 		mirror = io_bio->mirror_num;
-		if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) {
-			ret = tree->ops->readpage_end_io_hook(page, start, end,
-							      state, mirror);
+		if (likely(uptodate && tree->ops &&
+			   tree->ops->readpage_end_io_hook)) {
+			ret = tree->ops->readpage_end_io_hook(io_bio, offset,
+							      page, start, end,
+							      mirror);
 			if (ret)
 				uptodate = 0;
 			else
 				clean_io_failure(start, page);
 		}
 
-		if (!uptodate && tree->ops && tree->ops->readpage_io_failed_hook) {
+		if (likely(uptodate))
+			goto readpage_ok;
+
+		if (tree->ops && tree->ops->readpage_io_failed_hook) {
 			ret = tree->ops->readpage_io_failed_hook(page, mirror);
 			if (!ret && !err &&
 			    test_bit(BIO_UPTODATE, &bio->bi_flags))
 				uptodate = 1;
-		} else if (!uptodate) {
+		} else {
 			/*
 			 * The generic bio_readpage_error handles errors the
 			 * following way: If possible, new read requests are
@@ -2541,24 +2469,18 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			 * can't handle the error it will return -EIO and we
 			 * remain responsible for that page.
 			 */
-			ret = bio_readpage_error(bio, page, start, end, mirror, NULL);
+			ret = bio_readpage_error(bio, offset, page, start, end,
+						 mirror);
 			if (ret == 0) {
 				uptodate =
 					test_bit(BIO_UPTODATE, &bio->bi_flags);
 				if (err)
 					uptodate = 0;
-				uncache_state(&cached);
 				continue;
 			}
 		}
-
-		if (uptodate && tree->track_uptodate) {
-			set_extent_uptodate(tree, start, end, &cached,
-					    GFP_ATOMIC);
-		}
-		unlock_extent_cached(tree, start, end, &cached, GFP_ATOMIC);
-
-		if (uptodate) {
+readpage_ok:
+		if (likely(uptodate)) {
 			loff_t i_size = i_size_read(inode);
 			pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
 			unsigned offset;
@@ -2573,8 +2495,36 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
 			SetPageError(page);
 		}
 		unlock_page(page);
+		offset += len;
+
+		if (unlikely(!uptodate)) {
+			if (extent_len) {
+				endio_readpage_release_extent(tree,
+							      extent_start,
+							      extent_len, 1);
+				extent_start = 0;
+				extent_len = 0;
+			}
+			endio_readpage_release_extent(tree, start,
+						      end - start + 1, 0);
+		} else if (!extent_len) {
+			extent_start = start;
+			extent_len = end + 1 - start;
+		} else if (extent_start + extent_len == start) {
+			extent_len += end + 1 - start;
+		} else {
+			endio_readpage_release_extent(tree, extent_start,
+						      extent_len, uptodate);
+			extent_start = start;
+			extent_len = end + 1 - start;
+		}
 	} while (bvec <= bvec_end);
 
+	if (extent_len)
+		endio_readpage_release_extent(tree, extent_start, extent_len,
+					      uptodate);
+	if (io_bio->end_io)
+		io_bio->end_io(io_bio, err);
 	bio_put(bio);
 }
 
@@ -2586,6 +2536,7 @@ struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags)
 {
+	struct btrfs_io_bio *btrfs_bio;
 	struct bio *bio;
 
 	bio = bio_alloc_bioset(gfp_flags, nr_vecs, btrfs_bioset);
@@ -2601,6 +2552,10 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		bio->bi_size = 0;
 		bio->bi_bdev = bdev;
 		bio->bi_sector = first_sector;
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
 	}
 	return bio;
 }
@@ -2614,7 +2569,17 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask)
 /* this also allocates from the btrfs_bioset */
 struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs)
 {
-	return bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	struct btrfs_io_bio *btrfs_bio;
+	struct bio *bio;
+
+	bio = bio_alloc_bioset(gfp_mask, nr_iovecs, btrfs_bioset);
+	if (bio) {
+		btrfs_bio = btrfs_io_bio(bio);
+		btrfs_bio->csum = NULL;
+		btrfs_bio->csum_allocated = NULL;
+		btrfs_bio->end_io = NULL;
+	}
+	return bio;
 }
 
 
@@ -2738,17 +2703,45 @@ void set_page_extent_mapped(struct page *page)
 	}
 }
 
+static struct extent_map *
+__get_extent_map(struct inode *inode, struct page *page, size_t pg_offset,
+		 u64 start, u64 len, get_extent_t *get_extent,
+		 struct extent_map **em_cached)
+{
+	struct extent_map *em;
+
+	if (em_cached && *em_cached) {
+		em = *em_cached;
+		if (em->in_tree && start >= em->start &&
+		    start < extent_map_end(em)) {
+			atomic_inc(&em->refs);
+			return em;
+		}
+
+		free_extent_map(em);
+		*em_cached = NULL;
+	}
+
+	em = get_extent(inode, page, pg_offset, start, len, 0);
+	if (em_cached && !IS_ERR_OR_NULL(em)) {
+		BUG_ON(*em_cached);
+		atomic_inc(&em->refs);
+		*em_cached = em;
+	}
+	return em;
+}
 /*
  * basic readpage implementation.  Locked extent state structs are inserted
  * into the tree that are removed when the IO is done (by the end_io
  * handlers)
  * XXX JDM: This needs looking at to ensure proper page locking
  */
-static int __extent_read_full_page(struct extent_io_tree *tree,
-				   struct page *page,
-				   get_extent_t *get_extent,
-				   struct bio **bio, int mirror_num,
-				   unsigned long *bio_flags, int rw)
+static int __do_readpage(struct extent_io_tree *tree,
+			 struct page *page,
+			 get_extent_t *get_extent,
+			 struct extent_map **em_cached,
+			 struct bio **bio, int mirror_num,
+			 unsigned long *bio_flags, int rw)
 {
 	struct inode *inode = page->mapping->host;
 	u64 start = page_offset(page);
@@ -2762,35 +2755,26 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
-	struct btrfs_ordered_extent *ordered;
 	int ret;
 	int nr = 0;
+	int parent_locked = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 	size_t pg_offset = 0;
 	size_t iosize;
 	size_t disk_io_size;
 	size_t blocksize = inode->i_sb->s_blocksize;
-	unsigned long this_bio_flag = 0;
+	unsigned long this_bio_flag = *bio_flags & EXTENT_BIO_PARENT_LOCKED;
 
 	set_page_extent_mapped(page);
 
+	end = page_end;
 	if (!PageUptodate(page)) {
 		if (cleancache_get_page(page) == 0) {
 			BUG_ON(blocksize != PAGE_SIZE);
+			unlock_extent(tree, start, end);
 			goto out;
 		}
 	}
 
-	end = page_end;
-	while (1) {
-		lock_extent(tree, start, end);
-		ordered = btrfs_lookup_ordered_extent(inode, start);
-		if (!ordered)
-			break;
-		unlock_extent(tree, start, end);
-		btrfs_start_ordered_extent(inode, ordered, 1);
-		btrfs_put_ordered_extent(ordered);
-	}
-
 	if (page->index == last_byte >> PAGE_CACHE_SHIFT) {
 		char *userpage;
 		size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1);
@@ -2817,15 +2801,18 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			kunmap_atomic(userpage);
 			set_extent_uptodate(tree, cur, cur + iosize - 1,
 					    &cached, GFP_NOFS);
-			unlock_extent_cached(tree, cur, cur + iosize - 1,
-					     &cached, GFP_NOFS);
+			if (!parent_locked)
+				unlock_extent_cached(tree, cur,
+						     cur + iosize - 1,
+						     &cached, GFP_NOFS);
 			break;
 		}
-		em = get_extent(inode, page, pg_offset, cur,
-				end - cur + 1, 0);
+		em = __get_extent_map(inode, page, pg_offset, cur,
+				      end - cur + 1, get_extent, em_cached);
 		if (IS_ERR_OR_NULL(em)) {
 			SetPageError(page);
-			unlock_extent(tree, cur, end);
+			if (!parent_locked)
+				unlock_extent(tree, cur, end);
 			break;
 		}
 		extent_offset = cur - em->start;
@@ -2833,7 +2820,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		BUG_ON(end < cur);
 
 		if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
-			this_bio_flag = EXTENT_BIO_COMPRESSED;
+			this_bio_flag |= EXTENT_BIO_COMPRESSED;
 			extent_set_compress_type(&this_bio_flag,
 						 em->compress_type);
 		}
@@ -2877,7 +2864,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		if (test_range_bit(tree, cur, cur_end,
 				   EXTENT_UPTODATE, 1, NULL)) {
 			check_page_uptodate(tree, page);
-			unlock_extent(tree, cur, cur + iosize - 1);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -2887,7 +2875,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 		 */
 		if (block_start == EXTENT_MAP_INLINE) {
 			SetPageError(page);
-			unlock_extent(tree, cur, cur + iosize - 1);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 			cur = cur + iosize;
 			pg_offset += iosize;
 			continue;
@@ -2905,7 +2894,8 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
 			*bio_flags = this_bio_flag;
 		} else {
 			SetPageError(page);
-			unlock_extent(tree, cur, cur + iosize - 1);
+			if (!parent_locked)
+				unlock_extent(tree, cur, cur + iosize - 1);
 		}
 		cur = cur + iosize;
 		pg_offset += iosize;
@@ -2919,6 +2909,104 @@ out:
 	return 0;
 }
 
+static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
+					     struct page *pages[], int nr_pages,
+					     u64 start, u64 end,
+					     get_extent_t *get_extent,
+					     struct extent_map **em_cached,
+					     struct bio **bio, int mirror_num,
+					     unsigned long *bio_flags, int rw)
+{
+	struct inode *inode;
+	struct btrfs_ordered_extent *ordered;
+	int index;
+
+	inode = pages[0]->mapping->host;
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_range(inode, start,
+						     end - start + 1);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	for (index = 0; index < nr_pages; index++) {
+		__do_readpage(tree, pages[index], get_extent, em_cached, bio,
+			      mirror_num, bio_flags, rw);
+		page_cache_release(pages[index]);
+	}
+}
+
+static void __extent_readpages(struct extent_io_tree *tree,
+			       struct page *pages[],
+			       int nr_pages, get_extent_t *get_extent,
+			       struct extent_map **em_cached,
+			       struct bio **bio, int mirror_num,
+			       unsigned long *bio_flags, int rw)
+{
+	u64 start = 0;
+	u64 end = 0;
+	u64 page_start;
+	int index;
+	int first_index = 0;
+
+	for (index = 0; index < nr_pages; index++) {
+		page_start = page_offset(pages[index]);
+		if (!end) {
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		} else if (end + 1 == page_start) {
+			end += PAGE_CACHE_SIZE;
+		} else {
+			__do_contiguous_readpages(tree, &pages[first_index],
+						  index - first_index, start,
+						  end, get_extent, em_cached,
+						  bio, mirror_num, bio_flags,
+						  rw);
+			start = page_start;
+			end = start + PAGE_CACHE_SIZE - 1;
+			first_index = index;
+		}
+	}
+
+	if (end)
+		__do_contiguous_readpages(tree, &pages[first_index],
+					  index - first_index, start,
+					  end, get_extent, em_cached, bio,
+					  mirror_num, bio_flags, rw);
+}
+
+static int __extent_read_full_page(struct extent_io_tree *tree,
+				   struct page *page,
+				   get_extent_t *get_extent,
+				   struct bio **bio, int mirror_num,
+				   unsigned long *bio_flags, int rw)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_ordered_extent *ordered;
+	u64 start = page_offset(page);
+	u64 end = start + PAGE_CACHE_SIZE - 1;
+	int ret;
+
+	while (1) {
+		lock_extent(tree, start, end);
+		ordered = btrfs_lookup_ordered_extent(inode, start);
+		if (!ordered)
+			break;
+		unlock_extent(tree, start, end);
+		btrfs_start_ordered_extent(inode, ordered, 1);
+		btrfs_put_ordered_extent(ordered);
+	}
+
+	ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
+			    bio_flags, rw);
+	return ret;
+}
+
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			    get_extent_t *get_extent, int mirror_num)
 {
@@ -2933,6 +3021,20 @@ int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 	return ret;
 }
 
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num)
+{
+	struct bio *bio = NULL;
+	unsigned long bio_flags = EXTENT_BIO_PARENT_LOCKED;
+	int ret;
+
+	ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
+				      &bio_flags, READ);
+	if (bio)
+		ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
+	return ret;
+}
+
 static noinline void update_nr_written(struct page *page,
 				      struct writeback_control *wbc,
 				      unsigned long nr_written)
@@ -3189,8 +3291,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
 			if (!PageWriteback(page)) {
 				printk(KERN_ERR "btrfs warning page %lu not "
 				       "writeback, cur %llu end %llu\n",
-				       page->index, (unsigned long long)cur,
-				       (unsigned long long)end);
+				       page->index, cur, end);
 			}
 
 			ret = submit_extent_page(write_flags, tree, page,
@@ -3769,7 +3870,7 @@ int extent_readpages(struct extent_io_tree *tree,
 	unsigned long bio_flags = 0;
 	struct page *pagepool[16];
 	struct page *page;
-	int i = 0;
+	struct extent_map *em_cached = NULL;
 	int nr = 0;
 
 	for (page_idx = 0; page_idx < nr_pages; page_idx++) {
@@ -3786,18 +3887,16 @@ int extent_readpages(struct extent_io_tree *tree,
 		pagepool[nr++] = page;
 		if (nr < ARRAY_SIZE(pagepool))
 			continue;
-		for (i = 0; i < nr; i++) {
-			__extent_read_full_page(tree, pagepool[i], get_extent,
-					&bio, 0, &bio_flags, READ);
-			page_cache_release(pagepool[i]);
-		}
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
 		nr = 0;
 	}
-	for (i = 0; i < nr; i++) {
-		__extent_read_full_page(tree, pagepool[i], get_extent,
-					&bio, 0, &bio_flags, READ);
-		page_cache_release(pagepool[i]);
-	}
+	if (nr)
+		__extent_readpages(tree, pagepool, nr, get_extent, &em_cached,
+				   &bio, 0, &bio_flags, READ);
+
+	if (em_cached)
+		free_extent_map(em_cached);
 
 	BUG_ON(!list_empty(pages));
 	if (bio)
@@ -4136,6 +4235,76 @@ static void __free_extent_buffer(struct extent_buffer *eb)
 	kmem_cache_free(extent_buffer_cache, eb);
 }
 
+static int extent_buffer_under_io(struct extent_buffer *eb)
+{
+	return (atomic_read(&eb->io_pages) ||
+		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
+		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+}
+
+/*
+ * Helper for releasing extent buffer page.
+ */
+static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
+						unsigned long start_idx)
+{
+	unsigned long index;
+	unsigned long num_pages;
+	struct page *page;
+	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
+
+	BUG_ON(extent_buffer_under_io(eb));
+
+	num_pages = num_extent_pages(eb->start, eb->len);
+	index = start_idx + num_pages;
+	if (start_idx >= index)
+		return;
+
+	do {
+		index--;
+		page = extent_buffer_page(eb, index);
+		if (page && mapped) {
+			spin_lock(&page->mapping->private_lock);
+			/*
+			 * We do this since we'll remove the pages after we've
+			 * removed the eb from the radix tree, so we could race
+			 * and have this page now attached to the new eb.  So
+			 * only clear page_private if it's still connected to
+			 * this eb.
+			 */
+			if (PagePrivate(page) &&
+			    page->private == (unsigned long)eb) {
+				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+				BUG_ON(PageDirty(page));
+				BUG_ON(PageWriteback(page));
+				/*
+				 * We need to make sure we haven't be attached
+				 * to a new eb.
+				 */
+				ClearPagePrivate(page);
+				set_page_private(page, 0);
+				/* One for the page private */
+				page_cache_release(page);
+			}
+			spin_unlock(&page->mapping->private_lock);
+
+		}
+		if (page) {
+			/* One for when we alloced the page */
+			page_cache_release(page);
+		}
+	} while (index != start_idx);
+}
+
+/*
+ * Helper for releasing the extent buffer.
+ */
+static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
+{
+	btrfs_release_extent_buffer_page(eb, 0);
+	__free_extent_buffer(eb);
+}
+
 static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree,
 						   u64 start,
 						   unsigned long len,
@@ -4184,13 +4353,16 @@ struct extent_buffer *btrfs_clone_extent_buffer(struct extent_buffer *src)
 	struct extent_buffer *new;
 	unsigned long num_pages = num_extent_pages(src->start, src->len);
 
-	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_ATOMIC);
+	new = __alloc_extent_buffer(NULL, src->start, src->len, GFP_NOFS);
 	if (new == NULL)
 		return NULL;
 
 	for (i = 0; i < num_pages; i++) {
-		p = alloc_page(GFP_ATOMIC);
-		BUG_ON(!p);
+		p = alloc_page(GFP_NOFS);
+		if (!p) {
+			btrfs_release_extent_buffer(new);
+			return NULL;
+		}
 		attach_extent_buffer_page(new, p);
 		WARN_ON(PageDirty(p));
 		SetPageUptodate(p);
@@ -4210,12 +4382,12 @@ struct extent_buffer *alloc_dummy_extent_buffer(u64 start, unsigned long len)
 	unsigned long num_pages = num_extent_pages(0, len);
 	unsigned long i;
 
-	eb = __alloc_extent_buffer(NULL, start, len, GFP_ATOMIC);
+	eb = __alloc_extent_buffer(NULL, start, len, GFP_NOFS);
 	if (!eb)
 		return NULL;
 
 	for (i = 0; i < num_pages; i++) {
-		eb->pages[i] = alloc_page(GFP_ATOMIC);
+		eb->pages[i] = alloc_page(GFP_NOFS);
 		if (!eb->pages[i])
 			goto err;
 	}
@@ -4231,76 +4403,6 @@ err:
 	return NULL;
 }
 
-static int extent_buffer_under_io(struct extent_buffer *eb)
-{
-	return (atomic_read(&eb->io_pages) ||
-		test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags) ||
-		test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-}
-
-/*
- * Helper for releasing extent buffer page.
- */
-static void btrfs_release_extent_buffer_page(struct extent_buffer *eb,
-						unsigned long start_idx)
-{
-	unsigned long index;
-	unsigned long num_pages;
-	struct page *page;
-	int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags);
-
-	BUG_ON(extent_buffer_under_io(eb));
-
-	num_pages = num_extent_pages(eb->start, eb->len);
-	index = start_idx + num_pages;
-	if (start_idx >= index)
-		return;
-
-	do {
-		index--;
-		page = extent_buffer_page(eb, index);
-		if (page && mapped) {
-			spin_lock(&page->mapping->private_lock);
-			/*
-			 * We do this since we'll remove the pages after we've
-			 * removed the eb from the radix tree, so we could race
-			 * and have this page now attached to the new eb.  So
-			 * only clear page_private if it's still connected to
-			 * this eb.
-			 */
-			if (PagePrivate(page) &&
-			    page->private == (unsigned long)eb) {
-				BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
-				BUG_ON(PageDirty(page));
-				BUG_ON(PageWriteback(page));
-				/*
-				 * We need to make sure we haven't be attached
-				 * to a new eb.
-				 */
-				ClearPagePrivate(page);
-				set_page_private(page, 0);
-				/* One for the page private */
-				page_cache_release(page);
-			}
-			spin_unlock(&page->mapping->private_lock);
-
-		}
-		if (page) {
-			/* One for when we alloced the page */
-			page_cache_release(page);
-		}
-	} while (index != start_idx);
-}
-
-/*
- * Helper for releasing the extent buffer.
- */
-static inline void btrfs_release_extent_buffer(struct extent_buffer *eb)
-{
-	btrfs_release_extent_buffer_page(eb, 0);
-	__free_extent_buffer(eb);
-}
-
 static void check_buffer_tree_ref(struct extent_buffer *eb)
 {
 	int refs;
@@ -4771,7 +4873,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4813,8 +4915,8 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start,
 
 	if (start + min_len > eb->len) {
 		WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, "
-		       "wanted %lu %lu\n", (unsigned long long)eb->start,
-		       eb->len, start, min_len);
+		       "wanted %lu %lu\n",
+		       eb->start, eb->len, start, min_len);
 		return -EINVAL;
 	}
 
@@ -4841,7 +4943,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4875,7 +4977,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4905,7 +5007,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c,
 	WARN_ON(start > eb->len);
 	WARN_ON(start + len > eb->start + eb->len);
 
-	offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1);
+	offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(eb, i);
@@ -4936,7 +5038,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src,
 	WARN_ON(src->len != dst_len);
 
 	offset = (start_offset + dst_offset) &
-		((unsigned long)PAGE_CACHE_SIZE - 1);
+		(PAGE_CACHE_SIZE - 1);
 
 	while (len > 0) {
 		page = extent_buffer_page(dst, i);
@@ -5022,9 +5124,9 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 
 	while (len > 0) {
 		dst_off_in_page = (start_offset + dst_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_offset) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT;
 		src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT;
@@ -5075,9 +5177,9 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset,
 		src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT;
 
 		dst_off_in_page = (start_offset + dst_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 		src_off_in_page = (start_offset + src_end) &
-			((unsigned long)PAGE_CACHE_SIZE - 1);
+			(PAGE_CACHE_SIZE - 1);
 
 		cur = min_t(unsigned long, len, src_off_in_page + 1);
 		cur = min(cur, dst_off_in_page + 1);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 3b8c4e26e1da..6dbc645f1f3d 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -29,6 +29,7 @@
  */
 #define EXTENT_BIO_COMPRESSED 1
 #define EXTENT_BIO_TREE_LOG 2
+#define EXTENT_BIO_PARENT_LOCKED 4
 #define EXTENT_BIO_FLAG_SHIFT 16
 
 /* these are bit numbers for test/set bit */
@@ -44,14 +45,11 @@
 #define EXTENT_BUFFER_DUMMY 9
 
 /* these are flags for extent_clear_unlock_delalloc */
-#define EXTENT_CLEAR_UNLOCK_PAGE 0x1
-#define EXTENT_CLEAR_UNLOCK	 0x2
-#define EXTENT_CLEAR_DELALLOC	 0x4
-#define EXTENT_CLEAR_DIRTY	 0x8
-#define EXTENT_SET_WRITEBACK	 0x10
-#define EXTENT_END_WRITEBACK	 0x20
-#define EXTENT_SET_PRIVATE2	 0x40
-#define EXTENT_CLEAR_ACCOUNTING  0x80
+#define PAGE_UNLOCK		(1 << 0)
+#define PAGE_CLEAR_DIRTY	(1 << 1)
+#define PAGE_SET_WRITEBACK	(1 << 2)
+#define PAGE_END_WRITEBACK	(1 << 3)
+#define PAGE_SET_PRIVATE2	(1 << 4)
 
 /*
  * page->private values.  Every page that is controlled by the extent
@@ -62,6 +60,7 @@
 
 struct extent_state;
 struct btrfs_root;
+struct btrfs_io_bio;
 
 typedef	int (extent_submit_bio_hook_t)(struct inode *inode, int rw,
 				       struct bio *bio, int mirror_num,
@@ -77,8 +76,9 @@ struct extent_io_ops {
 			      size_t size, struct bio *bio,
 			      unsigned long bio_flags);
 	int (*readpage_io_failed_hook)(struct page *page, int failed_mirror);
-	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
-				    struct extent_state *state, int mirror);
+	int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset,
+				    struct page *page, u64 start, u64 end,
+				    int mirror);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
 				      struct extent_state *state, int uptodate);
 	void (*set_bit_hook)(struct inode *inode, struct extent_state *state,
@@ -200,6 +200,8 @@ int unlock_extent_cached(struct extent_io_tree *tree, u64 start, u64 end,
 int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
 int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
 			  get_extent_t *get_extent, int mirror_num);
+int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
+				 get_extent_t *get_extent, int mirror_num);
 int __init extent_io_init(void);
 void extent_io_exit(void);
 
@@ -261,11 +263,6 @@ int extent_readpages(struct extent_io_tree *tree,
 		     get_extent_t get_extent);
 int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		__u64 start, __u64 len, get_extent_t *get_extent);
-int set_state_private(struct extent_io_tree *tree, u64 start, u64 private);
-void extent_cache_csums_dio(struct extent_io_tree *tree, u64 start, u32 csums[],
-			    int count);
-void extent_cache_csums(struct extent_io_tree *tree, struct bio *bio,
-			int bvec_index, u32 csums[], int count);
 int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private);
 void set_page_extent_mapped(struct page *page);
 
@@ -330,10 +327,10 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset,
 		      unsigned long *map_len);
 int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end);
 int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
-int extent_clear_unlock_delalloc(struct inode *inode,
-				struct extent_io_tree *tree,
-				u64 start, u64 end, struct page *locked_page,
-				unsigned long op);
+int extent_clear_unlock_delalloc(struct inode *inode, u64 start, u64 end,
+				 struct page *locked_page,
+				 unsigned long bits_to_clear,
+				 unsigned long page_ops);
 struct bio *
 btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
 		gfp_t gfp_flags);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index a7bfc9541803..4f53159bdb9d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -23,6 +23,7 @@
 #include "ctree.h"
 #include "disk-io.h"
 #include "transaction.h"
+#include "volumes.h"
 #include "print-tree.h"
 
 #define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
@@ -152,28 +153,54 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
 	return ret;
 }
 
+static void btrfs_io_bio_endio_readpage(struct btrfs_io_bio *bio, int err)
+{
+	kfree(bio->csum_allocated);
+}
+
 static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				   struct inode *inode, struct bio *bio,
 				   u64 logical_offset, u32 *dst, int dio)
 {
-	u32 sum[16];
-	int len;
 	struct bio_vec *bvec = bio->bi_io_vec;
-	int bio_index = 0;
+	struct btrfs_io_bio *btrfs_bio = btrfs_io_bio(bio);
+	struct btrfs_csum_item *item = NULL;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_path *path;
+	u8 *csum;
 	u64 offset = 0;
 	u64 item_start_offset = 0;
 	u64 item_last_offset = 0;
 	u64 disk_bytenr;
 	u32 diff;
-	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int nblocks;
+	int bio_index = 0;
 	int count;
-	struct btrfs_path *path;
-	struct btrfs_csum_item *item = NULL;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
+
+	nblocks = bio->bi_size >> inode->i_sb->s_blocksize_bits;
+	if (!dst) {
+		if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) {
+			btrfs_bio->csum_allocated = kmalloc(nblocks * csum_size,
+							    GFP_NOFS);
+			if (!btrfs_bio->csum_allocated) {
+				btrfs_free_path(path);
+				return -ENOMEM;
+			}
+			btrfs_bio->csum = btrfs_bio->csum_allocated;
+			btrfs_bio->end_io = btrfs_io_bio_endio_readpage;
+		} else {
+			btrfs_bio->csum = btrfs_bio->csum_inline;
+		}
+		csum = btrfs_bio->csum;
+	} else {
+		csum = (u8 *)dst;
+	}
+
 	if (bio->bi_size > PAGE_CACHE_SIZE * 8)
 		path->reada = 2;
 
@@ -194,11 +221,10 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 	if (dio)
 		offset = logical_offset;
 	while (bio_index < bio->bi_vcnt) {
-		len = min_t(int, ARRAY_SIZE(sum), bio->bi_vcnt - bio_index);
 		if (!dio)
 			offset = page_offset(bvec->bv_page) + bvec->bv_offset;
-		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, sum,
-					       len);
+		count = btrfs_find_ordered_sum(inode, offset, disk_bytenr,
+					       (u32 *)csum, nblocks);
 		if (count)
 			goto found;
 
@@ -213,7 +239,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 						 path, disk_bytenr, 0);
 			if (IS_ERR(item)) {
 				count = 1;
-				sum[0] = 0;
+				memset(csum, 0, csum_size);
 				if (BTRFS_I(inode)->root->root_key.objectid ==
 				    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 					set_extent_bits(io_tree, offset,
@@ -222,9 +248,7 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 				} else {
 					printk(KERN_INFO "btrfs no csum found "
 					       "for inode %llu start %llu\n",
-					       (unsigned long long)
-					       btrfs_ino(inode),
-					       (unsigned long long)offset);
+					       btrfs_ino(inode), offset);
 				}
 				item = NULL;
 				btrfs_release_path(path);
@@ -249,23 +273,14 @@ static int __btrfs_lookup_bio_sums(struct btrfs_root *root,
 		diff = disk_bytenr - item_start_offset;
 		diff = diff / root->sectorsize;
 		diff = diff * csum_size;
-		count = min_t(int, len, (item_last_offset - disk_bytenr) >>
-					inode->i_sb->s_blocksize_bits);
-		read_extent_buffer(path->nodes[0], sum,
+		count = min_t(int, nblocks, (item_last_offset - disk_bytenr) >>
+					    inode->i_sb->s_blocksize_bits);
+		read_extent_buffer(path->nodes[0], csum,
 				   ((unsigned long)item) + diff,
 				   csum_size * count);
 found:
-		if (dst) {
-			memcpy(dst, sum, count * csum_size);
-			dst += count;
-		} else {
-			if (dio)
-				extent_cache_csums_dio(io_tree, offset, sum,
-						       count);
-			else
-				extent_cache_csums(io_tree, bio, bio_index, sum,
-					    count);
-		}
+		csum += count * csum_size;
+		nblocks -= count;
 		while (count--) {
 			disk_bytenr += bvec->bv_len;
 			offset += bvec->bv_len;
@@ -284,9 +299,19 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode,
 }
 
 int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode,
-			      struct bio *bio, u64 offset)
+			      struct btrfs_dio_private *dip, struct bio *bio,
+			      u64 offset)
 {
-	return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1);
+	int len = (bio->bi_sector << 9) - dip->disk_bytenr;
+	u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+	int ret;
+
+	len >>= inode->i_sb->s_blocksize_bits;
+	len *= csum_size;
+
+	ret = __btrfs_lookup_bio_sums(root, inode, bio, offset,
+				      (u32 *)(dip->csum + len), 1);
+	return ret;
 }
 
 int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 4d2eb6417145..bc5072b2db53 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1334,7 +1334,6 @@ fail:
 static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 				    size_t *write_bytes)
 {
-	struct btrfs_trans_handle *trans;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_ordered_extent *ordered;
 	u64 lockstart, lockend;
@@ -1356,16 +1355,8 @@ static noinline int check_can_nocow(struct inode *inode, loff_t pos,
 		btrfs_put_ordered_extent(ordered);
 	}
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans)) {
-		unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend);
-		return PTR_ERR(trans);
-	}
-
 	num_bytes = lockend - lockstart + 1;
-	ret = can_nocow_extent(trans, inode, lockstart, &num_bytes, NULL, NULL,
-			       NULL);
-	btrfs_end_transaction(trans, root);
+	ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, NULL, NULL);
 	if (ret <= 0) {
 		ret = 0;
 	} else {
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index b21a3cd667d8..3f0ddfce96e6 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -221,12 +221,10 @@ int btrfs_truncate_free_space_cache(struct btrfs_root *root,
 				    struct btrfs_path *path,
 				    struct inode *inode)
 {
-	loff_t oldsize;
 	int ret = 0;
 
-	oldsize = i_size_read(inode);
 	btrfs_i_size_write(inode, 0);
-	truncate_pagecache(inode, oldsize, 0);
+	truncate_pagecache(inode, 0);
 
 	/*
 	 * We don't need an orphan item because truncating the free space cache
@@ -308,7 +306,7 @@ static void io_ctl_unmap_page(struct io_ctl *io_ctl)
 
 static void io_ctl_map_page(struct io_ctl *io_ctl, int clear)
 {
-	BUG_ON(io_ctl->index >= io_ctl->num_pages);
+	ASSERT(io_ctl->index < io_ctl->num_pages);
 	io_ctl->page = io_ctl->pages[io_ctl->index++];
 	io_ctl->cur = kmap(io_ctl->page);
 	io_ctl->orig = io_ctl->cur;
@@ -673,8 +671,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 		btrfs_err(root->fs_info,
 			"free space inode generation (%llu) "
 			"did not match free space cache generation (%llu)",
-			(unsigned long long)BTRFS_I(inode)->generation,
-			(unsigned long long)generation);
+			BTRFS_I(inode)->generation, generation);
 		return 0;
 	}
 
@@ -729,7 +726,7 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode,
 				goto free_cache;
 			}
 		} else {
-			BUG_ON(!num_bitmaps);
+			ASSERT(num_bitmaps);
 			num_bitmaps--;
 			e->bitmap = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
 			if (!e->bitmap) {
@@ -1029,7 +1026,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode,
 	leaf = path->nodes[0];
 	if (ret > 0) {
 		struct btrfs_key found_key;
-		BUG_ON(!path->slots[0]);
+		ASSERT(path->slots[0]);
 		path->slots[0]--;
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID ||
@@ -1117,7 +1114,7 @@ int btrfs_write_out_cache(struct btrfs_root *root,
 static inline unsigned long offset_to_bit(u64 bitmap_start, u32 unit,
 					  u64 offset)
 {
-	BUG_ON(offset < bitmap_start);
+	ASSERT(offset >= bitmap_start);
 	offset -= bitmap_start;
 	return (unsigned long)(div_u64(offset, unit));
 }
@@ -1272,7 +1269,7 @@ tree_search_offset(struct btrfs_free_space_ctl *ctl,
 		if (n) {
 			entry = rb_entry(n, struct btrfs_free_space,
 					offset_index);
-			BUG_ON(entry->offset > offset);
+			ASSERT(entry->offset <= offset);
 		} else {
 			if (fuzzy)
 				return entry;
@@ -1336,7 +1333,7 @@ static int link_free_space(struct btrfs_free_space_ctl *ctl,
 {
 	int ret = 0;
 
-	BUG_ON(!info->bitmap && !info->bytes);
+	ASSERT(info->bytes || info->bitmap);
 	ret = tree_insert_offset(&ctl->free_space_offset, info->offset,
 				 &info->offset_index, (info->bitmap != NULL));
 	if (ret)
@@ -1359,7 +1356,7 @@ static void recalculate_thresholds(struct btrfs_free_space_ctl *ctl)
 
 	max_bitmaps = max(max_bitmaps, 1);
 
-	BUG_ON(ctl->total_bitmaps > max_bitmaps);
+	ASSERT(ctl->total_bitmaps <= max_bitmaps);
 
 	/*
 	 * The goal is to keep the total amount of memory used per 1gb of space
@@ -1403,7 +1400,7 @@ static inline void __bitmap_clear_bits(struct btrfs_free_space_ctl *ctl,
 
 	start = offset_to_bit(info->offset, ctl->unit, offset);
 	count = bytes_to_bits(bytes, ctl->unit);
-	BUG_ON(start + count > BITS_PER_BITMAP);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
 	bitmap_clear(info->bitmap, start, count);
 
@@ -1426,7 +1423,7 @@ static void bitmap_set_bits(struct btrfs_free_space_ctl *ctl,
 
 	start = offset_to_bit(info->offset, ctl->unit, offset);
 	count = bytes_to_bits(bytes, ctl->unit);
-	BUG_ON(start + count > BITS_PER_BITMAP);
+	ASSERT(start + count <= BITS_PER_BITMAP);
 
 	bitmap_set(info->bitmap, start, count);
 
@@ -1742,7 +1739,7 @@ no_cluster_bitmap:
 	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
 					 1, 0);
 	if (!bitmap_info) {
-		BUG_ON(added);
+		ASSERT(added == 0);
 		goto new_bitmap;
 	}
 
@@ -1882,7 +1879,7 @@ out:
 
 	if (ret) {
 		printk(KERN_CRIT "btrfs: unable to add free space :%d\n", ret);
-		BUG_ON(ret == -EEXIST);
+		ASSERT(ret != -EEXIST);
 	}
 
 	return ret;
@@ -1991,8 +1988,7 @@ void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 		if (info->bytes >= bytes && !block_group->ro)
 			count++;
 		printk(KERN_CRIT "entry offset %llu, bytes %llu, bitmap %s\n",
-		       (unsigned long long)info->offset,
-		       (unsigned long long)info->bytes,
+		       info->offset, info->bytes,
 		       (info->bitmap) ? "yes" : "no");
 	}
 	printk(KERN_INFO "block group has cluster?: %s\n",
@@ -2371,7 +2367,7 @@ again:
 	rb_erase(&entry->offset_index, &ctl->free_space_offset);
 	ret = tree_insert_offset(&cluster->root, entry->offset,
 				 &entry->offset_index, 1);
-	BUG_ON(ret); /* -EEXIST; Logic error */
+	ASSERT(!ret); /* -EEXIST; Logic error */
 
 	trace_btrfs_setup_cluster(block_group, cluster,
 				  total_found * ctl->unit, 1);
@@ -2464,7 +2460,7 @@ setup_cluster_no_bitmap(struct btrfs_block_group_cache *block_group,
 		ret = tree_insert_offset(&cluster->root, entry->offset,
 					 &entry->offset_index, 0);
 		total_size += entry->bytes;
-		BUG_ON(ret); /* -EEXIST; Logic error */
+		ASSERT(!ret); /* -EEXIST; Logic error */
 	} while (node && entry != last);
 
 	cluster->max_size = max_extent;
@@ -2525,8 +2521,7 @@ setup_cluster_bitmap(struct btrfs_block_group_cache *block_group,
  * returns zero and sets up cluster if things worked out, otherwise
  * it returns -enospc
  */
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size)
@@ -2856,7 +2851,7 @@ u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root)
 
 		ret = search_bitmap(ctl, entry, &offset, &count);
 		/* Logic error; Should be empty if it can't find anything */
-		BUG_ON(ret);
+		ASSERT(!ret);
 
 		ino = offset;
 		bitmap_clear_bits(ctl, entry, offset, 1);
@@ -2973,33 +2968,68 @@ int btrfs_write_out_ino_cache(struct btrfs_root *root,
 }
 
 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
-static struct btrfs_block_group_cache *init_test_block_group(void)
+/*
+ * Use this if you need to make a bitmap or extent entry specifically, it
+ * doesn't do any of the merging that add_free_space does, this acts a lot like
+ * how the free space cache loading stuff works, so you can get really weird
+ * configurations.
+ */
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap)
 {
-	struct btrfs_block_group_cache *cache;
+	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
+	struct btrfs_free_space *info = NULL, *bitmap_info;
+	void *map = NULL;
+	u64 bytes_added;
+	int ret;
 
-	cache = kzalloc(sizeof(*cache), GFP_NOFS);
-	if (!cache)
-		return NULL;
-	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
-					GFP_NOFS);
-	if (!cache->free_space_ctl) {
-		kfree(cache);
-		return NULL;
+again:
+	if (!info) {
+		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
+		if (!info)
+			return -ENOMEM;
 	}
 
-	cache->key.objectid = 0;
-	cache->key.offset = 1024 * 1024 * 1024;
-	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
-	cache->sectorsize = 4096;
+	if (!bitmap) {
+		spin_lock(&ctl->tree_lock);
+		info->offset = offset;
+		info->bytes = bytes;
+		ret = link_free_space(ctl, info);
+		spin_unlock(&ctl->tree_lock);
+		if (ret)
+			kmem_cache_free(btrfs_free_space_cachep, info);
+		return ret;
+	}
+
+	if (!map) {
+		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
+		if (!map) {
+			kmem_cache_free(btrfs_free_space_cachep, info);
+			return -ENOMEM;
+		}
+	}
+
+	spin_lock(&ctl->tree_lock);
+	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
+					 1, 0);
+	if (!bitmap_info) {
+		info->bitmap = map;
+		map = NULL;
+		add_new_bitmap(ctl, info, offset);
+		bitmap_info = info;
+	}
 
-	spin_lock_init(&cache->lock);
-	INIT_LIST_HEAD(&cache->list);
-	INIT_LIST_HEAD(&cache->cluster_list);
-	INIT_LIST_HEAD(&cache->new_bg_list);
+	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
+	bytes -= bytes_added;
+	offset += bytes_added;
+	spin_unlock(&ctl->tree_lock);
 
-	btrfs_init_free_space_ctl(cache);
+	if (bytes)
+		goto again;
 
-	return cache;
+	if (map)
+		kfree(map);
+	return 0;
 }
 
 /*
@@ -3007,8 +3037,8 @@ static struct btrfs_block_group_cache *init_test_block_group(void)
  * just used to check the absence of space, so if there is free space in the
  * range at all we will return 1.
  */
-static int check_exists(struct btrfs_block_group_cache *cache, u64 offset,
-			u64 bytes)
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes)
 {
 	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
 	struct btrfs_free_space *info;
@@ -3085,411 +3115,4 @@ out:
 	spin_unlock(&ctl->tree_lock);
 	return ret;
 }
-
-/*
- * Use this if you need to make a bitmap or extent entry specifically, it
- * doesn't do any of the merging that add_free_space does, this acts a lot like
- * how the free space cache loading stuff works, so you can get really weird
- * configurations.
- */
-static int add_free_space_entry(struct btrfs_block_group_cache *cache,
-				u64 offset, u64 bytes, bool bitmap)
-{
-	struct btrfs_free_space_ctl *ctl = cache->free_space_ctl;
-	struct btrfs_free_space *info = NULL, *bitmap_info;
-	void *map = NULL;
-	u64 bytes_added;
-	int ret;
-
-again:
-	if (!info) {
-		info = kmem_cache_zalloc(btrfs_free_space_cachep, GFP_NOFS);
-		if (!info)
-			return -ENOMEM;
-	}
-
-	if (!bitmap) {
-		spin_lock(&ctl->tree_lock);
-		info->offset = offset;
-		info->bytes = bytes;
-		ret = link_free_space(ctl, info);
-		spin_unlock(&ctl->tree_lock);
-		if (ret)
-			kmem_cache_free(btrfs_free_space_cachep, info);
-		return ret;
-	}
-
-	if (!map) {
-		map = kzalloc(PAGE_CACHE_SIZE, GFP_NOFS);
-		if (!map) {
-			kmem_cache_free(btrfs_free_space_cachep, info);
-			return -ENOMEM;
-		}
-	}
-
-	spin_lock(&ctl->tree_lock);
-	bitmap_info = tree_search_offset(ctl, offset_to_bitmap(ctl, offset),
-					 1, 0);
-	if (!bitmap_info) {
-		info->bitmap = map;
-		map = NULL;
-		add_new_bitmap(ctl, info, offset);
-		bitmap_info = info;
-	}
-
-	bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes);
-	bytes -= bytes_added;
-	offset += bytes_added;
-	spin_unlock(&ctl->tree_lock);
-
-	if (bytes)
-		goto again;
-
-	if (map)
-		kfree(map);
-	return 0;
-}
-
-#define test_msg(fmt, ...) printk(KERN_INFO "btrfs: selftest: " fmt, ##__VA_ARGS__)
-
-/*
- * This test just does basic sanity checking, making sure we can add an exten
- * entry and remove space from either end and the middle, and make sure we can
- * remove space that covers adjacent extent entries.
- */
-static int test_extents(struct btrfs_block_group_cache *cache)
-{
-	int ret = 0;
-
-	test_msg("Running extent only tests\n");
-
-	/* First just make sure we can remove an entire entry */
-	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error adding initial extents %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error removing extent %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-		test_msg("Full remove left some lingering space\n");
-		return -1;
-	}
-
-	/* Ok edge and middle cases now */
-	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error adding half extent %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error removing tail end %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error removing front end %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
-	if (ret) {
-		test_msg("Error removing middle piece %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-		test_msg("Still have space at the front\n");
-		return -1;
-	}
-
-	if (check_exists(cache, 2 * 1024 * 1024, 4096)) {
-		test_msg("Still have space in the middle\n");
-		return -1;
-	}
-
-	if (check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
-		test_msg("Still have space at the end\n");
-		return -1;
-	}
-
-	/* Cleanup */
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-
-	return 0;
-}
-
-static int test_bitmaps(struct btrfs_block_group_cache *cache)
-{
-	u64 next_bitmap_offset;
-	int ret;
-
-	test_msg("Running bitmap only tests\n");
-
-	ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't create a bitmap entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error removing bitmap full range %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 0, 4 * 1024 * 1024)) {
-		test_msg("Left some space in bitmap\n");
-		return -1;
-	}
-
-	ret = add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add to our bitmap entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
-	if (ret) {
-		test_msg("Couldn't remove middle chunk %d\n", ret);
-		return ret;
-	}
-
-	/*
-	 * The first bitmap we have starts at offset 0 so the next one is just
-	 * at the end of the first bitmap.
-	 */
-	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
-
-	/* Test a bit straddling two bitmaps */
-	ret = add_free_space_entry(cache, next_bitmap_offset -
-				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add space that straddles two bitmaps %d\n",
-				ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
-				      (1 * 1024 * 1024), 2 * 1024 * 1024);
-	if (ret) {
-		test_msg("Couldn't remove overlapping space %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
-			 2 * 1024 * 1024)) {
-		test_msg("Left some space when removing overlapping\n");
-		return -1;
-	}
-
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-
-	return 0;
-}
-
-/* This is the high grade jackassery */
-static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
-{
-	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
-	int ret;
-
-	test_msg("Running bitmap and extent tests\n");
-
-	/*
-	 * First let's do something simple, an extent at the same offset as the
-	 * bitmap, but the free space completely in the extent and then
-	 * completely in the bitmap.
-	 */
-	ret = add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't create bitmap entry %d\n", ret);
-		return ret;
-	}
-
-	ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
-	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
-	if (ret) {
-		test_msg("Couldn't remove extent entry %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 0, 1 * 1024 * 1024)) {
-		test_msg("Left remnants after our remove\n");
-		return -1;
-	}
-
-	/* Now to add back the extent entry and remove from the bitmap */
-	ret = add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
-	if (ret) {
-		test_msg("Couldn't re-add extent entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
-	if (ret) {
-		test_msg("Couldn't remove from bitmap %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
-		test_msg("Left remnants in the bitmap\n");
-		return -1;
-	}
-
-	/*
-	 * Ok so a little more evil, extent entry and bitmap at the same offset,
-	 * removing an overlapping chunk.
-	 */
-	ret = add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add to a bitmap %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
-	if (ret) {
-		test_msg("Couldn't remove overlapping space %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
-		test_msg("Left over peices after removing overlapping\n");
-		return -1;
-	}
-
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-
-	/* Now with the extent entry offset into the bitmap */
-	ret = add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add space to the bitmap %d\n", ret);
-		return ret;
-	}
-
-	ret = add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
-	if (ret) {
-		test_msg("Couldn't add extent to the cache %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
-	if (ret) {
-		test_msg("Problem removing overlapping space %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
-		test_msg("Left something behind when removing space");
-		return -1;
-	}
-
-	/*
-	 * This has blown up in the past, the extent entry starts before the
-	 * bitmap entry, but we're trying to remove an offset that falls
-	 * completely within the bitmap range and is in both the extent entry
-	 * and the bitmap entry, looks like this
-	 *
-	 *   [ extent ]
-	 *      [ bitmap ]
-	 *        [ del ]
-	 */
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	ret = add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
-				   4 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add bitmap %d\n", ret);
-		return ret;
-	}
-
-	ret = add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
-				   5 * 1024 * 1024, 0);
-	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
-				      5 * 1024 * 1024);
-	if (ret) {
-		test_msg("Failed to free our space %d\n", ret);
-		return ret;
-	}
-
-	if (check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
-			 5 * 1024 * 1024)) {
-		test_msg("Left stuff over\n");
-		return -1;
-	}
-
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-
-	/*
-	 * This blew up before, we have part of the free space in a bitmap and
-	 * then the entirety of the rest of the space in an extent.  This used
-	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
-	 * doesn't happen.
-	 */
-	ret = add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
-	if (ret) {
-		test_msg("Couldn't add bitmap entry %d\n", ret);
-		return ret;
-	}
-
-	ret = add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
-	if (ret) {
-		test_msg("Couldn't add extent entry %d\n", ret);
-		return ret;
-	}
-
-	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
-	if (ret) {
-		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
-		return ret;
-	}
-
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	return 0;
-}
-
-void btrfs_test_free_space_cache(void)
-{
-	struct btrfs_block_group_cache *cache;
-
-	test_msg("Running btrfs free space cache tests\n");
-
-	cache = init_test_block_group();
-	if (!cache) {
-		test_msg("Couldn't run the tests\n");
-		return;
-	}
-
-	if (test_extents(cache))
-		goto out;
-	if (test_bitmaps(cache))
-		goto out;
-	if (test_bitmaps_and_extents(cache))
-		goto out;
-out:
-	__btrfs_remove_free_space_cache(cache->free_space_ctl);
-	kfree(cache->free_space_ctl);
-	kfree(cache);
-	test_msg("Free space cache tests finished\n");
-}
-#undef test_msg
-#else /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
-void btrfs_test_free_space_cache(void) {}
-#endif /* !CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
+#endif /* CONFIG_BTRFS_FS_RUN_SANITY_TESTS */
diff --git a/fs/btrfs/free-space-cache.h b/fs/btrfs/free-space-cache.h
index 894116b71304..c74904167476 100644
--- a/fs/btrfs/free-space-cache.h
+++ b/fs/btrfs/free-space-cache.h
@@ -98,8 +98,7 @@ u64 btrfs_find_space_for_alloc(struct btrfs_block_group_cache *block_group,
 u64 btrfs_find_ino_for_alloc(struct btrfs_root *fs_root);
 void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group,
 			   u64 bytes);
-int btrfs_find_space_cluster(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root,
+int btrfs_find_space_cluster(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group,
 			     struct btrfs_free_cluster *cluster,
 			     u64 offset, u64 bytes, u64 empty_size);
@@ -113,6 +112,12 @@ int btrfs_return_cluster_to_free_space(
 int btrfs_trim_block_group(struct btrfs_block_group_cache *block_group,
 			   u64 *trimmed, u64 start, u64 end, u64 minlen);
 
-void btrfs_test_free_space_cache(void);
+/* Support functions for runnint our sanity tests */
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+int test_add_free_space_entry(struct btrfs_block_group_cache *cache,
+			      u64 offset, u64 bytes, bool bitmap);
+int test_check_exists(struct btrfs_block_group_cache *cache,
+		      u64 offset, u64 bytes);
+#endif
 
 #endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7bdc83d04d54..f338c5672d58 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -230,12 +230,13 @@ fail:
  * does the checks required to make sure the data is small enough
  * to fit as an inline extent.
  */
-static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
-				 struct btrfs_root *root,
-				 struct inode *inode, u64 start, u64 end,
-				 size_t compressed_size, int compress_type,
-				 struct page **compressed_pages)
+static noinline int cow_file_range_inline(struct btrfs_root *root,
+					  struct inode *inode, u64 start,
+					  u64 end, size_t compressed_size,
+					  int compress_type,
+					  struct page **compressed_pages)
 {
+	struct btrfs_trans_handle *trans;
 	u64 isize = i_size_read(inode);
 	u64 actual_end = min(end + 1, isize);
 	u64 inline_len = actual_end - start;
@@ -256,9 +257,16 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 		return 1;
 	}
 
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
+
 	ret = btrfs_drop_extents(trans, root, inode, start, aligned_end, 1);
-	if (ret)
-		return ret;
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
+	}
 
 	if (isize > actual_end)
 		inline_len = min_t(u64, isize, actual_end);
@@ -267,15 +275,18 @@ static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
 				   compress_type, compressed_pages);
 	if (ret && ret != -ENOSPC) {
 		btrfs_abort_transaction(trans, root, ret);
-		return ret;
+		goto out;
 	} else if (ret == -ENOSPC) {
-		return 1;
+		ret = 1;
+		goto out;
 	}
 
 	set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags);
 	btrfs_delalloc_release_metadata(inode, end + 1 - start);
 	btrfs_drop_extent_cache(inode, start, aligned_end - 1, 0);
-	return 0;
+out:
+	btrfs_end_transaction(trans, root);
+	return ret;
 }
 
 struct async_extent {
@@ -343,7 +354,6 @@ static noinline int compress_file_range(struct inode *inode,
 					int *num_added)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	u64 num_bytes;
 	u64 blocksize = root->sectorsize;
 	u64 actual_end;
@@ -461,45 +471,36 @@ again:
 	}
 cont:
 	if (start == 0) {
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-			trans = NULL;
-			goto cleanup_and_out;
-		}
-		trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
 		/* lets try to make an inline extent */
 		if (ret || total_in < (actual_end - start)) {
 			/* we didn't compress the entire range, try
 			 * to make an uncompressed inline extent.
 			 */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end, 0, 0, NULL);
+			ret = cow_file_range_inline(root, inode, start, end,
+						    0, 0, NULL);
 		} else {
 			/* try making a compressed inline extent */
-			ret = cow_file_range_inline(trans, root, inode,
-						    start, end,
+			ret = cow_file_range_inline(root, inode, start, end,
 						    total_compressed,
 						    compress_type, pages);
 		}
 		if (ret <= 0) {
+			unsigned long clear_flags = EXTENT_DELALLOC |
+				EXTENT_DEFRAG;
+			clear_flags |= (ret < 0) ? EXTENT_DO_ACCOUNTING : 0;
+
 			/*
 			 * inline extent creation worked or returned error,
 			 * we don't need to create any more async work items.
 			 * Unlock and free up our temp pages.
 			 */
-			extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, NULL,
-			     EXTENT_CLEAR_UNLOCK_PAGE | EXTENT_CLEAR_DIRTY |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_SET_WRITEBACK | EXTENT_END_WRITEBACK);
-
-			btrfs_end_transaction(trans, root);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+						     clear_flags, PAGE_UNLOCK |
+						     PAGE_CLEAR_DIRTY |
+						     PAGE_SET_WRITEBACK |
+						     PAGE_END_WRITEBACK);
 			goto free_pages_out;
 		}
-		btrfs_end_transaction(trans, root);
 	}
 
 	if (will_compress) {
@@ -590,20 +591,6 @@ free_pages_out:
 	kfree(pages);
 
 	goto out;
-
-cleanup_and_out:
-	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				     start, end, NULL,
-				     EXTENT_CLEAR_UNLOCK_PAGE |
-				     EXTENT_CLEAR_DIRTY |
-				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_SET_WRITEBACK |
-				     EXTENT_END_WRITEBACK);
-	if (!trans || IS_ERR(trans))
-		btrfs_error(root->fs_info, ret, "Failed to join transaction");
-	else
-		btrfs_abort_transaction(trans, root, ret);
-	goto free_pages_out;
 }
 
 /*
@@ -617,7 +604,6 @@ static noinline int submit_compressed_extents(struct inode *inode,
 {
 	struct async_extent *async_extent;
 	u64 alloc_hint = 0;
-	struct btrfs_trans_handle *trans;
 	struct btrfs_key ins;
 	struct extent_map *em;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
@@ -678,20 +664,10 @@ retry:
 		lock_extent(io_tree, async_extent->start,
 			    async_extent->start + async_extent->ram_size - 1);
 
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans)) {
-			ret = PTR_ERR(trans);
-		} else {
-			trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-			ret = btrfs_reserve_extent(trans, root,
+		ret = btrfs_reserve_extent(root,
 					   async_extent->compressed_size,
 					   async_extent->compressed_size,
 					   0, alloc_hint, &ins, 1);
-			if (ret && ret != -ENOSPC)
-				btrfs_abort_transaction(trans, root, ret);
-			btrfs_end_transaction(trans, root);
-		}
-
 		if (ret) {
 			int i;
 
@@ -770,16 +746,12 @@ retry:
 		/*
 		 * clear dirty, set writeback and unlock the pages.
 		 */
-		extent_clear_unlock_delalloc(inode,
-				&BTRFS_I(inode)->io_tree,
-				async_extent->start,
+		extent_clear_unlock_delalloc(inode, async_extent->start,
 				async_extent->start +
 				async_extent->ram_size - 1,
-				NULL, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK |
-				EXTENT_CLEAR_DELALLOC |
-				EXTENT_CLEAR_DIRTY | EXTENT_SET_WRITEBACK);
-
+				NULL, EXTENT_LOCKED | EXTENT_DELALLOC,
+				PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				PAGE_SET_WRITEBACK);
 		ret = btrfs_submit_compressed_write(inode,
 				    async_extent->start,
 				    async_extent->ram_size,
@@ -798,16 +770,13 @@ out:
 out_free_reserve:
 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_free:
-	extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				     async_extent->start,
+	extent_clear_unlock_delalloc(inode, async_extent->start,
 				     async_extent->start +
 				     async_extent->ram_size - 1,
-				     NULL, EXTENT_CLEAR_UNLOCK_PAGE |
-				     EXTENT_CLEAR_UNLOCK |
-				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_DIRTY |
-				     EXTENT_SET_WRITEBACK |
-				     EXTENT_END_WRITEBACK);
+				     NULL, EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG | EXTENT_DO_ACCOUNTING,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
 	kfree(async_extent);
 	goto again;
 }
@@ -857,14 +826,13 @@ static u64 get_extent_allocation_hint(struct inode *inode, u64 start,
  * required to start IO on it.  It may be clean and already done with
  * IO when we return.
  */
-static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
-				     struct inode *inode,
-				     struct btrfs_root *root,
-				     struct page *locked_page,
-				     u64 start, u64 end, int *page_started,
-				     unsigned long *nr_written,
-				     int unlock)
+static noinline int cow_file_range(struct inode *inode,
+				   struct page *locked_page,
+				   u64 start, u64 end, int *page_started,
+				   unsigned long *nr_written,
+				   int unlock)
 {
+	struct btrfs_root *root = BTRFS_I(inode)->root;
 	u64 alloc_hint = 0;
 	u64 num_bytes;
 	unsigned long ram_size;
@@ -885,29 +853,24 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 	/* if this is a small write inside eof, kick off defrag */
 	if (num_bytes < 64 * 1024 &&
 	    (start > 0 || end + 1 < BTRFS_I(inode)->disk_i_size))
-		btrfs_add_inode_defrag(trans, inode);
+		btrfs_add_inode_defrag(NULL, inode);
 
 	if (start == 0) {
 		/* lets try to make an inline extent */
-		ret = cow_file_range_inline(trans, root, inode,
-					    start, end, 0, 0, NULL);
+		ret = cow_file_range_inline(root, inode, start, end, 0, 0,
+					    NULL);
 		if (ret == 0) {
-			extent_clear_unlock_delalloc(inode,
-				     &BTRFS_I(inode)->io_tree,
-				     start, end, NULL,
-				     EXTENT_CLEAR_UNLOCK_PAGE |
-				     EXTENT_CLEAR_UNLOCK |
-				     EXTENT_CLEAR_DELALLOC |
-				     EXTENT_CLEAR_DIRTY |
-				     EXTENT_SET_WRITEBACK |
-				     EXTENT_END_WRITEBACK);
+			extent_clear_unlock_delalloc(inode, start, end, NULL,
+				     EXTENT_LOCKED | EXTENT_DELALLOC |
+				     EXTENT_DEFRAG, PAGE_UNLOCK |
+				     PAGE_CLEAR_DIRTY | PAGE_SET_WRITEBACK |
+				     PAGE_END_WRITEBACK);
 
 			*nr_written = *nr_written +
 			     (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE;
 			*page_started = 1;
 			goto out;
 		} else if (ret < 0) {
-			btrfs_abort_transaction(trans, root, ret);
 			goto out_unlock;
 		}
 	}
@@ -922,13 +885,11 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 		unsigned long op;
 
 		cur_alloc_size = disk_num_bytes;
-		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+		ret = btrfs_reserve_extent(root, cur_alloc_size,
 					   root->sectorsize, 0, alloc_hint,
 					   &ins, 1);
-		if (ret < 0) {
-			btrfs_abort_transaction(trans, root, ret);
+		if (ret < 0)
 			goto out_unlock;
-		}
 
 		em = alloc_extent_map();
 		if (!em) {
@@ -974,10 +935,8 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, start,
 						      cur_alloc_size);
-			if (ret) {
-				btrfs_abort_transaction(trans, root, ret);
+			if (ret)
 				goto out_reserve;
-			}
 		}
 
 		if (disk_num_bytes < cur_alloc_size)
@@ -990,13 +949,13 @@ static noinline int __cow_file_range(struct btrfs_trans_handle *trans,
 		 * Do set the Private2 bit so we know this page was properly
 		 * setup for writepage
 		 */
-		op = unlock ? EXTENT_CLEAR_UNLOCK_PAGE : 0;
-		op |= EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-			EXTENT_SET_PRIVATE2;
+		op = unlock ? PAGE_UNLOCK : 0;
+		op |= PAGE_SET_PRIVATE2;
 
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-					     start, start + ram_size - 1,
-					     locked_page, op);
+		extent_clear_unlock_delalloc(inode, start,
+					     start + ram_size - 1, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC,
+					     op);
 		disk_num_bytes -= cur_alloc_size;
 		num_bytes -= cur_alloc_size;
 		alloc_hint = ins.objectid + ins.offset;
@@ -1008,52 +967,14 @@ out:
 out_reserve:
 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
 out_unlock:
-	extent_clear_unlock_delalloc(inode,
-		     &BTRFS_I(inode)->io_tree,
-		     start, end, locked_page,
-		     EXTENT_CLEAR_UNLOCK_PAGE |
-		     EXTENT_CLEAR_UNLOCK |
-		     EXTENT_CLEAR_DELALLOC |
-		     EXTENT_CLEAR_DIRTY |
-		     EXTENT_SET_WRITEBACK |
-		     EXTENT_END_WRITEBACK);
-
+	extent_clear_unlock_delalloc(inode, start, end, locked_page,
+				     EXTENT_LOCKED | EXTENT_DO_ACCOUNTING |
+				     EXTENT_DELALLOC | EXTENT_DEFRAG,
+				     PAGE_UNLOCK | PAGE_CLEAR_DIRTY |
+				     PAGE_SET_WRITEBACK | PAGE_END_WRITEBACK);
 	goto out;
 }
 
-static noinline int cow_file_range(struct inode *inode,
-				   struct page *locked_page,
-				   u64 start, u64 end, int *page_started,
-				   unsigned long *nr_written,
-				   int unlock)
-{
-	struct btrfs_trans_handle *trans;
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	int ret;
-
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans)) {
-		extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, locked_page,
-			     EXTENT_CLEAR_UNLOCK_PAGE |
-			     EXTENT_CLEAR_UNLOCK |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_DIRTY |
-			     EXTENT_SET_WRITEBACK |
-			     EXTENT_END_WRITEBACK);
-		return PTR_ERR(trans);
-	}
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
-	ret = __cow_file_range(trans, inode, root, locked_page, start, end,
-			       page_started, nr_written, unlock);
-
-	btrfs_end_transaction(trans, root);
-
-	return ret;
-}
-
 /*
  * work queue call back to started compression on a file and pages
  */
@@ -1221,15 +1142,13 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 
 	path = btrfs_alloc_path();
 	if (!path) {
-		extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, locked_page,
-			     EXTENT_CLEAR_UNLOCK_PAGE |
-			     EXTENT_CLEAR_UNLOCK |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_DIRTY |
-			     EXTENT_SET_WRITEBACK |
-			     EXTENT_END_WRITEBACK);
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 		return -ENOMEM;
 	}
 
@@ -1241,15 +1160,13 @@ static noinline int run_delalloc_nocow(struct inode *inode,
 		trans = btrfs_join_transaction(root);
 
 	if (IS_ERR(trans)) {
-		extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     start, end, locked_page,
-			     EXTENT_CLEAR_UNLOCK_PAGE |
-			     EXTENT_CLEAR_UNLOCK |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_DIRTY |
-			     EXTENT_SET_WRITEBACK |
-			     EXTENT_END_WRITEBACK);
+		extent_clear_unlock_delalloc(inode, start, end, locked_page,
+					     EXTENT_LOCKED | EXTENT_DELALLOC |
+					     EXTENT_DO_ACCOUNTING |
+					     EXTENT_DEFRAG, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 		btrfs_free_path(path);
 		return PTR_ERR(trans);
 	}
@@ -1369,9 +1286,9 @@ out_check:
 
 		btrfs_release_path(path);
 		if (cow_start != (u64)-1) {
-			ret = __cow_file_range(trans, inode, root, locked_page,
-					       cow_start, found_key.offset - 1,
-					       page_started, nr_written, 1);
+			ret = cow_file_range(inode, locked_page,
+					     cow_start, found_key.offset - 1,
+					     page_started, nr_written, 1);
 			if (ret) {
 				btrfs_abort_transaction(trans, root, ret);
 				goto error;
@@ -1428,11 +1345,11 @@ out_check:
 			}
 		}
 
-		extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree,
-				cur_offset, cur_offset + num_bytes - 1,
-				locked_page, EXTENT_CLEAR_UNLOCK_PAGE |
-				EXTENT_CLEAR_UNLOCK | EXTENT_CLEAR_DELALLOC |
-				EXTENT_SET_PRIVATE2);
+		extent_clear_unlock_delalloc(inode, cur_offset,
+					     cur_offset + num_bytes - 1,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC, PAGE_UNLOCK |
+					     PAGE_SET_PRIVATE2);
 		cur_offset = extent_end;
 		if (cur_offset > end)
 			break;
@@ -1445,9 +1362,8 @@ out_check:
 	}
 
 	if (cow_start != (u64)-1) {
-		ret = __cow_file_range(trans, inode, root, locked_page,
-				       cow_start, end,
-				       page_started, nr_written, 1);
+		ret = cow_file_range(inode, locked_page, cow_start, end,
+				     page_started, nr_written, 1);
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			goto error;
@@ -1460,16 +1376,13 @@ error:
 		ret = err;
 
 	if (ret && cur_offset < end)
-		extent_clear_unlock_delalloc(inode,
-			     &BTRFS_I(inode)->io_tree,
-			     cur_offset, end, locked_page,
-			     EXTENT_CLEAR_UNLOCK_PAGE |
-			     EXTENT_CLEAR_UNLOCK |
-			     EXTENT_CLEAR_DELALLOC |
-			     EXTENT_CLEAR_DIRTY |
-			     EXTENT_SET_WRITEBACK |
-			     EXTENT_END_WRITEBACK);
-
+		extent_clear_unlock_delalloc(inode, cur_offset, end,
+					     locked_page, EXTENT_LOCKED |
+					     EXTENT_DELALLOC | EXTENT_DEFRAG |
+					     EXTENT_DO_ACCOUNTING, PAGE_UNLOCK |
+					     PAGE_CLEAR_DIRTY |
+					     PAGE_SET_WRITEBACK |
+					     PAGE_END_WRITEBACK);
 	btrfs_free_path(path);
 	return ret;
 }
@@ -2132,6 +2045,7 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
 		WARN_ON(1);
 		return ret;
 	}
+	ret = 0;
 
 	while (1) {
 		cond_resched();
@@ -2181,8 +2095,6 @@ static noinline int record_one_backref(u64 inum, u64 offset, u64 root_id,
 		    old->len || extent_offset + num_bytes <=
 		    old->extent_offset + old->offset)
 			continue;
-
-		ret = 0;
 		break;
 	}
 
@@ -2238,16 +2150,18 @@ static noinline bool record_extent_backrefs(struct btrfs_path *path,
 
 static int relink_is_mergable(struct extent_buffer *leaf,
 			      struct btrfs_file_extent_item *fi,
-			      u64 disk_bytenr)
+			      struct new_sa_defrag_extent *new)
 {
-	if (btrfs_file_extent_disk_bytenr(leaf, fi) != disk_bytenr)
+	if (btrfs_file_extent_disk_bytenr(leaf, fi) != new->bytenr)
 		return 0;
 
 	if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
 		return 0;
 
-	if (btrfs_file_extent_compression(leaf, fi) ||
-	    btrfs_file_extent_encryption(leaf, fi) ||
+	if (btrfs_file_extent_compression(leaf, fi) != new->compress_type)
+		return 0;
+
+	if (btrfs_file_extent_encryption(leaf, fi) ||
 	    btrfs_file_extent_other_encoding(leaf, fi))
 		return 0;
 
@@ -2391,8 +2305,8 @@ again:
 				    struct btrfs_file_extent_item);
 		extent_len = btrfs_file_extent_num_bytes(leaf, fi);
 
-		if (relink_is_mergable(leaf, fi, new->bytenr) &&
-		    extent_len + found_key.offset == start) {
+		if (extent_len + found_key.offset == start &&
+		    relink_is_mergable(leaf, fi, new)) {
 			btrfs_set_file_extent_num_bytes(leaf, fi,
 							extent_len + len);
 			btrfs_mark_buffer_dirty(leaf);
@@ -2648,8 +2562,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 	struct extent_state *cached_state = NULL;
 	struct new_sa_defrag_extent *new = NULL;
 	int compress_type = 0;
-	int ret;
+	int ret = 0;
+	u64 logical_len = ordered_extent->len;
 	bool nolock;
+	bool truncated = false;
 
 	nolock = btrfs_is_free_space_inode(inode);
 
@@ -2658,6 +2574,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		goto out;
 	}
 
+	if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) {
+		truncated = true;
+		logical_len = ordered_extent->truncated_len;
+		/* Truncated the entire extent, don't bother adding */
+		if (!logical_len)
+			goto out;
+	}
+
 	if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) {
 		BUG_ON(!list_empty(&ordered_extent->list)); /* Logic error */
 		btrfs_ordered_update_i_size(inode, 0, ordered_extent);
@@ -2713,15 +2637,14 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
 		ret = btrfs_mark_extent_written(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->file_offset +
-						ordered_extent->len);
+						logical_len);
 	} else {
 		BUG_ON(root == root->fs_info->tree_root);
 		ret = insert_reserved_file_extent(trans, inode,
 						ordered_extent->file_offset,
 						ordered_extent->start,
 						ordered_extent->disk_len,
-						ordered_extent->len,
-						ordered_extent->len,
+						logical_len, logical_len,
 						compress_type, 0, 0,
 						BTRFS_FILE_EXTENT_REG);
 	}
@@ -2753,17 +2676,27 @@ out:
 	if (trans)
 		btrfs_end_transaction(trans, root);
 
-	if (ret) {
-		clear_extent_uptodate(io_tree, ordered_extent->file_offset,
-				      ordered_extent->file_offset +
-				      ordered_extent->len - 1, NULL, GFP_NOFS);
+	if (ret || truncated) {
+		u64 start, end;
+
+		if (truncated)
+			start = ordered_extent->file_offset + logical_len;
+		else
+			start = ordered_extent->file_offset;
+		end = ordered_extent->file_offset + ordered_extent->len - 1;
+		clear_extent_uptodate(io_tree, start, end, NULL, GFP_NOFS);
+
+		/* Drop the cache for the part of the extent we didn't write. */
+		btrfs_drop_extent_cache(inode, start, end, 0);
 
 		/*
 		 * If the ordered extent had an IOERR or something else went
 		 * wrong we need to return the space for this ordered extent
-		 * back to the allocator.
+		 * back to the allocator.  We only free the extent in the
+		 * truncated case if we didn't write out the extent at all.
 		 */
-		if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
+		if ((ret || !logical_len) &&
+		    !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) &&
 		    !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags))
 			btrfs_free_reserved_extent(root, ordered_extent->start,
 						   ordered_extent->disk_len);
@@ -2827,16 +2760,16 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
  * if there's a match, we allow the bio to finish.  If not, the code in
  * extent_io.c will try to find good copies for us.
  */
-static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
-			       struct extent_state *state, int mirror)
+static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio,
+				      u64 phy_offset, struct page *page,
+				      u64 start, u64 end, int mirror)
 {
 	size_t offset = start - page_offset(page);
 	struct inode *inode = page->mapping->host;
 	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
 	char *kaddr;
-	u64 private = ~(u32)0;
-	int ret;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u32 csum_expected;
 	u32 csum = ~(u32)0;
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 	                              DEFAULT_RATELIMIT_BURST);
@@ -2856,19 +2789,13 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 		return 0;
 	}
 
-	if (state && state->start == start) {
-		private = state->private;
-		ret = 0;
-	} else {
-		ret = get_state_private(io_tree, start, &private);
-	}
-	kaddr = kmap_atomic(page);
-	if (ret)
-		goto zeroit;
+	phy_offset >>= inode->i_sb->s_blocksize_bits;
+	csum_expected = *(((u32 *)io_bio->csum) + phy_offset);
 
+	kaddr = kmap_atomic(page);
 	csum = btrfs_csum_data(kaddr + offset, csum,  end - start + 1);
 	btrfs_csum_final(csum, (char *)&csum);
-	if (csum != private)
+	if (csum != csum_expected)
 		goto zeroit;
 
 	kunmap_atomic(kaddr);
@@ -2877,14 +2804,12 @@ good:
 
 zeroit:
 	if (__ratelimit(&_rs))
-		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u private %llu",
-			(unsigned long long)btrfs_ino(page->mapping->host),
-			(unsigned long long)start, csum,
-			(unsigned long long)private);
+		btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+			btrfs_ino(page->mapping->host), start, csum, csum_expected);
 	memset(kaddr + offset, 1, end - start + 1);
 	flush_dcache_page(page);
 	kunmap_atomic(kaddr);
-	if (private == 0)
+	if (csum_expected == 0)
 		return 0;
 	return -EIO;
 }
@@ -2971,8 +2896,10 @@ void btrfs_orphan_commit_root(struct btrfs_trans_handle *trans,
 	    btrfs_root_refs(&root->root_item) > 0) {
 		ret = btrfs_del_orphan_item(trans, root->fs_info->tree_root,
 					    root->root_key.objectid);
-		BUG_ON(ret);
-		root->orphan_item_inserted = 0;
+		if (ret)
+			btrfs_abort_transaction(trans, root, ret);
+		else
+			root->orphan_item_inserted = 0;
 	}
 
 	if (block_rsv) {
@@ -3041,11 +2968,18 @@ int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode)
 	/* insert an orphan item to track this unlinked/truncated file */
 	if (insert >= 1) {
 		ret = btrfs_insert_orphan_item(trans, root, btrfs_ino(inode));
-		if (ret && ret != -EEXIST) {
-			clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
-				  &BTRFS_I(inode)->runtime_flags);
-			btrfs_abort_transaction(trans, root, ret);
-			return ret;
+		if (ret) {
+			if (reserve) {
+				clear_bit(BTRFS_INODE_ORPHAN_META_RESERVED,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_orphan_release_metadata(inode);
+			}
+			if (ret != -EEXIST) {
+				clear_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
+					  &BTRFS_I(inode)->runtime_flags);
+				btrfs_abort_transaction(trans, root, ret);
+				return ret;
+			}
 		}
 		ret = 0;
 	}
@@ -3084,17 +3018,15 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans,
 		release_rsv = 1;
 	spin_unlock(&root->orphan_lock);
 
-	if (trans && delete_item) {
+	if (trans && delete_item)
 		ret = btrfs_del_orphan_item(trans, root, btrfs_ino(inode));
-		BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
-	}
 
 	if (release_rsv) {
 		btrfs_orphan_release_metadata(inode);
 		atomic_dec(&root->orphan_inodes);
 	}
 
-	return 0;
+	return ret;
 }
 
 /*
@@ -3224,8 +3156,9 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
 				found_key.objectid);
 			ret = btrfs_del_orphan_item(trans, root,
 						    found_key.objectid);
-			BUG_ON(ret); /* -ENOMEM or corruption (JDM: Recheck) */
 			btrfs_end_transaction(trans, root);
+			if (ret)
+				goto out;
 			continue;
 		}
 
@@ -3657,8 +3590,7 @@ static int __btrfs_unlink_inode(struct btrfs_trans_handle *trans,
 	if (ret) {
 		btrfs_info(root->fs_info,
 			"failed to delete reference to %.*s, inode %llu parent %llu",
-			name_len, name,
-			(unsigned long long)ino, (unsigned long long)dir_ino);
+			name_len, name, ino, dir_ino);
 		btrfs_abort_transaction(trans, root, ret);
 		goto err;
 	}
@@ -3929,6 +3861,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	u64 extent_num_bytes = 0;
 	u64 extent_offset = 0;
 	u64 item_end = 0;
+	u64 last_size = (u64)-1;
 	u32 found_type = (u8)-1;
 	int found_extent;
 	int del_item;
@@ -4026,6 +3959,11 @@ search_again:
 		if (found_type != BTRFS_EXTENT_DATA_KEY)
 			goto delete;
 
+		if (del_item)
+			last_size = found_key.offset;
+		else
+			last_size = new_size;
+
 		if (extent_type != BTRFS_FILE_EXTENT_INLINE) {
 			u64 num_dec;
 			extent_start = btrfs_file_extent_disk_bytenr(leaf, fi);
@@ -4137,6 +4075,8 @@ out:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 error:
+	if (last_size != (u64)-1)
+		btrfs_ordered_update_i_size(inode, last_size, NULL);
 	btrfs_free_path(path);
 	return err;
 }
@@ -4409,7 +4349,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb);
 
 	if (newsize > oldsize) {
-		truncate_pagecache(inode, oldsize, newsize);
+		truncate_pagecache(inode, newsize);
 		ret = btrfs_cont_expand(inode, oldsize, newsize);
 		if (ret)
 			return ret;
@@ -4465,8 +4405,26 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
 		btrfs_inode_resume_unlocked_dio(inode);
 
 		ret = btrfs_truncate(inode);
-		if (ret && inode->i_nlink)
-			btrfs_orphan_del(NULL, inode);
+		if (ret && inode->i_nlink) {
+			int err;
+
+			/*
+			 * failed to truncate, disk_i_size is only adjusted down
+			 * as we remove extents, so it should represent the true
+			 * size of the inode, so reset the in memory size and
+			 * delete our orphan entry.
+			 */
+			trans = btrfs_join_transaction(root);
+			if (IS_ERR(trans)) {
+				btrfs_orphan_del(NULL, inode);
+				return ret;
+			}
+			i_size_write(inode, BTRFS_I(inode)->disk_i_size);
+			err = btrfs_orphan_del(trans, inode);
+			if (err)
+				btrfs_abort_transaction(trans, root, err);
+			btrfs_end_transaction(trans, root);
+		}
 	}
 
 	return ret;
@@ -4601,10 +4559,15 @@ void btrfs_evict_inode(struct inode *inode)
 
 	btrfs_free_block_rsv(root, rsv);
 
+	/*
+	 * Errors here aren't a big deal, it just means we leave orphan items
+	 * in the tree.  They will be cleaned up on the next mount.
+	 */
 	if (ret == 0) {
 		trans->block_rsv = root->orphan_block_rsv;
-		ret = btrfs_orphan_del(trans, inode);
-		BUG_ON(ret);
+		btrfs_orphan_del(trans, inode);
+	} else {
+		btrfs_orphan_del(NULL, inode);
 	}
 
 	trans->block_rsv = &root->fs_info->trans_block_rsv;
@@ -6161,10 +6124,7 @@ insert:
 	btrfs_release_path(path);
 	if (em->start > start || extent_map_end(em) <= start) {
 		btrfs_err(root->fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]",
-			(unsigned long long)em->start,
-			(unsigned long long)em->len,
-			(unsigned long long)start,
-			(unsigned long long)len);
+			em->start, em->len, start, len);
 		err = -EIO;
 		goto out;
 	}
@@ -6362,39 +6322,32 @@ static struct extent_map *btrfs_new_extent_direct(struct inode *inode,
 						  u64 start, u64 len)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	struct extent_map *em;
 	struct btrfs_key ins;
 	u64 alloc_hint;
 	int ret;
 
-	trans = btrfs_join_transaction(root);
-	if (IS_ERR(trans))
-		return ERR_CAST(trans);
-
-	trans->block_rsv = &root->fs_info->delalloc_block_rsv;
-
 	alloc_hint = get_extent_allocation_hint(inode, start, len);
-	ret = btrfs_reserve_extent(trans, root, len, root->sectorsize, 0,
+	ret = btrfs_reserve_extent(root, len, root->sectorsize, 0,
 				   alloc_hint, &ins, 1);
-	if (ret) {
-		em = ERR_PTR(ret);
-		goto out;
-	}
+	if (ret)
+		return ERR_PTR(ret);
 
 	em = create_pinned_em(inode, start, ins.offset, start, ins.objectid,
 			      ins.offset, ins.offset, ins.offset, 0);
-	if (IS_ERR(em))
-		goto out;
+	if (IS_ERR(em)) {
+		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
+		return em;
+	}
 
 	ret = btrfs_add_ordered_extent_dio(inode, start, ins.objectid,
 					   ins.offset, ins.offset, 0);
 	if (ret) {
 		btrfs_free_reserved_extent(root, ins.objectid, ins.offset);
-		em = ERR_PTR(ret);
+		free_extent_map(em);
+		return ERR_PTR(ret);
 	}
-out:
-	btrfs_end_transaction(trans, root);
+
 	return em;
 }
 
@@ -6402,11 +6355,11 @@ out:
  * returns 1 when the nocow is safe, < 1 on error, 0 if the
  * block must be cow'd
  */
-noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
-			      struct inode *inode, u64 offset, u64 *len,
+noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
 			      u64 *orig_start, u64 *orig_block_len,
 			      u64 *ram_bytes)
 {
+	struct btrfs_trans_handle *trans;
 	struct btrfs_path *path;
 	int ret;
 	struct extent_buffer *leaf;
@@ -6424,7 +6377,7 @@ noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
 	if (!path)
 		return -ENOMEM;
 
-	ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode),
+	ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode),
 				       offset, 0);
 	if (ret < 0)
 		goto out;
@@ -6489,9 +6442,19 @@ noinline int can_nocow_extent(struct btrfs_trans_handle *trans,
 	 * look for other files referencing this extent, if we
 	 * find any we must cow
 	 */
-	if (btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
-				  key.offset - backref_offset, disk_bytenr))
+	trans = btrfs_join_transaction(root);
+	if (IS_ERR(trans)) {
+		ret = 0;
 		goto out;
+	}
+
+	ret = btrfs_cross_ref_exist(trans, root, btrfs_ino(inode),
+				    key.offset - backref_offset, disk_bytenr);
+	btrfs_end_transaction(trans, root);
+	if (ret) {
+		ret = 0;
+		goto out;
+	}
 
 	/*
 	 * adjust disk_bytenr and num_bytes to cover just the bytes
@@ -6633,7 +6596,6 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 	u64 start = iblock << inode->i_blkbits;
 	u64 lockstart, lockend;
 	u64 len = bh_result->b_size;
-	struct btrfs_trans_handle *trans;
 	int unlock_bits = EXTENT_LOCKED;
 	int ret = 0;
 
@@ -6715,16 +6677,7 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 		len = min(len, em->len - (start - em->start));
 		block_start = em->block_start + (start - em->start);
 
-		/*
-		 * we're not going to log anything, but we do need
-		 * to make sure the current transaction stays open
-		 * while we look for nocow cross refs
-		 */
-		trans = btrfs_join_transaction(root);
-		if (IS_ERR(trans))
-			goto must_cow;
-
-		if (can_nocow_extent(trans, inode, start, &len, &orig_start,
+		if (can_nocow_extent(inode, start, &len, &orig_start,
 				     &orig_block_len, &ram_bytes) == 1) {
 			if (type == BTRFS_ORDERED_PREALLOC) {
 				free_extent_map(em);
@@ -6733,24 +6686,20 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock,
 						       block_start, len,
 						       orig_block_len,
 						       ram_bytes, type);
-				if (IS_ERR(em)) {
-					btrfs_end_transaction(trans, root);
+				if (IS_ERR(em))
 					goto unlock_err;
-				}
 			}
 
 			ret = btrfs_add_ordered_extent_dio(inode, start,
 					   block_start, len, len, type);
-			btrfs_end_transaction(trans, root);
 			if (ret) {
 				free_extent_map(em);
 				goto unlock_err;
 			}
 			goto unlock;
 		}
-		btrfs_end_transaction(trans, root);
 	}
-must_cow:
+
 	/*
 	 * this will cow the extent, reset the len in case we changed
 	 * it above
@@ -6813,26 +6762,6 @@ unlock_err:
 	return ret;
 }
 
-struct btrfs_dio_private {
-	struct inode *inode;
-	u64 logical_offset;
-	u64 disk_bytenr;
-	u64 bytes;
-	void *private;
-
-	/* number of bios pending for this dio */
-	atomic_t pending_bios;
-
-	/* IO errors */
-	int errors;
-
-	/* orig_bio is our btrfs_io_bio */
-	struct bio *orig_bio;
-
-	/* dio_bio came from fs/direct-io.c */
-	struct bio *dio_bio;
-};
-
 static void btrfs_endio_direct_read(struct bio *bio, int err)
 {
 	struct btrfs_dio_private *dip = bio->bi_private;
@@ -6841,6 +6770,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 	struct inode *inode = dip->inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct bio *dio_bio;
+	u32 *csums = (u32 *)dip->csum;
+	int index = 0;
 	u64 start;
 
 	start = dip->logical_offset;
@@ -6849,12 +6780,8 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			struct page *page = bvec->bv_page;
 			char *kaddr;
 			u32 csum = ~(u32)0;
-			u64 private = ~(u32)0;
 			unsigned long flags;
 
-			if (get_state_private(&BTRFS_I(inode)->io_tree,
-					      start, &private))
-				goto failed;
 			local_irq_save(flags);
 			kaddr = kmap_atomic(page);
 			csum = btrfs_csum_data(kaddr + bvec->bv_offset,
@@ -6864,18 +6791,17 @@ static void btrfs_endio_direct_read(struct bio *bio, int err)
 			local_irq_restore(flags);
 
 			flush_dcache_page(bvec->bv_page);
-			if (csum != private) {
-failed:
-				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u private %u",
-					(unsigned long long)btrfs_ino(inode),
-					(unsigned long long)start,
-					csum, (unsigned)private);
+			if (csum != csums[index]) {
+				btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u",
+					  btrfs_ino(inode), start, csum,
+					  csums[index]);
 				err = -EIO;
 			}
 		}
 
 		start += bvec->bv_len;
 		bvec++;
+		index++;
 	} while (bvec <= bvec_end);
 
 	unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset,
@@ -6956,7 +6882,7 @@ static void btrfs_end_dio_bio(struct bio *bio, int err)
 	if (err) {
 		printk(KERN_ERR "btrfs direct IO failed ino %llu rw %lu "
 		      "sector %#Lx len %u err no %d\n",
-		      (unsigned long long)btrfs_ino(dip->inode), bio->bi_rw,
+		      btrfs_ino(dip->inode), bio->bi_rw,
 		      (unsigned long long)bio->bi_sector, bio->bi_size, err);
 		dip->errors = 1;
 
@@ -6992,6 +6918,7 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 					 int rw, u64 file_offset, int skip_sum,
 					 int async_submit)
 {
+	struct btrfs_dio_private *dip = bio->bi_private;
 	int write = rw & REQ_WRITE;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret;
@@ -7026,7 +6953,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
 		if (ret)
 			goto err;
 	} else if (!skip_sum) {
-		ret = btrfs_lookup_bio_sums_dio(root, inode, bio, file_offset);
+		ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio,
+						file_offset);
 		if (ret)
 			goto err;
 	}
@@ -7061,6 +6989,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip,
 		bio_put(orig_bio);
 		return -EIO;
 	}
+
 	if (map_length >= orig_bio->bi_size) {
 		bio = orig_bio;
 		goto submit;
@@ -7156,19 +7085,28 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio,
 	struct btrfs_dio_private *dip;
 	struct bio *io_bio;
 	int skip_sum;
+	int sum_len;
 	int write = rw & REQ_WRITE;
 	int ret = 0;
+	u16 csum_size;
 
 	skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM;
 
 	io_bio = btrfs_bio_clone(dio_bio, GFP_NOFS);
-
 	if (!io_bio) {
 		ret = -ENOMEM;
 		goto free_ordered;
 	}
 
-	dip = kmalloc(sizeof(*dip), GFP_NOFS);
+	if (!skip_sum && !write) {
+		csum_size = btrfs_super_csum_size(root->fs_info->super_copy);
+		sum_len = dio_bio->bi_size >> inode->i_sb->s_blocksize_bits;
+		sum_len *= csum_size;
+	} else {
+		sum_len = 0;
+	}
+
+	dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS);
 	if (!dip) {
 		ret = -ENOMEM;
 		goto free_io_bio;
@@ -7443,10 +7381,23 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset,
 		 * whoever cleared the private bit is responsible
 		 * for the finish_ordered_io
 		 */
-		if (TestClearPagePrivate2(page) &&
-		    btrfs_dec_test_ordered_pending(inode, &ordered, page_start,
-						   PAGE_CACHE_SIZE, 1)) {
-			btrfs_finish_ordered_io(ordered);
+		if (TestClearPagePrivate2(page)) {
+			struct btrfs_ordered_inode_tree *tree;
+			u64 new_len;
+
+			tree = &BTRFS_I(inode)->ordered_tree;
+
+			spin_lock_irq(&tree->lock);
+			set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
+			new_len = page_start - ordered->file_offset;
+			if (new_len < ordered->truncated_len)
+				ordered->truncated_len = new_len;
+			spin_unlock_irq(&tree->lock);
+
+			if (btrfs_dec_test_ordered_pending(inode, &ordered,
+							   page_start,
+							   PAGE_CACHE_SIZE, 1))
+				btrfs_finish_ordered_io(ordered);
 		}
 		btrfs_put_ordered_extent(ordered);
 		cached_state = NULL;
@@ -7612,7 +7563,6 @@ static int btrfs_truncate(struct inode *inode)
 	u64 min_size = btrfs_calc_trunc_metadata_size(root, 1);
 
 	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
-	btrfs_ordered_update_i_size(inode, inode->i_size, NULL);
 
 	/*
 	 * Yes ladies and gentelment, this is indeed ugly.  The fact is we have
@@ -7876,7 +7826,7 @@ void btrfs_destroy_inode(struct inode *inode)
 	if (test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM,
 		     &BTRFS_I(inode)->runtime_flags)) {
 		btrfs_info(root->fs_info, "inode %llu still on the orphan list",
-			(unsigned long long)btrfs_ino(inode));
+			btrfs_ino(inode));
 		atomic_dec(&root->orphan_inodes);
 	}
 
@@ -7886,8 +7836,7 @@ void btrfs_destroy_inode(struct inode *inode)
 			break;
 		else {
 			btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup",
-				(unsigned long long)ordered->file_offset,
-				(unsigned long long)ordered->len);
+				ordered->file_offset, ordered->len);
 			btrfs_remove_ordered_extent(inode, ordered);
 			btrfs_put_ordered_extent(ordered);
 			btrfs_put_ordered_extent(ordered);
@@ -8161,10 +8110,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 						 new_dentry->d_name.name,
 						 new_dentry->d_name.len);
 		}
-		if (!ret && new_inode->i_nlink == 0) {
+		if (!ret && new_inode->i_nlink == 0)
 			ret = btrfs_orphan_add(trans, new_dentry->d_inode);
-			BUG_ON(ret);
-		}
 		if (ret) {
 			btrfs_abort_transaction(trans, root, ret);
 			goto out_fail;
@@ -8525,8 +8472,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
 
 		cur_bytes = min(num_bytes, 256ULL * 1024 * 1024);
 		cur_bytes = max(cur_bytes, min_size);
-		ret = btrfs_reserve_extent(trans, root, cur_bytes,
-					   min_size, 0, *alloc_hint, &ins, 1);
+		ret = btrfs_reserve_extent(root, cur_bytes, min_size, 0,
+					   *alloc_hint, &ins, 1);
 		if (ret) {
 			if (own_trans)
 				btrfs_end_transaction(trans, root);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 238a05545ee2..1a5b9462dd9a 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -43,6 +43,7 @@
 #include <linux/blkdev.h>
 #include <linux/uuid.h>
 #include <linux/btrfs.h>
+#include <linux/uaccess.h>
 #include "compat.h"
 #include "ctree.h"
 #include "disk-io.h"
@@ -57,6 +58,9 @@
 #include "send.h"
 #include "dev-replace.h"
 
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff);
+
 /* Mask out flags that are inappropriate for the given type of inode. */
 static inline __u32 btrfs_mask_flags(umode_t mode, __u32 flags)
 {
@@ -363,6 +367,13 @@ static noinline int btrfs_ioctl_fitrim(struct file *file, void __user *arg)
 	return 0;
 }
 
+int btrfs_is_empty_uuid(u8 *uuid)
+{
+	static char empty_uuid[BTRFS_UUID_SIZE] = {0};
+
+	return !memcmp(uuid, empty_uuid, BTRFS_UUID_SIZE);
+}
+
 static noinline int create_subvol(struct inode *dir,
 				  struct dentry *dentry,
 				  char *name, int namelen,
@@ -396,7 +407,7 @@ static noinline int create_subvol(struct inode *dir,
 	 * of create_snapshot().
 	 */
 	ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-					       7, &qgroup_reserved);
+					       8, &qgroup_reserved, false);
 	if (ret)
 		return ret;
 
@@ -425,26 +436,25 @@ static noinline int create_subvol(struct inode *dir,
 	btrfs_set_header_backref_rev(leaf, BTRFS_MIXED_BACKREF_REV);
 	btrfs_set_header_owner(leaf, objectid);
 
-	write_extent_buffer(leaf, root->fs_info->fsid,
-			    (unsigned long)btrfs_header_fsid(leaf),
+	write_extent_buffer(leaf, root->fs_info->fsid, btrfs_header_fsid(leaf),
 			    BTRFS_FSID_SIZE);
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-			    (unsigned long)btrfs_header_chunk_tree_uuid(leaf),
+			    btrfs_header_chunk_tree_uuid(leaf),
 			    BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
 	memset(&root_item, 0, sizeof(root_item));
 
 	inode_item = &root_item.inode;
-	inode_item->generation = cpu_to_le64(1);
-	inode_item->size = cpu_to_le64(3);
-	inode_item->nlink = cpu_to_le32(1);
-	inode_item->nbytes = cpu_to_le64(root->leafsize);
-	inode_item->mode = cpu_to_le32(S_IFDIR | 0755);
+	btrfs_set_stack_inode_generation(inode_item, 1);
+	btrfs_set_stack_inode_size(inode_item, 3);
+	btrfs_set_stack_inode_nlink(inode_item, 1);
+	btrfs_set_stack_inode_nbytes(inode_item, root->leafsize);
+	btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 
-	root_item.flags = 0;
-	root_item.byte_limit = 0;
-	inode_item->flags = cpu_to_le64(BTRFS_INODE_ROOT_ITEM_INIT);
+	btrfs_set_root_flags(&root_item, 0);
+	btrfs_set_root_limit(&root_item, 0);
+	btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
 
 	btrfs_set_root_bytenr(&root_item, leaf->start);
 	btrfs_set_root_generation(&root_item, trans->transid);
@@ -457,8 +467,8 @@ static noinline int create_subvol(struct inode *dir,
 			btrfs_root_generation(&root_item));
 	uuid_le_gen(&new_uuid);
 	memcpy(root_item.uuid, new_uuid.b, BTRFS_UUID_SIZE);
-	root_item.otime.sec = cpu_to_le64(cur_time.tv_sec);
-	root_item.otime.nsec = cpu_to_le32(cur_time.tv_nsec);
+	btrfs_set_stack_timespec_sec(&root_item.otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&root_item.otime, cur_time.tv_nsec);
 	root_item.ctime = root_item.otime;
 	btrfs_set_root_ctransid(&root_item, trans->transid);
 	btrfs_set_root_otransid(&root_item, trans->transid);
@@ -518,9 +528,14 @@ static noinline int create_subvol(struct inode *dir,
 	ret = btrfs_add_root_ref(trans, root->fs_info->tree_root,
 				 objectid, root->root_key.objectid,
 				 btrfs_ino(dir), index, name, namelen);
-
 	BUG_ON(ret);
 
+	ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+				  root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  objectid);
+	if (ret)
+		btrfs_abort_transaction(trans, root, ret);
+
 fail:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
@@ -573,10 +588,12 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 	 * 1 - root item
 	 * 2 - root ref/backref
 	 * 1 - root of snapshot
+	 * 1 - UUID item
 	 */
 	ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
-					&pending_snapshot->block_rsv, 7,
-					&pending_snapshot->qgroup_reserved);
+					&pending_snapshot->block_rsv, 8,
+					&pending_snapshot->qgroup_reserved,
+					false);
 	if (ret)
 		goto out;
 
@@ -1267,9 +1284,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			cluster = max_cluster;
 		}
 
-		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
-			BTRFS_I(inode)->force_compress = compress_type;
-
 		if (i + cluster > ra_index) {
 			ra_index = max(i, ra_index);
 			btrfs_force_ra(inode->i_mapping, ra, file, ra_index,
@@ -1278,6 +1292,8 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 		}
 
 		mutex_lock(&inode->i_mutex);
+		if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS)
+			BTRFS_I(inode)->force_compress = compress_type;
 		ret = cluster_pages_for_defrag(inode, pages, i, cluster);
 		if (ret < 0) {
 			mutex_unlock(&inode->i_mutex);
@@ -1334,10 +1350,6 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 			    atomic_read(&root->fs_info->async_delalloc_pages) == 0));
 		}
 		atomic_dec(&root->fs_info->async_submit_draining);
-
-		mutex_lock(&inode->i_mutex);
-		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
-		mutex_unlock(&inode->i_mutex);
 	}
 
 	if (range->compress_type == BTRFS_COMPRESS_LZO) {
@@ -1347,6 +1359,11 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
 	ret = defrag_count;
 
 out_ra:
+	if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) {
+		mutex_lock(&inode->i_mutex);
+		BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE;
+		mutex_unlock(&inode->i_mutex);
+	}
 	if (!file)
 		kfree(ra);
 	kfree(pages);
@@ -1377,9 +1394,8 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 
 	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 			1)) {
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
 		mnt_drop_write_file(file);
-		return -EINVAL;
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 	}
 
 	mutex_lock(&root->fs_info->volume_mutex);
@@ -1403,14 +1419,13 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 			ret = -EINVAL;
 			goto out_free;
 		}
-		printk(KERN_INFO "btrfs: resizing devid %llu\n",
-		       (unsigned long long)devid);
+		printk(KERN_INFO "btrfs: resizing devid %llu\n", devid);
 	}
 
 	device = btrfs_find_device(root->fs_info, devid, NULL, NULL);
 	if (!device) {
 		printk(KERN_INFO "btrfs: resizer unable to find device %llu\n",
-		       (unsigned long long)devid);
+		       devid);
 		ret = -ENODEV;
 		goto out_free;
 	}
@@ -1418,7 +1433,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	if (!device->writeable) {
 		printk(KERN_INFO "btrfs: resizer unable to apply on "
 		       "readonly device %llu\n",
-		       (unsigned long long)devid);
+		       devid);
 		ret = -EPERM;
 		goto out_free;
 	}
@@ -1470,8 +1485,7 @@ static noinline int btrfs_ioctl_resize(struct file *file,
 	new_size *= root->sectorsize;
 
 	printk_in_rcu(KERN_INFO "btrfs: new size for %s is %llu\n",
-		      rcu_str_deref(device->name),
-		      (unsigned long long)new_size);
+		      rcu_str_deref(device->name), new_size);
 
 	if (new_size > old_size) {
 		trans = btrfs_start_transaction(root, 0);
@@ -1721,13 +1735,28 @@ out:
 static noinline int may_destroy_subvol(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
+	struct btrfs_dir_item *di;
 	struct btrfs_key key;
+	u64 dir_id;
 	int ret;
 
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
+	/* Make sure this root isn't set as the default subvol */
+	dir_id = btrfs_super_root_dir(root->fs_info->super_copy);
+	di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, path,
+				   dir_id, "default", 7, 0);
+	if (di && !IS_ERR(di)) {
+		btrfs_dir_item_key_to_cpu(path->nodes[0], di, &key);
+		if (key.objectid == root->root_key.objectid) {
+			ret = -ENOTEMPTY;
+			goto out;
+		}
+		btrfs_release_path(path);
+	}
+
 	key.objectid = root->root_key.objectid;
 	key.type = BTRFS_ROOT_REF_KEY;
 	key.offset = (u64)-1;
@@ -1993,25 +2022,29 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 		if (ret < 0)
 			goto out;
+		else if (ret > 0) {
+			ret = btrfs_previous_item(root, path, dirid,
+						  BTRFS_INODE_REF_KEY);
+			if (ret < 0)
+				goto out;
+			else if (ret > 0) {
+				ret = -ENOENT;
+				goto out;
+			}
+		}
 
 		l = path->nodes[0];
 		slot = path->slots[0];
-		if (ret > 0 && slot > 0)
-			slot--;
 		btrfs_item_key_to_cpu(l, &key, slot);
 
-		if (ret > 0 && (key.objectid != dirid ||
-				key.type != BTRFS_INODE_REF_KEY)) {
-			ret = -ENOENT;
-			goto out;
-		}
-
 		iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
 		len = btrfs_inode_ref_name_len(l, iref);
 		ptr -= len + 1;
 		total_len += len + 1;
-		if (ptr < name)
+		if (ptr < name) {
+			ret = -ENAMETOOLONG;
 			goto out;
+		}
 
 		*(ptr + len) = '/';
 		read_extent_buffer(l, ptr,(unsigned long)(iref + 1), len);
@@ -2024,8 +2057,6 @@ static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
 		key.offset = (u64)-1;
 		dirid = key.objectid;
 	}
-	if (ptr < name)
-		goto out;
 	memmove(name, ptr, total_len);
 	name[total_len]='\0';
 	ret = 0;
@@ -2174,7 +2205,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 	 * ref/backref.
 	 */
 	err = btrfs_subvolume_reserve_metadata(root, &block_rsv,
-					       5, &qgroup_reserved);
+					       5, &qgroup_reserved, true);
 	if (err)
 		goto out_up_write;
 
@@ -2213,6 +2244,27 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
 			goto out_end_trans;
 		}
 	}
+
+	ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				  dest->root_item.uuid, BTRFS_UUID_KEY_SUBVOL,
+				  dest->root_key.objectid);
+	if (ret && ret != -ENOENT) {
+		btrfs_abort_transaction(trans, root, ret);
+		err = ret;
+		goto out_end_trans;
+	}
+	if (!btrfs_is_empty_uuid(dest->root_item.received_uuid)) {
+		ret = btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+					  dest->root_item.received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  dest->root_key.objectid);
+		if (ret && ret != -ENOENT) {
+			btrfs_abort_transaction(trans, root, ret);
+			err = ret;
+			goto out_end_trans;
+		}
+	}
+
 out_end_trans:
 	trans->block_rsv = NULL;
 	trans->bytes_reserved = 0;
@@ -2326,8 +2378,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg)
 
 	if (atomic_xchg(&root->fs_info->mutually_exclusive_operation_running,
 			1)) {
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-		return -EINVAL;
+		return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 	}
 
 	mutex_lock(&root->fs_info->volume_mutex);
@@ -2400,10 +2451,10 @@ static long btrfs_ioctl_fs_info(struct btrfs_root *root, void __user *arg)
 	if (!fi_args)
 		return -ENOMEM;
 
+	mutex_lock(&fs_devices->device_list_mutex);
 	fi_args->num_devices = fs_devices->num_devices;
 	memcpy(&fi_args->fsid, root->fs_info->fsid, sizeof(fi_args->fsid));
 
-	mutex_lock(&fs_devices->device_list_mutex);
 	list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
 		if (device->devid > fi_args->max_id)
 			fi_args->max_id = device->devid;
@@ -2424,7 +2475,6 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 	int ret = 0;
 	char *s_uuid = NULL;
-	char empty_uuid[BTRFS_UUID_SIZE] = {0};
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
@@ -2433,7 +2483,7 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg)
 	if (IS_ERR(di_args))
 		return PTR_ERR(di_args);
 
-	if (memcmp(empty_uuid, di_args->uuid, BTRFS_UUID_SIZE) != 0)
+	if (!btrfs_is_empty_uuid(di_args->uuid))
 		s_uuid = di_args->uuid;
 
 	mutex_lock(&fs_devices->device_list_mutex);
@@ -2469,150 +2519,336 @@ out:
 	return ret;
 }
 
-static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
-				       u64 off, u64 olen, u64 destoff)
+static struct page *extent_same_get_page(struct inode *inode, u64 off)
+{
+	struct page *page;
+	pgoff_t index;
+	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+
+	index = off >> PAGE_CACHE_SHIFT;
+
+	page = grab_cache_page(inode->i_mapping, index);
+	if (!page)
+		return NULL;
+
+	if (!PageUptodate(page)) {
+		if (extent_read_full_page_nolock(tree, page, btrfs_get_extent,
+						 0))
+			return NULL;
+		lock_page(page);
+		if (!PageUptodate(page)) {
+			unlock_page(page);
+			page_cache_release(page);
+			return NULL;
+		}
+	}
+	unlock_page(page);
+
+	return page;
+}
+
+static inline void lock_extent_range(struct inode *inode, u64 off, u64 len)
+{
+	/* do any pending delalloc/csum calc on src, one way or
+	   another, and lock file content */
+	while (1) {
+		struct btrfs_ordered_extent *ordered;
+		lock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		ordered = btrfs_lookup_first_ordered_extent(inode,
+							    off + len - 1);
+		if (!ordered &&
+		    !test_range_bit(&BTRFS_I(inode)->io_tree, off,
+				    off + len - 1, EXTENT_DELALLOC, 0, NULL))
+			break;
+		unlock_extent(&BTRFS_I(inode)->io_tree, off, off + len - 1);
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+		btrfs_wait_ordered_range(inode, off, len);
+	}
+}
+
+static void btrfs_double_unlock(struct inode *inode1, u64 loff1,
+				struct inode *inode2, u64 loff2, u64 len)
+{
+	unlock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
+	unlock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
+
+	mutex_unlock(&inode1->i_mutex);
+	mutex_unlock(&inode2->i_mutex);
+}
+
+static void btrfs_double_lock(struct inode *inode1, u64 loff1,
+			      struct inode *inode2, u64 loff2, u64 len)
+{
+	if (inode1 < inode2) {
+		swap(inode1, inode2);
+		swap(loff1, loff2);
+	}
+
+	mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT);
+	lock_extent_range(inode1, loff1, len);
+	if (inode1 != inode2) {
+		mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD);
+		lock_extent_range(inode2, loff2, len);
+	}
+}
+
+static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst,
+			  u64 dst_loff, u64 len)
+{
+	int ret = 0;
+	struct page *src_page, *dst_page;
+	unsigned int cmp_len = PAGE_CACHE_SIZE;
+	void *addr, *dst_addr;
+
+	while (len) {
+		if (len < PAGE_CACHE_SIZE)
+			cmp_len = len;
+
+		src_page = extent_same_get_page(src, loff);
+		if (!src_page)
+			return -EINVAL;
+		dst_page = extent_same_get_page(dst, dst_loff);
+		if (!dst_page) {
+			page_cache_release(src_page);
+			return -EINVAL;
+		}
+		addr = kmap_atomic(src_page);
+		dst_addr = kmap_atomic(dst_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dst_page);
+
+		if (memcmp(addr, dst_addr, cmp_len))
+			ret = BTRFS_SAME_DATA_DIFFERS;
+
+		kunmap_atomic(addr);
+		kunmap_atomic(dst_addr);
+		page_cache_release(src_page);
+		page_cache_release(dst_page);
+
+		if (ret)
+			break;
+
+		loff += cmp_len;
+		dst_loff += cmp_len;
+		len -= cmp_len;
+	}
+
+	return ret;
+}
+
+static int extent_same_check_offsets(struct inode *inode, u64 off, u64 len)
+{
+	u64 bs = BTRFS_I(inode)->root->fs_info->sb->s_blocksize;
+
+	if (off + len > inode->i_size || off + len < off)
+		return -EINVAL;
+	/* Check that we are block aligned - btrfs_clone() requires this */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int btrfs_extent_same(struct inode *src, u64 loff, u64 len,
+			     struct inode *dst, u64 dst_loff)
 {
-	struct inode *inode = file_inode(file);
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct fd src_file;
-	struct inode *src;
-	struct btrfs_trans_handle *trans;
-	struct btrfs_path *path;
-	struct extent_buffer *leaf;
-	char *buf;
-	struct btrfs_key key;
-	u32 nritems;
-	int slot;
 	int ret;
-	u64 len = olen;
-	u64 bs = root->fs_info->sb->s_blocksize;
-	int same_inode = 0;
 
 	/*
-	 * TODO:
-	 * - split compressed inline extents.  annoying: we need to
-	 *   decompress into destination's address_space (the file offset
-	 *   may change, so source mapping won't do), then recompress (or
-	 *   otherwise reinsert) a subrange.
-	 * - allow ranges within the same file to be cloned (provided
-	 *   they don't overlap)?
+	 * btrfs_clone() can't handle extents in the same file
+	 * yet. Once that works, we can drop this check and replace it
+	 * with a check for the same inode, but overlapping extents.
 	 */
-
-	/* the destination must be opened for writing */
-	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+	if (src == dst)
 		return -EINVAL;
 
-	if (btrfs_root_readonly(root))
-		return -EROFS;
+	btrfs_double_lock(src, loff, dst, dst_loff, len);
+
+	ret = extent_same_check_offsets(src, loff, len);
+	if (ret)
+		goto out_unlock;
+
+	ret = extent_same_check_offsets(dst, dst_loff, len);
+	if (ret)
+		goto out_unlock;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(dst)->flags & BTRFS_INODE_NODATASUM)) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	ret = btrfs_cmp_data(src, loff, dst, dst_loff, len);
+	if (ret == 0)
+		ret = btrfs_clone(src, dst, loff, len, len, dst_loff);
+
+out_unlock:
+	btrfs_double_unlock(src, loff, dst, dst_loff, len);
+
+	return ret;
+}
+
+#define BTRFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+
+static long btrfs_ioctl_file_extent_same(struct file *file,
+					 void __user *argp)
+{
+	struct btrfs_ioctl_same_args *args = argp;
+	struct btrfs_ioctl_same_args same;
+	struct btrfs_ioctl_same_extent_info info;
+	struct inode *src = file->f_dentry->d_inode;
+	struct file *dst_file = NULL;
+	struct inode *dst;
+	u64 off;
+	u64 len;
+	int i;
+	int ret;
+	u64 bs = BTRFS_I(src)->root->fs_info->sb->s_blocksize;
+	bool is_admin = capable(CAP_SYS_ADMIN);
+
+	if (!(file->f_mode & FMODE_READ))
+		return -EINVAL;
 
 	ret = mnt_want_write_file(file);
 	if (ret)
 		return ret;
 
-	src_file = fdget(srcfd);
-	if (!src_file.file) {
-		ret = -EBADF;
-		goto out_drop_write;
+	if (copy_from_user(&same,
+			   (struct btrfs_ioctl_same_args __user *)argp,
+			   sizeof(same))) {
+		ret = -EFAULT;
+		goto out;
 	}
 
-	ret = -EXDEV;
-	if (src_file.file->f_path.mnt != file->f_path.mnt)
-		goto out_fput;
+	off = same.logical_offset;
+	len = same.length;
 
-	src = file_inode(src_file.file);
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > BTRFS_MAX_DEDUPE_LEN)
+		len = BTRFS_MAX_DEDUPE_LEN;
 
-	ret = -EINVAL;
-	if (src == inode)
-		same_inode = 1;
+	if (WARN_ON_ONCE(bs < PAGE_CACHE_SIZE)) {
+		/*
+		 * Btrfs does not support blocksize < page_size. As a
+		 * result, btrfs_cmp_data() won't correctly handle
+		 * this situation without an update.
+		 */
+		ret = -EINVAL;
+		goto out;
+	}
 
-	/* the src must be open for reading */
-	if (!(src_file.file->f_mode & FMODE_READ))
-		goto out_fput;
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode))
+		goto out;
 
-	/* don't make the dst file partly checksummed */
-	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
-	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
-		goto out_fput;
+	ret = -EACCES;
+	if (!S_ISREG(src->i_mode))
+		goto out;
 
-	ret = -EISDIR;
-	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
-		goto out_fput;
+	ret = 0;
+	for (i = 0; i < same.dest_count; i++) {
+		if (copy_from_user(&info, &args->info[i], sizeof(info))) {
+			ret = -EFAULT;
+			goto out;
+		}
 
-	ret = -EXDEV;
-	if (src->i_sb != inode->i_sb)
-		goto out_fput;
+		info.bytes_deduped = 0;
 
-	ret = -ENOMEM;
-	buf = vmalloc(btrfs_level_size(root, 0));
-	if (!buf)
-		goto out_fput;
+		dst_file = fget(info.fd);
+		if (!dst_file) {
+			info.status = -EBADF;
+			goto next;
+		}
 
-	path = btrfs_alloc_path();
-	if (!path) {
-		vfree(buf);
-		goto out_fput;
-	}
-	path->reada = 2;
+		if (!(is_admin || (dst_file->f_mode & FMODE_WRITE))) {
+			info.status = -EINVAL;
+			goto next;
+		}
 
-	if (!same_inode) {
-		if (inode < src) {
-			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
-		} else {
-			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
-			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		info.status = -EXDEV;
+		if (file->f_path.mnt != dst_file->f_path.mnt)
+			goto next;
+
+		dst = dst_file->f_dentry->d_inode;
+		if (src->i_sb != dst->i_sb)
+			goto next;
+
+		if (S_ISDIR(dst->i_mode)) {
+			info.status = -EISDIR;
+			goto next;
 		}
-	} else {
-		mutex_lock(&src->i_mutex);
-	}
 
-	/* determine range to clone */
-	ret = -EINVAL;
-	if (off + len > src->i_size || off + len < off)
-		goto out_unlock;
-	if (len == 0)
-		olen = len = src->i_size - off;
-	/* if we extend to eof, continue to block boundary */
-	if (off + len == src->i_size)
-		len = ALIGN(src->i_size, bs) - off;
+		if (!S_ISREG(dst->i_mode)) {
+			info.status = -EACCES;
+			goto next;
+		}
 
-	/* verify the end result is block aligned */
-	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
-	    !IS_ALIGNED(destoff, bs))
-		goto out_unlock;
+		info.status = btrfs_extent_same(src, off, len, dst,
+						info.logical_offset);
+		if (info.status == 0)
+			info.bytes_deduped += len;
 
-	/* verify if ranges are overlapped within the same file */
-	if (same_inode) {
-		if (destoff + len > off && destoff < off + len)
-			goto out_unlock;
-	}
+next:
+		if (dst_file)
+			fput(dst_file);
 
-	if (destoff > inode->i_size) {
-		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
-		if (ret)
-			goto out_unlock;
+		if (__put_user_unaligned(info.status, &args->info[i].status) ||
+		    __put_user_unaligned(info.bytes_deduped,
+					 &args->info[i].bytes_deduped)) {
+			ret = -EFAULT;
+			goto out;
+		}                                                               
 	}
 
-	/* truncate page cache pages from target inode range */
-	truncate_inode_pages_range(&inode->i_data, destoff,
-				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+out:
+	mnt_drop_write_file(file);
+	return ret;
+}
 
-	/* do any pending delalloc/csum calc on src, one way or
-	   another, and lock file content */
-	while (1) {
-		struct btrfs_ordered_extent *ordered;
-		lock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-		ordered = btrfs_lookup_first_ordered_extent(src, off + len - 1);
-		if (!ordered &&
-		    !test_range_bit(&BTRFS_I(src)->io_tree, off, off + len - 1,
-				    EXTENT_DELALLOC, 0, NULL))
-			break;
-		unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
-		if (ordered)
-			btrfs_put_ordered_extent(ordered);
-		btrfs_wait_ordered_range(src, off, len);
+/**
+ * btrfs_clone() - clone a range from inode file to another
+ *
+ * @src: Inode to clone from
+ * @inode: Inode to clone to
+ * @off: Offset within source to start clone from
+ * @olen: Original length, passed by user, of range to clone
+ * @olen_aligned: Block-aligned value of olen, extent_same uses
+ *               identical values here
+ * @destoff: Offset within @inode to start clone
+ */
+static int btrfs_clone(struct inode *src, struct inode *inode,
+		       u64 off, u64 olen, u64 olen_aligned, u64 destoff)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *leaf;
+	struct btrfs_trans_handle *trans;
+	char *buf = NULL;
+	struct btrfs_key key;
+	u32 nritems;
+	int slot;
+	int ret;
+	u64 len = olen_aligned;
+
+	ret = -ENOMEM;
+	buf = vmalloc(btrfs_level_size(root, 0));
+	if (!buf)
+		return ret;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		vfree(buf);
+		return ret;
 	}
 
+	path->reada = 2;
 	/* clone data */
 	key.objectid = btrfs_ino(src);
 	key.type = BTRFS_EXTENT_DATA_KEY;
@@ -2858,15 +3094,132 @@ next:
 		key.offset++;
 	}
 	ret = 0;
+
 out:
 	btrfs_release_path(path);
+	btrfs_free_path(path);
+	vfree(buf);
+	return ret;
+}
+
+static noinline long btrfs_ioctl_clone(struct file *file, unsigned long srcfd,
+				       u64 off, u64 olen, u64 destoff)
+{
+	struct inode *inode = fdentry(file)->d_inode;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct fd src_file;
+	struct inode *src;
+	int ret;
+	u64 len = olen;
+	u64 bs = root->fs_info->sb->s_blocksize;
+	int same_inode = 0;
+
+	/*
+	 * TODO:
+	 * - split compressed inline extents.  annoying: we need to
+	 *   decompress into destination's address_space (the file offset
+	 *   may change, so source mapping won't do), then recompress (or
+	 *   otherwise reinsert) a subrange.
+	 * - allow ranges within the same file to be cloned (provided
+	 *   they don't overlap)?
+	 */
+
+	/* the destination must be opened for writing */
+	if (!(file->f_mode & FMODE_WRITE) || (file->f_flags & O_APPEND))
+		return -EINVAL;
+
+	if (btrfs_root_readonly(root))
+		return -EROFS;
+
+	ret = mnt_want_write_file(file);
+	if (ret)
+		return ret;
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != file->f_path.mnt)
+		goto out_fput;
+
+	src = file_inode(src_file.file);
+
+	ret = -EINVAL;
+	if (src == inode)
+		same_inode = 1;
+
+	/* the src must be open for reading */
+	if (!(src_file.file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* don't make the dst file partly checksummed */
+	if ((BTRFS_I(src)->flags & BTRFS_INODE_NODATASUM) !=
+	    (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM))
+		goto out_fput;
+
+	ret = -EISDIR;
+	if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src->i_sb != inode->i_sb)
+		goto out_fput;
+
+	if (!same_inode) {
+		if (inode < src) {
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_CHILD);
+		} else {
+			mutex_lock_nested(&src->i_mutex, I_MUTEX_PARENT);
+			mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD);
+		}
+	} else {
+		mutex_lock(&src->i_mutex);
+	}
+
+	/* determine range to clone */
+	ret = -EINVAL;
+	if (off + len > src->i_size || off + len < off)
+		goto out_unlock;
+	if (len == 0)
+		olen = len = src->i_size - off;
+	/* if we extend to eof, continue to block boundary */
+	if (off + len == src->i_size)
+		len = ALIGN(src->i_size, bs) - off;
+
+	/* verify the end result is block aligned */
+	if (!IS_ALIGNED(off, bs) || !IS_ALIGNED(off + len, bs) ||
+	    !IS_ALIGNED(destoff, bs))
+		goto out_unlock;
+
+	/* verify if ranges are overlapped within the same file */
+	if (same_inode) {
+		if (destoff + len > off && destoff < off + len)
+			goto out_unlock;
+	}
+
+	if (destoff > inode->i_size) {
+		ret = btrfs_cont_expand(inode, inode->i_size, destoff);
+		if (ret)
+			goto out_unlock;
+	}
+
+	/* truncate page cache pages from target inode range */
+	truncate_inode_pages_range(&inode->i_data, destoff,
+				   PAGE_CACHE_ALIGN(destoff + len) - 1);
+
+	lock_extent_range(src, off, len);
+
+	ret = btrfs_clone(src, inode, off, olen, len, destoff);
+
 	unlock_extent(&BTRFS_I(src)->io_tree, off, off + len - 1);
 out_unlock:
 	mutex_unlock(&src->i_mutex);
 	if (!same_inode)
 		mutex_unlock(&inode->i_mutex);
-	vfree(buf);
-	btrfs_free_path(path);
 out_fput:
 	fdput(src_file);
 out_drop_write:
@@ -3312,11 +3665,13 @@ static long btrfs_ioctl_dev_replace(struct btrfs_root *root, void __user *arg)
 
 	switch (p->cmd) {
 	case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
+		if (root->fs_info->sb->s_flags & MS_RDONLY)
+			return -EROFS;
+
 		if (atomic_xchg(
 			&root->fs_info->mutually_exclusive_operation_running,
 			1)) {
-			pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-			ret = -EINPROGRESS;
+			ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 		} else {
 			ret = btrfs_dev_replace_start(root, p);
 			atomic_set(
@@ -3560,8 +3915,7 @@ again:
 	} else {
 		/* this is (1) */
 		mutex_unlock(&fs_info->balance_mutex);
-		pr_info("btrfs: dev add/delete/balance/replace/resize operation in progress\n");
-		ret = -EINVAL;
+		ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
 		goto out;
 	}
 
@@ -3967,6 +4321,7 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 	struct btrfs_trans_handle *trans;
 	struct timespec ct = CURRENT_TIME;
 	int ret = 0;
+	int received_uuid_changed;
 
 	ret = mnt_want_write_file(file);
 	if (ret < 0)
@@ -3996,7 +4351,11 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 		goto out;
 	}
 
-	trans = btrfs_start_transaction(root, 1);
+	/*
+	 * 1 - root item
+	 * 2 - uuid items (received uuid + subvol uuid)
+	 */
+	trans = btrfs_start_transaction(root, 3);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		trans = NULL;
@@ -4007,24 +4366,42 @@ static long btrfs_ioctl_set_received_subvol(struct file *file,
 	sa->rtime.sec = ct.tv_sec;
 	sa->rtime.nsec = ct.tv_nsec;
 
+	received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
+				       BTRFS_UUID_SIZE);
+	if (received_uuid_changed &&
+	    !btrfs_is_empty_uuid(root_item->received_uuid))
+		btrfs_uuid_tree_rem(trans, root->fs_info->uuid_root,
+				    root_item->received_uuid,
+				    BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+				    root->root_key.objectid);
 	memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
 	btrfs_set_root_stransid(root_item, sa->stransid);
 	btrfs_set_root_rtransid(root_item, sa->rtransid);
-	root_item->stime.sec = cpu_to_le64(sa->stime.sec);
-	root_item->stime.nsec = cpu_to_le32(sa->stime.nsec);
-	root_item->rtime.sec = cpu_to_le64(sa->rtime.sec);
-	root_item->rtime.nsec = cpu_to_le32(sa->rtime.nsec);
+	btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
+	btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
+	btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
 
 	ret = btrfs_update_root(trans, root->fs_info->tree_root,
 				&root->root_key, &root->root_item);
 	if (ret < 0) {
 		btrfs_end_transaction(trans, root);
-		trans = NULL;
 		goto out;
-	} else {
-		ret = btrfs_commit_transaction(trans, root);
-		if (ret < 0)
+	}
+	if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
+		ret = btrfs_uuid_tree_add(trans, root->fs_info->uuid_root,
+					  sa->uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  root->root_key.objectid);
+		if (ret < 0 && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
 			goto out;
+		}
+	}
+	ret = btrfs_commit_transaction(trans, root);
+	if (ret < 0) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto out;
 	}
 
 	ret = copy_to_user(arg, sa, sizeof(*sa));
@@ -4041,18 +4418,22 @@ out:
 static int btrfs_ioctl_get_fslabel(struct file *file, void __user *arg)
 {
 	struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
-	const char *label = root->fs_info->super_copy->label;
-	size_t len = strnlen(label, BTRFS_LABEL_SIZE);
+	size_t len;
 	int ret;
+	char label[BTRFS_LABEL_SIZE];
+
+	spin_lock(&root->fs_info->super_lock);
+	memcpy(label, root->fs_info->super_copy->label, BTRFS_LABEL_SIZE);
+	spin_unlock(&root->fs_info->super_lock);
+
+	len = strnlen(label, BTRFS_LABEL_SIZE);
 
 	if (len == BTRFS_LABEL_SIZE) {
 		pr_warn("btrfs: label is too long, return the first %zu bytes\n",
 			--len);
 	}
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	ret = copy_to_user(arg, label, len);
-	mutex_unlock(&root->fs_info->volume_mutex);
 
 	return ret ? -EFAULT : 0;
 }
@@ -4081,18 +4462,18 @@ static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
 	if (ret)
 		return ret;
 
-	mutex_lock(&root->fs_info->volume_mutex);
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		ret = PTR_ERR(trans);
 		goto out_unlock;
 	}
 
+	spin_lock(&root->fs_info->super_lock);
 	strcpy(super_block->label, label);
+	spin_unlock(&root->fs_info->super_lock);
 	ret = btrfs_end_transaction(trans, root);
 
 out_unlock:
-	mutex_unlock(&root->fs_info->volume_mutex);
 	mnt_drop_write_file(file);
 	return ret;
 }
@@ -4207,6 +4588,8 @@ long btrfs_ioctl(struct file *file, unsigned int
 		return btrfs_ioctl_get_fslabel(file, argp);
 	case BTRFS_IOC_SET_FSLABEL:
 		return btrfs_ioctl_set_fslabel(file, argp);
+	case BTRFS_IOC_FILE_EXTENT_SAME:
+		return btrfs_ioctl_file_extent_same(file, argp);
 	}
 
 	return -ENOTTY;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index f93151a98886..b6a6f07c5ce2 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -207,8 +207,10 @@ static int lzo_compress_pages(struct list_head *ws,
 		}
 
 		/* we're making it bigger, give up */
-		if (tot_in > 8192 && tot_in < tot_out)
+		if (tot_in > 8192 && tot_in < tot_out) {
+			ret = -1;
 			goto out;
+		}
 
 		/* we're all done */
 		if (tot_in >= len)
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 81369827e514..966b413a33b8 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -67,7 +67,7 @@ static void ordered_data_tree_panic(struct inode *inode, int errno,
 {
 	struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
 	btrfs_panic(fs_info, errno, "Inconsistency in ordered tree at offset "
-		    "%llu\n", (unsigned long long)offset);
+		    "%llu\n", offset);
 }
 
 /*
@@ -205,6 +205,7 @@ static int __btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
 	entry->bytes_left = len;
 	entry->inode = igrab(inode);
 	entry->compress_type = compress_type;
+	entry->truncated_len = (u64)-1;
 	if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE)
 		set_bit(type, &entry->flags);
 
@@ -336,14 +337,12 @@ int btrfs_dec_test_first_ordered_pending(struct inode *inode,
 	*file_offset = dec_end;
 	if (dec_start > dec_end) {
 		printk(KERN_CRIT "bad ordering dec_start %llu end %llu\n",
-		       (unsigned long long)dec_start,
-		       (unsigned long long)dec_end);
+		       dec_start, dec_end);
 	}
 	to_dec = dec_end - dec_start;
 	if (to_dec > entry->bytes_left) {
 		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)to_dec);
+		       entry->bytes_left, to_dec);
 	}
 	entry->bytes_left -= to_dec;
 	if (!uptodate)
@@ -403,8 +402,7 @@ have_entry:
 
 	if (io_size > entry->bytes_left) {
 		printk(KERN_CRIT "bad ordered accounting left %llu size %llu\n",
-		       (unsigned long long)entry->bytes_left,
-		       (unsigned long long)io_size);
+		       entry->bytes_left, io_size);
 	}
 	entry->bytes_left -= io_size;
 	if (!uptodate)
@@ -671,7 +669,7 @@ int btrfs_run_ordered_operations(struct btrfs_trans_handle *trans,
 	INIT_LIST_HEAD(&splice);
 	INIT_LIST_HEAD(&works);
 
-	mutex_lock(&root->fs_info->ordered_operations_mutex);
+	mutex_lock(&root->fs_info->ordered_extent_flush_mutex);
 	spin_lock(&root->fs_info->ordered_root_lock);
 	list_splice_init(&cur_trans->ordered_operations, &splice);
 	while (!list_empty(&splice)) {
@@ -718,7 +716,7 @@ out:
 		list_del_init(&work->list);
 		btrfs_wait_and_free_delalloc_work(work);
 	}
-	mutex_unlock(&root->fs_info->ordered_operations_mutex);
+	mutex_unlock(&root->fs_info->ordered_extent_flush_mutex);
 	return ret;
 }
 
@@ -923,12 +921,16 @@ int btrfs_ordered_update_i_size(struct inode *inode, u64 offset,
 	struct btrfs_ordered_extent *test;
 	int ret = 1;
 
-	if (ordered)
+	spin_lock_irq(&tree->lock);
+	if (ordered) {
 		offset = entry_end(ordered);
-	else
+		if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags))
+			offset = min(offset,
+				     ordered->file_offset +
+				     ordered->truncated_len);
+	} else {
 		offset = ALIGN(offset, BTRFS_I(inode)->root->sectorsize);
-
-	spin_lock_irq(&tree->lock);
+	}
 	disk_i_size = BTRFS_I(inode)->disk_i_size;
 
 	/* truncate file */
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 68844d59ee6f..d9a5aa097b4f 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -69,6 +69,7 @@ struct btrfs_ordered_sum {
 				       * the isize. */
 #define BTRFS_ORDERED_LOGGED_CSUM 8 /* We've logged the csums on this ordered
 				       ordered extent */
+#define BTRFS_ORDERED_TRUNCATED 9 /* Set when we have to truncate an extent */
 
 struct btrfs_ordered_extent {
 	/* logical offset in the file */
@@ -96,6 +97,12 @@ struct btrfs_ordered_extent {
 	 */
 	u64 outstanding_isize;
 
+	/*
+	 * If we get truncated we need to adjust the file extent we enter for
+	 * this ordered extent so that we do not expose stale data.
+	 */
+	u64 truncated_len;
+
 	/* flags (described above) */
 	unsigned long flags;
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index dc0024f17c1f..0088bedc8631 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -26,14 +26,12 @@ static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
 	int i;
 	printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu "
 	       "num_stripes %d\n",
-	       (unsigned long long)btrfs_chunk_length(eb, chunk),
-	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
-	       (unsigned long long)btrfs_chunk_type(eb, chunk),
-	       num_stripes);
+	       btrfs_chunk_length(eb, chunk), btrfs_chunk_owner(eb, chunk),
+	       btrfs_chunk_type(eb, chunk), num_stripes);
 	for (i = 0 ; i < num_stripes ; i++) {
 		printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i,
-		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
-		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+		      btrfs_stripe_devid_nr(eb, chunk, i),
+		      btrfs_stripe_offset_nr(eb, chunk, i));
 	}
 }
 static void print_dev_item(struct extent_buffer *eb,
@@ -41,18 +39,18 @@ static void print_dev_item(struct extent_buffer *eb,
 {
 	printk(KERN_INFO "\t\tdev item devid %llu "
 	       "total_bytes %llu bytes used %llu\n",
-	       (unsigned long long)btrfs_device_id(eb, dev_item),
-	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
-	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+	       btrfs_device_id(eb, dev_item),
+	       btrfs_device_total_bytes(eb, dev_item),
+	       btrfs_device_bytes_used(eb, dev_item));
 }
 static void print_extent_data_ref(struct extent_buffer *eb,
 				  struct btrfs_extent_data_ref *ref)
 {
 	printk(KERN_INFO "\t\textent data backref root %llu "
 	       "objectid %llu offset %llu count %u\n",
-	       (unsigned long long)btrfs_extent_data_ref_root(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_objectid(eb, ref),
-	       (unsigned long long)btrfs_extent_data_ref_offset(eb, ref),
+	       btrfs_extent_data_ref_root(eb, ref),
+	       btrfs_extent_data_ref_objectid(eb, ref),
+	       btrfs_extent_data_ref_offset(eb, ref),
 	       btrfs_extent_data_ref_count(eb, ref));
 }
 
@@ -87,19 +85,17 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 	flags = btrfs_extent_flags(eb, ei);
 
 	printk(KERN_INFO "\t\textent refs %llu gen %llu flags %llu\n",
-	       (unsigned long long)btrfs_extent_refs(eb, ei),
-	       (unsigned long long)btrfs_extent_generation(eb, ei),
-	       (unsigned long long)flags);
+	       btrfs_extent_refs(eb, ei), btrfs_extent_generation(eb, ei),
+	       flags);
 
 	if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
 		struct btrfs_tree_block_info *info;
 		info = (struct btrfs_tree_block_info *)(ei + 1);
 		btrfs_tree_block_key(eb, info, &key);
-		printk(KERN_INFO "\t\ttree block key (%llu %x %llu) "
+		printk(KERN_INFO "\t\ttree block key (%llu %u %llu) "
 		       "level %d\n",
-		       (unsigned long long)btrfs_disk_key_objectid(&key),
-		       key.type,
-		       (unsigned long long)btrfs_disk_key_offset(&key),
+		       btrfs_disk_key_objectid(&key), key.type,
+		       btrfs_disk_key_offset(&key),
 		       btrfs_tree_block_level(eb, info));
 		iref = (struct btrfs_extent_inline_ref *)(info + 1);
 	} else {
@@ -115,11 +111,11 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 		switch (type) {
 		case BTRFS_TREE_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\ttree block backref "
-				"root %llu\n", (unsigned long long)offset);
+				"root %llu\n", offset);
 			break;
 		case BTRFS_SHARED_BLOCK_REF_KEY:
 			printk(KERN_INFO "\t\tshared block backref "
-				"parent %llu\n", (unsigned long long)offset);
+				"parent %llu\n", offset);
 			break;
 		case BTRFS_EXTENT_DATA_REF_KEY:
 			dref = (struct btrfs_extent_data_ref *)(&iref->offset);
@@ -129,8 +125,7 @@ static void print_extent_item(struct extent_buffer *eb, int slot)
 			sref = (struct btrfs_shared_data_ref *)(iref + 1);
 			printk(KERN_INFO "\t\tshared data backref "
 			       "parent %llu count %u\n",
-			       (unsigned long long)offset,
-			       btrfs_shared_data_ref_count(eb, sref));
+			       offset, btrfs_shared_data_ref_count(eb, sref));
 			break;
 		default:
 			BUG();
@@ -148,13 +143,32 @@ static void print_extent_ref_v0(struct extent_buffer *eb, int slot)
 	ref0 = btrfs_item_ptr(eb, slot, struct btrfs_extent_ref_v0);
 	printk("\t\textent back ref root %llu gen %llu "
 		"owner %llu num_refs %lu\n",
-		(unsigned long long)btrfs_ref_root_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_generation_v0(eb, ref0),
-		(unsigned long long)btrfs_ref_objectid_v0(eb, ref0),
+		btrfs_ref_root_v0(eb, ref0),
+		btrfs_ref_generation_v0(eb, ref0),
+		btrfs_ref_objectid_v0(eb, ref0),
 		(unsigned long)btrfs_ref_count_v0(eb, ref0));
 }
 #endif
 
+static void print_uuid_item(struct extent_buffer *l, unsigned long offset,
+			    u32 item_size)
+{
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		return;
+	}
+	while (item_size) {
+		__le64 subvol_id;
+
+		read_extent_buffer(l, &subvol_id, offset, sizeof(subvol_id));
+		printk(KERN_INFO "\t\tsubvol_id %llu\n",
+		       (unsigned long long)le64_to_cpu(subvol_id));
+		item_size -= sizeof(u64);
+		offset += sizeof(u64);
+	}
+}
+
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
@@ -177,39 +191,34 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 	nr = btrfs_header_nritems(l);
 
 	btrfs_info(root->fs_info, "leaf %llu total ptrs %d free space %d",
-		(unsigned long long)btrfs_header_bytenr(l), nr,
-		btrfs_leaf_free_space(root, l));
+		   btrfs_header_bytenr(l), nr, btrfs_leaf_free_space(root, l));
 	for (i = 0 ; i < nr ; i++) {
 		item = btrfs_item_nr(l, i);
 		btrfs_item_key_to_cpu(l, &key, i);
 		type = btrfs_key_type(&key);
-		printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d "
+		printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d "
 		       "itemsize %d\n",
-			i,
-			(unsigned long long)key.objectid, type,
-			(unsigned long long)key.offset,
+			i, key.objectid, type, key.offset,
 			btrfs_item_offset(l, item), btrfs_item_size(l, item));
 		switch (type) {
 		case BTRFS_INODE_ITEM_KEY:
 			ii = btrfs_item_ptr(l, i, struct btrfs_inode_item);
 			printk(KERN_INFO "\t\tinode generation %llu size %llu "
 			       "mode %o\n",
-			       (unsigned long long)
 			       btrfs_inode_generation(l, ii),
-			      (unsigned long long)btrfs_inode_size(l, ii),
+			       btrfs_inode_size(l, ii),
 			       btrfs_inode_mode(l, ii));
 			break;
 		case BTRFS_DIR_ITEM_KEY:
 			di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
 			btrfs_dir_item_key_to_cpu(l, di, &found_key);
 			printk(KERN_INFO "\t\tdir oid %llu type %u\n",
-				(unsigned long long)found_key.objectid,
+				found_key.objectid,
 				btrfs_dir_type(l, di));
 			break;
 		case BTRFS_ROOT_ITEM_KEY:
 			ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
 			printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n",
-				(unsigned long long)
 				btrfs_disk_root_bytenr(l, ri),
 				btrfs_disk_root_refs(l, ri));
 			break;
@@ -245,17 +254,12 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			}
 			printk(KERN_INFO "\t\textent data disk bytenr %llu "
 			       "nr %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_disk_bytenr(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_disk_num_bytes(l, fi));
 			printk(KERN_INFO "\t\textent data offset %llu "
 			       "nr %llu ram %llu\n",
-			       (unsigned long long)
 			       btrfs_file_extent_offset(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_num_bytes(l, fi),
-			       (unsigned long long)
 			       btrfs_file_extent_ram_bytes(l, fi));
 			break;
 		case BTRFS_EXTENT_REF_V0_KEY:
@@ -269,7 +273,6 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			bi = btrfs_item_ptr(l, i,
 					    struct btrfs_block_group_item);
 			printk(KERN_INFO "\t\tblock group used %llu\n",
-			       (unsigned long long)
 			       btrfs_disk_block_group_used(l, bi));
 			break;
 		case BTRFS_CHUNK_ITEM_KEY:
@@ -286,13 +289,9 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 			printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n"
 			       "\t\tchunk objectid %llu chunk offset %llu "
 			       "length %llu\n",
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_tree(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_objectid(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_chunk_offset(l, dev_extent),
-			       (unsigned long long)
 			       btrfs_dev_extent_length(l, dev_extent));
 			break;
 		case BTRFS_DEV_STATS_KEY:
@@ -301,6 +300,11 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 		case BTRFS_DEV_REPLACE_KEY:
 			printk(KERN_INFO "\t\tdev replace\n");
 			break;
+		case BTRFS_UUID_KEY_SUBVOL:
+		case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+			print_uuid_item(l, btrfs_item_ptr_offset(l, i),
+					btrfs_item_size_nr(l, i));
+			break;
 		};
 	}
 }
@@ -320,16 +324,13 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c)
 		return;
 	}
 	btrfs_info(root->fs_info, "node %llu level %d total ptrs %d free spc %u",
-		(unsigned long long)btrfs_header_bytenr(c),
-		level, nr, (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
+		btrfs_header_bytenr(c), level, nr,
+		(u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr);
 	for (i = 0; i < nr; i++) {
 		btrfs_node_key_to_cpu(c, &key, i);
 		printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n",
-		       i,
-		       (unsigned long long)key.objectid,
-		       key.type,
-		       (unsigned long long)key.offset,
-		       (unsigned long long)btrfs_node_blockptr(c, i));
+		       i, key.objectid, key.type, key.offset,
+		       btrfs_node_blockptr(c, i));
 	}
 	for (i = 0; i < nr; i++) {
 		struct extent_buffer *next = read_tree_block(root,
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 1280eff8af56..4e6ef490619e 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -157,18 +157,11 @@ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info,
 	return qgroup;
 }
 
-/* must be called with qgroup_lock held */
-static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+static void __del_qgroup_rb(struct btrfs_qgroup *qgroup)
 {
-	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
 	struct btrfs_qgroup_list *list;
 
-	if (!qgroup)
-		return -ENOENT;
-
-	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
 	list_del(&qgroup->dirty);
-
 	while (!list_empty(&qgroup->groups)) {
 		list = list_first_entry(&qgroup->groups,
 					struct btrfs_qgroup_list, next_group);
@@ -185,7 +178,18 @@ static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
 		kfree(list);
 	}
 	kfree(qgroup);
+}
 
+/* must be called with qgroup_lock held */
+static int del_qgroup_rb(struct btrfs_fs_info *fs_info, u64 qgroupid)
+{
+	struct btrfs_qgroup *qgroup = find_qgroup_rb(fs_info, qgroupid);
+
+	if (!qgroup)
+		return -ENOENT;
+
+	rb_erase(&qgroup->node, &fs_info->qgroup_tree);
+	__del_qgroup_rb(qgroup);
 	return 0;
 }
 
@@ -394,8 +398,7 @@ next1:
 		if (ret == -ENOENT) {
 			printk(KERN_WARNING
 				"btrfs: orphan qgroup relation 0x%llx->0x%llx\n",
-				(unsigned long long)found_key.objectid,
-				(unsigned long long)found_key.offset);
+				found_key.objectid, found_key.offset);
 			ret = 0;	/* ignore the error */
 		}
 		if (ret)
@@ -428,39 +431,28 @@ out:
 }
 
 /*
- * This is only called from close_ctree() or open_ctree(), both in single-
- * treaded paths. Clean up the in-memory structures. No locking needed.
+ * This is called from close_ctree() or open_ctree() or btrfs_quota_disable(),
+ * first two are in single-threaded paths.And for the third one, we have set
+ * quota_root to be null with qgroup_lock held before, so it is safe to clean
+ * up the in-memory structures without qgroup_lock held.
  */
 void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
 {
 	struct rb_node *n;
 	struct btrfs_qgroup *qgroup;
-	struct btrfs_qgroup_list *list;
 
 	while ((n = rb_first(&fs_info->qgroup_tree))) {
 		qgroup = rb_entry(n, struct btrfs_qgroup, node);
 		rb_erase(n, &fs_info->qgroup_tree);
-
-		while (!list_empty(&qgroup->groups)) {
-			list = list_first_entry(&qgroup->groups,
-						struct btrfs_qgroup_list,
-						next_group);
-			list_del(&list->next_group);
-			list_del(&list->next_member);
-			kfree(list);
-		}
-
-		while (!list_empty(&qgroup->members)) {
-			list = list_first_entry(&qgroup->members,
-						struct btrfs_qgroup_list,
-						next_member);
-			list_del(&list->next_group);
-			list_del(&list->next_member);
-			kfree(list);
-		}
-		kfree(qgroup);
+		__del_qgroup_rb(qgroup);
 	}
+	/*
+	 * we call btrfs_free_qgroup_config() when umounting
+	 * filesystem and disabling quota, so we set qgroup_ulit
+	 * to be null here to avoid double free.
+	 */
 	ulist_free(fs_info->qgroup_ulist);
+	fs_info->qgroup_ulist = NULL;
 }
 
 static int add_qgroup_relation_item(struct btrfs_trans_handle *trans,
@@ -946,13 +938,9 @@ int btrfs_quota_disable(struct btrfs_trans_handle *trans,
 	fs_info->pending_quota_state = 0;
 	quota_root = fs_info->quota_root;
 	fs_info->quota_root = NULL;
-	btrfs_free_qgroup_config(fs_info);
 	spin_unlock(&fs_info->qgroup_lock);
 
-	if (!quota_root) {
-		ret = -EINVAL;
-		goto out;
-	}
+	btrfs_free_qgroup_config(fs_info);
 
 	ret = btrfs_clean_quota_tree(trans, quota_root);
 	if (ret)
@@ -1174,7 +1162,7 @@ int btrfs_limit_qgroup(struct btrfs_trans_handle *trans,
 	if (ret) {
 		fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
 		printk(KERN_INFO "unable to update quota limit for %llu\n",
-		       (unsigned long long)qgroupid);
+		       qgroupid);
 	}
 
 	spin_lock(&fs_info->qgroup_lock);
@@ -1884,10 +1872,9 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
 					 path, 1, 0);
 
 	pr_debug("current progress key (%llu %u %llu), search_slot ret %d\n",
-		 (unsigned long long)fs_info->qgroup_rescan_progress.objectid,
+		 fs_info->qgroup_rescan_progress.objectid,
 		 fs_info->qgroup_rescan_progress.type,
-		 (unsigned long long)fs_info->qgroup_rescan_progress.offset,
-		 ret);
+		 fs_info->qgroup_rescan_progress.offset, ret);
 
 	if (ret) {
 		/*
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 0525e1389f5b..d0ecfbd9cc9f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -1540,8 +1540,10 @@ static int full_stripe_write(struct btrfs_raid_bio *rbio)
 	int ret;
 
 	ret = alloc_rbio_parity_pages(rbio);
-	if (ret)
+	if (ret) {
+		__free_raid_bio(rbio);
 		return ret;
+	}
 
 	ret = lock_stripe_add(rbio);
 	if (ret == 0)
@@ -1687,11 +1689,8 @@ int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
 	struct blk_plug_cb *cb;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio)) {
-		kfree(raid_map);
-		kfree(bbio);
+	if (IS_ERR(rbio))
 		return PTR_ERR(rbio);
-	}
 	bio_list_add(&rbio->bio_list, bio);
 	rbio->bio_list_bytes = bio->bi_size;
 
@@ -2041,9 +2040,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 	int ret;
 
 	rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
-	if (IS_ERR(rbio)) {
+	if (IS_ERR(rbio))
 		return PTR_ERR(rbio);
-	}
 
 	rbio->read_rebuild = 1;
 	bio_list_add(&rbio->bio_list, bio);
@@ -2052,6 +2050,8 @@ int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
 	rbio->faila = find_logical_bio_stripe(rbio, bio);
 	if (rbio->faila == -1) {
 		BUG();
+		kfree(raid_map);
+		kfree(bbio);
 		kfree(rbio);
 		return -EIO;
 	}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 12096496cc99..aacc2121e87c 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -335,7 +335,7 @@ static void backref_tree_panic(struct rb_node *rb_node, int errno, u64 bytenr)
 	if (bnode->root)
 		fs_info = bnode->root->fs_info;
 	btrfs_panic(fs_info, errno, "Inconsistency in backref cache "
-		    "found at offset %llu\n", (unsigned long long)bytenr);
+		    "found at offset %llu\n", bytenr);
 }
 
 /*
@@ -641,6 +641,11 @@ int find_inline_backref(struct extent_buffer *leaf, int slot,
 		WARN_ON(item_size < sizeof(*ei) + sizeof(*bi));
 		return 1;
 	}
+	if (key.type == BTRFS_METADATA_ITEM_KEY &&
+	    item_size <= sizeof(*ei)) {
+		WARN_ON(item_size < sizeof(*ei));
+		return 1;
+	}
 
 	if (key.type == BTRFS_EXTENT_ITEM_KEY) {
 		bi = (struct btrfs_tree_block_info *)(ei + 1);
@@ -691,6 +696,7 @@ struct backref_node *build_backref_tree(struct reloc_control *rc,
 	int cowonly;
 	int ret;
 	int err = 0;
+	bool need_check = true;
 
 	path1 = btrfs_alloc_path();
 	path2 = btrfs_alloc_path();
@@ -914,6 +920,7 @@ again:
 			cur->bytenr);
 
 		lower = cur;
+		need_check = true;
 		for (; level < BTRFS_MAX_LEVEL; level++) {
 			if (!path2->nodes[level]) {
 				BUG_ON(btrfs_root_bytenr(&root->root_item) !=
@@ -957,14 +964,12 @@ again:
 
 				/*
 				 * add the block to pending list if we
-				 * need check its backrefs. only block
-				 * at 'cur->level + 1' is added to the
-				 * tail of pending list. this guarantees
-				 * we check backrefs from lower level
-				 * blocks to upper level blocks.
+				 * need check its backrefs, we only do this once
+				 * while walking up a tree as we will catch
+				 * anything else later on.
 				 */
-				if (!upper->checked &&
-				    level == cur->level + 1) {
+				if (!upper->checked && need_check) {
+					need_check = false;
 					list_add_tail(&edge->list[UPPER],
 						      &list);
 				} else
@@ -2314,8 +2319,13 @@ again:
 			BUG_ON(root->reloc_root != reloc_root);
 
 			ret = merge_reloc_root(rc, root);
-			if (ret)
+			if (ret) {
+				__update_reloc_root(reloc_root, 1);
+				free_extent_buffer(reloc_root->node);
+				free_extent_buffer(reloc_root->commit_root);
+				kfree(reloc_root);
 				goto out;
+			}
 		} else {
 			list_del_init(&reloc_root->root_list);
 		}
@@ -2344,9 +2354,6 @@ again:
 			if (IS_ERR(root))
 				continue;
 
-			if (btrfs_root_refs(&root->root_item) == 0)
-				continue;
-
 			trans = btrfs_join_transaction(root);
 			BUG_ON(IS_ERR(trans));
 
@@ -3628,7 +3635,7 @@ int add_data_references(struct reloc_control *rc,
 	unsigned long ptr;
 	unsigned long end;
 	u32 blocksize = btrfs_level_size(rc->extent_root, 0);
-	int ret;
+	int ret = 0;
 	int err = 0;
 
 	eb = path->nodes[0];
@@ -3655,6 +3662,10 @@ int add_data_references(struct reloc_control *rc,
 		} else {
 			BUG();
 		}
+		if (ret) {
+			err = ret;
+			goto out;
+		}
 		ptr += btrfs_extent_inline_ref_size(key.type);
 	}
 	WARN_ON(ptr > end);
@@ -3700,6 +3711,7 @@ int add_data_references(struct reloc_control *rc,
 		}
 		path->slots[0]++;
 	}
+out:
 	btrfs_release_path(path);
 	if (err)
 		free_block_list(blocks);
@@ -4219,8 +4231,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 	}
 
 	printk(KERN_INFO "btrfs: relocating block group %llu flags %llu\n",
-	       (unsigned long long)rc->block_group->key.objectid,
-	       (unsigned long long)rc->block_group->flags);
+	       rc->block_group->key.objectid, rc->block_group->flags);
 
 	ret = btrfs_start_all_delalloc_inodes(fs_info, 0);
 	if (ret < 0) {
@@ -4242,7 +4253,7 @@ int btrfs_relocate_block_group(struct btrfs_root *extent_root, u64 group_start)
 			break;
 
 		printk(KERN_INFO "btrfs: found %llu extents\n",
-			(unsigned long long)rc->extents_found);
+			rc->extents_found);
 
 		if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
 			btrfs_wait_ordered_range(rc->data_inode, 0, (u64)-1);
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index ffb1036ef10d..0b1f4ef8db98 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -29,8 +29,8 @@
  * generation numbers as then we know the root was once mounted with an older
  * kernel that was not aware of the root item structure change.
  */
-void btrfs_read_root_item(struct extent_buffer *eb, int slot,
-			  struct btrfs_root_item *item)
+static void btrfs_read_root_item(struct extent_buffer *eb, int slot,
+				struct btrfs_root_item *item)
 {
 	uuid_le uuid;
 	int len;
@@ -155,8 +155,7 @@ int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (ret != 0) {
 		btrfs_print_leaf(root, path->nodes[0]);
 		printk(KERN_CRIT "unable to update root key %llu %u %llu\n",
-		       (unsigned long long)key->objectid, key->type,
-		       (unsigned long long)key->offset);
+		       key->objectid, key->type, key->offset);
 		BUG_ON(1);
 	}
 
@@ -490,13 +489,13 @@ again:
  */
 void btrfs_check_and_init_root_item(struct btrfs_root_item *root_item)
 {
-	u64 inode_flags = le64_to_cpu(root_item->inode.flags);
+	u64 inode_flags = btrfs_stack_inode_flags(&root_item->inode);
 
 	if (!(inode_flags & BTRFS_INODE_ROOT_ITEM_INIT)) {
 		inode_flags |= BTRFS_INODE_ROOT_ITEM_INIT;
-		root_item->inode.flags = cpu_to_le64(inode_flags);
-		root_item->flags = 0;
-		root_item->byte_limit = 0;
+		btrfs_set_stack_inode_flags(&root_item->inode, inode_flags);
+		btrfs_set_root_flags(root_item, 0);
+		btrfs_set_root_limit(root_item, 0);
 	}
 }
 
@@ -507,8 +506,8 @@ void btrfs_update_root_times(struct btrfs_trans_handle *trans,
 	struct timespec ct = CURRENT_TIME;
 
 	spin_lock(&root->root_item_lock);
-	item->ctransid = cpu_to_le64(trans->transid);
-	item->ctime.sec = cpu_to_le64(ct.tv_sec);
-	item->ctime.nsec = cpu_to_le32(ct.tv_nsec);
+	btrfs_set_root_ctransid(item, trans->transid);
+	btrfs_set_stack_timespec_sec(&item->ctime, ct.tv_sec);
+	btrfs_set_stack_timespec_nsec(&item->ctime, ct.tv_nsec);
 	spin_unlock(&root->root_item_lock);
 }
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 64a157becbe5..0afcd452fcb3 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -754,8 +754,7 @@ out:
 			num_uncorrectable_read_errors);
 		printk_ratelimited_in_rcu(KERN_ERR
 			"btrfs: unable to fixup (nodatasum) error at logical %llu on dev %s\n",
-			(unsigned long long)fixup->logical,
-			rcu_str_deref(fixup->dev->name));
+			fixup->logical, rcu_str_deref(fixup->dev->name));
 	}
 
 	btrfs_free_path(path);
@@ -1154,8 +1153,7 @@ corrected_error:
 			spin_unlock(&sctx->stat_lock);
 			printk_ratelimited_in_rcu(KERN_ERR
 				"btrfs: fixed up error at logical %llu on dev %s\n",
-				(unsigned long long)logical,
-				rcu_str_deref(dev->name));
+				logical, rcu_str_deref(dev->name));
 		}
 	} else {
 did_not_correct_error:
@@ -1164,8 +1162,7 @@ did_not_correct_error:
 		spin_unlock(&sctx->stat_lock);
 		printk_ratelimited_in_rcu(KERN_ERR
 			"btrfs: unable to fixup (regular) error at logical %llu on dev %s\n",
-			(unsigned long long)logical,
-			rcu_str_deref(dev->name));
+			logical, rcu_str_deref(dev->name));
 	}
 
 out:
@@ -1345,12 +1342,12 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info,
 		mapped_buffer = kmap_atomic(sblock->pagev[0]->page);
 		h = (struct btrfs_header *)mapped_buffer;
 
-		if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr) ||
+		if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) ||
 		    memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) ||
 		    memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
 			   BTRFS_UUID_SIZE)) {
 			sblock->header_error = 1;
-		} else if (generation != le64_to_cpu(h->generation)) {
+		} else if (generation != btrfs_stack_header_generation(h)) {
 			sblock->header_error = 1;
 			sblock->generation_error = 1;
 		}
@@ -1720,10 +1717,10 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock)
 	 * b) the page is already kmapped
 	 */
 
-	if (sblock->pagev[0]->logical != le64_to_cpu(h->bytenr))
+	if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h))
 		++fail;
 
-	if (sblock->pagev[0]->generation != le64_to_cpu(h->generation))
+	if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h))
 		++fail;
 
 	if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -1786,10 +1783,10 @@ static int scrub_checksum_super(struct scrub_block *sblock)
 	s = (struct btrfs_super_block *)mapped_buffer;
 	memcpy(on_disk_csum, s->csum, sctx->csum_size);
 
-	if (sblock->pagev[0]->logical != le64_to_cpu(s->bytenr))
+	if (sblock->pagev[0]->logical != btrfs_super_bytenr(s))
 		++fail_cor;
 
-	if (sblock->pagev[0]->generation != le64_to_cpu(s->generation))
+	if (sblock->pagev[0]->generation != btrfs_super_generation(s))
 		++fail_gen;
 
 	if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE))
@@ -2455,8 +2452,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
 				printk(KERN_ERR
 				       "btrfs scrub: tree block %llu spanning "
 				       "stripes, ignored. logical=%llu\n",
-				       (unsigned long long)key.objectid,
-				       (unsigned long long)logical);
+				       key.objectid, logical);
 				goto next;
 			}
 
@@ -2863,9 +2859,8 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
 	if (fs_info->chunk_root->sectorsize != PAGE_SIZE) {
 		/* not supported for data w/o checksums */
 		printk(KERN_ERR
-		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lld) fails\n",
-		       fs_info->chunk_root->sectorsize,
-		       (unsigned long long)PAGE_SIZE);
+		       "btrfs_scrub: size assumption sectorsize != PAGE_SIZE (%d != %lu) fails\n",
+		       fs_info->chunk_root->sectorsize, PAGE_SIZE);
 		return -EINVAL;
 	}
 
@@ -3175,11 +3170,9 @@ static void copy_nocow_pages_worker(struct btrfs_work *work)
 					  copy_nocow_pages_for_inode,
 					  nocow_ctx);
 	if (ret != 0 && ret != -ENOENT) {
-		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %llu, ret %d\n",
-			(unsigned long long)logical,
-			(unsigned long long)physical_for_dev_replace,
-			(unsigned long long)len,
-			(unsigned long long)mirror_num, ret);
+		pr_warn("iterate_inodes_from_logical() failed: log %llu, phys %llu, len %llu, mir %u, ret %d\n",
+			logical, physical_for_dev_replace, len, mirror_num,
+			ret);
 		not_written = 1;
 		goto out;
 	}
@@ -3224,11 +3217,6 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, void *ctx)
 		return PTR_ERR(local_root);
 	}
 
-	if (btrfs_root_refs(&local_root->root_item) == 0) {
-		srcu_read_unlock(&fs_info->subvol_srcu, srcu_index);
-		return -ENOENT;
-	}
-
 	key.type = BTRFS_INODE_ITEM_KEY;
 	key.objectid = inum;
 	key.offset = 0;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 2e14fd89a8b4..e46e0ed74925 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -26,6 +26,7 @@
 #include <linux/radix-tree.h>
 #include <linux/crc32c.h>
 #include <linux/vmalloc.h>
+#include <linux/string.h>
 
 #include "send.h"
 #include "backref.h"
@@ -54,8 +55,8 @@ struct fs_path {
 
 			char *buf;
 			int buf_len;
-			int reversed:1;
-			int virtual_mem:1;
+			unsigned int reversed:1;
+			unsigned int virtual_mem:1;
 			char inline_buf[];
 		};
 		char pad[PAGE_SIZE];
@@ -1668,6 +1669,7 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 			      u64 *who_ino, u64 *who_gen)
 {
 	int ret = 0;
+	u64 gen;
 	u64 other_inode = 0;
 	u8 other_type = 0;
 
@@ -1678,6 +1680,24 @@ static int will_overwrite_ref(struct send_ctx *sctx, u64 dir, u64 dir_gen,
 	if (ret <= 0)
 		goto out;
 
+	/*
+	 * If we have a parent root we need to verify that the parent dir was
+	 * not delted and then re-created, if it was then we have no overwrite
+	 * and we can just unlink this entry.
+	 */
+	if (sctx->parent_root) {
+		ret = get_inode_info(sctx->parent_root, dir, NULL, &gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret < 0 && ret != -ENOENT)
+			goto out;
+		if (ret) {
+			ret = 0;
+			goto out;
+		}
+		if (gen != dir_gen)
+			goto out;
+	}
+
 	ret = lookup_dir_item_inode(sctx->parent_root, dir, name, name_len,
 			&other_inode, &other_type);
 	if (ret < 0 && ret != -ENOENT)
@@ -2519,7 +2539,8 @@ static int did_create_dir(struct send_ctx *sctx, u64 dir)
 		di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
 		btrfs_dir_item_key_to_cpu(eb, di, &di_key);
 
-		if (di_key.objectid < sctx->send_progress) {
+		if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
+		    di_key.objectid < sctx->send_progress) {
 			ret = 1;
 			goto out;
 		}
@@ -2581,7 +2602,6 @@ static int record_ref(struct list_head *head, u64 dir,
 		      u64 dir_gen, struct fs_path *path)
 {
 	struct recorded_ref *ref;
-	char *tmp;
 
 	ref = kmalloc(sizeof(*ref), GFP_NOFS);
 	if (!ref)
@@ -2591,25 +2611,35 @@ static int record_ref(struct list_head *head, u64 dir,
 	ref->dir_gen = dir_gen;
 	ref->full_path = path;
 
-	tmp = strrchr(ref->full_path->start, '/');
-	if (!tmp) {
-		ref->name_len = ref->full_path->end - ref->full_path->start;
-		ref->name = ref->full_path->start;
+	ref->name = (char *)kbasename(ref->full_path->start);
+	ref->name_len = ref->full_path->end - ref->name;
+	ref->dir_path = ref->full_path->start;
+	if (ref->name == ref->full_path->start)
 		ref->dir_path_len = 0;
-		ref->dir_path = ref->full_path->start;
-	} else {
-		tmp++;
-		ref->name_len = ref->full_path->end - tmp;
-		ref->name = tmp;
-		ref->dir_path = ref->full_path->start;
+	else
 		ref->dir_path_len = ref->full_path->end -
 				ref->full_path->start - 1 - ref->name_len;
-	}
 
 	list_add_tail(&ref->list, head);
 	return 0;
 }
 
+static int dup_ref(struct recorded_ref *ref, struct list_head *list)
+{
+	struct recorded_ref *new;
+
+	new = kmalloc(sizeof(*ref), GFP_NOFS);
+	if (!new)
+		return -ENOMEM;
+
+	new->dir = ref->dir;
+	new->dir_gen = ref->dir_gen;
+	new->full_path = NULL;
+	INIT_LIST_HEAD(&new->list);
+	list_add_tail(&new->list, list);
+	return 0;
+}
+
 static void __free_recorded_refs(struct list_head *head)
 {
 	struct recorded_ref *cur;
@@ -2724,9 +2754,7 @@ static int process_recorded_refs(struct send_ctx *sctx)
 	int ret = 0;
 	struct recorded_ref *cur;
 	struct recorded_ref *cur2;
-	struct ulist *check_dirs = NULL;
-	struct ulist_iterator uit;
-	struct ulist_node *un;
+	struct list_head check_dirs;
 	struct fs_path *valid_path = NULL;
 	u64 ow_inode = 0;
 	u64 ow_gen;
@@ -2740,6 +2768,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 * which is always '..'
 	 */
 	BUG_ON(sctx->cur_ino <= BTRFS_FIRST_FREE_OBJECTID);
+	INIT_LIST_HEAD(&check_dirs);
 
 	valid_path = fs_path_alloc();
 	if (!valid_path) {
@@ -2747,12 +2776,6 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		goto out;
 	}
 
-	check_dirs = ulist_alloc(GFP_NOFS);
-	if (!check_dirs) {
-		ret = -ENOMEM;
-		goto out;
-	}
-
 	/*
 	 * First, check if the first ref of the current inode was overwritten
 	 * before. If yes, we know that the current inode was already orphanized
@@ -2889,8 +2912,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 					goto out;
 			}
 		}
-		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
-				GFP_NOFS);
+		ret = dup_ref(cur, &check_dirs);
 		if (ret < 0)
 			goto out;
 	}
@@ -2918,8 +2940,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		}
 
 		list_for_each_entry(cur, &sctx->deleted_refs, list) {
-			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
-					GFP_NOFS);
+			ret = dup_ref(cur, &check_dirs);
 			if (ret < 0)
 				goto out;
 		}
@@ -2930,8 +2951,7 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 		 */
 		cur = list_entry(sctx->deleted_refs.next, struct recorded_ref,
 				list);
-		ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
-				GFP_NOFS);
+		ret = dup_ref(cur, &check_dirs);
 		if (ret < 0)
 			goto out;
 	} else if (!S_ISDIR(sctx->cur_inode_mode)) {
@@ -2951,12 +2971,10 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 				if (ret < 0)
 					goto out;
 			}
-			ret = ulist_add(check_dirs, cur->dir, cur->dir_gen,
-					GFP_NOFS);
+			ret = dup_ref(cur, &check_dirs);
 			if (ret < 0)
 				goto out;
 		}
-
 		/*
 		 * If the inode is still orphan, unlink the orphan. This may
 		 * happen when a previous inode did overwrite the first ref
@@ -2978,33 +2996,32 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	 * deletion and if it's finally possible to perform the rmdir now.
 	 * We also update the inode stats of the parent dirs here.
 	 */
-	ULIST_ITER_INIT(&uit);
-	while ((un = ulist_next(check_dirs, &uit))) {
+	list_for_each_entry(cur, &check_dirs, list) {
 		/*
 		 * In case we had refs into dirs that were not processed yet,
 		 * we don't need to do the utime and rmdir logic for these dirs.
 		 * The dir will be processed later.
 		 */
-		if (un->val > sctx->cur_ino)
+		if (cur->dir > sctx->cur_ino)
 			continue;
 
-		ret = get_cur_inode_state(sctx, un->val, un->aux);
+		ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen);
 		if (ret < 0)
 			goto out;
 
 		if (ret == inode_state_did_create ||
 		    ret == inode_state_no_change) {
 			/* TODO delayed utimes */
-			ret = send_utimes(sctx, un->val, un->aux);
+			ret = send_utimes(sctx, cur->dir, cur->dir_gen);
 			if (ret < 0)
 				goto out;
 		} else if (ret == inode_state_did_delete) {
-			ret = can_rmdir(sctx, un->val, sctx->cur_ino);
+			ret = can_rmdir(sctx, cur->dir, sctx->cur_ino);
 			if (ret < 0)
 				goto out;
 			if (ret) {
-				ret = get_cur_path(sctx, un->val, un->aux,
-						valid_path);
+				ret = get_cur_path(sctx, cur->dir,
+						   cur->dir_gen, valid_path);
 				if (ret < 0)
 					goto out;
 				ret = send_rmdir(sctx, valid_path);
@@ -3017,8 +3034,8 @@ verbose_printk("btrfs: process_recorded_refs %llu\n", sctx->cur_ino);
 	ret = 0;
 
 out:
+	__free_recorded_refs(&check_dirs);
 	free_recorded_refs(sctx);
-	ulist_free(check_dirs);
 	fs_path_free(valid_path);
 	return ret;
 }
@@ -3119,6 +3136,8 @@ out:
 
 struct find_ref_ctx {
 	u64 dir;
+	u64 dir_gen;
+	struct btrfs_root *root;
 	struct fs_path *name;
 	int found_idx;
 };
@@ -3128,9 +3147,21 @@ static int __find_iref(int num, u64 dir, int index,
 		       void *ctx_)
 {
 	struct find_ref_ctx *ctx = ctx_;
+	u64 dir_gen;
+	int ret;
 
 	if (dir == ctx->dir && fs_path_len(name) == fs_path_len(ctx->name) &&
 	    strncmp(name->start, ctx->name->start, fs_path_len(name)) == 0) {
+		/*
+		 * To avoid doing extra lookups we'll only do this if everything
+		 * else matches.
+		 */
+		ret = get_inode_info(ctx->root, dir, NULL, &dir_gen, NULL,
+				     NULL, NULL, NULL);
+		if (ret)
+			return ret;
+		if (dir_gen != ctx->dir_gen)
+			return 0;
 		ctx->found_idx = num;
 		return 1;
 	}
@@ -3140,14 +3171,16 @@ static int __find_iref(int num, u64 dir, int index,
 static int find_iref(struct btrfs_root *root,
 		     struct btrfs_path *path,
 		     struct btrfs_key *key,
-		     u64 dir, struct fs_path *name)
+		     u64 dir, u64 dir_gen, struct fs_path *name)
 {
 	int ret;
 	struct find_ref_ctx ctx;
 
 	ctx.dir = dir;
 	ctx.name = name;
+	ctx.dir_gen = dir_gen;
 	ctx.found_idx = -1;
+	ctx.root = root;
 
 	ret = iterate_inode_ref(root, path, key, 0, __find_iref, &ctx);
 	if (ret < 0)
@@ -3163,11 +3196,17 @@ static int __record_changed_new_ref(int num, u64 dir, int index,
 				    struct fs_path *name,
 				    void *ctx)
 {
+	u64 dir_gen;
 	int ret;
 	struct send_ctx *sctx = ctx;
 
+	ret = get_inode_info(sctx->send_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
 	ret = find_iref(sctx->parent_root, sctx->right_path,
-			sctx->cmp_key, dir, name);
+			sctx->cmp_key, dir, dir_gen, name);
 	if (ret == -ENOENT)
 		ret = __record_new_ref(num, dir, index, name, sctx);
 	else if (ret > 0)
@@ -3180,11 +3219,17 @@ static int __record_changed_deleted_ref(int num, u64 dir, int index,
 					struct fs_path *name,
 					void *ctx)
 {
+	u64 dir_gen;
 	int ret;
 	struct send_ctx *sctx = ctx;
 
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &dir_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
 	ret = find_iref(sctx->send_root, sctx->left_path, sctx->cmp_key,
-			dir, name);
+			dir, dir_gen, name);
 	if (ret == -ENOENT)
 		ret = __record_deleted_ref(num, dir, index, name, sctx);
 	else if (ret > 0)
@@ -3869,7 +3914,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 	btrfs_item_key_to_cpu(eb, &found_key, slot);
 	if (found_key.objectid != key.objectid ||
 	    found_key.type != key.type) {
-		ret = 0;
+		/* If we're a hole then just pretend nothing changed */
+		ret = (left_disknr) ? 0 : 1;
 		goto out;
 	}
 
@@ -3895,7 +3941,8 @@ static int is_extent_unchanged(struct send_ctx *sctx,
 		 * This may only happen on the first iteration.
 		 */
 		if (found_key.offset + right_len <= ekey->offset) {
-			ret = 0;
+			/* If we're a hole just pretend nothing changed */
+			ret = (left_disknr) ? 0 : 1;
 			goto out;
 		}
 
@@ -3960,8 +4007,8 @@ static int process_extent(struct send_ctx *sctx,
 			  struct btrfs_path *path,
 			  struct btrfs_key *key)
 {
-	int ret = 0;
 	struct clone_root *found_clone = NULL;
+	int ret = 0;
 
 	if (S_ISLNK(sctx->cur_inode_mode))
 		return 0;
@@ -3974,6 +4021,32 @@ static int process_extent(struct send_ctx *sctx,
 			ret = 0;
 			goto out;
 		}
+	} else {
+		struct btrfs_file_extent_item *ei;
+		u8 type;
+
+		ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
+				    struct btrfs_file_extent_item);
+		type = btrfs_file_extent_type(path->nodes[0], ei);
+		if (type == BTRFS_FILE_EXTENT_PREALLOC ||
+		    type == BTRFS_FILE_EXTENT_REG) {
+			/*
+			 * The send spec does not have a prealloc command yet,
+			 * so just leave a hole for prealloc'ed extents until
+			 * we have enough commands queued up to justify rev'ing
+			 * the send spec.
+			 */
+			if (type == BTRFS_FILE_EXTENT_PREALLOC) {
+				ret = 0;
+				goto out;
+			}
+
+			/* Have a hole, just skip it. */
+			if (btrfs_file_extent_disk_bytenr(path->nodes[0], ei) == 0) {
+				ret = 0;
+				goto out;
+			}
+		}
 	}
 
 	ret = find_extent_clone(sctx, path, key->objectid, key->offset,
@@ -4361,6 +4434,64 @@ static int changed_extent(struct send_ctx *sctx,
 	return ret;
 }
 
+static int dir_changed(struct send_ctx *sctx, u64 dir)
+{
+	u64 orig_gen, new_gen;
+	int ret;
+
+	ret = get_inode_info(sctx->send_root, dir, NULL, &new_gen, NULL, NULL,
+			     NULL, NULL);
+	if (ret)
+		return ret;
+
+	ret = get_inode_info(sctx->parent_root, dir, NULL, &orig_gen, NULL,
+			     NULL, NULL, NULL);
+	if (ret)
+		return ret;
+
+	return (orig_gen != new_gen) ? 1 : 0;
+}
+
+static int compare_refs(struct send_ctx *sctx, struct btrfs_path *path,
+			struct btrfs_key *key)
+{
+	struct btrfs_inode_extref *extref;
+	struct extent_buffer *leaf;
+	u64 dirid = 0, last_dirid = 0;
+	unsigned long ptr;
+	u32 item_size;
+	u32 cur_offset = 0;
+	int ref_name_len;
+	int ret = 0;
+
+	/* Easy case, just check this one dirid */
+	if (key->type == BTRFS_INODE_REF_KEY) {
+		dirid = key->offset;
+
+		ret = dir_changed(sctx, dirid);
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	item_size = btrfs_item_size_nr(leaf, path->slots[0]);
+	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
+	while (cur_offset < item_size) {
+		extref = (struct btrfs_inode_extref *)(ptr +
+						       cur_offset);
+		dirid = btrfs_inode_extref_parent(leaf, extref);
+		ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
+		cur_offset += ref_name_len + sizeof(*extref);
+		if (dirid == last_dirid)
+			continue;
+		ret = dir_changed(sctx, dirid);
+		if (ret)
+			break;
+		last_dirid = dirid;
+	}
+out:
+	return ret;
+}
+
 /*
  * Updates compare related fields in sctx and simply forwards to the actual
  * changed_xxx functions.
@@ -4376,6 +4507,19 @@ static int changed_cb(struct btrfs_root *left_root,
 	int ret = 0;
 	struct send_ctx *sctx = ctx;
 
+	if (result == BTRFS_COMPARE_TREE_SAME) {
+		if (key->type != BTRFS_INODE_REF_KEY &&
+		    key->type != BTRFS_INODE_EXTREF_KEY)
+			return 0;
+		ret = compare_refs(sctx, left_path, key);
+		if (!ret)
+			return 0;
+		if (ret < 0)
+			return ret;
+		result = BTRFS_COMPARE_TREE_CHANGED;
+		ret = 0;
+	}
+
 	sctx->left_path = left_path;
 	sctx->right_path = right_path;
 	sctx->cmp_key = key;
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 8eb6191d86da..3aab10ce63e8 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -56,6 +56,8 @@
 #include "rcu-string.h"
 #include "dev-replace.h"
 #include "free-space-cache.h"
+#include "backref.h"
+#include "tests/btrfs-tests.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/btrfs.h>
@@ -320,14 +322,15 @@ enum {
 	Opt_enospc_debug, Opt_subvolrootid, Opt_defrag, Opt_inode_cache,
 	Opt_no_space_cache, Opt_recovery, Opt_skip_balance,
 	Opt_check_integrity, Opt_check_integrity_including_extent_data,
-	Opt_check_integrity_print_mask, Opt_fatal_errors,
+	Opt_check_integrity_print_mask, Opt_fatal_errors, Opt_rescan_uuid_tree,
+	Opt_commit_interval,
 	Opt_err,
 };
 
 static match_table_t tokens = {
 	{Opt_degraded, "degraded"},
 	{Opt_subvol, "subvol=%s"},
-	{Opt_subvolid, "subvolid=%d"},
+	{Opt_subvolid, "subvolid=%s"},
 	{Opt_device, "device=%s"},
 	{Opt_nodatasum, "nodatasum"},
 	{Opt_nodatacow, "nodatacow"},
@@ -360,7 +363,9 @@ static match_table_t tokens = {
 	{Opt_check_integrity, "check_int"},
 	{Opt_check_integrity_including_extent_data, "check_int_data"},
 	{Opt_check_integrity_print_mask, "check_int_print_mask=%d"},
+	{Opt_rescan_uuid_tree, "rescan_uuid_tree"},
 	{Opt_fatal_errors, "fatal_errors=%s"},
+	{Opt_commit_interval, "commit=%d"},
 	{Opt_err, NULL},
 };
 
@@ -496,10 +501,15 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, NOBARRIER);
 			break;
 		case Opt_thread_pool:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg)
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg > 0) {
 				info->thread_pool_size = intarg;
+			} else {
+				ret = -EINVAL;
+				goto out;
+			}
 			break;
 		case Opt_max_inline:
 			num = match_strdup(&args[0]);
@@ -513,7 +523,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 						root->sectorsize);
 				}
 				printk(KERN_INFO "btrfs: max_inline at %llu\n",
-					(unsigned long long)info->max_inline);
+					info->max_inline);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
 		case Opt_alloc_start:
@@ -525,7 +538,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				kfree(num);
 				printk(KERN_INFO
 					"btrfs: allocations start at %llu\n",
-					(unsigned long long)info->alloc_start);
+					info->alloc_start);
+			} else {
+				ret = -ENOMEM;
+				goto out;
 			}
 			break;
 		case Opt_noacl:
@@ -540,12 +556,16 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, FLUSHONCOMMIT);
 			break;
 		case Opt_ratio:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
 				info->metadata_ratio = intarg;
 				printk(KERN_INFO "btrfs: metadata ratio %d\n",
 				       info->metadata_ratio);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_discard:
@@ -554,6 +574,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 		case Opt_space_cache:
 			btrfs_set_opt(info->mount_opt, SPACE_CACHE);
 			break;
+		case Opt_rescan_uuid_tree:
+			btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE);
+			break;
 		case Opt_no_space_cache:
 			printk(KERN_INFO "btrfs: disabling disk space caching\n");
 			btrfs_clear_opt(info->mount_opt, SPACE_CACHE);
@@ -596,13 +619,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 			btrfs_set_opt(info->mount_opt, CHECK_INTEGRITY);
 			break;
 		case Opt_check_integrity_print_mask:
-			intarg = 0;
-			match_int(&args[0], &intarg);
-			if (intarg) {
+			ret = match_int(&args[0], &intarg);
+			if (ret) {
+				goto out;
+			} else if (intarg >= 0) {
 				info->check_integrity_print_mask = intarg;
 				printk(KERN_INFO "btrfs:"
 				       " check_integrity_print_mask 0x%x\n",
 				       info->check_integrity_print_mask);
+			} else {
+				ret = -EINVAL;
+				goto out;
 			}
 			break;
 #else
@@ -626,6 +653,29 @@ int btrfs_parse_options(struct btrfs_root *root, char *options)
 				goto out;
 			}
 			break;
+		case Opt_commit_interval:
+			intarg = 0;
+			ret = match_int(&args[0], &intarg);
+			if (ret < 0) {
+				printk(KERN_ERR
+					"btrfs: invalid commit interval\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (intarg > 0) {
+				if (intarg > 300) {
+					printk(KERN_WARNING
+					    "btrfs: excessive commit interval %d\n",
+							intarg);
+				}
+				info->commit_interval = intarg;
+			} else {
+				printk(KERN_INFO
+				    "btrfs: using default commit interval %ds\n",
+				    BTRFS_DEFAULT_COMMIT_INTERVAL);
+				info->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL;
+			}
+			break;
 		case Opt_err:
 			printk(KERN_INFO "btrfs: unrecognized mount option "
 			       "'%s'\n", p);
@@ -654,8 +704,8 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *device_name, *opts, *orig, *p;
+	char *num = NULL;
 	int error = 0;
-	int intarg;
 
 	if (!options)
 		return 0;
@@ -679,17 +729,23 @@ static int btrfs_parse_early_options(const char *options, fmode_t flags,
 		case Opt_subvol:
 			kfree(*subvol_name);
 			*subvol_name = match_strdup(&args[0]);
+			if (!*subvol_name) {
+				error = -ENOMEM;
+				goto out;
+			}
 			break;
 		case Opt_subvolid:
-			intarg = 0;
-			error = match_int(&args[0], &intarg);
-			if (!error) {
+			num = match_strdup(&args[0]);
+			if (num) {
+				*subvol_objectid = memparse(num, NULL);
+				kfree(num);
 				/* we want the original fs_tree */
-				if (!intarg)
+				if (!*subvol_objectid)
 					*subvol_objectid =
 						BTRFS_FS_TREE_OBJECTID;
-				else
-					*subvol_objectid = intarg;
+			} else {
+				error = -EINVAL;
+				goto out;
 			}
 			break;
 		case Opt_subvolrootid:
@@ -892,11 +948,9 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (btrfs_test_opt(root, NOBARRIER))
 		seq_puts(seq, ",nobarrier");
 	if (info->max_inline != 8192 * 1024)
-		seq_printf(seq, ",max_inline=%llu",
-			   (unsigned long long)info->max_inline);
+		seq_printf(seq, ",max_inline=%llu", info->max_inline);
 	if (info->alloc_start != 0)
-		seq_printf(seq, ",alloc_start=%llu",
-			   (unsigned long long)info->alloc_start);
+		seq_printf(seq, ",alloc_start=%llu", info->alloc_start);
 	if (info->thread_pool_size !=  min_t(unsigned long,
 					     num_online_cpus() + 2, 8))
 		seq_printf(seq, ",thread_pool=%d", info->thread_pool_size);
@@ -928,6 +982,8 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",space_cache");
 	else
 		seq_puts(seq, ",nospace_cache");
+	if (btrfs_test_opt(root, RESCAN_UUID_TREE))
+		seq_puts(seq, ",rescan_uuid_tree");
 	if (btrfs_test_opt(root, CLEAR_CACHE))
 		seq_puts(seq, ",clear_cache");
 	if (btrfs_test_opt(root, USER_SUBVOL_RM_ALLOWED))
@@ -940,8 +996,24 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry)
 		seq_puts(seq, ",inode_cache");
 	if (btrfs_test_opt(root, SKIP_BALANCE))
 		seq_puts(seq, ",skip_balance");
+	if (btrfs_test_opt(root, RECOVERY))
+		seq_puts(seq, ",recovery");
+#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
+	if (btrfs_test_opt(root, CHECK_INTEGRITY_INCLUDING_EXTENT_DATA))
+		seq_puts(seq, ",check_int_data");
+	else if (btrfs_test_opt(root, CHECK_INTEGRITY))
+		seq_puts(seq, ",check_int");
+	if (info->check_integrity_print_mask)
+		seq_printf(seq, ",check_int_print_mask=%d",
+				info->check_integrity_print_mask);
+#endif
+	if (info->metadata_ratio)
+		seq_printf(seq, ",metadata_ratio=%d",
+				info->metadata_ratio);
 	if (btrfs_test_opt(root, PANIC_ON_FATAL_ERROR))
 		seq_puts(seq, ",fatal_errors=panic");
+	if (info->commit_interval != BTRFS_DEFAULT_COMMIT_INTERVAL)
+		seq_printf(seq, ",commit=%d", info->commit_interval);
 	return 0;
 }
 
@@ -1696,6 +1768,11 @@ static void btrfs_print_info(void)
 			"\n");
 }
 
+static int btrfs_run_sanity_tests(void)
+{
+	return btrfs_test_free_space_cache();
+}
+
 static int __init init_btrfs_fs(void)
 {
 	int err;
@@ -1734,23 +1811,32 @@ static int __init init_btrfs_fs(void)
 	if (err)
 		goto free_auto_defrag;
 
-	err = btrfs_interface_init();
+	err = btrfs_prelim_ref_init();
 	if (err)
-		goto free_delayed_ref;
+		goto free_prelim_ref;
 
-	err = register_filesystem(&btrfs_fs_type);
+	err = btrfs_interface_init();
 	if (err)
-		goto unregister_ioctl;
+		goto free_delayed_ref;
 
 	btrfs_init_lockdep();
 
 	btrfs_print_info();
-	btrfs_test_free_space_cache();
+
+	err = btrfs_run_sanity_tests();
+	if (err)
+		goto unregister_ioctl;
+
+	err = register_filesystem(&btrfs_fs_type);
+	if (err)
+		goto unregister_ioctl;
 
 	return 0;
 
 unregister_ioctl:
 	btrfs_interface_exit();
+free_prelim_ref:
+	btrfs_prelim_ref_exit();
 free_delayed_ref:
 	btrfs_delayed_ref_exit();
 free_auto_defrag:
@@ -1777,6 +1863,7 @@ static void __exit exit_btrfs_fs(void)
 	btrfs_delayed_ref_exit();
 	btrfs_auto_defrag_exit();
 	btrfs_delayed_inode_exit();
+	btrfs_prelim_ref_exit();
 	ordered_data_exit();
 	extent_map_exit();
 	extent_io_exit();
diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h
new file mode 100644
index 000000000000..580877625776
--- /dev/null
+++ b/fs/btrfs/tests/btrfs-tests.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_TESTS
+#define __BTRFS_TESTS
+
+#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
+
+#define test_msg(fmt, ...) pr_info("btrfs: selftest: " fmt, ##__VA_ARGS__)
+
+int btrfs_test_free_space_cache(void);
+#else
+static inline int btrfs_test_free_space_cache(void)
+{
+	return 0;
+}
+#endif
+
+#endif
diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c
new file mode 100644
index 000000000000..6fc82010dc15
--- /dev/null
+++ b/fs/btrfs/tests/free-space-tests.c
@@ -0,0 +1,395 @@
+/*
+ * Copyright (C) 2013 Fusion IO.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/slab.h>
+#include "btrfs-tests.h"
+#include "../ctree.h"
+#include "../free-space-cache.h"
+
+#define BITS_PER_BITMAP		(PAGE_CACHE_SIZE * 8)
+static struct btrfs_block_group_cache *init_test_block_group(void)
+{
+	struct btrfs_block_group_cache *cache;
+
+	cache = kzalloc(sizeof(*cache), GFP_NOFS);
+	if (!cache)
+		return NULL;
+	cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
+					GFP_NOFS);
+	if (!cache->free_space_ctl) {
+		kfree(cache);
+		return NULL;
+	}
+
+	cache->key.objectid = 0;
+	cache->key.offset = 1024 * 1024 * 1024;
+	cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
+	cache->sectorsize = 4096;
+
+	spin_lock_init(&cache->lock);
+	INIT_LIST_HEAD(&cache->list);
+	INIT_LIST_HEAD(&cache->cluster_list);
+	INIT_LIST_HEAD(&cache->new_bg_list);
+
+	btrfs_init_free_space_ctl(cache);
+
+	return cache;
+}
+
+/*
+ * This test just does basic sanity checking, making sure we can add an exten
+ * entry and remove space from either end and the middle, and make sure we can
+ * remove space that covers adjacent extent entries.
+ */
+static int test_extents(struct btrfs_block_group_cache *cache)
+{
+	int ret = 0;
+
+	test_msg("Running extent only tests\n");
+
+	/* First just make sure we can remove an entire entry */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding initial extents %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing extent %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Full remove left some lingering space\n");
+		return -1;
+	}
+
+	/* Ok edge and middle cases now */
+	ret = btrfs_add_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error adding half extent %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing tail end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing front end %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 2 * 1024 * 1024, 4096);
+	if (ret) {
+		test_msg("Error removing middle peice %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the front\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 2 * 1024 * 1024, 4096)) {
+		test_msg("Still have space in the middle\n");
+		return -1;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Still have space at the end\n");
+		return -1;
+	}
+
+	/* Cleanup */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+static int test_bitmaps(struct btrfs_block_group_cache *cache)
+{
+	u64 next_bitmap_offset;
+	int ret;
+
+	test_msg("Running bitmap only tests\n");
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create a bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap full range %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 4 * 1024 * 1024)) {
+		test_msg("Left some space in bitmap\n");
+		return -1;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to our bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove middle chunk %d\n", ret);
+		return ret;
+	}
+
+	/*
+	 * The first bitmap we have starts at offset 0 so the next one is just
+	 * at the end of the first bitmap.
+	 */
+	next_bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+
+	/* Test a bit straddling two bitmaps */
+	ret = test_add_free_space_entry(cache, next_bitmap_offset -
+				   (2 * 1024 * 1024), 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space that straddles two bitmaps %d\n",
+				ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, next_bitmap_offset -
+				      (1 * 1024 * 1024), 2 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, next_bitmap_offset - (1 * 1024 * 1024),
+			 2 * 1024 * 1024)) {
+		test_msg("Left some space when removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	return 0;
+}
+
+/* This is the high grade jackassery */
+static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache)
+{
+	u64 bitmap_offset = (u64)(BITS_PER_BITMAP * 4096);
+	int ret;
+
+	test_msg("Running bitmap and extent tests\n");
+
+	/*
+	 * First let's do something simple, an extent at the same offset as the
+	 * bitmap, but the free space completely in the extent and then
+	 * completely in the bitmap.
+	 */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 1 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't create bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 0, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove extent entry %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 0, 1 * 1024 * 1024)) {
+		test_msg("Left remnants after our remove\n");
+		return -1;
+	}
+
+	/* Now to add back the extent entry and remove from the bitmap */
+	ret = test_add_free_space_entry(cache, 0, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't re-add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 4 * 1024 * 1024, 1 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove from bitmap %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 4 * 1024 * 1024, 1 * 1024 * 1024)) {
+		test_msg("Left remnants in the bitmap\n");
+		return -1;
+	}
+
+	/*
+	 * Ok so a little more evil, extent entry and bitmap at the same offset,
+	 * removing an overlapping chunk.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add to a bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 512 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Couldn't remove overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 512 * 1024, 3 * 1024 * 1024)) {
+		test_msg("Left over peices after removing overlapping\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/* Now with the extent entry offset into the bitmap */
+	ret = test_add_free_space_entry(cache, 4 * 1024 * 1024, 4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add space to the bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 2 * 1024 * 1024, 2 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent to the cache %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 3 * 1024 * 1024, 4 * 1024 * 1024);
+	if (ret) {
+		test_msg("Problem removing overlapping space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, 3 * 1024 * 1024, 4 * 1024 * 1024)) {
+		test_msg("Left something behind when removing space");
+		return -1;
+	}
+
+	/*
+	 * This has blown up in the past, the extent entry starts before the
+	 * bitmap entry, but we're trying to remove an offset that falls
+	 * completely within the bitmap range and is in both the extent entry
+	 * and the bitmap entry, looks like this
+	 *
+	 *   [ extent ]
+	 *      [ bitmap ]
+	 *        [ del ]
+	 */
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	ret = test_add_free_space_entry(cache, bitmap_offset + 4 * 1024 * 1024,
+				   4 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, bitmap_offset - 1 * 1024 * 1024,
+				   5 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, bitmap_offset + 1 * 1024 * 1024,
+				      5 * 1024 * 1024);
+	if (ret) {
+		test_msg("Failed to free our space %d\n", ret);
+		return ret;
+	}
+
+	if (test_check_exists(cache, bitmap_offset + 1 * 1024 * 1024,
+			 5 * 1024 * 1024)) {
+		test_msg("Left stuff over\n");
+		return -1;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+
+	/*
+	 * This blew up before, we have part of the free space in a bitmap and
+	 * then the entirety of the rest of the space in an extent.  This used
+	 * to return -EAGAIN back from btrfs_remove_extent, make sure this
+	 * doesn't happen.
+	 */
+	ret = test_add_free_space_entry(cache, 1 * 1024 * 1024, 2 * 1024 * 1024, 1);
+	if (ret) {
+		test_msg("Couldn't add bitmap entry %d\n", ret);
+		return ret;
+	}
+
+	ret = test_add_free_space_entry(cache, 3 * 1024 * 1024, 1 * 1024 * 1024, 0);
+	if (ret) {
+		test_msg("Couldn't add extent entry %d\n", ret);
+		return ret;
+	}
+
+	ret = btrfs_remove_free_space(cache, 1 * 1024 * 1024, 3 * 1024 * 1024);
+	if (ret) {
+		test_msg("Error removing bitmap and extent overlapping %d\n", ret);
+		return ret;
+	}
+
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	return 0;
+}
+
+int btrfs_test_free_space_cache(void)
+{
+	struct btrfs_block_group_cache *cache;
+	int ret;
+
+	test_msg("Running btrfs free space cache tests\n");
+
+	cache = init_test_block_group();
+	if (!cache) {
+		test_msg("Couldn't run the tests\n");
+		return 0;
+	}
+
+	ret = test_extents(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps(cache);
+	if (ret)
+		goto out;
+	ret = test_bitmaps_and_extents(cache);
+	if (ret)
+		goto out;
+out:
+	__btrfs_remove_free_space_cache(cache->free_space_ctl);
+	kfree(cache->free_space_ctl);
+	kfree(cache);
+	test_msg("Free space cache tests finished\n");
+	return ret;
+}
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index af1931a5960d..cac4a3f76323 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -837,7 +837,7 @@ int btrfs_wait_marked_extents(struct btrfs_root *root,
  * them in one of two extent_io trees.  This is used to make sure all of
  * those extents are on disk for transaction or log commit
  */
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
+static int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark)
 {
 	int ret;
@@ -1225,8 +1225,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 		btrfs_set_root_stransid(new_root_item, 0);
 		btrfs_set_root_rtransid(new_root_item, 0);
 	}
-	new_root_item->otime.sec = cpu_to_le64(cur_time.tv_sec);
-	new_root_item->otime.nsec = cpu_to_le32(cur_time.tv_nsec);
+	btrfs_set_stack_timespec_sec(&new_root_item->otime, cur_time.tv_sec);
+	btrfs_set_stack_timespec_nsec(&new_root_item->otime, cur_time.tv_nsec);
 	btrfs_set_root_otransid(new_root_item, trans->transid);
 
 	old = btrfs_lock_root_node(root);
@@ -1311,8 +1311,26 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 					 dentry->d_name.len * 2);
 	parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME;
 	ret = btrfs_update_inode_fallback(trans, parent_root, parent_inode);
-	if (ret)
+	if (ret) {
+		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root, new_uuid.b,
+				  BTRFS_UUID_KEY_SUBVOL, objectid);
+	if (ret) {
 		btrfs_abort_transaction(trans, root, ret);
+		goto fail;
+	}
+	if (!btrfs_is_empty_uuid(new_root_item->received_uuid)) {
+		ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+					  new_root_item->received_uuid,
+					  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+					  objectid);
+		if (ret && ret != -EEXIST) {
+			btrfs_abort_transaction(trans, root, ret);
+			goto fail;
+		}
+	}
 fail:
 	pending->error = ret;
 dir_item_existed:
@@ -1362,6 +1380,8 @@ static void update_super_roots(struct btrfs_root *root)
 	super->root_level = root_item->level;
 	if (btrfs_test_opt(root, SPACE_CACHE))
 		super->cache_generation = root_item->generation;
+	if (root->fs_info->update_uuid_tree_gen)
+		super->uuid_tree_generation = root_item->generation;
 }
 
 int btrfs_transaction_in_commit(struct btrfs_fs_info *info)
@@ -1928,8 +1948,7 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 	list_del_init(&root->root_list);
 	spin_unlock(&fs_info->trans_lock);
 
-	pr_debug("btrfs: cleaner removing %llu\n",
-			(unsigned long long)root->objectid);
+	pr_debug("btrfs: cleaner removing %llu\n", root->objectid);
 
 	btrfs_kill_all_delayed_nodes(root);
 
@@ -1942,6 +1961,5 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root)
 	 * If we encounter a transaction abort during snapshot cleaning, we
 	 * don't want to crash here
 	 */
-	BUG_ON(ret < 0 && ret != -EAGAIN && ret != -EROFS);
-	return 1;
+	return (ret < 0) ? 0 : 1;
 }
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index defbc4269897..5c2af8491621 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -160,8 +160,6 @@ int btrfs_should_end_transaction(struct btrfs_trans_handle *trans,
 void btrfs_throttle(struct btrfs_root *root);
 int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
 				struct btrfs_root *root);
-int btrfs_write_and_wait_marked_extents(struct btrfs_root *root,
-				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_write_marked_extents(struct btrfs_root *root,
 				struct extent_io_tree *dirty_pages, int mark);
 int btrfs_wait_marked_extents(struct btrfs_root *root,
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index ff60d8978ae2..0d9613c3f5e5 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -747,7 +747,8 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
 	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
 	if (ret)
 		goto out;
-	btrfs_run_delayed_items(trans, root);
+	else
+		ret = btrfs_run_delayed_items(trans, root);
 out:
 	kfree(name);
 	iput(inode);
@@ -923,7 +924,9 @@ again:
 				kfree(victim_name);
 				if (ret)
 					return ret;
-				btrfs_run_delayed_items(trans, root);
+				ret = btrfs_run_delayed_items(trans, root);
+				if (ret)
+					return ret;
 				*search_done = 1;
 				goto again;
 			}
@@ -990,7 +993,9 @@ again:
 								 inode,
 								 victim_name,
 								 victim_name_len);
-					btrfs_run_delayed_items(trans, root);
+					if (!ret)
+						ret = btrfs_run_delayed_items(
+								  trans, root);
 				}
 				iput(victim_parent);
 				kfree(victim_name);
@@ -1536,8 +1541,10 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans,
 
 	name_len = btrfs_dir_name_len(eb, di);
 	name = kmalloc(name_len, GFP_NOFS);
-	if (!name)
-		return -ENOMEM;
+	if (!name) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	log_type = btrfs_dir_type(eb, di);
 	read_extent_buffer(eb, name, (unsigned long)(di + 1),
@@ -1810,7 +1817,7 @@ again:
 			ret = btrfs_unlink_inode(trans, root, dir, inode,
 						 name, name_len);
 			if (!ret)
-				btrfs_run_delayed_items(trans, root);
+				ret = btrfs_run_delayed_items(trans, root);
 			kfree(name);
 			iput(inode);
 			if (ret)
diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c
new file mode 100644
index 000000000000..dd0dea3766f7
--- /dev/null
+++ b/fs/btrfs/uuid-tree.c
@@ -0,0 +1,358 @@
+/*
+ * Copyright (C) STRATO AG 2013.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/uuid.h>
+#include <asm/unaligned.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "disk-io.h"
+#include "print-tree.h"
+
+
+static void btrfs_uuid_to_key(u8 *uuid, u8 type, struct btrfs_key *key)
+{
+	key->type = type;
+	key->objectid = get_unaligned_le64(uuid);
+	key->offset = get_unaligned_le64(uuid + sizeof(u64));
+}
+
+/* return -ENOENT for !found, < 0 for errors, or 0 if an item was found */
+static int btrfs_uuid_tree_lookup(struct btrfs_root *uuid_root, u8 *uuid,
+				  u8 type, u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct extent_buffer *eb;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+	struct btrfs_key key;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+	ret = btrfs_search_slot(NULL, uuid_root, &key, path, 0, 0);
+	if (ret < 0) {
+		goto out;
+	} else if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	item_size = btrfs_item_size_nr(eb, slot);
+	offset = btrfs_item_ptr_offset(eb, slot);
+	ret = -ENOENT;
+
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		goto out;
+	}
+	while (item_size) {
+		__le64 data;
+
+		read_extent_buffer(eb, &data, offset, sizeof(data));
+		if (le64_to_cpu(data) == subid) {
+			ret = 0;
+			break;
+		}
+		offset += sizeof(data);
+		item_size -= sizeof(data);
+	}
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_add(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid_cpu)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	__le64 subid_le;
+
+	ret = btrfs_uuid_tree_lookup(uuid_root, uuid, type, subid_cpu);
+	if (ret != -ENOENT)
+		return ret;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_insert_empty_item(trans, uuid_root, path, &key,
+				      sizeof(subid_le));
+	if (ret >= 0) {
+		/* Add an item for the type for the first time */
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+	} else if (ret == -EEXIST) {
+		/*
+		 * An item with that type already exists.
+		 * Extend the item and store the new subid at the end.
+		 */
+		btrfs_extend_item(uuid_root, path, sizeof(subid_le));
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		offset = btrfs_item_ptr_offset(eb, slot);
+		offset += btrfs_item_size_nr(eb, slot) - sizeof(subid_le);
+	} else if (ret < 0) {
+		pr_warn("btrfs: insert uuid item failed %d (0x%016llx, 0x%016llx) type %u!\n",
+			ret, (unsigned long long)key.objectid,
+			(unsigned long long)key.offset, type);
+		goto out;
+	}
+
+	ret = 0;
+	subid_le = cpu_to_le64(subid_cpu);
+	write_extent_buffer(eb, &subid_le, offset, sizeof(subid_le));
+	btrfs_mark_buffer_dirty(eb);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_uuid_tree_rem(struct btrfs_trans_handle *trans,
+			struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			u64 subid)
+{
+	int ret;
+	struct btrfs_path *path = NULL;
+	struct btrfs_key key;
+	struct extent_buffer *eb;
+	int slot;
+	unsigned long offset;
+	u32 item_size;
+	unsigned long move_dst;
+	unsigned long move_src;
+	unsigned long move_len;
+
+	if (WARN_ON_ONCE(!uuid_root)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	btrfs_uuid_to_key(uuid, type, &key);
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	ret = btrfs_search_slot(trans, uuid_root, &key, path, -1, 1);
+	if (ret < 0) {
+		pr_warn("btrfs: error %d while searching for uuid item!\n",
+			ret);
+		goto out;
+	}
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	eb = path->nodes[0];
+	slot = path->slots[0];
+	offset = btrfs_item_ptr_offset(eb, slot);
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (!IS_ALIGNED(item_size, sizeof(u64))) {
+		pr_warn("btrfs: uuid item with illegal size %lu!\n",
+			(unsigned long)item_size);
+		ret = -ENOENT;
+		goto out;
+	}
+	while (item_size) {
+		__le64 read_subid;
+
+		read_extent_buffer(eb, &read_subid, offset, sizeof(read_subid));
+		if (le64_to_cpu(read_subid) == subid)
+			break;
+		offset += sizeof(read_subid);
+		item_size -= sizeof(read_subid);
+	}
+
+	if (!item_size) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	item_size = btrfs_item_size_nr(eb, slot);
+	if (item_size == sizeof(subid)) {
+		ret = btrfs_del_item(trans, uuid_root, path);
+		goto out;
+	}
+
+	move_dst = offset;
+	move_src = offset + sizeof(subid);
+	move_len = item_size - (move_src - btrfs_item_ptr_offset(eb, slot));
+	memmove_extent_buffer(eb, move_dst, move_src, move_len);
+	btrfs_truncate_item(uuid_root, path, item_size - sizeof(subid), 1);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int btrfs_uuid_iter_rem(struct btrfs_root *uuid_root, u8 *uuid, u8 type,
+			       u64 subid)
+{
+	struct btrfs_trans_handle *trans;
+	int ret;
+
+	/* 1 - for the uuid item */
+	trans = btrfs_start_transaction(uuid_root, 1);
+	if (IS_ERR(trans)) {
+		ret = PTR_ERR(trans);
+		goto out;
+	}
+
+	ret = btrfs_uuid_tree_rem(trans, uuid_root, uuid, type, subid);
+	btrfs_end_transaction(trans, uuid_root);
+
+out:
+	return ret;
+}
+
+int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info,
+			    int (*check_func)(struct btrfs_fs_info *, u8 *, u8,
+					      u64))
+{
+	struct btrfs_root *root = fs_info->uuid_root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path;
+	int ret = 0;
+	struct extent_buffer *leaf;
+	int slot;
+	u32 item_size;
+	unsigned long offset;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = 0;
+	key.offset = 0;
+	max_key.objectid = (u64)-1;
+	max_key.type = (u8)-1;
+	max_key.offset = (u64)-1;
+
+again_search_slot:
+	path->keep_locks = 1;
+	ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+	if (ret) {
+		if (ret > 0)
+			ret = 0;
+		goto out;
+	}
+
+	while (1) {
+		cond_resched();
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		btrfs_item_key_to_cpu(leaf, &key, slot);
+
+		if (key.type != BTRFS_UUID_KEY_SUBVOL &&
+		    key.type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+			goto skip;
+
+		offset = btrfs_item_ptr_offset(leaf, slot);
+		item_size = btrfs_item_size_nr(leaf, slot);
+		if (!IS_ALIGNED(item_size, sizeof(u64))) {
+			pr_warn("btrfs: uuid item with illegal size %lu!\n",
+				(unsigned long)item_size);
+			goto skip;
+		}
+		while (item_size) {
+			u8 uuid[BTRFS_UUID_SIZE];
+			__le64 subid_le;
+			u64 subid_cpu;
+
+			put_unaligned_le64(key.objectid, uuid);
+			put_unaligned_le64(key.offset, uuid + sizeof(u64));
+			read_extent_buffer(leaf, &subid_le, offset,
+					   sizeof(subid_le));
+			subid_cpu = le64_to_cpu(subid_le);
+			ret = check_func(fs_info, uuid, key.type, subid_cpu);
+			if (ret < 0)
+				goto out;
+			if (ret > 0) {
+				btrfs_release_path(path);
+				ret = btrfs_uuid_iter_rem(root, uuid, key.type,
+							  subid_cpu);
+				if (ret == 0) {
+					/*
+					 * this might look inefficient, but the
+					 * justification is that it is an
+					 * exception that check_func returns 1,
+					 * and that in the regular case only one
+					 * entry per UUID exists.
+					 */
+					goto again_search_slot;
+				}
+				if (ret < 0 && ret != -ENOENT)
+					goto out;
+			}
+			item_size -= sizeof(subid_le);
+			offset += sizeof(subid_le);
+		}
+
+skip:
+		ret = btrfs_next_item(root, path);
+		if (ret == 0)
+			continue;
+		else if (ret > 0)
+			ret = 0;
+		break;
+	}
+
+out:
+	btrfs_free_path(path);
+	if (ret)
+		pr_warn("btrfs: btrfs_uuid_tree_iterate failed %d\n", ret);
+	return 0;
+}
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 67a085381845..0052ca8264d9 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -26,6 +26,7 @@
 #include <linux/ratelimit.h>
 #include <linux/kthread.h>
 #include <linux/raid/pq.h>
+#include <linux/semaphore.h>
 #include <asm/div64.h>
 #include "compat.h"
 #include "ctree.h"
@@ -62,6 +63,48 @@ static void unlock_chunks(struct btrfs_root *root)
 	mutex_unlock(&root->fs_info->chunk_mutex);
 }
 
+static struct btrfs_fs_devices *__alloc_fs_devices(void)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = kzalloc(sizeof(*fs_devs), GFP_NOFS);
+	if (!fs_devs)
+		return ERR_PTR(-ENOMEM);
+
+	mutex_init(&fs_devs->device_list_mutex);
+
+	INIT_LIST_HEAD(&fs_devs->devices);
+	INIT_LIST_HEAD(&fs_devs->alloc_list);
+	INIT_LIST_HEAD(&fs_devs->list);
+
+	return fs_devs;
+}
+
+/**
+ * alloc_fs_devices - allocate struct btrfs_fs_devices
+ * @fsid:	a pointer to UUID for this FS.  If NULL a new UUID is
+ *		generated.
+ *
+ * Return: a pointer to a new &struct btrfs_fs_devices on success;
+ * ERR_PTR() on error.  Returned struct is not linked onto any lists and
+ * can be destroyed with kfree() right away.
+ */
+static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid)
+{
+	struct btrfs_fs_devices *fs_devs;
+
+	fs_devs = __alloc_fs_devices();
+	if (IS_ERR(fs_devs))
+		return fs_devs;
+
+	if (fsid)
+		memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
+	else
+		generate_random_uuid(fs_devs->fsid);
+
+	return fs_devs;
+}
+
 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 {
 	struct btrfs_device *device;
@@ -101,6 +144,27 @@ void btrfs_cleanup_fs_uuids(void)
 	}
 }
 
+static struct btrfs_device *__alloc_device(void)
+{
+	struct btrfs_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_NOFS);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	INIT_LIST_HEAD(&dev->dev_list);
+	INIT_LIST_HEAD(&dev->dev_alloc_list);
+
+	spin_lock_init(&dev->io_lock);
+
+	spin_lock_init(&dev->reada_lock);
+	atomic_set(&dev->reada_in_flight, 0);
+	INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT);
+	INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT);
+
+	return dev;
+}
+
 static noinline struct btrfs_device *__find_device(struct list_head *head,
 						   u64 devid, u8 *uuid)
 {
@@ -395,16 +459,14 @@ static noinline int device_list_add(const char *path,
 
 	fs_devices = find_fsid(disk_super->fsid);
 	if (!fs_devices) {
-		fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-		if (!fs_devices)
-			return -ENOMEM;
-		INIT_LIST_HEAD(&fs_devices->devices);
-		INIT_LIST_HEAD(&fs_devices->alloc_list);
+		fs_devices = alloc_fs_devices(disk_super->fsid);
+		if (IS_ERR(fs_devices))
+			return PTR_ERR(fs_devices);
+
 		list_add(&fs_devices->list, &fs_uuids);
-		memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE);
 		fs_devices->latest_devid = devid;
 		fs_devices->latest_trans = found_transid;
-		mutex_init(&fs_devices->device_list_mutex);
+
 		device = NULL;
 	} else {
 		device = __find_device(&fs_devices->devices, devid,
@@ -414,17 +476,12 @@ static noinline int device_list_add(const char *path,
 		if (fs_devices->opened)
 			return -EBUSY;
 
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device) {
+		device = btrfs_alloc_device(NULL, &devid,
+					    disk_super->dev_item.uuid);
+		if (IS_ERR(device)) {
 			/* we can safely leave the fs_devices entry around */
-			return -ENOMEM;
+			return PTR_ERR(device);
 		}
-		device->devid = devid;
-		device->dev_stats_valid = 0;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, disk_super->dev_item.uuid,
-		       BTRFS_UUID_SIZE);
-		spin_lock_init(&device->io_lock);
 
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name) {
@@ -432,22 +489,13 @@ static noinline int device_list_add(const char *path,
 			return -ENOMEM;
 		}
 		rcu_assign_pointer(device->name, name);
-		INIT_LIST_HEAD(&device->dev_alloc_list);
-
-		/* init readahead state */
-		spin_lock_init(&device->reada_lock);
-		device->reada_curr_zone = NULL;
-		atomic_set(&device->reada_in_flight, 0);
-		device->reada_next = 0;
-		INIT_RADIX_TREE(&device->reada_zones, GFP_NOFS & ~__GFP_WAIT);
-		INIT_RADIX_TREE(&device->reada_extents, GFP_NOFS & ~__GFP_WAIT);
 
 		mutex_lock(&fs_devices->device_list_mutex);
 		list_add_rcu(&device->dev_list, &fs_devices->devices);
+		fs_devices->num_devices++;
 		mutex_unlock(&fs_devices->device_list_mutex);
 
 		device->fs_devices = fs_devices;
-		fs_devices->num_devices++;
 	} else if (!device->name || strcmp(device->name->str, path)) {
 		name = rcu_string_strdup(path, GFP_NOFS);
 		if (!name)
@@ -474,25 +522,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 	struct btrfs_device *device;
 	struct btrfs_device *orig_dev;
 
-	fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!fs_devices)
-		return ERR_PTR(-ENOMEM);
+	fs_devices = alloc_fs_devices(orig->fsid);
+	if (IS_ERR(fs_devices))
+		return fs_devices;
 
-	INIT_LIST_HEAD(&fs_devices->devices);
-	INIT_LIST_HEAD(&fs_devices->alloc_list);
-	INIT_LIST_HEAD(&fs_devices->list);
-	mutex_init(&fs_devices->device_list_mutex);
 	fs_devices->latest_devid = orig->latest_devid;
 	fs_devices->latest_trans = orig->latest_trans;
 	fs_devices->total_devices = orig->total_devices;
-	memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid));
 
 	/* We have held the volume lock, it is safe to get the devices. */
 	list_for_each_entry(orig_dev, &orig->devices, dev_list) {
 		struct rcu_string *name;
 
-		device = kzalloc(sizeof(*device), GFP_NOFS);
-		if (!device)
+		device = btrfs_alloc_device(NULL, &orig_dev->devid,
+					    orig_dev->uuid);
+		if (IS_ERR(device))
 			goto error;
 
 		/*
@@ -506,13 +550,6 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
 		}
 		rcu_assign_pointer(device->name, name);
 
-		device->devid = orig_dev->devid;
-		device->work.func = pending_bios_fn;
-		memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid));
-		spin_lock_init(&device->io_lock);
-		INIT_LIST_HEAD(&device->dev_list);
-		INIT_LIST_HEAD(&device->dev_alloc_list);
-
 		list_add(&device->dev_list, &fs_devices->devices);
 		device->fs_devices = fs_devices;
 		fs_devices->num_devices++;
@@ -636,23 +673,22 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
 
 		if (device->can_discard)
 			fs_devices->num_can_discard--;
+		if (device->missing)
+			fs_devices->missing_devices--;
 
-		new_device = kmalloc(sizeof(*new_device), GFP_NOFS);
-		BUG_ON(!new_device); /* -ENOMEM */
-		memcpy(new_device, device, sizeof(*new_device));
+		new_device = btrfs_alloc_device(NULL, &device->devid,
+						device->uuid);
+		BUG_ON(IS_ERR(new_device)); /* -ENOMEM */
 
 		/* Safe because we are under uuid_mutex */
 		if (device->name) {
 			name = rcu_string_strdup(device->name->str, GFP_NOFS);
-			BUG_ON(device->name && !name); /* -ENOMEM */
+			BUG_ON(!name); /* -ENOMEM */
 			rcu_assign_pointer(new_device->name, name);
 		}
-		new_device->bdev = NULL;
-		new_device->writeable = 0;
-		new_device->in_fs_metadata = 0;
-		new_device->can_discard = 0;
-		spin_lock_init(&new_device->io_lock);
+
 		list_replace_rcu(&device->dev_list, &new_device->dev_list);
+		new_device->fs_devices = device->fs_devices;
 
 		call_rcu(&device->rcu, free_device);
 	}
@@ -865,7 +901,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 	disk_super = p + (bytenr & ~PAGE_CACHE_MASK);
 
 	if (btrfs_super_bytenr(disk_super) != bytenr ||
-	    disk_super->magic != cpu_to_le64(BTRFS_MAGIC))
+	    btrfs_super_magic(disk_super) != BTRFS_MAGIC)
 		goto error_unmap;
 
 	devid = btrfs_stack_device_id(&disk_super->dev_item);
@@ -880,8 +916,7 @@ int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder,
 		printk(KERN_INFO "device fsid %pU ", disk_super->fsid);
 	}
 
-	printk(KERN_CONT "devid %llu transid %llu %s\n",
-	       (unsigned long long)devid, (unsigned long long)transid, path);
+	printk(KERN_CONT "devid %llu transid %llu %s\n", devid, transid, path);
 
 	ret = device_list_add(path, disk_super, devid, fs_devices_ret);
 	if (!ret && fs_devices_ret)
@@ -1278,8 +1313,7 @@ static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
 	btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
 
 	write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid,
-		    (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent),
-		    BTRFS_UUID_SIZE);
+		    btrfs_dev_extent_chunk_tree_uuid(extent), BTRFS_UUID_SIZE);
 
 	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
 	btrfs_mark_buffer_dirty(leaf);
@@ -1307,15 +1341,14 @@ static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
 	return ret;
 }
 
-static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
+static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
+				    u64 *devid_ret)
 {
 	int ret;
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_path *path;
 
-	root = root->fs_info->chunk_root;
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
@@ -1324,20 +1357,21 @@ static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid)
 	key.type = BTRFS_DEV_ITEM_KEY;
 	key.offset = (u64)-1;
 
-	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
 
 	BUG_ON(ret == 0); /* Corruption */
 
-	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+	ret = btrfs_previous_item(fs_info->chunk_root, path,
+				  BTRFS_DEV_ITEMS_OBJECTID,
 				  BTRFS_DEV_ITEM_KEY);
 	if (ret) {
-		*objectid = 1;
+		*devid_ret = 1;
 	} else {
 		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
 				      path->slots[0]);
-		*objectid = found_key.offset + 1;
+		*devid_ret = found_key.offset + 1;
 	}
 	ret = 0;
 error:
@@ -1391,9 +1425,9 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans,
 	btrfs_set_device_bandwidth(leaf, dev_item, 0);
 	btrfs_set_device_start_offset(leaf, dev_item, 0);
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
-	ptr = (unsigned long)btrfs_device_fsid(dev_item);
+	ptr = btrfs_device_fsid(dev_item);
 	write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE);
 	btrfs_mark_buffer_dirty(leaf);
 
@@ -1562,7 +1596,9 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		clear_super = true;
 	}
 
+	mutex_unlock(&uuid_mutex);
 	ret = btrfs_shrink_device(device, 0);
+	mutex_lock(&uuid_mutex);
 	if (ret)
 		goto error_undo;
 
@@ -1586,7 +1622,11 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 	/*
 	 * the device list mutex makes sure that we don't change
 	 * the device list while someone else is writing out all
-	 * the device supers.
+	 * the device supers. Whoever is writing all supers, should
+	 * lock the device list mutex before getting the number of
+	 * devices in the super block (super_copy). Conversely,
+	 * whoever updates the number of devices in the super block
+	 * (super_copy) should hold the device list mutex.
 	 */
 
 	cur_devices = device->fs_devices;
@@ -1610,10 +1650,10 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path)
 		device->fs_devices->open_devices--;
 
 	call_rcu(&device->rcu, free_device);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	num_devices = btrfs_super_num_devices(root->fs_info->super_copy) - 1;
 	btrfs_set_super_num_devices(root->fs_info->super_copy, num_devices);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	if (cur_devices->open_devices == 0) {
 		struct btrfs_fs_devices *fs_devices;
@@ -1793,9 +1833,9 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
 	if (!fs_devices->seeding)
 		return -EINVAL;
 
-	seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS);
-	if (!seed_devices)
-		return -ENOMEM;
+	seed_devices = __alloc_fs_devices();
+	if (IS_ERR(seed_devices))
+		return PTR_ERR(seed_devices);
 
 	old_devices = clone_fs_devices(fs_devices);
 	if (IS_ERR(old_devices)) {
@@ -1814,7 +1854,6 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
 			      synchronize_rcu);
-	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
 	list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list);
 	list_for_each_entry(device, &seed_devices->devices, dev_list) {
@@ -1830,6 +1869,8 @@ static int btrfs_prepare_sprout(struct btrfs_root *root)
 	generate_random_uuid(fs_devices->fsid);
 	memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
 	memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
+	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
+
 	super_flags = btrfs_super_flags(disk_super) &
 		      ~BTRFS_SUPER_FLAG_SEEDING;
 	btrfs_set_super_flags(disk_super, super_flags);
@@ -1889,11 +1930,9 @@ next_slot:
 		dev_item = btrfs_item_ptr(leaf, path->slots[0],
 					  struct btrfs_dev_item);
 		devid = btrfs_device_id(leaf, dev_item);
-		read_extent_buffer(leaf, dev_uuid,
-				   (unsigned long)btrfs_device_uuid(dev_item),
+		read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 				   BTRFS_UUID_SIZE);
-		read_extent_buffer(leaf, fs_uuid,
-				   (unsigned long)btrfs_device_fsid(dev_item),
+		read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 				   BTRFS_UUID_SIZE);
 		device = btrfs_find_device(root->fs_info, devid, dev_uuid,
 					   fs_uuid);
@@ -1956,10 +1995,10 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 	mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device) {
+	device = btrfs_alloc_device(root->fs_info, NULL, NULL);
+	if (IS_ERR(device)) {
 		/* we can safely leave the fs_devices entry around */
-		ret = -ENOMEM;
+		ret = PTR_ERR(device);
 		goto error;
 	}
 
@@ -1971,13 +2010,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	}
 	rcu_assign_pointer(device->name, name);
 
-	ret = find_next_devid(root, &device->devid);
-	if (ret) {
-		rcu_string_free(device->name);
-		kfree(device);
-		goto error;
-	}
-
 	trans = btrfs_start_transaction(root, 0);
 	if (IS_ERR(trans)) {
 		rcu_string_free(device->name);
@@ -1992,9 +2024,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path)
 	if (blk_queue_discard(q))
 		device->can_discard = 1;
 	device->writeable = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	spin_lock_init(&device->io_lock);
 	device->generation = trans->transid;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
@@ -2121,6 +2150,7 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
 	struct btrfs_fs_info *fs_info = root->fs_info;
 	struct list_head *devices;
 	struct rcu_string *name;
+	u64 devid = BTRFS_DEV_REPLACE_DEVID;
 	int ret = 0;
 
 	*device_out = NULL;
@@ -2142,9 +2172,9 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
 		}
 	}
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device) {
-		ret = -ENOMEM;
+	device = btrfs_alloc_device(NULL, &devid, NULL);
+	if (IS_ERR(device)) {
+		ret = PTR_ERR(device);
 		goto error;
 	}
 
@@ -2161,10 +2191,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path,
 		device->can_discard = 1;
 	mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
 	device->writeable = 1;
-	device->work.func = pending_bios_fn;
-	generate_random_uuid(device->uuid);
-	device->devid = BTRFS_DEV_REPLACE_DEVID;
-	spin_lock_init(&device->io_lock);
 	device->generation = 0;
 	device->io_width = root->sectorsize;
 	device->io_align = root->sectorsize;
@@ -2971,10 +2997,6 @@ again:
 		if (found_key.objectid != key.objectid)
 			break;
 
-		/* chunk zero is special */
-		if (found_key.offset == 0)
-			break;
-
 		chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
 
 		if (!counting) {
@@ -3010,6 +3032,8 @@ again:
 			spin_unlock(&fs_info->balance_lock);
 		}
 loop:
+		if (found_key.offset == 0)
+			break;
 		key.offset = found_key.offset - 1;
 	}
 
@@ -3074,9 +3098,6 @@ static void __cancel_balance(struct btrfs_fs_info *fs_info)
 	atomic_set(&fs_info->mutually_exclusive_operation_running, 0);
 }
 
-void update_ioctl_balance_args(struct btrfs_fs_info *fs_info, int lock,
-			       struct btrfs_ioctl_balance_args *bargs);
-
 /*
  * Should be called with both balance and volume mutexes held
  */
@@ -3139,7 +3160,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	     (bctl->data.target & ~allowed))) {
 		printk(KERN_ERR "btrfs: unable to start balance with target "
 		       "data profile %llu\n",
-		       (unsigned long long)bctl->data.target);
+		       bctl->data.target);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3148,7 +3169,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	     (bctl->meta.target & ~allowed))) {
 		printk(KERN_ERR "btrfs: unable to start balance with target "
 		       "metadata profile %llu\n",
-		       (unsigned long long)bctl->meta.target);
+		       bctl->meta.target);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3157,7 +3178,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl,
 	     (bctl->sys.target & ~allowed))) {
 		printk(KERN_ERR "btrfs: unable to start balance with target "
 		       "system profile %llu\n",
-		       (unsigned long long)bctl->sys.target);
+		       bctl->sys.target);
 		ret = -EINVAL;
 		goto out;
 	}
@@ -3430,6 +3451,264 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
 	return 0;
 }
 
+static int btrfs_uuid_scan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = data;
+	struct btrfs_root *root = fs_info->tree_root;
+	struct btrfs_key key;
+	struct btrfs_key max_key;
+	struct btrfs_path *path = NULL;
+	int ret = 0;
+	struct extent_buffer *eb;
+	int slot;
+	struct btrfs_root_item root_item;
+	u32 item_size;
+	struct btrfs_trans_handle *trans = NULL;
+
+	path = btrfs_alloc_path();
+	if (!path) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	key.objectid = 0;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = 0;
+
+	max_key.objectid = (u64)-1;
+	max_key.type = BTRFS_ROOT_ITEM_KEY;
+	max_key.offset = (u64)-1;
+
+	path->keep_locks = 1;
+
+	while (1) {
+		ret = btrfs_search_forward(root, &key, &max_key, path, 0);
+		if (ret) {
+			if (ret > 0)
+				ret = 0;
+			break;
+		}
+
+		if (key.type != BTRFS_ROOT_ITEM_KEY ||
+		    (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
+		     key.objectid != BTRFS_FS_TREE_OBJECTID) ||
+		    key.objectid > BTRFS_LAST_FREE_OBJECTID)
+			goto skip;
+
+		eb = path->nodes[0];
+		slot = path->slots[0];
+		item_size = btrfs_item_size_nr(eb, slot);
+		if (item_size < sizeof(root_item))
+			goto skip;
+
+		read_extent_buffer(eb, &root_item,
+				   btrfs_item_ptr_offset(eb, slot),
+				   (int)sizeof(root_item));
+		if (btrfs_root_refs(&root_item) == 0)
+			goto skip;
+
+		if (!btrfs_is_empty_uuid(root_item.uuid) ||
+		    !btrfs_is_empty_uuid(root_item.received_uuid)) {
+			if (trans)
+				goto update_tree;
+
+			btrfs_release_path(path);
+			/*
+			 * 1 - subvol uuid item
+			 * 1 - received_subvol uuid item
+			 */
+			trans = btrfs_start_transaction(fs_info->uuid_root, 2);
+			if (IS_ERR(trans)) {
+				ret = PTR_ERR(trans);
+				break;
+			}
+			continue;
+		} else {
+			goto skip;
+		}
+update_tree:
+		if (!btrfs_is_empty_uuid(root_item.uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.uuid,
+						  BTRFS_UUID_KEY_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				pr_warn("btrfs: uuid_tree_add failed %d\n",
+					ret);
+				break;
+			}
+		}
+
+		if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
+			ret = btrfs_uuid_tree_add(trans, fs_info->uuid_root,
+						  root_item.received_uuid,
+						 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
+						  key.objectid);
+			if (ret < 0) {
+				pr_warn("btrfs: uuid_tree_add failed %d\n",
+					ret);
+				break;
+			}
+		}
+
+skip:
+		if (trans) {
+			ret = btrfs_end_transaction(trans, fs_info->uuid_root);
+			trans = NULL;
+			if (ret)
+				break;
+		}
+
+		btrfs_release_path(path);
+		if (key.offset < (u64)-1) {
+			key.offset++;
+		} else if (key.type < BTRFS_ROOT_ITEM_KEY) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+		} else if (key.objectid < (u64)-1) {
+			key.offset = 0;
+			key.type = BTRFS_ROOT_ITEM_KEY;
+			key.objectid++;
+		} else {
+			break;
+		}
+		cond_resched();
+	}
+
+out:
+	btrfs_free_path(path);
+	if (trans && !IS_ERR(trans))
+		btrfs_end_transaction(trans, fs_info->uuid_root);
+	if (ret)
+		pr_warn("btrfs: btrfs_uuid_scan_kthread failed %d\n", ret);
+	else
+		fs_info->update_uuid_tree_gen = 1;
+	up(&fs_info->uuid_tree_rescan_sem);
+	return 0;
+}
+
+/*
+ * Callback for btrfs_uuid_tree_iterate().
+ * returns:
+ * 0	check succeeded, the entry is not outdated.
+ * < 0	if an error occured.
+ * > 0	if the check failed, which means the caller shall remove the entry.
+ */
+static int btrfs_check_uuid_tree_entry(struct btrfs_fs_info *fs_info,
+				       u8 *uuid, u8 type, u64 subid)
+{
+	struct btrfs_key key;
+	int ret = 0;
+	struct btrfs_root *subvol_root;
+
+	if (type != BTRFS_UUID_KEY_SUBVOL &&
+	    type != BTRFS_UUID_KEY_RECEIVED_SUBVOL)
+		goto out;
+
+	key.objectid = subid;
+	key.type = BTRFS_ROOT_ITEM_KEY;
+	key.offset = (u64)-1;
+	subvol_root = btrfs_read_fs_root_no_name(fs_info, &key);
+	if (IS_ERR(subvol_root)) {
+		ret = PTR_ERR(subvol_root);
+		if (ret == -ENOENT)
+			ret = 1;
+		goto out;
+	}
+
+	switch (type) {
+	case BTRFS_UUID_KEY_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.uuid, BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	case BTRFS_UUID_KEY_RECEIVED_SUBVOL:
+		if (memcmp(uuid, subvol_root->root_item.received_uuid,
+			   BTRFS_UUID_SIZE))
+			ret = 1;
+		break;
+	}
+
+out:
+	return ret;
+}
+
+static int btrfs_uuid_rescan_kthread(void *data)
+{
+	struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
+	int ret;
+
+	/*
+	 * 1st step is to iterate through the existing UUID tree and
+	 * to delete all entries that contain outdated data.
+	 * 2nd step is to add all missing entries to the UUID tree.
+	 */
+	ret = btrfs_uuid_tree_iterate(fs_info, btrfs_check_uuid_tree_entry);
+	if (ret < 0) {
+		pr_warn("btrfs: iterating uuid_tree failed %d\n", ret);
+		up(&fs_info->uuid_tree_rescan_sem);
+		return ret;
+	}
+	return btrfs_uuid_scan_kthread(data);
+}
+
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct btrfs_trans_handle *trans;
+	struct btrfs_root *tree_root = fs_info->tree_root;
+	struct btrfs_root *uuid_root;
+	struct task_struct *task;
+	int ret;
+
+	/*
+	 * 1 - root node
+	 * 1 - root item
+	 */
+	trans = btrfs_start_transaction(tree_root, 2);
+	if (IS_ERR(trans))
+		return PTR_ERR(trans);
+
+	uuid_root = btrfs_create_tree(trans, fs_info,
+				      BTRFS_UUID_TREE_OBJECTID);
+	if (IS_ERR(uuid_root)) {
+		btrfs_abort_transaction(trans, tree_root,
+					PTR_ERR(uuid_root));
+		return PTR_ERR(uuid_root);
+	}
+
+	fs_info->uuid_root = uuid_root;
+
+	ret = btrfs_commit_transaction(trans, tree_root);
+	if (ret)
+		return ret;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		pr_warn("btrfs: failed to start uuid_scan task\n");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info)
+{
+	struct task_struct *task;
+
+	down(&fs_info->uuid_tree_rescan_sem);
+	task = kthread_run(btrfs_uuid_rescan_kthread, fs_info, "btrfs-uuid");
+	if (IS_ERR(task)) {
+		/* fs_info->update_uuid_tree_gen remains 0 in all error case */
+		pr_warn("btrfs: failed to start uuid_rescan task\n");
+		up(&fs_info->uuid_tree_rescan_sem);
+		return PTR_ERR(task);
+	}
+
+	return 0;
+}
+
 /*
  * shrinking a device means finding all of the device extents past
  * the new size, and then following the back refs to the chunks.
@@ -4194,13 +4473,13 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
 	 * and exit, so return 1 so the callers don't try to use other copies.
 	 */
 	if (!em) {
-		btrfs_emerg(fs_info, "No mapping for %Lu-%Lu\n", logical,
+		btrfs_crit(fs_info, "No mapping for %Lu-%Lu\n", logical,
 			    logical+len);
 		return 1;
 	}
 
 	if (em->start > logical || em->start + em->len < logical) {
-		btrfs_emerg(fs_info, "Invalid mapping for %Lu-%Lu, got "
+		btrfs_crit(fs_info, "Invalid mapping for %Lu-%Lu, got "
 			    "%Lu-%Lu\n", logical, logical+len, em->start,
 			    em->start + em->len);
 		return 1;
@@ -4375,8 +4654,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 
 	if (!em) {
 		btrfs_crit(fs_info, "unable to find logical %llu len %llu",
-			(unsigned long long)logical,
-			(unsigned long long)*length);
+			logical, *length);
 		return -EINVAL;
 	}
 
@@ -4671,6 +4949,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw,
 	}
 	bbio = kzalloc(btrfs_bio_size(num_alloc_stripes), GFP_NOFS);
 	if (!bbio) {
+		kfree(raid_map);
 		ret = -ENOMEM;
 		goto out;
 	}
@@ -5246,9 +5525,7 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio,
 
 	if (map_length < length) {
 		btrfs_crit(root->fs_info, "mapping failed logical %llu bio len %llu len %llu",
-			(unsigned long long)logical,
-			(unsigned long long)length,
-			(unsigned long long)map_length);
+			logical, length, map_length);
 		BUG();
 	}
 
@@ -5314,23 +5591,72 @@ static struct btrfs_device *add_missing_dev(struct btrfs_root *root,
 	struct btrfs_device *device;
 	struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
 
-	device = kzalloc(sizeof(*device), GFP_NOFS);
-	if (!device)
+	device = btrfs_alloc_device(NULL, &devid, dev_uuid);
+	if (IS_ERR(device))
 		return NULL;
-	list_add(&device->dev_list,
-		 &fs_devices->devices);
-	device->devid = devid;
-	device->work.func = pending_bios_fn;
+
+	list_add(&device->dev_list, &fs_devices->devices);
 	device->fs_devices = fs_devices;
-	device->missing = 1;
 	fs_devices->num_devices++;
+
+	device->missing = 1;
 	fs_devices->missing_devices++;
-	spin_lock_init(&device->io_lock);
-	INIT_LIST_HEAD(&device->dev_alloc_list);
-	memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE);
+
 	return device;
 }
 
+/**
+ * btrfs_alloc_device - allocate struct btrfs_device
+ * @fs_info:	used only for generating a new devid, can be NULL if
+ *		devid is provided (i.e. @devid != NULL).
+ * @devid:	a pointer to devid for this device.  If NULL a new devid
+ *		is generated.
+ * @uuid:	a pointer to UUID for this device.  If NULL a new UUID
+ *		is generated.
+ *
+ * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
+ * on error.  Returned struct is not linked onto any lists and can be
+ * destroyed with kfree() right away.
+ */
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid)
+{
+	struct btrfs_device *dev;
+	u64 tmp;
+
+	if (!devid && !fs_info) {
+		WARN_ON(1);
+		return ERR_PTR(-EINVAL);
+	}
+
+	dev = __alloc_device();
+	if (IS_ERR(dev))
+		return dev;
+
+	if (devid)
+		tmp = *devid;
+	else {
+		int ret;
+
+		ret = find_next_devid(fs_info, &tmp);
+		if (ret) {
+			kfree(dev);
+			return ERR_PTR(ret);
+		}
+	}
+	dev->devid = tmp;
+
+	if (uuid)
+		memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
+	else
+		generate_random_uuid(dev->uuid);
+
+	dev->work.func = pending_bios_fn;
+
+	return dev;
+}
+
 static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 			  struct extent_buffer *leaf,
 			  struct btrfs_chunk *chunk)
@@ -5437,7 +5763,7 @@ static void fill_device_from_item(struct extent_buffer *leaf,
 	WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
 	device->is_tgtdev_for_dev_replace = 0;
 
-	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	ptr = btrfs_device_uuid(dev_item);
 	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
 }
 
@@ -5500,11 +5826,9 @@ static int read_one_dev(struct btrfs_root *root,
 	u8 dev_uuid[BTRFS_UUID_SIZE];
 
 	devid = btrfs_device_id(leaf, dev_item);
-	read_extent_buffer(leaf, dev_uuid,
-			   (unsigned long)btrfs_device_uuid(dev_item),
+	read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
 			   BTRFS_UUID_SIZE);
-	read_extent_buffer(leaf, fs_uuid,
-			   (unsigned long)btrfs_device_fsid(dev_item),
+	read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
 			   BTRFS_UUID_SIZE);
 
 	if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) {
@@ -5519,8 +5843,7 @@ static int read_one_dev(struct btrfs_root *root,
 			return -EIO;
 
 		if (!device) {
-			btrfs_warn(root->fs_info, "devid %llu missing",
-				(unsigned long long)devid);
+			btrfs_warn(root->fs_info, "devid %llu missing", devid);
 			device = add_missing_dev(root, devid, dev_uuid);
 			if (!device)
 				return -ENOMEM;
@@ -5644,14 +5967,15 @@ int btrfs_read_chunk_tree(struct btrfs_root *root)
 	mutex_lock(&uuid_mutex);
 	lock_chunks(root);
 
-	/* first we search for all of the device items, and then we
-	 * read in all of the chunk items.  This way we can create chunk
-	 * mappings that reference all of the devices that are afound
+	/*
+	 * Read all device items, and then all the chunk items. All
+	 * device items are found before any chunk item (their object id
+	 * is smaller than the lowest possible object id for a chunk
+	 * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
 	 */
 	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
 	key.offset = 0;
 	key.type = 0;
-again:
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		goto error;
@@ -5667,17 +5991,13 @@ again:
 			break;
 		}
 		btrfs_item_key_to_cpu(leaf, &found_key, slot);
-		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
-			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
-				break;
-			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
-				struct btrfs_dev_item *dev_item;
-				dev_item = btrfs_item_ptr(leaf, slot,
+		if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+			struct btrfs_dev_item *dev_item;
+			dev_item = btrfs_item_ptr(leaf, slot,
 						  struct btrfs_dev_item);
-				ret = read_one_dev(root, leaf, dev_item);
-				if (ret)
-					goto error;
-			}
+			ret = read_one_dev(root, leaf, dev_item);
+			if (ret)
+				goto error;
 		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
 			struct btrfs_chunk *chunk;
 			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
@@ -5687,11 +6007,6 @@ again:
 		}
 		path->slots[0]++;
 	}
-	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
-		key.objectid = 0;
-		btrfs_release_path(path);
-		goto again;
-	}
 	ret = 0;
 error:
 	unlock_chunks(root);
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index 86705583480d..b72f540c8b29 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -152,6 +152,8 @@ struct btrfs_fs_devices {
 	int rotating;
 };
 
+#define BTRFS_BIO_INLINE_CSUM_SIZE	64
+
 /*
  * we need the mirror number and stripe index to be passed around
  * the call chain while we are processing end_io (especially errors).
@@ -161,9 +163,14 @@ struct btrfs_fs_devices {
  * we allocate are actually btrfs_io_bios.  We'll cram as much of
  * struct btrfs_bio as we can into this over time.
  */
+typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err);
 struct btrfs_io_bio {
 	unsigned long mirror_num;
 	unsigned long stripe_index;
+	u8 *csum;
+	u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
+	u8 *csum_allocated;
+	btrfs_io_bio_end_io_t *end_io;
 	struct bio bio;
 };
 
@@ -298,6 +305,9 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info,
 int btrfs_find_device_missing_or_by_path(struct btrfs_root *root,
 					 char *device_path,
 					 struct btrfs_device **device);
+struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
+					const u64 *devid,
+					const u8 *uuid);
 int btrfs_rm_device(struct btrfs_root *root, char *device_path);
 void btrfs_cleanup_fs_uuids(void);
 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len);
@@ -315,6 +325,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
 int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
 int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
+int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
+int btrfs_check_uuid_tree(struct btrfs_fs_info *fs_info);
 int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset);
 int find_free_dev_extent(struct btrfs_trans_handle *trans,
 			 struct btrfs_device *device, u64 num_bytes,
diff --git a/fs/cifs/AUTHORS b/fs/cifs/AUTHORS
deleted file mode 100644
index ea940b1db77b..000000000000
--- a/fs/cifs/AUTHORS
+++ /dev/null
@@ -1,55 +0,0 @@
-Original Author
-===============
-Steve French (sfrench@samba.org)
-
-The author wishes to express his appreciation and thanks to:
-Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS
-improvements. Thanks to IBM for allowing me time and test resources to pursue
-this project, to Jim McDonough from IBM (and the Samba Team) for his help, to
-the IBM Linux JFS team for explaining many esoteric Linux filesystem features.
-Jeremy Allison of the Samba team has done invaluable work in adding the server
-side of the original CIFS Unix extensions and reviewing and implementing
-portions of the newer CIFS POSIX extensions into the Samba 3 file server. Thank
-Dave Boutcher of IBM Rochester (author of the OS/400 smb/cifs filesystem client)
-for proving years ago that very good smb/cifs clients could be done on Unix-like
-operating systems.  Volker Lendecke, Andrew Tridgell, Urban Widmark, John 
-Newbigin and others for their work on the Linux smbfs module.  Thanks to
-the other members of the Storage Network Industry Association CIFS Technical
-Workgroup for their work specifying this highly complex protocol and finally
-thanks to the Samba team for their technical advice and encouragement.
-
-Patch Contributors
-------------------
-Zwane Mwaikambo
-Andi Kleen
-Amrut Joshi
-Shobhit Dayal
-Sergey Vlasov
-Richard Hughes
-Yury Umanets
-Mark Hamzy (for some of the early cifs IPv6 work)
-Domen Puncer
-Jesper Juhl (in particular for lots of whitespace/formatting cleanup)
-Vince Negri and Dave Stahl (for finding an important caching bug)
-Adrian Bunk (kcalloc cleanups)
-Miklos Szeredi 
-Kazeon team for various fixes especially for 2.4 version.
-Asser Ferno (Change Notify support)
-Shaggy (Dave Kleikamp) for innumerable small fs suggestions and some good cleanup
-Gunter Kukkukk (testing and suggestions for support of old servers)
-Igor Mammedov (DFS support)
-Jeff Layton (many, many fixes, as well as great work on the cifs Kerberos code)
-
-Test case and Bug Report contributors
--------------------------------------
-Thanks to those in the community who have submitted detailed bug reports
-and debug of problems they have found:  Jochen Dolze, David Blaine,
-Rene Scharfe, Martin Josefsson, Alexander Wild, Anthony Liguori,
-Lars Muller, Urban Widmark, Massimiliano Ferrero, Howard Owen,
-Olaf Kirch, Kieron Briggs, Nick Millington and others. Also special
-mention to the Stanford Checker (SWAT) which pointed out many minor
-bugs in error paths.  Valuable suggestions also have come from Al Viro
-and Dave Miller.
-
-And thanks to the IBM LTC and Power test teams and SuSE testers for
-finding multiple bugs during excellent stress test runs.
diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
deleted file mode 100644
index bc0025cdd1c9..000000000000
--- a/fs/cifs/CHANGES
+++ /dev/null
@@ -1,1065 +0,0 @@
-Version 1.62
-------------
-Add sockopt=TCP_NODELAY mount option. EA (xattr) routines hardened
-to more strictly handle corrupt frames.
-
-Version 1.61
-------------
-Fix append problem to Samba servers (files opened with O_APPEND could
-have duplicated data). Fix oops in cifs_lookup. Workaround problem
-mounting to OS/400 Netserve. Fix oops in cifs_get_tcp_session.
-Disable use of server inode numbers when server only
-partially supports them (e.g. for one server querying inode numbers on
-FindFirst fails but QPathInfo queries works). Fix oops with dfs in 
-cifs_put_smb_ses. Fix mmap to work on directio mounts (needed
-for OpenOffice when on forcedirectio mount e.g.)
-
-Version 1.60
--------------
-Fix memory leak in reconnect.  Fix oops in DFS mount error path.
-Set s_maxbytes to smaller (the max that vfs can handle) so that
-sendfile will now work over cifs mounts again.  Add noforcegid
-and noforceuid mount parameters. Fix small mem leak when using
-ntlmv2. Fix 2nd mount to same server but with different port to
-be allowed (rather than reusing the 1st port) - only when the
-user explicitly overrides the port on the 2nd mount.
-
-Version 1.59
-------------
-Client uses server inode numbers (which are persistent) rather than
-client generated ones by default (mount option "serverino" turned
-on by default if server supports it).  Add forceuid and forcegid
-mount options (so that when negotiating unix extensions specifying
-which uid mounted does not immediately force the server's reported
-uids to be overridden).  Add support for scope mount parm. Improve
-hard link detection to use same inode for both.  Do not set
-read-only dos attribute on directories (for chmod) since Windows
-explorer special cases this attribute bit for directories for
-a different purpose.
-
-Version 1.58
-------------
-Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
-when the UTF-8 string is composed of unusually long (more than 4 byte) converted
-characters. Add support for mounting root of a share which redirects immediately
-to DFS target. Convert string conversion functions from Unicode to more
-accurately mark string length before allocating memory (which may help the
-rare cases where a UTF-8 string is much larger than the UCS2 string that
-we converted from).  Fix endianness of the vcnum field used during
-session setup to distinguish multiple mounts to same server from different
-userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
-flag to be set to 2, and mount must enable krb5 to turn on extended security).
-Performance of file create to Samba improved (posix create on lookup
-removes 1 of 2 network requests sent on file create)
- 
-Version 1.57
-------------
-Improve support for multiple security contexts to the same server. We
-used to use the same "vcnumber" for all connections which could cause
-the server to treat subsequent connections, especially those that
-are authenticated as guest, as reconnections, invalidating the earlier
-user's smb session.  This fix allows cifs to mount multiple times to the
-same server with different userids without risking invalidating earlier
-established security contexts.  fsync now sends SMB Flush operation
-to better ensure that we wait for server to write all of the data to
-server disk (not just write it over the network).  Add new mount
-parameter to allow user to disable sending the (slow) SMB flush on
-fsync if desired (fsync still flushes all cached write data to the server).
-Posix file open support added (turned off after one attempt if server
-fails to support it properly, as with Samba server versions prior to 3.3.2)
-Fix "redzone overwritten" bug in cifs_put_tcon (CIFSTcon may allocate too
-little memory for the "nativeFileSystem" field returned by the server
-during mount).  Endian convert inode numbers if necessary (makes it easier
-to compare inode numbers on network files from big endian systems). 
-
-Version 1.56
-------------
-Add "forcemandatorylock" mount option to allow user to use mandatory
-rather than posix (advisory) byte range locks, even though server would
-support posix byte range locks.  Fix query of root inode when prefixpath
-specified and user does not have access to query information about the
-top of the share.  Fix problem in 2.6.28 resolving DFS paths to
-Samba servers (worked to Windows).  Fix rmdir so that pending search
-(readdir) requests do not get invalid results which include the now
-removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
-when using DFS.  Add better file create support to servers which support
-the CIFS POSIX protocol extensions (this adds support for new flags
-on create, and improves semantics for write of locked ranges).
-
-Version 1.55
-------------
-Various fixes to make delete of open files behavior more predictable
-(when delete of an open file fails we mark the file as "delete-on-close"
-in a way that more servers accept, but only if we can first rename the
-file to a temporary name).  Add experimental support for more safely
-handling fcntl(F_SETLEASE).  Convert cifs to using blocking tcp
-sends, and also let tcp autotune the socket send and receive buffers.
-This reduces the number of EAGAIN errors returned by TCP/IP in
-high stress workloads (and the number of retries on socket writes
-when sending large SMBWriteX requests).  Fix case in which a portion of
-data can in some cases not get written to the file on the server before the
-file is closed.  Fix DFS parsing to properly handle path consumed field,
-and to handle certain codepage conversions better.  Fix mount and
-umount race that can cause oops in mount or umount or reconnect.
-
-Version 1.54
-------------
-Fix premature write failure on congested networks (we would give up
-on EAGAIN from the socket too quickly on large writes).
-Cifs_mkdir and cifs_create now respect the setgid bit on parent dir.
-Fix endian problems in acl (mode from/to cifs acl) on bigendian
-architectures.  Fix problems with preserving timestamps on copying open
-files (e.g. "cp -a") to Windows servers.  For mkdir and create honor setgid bit
-on parent directory when server supports Unix Extensions but not POSIX
-create. Update cifs.upcall version to handle new Kerberos sec flags
-(this requires update of cifs.upcall program from Samba).  Fix memory leak
-on dns_upcall (resolving DFS referralls).  Fix plain text password
-authentication (requires setting SecurityFlags to 0x30030 to enable
-lanman and plain text though).  Fix writes to be at correct offset when
-file is open with O_APPEND and file is on a directio (forcediretio) mount.
-Fix bug in rewinding readdir directory searches.  Add nodfs mount option.
-
-Version 1.53
-------------
-DFS support added (Microsoft Distributed File System client support needed
-for referrals which enable a hierarchical name space among servers).
-Disable temporary caching of mode bits to servers which do not support
-storing of mode (e.g. Windows servers, when client mounts without cifsacl
-mount option) and add new "dynperm" mount option to enable temporary caching
-of mode (enable old behavior).  Fix hang on mount caused when server crashes
-tcp session during negotiate protocol.
-
-Version 1.52
-------------
-Fix oops on second mount to server when null auth is used.
-Enable experimental Kerberos support.  Return writebehind errors on flush
-and sync so that events like out of disk space get reported properly on
-cached files. Fix setxattr failure to certain Samba versions. Fix mount
-of second share to disconnected server session (autoreconnect on this).
-Add ability to modify cifs acls for handling chmod (when mounted with
-cifsacl flag). Fix prefixpath path separator so we can handle mounts
-with prefixpaths longer than one directory (one path component) when
-mounted to Windows servers.  Fix slow file open when cifsacl
-enabled. Fix memory leak in FindNext when the SMB call returns -EBADF.
-
-
-Version 1.51
-------------
-Fix memory leak in statfs when mounted to very old servers (e.g.
-Windows 9x).  Add new feature "POSIX open" which allows servers
-which support the current POSIX Extensions to provide better semantics
-(e.g. delete for open files opened with posix open).  Take into
-account umask on posix mkdir not just older style mkdir.  Add
-ability to mount to IPC$ share (which allows CIFS named pipes to be
-opened, read and written as if they were files).  When 1st tree
-connect fails (e.g. due to signing negotiation failure) fix
-leak that causes cifsd not to stop and rmmod to fail to cleanup
-cifs_request_buffers pool. Fix problem with POSIX Open/Mkdir on
-bigendian architectures. Fix possible memory corruption when
-EAGAIN returned on kern_recvmsg. Return better error if server
-requires packet signing but client has disabled it. When mounted
-with cifsacl mount option - mode bits are approximated based
-on the contents of the ACL of the file or directory. When cifs
-mount helper is missing convert make sure that UNC name 
-has backslash (not forward slash) between ip address of server
-and the share name.
-
-Version 1.50
-------------
-Fix NTLMv2 signing. NFS server mounted over cifs works (if cifs mount is
-done with "serverino" mount option).  Add support for POSIX Unlink
-(helps with certain sharing violation cases when server such as
-Samba supports newer POSIX CIFS Protocol Extensions). Add "nounix"
-mount option to allow disabling the CIFS Unix Extensions for just
-that mount. Fix hang on spinlock in find_writable_file (race when
-reopening file after session crash).  Byte range unlock request to
-windows server could unlock more bytes (on server copy of file)
-than intended if start of unlock request is well before start of
-a previous byte range lock that we issued.
-
-Version 1.49
-------------
-IPv6 support.  Enable ipv6 addresses to be passed on mount (put the ipv6
-address after the "ip=" mount option, at least until mount.cifs is fixed to
-handle DNS host to ipv6 name translation).  Accept override of uid or gid
-on mount even when Unix Extensions are negotiated (it used to be ignored
-when Unix Extensions were ignored).  This allows users to override the
-default uid and gid for files when they are certain that the uids or
-gids on the server do not match those of the client.  Make "sec=none"
-mount override username (so that null user connection is attempted)
-to match what documentation said. Support for very large reads, over 127K,
-available to some newer servers (such as Samba 3.0.26 and later but
-note that it also requires setting CIFSMaxBufSize at module install
-time to a larger value which may hurt performance in some cases).
-Make sign option force signing (or fail if server does not support it).
-
-Version 1.48
-------------
-Fix mtime bouncing around from local idea of last write times to remote time.
-Fix hang (in i_size_read) when simultaneous size update of same remote file
-on smp system corrupts sequence number. Do not reread unnecessarily partial page
-(which we are about to overwrite anyway) when writing out file opened rw.
-When DOS attribute of file on non-Unix server's file changes on the server side
-from read-only back to read-write, reflect this change in default file mode
-(we had been leaving a file's mode read-only until the inode were reloaded).
-Allow setting of attribute back to ATTR_NORMAL (removing readonly dos attribute
-when archive dos attribute not set and we are changing mode back to writeable
-on server which does not support the Unix Extensions).  Remove read only dos
-attribute on chmod when adding any write permission (ie on any of
-user/group/other (not all of user/group/other ie  0222) when
-mounted to windows.  Add support for POSIX MkDir (slight performance
-enhancement and eliminates the network race between the mkdir and set 
-path info of the mode).
-
-
-Version 1.47
-------------
-Fix oops in list_del during mount caused by unaligned string.
-Fix file corruption which could occur on some large file
-copies caused by writepages page i/o completion bug.
-Seek to SEEK_END forces check for update of file size for non-cached
-files. Allow file size to be updated on remote extend of locally open,
-non-cached file.  Fix reconnect to newer Samba servers (or other servers
-which support the CIFS Unix/POSIX extensions) so that we again tell the
-server the Unix/POSIX cifs capabilities which we support (SetFSInfo).
-Add experimental support for new POSIX Open/Mkdir (which returns
-stat information on the open, and allows setting the mode).
-
-Version 1.46
-------------
-Support deep tree mounts.  Better support OS/2, Win9x (DOS) time stamps.
-Allow null user to be specified on mount ("username="). Do not return
-EINVAL on readdir when filldir fails due to overwritten blocksize
-(fixes FC problem).  Return error in rename 2nd attempt retry (ie report
-if rename by handle also fails, after rename by path fails, we were
-not reporting whether the retry worked or not). Fix NTLMv2 to
-work to Windows servers (mount with option "sec=ntlmv2").
-
-Version 1.45
-------------
-Do not time out lockw calls when using posix extensions. Do not
-time out requests if server still responding reasonably fast
-on requests on other threads.  Improve POSIX locking emulation,
-(lock cancel now works, and unlock of merged range works even
-to Windows servers now).  Fix oops on mount to lanman servers
-(win9x, os/2 etc.) when null password.  Do not send listxattr
-(SMB to query all EAs) if nouser_xattr specified.  Fix SE Linux
-problem (instantiate inodes/dentries in right order for readdir).
-
-Version 1.44
-------------
-Rewritten sessionsetup support, including support for legacy SMB
-session setup needed for OS/2 and older servers such as Windows 95 and 98.
-Fix oops on ls to OS/2 servers.  Add support for level 1 FindFirst
-so we can do search (ls etc.) to OS/2.  Do not send NTCreateX
-or recent levels of FindFirst unless server says it supports NT SMBs
-(instead use legacy equivalents from LANMAN dialect). Fix to allow
-NTLMv2 authentication support (now can use stronger password hashing
-on mount if corresponding /proc/fs/cifs/SecurityFlags is set (0x4004).
-Allow override of global cifs security flags on mount via "sec=" option(s).
-
-Version 1.43
-------------
-POSIX locking to servers which support CIFS POSIX Extensions
-(disabled by default controlled by proc/fs/cifs/Experimental).
-Handle conversion of long share names (especially Asian languages)
-to Unicode during mount. Fix memory leak in sess struct on reconnect.
-Fix rare oops after acpi suspend.  Fix O_TRUNC opens to overwrite on
-cifs open which helps rare case when setpathinfo fails or server does
-not support it. 
-
-Version 1.42
-------------
-Fix slow oplock break when mounted to different servers at the same time and
-the tids match and we try to find matching fid on wrong server. Fix read
-looping when signing required by server (2.6.16 kernel only). Fix readdir
-vs. rename race which could cause each to hang. Return . and .. even
-if server does not.  Allow searches to skip first three entries and
-begin at any location. Fix oops in find_writeable_file.
-
-Version 1.41
-------------
-Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can
-configure stronger authentication.  Fix sfu symlinks so they can
-be followed (not just recognized).  Fix wraparound of bcc on
-read responses when buffer size over 64K and also fix wrap of
-max smb buffer size when CIFSMaxBufSize over 64K.  Fix oops in
-cifs_user_read and cifs_readpages (when EAGAIN on send of smb
-on socket is returned over and over).  Add POSIX (advisory) byte range
-locking support (requires server with newest CIFS UNIX Extensions
-to the protocol implemented). Slow down negprot slightly in port 139
-RFC1001 case to give session_init time on buggy servers.
-
-Version 1.40
-------------
-Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance
-of readpages by eliminating one extra memcpy. Allow update of file size
-from remote server even if file is open for write as long as mount is
-directio.  Recognize share mode security and send NTLM encrypted password
-on tree connect if share mode negotiated.
-
-Version 1.39
-------------
-Defer close of a file handle slightly if pending writes depend on that handle
-(this reduces the EBADF bad file handle errors that can be logged under heavy
-stress on writes). Modify cifs Kconfig options to expose CONFIG_CIFS_STATS2 
-Fix SFU style symlinks and mknod needed for servers which do not support the
-CIFS Unix Extensions.  Fix setfacl/getfacl on bigendian. Timeout negative
-dentries so files that the client sees as deleted but that later get created
-on the server will be recognized.  Add client side permission check on setattr.
-Timeout stuck requests better (where server has never responded or sent corrupt
-responses)
-
-Version 1.38
-------------
-Fix tcp socket retransmission timeouts (e.g. on ENOSPACE from the socket)
-to be smaller at first (but increasing) so large write performance performance
-over GigE is better.  Do not hang thread on illegal byte range lock response
-from Windows (Windows can send an RFC1001 size which does not match smb size) by
-allowing an SMBs TCP length to be up to a few bytes longer than it should be.
-wsize and rsize can now be larger than negotiated buffer size if server
-supports large readx/writex, even when directio mount flag not specified.
-Write size will in many cases now be 16K instead of 4K which greatly helps
-file copy performance on lightly loaded networks.  Fix oops in dnotify
-when experimental config flag enabled. Make cifsFYI more granular.
-
-Version 1.37
-------------
-Fix readdir caching when unlink removes file in current search buffer,
-and this is followed by a rewind search to just before the deleted entry.
-Do not attempt to set ctime unless atime and/or mtime change requested
-(most servers throw it away anyway). Fix length check of received smbs
-to be more accurate. Fix big endian problem with mapchars mount option,
-and with a field returned by statfs.
-
-Version 1.36
-------------
-Add support for mounting to older pre-CIFS servers such as Windows9x and ME.
-For these older servers, add option for passing netbios name of server in
-on mount (servernetbiosname).  Add suspend support for power management, to
-avoid cifsd thread preventing software suspend from working.
-Add mount option for disabling the default behavior of sending byte range lock
-requests to the server (necessary for certain applications which break with
-mandatory lock behavior such as Evolution), and also mount option for
-requesting case insensitive matching for path based requests (requesting
-case sensitive is the default).
-
-Version 1.35
-------------
-Add writepage performance improvements.  Fix path name conversions
-for long filenames on mounts which were done with "mapchars" mount option
-specified.  Ensure multiplex ids do not collide.  Fix case in which 
-rmmod can oops if done soon after last unmount.  Fix truncated
-search (readdir) output when resume filename was a long filename.
-Fix filename conversion when mapchars mount option was specified and
-filename was a long filename.
-
-Version 1.34
-------------
-Fix error mapping of the TOO_MANY_LINKS (hardlinks) case.
-Do not oops if root user kills cifs oplock kernel thread or
-kills the cifsd thread (NB: killing the cifs kernel threads is not
-recommended, unmount and rmmod cifs will kill them when they are
-no longer needed).  Fix readdir to ASCII servers (ie older servers
-which do not support Unicode) and also require asterisk.
-Fix out of memory case in which data could be written one page
-off in the page cache.
-
-Version 1.33
-------------
-Fix caching problem, in which readdir of directory containing a file
-which was cached could cause the file's time stamp to be updated
-without invalidating the readahead data (so we could get stale
-file data on the client for that file even as the server copy changed).
-Cleanup response processing so cifsd can not loop when abnormally
-terminated.
-
-
-Version 1.32
-------------
-Fix oops in ls when Transact2 FindFirst (or FindNext) returns more than one
-transact response for an SMB request and search entry split across two frames.
-Add support for lsattr (getting ext2/ext3/reiserfs attr flags from the server)
-as new protocol extensions. Do not send Get/Set calls for POSIX ACLs
-unless server explicitly claims to support them in CIFS Unix extensions
-POSIX ACL capability bit. Fix packet signing when multiuser mounting with
-different users from the same client to the same server. Fix oops in
-cifs_close. Add mount option for remapping reserved characters in
-filenames (also allow recognizing files with created by SFU which have any
-of these seven reserved characters, except backslash, to be recognized).
-Fix invalid transact2 message (we were sometimes trying to interpret
-oplock breaks as SMB responses). Add ioctl for checking that the
-current uid matches the uid of the mounter (needed by umount.cifs).
-Reduce the number of large buffer allocations in cifs response processing
-(significantly reduces memory pressure under heavy stress with multiple
-processes accessing the same server at the same time).
-
-Version 1.31
-------------
-Fix updates of DOS attributes and time fields so that files on NT4 servers
-do not get marked delete on close. Display sizes of cifs buffer pools in
-cifs stats. Fix oops in unmount when cifsd thread being killed by 
-shutdown. Add generic readv/writev and aio support. Report inode numbers 
-consistently in readdir and lookup (when serverino mount option is
-specified use the inode number that the server reports - for both lookup
-and readdir, otherwise by default the locally generated inode number is used
-for inodes created in either path since servers are not always able to 
-provide unique inode numbers when exporting multiple volumes from under one
-sharename).
-
-Version 1.30
-------------
-Allow new nouser_xattr mount parm to disable xattr support for user namespace.
-Do not flag user_xattr mount parm in dmesg.  Retry failures setting file time  
-(mostly affects NT4 servers) by retry with handle based network operation. 
-Add new POSIX Query FS Info for returning statfs info more accurately.
-Handle passwords with multiple commas in them.
-
-Version 1.29
-------------
-Fix default mode in sysfs of cifs module parms.  Remove old readdir routine.
-Fix capabilities flags for large readx so as to allow reads larger than 64K.
-
-Version 1.28
-------------
-Add module init parm for large SMB buffer size (to allow it to be changed
-from its default of 16K) which is especially useful for large file copy
-when mounting with the directio mount option. Fix oops after 
-returning from mount when experimental ExtendedSecurity enabled and
-SpnegoNegotiated returning invalid error. Fix case to retry better when 
-peek returns from 1 to 3 bytes on socket which should have more data.
-Fixed path based calls (such as cifs lookup) to handle path names
-longer than 530 (now can handle PATH_MAX). Fix pass through authentication
-from Samba server to DC (Samba required dummy LM password).
-
-Version 1.27
-------------
-Turn off DNOTIFY (directory change notification support) by default
-(unless built with the experimental flag) to fix hang with KDE
-file browser. Fix DNOTIFY flag mappings.  Fix hang (in wait_event
-waiting on an SMB response) in SendReceive when session dies but
-reconnects quickly from another task.  Add module init  parms for
-minimum number of large and small network buffers in the buffer pools,
-and for the maximum number of simultaneous requests.
-
-Version 1.26
-------------
-Add setfacl support to allow setting of ACLs remotely to Samba 3.10 and later
-and other POSIX CIFS compliant servers.  Fix error mapping for getfacl 
-to EOPNOTSUPP when server does not support posix acls on the wire. Fix 
-improperly zeroed buffer in CIFS Unix extensions set times call. 
-
-Version 1.25
-------------
-Fix internationalization problem in cifs readdir with filenames that map to 
-longer UTF-8 strings than the string on the wire was in Unicode.  Add workaround
-for readdir to netapp servers. Fix search rewind (seek into readdir to return 
-non-consecutive entries).  Do not do readdir when server negotiates 
-buffer size to small to fit filename. Add support for reading POSIX ACLs from
-the server (add also acl and noacl mount options).
-
-Version 1.24
-------------
-Optionally allow using server side inode numbers, rather than client generated
-ones by specifying mount option "serverino" - this is required for some apps
-to work which double check hardlinked files and have persistent inode numbers.
-
-Version 1.23
-------------
-Multiple bigendian fixes. On little endian systems (for reconnect after
-network failure) fix tcp session reconnect code so we do not try first
-to reconnect on reverse of port 445. Treat reparse points (NTFS junctions)
-as directories rather than symlinks because we can do follow link on them.
-
-Version 1.22
-------------
-Add config option to enable XATTR (extended attribute) support, mapping
-xattr names in the "user." namespace space to SMB/CIFS EAs. Lots of
-minor fixes pointed out by the Stanford SWAT checker (mostly missing
-or out of order NULL pointer checks in little used error paths).
-
-Version 1.21
-------------
-Add new mount parm to control whether mode check (generic_permission) is done
-on the client.  If Unix extensions are enabled and the uids on the client
-and server do not match, client permission checks are meaningless on
-server uids that do not exist on the client (this does not affect the
-normal ACL check which occurs on the server).  Fix default uid
-on mknod to match create and mkdir. Add optional mount parm to allow
-override of the default uid behavior (in which the server sets the uid
-and gid of newly created files). Normally for network filesystem mounts
-user want the server to set the uid/gid on newly created files (rather than 
-using uid of the client processes you would in a local filesystem).
-
-Version 1.20
-------------
-Make transaction counts more consistent. Merge /proc/fs/cifs/SimultaneousOps
-info into /proc/fs/cifs/DebugData.  Fix oops in rare oops in readdir 
-(in build_wildcard_path_from_dentry).  Fix mknod to pass type field
-(block/char/fifo) properly.  Remove spurious mount warning log entry when
-credentials passed as mount argument. Set major/minor device number in
-inode for block and char devices when unix extensions enabled.
-
-Version 1.19
-------------
-Fix /proc/fs/cifs/Stats and DebugData display to handle larger
-amounts of return data. Properly limit requests to MAX_REQ (50
-is the usual maximum active multiplex SMB/CIFS requests per server).
-Do not kill cifsd (and thus hurt the other SMB session) when more than one
-session to the same server (but with different userids) exists and one
-of the two user's smb sessions is being removed while leaving the other.
-Do not loop reconnecting in cifsd demultiplex thread when admin
-kills the thread without going through unmount.
-
-Version 1.18
-------------
-Do not rename hardlinked files (since that should be a noop). Flush
-cached write behind data when reopening a file after session abend,
-except when already in write. Grab per socket sem during reconnect 
-to avoid oops in sendmsg if overlapping with reconnect. Do not
-reset cached inode file size on readdir for files open for write on 
-client.
-
-
-Version 1.17
-------------
-Update number of blocks in file so du command is happier (in Linux a fake
-blocksize of 512 is required for calculating number of blocks in inode).
-Fix prepare write of partial pages to read in data from server if possible.
-Fix race on tcpStatus field between unmount and reconnection code, causing
-cifsd process sometimes to hang around forever. Improve out of memory
-checks in cifs_filldir
-
-Version 1.16
-------------
-Fix incorrect file size in file handle based setattr on big endian hardware.
-Fix oops in build_path_from_dentry when out of memory.  Add checks for invalid
-and closing file structs in writepage/partialpagewrite.  Add statistics
-for each mounted share (new menuconfig option). Fix endianness problem in
-volume information displayed in /proc/fs/cifs/DebugData (only affects
-affects big endian architectures). Prevent renames while constructing
-path names for open, mkdir and rmdir.
-
-Version 1.15
-------------
-Change to mempools for alloc smb request buffers and multiplex structs
-to better handle low memory problems (and potential deadlocks).
-
-Version 1.14
-------------
-Fix incomplete listings of large directories on Samba servers when Unix
-extensions enabled.  Fix oops when smb_buffer can not be allocated. Fix
-rename deadlock when writing out dirty pages at same time.
-
-Version 1.13
-------------
-Fix open of files in which O_CREATE can cause the mode to change in
-some cases. Fix case in which retry of write overlaps file close.
-Fix PPC64 build error.  Reduce excessive stack usage in smb password
-hashing. Fix overwrite of Linux user's view of file mode to Windows servers.
-
-Version 1.12
-------------
-Fixes for large file copy, signal handling, socket retry, buffer
-allocation and low memory situations.
-
-Version 1.11
-------------
-Better port 139 support to Windows servers (RFC1001/RFC1002 Session_Initialize)
-also now allowing support for specifying client netbiosname.  NT4 support added.
-
-Version 1.10
-------------
-Fix reconnection (and certain failed mounts) to properly wake up the
-blocked users thread so it does not seem hung (in some cases was blocked
-until the cifs receive timeout expired). Fix spurious error logging
-to kernel log when application with open network files killed. 
-
-Version 1.09
-------------
-Fix /proc/fs module unload warning message (that could be logged
-to the kernel log). Fix intermittent failure in connectathon
-test7 (hardlink count not immediately refreshed in case in which
-inode metadata can be incorrectly kept cached when time near zero)
-
-Version 1.08
-------------
-Allow file_mode and dir_mode (specified at mount time) to be enforced
-locally (the server already enforced its own ACLs too) for servers
-that do not report the correct mode (do not support the 
-CIFS Unix Extensions).
-
-Version 1.07
-------------
-Fix some small memory leaks in some unmount error paths. Fix major leak
-of cache pages in readpages causing multiple read oriented stress
-testcases (including fsx, and even large file copy) to fail over time. 
-
-Version 1.06
-------------
-Send NTCreateX with ATTR_POSIX if Linux/Unix extensions negotiated with server.
-This allows files that differ only in case and improves performance of file
-creation and file open to such servers.  Fix semaphore conflict which causes 
-slow delete of open file to Samba (which unfortunately can cause an oplock
-break to self while vfs_unlink held i_sem) which can hang for 20 seconds.
-
-Version 1.05
-------------
-fixes to cifs_readpages for fsx test case
-
-Version 1.04
-------------
-Fix caching data integrity bug when extending file size especially when no
-oplock on file.  Fix spurious logging of valid already parsed mount options
-that are parsed outside of the cifs vfs such as nosuid.
-
-
-Version 1.03
-------------
-Connect to server when port number override not specified, and tcp port
-unitialized.  Reset search to restart at correct file when kernel routine
-filldir returns error during large directory searches (readdir). 
-
-Version 1.02
-------------
-Fix caching problem when files opened by multiple clients in which 
-page cache could contain stale data, and write through did
-not occur often enough while file was still open when read ahead
-(read oplock) not allowed.  Treat "sep=" when first mount option
-as an override of comma as the default separator between mount
-options. 
-
-Version 1.01
-------------
-Allow passwords longer than 16 bytes. Allow null password string.
-
-Version 1.00
-------------
-Gracefully clean up failed mounts when attempting to mount to servers such as
-Windows 98 that terminate tcp sessions during protocol negotiation.  Handle
-embedded commas in mount parsing of passwords.
-
-Version 0.99
-------------
-Invalidate local inode cached pages on oplock break and when last file
-instance is closed so that the client does not continue using stale local
-copy rather than later modified server copy of file.  Do not reconnect
-when server drops the tcp session prematurely before negotiate
-protocol response.  Fix oops in reopen_file when dentry freed.  Allow
-the support for CIFS Unix Extensions to be disabled via proc interface.
-
-Version 0.98
-------------
-Fix hang in commit_write during reconnection of open files under heavy load.
-Fix unload_nls oops in a mount failure path. Serialize writes to same socket
-which also fixes any possible races when cifs signatures are enabled in SMBs
-being sent out of signature sequence number order.    
-
-Version 0.97
-------------
-Fix byte range locking bug (endian problem) causing bad offset and
-length.
-
-Version 0.96
-------------
-Fix oops (in send_sig) caused by CIFS unmount code trying to
-wake up the demultiplex thread after it had exited. Do not log
-error on harmless oplock release of closed handle.
-
-Version 0.95
-------------
-Fix unsafe global variable usage and password hash failure on gcc 3.3.1
-Fix problem reconnecting secondary mounts to same server after session 
-failure.  Fix invalid dentry - race in mkdir when directory gets created
-by another client between the lookup and mkdir.
- 
-Version 0.94
-------------
-Fix to list processing in reopen_files. Fix reconnection when server hung
-but tcpip session still alive.  Set proper timeout on socket read.
-
-Version 0.93
-------------
-Add missing mount options including iocharset.  SMP fixes in write and open. 
-Fix errors in reconnecting after TCP session failure.  Fix module unloading
-of default nls codepage
-
-Version 0.92
-------------
-Active smb transactions should never go negative (fix double FreeXid). Fix
-list processing in file routines. Check return code on kmalloc in open.
-Fix spinlock usage for SMP.
-
-Version 0.91
-------------
-Fix oops in reopen_files when invalid dentry. drop dentry on server rename 
-and on revalidate errors. Fix cases where pid is now tgid.  Fix return code
-on create hard link when server does not support them. 
-
-Version 0.90
-------------
-Fix scheduling while atomic error in getting inode info on newly created file. 
-Fix truncate of existing files opened with O_CREAT but not O_TRUNC set.
-
-Version 0.89
-------------
-Fix oops on write to dead tcp session. Remove error log write for case when file open
-O_CREAT but not O_EXCL
-
-Version 0.88
-------------
-Fix non-POSIX behavior on rename of open file and delete of open file by taking 
-advantage of trans2 SetFileInfo rename facility if available on target server.
-Retry on ENOSPC and EAGAIN socket errors.
-
-Version 0.87
-------------
-Fix oops on big endian readdir.  Set blksize to be even power of two (2**blkbits) to fix
-allocation size miscalculation. After oplock token lost do not read through
-cache. 
-
-Version 0.86
-------------
-Fix oops on empty file readahead.  Fix for file size handling for locally cached files.
-
-Version 0.85
-------------
-Fix oops in mkdir when server fails to return inode info. Fix oops in reopen_files
-during auto reconnection to server after server recovered from failure.
-
-Version 0.84
-------------
-Finish support for Linux 2.5 open/create changes, which removes the
-redundant NTCreate/QPathInfo/close that was sent during file create.
-Enable oplock by default. Enable packet signing by default (needed to 
-access many recent Windows servers)
-
-Version 0.83
-------------
-Fix oops when mounting to long server names caused by inverted parms to kmalloc.
-Fix MultiuserMount (/proc/fs/cifs configuration setting) so that when enabled
-we will choose a cifs user session (smb uid) that better matches the local
-uid if a) the mount uid does not match the current uid and b) we have another
-session to the same server (ip address) for a different mount which
-matches the current local uid.
-
-Version 0.82
-------------
-Add support for mknod of block or character devices.  Fix oplock
-code (distributed caching) to properly send response to oplock
-break from server.
-
-Version 0.81
-------------
-Finish up CIFS packet digital signing for the default
-NTLM security case. This should help Windows 2003
-network interoperability since it is common for
-packet signing to be required now. Fix statfs (stat -f)
-which recently started returning errors due to 
-invalid value (-1 instead of 0) being set in the
-struct kstatfs f_ffiles field.
-
-Version 0.80
------------
-Fix oops on stopping oplock thread when removing cifs when
-built as module.
-
-Version 0.79
-------------
-Fix mount options for ro (readonly), uid, gid and file and directory mode. 
-
-Version 0.78
-------------
-Fix errors displayed on failed mounts to be more understandable.
-Fixed various incorrect or misleading smb to posix error code mappings.
-
-Version 0.77
-------------
-Fix display of NTFS DFS junctions to display as symlinks.
-They are the network equivalent.  Fix oops in 
-cifs_partialpagewrite caused by missing spinlock protection
-of openfile linked list.  Allow writebehind caching errors to 
-be returned to the application at file close.
-
-Version 0.76
-------------
-Clean up options displayed in /proc/mounts by show_options to
-be more consistent with other filesystems.
-
-Version 0.75
-------------
-Fix delete of readonly file to Windows servers.  Reflect
-presence or absence of read only dos attribute in mode
-bits for servers that do not support CIFS Unix extensions.
-Fix shortened results on readdir of large directories to
-servers supporting CIFS Unix extensions (caused by
-incorrect resume key).
-
-Version 0.74
-------------
-Fix truncate bug (set file size) that could cause hangs e.g. running fsx
-
-Version 0.73
-------------
-unload nls if mount fails.
-
-Version 0.72
-------------
-Add resume key support to search (readdir) code to workaround
-Windows bug.  Add /proc/fs/cifs/LookupCacheEnable which
-allows disabling caching of attribute information for
-lookups.
-
-Version 0.71
-------------
-Add more oplock handling (distributed caching code).  Remove
-dead code.  Remove excessive stack space utilization from
-symlink routines.
-
-Version 0.70
-------------
-Fix oops in get dfs referral (triggered when null path sent in to
-mount).  Add support for overriding rsize at mount time.
-
-Version 0.69
-------------
-Fix buffer overrun in readdir which caused intermittent kernel oopses.
-Fix writepage code to release kmap on write data.  Allow "-ip=" new 
-mount option to be passed in on parameter distinct from the first part
-(server name portion of) the UNC name.  Allow override of the
-tcp port of the target server via new mount option "-port="  
-
-Version 0.68
-------------
-Fix search handle leak on rewind.  Fix setuid and gid so that they are 
-reflected in the local inode immediately.  Cleanup of whitespace
-to make 2.4 and 2.5 versions more consistent.
-
-
-Version 0.67
-------------
-Fix signal sending so that captive thread (cifsd) exits on umount 
-(which was causing the warning in kmem_cache_free of the request buffers
-at rmmod time).  This had broken as a sideeffect of the recent global
-kernel change to daemonize.  Fix memory leak in readdir code which
-showed up in "ls -R" (and applications that did search rewinding).
-
-Version 0.66
-------------
-Reconnect tids and fids after session reconnection (still do not
-reconnect byte range locks though).  Fix problem caching
-lookup information for directory inodes, improving performance,
-especially in deep directory trees.  Fix various build warnings.
-
-Version 0.65
-------------
-Finish fixes to commit write for caching/readahead consistency.  fsx 
-now works to Samba servers.  Fix oops caused when readahead
-was interrupted by a signal.
-
-Version 0.64
-------------
-Fix data corruption (in partial page after truncate) that caused fsx to
-fail to Windows servers.  Cleaned up some extraneous error logging in
-common error paths.  Add generic sendfile support.
-
-Version 0.63
-------------
-Fix memory leak in AllocMidQEntry.
-Finish reconnection logic, so connection with server can be dropped
-(or server rebooted) and the cifs client will reconnect.  
-
-Version 0.62
-------------
-Fix temporary socket leak when bad userid or password specified 
-(or other SMBSessSetup failure).  Increase maximum buffer size to slightly
-over 16K to allow negotiation of up to Samba and Windows server default read 
-sizes.  Add support for readpages
-
-Version 0.61
-------------
-Fix oops when username not passed in on mount.  Extensive fixes and improvements
-to error logging (strip redundant newlines, change debug macros to ensure newline
-passed in and to be more consistent).  Fix writepage wrong file handle problem,
-a readonly file handle could be incorrectly used to attempt to write out
-file updates through the page cache to multiply open files.  This could cause
-the iozone benchmark to fail on the fwrite test. Fix bug mounting two different
-shares to the same Windows server when using different usernames
-(doing this to Samba servers worked but Windows was rejecting it) - now it is
-possible to use different userids when connecting to the same server from a
-Linux client. Fix oops when treeDisconnect called during unmount on
-previously freed socket.
-
-Version 0.60
-------------
-Fix oops in readpages caused by not setting address space operations in inode in 
-rare code path. 
-
-Version 0.59
-------------
-Includes support for deleting of open files and renaming over existing files (per POSIX
-requirement).  Add readlink support for Windows junction points (directory symlinks).
-
-Version 0.58
-------------
-Changed read and write to go through pagecache. Added additional address space operations.
-Memory mapped operations now working.
-
-Version 0.57
-------------
-Added writepage code for additional memory mapping support.  Fixed leak in xids causing
-the simultaneous operations counter (/proc/fs/cifs/SimultaneousOps) to increase on 
-every stat call.  Additional formatting cleanup. 
-
-Version 0.56
-------------
-Fix bigendian bug in order of time conversion. Merge 2.5 to 2.4 version.  Formatting cleanup.   
-
-Version 0.55
-------------
-Fixes from Zwane Mwaikambo for adding missing return code checking in a few places.
-Also included a modified version of his fix to protect global list manipulation of
-the smb session and tree connection and mid related global variables.
-
-Version 0.54
-------------
-Fix problem with captive thread hanging around at unmount time.  Adjust to 2.5.42-pre
-changes to superblock layout.   Remove wasteful allocation of smb buffers (now the send 
-buffer is reused for responses).  Add more oplock handling. Additional minor cleanup.
-
-Version 0.53
-------------
-More stylistic updates to better match kernel style.  Add additional statistics
-for filesystem which can be viewed via /proc/fs/cifs.  Add more pieces of NTLMv2
-and CIFS Packet Signing enablement.
-
-Version 0.52
-------------
-Replace call to sleep_on with safer wait_on_event.
-Make stylistic changes to better match kernel style recommendations.
-Remove most typedef usage (except for the PDUs themselves).
-
-Version 0.51
-------------
-Update mount so the -unc mount option is no longer required (the ip address can be specified
-in a UNC style device name.   Implementation of readpage/writepage started.
-
-Version 0.50
-------------
-Fix intermittent problem with incorrect smb header checking on badly 
-fragmented tcp responses
-
-Version 0.49
-------------
-Fixes to setting of allocation size and file size.
-
-Version 0.48
-------------
-Various 2.5.38 fixes.  Now works on 2.5.38
-
-Version 0.47
-------------
-Prepare for 2.5 kernel merge.  Remove ifdefs.
-
-Version 0.46
-------------
-Socket buffer management fixes.  Fix dual free.
-
-Version 0.45
-------------
-Various big endian fixes for hardlinks and symlinks and also for dfs.
-
-Version 0.44
-------------
-Various big endian fixes for servers with Unix extensions such as Samba
-
-Version 0.43
-------------
-Various FindNext fixes for incorrect filenames on large directory searches on big endian
-clients.  basic posix file i/o tests now work on big endian machines, not just le
-
-Version 0.42
-------------
-SessionSetup and NegotiateProtocol now work from Big Endian machines.
-Various Big Endian fixes found during testing on the Linux on 390.  Various fixes for compatibility with older
-versions of 2.4 kernel (now builds and works again on kernels at least as early as 2.4.7).
-
-Version 0.41
-------------
-Various minor fixes for Connectathon Posix "basic" file i/o test suite.  Directory caching fixed so hardlinked
-files now return the correct number of links on fstat as they are repeatedly linked and unlinked.
-
-Version 0.40
-------------
-Implemented "Raw" (i.e. not encapsulated in SPNEGO) NTLMSSP (i.e. the Security Provider Interface used to negotiate
-session advanced session authentication).  Raw NTLMSSP is preferred by Windows 2000 Professional and Windows XP.
-Began implementing support for SPNEGO encapsulation of NTLMSSP based session authentication blobs
-(which is the mechanism preferred by Windows 2000 server in the absence of Kerberos).
-
-Version 0.38
-------------
-Introduced optional mount helper utility mount.cifs and made coreq changes to cifs vfs to enable
-it. Fixed a few bugs in the DFS code (e.g. bcc two bytes too short and incorrect uid in PDU).
-
-Version 0.37
-------------
-Rewrote much of connection and mount/unmount logic to handle bugs with
-multiple uses to same share, multiple users to same server etc.
-
-Version 0.36
-------------
-Fixed major problem with dentry corruption (missing call to dput)
-
-Version 0.35
-------------
-Rewrite of readdir code to fix bug. Various fixes for bigendian machines.
-Begin adding oplock support.  Multiusermount and oplockEnabled flags added to /proc/fs/cifs
-although corresponding function not fully implemented in the vfs yet
-
-Version 0.34
-------------
-Fixed dentry caching bug, misc. cleanup 
-
-Version 0.33
-------------
-Fixed 2.5 support to handle build and configure changes as well as misc. 2.5 changes.  Now can build
-on current 2.5 beta version (2.5.24) of the Linux kernel as well as on 2.4 Linux kernels.
-Support for STATUS codes (newer 32 bit NT error codes) added.  DFS support begun to be added.
-
-Version 0.32
-------------
-Unix extensions (symlink, readlink, hardlink, chmod and some chgrp and chown) implemented
-and tested against Samba 2.2.5
-
-
-Version 0.31
-------------
-1) Fixed lockrange to be correct (it was one byte too short)
-
-2) Fixed GETLK (i.e. the fcntl call to test a range of bytes in a file to see if locked) to correctly 
-show range as locked when there is a conflict with an existing lock.
-
-3) default file perms are now 2767 (indicating support for mandatory locks) instead of 777 for directories
-in most cases.  Eventually will offer optional ability to query server for the correct perms.
-
-3) Fixed eventual trap when mounting twice to different shares on the same server when the first succeeded 
-but the second one was invalid and failed (the second one was incorrectly disconnecting the tcp and smb
-session) 
-
-4) Fixed error logging of valid mount options
-
-5) Removed logging of password field.
-
-6) Moved negotiate, treeDisconnect and uloggoffX (only tConx and SessSetup remain in connect.c) to cifssmb.c
-and cleaned them up and made them more consistent with other cifs functions. 
-
-7) Server support for Unix extensions is now fully detected and FindFirst is implemented both ways 
-(with or without Unix extensions) but FindNext and QueryPathInfo with the Unix extensions are not completed,
-nor is the symlink support using the Unix extensions
-
-8) Started adding the readlink and follow_link code 
-
-Version 0.3 
------------
-Initial drop
-
diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile
index aa0d68b086eb..1964d212ab08 100644
--- a/fs/cifs/Makefile
+++ b/fs/cifs/Makefile
@@ -6,7 +6,7 @@ obj-$(CONFIG_CIFS) += cifs.o
 cifs-y := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o \
 	  link.o misc.o netmisc.o smbencrypt.o transport.o asn1.o \
 	  cifs_unicode.o nterr.o xattr.o cifsencrypt.o \
-	  readdir.o ioctl.o sess.o export.o smb1ops.o
+	  readdir.o ioctl.o sess.o export.o smb1ops.o winucase.o
 
 cifs-$(CONFIG_CIFS_ACL) += cifsacl.o
 
diff --git a/fs/cifs/README b/fs/cifs/README
deleted file mode 100644
index 2d5622f60e11..000000000000
--- a/fs/cifs/README
+++ /dev/null
@@ -1,753 +0,0 @@
-The CIFS VFS support for Linux supports many advanced network filesystem 
-features such as hierarchical dfs like namespace, hardlinks, locking and more.  
-It was designed to comply with the SNIA CIFS Technical Reference (which 
-supersedes the 1992 X/Open SMB Standard) as well as to perform best practice 
-practical interoperability with Windows 2000, Windows XP, Samba and equivalent 
-servers.  This code was developed in participation with the Protocol Freedom
-Information Foundation.
-
-Please see
-  http://protocolfreedom.org/ and
-  http://samba.org/samba/PFIF/
-for more details.
-
-
-For questions or bug reports please contact:
-    sfrench@samba.org (sfrench@us.ibm.com) 
-
-Build instructions:
-==================
-For Linux 2.4:
-1) Get the kernel source (e.g.from http://www.kernel.org)
-and download the cifs vfs source (see the project page
-at http://us1.samba.org/samba/Linux_CIFS_client.html)
-and change directory into the top of the kernel directory
-then patch the kernel (e.g. "patch -p1 < cifs_24.patch") 
-to add the cifs vfs to your kernel configure options if
-it has not already been added (e.g. current SuSE and UL
-users do not need to apply the cifs_24.patch since the cifs vfs is
-already in the kernel configure menu) and then
-mkdir linux/fs/cifs and then copy the current cifs vfs files from
-the cifs download to your kernel build directory e.g.
-
-	cp <cifs_download_dir>/fs/cifs/* to <kernel_download_dir>/fs/cifs
-	
-2) make menuconfig (or make xconfig)
-3) select cifs from within the network filesystem choices
-4) save and exit
-5) make dep
-6) make modules (or "make" if CIFS VFS not to be built as a module)
-
-For Linux 2.6:
-1) Download the kernel (e.g. from http://www.kernel.org)
-and change directory into the top of the kernel directory tree
-(e.g. /usr/src/linux-2.5.73)
-2) make menuconfig (or make xconfig)
-3) select cifs from within the network filesystem choices
-4) save and exit
-5) make
-
-
-Installation instructions:
-=========================
-If you have built the CIFS vfs as module (successfully) simply
-type "make modules_install" (or if you prefer, manually copy the file to
-the modules directory e.g. /lib/modules/2.4.10-4GB/kernel/fs/cifs/cifs.o).
-
-If you have built the CIFS vfs into the kernel itself, follow the instructions
-for your distribution on how to install a new kernel (usually you
-would simply type "make install").
-
-If you do not have the utility mount.cifs (in the Samba 3.0 source tree and on 
-the CIFS VFS web site) copy it to the same directory in which mount.smbfs and 
-similar files reside (usually /sbin).  Although the helper software is not  
-required, mount.cifs is recommended.  Eventually the Samba 3.0 utility program 
-"net" may also be helpful since it may someday provide easier mount syntax for
-users who are used to Windows e.g.
-	net use <mount point> <UNC name or cifs URL>
-Note that running the Winbind pam/nss module (logon service) on all of your
-Linux clients is useful in mapping Uids and Gids consistently across the
-domain to the proper network user.  The mount.cifs mount helper can be
-trivially built from Samba 3.0 or later source e.g. by executing:
-
-	gcc samba/source/client/mount.cifs.c -o mount.cifs
-
-If cifs is built as a module, then the size and number of network buffers
-and maximum number of simultaneous requests to one server can be configured.
-Changing these from their defaults is not recommended. By executing modinfo
-	modinfo kernel/fs/cifs/cifs.ko
-on kernel/fs/cifs/cifs.ko the list of configuration changes that can be made
-at module initialization time (by running insmod cifs.ko) can be seen.
-
-Allowing User Mounts
-====================
-To permit users to mount and unmount over directories they own is possible
-with the cifs vfs.  A way to enable such mounting is to mark the mount.cifs
-utility as suid (e.g. "chmod +s /sbin/mount.cifs). To enable users to 
-umount shares they mount requires
-1) mount.cifs version 1.4 or later
-2) an entry for the share in /etc/fstab indicating that a user may
-unmount it e.g.
-//server/usersharename  /mnt/username cifs user 0 0
-
-Note that when the mount.cifs utility is run suid (allowing user mounts), 
-in order to reduce risks, the "nosuid" mount flag is passed in on mount to
-disallow execution of an suid program mounted on the remote target.
-When mount is executed as root, nosuid is not passed in by default,
-and execution of suid programs on the remote target would be enabled
-by default. This can be changed, as with nfs and other filesystems, 
-by simply specifying "nosuid" among the mount options. For user mounts 
-though to be able to pass the suid flag to mount requires rebuilding 
-mount.cifs with the following flag: 
- 
-        gcc samba/source/client/mount.cifs.c -DCIFS_ALLOW_USR_SUID -o mount.cifs
-
-There is a corresponding manual page for cifs mounting in the Samba 3.0 and
-later source tree in docs/manpages/mount.cifs.8 
-
-Allowing User Unmounts
-======================
-To permit users to ummount directories that they have user mounted (see above),
-the utility umount.cifs may be used.  It may be invoked directly, or if 
-umount.cifs is placed in /sbin, umount can invoke the cifs umount helper
-(at least for most versions of the umount utility) for umount of cifs
-mounts, unless umount is invoked with -i (which will avoid invoking a umount
-helper). As with mount.cifs, to enable user unmounts umount.cifs must be marked
-as suid (e.g. "chmod +s /sbin/umount.cifs") or equivalent (some distributions
-allow adding entries to a file to the /etc/permissions file to achieve the
-equivalent suid effect).  For this utility to succeed the target path
-must be a cifs mount, and the uid of the current user must match the uid
-of the user who mounted the resource.
-
-Also note that the customary way of allowing user mounts and unmounts is 
-(instead of using mount.cifs and unmount.cifs as suid) to add a line
-to the file /etc/fstab for each //server/share you wish to mount, but
-this can become unwieldy when potential mount targets include many
-or  unpredictable UNC names.
-
-Samba Considerations 
-==================== 
-To get the maximum benefit from the CIFS VFS, we recommend using a server that 
-supports the SNIA CIFS Unix Extensions standard (e.g.  Samba 2.2.5 or later or 
-Samba 3.0) but the CIFS vfs works fine with a wide variety of CIFS servers.  
-Note that uid, gid and file permissions will display default values if you do 
-not have a server that supports the Unix extensions for CIFS (such as Samba 
-2.2.5 or later).  To enable the Unix CIFS Extensions in the Samba server, add 
-the line: 
-
-	unix extensions = yes
-	
-to your smb.conf file on the server.  Note that the following smb.conf settings 
-are also useful (on the Samba server) when the majority of clients are Unix or 
-Linux: 
-
-	case sensitive = yes
-	delete readonly = yes 
-	ea support = yes
-
-Note that server ea support is required for supporting xattrs from the Linux
-cifs client, and that EA support is present in later versions of Samba (e.g. 
-3.0.6 and later (also EA support works in all versions of Windows, at least to
-shares on NTFS filesystems).  Extended Attribute (xattr) support is an optional
-feature of most Linux filesystems which may require enabling via
-make menuconfig. Client support for extended attributes (user xattr) can be
-disabled on a per-mount basis by specifying "nouser_xattr" on mount.
-
-The CIFS client can get and set POSIX ACLs (getfacl, setfacl) to Samba servers
-version 3.10 and later.  Setting POSIX ACLs requires enabling both XATTR and 
-then POSIX support in the CIFS configuration options when building the cifs
-module.  POSIX ACL support can be disabled on a per mount basic by specifying
-"noacl" on mount.
- 
-Some administrators may want to change Samba's smb.conf "map archive" and 
-"create mask" parameters from the default.  Unless the create mask is changed
-newly created files can end up with an unnecessarily restrictive default mode,
-which may not be what you want, although if the CIFS Unix extensions are
-enabled on the server and client, subsequent setattr calls (e.g. chmod) can
-fix the mode.  Note that creating special devices (mknod) remotely 
-may require specifying a mkdev function to Samba if you are not using 
-Samba 3.0.6 or later.  For more information on these see the manual pages
-("man smb.conf") on the Samba server system.  Note that the cifs vfs,
-unlike the smbfs vfs, does not read the smb.conf on the client system 
-(the few optional settings are passed in on mount via -o parameters instead).  
-Note that Samba 2.2.7 or later includes a fix that allows the CIFS VFS to delete
-open files (required for strict POSIX compliance).  Windows Servers already 
-supported this feature. Samba server does not allow symlinks that refer to files
-outside of the share, so in Samba versions prior to 3.0.6, most symlinks to
-files with absolute paths (ie beginning with slash) such as:
-	 ln -s /mnt/foo bar
-would be forbidden. Samba 3.0.6 server or later includes the ability to create 
-such symlinks safely by converting unsafe symlinks (ie symlinks to server 
-files that are outside of the share) to a samba specific format on the server
-that is ignored by local server applications and non-cifs clients and that will
-not be traversed by the Samba server).  This is opaque to the Linux client
-application using the cifs vfs. Absolute symlinks will work to Samba 3.0.5 or
-later, but only for remote clients using the CIFS Unix extensions, and will
-be invisbile to Windows clients and typically will not affect local
-applications running on the same server as Samba.  
-
-Use instructions:
-================
-Once the CIFS VFS support is built into the kernel or installed as a module 
-(cifs.o), you can use mount syntax like the following to access Samba or Windows 
-servers: 
-
-  mount -t cifs //9.53.216.11/e$ /mnt -o user=myname,pass=mypassword
-
-Before -o the option -v may be specified to make the mount.cifs
-mount helper display the mount steps more verbosely.  
-After -o the following commonly used cifs vfs specific options
-are supported:
-
-  user=<username>
-  pass=<password>
-  domain=<domain name>
-  
-Other cifs mount options are described below.  Use of TCP names (in addition to
-ip addresses) is available if the mount helper (mount.cifs) is installed. If
-you do not trust the server to which are mounted, or if you do not have
-cifs signing enabled (and the physical network is insecure), consider use
-of the standard mount options "noexec" and "nosuid" to reduce the risk of 
-running an altered binary on your local system (downloaded from a hostile server
-or altered by a hostile router).
-
-Although mounting using format corresponding to the CIFS URL specification is
-not possible in mount.cifs yet, it is possible to use an alternate format
-for the server and sharename (which is somewhat similar to NFS style mount
-syntax) instead of the more widely used UNC format (i.e. \\server\share):
-  mount -t cifs tcp_name_of_server:share_name /mnt -o user=myname,pass=mypasswd
-
-When using the mount helper mount.cifs, passwords may be specified via alternate
-mechanisms, instead of specifying it after -o using the normal "pass=" syntax
-on the command line:
-1) By including it in a credential file. Specify credentials=filename as one
-of the mount options. Credential files contain two lines
-        username=someuser
-        password=your_password
-2) By specifying the password in the PASSWD environment variable (similarly
-the user name can be taken from the USER environment variable).
-3) By specifying the password in a file by name via PASSWD_FILE
-4) By specifying the password in a file by file descriptor via PASSWD_FD
-
-If no password is provided, mount.cifs will prompt for password entry
-
-Restrictions
-============
-Servers must support either "pure-TCP" (port 445 TCP/IP CIFS connections) or RFC 
-1001/1002 support for "Netbios-Over-TCP/IP." This is not likely to be a 
-problem as most servers support this.
-
-Valid filenames differ between Windows and Linux.  Windows typically restricts
-filenames which contain certain reserved characters (e.g.the character : 
-which is used to delimit the beginning of a stream name by Windows), while
-Linux allows a slightly wider set of valid characters in filenames. Windows
-servers can remap such characters when an explicit mapping is specified in
-the Server's registry.  Samba starting with version 3.10 will allow such 
-filenames (ie those which contain valid Linux characters, which normally
-would be forbidden for Windows/CIFS semantics) as long as the server is
-configured for Unix Extensions (and the client has not disabled
-/proc/fs/cifs/LinuxExtensionsEnabled).
-  
-
-CIFS VFS Mount Options
-======================
-A partial list of the supported mount options follows:
-  user		The user name to use when trying to establish
-		the CIFS session.
-  password	The user password.  If the mount helper is
-		installed, the user will be prompted for password
-		if not supplied.
-  ip		The ip address of the target server
-  unc		The target server Universal Network Name (export) to 
-		mount.	
-  domain	Set the SMB/CIFS workgroup name prepended to the
-		username during CIFS session establishment
-  forceuid	Set the default uid for inodes to the uid
-		passed in on mount. For mounts to servers
-		which do support the CIFS Unix extensions, such as a
-		properly configured Samba server, the server provides
-		the uid, gid and mode so this parameter should not be
-		specified unless the server and clients uid and gid
-		numbering differ.  If the server and client are in the
-		same domain (e.g. running winbind or nss_ldap) and
-		the server supports the Unix Extensions then the uid
-		and gid can be retrieved from the server (and uid
-		and gid would not have to be specifed on the mount. 
-		For servers which do not support the CIFS Unix
-		extensions, the default uid (and gid) returned on lookup
-		of existing files will be the uid (gid) of the person
-		who executed the mount (root, except when mount.cifs
-		is configured setuid for user mounts) unless the "uid=" 
-		(gid) mount option is specified. Also note that permission
-		checks (authorization checks) on accesses to a file occur
-		at the server, but there are cases in which an administrator
-		may want to restrict at the client as well.  For those
-		servers which do not report a uid/gid owner
-		(such as Windows), permissions can also be checked at the
-		client, and a crude form of client side permission checking 
-		can be enabled by specifying file_mode and dir_mode on 
-		the client.  (default)
-  forcegid	(similar to above but for the groupid instead of uid) (default)
-  noforceuid	Fill in file owner information (uid) by requesting it from
-		the server if possible. With this option, the value given in
-		the uid= option (on mount) will only be used if the server
-		can not support returning uids on inodes.
-  noforcegid	(similar to above but for the group owner, gid, instead of uid)
-  uid		Set the default uid for inodes, and indicate to the
-		cifs kernel driver which local user mounted. If the server
-		supports the unix extensions the default uid is
-		not used to fill in the owner fields of inodes (files)
-		unless the "forceuid" parameter is specified.
-  gid		Set the default gid for inodes (similar to above).
-  file_mode     If CIFS Unix extensions are not supported by the server
-		this overrides the default mode for file inodes.
-  fsc		Enable local disk caching using FS-Cache (off by default). This
-  		option could be useful to improve performance on a slow link,
-		heavily loaded server and/or network where reading from the
-		disk is faster than reading from the server (over the network).
-		This could also impact scalability positively as the
-		number of calls to the server are reduced. However, local
-		caching is not suitable for all workloads for e.g. read-once
-		type workloads. So, you need to consider carefully your
-		workload/scenario before using this option. Currently, local
-		disk caching is functional for CIFS files opened as read-only.
-  dir_mode      If CIFS Unix extensions are not supported by the server 
-		this overrides the default mode for directory inodes.
-  port		attempt to contact the server on this tcp port, before
-		trying the usual ports (port 445, then 139).
-  iocharset     Codepage used to convert local path names to and from
-		Unicode. Unicode is used by default for network path
-		names if the server supports it.  If iocharset is
-		not specified then the nls_default specified
-		during the local client kernel build will be used.
-		If server does not support Unicode, this parameter is
-		unused.
-  rsize		default read size (usually 16K). The client currently
-		can not use rsize larger than CIFSMaxBufSize. CIFSMaxBufSize
-		defaults to 16K and may be changed (from 8K to the maximum
-		kmalloc size allowed by your kernel) at module install time
-		for cifs.ko. Setting CIFSMaxBufSize to a very large value
-		will cause cifs to use more memory and may reduce performance
-		in some cases.  To use rsize greater than 127K (the original
-		cifs protocol maximum) also requires that the server support
-		a new Unix Capability flag (for very large read) which some
-		newer servers (e.g. Samba 3.0.26 or later) do. rsize can be
-		set from a minimum of 2048 to a maximum of 130048 (127K or
-		CIFSMaxBufSize, whichever is smaller)
-  wsize		default write size (default 57344)
-		maximum wsize currently allowed by CIFS is 57344 (fourteen
-		4096 byte pages)
-  actimeo=n	attribute cache timeout in seconds (default 1 second).
-		After this timeout, the cifs client requests fresh attribute
-		information from the server. This option allows to tune the
-		attribute cache timeout to suit the workload needs. Shorter
-		timeouts mean better the cache coherency, but increased number
-		of calls to the server. Longer timeouts mean reduced number
-		of calls to the server at the expense of less stricter cache
-		coherency checks (i.e. incorrect attribute cache for a short
-		period of time).
-  rw		mount the network share read-write (note that the
-		server may still consider the share read-only)
-  ro		mount network share read-only
-  version	used to distinguish different versions of the
-		mount helper utility (not typically needed)
-  sep		if first mount option (after the -o), overrides
-		the comma as the separator between the mount
-		parms. e.g.
-			-o user=myname,password=mypassword,domain=mydom
-		could be passed instead with period as the separator by
-			-o sep=.user=myname.password=mypassword.domain=mydom
-		this might be useful when comma is contained within username
-		or password or domain. This option is less important
-		when the cifs mount helper cifs.mount (version 1.1 or later)
-		is used.
-  nosuid        Do not allow remote executables with the suid bit 
-		program to be executed.  This is only meaningful for mounts
-		to servers such as Samba which support the CIFS Unix Extensions.
-		If you do not trust the servers in your network (your mount
-		targets) it is recommended that you specify this option for
-		greater security.
-  exec		Permit execution of binaries on the mount.
-  noexec	Do not permit execution of binaries on the mount.
-  dev		Recognize block devices on the remote mount.
-  nodev		Do not recognize devices on the remote mount.
-  suid          Allow remote files on this mountpoint with suid enabled to 
-		be executed (default for mounts when executed as root,
-		nosuid is default for user mounts).
-  credentials   Although ignored by the cifs kernel component, it is used by 
-		the mount helper, mount.cifs. When mount.cifs is installed it
-		opens and reads the credential file specified in order  
-		to obtain the userid and password arguments which are passed to
-		the cifs vfs.
-  guest         Although ignored by the kernel component, the mount.cifs
-		mount helper will not prompt the user for a password
-		if guest is specified on the mount options.  If no
-		password is specified a null password will be used.
-  perm          Client does permission checks (vfs_permission check of uid
-		and gid of the file against the mode and desired operation),
-		Note that this is in addition to the normal ACL check on the
-		target machine done by the server software. 
-		Client permission checking is enabled by default.
-  noperm        Client does not do permission checks.  This can expose
-		files on this mount to access by other users on the local
-		client system. It is typically only needed when the server
-		supports the CIFS Unix Extensions but the UIDs/GIDs on the
-		client and server system do not match closely enough to allow
-		access by the user doing the mount, but it may be useful with
-		non CIFS Unix Extension mounts for cases in which the default
-		mode is specified on the mount but is not to be enforced on the
-		client (e.g. perhaps when MultiUserMount is enabled)
-		Note that this does not affect the normal ACL check on the
-		target machine done by the server software (of the server
-		ACL against the user name provided at mount time).
-  serverino	Use server's inode numbers instead of generating automatically
-		incrementing inode numbers on the client.  Although this will
-		make it easier to spot hardlinked files (as they will have
-		the same inode numbers) and inode numbers may be persistent,
-		note that the server does not guarantee that the inode numbers
-		are unique if multiple server side mounts are exported under a
-		single share (since inode numbers on the servers might not
-		be unique if multiple filesystems are mounted under the same
-		shared higher level directory).  Note that some older
-		(e.g. pre-Windows 2000) do not support returning UniqueIDs
-		or the CIFS Unix Extensions equivalent and for those
-		this mount option will have no effect.  Exporting cifs mounts
-		under nfsd requires this mount option on the cifs mount.
-		This is now the default if server supports the 
-		required network operation.
-  noserverino   Client generates inode numbers (rather than using the actual one
-		from the server). These inode numbers will vary after
-		unmount or reboot which can confuse some applications,
-		but not all server filesystems support unique inode
-		numbers.
-  setuids       If the CIFS Unix extensions are negotiated with the server
-		the client will attempt to set the effective uid and gid of
-		the local process on newly created files, directories, and
-		devices (create, mkdir, mknod).  If the CIFS Unix Extensions
-		are not negotiated, for newly created files and directories
-		instead of using the default uid and gid specified on
-		the mount, cache the new file's uid and gid locally which means
-		that the uid for the file can change when the inode is
-	        reloaded (or the user remounts the share).
-  nosetuids     The client will not attempt to set the uid and gid on
-		on newly created files, directories, and devices (create, 
-		mkdir, mknod) which will result in the server setting the
-		uid and gid to the default (usually the server uid of the
-		user who mounted the share).  Letting the server (rather than
-		the client) set the uid and gid is the default. If the CIFS
-		Unix Extensions are not negotiated then the uid and gid for
-		new files will appear to be the uid (gid) of the mounter or the
-		uid (gid) parameter specified on the mount.
-  netbiosname   When mounting to servers via port 139, specifies the RFC1001
-		source name to use to represent the client netbios machine 
-		name when doing the RFC1001 netbios session initialize.
-  direct        Do not do inode data caching on files opened on this mount.
-		This precludes mmapping files on this mount. In some cases
-		with fast networks and little or no caching benefits on the
-		client (e.g. when the application is doing large sequential
-		reads bigger than page size without rereading the same data) 
-		this can provide better performance than the default
-		behavior which caches reads (readahead) and writes 
-		(writebehind) through the local Linux client pagecache 
-		if oplock (caching token) is granted and held. Note that
-		direct allows write operations larger than page size
-		to be sent to the server.
-  strictcache   Use for switching on strict cache mode. In this mode the
-		client read from the cache all the time it has Oplock Level II,
-		otherwise - read from the server. All written data are stored
-		in the cache, but if the client doesn't have Exclusive Oplock,
-		it writes the data to the server.
-  rwpidforward  Forward pid of a process who opened a file to any read or write
-		operation on that file. This prevent applications like WINE
-		from failing on read and write if we use mandatory brlock style.
-  acl   	Allow setfacl and getfacl to manage posix ACLs if server
-		supports them.  (default)
-  noacl 	Do not allow setfacl and getfacl calls on this mount
-  user_xattr    Allow getting and setting user xattrs (those attributes whose
-		name begins with "user." or "os2.") as OS/2 EAs (extended
-		attributes) to the server.  This allows support of the
-		setfattr and getfattr utilities. (default)
-  nouser_xattr  Do not allow getfattr/setfattr to get/set/list xattrs 
-  mapchars      Translate six of the seven reserved characters (not backslash)
-			*?<>|:
-		to the remap range (above 0xF000), which also
-		allows the CIFS client to recognize files created with
-		such characters by Windows's POSIX emulation. This can
-		also be useful when mounting to most versions of Samba
-		(which also forbids creating and opening files
-		whose names contain any of these seven characters).
-		This has no effect if the server does not support
-		Unicode on the wire.
- nomapchars     Do not translate any of these seven characters (default).
- nocase         Request case insensitive path name matching (case
-		sensitive is the default if the server supports it).
-		(mount option "ignorecase" is identical to "nocase")
- posixpaths     If CIFS Unix extensions are supported, attempt to
-		negotiate posix path name support which allows certain
-		characters forbidden in typical CIFS filenames, without
-		requiring remapping. (default)
- noposixpaths   If CIFS Unix extensions are supported, do not request
-		posix path name support (this may cause servers to
-		reject creatingfile with certain reserved characters).
- nounix         Disable the CIFS Unix Extensions for this mount (tree
-		connection). This is rarely needed, but it may be useful
-		in order to turn off multiple settings all at once (ie
-		posix acls, posix locks, posix paths, symlink support
-		and retrieving uids/gids/mode from the server) or to
-		work around a bug in server which implement the Unix
-		Extensions.
- nobrl          Do not send byte range lock requests to the server.
-		This is necessary for certain applications that break
-		with cifs style mandatory byte range locks (and most
-		cifs servers do not yet support requesting advisory
-		byte range locks).
- forcemandatorylock Even if the server supports posix (advisory) byte range
-		locking, send only mandatory lock requests.  For some
-		(presumably rare) applications, originally coded for
-		DOS/Windows, which require Windows style mandatory byte range
-		locking, they may be able to take advantage of this option,
-		forcing the cifs client to only send mandatory locks
-		even if the cifs server would support posix advisory locks.
-		"forcemand" is accepted as a shorter form of this mount
-		option.
- nostrictsync   If this mount option is set, when an application does an
-		fsync call then the cifs client does not send an SMB Flush
-		to the server (to force the server to write all dirty data
-		for this file immediately to disk), although cifs still sends
-		all dirty (cached) file data to the server and waits for the
-		server to respond to the write.  Since SMB Flush can be
-		very slow, and some servers may be reliable enough (to risk
-		delaying slightly flushing the data to disk on the server),
-		turning on this option may be useful to improve performance for
-		applications that fsync too much, at a small risk of server
-		crash.  If this mount option is not set, by default cifs will
-		send an SMB flush request (and wait for a response) on every
-		fsync call.
- nodfs          Disable DFS (global name space support) even if the
-		server claims to support it.  This can help work around
-		a problem with parsing of DFS paths with Samba server
-		versions 3.0.24 and 3.0.25.
- remount        remount the share (often used to change from ro to rw mounts
-	        or vice versa)
- cifsacl        Report mode bits (e.g. on stat) based on the Windows ACL for
-	        the file. (EXPERIMENTAL)
- servern        Specify the server 's netbios name (RFC1001 name) to use
-		when attempting to setup a session to the server. 
-		This is needed for mounting to some older servers (such
-		as OS/2 or Windows 98 and Windows ME) since they do not
-		support a default server name.  A server name can be up
-		to 15 characters long and is usually uppercased.
- sfu            When the CIFS Unix Extensions are not negotiated, attempt to
-		create device files and fifos in a format compatible with
-		Services for Unix (SFU).  In addition retrieve bits 10-12
-		of the mode via the SETFILEBITS extended attribute (as
-		SFU does).  In the future the bottom 9 bits of the
-		mode also will be emulated using queries of the security
-		descriptor (ACL).
- mfsymlinks     Enable support for Minshall+French symlinks
-		(see http://wiki.samba.org/index.php/UNIX_Extensions#Minshall.2BFrench_symlinks)
-		This option is ignored when specified together with the
-		'sfu' option. Minshall+French symlinks are used even if
-		the server supports the CIFS Unix Extensions.
- sign           Must use packet signing (helps avoid unwanted data modification
-		by intermediate systems in the route).  Note that signing
-		does not work with lanman or plaintext authentication.
- seal           Must seal (encrypt) all data on this mounted share before
-		sending on the network.  Requires support for Unix Extensions.
-		Note that this differs from the sign mount option in that it
-		causes encryption of data sent over this mounted share but other
-		shares mounted to the same server are unaffected.
- locallease     This option is rarely needed. Fcntl F_SETLEASE is
-		used by some applications such as Samba and NFSv4 server to
-		check to see whether a file is cacheable.  CIFS has no way
-		to explicitly request a lease, but can check whether a file
-		is cacheable (oplocked).  Unfortunately, even if a file
-		is not oplocked, it could still be cacheable (ie cifs client
-		could grant fcntl leases if no other local processes are using
-		the file) for cases for example such as when the server does not
-		support oplocks and the user is sure that the only updates to
-		the file will be from this client. Specifying this mount option
-		will allow the cifs client to check for leases (only) locally
-		for files which are not oplocked instead of denying leases
-		in that case. (EXPERIMENTAL)
- sec            Security mode.  Allowed values are:
-			none	attempt to connection as a null user (no name)
-			krb5    Use Kerberos version 5 authentication
-			krb5i   Use Kerberos authentication and packet signing
-			ntlm    Use NTLM password hashing (default)
-			ntlmi   Use NTLM password hashing with signing (if
-				/proc/fs/cifs/PacketSigningEnabled on or if
-				server requires signing also can be the default) 
-			ntlmv2  Use NTLMv2 password hashing      
-			ntlmv2i Use NTLMv2 password hashing with packet signing
-			lanman  (if configured in kernel config) use older
-				lanman hash
-hard		Retry file operations if server is not responding
-soft		Limit retries to unresponsive servers (usually only
-		one retry) before returning an error.  (default)
-
-The mount.cifs mount helper also accepts a few mount options before -o
-including:
-
-	-S      take password from stdin (equivalent to setting the environment
-		variable "PASSWD_FD=0"
-	-V      print mount.cifs version
-	-?      display simple usage information
-
-With most 2.6 kernel versions of modutils, the version of the cifs kernel
-module can be displayed via modinfo.
-
-Misc /proc/fs/cifs Flags and Debug Info
-=======================================
-Informational pseudo-files:
-DebugData		Displays information about active CIFS sessions and
-			shares, features enabled as well as the cifs.ko
-			version.
-Stats			Lists summary resource usage information as well as per
-			share statistics, if CONFIG_CIFS_STATS in enabled
-			in the kernel configuration.
-
-Configuration pseudo-files:
-PacketSigningEnabled	If set to one, cifs packet signing is enabled
-			and will be used if the server requires 
-			it.  If set to two, cifs packet signing is
-			required even if the server considers packet
-			signing optional. (default 1)
-SecurityFlags		Flags which control security negotiation and
-			also packet signing. Authentication (may/must)
-			flags (e.g. for NTLM and/or NTLMv2) may be combined with
-			the signing flags.  Specifying two different password
-			hashing mechanisms (as "must use") on the other hand 
-			does not make much sense. Default flags are 
-				0x07007 
-			(NTLM, NTLMv2 and packet signing allowed).  The maximum 
-			allowable flags if you want to allow mounts to servers
-			using weaker password hashes is 0x37037 (lanman,
-			plaintext, ntlm, ntlmv2, signing allowed).  Some
-			SecurityFlags require the corresponding menuconfig
-			options to be enabled (lanman and plaintext require
-			CONFIG_CIFS_WEAK_PW_HASH for example).  Enabling
-			plaintext authentication currently requires also
-			enabling lanman authentication in the security flags
-			because the cifs module only supports sending
-			laintext passwords using the older lanman dialect
-			form of the session setup SMB.  (e.g. for authentication
-			using plain text passwords, set the SecurityFlags
-			to 0x30030):
- 
-			may use packet signing 				0x00001
-			must use packet signing				0x01001
-			may use NTLM (most common password hash)	0x00002
-			must use NTLM					0x02002
-			may use NTLMv2					0x00004
-			must use NTLMv2					0x04004
-			may use Kerberos security			0x00008
-			must use Kerberos				0x08008
-			may use lanman (weak) password hash  		0x00010
-			must use lanman password hash			0x10010
-			may use plaintext passwords    			0x00020
-			must use plaintext passwords			0x20020
-			(reserved for future packet encryption)		0x00040
-
-cifsFYI			If set to non-zero value, additional debug information
-			will be logged to the system error log.  This field
-			contains three flags controlling different classes of
-			debugging entries.  The maximum value it can be set
-			to is 7 which enables all debugging points (default 0).
-			Some debugging statements are not compiled into the
-			cifs kernel unless CONFIG_CIFS_DEBUG2 is enabled in the
-			kernel configuration. cifsFYI may be set to one or
-			nore of the following flags (7 sets them all):
-
-			log cifs informational messages			0x01
-			log return codes from cifs entry points		0x02
-			log slow responses (ie which take longer than 1 second)
-			  CONFIG_CIFS_STATS2 must be enabled in .config	0x04
-				
-				
-traceSMB		If set to one, debug information is logged to the
-			system error log with the start of smb requests
-			and responses (default 0)
-LookupCacheEnable	If set to one, inode information is kept cached
-			for one second improving performance of lookups
-			(default 1)
-OplockEnabled		If set to one, safe distributed caching enabled.
-			(default 1)
-LinuxExtensionsEnabled	If set to one then the client will attempt to
-			use the CIFS "UNIX" extensions which are optional
-			protocol enhancements that allow CIFS servers
-			to return accurate UID/GID information as well
-			as support symbolic links. If you use servers
-			such as Samba that support the CIFS Unix
-			extensions but do not want to use symbolic link
-			support and want to map the uid and gid fields 
-			to values supplied at mount (rather than the 
-			actual values, then set this to zero. (default 1)
-
-These experimental features and tracing can be enabled by changing flags in 
-/proc/fs/cifs (after the cifs module has been installed or built into the 
-kernel, e.g.  insmod cifs).  To enable a feature set it to 1 e.g.  to enable 
-tracing to the kernel message log type: 
-
-	echo 7 > /proc/fs/cifs/cifsFYI
-	
-cifsFYI functions as a bit mask. Setting it to 1 enables additional kernel
-logging of various informational messages.  2 enables logging of non-zero
-SMB return codes while 4 enables logging of requests that take longer
-than one second to complete (except for byte range lock requests). 
-Setting it to 4 requires defining CONFIG_CIFS_STATS2 manually in the
-source code (typically by setting it in the beginning of cifsglob.h),
-and setting it to seven enables all three.  Finally, tracing
-the start of smb requests and responses can be enabled via:
-
-	echo 1 > /proc/fs/cifs/traceSMB
-
-Per share (per client mount) statistics are available in /proc/fs/cifs/Stats
-if the kernel was configured with cifs statistics enabled.  The statistics
-represent the number of successful (ie non-zero return code from the server) 
-SMB responses to some of the more common commands (open, delete, mkdir etc.).
-Also recorded is the total bytes read and bytes written to the server for
-that share.  Note that due to client caching effects this can be less than the
-number of bytes read and written by the application running on the client.
-The statistics for the number of total SMBs and oplock breaks are different in
-that they represent all for that share, not just those for which the server
-returned success.
-	
-Also note that "cat /proc/fs/cifs/DebugData" will display information about
-the active sessions and the shares that are mounted.
-
-Enabling Kerberos (extended security) works but requires version 1.2 or later
-of the helper program cifs.upcall to be present and to be configured in the
-/etc/request-key.conf file.  The cifs.upcall helper program is from the Samba
-project(http://www.samba.org). NTLM and NTLMv2 and LANMAN support do not
-require this helper. Note that NTLMv2 security (which does not require the
-cifs.upcall helper program), instead of using Kerberos, is sufficient for
-some use cases.
-
-DFS support allows transparent redirection to shares in an MS-DFS name space.
-In addition, DFS support for target shares which are specified as UNC
-names which begin with host names (rather than IP addresses) requires
-a user space helper (such as cifs.upcall) to be present in order to
-translate host names to ip address, and the user space helper must also
-be configured in the file /etc/request-key.conf.  Samba, Windows servers and
-many NAS appliances support DFS as a way of constructing a global name
-space to ease network configuration and improve reliability.
-
-To use cifs Kerberos and DFS support, the Linux keyutils package should be
-installed and something like the following lines should be added to the
-/etc/request-key.conf file:
-
-create cifs.spnego * * /usr/local/sbin/cifs.upcall %k
-create dns_resolver * * /usr/local/sbin/cifs.upcall %k
-
-CIFS kernel module parameters
-=============================
-These module parameters can be specified or modified either during the time of
-module loading or during the runtime by using the interface
-	/proc/module/cifs/parameters/<param>
-
-i.e. echo "value" > /sys/module/cifs/parameters/<param>
-
-1. enable_oplocks - Enable or disable oplocks. Oplocks are enabled by default.
-		    [Y/y/1]. To disable use any of [N/n/0].
-
diff --git a/fs/cifs/TODO b/fs/cifs/TODO
deleted file mode 100644
index 355abcdcda98..000000000000
--- a/fs/cifs/TODO
+++ /dev/null
@@ -1,129 +0,0 @@
-Version 1.53 May 20, 2008
-
-A Partial List of Missing Features
-==================================
-
-Contributions are welcome.  There are plenty of opportunities
-for visible, important contributions to this module.  Here
-is a partial list of the known problems and missing features:
-
-a) Support for SecurityDescriptors(Windows/CIFS ACLs) for chmod/chgrp/chown
-so that these operations can be supported to Windows servers
-
-b) Mapping POSIX ACLs (and eventually NFSv4 ACLs) to CIFS
-SecurityDescriptors
-
-c) Better pam/winbind integration (e.g. to handle uid mapping
-better)
-
-d) Cleanup now unneeded SessSetup code in
-fs/cifs/connect.c and add back in NTLMSSP code if any servers
-need it
-
-e) fix NTLMv2 signing when two mounts with different users to same
-server.
-
-f) Directory entry caching relies on a 1 second timer, rather than 
-using FindNotify or equivalent.  - (started)
-
-g) quota support (needs minor kernel change since quota calls
-to make it to network filesystems or deviceless filesystems)
-
-h) investigate sync behavior (including syncpage) and check  
-for proper behavior of intr/nointr
-
-i) improve support for very old servers (OS/2 and Win9x for example)
-Including support for changing the time remotely (utimes command).
-
-j) hook lower into the sockets api (as NFS/SunRPC does) to avoid the
-extra copy in/out of the socket buffers in some cases.
-
-k) Better optimize open (and pathbased setfilesize) to reduce the
-oplock breaks coming from windows srv.  Piggyback identical file
-opens on top of each other by incrementing reference count rather
-than resending (helps reduce server resource utilization and avoid
-spurious oplock breaks).
-
-l) Improve performance of readpages by sending more than one read
-at a time when 8 pages or more are requested. In conjuntion
-add support for async_cifs_readpages.
-
-m) Add support for storing symlink info to Windows servers 
-in the Extended Attribute format their SFU clients would recognize.
-
-n) Finish fcntl D_NOTIFY support so kde and gnome file list windows
-will autorefresh (partially complete by Asser). Needs minor kernel
-vfs change to support removing D_NOTIFY on a file.   
-
-o) Add GUI tool to configure /proc/fs/cifs settings and for display of
-the CIFS statistics (started)
-
-p) implement support for security and trusted categories of xattrs
-(requires minor protocol extension) to enable better support for SELINUX
-
-q) Implement O_DIRECT flag on open (already supported on mount)
-
-r) Create UID mapping facility so server UIDs can be mapped on a per
-mount or a per server basis to client UIDs or nobody if no mapping
-exists.  This is helpful when Unix extensions are negotiated to
-allow better permission checking when UIDs differ on the server
-and client.  Add new protocol request to the CIFS protocol 
-standard for asking the server for the corresponding name of a
-particular uid.
-
-s) Add support for CIFS Unix and also the newer POSIX extensions to the
-server side for Samba 4.
-
-t) In support for OS/2 (LANMAN 1.2 and LANMAN2.1 based SMB servers) 
-need to add ability to set time to server (utimes command)
-
-u) DOS attrs - returned as pseudo-xattr in Samba format (check VFAT and NTFS for this too)
-
-v) mount check for unmatched uids
-
-w) Add support for new vfs entry point for fallocate
-
-x) Fix Samba 3 server to handle Linux kernel aio so dbench with lots of 
-processes can proceed better in parallel (on the server)
-
-y) Fix Samba 3 to handle reads/writes over 127K (and remove the cifs mount
-restriction of wsize max being 127K) 
-
-KNOWN BUGS (updated April 24, 2007)
-====================================
-See http://bugzilla.samba.org - search on product "CifsVFS" for
-current bug list.
-
-1) existing symbolic links (Windows reparse points) are recognized but
-can not be created remotely. They are implemented for Samba and those that
-support the CIFS Unix extensions, although earlier versions of Samba
-overly restrict the pathnames.
-2) follow_link and readdir code does not follow dfs junctions
-but recognizes them
-3) create of new files to FAT partitions on Windows servers can
-succeed but still return access denied (appears to be Windows 
-server not cifs client problem) and has not been reproduced recently.
-NTFS partitions do not have this problem.
-4) Unix/POSIX capabilities are reset after reconnection, and affect
-a few fields in the tree connection but we do do not know which
-superblocks to apply these changes to.  We should probably walk
-the list of superblocks to set these.  Also need to check the
-flags on the second mount to the same share, and see if we
-can do the same trick that NFS does to remount duplicate shares.
-
-Misc testing to do
-==================
-1) check out max path names and max path name components against various server
-types. Try nested symlinks (8 deep). Return max path name in stat -f information
-
-2) Modify file portion of ltp so it can run against a mounted network
-share and run it against cifs vfs in automated fashion.
-
-3) Additional performance testing and optimization using iozone and similar - 
-there are some easy changes that can be done to parallelize sequential writes,
-and when signing is disabled to request larger read sizes (larger than 
-negotiated size) and send larger write sizes to modern servers.
-
-4) More exhaustively test against less common servers.  More testing
-against Windows 9x, Windows ME servers.
-
diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h
index fe8d6276410a..d8eac3b6cefb 100644
--- a/fs/cifs/cifs_unicode.h
+++ b/fs/cifs/cifs_unicode.h
@@ -91,6 +91,8 @@ extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen,
 #endif /* CONFIG_CIFS_SMB2 */
 #endif
 
+wchar_t cifs_toupper(wchar_t in);
+
 /*
  * UniStrcat:  Concatenate the second string to the first
  *
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 85ea98d139fc..a16b4e58bcc6 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -255,6 +255,7 @@ cifs_alloc_inode(struct super_block *sb)
 	cifs_inode->server_eof = 0;
 	cifs_inode->uniqueid = 0;
 	cifs_inode->createtime = 0;
+	cifs_inode->epoch = 0;
 #ifdef CONFIG_CIFS_SMB2
 	get_random_bytes(cifs_inode->lease_key, SMB2_LEASE_KEY_SIZE);
 #endif
@@ -357,6 +358,18 @@ cifs_show_cache_flavor(struct seq_file *s, struct cifs_sb_info *cifs_sb)
 		seq_printf(s, "loose");
 }
 
+static void
+cifs_show_nls(struct seq_file *s, struct nls_table *cur)
+{
+	struct nls_table *def;
+
+	/* Display iocharset= option if it's not default charset */
+	def = load_nls_default();
+	if (def != cur)
+		seq_printf(s, ",iocharset=%s", cur->charset);
+	unload_nls(def);
+}
+
 /*
  * cifs_show_options() is for displaying mount options in /proc/mounts.
  * Not all settable options are displayed but most of the important
@@ -418,6 +431,9 @@ cifs_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",file_mode=0%ho,dir_mode=0%ho",
 					   cifs_sb->mnt_file_mode,
 					   cifs_sb->mnt_dir_mode);
+
+	cifs_show_nls(s, cifs_sb->local_nls);
+
 	if (tcon->seal)
 		seq_printf(s, ",seal");
 	if (tcon->nocase)
@@ -718,7 +734,7 @@ static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 	written = generic_file_aio_write(iocb, iov, nr_segs, pos);
 
-	if (CIFS_I(inode)->clientCanCacheAll)
+	if (CIFS_CACHE_WRITE(CIFS_I(inode)))
 		return written;
 
 	rc = filemap_fdatawrite(inode->i_mapping);
@@ -743,7 +759,7 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 		 * We need to be sure that all dirty pages are written and the
 		 * server has the newest file length.
 		 */
-		if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+		if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
 		    inode->i_mapping->nrpages != 0) {
 			rc = filemap_fdatawait(inode->i_mapping);
 			if (rc) {
@@ -767,8 +783,10 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence)
 
 static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 {
-	/* note that this is called by vfs setlease with i_lock held
-	   to protect *lease from going away */
+	/*
+	 * Note that this is called by vfs setlease with i_lock held to
+	 * protect *lease from going away.
+	 */
 	struct inode *inode = file_inode(file);
 	struct cifsFileInfo *cfile = file->private_data;
 
@@ -776,20 +794,19 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease)
 		return -EINVAL;
 
 	/* check if file is oplocked */
-	if (((arg == F_RDLCK) &&
-		(CIFS_I(inode)->clientCanCacheRead)) ||
-	    ((arg == F_WRLCK) &&
-		(CIFS_I(inode)->clientCanCacheAll)))
+	if (((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) ||
+	    ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode))))
 		return generic_setlease(file, arg, lease);
 	else if (tlink_tcon(cfile->tlink)->local_lease &&
-		 !CIFS_I(inode)->clientCanCacheRead)
-		/* If the server claims to support oplock on this
-		   file, then we still need to check oplock even
-		   if the local_lease mount option is set, but there
-		   are servers which do not support oplock for which
-		   this mount option may be useful if the user
-		   knows that the file won't be changed on the server
-		   by anyone else */
+		 !CIFS_CACHE_READ(CIFS_I(inode)))
+		/*
+		 * If the server claims to support oplock on this file, then we
+		 * still need to check oplock even if the local_lease mount
+		 * option is set, but there are servers which do not support
+		 * oplock for which this mount option may be useful if the user
+		 * knows that the file won't be changed on the server by anyone
+		 * else.
+		 */
 		return generic_setlease(file, arg, lease);
 	else
 		return -EAGAIN;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 52ca861ed35e..cfa14c80ef3b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -28,6 +28,7 @@
 #include "cifsacl.h"
 #include <crypto/internal/hash.h>
 #include <linux/scatterlist.h>
+#include <uapi/linux/cifs/cifs_mount.h>
 #ifdef CONFIG_CIFS_SMB2
 #include "smb2pdu.h"
 #endif
@@ -41,12 +42,7 @@
 #define MAX_SES_INFO 2
 #define MAX_TCON_INFO 4
 
-#define MAX_TREE_SIZE (2 + MAX_SERVER_SIZE + 1 + MAX_SHARE_SIZE + 1)
-#define MAX_SERVER_SIZE 15
-#define MAX_SHARE_SIZE 80
-#define CIFS_MAX_DOMAINNAME_LEN 256 /* max domain name length */
-#define MAX_USERNAME_SIZE 256	/* reasonable maximum for current servers */
-#define MAX_PASSWORD_SIZE 512	/* max for windows seems to be 256 wide chars */
+#define MAX_TREE_SIZE (2 + CIFS_NI_MAXHOST + 1 + CIFS_MAX_SHARE_LEN + 1)
 
 #define CIFS_MIN_RCV_POOL 4
 
@@ -135,6 +131,7 @@ struct cifs_secmech {
 
 /* per smb session structure/fields */
 struct ntlmssp_auth {
+	bool sesskey_per_smbsess; /* whether session key is per smb session */
 	__u32 client_flags; /* sent by client in type 1 ntlmsssp exchange */
 	__u32 server_flags; /* sent by server in type 2 ntlmssp exchange */
 	unsigned char ciphertext[CIFS_CPHTXT_SIZE]; /* sent to server */
@@ -308,6 +305,9 @@ struct smb_version_operations {
 	int (*create_hardlink)(const unsigned int, struct cifs_tcon *,
 			       const char *, const char *,
 			       struct cifs_sb_info *);
+	/* query symlink target */
+	int (*query_symlink)(const unsigned int, struct cifs_tcon *,
+			     const char *, char **, struct cifs_sb_info *);
 	/* open a file for non-posix mounts */
 	int (*open)(const unsigned int, struct cifs_open_parms *,
 		    __u32 *, FILE_ALL_INFO *);
@@ -361,18 +361,24 @@ struct smb_version_operations {
 	/* push brlocks from the cache to the server */
 	int (*push_mand_locks)(struct cifsFileInfo *);
 	/* get lease key of the inode */
-	void (*get_lease_key)(struct inode *, struct cifs_fid *fid);
+	void (*get_lease_key)(struct inode *, struct cifs_fid *);
 	/* set lease key of the inode */
-	void (*set_lease_key)(struct inode *, struct cifs_fid *fid);
+	void (*set_lease_key)(struct inode *, struct cifs_fid *);
 	/* generate new lease key */
-	void (*new_lease_key)(struct cifs_fid *fid);
-	/* The next two functions will need to be changed to per smb session */
-	void (*generate_signingkey)(struct TCP_Server_Info *server);
-	int (*calc_signature)(struct smb_rqst *rqst,
-				   struct TCP_Server_Info *server);
-	int (*query_mf_symlink)(const unsigned char *path, char *pbuf,
-			unsigned int *pbytes_read, struct cifs_sb_info *cifs_sb,
-			unsigned int xid);
+	void (*new_lease_key)(struct cifs_fid *);
+	int (*generate_signingkey)(struct cifs_ses *);
+	int (*calc_signature)(struct smb_rqst *, struct TCP_Server_Info *);
+	int (*query_mf_symlink)(const unsigned char *, char *, unsigned int *,
+				struct cifs_sb_info *, unsigned int);
+	/* if we can do cache read operations */
+	bool (*is_read_op)(__u32);
+	/* set oplock level for the inode */
+	void (*set_oplock_level)(struct cifsInodeInfo *, __u32, unsigned int,
+				 bool *);
+	/* create lease context buffer for CREATE request */
+	char * (*create_lease_buf)(u8 *, u8);
+	/* parse lease context buffer and return oplock/epoch info */
+	__u8 (*parse_lease_buf)(void *, unsigned int *);
 };
 
 struct smb_version_values {
@@ -390,9 +396,9 @@ struct smb_version_values {
 	unsigned int	cap_unix;
 	unsigned int	cap_nt_find;
 	unsigned int	cap_large_files;
-	unsigned int	oplock_read;
 	__u16		signing_enabled;
 	__u16		signing_required;
+	size_t		create_lease_size;
 };
 
 #define HEADER_SIZE(server) (server->vals->header_size)
@@ -548,7 +554,6 @@ struct TCP_Server_Info {
 	int timeAdj;  /* Adjust for difference in server time zone in sec */
 	__u64 CurrentMid;         /* multiplex id - rotating counter */
 	char cryptkey[CIFS_CRYPTO_KEY_SIZE]; /* used by ntlm, ntlmv2 etc */
-	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
 	/* 16th byte of RFC1001 workstation name is always null */
 	char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL];
 	__u32 sequence_number; /* for signing, protected by srv_mutex */
@@ -731,6 +736,7 @@ struct cifs_ses {
 	bool need_reconnect:1; /* connection reset, uid now invalid */
 #ifdef CONFIG_CIFS_SMB2
 	__u16 session_flags;
+	char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */
 #endif /* CONFIG_CIFS_SMB2 */
 };
 
@@ -935,6 +941,8 @@ struct cifs_fid {
 	__u8 lease_key[SMB2_LEASE_KEY_SIZE];	/* lease key for smb2 */
 #endif
 	struct cifs_pending_open *pending_open;
+	unsigned int epoch;
+	bool purge_cache;
 };
 
 struct cifs_fid_locks {
@@ -1032,6 +1040,17 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file)
 struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file);
 void cifsFileInfo_put(struct cifsFileInfo *cifs_file);
 
+#define CIFS_CACHE_READ_FLG	1
+#define CIFS_CACHE_HANDLE_FLG	2
+#define CIFS_CACHE_RH_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE_FLG	4
+#define CIFS_CACHE_RW_FLG	(CIFS_CACHE_READ_FLG | CIFS_CACHE_WRITE_FLG)
+#define CIFS_CACHE_RHW_FLG	(CIFS_CACHE_RW_FLG | CIFS_CACHE_HANDLE_FLG)
+
+#define CIFS_CACHE_READ(cinode) (cinode->oplock & CIFS_CACHE_READ_FLG)
+#define CIFS_CACHE_HANDLE(cinode) (cinode->oplock & CIFS_CACHE_HANDLE_FLG)
+#define CIFS_CACHE_WRITE(cinode) (cinode->oplock & CIFS_CACHE_WRITE_FLG)
+
 /*
  * One of these for each file inode
  */
@@ -1043,8 +1062,8 @@ struct cifsInodeInfo {
 	/* BB add in lists for dirty pages i.e. write caching info for oplock */
 	struct list_head openFileList;
 	__u32 cifsAttrs; /* e.g. DOS archive bit, sparse, compressed, system */
-	bool clientCanCacheRead;	/* read oplock */
-	bool clientCanCacheAll;		/* read and writebehind oplock */
+	unsigned int oplock;		/* oplock/lease level we have */
+	unsigned int epoch;		/* used to track lease state changes */
 	bool delete_pending;		/* DELETE_ON_CLOSE is set */
 	bool invalid_mapping;		/* pagecache is invalid */
 	unsigned long time;		/* jiffies of last update of inode */
@@ -1502,7 +1521,7 @@ extern mempool_t *cifs_mid_poolp;
 extern struct smb_version_operations smb1_operations;
 extern struct smb_version_values smb1_values;
 #define SMB20_VERSION_STRING	"2.0"
-/*extern struct smb_version_operations smb20_operations; */ /* not needed yet */
+extern struct smb_version_operations smb20_operations;
 extern struct smb_version_values smb20_values;
 #define SMB21_VERSION_STRING	"2.1"
 extern struct smb_version_operations smb21_operations;
diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h
index 11ca24a8e054..948676db8e2e 100644
--- a/fs/cifs/cifspdu.h
+++ b/fs/cifs/cifspdu.h
@@ -1495,11 +1495,12 @@ struct reparse_data {
 	__u32	ReparseTag;
 	__u16	ReparseDataLength;
 	__u16	Reserved;
-	__u16	AltNameOffset;
-	__u16	AltNameLen;
-	__u16	TargetNameOffset;
-	__u16	TargetNameLen;
-	char	LinkNamesBuf[1];
+	__u16	SubstituteNameOffset;
+	__u16	SubstituteNameLength;
+	__u16	PrintNameOffset;
+	__u16	PrintNameLength;
+	__u32	Flags;
+	char	PathBuffer[0];
 } __attribute__((packed));
 
 struct cifs_quota_data {
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index b29a012bed33..b5ec2a268f56 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -357,13 +357,9 @@ extern int CIFSSMBUnixQuerySymLink(const unsigned int xid,
 			struct cifs_tcon *tcon,
 			const unsigned char *searchName, char **syminfo,
 			const struct nls_table *nls_codepage);
-#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
-extern int CIFSSMBQueryReparseLinkInfo(const unsigned int xid,
-			struct cifs_tcon *tcon,
-			const unsigned char *searchName,
-			char *symlinkinfo, const int buflen, __u16 fid,
-			const struct nls_table *nls_codepage);
-#endif /* temporarily unused until cifs_symlink fixed */
+extern int CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+			       __u16 fid, char **symlinkinfo,
+			       const struct nls_table *nls_codepage);
 extern int CIFSSMBOpen(const unsigned int xid, struct cifs_tcon *tcon,
 			const char *fileName, const int disposition,
 			const int access_flags, const int omode,
@@ -435,7 +431,7 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *);
 extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *);
 extern void cifs_crypto_shash_release(struct TCP_Server_Info *);
 extern int calc_seckey(struct cifs_ses *);
-extern void generate_smb3signingkey(struct TCP_Server_Info *);
+extern int generate_smb3signingkey(struct cifs_ses *);
 
 #ifdef CONFIG_CIFS_WEAK_PW_HASH
 extern int calc_lanman_hash(const char *password, const char *cryptkey,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index a89c4cb4e6cf..a3d74fea1623 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -3067,7 +3067,6 @@ querySymLinkRetry:
 	return rc;
 }
 
-#ifdef CONFIG_CIFS_SYMLINK_EXPERIMENTAL
 /*
  *	Recent Windows versions now create symlinks more frequently
  *	and they use the "reparse point" mechanism below.  We can of course
@@ -3079,18 +3078,22 @@ querySymLinkRetry:
  *	it is not compiled in by default until callers fixed up and more tested.
  */
 int
-CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
-			const unsigned char *searchName,
-			char *symlinkinfo, const int buflen, __u16 fid,
-			const struct nls_table *nls_codepage)
+CIFSSMBQuerySymLink(const unsigned int xid, struct cifs_tcon *tcon,
+		    __u16 fid, char **symlinkinfo,
+		    const struct nls_table *nls_codepage)
 {
 	int rc = 0;
 	int bytes_returned;
 	struct smb_com_transaction_ioctl_req *pSMB;
 	struct smb_com_transaction_ioctl_rsp *pSMBr;
+	bool is_unicode;
+	unsigned int sub_len;
+	char *sub_start;
+	struct reparse_data *reparse_buf;
+	__u32 data_offset, data_count;
+	char *end_of_smb;
 
-	cifs_dbg(FYI, "In Windows reparse style QueryLink for path %s\n",
-		 searchName);
+	cifs_dbg(FYI, "In Windows reparse style QueryLink for fid %u\n", fid);
 	rc = smb_init(SMB_COM_NT_TRANSACT, 23, tcon, (void **) &pSMB,
 		      (void **) &pSMBr);
 	if (rc)
@@ -3119,66 +3122,55 @@ CIFSSMBQueryReparseLinkInfo(const unsigned int xid, struct cifs_tcon *tcon,
 			 (struct smb_hdr *) pSMBr, &bytes_returned, 0);
 	if (rc) {
 		cifs_dbg(FYI, "Send error in QueryReparseLinkInfo = %d\n", rc);
-	} else {		/* decode response */
-		__u32 data_offset = le32_to_cpu(pSMBr->DataOffset);
-		__u32 data_count = le32_to_cpu(pSMBr->DataCount);
-		if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
-			/* BB also check enough total bytes returned */
-			rc = -EIO;	/* bad smb */
-			goto qreparse_out;
-		}
-		if (data_count && (data_count < 2048)) {
-			char *end_of_smb = 2 /* sizeof byte count */ +
-			       get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
-
-			struct reparse_data *reparse_buf =
-						(struct reparse_data *)
-						((char *)&pSMBr->hdr.Protocol
-								 + data_offset);
-			if ((char *)reparse_buf >= end_of_smb) {
-				rc = -EIO;
-				goto qreparse_out;
-			}
-			if ((reparse_buf->LinkNamesBuf +
-				reparse_buf->TargetNameOffset +
-				reparse_buf->TargetNameLen) > end_of_smb) {
-				cifs_dbg(FYI, "reparse buf beyond SMB\n");
-				rc = -EIO;
-				goto qreparse_out;
-			}
+		goto qreparse_out;
+	}
 
-			if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE) {
-				cifs_from_ucs2(symlinkinfo, (__le16 *)
-						(reparse_buf->LinkNamesBuf +
-						reparse_buf->TargetNameOffset),
-						buflen,
-						reparse_buf->TargetNameLen,
-						nls_codepage, 0);
-			} else { /* ASCII names */
-				strncpy(symlinkinfo,
-					reparse_buf->LinkNamesBuf +
-					reparse_buf->TargetNameOffset,
-					min_t(const int, buflen,
-					   reparse_buf->TargetNameLen));
-			}
-		} else {
-			rc = -EIO;
-			cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
-		}
-		symlinkinfo[buflen] = 0; /* just in case so the caller
-					does not go off the end of the buffer */
-		cifs_dbg(FYI, "readlink result - %s\n", symlinkinfo);
+	data_offset = le32_to_cpu(pSMBr->DataOffset);
+	data_count = le32_to_cpu(pSMBr->DataCount);
+	if (get_bcc(&pSMBr->hdr) < 2 || data_offset > 512) {
+		/* BB also check enough total bytes returned */
+		rc = -EIO;	/* bad smb */
+		goto qreparse_out;
+	}
+	if (!data_count || (data_count > 2048)) {
+		rc = -EIO;
+		cifs_dbg(FYI, "Invalid return data count on get reparse info ioctl\n");
+		goto qreparse_out;
+	}
+	end_of_smb = 2 + get_bcc(&pSMBr->hdr) + (char *)&pSMBr->ByteCount;
+	reparse_buf = (struct reparse_data *)
+				((char *)&pSMBr->hdr.Protocol + data_offset);
+	if ((char *)reparse_buf >= end_of_smb) {
+		rc = -EIO;
+		goto qreparse_out;
 	}
+	if ((reparse_buf->PathBuffer + reparse_buf->PrintNameOffset +
+				reparse_buf->PrintNameLength) > end_of_smb) {
+		cifs_dbg(FYI, "reparse buf beyond SMB\n");
+		rc = -EIO;
+		goto qreparse_out;
+	}
+	sub_start = reparse_buf->SubstituteNameOffset + reparse_buf->PathBuffer;
+	sub_len = reparse_buf->SubstituteNameLength;
+	if (pSMBr->hdr.Flags2 & SMBFLG2_UNICODE)
+		is_unicode = true;
+	else
+		is_unicode = false;
 
+	/* BB FIXME investigate remapping reserved chars here */
+	*symlinkinfo = cifs_strndup_from_utf16(sub_start, sub_len, is_unicode,
+					       nls_codepage);
+	if (!*symlinkinfo)
+		rc = -ENOMEM;
 qreparse_out:
 	cifs_buf_release(pSMB);
 
-	/* Note: On -EAGAIN error only caller can retry on handle based calls
-		since file handle passed in no longer valid */
-
+	/*
+	 * Note: On -EAGAIN error only caller can retry on handle based calls
+	 * since file handle passed in no longer valid.
+	 */
 	return rc;
 }
-#endif /* CIFS_SYMLINK_EXPERIMENTAL */ /* BB temporarily unused */
 
 #ifdef CONFIG_CIFS_POSIX
 
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index d67c550c4980..a279ffc0bc29 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -379,6 +379,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 		try_to_freeze();
 
 		/* we should try only the port we connected to before */
+		mutex_lock(&server->srv_mutex);
 		rc = generic_ip_connect(server);
 		if (rc) {
 			cifs_dbg(FYI, "reconnect error %d\n", rc);
@@ -390,6 +391,7 @@ cifs_reconnect(struct TCP_Server_Info *server)
 				server->tcpStatus = CifsNeedNegotiate;
 			spin_unlock(&GlobalMid_Lock);
 		}
+		mutex_unlock(&server->srv_mutex);
 	} while (server->tcpStatus == CifsNeedReconnect);
 
 	return rc;
@@ -1114,7 +1116,7 @@ cifs_parse_smb_version(char *value, struct smb_vol *vol)
 		break;
 #ifdef CONFIG_CIFS_SMB2
 	case Smb_20:
-		vol->ops = &smb21_operations; /* currently identical with 2.1 */
+		vol->ops = &smb20_operations;
 		vol->vals = &smb20_values;
 		break;
 	case Smb_21:
@@ -1575,8 +1577,8 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			if (string == NULL)
 				goto out_nomem;
 
-			if (strnlen(string, MAX_USERNAME_SIZE) >
-							MAX_USERNAME_SIZE) {
+			if (strnlen(string, CIFS_MAX_USERNAME_LEN) >
+							CIFS_MAX_USERNAME_LEN) {
 				printk(KERN_WARNING "CIFS: username too long\n");
 				goto cifs_parse_mount_err;
 			}
@@ -2221,13 +2223,13 @@ static int match_session(struct cifs_ses *ses, struct smb_vol *vol)
 		/* anything else takes username/password */
 		if (strncmp(ses->user_name,
 			    vol->username ? vol->username : "",
-			    MAX_USERNAME_SIZE))
+			    CIFS_MAX_USERNAME_LEN))
 			return 0;
 		if (strlen(vol->username) != 0 &&
 		    ses->password != NULL &&
 		    strncmp(ses->password,
 			    vol->password ? vol->password : "",
-			    MAX_PASSWORD_SIZE))
+			    CIFS_MAX_PASSWORD_LEN))
 			return 0;
 	}
 	return 1;
@@ -2352,7 +2354,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	}
 
 	len = delim - payload;
-	if (len > MAX_USERNAME_SIZE || len <= 0) {
+	if (len > CIFS_MAX_USERNAME_LEN || len <= 0) {
 		cifs_dbg(FYI, "Bad value from username search (len=%zd)\n",
 			 len);
 		rc = -EINVAL;
@@ -2369,7 +2371,7 @@ cifs_set_cifscreds(struct smb_vol *vol, struct cifs_ses *ses)
 	cifs_dbg(FYI, "%s: username=%s\n", __func__, vol->username);
 
 	len = key->datalen - (len + 1);
-	if (len > MAX_PASSWORD_SIZE || len <= 0) {
+	if (len > CIFS_MAX_PASSWORD_LEN || len <= 0) {
 		cifs_dbg(FYI, "Bad len for password search (len=%zd)\n", len);
 		rc = -EINVAL;
 		kfree(vol->username);
@@ -3826,33 +3828,8 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses,
 	if (server->ops->sess_setup)
 		rc = server->ops->sess_setup(xid, ses, nls_info);
 
-	if (rc) {
+	if (rc)
 		cifs_dbg(VFS, "Send error in SessSetup = %d\n", rc);
-	} else {
-		mutex_lock(&server->srv_mutex);
-		if (!server->session_estab) {
-			server->session_key.response = ses->auth_key.response;
-			server->session_key.len = ses->auth_key.len;
-			server->sequence_number = 0x2;
-			server->session_estab = true;
-			ses->auth_key.response = NULL;
-			if (server->ops->generate_signingkey)
-				server->ops->generate_signingkey(server);
-		}
-		mutex_unlock(&server->srv_mutex);
-
-		cifs_dbg(FYI, "CIFS Session Established successfully\n");
-		spin_lock(&GlobalMid_Lock);
-		ses->status = CifsGood;
-		ses->need_reconnect = false;
-		spin_unlock(&GlobalMid_Lock);
-	}
-
-	kfree(ses->auth_key.response);
-	ses->auth_key.response = NULL;
-	ses->auth_key.len = 0;
-	kfree(ses->ntlmssp);
-	ses->ntlmssp = NULL;
 
 	return rc;
 }
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index d62ce0d48141..5384c2a640ca 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -32,6 +32,7 @@
 #include "cifsproto.h"
 #include "cifs_debug.h"
 #include "cifs_fs_sb.h"
+#include "cifs_unicode.h"
 
 static void
 renew_parental_timestamps(struct dentry *direntry)
@@ -499,6 +500,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 		if (server->ops->close)
 			server->ops->close(xid, tcon, &fid);
 		cifs_del_pending_open(&open);
+		fput(file);
 		rc = -ENOMEM;
 	}
 
@@ -834,12 +836,17 @@ static int cifs_ci_hash(const struct dentry *dentry, struct qstr *q)
 {
 	struct nls_table *codepage = CIFS_SB(dentry->d_sb)->local_nls;
 	unsigned long hash;
-	int i;
+	wchar_t c;
+	int i, charlen;
 
 	hash = init_name_hash();
-	for (i = 0; i < q->len; i++)
-		hash = partial_name_hash(nls_tolower(codepage, q->name[i]),
-					 hash);
+	for (i = 0; i < q->len; i += charlen) {
+		charlen = codepage->char2uni(&q->name[i], q->len - i, &c);
+		/* error out if we can't convert the character */
+		if (unlikely(charlen < 0))
+			return charlen;
+		hash = partial_name_hash(cifs_toupper(c), hash);
+	}
 	q->hash = end_name_hash(hash);
 
 	return 0;
@@ -849,11 +856,47 @@ static int cifs_ci_compare(const struct dentry *parent, const struct dentry *den
 		unsigned int len, const char *str, const struct qstr *name)
 {
 	struct nls_table *codepage = CIFS_SB(parent->d_sb)->local_nls;
+	wchar_t c1, c2;
+	int i, l1, l2;
 
-	if ((name->len == len) &&
-	    (nls_strnicmp(codepage, name->name, str, len) == 0))
-		return 0;
-	return 1;
+	/*
+	 * We make the assumption here that uppercase characters in the local
+	 * codepage are always the same length as their lowercase counterparts.
+	 *
+	 * If that's ever not the case, then this will fail to match it.
+	 */
+	if (name->len != len)
+		return 1;
+
+	for (i = 0; i < len; i += l1) {
+		/* Convert characters in both strings to UTF-16. */
+		l1 = codepage->char2uni(&str[i], len - i, &c1);
+		l2 = codepage->char2uni(&name->name[i], name->len - i, &c2);
+
+		/*
+		 * If we can't convert either character, just declare it to
+		 * be 1 byte long and compare the original byte.
+		 */
+		if (unlikely(l1 < 0 && l2 < 0)) {
+			if (str[i] != name->name[i])
+				return 1;
+			l1 = 1;
+			continue;
+		}
+
+		/*
+		 * Here, we again ass|u|me that upper/lowercase versions of
+		 * a character are the same length in the local NLS.
+		 */
+		if (l1 != l2)
+			return 1;
+
+		/* Now compare uppercase versions of these characters */
+		if (cifs_toupper(c1) != cifs_toupper(c2))
+			return 1;
+	}
+
+	return 0;
 }
 
 const struct dentry_operations cifs_ci_dentry_ops = {
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 9d0dd952ad79..eb955b525e55 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -313,8 +313,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 	 * If the server returned a read oplock and we have mandatory brlocks,
 	 * set oplock level to None.
 	 */
-	if (oplock == server->vals->oplock_read &&
-						cifs_has_mand_locks(cinode)) {
+	if (server->ops->is_read_op(oplock) && cifs_has_mand_locks(cinode)) {
 		cifs_dbg(FYI, "Reset oplock val from read to None due to mand locks\n");
 		oplock = 0;
 	}
@@ -324,6 +323,7 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 		oplock = fid->pending_open->oplock;
 	list_del(&fid->pending_open->olist);
 
+	fid->purge_cache = false;
 	server->ops->set_fid(cfile, fid, oplock);
 
 	list_add(&cfile->tlist, &tcon->openFileList);
@@ -334,6 +334,9 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file,
 		list_add_tail(&cfile->flist, &cinode->openFileList);
 	spin_unlock(&cifs_file_list_lock);
 
+	if (fid->purge_cache)
+		cifs_invalidate_mapping(inode);
+
 	file->private_data = cfile;
 	return cfile;
 }
@@ -1524,12 +1527,12 @@ cifs_setlk(struct file *file, struct file_lock *flock, __u32 type,
 		 * read won't conflict with non-overlapted locks due to
 		 * pagereading.
 		 */
-		if (!CIFS_I(inode)->clientCanCacheAll &&
-					CIFS_I(inode)->clientCanCacheRead) {
+		if (!CIFS_CACHE_WRITE(CIFS_I(inode)) &&
+					CIFS_CACHE_READ(CIFS_I(inode))) {
 			cifs_invalidate_mapping(inode);
 			cifs_dbg(FYI, "Set no oplock for inode=%p due to mand locks\n",
 				 inode);
-			CIFS_I(inode)->clientCanCacheRead = false;
+			CIFS_I(inode)->oplock = 0;
 		}
 
 		rc = server->ops->mand_lock(xid, cfile, flock->fl_start, length,
@@ -2213,7 +2216,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end,
 	cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n",
 		 file->f_path.dentry->d_name.name, datasync);
 
-	if (!CIFS_I(inode)->clientCanCacheRead) {
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
 		rc = cifs_invalidate_mapping(inode);
 		if (rc) {
 			cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc);
@@ -2577,7 +2580,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	ssize_t written;
 
-	if (cinode->clientCanCacheAll) {
+	if (CIFS_CACHE_WRITE(cinode)) {
 		if (cap_unix(tcon->ses) &&
 		(CIFS_UNIX_FCNTL_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))
 		    && ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOPOSIXBRL) == 0))
@@ -2591,7 +2594,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 	 * these pages but not on the region from pos to ppos+len-1.
 	 */
 	written = cifs_user_writev(iocb, iov, nr_segs, pos);
-	if (written > 0 && cinode->clientCanCacheRead) {
+	if (written > 0 && CIFS_CACHE_READ(cinode)) {
 		/*
 		 * Windows 7 server can delay breaking level2 oplock if a write
 		 * request comes - break it on the client to prevent reading
@@ -2600,7 +2603,7 @@ cifs_strict_writev(struct kiocb *iocb, const struct iovec *iov,
 		cifs_invalidate_mapping(inode);
 		cifs_dbg(FYI, "Set no oplock for inode=%p after a write operation\n",
 			 inode);
-		cinode->clientCanCacheRead = false;
+		cinode->oplock = 0;
 	}
 	return written;
 }
@@ -2957,7 +2960,7 @@ cifs_strict_readv(struct kiocb *iocb, const struct iovec *iov,
 	 * on pages affected by this read but not on the region from pos to
 	 * pos+len-1.
 	 */
-	if (!cinode->clientCanCacheRead)
+	if (!CIFS_CACHE_READ(cinode))
 		return cifs_user_readv(iocb, iov, nr_segs, pos);
 
 	if (cap_unix(tcon->ses) &&
@@ -3093,7 +3096,7 @@ int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
 
 	xid = get_xid();
 
-	if (!CIFS_I(inode)->clientCanCacheRead) {
+	if (!CIFS_CACHE_READ(CIFS_I(inode))) {
 		rc = cifs_invalidate_mapping(inode);
 		if (rc)
 			return rc;
@@ -3376,6 +3379,9 @@ static int cifs_readpages(struct file *file, struct address_space *mapping,
 	return rc;
 }
 
+/*
+ * cifs_readpage_worker must be called with the page pinned
+ */
 static int cifs_readpage_worker(struct file *file, struct page *page,
 	loff_t *poffset)
 {
@@ -3387,7 +3393,6 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 	if (rc == 0)
 		goto read_complete;
 
-	page_cache_get(page);
 	read_data = kmap(page);
 	/* for reads over a certain size could initiate async read ahead */
 
@@ -3414,7 +3419,7 @@ static int cifs_readpage_worker(struct file *file, struct page *page,
 
 io_error:
 	kunmap(page);
-	page_cache_release(page);
+	unlock_page(page);
 
 read_complete:
 	return rc;
@@ -3439,8 +3444,6 @@ static int cifs_readpage(struct file *file, struct page *page)
 
 	rc = cifs_readpage_worker(file, page, &offset);
 
-	unlock_page(page);
-
 	free_xid(xid);
 	return rc;
 }
@@ -3494,6 +3497,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 			loff_t pos, unsigned len, unsigned flags,
 			struct page **pagep, void **fsdata)
 {
+	int oncethru = 0;
 	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
 	loff_t offset = pos & (PAGE_CACHE_SIZE - 1);
 	loff_t page_start = pos & PAGE_MASK;
@@ -3503,6 +3507,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 
 	cifs_dbg(FYI, "write_begin from %lld len %d\n", (long long)pos, len);
 
+start:
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		rc = -ENOMEM;
@@ -3526,7 +3531,7 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 	 * is, when the page lies beyond the EOF, or straddles the EOF
 	 * and the write will cover all of the existing data.
 	 */
-	if (CIFS_I(mapping->host)->clientCanCacheRead) {
+	if (CIFS_CACHE_READ(CIFS_I(mapping->host))) {
 		i_size = i_size_read(mapping->host);
 		if (page_start >= i_size ||
 		    (offset == 0 && (pos + len) >= i_size)) {
@@ -3544,13 +3549,16 @@ static int cifs_write_begin(struct file *file, struct address_space *mapping,
 		}
 	}
 
-	if ((file->f_flags & O_ACCMODE) != O_WRONLY) {
+	if ((file->f_flags & O_ACCMODE) != O_WRONLY && !oncethru) {
 		/*
 		 * might as well read a page, it is fast enough. If we get
 		 * an error, we don't need to return it. cifs_write_end will
 		 * do a sync write instead since PG_uptodate isn't set.
 		 */
 		cifs_readpage_worker(file, page, &page_start);
+		page_cache_release(page);
+		oncethru = 1;
+		goto start;
 	} else {
 		/* we could try using another file handle if there is one -
 		   but how would we lock it to prevent close of that handle
@@ -3609,20 +3617,20 @@ void cifs_oplock_break(struct work_struct *work)
 	struct cifs_tcon *tcon = tlink_tcon(cfile->tlink);
 	int rc = 0;
 
-	if (!cinode->clientCanCacheAll && cinode->clientCanCacheRead &&
+	if (!CIFS_CACHE_WRITE(cinode) && CIFS_CACHE_READ(cinode) &&
 						cifs_has_mand_locks(cinode)) {
 		cifs_dbg(FYI, "Reset oplock to None for inode=%p due to mand locks\n",
 			 inode);
-		cinode->clientCanCacheRead = false;
+		cinode->oplock = 0;
 	}
 
 	if (inode && S_ISREG(inode->i_mode)) {
-		if (cinode->clientCanCacheRead)
+		if (CIFS_CACHE_READ(cinode))
 			break_lease(inode, O_RDONLY);
 		else
 			break_lease(inode, O_WRONLY);
 		rc = filemap_fdatawrite(inode->i_mapping);
-		if (cinode->clientCanCacheRead == 0) {
+		if (!CIFS_CACHE_READ(cinode)) {
 			rc = filemap_fdatawait(inode->i_mapping);
 			mapping_set_error(inode->i_mapping, rc);
 			cifs_invalidate_mapping(inode);
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 449b6cf09b09..f9ff9c173f78 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -101,7 +101,7 @@ cifs_revalidate_cache(struct inode *inode, struct cifs_fattr *fattr)
 	}
 
 	/* don't bother with revalidation if we have an oplock */
-	if (cifs_i->clientCanCacheRead) {
+	if (CIFS_CACHE_READ(cifs_i)) {
 		cifs_dbg(FYI, "%s: inode %llu is oplocked\n",
 			 __func__, cifs_i->uniqueid);
 		return;
@@ -549,6 +549,10 @@ cifs_all_info_to_fattr(struct cifs_fattr *fattr, FILE_ALL_INFO *info,
 		 * when Unix extensions are disabled - fake it.
 		 */
 		fattr->cf_nlink = 2;
+	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
+		fattr->cf_nlink = le32_to_cpu(info->NumberOfLinks);
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
@@ -646,7 +650,7 @@ cifs_get_inode_info(struct inode **inode, const char *full_path,
 	cifs_dbg(FYI, "Getting info on %s\n", full_path);
 
 	if ((data == NULL) && (*inode != NULL)) {
-		if (CIFS_I(*inode)->clientCanCacheRead) {
+		if (CIFS_CACHE_READ(CIFS_I(*inode))) {
 			cifs_dbg(FYI, "No need to revalidate cached inode sizes\n");
 			goto cgii_exit;
 		}
@@ -1657,7 +1661,7 @@ cifs_inode_needs_reval(struct inode *inode)
 	struct cifsInodeInfo *cifs_i = CIFS_I(inode);
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 
-	if (cifs_i->clientCanCacheRead)
+	if (CIFS_CACHE_READ(cifs_i))
 		return false;
 
 	if (!lookupCacheEnabled)
@@ -1800,7 +1804,7 @@ int cifs_getattr(struct vfsmount *mnt, struct dentry *dentry,
 	 * We need to be sure that all dirty pages are written and the server
 	 * has actual ctime, mtime and file length.
 	 */
-	if (!CIFS_I(inode)->clientCanCacheRead && inode->i_mapping &&
+	if (!CIFS_CACHE_READ(CIFS_I(inode)) && inode->i_mapping &&
 	    inode->i_mapping->nrpages != 0) {
 		rc = filemap_fdatawait(inode->i_mapping);
 		if (rc) {
@@ -1852,14 +1856,11 @@ static int cifs_truncate_page(struct address_space *mapping, loff_t from)
 
 static void cifs_setsize(struct inode *inode, loff_t offset)
 {
-	loff_t oldsize;
-
 	spin_lock(&inode->i_lock);
-	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 
-	truncate_pagecache(inode, oldsize, offset);
+	truncate_pagecache(inode, offset);
 }
 
 static int
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 562044f700e5..7e36ceba0c7a 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -509,6 +509,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
 	struct tcon_link *tlink = NULL;
 	struct cifs_tcon *tcon;
+	struct TCP_Server_Info *server;
 
 	xid = get_xid();
 
@@ -519,25 +520,7 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 		goto out;
 	}
 	tcon = tlink_tcon(tlink);
-
-	/*
-	 * For now, we just handle symlinks with unix extensions enabled.
-	 * Eventually we should handle NTFS reparse points, and MacOS
-	 * symlink support. For instance...
-	 *
-	 * rc = CIFSSMBQueryReparseLinkInfo(...)
-	 *
-	 * For now, just return -EACCES when the server doesn't support posix
-	 * extensions. Note that we still allow querying symlinks when posix
-	 * extensions are manually disabled. We could disable these as well
-	 * but there doesn't seem to be any harm in allowing the client to
-	 * read them.
-	 */
-	if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MF_SYMLINKS) &&
-	    !cap_unix(tcon->ses)) {
-		rc = -EACCES;
-		goto out;
-	}
+	server = tcon->ses->server;
 
 	full_path = build_path_from_dentry(direntry);
 	if (!full_path)
@@ -559,6 +542,9 @@ cifs_follow_link(struct dentry *direntry, struct nameidata *nd)
 	if ((rc != 0) && cap_unix(tcon->ses))
 		rc = CIFSSMBUnixQuerySymLink(xid, tcon, full_path, &target_path,
 					     cifs_sb->local_nls);
+	else if (rc != 0 && server->ops->query_symlink)
+		rc = server->ops->query_symlink(xid, tcon, full_path,
+						&target_path, cifs_sb);
 
 	kfree(full_path);
 out:
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index f7d4b2285efe..138a011633fe 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -105,6 +105,7 @@ sesInfoFree(struct cifs_ses *buf_to_free)
 	}
 	kfree(buf_to_free->user_name);
 	kfree(buf_to_free->domainName);
+	kfree(buf_to_free->auth_key.response);
 	kfree(buf_to_free);
 }
 
@@ -545,19 +546,15 @@ void cifs_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
 	oplock &= 0xF;
 
 	if (oplock == OPLOCK_EXCLUSIVE) {
-		cinode->clientCanCacheAll = true;
-		cinode->clientCanCacheRead = true;
+		cinode->oplock = CIFS_CACHE_WRITE_FLG | CIFS_CACHE_READ_FLG;
 		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
 			 &cinode->vfs_inode);
 	} else if (oplock == OPLOCK_READ) {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = true;
+		cinode->oplock = CIFS_CACHE_READ_FLG;
 		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
 			 &cinode->vfs_inode);
-	} else {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = false;
-	}
+	} else
+		cinode->oplock = 0;
 }
 
 bool
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 69d2c826a23b..42ef03be089f 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -172,6 +172,9 @@ cifs_fill_common_info(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb)
 		if (cifs_dfs_is_possible(cifs_sb) &&
 		    (fattr->cf_cifsattrs & ATTR_REPARSE))
 			fattr->cf_flags |= CIFS_FATTR_NEED_REVAL;
+	} else if (fattr->cf_cifsattrs & ATTR_REPARSE) {
+		fattr->cf_mode = S_IFLNK;
+		fattr->cf_dtype = DT_LNK;
 	} else {
 		fattr->cf_mode = S_IFREG | cifs_sb->mnt_file_mode;
 		fattr->cf_dtype = DT_REG;
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 08dd37bb23aa..5f99b7f19e78 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -226,7 +226,7 @@ static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 		*(bcc_ptr+1) = 0;
 	} else {
 		bytes_ret = cifs_strtoUTF16((__le16 *) bcc_ptr, ses->user_name,
-					    MAX_USERNAME_SIZE, nls_cp);
+					    CIFS_MAX_USERNAME_LEN, nls_cp);
 	}
 	bcc_ptr += 2 * bytes_ret;
 	bcc_ptr += 2; /* account for null termination */
@@ -246,8 +246,8 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses,
 	/* BB what about null user mounts - check that we do this BB */
 	/* copy user */
 	if (ses->user_name != NULL) {
-		strncpy(bcc_ptr, ses->user_name, MAX_USERNAME_SIZE);
-		bcc_ptr += strnlen(ses->user_name, MAX_USERNAME_SIZE);
+		strncpy(bcc_ptr, ses->user_name, CIFS_MAX_USERNAME_LEN);
+		bcc_ptr += strnlen(ses->user_name, CIFS_MAX_USERNAME_LEN);
 	}
 	/* else null user mount */
 	*bcc_ptr = 0;
@@ -428,7 +428,8 @@ void build_ntlmssp_negotiate_blob(unsigned char *pbuffer,
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
 	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-		if (!ses->server->session_estab)
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
 			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
 	}
 
@@ -466,7 +467,8 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 		NTLMSSP_NEGOTIATE_NTLM | NTLMSSP_NEGOTIATE_EXTENDED_SEC;
 	if (ses->server->sign) {
 		flags |= NTLMSSP_NEGOTIATE_SIGN;
-		if (!ses->server->session_estab)
+		if (!ses->server->session_estab ||
+				ses->ntlmssp->sesskey_per_smbsess)
 			flags |= NTLMSSP_NEGOTIATE_KEY_XCH;
 	}
 
@@ -501,7 +503,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	} else {
 		int len;
 		len = cifs_strtoUTF16((__le16 *)tmp, ses->domainName,
-				      MAX_USERNAME_SIZE, nls_cp);
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->DomainName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->DomainName.Length = cpu_to_le16(len);
@@ -517,7 +519,7 @@ int build_ntlmssp_auth_blob(unsigned char *pbuffer,
 	} else {
 		int len;
 		len = cifs_strtoUTF16((__le16 *)tmp, ses->user_name,
-				      MAX_USERNAME_SIZE, nls_cp);
+				      CIFS_MAX_USERNAME_LEN, nls_cp);
 		len *= 2; /* unicode is 2 bytes each */
 		sec_blob->UserName.BufferOffset = cpu_to_le32(tmp - pbuffer);
 		sec_blob->UserName.Length = cpu_to_le16(len);
@@ -629,7 +631,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 	type = select_sectype(ses->server, ses->sectype);
 	cifs_dbg(FYI, "sess setup type %d\n", type);
 	if (type == Unspecified) {
-		cifs_dbg(VFS, "Unable to select appropriate authentication method!");
+		cifs_dbg(VFS,
+			"Unable to select appropriate authentication method!");
 		return -EINVAL;
 	}
 
@@ -640,6 +643,8 @@ CIFS_SessSetup(const unsigned int xid, struct cifs_ses *ses,
 		ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 		if (!ses->ntlmssp)
 			return -ENOMEM;
+		ses->ntlmssp->sesskey_per_smbsess = false;
+
 	}
 
 ssetup_ntlmssp_authenticate:
@@ -815,8 +820,9 @@ ssetup_ntlmssp_authenticate:
 		ses->auth_key.response = kmemdup(msg->data, msg->sesskey_len,
 						 GFP_KERNEL);
 		if (!ses->auth_key.response) {
-			cifs_dbg(VFS, "Kerberos can't allocate (%u bytes) memory",
-					msg->sesskey_len);
+			cifs_dbg(VFS,
+				"Kerberos can't allocate (%u bytes) memory",
+				msg->sesskey_len);
 			rc = -ENOMEM;
 			goto ssetup_exit;
 		}
@@ -1005,5 +1011,37 @@ ssetup_exit:
 	if ((phase == NtLmChallenge) && (rc == 0))
 		goto ssetup_ntlmssp_authenticate;
 
+	if (!rc) {
+		mutex_lock(&ses->server->srv_mutex);
+		if (!ses->server->session_estab) {
+			if (ses->server->sign) {
+				ses->server->session_key.response =
+					kmemdup(ses->auth_key.response,
+					ses->auth_key.len, GFP_KERNEL);
+				if (!ses->server->session_key.response) {
+					rc = -ENOMEM;
+					mutex_unlock(&ses->server->srv_mutex);
+					goto keycp_exit;
+				}
+				ses->server->session_key.len =
+							ses->auth_key.len;
+			}
+			ses->server->sequence_number = 0x2;
+			ses->server->session_estab = true;
+		}
+		mutex_unlock(&ses->server->srv_mutex);
+
+		cifs_dbg(FYI, "CIFS session established successfully\n");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+keycp_exit:
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+	kfree(ses->ntlmssp);
+
 	return rc;
 }
diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c
index 60943978aec3..8233b174de3d 100644
--- a/fs/cifs/smb1ops.c
+++ b/fs/cifs/smb1ops.c
@@ -700,7 +700,7 @@ cifs_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
 	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
 	cfile->fid.netfid = fid->netfid;
 	cifs_set_oplock_level(cinode, oplock);
-	cinode->can_cache_brlcks = cinode->clientCanCacheAll;
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
 }
 
 static void
@@ -837,7 +837,7 @@ cifs_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
 {
 	return CIFSSMBLock(0, tcon, fid->netfid, current->tgid, 0, 0, 0, 0,
 			   LOCKING_ANDX_OPLOCK_RELEASE, false,
-			   cinode->clientCanCacheRead ? 1 : 0);
+			   CIFS_CACHE_READ(cinode) ? 1 : 0);
 }
 
 static int
@@ -881,6 +881,43 @@ cifs_mand_lock(const unsigned int xid, struct cifsFileInfo *cfile, __u64 offset,
 			   (__u8)type, wait, 0);
 }
 
+static int
+cifs_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	int oplock = 0;
+	__u16 netfid;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	rc = CIFSSMBOpen(xid, tcon, full_path, FILE_OPEN,
+			 FILE_READ_ATTRIBUTES, OPEN_REPARSE_POINT, &netfid,
+			 &oplock, NULL, cifs_sb->local_nls,
+			 cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		return rc;
+
+	rc = CIFSSMBQuerySymLink(xid, tcon, netfid, target_path,
+				 cifs_sb->local_nls);
+	if (rc) {
+		CIFSSMBClose(xid, tcon, netfid);
+		return rc;
+	}
+
+	convert_delimiter(*target_path, '/');
+	CIFSSMBClose(xid, tcon, netfid);
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	return rc;
+}
+
+static bool
+cifs_is_read_op(__u32 oplock)
+{
+	return oplock == OPLOCK_READ;
+}
+
 struct smb_version_operations smb1_operations = {
 	.send_cancel = send_nt_cancel,
 	.compare_fids = cifs_compare_fids,
@@ -927,6 +964,7 @@ struct smb_version_operations smb1_operations = {
 	.rename_pending_delete = cifs_rename_pending_delete,
 	.rename = CIFSSMBRename,
 	.create_hardlink = CIFSCreateHardLink,
+	.query_symlink = cifs_query_symlink,
 	.open = cifs_open_file,
 	.set_fid = cifs_set_fid,
 	.close = cifs_close_file,
@@ -945,6 +983,7 @@ struct smb_version_operations smb1_operations = {
 	.mand_unlock_range = cifs_unlock_range,
 	.push_mand_locks = cifs_push_mandatory_locks,
 	.query_mf_symlink = open_query_close_cifs_symlink,
+	.is_read_op = cifs_is_read_op,
 };
 
 struct smb_version_values smb1_values = {
@@ -960,7 +999,6 @@ struct smb_version_values smb1_values = {
 	.cap_unix = CAP_UNIX,
 	.cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND,
 	.cap_large_files = CAP_LARGE_FILES,
-	.oplock_read = OPLOCK_READ,
 	.signing_enabled = SECMODE_SIGN_ENABLED,
 	.signing_required = SECMODE_SIGN_REQUIRED,
 };
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 04a81a4142c3..3f17b4550831 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -34,29 +34,6 @@
 #include "fscache.h"
 #include "smb2proto.h"
 
-void
-smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock)
-{
-	oplock &= 0xFF;
-	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
-		return;
-	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE ||
-	    oplock == SMB2_OPLOCK_LEVEL_BATCH) {
-		cinode->clientCanCacheAll = true;
-		cinode->clientCanCacheRead = true;
-		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
-			 &cinode->vfs_inode);
-	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = true;
-		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
-			 &cinode->vfs_inode);
-	} else {
-		cinode->clientCanCacheAll = false;
-		cinode->clientCanCacheRead = false;
-	}
-}
-
 int
 smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 	       __u32 *oplock, FILE_ALL_INFO *buf)
@@ -86,7 +63,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 	if (oparms->tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING)
 		memcpy(smb2_oplock + 1, fid->lease_key, SMB2_LEASE_KEY_SIZE);
 
-	rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data);
+	rc = SMB2_open(xid, oparms, smb2_path, smb2_oplock, smb2_data, NULL);
 	if (rc)
 		goto out;
 
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index c6ec1633309a..78ff88c467b9 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -60,7 +60,7 @@ smb2_open_op_close(const unsigned int xid, struct cifs_tcon *tcon,
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
 	if (rc) {
 		kfree(utf16_path);
 		return rc;
@@ -136,7 +136,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 		return -ENOMEM;
 
 	rc = smb2_open_op_close(xid, tcon, cifs_sb, full_path,
-				FILE_READ_ATTRIBUTES, FILE_OPEN, 0, smb2_data,
+				FILE_READ_ATTRIBUTES, FILE_OPEN,
+				OPEN_REPARSE_POINT, smb2_data,
 				SMB2_OP_QUERY_INFO);
 	if (rc)
 		goto out;
@@ -191,8 +192,8 @@ smb2_unlink(const unsigned int xid, struct cifs_tcon *tcon, const char *name,
 	    struct cifs_sb_info *cifs_sb)
 {
 	return smb2_open_op_close(xid, tcon, cifs_sb, name, DELETE, FILE_OPEN,
-				  CREATE_DELETE_ON_CLOSE, NULL,
-				  SMB2_OP_DELETE);
+				  CREATE_DELETE_ON_CLOSE | OPEN_REPARSE_POINT,
+				  NULL, SMB2_OP_DELETE);
 }
 
 static int
diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c
index b0c43345cd98..fb3966265b6e 100644
--- a/fs/cifs/smb2misc.c
+++ b/fs/cifs/smb2misc.c
@@ -171,6 +171,10 @@ smb2_check_message(char *buf, unsigned int length)
 	if (4 + len != clc_len) {
 		cifs_dbg(FYI, "Calculated size %u length %u mismatch mid %llu\n",
 			 clc_len, 4 + len, mid);
+		/* create failed on symlink */
+		if (command == SMB2_CREATE_HE &&
+		    hdr->Status == STATUS_STOPPED_ON_SYMLINK)
+			return 0;
 		/* Windows 7 server returns 24 bytes more */
 		if (clc_len + 20 == len && command == SMB2_OPLOCK_BREAK_HE)
 			return 0;
@@ -376,23 +380,15 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb)
 __le32
 smb2_get_lease_state(struct cifsInodeInfo *cinode)
 {
-	if (cinode->clientCanCacheAll)
-		return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
-	else if (cinode->clientCanCacheRead)
-		return SMB2_LEASE_READ_CACHING;
-	return 0;
-}
-
-__u8 smb2_map_lease_to_oplock(__le32 lease_state)
-{
-	if (lease_state & SMB2_LEASE_WRITE_CACHING) {
-		if (lease_state & SMB2_LEASE_HANDLE_CACHING)
-			return SMB2_OPLOCK_LEVEL_BATCH;
-		else
-			return SMB2_OPLOCK_LEVEL_EXCLUSIVE;
-	} else if (lease_state & SMB2_LEASE_READ_CACHING)
-		return SMB2_OPLOCK_LEVEL_II;
-	return 0;
+	__le32 lease = 0;
+
+	if (CIFS_CACHE_WRITE(cinode))
+		lease |= SMB2_LEASE_WRITE_CACHING;
+	if (CIFS_CACHE_HANDLE(cinode))
+		lease |= SMB2_LEASE_HANDLE_CACHING;
+	if (CIFS_CACHE_READ(cinode))
+		lease |= SMB2_LEASE_READ_CACHING;
+	return lease;
 }
 
 struct smb2_lease_break_work {
@@ -417,96 +413,109 @@ cifs_ses_oplock_break(struct work_struct *work)
 }
 
 static bool
-smb2_is_valid_lease_break(char *buffer, struct TCP_Server_Info *server)
+smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp,
+		    struct smb2_lease_break_work *lw)
 {
-	struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
-	struct list_head *tmp, *tmp1, *tmp2;
-	struct cifs_ses *ses;
-	struct cifs_tcon *tcon;
-	struct cifsInodeInfo *cinode;
+	bool found;
+	__u8 lease_state;
+	struct list_head *tmp;
 	struct cifsFileInfo *cfile;
+	struct TCP_Server_Info *server = tcon->ses->server;
 	struct cifs_pending_open *open;
-	struct smb2_lease_break_work *lw;
-	bool found;
+	struct cifsInodeInfo *cinode;
 	int ack_req = le32_to_cpu(rsp->Flags &
 				  SMB2_NOTIFY_BREAK_LEASE_FLAG_ACK_REQUIRED);
 
-	lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
-	if (!lw)
-		return false;
+	lease_state = le32_to_cpu(rsp->NewLeaseState);
 
-	INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
-	lw->lease_state = rsp->NewLeaseState;
+	list_for_each(tmp, &tcon->openFileList) {
+		cfile = list_entry(tmp, struct cifsFileInfo, tlist);
+		cinode = CIFS_I(cfile->dentry->d_inode);
 
-	cifs_dbg(FYI, "Checking for lease break\n");
+		if (memcmp(cinode->lease_key, rsp->LeaseKey,
+							SMB2_LEASE_KEY_SIZE))
+			continue;
 
-	/* look up tcon based on tid & uid */
-	spin_lock(&cifs_tcp_ses_lock);
-	list_for_each(tmp, &server->smb_ses_list) {
-		ses = list_entry(tmp, struct cifs_ses, smb_ses_list);
+		cifs_dbg(FYI, "found in the open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+			 le32_to_cpu(rsp->NewLeaseState));
 
-		spin_lock(&cifs_file_list_lock);
-		list_for_each(tmp1, &ses->tcon_list) {
-			tcon = list_entry(tmp1, struct cifs_tcon, tcon_list);
+		server->ops->set_oplock_level(cinode, lease_state, 0, NULL);
 
-			cifs_stats_inc(&tcon->stats.cifs_stats.num_oplock_brks);
-			list_for_each(tmp2, &tcon->openFileList) {
-				cfile = list_entry(tmp2, struct cifsFileInfo,
-						   tlist);
-				cinode = CIFS_I(cfile->dentry->d_inode);
+		if (ack_req)
+			cfile->oplock_break_cancelled = false;
+		else
+			cfile->oplock_break_cancelled = true;
 
-				if (memcmp(cinode->lease_key, rsp->LeaseKey,
-					   SMB2_LEASE_KEY_SIZE))
-					continue;
+		queue_work(cifsiod_wq, &cfile->oplock_break);
+		kfree(lw);
+		return true;
+	}
 
-				cifs_dbg(FYI, "found in the open list\n");
-				cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
-					 le32_to_cpu(rsp->NewLeaseState));
+	found = false;
+	list_for_each_entry(open, &tcon->pending_opens, olist) {
+		if (memcmp(open->lease_key, rsp->LeaseKey,
+			   SMB2_LEASE_KEY_SIZE))
+			continue;
+
+		if (!found && ack_req) {
+			found = true;
+			memcpy(lw->lease_key, open->lease_key,
+			       SMB2_LEASE_KEY_SIZE);
+			lw->tlink = cifs_get_tlink(open->tlink);
+			queue_work(cifsiod_wq, &lw->lease_break);
+		}
 
-				smb2_set_oplock_level(cinode,
-				  smb2_map_lease_to_oplock(rsp->NewLeaseState));
+		cifs_dbg(FYI, "found in the pending open list\n");
+		cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
+			 le32_to_cpu(rsp->NewLeaseState));
 
-				if (ack_req)
-					cfile->oplock_break_cancelled = false;
-				else
-					cfile->oplock_break_cancelled = true;
+		open->oplock = lease_state;
+	}
+	return found;
+}
 
-				queue_work(cifsiod_wq, &cfile->oplock_break);
+static bool
+smb2_is_valid_lease_break(char *buffer)
+{
+	struct smb2_lease_break *rsp = (struct smb2_lease_break *)buffer;
+	struct list_head *tmp, *tmp1, *tmp2;
+	struct TCP_Server_Info *server;
+	struct cifs_ses *ses;
+	struct cifs_tcon *tcon;
+	struct smb2_lease_break_work *lw;
 
-				spin_unlock(&cifs_file_list_lock);
-				spin_unlock(&cifs_tcp_ses_lock);
-				return true;
-			}
+	lw = kmalloc(sizeof(struct smb2_lease_break_work), GFP_KERNEL);
+	if (!lw)
+		return false;
 
-			found = false;
-			list_for_each_entry(open, &tcon->pending_opens, olist) {
-				if (memcmp(open->lease_key, rsp->LeaseKey,
-					   SMB2_LEASE_KEY_SIZE))
-					continue;
+	INIT_WORK(&lw->lease_break, cifs_ses_oplock_break);
+	lw->lease_state = rsp->NewLeaseState;
 
-				if (!found && ack_req) {
-					found = true;
-					memcpy(lw->lease_key, open->lease_key,
-					       SMB2_LEASE_KEY_SIZE);
-					lw->tlink = cifs_get_tlink(open->tlink);
-					queue_work(cifsiod_wq,
-						   &lw->lease_break);
-				}
+	cifs_dbg(FYI, "Checking for lease break\n");
 
-				cifs_dbg(FYI, "found in the pending open list\n");
-				cifs_dbg(FYI, "lease key match, lease break 0x%d\n",
-					 le32_to_cpu(rsp->NewLeaseState));
+	/* look up tcon based on tid & uid */
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each(tmp, &cifs_tcp_ses_list) {
+		server = list_entry(tmp, struct TCP_Server_Info, tcp_ses_list);
 
-				open->oplock =
-				  smb2_map_lease_to_oplock(rsp->NewLeaseState);
-			}
-			if (found) {
-				spin_unlock(&cifs_file_list_lock);
-				spin_unlock(&cifs_tcp_ses_lock);
-				return true;
+		list_for_each(tmp1, &server->smb_ses_list) {
+			ses = list_entry(tmp1, struct cifs_ses, smb_ses_list);
+
+			spin_lock(&cifs_file_list_lock);
+			list_for_each(tmp2, &ses->tcon_list) {
+				tcon = list_entry(tmp2, struct cifs_tcon,
+						  tcon_list);
+				cifs_stats_inc(
+				    &tcon->stats.cifs_stats.num_oplock_brks);
+				if (smb2_tcon_has_lease(tcon, rsp, lw)) {
+					spin_unlock(&cifs_file_list_lock);
+					spin_unlock(&cifs_tcp_ses_lock);
+					return true;
+				}
 			}
+			spin_unlock(&cifs_file_list_lock);
 		}
-		spin_unlock(&cifs_file_list_lock);
 	}
 	spin_unlock(&cifs_tcp_ses_lock);
 	kfree(lw);
@@ -532,7 +541,7 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 	if (rsp->StructureSize !=
 				smb2_rsp_struct_sizes[SMB2_OPLOCK_BREAK_HE]) {
 		if (le16_to_cpu(rsp->StructureSize) == 44)
-			return smb2_is_valid_lease_break(buffer, server);
+			return smb2_is_valid_lease_break(buffer);
 		else
 			return false;
 	}
@@ -560,14 +569,15 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server)
 				cifs_dbg(FYI, "file id match, oplock break\n");
 				cinode = CIFS_I(cfile->dentry->d_inode);
 
-				if (!cinode->clientCanCacheAll &&
+				if (!CIFS_CACHE_WRITE(cinode) &&
 				    rsp->OplockLevel == SMB2_OPLOCK_LEVEL_NONE)
 					cfile->oplock_break_cancelled = true;
 				else
 					cfile->oplock_break_cancelled = false;
 
-				smb2_set_oplock_level(cinode,
-				  rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0);
+				server->ops->set_oplock_level(cinode,
+				  rsp->OplockLevel ? SMB2_OPLOCK_LEVEL_II : 0,
+				  0, NULL);
 
 				queue_work(cifsiod_wq, &cfile->oplock_break);
 
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index f259e6cc8357..861b33214144 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -24,6 +24,7 @@
 #include "smb2proto.h"
 #include "cifsproto.h"
 #include "cifs_debug.h"
+#include "cifs_unicode.h"
 #include "smb2status.h"
 #include "smb2glob.h"
 
@@ -229,7 +230,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon,
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
 	if (rc) {
 		kfree(utf16_path);
 		return rc;
@@ -376,10 +377,13 @@ static void
 smb2_set_fid(struct cifsFileInfo *cfile, struct cifs_fid *fid, __u32 oplock)
 {
 	struct cifsInodeInfo *cinode = CIFS_I(cfile->dentry->d_inode);
+	struct TCP_Server_Info *server = tlink_tcon(cfile->tlink)->ses->server;
+
 	cfile->fid.persistent_fid = fid->persistent_fid;
 	cfile->fid.volatile_fid = fid->volatile_fid;
-	smb2_set_oplock_level(cinode, oplock);
-	cinode->can_cache_brlcks = cinode->clientCanCacheAll;
+	server->ops->set_oplock_level(cinode, oplock, fid->epoch,
+				      &fid->purge_cache);
+	cinode->can_cache_brlcks = CIFS_CACHE_WRITE(cinode);
 }
 
 static void
@@ -463,7 +467,7 @@ smb2_query_dir_first(const unsigned int xid, struct cifs_tcon *tcon,
 	oparms.fid = fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL);
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL);
 	kfree(utf16_path);
 	if (rc) {
 		cifs_dbg(VFS, "open dir failed\n");
@@ -530,7 +534,7 @@ smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid,
 
 	return SMB2_oplock_break(0, tcon, fid->persistent_fid,
 				 fid->volatile_fid,
-				 cinode->clientCanCacheRead ? 1 : 0);
+				 CIFS_CACHE_READ(cinode) ? 1 : 0);
 }
 
 static int
@@ -550,7 +554,7 @@ smb2_queryfs(const unsigned int xid, struct cifs_tcon *tcon,
 	oparms.fid = &fid;
 	oparms.reconnect = false;
 
-	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL);
+	rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL);
 	if (rc)
 		return rc;
 	buf->f_type = SMB2_MAGIC_NUMBER;
@@ -596,7 +600,245 @@ smb2_new_lease_key(struct cifs_fid *fid)
 	get_random_bytes(fid->lease_key, SMB2_LEASE_KEY_SIZE);
 }
 
-struct smb_version_operations smb21_operations = {
+static int
+smb2_query_symlink(const unsigned int xid, struct cifs_tcon *tcon,
+		   const char *full_path, char **target_path,
+		   struct cifs_sb_info *cifs_sb)
+{
+	int rc;
+	__le16 *utf16_path;
+	__u8 oplock = SMB2_OPLOCK_LEVEL_NONE;
+	struct cifs_open_parms oparms;
+	struct cifs_fid fid;
+	struct smb2_err_rsp *err_buf = NULL;
+	struct smb2_symlink_err_rsp *symlink;
+	unsigned int sub_len, sub_offset;
+
+	cifs_dbg(FYI, "%s: path: %s\n", __func__, full_path);
+
+	utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb);
+	if (!utf16_path)
+		return -ENOMEM;
+
+	oparms.tcon = tcon;
+	oparms.desired_access = FILE_READ_ATTRIBUTES;
+	oparms.disposition = FILE_OPEN;
+	oparms.create_options = 0;
+	oparms.fid = &fid;
+	oparms.reconnect = false;
+
+	rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, &err_buf);
+
+	if (!rc || !err_buf) {
+		kfree(utf16_path);
+		return -ENOENT;
+	}
+	/* open must fail on symlink - reset rc */
+	rc = 0;
+	symlink = (struct smb2_symlink_err_rsp *)err_buf->ErrorData;
+	sub_len = le16_to_cpu(symlink->SubstituteNameLength);
+	sub_offset = le16_to_cpu(symlink->SubstituteNameOffset);
+	*target_path = cifs_strndup_from_utf16(
+				(char *)symlink->PathBuffer + sub_offset,
+				sub_len, true, cifs_sb->local_nls);
+	if (!(*target_path)) {
+		kfree(utf16_path);
+		return -ENOMEM;
+	}
+	convert_delimiter(*target_path, '/');
+	cifs_dbg(FYI, "%s: target path: %s\n", __func__, *target_path);
+	kfree(utf16_path);
+	return rc;
+}
+
+static void
+smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+	if (oplock == SMB2_OPLOCK_LEVEL_BATCH) {
+		cinode->oplock = CIFS_CACHE_RHW_FLG;
+		cifs_dbg(FYI, "Batch Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE) {
+		cinode->oplock = CIFS_CACHE_RW_FLG;
+		cifs_dbg(FYI, "Exclusive Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else if (oplock == SMB2_OPLOCK_LEVEL_II) {
+		cinode->oplock = CIFS_CACHE_READ_FLG;
+		cifs_dbg(FYI, "Level II Oplock granted on inode %p\n",
+			 &cinode->vfs_inode);
+	} else
+		cinode->oplock = 0;
+}
+
+static void
+smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		       unsigned int epoch, bool *purge_cache)
+{
+	char message[5] = {0};
+
+	oplock &= 0xFF;
+	if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE)
+		return;
+
+	cinode->oplock = 0;
+	if (oplock & SMB2_LEASE_READ_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_READ_FLG;
+		strcat(message, "R");
+	}
+	if (oplock & SMB2_LEASE_HANDLE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_HANDLE_FLG;
+		strcat(message, "H");
+	}
+	if (oplock & SMB2_LEASE_WRITE_CACHING_HE) {
+		cinode->oplock |= CIFS_CACHE_WRITE_FLG;
+		strcat(message, "W");
+	}
+	if (!cinode->oplock)
+		strcat(message, "None");
+	cifs_dbg(FYI, "%s Lease granted on inode %p\n", message,
+		 &cinode->vfs_inode);
+}
+
+static void
+smb3_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock,
+		      unsigned int epoch, bool *purge_cache)
+{
+	unsigned int old_oplock = cinode->oplock;
+
+	smb21_set_oplock_level(cinode, oplock, epoch, purge_cache);
+
+	if (purge_cache) {
+		*purge_cache = false;
+		if (old_oplock == CIFS_CACHE_READ_FLG) {
+			if (cinode->oplock == CIFS_CACHE_READ_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+			else if (cinode->oplock == 0 &&
+				 (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+		} else if (old_oplock == CIFS_CACHE_RH_FLG) {
+			if (cinode->oplock == CIFS_CACHE_RH_FLG &&
+			    (epoch - cinode->epoch > 0))
+				*purge_cache = true;
+			else if (cinode->oplock == CIFS_CACHE_RHW_FLG &&
+				 (epoch - cinode->epoch > 1))
+				*purge_cache = true;
+		}
+		cinode->epoch = epoch;
+	}
+}
+
+static bool
+smb2_is_read_op(__u32 oplock)
+{
+	return oplock == SMB2_OPLOCK_LEVEL_II;
+}
+
+static bool
+smb21_is_read_op(__u32 oplock)
+{
+	return (oplock & SMB2_LEASE_READ_CACHING_HE) &&
+	       !(oplock & SMB2_LEASE_WRITE_CACHING_HE);
+}
+
+static __le32
+map_oplock_to_lease(u8 oplock)
+{
+	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
+		return SMB2_LEASE_WRITE_CACHING | SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_II)
+		return SMB2_LEASE_READ_CACHING;
+	else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
+		return SMB2_LEASE_HANDLE_CACHING | SMB2_LEASE_READ_CACHING |
+		       SMB2_LEASE_WRITE_CACHING;
+	return 0;
+}
+
+static char *
+smb2_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease *buf;
+
+	buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static char *
+smb3_create_lease_buf(u8 *lease_key, u8 oplock)
+{
+	struct create_lease_v2 *buf;
+
+	buf = kzalloc(sizeof(struct create_lease_v2), GFP_KERNEL);
+	if (!buf)
+		return NULL;
+
+	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
+	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
+	buf->lcontext.LeaseState = map_oplock_to_lease(oplock);
+
+	buf->ccontext.DataOffset = cpu_to_le16(offsetof
+					(struct create_lease_v2, lcontext));
+	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context_v2));
+	buf->ccontext.NameOffset = cpu_to_le16(offsetof
+				(struct create_lease_v2, Name));
+	buf->ccontext.NameLength = cpu_to_le16(4);
+	buf->Name[0] = 'R';
+	buf->Name[1] = 'q';
+	buf->Name[2] = 'L';
+	buf->Name[3] = 's';
+	return (char *)buf;
+}
+
+static __u8
+smb2_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease *lc = (struct create_lease *)buf;
+
+	*epoch = 0; /* not used */
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+static __u8
+smb3_parse_lease_buf(void *buf, unsigned int *epoch)
+{
+	struct create_lease_v2 *lc = (struct create_lease_v2 *)buf;
+
+	*epoch = le16_to_cpu(lc->lcontext.Epoch);
+	if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
+		return SMB2_OPLOCK_LEVEL_NOCHANGE;
+	return le32_to_cpu(lc->lcontext.LeaseState);
+}
+
+struct smb_version_operations smb20_operations = {
 	.compare_fids = smb2_compare_fids,
 	.setup_request = smb2_setup_request,
 	.setup_async_request = smb2_setup_async_request,
@@ -638,6 +880,7 @@ struct smb_version_operations smb21_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
 	.open = smb2_open_file,
 	.set_fid = smb2_set_fid,
 	.close = smb2_close_file,
@@ -660,8 +903,82 @@ struct smb_version_operations smb21_operations = {
 	.set_lease_key = smb2_set_lease_key,
 	.new_lease_key = smb2_new_lease_key,
 	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb2_is_read_op,
+	.set_oplock_level = smb2_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
 };
 
+struct smb_version_operations smb21_operations = {
+	.compare_fids = smb2_compare_fids,
+	.setup_request = smb2_setup_request,
+	.setup_async_request = smb2_setup_async_request,
+	.check_receive = smb2_check_receive,
+	.add_credits = smb2_add_credits,
+	.set_credits = smb2_set_credits,
+	.get_credits_field = smb2_get_credits_field,
+	.get_credits = smb2_get_credits,
+	.get_next_mid = smb2_get_next_mid,
+	.read_data_offset = smb2_read_data_offset,
+	.read_data_length = smb2_read_data_length,
+	.map_error = map_smb2_to_linux_error,
+	.find_mid = smb2_find_mid,
+	.check_message = smb2_check_message,
+	.dump_detail = smb2_dump_detail,
+	.clear_stats = smb2_clear_stats,
+	.print_stats = smb2_print_stats,
+	.is_oplock_break = smb2_is_valid_oplock_break,
+	.need_neg = smb2_need_neg,
+	.negotiate = smb2_negotiate,
+	.negotiate_wsize = smb2_negotiate_wsize,
+	.negotiate_rsize = smb2_negotiate_rsize,
+	.sess_setup = SMB2_sess_setup,
+	.logoff = SMB2_logoff,
+	.tree_connect = SMB2_tcon,
+	.tree_disconnect = SMB2_tdis,
+	.is_path_accessible = smb2_is_path_accessible,
+	.can_echo = smb2_can_echo,
+	.echo = SMB2_echo,
+	.query_path_info = smb2_query_path_info,
+	.get_srv_inum = smb2_get_srv_inum,
+	.query_file_info = smb2_query_file_info,
+	.set_path_size = smb2_set_path_size,
+	.set_file_size = smb2_set_file_size,
+	.set_file_info = smb2_set_file_info,
+	.mkdir = smb2_mkdir,
+	.mkdir_setinfo = smb2_mkdir_setinfo,
+	.rmdir = smb2_rmdir,
+	.unlink = smb2_unlink,
+	.rename = smb2_rename_path,
+	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
+	.open = smb2_open_file,
+	.set_fid = smb2_set_fid,
+	.close = smb2_close_file,
+	.flush = smb2_flush_file,
+	.async_readv = smb2_async_readv,
+	.async_writev = smb2_async_writev,
+	.sync_read = smb2_sync_read,
+	.sync_write = smb2_sync_write,
+	.query_dir_first = smb2_query_dir_first,
+	.query_dir_next = smb2_query_dir_next,
+	.close_dir = smb2_close_dir,
+	.calc_smb_size = smb2_calc_size,
+	.is_status_pending = smb2_is_status_pending,
+	.oplock_response = smb2_oplock_response,
+	.queryfs = smb2_queryfs,
+	.mand_lock = smb2_mand_lock,
+	.mand_unlock_range = smb2_unlock_range,
+	.push_mand_locks = smb2_push_mandatory_locks,
+	.get_lease_key = smb2_get_lease_key,
+	.set_lease_key = smb2_set_lease_key,
+	.new_lease_key = smb2_new_lease_key,
+	.calc_signature = smb2_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb21_set_oplock_level,
+	.create_lease_buf = smb2_create_lease_buf,
+	.parse_lease_buf = smb2_parse_lease_buf,
+};
 
 struct smb_version_operations smb30_operations = {
 	.compare_fids = smb2_compare_fids,
@@ -706,6 +1023,7 @@ struct smb_version_operations smb30_operations = {
 	.unlink = smb2_unlink,
 	.rename = smb2_rename_path,
 	.create_hardlink = smb2_create_hardlink,
+	.query_symlink = smb2_query_symlink,
 	.open = smb2_open_file,
 	.set_fid = smb2_set_fid,
 	.close = smb2_close_file,
@@ -729,6 +1047,10 @@ struct smb_version_operations smb30_operations = {
 	.new_lease_key = smb2_new_lease_key,
 	.generate_signingkey = generate_smb3signingkey,
 	.calc_signature = smb3_calc_signature,
+	.is_read_op = smb21_is_read_op,
+	.set_oplock_level = smb3_set_oplock_level,
+	.create_lease_buf = smb3_create_lease_buf,
+	.parse_lease_buf = smb3_parse_lease_buf,
 };
 
 struct smb_version_values smb20_values = {
@@ -746,9 +1068,9 @@ struct smb_version_values smb20_values = {
 	.cap_unix = 0,
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
-	.oplock_read = SMB2_OPLOCK_LEVEL_II,
 	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
 	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
 };
 
 struct smb_version_values smb21_values = {
@@ -766,9 +1088,9 @@ struct smb_version_values smb21_values = {
 	.cap_unix = 0,
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
-	.oplock_read = SMB2_OPLOCK_LEVEL_II,
 	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
 	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease),
 };
 
 struct smb_version_values smb30_values = {
@@ -786,9 +1108,9 @@ struct smb_version_values smb30_values = {
 	.cap_unix = 0,
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
-	.oplock_read = SMB2_OPLOCK_LEVEL_II,
 	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
 	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
 };
 
 struct smb_version_values smb302_values = {
@@ -806,7 +1128,7 @@ struct smb_version_values smb302_values = {
 	.cap_unix = 0,
 	.cap_nt_find = SMB2_NT_FIND,
 	.cap_large_files = SMB2_LARGE_FILES,
-	.oplock_read = SMB2_OPLOCK_LEVEL_II,
 	.signing_enabled = SMB2_NEGOTIATE_SIGNING_ENABLED | SMB2_NEGOTIATE_SIGNING_REQUIRED,
 	.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
+	.create_lease_size = sizeof(struct create_lease_v2),
 };
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index abc9c2809b51..eba0efde66d7 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -478,12 +478,20 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 	}
 
 	/*
+	 * If we are here due to reconnect, free per-smb session key
+	 * in case signing was required.
+	 */
+	kfree(ses->auth_key.response);
+	ses->auth_key.response = NULL;
+
+	/*
 	 * If memory allocation is successful, caller of this function
 	 * frees it.
 	 */
 	ses->ntlmssp = kmalloc(sizeof(struct ntlmssp_auth), GFP_KERNEL);
 	if (!ses->ntlmssp)
 		return -ENOMEM;
+	ses->ntlmssp->sesskey_per_smbsess = true;
 
 	/* FIXME: allow for other auth types besides NTLMSSP (e.g. krb5) */
 	ses->sectype = RawNTLMSSP;
@@ -628,6 +636,40 @@ ssetup_exit:
 	/* if ntlmssp, and negotiate succeeded, proceed to authenticate phase */
 	if ((phase == NtLmChallenge) && (rc == 0))
 		goto ssetup_ntlmssp_authenticate;
+
+	if (!rc) {
+		mutex_lock(&server->srv_mutex);
+		if (server->sign && server->ops->generate_signingkey) {
+			rc = server->ops->generate_signingkey(ses);
+			kfree(ses->auth_key.response);
+			ses->auth_key.response = NULL;
+			if (rc) {
+				cifs_dbg(FYI,
+					"SMB3 session key generation failed\n");
+				mutex_unlock(&server->srv_mutex);
+				goto keygen_exit;
+			}
+		}
+		if (!server->session_estab) {
+			server->sequence_number = 0x2;
+			server->session_estab = true;
+		}
+		mutex_unlock(&server->srv_mutex);
+
+		cifs_dbg(FYI, "SMB2/3 session established successfully\n");
+		spin_lock(&GlobalMid_Lock);
+		ses->status = CifsGood;
+		ses->need_reconnect = false;
+		spin_unlock(&GlobalMid_Lock);
+	}
+
+keygen_exit:
+	if (!server->sign) {
+		kfree(ses->auth_key.response);
+		ses->auth_key.response = NULL;
+	}
+	kfree(ses->ntlmssp);
+
 	return rc;
 }
 
@@ -813,39 +855,6 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon)
 	return rc;
 }
 
-static struct create_lease *
-create_lease_buf(u8 *lease_key, u8 oplock)
-{
-	struct create_lease *buf;
-
-	buf = kzalloc(sizeof(struct create_lease), GFP_KERNEL);
-	if (!buf)
-		return NULL;
-
-	buf->lcontext.LeaseKeyLow = cpu_to_le64(*((u64 *)lease_key));
-	buf->lcontext.LeaseKeyHigh = cpu_to_le64(*((u64 *)(lease_key + 8)));
-	if (oplock == SMB2_OPLOCK_LEVEL_EXCLUSIVE)
-		buf->lcontext.LeaseState = SMB2_LEASE_WRITE_CACHING |
-					   SMB2_LEASE_READ_CACHING;
-	else if (oplock == SMB2_OPLOCK_LEVEL_II)
-		buf->lcontext.LeaseState = SMB2_LEASE_READ_CACHING;
-	else if (oplock == SMB2_OPLOCK_LEVEL_BATCH)
-		buf->lcontext.LeaseState = SMB2_LEASE_HANDLE_CACHING |
-					   SMB2_LEASE_READ_CACHING |
-					   SMB2_LEASE_WRITE_CACHING;
-
-	buf->ccontext.DataOffset = cpu_to_le16(offsetof
-					(struct create_lease, lcontext));
-	buf->ccontext.DataLength = cpu_to_le32(sizeof(struct lease_context));
-	buf->ccontext.NameOffset = cpu_to_le16(offsetof
-				(struct create_lease, Name));
-	buf->ccontext.NameLength = cpu_to_le16(4);
-	buf->Name[0] = 'R';
-	buf->Name[1] = 'q';
-	buf->Name[2] = 'L';
-	buf->Name[3] = 's';
-	return buf;
-}
 
 static struct create_durable *
 create_durable_buf(void)
@@ -894,55 +903,49 @@ create_reconnect_durable_buf(struct cifs_fid *fid)
 }
 
 static __u8
-parse_lease_state(struct smb2_create_rsp *rsp)
+parse_lease_state(struct TCP_Server_Info *server, struct smb2_create_rsp *rsp,
+		  unsigned int *epoch)
 {
 	char *data_offset;
-	struct create_lease *lc;
-	bool found = false;
+	struct create_context *cc;
 	unsigned int next = 0;
 	char *name;
 
 	data_offset = (char *)rsp + 4 + le32_to_cpu(rsp->CreateContextsOffset);
-	lc = (struct create_lease *)data_offset;
+	cc = (struct create_context *)data_offset;
 	do {
-		lc = (struct create_lease *)((char *)lc + next);
-		name = le16_to_cpu(lc->ccontext.NameOffset) + (char *)lc;
-		if (le16_to_cpu(lc->ccontext.NameLength) != 4 ||
+		cc = (struct create_context *)((char *)cc + next);
+		name = le16_to_cpu(cc->NameOffset) + (char *)cc;
+		if (le16_to_cpu(cc->NameLength) != 4 ||
 		    strncmp(name, "RqLs", 4)) {
-			next = le32_to_cpu(lc->ccontext.Next);
+			next = le32_to_cpu(cc->Next);
 			continue;
 		}
-		if (lc->lcontext.LeaseFlags & SMB2_LEASE_FLAG_BREAK_IN_PROGRESS)
-			return SMB2_OPLOCK_LEVEL_NOCHANGE;
-		found = true;
-		break;
+		return server->ops->parse_lease_buf(cc, epoch);
 	} while (next != 0);
 
-	if (!found)
-		return 0;
-
-	return smb2_map_lease_to_oplock(lc->lcontext.LeaseState);
+	return 0;
 }
 
 static int
-add_lease_context(struct kvec *iov, unsigned int *num_iovec, __u8 *oplock)
+add_lease_context(struct TCP_Server_Info *server, struct kvec *iov,
+		  unsigned int *num_iovec, __u8 *oplock)
 {
 	struct smb2_create_req *req = iov[0].iov_base;
 	unsigned int num = *num_iovec;
 
-	iov[num].iov_base = create_lease_buf(oplock+1, *oplock);
+	iov[num].iov_base = server->ops->create_lease_buf(oplock+1, *oplock);
 	if (iov[num].iov_base == NULL)
 		return -ENOMEM;
-	iov[num].iov_len = sizeof(struct create_lease);
+	iov[num].iov_len = server->vals->create_lease_size;
 	req->RequestedOplockLevel = SMB2_OPLOCK_LEVEL_LEASE;
 	if (!req->CreateContextsOffset)
 		req->CreateContextsOffset = cpu_to_le32(
 				sizeof(struct smb2_create_req) - 4 +
 				iov[num - 1].iov_len);
-	req->CreateContextsLength = cpu_to_le32(
-				le32_to_cpu(req->CreateContextsLength) +
-				sizeof(struct create_lease));
-	inc_rfc1001_len(&req->hdr, sizeof(struct create_lease));
+	le32_add_cpu(&req->CreateContextsLength,
+		     server->vals->create_lease_size);
+	inc_rfc1001_len(&req->hdr, server->vals->create_lease_size);
 	*num_iovec = num + 1;
 	return 0;
 }
@@ -967,9 +970,7 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
 		req->CreateContextsOffset =
 			cpu_to_le32(sizeof(struct smb2_create_req) - 4 +
 								iov[1].iov_len);
-	req->CreateContextsLength =
-			cpu_to_le32(le32_to_cpu(req->CreateContextsLength) +
-						sizeof(struct create_durable));
+	le32_add_cpu(&req->CreateContextsLength, sizeof(struct create_durable));
 	inc_rfc1001_len(&req->hdr, sizeof(struct create_durable));
 	*num_iovec = num + 1;
 	return 0;
@@ -977,7 +978,8 @@ add_durable_context(struct kvec *iov, unsigned int *num_iovec,
 
 int
 SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
-	  __u8 *oplock, struct smb2_file_all_info *buf)
+	  __u8 *oplock, struct smb2_file_all_info *buf,
+	  struct smb2_err_rsp **err_buf)
 {
 	struct smb2_create_req *req;
 	struct smb2_create_rsp *rsp;
@@ -1048,11 +1050,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	if (!server->oplocks)
 		*oplock = SMB2_OPLOCK_LEVEL_NONE;
 
-	if (!(tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
+	if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) ||
 	    *oplock == SMB2_OPLOCK_LEVEL_NONE)
 		req->RequestedOplockLevel = *oplock;
 	else {
-		rc = add_lease_context(iov, &num_iovecs, oplock);
+		rc = add_lease_context(server, iov, &num_iovecs, oplock);
 		if (rc) {
 			cifs_small_buf_release(req);
 			kfree(copy_path);
@@ -1062,11 +1064,11 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 
 	if (*oplock == SMB2_OPLOCK_LEVEL_BATCH) {
 		/* need to set Next field of lease context if we request it */
-		if (tcon->ses->server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
+		if (server->capabilities & SMB2_GLOBAL_CAP_LEASING) {
 			struct create_context *ccontext =
 			    (struct create_context *)iov[num_iovecs-1].iov_base;
 			ccontext->Next =
-				cpu_to_le32(sizeof(struct create_lease));
+				cpu_to_le32(server->vals->create_lease_size);
 		}
 		rc = add_durable_context(iov, &num_iovecs, oparms);
 		if (rc) {
@@ -1082,6 +1084,9 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 
 	if (rc != 0) {
 		cifs_stats_fail_inc(tcon, SMB2_CREATE_HE);
+		if (err_buf)
+			*err_buf = kmemdup(rsp, get_rfc1002_length(rsp) + 4,
+					   GFP_KERNEL);
 		goto creat_exit;
 	}
 
@@ -1098,7 +1103,7 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path,
 	}
 
 	if (rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE)
-		*oplock = parse_lease_state(rsp);
+		*oplock = parse_lease_state(server, rsp, &oparms->fid->epoch);
 	else
 		*oplock = rsp->OplockLevel;
 creat_exit:
diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h
index 36b0d37ea69b..b83d0118a757 100644
--- a/fs/cifs/smb2pdu.h
+++ b/fs/cifs/smb2pdu.h
@@ -150,6 +150,20 @@ struct smb2_err_rsp {
 	__u8   ErrorData[1];  /* variable length */
 } __packed;
 
+struct smb2_symlink_err_rsp {
+	__le32 SymLinkLength;
+	__le32 SymLinkErrorTag;
+	__le32 ReparseTag;
+	__le16 ReparseDataLength;
+	__le16 UnparsedPathLength;
+	__le16 SubstituteNameOffset;
+	__le16 SubstituteNameLength;
+	__le16 PrintNameOffset;
+	__le16 PrintNameLength;
+	__le32 Flags;
+	__u8  PathBuffer[0];
+} __packed;
+
 #define SMB2_CLIENT_GUID_SIZE 16
 
 extern __u8 cifs_client_guid[SMB2_CLIENT_GUID_SIZE];
@@ -462,6 +476,10 @@ struct create_context {
 	__u8 Buffer[0];
 } __packed;
 
+#define SMB2_LEASE_READ_CACHING_HE	0x01
+#define SMB2_LEASE_HANDLE_CACHING_HE	0x02
+#define SMB2_LEASE_WRITE_CACHING_HE	0x04
+
 #define SMB2_LEASE_NONE			__constant_cpu_to_le32(0x00)
 #define SMB2_LEASE_READ_CACHING		__constant_cpu_to_le32(0x01)
 #define SMB2_LEASE_HANDLE_CACHING	__constant_cpu_to_le32(0x02)
@@ -479,12 +497,31 @@ struct lease_context {
 	__le64 LeaseDuration;
 } __packed;
 
+struct lease_context_v2 {
+	__le64 LeaseKeyLow;
+	__le64 LeaseKeyHigh;
+	__le32 LeaseState;
+	__le32 LeaseFlags;
+	__le64 LeaseDuration;
+	__le64 ParentLeaseKeyLow;
+	__le64 ParentLeaseKeyHigh;
+	__le16 Epoch;
+	__le16 Reserved;
+} __packed;
+
 struct create_lease {
 	struct create_context ccontext;
 	__u8   Name[8];
 	struct lease_context lcontext;
 } __packed;
 
+struct create_lease_v2 {
+	struct create_context ccontext;
+	__u8   Name[8];
+	struct lease_context_v2 lcontext;
+	__u8   Pad[4];
+} __packed;
+
 struct create_durable {
 	struct create_context ccontext;
 	__u8   Name[8];
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 1a5ecbed40ed..e3fb4801ee96 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -53,7 +53,6 @@ extern int smb3_calc_signature(struct smb_rqst *rqst,
 				struct TCP_Server_Info *server);
 extern void smb2_echo_request(struct work_struct *work);
 extern __le32 smb2_get_lease_state(struct cifsInodeInfo *cinode);
-extern __u8 smb2_map_lease_to_oplock(__le32 lease_state);
 extern bool smb2_is_valid_oplock_break(char *buffer,
 				       struct TCP_Server_Info *srv);
 
@@ -87,7 +86,6 @@ extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon,
 extern int smb2_open_file(const unsigned int xid,
 			  struct cifs_open_parms *oparms,
 			  __u32 *oplock, FILE_ALL_INFO *buf);
-extern void smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock);
 extern int smb2_unlock_range(struct cifsFileInfo *cfile,
 			     struct file_lock *flock, const unsigned int xid);
 extern int smb2_push_mandatory_locks(struct cifsFileInfo *cfile);
@@ -106,7 +104,8 @@ extern int SMB2_tcon(const unsigned int xid, struct cifs_ses *ses,
 extern int SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon);
 extern int SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms,
 		     __le16 *path, __u8 *oplock,
-		     struct smb2_file_all_info *buf);
+		     struct smb2_file_all_info *buf,
+		     struct smb2_err_rsp **err_buf);
 extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
 		     u64 persistent_fid, u64 volatile_fid, u32 opcode,
 		     bool is_fsctl, char *in_data, u32 indatalen,
diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c
index 4f2300d020c7..340abca3aa52 100644
--- a/fs/cifs/smb2transport.c
+++ b/fs/cifs/smb2transport.c
@@ -114,6 +114,23 @@ smb3_crypto_shash_allocate(struct TCP_Server_Info *server)
 	return 0;
 }
 
+static struct cifs_ses *
+smb2_find_smb_ses(struct smb2_hdr *smb2hdr, struct TCP_Server_Info *server)
+{
+	struct cifs_ses *ses;
+
+	spin_lock(&cifs_tcp_ses_lock);
+	list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) {
+		if (ses->Suid != smb2hdr->SessionId)
+			continue;
+		spin_unlock(&cifs_tcp_ses_lock);
+		return ses;
+	}
+	spin_unlock(&cifs_tcp_ses_lock);
+
+	return NULL;
+}
+
 
 int
 smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
@@ -124,6 +141,13 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	struct kvec *iov = rqst->rq_iov;
 	int n_vec = rqst->rq_nvec;
 	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
 
 	memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE);
 	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
@@ -135,7 +159,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	}
 
 	rc = crypto_shash_setkey(server->secmech.hmacsha256,
-		server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with response\n", __func__);
 		return rc;
@@ -198,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	return rc;
 }
 
-void
-generate_smb3signingkey(struct TCP_Server_Info *server)
+int
+generate_smb3signingkey(struct cifs_ses *ses)
 {
 	unsigned char zero = 0x0;
 	__u8 i[4] = {0, 0, 0, 1};
@@ -209,90 +233,99 @@ generate_smb3signingkey(struct TCP_Server_Info *server)
 	unsigned char *hashptr = prfhash;
 
 	memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE);
-	memset(server->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
+	memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE);
 
-	rc = smb3_crypto_shash_allocate(server);
+	rc = smb3_crypto_shash_allocate(ses->server);
 	if (rc) {
 		cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_setkey(server->secmech.hmacsha256,
-		server->session_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
+	rc = crypto_shash_setkey(ses->server->secmech.hmacsha256,
+		ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not set with session key\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_init(&server->secmech.sdeschmacsha256->shash);
+	rc = crypto_shash_init(&ses->server->secmech.sdeschmacsha256->shash);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not init sign hmac\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
 				i, 4);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with n\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
 				"SMB2AESCMAC", 12);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with label\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
 				&zero, 1);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with zero\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
 				"SmbSign", 8);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with context\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_update(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash,
 				L, 4);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not update with L\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	rc = crypto_shash_final(&server->secmech.sdeschmacsha256->shash,
+	rc = crypto_shash_final(&ses->server->secmech.sdeschmacsha256->shash,
 				hashptr);
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__);
 		goto smb3signkey_ret;
 	}
 
-	memcpy(server->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
+	memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE);
 
 smb3signkey_ret:
-	return;
+	return rc;
 }
 
 int
 smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 {
-	int i, rc;
+	int i;
+	int rc = 0;
 	unsigned char smb3_signature[SMB2_CMACAES_SIZE];
 	unsigned char *sigptr = smb3_signature;
 	struct kvec *iov = rqst->rq_iov;
 	int n_vec = rqst->rq_nvec;
 	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)iov[0].iov_base;
+	struct cifs_ses *ses;
+
+	ses = smb2_find_smb_ses(smb2_pdu, server);
+	if (!ses) {
+		cifs_dbg(VFS, "%s: Could not find session\n", __func__);
+		return 0;
+	}
 
 	memset(smb3_signature, 0x0, SMB2_CMACAES_SIZE);
 	memset(smb2_pdu->Signature, 0x0, SMB2_SIGNATURE_SIZE);
 
 	rc = crypto_shash_setkey(server->secmech.cmacaes,
-		server->smb3signingkey, SMB2_CMACAES_SIZE);
+		ses->smb3signingkey, SMB2_CMACAES_SIZE);
+
 	if (rc) {
 		cifs_dbg(VFS, "%s: Could not set key for cmac aes\n", __func__);
 		return rc;
@@ -389,6 +422,7 @@ smb2_verify_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server)
 	struct smb2_hdr *smb2_pdu = (struct smb2_hdr *)rqst->rq_iov[0].iov_base;
 
 	if ((smb2_pdu->Command == SMB2_NEGOTIATE) ||
+	    (smb2_pdu->Command == SMB2_SESSION_SETUP) ||
 	    (smb2_pdu->Command == SMB2_OPLOCK_BREAK) ||
 	    (!server->session_estab))
 		return 0;
diff --git a/fs/cifs/winucase.c b/fs/cifs/winucase.c
new file mode 100644
index 000000000000..1506d4fddb2c
--- /dev/null
+++ b/fs/cifs/winucase.c
@@ -0,0 +1,663 @@
+/*
+ * fs/cifs/winucase.c
+ *
+ * Copyright (c) Jeffrey Layton <jlayton@redhat.com>, 2013
+ *
+ * This program is free software;  you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY;  without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See
+ * the GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program;  if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * The const tables in this file were converted from the following info
+ * provided by Microsoft:
+ *
+ * 3.1.5.3 Mapping UTF-16 Strings to Upper Case:
+ *
+ * http://msdn.microsoft.com/en-us/library/hh877830.aspx
+ * http://www.microsoft.com/en-us/download/details.aspx?displaylang=en&id=10921
+ *
+ * In particular, the table in "Windows 8 Upper Case Mapping Table.txt" was
+ * post-processed using the winucase_convert.pl script.
+ */
+
+#include <linux/nls.h>
+
+wchar_t cifs_toupper(wchar_t in);  /* quiet sparse */
+
+static const wchar_t t2_00[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0041, 0x0042, 0x0043, 0x0044, 0x0045, 0x0046, 0x0047,
+	0x0048, 0x0049, 0x004a, 0x004b, 0x004c, 0x004d, 0x004e, 0x004f,
+	0x0050, 0x0051, 0x0052, 0x0053, 0x0054, 0x0055, 0x0056, 0x0057,
+	0x0058, 0x0059, 0x005a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x00c4, 0x00c5, 0x00c6, 0x00c7,
+	0x00c8, 0x00c9, 0x00ca, 0x00cb, 0x00cc, 0x00cd, 0x00ce, 0x00cf,
+	0x00d0, 0x00d1, 0x00d2, 0x00d3, 0x00d4, 0x00d5, 0x00d6, 0x0000,
+	0x00d8, 0x00d9, 0x00da, 0x00db, 0x00dc, 0x00dd, 0x00de, 0x0178,
+};
+
+static const wchar_t t2_01[256] = {
+	0x0000, 0x0100, 0x0000, 0x0102, 0x0000, 0x0104, 0x0000, 0x0106,
+	0x0000, 0x0108, 0x0000, 0x010a, 0x0000, 0x010c, 0x0000, 0x010e,
+	0x0000, 0x0110, 0x0000, 0x0112, 0x0000, 0x0114, 0x0000, 0x0116,
+	0x0000, 0x0118, 0x0000, 0x011a, 0x0000, 0x011c, 0x0000, 0x011e,
+	0x0000, 0x0120, 0x0000, 0x0122, 0x0000, 0x0124, 0x0000, 0x0126,
+	0x0000, 0x0128, 0x0000, 0x012a, 0x0000, 0x012c, 0x0000, 0x012e,
+	0x0000, 0x0000, 0x0000, 0x0132, 0x0000, 0x0134, 0x0000, 0x0136,
+	0x0000, 0x0000, 0x0139, 0x0000, 0x013b, 0x0000, 0x013d, 0x0000,
+	0x013f, 0x0000, 0x0141, 0x0000, 0x0143, 0x0000, 0x0145, 0x0000,
+	0x0147, 0x0000, 0x0000, 0x014a, 0x0000, 0x014c, 0x0000, 0x014e,
+	0x0000, 0x0150, 0x0000, 0x0152, 0x0000, 0x0154, 0x0000, 0x0156,
+	0x0000, 0x0158, 0x0000, 0x015a, 0x0000, 0x015c, 0x0000, 0x015e,
+	0x0000, 0x0160, 0x0000, 0x0162, 0x0000, 0x0164, 0x0000, 0x0166,
+	0x0000, 0x0168, 0x0000, 0x016a, 0x0000, 0x016c, 0x0000, 0x016e,
+	0x0000, 0x0170, 0x0000, 0x0172, 0x0000, 0x0174, 0x0000, 0x0176,
+	0x0000, 0x0000, 0x0179, 0x0000, 0x017b, 0x0000, 0x017d, 0x0000,
+	0x0243, 0x0000, 0x0000, 0x0182, 0x0000, 0x0184, 0x0000, 0x0000,
+	0x0187, 0x0000, 0x0000, 0x0000, 0x018b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0191, 0x0000, 0x0000, 0x01f6, 0x0000, 0x0000,
+	0x0000, 0x0198, 0x023d, 0x0000, 0x0000, 0x0000, 0x0220, 0x0000,
+	0x0000, 0x01a0, 0x0000, 0x01a2, 0x0000, 0x01a4, 0x0000, 0x0000,
+	0x01a7, 0x0000, 0x0000, 0x0000, 0x0000, 0x01ac, 0x0000, 0x0000,
+	0x01af, 0x0000, 0x0000, 0x0000, 0x01b3, 0x0000, 0x01b5, 0x0000,
+	0x0000, 0x01b8, 0x0000, 0x0000, 0x0000, 0x01bc, 0x0000, 0x01f7,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x01c4, 0x0000,
+	0x0000, 0x01c7, 0x0000, 0x0000, 0x01ca, 0x0000, 0x01cd, 0x0000,
+	0x01cf, 0x0000, 0x01d1, 0x0000, 0x01d3, 0x0000, 0x01d5, 0x0000,
+	0x01d7, 0x0000, 0x01d9, 0x0000, 0x01db, 0x018e, 0x0000, 0x01de,
+	0x0000, 0x01e0, 0x0000, 0x01e2, 0x0000, 0x01e4, 0x0000, 0x01e6,
+	0x0000, 0x01e8, 0x0000, 0x01ea, 0x0000, 0x01ec, 0x0000, 0x01ee,
+	0x0000, 0x0000, 0x0000, 0x01f1, 0x0000, 0x01f4, 0x0000, 0x0000,
+	0x0000, 0x01f8, 0x0000, 0x01fa, 0x0000, 0x01fc, 0x0000, 0x01fe,
+};
+
+static const wchar_t t2_02[256] = {
+	0x0000, 0x0200, 0x0000, 0x0202, 0x0000, 0x0204, 0x0000, 0x0206,
+	0x0000, 0x0208, 0x0000, 0x020a, 0x0000, 0x020c, 0x0000, 0x020e,
+	0x0000, 0x0210, 0x0000, 0x0212, 0x0000, 0x0214, 0x0000, 0x0216,
+	0x0000, 0x0218, 0x0000, 0x021a, 0x0000, 0x021c, 0x0000, 0x021e,
+	0x0000, 0x0000, 0x0000, 0x0222, 0x0000, 0x0224, 0x0000, 0x0226,
+	0x0000, 0x0228, 0x0000, 0x022a, 0x0000, 0x022c, 0x0000, 0x022e,
+	0x0000, 0x0230, 0x0000, 0x0232, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x023b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0241, 0x0000, 0x0000, 0x0000, 0x0000, 0x0246,
+	0x0000, 0x0248, 0x0000, 0x024a, 0x0000, 0x024c, 0x0000, 0x024e,
+	0x2c6f, 0x2c6d, 0x0000, 0x0181, 0x0186, 0x0000, 0x0189, 0x018a,
+	0x0000, 0x018f, 0x0000, 0x0190, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0193, 0x0000, 0x0000, 0x0194, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0197, 0x0196, 0x0000, 0x2c62, 0x0000, 0x0000, 0x0000, 0x019c,
+	0x0000, 0x2c6e, 0x019d, 0x0000, 0x0000, 0x019f, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2c64, 0x0000, 0x0000,
+	0x01a6, 0x0000, 0x0000, 0x01a9, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x01ae, 0x0244, 0x01b1, 0x01b2, 0x0245, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x01b7, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_03[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0370, 0x0000, 0x0372, 0x0000, 0x0000, 0x0000, 0x0376,
+	0x0000, 0x0000, 0x0000, 0x03fd, 0x03fe, 0x03ff, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0386, 0x0388, 0x0389, 0x038a,
+	0x0000, 0x0391, 0x0392, 0x0393, 0x0394, 0x0395, 0x0396, 0x0397,
+	0x0398, 0x0399, 0x039a, 0x039b, 0x039c, 0x039d, 0x039e, 0x039f,
+	0x03a0, 0x03a1, 0x0000, 0x03a3, 0x03a4, 0x03a5, 0x03a6, 0x03a7,
+	0x03a8, 0x03a9, 0x03aa, 0x03ab, 0x038c, 0x038e, 0x038f, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x03cf,
+	0x0000, 0x03d8, 0x0000, 0x03da, 0x0000, 0x03dc, 0x0000, 0x03de,
+	0x0000, 0x03e0, 0x0000, 0x03e2, 0x0000, 0x03e4, 0x0000, 0x03e6,
+	0x0000, 0x03e8, 0x0000, 0x03ea, 0x0000, 0x03ec, 0x0000, 0x03ee,
+	0x0000, 0x0000, 0x03f9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x03f7, 0x0000, 0x0000, 0x03fa, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_04[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0410, 0x0411, 0x0412, 0x0413, 0x0414, 0x0415, 0x0416, 0x0417,
+	0x0418, 0x0419, 0x041a, 0x041b, 0x041c, 0x041d, 0x041e, 0x041f,
+	0x0420, 0x0421, 0x0422, 0x0423, 0x0424, 0x0425, 0x0426, 0x0427,
+	0x0428, 0x0429, 0x042a, 0x042b, 0x042c, 0x042d, 0x042e, 0x042f,
+	0x0400, 0x0401, 0x0402, 0x0403, 0x0404, 0x0405, 0x0406, 0x0407,
+	0x0408, 0x0409, 0x040a, 0x040b, 0x040c, 0x040d, 0x040e, 0x040f,
+	0x0000, 0x0460, 0x0000, 0x0462, 0x0000, 0x0464, 0x0000, 0x0466,
+	0x0000, 0x0468, 0x0000, 0x046a, 0x0000, 0x046c, 0x0000, 0x046e,
+	0x0000, 0x0470, 0x0000, 0x0472, 0x0000, 0x0474, 0x0000, 0x0476,
+	0x0000, 0x0478, 0x0000, 0x047a, 0x0000, 0x047c, 0x0000, 0x047e,
+	0x0000, 0x0480, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x048a, 0x0000, 0x048c, 0x0000, 0x048e,
+	0x0000, 0x0490, 0x0000, 0x0492, 0x0000, 0x0494, 0x0000, 0x0496,
+	0x0000, 0x0498, 0x0000, 0x049a, 0x0000, 0x049c, 0x0000, 0x049e,
+	0x0000, 0x04a0, 0x0000, 0x04a2, 0x0000, 0x04a4, 0x0000, 0x04a6,
+	0x0000, 0x04a8, 0x0000, 0x04aa, 0x0000, 0x04ac, 0x0000, 0x04ae,
+	0x0000, 0x04b0, 0x0000, 0x04b2, 0x0000, 0x04b4, 0x0000, 0x04b6,
+	0x0000, 0x04b8, 0x0000, 0x04ba, 0x0000, 0x04bc, 0x0000, 0x04be,
+	0x0000, 0x0000, 0x04c1, 0x0000, 0x04c3, 0x0000, 0x04c5, 0x0000,
+	0x04c7, 0x0000, 0x04c9, 0x0000, 0x04cb, 0x0000, 0x04cd, 0x04c0,
+	0x0000, 0x04d0, 0x0000, 0x04d2, 0x0000, 0x04d4, 0x0000, 0x04d6,
+	0x0000, 0x04d8, 0x0000, 0x04da, 0x0000, 0x04dc, 0x0000, 0x04de,
+	0x0000, 0x04e0, 0x0000, 0x04e2, 0x0000, 0x04e4, 0x0000, 0x04e6,
+	0x0000, 0x04e8, 0x0000, 0x04ea, 0x0000, 0x04ec, 0x0000, 0x04ee,
+	0x0000, 0x04f0, 0x0000, 0x04f2, 0x0000, 0x04f4, 0x0000, 0x04f6,
+	0x0000, 0x04f8, 0x0000, 0x04fa, 0x0000, 0x04fc, 0x0000, 0x04fe,
+};
+
+static const wchar_t t2_05[256] = {
+	0x0000, 0x0500, 0x0000, 0x0502, 0x0000, 0x0504, 0x0000, 0x0506,
+	0x0000, 0x0508, 0x0000, 0x050a, 0x0000, 0x050c, 0x0000, 0x050e,
+	0x0000, 0x0510, 0x0000, 0x0512, 0x0000, 0x0514, 0x0000, 0x0516,
+	0x0000, 0x0518, 0x0000, 0x051a, 0x0000, 0x051c, 0x0000, 0x051e,
+	0x0000, 0x0520, 0x0000, 0x0522, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0531, 0x0532, 0x0533, 0x0534, 0x0535, 0x0536, 0x0537,
+	0x0538, 0x0539, 0x053a, 0x053b, 0x053c, 0x053d, 0x053e, 0x053f,
+	0x0540, 0x0541, 0x0542, 0x0543, 0x0544, 0x0545, 0x0546, 0x0547,
+	0x0548, 0x0549, 0x054a, 0x054b, 0x054c, 0x054d, 0x054e, 0x054f,
+	0x0550, 0x0551, 0x0552, 0x0553, 0x0554, 0x0555, 0x0556, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1d[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa77d, 0x0000, 0x0000, 0x0000, 0x2c63, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_1e[256] = {
+	0x0000, 0x1e00, 0x0000, 0x1e02, 0x0000, 0x1e04, 0x0000, 0x1e06,
+	0x0000, 0x1e08, 0x0000, 0x1e0a, 0x0000, 0x1e0c, 0x0000, 0x1e0e,
+	0x0000, 0x1e10, 0x0000, 0x1e12, 0x0000, 0x1e14, 0x0000, 0x1e16,
+	0x0000, 0x1e18, 0x0000, 0x1e1a, 0x0000, 0x1e1c, 0x0000, 0x1e1e,
+	0x0000, 0x1e20, 0x0000, 0x1e22, 0x0000, 0x1e24, 0x0000, 0x1e26,
+	0x0000, 0x1e28, 0x0000, 0x1e2a, 0x0000, 0x1e2c, 0x0000, 0x1e2e,
+	0x0000, 0x1e30, 0x0000, 0x1e32, 0x0000, 0x1e34, 0x0000, 0x1e36,
+	0x0000, 0x1e38, 0x0000, 0x1e3a, 0x0000, 0x1e3c, 0x0000, 0x1e3e,
+	0x0000, 0x1e40, 0x0000, 0x1e42, 0x0000, 0x1e44, 0x0000, 0x1e46,
+	0x0000, 0x1e48, 0x0000, 0x1e4a, 0x0000, 0x1e4c, 0x0000, 0x1e4e,
+	0x0000, 0x1e50, 0x0000, 0x1e52, 0x0000, 0x1e54, 0x0000, 0x1e56,
+	0x0000, 0x1e58, 0x0000, 0x1e5a, 0x0000, 0x1e5c, 0x0000, 0x1e5e,
+	0x0000, 0x1e60, 0x0000, 0x1e62, 0x0000, 0x1e64, 0x0000, 0x1e66,
+	0x0000, 0x1e68, 0x0000, 0x1e6a, 0x0000, 0x1e6c, 0x0000, 0x1e6e,
+	0x0000, 0x1e70, 0x0000, 0x1e72, 0x0000, 0x1e74, 0x0000, 0x1e76,
+	0x0000, 0x1e78, 0x0000, 0x1e7a, 0x0000, 0x1e7c, 0x0000, 0x1e7e,
+	0x0000, 0x1e80, 0x0000, 0x1e82, 0x0000, 0x1e84, 0x0000, 0x1e86,
+	0x0000, 0x1e88, 0x0000, 0x1e8a, 0x0000, 0x1e8c, 0x0000, 0x1e8e,
+	0x0000, 0x1e90, 0x0000, 0x1e92, 0x0000, 0x1e94, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1ea0, 0x0000, 0x1ea2, 0x0000, 0x1ea4, 0x0000, 0x1ea6,
+	0x0000, 0x1ea8, 0x0000, 0x1eaa, 0x0000, 0x1eac, 0x0000, 0x1eae,
+	0x0000, 0x1eb0, 0x0000, 0x1eb2, 0x0000, 0x1eb4, 0x0000, 0x1eb6,
+	0x0000, 0x1eb8, 0x0000, 0x1eba, 0x0000, 0x1ebc, 0x0000, 0x1ebe,
+	0x0000, 0x1ec0, 0x0000, 0x1ec2, 0x0000, 0x1ec4, 0x0000, 0x1ec6,
+	0x0000, 0x1ec8, 0x0000, 0x1eca, 0x0000, 0x1ecc, 0x0000, 0x1ece,
+	0x0000, 0x1ed0, 0x0000, 0x1ed2, 0x0000, 0x1ed4, 0x0000, 0x1ed6,
+	0x0000, 0x1ed8, 0x0000, 0x1eda, 0x0000, 0x1edc, 0x0000, 0x1ede,
+	0x0000, 0x1ee0, 0x0000, 0x1ee2, 0x0000, 0x1ee4, 0x0000, 0x1ee6,
+	0x0000, 0x1ee8, 0x0000, 0x1eea, 0x0000, 0x1eec, 0x0000, 0x1eee,
+	0x0000, 0x1ef0, 0x0000, 0x1ef2, 0x0000, 0x1ef4, 0x0000, 0x1ef6,
+	0x0000, 0x1ef8, 0x0000, 0x1efa, 0x0000, 0x1efc, 0x0000, 0x1efe,
+};
+
+static const wchar_t t2_1f[256] = {
+	0x1f08, 0x1f09, 0x1f0a, 0x1f0b, 0x1f0c, 0x1f0d, 0x1f0e, 0x1f0f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f18, 0x1f19, 0x1f1a, 0x1f1b, 0x1f1c, 0x1f1d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f28, 0x1f29, 0x1f2a, 0x1f2b, 0x1f2c, 0x1f2d, 0x1f2e, 0x1f2f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f38, 0x1f39, 0x1f3a, 0x1f3b, 0x1f3c, 0x1f3d, 0x1f3e, 0x1f3f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f48, 0x1f49, 0x1f4a, 0x1f4b, 0x1f4c, 0x1f4d, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x1f59, 0x0000, 0x1f5b, 0x0000, 0x1f5d, 0x0000, 0x1f5f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f68, 0x1f69, 0x1f6a, 0x1f6b, 0x1f6c, 0x1f6d, 0x1f6e, 0x1f6f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fba, 0x1fbb, 0x1fc8, 0x1fc9, 0x1fca, 0x1fcb, 0x1fda, 0x1fdb,
+	0x1ff8, 0x1ff9, 0x1fea, 0x1feb, 0x1ffa, 0x1ffb, 0x0000, 0x0000,
+	0x1f88, 0x1f89, 0x1f8a, 0x1f8b, 0x1f8c, 0x1f8d, 0x1f8e, 0x1f8f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1f98, 0x1f99, 0x1f9a, 0x1f9b, 0x1f9c, 0x1f9d, 0x1f9e, 0x1f9f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fa8, 0x1fa9, 0x1faa, 0x1fab, 0x1fac, 0x1fad, 0x1fae, 0x1faf,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fb8, 0x1fb9, 0x0000, 0x1fbc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1fcc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fd8, 0x1fd9, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x1fe8, 0x1fe9, 0x0000, 0x0000, 0x0000, 0x1fec, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x1ffc, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_21[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x2132, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2160, 0x2161, 0x2162, 0x2163, 0x2164, 0x2165, 0x2166, 0x2167,
+	0x2168, 0x2169, 0x216a, 0x216b, 0x216c, 0x216d, 0x216e, 0x216f,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x2183, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_24[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x24b6, 0x24b7, 0x24b8, 0x24b9, 0x24ba, 0x24bb, 0x24bc, 0x24bd,
+	0x24be, 0x24bf, 0x24c0, 0x24c1, 0x24c2, 0x24c3, 0x24c4, 0x24c5,
+	0x24c6, 0x24c7, 0x24c8, 0x24c9, 0x24ca, 0x24cb, 0x24cc, 0x24cd,
+	0x24ce, 0x24cf, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2c[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x2c00, 0x2c01, 0x2c02, 0x2c03, 0x2c04, 0x2c05, 0x2c06, 0x2c07,
+	0x2c08, 0x2c09, 0x2c0a, 0x2c0b, 0x2c0c, 0x2c0d, 0x2c0e, 0x2c0f,
+	0x2c10, 0x2c11, 0x2c12, 0x2c13, 0x2c14, 0x2c15, 0x2c16, 0x2c17,
+	0x2c18, 0x2c19, 0x2c1a, 0x2c1b, 0x2c1c, 0x2c1d, 0x2c1e, 0x2c1f,
+	0x2c20, 0x2c21, 0x2c22, 0x2c23, 0x2c24, 0x2c25, 0x2c26, 0x2c27,
+	0x2c28, 0x2c29, 0x2c2a, 0x2c2b, 0x2c2c, 0x2c2d, 0x2c2e, 0x0000,
+	0x0000, 0x2c60, 0x0000, 0x0000, 0x0000, 0x023a, 0x023e, 0x0000,
+	0x2c67, 0x0000, 0x2c69, 0x0000, 0x2c6b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x2c72, 0x0000, 0x0000, 0x2c75, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x2c80, 0x0000, 0x2c82, 0x0000, 0x2c84, 0x0000, 0x2c86,
+	0x0000, 0x2c88, 0x0000, 0x2c8a, 0x0000, 0x2c8c, 0x0000, 0x2c8e,
+	0x0000, 0x2c90, 0x0000, 0x2c92, 0x0000, 0x2c94, 0x0000, 0x2c96,
+	0x0000, 0x2c98, 0x0000, 0x2c9a, 0x0000, 0x2c9c, 0x0000, 0x2c9e,
+	0x0000, 0x2ca0, 0x0000, 0x2ca2, 0x0000, 0x2ca4, 0x0000, 0x2ca6,
+	0x0000, 0x2ca8, 0x0000, 0x2caa, 0x0000, 0x2cac, 0x0000, 0x2cae,
+	0x0000, 0x2cb0, 0x0000, 0x2cb2, 0x0000, 0x2cb4, 0x0000, 0x2cb6,
+	0x0000, 0x2cb8, 0x0000, 0x2cba, 0x0000, 0x2cbc, 0x0000, 0x2cbe,
+	0x0000, 0x2cc0, 0x0000, 0x2cc2, 0x0000, 0x2cc4, 0x0000, 0x2cc6,
+	0x0000, 0x2cc8, 0x0000, 0x2cca, 0x0000, 0x2ccc, 0x0000, 0x2cce,
+	0x0000, 0x2cd0, 0x0000, 0x2cd2, 0x0000, 0x2cd4, 0x0000, 0x2cd6,
+	0x0000, 0x2cd8, 0x0000, 0x2cda, 0x0000, 0x2cdc, 0x0000, 0x2cde,
+	0x0000, 0x2ce0, 0x0000, 0x2ce2, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_2d[256] = {
+	0x10a0, 0x10a1, 0x10a2, 0x10a3, 0x10a4, 0x10a5, 0x10a6, 0x10a7,
+	0x10a8, 0x10a9, 0x10aa, 0x10ab, 0x10ac, 0x10ad, 0x10ae, 0x10af,
+	0x10b0, 0x10b1, 0x10b2, 0x10b3, 0x10b4, 0x10b5, 0x10b6, 0x10b7,
+	0x10b8, 0x10b9, 0x10ba, 0x10bb, 0x10bc, 0x10bd, 0x10be, 0x10bf,
+	0x10c0, 0x10c1, 0x10c2, 0x10c3, 0x10c4, 0x10c5, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a6[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa640, 0x0000, 0xa642, 0x0000, 0xa644, 0x0000, 0xa646,
+	0x0000, 0xa648, 0x0000, 0xa64a, 0x0000, 0xa64c, 0x0000, 0xa64e,
+	0x0000, 0xa650, 0x0000, 0xa652, 0x0000, 0xa654, 0x0000, 0xa656,
+	0x0000, 0xa658, 0x0000, 0xa65a, 0x0000, 0xa65c, 0x0000, 0xa65e,
+	0x0000, 0x0000, 0x0000, 0xa662, 0x0000, 0xa664, 0x0000, 0xa666,
+	0x0000, 0xa668, 0x0000, 0xa66a, 0x0000, 0xa66c, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xa680, 0x0000, 0xa682, 0x0000, 0xa684, 0x0000, 0xa686,
+	0x0000, 0xa688, 0x0000, 0xa68a, 0x0000, 0xa68c, 0x0000, 0xa68e,
+	0x0000, 0xa690, 0x0000, 0xa692, 0x0000, 0xa694, 0x0000, 0xa696,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_a7[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0xa722, 0x0000, 0xa724, 0x0000, 0xa726,
+	0x0000, 0xa728, 0x0000, 0xa72a, 0x0000, 0xa72c, 0x0000, 0xa72e,
+	0x0000, 0x0000, 0x0000, 0xa732, 0x0000, 0xa734, 0x0000, 0xa736,
+	0x0000, 0xa738, 0x0000, 0xa73a, 0x0000, 0xa73c, 0x0000, 0xa73e,
+	0x0000, 0xa740, 0x0000, 0xa742, 0x0000, 0xa744, 0x0000, 0xa746,
+	0x0000, 0xa748, 0x0000, 0xa74a, 0x0000, 0xa74c, 0x0000, 0xa74e,
+	0x0000, 0xa750, 0x0000, 0xa752, 0x0000, 0xa754, 0x0000, 0xa756,
+	0x0000, 0xa758, 0x0000, 0xa75a, 0x0000, 0xa75c, 0x0000, 0xa75e,
+	0x0000, 0xa760, 0x0000, 0xa762, 0x0000, 0xa764, 0x0000, 0xa766,
+	0x0000, 0xa768, 0x0000, 0xa76a, 0x0000, 0xa76c, 0x0000, 0xa76e,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0xa779, 0x0000, 0xa77b, 0x0000, 0x0000, 0xa77e,
+	0x0000, 0xa780, 0x0000, 0xa782, 0x0000, 0xa784, 0x0000, 0xa786,
+	0x0000, 0x0000, 0x0000, 0x0000, 0xa78b, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t t2_ff[256] = {
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0xff21, 0xff22, 0xff23, 0xff24, 0xff25, 0xff26, 0xff27,
+	0xff28, 0xff29, 0xff2a, 0xff2b, 0xff2c, 0xff2d, 0xff2e, 0xff2f,
+	0xff30, 0xff31, 0xff32, 0xff33, 0xff34, 0xff35, 0xff36, 0xff37,
+	0xff38, 0xff39, 0xff3a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
+};
+
+static const wchar_t *const toplevel[256] = {
+	t2_00, t2_01, t2_02, t2_03, t2_04, t2_05,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL, t2_1d, t2_1e, t2_1f,
+	NULL, t2_21,  NULL,  NULL, t2_24,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL, t2_2c, t2_2d,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_a6, t2_a7,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL,
+	NULL,  NULL,  NULL,  NULL,  NULL,  NULL,  NULL, t2_ff,
+};
+
+/**
+ * cifs_toupper - convert a wchar_t from lower to uppercase
+ * @in: character to convert from lower to uppercase
+ *
+ * This function consults the static tables above to convert a wchar_t from
+ * lower to uppercase. In the event that there is no mapping, the original
+ * "in" character is returned.
+ */
+wchar_t
+cifs_toupper(wchar_t in)
+{
+	unsigned char idx;
+	const wchar_t *tbl;
+	wchar_t out;
+
+	/* grab upper byte */
+	idx = (in & 0xff00) >> 8;
+
+	/* find pointer to 2nd layer table */
+	tbl = toplevel[idx];
+	if (!tbl)
+		return in;
+
+	/* grab lower byte */
+	idx = in & 0xff;
+
+	/* look up character in table */
+	out = tbl[idx];
+	if (out)
+		return out;
+
+	return in;
+}
diff --git a/fs/dcache.c b/fs/dcache.c
index 4d9df3c940e6..41000305d716 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -37,6 +37,7 @@
 #include <linux/rculist_bl.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
 #include "internal.h"
 #include "mount.h"
 
@@ -48,7 +49,7 @@
  *   - the dcache hash table
  * s_anon bl list spinlock protects:
  *   - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
  *   - d_flags
@@ -63,7 +64,7 @@
  * Ordering:
  * dentry->d_inode->i_lock
  *   dentry->d_lock
- *     dcache_lru_lock
+ *     dentry->d_sb->s_dentry_lru_lock
  *     dcache_hash_bucket lock
  *     s_anon lock
  *
@@ -81,7 +82,6 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
@@ -90,8 +90,8 @@ static struct kmem_cache *dentry_cache __read_mostly;
 
 /**
  * read_seqbegin_or_lock - begin a sequence number check or locking block
- * lock: sequence lock
- * seq : sequence number to be checked
+ * @lock: sequence lock
+ * @seq : sequence number to be checked
  *
  * First try it once optimistically without taking the lock. If that fails,
  * take the lock. The sequence number is also used as a marker for deciding
@@ -103,7 +103,7 @@ static inline void read_seqbegin_or_lock(seqlock_t *lock, int *seq)
 	if (!(*seq & 1))	/* Even */
 		*seq = read_seqbegin(lock);
 	else			/* Odd */
-		write_seqlock(lock);
+		read_seqlock_excl(lock);
 }
 
 static inline int need_seqretry(seqlock_t *lock, int seq)
@@ -114,7 +114,7 @@ static inline int need_seqretry(seqlock_t *lock, int seq)
 static inline void done_seqretry(seqlock_t *lock, int seq)
 {
 	if (seq & 1)
-		write_sequnlock(lock);
+		read_sequnlock_excl(lock);
 }
 
 /*
@@ -146,23 +146,47 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_dentry, i);
 	return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_unused(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	dentry_stat.nr_unused = get_nr_dentry_unused();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -333,52 +357,96 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
  */
-static void dentry_lru_add(struct dentry *dentry)
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
 {
-	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
-		spin_lock(&dcache_lru_lock);
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, 0);
+	dentry->d_flags |= DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
-static void __dentry_lru_del(struct dentry *dentry)
+static void d_lru_del(struct dentry *dentry)
 {
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
 	list_del_init(&dentry->d_lru);
 	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
-	dentry->d_sb->s_nr_dentry_unused--;
-	dentry_stat.nr_unused--;
+	this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+	D_FLAG_VERIFY(dentry, 0);
+	list_add(&dentry->d_lru, list);
+	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
 }
 
 /*
- * Remove a dentry with references from the LRU.
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
  */
-static void dentry_lru_del(struct dentry *dentry)
+static void d_lru_isolate(struct dentry *dentry)
 {
-	if (!list_empty(&dentry->d_lru)) {
-		spin_lock(&dcache_lru_lock);
-		__dentry_lru_del(dentry);
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	list_del_init(&dentry->d_lru);
 }
 
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct dentry *dentry, struct list_head *list)
 {
-	spin_lock(&dcache_lru_lock);
-	if (list_empty(&dentry->d_lru)) {
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add_tail(&dentry->d_lru, list);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-	} else {
-		list_move_tail(&dentry->d_lru, list);
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags |= DCACHE_SHRINK_LIST;
+	list_move_tail(&dentry->d_lru, list);
+}
+
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
+ */
+static void dentry_lru_add(struct dentry *dentry)
+{
+	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+		d_lru_add(dentry);
+}
+
+/*
+ * Remove a dentry with references from the LRU.
+ *
+ * If we are on the shrink list, then we can get to try_prune_one_dentry() and
+ * lose our last reference through the parent walk. In this case, we need to
+ * remove ourselves from the shrink list, not the LRU.
+ */
+static void dentry_lru_del(struct dentry *dentry)
+{
+	if (dentry->d_flags & DCACHE_LRU_LIST) {
+		if (dentry->d_flags & DCACHE_SHRINK_LIST)
+			return d_shrink_del(dentry);
+		d_lru_del(dentry);
 	}
-	spin_unlock(&dcache_lru_lock);
 }
 
 /**
@@ -474,7 +542,8 @@ EXPORT_SYMBOL(d_drop);
  * If ref is non-zero, then decrement the refcount too.
  * Returns dentry requiring refcount drop, or NULL if we're done.
  */
-static inline struct dentry *dentry_kill(struct dentry *dentry)
+static inline struct dentry *
+dentry_kill(struct dentry *dentry, int unlock_on_failure)
 	__releases(dentry->d_lock)
 {
 	struct inode *inode;
@@ -483,8 +552,10 @@ static inline struct dentry *dentry_kill(struct dentry *dentry)
 	inode = dentry->d_inode;
 	if (inode && !spin_trylock(&inode->i_lock)) {
 relock:
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
+		if (unlock_on_failure) {
+			spin_unlock(&dentry->d_lock);
+			cpu_relax();
+		}
 		return dentry; /* try again with same dentry */
 	}
 	if (IS_ROOT(dentry))
@@ -567,7 +638,7 @@ repeat:
 	return;
 
 kill_it:
-	dentry = dentry_kill(dentry);
+	dentry = dentry_kill(dentry, 1);
 	if (dentry)
 		goto repeat;
 }
@@ -787,12 +858,12 @@ EXPORT_SYMBOL(d_prune_aliases);
  *
  * This may fail if locks cannot be acquired no problem, just try again.
  */
-static void try_prune_one_dentry(struct dentry *dentry)
+static struct dentry * try_prune_one_dentry(struct dentry *dentry)
 	__releases(dentry->d_lock)
 {
 	struct dentry *parent;
 
-	parent = dentry_kill(dentry);
+	parent = dentry_kill(dentry, 0);
 	/*
 	 * If dentry_kill returns NULL, we have nothing more to do.
 	 * if it returns the same dentry, trylocks failed. In either
@@ -804,17 +875,18 @@ static void try_prune_one_dentry(struct dentry *dentry)
 	 * fragmentation.
 	 */
 	if (!parent)
-		return;
+		return NULL;
 	if (parent == dentry)
-		return;
+		return dentry;
 
 	/* Prune ancestors. */
 	dentry = parent;
 	while (dentry) {
 		if (lockref_put_or_lock(&dentry->d_lockref))
-			return;
-		dentry = dentry_kill(dentry);
+			return NULL;
+		dentry = dentry_kill(dentry, 1);
 	}
+	return NULL;
 }
 
 static void shrink_dentry_list(struct list_head *list)
@@ -826,6 +898,12 @@ static void shrink_dentry_list(struct list_head *list)
 		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
 		if (&dentry->d_lru == list)
 			break; /* empty */
+
+		/*
+		 * Get the dentry lock, and re-verify that the dentry is
+		 * this on the shrinking list. If it is, we know that
+		 * DCACHE_SHRINK_LIST and DCACHE_LRU_LIST are set.
+		 */
 		spin_lock(&dentry->d_lock);
 		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
 			spin_unlock(&dentry->d_lock);
@@ -833,76 +911,146 @@ static void shrink_dentry_list(struct list_head *list)
 		}
 
 		/*
+		 * The dispose list is isolated and dentries are not accounted
+		 * to the LRU here, so we can simply remove it from the list
+		 * here regardless of whether it is referenced or not.
+		 */
+		d_shrink_del(dentry);
+
+		/*
 		 * We found an inuse dentry which was not removed from
-		 * the LRU because of laziness during lookup.  Do not free
-		 * it - just keep it off the LRU list.
+		 * the LRU because of laziness during lookup. Do not free it.
 		 */
 		if (dentry->d_lockref.count) {
-			dentry_lru_del(dentry);
 			spin_unlock(&dentry->d_lock);
 			continue;
 		}
-
 		rcu_read_unlock();
 
-		try_prune_one_dentry(dentry);
+		/*
+		 * If 'try_to_prune()' returns a dentry, it will
+		 * be the same one we passed in, and d_lock will
+		 * have been held the whole time, so it will not
+		 * have been added to any other lists. We failed
+		 * to get the inode lock.
+		 *
+		 * We just add it back to the shrink list.
+		 */
+		dentry = try_prune_one_dentry(dentry);
 
 		rcu_read_lock();
+		if (dentry) {
+			d_shrink_add(dentry, list);
+			spin_unlock(&dentry->d_lock);
+		}
 	}
 	rcu_read_unlock();
 }
 
+static enum lru_status
+dentry_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(dentry);
+		spin_unlock(&dentry->d_lock);
+		return LRU_REMOVED;
+	}
+
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
+
+		/*
+		 * The list move itself will be made by the common LRU code. At
+		 * this point, we've dropped the dentry->d_lock but keep the
+		 * lru lock. This is safe to do, since every list movement is
+		 * protected by the lru lock even if both locks are held.
+		 *
+		 * This is guaranteed by the fact that all LRU management
+		 * functions are intermediated by the LRU API calls like
+		 * list_lru_add and list_lru_del. List movement in this file
+		 * only ever occur through this functions or through callbacks
+		 * like this one, that are called from the LRU API.
+		 *
+		 * The only exceptions to this are functions like
+		 * shrink_dentry_list, and code that first checks for the
+		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+		 * operating only with stack provided lists after they are
+		 * properly isolated from the main list.  It is thus, always a
+		 * local access.
+		 */
+		return LRU_ROTATE;
+	}
+
+	d_lru_shrink_move(dentry, freeable);
+	spin_unlock(&dentry->d_lock);
+
+	return LRU_REMOVED;
+}
+
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @count: number of entries to try to free
+ * @nr_to_scan : number of entries to try to free
+ * @nid: which node to scan for freeable entities
  *
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
+ * Attempt to shrink the superblock dcache LRU by @nr_to_scan entries. This is
  * done when we need more memory an called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+		     int nid)
 {
-	struct dentry *dentry;
-	LIST_HEAD(referenced);
-	LIST_HEAD(tmp);
+	LIST_HEAD(dispose);
+	long freed;
 
-relock:
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		dentry = list_entry(sb->s_dentry_lru.prev,
-				struct dentry, d_lru);
-		BUG_ON(dentry->d_sb != sb);
-
-		if (!spin_trylock(&dentry->d_lock)) {
-			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
-			goto relock;
-		}
+	freed = list_lru_walk_node(&sb->s_dentry_lru, nid, dentry_lru_isolate,
+				       &dispose, &nr_to_scan);
+	shrink_dentry_list(&dispose);
+	return freed;
+}
 
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
-			list_move(&dentry->d_lru, &referenced);
-			spin_unlock(&dentry->d_lock);
-		} else {
-			list_move_tail(&dentry->d_lru, &tmp);
-			dentry->d_flags |= DCACHE_SHRINK_LIST;
-			spin_unlock(&dentry->d_lock);
-			if (!--count)
-				break;
-		}
-		cond_resched_lock(&dcache_lru_lock);
-	}
-	if (!list_empty(&referenced))
-		list_splice(&referenced, &sb->s_dentry_lru);
-	spin_unlock(&dcache_lru_lock);
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+						spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	d_lru_shrink_move(dentry, freeable);
+	spin_unlock(&dentry->d_lock);
 
-	shrink_dentry_list(&tmp);
+	return LRU_REMOVED;
 }
 
+
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -912,16 +1060,17 @@ relock:
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	LIST_HEAD(tmp);
+	long freed;
 
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		list_splice_init(&sb->s_dentry_lru, &tmp);
-		spin_unlock(&dcache_lru_lock);
-		shrink_dentry_list(&tmp);
-		spin_lock(&dcache_lru_lock);
-	}
-	spin_unlock(&dcache_lru_lock);
+	do {
+		LIST_HEAD(dispose);
+
+		freed = list_lru_walk(&sb->s_dentry_lru,
+			dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+		this_cpu_sub(nr_dentry_unused, freed);
+		shrink_dentry_list(&dispose);
+	} while (freed > 0);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -1283,8 +1432,13 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 	if (dentry->d_lockref.count) {
 		dentry_lru_del(dentry);
 	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-		dentry_lru_move_list(dentry, &data->dispose);
-		dentry->d_flags |= DCACHE_SHRINK_LIST;
+		/*
+		 * We can't use d_lru_shrink_move() because we
+		 * need to get the global LRU lock and do the
+		 * LRU accounting.
+		 */
+		d_lru_del(dentry);
+		d_shrink_add(dentry, &data->dispose);
 		data->found++;
 		ret = D_WALK_NORETRY;
 	}
@@ -2673,9 +2827,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen)
 
 /**
  * prepend_name - prepend a pathname in front of current buffer pointer
- * buffer: buffer pointer
- * buflen: allocated length of the buffer
- * name:   name string and length qstr structure
+ * @buffer: buffer pointer
+ * @buflen: allocated length of the buffer
+ * @name:   name string and length qstr structure
  *
  * With RCU path tracing, it may race with d_move(). Use ACCESS_ONCE() to
  * make sure that either the old or the new name pointer and length are
@@ -2713,14 +2867,15 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name)
  * @buffer: pointer to the end of the buffer
  * @buflen: pointer to buffer length
  *
- * The function tries to write out the pathname without taking any lock other
- * than the RCU read lock to make sure that dentries won't go away. It only
- * checks the sequence number of the global rename_lock as any change in the
- * dentry's d_seq will be preceded by changes in the rename_lock sequence
- * number. If the sequence number had been change, it will restart the whole
- * pathname back-tracing sequence again. It performs a total of 3 trials of
- * lockless back-tracing sequences before falling back to take the
- * rename_lock.
+ * The function will first try to write out the pathname without taking any
+ * lock other than the RCU read lock to make sure that dentries won't go away.
+ * It only checks the sequence number of the global rename_lock as any change
+ * in the dentry's d_seq will be preceded by changes in the rename_lock
+ * sequence number. If the sequence number had been changed, it will restart
+ * the whole pathname back-tracing sequence again by taking the rename_lock.
+ * In this case, there is no need to take the RCU read lock as the recursive
+ * parent pointer references will keep the dentry chain alive as long as no
+ * rename operation is performed.
  */
 static int prepend_path(const struct path *path,
 			const struct path *root,
@@ -2868,6 +3023,16 @@ static int prepend_unreachable(char **buffer, int *buflen)
 	return prepend(buffer, buflen, "(unreachable)", 13);
 }
 
+static void get_fs_root_rcu(struct fs_struct *fs, struct path *root)
+{
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		*root = fs->root;
+	} while (read_seqcount_retry(&fs->seq, seq));
+}
+
 /**
  * d_path - return the path of a dentry
  * @path: path to report
@@ -2900,13 +3065,15 @@ char *d_path(const struct path *path, char *buf, int buflen)
 	if (path->dentry->d_op && path->dentry->d_op->d_dname)
 		return path->dentry->d_op->d_dname(path->dentry, buf, buflen);
 
-	get_fs_root(current->fs, &root);
+	rcu_read_lock();
+	get_fs_root_rcu(current->fs, &root);
 	br_read_lock(&vfsmount_lock);
 	error = path_with_deleted(path, &root, &res, &buflen);
 	br_read_unlock(&vfsmount_lock);
+	rcu_read_unlock();
+
 	if (error < 0)
 		res = ERR_PTR(error);
-	path_put(&root);
 	return res;
 }
 EXPORT_SYMBOL(d_path);
@@ -3014,6 +3181,18 @@ Elong:
 	return ERR_PTR(-ENAMETOOLONG);
 }
 
+static void get_fs_root_and_pwd_rcu(struct fs_struct *fs, struct path *root,
+				    struct path *pwd)
+{
+	unsigned seq;
+
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		*root = fs->root;
+		*pwd = fs->pwd;
+	} while (read_seqcount_retry(&fs->seq, seq));
+}
+
 /*
  * NOTE! The user-level library version returns a
  * character pointer. The kernel system call just
@@ -3036,23 +3215,25 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 {
 	int error;
 	struct path pwd, root;
-	char *page = (char *) __get_free_page(GFP_USER);
+	char *page = __getname();
 
 	if (!page)
 		return -ENOMEM;
 
-	get_fs_root_and_pwd(current->fs, &root, &pwd);
+	rcu_read_lock();
+	get_fs_root_and_pwd_rcu(current->fs, &root, &pwd);
 
 	error = -ENOENT;
 	br_read_lock(&vfsmount_lock);
 	if (!d_unlinked(pwd.dentry)) {
 		unsigned long len;
-		char *cwd = page + PAGE_SIZE;
-		int buflen = PAGE_SIZE;
+		char *cwd = page + PATH_MAX;
+		int buflen = PATH_MAX;
 
 		prepend(&cwd, &buflen, "\0", 1);
 		error = prepend_path(&pwd, &root, &cwd, &buflen);
 		br_read_unlock(&vfsmount_lock);
+		rcu_read_unlock();
 
 		if (error < 0)
 			goto out;
@@ -3065,7 +3246,7 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		}
 
 		error = -ERANGE;
-		len = PAGE_SIZE + page - cwd;
+		len = PATH_MAX + page - cwd;
 		if (len <= size) {
 			error = len;
 			if (copy_to_user(buf, cwd, len))
@@ -3073,12 +3254,11 @@ SYSCALL_DEFINE2(getcwd, char __user *, buf, unsigned long, size)
 		}
 	} else {
 		br_read_unlock(&vfsmount_lock);
+		rcu_read_unlock();
 	}
 
 out:
-	path_put(&pwd);
-	path_put(&root);
-	free_page((unsigned long) page);
+	__putname(page);
 	return error;
 }
 
diff --git a/fs/drop_caches.c b/fs/drop_caches.c
index c00e055b6282..9fd702f5bfb2 100644
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -44,6 +44,7 @@ static void drop_slab(void)
 		.gfp_mask = GFP_KERNEL,
 	};
 
+	nodes_setall(shrink.nodes_to_scan);
 	do {
 		nr_objects = shrink_slab(&shrink, 1000, 1000);
 	} while (nr_objects > 10);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 2ec8eb1ab269..a52a5d23c30b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -861,7 +861,7 @@ static int exofs_writepage(struct page *page, struct writeback_control *wbc)
 static void _write_failed(struct inode *inode, loff_t to)
 {
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 
 int exofs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c
index 0a87bb10998d..c260de6d7b6d 100644
--- a/fs/ext2/inode.c
+++ b/fs/ext2/inode.c
@@ -58,7 +58,7 @@ static void ext2_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		ext2_truncate_blocks(inode, inode->i_size);
 	}
 }
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 2d1bdbe78c04..3981ff783950 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -931,13 +931,15 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
 	struct ext4_inode_info *ei;
 	struct list_head *cur, *tmp;
 	LIST_HEAD(skipped);
-	int ret, nr_shrunk = 0;
+	int nr_shrunk = 0;
 	int retried = 0, skip_precached = 1, nr_skipped = 0;
 
 	spin_lock(&sbi->s_es_lru_lock);
 
 retry:
 	list_for_each_safe(cur, tmp, &sbi->s_es_lru) {
+		int shrunk;
+
 		/*
 		 * If we have already reclaimed all extents from extent
 		 * status tree, just stop the loop immediately.
@@ -964,13 +966,13 @@ retry:
 			continue;
 
 		write_lock(&ei->i_es_lock);
-		ret = __es_try_to_reclaim_extents(ei, nr_to_scan);
+		shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan);
 		if (ei->i_es_lru_nr == 0)
 			list_del_init(&ei->i_es_lru);
 		write_unlock(&ei->i_es_lock);
 
-		nr_shrunk += ret;
-		nr_to_scan -= ret;
+		nr_shrunk += shrunk;
+		nr_to_scan -= shrunk;
 		if (nr_to_scan == 0)
 			break;
 	}
@@ -1007,7 +1009,20 @@ retry:
 	return nr_shrunk;
 }
 
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long ext4_es_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	unsigned long nr;
+	struct ext4_sb_info *sbi;
+
+	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+	return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct ext4_sb_info *sbi = container_of(shrink,
 					struct ext4_sb_info, s_es_shrinker);
@@ -1022,9 +1037,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
 
-	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
 	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
-	return ret;
+	return nr_shrunk;
 }
 
 void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
@@ -1032,7 +1046,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	INIT_LIST_HEAD(&sbi->s_es_lru);
 	spin_lock_init(&sbi->s_es_lru_lock);
 	sbi->s_es_last_sorted = 0;
-	sbi->s_es_shrinker.shrink = ext4_es_shrink;
+	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 	register_shrinker(&sbi->s_es_shrinker);
 }
@@ -1076,7 +1091,7 @@ static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei,
 	struct ext4_es_tree *tree = &ei->i_es_tree;
 	struct rb_node *node;
 	struct extent_status *es;
-	int nr_shrunk = 0;
+	unsigned long nr_shrunk = 0;
 	static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL,
 				      DEFAULT_RATELIMIT_BURST);
 
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index c79fd7dabe79..0d424d7ac02b 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -4587,7 +4587,6 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 
 	if (attr->ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) {
 		handle_t *handle;
-		loff_t oldsize = inode->i_size;
 
 		if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
 			struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
@@ -4650,7 +4649,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 		 * Truncate pagecache after we've waited for commit
 		 * in data=journal mode to make pages freeable.
 		 */
-		truncate_pagecache(inode, oldsize, inode->i_size);
+			truncate_pagecache(inode, inode->i_size);
 	}
 	/*
 	 * We want to call ext4_truncate() even if attr->ia_size ==
diff --git a/fs/fat/inode.c b/fs/fat/inode.c
index 11b51bb55b42..0062da21dd8b 100644
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -147,7 +147,7 @@ static void fat_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		fat_truncate_blocks(inode, inode->i_size);
 	}
 }
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 30f6f27d5a59..9f4935b8f208 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -69,7 +69,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
 
-	if (strcmp(sb->s_type->name, "bdev") == 0)
+	if (sb_is_blkdev_sb(sb))
 		return inode->i_mapping->backing_dev_info;
 
 	return sb->s_bdi;
@@ -251,11 +251,13 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
+		list_move(&inode->i_wb_list, &tmp);
+		moved++;
+		if (sb_is_blkdev_sb(inode->i_sb))
+			continue;
 		if (sb && sb != inode->i_sb)
 			do_sb_sort = 1;
 		sb = inode->i_sb;
-		list_move(&inode->i_wb_list, &tmp);
-		moved++;
 	}
 
 	/* just one sb in list, splice to dispatch_queue and we're done */
diff --git a/fs/fscache/cookie.c b/fs/fscache/cookie.c
index 318e8433527c..b2a86e324aac 100644
--- a/fs/fscache/cookie.c
+++ b/fs/fscache/cookie.c
@@ -586,7 +586,8 @@ int __fscache_check_consistency(struct fscache_cookie *cookie)
 
 	fscache_operation_init(op, NULL, NULL);
 	op->flags = FSCACHE_OP_MYTHREAD |
-		(1 << FSCACHE_OP_WAITING);
+		(1 << FSCACHE_OP_WAITING) |
+		(1 << FSCACHE_OP_UNUSE_COOKIE);
 
 	spin_lock(&cookie->lock);
 
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 3ac91086f41f..62b43b577bfc 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -1678,7 +1678,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
 	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
-		truncate_pagecache(inode, oldsize, outarg.attr.size);
+		truncate_pagecache(inode, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
 
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 84434594e80e..a8ce6dab60a0 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -218,7 +218,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 		bool inval = false;
 
 		if (oldsize != attr->size) {
-			truncate_pagecache(inode, oldsize, attr->size);
+			truncate_pagecache(inode, attr->size);
 			inval = true;
 		} else if (fc->auto_inval_data) {
 			struct timespec new_mtime = {
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index 5e2f56fccf6b..62a65fc448dc 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1016,7 +1016,7 @@ static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize
 		chunk = oldsize - newsize;
 		if (chunk > max_chunk)
 			chunk = max_chunk;
-		truncate_pagecache(inode, oldsize, oldsize - chunk);
+		truncate_pagecache(inode, oldsize - chunk);
 		oldsize -= chunk;
 		gfs2_trans_end(sdp);
 		error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
@@ -1067,7 +1067,7 @@ static int trunc_start(struct inode *inode, u64 oldsize, u64 newsize)
 	if (journaled)
 		error = gfs2_journaled_truncate(inode, oldsize, newsize);
 	else
-		truncate_pagecache(inode, oldsize, newsize);
+		truncate_pagecache(inode, newsize);
 
 	if (error) {
 		brelse(dibh);
diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 722329cac98f..c2f41b4d00b9 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1427,21 +1427,22 @@ __acquires(&lru_lock)
  * gfs2_dispose_glock_lru() above.
  */
 
-static void gfs2_scan_glock_lru(int nr)
+static long gfs2_scan_glock_lru(int nr)
 {
 	struct gfs2_glock *gl;
 	LIST_HEAD(skipped);
 	LIST_HEAD(dispose);
+	long freed = 0;
 
 	spin_lock(&lru_lock);
-	while(nr && !list_empty(&lru_list)) {
+	while ((nr-- >= 0) && !list_empty(&lru_list)) {
 		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
 
 		/* Test for being demotable */
 		if (!test_and_set_bit(GLF_LOCK, &gl->gl_flags)) {
 			list_move(&gl->gl_lru, &dispose);
 			atomic_dec(&lru_count);
-			nr--;
+			freed++;
 			continue;
 		}
 
@@ -1451,23 +1452,28 @@ static void gfs2_scan_glock_lru(int nr)
 	if (!list_empty(&dispose))
 		gfs2_dispose_glock_lru(&dispose);
 	spin_unlock(&lru_lock);
+
+	return freed;
 }
 
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
-				    struct shrink_control *sc)
+static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
+					    struct shrink_control *sc)
 {
-	if (sc->nr_to_scan) {
-		if (!(sc->gfp_mask & __GFP_FS))
-			return -1;
-		gfs2_scan_glock_lru(sc->nr_to_scan);
-	}
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+	return gfs2_scan_glock_lru(sc->nr_to_scan);
+}
 
-	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
+static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
+					     struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_read(&lru_count));
 }
 
 static struct shrinker glock_shrinker = {
-	.shrink = gfs2_shrink_glock_memory,
 	.seeks = DEFAULT_SEEKS,
+	.count_objects = gfs2_glock_shrink_count,
+	.scan_objects = gfs2_glock_shrink_scan,
 };
 
 /**
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 64915eeae5a7..ced3257f06e8 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -694,8 +694,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 
 	mark_inode_dirty(inode);
 	d_instantiate(dentry, inode);
-	if (file)
+	if (file) {
+		*opened |= FILE_CREATED;
 		error = finish_open(file, dentry, gfs2_open_common, opened);
+	}
 	gfs2_glock_dq_uninit(ghs);
 	gfs2_glock_dq_uninit(ghs + 1);
 	return error;
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index 7b0f5043cf24..351586e24e30 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -32,7 +32,8 @@
 struct workqueue_struct *gfs2_control_wq;
 
 static struct shrinker qd_shrinker = {
-	.shrink = gfs2_shrink_qd_memory,
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 3768c2f40e43..db441359ee8c 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -75,17 +75,16 @@ static LIST_HEAD(qd_lru_list);
 static atomic_t qd_lru_count = ATOMIC_INIT(0);
 static DEFINE_SPINLOCK(qd_lru_lock);
 
-int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
+unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct gfs2_quota_data *qd;
 	struct gfs2_sbd *sdp;
 	int nr_to_scan = sc->nr_to_scan;
-
-	if (nr_to_scan == 0)
-		goto out;
+	long freed = 0;
 
 	if (!(sc->gfp_mask & __GFP_FS))
-		return -1;
+		return SHRINK_STOP;
 
 	spin_lock(&qd_lru_lock);
 	while (nr_to_scan && !list_empty(&qd_lru_list)) {
@@ -110,11 +109,16 @@ int gfs2_shrink_qd_memory(struct shrinker *shrink, struct shrink_control *sc)
 		kmem_cache_free(gfs2_quotad_cachep, qd);
 		spin_lock(&qd_lru_lock);
 		nr_to_scan--;
+		freed++;
 	}
 	spin_unlock(&qd_lru_lock);
+	return freed;
+}
 
-out:
-	return (atomic_read(&qd_lru_count) * sysctl_vfs_cache_pressure) / 100;
+unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_read(&qd_lru_count));
 }
 
 static u64 qd2index(struct gfs2_quota_data *qd)
diff --git a/fs/gfs2/quota.h b/fs/gfs2/quota.h
index 4f5e6e44ed83..0f64d9deb1b0 100644
--- a/fs/gfs2/quota.h
+++ b/fs/gfs2/quota.h
@@ -53,8 +53,10 @@ static inline int gfs2_quota_lock_check(struct gfs2_inode *ip)
 	return ret;
 }
 
-extern int gfs2_shrink_qd_memory(struct shrinker *shrink,
-				 struct shrink_control *sc);
+extern unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc);
+extern unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc);
 extern const struct quotactl_ops gfs2_quotactl_ops;
 
 #endif /* __QUOTA_DOT_H__ */
diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c
index f9299d8a64e3..380ab31b5e0f 100644
--- a/fs/hfs/inode.c
+++ b/fs/hfs/inode.c
@@ -41,7 +41,7 @@ static void hfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hfs_file_truncate(inode);
 	}
 }
diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c
index 4d2edaea891c..37213d075f3c 100644
--- a/fs/hfsplus/inode.c
+++ b/fs/hfsplus/inode.c
@@ -36,7 +36,7 @@ static void hfsplus_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hfsplus_file_truncate(inode);
 	}
 }
diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c
index 4e9dabcf1f4c..67c1a61e0955 100644
--- a/fs/hpfs/file.c
+++ b/fs/hpfs/file.c
@@ -138,7 +138,7 @@ static void hpfs_write_failed(struct address_space *mapping, loff_t to)
 	hpfs_lock(inode->i_sb);
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		hpfs_truncate(inode);
 	}
 
diff --git a/fs/inode.c b/fs/inode.c
index 93a0625b46e4..b33ba8e021cc 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,7 @@
 #include <linux/prefetch.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
 #include "internal.h"
 
 /*
@@ -24,7 +25,7 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
- * inode->i_sb->s_inode_lru_lock protects:
+ * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
@@ -37,7 +38,7 @@
  *
  * inode_sb_list_lock
  *   inode->i_lock
- *     inode->i_sb->s_inode_lru_lock
+ *     Inode LRU list locks
  *
  * bdi->wb.list_lock
  *   inode->i_lock
@@ -70,33 +71,33 @@ EXPORT_SYMBOL(empty_aops);
  */
 struct inodes_stat_t inodes_stat;
 
-static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static DEFINE_PER_CPU(unsigned int, nr_unused);
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static int get_nr_inodes(void)
+static long get_nr_inodes(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_inodes, i);
 	return sum < 0 ? 0 : sum;
 }
 
-static inline int get_nr_inodes_unused(void)
+static inline long get_nr_inodes_unused(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_unused, i);
 	return sum < 0 ? 0 : sum;
 }
 
-int get_nr_dirty_inodes(void)
+long get_nr_dirty_inodes(void)
 {
 	/* not actually dirty inodes, but a wild approximation */
-	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
 }
 
@@ -109,7 +110,7 @@ int proc_nr_inodes(ctl_table *table, int write,
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
 	inodes_stat.nr_unused = get_nr_inodes_unused();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -401,13 +402,8 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (list_empty(&inode->i_lru)) {
-		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
-		inode->i_sb->s_nr_inodes_unused++;
+	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_inc(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /*
@@ -425,13 +421,9 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (!list_empty(&inode->i_lru)) {
-		list_del_init(&inode->i_lru);
-		inode->i_sb->s_nr_inodes_unused--;
+
+	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_dec(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /**
@@ -675,24 +667,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	return busy;
 }
 
-static int can_unuse(struct inode *inode)
-{
-	if (inode->i_state & ~I_REFERENCED)
-		return 0;
-	if (inode_has_buffers(inode))
-		return 0;
-	if (atomic_read(&inode->i_count))
-		return 0;
-	if (inode->i_data.nrpages)
-		return 0;
-	return 1;
-}
-
 /*
- * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * This is called from the superblock shrinker function with a number of inodes
- * to trim from the LRU. Inodes to be freed are moved to a temporary list and
- * then are freed outside inode_lock by dispose_list().
+ * Isolate the inode from the LRU in preparation for freeing it.
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -706,89 +682,82 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-void prune_icache_sb(struct super_block *sb, int nr_to_scan)
+static enum lru_status
+inode_lru_isolate(struct list_head *item, spinlock_t *lru_lock, void *arg)
 {
-	LIST_HEAD(freeable);
-	int nr_scanned;
-	unsigned long reap = 0;
+	struct list_head *freeable = arg;
+	struct inode	*inode = container_of(item, struct inode, i_lru);
 
-	spin_lock(&sb->s_inode_lru_lock);
-	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
-		struct inode *inode;
+	/*
+	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&inode->i_lock))
+		return LRU_SKIP;
 
-		if (list_empty(&sb->s_inode_lru))
-			break;
+	/*
+	 * Referenced or dirty inodes are still in use. Give them another pass
+	 * through the LRU as we canot reclaim them now.
+	 */
+	if (atomic_read(&inode->i_count) ||
+	    (inode->i_state & ~I_REFERENCED)) {
+		list_del_init(&inode->i_lru);
+		spin_unlock(&inode->i_lock);
+		this_cpu_dec(nr_unused);
+		return LRU_REMOVED;
+	}
 
-		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
+		inode->i_state &= ~I_REFERENCED;
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
 
-		/*
-		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
-		 * so use a trylock. If we fail to get the lock, just move the
-		 * inode to the back of the list so we don't spin on it.
-		 */
-		if (!spin_trylock(&inode->i_lock)) {
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			continue;
+	if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(lru_lock);
+		if (remove_inode_buffers(inode)) {
+			unsigned long reap;
+			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+			if (current_is_kswapd())
+				__count_vm_events(KSWAPD_INODESTEAL, reap);
+			else
+				__count_vm_events(PGINODESTEAL, reap);
+			if (current->reclaim_state)
+				current->reclaim_state->reclaimed_slab += reap;
 		}
+		iput(inode);
+		spin_lock(lru_lock);
+		return LRU_RETRY;
+	}
 
-		/*
-		 * Referenced or dirty inodes are still in use. Give them
-		 * another pass through the LRU as we canot reclaim them now.
-		 */
-		if (atomic_read(&inode->i_count) ||
-		    (inode->i_state & ~I_REFERENCED)) {
-			list_del_init(&inode->i_lru);
-			spin_unlock(&inode->i_lock);
-			sb->s_nr_inodes_unused--;
-			this_cpu_dec(nr_unused);
-			continue;
-		}
+	WARN_ON(inode->i_state & I_NEW);
+	inode->i_state |= I_FREEING;
+	list_move(&inode->i_lru, freeable);
+	spin_unlock(&inode->i_lock);
 
-		/* recently referenced inodes get one more pass */
-		if (inode->i_state & I_REFERENCED) {
-			inode->i_state &= ~I_REFERENCED;
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
-			__iget(inode);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&sb->s_inode_lru_lock);
-			if (remove_inode_buffers(inode))
-				reap += invalidate_mapping_pages(&inode->i_data,
-								0, -1);
-			iput(inode);
-			spin_lock(&sb->s_inode_lru_lock);
-
-			if (inode != list_entry(sb->s_inode_lru.next,
-						struct inode, i_lru))
-				continue;	/* wrong inode or list_empty */
-			/* avoid lock inversions with trylock */
-			if (!spin_trylock(&inode->i_lock))
-				continue;
-			if (!can_unuse(inode)) {
-				spin_unlock(&inode->i_lock);
-				continue;
-			}
-		}
-		WARN_ON(inode->i_state & I_NEW);
-		inode->i_state |= I_FREEING;
-		spin_unlock(&inode->i_lock);
+	this_cpu_dec(nr_unused);
+	return LRU_REMOVED;
+}
 
-		list_move(&inode->i_lru, &freeable);
-		sb->s_nr_inodes_unused--;
-		this_cpu_dec(nr_unused);
-	}
-	if (current_is_kswapd())
-		__count_vm_events(KSWAPD_INODESTEAL, reap);
-	else
-		__count_vm_events(PGINODESTEAL, reap);
-	spin_unlock(&sb->s_inode_lru_lock);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += reap;
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+		     int nid)
+{
+	LIST_HEAD(freeable);
+	long freed;
 
+	freed = list_lru_walk_node(&sb->s_inode_lru, nid, inode_lru_isolate,
+				       &freeable, &nr_to_scan);
 	dispose_list(&freeable);
+	return freed;
 }
 
 static void __wait_on_freeing_inode(struct inode *inode);
diff --git a/fs/internal.h b/fs/internal.h
index 2be46ea5dd0b..513e0d859a6c 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -114,6 +114,8 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, unsigned long nr_to_scan,
+			    int nid);
 extern void inode_add_lru(struct inode *inode);
 
 /*
@@ -121,7 +123,7 @@ extern void inode_add_lru(struct inode *inode);
  */
 extern void inode_wb_list_del(struct inode *inode);
 
-extern int get_nr_dirty_inodes(void);
+extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
@@ -130,6 +132,8 @@ extern int invalidate_inodes(struct super_block *, bool);
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
+extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan,
+			    int nid);
 
 /*
  * read_write.c
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index 730f24e282a6..f4aab719add5 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -306,7 +306,7 @@ static void jfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		jfs_truncate(inode);
 	}
 }
diff --git a/fs/mbcache.c b/fs/mbcache.c
index 8c32ef3ba88e..e519e45bf673 100644
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
-/*
- * What the mbcache registers as to get shrunk dynamically.
- */
-
-static int mb_cache_shrink_fn(struct shrinker *shrink,
-			      struct shrink_control *sc);
-
-static struct shrinker mb_cache_shrinker = {
-	.shrink = mb_cache_shrink_fn,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static inline int
 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 {
@@ -151,7 +139,7 @@ forget:
 
 
 /*
- * mb_cache_shrink_fn()  memory pressure callback
+ * mb_cache_shrink_scan()  memory pressure callback
  *
  * This function is called by the kernel memory management when memory
  * gets low.
@@ -159,17 +147,16 @@ forget:
  * @shrink: (ignored)
  * @sc: shrink_control passed from reclaim
  *
- * Returns the number of objects which are present in the cache.
+ * Returns the number of objects freed.
  */
-static int
-mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	LIST_HEAD(free_list);
-	struct mb_cache *cache;
 	struct mb_cache_entry *entry, *tmp;
-	int count = 0;
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
+	unsigned long freed = 0;
 
 	mb_debug("trying to free %d entries", nr_to_scan);
 	spin_lock(&mb_cache_spinlock);
@@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 				   struct mb_cache_entry, e_lru_list);
 		list_move_tail(&ce->e_lru_list, &free_list);
 		__mb_cache_entry_unhash(ce);
+		freed++;
+	}
+	spin_unlock(&mb_cache_spinlock);
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
 	}
+	return freed;
+}
+
+static unsigned long
+mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct mb_cache *cache;
+	unsigned long count = 0;
+
+	spin_lock(&mb_cache_spinlock);
 	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
 		mb_debug("cache %s (%d)", cache->c_name,
 			  atomic_read(&cache->c_entry_count));
 		count += atomic_read(&cache->c_entry_count);
 	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-		__mb_cache_entry_forget(entry, gfp_mask);
-	}
-	return (count / 100) * sysctl_vfs_cache_pressure;
+
+	return vfs_pressure_ratio(count);
 }
 
+static struct shrinker mb_cache_shrinker = {
+	.count_objects = mb_cache_shrink_count,
+	.scan_objects = mb_cache_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
 
 /*
  * mb_cache_create()  create a new cache
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df122496f328..0332109162a5 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -400,7 +400,7 @@ static void minix_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		minix_truncate(inode);
 	}
 }
diff --git a/fs/namei.c b/fs/namei.c
index 409a441ba2ae..645268f23eb6 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -660,29 +660,6 @@ static __always_inline void set_root_rcu(struct nameidata *nd)
 	}
 }
 
-static __always_inline int __vfs_follow_link(struct nameidata *nd, const char *link)
-{
-	int ret;
-
-	if (IS_ERR(link))
-		goto fail;
-
-	if (*link == '/') {
-		set_root(nd);
-		path_put(&nd->path);
-		nd->path = nd->root;
-		path_get(&nd->root);
-		nd->flags |= LOOKUP_JUMPED;
-	}
-	nd->inode = nd->path.dentry->d_inode;
-
-	ret = link_path_walk(link, nd);
-	return ret;
-fail:
-	path_put(&nd->path);
-	return PTR_ERR(link);
-}
-
 static void path_put_conditional(struct path *path, struct nameidata *nd)
 {
 	dput(path->dentry);
@@ -874,7 +851,20 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 	error = 0;
 	s = nd_get_link(nd);
 	if (s) {
-		error = __vfs_follow_link(nd, s);
+		if (unlikely(IS_ERR(s))) {
+			path_put(&nd->path);
+			put_link(nd, link, *p);
+			return PTR_ERR(s);
+		}
+		if (*s == '/') {
+			set_root(nd);
+			path_put(&nd->path);
+			nd->path = nd->root;
+			path_get(&nd->root);
+			nd->flags |= LOOKUP_JUMPED;
+		}
+		nd->inode = nd->path.dentry->d_inode;
+		error = link_path_walk(s, nd);
 		if (unlikely(error))
 			put_link(nd, link, *p);
 	}
@@ -2271,12 +2261,15 @@ mountpoint_last(struct nameidata *nd, struct path *path)
 		dentry = d_alloc(dir, &nd->last);
 		if (!dentry) {
 			error = -ENOMEM;
+			mutex_unlock(&dir->d_inode->i_mutex);
 			goto out;
 		}
 		dentry = lookup_real(dir->d_inode, dentry, nd->flags);
 		error = PTR_ERR(dentry);
-		if (IS_ERR(dentry))
+		if (IS_ERR(dentry)) {
+			mutex_unlock(&dir->d_inode->i_mutex);
 			goto out;
+		}
 	}
 	mutex_unlock(&dir->d_inode->i_mutex);
 
@@ -2663,6 +2656,7 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	int acc_mode;
 	int create_error = 0;
 	struct dentry *const DENTRY_NOT_SET = (void *) -1UL;
+	bool excl;
 
 	BUG_ON(dentry->d_inode);
 
@@ -2676,10 +2670,9 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	if ((open_flag & O_CREAT) && !IS_POSIXACL(dir))
 		mode &= ~current_umask();
 
-	if ((open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT)) {
+	excl = (open_flag & (O_EXCL | O_CREAT)) == (O_EXCL | O_CREAT);
+	if (excl)
 		open_flag &= ~O_TRUNC;
-		*opened |= FILE_CREATED;
-	}
 
 	/*
 	 * Checking write permission is tricky, bacuse we don't know if we are
@@ -2732,12 +2725,6 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 		goto out;
 	}
 
-	acc_mode = op->acc_mode;
-	if (*opened & FILE_CREATED) {
-		fsnotify_create(dir, dentry);
-		acc_mode = MAY_OPEN;
-	}
-
 	if (error) {	/* returned 1, that is */
 		if (WARN_ON(file->f_path.dentry == DENTRY_NOT_SET)) {
 			error = -EIO;
@@ -2747,9 +2734,19 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 			dput(dentry);
 			dentry = file->f_path.dentry;
 		}
-		if (create_error && dentry->d_inode == NULL) {
-			error = create_error;
-			goto out;
+		if (*opened & FILE_CREATED)
+			fsnotify_create(dir, dentry);
+		if (!dentry->d_inode) {
+			WARN_ON(*opened & FILE_CREATED);
+			if (create_error) {
+				error = create_error;
+				goto out;
+			}
+		} else {
+			if (excl && !(*opened & FILE_CREATED)) {
+				error = -EEXIST;
+				goto out;
+			}
 		}
 		goto looked_up;
 	}
@@ -2758,6 +2755,12 @@ static int atomic_open(struct nameidata *nd, struct dentry *dentry,
 	 * We didn't have the inode before the open, so check open permission
 	 * here.
 	 */
+	acc_mode = op->acc_mode;
+	if (*opened & FILE_CREATED) {
+		WARN_ON(!(open_flag & O_CREAT));
+		fsnotify_create(dir, dentry);
+		acc_mode = MAY_OPEN;
+	}
 	error = may_open(&file->f_path, acc_mode, open_flag);
 	if (error)
 		fput(file);
@@ -4236,11 +4239,6 @@ int generic_readlink(struct dentry *dentry, char __user *buffer, int buflen)
 	return res;
 }
 
-int vfs_follow_link(struct nameidata *nd, const char *link)
-{
-	return __vfs_follow_link(nd, link);
-}
-
 /* get the link contents into pagecache */
 static char *page_getlink(struct dentry * dentry, struct page **ppage)
 {
@@ -4352,7 +4350,6 @@ EXPORT_SYMBOL(vfs_path_lookup);
 EXPORT_SYMBOL(inode_permission);
 EXPORT_SYMBOL(unlock_rename);
 EXPORT_SYMBOL(vfs_create);
-EXPORT_SYMBOL(vfs_follow_link);
 EXPORT_SYMBOL(vfs_link);
 EXPORT_SYMBOL(vfs_mkdir);
 EXPORT_SYMBOL(vfs_mknod);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index e79bc6ce828e..854a8f05a610 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1392,6 +1392,9 @@ static int nfs_finish_open(struct nfs_open_context *ctx,
 {
 	int err;
 
+	if ((open_flags & (O_CREAT | O_EXCL)) == (O_CREAT | O_EXCL))
+		*opened |= FILE_CREATED;
+
 	err = finish_open(file, dentry, do_open, opened);
 	if (err)
 		goto out;
@@ -2006,17 +2009,18 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-int nfs_access_cache_shrinker(struct shrinker *shrink,
-			      struct shrink_control *sc)
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
+	long freed = 0;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return (nr_to_scan == 0) ? 0 : -1;
+		return SHRINK_STOP;
 
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
@@ -2032,6 +2036,7 @@ int nfs_access_cache_shrinker(struct shrinker *shrink,
 				struct nfs_access_entry, lru);
 		list_move(&cache->lru, &head);
 		rb_erase(&cache->rb_node, &nfsi->access_cache);
+		freed++;
 		if (!list_empty(&nfsi->access_cache_entry_lru))
 			list_move_tail(&nfsi->access_cache_inode_lru,
 					&nfs_access_lru_list);
@@ -2046,7 +2051,13 @@ remove_lru_entry:
 	}
 	spin_unlock(&nfs_access_lru_lock);
 	nfs_access_free_list(&head);
-	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+	return freed;
+}
+
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
 }
 
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0bd7a55a5f07..91ff089d3412 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -130,7 +130,6 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_
 
 	return -EINVAL;
 #else
-	VM_BUG_ON(iocb->ki_left != PAGE_SIZE);
 	VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE);
 
 	if (rw == READ || rw == KERNEL_READ)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 87e797640828..eda8879171c4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -541,7 +541,6 @@ EXPORT_SYMBOL_GPL(nfs_setattr);
  */
 static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 {
-	loff_t oldsize;
 	int err;
 
 	err = inode_newsize_ok(inode, offset);
@@ -549,11 +548,10 @@ static int nfs_vmtruncate(struct inode * inode, loff_t offset)
 		goto out;
 
 	spin_lock(&inode->i_lock);
-	oldsize = inode->i_size;
 	i_size_write(inode, offset);
 	spin_unlock(&inode->i_lock);
 
-	truncate_pagecache(inode, oldsize, offset);
+	truncate_pagecache(inode, offset);
 out:
 	return err;
 }
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index d388302c005f..38da8c2b81ac 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -273,8 +273,10 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const char *ip_addr);
 
 /* dir.c */
-extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-					struct shrink_control *sc);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
 int nfs_create(struct inode *, struct dentry *, umode_t, bool);
 int nfs_mkdir(struct inode *, struct dentry *, umode_t);
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index f520a1113b38..28842abafab4 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -279,15 +279,15 @@ _nfs4_state_protect(struct nfs_client *clp, unsigned long sp4_mode,
 	if (test_bit(sp4_mode, &clp->cl_sp4_flags)) {
 		spin_lock(&clp->cl_lock);
 		if (clp->cl_machine_cred != NULL)
-			newcred = get_rpccred(clp->cl_machine_cred);
+			/* don't call get_rpccred on the machine cred -
+			 * a reference will be held for life of clp */
+			newcred = clp->cl_machine_cred;
 		spin_unlock(&clp->cl_lock);
-		if (msg->rpc_cred)
-			put_rpccred(msg->rpc_cred);
 		msg->rpc_cred = newcred;
 
 		flavor = clp->cl_rpcclient->cl_auth->au_flavor;
-		WARN_ON(flavor != RPC_AUTH_GSS_KRB5I &&
-			flavor != RPC_AUTH_GSS_KRB5P);
+		WARN_ON_ONCE(flavor != RPC_AUTH_GSS_KRB5I &&
+			     flavor != RPC_AUTH_GSS_KRB5P);
 		*clntp = clp->cl_rpcclient;
 
 		return true;
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 39b6cf2d1683..989bb9d3074d 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -6001,10 +6001,12 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
 		.rpc_resp = &res,
 	};
 	struct rpc_clnt *clnt = NFS_SERVER(dir)->client;
+	struct rpc_cred *cred = NULL;
 
 	if (use_integrity) {
 		clnt = NFS_SERVER(dir)->nfs_client->cl_rpcclient;
-		msg.rpc_cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+		cred = nfs4_get_clid_cred(NFS_SERVER(dir)->nfs_client);
+		msg.rpc_cred = cred;
 	}
 
 	dprintk("NFS call  secinfo %s\n", name->name);
@@ -6016,8 +6018,8 @@ static int _nfs4_proc_secinfo(struct inode *dir, const struct qstr *name, struct
 				&res.seq_res, 0);
 	dprintk("NFS reply  secinfo: %d\n", status);
 
-	if (msg.rpc_cred)
-		put_rpccred(msg.rpc_cred);
+	if (cred)
+		put_rpccred(cred);
 
 	return status;
 }
@@ -6151,11 +6153,13 @@ static const struct nfs41_state_protection nfs4_sp4_mach_cred_request = {
 	},
 	.allow.u.words = {
 		[0] = 1 << (OP_CLOSE) |
-		      1 << (OP_LOCKU),
+		      1 << (OP_LOCKU) |
+		      1 << (OP_COMMIT),
 		[1] = 1 << (OP_SECINFO - 32) |
 		      1 << (OP_SECINFO_NO_NAME - 32) |
 		      1 << (OP_TEST_STATEID - 32) |
-		      1 << (OP_FREE_STATEID - 32)
+		      1 << (OP_FREE_STATEID - 32) |
+		      1 << (OP_WRITE - 32)
 	}
 };
 
@@ -7496,11 +7500,13 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 		.rpc_resp = &res,
 	};
 	struct rpc_clnt *clnt = server->client;
+	struct rpc_cred *cred = NULL;
 	int status;
 
 	if (use_integrity) {
 		clnt = server->nfs_client->cl_rpcclient;
-		msg.rpc_cred = nfs4_get_clid_cred(server->nfs_client);
+		cred = nfs4_get_clid_cred(server->nfs_client);
+		msg.rpc_cred = cred;
 	}
 
 	dprintk("--> %s\n", __func__);
@@ -7508,8 +7514,8 @@ _nfs41_proc_secinfo_no_name(struct nfs_server *server, struct nfs_fh *fhandle,
 				&res.seq_res, 0);
 	dprintk("<-- %s status=%d\n", __func__, status);
 
-	if (msg.rpc_cred)
-		put_rpccred(msg.rpc_cred);
+	if (cred)
+		put_rpccred(cred);
 
 	return status;
 }
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index fbdad9e1719f..79210d23f607 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -414,7 +414,7 @@ static int nfs4_stat_to_errno(int);
 #define decode_test_stateid_maxsz	(op_decode_hdr_maxsz + 2 + 1)
 #define encode_free_stateid_maxsz	(op_encode_hdr_maxsz + 1 + \
 					 XDR_QUADLEN(NFS4_STATEID_SIZE))
-#define decode_free_stateid_maxsz	(op_decode_hdr_maxsz + 1)
+#define decode_free_stateid_maxsz	(op_decode_hdr_maxsz)
 #else /* CONFIG_NFS_V4_1 */
 #define encode_sequence_maxsz	0
 #define decode_sequence_maxsz	0
@@ -5966,21 +5966,8 @@ out:
 static int decode_free_stateid(struct xdr_stream *xdr,
 			       struct nfs41_free_stateid_res *res)
 {
-	__be32 *p;
-	int status;
-
-	status = decode_op_hdr(xdr, OP_FREE_STATEID);
-	if (status)
-		return status;
-
-	p = xdr_inline_decode(xdr, 4);
-	if (unlikely(!p))
-		goto out_overflow;
-	res->status = be32_to_cpup(p++);
+	res->status = decode_op_hdr(xdr, OP_FREE_STATEID);
 	return res->status;
-out_overflow:
-	print_overflow_msg(__func__, xdr);
-	return -EIO;
 }
 #endif /* CONFIG_NFS_V4_1 */
 
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 5793f24613c8..a03b9c6f9489 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -360,7 +360,8 @@ static void unregister_nfs4_fs(void)
 #endif
 
 static struct shrinker acl_shrinker = {
-	.shrink		= nfs_access_cache_shrinker,
+	.count_objects	= nfs_access_cache_count,
+	.scan_objects	= nfs_access_cache_scan,
 	.seeks		= DEFAULT_SEEKS,
 };
 
diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c
index e76244edd748..9186c7ce0b14 100644
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -59,11 +59,14 @@ static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 static void	cache_cleaner_func(struct work_struct *unused);
-static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
-					struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 
 static struct shrinker nfsd_reply_cache_shrinker = {
-	.shrink	= nfsd_reply_cache_shrink,
+	.scan_objects = nfsd_reply_cache_scan,
+	.count_objects = nfsd_reply_cache_count,
 	.seeks	= 1,
 };
 
@@ -232,16 +235,18 @@ nfsd_cache_entry_expired(struct svc_cacherep *rp)
  * Walk the LRU list and prune off entries that are older than RC_EXPIRE.
  * Also prune the oldest ones when the total exceeds the max number of entries.
  */
-static void
+static long
 prune_cache_entries(void)
 {
 	struct svc_cacherep *rp, *tmp;
+	long freed = 0;
 
 	list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) {
 		if (!nfsd_cache_entry_expired(rp) &&
 		    num_drc_entries <= max_drc_entries)
 			break;
 		nfsd_reply_cache_free_locked(rp);
+		freed++;
 	}
 
 	/*
@@ -254,6 +259,7 @@ prune_cache_entries(void)
 		cancel_delayed_work(&cache_cleaner);
 	else
 		mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE);
+	return freed;
 }
 
 static void
@@ -264,20 +270,28 @@ cache_cleaner_func(struct work_struct *unused)
 	spin_unlock(&cache_lock);
 }
 
-static int
-nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	unsigned int num;
+	unsigned long num;
 
 	spin_lock(&cache_lock);
-	if (sc->nr_to_scan)
-		prune_cache_entries();
 	num = num_drc_entries;
 	spin_unlock(&cache_lock);
 
 	return num;
 }
 
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	unsigned long freed;
+
+	spin_lock(&cache_lock);
+	freed = prune_cache_entries();
+	spin_unlock(&cache_lock);
+	return freed;
+}
 /*
  * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
  */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index b1a5277cfd18..7e350c562e0e 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -254,7 +254,7 @@ void nilfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		nilfs_truncate(inode);
 	}
 }
diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c
index c5670b8d198c..ea4ba9daeb47 100644
--- a/fs/ntfs/file.c
+++ b/fs/ntfs/file.c
@@ -1768,7 +1768,7 @@ static void ntfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		ntfs_truncate_vfs(inode);
 	}
 }
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 4f8197caa487..d71903c6068b 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2242,7 +2242,7 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 		file->f_path.dentry->d_name.name,
 		(unsigned int)nr_segs);
 
-	if (iocb->ki_left == 0)
+	if (iocb->ki_nbytes == 0)
 		return 0;
 
 	appending = file->f_flags & O_APPEND ? 1 : 0;
@@ -2293,7 +2293,7 @@ relock:
 
 	can_do_direct = direct_io;
 	ret = ocfs2_prepare_inode_for_write(file, ppos,
-					    iocb->ki_left, appending,
+					    iocb->ki_nbytes, appending,
 					    &can_do_direct, &has_refcount);
 	if (ret < 0) {
 		mlog_errno(ret);
@@ -2301,7 +2301,7 @@ relock:
 	}
 
 	if (direct_io && !is_sync_kiocb(iocb))
-		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_nbytes,
 						      *ppos);
 
 	/*
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index e0d9b3e722bd..54d57d6ba68d 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -311,7 +311,7 @@ static void omfs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		omfs_truncate(inode);
 	}
 }
diff --git a/fs/open.c b/fs/open.c
index 2a731b0d08bc..d420331ca32a 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -744,14 +744,24 @@ cleanup_file:
 
 /**
  * finish_open - finish opening a file
- * @od: opaque open data
+ * @file: file pointer
  * @dentry: pointer to dentry
  * @open: open callback
+ * @opened: state of open
  *
  * This can be used to finish opening a file passed to i_op->atomic_open().
  *
  * If the open callback is set to NULL, then the standard f_op->open()
  * filesystem callback is substituted.
+ *
+ * NB: the dentry reference is _not_ consumed.  If, for example, the dentry is
+ * the return value of d_splice_alias(), then the caller needs to perform dput()
+ * on it after finish_open().
+ *
+ * On successful return @file is a fully instantiated open file.  After this, if
+ * an error occurs in ->atomic_open(), it needs to clean up with fput().
+ *
+ * Returns zero on success or -errno if the open failed.
  */
 int finish_open(struct file *file, struct dentry *dentry,
 		int (*open)(struct inode *, struct file *),
@@ -772,11 +782,16 @@ EXPORT_SYMBOL(finish_open);
 /**
  * finish_no_open - finish ->atomic_open() without opening the file
  *
- * @od: opaque open data
+ * @file: file pointer
  * @dentry: dentry or NULL (as returned from ->lookup())
  *
  * This can be used to set the result of a successful lookup in ->atomic_open().
- * The filesystem's atomic_open() method shall return NULL after calling this.
+ *
+ * NB: unlike finish_open() this function does consume the dentry reference and
+ * the caller need not dput() it.
+ *
+ * Returns "1" which must be the return value of ->atomic_open() after having
+ * called this function.
  */
 int finish_no_open(struct file *file, struct dentry *dentry)
 {
diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 5aa847a603c0..59d85d608898 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -132,13 +132,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeswap),
 		K(global_page_state(NR_FILE_DIRTY)),
 		K(global_page_state(NR_WRITEBACK)),
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-		K(global_page_state(NR_ANON_PAGES)
-		  + global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
-		  HPAGE_PMD_NR),
-#else
 		K(global_page_state(NR_ANON_PAGES)),
-#endif
 		K(global_page_state(NR_FILE_MAPPED)),
 		K(global_page_state(NR_SHMEM)),
 		K(global_page_state(NR_SLAB_RECLAIMABLE) +
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index 4ffb7ab5e397..b8e93a40a5d3 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -168,7 +168,7 @@ static int pstore_decompress(void *in, void *out, size_t inlen, size_t outlen)
 	int err, ret;
 
 	ret = -EIO;
-	err = zlib_inflateInit(&stream);
+	err = zlib_inflateInit2(&stream, WINDOW_BITS);
 	if (err != Z_OK)
 		goto error;
 
@@ -195,8 +195,29 @@ error:
 static void allocate_buf_for_compression(void)
 {
 	size_t size;
+	size_t cmpr;
+
+	switch (psinfo->bufsize) {
+	/* buffer range for efivars */
+	case 1000 ... 2000:
+		cmpr = 56;
+		break;
+	case 2001 ... 3000:
+		cmpr = 54;
+		break;
+	case 3001 ... 3999:
+		cmpr = 52;
+		break;
+	/* buffer range for nvram, erst */
+	case 4000 ... 10000:
+		cmpr = 45;
+		break;
+	default:
+		cmpr = 60;
+		break;
+	}
 
-	big_oops_buf_sz = (psinfo->bufsize * 100) / 45;
+	big_oops_buf_sz = (psinfo->bufsize * 100) / cmpr;
 	big_oops_buf = kmalloc(big_oops_buf_sz, GFP_KERNEL);
 	if (big_oops_buf) {
 		size = max(zlib_deflate_workspacesize(WINDOW_BITS, MEM_LEVEL),
@@ -295,10 +316,6 @@ static void pstore_dump(struct kmsg_dumper *dumper,
 				compressed = true;
 				total_len = zipped_len;
 			} else {
-				pr_err("pstore: compression failed for Part %d"
-					" returned %d\n", part, zipped_len);
-				pr_err("pstore: Capture uncompressed"
-					" oops/panic report of Part %d\n", part);
 				compressed = false;
 				total_len = copy_kmsg_to_buffer(hsize, len);
 			}
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 9a702e193538..831d49a4111f 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -687,45 +687,37 @@ int dquot_quota_sync(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_quota_sync);
 
-/* Free unused dquots from cache */
-static void prune_dqcache(int count)
+static unsigned long
+dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct list_head *head;
 	struct dquot *dquot;
+	unsigned long freed = 0;
 
 	head = free_dquots.prev;
-	while (head != &free_dquots && count) {
+	while (head != &free_dquots && sc->nr_to_scan) {
 		dquot = list_entry(head, struct dquot, dq_free);
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
 		do_destroy_dquot(dquot);
-		count--;
+		sc->nr_to_scan--;
+		freed++;
 		head = free_dquots.prev;
 	}
+	return freed;
 }
 
-/*
- * This is called from kswapd when we think we need some
- * more memory
- */
-static int shrink_dqcache_memory(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long
+dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-
-	if (nr) {
-		spin_lock(&dq_list_lock);
-		prune_dqcache(nr);
-		spin_unlock(&dq_list_lock);
-	}
-	return ((unsigned)
-		percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
-		/100) * sysctl_vfs_cache_pressure;
+	return vfs_pressure_ratio(
+	percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
 }
 
 static struct shrinker dqcache_shrinker = {
-	.shrink = shrink_dqcache_memory,
+	.count_objects = dqcache_shrink_count,
+	.scan_objects = dqcache_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
diff --git a/fs/read_write.c b/fs/read_write.c
index 122a3846d9e1..e3cd280b158c 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -367,7 +367,6 @@ ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *pp
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_read(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -417,7 +416,6 @@ ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, lof
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
 	ret = filp->f_op->aio_write(&kiocb, &iov, 1, kiocb.ki_pos);
@@ -599,7 +597,6 @@ static ssize_t do_sync_readv_writev(struct file *filp, const struct iovec *iov,
 
 	init_sync_kiocb(&kiocb, filp);
 	kiocb.ki_pos = *ppos;
-	kiocb.ki_left = len;
 	kiocb.ki_nbytes = len;
 
 	ret = fn(&kiocb, iov, nr_segs, kiocb.ki_pos);
diff --git a/fs/super.c b/fs/super.c
index f6961ea84c56..3a96c9783a8b 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -53,11 +53,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long super_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
 {
 	struct super_block *sb;
-	int	fs_objects = 0;
-	int	total_objects;
+	long	fs_objects = 0;
+	long	total_objects;
+	long	freed = 0;
+	long	dentries;
+	long	inodes;
 
 	sb = container_of(shrink, struct super_block, s_shrink);
 
@@ -65,46 +69,62 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
 	 * to recurse into the FS that called us in clear_inode() and friends..
 	 */
-	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-		return -1;
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
 
 	if (!grab_super_passive(sb))
-		return -1;
+		return SHRINK_STOP;
 
 	if (sb->s_op->nr_cached_objects)
-		fs_objects = sb->s_op->nr_cached_objects(sb);
-
-	total_objects = sb->s_nr_dentry_unused +
-			sb->s_nr_inodes_unused + fs_objects + 1;
-
-	if (sc->nr_to_scan) {
-		int	dentries;
-		int	inodes;
-
-		/* proportion the scan between the caches */
-		dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
-							total_objects;
-		inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
-							total_objects;
-		if (fs_objects)
-			fs_objects = (sc->nr_to_scan * fs_objects) /
-							total_objects;
-		/*
-		 * prune the dcache first as the icache is pinned by it, then
-		 * prune the icache, followed by the filesystem specific caches
-		 */
-		prune_dcache_sb(sb, dentries);
-		prune_icache_sb(sb, inodes);
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc->nid);
 
-		if (fs_objects && sb->s_op->free_cached_objects) {
-			sb->s_op->free_cached_objects(sb, fs_objects);
-			fs_objects = sb->s_op->nr_cached_objects(sb);
-		}
-		total_objects = sb->s_nr_dentry_unused +
-				sb->s_nr_inodes_unused + fs_objects;
+	inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid);
+	dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid);
+	total_objects = dentries + inodes + fs_objects + 1;
+
+	/* proportion the scan between the caches */
+	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
+	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+
+	/*
+	 * prune the dcache first as the icache is pinned by it, then
+	 * prune the icache, followed by the filesystem specific caches
+	 */
+	freed = prune_dcache_sb(sb, dentries, sc->nid);
+	freed += prune_icache_sb(sb, inodes, sc->nid);
+
+	if (fs_objects) {
+		fs_objects = mult_frac(sc->nr_to_scan, fs_objects,
+								total_objects);
+		freed += sb->s_op->free_cached_objects(sb, fs_objects,
+						       sc->nid);
 	}
 
-	total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
+	drop_super(sb);
+	return freed;
+}
+
+static unsigned long super_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct super_block *sb;
+	long	total_objects = 0;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	if (!grab_super_passive(sb))
+		return 0;
+
+	if (sb->s_op && sb->s_op->nr_cached_objects)
+		total_objects = sb->s_op->nr_cached_objects(sb,
+						 sc->nid);
+
+	total_objects += list_lru_count_node(&sb->s_dentry_lru,
+						 sc->nid);
+	total_objects += list_lru_count_node(&sb->s_inode_lru,
+						 sc->nid);
+
+	total_objects = vfs_pressure_ratio(total_objects);
 	drop_super(sb);
 	return total_objects;
 }
@@ -175,9 +195,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		INIT_HLIST_NODE(&s->s_instances);
 		INIT_HLIST_BL_HEAD(&s->s_anon);
 		INIT_LIST_HEAD(&s->s_inodes);
-		INIT_LIST_HEAD(&s->s_dentry_lru);
-		INIT_LIST_HEAD(&s->s_inode_lru);
-		spin_lock_init(&s->s_inode_lru_lock);
+
+		if (list_lru_init(&s->s_dentry_lru))
+			goto err_out;
+		if (list_lru_init(&s->s_inode_lru))
+			goto err_out_dentry_lru;
+
 		INIT_LIST_HEAD(&s->s_mounts);
 		init_rwsem(&s->s_umount);
 		lockdep_set_class(&s->s_umount, &type->s_umount_key);
@@ -210,11 +233,16 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 		s->cleancache_poolid = -1;
 
 		s->s_shrink.seeks = DEFAULT_SEEKS;
-		s->s_shrink.shrink = prune_super;
+		s->s_shrink.scan_objects = super_cache_scan;
+		s->s_shrink.count_objects = super_cache_count;
 		s->s_shrink.batch = 1024;
+		s->s_shrink.flags = SHRINKER_NUMA_AWARE;
 	}
 out:
 	return s;
+
+err_out_dentry_lru:
+	list_lru_destroy(&s->s_dentry_lru);
 err_out:
 	security_sb_free(s);
 #ifdef CONFIG_SMP
@@ -295,6 +323,9 @@ void deactivate_locked_super(struct super_block *s)
 
 		/* caches are now gone, we can safely kill the shrinker now */
 		unregister_shrinker(&s->s_shrink);
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c
index c1a591a4725b..66bc316927e8 100644
--- a/fs/sysv/itree.c
+++ b/fs/sysv/itree.c
@@ -469,7 +469,7 @@ static void sysv_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size) {
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 		sysv_truncate(inode);
 	}
 }
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index 7f60e900edff..6e025e02ffde 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -2587,10 +2587,11 @@ int dbg_leb_write(struct ubifs_info *c, int lnum, const void *buf,
 		return -EROFS;
 
 	failing = power_cut_emulated(c, lnum, 1);
-	if (failing)
+	if (failing) {
 		len = corrupt_data(c, buf, len);
-	ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
-		   len, lnum, offs);
+		ubifs_warn("actually write %d bytes to LEB %d:%d (the buffer was corrupted)",
+			   len, lnum, offs);
+	}
 	err = ubi_leb_write(c->ubi, lnum, buf, offs, len);
 	if (err)
 		return err;
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index 9e1d05666fed..f35135e28e96 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,18 +277,25 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
-	if (nr == 0)
-		/*
-		 * Due to the way UBIFS updates the clean znode counter it may
-		 * temporarily be negative.
-		 */
-		return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+	/*
+	 * Due to the way UBIFS updates the clean znode counter it may
+	 * temporarily be negative.
+	 */
+	return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+}
+
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	unsigned long nr = sc->nr_to_scan;
+	int contention = 0;
+	unsigned long freed;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
 	if (!clean_zn_cnt) {
 		/*
@@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 
 	if (!freed && contention) {
 		dbg_tnc("freed nothing, but contention");
-		return -1;
+		return SHRINK_STOP;
 	}
 
 out:
-	dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+	dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
 	return freed;
 }
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 879b9976c12b..3e4aa7281e04 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab;
 
 /* UBIFS TNC shrinker description */
 static struct shrinker ubifs_shrinker_info = {
-	.shrink = ubifs_shrinker,
+	.scan_objects = ubifs_shrink_scan,
+	.count_objects = ubifs_shrink_count,
 	.seeks = DEFAULT_SEEKS,
 };
 
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index b2babce4d70f..e8c8cfe1435c 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc);
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
diff --git a/fs/udf/file.c b/fs/udf/file.c
index 29569dd08168..c02a27a19c6d 100644
--- a/fs/udf/file.c
+++ b/fs/udf/file.c
@@ -141,7 +141,7 @@ static ssize_t udf_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct file *file = iocb->ki_filp;
 	struct inode *inode = file_inode(file);
 	int err, pos;
-	size_t count = iocb->ki_left;
+	size_t count = iocb->ki_nbytes;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 
 	down_write(&iinfo->i_data_sem);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index b6d15d349810..062b7925bca0 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -172,7 +172,7 @@ static void udf_write_failed(struct address_space *mapping, loff_t to)
 	loff_t isize = inode->i_size;
 
 	if (to > isize) {
-		truncate_pagecache(inode, to, isize);
+		truncate_pagecache(inode, isize);
 		if (iinfo->i_alloc_type != ICBTAG_FLAG_AD_IN_ICB) {
 			down_write(&iinfo->i_data_sem);
 			udf_clear_extent_cache(inode);
diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c
index ff24e4449ece..c8ca96086784 100644
--- a/fs/ufs/inode.c
+++ b/fs/ufs/inode.c
@@ -531,7 +531,7 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to)
 	struct inode *inode = mapping->host;
 
 	if (to > inode->i_size)
-		truncate_pagecache(inode, to, inode->i_size);
+		truncate_pagecache(inode, inode->i_size);
 }
 
 static int ufs_write_begin(struct file *file, struct address_space *mapping,
diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c
index 4a7286c1dc80..a02cfb9e3bce 100644
--- a/fs/xfs/kmem.c
+++ b/fs/xfs/kmem.c
@@ -27,8 +27,6 @@
 
 /*
  * Greedy allocation.  May fail and may return vmalloced memory.
- *
- * Must be freed using kmem_free_large.
  */
 void *
 kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
@@ -36,7 +34,7 @@ kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize)
 	void		*ptr;
 	size_t		kmsize = maxsize;
 
-	while (!(ptr = kmem_zalloc_large(kmsize))) {
+	while (!(ptr = vzalloc(kmsize))) {
 		if ((kmsize >>= 1) <= minsize)
 			kmsize = minsize;
 	}
@@ -75,6 +73,17 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
 	return ptr;
 }
 
+void *
+kmem_zalloc_large(size_t size, xfs_km_flags_t flags)
+{
+	void	*ptr;
+
+	ptr = kmem_zalloc(size, flags | KM_MAYFAIL);
+	if (ptr)
+		return ptr;
+	return vzalloc(size);
+}
+
 void
 kmem_free(const void *ptr)
 {
diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h
index b2f2620f9a87..3a7371cab508 100644
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -57,17 +57,10 @@ kmem_flags_convert(xfs_km_flags_t flags)
 
 extern void *kmem_alloc(size_t, xfs_km_flags_t);
 extern void *kmem_zalloc(size_t, xfs_km_flags_t);
+extern void *kmem_zalloc_large(size_t size, xfs_km_flags_t);
 extern void *kmem_realloc(const void *, size_t, size_t, xfs_km_flags_t);
 extern void  kmem_free(const void *);
 
-static inline void *kmem_zalloc_large(size_t size)
-{
-	return vzalloc(size);
-}
-static inline void kmem_free_large(void *ptr)
-{
-	vfree(ptr);
-}
 
 extern void *kmem_zalloc_greedy(size_t *, size_t, size_t);
 
diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c
index 69518960b2ba..0e2f37efedd0 100644
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -152,7 +152,7 @@ xfs_get_acl(struct inode *inode, int type)
 	 * go out to the disk.
 	 */
 	len = XFS_ACL_MAX_SIZE(ip->i_mount);
-	xfs_acl = kzalloc(len, GFP_KERNEL);
+	xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
 	if (!xfs_acl)
 		return ERR_PTR(-ENOMEM);
 
@@ -175,10 +175,10 @@ xfs_get_acl(struct inode *inode, int type)
 	if (IS_ERR(acl))
 		goto out;
 
- out_update_cache:
+out_update_cache:
 	set_cached_acl(inode, type, acl);
- out:
-	kfree(xfs_acl);
+out:
+	kmem_free(xfs_acl);
 	return acl;
 }
 
@@ -209,7 +209,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		struct xfs_acl *xfs_acl;
 		int len = XFS_ACL_MAX_SIZE(ip->i_mount);
 
-		xfs_acl = kzalloc(len, GFP_KERNEL);
+		xfs_acl = kmem_zalloc_large(len, KM_SLEEP);
 		if (!xfs_acl)
 			return -ENOMEM;
 
@@ -222,7 +222,7 @@ xfs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		error = -xfs_attr_set(ip, ea_name, (unsigned char *)xfs_acl,
 				len, ATTR_ROOT);
 
-		kfree(xfs_acl);
+		kmem_free(xfs_acl);
 	} else {
 		/*
 		 * A NULL ACL argument means we want to remove the ACL.
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 977da0ec6604..e51e581454e9 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -1582,7 +1582,7 @@ xfs_vm_write_begin(
 		unlock_page(page);
 
 		if (pos + len > i_size_read(inode))
-			truncate_pagecache(inode, pos + len, i_size_read(inode));
+			truncate_pagecache(inode, i_size_read(inode));
 
 		page_cache_release(page);
 		page = NULL;
@@ -1618,7 +1618,7 @@ xfs_vm_write_end(
 		loff_t		to = pos + len;
 
 		if (to > isize) {
-			truncate_pagecache(inode, to, isize);
+			truncate_pagecache(inode, isize);
 			xfs_vm_kill_delalloc_range(inode, isize, to);
 		}
 	}
diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c
index 92b830901d60..f47e65c30be6 100644
--- a/fs/xfs/xfs_bmap.c
+++ b/fs/xfs/xfs_bmap.c
@@ -4450,7 +4450,7 @@ xfs_bmapi_write(
 {
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_ifork	*ifp;
-	struct xfs_bmalloca	bma = { 0 };	/* args for xfs_bmap_alloc */
+	struct xfs_bmalloca	bma = { NULL };	/* args for xfs_bmap_alloc */
 	xfs_fileoff_t		end;		/* end of mapped file region */
 	int			eof;		/* after the end of extents */
 	int			error;		/* error return */
diff --git a/fs/xfs/xfs_bmap_btree.c b/fs/xfs/xfs_bmap_btree.c
index cf3bc76710c3..bb8de8e399c4 100644
--- a/fs/xfs/xfs_bmap_btree.c
+++ b/fs/xfs/xfs_bmap_btree.c
@@ -925,3 +925,47 @@ xfs_bmdr_maxrecs(
 		return blocklen / sizeof(xfs_bmdr_rec_t);
 	return blocklen / (sizeof(xfs_bmdr_key_t) + sizeof(xfs_bmdr_ptr_t));
 }
+
+/*
+ * Change the owner of a btree format fork fo the inode passed in. Change it to
+ * the owner of that is passed in so that we can change owners before or after
+ * we switch forks between inodes. The operation that the caller is doing will
+ * determine whether is needs to change owner before or after the switch.
+ *
+ * For demand paged transactional modification, the fork switch should be done
+ * after reading in all the blocks, modifying them and pinning them in the
+ * transaction. For modification when the buffers are already pinned in memory,
+ * the fork switch can be done before changing the owner as we won't need to
+ * validate the owner until the btree buffers are unpinned and writes can occur
+ * again.
+ *
+ * For recovery based ownership change, there is no transactional context and
+ * so a buffer list must be supplied so that we can record the buffers that we
+ * modified for the caller to issue IO on.
+ */
+int
+xfs_bmbt_change_owner(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	int			whichfork,
+	xfs_ino_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	ASSERT(tp || buffer_list);
+	ASSERT(!(tp && buffer_list));
+	if (whichfork == XFS_DATA_FORK)
+		ASSERT(ip->i_d.di_format == XFS_DINODE_FMT_BTREE);
+	else
+		ASSERT(ip->i_d.di_aformat == XFS_DINODE_FMT_BTREE);
+
+	cur = xfs_bmbt_init_cursor(ip->i_mount, tp, ip, whichfork);
+	if (!cur)
+		return ENOMEM;
+
+	error = xfs_btree_change_owner(cur, new_owner, buffer_list);
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap_btree.h b/fs/xfs/xfs_bmap_btree.h
index 1b726d626941..e367461a638e 100644
--- a/fs/xfs/xfs_bmap_btree.h
+++ b/fs/xfs/xfs_bmap_btree.h
@@ -236,6 +236,10 @@ extern int xfs_bmbt_get_maxrecs(struct xfs_btree_cur *, int level);
 extern int xfs_bmdr_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 extern int xfs_bmbt_maxrecs(struct xfs_mount *, int blocklen, int leaf);
 
+extern int xfs_bmbt_change_owner(struct xfs_trans *tp, struct xfs_inode *ip,
+				 int whichfork, xfs_ino_t new_owner,
+				 struct list_head *buffer_list);
+
 extern struct xfs_btree_cur *xfs_bmbt_init_cursor(struct xfs_mount *,
 		struct xfs_trans *, struct xfs_inode *, int);
 
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index 541d59f5e658..97f952caea74 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -612,13 +612,9 @@ xfs_getbmap(
 
 	if (bmv->bmv_count > ULONG_MAX / sizeof(struct getbmapx))
 		return XFS_ERROR(ENOMEM);
-	out = kmem_zalloc(bmv->bmv_count * sizeof(struct getbmapx), KM_MAYFAIL);
-	if (!out) {
-		out = kmem_zalloc_large(bmv->bmv_count *
-					sizeof(struct getbmapx));
-		if (!out)
-			return XFS_ERROR(ENOMEM);
-	}
+	out = kmem_zalloc_large(bmv->bmv_count * sizeof(struct getbmapx), 0);
+	if (!out)
+		return XFS_ERROR(ENOMEM);
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
 	if (whichfork == XFS_DATA_FORK && !(iflags & BMV_IF_DELALLOC)) {
@@ -754,10 +750,7 @@ xfs_getbmap(
 			break;
 	}
 
-	if (is_vmalloc_addr(out))
-		kmem_free_large(out);
-	else
-		kmem_free(out);
+	kmem_free(out);
 	return error;
 }
 
@@ -1789,14 +1782,6 @@ xfs_swap_extents(
 	int		taforkblks = 0;
 	__uint64_t	tmp;
 
-	/*
-	 * We have no way of updating owner information in the BMBT blocks for
-	 * each inode on CRC enabled filesystems, so to avoid corrupting the
-	 * this metadata we simply don't allow extent swaps to occur.
-	 */
-	if (xfs_sb_version_hascrc(&mp->m_sb))
-		return XFS_ERROR(EINVAL);
-
 	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
 	if (!tempifp) {
 		error = XFS_ERROR(ENOMEM);
@@ -1920,6 +1905,42 @@ xfs_swap_extents(
 			goto out_trans_cancel;
 	}
 
+	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
+
+	/*
+	 * Before we've swapped the forks, lets set the owners of the forks
+	 * appropriately. We have to do this as we are demand paging the btree
+	 * buffers, and so the validation done on read will expect the owner
+	 * field to be correctly set. Once we change the owners, we can swap the
+	 * inode forks.
+	 *
+	 * Note the trickiness in setting the log flags - we set the owner log
+	 * flag on the opposite inode (i.e. the inode we are setting the new
+	 * owner to be) because once we swap the forks and log that, log
+	 * recovery is going to see the fork as owned by the swapped inode,
+	 * not the pre-swapped inodes.
+	 */
+	src_log_flags = XFS_ILOG_CORE;
+	target_log_flags = XFS_ILOG_CORE;
+	if (ip->i_d.di_version == 3 &&
+	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		target_log_flags |= XFS_ILOG_DOWNER;
+		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
+					      tip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+
+	if (tip->i_d.di_version == 3 &&
+	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
+		src_log_flags |= XFS_ILOG_DOWNER;
+		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
+					      ip->i_ino, NULL);
+		if (error)
+			goto out_trans_cancel;
+	}
+
 	/*
 	 * Swap the data forks of the inodes
 	 */
@@ -1957,7 +1978,6 @@ xfs_swap_extents(
 	tip->i_delayed_blks = ip->i_delayed_blks;
 	ip->i_delayed_blks = 0;
 
-	src_log_flags = XFS_ILOG_CORE;
 	switch (ip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		/* If the extents fit in the inode, fix the
@@ -1971,11 +1991,12 @@ xfs_swap_extents(
 		src_log_flags |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
+		ASSERT(ip->i_d.di_version < 3 ||
+		       (src_log_flags & XFS_ILOG_DOWNER));
 		src_log_flags |= XFS_ILOG_DBROOT;
 		break;
 	}
 
-	target_log_flags = XFS_ILOG_CORE;
 	switch (tip->i_d.di_format) {
 	case XFS_DINODE_FMT_EXTENTS:
 		/* If the extents fit in the inode, fix the
@@ -1990,13 +2011,11 @@ xfs_swap_extents(
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		target_log_flags |= XFS_ILOG_DBROOT;
+		ASSERT(tip->i_d.di_version < 3 ||
+		       (target_log_flags & XFS_ILOG_DOWNER));
 		break;
 	}
 
-
-	xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-	xfs_trans_ijoin(tp, tip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL);
-
 	xfs_trans_log_inode(tp, ip,  src_log_flags);
 	xfs_trans_log_inode(tp, tip, target_log_flags);
 
diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c
index 7a2b4da3c0db..5690e102243d 100644
--- a/fs/xfs/xfs_btree.c
+++ b/fs/xfs/xfs_btree.c
@@ -855,6 +855,41 @@ xfs_btree_readahead(
 	return xfs_btree_readahead_sblock(cur, lr, block);
 }
 
+STATIC xfs_daddr_t
+xfs_btree_ptr_to_daddr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
+		ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
+
+		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
+	} else {
+		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
+		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
+
+		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
+					be32_to_cpu(ptr->s));
+	}
+}
+
+/*
+ * Readahead @count btree blocks at the given @ptr location.
+ *
+ * We don't need to care about long or short form btrees here as we have a
+ * method of converting the ptr directly to a daddr available to us.
+ */
+STATIC void
+xfs_btree_readahead_ptr(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	xfs_extlen_t		count)
+{
+	xfs_buf_readahead(cur->bc_mp->m_ddev_targp,
+			  xfs_btree_ptr_to_daddr(cur, ptr),
+			  cur->bc_mp->m_bsize * count, cur->bc_ops->buf_ops);
+}
+
 /*
  * Set the buffer for level "lev" in the cursor to bp, releasing
  * any previous buffer.
@@ -1073,24 +1108,6 @@ xfs_btree_buf_to_ptr(
 	}
 }
 
-STATIC xfs_daddr_t
-xfs_btree_ptr_to_daddr(
-	struct xfs_btree_cur	*cur,
-	union xfs_btree_ptr	*ptr)
-{
-	if (cur->bc_flags & XFS_BTREE_LONG_PTRS) {
-		ASSERT(ptr->l != cpu_to_be64(NULLDFSBNO));
-
-		return XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
-	} else {
-		ASSERT(cur->bc_private.a.agno != NULLAGNUMBER);
-		ASSERT(ptr->s != cpu_to_be32(NULLAGBLOCK));
-
-		return XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_private.a.agno,
-					be32_to_cpu(ptr->s));
-	}
-}
-
 STATIC void
 xfs_btree_set_refs(
 	struct xfs_btree_cur	*cur,
@@ -3869,3 +3886,120 @@ xfs_btree_get_rec(
 	*stat = 1;
 	return 0;
 }
+
+/*
+ * Change the owner of a btree.
+ *
+ * The mechanism we use here is ordered buffer logging. Because we don't know
+ * how many buffers were are going to need to modify, we don't really want to
+ * have to make transaction reservations for the worst case of every buffer in a
+ * full size btree as that may be more space that we can fit in the log....
+ *
+ * We do the btree walk in the most optimal manner possible - we have sibling
+ * pointers so we can just walk all the blocks on each level from left to right
+ * in a single pass, and then move to the next level and do the same. We can
+ * also do readahead on the sibling pointers to get IO moving more quickly,
+ * though for slow disks this is unlikely to make much difference to performance
+ * as the amount of CPU work we have to do before moving to the next block is
+ * relatively small.
+ *
+ * For each btree block that we load, modify the owner appropriately, set the
+ * buffer as an ordered buffer and log it appropriately. We need to ensure that
+ * we mark the region we change dirty so that if the buffer is relogged in
+ * a subsequent transaction the changes we make here as an ordered buffer are
+ * correctly relogged in that transaction.  If we are in recovery context, then
+ * just queue the modified buffer as delayed write buffer so the transaction
+ * recovery completion writes the changes to disk.
+ */
+static int
+xfs_btree_block_change_owner(
+	struct xfs_btree_cur	*cur,
+	int			level,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	struct xfs_btree_block	*block;
+	struct xfs_buf		*bp;
+	union xfs_btree_ptr     rptr;
+
+	/* do right sibling readahead */
+	xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
+
+	/* modify the owner */
+	block = xfs_btree_get_block(cur, level, &bp);
+	if (cur->bc_flags & XFS_BTREE_LONG_PTRS)
+		block->bb_u.l.bb_owner = cpu_to_be64(new_owner);
+	else
+		block->bb_u.s.bb_owner = cpu_to_be32(new_owner);
+
+	/*
+	 * If the block is a root block hosted in an inode, we might not have a
+	 * buffer pointer here and we shouldn't attempt to log the change as the
+	 * information is already held in the inode and discarded when the root
+	 * block is formatted into the on-disk inode fork. We still change it,
+	 * though, so everything is consistent in memory.
+	 */
+	if (bp) {
+		if (cur->bc_tp) {
+			xfs_trans_ordered_buf(cur->bc_tp, bp);
+			xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
+		} else {
+			xfs_buf_delwri_queue(bp, buffer_list);
+		}
+	} else {
+		ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
+		ASSERT(level == cur->bc_nlevels - 1);
+	}
+
+	/* now read rh sibling block for next iteration */
+	xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
+	if (xfs_btree_ptr_is_null(cur, &rptr))
+		return ENOENT;
+
+	return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
+}
+
+int
+xfs_btree_change_owner(
+	struct xfs_btree_cur	*cur,
+	__uint64_t		new_owner,
+	struct list_head	*buffer_list)
+{
+	union xfs_btree_ptr     lptr;
+	int			level;
+	struct xfs_btree_block	*block = NULL;
+	int			error = 0;
+
+	cur->bc_ops->init_ptr_from_cur(cur, &lptr);
+
+	/* for each level */
+	for (level = cur->bc_nlevels - 1; level >= 0; level--) {
+		/* grab the left hand block */
+		error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
+		if (error)
+			return error;
+
+		/* readahead the left most block for the next level down */
+		if (level > 0) {
+			union xfs_btree_ptr     *ptr;
+
+			ptr = xfs_btree_ptr_addr(cur, 1, block);
+			xfs_btree_readahead_ptr(cur, ptr, 1);
+
+			/* save for the next iteration of the loop */
+			lptr = *ptr;
+		}
+
+		/* for each buffer in the level */
+		do {
+			error = xfs_btree_block_change_owner(cur, level,
+							     new_owner,
+							     buffer_list);
+		} while (!error);
+
+		if (error != ENOENT)
+			return error;
+	}
+
+	return 0;
+}
diff --git a/fs/xfs/xfs_btree.h b/fs/xfs/xfs_btree.h
index c8473c7ef45e..06729b67ad58 100644
--- a/fs/xfs/xfs_btree.h
+++ b/fs/xfs/xfs_btree.h
@@ -121,15 +121,18 @@ union xfs_btree_rec {
 /*
  * For logging record fields.
  */
-#define	XFS_BB_MAGIC		0x01
-#define	XFS_BB_LEVEL		0x02
-#define	XFS_BB_NUMRECS		0x04
-#define	XFS_BB_LEFTSIB		0x08
-#define	XFS_BB_RIGHTSIB		0x10
-#define	XFS_BB_BLKNO		0x20
+#define	XFS_BB_MAGIC		(1 << 0)
+#define	XFS_BB_LEVEL		(1 << 1)
+#define	XFS_BB_NUMRECS		(1 << 2)
+#define	XFS_BB_LEFTSIB		(1 << 3)
+#define	XFS_BB_RIGHTSIB		(1 << 4)
+#define	XFS_BB_BLKNO		(1 << 5)
+#define	XFS_BB_LSN		(1 << 6)
+#define	XFS_BB_UUID		(1 << 7)
+#define	XFS_BB_OWNER		(1 << 8)
 #define	XFS_BB_NUM_BITS		5
 #define	XFS_BB_ALL_BITS		((1 << XFS_BB_NUM_BITS) - 1)
-#define	XFS_BB_NUM_BITS_CRC	8
+#define	XFS_BB_NUM_BITS_CRC	9
 #define	XFS_BB_ALL_BITS_CRC	((1 << XFS_BB_NUM_BITS_CRC) - 1)
 
 /*
@@ -442,6 +445,8 @@ int xfs_btree_new_iroot(struct xfs_btree_cur *, int *, int *);
 int xfs_btree_insert(struct xfs_btree_cur *, int *);
 int xfs_btree_delete(struct xfs_btree_cur *, int *);
 int xfs_btree_get_rec(struct xfs_btree_cur *, union xfs_btree_rec **, int *);
+int xfs_btree_change_owner(struct xfs_btree_cur *cur, __uint64_t new_owner,
+			   struct list_head *buffer_list);
 
 /*
  * btree block CRC helpers
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index c06823fe10d3..263470075ea2 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -81,54 +81,6 @@ xfs_buf_vmap_len(
 }
 
 /*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (list_empty(&bp->b_lru)) {
-		atomic_inc(&bp->b_hold);
-		list_add_tail(&bp->b_lru, &btp->bt_lru);
-		btp->bt_lru_nr++;
-		bp->b_lru_flags &= ~_XBF_LRU_DISPOSE;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	if (list_empty(&bp->b_lru))
-		return;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (!list_empty(&bp->b_lru)) {
-		list_del_init(&bp->b_lru);
-		btp->bt_lru_nr--;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
  * When we mark a buffer stale, we remove the buffer from the LRU and clear the
  * b_lru_ref count so that the buffer is freed immediately when the buffer
  * reference count falls to zero. If the buffer is already on the LRU, we need
@@ -151,20 +103,14 @@ xfs_buf_stale(
 	 */
 	bp->b_flags &= ~_XBF_DELWRI_Q;
 
-	atomic_set(&(bp)->b_lru_ref, 0);
-	if (!list_empty(&bp->b_lru)) {
-		struct xfs_buftarg *btp = bp->b_target;
+	spin_lock(&bp->b_lock);
+	atomic_set(&bp->b_lru_ref, 0);
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+		atomic_dec(&bp->b_hold);
 
-		spin_lock(&btp->bt_lru_lock);
-		if (!list_empty(&bp->b_lru) &&
-		    !(bp->b_lru_flags & _XBF_LRU_DISPOSE)) {
-			list_del_init(&bp->b_lru);
-			btp->bt_lru_nr--;
-			atomic_dec(&bp->b_hold);
-		}
-		spin_unlock(&btp->bt_lru_lock);
-	}
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
 }
 
 static int
@@ -228,6 +174,7 @@ _xfs_buf_alloc(
 	INIT_LIST_HEAD(&bp->b_list);
 	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
+	spin_lock_init(&bp->b_lock);
 	XB_SET_OWNER(bp);
 	bp->b_target = target;
 	bp->b_flags = flags;
@@ -917,12 +864,33 @@ xfs_buf_rele(
 
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	if (atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock)) {
-		if (!(bp->b_flags & XBF_STALE) &&
-			   atomic_read(&bp->b_lru_ref)) {
-			xfs_buf_lru_add(bp);
+		spin_lock(&bp->b_lock);
+		if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) {
+			/*
+			 * If the buffer is added to the LRU take a new
+			 * reference to the buffer for the LRU and clear the
+			 * (now stale) dispose list state flag
+			 */
+			if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+				bp->b_state &= ~XFS_BSTATE_DISPOSE;
+				atomic_inc(&bp->b_hold);
+			}
+			spin_unlock(&bp->b_lock);
 			spin_unlock(&pag->pag_buf_lock);
 		} else {
-			xfs_buf_lru_del(bp);
+			/*
+			 * most of the time buffers will already be removed from
+			 * the LRU, so optimise that case by checking for the
+			 * XFS_BSTATE_DISPOSE flag indicating the last list the
+			 * buffer was on was the disposal list
+			 */
+			if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
+				list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
+			} else {
+				ASSERT(list_empty(&bp->b_lru));
+			}
+			spin_unlock(&bp->b_lock);
+
 			ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 			rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 			spin_unlock(&pag->pag_buf_lock);
@@ -1502,83 +1470,121 @@ xfs_buf_iomove(
  * returned. These buffers will have an elevated hold count, so wait on those
  * while freeing all the buffers only held by the LRU.
  */
+static enum lru_status
+xfs_buftarg_wait_rele(
+	struct list_head	*item,
+	spinlock_t		*lru_lock,
+	void			*arg)
+
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	if (atomic_read(&bp->b_hold) > 1) {
+		/* need to wait, so skip it this pass */
+		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+		return LRU_SKIP;
+	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_move(item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct xfs_buf		*bp;
+	LIST_HEAD(dispose);
+	int loop = 0;
 
-restart:
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-		if (atomic_read(&bp->b_hold) > 1) {
-			trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			spin_unlock(&btp->bt_lru_lock);
-			delay(100);
-			goto restart;
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
+		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+			      &dispose, LONG_MAX);
+
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			xfs_buf_rele(bp);
 		}
-		/*
-		 * clear the LRU reference count so the buffer doesn't get
-		 * ignored in xfs_buf_rele().
-		 */
-		atomic_set(&bp->b_lru_ref, 0);
-		spin_unlock(&btp->bt_lru_lock);
-		xfs_buf_rele(bp);
-		spin_lock(&btp->bt_lru_lock);
+		if (loop++ != 0)
+			delay(100);
 	}
-	spin_unlock(&btp->bt_lru_lock);
 }
 
-int
-xfs_buftarg_shrink(
+static enum lru_status
+xfs_buftarg_isolate(
+	struct list_head	*item,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * Decrement the b_lru_ref count unless the value is already
+	 * zero. If the value is already zero, we need to reclaim the
+	 * buffer, otherwise it gets another trip through the LRU.
+	 */
+	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
+		return LRU_ROTATE;
+	}
+
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_move(item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
+static unsigned long
+xfs_buftarg_shrink_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
-	struct xfs_buf		*bp;
-	int nr_to_scan = sc->nr_to_scan;
 	LIST_HEAD(dispose);
+	unsigned long		freed;
+	unsigned long		nr_to_scan = sc->nr_to_scan;
 
-	if (!nr_to_scan)
-		return btp->bt_lru_nr;
-
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		if (nr_to_scan-- <= 0)
-			break;
-
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
-		/*
-		 * Decrement the b_lru_ref count unless the value is already
-		 * zero. If the value is already zero, we need to reclaim the
-		 * buffer, otherwise it gets another trip through the LRU.
-		 */
-		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * remove the buffer from the LRU now to avoid needing another
-		 * lock round trip inside xfs_buf_rele().
-		 */
-		list_move(&bp->b_lru, &dispose);
-		btp->bt_lru_nr--;
-		bp->b_lru_flags |= _XBF_LRU_DISPOSE;
-	}
-	spin_unlock(&btp->bt_lru_lock);
+	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+				       &dispose, &nr_to_scan);
 
 	while (!list_empty(&dispose)) {
+		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
 		xfs_buf_rele(bp);
 	}
 
-	return btp->bt_lru_nr;
+	return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	return list_lru_count_node(&btp->bt_lru, sc->nid);
 }
 
 void
@@ -1587,6 +1593,7 @@ xfs_free_buftarg(
 	struct xfs_buftarg	*btp)
 {
 	unregister_shrinker(&btp->bt_shrinker);
+	list_lru_destroy(&btp->bt_lru);
 
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
@@ -1660,12 +1667,16 @@ xfs_alloc_buftarg(
 	if (!btp->bt_bdi)
 		goto error;
 
-	INIT_LIST_HEAD(&btp->bt_lru);
-	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
-	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+
+	if (list_lru_init(&btp->bt_lru))
+		goto error;
+
+	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&btp->bt_shrinker);
 	return btp;
 
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 433a12ed7b17..e65683361017 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
+#include <linux/list_lru.h>
 
 /*
  *	Base types
@@ -59,7 +60,6 @@ typedef enum {
 #define _XBF_KMEM	 (1 << 21)/* backed by heap memory */
 #define _XBF_DELWRI_Q	 (1 << 22)/* buffer on a delwri queue */
 #define _XBF_COMPOUND	 (1 << 23)/* compound buffer */
-#define _XBF_LRU_DISPOSE (1 << 24)/* buffer being discarded */
 
 typedef unsigned int xfs_buf_flags_t;
 
@@ -78,8 +78,12 @@ typedef unsigned int xfs_buf_flags_t;
 	{ _XBF_PAGES,		"PAGES" }, \
 	{ _XBF_KMEM,		"KMEM" }, \
 	{ _XBF_DELWRI_Q,	"DELWRI_Q" }, \
-	{ _XBF_COMPOUND,	"COMPOUND" }, \
-	{ _XBF_LRU_DISPOSE,	"LRU_DISPOSE" }
+	{ _XBF_COMPOUND,	"COMPOUND" }
+
+/*
+ * Internal state flags.
+ */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
 
 typedef struct xfs_buftarg {
 	dev_t			bt_dev;
@@ -92,9 +96,7 @@ typedef struct xfs_buftarg {
 
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
-	struct list_head	bt_lru;
-	spinlock_t		bt_lru_lock;
-	unsigned int		bt_lru_nr;
+	struct list_lru		bt_lru;
 } xfs_buftarg_t;
 
 struct xfs_buf;
@@ -137,7 +139,8 @@ typedef struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
-	xfs_buf_flags_t		b_lru_flags;	/* internal lru status flags */
+	spinlock_t		b_lock;		/* internal state lock */
+	unsigned int		b_state;	/* internal state flags */
 	wait_queue_head_t	b_waiters;	/* unpin waiters */
 	struct list_head	b_list;
 	struct xfs_perag	*b_pag;		/* contains rbtree root */
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3a944b198e35..88c5ea75ebf6 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -613,13 +613,27 @@ xfs_buf_item_unlock(
 			}
 		}
 	}
-	if (clean || aborted) {
-		if (atomic_dec_and_test(&bip->bli_refcount)) {
-			ASSERT(!aborted || XFS_FORCED_SHUTDOWN(lip->li_mountp));
+
+	/*
+	 * Clean buffers, by definition, cannot be in the AIL. However, aborted
+	 * buffers may be dirty and hence in the AIL. Therefore if we are
+	 * aborting a buffer and we've just taken the last refernce away, we
+	 * have to check if it is in the AIL before freeing it. We need to free
+	 * it in this case, because an aborted transaction has already shut the
+	 * filesystem down and this is the last chance we will have to do so.
+	 */
+	if (atomic_dec_and_test(&bip->bli_refcount)) {
+		if (clean)
+			xfs_buf_item_relse(bp);
+		else if (aborted) {
+			ASSERT(XFS_FORCED_SHUTDOWN(lip->li_mountp));
+			if (lip->li_flags & XFS_LI_IN_AIL) {
+				xfs_trans_ail_delete(lip->li_ailp, lip,
+						     SHUTDOWN_LOG_IO_ERROR);
+			}
 			xfs_buf_item_relse(bp);
 		}
-	} else
-		atomic_dec(&bip->bli_refcount);
+	}
 
 	if (!(flags & XFS_BLI_HOLD))
 		xfs_buf_relse(bp);
diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c
index d4e59a4ff59f..069537c845e5 100644
--- a/fs/xfs/xfs_da_btree.c
+++ b/fs/xfs/xfs_da_btree.c
@@ -635,6 +635,7 @@ xfs_da3_root_split(
 	xfs_trans_log_buf(tp, bp, 0, size - 1);
 
 	bp->b_ops = blk1->bp->b_ops;
+	xfs_trans_buf_copy_type(bp, blk1->bp);
 	blk1->bp = bp;
 	blk1->blkno = blkno;
 
diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c
index 08984eeee159..1021c8356d08 100644
--- a/fs/xfs/xfs_dir2_leaf.c
+++ b/fs/xfs/xfs_dir2_leaf.c
@@ -180,6 +180,11 @@ xfs_dir3_leaf_check_int(
 	return true;
 }
 
+/*
+ * We verify the magic numbers before decoding the leaf header so that on debug
+ * kernels we don't get assertion failures in xfs_dir3_leaf_hdr_from_disk() due
+ * to incorrect magic numbers.
+ */
 static bool
 xfs_dir3_leaf_verify(
 	struct xfs_buf		*bp,
@@ -191,24 +196,25 @@ xfs_dir3_leaf_verify(
 
 	ASSERT(magic == XFS_DIR2_LEAF1_MAGIC || magic == XFS_DIR2_LEAFN_MAGIC);
 
-	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	if (xfs_sb_version_hascrc(&mp->m_sb)) {
 		struct xfs_dir3_leaf_hdr *leaf3 = bp->b_addr;
+		__uint16_t		magic3;
 
-		if ((magic == XFS_DIR2_LEAF1_MAGIC &&
-		     leafhdr.magic != XFS_DIR3_LEAF1_MAGIC) ||
-		    (magic == XFS_DIR2_LEAFN_MAGIC &&
-		     leafhdr.magic != XFS_DIR3_LEAFN_MAGIC))
-			return false;
+		magic3 = (magic == XFS_DIR2_LEAF1_MAGIC) ? XFS_DIR3_LEAF1_MAGIC
+							 : XFS_DIR3_LEAFN_MAGIC;
 
+		if (leaf3->info.hdr.magic != cpu_to_be16(magic3))
+			return false;
 		if (!uuid_equal(&leaf3->info.uuid, &mp->m_sb.sb_uuid))
 			return false;
 		if (be64_to_cpu(leaf3->info.blkno) != bp->b_bn)
 			return false;
 	} else {
-		if (leafhdr.magic != magic)
+		if (leaf->hdr.info.magic != cpu_to_be16(magic))
 			return false;
 	}
+
+	xfs_dir3_leaf_hdr_from_disk(&leafhdr, leaf);
 	return xfs_dir3_leaf_check_int(mp, &leafhdr, leaf);
 }
 
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 251c66632e5e..71520e6e5d65 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -940,13 +940,8 @@ xfs_qm_dqput_final(
 
 	trace_xfs_dqput_free(dqp);
 
-	mutex_lock(&qi->qi_lru_lock);
-	if (list_empty(&dqp->q_lru)) {
-		list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
-		qi->qi_lru_count++;
+	if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
 		XFS_STATS_INC(xs_qm_dquot_unused);
-	}
-	mutex_unlock(&qi->qi_lru_lock);
 
 	/*
 	 * If we just added a udquot to the freelist, then we want to release
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 60c6e1f12695..e838d84b4e85 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -142,7 +142,8 @@ xfs_qm_dqunpin_wait(
 STATIC uint
 xfs_qm_dquot_logitem_push(
 	struct xfs_log_item	*lip,
-	struct list_head	*buffer_list)
+	struct list_head	*buffer_list) __releases(&lip->li_ailp->xa_lock)
+					      __acquires(&lip->li_ailp->xa_lock)
 {
 	struct xfs_dquot	*dqp = DQUOT_ITEM(lip)->qli_dquot;
 	struct xfs_buf		*bp = NULL;
diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c
index 86f559f6e5d3..e43708e2f080 100644
--- a/fs/xfs/xfs_extent_busy.c
+++ b/fs/xfs/xfs_extent_busy.c
@@ -160,7 +160,8 @@ xfs_extent_busy_update_extent(
 	struct xfs_extent_busy	*busyp,
 	xfs_agblock_t		fbno,
 	xfs_extlen_t		flen,
-	bool			userdata)
+	bool			userdata) __releases(&pag->pagb_lock)
+					  __acquires(&pag->pagb_lock)
 {
 	xfs_agblock_t		fend = fbno + flen;
 	xfs_agblock_t		bbno = busyp->bno;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 16219b9c6790..193206ba4358 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -48,7 +48,7 @@ STATIC void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp,
 /*
  * Allocate and initialise an xfs_inode.
  */
-STATIC struct xfs_inode *
+struct xfs_inode *
 xfs_inode_alloc(
 	struct xfs_mount	*mp,
 	xfs_ino_t		ino)
@@ -98,7 +98,7 @@ xfs_inode_free_callback(
 	kmem_zone_free(xfs_inode_zone, ip);
 }
 
-STATIC void
+void
 xfs_inode_free(
 	struct xfs_inode	*ip)
 {
@@ -1167,7 +1167,7 @@ xfs_reclaim_inodes(
  * them to be cleaned, which we hope will not be very long due to the
  * background walker having already kicked the IO off on those dirty inodes.
  */
-void
+long
 xfs_reclaim_inodes_nr(
 	struct xfs_mount	*mp,
 	int			nr_to_scan)
@@ -1176,7 +1176,7 @@ xfs_reclaim_inodes_nr(
 	xfs_reclaim_work_queue(mp);
 	xfs_ail_push_all(mp->m_ail);
 
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+	return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
 }
 
 /*
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 8a89f7d791bd..9ed68bb750f5 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -42,11 +42,15 @@ struct xfs_eofblocks {
 int xfs_iget(struct xfs_mount *mp, struct xfs_trans *tp, xfs_ino_t ino,
 	     uint flags, uint lock_flags, xfs_inode_t **ipp);
 
+/* recovery needs direct inode allocation capability */
+struct xfs_inode * xfs_inode_alloc(struct xfs_mount *mp, xfs_ino_t ino);
+void xfs_inode_free(struct xfs_inode *ip);
+
 void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
diff --git a/fs/xfs/xfs_inode_buf.c b/fs/xfs/xfs_inode_buf.c
index e011d597f12f..63382d37f565 100644
--- a/fs/xfs/xfs_inode_buf.c
+++ b/fs/xfs/xfs_inode_buf.c
@@ -53,9 +53,8 @@ xfs_inobp_check(
 					i * mp->m_sb.sb_inodesize);
 		if (!dip->di_next_unlinked)  {
 			xfs_alert(mp,
-	"Detected bogus zero next_unlinked field in incore inode buffer 0x%p.",
-				bp);
-			ASSERT(dip->di_next_unlinked);
+	"Detected bogus zero next_unlinked field in inode %d buffer 0x%llx.",
+				i, (long long)bp->b_bn);
 		}
 	}
 }
@@ -106,11 +105,10 @@ xfs_inode_buf_verify(
 			XFS_CORRUPTION_ERROR(__func__, XFS_ERRLEVEL_HIGH,
 					     mp, dip);
 #ifdef DEBUG
-			xfs_emerg(mp,
+			xfs_alert(mp,
 				"bad inode magic/vsn daddr %lld #%d (magic=%x)",
 				(unsigned long long)bp->b_bn, i,
 				be16_to_cpu(dip->di_magic));
-			ASSERT(0);
 #endif
 		}
 	}
@@ -196,7 +194,7 @@ xfs_imap_to_bp(
 	return 0;
 }
 
-STATIC void
+void
 xfs_dinode_from_disk(
 	xfs_icdinode_t		*to,
 	xfs_dinode_t		*from)
diff --git a/fs/xfs/xfs_inode_buf.h b/fs/xfs/xfs_inode_buf.h
index 599e6c0ca2a9..abba0ae8cf2d 100644
--- a/fs/xfs/xfs_inode_buf.h
+++ b/fs/xfs/xfs_inode_buf.h
@@ -32,17 +32,17 @@ struct xfs_imap {
 	ushort		im_boffset;	/* inode offset in block in bytes */
 };
 
-int		xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
-			       struct xfs_imap *, struct xfs_dinode **,
-			       struct xfs_buf **, uint, uint);
-int		xfs_iread(struct xfs_mount *, struct xfs_trans *,
-			  struct xfs_inode *, uint);
-void		xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
-void		xfs_dinode_to_disk(struct xfs_dinode *,
-				   struct xfs_icdinode *);
+int	xfs_imap_to_bp(struct xfs_mount *, struct xfs_trans *,
+		       struct xfs_imap *, struct xfs_dinode **,
+		       struct xfs_buf **, uint, uint);
+int	xfs_iread(struct xfs_mount *, struct xfs_trans *,
+		  struct xfs_inode *, uint);
+void	xfs_dinode_calc_crc(struct xfs_mount *, struct xfs_dinode *);
+void	xfs_dinode_to_disk(struct xfs_dinode *to, struct xfs_icdinode *from);
+void	xfs_dinode_from_disk(struct xfs_icdinode *to, struct xfs_dinode *from);
 
 #if defined(DEBUG)
-void		xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
+void	xfs_inobp_check(struct xfs_mount *, struct xfs_buf *);
 #else
 #define	xfs_inobp_check(mp, bp)
 #endif /* DEBUG */
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index bdebc21078d7..668e8f4ccf5e 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -71,7 +71,7 @@ xfs_find_handle(
 	int			hsize;
 	xfs_handle_t		handle;
 	struct inode		*inode;
-	struct fd		f = {0};
+	struct fd		f = {NULL};
 	struct path		path;
 	int			error;
 	struct xfs_inode	*ip;
@@ -456,12 +456,9 @@ xfs_attrlist_by_handle(
 	if (IS_ERR(dentry))
 		return PTR_ERR(dentry);
 
-	kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
-	if (!kbuf) {
-		kbuf = kmem_zalloc_large(al_hreq.buflen);
-		if (!kbuf)
-			goto out_dput;
-	}
+	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	if (!kbuf)
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
 	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -472,12 +469,9 @@ xfs_attrlist_by_handle(
 	if (copy_to_user(al_hreq.buffer, kbuf, al_hreq.buflen))
 		error = -EFAULT;
 
- out_kfree:
-	if (is_vmalloc_addr(kbuf))
-		kmem_free_large(kbuf);
-	else
-		kmem_free(kbuf);
- out_dput:
+out_kfree:
+	kmem_free(kbuf);
+out_dput:
 	dput(dentry);
 	return error;
 }
@@ -495,12 +489,9 @@ xfs_attrmulti_attr_get(
 
 	if (*len > XATTR_SIZE_MAX)
 		return EINVAL;
-	kbuf = kmem_zalloc(*len, KM_SLEEP | KM_MAYFAIL);
-	if (!kbuf) {
-		kbuf = kmem_zalloc_large(*len);
-		if (!kbuf)
-			return ENOMEM;
-	}
+	kbuf = kmem_zalloc_large(*len, KM_SLEEP);
+	if (!kbuf)
+		return ENOMEM;
 
 	error = xfs_attr_get(XFS_I(inode), name, kbuf, (int *)len, flags);
 	if (error)
@@ -509,11 +500,8 @@ xfs_attrmulti_attr_get(
 	if (copy_to_user(ubuf, kbuf, *len))
 		error = EFAULT;
 
- out_kfree:
-	if (is_vmalloc_addr(kbuf))
-		kmem_free_large(kbuf);
-	else
-		kmem_free(kbuf);
+out_kfree:
+	kmem_free(kbuf);
 	return error;
 }
 
diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c
index d3ab9534307f..f671f7e472ac 100644
--- a/fs/xfs/xfs_ioctl32.c
+++ b/fs/xfs/xfs_ioctl32.c
@@ -371,12 +371,9 @@ xfs_compat_attrlist_by_handle(
 		return PTR_ERR(dentry);
 
 	error = -ENOMEM;
-	kbuf = kmem_zalloc(al_hreq.buflen, KM_SLEEP | KM_MAYFAIL);
-	if (!kbuf) {
-		kbuf = kmem_zalloc_large(al_hreq.buflen);
-		if (!kbuf)
-			goto out_dput;
-	}
+	kbuf = kmem_zalloc_large(al_hreq.buflen, KM_SLEEP);
+	if (!kbuf)
+		goto out_dput;
 
 	cursor = (attrlist_cursor_kern_t *)&al_hreq.pos;
 	error = -xfs_attr_list(XFS_I(dentry->d_inode), kbuf, al_hreq.buflen,
@@ -387,12 +384,9 @@ xfs_compat_attrlist_by_handle(
 	if (copy_to_user(compat_ptr(al_hreq.buffer), kbuf, al_hreq.buflen))
 		error = -EFAULT;
 
- out_kfree:
-	if (is_vmalloc_addr(kbuf))
-		kmem_free_large(kbuf);
-	else
-		kmem_free(kbuf);
- out_dput:
+out_kfree:
+	kmem_free(kbuf);
+out_dput:
 	dput(dentry);
 	return error;
 }
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index b93e14b86754..084b3e1741fd 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -495,7 +495,7 @@ xfs_bulkstat(
 	/*
 	 * Done, we're either out of filesystem or space to put the data.
 	 */
-	kmem_free_large(irbuf);
+	kmem_free(irbuf);
 	*ubcountp = ubelem;
 	/*
 	 * Found some inodes, return them now and return the error next time.
@@ -541,8 +541,9 @@ xfs_bulkstat_single(
 	 * at the expense of the error case.
 	 */
 
-	ino = (xfs_ino_t)*lastinop;
-	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t), 0, &res);
+	ino = *lastinop;
+	error = xfs_bulkstat_one(mp, ino, buffer, sizeof(xfs_bstat_t),
+				 NULL, &res);
 	if (error) {
 		/*
 		 * Special case way failed, do it the "long" way
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index 5372d58ef93a..a2dea108071a 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -257,7 +257,8 @@ xlog_grant_head_wait(
 	struct xlog		*log,
 	struct xlog_grant_head	*head,
 	struct xlog_ticket	*tic,
-	int			need_bytes)
+	int			need_bytes) __releases(&head->lock)
+					    __acquires(&head->lock)
 {
 	list_add_tail(&tic->t_queue, &head->waiters);
 
diff --git a/fs/xfs/xfs_log_format.h b/fs/xfs/xfs_log_format.h
index 31e3a06c4644..ca7e28a8ed31 100644
--- a/fs/xfs/xfs_log_format.h
+++ b/fs/xfs/xfs_log_format.h
@@ -474,6 +474,8 @@ typedef struct xfs_inode_log_format_64 {
 #define	XFS_ILOG_ADATA	0x040	/* log i_af.if_data */
 #define	XFS_ILOG_AEXT	0x080	/* log i_af.if_extents */
 #define	XFS_ILOG_ABROOT	0x100	/* log i_af.i_broot */
+#define XFS_ILOG_DOWNER	0x200	/* change the data fork owner on replay */
+#define XFS_ILOG_AOWNER	0x400	/* change the attr fork owner on replay */
 
 
 /*
@@ -487,7 +489,8 @@ typedef struct xfs_inode_log_format_64 {
 #define	XFS_ILOG_NONCORE	(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
 				 XFS_ILOG_DBROOT | XFS_ILOG_DEV | \
 				 XFS_ILOG_UUID | XFS_ILOG_ADATA | \
-				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT)
+				 XFS_ILOG_AEXT | XFS_ILOG_ABROOT | \
+				 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
 
 #define	XFS_ILOG_DFORK		(XFS_ILOG_DDATA | XFS_ILOG_DEXT | \
 				 XFS_ILOG_DBROOT)
@@ -499,7 +502,8 @@ typedef struct xfs_inode_log_format_64 {
 				 XFS_ILOG_DEXT | XFS_ILOG_DBROOT | \
 				 XFS_ILOG_DEV | XFS_ILOG_UUID | \
 				 XFS_ILOG_ADATA | XFS_ILOG_AEXT | \
-				 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP)
+				 XFS_ILOG_ABROOT | XFS_ILOG_TIMESTAMP | \
+				 XFS_ILOG_DOWNER | XFS_ILOG_AOWNER)
 
 static inline int xfs_ilog_fbroot(int w)
 {
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 7c0c1fdc728b..dabda9521b4b 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2014,7 +2014,7 @@ xlog_recover_get_buf_lsn(
 	case XFS_ATTR3_RMT_MAGIC:
 		return be64_to_cpu(((struct xfs_attr3_rmt_hdr *)blk)->rm_lsn);
 	case XFS_SB_MAGIC:
-		return be64_to_cpu(((struct xfs_sb *)blk)->sb_lsn);
+		return be64_to_cpu(((struct xfs_dsb *)blk)->sb_lsn);
 	default:
 		break;
 	}
@@ -2629,6 +2629,82 @@ out_release:
 	return error;
 }
 
+/*
+ * Inode fork owner changes
+ *
+ * If we have been told that we have to reparent the inode fork, it's because an
+ * extent swap operation on a CRC enabled filesystem has been done and we are
+ * replaying it. We need to walk the BMBT of the appropriate fork and change the
+ * owners of it.
+ *
+ * The complexity here is that we don't have an inode context to work with, so
+ * after we've replayed the inode we need to instantiate one.  This is where the
+ * fun begins.
+ *
+ * We are in the middle of log recovery, so we can't run transactions. That
+ * means we cannot use cache coherent inode instantiation via xfs_iget(), as
+ * that will result in the corresponding iput() running the inode through
+ * xfs_inactive(). If we've just replayed an inode core that changes the link
+ * count to zero (i.e. it's been unlinked), then xfs_inactive() will run
+ * transactions (bad!).
+ *
+ * So, to avoid this, we instantiate an inode directly from the inode core we've
+ * just recovered. We have the buffer still locked, and all we really need to
+ * instantiate is the inode core and the forks being modified. We can do this
+ * manually, then run the inode btree owner change, and then tear down the
+ * xfs_inode without having to run any transactions at all.
+ *
+ * Also, because we don't have a transaction context available here but need to
+ * gather all the buffers we modify for writeback so we pass the buffer_list
+ * instead for the operation to use.
+ */
+
+STATIC int
+xfs_recover_inode_owner_change(
+	struct xfs_mount	*mp,
+	struct xfs_dinode	*dip,
+	struct xfs_inode_log_format *in_f,
+	struct list_head	*buffer_list)
+{
+	struct xfs_inode	*ip;
+	int			error;
+
+	ASSERT(in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER));
+
+	ip = xfs_inode_alloc(mp, in_f->ilf_ino);
+	if (!ip)
+		return ENOMEM;
+
+	/* instantiate the inode */
+	xfs_dinode_from_disk(&ip->i_d, dip);
+	ASSERT(ip->i_d.di_version >= 3);
+
+	error = xfs_iformat_fork(ip, dip);
+	if (error)
+		goto out_free_ip;
+
+
+	if (in_f->ilf_fields & XFS_ILOG_DOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_DBROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_DATA_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+	if (in_f->ilf_fields & XFS_ILOG_AOWNER) {
+		ASSERT(in_f->ilf_fields & XFS_ILOG_ABROOT);
+		error = xfs_bmbt_change_owner(NULL, ip, XFS_ATTR_FORK,
+					      ip->i_ino, buffer_list);
+		if (error)
+			goto out_free_ip;
+	}
+
+out_free_ip:
+	xfs_inode_free(ip);
+	return error;
+}
+
 STATIC int
 xlog_recover_inode_pass2(
 	struct xlog			*log,
@@ -2681,8 +2757,7 @@ xlog_recover_inode_pass2(
 	error = bp->b_error;
 	if (error) {
 		xfs_buf_ioerror_alert(bp, "xlog_recover_do..(read#2)");
-		xfs_buf_relse(bp);
-		goto error;
+		goto out_release;
 	}
 	ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
 	dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
@@ -2692,30 +2767,31 @@ xlog_recover_inode_pass2(
 	 * like an inode!
 	 */
 	if (unlikely(dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))) {
-		xfs_buf_relse(bp);
 		xfs_alert(mp,
 	"%s: Bad inode magic number, dip = 0x%p, dino bp = 0x%p, ino = %Ld",
 			__func__, dip, bp, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(1)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
-		goto error;
+		goto out_release;
 	}
 	dicp = item->ri_buf[1].i_addr;
 	if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
-		xfs_buf_relse(bp);
 		xfs_alert(mp,
 			"%s: Bad inode log record, rec ptr 0x%p, ino %Ld",
 			__func__, item, in_f->ilf_ino);
 		XFS_ERROR_REPORT("xlog_recover_inode_pass2(2)",
 				 XFS_ERRLEVEL_LOW, mp);
 		error = EFSCORRUPTED;
-		goto error;
+		goto out_release;
 	}
 
 	/*
 	 * If the inode has an LSN in it, recover the inode only if it's less
-	 * than the lsn of the transaction we are replaying.
+	 * than the lsn of the transaction we are replaying. Note: we still
+	 * need to replay an owner change even though the inode is more recent
+	 * than the transaction as there is no guarantee that all the btree
+	 * blocks are more recent than this transaction, too.
 	 */
 	if (dip->di_version >= 3) {
 		xfs_lsn_t	lsn = be64_to_cpu(dip->di_lsn);
@@ -2723,7 +2799,7 @@ xlog_recover_inode_pass2(
 		if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
 			trace_xfs_log_recover_inode_skip(log, in_f);
 			error = 0;
-			goto out_release;
+			goto out_owner_change;
 		}
 	}
 
@@ -2745,10 +2821,9 @@ xlog_recover_inode_pass2(
 		    dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
 			/* do nothing */
 		} else {
-			xfs_buf_relse(bp);
 			trace_xfs_log_recover_inode_skip(log, in_f);
 			error = 0;
-			goto error;
+			goto out_release;
 		}
 	}
 
@@ -2760,13 +2835,12 @@ xlog_recover_inode_pass2(
 		    (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(3)",
 					 XFS_ERRLEVEL_LOW, mp, dicp);
-			xfs_buf_relse(bp);
 			xfs_alert(mp,
 		"%s: Bad regular inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				__func__, item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
-			goto error;
+			goto out_release;
 		}
 	} else if (unlikely(S_ISDIR(dicp->di_mode))) {
 		if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
@@ -2774,19 +2848,17 @@ xlog_recover_inode_pass2(
 		    (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
 			XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(4)",
 					     XFS_ERRLEVEL_LOW, mp, dicp);
-			xfs_buf_relse(bp);
 			xfs_alert(mp,
 		"%s: Bad dir inode log record, rec ptr 0x%p, "
 		"ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
 				__func__, item, dip, bp, in_f->ilf_ino);
 			error = EFSCORRUPTED;
-			goto error;
+			goto out_release;
 		}
 	}
 	if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(5)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
-		xfs_buf_relse(bp);
 		xfs_alert(mp,
 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
 	"dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
@@ -2794,29 +2866,27 @@ xlog_recover_inode_pass2(
 			dicp->di_nextents + dicp->di_anextents,
 			dicp->di_nblocks);
 		error = EFSCORRUPTED;
-		goto error;
+		goto out_release;
 	}
 	if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(6)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
-		xfs_buf_relse(bp);
 		xfs_alert(mp,
 	"%s: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, "
 	"dino bp 0x%p, ino %Ld, forkoff 0x%x", __func__,
 			item, dip, bp, in_f->ilf_ino, dicp->di_forkoff);
 		error = EFSCORRUPTED;
-		goto error;
+		goto out_release;
 	}
 	isize = xfs_icdinode_size(dicp->di_version);
 	if (unlikely(item->ri_buf[1].i_len > isize)) {
 		XFS_CORRUPTION_ERROR("xlog_recover_inode_pass2(7)",
 				     XFS_ERRLEVEL_LOW, mp, dicp);
-		xfs_buf_relse(bp);
 		xfs_alert(mp,
 			"%s: Bad inode log record length %d, rec ptr 0x%p",
 			__func__, item->ri_buf[1].i_len, item);
 		error = EFSCORRUPTED;
-		goto error;
+		goto out_release;
 	}
 
 	/* The core is in in-core format */
@@ -2842,7 +2912,7 @@ xlog_recover_inode_pass2(
 	}
 
 	if (in_f->ilf_size == 2)
-		goto write_inode_buffer;
+		goto out_owner_change;
 	len = item->ri_buf[2].i_len;
 	src = item->ri_buf[2].i_addr;
 	ASSERT(in_f->ilf_size <= 4);
@@ -2903,13 +2973,15 @@ xlog_recover_inode_pass2(
 		default:
 			xfs_warn(log->l_mp, "%s: Invalid flag", __func__);
 			ASSERT(0);
-			xfs_buf_relse(bp);
 			error = EIO;
-			goto error;
+			goto out_release;
 		}
 	}
 
-write_inode_buffer:
+out_owner_change:
+	if (in_f->ilf_fields & (XFS_ILOG_DOWNER|XFS_ILOG_AOWNER))
+		error = xfs_recover_inode_owner_change(mp, dip, in_f,
+						       buffer_list);
 	/* re-generate the checksum. */
 	xfs_dinode_calc_crc(log->l_mp, dip);
 
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 6218a0aeeeea..3e6c2e6c9cd2 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -51,8 +51,9 @@
  */
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(struct shrinker *, struct shrink_control *);
 
+
+STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
 /*
  * We use the batch lookup interface to iterate over the dquots as it
  * currently is the only interface into the radix tree code that allows
@@ -203,12 +204,9 @@ xfs_qm_dqpurge(
 	 * We move dquots to the freelist as soon as their reference count
 	 * hits zero, so it really should be on the freelist here.
 	 */
-	mutex_lock(&qi->qi_lru_lock);
 	ASSERT(!list_empty(&dqp->q_lru));
-	list_del_init(&dqp->q_lru);
-	qi->qi_lru_count--;
+	list_lru_del(&qi->qi_lru, &dqp->q_lru);
 	XFS_STATS_DEC(xs_qm_dquot_unused);
-	mutex_unlock(&qi->qi_lru_lock);
 
 	xfs_qm_dqdestroy(dqp);
 
@@ -680,6 +678,143 @@ xfs_qm_calc_dquots_per_chunk(
 	return ndquots;
 }
 
+struct xfs_qm_isolate {
+	struct list_head	buffers;
+	struct list_head	dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+	struct list_head	*item,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_dquot	*dqp = container_of(item,
+						struct xfs_dquot, q_lru);
+	struct xfs_qm_isolate	*isol = arg;
+
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_miss_busy;
+
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
+		XFS_STATS_INC(xs_qm_dqwants);
+
+		trace_xfs_dqreclaim_want(dqp);
+		list_del_init(&dqp->q_lru);
+		XFS_STATS_DEC(xs_qm_dquot_unused);
+		return LRU_REMOVED;
+	}
+
+	/*
+	 * If the dquot is dirty, flush it. If it's already being flushed, just
+	 * skip it so there is time for the IO to complete before we try to
+	 * reclaim it again on the next LRU pass.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		xfs_dqunlock(dqp);
+		goto out_miss_busy;
+	}
+
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		trace_xfs_dqreclaim_dirty(dqp);
+
+		/* we have to drop the LRU lock to flush the dquot */
+		spin_unlock(lru_lock);
+
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error) {
+			xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+				 __func__, dqp);
+			goto out_unlock_dirty;
+		}
+
+		xfs_buf_delwri_queue(bp, &isol->buffers);
+		xfs_buf_relse(bp);
+		goto out_unlock_dirty;
+	}
+	xfs_dqfunlock(dqp);
+
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->dq_flags |= XFS_DQ_FREEING;
+	xfs_dqunlock(dqp);
+
+	ASSERT(dqp->q_nrefs == 0);
+	list_move_tail(&dqp->q_lru, &isol->dispose);
+	XFS_STATS_DEC(xs_qm_dquot_unused);
+	trace_xfs_dqreclaim_done(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaims);
+	return LRU_REMOVED;
+
+out_miss_busy:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	return LRU_SKIP;
+
+out_unlock_dirty:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(xs_qm_dqreclaim_misses);
+	xfs_dqunlock(dqp);
+	spin_lock(lru_lock);
+	return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_qm_isolate	isol;
+	unsigned long		freed;
+	int			error;
+	unsigned long		nr_to_scan = sc->nr_to_scan;
+
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+		return 0;
+
+	INIT_LIST_HEAD(&isol.buffers);
+	INIT_LIST_HEAD(&isol.dispose);
+
+	freed = list_lru_walk_node(&qi->qi_lru, sc->nid, xfs_qm_dquot_isolate, &isol,
+					&nr_to_scan);
+
+	error = xfs_buf_delwri_submit(&isol.buffers);
+	if (error)
+		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+	while (!list_empty(&isol.dispose)) {
+		struct xfs_dquot	*dqp;
+
+		dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+		list_del_init(&dqp->q_lru);
+		xfs_qm_dqfree_one(dqp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+
+	return list_lru_count_node(&qi->qi_lru, sc->nid);
+}
+
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -696,11 +831,18 @@ xfs_qm_init_quotainfo(
 
 	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
+	if ((error = list_lru_init(&qinf->qi_lru))) {
+		kmem_free(qinf);
+		mp->m_quotainfo = NULL;
+		return error;
+	}
+
 	/*
 	 * See if quotainodes are setup, and if not, allocate them,
 	 * and change the superblock accordingly.
 	 */
 	if ((error = xfs_qm_init_quotainos(mp))) {
+		list_lru_destroy(&qinf->qi_lru);
 		kmem_free(qinf);
 		mp->m_quotainfo = NULL;
 		return error;
@@ -711,10 +853,6 @@ xfs_qm_init_quotainfo(
 	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
 	mutex_init(&qinf->qi_tree_lock);
 
-	INIT_LIST_HEAD(&qinf->qi_lru_list);
-	qinf->qi_lru_count = 0;
-	mutex_init(&qinf->qi_lru_lock);
-
 	/* mutex used to serialize quotaoffs */
 	mutex_init(&qinf->qi_quotaofflock);
 
@@ -779,8 +917,10 @@ xfs_qm_init_quotainfo(
 		qinf->qi_rtbwarnlimit = XFS_QM_RTBWARNLIMIT;
 	}
 
-	qinf->qi_shrinker.shrink = xfs_qm_shake;
+	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&qinf->qi_shrinker);
 	return 0;
 }
@@ -801,6 +941,7 @@ xfs_qm_destroy_quotainfo(
 	ASSERT(qi != NULL);
 
 	unregister_shrinker(&qi->qi_shrinker);
+	list_lru_destroy(&qi->qi_lru);
 
 	if (qi->qi_uquotaip) {
 		IRELE(qi->qi_uquotaip);
@@ -1599,132 +1740,6 @@ xfs_qm_dqfree_one(
 	xfs_qm_dqdestroy(dqp);
 }
 
-STATIC void
-xfs_qm_dqreclaim_one(
-	struct xfs_dquot	*dqp,
-	struct list_head	*buffer_list,
-	struct list_head	*dispose_list)
-{
-	struct xfs_mount	*mp = dqp->q_mount;
-	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	int			error;
-
-	if (!xfs_dqlock_nowait(dqp))
-		goto out_move_tail;
-
-	/*
-	 * This dquot has acquired a reference in the meantime remove it from
-	 * the freelist and try again.
-	 */
-	if (dqp->q_nrefs) {
-		xfs_dqunlock(dqp);
-
-		trace_xfs_dqreclaim_want(dqp);
-		XFS_STATS_INC(xs_qm_dqwants);
-
-		list_del_init(&dqp->q_lru);
-		qi->qi_lru_count--;
-		XFS_STATS_DEC(xs_qm_dquot_unused);
-		return;
-	}
-
-	/*
-	 * Try to grab the flush lock. If this dquot is in the process of
-	 * getting flushed to disk, we don't want to reclaim it.
-	 */
-	if (!xfs_dqflock_nowait(dqp))
-		goto out_unlock_move_tail;
-
-	if (XFS_DQ_IS_DIRTY(dqp)) {
-		struct xfs_buf	*bp = NULL;
-
-		trace_xfs_dqreclaim_dirty(dqp);
-
-		error = xfs_qm_dqflush(dqp, &bp);
-		if (error) {
-			xfs_warn(mp, "%s: dquot %p flush failed",
-				 __func__, dqp);
-			goto out_unlock_move_tail;
-		}
-
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
-		/*
-		 * Give the dquot another try on the freelist, as the
-		 * flushing will take some time.
-		 */
-		goto out_unlock_move_tail;
-	}
-	xfs_dqfunlock(dqp);
-
-	/*
-	 * Prevent lookups now that we are past the point of no return.
-	 */
-	dqp->dq_flags |= XFS_DQ_FREEING;
-	xfs_dqunlock(dqp);
-
-	ASSERT(dqp->q_nrefs == 0);
-	list_move_tail(&dqp->q_lru, dispose_list);
-	qi->qi_lru_count--;
-	XFS_STATS_DEC(xs_qm_dquot_unused);
-
-	trace_xfs_dqreclaim_done(dqp);
-	XFS_STATS_INC(xs_qm_dqreclaims);
-	return;
-
-	/*
-	 * Move the dquot to the tail of the list so that we don't spin on it.
-	 */
-out_unlock_move_tail:
-	xfs_dqunlock(dqp);
-out_move_tail:
-	list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-	trace_xfs_dqreclaim_busy(dqp);
-	XFS_STATS_INC(xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
-	struct shrinker		*shrink,
-	struct shrink_control	*sc)
-{
-	struct xfs_quotainfo	*qi =
-		container_of(shrink, struct xfs_quotainfo, qi_shrinker);
-	int			nr_to_scan = sc->nr_to_scan;
-	LIST_HEAD		(buffer_list);
-	LIST_HEAD		(dispose_list);
-	struct xfs_dquot	*dqp;
-	int			error;
-
-	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
-		return 0;
-	if (!nr_to_scan)
-		goto out;
-
-	mutex_lock(&qi->qi_lru_lock);
-	while (!list_empty(&qi->qi_lru_list)) {
-		if (nr_to_scan-- <= 0)
-			break;
-		dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
-				       q_lru);
-		xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
-	}
-	mutex_unlock(&qi->qi_lru_lock);
-
-	error = xfs_buf_delwri_submit(&buffer_list);
-	if (error)
-		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
-	while (!list_empty(&dispose_list)) {
-		dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
-		list_del_init(&dqp->q_lru);
-		xfs_qm_dqfree_one(dqp);
-	}
-
-out:
-	return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
-}
-
 /*
  * Start a transaction and write the incore superblock changes to
  * disk. flags parameter indicates which fields have changed.
diff --git a/fs/xfs/xfs_qm.h b/fs/xfs/xfs_qm.h
index 670cd4464070..2b602df9c242 100644
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -49,9 +49,7 @@ typedef struct xfs_quotainfo {
 	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
 	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
 	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
-	struct list_head qi_lru_list;
-	struct mutex	 qi_lru_lock;
-	int		 qi_lru_count;
+	struct list_lru	 qi_lru;
 	int		 qi_dquots;
 	time_t		 qi_btimelimit;	 /* limit for blks timer */
 	time_t		 qi_itimelimit;	 /* limit for inodes timer */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 979a77d4b87d..15188cc99449 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1535,19 +1535,21 @@ xfs_fs_mount(
 	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
 }
 
-static int
+static long
 xfs_fs_nr_cached_objects(
-	struct super_block	*sb)
+	struct super_block	*sb,
+	int			nid)
 {
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
 
-static void
+static long
 xfs_fs_free_cached_objects(
 	struct super_block	*sb,
-	int			nr_to_scan)
+	long			nr_to_scan,
+	int			nid)
 {
-	xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+	return xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
 }
 
 static const struct super_operations xfs_super_operations = {
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 2f2a7c005be2..f622a97a7e33 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -41,6 +41,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_trace.h"
 #include "xfs_symlink.h"
+#include "xfs_buf_item.h"
 
 /* ----- Kernel only functions below ----- */
 STATIC int
@@ -363,6 +364,7 @@ xfs_symlink(
 			pathlen -= byte_cnt;
 			offset += byte_cnt;
 
+			xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SYMLINK_BUF);
 			xfs_trans_log_buf(tp, bp, 0, (buf + byte_cnt - 1) -
 							(char *)bp->b_addr);
 		}