76 files changed, 12148 insertions, 8321 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 3c4886b849f5..e33c08924572 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -2019,7 +2019,7 @@ config CODA_FS_OLD_API
 config AFS_FS
 	tristate "Andrew File System support (AFS) (EXPERIMENTAL)"
 	depends on INET && EXPERIMENTAL
-	select RXRPC
+	select AF_RXRPC
 	help
 	  If you say Y here, you will get an experimental Andrew File System
 	  driver. It currently only supports unsecured read-only AFS access.
@@ -2028,8 +2028,15 @@ config AFS_FS
 
 	  If unsure, say N.
 
-config RXRPC
-	tristate
+config AFS_DEBUG
+	bool "AFS dynamic debugging"
+	depends on AFS_FS
+	help
+	  Say Y here to make runtime controllable debugging messages appear.
+
+	  See <file:Documentation/filesystems/afs.txt> for more information.
+
+	  If unsure, say N.
 
 config 9P_FS
 	tristate "Plan 9 Resource Sharing Support (9P2000) (Experimental)"
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 4029c9da4b86..01545eb1d872 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -2,8 +2,6 @@
 # Makefile for Red Hat Linux AFS client.
 #
 
-#CFLAGS += -finstrument-functions
-
 kafs-objs := \
 	callback.o \
 	cell.o \
@@ -12,14 +10,15 @@ kafs-objs := \
 	file.o \
 	fsclient.o \
 	inode.o \
-	kafsasyncd.o \
-	kafstimod.o \
 	main.o \
 	misc.o \
 	mntpt.o \
 	proc.o \
+	rxrpc.o \
+	security.o \
 	server.o \
 	super.o \
+	use-rtnetlink.o \
 	vlclient.o \
 	vlocation.o \
 	vnode.o \
diff --git a/fs/afs/afs.h b/fs/afs/afs.h
new file mode 100644
index 000000000000..52d0752265b8
--- /dev/null
+++ b/fs/afs/afs.h
@@ -0,0 +1,146 @@
+/* AFS common types
+ *
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_H
+#define AFS_H
+
+#include <linux/in.h>
+
+#define AFS_MAXCELLNAME	64		/* maximum length of a cell name */
+#define AFS_MAXVOLNAME	64		/* maximum length of a volume name */
+
+typedef unsigned			afs_volid_t;
+typedef unsigned			afs_vnodeid_t;
+typedef unsigned long long		afs_dataversion_t;
+
+typedef enum {
+	AFSVL_RWVOL,			/* read/write volume */
+	AFSVL_ROVOL,			/* read-only volume */
+	AFSVL_BACKVOL,			/* backup volume */
+} __attribute__((packed)) afs_voltype_t;
+
+typedef enum {
+	AFS_FTYPE_INVALID	= 0,
+	AFS_FTYPE_FILE		= 1,
+	AFS_FTYPE_DIR		= 2,
+	AFS_FTYPE_SYMLINK	= 3,
+} afs_file_type_t;
+
+/*
+ * AFS file identifier
+ */
+struct afs_fid {
+	afs_volid_t	vid;		/* volume ID */
+	afs_vnodeid_t	vnode;		/* file index within volume */
+	unsigned	unique;		/* unique ID number (file index version) */
+};
+
+/*
+ * AFS callback notification
+ */
+typedef enum {
+	AFSCM_CB_UNTYPED	= 0,	/* no type set on CB break */
+	AFSCM_CB_EXCLUSIVE	= 1,	/* CB exclusive to CM [not implemented] */
+	AFSCM_CB_SHARED		= 2,	/* CB shared by other CM's */
+	AFSCM_CB_DROPPED	= 3,	/* CB promise cancelled by file server */
+} afs_callback_type_t;
+
+struct afs_callback {
+	struct afs_fid		fid;		/* file identifier */
+	unsigned		version;	/* callback version */
+	unsigned		expiry;		/* time at which expires */
+	afs_callback_type_t	type;		/* type of callback */
+};
+
+#define AFSCBMAX 50	/* maximum callbacks transferred per bulk op */
+
+/*
+ * AFS volume information
+ */
+struct afs_volume_info {
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of this volume */
+	afs_volid_t		type_vids[5];	/* volume ID's for possible types for this vol */
+
+	/* list of fileservers serving this volume */
+	size_t			nservers;	/* number of entries used in servers[] */
+	struct {
+		struct in_addr	addr;		/* fileserver address */
+	} servers[8];
+};
+
+/*
+ * AFS security ACE access mask
+ */
+typedef u32 afs_access_t;
+#define AFS_ACE_READ		0x00000001U	/* - permission to read a file/dir */
+#define AFS_ACE_WRITE		0x00000002U	/* - permission to write/chmod a file */
+#define AFS_ACE_INSERT		0x00000004U	/* - permission to create dirent in a dir */
+#define AFS_ACE_LOOKUP		0x00000008U	/* - permission to lookup a file/dir in a dir */
+#define AFS_ACE_DELETE		0x00000010U	/* - permission to delete a dirent from a dir */
+#define AFS_ACE_LOCK		0x00000020U	/* - permission to lock a file */
+#define AFS_ACE_ADMINISTER	0x00000040U	/* - permission to change ACL */
+#define AFS_ACE_USER_A		0x01000000U	/* - 'A' user-defined permission */
+#define AFS_ACE_USER_B		0x02000000U	/* - 'B' user-defined permission */
+#define AFS_ACE_USER_C		0x04000000U	/* - 'C' user-defined permission */
+#define AFS_ACE_USER_D		0x08000000U	/* - 'D' user-defined permission */
+#define AFS_ACE_USER_E		0x10000000U	/* - 'E' user-defined permission */
+#define AFS_ACE_USER_F		0x20000000U	/* - 'F' user-defined permission */
+#define AFS_ACE_USER_G		0x40000000U	/* - 'G' user-defined permission */
+#define AFS_ACE_USER_H		0x80000000U	/* - 'H' user-defined permission */
+
+/*
+ * AFS file status information
+ */
+struct afs_file_status {
+	unsigned		if_version;	/* interface version */
+#define AFS_FSTATUS_VERSION	1
+
+	afs_file_type_t		type;		/* file type */
+	unsigned		nlink;		/* link count */
+	u64			size;		/* file size */
+	afs_dataversion_t	data_version;	/* current data version */
+	u32			author;		/* author ID */
+	u32			owner;		/* owner ID */
+	u32			group;		/* group ID */
+	afs_access_t		caller_access;	/* access rights for authenticated caller */
+	afs_access_t		anon_access;	/* access rights for unauthenticated caller */
+	umode_t			mode;		/* UNIX mode */
+	struct afs_fid		parent;		/* parent dir ID for non-dirs only */
+	time_t			mtime_client;	/* last time client changed data */
+	time_t			mtime_server;	/* last time server changed data */
+};
+
+/*
+ * AFS file status change request
+ */
+struct afs_store_status {
+	u32			mask;		/* which bits of the struct are set */
+	u32			mtime_client;	/* last time client changed data */
+	u32			owner;		/* owner ID */
+	u32			group;		/* group ID */
+	umode_t			mode;		/* UNIX mode */
+};
+
+#define AFS_SET_MTIME		0x01		/* set the mtime */
+#define AFS_SET_OWNER		0x02		/* set the owner ID */
+#define AFS_SET_GROUP		0x04		/* set the group ID (unsupported?) */
+#define AFS_SET_MODE		0x08		/* set the UNIX mode */
+#define AFS_SET_SEG_SIZE	0x10		/* set the segment size (unsupported) */
+
+/*
+ * AFS volume synchronisation information
+ */
+struct afs_volsync {
+	time_t			creation;	/* volume creation time */
+};
+
+#endif /* AFS_H */
diff --git a/fs/afs/afs_cm.h b/fs/afs/afs_cm.h
new file mode 100644
index 000000000000..7b4d4fab4c80
--- /dev/null
+++ b/fs/afs/afs_cm.h
@@ -0,0 +1,32 @@
+/* AFS Cache Manager definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_CM_H
+#define AFS_CM_H
+
+#define AFS_CM_PORT		7001	/* AFS file server port */
+#define CM_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_CM_Operations {
+	CBCallBack		= 204,	/* break callback promises */
+	CBInitCallBackState	= 205,	/* initialise callback state */
+	CBProbe			= 206,	/* probe client */
+	CBGetLock		= 207,	/* get contents of CM lock table */
+	CBGetCE			= 208,	/* get cache file description */
+	CBGetXStatsVersion	= 209,	/* get version of extended statistics */
+	CBGetXStats		= 210,	/* get contents of extended statistics data */
+	CBInitCallBackState3	= 213,	/* initialise callback state, version 3 */
+	CBGetCapabilities	= 65538, /* get client capabilities */
+};
+
+#define AFS_CAP_ERROR_TRANSLATION	0x1
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/afs_fs.h b/fs/afs/afs_fs.h
new file mode 100644
index 000000000000..89e0d1650a72
--- /dev/null
+++ b/fs/afs/afs_fs.h
@@ -0,0 +1,48 @@
+/* AFS File Service definitions
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifndef AFS_FS_H
+#define AFS_FS_H
+
+#define AFS_FS_PORT		7000	/* AFS file server port */
+#define FS_SERVICE		1	/* AFS File Service ID */
+
+enum AFS_FS_Operations {
+	FSFETCHDATA		= 130,	/* AFS Fetch file data */
+	FSFETCHSTATUS		= 132,	/* AFS Fetch file status */
+	FSREMOVEFILE		= 136,	/* AFS Remove a file */
+	FSCREATEFILE		= 137,	/* AFS Create a file */
+	FSRENAME		= 138,	/* AFS Rename or move a file or directory */
+	FSSYMLINK		= 139,	/* AFS Create a symbolic link */
+	FSLINK			= 140,	/* AFS Create a hard link */
+	FSMAKEDIR		= 141,	/* AFS Create a directory */
+	FSREMOVEDIR		= 142,	/* AFS Remove a directory */
+	FSGIVEUPCALLBACKS	= 147,	/* AFS Discard callback promises */
+	FSGETVOLUMEINFO		= 148,	/* AFS Get root volume information */
+	FSGETROOTVOLUME		= 151,	/* AFS Get root volume name */
+	FSLOOKUP		= 161,	/* AFS lookup file in directory */
+};
+
+enum AFS_FS_Errors {
+	VSALVAGE	= 101,	/* volume needs salvaging */
+	VNOVNODE	= 102,	/* no such file/dir (vnode) */
+	VNOVOL		= 103,	/* no such volume or volume unavailable */
+	VVOLEXISTS	= 104,	/* volume name already exists */
+	VNOSERVICE	= 105,	/* volume not currently in service */
+	VOFFLINE	= 106,	/* volume is currently offline (more info available [VVL-spec]) */
+	VONLINE		= 107,	/* volume is already online */
+	VDISKFULL	= 108,	/* disk partition is full */
+	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
+	VBUSY		= 110,	/* volume is temporarily unavailable */
+	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
+};
+
+#endif /* AFS_FS_H */
diff --git a/fs/afs/vlclient.h b/fs/afs/afs_vl.h
index e3d601179c46..8bbefe009ed4 100644
--- a/fs/afs/vlclient.h
+++ b/fs/afs/afs_vl.h
@@ -1,6 +1,6 @@
-/* vlclient.h: Volume Location Service client interface
+/* AFS Volume Location Service client interface
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,10 +9,19 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef _LINUX_AFS_VLCLIENT_H
-#define _LINUX_AFS_VLCLIENT_H
+#ifndef AFS_VL_H
+#define AFS_VL_H
 
-#include "types.h"
+#include "afs.h"
+
+#define AFS_VL_PORT		7003	/* volume location service port */
+#define VL_SERVICE		52	/* RxRPC service ID for the Volume Location service */
+
+enum AFSVL_Operations {
+	VLGETENTRYBYID		= 503,	/* AFS Get Cache Entry By ID operation ID */
+	VLGETENTRYBYNAME	= 504,	/* AFS Get Cache Entry By Name operation ID */
+	VLPROBE			= 514,	/* AFS Probe Volume Location Service operation ID */
+};
 
 enum AFSVL_Errors {
 	AFSVL_IDEXIST 		= 363520,	/* Volume Id entry exists in vl database */
@@ -40,14 +49,16 @@ enum AFSVL_Errors {
 	AFSVL_BADVOLOPER 	= 363542,	/* Bad volume operation code */
 	AFSVL_BADRELLOCKTYPE 	= 363543,	/* Bad release lock type */
 	AFSVL_RERELEASE 	= 363544,	/* Status report: last release was aborted */
-	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server �ag */
+	AFSVL_BADSERVERFLAG 	= 363545,	/* Invalid replication site server °ag */
 	AFSVL_PERM 		= 363546,	/* No permission access */
 	AFSVL_NOMEM 		= 363547,	/* malloc/realloc failed to alloc enough memory */
 };
 
-/* maps to "struct vldbentry" in vvl-spec.pdf */
+/*
+ * maps to "struct vldbentry" in vvl-spec.pdf
+ */
 struct afs_vldbentry {
-	char		name[65];		/* name of volume (including NUL char) */
+	char		name[65];		/* name of volume (with NUL char) */
 	afs_voltype_t	type;			/* volume type */
 	unsigned	num_servers;		/* num servers that hold instances of this vol */
 	unsigned	clone_id;		/* cloning ID */
@@ -68,26 +79,6 @@ struct afs_vldbentry {
 #define AFS_VLSF_RWVOL		0x0004	/* this server holds a R/W instance of the volume */
 #define AFS_VLSF_BACKVOL	0x0008	/* this server holds a backup instance of the volume */
 	} servers[8];
-
 };
 
-/* look up a volume location database entry by name */
-extern int afs_rxvl_get_entry_by_name(struct afs_server *server,
-				      const char *volname,
-				      unsigned volnamesz,
-				      struct afs_cache_vlocation *entry);
-
-/* look up a volume location database entry by ID */
-extern int afs_rxvl_get_entry_by_id(struct afs_server *server,
-				    afs_volid_t	volid,
-				    afs_voltype_t voltype,
-				    struct afs_cache_vlocation *entry);
-
-extern int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
-					  afs_volid_t volid,
-					  afs_voltype_t voltype);
-
-extern int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
-					   struct afs_cache_vlocation *entry);
-
-#endif /* _LINUX_AFS_VLCLIENT_H */
+#endif /* AFS_VL_H */
diff --git a/fs/afs/cache.c b/fs/afs/cache.c
new file mode 100644
index 000000000000..de0d7de69edc
--- /dev/null
+++ b/fs/afs/cache.c
@@ -0,0 +1,256 @@
+/* AFS caching stuff
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+						const void *entry);
+static void afs_cell_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_cache_cell_index_def = {
+	.name			= "cell_ix",
+	.data_size		= sizeof(struct afs_cache_cell),
+	.keys[0]		= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+	.match			= afs_cell_cache_match,
+	.update			= afs_cell_cache_update,
+};
+#endif
+
+/*
+ * match a cell record obtained from the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_cell_cache_match(void *target,
+						const void *entry)
+{
+	const struct afs_cache_cell *ccell = entry;
+	struct afs_cell *cell = target;
+
+	_enter("{%s},{%s}", ccell->name, cell->name);
+
+	if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
+		_leave(" = SUCCESS");
+		return CACHEFS_MATCH_SUCCESS;
+	}
+
+	_leave(" = FAILED");
+	return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a cell record in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_cell_cache_update(void *source, void *entry)
+{
+	struct afs_cache_cell *ccell = entry;
+	struct afs_cell *cell = source;
+
+	_enter("%p,%p", source, entry);
+
+	strncpy(ccell->name, cell->name, sizeof(ccell->name));
+
+	memcpy(ccell->vl_servers,
+	       cell->vl_addrs,
+	       min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
+
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+						     const void *entry);
+static void afs_vlocation_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_vlocation_cache_index_def = {
+	.name		= "vldb",
+	.data_size	= sizeof(struct afs_cache_vlocation),
+	.keys[0]	= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
+	.match		= afs_vlocation_cache_match,
+	.update		= afs_vlocation_cache_update,
+};
+#endif
+
+/*
+ * match a VLDB record stored in the cache
+ * - may also load target from entry
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vlocation_cache_match(void *target,
+						     const void *entry)
+{
+	const struct afs_cache_vlocation *vldb = entry;
+	struct afs_vlocation *vlocation = target;
+
+	_enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
+
+	if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
+	    ) {
+		if (!vlocation->valid ||
+		    vlocation->vldb.rtime == vldb->rtime
+		    ) {
+			vlocation->vldb = *vldb;
+			vlocation->valid = 1;
+			_leave(" = SUCCESS [c->m]");
+			return CACHEFS_MATCH_SUCCESS;
+		} else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
+			/* delete if VIDs for this name differ */
+			if (memcmp(&vlocation->vldb.vid,
+				   &vldb->vid,
+				   sizeof(vldb->vid)) != 0) {
+				_leave(" = DELETE");
+				return CACHEFS_MATCH_SUCCESS_DELETE;
+			}
+
+			_leave(" = UPDATE");
+			return CACHEFS_MATCH_SUCCESS_UPDATE;
+		} else {
+			_leave(" = SUCCESS");
+			return CACHEFS_MATCH_SUCCESS;
+		}
+	}
+
+	_leave(" = FAILED");
+	return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a VLDB record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vlocation_cache_update(void *source, void *entry)
+{
+	struct afs_cache_vlocation *vldb = entry;
+	struct afs_vlocation *vlocation = source;
+
+	_enter("");
+
+	*vldb = vlocation->vldb;
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+						  const void *entry);
+static void afs_volume_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_volume_cache_index_def = {
+	.name		= "volume",
+	.data_size	= sizeof(struct afs_cache_vhash),
+	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
+	.keys[1]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
+	.match		= afs_volume_cache_match,
+	.update		= afs_volume_cache_update,
+};
+#endif
+
+/*
+ * match a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_volume_cache_match(void *target,
+						  const void *entry)
+{
+	const struct afs_cache_vhash *vhash = entry;
+	struct afs_volume *volume = target;
+
+	_enter("{%u},{%u}", volume->type, vhash->vtype);
+
+	if (volume->type == vhash->vtype) {
+		_leave(" = SUCCESS");
+		return CACHEFS_MATCH_SUCCESS;
+	}
+
+	_leave(" = FAILED");
+	return CACHEFS_MATCH_FAILED;
+}
+#endif
+
+/*
+ * update a volume hash record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_volume_cache_update(void *source, void *entry)
+{
+	struct afs_cache_vhash *vhash = entry;
+	struct afs_volume *volume = source;
+
+	_enter("");
+
+	vhash->vtype = volume->type;
+}
+#endif
+
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+						 const void *entry);
+static void afs_vnode_cache_update(void *source, void *entry);
+
+struct cachefs_index_def afs_vnode_cache_index_def = {
+	.name		= "vnode",
+	.data_size	= sizeof(struct afs_cache_vnode),
+	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 4 },
+	.match		= afs_vnode_cache_match,
+	.update		= afs_vnode_cache_update,
+};
+#endif
+
+/*
+ * match a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static cachefs_match_val_t afs_vnode_cache_match(void *target,
+						 const void *entry)
+{
+	const struct afs_cache_vnode *cvnode = entry;
+	struct afs_vnode *vnode = target;
+
+	_enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       vnode->status.version,
+	       cvnode->vnode_id,
+	       cvnode->vnode_unique,
+	       cvnode->data_version);
+
+	if (vnode->fid.vnode != cvnode->vnode_id) {
+		_leave(" = FAILED");
+		return CACHEFS_MATCH_FAILED;
+	}
+
+	if (vnode->fid.unique != cvnode->vnode_unique ||
+	    vnode->status.version != cvnode->data_version) {
+		_leave(" = DELETE");
+		return CACHEFS_MATCH_SUCCESS_DELETE;
+	}
+
+	_leave(" = SUCCESS");
+	return CACHEFS_MATCH_SUCCESS;
+}
+#endif
+
+/*
+ * update a vnode record stored in the cache
+ */
+#ifdef AFS_CACHING_SUPPORT
+static void afs_vnode_cache_update(void *source, void *entry)
+{
+	struct afs_cache_vnode *cvnode = entry;
+	struct afs_vnode *vnode = source;
+
+	_enter("");
+
+	cvnode->vnode_id	= vnode->fid.vnode;
+	cvnode->vnode_unique	= vnode->fid.unique;
+	cvnode->data_version	= vnode->status.version;
+}
+#endif
diff --git a/fs/afs/cache.h b/fs/afs/cache.h
index 9eb7722b34d5..36a3642cf90e 100644
--- a/fs/afs/cache.h
+++ b/fs/afs/cache.h
@@ -1,4 +1,4 @@
-/* cache.h: AFS local cache management interface
+/* AFS local cache management interface
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -9,8 +9,8 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef _LINUX_AFS_CACHE_H
-#define _LINUX_AFS_CACHE_H
+#ifndef AFS_CACHE_H
+#define AFS_CACHE_H
 
 #undef AFS_CACHING_SUPPORT
 
@@ -20,8 +20,4 @@
 #endif
 #include "types.h"
 
-#ifdef __KERNEL__
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_CACHE_H */
+#endif /* AFS_CACHE_H */
diff --git a/fs/afs/callback.c b/fs/afs/callback.c
index 9cb206e9d4be..639399f0ab6f 100644
--- a/fs/afs/callback.c
+++ b/fs/afs/callback.c
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
  *
  * This software may be freely redistributed under the terms of the
  * GNU General Public License.
@@ -16,85 +16,187 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include "server.h"
-#include "vnode.h"
+#include <linux/circ_buf.h>
 #include "internal.h"
-#include "cmservice.h"
 
-/*****************************************************************************/
+unsigned afs_vnode_update_timeout = 10;
+
+#define afs_breakring_space(server) \
+	CIRC_SPACE((server)->cb_break_head, (server)->cb_break_tail,	\
+		   ARRAY_SIZE((server)->cb_break))
+
+//static void afs_callback_updater(struct work_struct *);
+
+static struct workqueue_struct *afs_callback_update_worker;
+
 /*
  * allow the fileserver to request callback state (re-)initialisation
  */
-int SRXAFSCM_InitCallBackState(struct afs_server *server)
+void afs_init_callback_state(struct afs_server *server)
 {
-	struct list_head callbacks;
+	struct afs_vnode *vnode;
 
-	_enter("%p", server);
+	_enter("{%p}", server);
 
-	INIT_LIST_HEAD(&callbacks);
-
-	/* transfer the callback list from the server to a temp holding area */
 	spin_lock(&server->cb_lock);
 
-	list_add(&callbacks, &server->cb_promises);
-	list_del_init(&server->cb_promises);
+	/* kill all the promises on record from this server */
+	while (!RB_EMPTY_ROOT(&server->cb_promises)) {
+		vnode = rb_entry(server->cb_promises.rb_node,
+				 struct afs_vnode, cb_promise);
+		_debug("UNPROMISE { vid=%x vn=%u uq=%u}",
+		       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
 
-	/* munch our way through the list, grabbing the inode, dropping all the
-	 * locks and regetting them in the right order
-	 */
-	while (!list_empty(&callbacks)) {
-		struct afs_vnode *vnode;
-		struct inode *inode;
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
 
-		vnode = list_entry(callbacks.next, struct afs_vnode, cb_link);
-		list_del_init(&vnode->cb_link);
+/*
+ * handle the data invalidation side of a callback being broken
+ */
+void afs_broken_callback_work(struct work_struct *work)
+{
+	struct afs_vnode *vnode =
+		container_of(work, struct afs_vnode, cb_broken_work);
 
-		/* try and grab the inode - may fail */
-		inode = igrab(AFS_VNODE_TO_I(vnode));
-		if (inode) {
-			int release = 0;
+	_enter("");
 
-			spin_unlock(&server->cb_lock);
-			spin_lock(&vnode->lock);
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		return;
 
-			if (vnode->cb_server == server) {
-				vnode->cb_server = NULL;
-				afs_kafstimod_del_timer(&vnode->cb_timeout);
-				spin_lock(&afs_cb_hash_lock);
-				list_del_init(&vnode->cb_hash_link);
-				spin_unlock(&afs_cb_hash_lock);
-				release = 1;
-			}
+	/* we're only interested in dealing with a broken callback on *this*
+	 * vnode and only if no-one else has dealt with it yet */
+	if (!mutex_trylock(&vnode->validate_lock))
+		return; /* someone else is dealing with it */
 
-			spin_unlock(&vnode->lock);
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		if (S_ISDIR(vnode->vfs_inode.i_mode))
+			afs_clear_permits(vnode);
 
-			iput(inode);
-			afs_put_server(server);
+		if (afs_vnode_fetch_status(vnode, NULL, NULL) < 0)
+			goto out;
 
-			spin_lock(&server->cb_lock);
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			goto out;
+
+		/* if the vnode's data version number changed then its contents
+		 * are different */
+		if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+			_debug("zap data {%x:%u}",
+			       vnode->fid.vid, vnode->fid.vnode);
+			invalidate_remote_inode(&vnode->vfs_inode);
 		}
 	}
 
-	spin_unlock(&server->cb_lock);
+out:
+	mutex_unlock(&vnode->validate_lock);
 
-	_leave(" = 0");
-	return 0;
-} /* end SRXAFSCM_InitCallBackState() */
+	/* avoid the potential race whereby the mutex_trylock() in this
+	 * function happens again between the clear_bit() and the
+	 * mutex_unlock() */
+	if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("requeue");
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+	}
+	_leave("");
+}
+
+/*
+ * actually break a callback
+ */
+static void afs_break_callback(struct afs_server *server,
+			       struct afs_vnode *vnode)
+{
+	_enter("");
+
+	set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+
+	if (vnode->cb_promised) {
+		spin_lock(&vnode->lock);
+
+		_debug("break callback");
+
+		spin_lock(&server->cb_lock);
+		if (vnode->cb_promised) {
+			rb_erase(&vnode->cb_promise, &server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&server->cb_lock);
+
+		queue_work(afs_callback_update_worker, &vnode->cb_broken_work);
+		spin_unlock(&vnode->lock);
+	}
+}
+
+/*
+ * allow the fileserver to explicitly break one callback
+ * - happens when
+ *   - the backing file is changed
+ *   - a lock is released
+ */
+static void afs_break_one_callback(struct afs_server *server,
+				   struct afs_fid *fid)
+{
+	struct afs_vnode *vnode;
+	struct rb_node *p;
+
+	_debug("find");
+	spin_lock(&server->fs_lock);
+	p = server->fs_vnodes.rb_node;
+	while (p) {
+		vnode = rb_entry(p, struct afs_vnode, server_rb);
+		if (fid->vid < vnode->fid.vid)
+			p = p->rb_left;
+		else if (fid->vid > vnode->fid.vid)
+			p = p->rb_right;
+		else if (fid->vnode < vnode->fid.vnode)
+			p = p->rb_left;
+		else if (fid->vnode > vnode->fid.vnode)
+			p = p->rb_right;
+		else if (fid->unique < vnode->fid.unique)
+			p = p->rb_left;
+		else if (fid->unique > vnode->fid.unique)
+			p = p->rb_right;
+		else
+			goto found;
+	}
+
+	/* not found so we just ignore it (it may have moved to another
+	 * server) */
+not_available:
+	_debug("not avail");
+	spin_unlock(&server->fs_lock);
+	_leave("");
+	return;
+
+found:
+	_debug("found");
+	ASSERTCMP(server, ==, vnode->server);
+
+	if (!igrab(AFS_VNODE_TO_I(vnode)))
+		goto not_available;
+	spin_unlock(&server->fs_lock);
+
+	afs_break_callback(server, vnode);
+	iput(&vnode->vfs_inode);
+	_leave("");
+}
 
-/*****************************************************************************/
 /*
  * allow the fileserver to break callback promises
  */
-int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
-		      struct afs_callback callbacks[])
+void afs_break_callbacks(struct afs_server *server, size_t count,
+			 struct afs_callback callbacks[])
 {
-	_enter("%p,%u,", server, count);
+	_enter("%p,%zu,", server, count);
 
-	for (; count > 0; callbacks++, count--) {
-		struct afs_vnode *vnode = NULL;
-		struct inode *inode = NULL;
-		int valid = 0;
+	ASSERT(server != NULL);
+	ASSERTCMP(count, <=, AFSCBMAX);
 
+	for (; count > 0; callbacks++, count--) {
 		_debug("- Fid { vl=%08x n=%u u=%u }  CB { v=%u x=%u t=%u }",
 		       callbacks->fid.vid,
 		       callbacks->fid.vnode,
@@ -103,67 +205,270 @@ int SRXAFSCM_CallBack(struct afs_server *server, size_t count,
 		       callbacks->expiry,
 		       callbacks->type
 		       );
+		afs_break_one_callback(server, &callbacks->fid);
+	}
 
-		/* find the inode for this fid */
-		spin_lock(&afs_cb_hash_lock);
+	_leave("");
+	return;
+}
 
-		list_for_each_entry(vnode,
-				    &afs_cb_hash(server, &callbacks->fid),
-				    cb_hash_link) {
-			if (memcmp(&vnode->fid, &callbacks->fid,
-				   sizeof(struct afs_fid)) != 0)
-				continue;
+/*
+ * record the callback for breaking
+ * - the caller must hold server->cb_lock
+ */
+static void afs_do_give_up_callback(struct afs_server *server,
+				    struct afs_vnode *vnode)
+{
+	struct afs_callback *cb;
 
-			/* right vnode, but is it same server? */
-			if (vnode->cb_server != server)
-				break; /* no */
+	_enter("%p,%p", server, vnode);
 
-			/* try and nail the inode down */
-			inode = igrab(AFS_VNODE_TO_I(vnode));
-			break;
+	cb = &server->cb_break[server->cb_break_head];
+	cb->fid		= vnode->fid;
+	cb->version	= vnode->cb_version;
+	cb->expiry	= vnode->cb_expiry;
+	cb->type	= vnode->cb_type;
+	smp_wmb();
+	server->cb_break_head =
+		(server->cb_break_head + 1) &
+		(ARRAY_SIZE(server->cb_break) - 1);
+
+	/* defer the breaking of callbacks to try and collect as many as
+	 * possible to ship in one operation */
+	switch (atomic_inc_return(&server->cb_break_n)) {
+	case 1 ... AFSCBMAX - 1:
+		queue_delayed_work(afs_callback_update_worker,
+				   &server->cb_break_work, HZ * 2);
+		break;
+	case AFSCBMAX:
+		afs_flush_callback_breaks(server);
+		break;
+	default:
+		break;
+	}
+
+	ASSERT(server->cb_promises.rb_node != NULL);
+	rb_erase(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = false;
+	_leave("");
+}
+
+/*
+ * discard the callback on a deleted item
+ */
+void afs_discard_callback_on_delete(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	_enter("%d", vnode->cb_promised);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised) {
+		ASSERT(server->cb_promises.rb_node != NULL);
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
+	}
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * give up the callback registered for a vnode on the file server when the
+ * inode is being cleared
+ */
+void afs_give_up_callback(struct afs_vnode *vnode)
+{
+	struct afs_server *server = vnode->server;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("%d", vnode->cb_promised);
+
+	_debug("GIVE UP INODE %p", &vnode->vfs_inode);
+
+	if (!vnode->cb_promised) {
+		_leave(" [not promised]");
+		return;
+	}
+
+	ASSERT(server != NULL);
+
+	spin_lock(&server->cb_lock);
+	if (vnode->cb_promised && afs_breakring_space(server) == 0) {
+		add_wait_queue(&server->cb_break_waitq, &myself);
+		for (;;) {
+			set_current_state(TASK_UNINTERRUPTIBLE);
+			if (!vnode->cb_promised ||
+			    afs_breakring_space(server) != 0)
+				break;
+			spin_unlock(&server->cb_lock);
+			schedule();
+			spin_lock(&server->cb_lock);
 		}
+		remove_wait_queue(&server->cb_break_waitq, &myself);
+		__set_current_state(TASK_RUNNING);
+	}
+
+	/* of course, it's always possible for the server to break this vnode's
+	 * callback first... */
+	if (vnode->cb_promised)
+		afs_do_give_up_callback(server, vnode);
+
+	spin_unlock(&server->cb_lock);
+	_leave("");
+}
+
+/*
+ * dispatch a deferred give up callbacks operation
+ */
+void afs_dispatch_give_up_callbacks(struct work_struct *work)
+{
+	struct afs_server *server =
+		container_of(work, struct afs_server, cb_break_work.work);
+
+	_enter("");
+
+	/* tell the fileserver to discard the callback promises it has
+	 * - in the event of ENOMEM or some other error, we just forget that we
+	 *   had callbacks entirely, and the server will call us later to break
+	 *   them
+	 */
+	afs_fs_give_up_callbacks(server, &afs_async_call);
+}
+
+/*
+ * flush the outstanding callback breaks on a server
+ */
+void afs_flush_callback_breaks(struct afs_server *server)
+{
+	cancel_delayed_work(&server->cb_break_work);
+	queue_delayed_work(afs_callback_update_worker,
+			   &server->cb_break_work, 0);
+}
 
-		spin_unlock(&afs_cb_hash_lock);
-
-		if (inode) {
-			/* we've found the record for this vnode */
-			spin_lock(&vnode->lock);
-			if (vnode->cb_server == server) {
-				/* the callback _is_ on the calling server */
-				vnode->cb_server = NULL;
-				valid = 1;
-
-				afs_kafstimod_del_timer(&vnode->cb_timeout);
-				vnode->flags |= AFS_VNODE_CHANGED;
-
-				spin_lock(&server->cb_lock);
-				list_del_init(&vnode->cb_link);
-				spin_unlock(&server->cb_lock);
-
-				spin_lock(&afs_cb_hash_lock);
-				list_del_init(&vnode->cb_hash_link);
-				spin_unlock(&afs_cb_hash_lock);
-			}
-			spin_unlock(&vnode->lock);
-
-			if (valid) {
-				invalidate_remote_inode(inode);
-				afs_put_server(server);
-			}
-			iput(inode);
+#if 0
+/*
+ * update a bunch of callbacks
+ */
+static void afs_callback_updater(struct work_struct *work)
+{
+	struct afs_server *server;
+	struct afs_vnode *vnode, *xvnode;
+	time_t now;
+	long timeout;
+	int ret;
+
+	server = container_of(work, struct afs_server, updater);
+
+	_enter("");
+
+	now = get_seconds();
+
+	/* find the first vnode to update */
+	spin_lock(&server->cb_lock);
+	for (;;) {
+		if (RB_EMPTY_ROOT(&server->cb_promises)) {
+			spin_unlock(&server->cb_lock);
+			_leave(" [nothing]");
+			return;
 		}
+
+		vnode = rb_entry(rb_first(&server->cb_promises),
+				 struct afs_vnode, cb_promise);
+		if (atomic_read(&vnode->usage) > 0)
+			break;
+		rb_erase(&vnode->cb_promise, &server->cb_promises);
+		vnode->cb_promised = false;
 	}
 
-	_leave(" = 0");
-	return 0;
-} /* end SRXAFSCM_CallBack() */
+	timeout = vnode->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vnode_update_worker,
+				   &afs_vnode_update, timeout * HZ);
+		spin_unlock(&server->cb_lock);
+		_leave(" [nothing]");
+		return;
+	}
+
+	list_del_init(&vnode->update);
+	atomic_inc(&vnode->usage);
+	spin_unlock(&server->cb_lock);
+
+	/* we can now perform the update */
+	_debug("update %s", vnode->vldb.name);
+	vnode->state = AFS_VL_UPDATING;
+	vnode->upd_rej_cnt = 0;
+	vnode->upd_busy_cnt = 0;
+
+	ret = afs_vnode_update_record(vl, &vldb);
+	switch (ret) {
+	case 0:
+		afs_vnode_apply_update(vl, &vldb);
+		vnode->state = AFS_VL_UPDATING;
+		break;
+	case -ENOMEDIUM:
+		vnode->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vnode->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+
+	/* and then reschedule */
+	_debug("reschedule");
+	vnode->update_at = get_seconds() + afs_vnode_update_timeout;
+
+	spin_lock(&server->cb_lock);
+
+	if (!list_empty(&server->cb_promises)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvnode = list_entry(server->cb_promises.prev,
+				    struct afs_vnode, update);
+		if (vnode->update_at <= xvnode->update_at)
+			vnode->update_at = xvnode->update_at + 1;
+		xvnode = list_entry(server->cb_promises.next,
+				    struct afs_vnode, update);
+		timeout = xvnode->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vnode_update_timeout;
+	}
+
+	list_add_tail(&vnode->update, &server->cb_promises);
+
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vnode_update_worker,
+			   &afs_vnode_update, timeout * HZ);
+	spin_unlock(&server->cb_lock);
+	afs_put_vnode(vl);
+}
+#endif
+
+/*
+ * initialise the callback update process
+ */
+int __init afs_callback_update_init(void)
+{
+	afs_callback_update_worker =
+		create_singlethread_workqueue("kafs_callbackd");
+	return afs_callback_update_worker ? 0 : -ENOMEM;
+}
 
-/*****************************************************************************/
 /*
- * allow the fileserver to see if the cache manager is still alive
+ * shut down the callback update process
  */
-int SRXAFSCM_Probe(struct afs_server *server)
+void __exit afs_callback_update_kill(void)
 {
-	_debug("SRXAFSCM_Probe(%p)\n", server);
-	return 0;
-} /* end SRXAFSCM_Probe() */
+	destroy_workqueue(afs_callback_update_worker);
+}
diff --git a/fs/afs/cell.c b/fs/afs/cell.c
index 1fc578372759..9b1311a1df51 100644
--- a/fs/afs/cell.c
+++ b/fs/afs/cell.c
@@ -1,4 +1,4 @@
-/* cell.c: AFS cell and server record management
+/* AFS cell and server record management
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -11,15 +11,9 @@
 
 #include <linux/module.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include "super.h"
+#include <linux/key.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
 #include "internal.h"
 
 DECLARE_RWSEM(afs_proc_cells_sem);
@@ -28,66 +22,47 @@ LIST_HEAD(afs_proc_cells);
 static struct list_head afs_cells = LIST_HEAD_INIT(afs_cells);
 static DEFINE_RWLOCK(afs_cells_lock);
 static DECLARE_RWSEM(afs_cells_sem); /* add/remove serialisation */
+static DECLARE_WAIT_QUEUE_HEAD(afs_cells_freeable_wq);
 static struct afs_cell *afs_cell_root;
 
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-						const void *entry);
-static void afs_cell_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_cache_cell_index_def = {
-	.name			= "cell_ix",
-	.data_size		= sizeof(struct afs_cache_cell),
-	.keys[0]		= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-	.match			= afs_cell_cache_match,
-	.update			= afs_cell_cache_update,
-};
-#endif
-
-/*****************************************************************************/
 /*
- * create a cell record
- * - "name" is the name of the cell
- * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ * allocate a cell record and fill in its name, VL server address list and
+ * allocate an anonymous key
  */
-int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
+static struct afs_cell *afs_cell_alloc(const char *name, char *vllist)
 {
 	struct afs_cell *cell;
-	char *next;
+	size_t namelen;
+	char keyname[4 + AFS_MAXCELLNAME + 1], *cp, *dp, *next;
 	int ret;
 
-	_enter("%s", name);
+	_enter("%s,%s", name, vllist);
 
 	BUG_ON(!name); /* TODO: want to look up "this cell" in the cache */
 
+	namelen = strlen(name);
+	if (namelen > AFS_MAXCELLNAME)
+		return ERR_PTR(-ENAMETOOLONG);
+
 	/* allocate and initialise a cell record */
-	cell = kmalloc(sizeof(struct afs_cell) + strlen(name) + 1, GFP_KERNEL);
+	cell = kzalloc(sizeof(struct afs_cell) + namelen + 1, GFP_KERNEL);
 	if (!cell) {
 		_leave(" = -ENOMEM");
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
-	down_write(&afs_cells_sem);
-
-	memset(cell, 0, sizeof(struct afs_cell));
-	atomic_set(&cell->usage, 0);
+	memcpy(cell->name, name, namelen);
+	cell->name[namelen] = 0;
 
+	atomic_set(&cell->usage, 1);
 	INIT_LIST_HEAD(&cell->link);
-
-	rwlock_init(&cell->sv_lock);
-	INIT_LIST_HEAD(&cell->sv_list);
-	INIT_LIST_HEAD(&cell->sv_graveyard);
-	spin_lock_init(&cell->sv_gylock);
-
+	rwlock_init(&cell->servers_lock);
+	INIT_LIST_HEAD(&cell->servers);
 	init_rwsem(&cell->vl_sem);
 	INIT_LIST_HEAD(&cell->vl_list);
-	INIT_LIST_HEAD(&cell->vl_graveyard);
-	spin_lock_init(&cell->vl_gylock);
-
-	strcpy(cell->name,name);
+	spin_lock_init(&cell->vl_lock);
 
 	/* fill in the VL server list from the rest of the string */
-	ret = -EINVAL;
 	do {
 		unsigned a, b, c, d;
 
@@ -96,20 +71,75 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
 			*next++ = 0;
 
 		if (sscanf(vllist, "%u.%u.%u.%u", &a, &b, &c, &d) != 4)
-			goto badaddr;
+			goto bad_address;
 
 		if (a > 255 || b > 255 || c > 255 || d > 255)
-			goto badaddr;
+			goto bad_address;
 
 		cell->vl_addrs[cell->vl_naddrs++].s_addr =
 			htonl((a << 24) | (b << 16) | (c << 8) | d);
 
-		if (cell->vl_naddrs >= AFS_CELL_MAX_ADDRS)
-			break;
+	} while (cell->vl_naddrs < AFS_CELL_MAX_ADDRS && (vllist = next));
+
+	/* create a key to represent an anonymous user */
+	memcpy(keyname, "afs@", 4);
+	dp = keyname + 4;
+	cp = cell->name;
+	do {
+		*dp++ = toupper(*cp);
+	} while (*cp++);
+	cell->anonymous_key = key_alloc(&key_type_rxrpc, keyname, 0, 0, current,
+					KEY_POS_SEARCH, KEY_ALLOC_NOT_IN_QUOTA);
+	if (IS_ERR(cell->anonymous_key)) {
+		_debug("no key");
+		ret = PTR_ERR(cell->anonymous_key);
+		goto error;
+	}
+
+	ret = key_instantiate_and_link(cell->anonymous_key, NULL, 0,
+				       NULL, NULL);
+	if (ret < 0) {
+		_debug("instantiate failed");
+		goto error;
+	}
+
+	_debug("anon key %p{%x}",
+	       cell->anonymous_key, key_serial(cell->anonymous_key));
+
+	_leave(" = %p", cell);
+	return cell;
+
+bad_address:
+	printk(KERN_ERR "kAFS: bad VL server IP address\n");
+	ret = -EINVAL;
+error:
+	key_put(cell->anonymous_key);
+	kfree(cell);
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * create a cell record
+ * - "name" is the name of the cell
+ * - "vllist" is a colon separated list of IP addresses in "a.b.c.d" format
+ */
+struct afs_cell *afs_cell_create(const char *name, char *vllist)
+{
+	struct afs_cell *cell;
+	int ret;
+
+	_enter("%s,%s", name, vllist);
 
-	} while(vllist = next, vllist);
+	cell = afs_cell_alloc(name, vllist);
+	if (IS_ERR(cell)) {
+		_leave(" = %ld", PTR_ERR(cell));
+		return cell;
+	}
+
+	down_write(&afs_cells_sem);
 
-	/* add a proc dir for this cell */
+	/* add a proc directory for this cell */
 	ret = afs_proc_cell_setup(cell);
 	if (ret < 0)
 		goto error;
@@ -130,31 +160,28 @@ int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell)
 	down_write(&afs_proc_cells_sem);
 	list_add_tail(&cell->proc_link, &afs_proc_cells);
 	up_write(&afs_proc_cells_sem);
-
-	*_cell = cell;
 	up_write(&afs_cells_sem);
 
-	_leave(" = 0 (%p)", cell);
-	return 0;
+	_leave(" = %p", cell);
+	return cell;
 
- badaddr:
-	printk(KERN_ERR "kAFS: bad VL server IP address: '%s'\n", vllist);
- error:
+error:
 	up_write(&afs_cells_sem);
+	key_put(cell->anonymous_key);
 	kfree(cell);
 	_leave(" = %d", ret);
-	return ret;
-} /* end afs_cell_create() */
+	return ERR_PTR(ret);
+}
 
-/*****************************************************************************/
 /*
- * initialise the cell database from module parameters
+ * set the root cell information
+ * - can be called with a module parameter string
+ * - can be called from a write to /proc/fs/afs/rootcell
  */
 int afs_cell_init(char *rootcell)
 {
 	struct afs_cell *old_root, *new_root;
 	char *cp;
-	int ret;
 
 	_enter("");
 
@@ -162,82 +189,60 @@ int afs_cell_init(char *rootcell)
 		/* module is loaded with no parameters, or built statically.
 		 * - in the future we might initialize cell DB here.
 		 */
-		_leave(" = 0 (but no root)");
+		_leave(" = 0 [no root]");
 		return 0;
 	}
 
 	cp = strchr(rootcell, ':');
 	if (!cp) {
 		printk(KERN_ERR "kAFS: no VL server IP addresses specified\n");
-		_leave(" = %d (no colon)", -EINVAL);
+		_leave(" = -EINVAL");
 		return -EINVAL;
 	}
 
 	/* allocate a cell record for the root cell */
 	*cp++ = 0;
-	ret = afs_cell_create(rootcell, cp, &new_root);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
+	new_root = afs_cell_create(rootcell, cp);
+	if (IS_ERR(new_root)) {
+		_leave(" = %ld", PTR_ERR(new_root));
+		return PTR_ERR(new_root);
 	}
 
-	/* as afs_put_cell() takes locks by itself, we have to do
-	 * a little gymnastics to be race-free.
-	 */
-	afs_get_cell(new_root);
-
+	/* install the new cell */
 	write_lock(&afs_cells_lock);
-	while (afs_cell_root) {
-		old_root = afs_cell_root;
-		afs_cell_root = NULL;
-		write_unlock(&afs_cells_lock);
-		afs_put_cell(old_root);
-		write_lock(&afs_cells_lock);
-	}
+	old_root = afs_cell_root;
 	afs_cell_root = new_root;
 	write_unlock(&afs_cells_lock);
+	afs_put_cell(old_root);
 
-	_leave(" = %d", ret);
-	return ret;
-
-} /* end afs_cell_init() */
+	_leave(" = 0");
+	return 0;
+}
 
-/*****************************************************************************/
 /*
  * lookup a cell record
  */
-int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
+struct afs_cell *afs_cell_lookup(const char *name, unsigned namesz)
 {
 	struct afs_cell *cell;
-	int ret;
 
 	_enter("\"%*.*s\",", namesz, namesz, name ? name : "");
 
-	*_cell = NULL;
+	down_read(&afs_cells_sem);
+	read_lock(&afs_cells_lock);
 
 	if (name) {
 		/* if the cell was named, look for it in the cell record list */
-		ret = -ENOENT;
-		cell = NULL;
-		read_lock(&afs_cells_lock);
-
 		list_for_each_entry(cell, &afs_cells, link) {
 			if (strncmp(cell->name, name, namesz) == 0) {
 				afs_get_cell(cell);
 				goto found;
 			}
 		}
-		cell = NULL;
+		cell = ERR_PTR(-ENOENT);
 	found:
-
-		read_unlock(&afs_cells_lock);
-
-		if (cell)
-			ret = 0;
-	}
-	else {
-		read_lock(&afs_cells_lock);
-
+		;
+	} else {
 		cell = afs_cell_root;
 		if (!cell) {
 			/* this should not happen unless user tries to mount
@@ -246,44 +251,35 @@ int afs_cell_lookup(const char *name, unsigned namesz, struct afs_cell **_cell)
 			 * ENOENT might be "more appropriate" but they happen
 			 * for other reasons.
 			 */
-			ret = -EDESTADDRREQ;
-		}
-		else {
+			cell = ERR_PTR(-EDESTADDRREQ);
+		} else {
 			afs_get_cell(cell);
-			ret = 0;
 		}
 
-		read_unlock(&afs_cells_lock);
 	}
 
-	*_cell = cell;
-	_leave(" = %d (%p)", ret, cell);
-	return ret;
-
-} /* end afs_cell_lookup() */
+	read_unlock(&afs_cells_lock);
+	up_read(&afs_cells_sem);
+	_leave(" = %p", cell);
+	return cell;
+}
 
-/*****************************************************************************/
 /*
  * try and get a cell record
  */
-struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell)
+struct afs_cell *afs_get_cell_maybe(struct afs_cell *cell)
 {
-	struct afs_cell *cell;
-
 	write_lock(&afs_cells_lock);
 
-	cell = *_cell;
 	if (cell && !list_empty(&cell->link))
 		afs_get_cell(cell);
 	else
 		cell = NULL;
 
 	write_unlock(&afs_cells_lock);
-
 	return cell;
-} /* end afs_get_cell_maybe() */
+}
 
-/*****************************************************************************/
 /*
  * destroy a cell record
  */
@@ -294,8 +290,7 @@ void afs_put_cell(struct afs_cell *cell)
 
 	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
 
-	/* sanity check */
-	BUG_ON(atomic_read(&cell->usage) <= 0);
+	ASSERTCMP(atomic_read(&cell->usage), >, 0);
 
 	/* to prevent a race, the decrement and the dequeue must be effectively
 	 * atomic */
@@ -307,36 +302,49 @@ void afs_put_cell(struct afs_cell *cell)
 		return;
 	}
 
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
+
 	write_unlock(&afs_cells_lock);
 
-	BUG_ON(!list_empty(&cell->sv_list));
-	BUG_ON(!list_empty(&cell->sv_graveyard));
-	BUG_ON(!list_empty(&cell->vl_list));
-	BUG_ON(!list_empty(&cell->vl_graveyard));
+	wake_up(&afs_cells_freeable_wq);
 
 	_leave(" [unused]");
-} /* end afs_put_cell() */
+}
 
-/*****************************************************************************/
 /*
  * destroy a cell record
+ * - must be called with the afs_cells_sem write-locked
+ * - cell->link should have been broken by the caller
  */
 static void afs_cell_destroy(struct afs_cell *cell)
 {
 	_enter("%p{%d,%s}", cell, atomic_read(&cell->usage), cell->name);
 
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	write_lock(&afs_cells_lock);
+	ASSERTCMP(atomic_read(&cell->usage), >=, 0);
+	ASSERT(list_empty(&cell->link));
 
-	/* sanity check */
-	BUG_ON(atomic_read(&cell->usage) != 0);
+	/* wait for everyone to stop using the cell */
+	if (atomic_read(&cell->usage) > 0) {
+		DECLARE_WAITQUEUE(myself, current);
 
-	list_del_init(&cell->link);
+		_debug("wait for cell %s", cell->name);
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		add_wait_queue(&afs_cells_freeable_wq, &myself);
 
-	write_unlock(&afs_cells_lock);
+		while (atomic_read(&cell->usage) > 0) {
+			schedule();
+			set_current_state(TASK_UNINTERRUPTIBLE);
+		}
 
-	down_write(&afs_cells_sem);
+		remove_wait_queue(&afs_cells_freeable_wq, &myself);
+		set_current_state(TASK_RUNNING);
+	}
+
+	_debug("cell dead");
+	ASSERTCMP(atomic_read(&cell->usage), ==, 0);
+	ASSERT(list_empty(&cell->servers));
+	ASSERT(list_empty(&cell->vl_list));
 
 	afs_proc_cell_remove(cell);
 
@@ -348,104 +356,26 @@ static void afs_cell_destroy(struct afs_cell *cell)
 	cachefs_relinquish_cookie(cell->cache, 0);
 #endif
 
-	up_write(&afs_cells_sem);
-
-	BUG_ON(!list_empty(&cell->sv_list));
-	BUG_ON(!list_empty(&cell->sv_graveyard));
-	BUG_ON(!list_empty(&cell->vl_list));
-	BUG_ON(!list_empty(&cell->vl_graveyard));
-
-	/* finish cleaning up the cell */
+	key_put(cell->anonymous_key);
 	kfree(cell);
 
 	_leave(" [destroyed]");
-} /* end afs_cell_destroy() */
-
-/*****************************************************************************/
-/*
- * lookup the server record corresponding to an Rx RPC peer
- */
-int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-			    struct afs_server **_server)
-{
-	struct afs_server *server;
-	struct afs_cell *cell;
-
-	_enter("%p{a=%08x},", peer, ntohl(peer->addr.s_addr));
-
-	/* search the cell list */
-	read_lock(&afs_cells_lock);
-
-	list_for_each_entry(cell, &afs_cells, link) {
-
-		_debug("? cell %s",cell->name);
-
-		write_lock(&cell->sv_lock);
-
-		/* check the active list */
-		list_for_each_entry(server, &cell->sv_list, link) {
-			_debug("?? server %08x", ntohl(server->addr.s_addr));
-
-			if (memcmp(&server->addr, &peer->addr,
-				   sizeof(struct in_addr)) == 0)
-				goto found_server;
-		}
+}
 
-		/* check the inactive list */
-		spin_lock(&cell->sv_gylock);
-		list_for_each_entry(server, &cell->sv_graveyard, link) {
-			_debug("?? dead server %08x",
-			       ntohl(server->addr.s_addr));
-
-			if (memcmp(&server->addr, &peer->addr,
-				   sizeof(struct in_addr)) == 0)
-				goto found_dead_server;
-		}
-		spin_unlock(&cell->sv_gylock);
-
-		write_unlock(&cell->sv_lock);
-	}
-	read_unlock(&afs_cells_lock);
-
-	_leave(" = -ENOENT");
-	return -ENOENT;
-
-	/* we found it in the graveyard - resurrect it */
- found_dead_server:
-	list_move_tail(&server->link, &cell->sv_list);
-	afs_get_server(server);
-	afs_kafstimod_del_timer(&server->timeout);
-	spin_unlock(&cell->sv_gylock);
-	goto success;
-
-	/* we found it - increment its ref count and return it */
- found_server:
-	afs_get_server(server);
-
- success:
-	write_unlock(&cell->sv_lock);
-	read_unlock(&afs_cells_lock);
-
-	*_server = server;
-	_leave(" = 0 (s=%p c=%p)", server, cell);
-	return 0;
-
-} /* end afs_server_find_by_peer() */
-
-/*****************************************************************************/
 /*
  * purge in-memory cell database on module unload or afs_init() failure
  * - the timeout daemon is stopped before calling this
  */
 void afs_cell_purge(void)
 {
-	struct afs_vlocation *vlocation;
 	struct afs_cell *cell;
 
 	_enter("");
 
 	afs_put_cell(afs_cell_root);
 
+	down_write(&afs_cells_sem);
+
 	while (!list_empty(&afs_cells)) {
 		cell = NULL;
 
@@ -464,104 +394,11 @@ void afs_cell_purge(void)
 			_debug("PURGING CELL %s (%d)",
 			       cell->name, atomic_read(&cell->usage));
 
-			BUG_ON(!list_empty(&cell->sv_list));
-			BUG_ON(!list_empty(&cell->vl_list));
-
-			/* purge the cell's VL graveyard list */
-			_debug(" - clearing VL graveyard");
-
-			spin_lock(&cell->vl_gylock);
-
-			while (!list_empty(&cell->vl_graveyard)) {
-				vlocation = list_entry(cell->vl_graveyard.next,
-						       struct afs_vlocation,
-						       link);
-				list_del_init(&vlocation->link);
-
-				afs_kafstimod_del_timer(&vlocation->timeout);
-
-				spin_unlock(&cell->vl_gylock);
-
-				afs_vlocation_do_timeout(vlocation);
-				/* TODO: race if move to use krxtimod instead
-				 * of kafstimod */
-
-				spin_lock(&cell->vl_gylock);
-			}
-
-			spin_unlock(&cell->vl_gylock);
-
-			/* purge the cell's server graveyard list */
-			_debug(" - clearing server graveyard");
-
-			spin_lock(&cell->sv_gylock);
-
-			while (!list_empty(&cell->sv_graveyard)) {
-				struct afs_server *server;
-
-				server = list_entry(cell->sv_graveyard.next,
-						    struct afs_server, link);
-				list_del_init(&server->link);
-
-				afs_kafstimod_del_timer(&server->timeout);
-
-				spin_unlock(&cell->sv_gylock);
-
-				afs_server_do_timeout(server);
-
-				spin_lock(&cell->sv_gylock);
-			}
-
-			spin_unlock(&cell->sv_gylock);
-
 			/* now the cell should be left with no references */
 			afs_cell_destroy(cell);
 		}
 	}
 
+	up_write(&afs_cells_sem);
 	_leave("");
-} /* end afs_cell_purge() */
-
-/*****************************************************************************/
-/*
- * match a cell record obtained from the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_cell_cache_match(void *target,
-						const void *entry)
-{
-	const struct afs_cache_cell *ccell = entry;
-	struct afs_cell *cell = target;
-
-	_enter("{%s},{%s}", ccell->name, cell->name);
-
-	if (strncmp(ccell->name, cell->name, sizeof(ccell->name)) == 0) {
-		_leave(" = SUCCESS");
-		return CACHEFS_MATCH_SUCCESS;
-	}
-
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
-} /* end afs_cell_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a cell record in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_cell_cache_update(void *source, void *entry)
-{
-	struct afs_cache_cell *ccell = entry;
-	struct afs_cell *cell = source;
-
-	_enter("%p,%p", source, entry);
-
-	strncpy(ccell->name, cell->name, sizeof(ccell->name));
-
-	memcpy(ccell->vl_servers,
-	       cell->vl_addrs,
-	       min(sizeof(ccell->vl_servers), sizeof(cell->vl_addrs)));
-
-} /* end afs_cell_cache_update() */
-#endif
+}
diff --git a/fs/afs/cell.h b/fs/afs/cell.h
deleted file mode 100644
index 48349108fb00..000000000000
--- a/fs/afs/cell.h
+++ /dev/null
@@ -1,78 +0,0 @@
-/* cell.h: AFS cell record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_CELL_H
-#define _LINUX_AFS_CELL_H
-
-#include "types.h"
-#include "cache.h"
-
-#define AFS_CELL_MAX_ADDRS 15
-
-extern volatile int afs_cells_being_purged; /* T when cells are being purged by rmmod */
-
-/*****************************************************************************/
-/*
- * entry in the cached cell catalogue
- */
-struct afs_cache_cell
-{
-	char			name[64];	/* cell name (padded with NULs) */
-	struct in_addr		vl_servers[15];	/* cached cell VL servers */
-};
-
-/*****************************************************************************/
-/*
- * AFS cell record
- */
-struct afs_cell
-{
-	atomic_t		usage;
-	struct list_head	link;		/* main cell list link */
-	struct list_head	proc_link;	/* /proc cell list link */
-	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
-#endif
-
-	/* server record management */
-	rwlock_t		sv_lock;	/* active server list lock */
-	struct list_head	sv_list;	/* active server list */
-	struct list_head	sv_graveyard;	/* inactive server list */
-	spinlock_t		sv_gylock;	/* inactive server list lock */
-
-	/* volume location record management */
-	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
-	struct list_head	vl_list;	/* cell's active VL record list */
-	struct list_head	vl_graveyard;	/* cell's inactive VL record list */
-	spinlock_t		vl_gylock;	/* graveyard lock */
-	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
-	unsigned short		vl_curr_svix;	/* current server index */
-	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
-
-	char			name[0];	/* cell name - must go last */
-};
-
-extern int afs_cell_init(char *rootcell);
-
-extern int afs_cell_create(const char *name, char *vllist, struct afs_cell **_cell);
-
-extern int afs_cell_lookup(const char *name, unsigned nmsize, struct afs_cell **_cell);
-
-#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
-
-extern struct afs_cell *afs_get_cell_maybe(struct afs_cell **_cell);
-
-extern void afs_put_cell(struct afs_cell *cell);
-
-extern void afs_cell_purge(void);
-
-#endif /* _LINUX_AFS_CELL_H */
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index 3d097fddcb7a..6685f4cbccb3 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -1,4 +1,4 @@
-/* cmservice.c: AFS Cache Manager Service
+/* AFS Cache Manager Service
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -12,641 +12,463 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <linux/completion.h>
-#include "server.h"
-#include "cell.h"
-#include "transport.h"
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "cmservice.h"
+#include <linux/ip.h>
 #include "internal.h"
+#include "afs_cm.h"
 
-static unsigned afscm_usage;		/* AFS cache manager usage count */
-static struct rw_semaphore afscm_sem;	/* AFS cache manager start/stop semaphore */
-
-static int afscm_new_call(struct rxrpc_call *call);
-static void afscm_attention(struct rxrpc_call *call);
-static void afscm_error(struct rxrpc_call *call);
-static void afscm_aemap(struct rxrpc_call *call);
-
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call);
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call);
-static void _SRXAFSCM_Probe(struct rxrpc_call *call);
-
-typedef void (*_SRXAFSCM_xxxx_t)(struct rxrpc_call *call);
-
-static const struct rxrpc_operation AFSCM_ops[] = {
-	{
-		.id	= 204,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "CallBack",
-		.user	= _SRXAFSCM_CallBack,
-	},
-	{
-		.id	= 205,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "InitCallBackState",
-		.user	= _SRXAFSCM_InitCallBackState,
-	},
-	{
-		.id	= 206,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "Probe",
-		.user	= _SRXAFSCM_Probe,
-	},
-#if 0
-	{
-		.id	= 207,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "GetLock",
-		.user	= _SRXAFSCM_GetLock,
-	},
-	{
-		.id	= 208,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "GetCE",
-		.user	= _SRXAFSCM_GetCE,
-	},
-	{
-		.id	= 209,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "GetXStatsVersion",
-		.user	= _SRXAFSCM_GetXStatsVersion,
-	},
-	{
-		.id	= 210,
-		.asize	= RXRPC_APP_MARK_EOF,
-		.name	= "GetXStats",
-		.user	= _SRXAFSCM_GetXStats,
-	}
-#endif
-};
+struct workqueue_struct *afs_cm_workqueue;
 
-static struct rxrpc_service AFSCM_service = {
-	.name		= "AFS/CM",
-	.owner		= THIS_MODULE,
-	.link		= LIST_HEAD_INIT(AFSCM_service.link),
-	.new_call	= afscm_new_call,
-	.service_id	= 1,
-	.attn_func	= afscm_attention,
-	.error_func	= afscm_error,
-	.aemap_func	= afscm_aemap,
-	.ops_begin	= &AFSCM_ops[0],
-	.ops_end	= &AFSCM_ops[ARRAY_SIZE(AFSCM_ops)],
-};
+static int afs_deliver_cb_init_call_back_state(struct afs_call *,
+					       struct sk_buff *, bool);
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *,
+						struct sk_buff *, bool);
+static int afs_deliver_cb_probe(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_callback(struct afs_call *, struct sk_buff *, bool);
+static int afs_deliver_cb_get_capabilities(struct afs_call *, struct sk_buff *,
+					   bool);
+static void afs_cm_destructor(struct afs_call *);
 
-static DECLARE_COMPLETION(kafscmd_alive);
-static DECLARE_COMPLETION(kafscmd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafscmd_sleepq);
-static LIST_HEAD(kafscmd_attention_list);
-static LIST_HEAD(afscm_calls);
-static DEFINE_SPINLOCK(afscm_calls_lock);
-static DEFINE_SPINLOCK(kafscmd_attention_lock);
-static int kafscmd_die;
-
-/*****************************************************************************/
 /*
- * AFS Cache Manager kernel thread
+ * CB.CallBack operation type
  */
-static int kafscmd(void *arg)
-{
-	DECLARE_WAITQUEUE(myself, current);
-
-	struct rxrpc_call *call;
-	_SRXAFSCM_xxxx_t func;
-	int die;
-
-	printk(KERN_INFO "kAFS: Started kafscmd %d\n", current->pid);
-
-	daemonize("kafscmd");
-
-	complete(&kafscmd_alive);
-
-	/* loop around looking for things to attend to */
-	do {
-		if (list_empty(&kafscmd_attention_list)) {
-			set_current_state(TASK_INTERRUPTIBLE);
-			add_wait_queue(&kafscmd_sleepq, &myself);
-
-			for (;;) {
-				set_current_state(TASK_INTERRUPTIBLE);
-				if (!list_empty(&kafscmd_attention_list) ||
-				    signal_pending(current) ||
-				    kafscmd_die)
-					break;
-
-				schedule();
-			}
-
-			remove_wait_queue(&kafscmd_sleepq, &myself);
-			set_current_state(TASK_RUNNING);
-		}
-
-		die = kafscmd_die;
-
-		/* dequeue the next call requiring attention */
-		call = NULL;
-		spin_lock(&kafscmd_attention_lock);
-
-		if (!list_empty(&kafscmd_attention_list)) {
-			call = list_entry(kafscmd_attention_list.next,
-					  struct rxrpc_call,
-					  app_attn_link);
-			list_del_init(&call->app_attn_link);
-			die = 0;
-		}
-
-		spin_unlock(&kafscmd_attention_lock);
-
-		if (call) {
-			/* act upon it */
-			_debug("@@@ Begin Attend Call %p", call);
-
-			func = call->app_user;
-			if (func)
-				func(call);
-
-			rxrpc_put_call(call);
-
-			_debug("@@@ End Attend Call %p", call);
-		}
-
-	} while(!die);
-
-	/* and that's all */
-	complete_and_exit(&kafscmd_dead, 0);
-
-} /* end kafscmd() */
+static const struct afs_call_type afs_SRXCBCallBack = {
+	.name		= "CB.CallBack",
+	.deliver	= afs_deliver_cb_callback,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
 
-/*****************************************************************************/
 /*
- * handle a call coming in to the cache manager
- * - if I want to keep the call, I must increment its usage count
- * - the return value will be negated and passed back in an abort packet if
- *   non-zero
- * - serialised by virtue of there only being one krxiod
+ * CB.InitCallBackState operation type
  */
-static int afscm_new_call(struct rxrpc_call *call)
-{
-	_enter("%p{cid=%u u=%d}",
-	       call, ntohl(call->call_id), atomic_read(&call->usage));
-
-	rxrpc_get_call(call);
-
-	/* add to my current call list */
-	spin_lock(&afscm_calls_lock);
-	list_add(&call->app_link,&afscm_calls);
-	spin_unlock(&afscm_calls_lock);
-
-	_leave(" = 0");
-	return 0;
-
-} /* end afscm_new_call() */
+static const struct afs_call_type afs_SRXCBInitCallBackState = {
+	.name		= "CB.InitCallBackState",
+	.deliver	= afs_deliver_cb_init_call_back_state,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
 
-/*****************************************************************************/
 /*
- * queue on the kafscmd queue for attention
+ * CB.InitCallBackState3 operation type
  */
-static void afscm_attention(struct rxrpc_call *call)
-{
-	_enter("%p{cid=%u u=%d}",
-	       call, ntohl(call->call_id), atomic_read(&call->usage));
-
-	spin_lock(&kafscmd_attention_lock);
-
-	if (list_empty(&call->app_attn_link)) {
-		list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-		rxrpc_get_call(call);
-	}
-
-	spin_unlock(&kafscmd_attention_lock);
-
-	wake_up(&kafscmd_sleepq);
-
-	_leave(" {u=%d}", atomic_read(&call->usage));
-} /* end afscm_attention() */
+static const struct afs_call_type afs_SRXCBInitCallBackState3 = {
+	.name		= "CB.InitCallBackState3",
+	.deliver	= afs_deliver_cb_init_call_back_state3,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
 
-/*****************************************************************************/
 /*
- * handle my call being aborted
- * - clean up, dequeue and put my ref to the call
+ * CB.Probe operation type
  */
-static void afscm_error(struct rxrpc_call *call)
-{
-	int removed;
-
-	_enter("%p{est=%s ac=%u er=%d}",
-	       call,
-	       rxrpc_call_error_states[call->app_err_state],
-	       call->app_abort_code,
-	       call->app_errno);
-
-	spin_lock(&kafscmd_attention_lock);
-
-	if (list_empty(&call->app_attn_link)) {
-		list_add_tail(&call->app_attn_link, &kafscmd_attention_list);
-		rxrpc_get_call(call);
-	}
-
-	spin_unlock(&kafscmd_attention_lock);
-
-	removed = 0;
-	spin_lock(&afscm_calls_lock);
-	if (!list_empty(&call->app_link)) {
-		list_del_init(&call->app_link);
-		removed = 1;
-	}
-	spin_unlock(&afscm_calls_lock);
-
-	if (removed)
-		rxrpc_put_call(call);
-
-	wake_up(&kafscmd_sleepq);
+static const struct afs_call_type afs_SRXCBProbe = {
+	.name		= "CB.Probe",
+	.deliver	= afs_deliver_cb_probe,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
 
-	_leave("");
-} /* end afscm_error() */
+/*
+ * CB.GetCapabilities operation type
+ */
+static const struct afs_call_type afs_SRXCBGetCapabilites = {
+	.name		= "CB.GetCapabilities",
+	.deliver	= afs_deliver_cb_get_capabilities,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_cm_destructor,
+};
 
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
- * - called with call->lock held
+ * route an incoming cache manager call
+ * - return T if supported, F if not
  */
-static void afscm_aemap(struct rxrpc_call *call)
+bool afs_cm_incoming_call(struct afs_call *call)
 {
-	switch (call->app_err_state) {
-	case RXRPC_ESTATE_LOCAL_ABORT:
-		call->app_abort_code = -call->app_errno;
-		break;
-	case RXRPC_ESTATE_PEER_ABORT:
-		call->app_errno = -ECONNABORTED;
-		break;
+	u32 operation_id = ntohl(call->operation_ID);
+
+	_enter("{CB.OP %u}", operation_id);
+
+	switch (operation_id) {
+	case CBCallBack:
+		call->type = &afs_SRXCBCallBack;
+		return true;
+	case CBInitCallBackState:
+		call->type = &afs_SRXCBInitCallBackState;
+		return true;
+	case CBInitCallBackState3:
+		call->type = &afs_SRXCBInitCallBackState3;
+		return true;
+	case CBProbe:
+		call->type = &afs_SRXCBProbe;
+		return true;
+	case CBGetCapabilities:
+		call->type = &afs_SRXCBGetCapabilites;
+		return true;
 	default:
-		break;
+		return false;
 	}
-} /* end afscm_aemap() */
+}
 
-/*****************************************************************************/
 /*
- * start the cache manager service if not already started
+ * clean up a cache manager call
  */
-int afscm_start(void)
+static void afs_cm_destructor(struct afs_call *call)
 {
-	int ret;
-
-	down_write(&afscm_sem);
-	if (!afscm_usage) {
-		ret = kernel_thread(kafscmd, NULL, 0);
-		if (ret < 0)
-			goto out;
-
-		wait_for_completion(&kafscmd_alive);
-
-		ret = rxrpc_add_service(afs_transport, &AFSCM_service);
-		if (ret < 0)
-			goto kill;
-
-		afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
-					afs_mntpt_expiry_timeout * HZ);
-	}
-
-	afscm_usage++;
-	up_write(&afscm_sem);
-
-	return 0;
-
- kill:
-	kafscmd_die = 1;
-	wake_up(&kafscmd_sleepq);
-	wait_for_completion(&kafscmd_dead);
-
- out:
-	up_write(&afscm_sem);
-	return ret;
+	_enter("");
 
-} /* end afscm_start() */
+	afs_put_server(call->server);
+	call->server = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
 
-/*****************************************************************************/
 /*
- * stop the cache manager service
+ * allow the fileserver to see if the cache manager is still alive
  */
-void afscm_stop(void)
+static void SRXAFSCB_CallBack(struct work_struct *work)
 {
-	struct rxrpc_call *call;
+	struct afs_call *call = container_of(work, struct afs_call, work);
 
-	down_write(&afscm_sem);
+	_enter("");
 
-	BUG_ON(afscm_usage == 0);
-	afscm_usage--;
+	/* be sure to send the reply *before* attempting to spam the AFS server
+	 * with FSFetchStatus requests on the vnodes with broken callbacks lest
+	 * the AFS server get into a vicious cycle of trying to break further
+	 * callbacks because it hadn't received completion of the CBCallBack op
+	 * yet */
+	afs_send_empty_reply(call);
 
-	if (afscm_usage == 0) {
-		/* don't want more incoming calls */
-		rxrpc_del_service(afs_transport, &AFSCM_service);
-
-		/* abort any calls I've still got open (the afscm_error() will
-		 * dequeue them) */
-		spin_lock(&afscm_calls_lock);
-		while (!list_empty(&afscm_calls)) {
-			call = list_entry(afscm_calls.next,
-					  struct rxrpc_call,
-					  app_link);
+	afs_break_callbacks(call->server, call->count, call->request);
+	_leave("");
+}
 
-			list_del_init(&call->app_link);
-			rxrpc_get_call(call);
-			spin_unlock(&afscm_calls_lock);
+/*
+ * deliver request data to a CB.CallBack call
+ */
+static int afs_deliver_cb_callback(struct afs_call *call, struct sk_buff *skb,
+				   bool last)
+{
+	struct afs_callback *cb;
+	struct afs_server *server;
+	struct in_addr addr;
+	__be32 *bp;
+	u32 tmp;
+	int ret, loop;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the FID array and its count in two steps */
+	case 1:
+		_debug("extract FID count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-			rxrpc_call_abort(call, -ESRCH); /* abort, dequeue and
-							 * put */
+		call->count = ntohl(call->tmp);
+		_debug("FID count: %u", call->count);
+		if (call->count > AFSCBMAX)
+			return -EBADMSG;
+
+		call->buffer = kmalloc(call->count * 3 * 4, GFP_KERNEL);
+		if (!call->buffer)
+			return -ENOMEM;
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 2:
+		_debug("extract FID array");
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-			_debug("nuking active call %08x.%d",
-			       ntohl(call->conn->conn_id),
-			       ntohl(call->call_id));
-			rxrpc_put_call(call);
-			rxrpc_put_call(call);
+		_debug("unmarshall FID array");
+		call->request = kcalloc(call->count,
+					sizeof(struct afs_callback),
+					GFP_KERNEL);
+		if (!call->request)
+			return -ENOMEM;
+
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->fid.vid	= ntohl(*bp++);
+			cb->fid.vnode	= ntohl(*bp++);
+			cb->fid.unique	= ntohl(*bp++);
+			cb->type	= AFSCM_CB_UNTYPED;
+		}
 
-			spin_lock(&afscm_calls_lock);
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the callback array and its count in two steps */
+	case 3:
+		_debug("extract CB count");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
 		}
-		spin_unlock(&afscm_calls_lock);
 
-		/* get rid of my daemon */
-		kafscmd_die = 1;
-		wake_up(&kafscmd_sleepq);
-		wait_for_completion(&kafscmd_dead);
+		tmp = ntohl(call->tmp);
+		_debug("CB count: %u", tmp);
+		if (tmp != call->count && tmp != 0)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+		if (tmp == 0)
+			goto empty_cb_array;
+
+	case 4:
+		_debug("extract CB array");
+		ret = afs_extract_data(call, skb, last, call->request,
+				       call->count * 3 * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-		/* dispose of any calls waiting for attention */
-		spin_lock(&kafscmd_attention_lock);
-		while (!list_empty(&kafscmd_attention_list)) {
-			call = list_entry(kafscmd_attention_list.next,
-					  struct rxrpc_call,
-					  app_attn_link);
+		_debug("unmarshall CB array");
+		cb = call->request;
+		bp = call->buffer;
+		for (loop = call->count; loop > 0; loop--, cb++) {
+			cb->version	= ntohl(*bp++);
+			cb->expiry	= ntohl(*bp++);
+			cb->type	= ntohl(*bp++);
+		}
 
-			list_del_init(&call->app_attn_link);
-			spin_unlock(&kafscmd_attention_lock);
+	empty_cb_array:
+		call->offset = 0;
+		call->unmarshall++;
 
-			rxrpc_put_call(call);
+	case 5:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
+	}
 
-			spin_lock(&kafscmd_attention_lock);
-		}
-		spin_unlock(&kafscmd_attention_lock);
+	if (!last)
+		return 0;
 
-		afs_kafstimod_del_timer(&afs_mntpt_expiry_timer);
-	}
+	call->state = AFS_CALL_REPLYING;
 
-	up_write(&afscm_sem);
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
 
-} /* end afscm_stop() */
+	INIT_WORK(&call->work, SRXAFSCB_CallBack);
+	schedule_work(&call->work);
+	return 0;
+}
 
-/*****************************************************************************/
 /*
- * handle the fileserver breaking a set of callbacks
+ * allow the fileserver to request callback state (re-)initialisation
  */
-static void _SRXAFSCM_CallBack(struct rxrpc_call *call)
+static void SRXAFSCB_InitCallBackState(struct work_struct *work)
 {
-	struct afs_server *server;
-	size_t count, qty, tmp;
-	int ret = 0, removed;
-
-	_enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
-
-	server = afs_server_get_from_peer(call->conn->peer);
-
-	switch (call->app_call_state) {
-		/* we've received the last packet
-		 * - drain all the data from the call and send the reply
-		 */
-	case RXRPC_CSTATE_SRVR_GOT_ARGS:
-		ret = -EBADMSG;
-		qty = call->app_ready_qty;
-		if (qty < 8 || qty > 50 * (6 * 4) + 8)
-			break;
-
-		{
-			struct afs_callback *cb, *pcb;
-			int loop;
-			__be32 *fp, *bp;
-
-			fp = rxrpc_call_alloc_scratch(call, qty);
-
-			/* drag the entire argument block out to the scratch
-			 * space */
-			ret = rxrpc_call_read_data(call, fp, qty, 0);
-			if (ret < 0)
-				break;
-
-			/* and unmarshall the parameter block */
-			ret = -EBADMSG;
-			count = ntohl(*fp++);
-			if (count>AFSCBMAX ||
-			    (count * (3 * 4) + 8 != qty &&
-			     count * (6 * 4) + 8 != qty))
-				break;
-
-			bp = fp + count*3;
-			tmp = ntohl(*bp++);
-			if (tmp > 0 && tmp != count)
-				break;
-			if (tmp == 0)
-				bp = NULL;
-
-			pcb = cb = rxrpc_call_alloc_scratch_s(
-				call, struct afs_callback);
-
-			for (loop = count - 1; loop >= 0; loop--) {
-				pcb->fid.vid	= ntohl(*fp++);
-				pcb->fid.vnode	= ntohl(*fp++);
-				pcb->fid.unique	= ntohl(*fp++);
-				if (bp) {
-					pcb->version	= ntohl(*bp++);
-					pcb->expiry	= ntohl(*bp++);
-					pcb->type	= ntohl(*bp++);
-				}
-				else {
-					pcb->version	= 0;
-					pcb->expiry	= 0;
-					pcb->type	= AFSCM_CB_UNTYPED;
-				}
-				pcb++;
-			}
-
-			/* invoke the actual service routine */
-			ret = SRXAFSCM_CallBack(server, count, cb);
-			if (ret < 0)
-				break;
-		}
+	struct afs_call *call = container_of(work, struct afs_call, work);
 
-		/* send the reply */
-		ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-					    GFP_KERNEL, 0, &count);
-		if (ret < 0)
-			break;
-		break;
-
-		/* operation complete */
-	case RXRPC_CSTATE_COMPLETE:
-		call->app_user = NULL;
-		removed = 0;
-		spin_lock(&afscm_calls_lock);
-		if (!list_empty(&call->app_link)) {
-			list_del_init(&call->app_link);
-			removed = 1;
-		}
-		spin_unlock(&afscm_calls_lock);
+	_enter("{%p}", call->server);
 
-		if (removed)
-			rxrpc_put_call(call);
-		break;
+	afs_init_callback_state(call->server);
+	afs_send_empty_reply(call);
+	_leave("");
+}
 
-		/* operation terminated on error */
-	case RXRPC_CSTATE_ERROR:
-		call->app_user = NULL;
-		break;
+/*
+ * deliver request data to a CB.InitCallBackState call
+ */
+static int afs_deliver_cb_init_call_back_state(struct afs_call *call,
+					       struct sk_buff *skb,
+					       bool last)
+{
+	struct afs_server *server;
+	struct in_addr addr;
 
-	default:
-		break;
-	}
+	_enter(",{%u},%d", skb->len, last);
 
-	if (ret < 0)
-		rxrpc_call_abort(call, ret);
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
 
-	afs_put_server(server);
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
 
-	_leave(" = %d", ret);
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
 
-} /* end _SRXAFSCM_CallBack() */
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	schedule_work(&call->work);
+	return 0;
+}
 
-/*****************************************************************************/
 /*
- * handle the fileserver asking us to initialise our callback state
+ * deliver request data to a CB.InitCallBackState3 call
  */
-static void _SRXAFSCM_InitCallBackState(struct rxrpc_call *call)
+static int afs_deliver_cb_init_call_back_state3(struct afs_call *call,
+						struct sk_buff *skb,
+						bool last)
 {
 	struct afs_server *server;
-	size_t count;
-	int ret = 0, removed;
+	struct in_addr addr;
 
-	_enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+	_enter(",{%u},%d", skb->len, last);
 
-	server = afs_server_get_from_peer(call->conn->peer);
+	if (!last)
+		return 0;
 
-	switch (call->app_call_state) {
-		/* we've received the last packet - drain all the data from the
-		 * call */
-	case RXRPC_CSTATE_SRVR_GOT_ARGS:
-		/* shouldn't be any args */
-		ret = -EBADMSG;
-		break;
-
-		/* send the reply when asked for it */
-	case RXRPC_CSTATE_SRVR_SND_REPLY:
-		/* invoke the actual service routine */
-		ret = SRXAFSCM_InitCallBackState(server);
-		if (ret < 0)
-			break;
-
-		ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-					    GFP_KERNEL, 0, &count);
-		if (ret < 0)
-			break;
-		break;
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
 
-		/* operation complete */
-	case RXRPC_CSTATE_COMPLETE:
-		call->app_user = NULL;
-		removed = 0;
-		spin_lock(&afscm_calls_lock);
-		if (!list_empty(&call->app_link)) {
-			list_del_init(&call->app_link);
-			removed = 1;
-		}
-		spin_unlock(&afscm_calls_lock);
+	/* we'll need the file server record as that tells us which set of
+	 * vnodes to operate upon */
+	memcpy(&addr, &ip_hdr(skb)->saddr, 4);
+	server = afs_find_server(&addr);
+	if (!server)
+		return -ENOTCONN;
+	call->server = server;
 
-		if (removed)
-			rxrpc_put_call(call);
-		break;
-
-		/* operation terminated on error */
-	case RXRPC_CSTATE_ERROR:
-		call->app_user = NULL;
-		break;
-
-	default:
-		break;
-	}
-
-	if (ret < 0)
-		rxrpc_call_abort(call, ret);
-
-	afs_put_server(server);
+	INIT_WORK(&call->work, SRXAFSCB_InitCallBackState);
+	schedule_work(&call->work);
+	return 0;
+}
 
-	_leave(" = %d", ret);
+/*
+ * allow the fileserver to see if the cache manager is still alive
+ */
+static void SRXAFSCB_Probe(struct work_struct *work)
+{
+	struct afs_call *call = container_of(work, struct afs_call, work);
 
-} /* end _SRXAFSCM_InitCallBackState() */
+	_enter("");
+	afs_send_empty_reply(call);
+	_leave("");
+}
 
-/*****************************************************************************/
 /*
- * handle a probe from a fileserver
+ * deliver request data to a CB.Probe call
  */
-static void _SRXAFSCM_Probe(struct rxrpc_call *call)
+static int afs_deliver_cb_probe(struct afs_call *call, struct sk_buff *skb,
+				bool last)
 {
-	struct afs_server *server;
-	size_t count;
-	int ret = 0, removed;
-
-	_enter("%p{acs=%s}", call, rxrpc_call_states[call->app_call_state]);
+	_enter(",{%u},%d", skb->len, last);
 
-	server = afs_server_get_from_peer(call->conn->peer);
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
 
-	switch (call->app_call_state) {
-		/* we've received the last packet - drain all the data from the
-		 * call */
-	case RXRPC_CSTATE_SRVR_GOT_ARGS:
-		/* shouldn't be any args */
-		ret = -EBADMSG;
-		break;
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
 
-		/* send the reply when asked for it */
-	case RXRPC_CSTATE_SRVR_SND_REPLY:
-		/* invoke the actual service routine */
-		ret = SRXAFSCM_Probe(server);
-		if (ret < 0)
-			break;
-
-		ret = rxrpc_call_write_data(call, 0, NULL, RXRPC_LAST_PACKET,
-					    GFP_KERNEL, 0, &count);
-		if (ret < 0)
-			break;
-		break;
+	INIT_WORK(&call->work, SRXAFSCB_Probe);
+	schedule_work(&call->work);
+	return 0;
+}
 
-		/* operation complete */
-	case RXRPC_CSTATE_COMPLETE:
-		call->app_user = NULL;
-		removed = 0;
-		spin_lock(&afscm_calls_lock);
-		if (!list_empty(&call->app_link)) {
-			list_del_init(&call->app_link);
-			removed = 1;
+/*
+ * allow the fileserver to ask about the cache manager's capabilities
+ */
+static void SRXAFSCB_GetCapabilities(struct work_struct *work)
+{
+	struct afs_interface *ifs;
+	struct afs_call *call = container_of(work, struct afs_call, work);
+	int loop, nifs;
+
+	struct {
+		struct /* InterfaceAddr */ {
+			__be32 nifs;
+			__be32 uuid[11];
+			__be32 ifaddr[32];
+			__be32 netmask[32];
+			__be32 mtu[32];
+		} ia;
+		struct /* Capabilities */ {
+			__be32 capcount;
+			__be32 caps[1];
+		} cap;
+	} reply;
+
+	_enter("");
+
+	nifs = 0;
+	ifs = kcalloc(32, sizeof(*ifs), GFP_KERNEL);
+	if (ifs) {
+		nifs = afs_get_ipv4_interfaces(ifs, 32, false);
+		if (nifs < 0) {
+			kfree(ifs);
+			ifs = NULL;
+			nifs = 0;
 		}
-		spin_unlock(&afscm_calls_lock);
+	}
 
-		if (removed)
-			rxrpc_put_call(call);
-		break;
+	memset(&reply, 0, sizeof(reply));
+	reply.ia.nifs = htonl(nifs);
+
+	reply.ia.uuid[0] = htonl(afs_uuid.time_low);
+	reply.ia.uuid[1] = htonl(afs_uuid.time_mid);
+	reply.ia.uuid[2] = htonl(afs_uuid.time_hi_and_version);
+	reply.ia.uuid[3] = htonl((s8) afs_uuid.clock_seq_hi_and_reserved);
+	reply.ia.uuid[4] = htonl((s8) afs_uuid.clock_seq_low);
+	for (loop = 0; loop < 6; loop++)
+		reply.ia.uuid[loop + 5] = htonl((s8) afs_uuid.node[loop]);
+
+	if (ifs) {
+		for (loop = 0; loop < nifs; loop++) {
+			reply.ia.ifaddr[loop] = ifs[loop].address.s_addr;
+			reply.ia.netmask[loop] = ifs[loop].netmask.s_addr;
+			reply.ia.mtu[loop] = htonl(ifs[loop].mtu);
+		}
+	}
 
-		/* operation terminated on error */
-	case RXRPC_CSTATE_ERROR:
-		call->app_user = NULL;
-		break;
+	reply.cap.capcount = htonl(1);
+	reply.cap.caps[0] = htonl(AFS_CAP_ERROR_TRANSLATION);
+	afs_send_simple_reply(call, &reply, sizeof(reply));
 
-	default:
-		break;
-	}
+	_leave("");
+}
 
-	if (ret < 0)
-		rxrpc_call_abort(call, ret);
+/*
+ * deliver request data to a CB.GetCapabilities call
+ */
+static int afs_deliver_cb_get_capabilities(struct afs_call *call,
+					   struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
 
-	afs_put_server(server);
+	if (skb->len > 0)
+		return -EBADMSG;
+	if (!last)
+		return 0;
 
-	_leave(" = %d", ret);
+	/* no unmarshalling required */
+	call->state = AFS_CALL_REPLYING;
 
-} /* end _SRXAFSCM_Probe() */
+	INIT_WORK(&call->work, SRXAFSCB_GetCapabilities);
+	schedule_work(&call->work);
+	return 0;
+}
diff --git a/fs/afs/cmservice.h b/fs/afs/cmservice.h
deleted file mode 100644
index af8d4d689cb2..000000000000
--- a/fs/afs/cmservice.h
+++ /dev/null
@@ -1,29 +0,0 @@
-/* cmservice.h: AFS Cache Manager Service declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_CMSERVICE_H
-#define _LINUX_AFS_CMSERVICE_H
-
-#include <rxrpc/transport.h>
-#include "types.h"
-
-/* cache manager start/stop */
-extern int afscm_start(void);
-extern void afscm_stop(void);
-
-/* cache manager server functions */
-extern int SRXAFSCM_InitCallBackState(struct afs_server *server);
-extern int SRXAFSCM_CallBack(struct afs_server *server,
-			     size_t count,
-			     struct afs_callback callbacks[]);
-extern int SRXAFSCM_Probe(struct afs_server *server);
-
-#endif /* _LINUX_AFS_CMSERVICE_H */
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index b6dc2ebe47a8..dac5b990c0cd 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -15,45 +15,53 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include <linux/smp_lock.h>
-#include "vnode.h"
-#include "volume.h"
-#include <rxrpc/call.h>
-#include "super.h"
+#include <linux/ctype.h>
 #include "internal.h"
 
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
-				     struct nameidata *nd);
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd);
 static int afs_dir_open(struct inode *inode, struct file *file);
-static int afs_dir_readdir(struct file *file, void *dirent, filldir_t filldir);
+static int afs_readdir(struct file *file, void *dirent, filldir_t filldir);
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd);
 static int afs_d_delete(struct dentry *dentry);
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
+static void afs_d_release(struct dentry *dentry);
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
 				  loff_t fpos, u64 ino, unsigned dtype);
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd);
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode);
+static int afs_rmdir(struct inode *dir, struct dentry *dentry);
+static int afs_unlink(struct inode *dir, struct dentry *dentry);
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry);
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content);
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry);
 
 const struct file_operations afs_dir_file_operations = {
 	.open		= afs_dir_open,
-	.readdir	= afs_dir_readdir,
+	.release	= afs_release,
+	.readdir	= afs_readdir,
 };
 
 const struct inode_operations afs_dir_inode_operations = {
-	.lookup		= afs_dir_lookup,
+	.create		= afs_create,
+	.lookup		= afs_lookup,
+	.link		= afs_link,
+	.unlink		= afs_unlink,
+	.symlink	= afs_symlink,
+	.mkdir		= afs_mkdir,
+	.rmdir		= afs_rmdir,
+	.rename		= afs_rename,
+	.permission	= afs_permission,
 	.getattr	= afs_inode_getattr,
-#if 0 /* TODO */
-	.create		= afs_dir_create,
-	.link		= afs_dir_link,
-	.unlink		= afs_dir_unlink,
-	.symlink	= afs_dir_symlink,
-	.mkdir		= afs_dir_mkdir,
-	.rmdir		= afs_dir_rmdir,
-	.mknod		= afs_dir_mknod,
-	.rename		= afs_dir_rename,
-#endif
 };
 
 static struct dentry_operations afs_fs_dentry_operations = {
 	.d_revalidate	= afs_d_revalidate,
 	.d_delete	= afs_d_delete,
+	.d_release	= afs_d_release,
 };
 
 #define AFS_DIR_HASHTBL_SIZE	128
@@ -105,14 +113,13 @@ struct afs_dir_page {
 	union afs_dir_block blocks[PAGE_SIZE / sizeof(union afs_dir_block)];
 };
 
-struct afs_dir_lookup_cookie {
+struct afs_lookup_cookie {
 	struct afs_fid	fid;
 	const char	*name;
 	size_t		nlen;
 	int		found;
 };
 
-/*****************************************************************************/
 /*
  * check that a directory page is valid
  */
@@ -128,9 +135,10 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 	if (qty == 0)
 		goto error;
 
-	if (page->index==0 && qty!=ntohs(dbuf->blocks[0].pagehdr.npages)) {
+	if (page->index == 0 && qty != ntohs(dbuf->blocks[0].pagehdr.npages)) {
 		printk("kAFS: %s(%lu): wrong number of dir blocks %d!=%hu\n",
-		       __FUNCTION__,dir->i_ino,qty,ntohs(dbuf->blocks[0].pagehdr.npages));
+		       __FUNCTION__, dir->i_ino, qty,
+		       ntohs(dbuf->blocks[0].pagehdr.npages));
 		goto error;
 	}
 #endif
@@ -157,13 +165,11 @@ static inline void afs_dir_check_page(struct inode *dir, struct page *page)
 	SetPageChecked(page);
 	return;
 
- error:
+error:
 	SetPageChecked(page);
 	SetPageError(page);
+}
 
-} /* end afs_dir_check_page() */
-
-/*****************************************************************************/
 /*
  * discard a page cached in the pagecache
  */
@@ -171,20 +177,22 @@ static inline void afs_dir_put_page(struct page *page)
 {
 	kunmap(page);
 	page_cache_release(page);
+}
 
-} /* end afs_dir_put_page() */
-
-/*****************************************************************************/
 /*
  * get a page into the pagecache
  */
-static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
+static struct page *afs_dir_get_page(struct inode *dir, unsigned long index,
+				     struct key *key)
 {
 	struct page *page;
+	struct file file = {
+		.private_data = key,
+	};
 
 	_enter("{%lu},%lu", dir->i_ino, index);
 
-	page = read_mapping_page(dir->i_mapping, index, NULL);
+	page = read_mapping_page(dir->i_mapping, index, &file);
 	if (!IS_ERR(page)) {
 		wait_on_page_locked(page);
 		kmap(page);
@@ -197,12 +205,12 @@ static struct page *afs_dir_get_page(struct inode *dir, unsigned long index)
 	}
 	return page;
 
- fail:
+fail:
 	afs_dir_put_page(page);
+	_leave(" = -EIO");
 	return ERR_PTR(-EIO);
-} /* end afs_dir_get_page() */
+}
 
-/*****************************************************************************/
 /*
  * open an AFS directory file
  */
@@ -213,15 +221,12 @@ static int afs_dir_open(struct inode *inode, struct file *file)
 	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
 	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
-	if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED)
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(inode)->flags))
 		return -ENOENT;
 
-	_leave(" = 0");
-	return 0;
-
-} /* end afs_dir_open() */
+	return afs_open(inode, file);
+}
 
-/*****************************************************************************/
 /*
  * deal with one block in an AFS directory
  */
@@ -250,7 +255,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 		/* skip entries marked unused in the bitmap */
 		if (!(block->pagehdr.bitmap[offset / 8] &
 		      (1 << (offset % 8)))) {
-			_debug("ENT[%Zu.%u]: unused\n",
+			_debug("ENT[%Zu.%u]: unused",
 			       blkoff / sizeof(union afs_dir_block), offset);
 			if (offset >= curr)
 				*fpos = blkoff +
@@ -264,7 +269,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			       sizeof(*block) -
 			       offset * sizeof(union afs_dirent));
 
-		_debug("ENT[%Zu.%u]: %s %Zu \"%s\"\n",
+		_debug("ENT[%Zu.%u]: %s %Zu \"%s\"",
 		       blkoff / sizeof(union afs_dir_block), offset,
 		       (offset < curr ? "skip" : "fill"),
 		       nlen, dire->u.name);
@@ -274,7 +279,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			if (next >= AFS_DIRENT_PER_BLOCK) {
 				_debug("ENT[%Zu.%u]:"
 				       " %u travelled beyond end dir block"
-				       " (len %u/%Zu)\n",
+				       " (len %u/%Zu)",
 				       blkoff / sizeof(union afs_dir_block),
 				       offset, next, tmp, nlen);
 				return -EIO;
@@ -282,13 +287,13 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			if (!(block->pagehdr.bitmap[next / 8] &
 			      (1 << (next % 8)))) {
 				_debug("ENT[%Zu.%u]:"
-				       " %u unmarked extension (len %u/%Zu)\n",
+				       " %u unmarked extension (len %u/%Zu)",
 				       blkoff / sizeof(union afs_dir_block),
 				       offset, next, tmp, nlen);
 				return -EIO;
 			}
 
-			_debug("ENT[%Zu.%u]: ext %u/%Zu\n",
+			_debug("ENT[%Zu.%u]: ext %u/%Zu",
 			       blkoff / sizeof(union afs_dir_block),
 			       next, tmp, nlen);
 			next++;
@@ -304,7 +309,7 @@ static int afs_dir_iterate_block(unsigned *fpos,
 			      nlen,
 			      blkoff + offset * sizeof(union afs_dirent),
 			      ntohl(dire->u.vnode),
-			      filldir == afs_dir_lookup_filldir ?
+			      filldir == afs_lookup_filldir ?
 			      ntohl(dire->u.unique) : DT_UNKNOWN);
 		if (ret < 0) {
 			_leave(" = 0 [full]");
@@ -316,16 +321,15 @@ static int afs_dir_iterate_block(unsigned *fpos,
 
 	_leave(" = 1 [more]");
 	return 1;
-} /* end afs_dir_iterate_block() */
+}
 
-/*****************************************************************************/
 /*
- * read an AFS directory
+ * iterate through the data blob that lists the contents of an AFS directory
  */
 static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
-			   filldir_t filldir)
+			   filldir_t filldir, struct key *key)
 {
-	union afs_dir_block	*dblock;
+	union afs_dir_block *dblock;
 	struct afs_dir_page *dbuf;
 	struct page *page;
 	unsigned blkoff, limit;
@@ -333,7 +337,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 
 	_enter("{%lu},%u,,", dir->i_ino, *fpos);
 
-	if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+	if (test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dir)->flags)) {
 		_leave(" = -ESTALE");
 		return -ESTALE;
 	}
@@ -348,7 +352,7 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 		blkoff = *fpos & ~(sizeof(union afs_dir_block) - 1);
 
 		/* fetch the appropriate page from the directory */
-		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE);
+		page = afs_dir_get_page(dir, blkoff / PAGE_SIZE, key);
 		if (IS_ERR(page)) {
 			ret = PTR_ERR(page);
 			break;
@@ -377,43 +381,50 @@ static int afs_dir_iterate(struct inode *dir, unsigned *fpos, void *cookie,
 		ret = 0;
 	}
 
- out:
+out:
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_dir_iterate() */
+}
 
-/*****************************************************************************/
 /*
  * read an AFS directory
  */
-static int afs_dir_readdir(struct file *file, void *cookie, filldir_t filldir)
+static int afs_readdir(struct file *file, void *cookie, filldir_t filldir)
 {
 	unsigned fpos;
 	int ret;
 
-	_enter("{%Ld,{%lu}}", file->f_pos, file->f_path.dentry->d_inode->i_ino);
+	_enter("{%Ld,{%lu}}",
+	       file->f_pos, file->f_path.dentry->d_inode->i_ino);
+
+	ASSERT(file->private_data != NULL);
 
 	fpos = file->f_pos;
-	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos, cookie, filldir);
+	ret = afs_dir_iterate(file->f_path.dentry->d_inode, &fpos,
+			      cookie, filldir, file->private_data);
 	file->f_pos = fpos;
 
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_dir_readdir() */
+}
 
-/*****************************************************************************/
 /*
  * search the directory for a name
  * - if afs_dir_iterate_block() spots this function, it'll pass the FID
  *   uniquifier through dtype
  */
-static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
-				  loff_t fpos, u64 ino, unsigned dtype)
+static int afs_lookup_filldir(void *_cookie, const char *name, int nlen,
+			      loff_t fpos, u64 ino, unsigned dtype)
 {
-	struct afs_dir_lookup_cookie *cookie = _cookie;
+	struct afs_lookup_cookie *cookie = _cookie;
 
-	_enter("{%s,%Zu},%s,%u,,%lu,%u",
-	       cookie->name, cookie->nlen, name, nlen, ino, dtype);
+	_enter("{%s,%Zu},%s,%u,,%llu,%u",
+	       cookie->name, cookie->nlen, name, nlen,
+	       (unsigned long long) ino, dtype);
+
+	/* insanity checks first */
+	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
+	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
 
 	if (cookie->nlen != nlen || memcmp(cookie->name, name, nlen) != 0) {
 		_leave(" = 0 [no]");
@@ -426,216 +437,254 @@ static int afs_dir_lookup_filldir(void *_cookie, const char *name, int nlen,
 
 	_leave(" = -1 [found]");
 	return -1;
-} /* end afs_dir_lookup_filldir() */
+}
 
-/*****************************************************************************/
 /*
- * look up an entry in a directory
+ * do a lookup in a directory
+ * - just returns the FID the dentry name maps to if found
  */
-static struct dentry *afs_dir_lookup(struct inode *dir, struct dentry *dentry,
-				     struct nameidata *nd)
+static int afs_do_lookup(struct inode *dir, struct dentry *dentry,
+			 struct afs_fid *fid, struct key *key)
 {
-	struct afs_dir_lookup_cookie cookie;
+	struct afs_lookup_cookie cookie;
 	struct afs_super_info *as;
+	unsigned fpos;
+	int ret;
+
+	_enter("{%lu},%p{%s},", dir->i_ino, dentry, dentry->d_name.name);
+
+	as = dir->i_sb->s_fs_info;
+
+	/* search the directory */
+	cookie.name	= dentry->d_name.name;
+	cookie.nlen	= dentry->d_name.len;
+	cookie.fid.vid	= as->volume->vid;
+	cookie.found	= 0;
+
+	fpos = 0;
+	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_lookup_filldir,
+			      key);
+	if (ret < 0) {
+		_leave(" = %d [iter]", ret);
+		return ret;
+	}
+
+	ret = -ENOENT;
+	if (!cookie.found) {
+		_leave(" = -ENOENT [not found]");
+		return -ENOENT;
+	}
+
+	*fid = cookie.fid;
+	_leave(" = 0 { vn=%u u=%u }", fid->vnode, fid->unique);
+	return 0;
+}
+
+/*
+ * look up an entry in a directory
+ */
+static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry,
+				 struct nameidata *nd)
+{
 	struct afs_vnode *vnode;
+	struct afs_fid fid;
 	struct inode *inode;
-	unsigned fpos;
+	struct key *key;
 	int ret;
 
-	_enter("{%lu},%p{%s}", dir->i_ino, dentry, dentry->d_name.name);
+	vnode = AFS_FS_I(dir);
 
-	/* insanity checks first */
-	BUILD_BUG_ON(sizeof(union afs_dir_block) != 2048);
-	BUILD_BUG_ON(sizeof(union afs_dirent) != 32);
+	_enter("{%x:%d},%p{%s},",
+	       vnode->fid.vid, vnode->fid.vnode, dentry, dentry->d_name.name);
+
+	ASSERTCMP(dentry->d_inode, ==, NULL);
 
 	if (dentry->d_name.len > 255) {
 		_leave(" = -ENAMETOOLONG");
 		return ERR_PTR(-ENAMETOOLONG);
 	}
 
-	vnode = AFS_FS_I(dir);
-	if (vnode->flags & AFS_VNODE_DELETED) {
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		_leave(" = -ESTALE");
 		return ERR_PTR(-ESTALE);
 	}
 
-	as = dir->i_sb->s_fs_info;
-
-	/* search the directory */
-	cookie.name	= dentry->d_name.name;
-	cookie.nlen	= dentry->d_name.len;
-	cookie.fid.vid	= as->volume->vid;
-	cookie.found	= 0;
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return ERR_PTR(PTR_ERR(key));
+	}
 
-	fpos = 0;
-	ret = afs_dir_iterate(dir, &fpos, &cookie, afs_dir_lookup_filldir);
+	ret = afs_validate(vnode, key);
 	if (ret < 0) {
-		_leave(" = %d", ret);
+		key_put(key);
+		_leave(" = %d [val]", ret);
 		return ERR_PTR(ret);
 	}
 
-	ret = -ENOENT;
-	if (!cookie.found) {
-		_leave(" = %d", ret);
+	ret = afs_do_lookup(dir, dentry, &fid, key);
+	if (ret < 0) {
+		key_put(key);
+		if (ret == -ENOENT) {
+			d_add(dentry, NULL);
+			_leave(" = NULL [negative]");
+			return NULL;
+		}
+		_leave(" = %d [do]", ret);
 		return ERR_PTR(ret);
 	}
+	dentry->d_fsdata = (void *)(unsigned long) vnode->status.data_version;
 
 	/* instantiate the dentry */
-	ret = afs_iget(dir->i_sb, &cookie.fid, &inode);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ERR_PTR(ret);
+	inode = afs_iget(dir->i_sb, key, &fid, NULL, NULL);
+	key_put(key);
+	if (IS_ERR(inode)) {
+		_leave(" = %ld", PTR_ERR(inode));
+		return ERR_PTR(PTR_ERR(inode));
 	}
 
 	dentry->d_op = &afs_fs_dentry_operations;
-	dentry->d_fsdata = (void *) (unsigned long) vnode->status.version;
 
 	d_add(dentry, inode);
 	_leave(" = 0 { vn=%u u=%u } -> { ino=%lu v=%lu }",
-	       cookie.fid.vnode,
-	       cookie.fid.unique,
+	       fid.vnode,
+	       fid.unique,
 	       dentry->d_inode->i_ino,
 	       dentry->d_inode->i_version);
 
 	return NULL;
-} /* end afs_dir_lookup() */
+}
 
-/*****************************************************************************/
 /*
  * check that a dentry lookup hit has found a valid entry
  * - NOTE! the hit can be a negative hit too, so we can't assume we have an
  *   inode
- * (derived from nfs_lookup_revalidate)
  */
 static int afs_d_revalidate(struct dentry *dentry, struct nameidata *nd)
 {
-	struct afs_dir_lookup_cookie cookie;
+	struct afs_vnode *vnode, *dir;
+	struct afs_fid fid;
 	struct dentry *parent;
-	struct inode *inode, *dir;
-	unsigned fpos;
+	struct key *key;
+	void *dir_version;
 	int ret;
 
-	_enter("{sb=%p n=%s},", dentry->d_sb, dentry->d_name.name);
+	vnode = AFS_FS_I(dentry->d_inode);
 
-	/* lock down the parent dentry so we can peer at it */
-	parent = dget_parent(dentry->d_parent);
+	if (dentry->d_inode)
+		_enter("{v={%x:%u} n=%s fl=%lx},",
+		       vnode->fid.vid, vnode->fid.vnode, dentry->d_name.name,
+		       vnode->flags);
+	else
+		_enter("{neg n=%s}", dentry->d_name.name);
 
-	dir = parent->d_inode;
-	inode = dentry->d_inode;
+	key = afs_request_key(AFS_FS_S(dentry->d_sb)->volume->cell);
+	if (IS_ERR(key))
+		key = NULL;
 
-	/* handle a negative dentry */
-	if (!inode)
+	/* lock down the parent dentry so we can peer at it */
+	parent = dget_parent(dentry);
+	if (!parent->d_inode)
 		goto out_bad;
 
-	/* handle a bad inode */
-	if (is_bad_inode(inode)) {
-		printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
-		       dentry->d_parent->d_name.name, dentry->d_name.name);
-		goto out_bad;
-	}
+	dir = AFS_FS_I(parent->d_inode);
 
-	/* force a full look up if the parent directory changed since last the
-	 * server was consulted
-	 * - otherwise this inode must still exist, even if the inode details
-	 *   themselves have changed
-	 */
-	if (AFS_FS_I(dir)->flags & AFS_VNODE_CHANGED)
-		afs_vnode_fetch_status(AFS_FS_I(dir));
+	/* validate the parent directory */
+	if (test_bit(AFS_VNODE_MODIFIED, &dir->flags))
+		afs_validate(dir, key);
 
-	if (AFS_FS_I(dir)->flags & AFS_VNODE_DELETED) {
+	if (test_bit(AFS_VNODE_DELETED, &dir->flags)) {
 		_debug("%s: parent dir deleted", dentry->d_name.name);
 		goto out_bad;
 	}
 
-	if (AFS_FS_I(inode)->flags & AFS_VNODE_DELETED) {
-		_debug("%s: file already deleted", dentry->d_name.name);
-		goto out_bad;
-	}
-
-	if ((unsigned long) dentry->d_fsdata !=
-	    (unsigned long) AFS_FS_I(dir)->status.version) {
-		_debug("%s: parent changed %lu -> %u",
-		       dentry->d_name.name,
-		       (unsigned long) dentry->d_fsdata,
-		       (unsigned) AFS_FS_I(dir)->status.version);
+	dir_version = (void *) (unsigned long) dir->status.data_version;
+	if (dentry->d_fsdata == dir_version)
+		goto out_valid; /* the dir contents are unchanged */
 
-		/* search the directory for this vnode */
-		cookie.name	= dentry->d_name.name;
-		cookie.nlen	= dentry->d_name.len;
-		cookie.fid.vid	= AFS_FS_I(inode)->volume->vid;
-		cookie.found	= 0;
+	_debug("dir modified");
 
-		fpos = 0;
-		ret = afs_dir_iterate(dir, &fpos, &cookie,
-				      afs_dir_lookup_filldir);
-		if (ret < 0) {
-			_debug("failed to iterate dir %s: %d",
-			       parent->d_name.name, ret);
+	/* search the directory for this vnode */
+	ret = afs_do_lookup(&dir->vfs_inode, dentry, &fid, key);
+	switch (ret) {
+	case 0:
+		/* the filename maps to something */
+		if (!dentry->d_inode)
+			goto out_bad;
+		if (is_bad_inode(dentry->d_inode)) {
+			printk("kAFS: afs_d_revalidate: %s/%s has bad inode\n",
+			       parent->d_name.name, dentry->d_name.name);
 			goto out_bad;
-		}
-
-		if (!cookie.found) {
-			_debug("%s: dirent not found", dentry->d_name.name);
-			goto not_found;
 		}
 
 		/* if the vnode ID has changed, then the dirent points to a
 		 * different file */
-		if (cookie.fid.vnode != AFS_FS_I(inode)->fid.vnode) {
-			_debug("%s: dirent changed", dentry->d_name.name);
+		if (fid.vnode != vnode->fid.vnode) {
+			_debug("%s: dirent changed [%u != %u]",
+			       dentry->d_name.name, fid.vnode,
+			       vnode->fid.vnode);
 			goto not_found;
 		}
 
 		/* if the vnode ID uniqifier has changed, then the file has
-		 * been deleted */
-		if (cookie.fid.unique != AFS_FS_I(inode)->fid.unique) {
+		 * been deleted and replaced, and the original vnode ID has
+		 * been reused */
+		if (fid.unique != vnode->fid.unique) {
 			_debug("%s: file deleted (uq %u -> %u I:%lu)",
-			       dentry->d_name.name,
-			       cookie.fid.unique,
-			       AFS_FS_I(inode)->fid.unique,
-			       inode->i_version);
-			spin_lock(&AFS_FS_I(inode)->lock);
-			AFS_FS_I(inode)->flags |= AFS_VNODE_DELETED;
-			spin_unlock(&AFS_FS_I(inode)->lock);
-			invalidate_remote_inode(inode);
-			goto out_bad;
+			       dentry->d_name.name, fid.unique,
+			       vnode->fid.unique, dentry->d_inode->i_version);
+			spin_lock(&vnode->lock);
+			set_bit(AFS_VNODE_DELETED, &vnode->flags);
+			spin_unlock(&vnode->lock);
+			goto not_found;
 		}
+		goto out_valid;
+
+	case -ENOENT:
+		/* the filename is unknown */
+		_debug("%s: dirent not found", dentry->d_name.name);
+		if (dentry->d_inode)
+			goto not_found;
+		goto out_valid;
 
-		dentry->d_fsdata =
-			(void *) (unsigned long) AFS_FS_I(dir)->status.version;
+	default:
+		_debug("failed to iterate dir %s: %d",
+		       parent->d_name.name, ret);
+		goto out_bad;
 	}
 
- out_valid:
+out_valid:
+	dentry->d_fsdata = dir_version;
+out_skip:
 	dput(parent);
+	key_put(key);
 	_leave(" = 1 [valid]");
 	return 1;
 
 	/* the dirent, if it exists, now points to a different vnode */
- not_found:
+not_found:
 	spin_lock(&dentry->d_lock);
 	dentry->d_flags |= DCACHE_NFSFS_RENAMED;
 	spin_unlock(&dentry->d_lock);
 
- out_bad:
-	if (inode) {
+out_bad:
+	if (dentry->d_inode) {
 		/* don't unhash if we have submounts */
 		if (have_submounts(dentry))
-			goto out_valid;
+			goto out_skip;
 	}
 
-	shrink_dcache_parent(dentry);
-
 	_debug("dropping dentry %s/%s",
-	       dentry->d_parent->d_name.name, dentry->d_name.name);
+	       parent->d_name.name, dentry->d_name.name);
+	shrink_dcache_parent(dentry);
 	d_drop(dentry);
-
 	dput(parent);
+	key_put(key);
 
 	_leave(" = 0 [bad]");
 	return 0;
-} /* end afs_d_revalidate() */
+}
 
-/*****************************************************************************/
 /*
  * allow the VFS to enquire as to whether a dentry should be unhashed (mustn't
  * sleep)
@@ -649,15 +698,444 @@ static int afs_d_delete(struct dentry *dentry)
 	if (dentry->d_flags & DCACHE_NFSFS_RENAMED)
 		goto zap;
 
-	if (dentry->d_inode) {
-		if (AFS_FS_I(dentry->d_inode)->flags & AFS_VNODE_DELETED)
+	if (dentry->d_inode &&
+	    test_bit(AFS_VNODE_DELETED, &AFS_FS_I(dentry->d_inode)->flags))
 			goto zap;
-	}
 
 	_leave(" = 0 [keep]");
 	return 0;
 
- zap:
+zap:
 	_leave(" = 1 [zap]");
 	return 1;
-} /* end afs_d_delete() */
+}
+
+/*
+ * handle dentry release
+ */
+static void afs_d_release(struct dentry *dentry)
+{
+	_enter("%s", dentry->d_name.name);
+}
+
+/*
+ * create a directory on an AFS filesystem
+ */
+static int afs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%s},%o",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFDIR;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto mkdir_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+mkdir_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a directory from an AFS filesystem
+ */
+static int afs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%s}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, true);
+	if (ret < 0)
+		goto rmdir_error;
+
+	if (dentry->d_inode) {
+		vnode = AFS_FS_I(dentry->d_inode);
+		clear_nlink(&vnode->vfs_inode);
+		set_bit(AFS_VNODE_DELETED, &vnode->flags);
+		afs_discard_callback_on_delete(vnode);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rmdir_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * remove a file from an AFS filesystem
+ */
+static int afs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%s}",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	if (dentry->d_inode) {
+		vnode = AFS_FS_I(dentry->d_inode);
+
+		/* make sure we have a callback promise on the victim */
+		ret = afs_validate(vnode, key);
+		if (ret < 0)
+			goto error;
+	}
+
+	ret = afs_vnode_remove(dvnode, key, dentry->d_name.name, false);
+	if (ret < 0)
+		goto remove_error;
+
+	if (dentry->d_inode) {
+		/* if the file wasn't deleted due to excess hard links, the
+		 * fileserver will break the callback promise on the file - if
+		 * it had one - before it returns to us, and if it was deleted,
+		 * it won't
+		 *
+		 * however, if we didn't have a callback promise outstanding,
+		 * or it was outstanding on a different server, then it won't
+		 * break it either...
+		 */
+		vnode = AFS_FS_I(dentry->d_inode);
+		if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+			_debug("AFS_VNODE_DELETED");
+		if (test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
+			_debug("AFS_VNODE_CB_BROKEN");
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_validate(vnode, key);
+		_debug("nlink %d [val %d]", vnode->vfs_inode.i_nlink, ret);
+	}
+
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+remove_error:
+	key_put(key);
+error:
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a regular file on an AFS filesystem
+ */
+static int afs_create(struct inode *dir, struct dentry *dentry, int mode,
+		      struct nameidata *nd)
+{
+	struct afs_file_status status;
+	struct afs_callback cb;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%s},%o,",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name, mode);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	mode |= S_IFREG;
+	ret = afs_vnode_create(dvnode, key, dentry->d_name.name,
+			       mode, &fid, &status, &cb, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, &cb);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a hard link between files in an AFS filesystem
+ */
+static int afs_link(struct dentry *from, struct inode *dir,
+		    struct dentry *dentry)
+{
+	struct afs_vnode *dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(from->d_inode);
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%x:%d},{%s}",
+	       vnode->fid.vid, vnode->fid.vnode,
+	       dvnode->fid.vid, dvnode->fid.vnode,
+	       dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_link(dvnode, vnode, key, dentry->d_name.name);
+	if (ret < 0)
+		goto link_error;
+
+	atomic_inc(&vnode->vfs_inode.i_count);
+	d_instantiate(dentry, &vnode->vfs_inode);
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+link_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * create a symlink in an AFS filesystem
+ */
+static int afs_symlink(struct inode *dir, struct dentry *dentry,
+		       const char *content)
+{
+	struct afs_file_status status;
+	struct afs_server *server;
+	struct afs_vnode *dvnode, *vnode;
+	struct afs_fid fid;
+	struct inode *inode;
+	struct key *key;
+	int ret;
+
+	dvnode = AFS_FS_I(dir);
+
+	_enter("{%x:%d},{%s},%s",
+	       dvnode->fid.vid, dvnode->fid.vnode, dentry->d_name.name,
+	       content);
+
+	ret = -ENAMETOOLONG;
+	if (dentry->d_name.len > 255)
+		goto error;
+
+	ret = -EINVAL;
+	if (strlen(content) > 1023)
+		goto error;
+
+	key = afs_request_key(dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_symlink(dvnode, key, dentry->d_name.name, content,
+				&fid, &status, &server);
+	if (ret < 0)
+		goto create_error;
+
+	inode = afs_iget(dir->i_sb, key, &fid, &status, NULL);
+	if (IS_ERR(inode)) {
+		/* ENOMEM at a really inconvenient time - just abandon the new
+		 * directory on the server */
+		ret = PTR_ERR(inode);
+		goto iget_error;
+	}
+
+	/* apply the status report we've got for the new vnode */
+	vnode = AFS_FS_I(inode);
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	afs_vnode_finalise_status_update(vnode, server);
+	afs_put_server(server);
+
+	d_instantiate(dentry, inode);
+	if (d_unhashed(dentry)) {
+		_debug("not hashed");
+		d_rehash(dentry);
+	}
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+iget_error:
+	afs_put_server(server);
+create_error:
+	key_put(key);
+error:
+	d_drop(dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * rename a file in an AFS filesystem and/or move it between directories
+ */
+static int afs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		      struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct afs_vnode *orig_dvnode, *new_dvnode, *vnode;
+	struct key *key;
+	int ret;
+
+	vnode = AFS_FS_I(old_dentry->d_inode);
+	orig_dvnode = AFS_FS_I(old_dir);
+	new_dvnode = AFS_FS_I(new_dir);
+
+	_enter("{%x:%d},{%x:%d},{%x:%d},{%s}",
+	       orig_dvnode->fid.vid, orig_dvnode->fid.vnode,
+	       vnode->fid.vid, vnode->fid.vnode,
+	       new_dvnode->fid.vid, new_dvnode->fid.vnode,
+	       new_dentry->d_name.name);
+
+	ret = -ENAMETOOLONG;
+	if (new_dentry->d_name.len > 255)
+		goto error;
+
+	key = afs_request_key(orig_dvnode->volume->cell);
+	if (IS_ERR(key)) {
+		ret = PTR_ERR(key);
+		goto error;
+	}
+
+	ret = afs_vnode_rename(orig_dvnode, new_dvnode, key,
+			       old_dentry->d_name.name,
+			       new_dentry->d_name.name);
+	if (ret < 0)
+		goto rename_error;
+	key_put(key);
+	_leave(" = 0");
+	return 0;
+
+rename_error:
+	key_put(key);
+error:
+	d_drop(new_dentry);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/errors.h b/fs/afs/errors.h
deleted file mode 100644
index 574d94ac8d05..000000000000
--- a/fs/afs/errors.h
+++ /dev/null
@@ -1,34 +0,0 @@
-/* errors.h: AFS abort/error codes
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_ERRORS_H
-#define _LINUX_AFS_ERRORS_H
-
-#include "types.h"
-
-/* file server abort codes */
-typedef enum {
-	VSALVAGE	= 101,	/* volume needs salvaging */
-	VNOVNODE	= 102,	/* no such file/dir (vnode) */
-	VNOVOL		= 103,	/* no such volume or volume unavailable */
-	VVOLEXISTS	= 104,	/* volume name already exists */
-	VNOSERVICE	= 105,	/* volume not currently in service */
-	VOFFLINE	= 106,	/* volume is currently offline (more info available [VVL-spec]) */
-	VONLINE		= 107,	/* volume is already online */
-	VDISKFULL	= 108,	/* disk partition is full */
-	VOVERQUOTA	= 109,	/* volume's maximum quota exceeded */
-	VBUSY		= 110,	/* volume is temporarily unavailable */
-	VMOVED		= 111,	/* volume moved to new server - ask this FS where */
-} afs_rxfs_abort_t;
-
-extern int afs_abort_to_error(int abortcode);
-
-#endif /* _LINUX_AFS_ERRORS_H */
diff --git a/fs/afs/file.c b/fs/afs/file.c
index b17634541f67..ae256498f4f7 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -1,6 +1,6 @@
-/* file.c: AFS filesystem file handling
+/* AFS filesystem file handling
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -15,22 +15,25 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include <rxrpc/call.h>
 #include "internal.h"
 
-#if 0
-static int afs_file_open(struct inode *inode, struct file *file);
-static int afs_file_release(struct inode *inode, struct file *file);
-#endif
-
 static int afs_file_readpage(struct file *file, struct page *page);
 static void afs_file_invalidatepage(struct page *page, unsigned long offset);
 static int afs_file_releasepage(struct page *page, gfp_t gfp_flags);
 
+const struct file_operations afs_file_operations = {
+	.open		= afs_open,
+	.release	= afs_release,
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.aio_read	= generic_file_aio_read,
+	.mmap		= generic_file_readonly_mmap,
+	.sendfile	= generic_file_sendfile,
+};
+
 const struct inode_operations afs_file_inode_operations = {
 	.getattr	= afs_inode_getattr,
+	.permission	= afs_permission,
 };
 
 const struct address_space_operations afs_fs_aops = {
@@ -40,7 +43,48 @@ const struct address_space_operations afs_fs_aops = {
 	.invalidatepage	= afs_file_invalidatepage,
 };
 
-/*****************************************************************************/
+/*
+ * open an AFS file or directory and attach a key to it
+ */
+int afs_open(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	struct key *key;
+	int ret;
+
+	_enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	ret = afs_validate(vnode, key);
+	if (ret < 0) {
+		_leave(" = %d [val]", ret);
+		return ret;
+	}
+
+	file->private_data = key;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * release an AFS file or directory and discard its key
+ */
+int afs_release(struct inode *inode, struct file *file)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
+	_enter("{%x:%x},", vnode->fid.vid, vnode->fid.vnode);
+
+	key_put(file->private_data);
+	_leave(" = 0");
+	return 0;
+}
+
 /*
  * deal with notification that a page was read from the cache
  */
@@ -58,10 +102,9 @@ static void afs_file_readpage_read_complete(void *cookie_data,
 		SetPageUptodate(page);
 	unlock_page(page);
 
-} /* end afs_file_readpage_read_complete() */
+}
 #endif
 
-/*****************************************************************************/
 /*
  * deal with notification that a page was written to the cache
  */
@@ -74,41 +117,38 @@ static void afs_file_readpage_write_complete(void *cookie_data,
 	_enter("%p,%p,%p,%d", cookie_data, page, data, error);
 
 	unlock_page(page);
-
-} /* end afs_file_readpage_write_complete() */
+}
 #endif
 
-/*****************************************************************************/
 /*
  * AFS read page from file (or symlink)
  */
 static int afs_file_readpage(struct file *file, struct page *page)
 {
-	struct afs_rxfs_fetch_descriptor desc;
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_page *pageio;
-#endif
 	struct afs_vnode *vnode;
 	struct inode *inode;
+	struct key *key;
+	size_t len;
+	off_t offset;
 	int ret;
 
 	inode = page->mapping->host;
 
-	_enter("{%lu},{%lu}", inode->i_ino, page->index);
+	ASSERT(file != NULL);
+	key = file->private_data;
+	ASSERT(key != NULL);
+
+	_enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
 
 	vnode = AFS_FS_I(inode);
 
 	BUG_ON(!PageLocked(page));
 
 	ret = -ESTALE;
-	if (vnode->flags & AFS_VNODE_DELETED)
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 		goto error;
 
 #ifdef AFS_CACHING_SUPPORT
-	ret = cachefs_page_get_private(page, &pageio, GFP_NOIO);
-	if (ret < 0)
-		goto error;
-
 	/* is it cached? */
 	ret = cachefs_read_or_alloc_page(vnode->cache,
 					 page,
@@ -132,26 +172,19 @@ static int afs_file_readpage(struct file *file, struct page *page)
 	case -ENOBUFS:
 	case -ENODATA:
 	default:
-		desc.fid	= vnode->fid;
-		desc.offset	= page->index << PAGE_CACHE_SHIFT;
-		desc.size	= min((size_t) (inode->i_size - desc.offset),
-				      (size_t) PAGE_SIZE);
-		desc.buffer	= kmap(page);
-
-		clear_page(desc.buffer);
+		offset = page->index << PAGE_CACHE_SHIFT;
+		len = min_t(size_t, i_size_read(inode) - offset, PAGE_SIZE);
 
 		/* read the contents of the file from the server into the
 		 * page */
-		ret = afs_vnode_fetch_data(vnode, &desc);
-		kunmap(page);
+		ret = afs_vnode_fetch_data(vnode, key, offset, len, page);
 		if (ret < 0) {
-			if (ret==-ENOENT) {
+			if (ret == -ENOENT) {
 				_debug("got NOENT from server"
 				       " - marking file deleted and stale");
-				vnode->flags |= AFS_VNODE_DELETED;
+				set_bit(AFS_VNODE_DELETED, &vnode->flags);
 				ret = -ESTALE;
 			}
-
 #ifdef AFS_CACHING_SUPPORT
 			cachefs_uncache_page(vnode->cache, page);
 #endif
@@ -178,16 +211,13 @@ static int afs_file_readpage(struct file *file, struct page *page)
 	_leave(" = 0");
 	return 0;
 
- error:
+error:
 	SetPageError(page);
 	unlock_page(page);
-
 	_leave(" = %d", ret);
 	return ret;
+}
 
-} /* end afs_file_readpage() */
-
-/*****************************************************************************/
 /*
  * get a page cookie for the specified page
  */
@@ -202,10 +232,9 @@ int afs_cache_get_page_cookie(struct page *page,
 
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_cache_get_page_cookie() */
+}
 #endif
 
-/*****************************************************************************/
 /*
  * invalidate part or all of a page
  */
@@ -240,9 +269,8 @@ static void afs_file_invalidatepage(struct page *page, unsigned long offset)
 	}
 
 	_leave(" = %d", ret);
-} /* end afs_file_invalidatepage() */
+}
 
-/*****************************************************************************/
 /*
  * release a page and cleanup its private data
  */
@@ -267,4 +295,4 @@ static int afs_file_releasepage(struct page *page, gfp_t gfp_flags)
 
 	_leave(" = 0");
 	return 0;
-} /* end afs_file_releasepage() */
+}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 61bc371532ab..2393d2a08d79 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -1,6 +1,6 @@
-/* fsclient.c: AFS File Server client stubs
+/* AFS File Server client stubs
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -11,827 +11,927 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "fsclient.h"
-#include "cmservice.h"
-#include "vnode.h"
-#include "server.h"
-#include "errors.h"
+#include <linux/circ_buf.h>
 #include "internal.h"
+#include "afs_fs.h"
 
-#define FSFETCHSTATUS		132	/* AFS Fetch file status */
-#define FSFETCHDATA		130	/* AFS Fetch file data */
-#define FSGIVEUPCALLBACKS	147	/* AFS Discard callback promises */
-#define FSGETVOLUMEINFO		148	/* AFS Get root volume information */
-#define FSGETROOTVOLUME		151	/* AFS Get root volume name */
-#define FSLOOKUP		161	/* AFS lookup file in directory */
-
-/*****************************************************************************/
 /*
- * map afs abort codes to/from Linux error codes
- * - called with call->lock held
+ * decode an AFSFid block
  */
-static void afs_rxfs_aemap(struct rxrpc_call *call)
+static void xdr_decode_AFSFid(const __be32 **_bp, struct afs_fid *fid)
 {
-	switch (call->app_err_state) {
-	case RXRPC_ESTATE_LOCAL_ABORT:
-		call->app_abort_code = -call->app_errno;
-		break;
-	case RXRPC_ESTATE_PEER_ABORT:
-		call->app_errno = afs_abort_to_error(call->app_abort_code);
-		break;
-	default:
-		break;
-	}
-} /* end afs_rxfs_aemap() */
+	const __be32 *bp = *_bp;
+
+	fid->vid		= ntohl(*bp++);
+	fid->vnode		= ntohl(*bp++);
+	fid->unique		= ntohl(*bp++);
+	*_bp = bp;
+}
 
-/*****************************************************************************/
 /*
- * get the root volume name from a fileserver
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
+ * decode an AFSFetchStatus block
  */
-#if 0
-int afs_rxfs_get_root_volume(struct afs_server *server,
-			     char *buf, size_t *buflen)
+static void xdr_decode_AFSFetchStatus(const __be32 **_bp,
+				      struct afs_file_status *status,
+				      struct afs_vnode *vnode)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[2];
-	size_t sent;
-	int ret;
-	u32 param[1];
+	const __be32 *bp = *_bp;
+	umode_t mode;
+	u64 data_version, size;
+	u32 changed = 0; /* becomes non-zero if ctime-type changes seen */
+
+#define EXTRACT(DST)				\
+	do {					\
+		u32 x = ntohl(*bp++);		\
+		changed |= DST - x;		\
+		DST = x;			\
+	} while (0)
+
+	status->if_version = ntohl(*bp++);
+	EXTRACT(status->type);
+	EXTRACT(status->nlink);
+	size = ntohl(*bp++);
+	data_version = ntohl(*bp++);
+	EXTRACT(status->author);
+	EXTRACT(status->owner);
+	EXTRACT(status->caller_access); /* call ticket dependent */
+	EXTRACT(status->anon_access);
+	EXTRACT(status->mode);
+	EXTRACT(status->parent.vnode);
+	EXTRACT(status->parent.unique);
+	bp++; /* seg size */
+	status->mtime_client = ntohl(*bp++);
+	status->mtime_server = ntohl(*bp++);
+	EXTRACT(status->group);
+	bp++; /* sync counter */
+	data_version |= (u64) ntohl(*bp++) << 32;
+	bp++; /* lock count */
+	size |= (u64) ntohl(*bp++) << 32;
+	bp++; /* spare 4 */
+	*_bp = bp;
+
+	if (size != status->size) {
+		status->size = size;
+		changed |= true;
+	}
+	status->mode &= S_IALLUGO;
+
+	_debug("vnode time %lx, %lx",
+	       status->mtime_client, status->mtime_server);
+
+	if (vnode) {
+		status->parent.vid = vnode->fid.vid;
+		if (changed && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode changed");
+			i_size_write(&vnode->vfs_inode, size);
+			vnode->vfs_inode.i_uid = status->owner;
+			vnode->vfs_inode.i_gid = status->group;
+			vnode->vfs_inode.i_version = vnode->fid.unique;
+			vnode->vfs_inode.i_nlink = status->nlink;
+
+			mode = vnode->vfs_inode.i_mode;
+			mode &= ~S_IALLUGO;
+			mode |= status->mode;
+			barrier();
+			vnode->vfs_inode.i_mode = mode;
+		}
 
-	DECLARE_WAITQUEUE(myself, current);
+		vnode->vfs_inode.i_ctime.tv_sec	= status->mtime_server;
+		vnode->vfs_inode.i_mtime	= vnode->vfs_inode.i_ctime;
+		vnode->vfs_inode.i_atime	= vnode->vfs_inode.i_ctime;
+	}
 
-	kenter("%p,%p,%u",server, buf, *buflen);
+	if (status->data_version != data_version) {
+		status->data_version = data_version;
+		if (vnode && !test_bit(AFS_VNODE_UNSET, &vnode->flags)) {
+			_debug("vnode modified %llx on {%x:%u}",
+			       (unsigned long long) data_version,
+			       vnode->fid.vid, vnode->fid.vnode);
+			set_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+			set_bit(AFS_VNODE_ZAP_DATA, &vnode->flags);
+		}
+	}
+}
 
-	/* get hold of the fileserver connection */
-	ret = afs_server_get_fsconn(server, &conn);
-	if (ret < 0)
-		goto out;
+/*
+ * decode an AFSCallBack block
+ */
+static void xdr_decode_AFSCallBack(const __be32 **_bp, struct afs_vnode *vnode)
+{
+	const __be32 *bp = *_bp;
 
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = FSGETROOTVOLUME;
+	vnode->cb_version	= ntohl(*bp++);
+	vnode->cb_expiry	= ntohl(*bp++);
+	vnode->cb_type		= ntohl(*bp++);
+	vnode->cb_expires	= vnode->cb_expiry + get_seconds();
+	*_bp = bp;
+}
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+static void xdr_decode_AFSCallBack_raw(const __be32 **_bp,
+				       struct afs_callback *cb)
+{
+	const __be32 *bp = *_bp;
 
-	/* marshall the parameters */
-	param[0] = htonl(FSGETROOTVOLUME);
-
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-		    signal_pending(current))
-			break;
-		schedule();
-	}
-	set_current_state(TASK_RUNNING);
+	cb->version	= ntohl(*bp++);
+	cb->expiry	= ntohl(*bp++);
+	cb->type	= ntohl(*bp++);
+	*_bp = bp;
+}
 
-	ret = -EINTR;
-	if (signal_pending(current))
-		goto abort;
+/*
+ * decode an AFSVolSync block
+ */
+static void xdr_decode_AFSVolSync(const __be32 **_bp,
+				  struct afs_volsync *volsync)
+{
+	const __be32 *bp = *_bp;
 
-	switch (call->app_call_state) {
-	case RXRPC_CSTATE_ERROR:
-		ret = call->app_errno;
-		kdebug("Got Error: %d", ret);
-		goto out_unwait;
+	volsync->creation = ntohl(*bp++);
+	bp++; /* spare2 */
+	bp++; /* spare3 */
+	bp++; /* spare4 */
+	bp++; /* spare5 */
+	bp++; /* spare6 */
+	*_bp = bp;
+}
 
-	case RXRPC_CSTATE_CLNT_GOT_REPLY:
-		/* read the reply */
-		kdebug("Got Reply: qty=%d", call->app_ready_qty);
+/*
+ * deliver reply data to an FS.FetchStatus
+ */
+static int afs_deliver_fs_fetch_status(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
 
-		ret = -EBADMSG;
-		if (call->app_ready_qty <= 4)
-			goto abort;
+	_enter(",,%u", last);
 
-		ret = rxrpc_call_read_data(call, NULL, call->app_ready_qty, 0);
-		if (ret < 0)
-			goto abort;
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
 
-#if 0
-		/* unmarshall the reply */
-		bp = buffer;
-		for (loop = 0; loop < 65; loop++)
-			entry->name[loop] = ntohl(*bp++);
-		entry->name[64] = 0;
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
 
-		entry->type = ntohl(*bp++);
-		entry->num_servers = ntohl(*bp++);
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSCallBack(&bp, vnode);
+	if (call->reply2)
+		xdr_decode_AFSVolSync(&bp, call->reply2);
 
-		for (loop = 0; loop < 8; loop++)
-			entry->servers[loop].addr.s_addr = *bp++;
+	_leave(" = 0 [done]");
+	return 0;
+}
 
-		for (loop = 0; loop < 8; loop++)
-			entry->servers[loop].partition = ntohl(*bp++);
+/*
+ * FS.FetchStatus operation type
+ */
+static const struct afs_call_type afs_RXFSFetchStatus = {
+	.name		= "FS.FetchStatus",
+	.deliver	= afs_deliver_fs_fetch_status,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
 
-		for (loop = 0; loop < 8; loop++)
-			entry->servers[loop].flags = ntohl(*bp++);
+/*
+ * fetch the status information for a file
+ */
+int afs_fs_fetch_file_status(struct afs_server *server,
+			     struct key *key,
+			     struct afs_vnode *vnode,
+			     struct afs_volsync *volsync,
+			     const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	__be32 *bp;
 
-		for (loop = 0; loop < 3; loop++)
-			entry->volume_ids[loop] = ntohl(*bp++);
+	_enter(",%x,{%x:%d},,",
+	       key_serial(key), vnode->fid.vid, vnode->fid.vnode);
 
-		entry->clone_id = ntohl(*bp++);
-		entry->flags = ntohl(*bp);
-#endif
+	call = afs_alloc_flat_call(&afs_RXFSFetchStatus, 16, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
 
-		/* success */
-		ret = 0;
-		goto out_unwait;
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = volsync;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
 
-	default:
-		BUG();
-	}
+	/* marshall the parameters */
+	bp = call->request;
+	bp[0] = htonl(FSFETCHSTATUS);
+	bp[1] = htonl(vnode->fid.vid);
+	bp[2] = htonl(vnode->fid.vnode);
+	bp[3] = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_fsconn(server, conn);
- out:
-	kleave("");
-	return ret;
-} /* end afs_rxfs_get_root_volume() */
-#endif
-
-/*****************************************************************************/
 /*
- * get information about a volume
+ * deliver reply data to an FS.FetchData
  */
-#if 0
-int afs_rxfs_get_volume_info(struct afs_server *server,
-			     const char *name,
-			     struct afs_volume_info *vinfo)
+static int afs_deliver_fs_fetch_data(struct afs_call *call,
+				     struct sk_buff *skb, bool last)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[3];
-	size_t sent;
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+	struct page *page;
+	void *buffer;
 	int ret;
-	u32 param[2], *bp, zero;
 
-	DECLARE_WAITQUEUE(myself, current);
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	switch (call->unmarshall) {
+	case 0:
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the returned data length */
+	case 1:
+		_debug("extract data length");
+		ret = afs_extract_data(call, skb, last, &call->tmp, 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-	_enter("%p,%s,%p", server, name, vinfo);
+		call->count = ntohl(call->tmp);
+		_debug("DATA length: %u", call->count);
+		if (call->count > PAGE_SIZE)
+			return -EBADMSG;
+		call->offset = 0;
+		call->unmarshall++;
+
+		if (call->count < PAGE_SIZE) {
+			buffer = kmap_atomic(call->reply3, KM_USER0);
+			memset(buffer + PAGE_SIZE - call->count, 0,
+			       call->count);
+			kunmap_atomic(buffer, KM_USER0);
+		}
 
-	/* get hold of the fileserver connection */
-	ret = afs_server_get_fsconn(server, &conn);
-	if (ret < 0)
-		goto out;
+		/* extract the returned data */
+	case 2:
+		_debug("extract data");
+		page = call->reply3;
+		buffer = kmap_atomic(page, KM_USER0);
+		ret = afs_extract_data(call, skb, last, buffer, call->count);
+		kunmap_atomic(buffer, KM_USER0);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = FSGETVOLUMEINFO;
+		call->offset = 0;
+		call->unmarshall++;
+
+		/* extract the metadata */
+	case 3:
+		ret = afs_extract_data(call, skb, last, call->buffer,
+				       (21 + 3 + 6) * 4);
+		switch (ret) {
+		case 0:		break;
+		case -EAGAIN:	return 0;
+		default:	return ret;
+		}
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+		bp = call->buffer;
+		xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+		xdr_decode_AFSCallBack(&bp, vnode);
+		if (call->reply2)
+			xdr_decode_AFSVolSync(&bp, call->reply2);
 
-	/* marshall the parameters */
-	piov[1].iov_len = strlen(name);
-	piov[1].iov_base = (char *) name;
-
-	zero = 0;
-	piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
-	piov[2].iov_base = &zero;
-
-	param[0] = htonl(FSGETVOLUMEINFO);
-	param[1] = htonl(piov[1].iov_len);
-
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	bp = rxrpc_call_alloc_scratch(call, 64);
-
-	ret = rxrpc_call_read_data(call, bp, 64,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0) {
-		if (ret == -ECONNABORTED) {
-			ret = call->app_errno;
-			goto out_unwait;
-		}
-		goto abort;
+		call->offset = 0;
+		call->unmarshall++;
+
+	case 4:
+		_debug("trailer");
+		if (skb->len != 0)
+			return -EBADMSG;
+		break;
 	}
 
-	/* unmarshall the reply */
-	vinfo->vid = ntohl(*bp++);
-	vinfo->type = ntohl(*bp++);
-
-	vinfo->type_vids[0] = ntohl(*bp++);
-	vinfo->type_vids[1] = ntohl(*bp++);
-	vinfo->type_vids[2] = ntohl(*bp++);
-	vinfo->type_vids[3] = ntohl(*bp++);
-	vinfo->type_vids[4] = ntohl(*bp++);
-
-	vinfo->nservers = ntohl(*bp++);
-	vinfo->servers[0].addr.s_addr = *bp++;
-	vinfo->servers[1].addr.s_addr = *bp++;
-	vinfo->servers[2].addr.s_addr = *bp++;
-	vinfo->servers[3].addr.s_addr = *bp++;
-	vinfo->servers[4].addr.s_addr = *bp++;
-	vinfo->servers[5].addr.s_addr = *bp++;
-	vinfo->servers[6].addr.s_addr = *bp++;
-	vinfo->servers[7].addr.s_addr = *bp++;
-
-	ret = -EBADMSG;
-	if (vinfo->nservers > 8)
-		goto abort;
-
-	/* success */
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_fsconn(server, conn);
- out:
-	_leave("");
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-
-} /* end afs_rxfs_get_volume_info() */
-#endif
-
-/*****************************************************************************/
+	if (!last)
+		return 0;
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
 /*
- * fetch the status information for a file
+ * FS.FetchData operation type
+ */
+static const struct afs_call_type afs_RXFSFetchData = {
+	.name		= "FS.FetchData",
+	.deliver	= afs_deliver_fs_fetch_data,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * fetch data from a file
  */
-int afs_rxfs_fetch_file_status(struct afs_server *server,
-			       struct afs_vnode *vnode,
-			       struct afs_volsync *volsync)
+int afs_fs_fetch_data(struct afs_server *server,
+		      struct key *key,
+		      struct afs_vnode *vnode,
+		      off_t offset, size_t length,
+		      struct page *buffer,
+		      const struct afs_wait_mode *wait_mode)
 {
-	struct afs_server_callslot callslot;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	size_t sent;
-	int ret;
+	struct afs_call *call;
 	__be32 *bp;
 
-	DECLARE_WAITQUEUE(myself, current);
+	_enter("");
 
-	_enter("%p,{%u,%u,%u}",
-	       server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+	call = afs_alloc_flat_call(&afs_RXFSFetchData, 24, (21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
 
-	/* get hold of the fileserver connection */
-	ret = afs_server_request_callslot(server, &callslot);
-	if (ret < 0)
-		goto out;
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap,
-				&call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = FSFETCHSTATUS;
-
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = NULL; /* volsync */
+	call->reply3 = buffer;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
 
 	/* marshall the parameters */
-	bp = rxrpc_call_alloc_scratch(call, 16);
-	bp[0] = htonl(FSFETCHSTATUS);
+	bp = call->request;
+	bp[0] = htonl(FSFETCHDATA);
 	bp[1] = htonl(vnode->fid.vid);
 	bp[2] = htonl(vnode->fid.vnode);
 	bp[3] = htonl(vnode->fid.unique);
+	bp[4] = htonl(offset);
+	bp[5] = htonl(length);
 
-	piov[0].iov_len = 16;
-	piov[0].iov_base = bp;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	bp = rxrpc_call_alloc_scratch(call, 120);
-
-	ret = rxrpc_call_read_data(call, bp, 120,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0) {
-		if (ret == -ECONNABORTED) {
-			ret = call->app_errno;
-			goto out_unwait;
-		}
-		goto abort;
-	}
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
-	/* unmarshall the reply */
-	vnode->status.if_version	= ntohl(*bp++);
-	vnode->status.type		= ntohl(*bp++);
-	vnode->status.nlink		= ntohl(*bp++);
-	vnode->status.size		= ntohl(*bp++);
-	vnode->status.version		= ntohl(*bp++);
-	vnode->status.author		= ntohl(*bp++);
-	vnode->status.owner		= ntohl(*bp++);
-	vnode->status.caller_access	= ntohl(*bp++);
-	vnode->status.anon_access	= ntohl(*bp++);
-	vnode->status.mode		= ntohl(*bp++);
-	vnode->status.parent.vid	= vnode->fid.vid;
-	vnode->status.parent.vnode	= ntohl(*bp++);
-	vnode->status.parent.unique	= ntohl(*bp++);
-	bp++; /* seg size */
-	vnode->status.mtime_client	= ntohl(*bp++);
-	vnode->status.mtime_server	= ntohl(*bp++);
-	bp++; /* group */
-	bp++; /* sync counter */
-	vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-	bp++; /* spare2 */
-	bp++; /* spare3 */
-	bp++; /* spare4 */
+/*
+ * deliver reply data to an FS.GiveUpCallBacks
+ */
+static int afs_deliver_fs_give_up_callbacks(struct afs_call *call,
+					    struct sk_buff *skb, bool last)
+{
+	_enter(",{%u},%d", skb->len, last);
 
-	vnode->cb_version		= ntohl(*bp++);
-	vnode->cb_expiry		= ntohl(*bp++);
-	vnode->cb_type			= ntohl(*bp++);
-
-	if (volsync) {
-		volsync->creation	= ntohl(*bp++);
-		bp++; /* spare2 */
-		bp++; /* spare3 */
-		bp++; /* spare4 */
-		bp++; /* spare5 */
-		bp++; /* spare6 */
-	}
+	if (skb->len > 0)
+		return -EBADMSG; /* shouldn't be any reply data */
+	return 0;
+}
 
-	/* success */
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_callslot(server, &callslot);
- out:
-	_leave("");
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-} /* end afs_rxfs_fetch_file_status() */
-
-/*****************************************************************************/
 /*
- * fetch the contents of a file or directory
+ * FS.GiveUpCallBacks operation type
  */
-int afs_rxfs_fetch_file_data(struct afs_server *server,
-			     struct afs_vnode *vnode,
-			     struct afs_rxfs_fetch_descriptor *desc,
-			     struct afs_volsync *volsync)
+static const struct afs_call_type afs_RXFSGiveUpCallBacks = {
+	.name		= "FS.GiveUpCallBacks",
+	.deliver	= afs_deliver_fs_give_up_callbacks,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * give up a set of callbacks
+ * - the callbacks are held in the server->cb_break ring
+ */
+int afs_fs_give_up_callbacks(struct afs_server *server,
+			     const struct afs_wait_mode *wait_mode)
 {
-	struct afs_server_callslot callslot;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	size_t sent;
-	int ret;
-	__be32 *bp;
+	struct afs_call *call;
+	size_t ncallbacks;
+	__be32 *bp, *tp;
+	int loop;
 
-	DECLARE_WAITQUEUE(myself, current);
-
-	_enter("%p,{fid={%u,%u,%u},sz=%Zu,of=%lu}",
-	       server,
-	       desc->fid.vid,
-	       desc->fid.vnode,
-	       desc->fid.unique,
-	       desc->size,
-	       desc->offset);
-
-	/* get hold of the fileserver connection */
-	ret = afs_server_request_callslot(server, &callslot);
-	if (ret < 0)
-		goto out;
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = FSFETCHDATA;
+	ncallbacks = CIRC_CNT(server->cb_break_head, server->cb_break_tail,
+			      ARRAY_SIZE(server->cb_break));
+
+	_enter("{%zu},", ncallbacks);
+
+	if (ncallbacks == 0)
+		return 0;
+	if (ncallbacks > AFSCBMAX)
+		ncallbacks = AFSCBMAX;
+
+	_debug("break %zu callbacks", ncallbacks);
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+	call = afs_alloc_flat_call(&afs_RXFSGiveUpCallBacks,
+				   12 + ncallbacks * 6 * 4, 0);
+	if (!call)
+		return -ENOMEM;
+
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
 
 	/* marshall the parameters */
-	bp = rxrpc_call_alloc_scratch(call, 24);
-	bp[0] = htonl(FSFETCHDATA);
-	bp[1] = htonl(desc->fid.vid);
-	bp[2] = htonl(desc->fid.vnode);
-	bp[3] = htonl(desc->fid.unique);
-	bp[4] = htonl(desc->offset);
-	bp[5] = htonl(desc->size);
-
-	piov[0].iov_len = 24;
-	piov[0].iov_base = bp;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the data count to arrive */
-	ret = rxrpc_call_read_data(call, bp, 4, RXRPC_CALL_READ_BLOCK);
-	if (ret < 0)
-		goto read_failed;
-
-	desc->actual = ntohl(bp[0]);
-	if (desc->actual != desc->size) {
-		ret = -EBADMSG;
-		goto abort;
+	bp = call->request;
+	tp = bp + 2 + ncallbacks * 3;
+	*bp++ = htonl(FSGIVEUPCALLBACKS);
+	*bp++ = htonl(ncallbacks);
+	*tp++ = htonl(ncallbacks);
+
+	atomic_sub(ncallbacks, &server->cb_break_n);
+	for (loop = ncallbacks; loop > 0; loop--) {
+		struct afs_callback *cb =
+			&server->cb_break[server->cb_break_tail];
+
+		*bp++ = htonl(cb->fid.vid);
+		*bp++ = htonl(cb->fid.vnode);
+		*bp++ = htonl(cb->fid.unique);
+		*tp++ = htonl(cb->version);
+		*tp++ = htonl(cb->expiry);
+		*tp++ = htonl(cb->type);
+		smp_mb();
+		server->cb_break_tail =
+			(server->cb_break_tail + 1) &
+			(ARRAY_SIZE(server->cb_break) - 1);
 	}
 
-	/* call the app to read the actual data */
-	rxrpc_call_reset_scratch(call);
-
-	ret = rxrpc_call_read_data(call, desc->buffer, desc->actual,
-				   RXRPC_CALL_READ_BLOCK);
-	if (ret < 0)
-		goto read_failed;
-
-	/* wait for the rest of the reply to completely arrive */
-	rxrpc_call_reset_scratch(call);
-	bp = rxrpc_call_alloc_scratch(call, 120);
-
-	ret = rxrpc_call_read_data(call, bp, 120,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0)
-		goto read_failed;
-
-	/* unmarshall the reply */
-	vnode->status.if_version	= ntohl(*bp++);
-	vnode->status.type		= ntohl(*bp++);
-	vnode->status.nlink		= ntohl(*bp++);
-	vnode->status.size		= ntohl(*bp++);
-	vnode->status.version		= ntohl(*bp++);
-	vnode->status.author		= ntohl(*bp++);
-	vnode->status.owner		= ntohl(*bp++);
-	vnode->status.caller_access	= ntohl(*bp++);
-	vnode->status.anon_access	= ntohl(*bp++);
-	vnode->status.mode		= ntohl(*bp++);
-	vnode->status.parent.vid	= desc->fid.vid;
-	vnode->status.parent.vnode	= ntohl(*bp++);
-	vnode->status.parent.unique	= ntohl(*bp++);
-	bp++; /* seg size */
-	vnode->status.mtime_client	= ntohl(*bp++);
-	vnode->status.mtime_server	= ntohl(*bp++);
-	bp++; /* group */
-	bp++; /* sync counter */
-	vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-	bp++; /* spare2 */
-	bp++; /* spare3 */
-	bp++; /* spare4 */
+	ASSERT(ncallbacks > 0);
+	wake_up_nr(&server->cb_break_waitq, ncallbacks);
 
-	vnode->cb_version		= ntohl(*bp++);
-	vnode->cb_expiry		= ntohl(*bp++);
-	vnode->cb_type			= ntohl(*bp++);
-
-	if (volsync) {
-		volsync->creation	= ntohl(*bp++);
-		bp++; /* spare2 */
-		bp++; /* spare3 */
-		bp++; /* spare4 */
-		bp++; /* spare5 */
-		bp++; /* spare6 */
-	}
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
-	/* success */
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq,&myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_callslot(server, &callslot);
- out:
-	_leave(" = %d", ret);
-	return ret;
-
- read_failed:
-	if (ret == -ECONNABORTED) {
-		ret = call->app_errno;
-		goto out_unwait;
-	}
+/*
+ * deliver reply data to an FS.CreateFile or an FS.MakeDir
+ */
+static int afs_deliver_fs_create_vnode(struct afs_call *call,
+				       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
+
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
 
-} /* end afs_rxfs_fetch_file_data() */
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSCallBack_raw(&bp, call->reply4);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.CreateFile and FS.MakeDir operation type
+ */
+static const struct afs_call_type afs_RXFSCreateXXXX = {
+	.name		= "FS.CreateXXXX",
+	.deliver	= afs_deliver_fs_create_vnode,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
 
-/*****************************************************************************/
 /*
- * ask the AFS fileserver to discard a callback request on a file
+ * create a file or make a directory
  */
-int afs_rxfs_give_up_callback(struct afs_server *server,
-			      struct afs_vnode *vnode)
+int afs_fs_create(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  umode_t mode,
+		  struct afs_fid *newfid,
+		  struct afs_file_status *newstatus,
+		  struct afs_callback *newcb,
+		  const struct afs_wait_mode *wait_mode)
 {
-	struct afs_server_callslot callslot;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	size_t sent;
-	int ret;
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
 	__be32 *bp;
 
-	DECLARE_WAITQUEUE(myself, current);
+	_enter("");
 
-	_enter("%p,{%u,%u,%u}",
-	       server, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (6 * 4);
 
-	/* get hold of the fileserver connection */
-	ret = afs_server_request_callslot(server, &callslot);
-	if (ret < 0)
-		goto out;
+	call = afs_alloc_flat_call(&afs_RXFSCreateXXXX, reqsz,
+				   (3 + 21 + 21 + 3 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
 
-	/* create a call through that connection */
-	ret = rxrpc_create_call(callslot.conn, NULL, NULL, afs_rxfs_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->reply4 = newcb;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(S_ISDIR(mode) ? FSMAKEDIR : FSCREATEFILE);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
 	}
-	call->app_opcode = FSGIVEUPCALLBACKS;
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(mode & S_IALLUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
-	/* marshall the parameters */
-	bp = rxrpc_call_alloc_scratch(call, (1 + 4 + 4) * 4);
+/*
+ * deliver reply data to an FS.RemoveFile or FS.RemoveDir
+ */
+static int afs_deliver_fs_remove(struct afs_call *call,
+				 struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
 
-	piov[0].iov_len = (1 + 4 + 4) * 4;
-	piov[0].iov_base = bp;
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	*bp++ = htonl(FSGIVEUPCALLBACKS);
-	*bp++ = htonl(1);
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.RemoveDir/FS.RemoveFile operation type
+ */
+static const struct afs_call_type afs_RXFSRemoveXXXX = {
+	.name		= "FS.RemoveXXXX",
+	.deliver	= afs_deliver_fs_remove,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * remove a file or directory
+ */
+int afs_fs_remove(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *vnode,
+		  const char *name,
+		  bool isdir,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRemoveXXXX, reqsz, (21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(isdir ? FSREMOVEDIR : FSREMOVEFILE);
 	*bp++ = htonl(vnode->fid.vid);
 	*bp++ = htonl(vnode->fid.vnode);
 	*bp++ = htonl(vnode->fid.unique);
-	*bp++ = htonl(1);
-	*bp++ = htonl(vnode->cb_version);
-	*bp++ = htonl(vnode->cb_expiry);
-	*bp++ = htonl(vnode->cb_type);
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-		    signal_pending(current))
-			break;
-		schedule();
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
 	}
-	set_current_state(TASK_RUNNING);
 
-	ret = -EINTR;
-	if (signal_pending(current))
-		goto abort;
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
-	switch (call->app_call_state) {
-	case RXRPC_CSTATE_ERROR:
-		ret = call->app_errno;
-		goto out_unwait;
+/*
+ * deliver reply data to an FS.Link
+ */
+static int afs_deliver_fs_link(struct afs_call *call,
+			       struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *dvnode = call->reply, *vnode = call->reply2;
+	const __be32 *bp;
 
-	case RXRPC_CSTATE_CLNT_GOT_REPLY:
-		ret = 0;
-		goto out_unwait;
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
 
-	default:
-		BUG();
-	}
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	xdr_decode_AFSFetchStatus(&bp, &dvnode->status, dvnode);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Link operation type
+ */
+static const struct afs_call_type afs_RXFSLink = {
+	.name		= "FS.Link",
+	.deliver	= afs_deliver_fs_link,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
 
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_callslot(server, &callslot);
- out:
-	_leave("");
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-} /* end afs_rxfs_give_up_callback() */
-
-/*****************************************************************************/
 /*
- * look a filename up in a directory
- * - this operation doesn't seem to work correctly in OpenAFS server 1.2.2
+ * make a hard link
  */
-#if 0
-int afs_rxfs_lookup(struct afs_server *server,
-		    struct afs_vnode *dir,
-		    const char *filename,
-		    struct afs_vnode *vnode,
-		    struct afs_volsync *volsync)
+int afs_fs_link(struct afs_server *server,
+		struct key *key,
+		struct afs_vnode *dvnode,
+		struct afs_vnode *vnode,
+		const char *name,
+		const struct afs_wait_mode *wait_mode)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[3];
-	size_t sent;
-	int ret;
-	u32 *bp, zero;
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz;
+	__be32 *bp;
 
-	DECLARE_WAITQUEUE(myself, current);
+	_enter("");
 
-	kenter("%p,{%u,%u,%u},%s",
-	       server, fid->vid, fid->vnode, fid->unique, filename);
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+	reqsz = (5 * 4) + namesz + padsz + (3 * 4);
 
-	/* get hold of the fileserver connection */
-	ret = afs_server_get_fsconn(server, &conn);
-	if (ret < 0)
-		goto out;
+	call = afs_alloc_flat_call(&afs_RXFSLink, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
 
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxfs_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
+	call->key = key;
+	call->reply = dvnode;
+	call->reply2 = vnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSLINK);
+	*bp++ = htonl(dvnode->fid.vid);
+	*bp++ = htonl(dvnode->fid.vnode);
+	*bp++ = htonl(dvnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
 	}
-	call->app_opcode = FSLOOKUP;
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
+
+/*
+ * deliver reply data to an FS.Symlink
+ */
+static int afs_deliver_fs_symlink(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *vnode = call->reply;
+	const __be32 *bp;
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq,&myself);
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFid(&bp, call->reply2);
+	xdr_decode_AFSFetchStatus(&bp, call->reply3, NULL);
+	xdr_decode_AFSFetchStatus(&bp, &vnode->status, vnode);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Symlink operation type
+ */
+static const struct afs_call_type afs_RXFSSymlink = {
+	.name		= "FS.Symlink",
+	.deliver	= afs_deliver_fs_symlink,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_symlink(struct afs_server *server,
+		   struct key *key,
+		   struct afs_vnode *vnode,
+		   const char *name,
+		   const char *contents,
+		   struct afs_fid *newfid,
+		   struct afs_file_status *newstatus,
+		   const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t namesz, reqsz, padsz, c_namesz, c_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	namesz = strlen(name);
+	padsz = (4 - (namesz & 3)) & 3;
+
+	c_namesz = strlen(contents);
+	c_padsz = (4 - (c_namesz & 3)) & 3;
+
+	reqsz = (6 * 4) + namesz + padsz + c_namesz + c_padsz + (6 * 4);
+
+	call = afs_alloc_flat_call(&afs_RXFSSymlink, reqsz,
+				   (3 + 21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = vnode;
+	call->reply2 = newfid;
+	call->reply3 = newstatus;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
 
 	/* marshall the parameters */
-	bp = rxrpc_call_alloc_scratch(call, 20);
-
-	zero = 0;
-
-	piov[0].iov_len = 20;
-	piov[0].iov_base = bp;
-	piov[1].iov_len = strlen(filename);
-	piov[1].iov_base = (char *) filename;
-	piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
-	piov[2].iov_base = &zero;
-
-	*bp++ = htonl(FSLOOKUP);
-	*bp++ = htonl(dirfid->vid);
-	*bp++ = htonl(dirfid->vnode);
-	*bp++ = htonl(dirfid->unique);
-	*bp++ = htonl(piov[1].iov_len);
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	bp = rxrpc_call_alloc_scratch(call, 220);
-
-	ret = rxrpc_call_read_data(call, bp, 220,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0) {
-		if (ret == -ECONNABORTED) {
-			ret = call->app_errno;
-			goto out_unwait;
-		}
-		goto abort;
+	bp = call->request;
+	*bp++ = htonl(FSSYMLINK);
+	*bp++ = htonl(vnode->fid.vid);
+	*bp++ = htonl(vnode->fid.vnode);
+	*bp++ = htonl(vnode->fid.unique);
+	*bp++ = htonl(namesz);
+	memcpy(bp, name, namesz);
+	bp = (void *) bp + namesz;
+	if (padsz > 0) {
+		memset(bp, 0, padsz);
+		bp = (void *) bp + padsz;
 	}
+	*bp++ = htonl(c_namesz);
+	memcpy(bp, contents, c_namesz);
+	bp = (void *) bp + c_namesz;
+	if (c_padsz > 0) {
+		memset(bp, 0, c_padsz);
+		bp = (void *) bp + c_padsz;
+	}
+	*bp++ = htonl(AFS_SET_MODE);
+	*bp++ = 0; /* mtime */
+	*bp++ = 0; /* owner */
+	*bp++ = 0; /* group */
+	*bp++ = htonl(S_IRWXUGO); /* unix mode */
+	*bp++ = 0; /* segment size */
 
-	/* unmarshall the reply */
-	fid->vid		= ntohl(*bp++);
-	fid->vnode		= ntohl(*bp++);
-	fid->unique		= ntohl(*bp++);
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
 
-	vnode->status.if_version	= ntohl(*bp++);
-	vnode->status.type		= ntohl(*bp++);
-	vnode->status.nlink		= ntohl(*bp++);
-	vnode->status.size		= ntohl(*bp++);
-	vnode->status.version		= ntohl(*bp++);
-	vnode->status.author		= ntohl(*bp++);
-	vnode->status.owner		= ntohl(*bp++);
-	vnode->status.caller_access	= ntohl(*bp++);
-	vnode->status.anon_access	= ntohl(*bp++);
-	vnode->status.mode		= ntohl(*bp++);
-	vnode->status.parent.vid	= dirfid->vid;
-	vnode->status.parent.vnode	= ntohl(*bp++);
-	vnode->status.parent.unique	= ntohl(*bp++);
-	bp++; /* seg size */
-	vnode->status.mtime_client	= ntohl(*bp++);
-	vnode->status.mtime_server	= ntohl(*bp++);
-	bp++; /* group */
-	bp++; /* sync counter */
-	vnode->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-	bp++; /* spare2 */
-	bp++; /* spare3 */
-	bp++; /* spare4 */
+/*
+ * deliver reply data to an FS.Rename
+ */
+static int afs_deliver_fs_rename(struct afs_call *call,
+				  struct sk_buff *skb, bool last)
+{
+	struct afs_vnode *orig_dvnode = call->reply, *new_dvnode = call->reply2;
+	const __be32 *bp;
 
-	dir->status.if_version		= ntohl(*bp++);
-	dir->status.type		= ntohl(*bp++);
-	dir->status.nlink		= ntohl(*bp++);
-	dir->status.size		= ntohl(*bp++);
-	dir->status.version		= ntohl(*bp++);
-	dir->status.author		= ntohl(*bp++);
-	dir->status.owner		= ntohl(*bp++);
-	dir->status.caller_access	= ntohl(*bp++);
-	dir->status.anon_access		= ntohl(*bp++);
-	dir->status.mode		= ntohl(*bp++);
-	dir->status.parent.vid		= dirfid->vid;
-	dir->status.parent.vnode	= ntohl(*bp++);
-	dir->status.parent.unique	= ntohl(*bp++);
-	bp++; /* seg size */
-	dir->status.mtime_client	= ntohl(*bp++);
-	dir->status.mtime_server	= ntohl(*bp++);
-	bp++; /* group */
-	bp++; /* sync counter */
-	dir->status.version |= ((unsigned long long) ntohl(*bp++)) << 32;
-	bp++; /* spare2 */
-	bp++; /* spare3 */
-	bp++; /* spare4 */
+	_enter("{%u},{%u},%d", call->unmarshall, skb->len, last);
+
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
+
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
+
+	/* unmarshall the reply once we've received all of it */
+	bp = call->buffer;
+	xdr_decode_AFSFetchStatus(&bp, &orig_dvnode->status, orig_dvnode);
+	if (new_dvnode != orig_dvnode)
+		xdr_decode_AFSFetchStatus(&bp, &new_dvnode->status, new_dvnode);
+	/* xdr_decode_AFSVolSync(&bp, call->replyX); */
+
+	_leave(" = 0 [done]");
+	return 0;
+}
+
+/*
+ * FS.Rename operation type
+ */
+static const struct afs_call_type afs_RXFSRename = {
+	.name		= "FS.Rename",
+	.deliver	= afs_deliver_fs_rename,
+	.abort_to_error	= afs_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
+
+/*
+ * create a symbolic link
+ */
+int afs_fs_rename(struct afs_server *server,
+		  struct key *key,
+		  struct afs_vnode *orig_dvnode,
+		  const char *orig_name,
+		  struct afs_vnode *new_dvnode,
+		  const char *new_name,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct afs_call *call;
+	size_t reqsz, o_namesz, o_padsz, n_namesz, n_padsz;
+	__be32 *bp;
+
+	_enter("");
+
+	o_namesz = strlen(orig_name);
+	o_padsz = (4 - (o_namesz & 3)) & 3;
+
+	n_namesz = strlen(new_name);
+	n_padsz = (4 - (n_namesz & 3)) & 3;
+
+	reqsz = (4 * 4) +
+		4 + o_namesz + o_padsz +
+		(3 * 4) +
+		4 + n_namesz + n_padsz;
+
+	call = afs_alloc_flat_call(&afs_RXFSRename, reqsz, (21 + 21 + 6) * 4);
+	if (!call)
+		return -ENOMEM;
+
+	call->key = key;
+	call->reply = orig_dvnode;
+	call->reply2 = new_dvnode;
+	call->service_id = FS_SERVICE;
+	call->port = htons(AFS_FS_PORT);
+
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(FSRENAME);
+	*bp++ = htonl(orig_dvnode->fid.vid);
+	*bp++ = htonl(orig_dvnode->fid.vnode);
+	*bp++ = htonl(orig_dvnode->fid.unique);
+	*bp++ = htonl(o_namesz);
+	memcpy(bp, orig_name, o_namesz);
+	bp = (void *) bp + o_namesz;
+	if (o_padsz > 0) {
+		memset(bp, 0, o_padsz);
+		bp = (void *) bp + o_padsz;
+	}
 
-	callback->fid		= *fid;
-	callback->version	= ntohl(*bp++);
-	callback->expiry	= ntohl(*bp++);
-	callback->type		= ntohl(*bp++);
-
-	if (volsync) {
-		volsync->creation	= ntohl(*bp++);
-		bp++; /* spare2 */
-		bp++; /* spare3 */
-		bp++; /* spare4 */
-		bp++; /* spare5 */
-		bp++; /* spare6 */
+	*bp++ = htonl(new_dvnode->fid.vid);
+	*bp++ = htonl(new_dvnode->fid.vnode);
+	*bp++ = htonl(new_dvnode->fid.unique);
+	*bp++ = htonl(n_namesz);
+	memcpy(bp, new_name, n_namesz);
+	bp = (void *) bp + n_namesz;
+	if (n_padsz > 0) {
+		memset(bp, 0, n_padsz);
+		bp = (void *) bp + n_padsz;
 	}
 
-	/* success */
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	afs_server_release_fsconn(server, conn);
- out:
-	kleave("");
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-} /* end afs_rxfs_lookup() */
-#endif
+	return afs_make_call(&server->addr, call, GFP_NOFS, wait_mode);
+}
diff --git a/fs/afs/fsclient.h b/fs/afs/fsclient.h
deleted file mode 100644
index 8ba3e749ee3c..000000000000
--- a/fs/afs/fsclient.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/* fsclient.h: AFS File Server client stub declarations
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_FSCLIENT_H
-#define _LINUX_AFS_FSCLIENT_H
-
-#include "server.h"
-
-extern int afs_rxfs_get_volume_info(struct afs_server *server,
-				    const char *name,
-				    struct afs_volume_info *vinfo);
-
-extern int afs_rxfs_fetch_file_status(struct afs_server *server,
-				      struct afs_vnode *vnode,
-				      struct afs_volsync *volsync);
-
-struct afs_rxfs_fetch_descriptor {
-	struct afs_fid	fid;		/* file ID to fetch */
-	size_t		size;		/* total number of bytes to fetch */
-	off_t		offset;		/* offset in file to start from */
-	void		*buffer;	/* read buffer */
-	size_t		actual;		/* actual size sent back by server */
-};
-
-extern int afs_rxfs_fetch_file_data(struct afs_server *server,
-				    struct afs_vnode *vnode,
-				    struct afs_rxfs_fetch_descriptor *desc,
-				    struct afs_volsync *volsync);
-
-extern int afs_rxfs_give_up_callback(struct afs_server *server,
-				     struct afs_vnode *vnode);
-
-/* this doesn't appear to work in OpenAFS server */
-extern int afs_rxfs_lookup(struct afs_server *server,
-			   struct afs_vnode *dir,
-			   const char *filename,
-			   struct afs_vnode *vnode,
-			   struct afs_volsync *volsync);
-
-/* this is apparently mis-implemented in OpenAFS server */
-extern int afs_rxfs_get_root_volume(struct afs_server *server,
-				    char *buf,
-				    size_t *buflen);
-
-
-#endif /* _LINUX_AFS_FSCLIENT_H */
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 9d9bca6c28b5..c184a4ee5995 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -19,9 +19,6 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "super.h"
 #include "internal.h"
 
 struct afs_iget_data {
@@ -29,26 +26,25 @@ struct afs_iget_data {
 	struct afs_volume	*volume;	/* volume on which resides */
 };
 
-/*****************************************************************************/
 /*
  * map the AFS file status to the inode member variables
  */
-static int afs_inode_map_status(struct afs_vnode *vnode)
+static int afs_inode_map_status(struct afs_vnode *vnode, struct key *key)
 {
 	struct inode *inode = AFS_VNODE_TO_I(vnode);
 
-	_debug("FS: ft=%d lk=%d sz=%Zu ver=%Lu mod=%hu",
+	_debug("FS: ft=%d lk=%d sz=%llu ver=%Lu mod=%hu",
 	       vnode->status.type,
 	       vnode->status.nlink,
-	       vnode->status.size,
-	       vnode->status.version,
+	       (unsigned long long) vnode->status.size,
+	       vnode->status.data_version,
 	       vnode->status.mode);
 
 	switch (vnode->status.type) {
 	case AFS_FTYPE_FILE:
 		inode->i_mode	= S_IFREG | vnode->status.mode;
 		inode->i_op	= &afs_file_inode_operations;
-		inode->i_fop	= &generic_ro_fops;
+		inode->i_fop	= &afs_file_operations;
 		break;
 	case AFS_FTYPE_DIR:
 		inode->i_mode	= S_IFDIR | vnode->status.mode;
@@ -77,9 +73,9 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
 
 	/* check to see whether a symbolic link is really a mountpoint */
 	if (vnode->status.type == AFS_FTYPE_SYMLINK) {
-		afs_mntpt_check_symlink(vnode);
+		afs_mntpt_check_symlink(vnode, key);
 
-		if (vnode->flags & AFS_VNODE_MOUNTPOINT) {
+		if (test_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags)) {
 			inode->i_mode	= S_IFDIR | vnode->status.mode;
 			inode->i_op	= &afs_mntpt_inode_operations;
 			inode->i_fop	= &afs_mntpt_file_operations;
@@ -87,30 +83,8 @@ static int afs_inode_map_status(struct afs_vnode *vnode)
 	}
 
 	return 0;
-} /* end afs_inode_map_status() */
+}
 
-/*****************************************************************************/
-/*
- * attempt to fetch the status of an inode, coelescing multiple simultaneous
- * fetches
- */
-static int afs_inode_fetch_status(struct inode *inode)
-{
-	struct afs_vnode *vnode;
-	int ret;
-
-	vnode = AFS_FS_I(inode);
-
-	ret = afs_vnode_fetch_status(vnode);
-
-	if (ret == 0)
-		ret = afs_inode_map_status(vnode);
-
-	return ret;
-
-} /* end afs_inode_fetch_status() */
-
-/*****************************************************************************/
 /*
  * iget5() comparator
  */
@@ -120,9 +94,8 @@ static int afs_iget5_test(struct inode *inode, void *opaque)
 
 	return inode->i_ino == data->fid.vnode &&
 		inode->i_version == data->fid.unique;
-} /* end afs_iget5_test() */
+}
 
-/*****************************************************************************/
 /*
  * iget5() inode initialiser
  */
@@ -137,14 +110,14 @@ static int afs_iget5_set(struct inode *inode, void *opaque)
 	vnode->volume = data->volume;
 
 	return 0;
-} /* end afs_iget5_set() */
+}
 
-/*****************************************************************************/
 /*
  * inode retrieval
  */
-inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
-		    struct inode **_inode)
+struct inode *afs_iget(struct super_block *sb, struct key *key,
+		       struct afs_fid *fid, struct afs_file_status *status,
+		       struct afs_callback *cb)
 {
 	struct afs_iget_data data = { .fid = *fid };
 	struct afs_super_info *as;
@@ -161,20 +134,18 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
 			     &data);
 	if (!inode) {
 		_leave(" = -ENOMEM");
-		return -ENOMEM;
+		return ERR_PTR(-ENOMEM);
 	}
 
+	_debug("GOT INODE %p { vl=%x vn=%x, u=%x }",
+	       inode, fid->vid, fid->vnode, fid->unique);
+
 	vnode = AFS_FS_I(inode);
 
 	/* deal with an existing inode */
 	if (!(inode->i_state & I_NEW)) {
-		ret = afs_vnode_fetch_status(vnode);
-		if (ret==0)
-			*_inode = inode;
-		else
-			iput(inode);
-		_leave(" = %d", ret);
-		return ret;
+		_leave(" = %p", inode);
+		return inode;
 	}
 
 #ifdef AFS_CACHING_SUPPORT
@@ -186,100 +157,185 @@ inline int afs_iget(struct super_block *sb, struct afs_fid *fid,
 			       &vnode->cache);
 #endif
 
-	/* okay... it's a new inode */
-	inode->i_flags |= S_NOATIME;
-	vnode->flags |= AFS_VNODE_CHANGED;
-	ret = afs_inode_fetch_status(inode);
-	if (ret<0)
+	if (!status) {
+		/* it's a remotely extant inode */
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto bad_inode;
+	} else {
+		/* it's an inode we just created */
+		memcpy(&vnode->status, status, sizeof(vnode->status));
+
+		if (!cb) {
+			/* it's a symlink we just created (the fileserver
+			 * didn't give us a callback) */
+			vnode->cb_version = 0;
+			vnode->cb_expiry = 0;
+			vnode->cb_type = 0;
+			vnode->cb_expires = get_seconds();
+		} else {
+			vnode->cb_version = cb->version;
+			vnode->cb_expiry = cb->expiry;
+			vnode->cb_type = cb->type;
+			vnode->cb_expires = vnode->cb_expiry + get_seconds();
+		}
+	}
+
+	ret = afs_inode_map_status(vnode, key);
+	if (ret < 0)
 		goto bad_inode;
 
 	/* success */
+	clear_bit(AFS_VNODE_UNSET, &vnode->flags);
+	inode->i_flags |= S_NOATIME;
 	unlock_new_inode(inode);
-
-	*_inode = inode;
-	_leave(" = 0 [CB { v=%u x=%lu t=%u }]",
-	       vnode->cb_version,
-	       vnode->cb_timeout.timo_jif,
-	       vnode->cb_type);
-	return 0;
+	_leave(" = %p [CB { v=%u t=%u }]", inode, vnode->cb_version, vnode->cb_type);
+	return inode;
 
 	/* failure */
- bad_inode:
+bad_inode:
 	make_bad_inode(inode);
 	unlock_new_inode(inode);
 	iput(inode);
 
 	_leave(" = %d [bad]", ret);
+	return ERR_PTR(ret);
+}
+
+/*
+ * validate a vnode/inode
+ * - there are several things we need to check
+ *   - parent dir data changes (rm, rmdir, rename, mkdir, create, link,
+ *     symlink)
+ *   - parent dir metadata changed (security changes)
+ *   - dentry data changed (write, truncate)
+ *   - dentry metadata changed (security changes)
+ */
+int afs_validate(struct afs_vnode *vnode, struct key *key)
+{
+	int ret;
+
+	_enter("{v={%x:%u} fl=%lx},%x",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags,
+	       key_serial(key));
+
+	if (vnode->cb_promised &&
+	    !test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_MODIFIED, &vnode->flags) &&
+	    !test_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		if (vnode->cb_expires < get_seconds() + 10) {
+			_debug("callback expired");
+			set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		} else {
+			goto valid;
+		}
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
+		goto valid;
+
+	mutex_lock(&vnode->validate_lock);
+
+	/* if the promise has expired, we need to check the server again to get
+	 * a new promise - note that if the (parent) directory's metadata was
+	 * changed then the security may be different and we may no longer have
+	 * access */
+	if (!vnode->cb_promised ||
+	    test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags)) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error_unlock;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
+		_debug("file already deleted");
+		ret = -ESTALE;
+		goto error_unlock;
+	}
+
+	/* if the vnode's data version number changed then its contents are
+	 * different */
+	if (test_and_clear_bit(AFS_VNODE_ZAP_DATA, &vnode->flags)) {
+		_debug("zap data {%x:%d}", vnode->fid.vid, vnode->fid.vnode);
+		invalidate_remote_inode(&vnode->vfs_inode);
+	}
+
+	clear_bit(AFS_VNODE_MODIFIED, &vnode->flags);
+	mutex_unlock(&vnode->validate_lock);
+valid:
+	_leave(" = 0");
+	return 0;
+
+error_unlock:
+	mutex_unlock(&vnode->validate_lock);
+	_leave(" = %d", ret);
 	return ret;
-} /* end afs_iget() */
+}
 
-/*****************************************************************************/
 /*
  * read the attributes of an inode
  */
 int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		      struct kstat *stat)
 {
-	struct afs_vnode *vnode;
 	struct inode *inode;
-	int ret;
 
 	inode = dentry->d_inode;
 
 	_enter("{ ino=%lu v=%lu }", inode->i_ino, inode->i_version);
 
-	vnode = AFS_FS_I(inode);
-
-	ret = afs_inode_fetch_status(inode);
-	if (ret == -ENOENT) {
-		_leave(" = %d [%d %p]",
-		       ret, atomic_read(&dentry->d_count), dentry->d_inode);
-		return ret;
-	}
-	else if (ret < 0) {
-		make_bad_inode(inode);
-		_leave(" = %d", ret);
-		return ret;
-	}
-
-	/* transfer attributes from the inode structure to the stat
-	 * structure */
 	generic_fillattr(inode, stat);
-
-	_leave(" = 0 CB { v=%u x=%u t=%u }",
-	       vnode->cb_version,
-	       vnode->cb_expiry,
-	       vnode->cb_type);
-
 	return 0;
-} /* end afs_inode_getattr() */
+}
 
-/*****************************************************************************/
 /*
  * clear an AFS inode
  */
 void afs_clear_inode(struct inode *inode)
 {
+	struct afs_permits *permits;
 	struct afs_vnode *vnode;
 
 	vnode = AFS_FS_I(inode);
 
-	_enter("ino=%lu { vn=%08x v=%u x=%u t=%u }",
-	       inode->i_ino,
+	_enter("{%x:%d.%d} v=%u x=%u t=%u }",
+	       vnode->fid.vid,
 	       vnode->fid.vnode,
+	       vnode->fid.unique,
 	       vnode->cb_version,
 	       vnode->cb_expiry,
-	       vnode->cb_type
-	       );
+	       vnode->cb_type);
 
-	BUG_ON(inode->i_ino != vnode->fid.vnode);
+	_debug("CLEAR INODE %p", inode);
 
-	afs_vnode_give_up_callback(vnode);
+	ASSERTCMP(inode->i_ino, ==, vnode->fid.vnode);
+
+	afs_give_up_callback(vnode);
+
+	if (vnode->server) {
+		spin_lock(&vnode->server->fs_lock);
+		rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+		spin_unlock(&vnode->server->fs_lock);
+		afs_put_server(vnode->server);
+		vnode->server = NULL;
+	}
+
+	ASSERT(!vnode->cb_promised);
 
 #ifdef AFS_CACHING_SUPPORT
 	cachefs_relinquish_cookie(vnode->cache, 0);
 	vnode->cache = NULL;
 #endif
 
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+
 	_leave("");
-} /* end afs_clear_inode() */
+}
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 5151d5da2c2f..6dd3197d1d8d 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -1,6 +1,6 @@
-/* internal.h: internal AFS stuff
+/* internal AFS stuff
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -9,48 +9,391 @@
  * 2 of the License, or (at your option) any later version.
  */
 
-#ifndef AFS_INTERNAL_H
-#define AFS_INTERNAL_H
-
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
+#include <linux/skbuff.h>
+#include <linux/rxrpc.h>
+#include <linux/key.h>
+#include "afs.h"
+#include "afs_vl.h"
+
+#define AFS_CELL_MAX_ADDRS 15
+
+struct afs_call;
+
+typedef enum {
+	AFS_VL_NEW,			/* new, uninitialised record */
+	AFS_VL_CREATING,		/* creating record */
+	AFS_VL_VALID,			/* record is pending */
+	AFS_VL_NO_VOLUME,		/* no such volume available */
+	AFS_VL_UPDATING,		/* update in progress */
+	AFS_VL_VOLUME_DELETED,		/* volume was deleted */
+	AFS_VL_UNCERTAIN,		/* uncertain state (update failed) */
+} __attribute__((packed)) afs_vlocation_state_t;
+
+struct afs_mount_params {
+	bool			rwpath;		/* T if the parent should be considered R/W */
+	bool			force;		/* T to force cell type */
+	afs_voltype_t		type;		/* type of volume requested */
+	int			volnamesz;	/* size of volume name */
+	const char		*volname;	/* name of volume to mount */
+	struct afs_cell		*cell;		/* cell in which to find volume */
+	struct afs_volume	*volume;	/* volume record */
+	struct key		*key;		/* key to use for secure mounting */
+};
 
 /*
- * debug tracing
+ * definition of how to wait for the completion of an operation
  */
-#define kenter(FMT, a...)	printk("==> %s("FMT")\n",__FUNCTION__ , ## a)
-#define kleave(FMT, a...)	printk("<== %s()"FMT"\n",__FUNCTION__ , ## a)
-#define kdebug(FMT, a...)	printk(FMT"\n" , ## a)
-#define kproto(FMT, a...)	printk("### "FMT"\n" , ## a)
-#define knet(FMT, a...)		printk(FMT"\n" , ## a)
-
-#ifdef __KDEBUG
-#define _enter(FMT, a...)	kenter(FMT , ## a)
-#define _leave(FMT, a...)	kleave(FMT , ## a)
-#define _debug(FMT, a...)	kdebug(FMT , ## a)
-#define _proto(FMT, a...)	kproto(FMT , ## a)
-#define _net(FMT, a...)		knet(FMT , ## a)
-#else
-#define _enter(FMT, a...)	do { } while(0)
-#define _leave(FMT, a...)	do { } while(0)
-#define _debug(FMT, a...)	do { } while(0)
-#define _proto(FMT, a...)	do { } while(0)
-#define _net(FMT, a...)		do { } while(0)
-#endif
+struct afs_wait_mode {
+	/* RxRPC received message notification */
+	void (*rx_wakeup)(struct afs_call *call);
 
-static inline void afs_discard_my_signals(void)
-{
-	while (signal_pending(current)) {
-		siginfo_t sinfo;
+	/* synchronous call waiter and call dispatched notification */
+	int (*wait)(struct afs_call *call);
+
+	/* asynchronous call completion */
+	void (*async_complete)(void *reply, int error);
+};
+
+extern const struct afs_wait_mode afs_sync_call;
+extern const struct afs_wait_mode afs_async_call;
 
-		spin_lock_irq(&current->sighand->siglock);
-		dequeue_signal(current,&current->blocked, &sinfo);
-		spin_unlock_irq(&current->sighand->siglock);
-	}
+/*
+ * a record of an in-progress RxRPC call
+ */
+struct afs_call {
+	const struct afs_call_type *type;	/* type of call */
+	const struct afs_wait_mode *wait_mode;	/* completion wait mode */
+	wait_queue_head_t	waitq;		/* processes awaiting completion */
+	struct work_struct	async_work;	/* asynchronous work processor */
+	struct work_struct	work;		/* actual work processor */
+	struct sk_buff_head	rx_queue;	/* received packets */
+	struct rxrpc_call	*rxcall;	/* RxRPC call handle */
+	struct key		*key;		/* security for this call */
+	struct afs_server	*server;	/* server affected by incoming CM call */
+	void			*request;	/* request data (first part) */
+	void			*request2;	/* request data (second part) */
+	void			*buffer;	/* reply receive buffer */
+	void			*reply;		/* reply buffer (first part) */
+	void			*reply2;	/* reply buffer (second part) */
+	void			*reply3;	/* reply buffer (third part) */
+	void			*reply4;	/* reply buffer (fourth part) */
+	enum {					/* call state */
+		AFS_CALL_REQUESTING,	/* request is being sent for outgoing call */
+		AFS_CALL_AWAIT_REPLY,	/* awaiting reply to outgoing call */
+		AFS_CALL_AWAIT_OP_ID,	/* awaiting op ID on incoming call */
+		AFS_CALL_AWAIT_REQUEST,	/* awaiting request data on incoming call */
+		AFS_CALL_REPLYING,	/* replying to incoming call */
+		AFS_CALL_AWAIT_ACK,	/* awaiting final ACK of incoming call */
+		AFS_CALL_COMPLETE,	/* successfully completed */
+		AFS_CALL_BUSY,		/* server was busy */
+		AFS_CALL_ABORTED,	/* call was aborted */
+		AFS_CALL_ERROR,		/* call failed due to error */
+	}			state;
+	int			error;		/* error code */
+	unsigned		request_size;	/* size of request data */
+	unsigned		reply_max;	/* maximum size of reply */
+	unsigned		reply_size;	/* current size of reply */
+	unsigned short		offset;		/* offset into received data store */
+	unsigned char		unmarshall;	/* unmarshalling phase */
+	bool			incoming;	/* T if incoming call */
+	u16			service_id;	/* RxRPC service ID to call */
+	__be16			port;		/* target UDP port */
+	__be32			operation_ID;	/* operation ID for an incoming call */
+	u32			count;		/* count for use in unmarshalling */
+	__be32			tmp;		/* place to extract temporary data */
+};
+
+struct afs_call_type {
+	const char *name;
+
+	/* deliver request or reply data to an call
+	 * - returning an error will cause the call to be aborted
+	 */
+	int (*deliver)(struct afs_call *call, struct sk_buff *skb,
+		       bool last);
+
+	/* map an abort code to an error number */
+	int (*abort_to_error)(u32 abort_code);
+
+	/* clean up a call */
+	void (*destructor)(struct afs_call *call);
+};
+
+/*
+ * AFS superblock private data
+ * - there's one superblock per volume
+ */
+struct afs_super_info {
+	struct afs_volume	*volume;	/* volume record */
+	char			rwparent;	/* T if parent is R/W AFS volume */
+};
+
+static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
+{
+	return sb->s_fs_info;
 }
 
+extern struct file_system_type afs_fs_type;
+
+/*
+ * entry in the cached cell catalogue
+ */
+struct afs_cache_cell {
+	char		name[AFS_MAXCELLNAME];	/* cell name (padded with NULs) */
+	struct in_addr	vl_servers[15];		/* cached cell VL servers */
+};
+
+/*
+ * AFS cell record
+ */
+struct afs_cell {
+	atomic_t		usage;
+	struct list_head	link;		/* main cell list link */
+	struct key		*anonymous_key;	/* anonymous user key for this cell */
+	struct list_head	proc_link;	/* /proc cell list link */
+	struct proc_dir_entry	*proc_dir;	/* /proc dir for this cell */
+#ifdef AFS_CACHING_SUPPORT
+	struct cachefs_cookie	*cache;		/* caching cookie */
+#endif
+
+	/* server record management */
+	rwlock_t		servers_lock;	/* active server list lock */
+	struct list_head	servers;	/* active server list */
+
+	/* volume location record management */
+	struct rw_semaphore	vl_sem;		/* volume management serialisation semaphore */
+	struct list_head	vl_list;	/* cell's active VL record list */
+	spinlock_t		vl_lock;	/* vl_list lock */
+	unsigned short		vl_naddrs;	/* number of VL servers in addr list */
+	unsigned short		vl_curr_svix;	/* current server index */
+	struct in_addr		vl_addrs[AFS_CELL_MAX_ADDRS];	/* cell VL server addresses */
+
+	char			name[0];	/* cell name - must go last */
+};
+
+/*
+ * entry in the cached volume location catalogue
+ */
+struct afs_cache_vlocation {
+	/* volume name (lowercase, padded with NULs) */
+	uint8_t			name[AFS_MAXVOLNAME + 1];
+
+	uint8_t			nservers;	/* number of entries used in servers[] */
+	uint8_t			vidmask;	/* voltype mask for vid[] */
+	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
+#define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
+#define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
+#define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
+
+	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
+	struct in_addr		servers[8];	/* fileserver addresses */
+	time_t			rtime;		/* last retrieval time */
+};
+
+/*
+ * volume -> vnode hash table entry
+ */
+struct afs_cache_vhash {
+	afs_voltype_t		vtype;		/* which volume variation */
+	uint8_t			hash_bucket;	/* which hash bucket this represents */
+} __attribute__((packed));
+
+/*
+ * AFS volume location record
+ */
+struct afs_vlocation {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct list_head	link;		/* link in cell volume location list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct list_head	update;		/* link in master update list */
+	struct afs_cell		*cell;		/* cell to which volume belongs */
+#ifdef AFS_CACHING_SUPPORT
+	struct cachefs_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_cache_vlocation vldb;	/* volume information DB record */
+	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
+	wait_queue_head_t	waitq;		/* status change waitqueue */
+	time_t			update_at;	/* time at which record should be updated */
+	spinlock_t		lock;		/* access lock */
+	afs_vlocation_state_t	state;		/* volume location state */
+	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
+	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
+	bool			valid;		/* T if valid */
+};
+
+/*
+ * AFS fileserver record
+ */
+struct afs_server {
+	atomic_t		usage;
+	time_t			time_of_death;	/* time at which put reduced usage to 0 */
+	struct in_addr		addr;		/* server address */
+	struct afs_cell		*cell;		/* cell in which server resides */
+	struct list_head	link;		/* link in cell's server list */
+	struct list_head	grave;		/* link in master graveyard list */
+	struct rb_node		master_rb;	/* link in master by-addr tree */
+	struct rw_semaphore	sem;		/* access lock */
+
+	/* file service access */
+	struct rb_root		fs_vnodes;	/* vnodes backed by this server (ordered by FID) */
+	unsigned long		fs_act_jif;	/* time at which last activity occurred */
+	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
+	spinlock_t		fs_lock;	/* access lock */
+	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
+
+	/* callback promise management */
+	struct rb_root		cb_promises;	/* vnode expiration list (ordered earliest first) */
+	struct delayed_work	cb_updater;	/* callback updater */
+	struct delayed_work	cb_break_work;	/* collected break dispatcher */
+	wait_queue_head_t	cb_break_waitq;	/* space available in cb_break waitqueue */
+	spinlock_t		cb_lock;	/* access lock */
+	struct afs_callback	cb_break[64];	/* ring of callbacks awaiting breaking */
+	atomic_t		cb_break_n;	/* number of pending breaks */
+	u8			cb_break_head;	/* head of callback breaking ring */
+	u8			cb_break_tail;	/* tail of callback breaking ring */
+};
+
+/*
+ * AFS volume access record
+ */
+struct afs_volume {
+	atomic_t		usage;
+	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
+	struct afs_vlocation	*vlocation;	/* volume location */
+#ifdef AFS_CACHING_SUPPORT
+	struct cachefs_cookie	*cache;		/* caching cookie */
+#endif
+	afs_volid_t		vid;		/* volume ID */
+	afs_voltype_t		type;		/* type of volume */
+	char			type_force;	/* force volume type (suppress R/O -> R/W) */
+	unsigned short		nservers;	/* number of server slots filled */
+	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
+	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
+	struct rw_semaphore	server_sem;	/* lock for accessing current server */
+};
+
+/*
+ * vnode catalogue entry
+ */
+struct afs_cache_vnode {
+	afs_vnodeid_t		vnode_id;	/* vnode ID */
+	unsigned		vnode_unique;	/* vnode ID uniquifier */
+	afs_dataversion_t	data_version;	/* data version */
+};
+
+/*
+ * AFS inode private data
+ */
+struct afs_vnode {
+	struct inode		vfs_inode;	/* the VFS's inode record */
+
+	struct afs_volume	*volume;	/* volume on which vnode resides */
+	struct afs_server	*server;	/* server currently supplying this file */
+	struct afs_fid		fid;		/* the file identifier for this inode */
+	struct afs_file_status	status;		/* AFS status info for this file */
+#ifdef AFS_CACHING_SUPPORT
+	struct cachefs_cookie	*cache;		/* caching cookie */
+#endif
+	struct afs_permits	*permits;	/* cache of permits so far obtained */
+	struct mutex		permits_lock;	/* lock for altering permits list */
+	struct mutex		validate_lock;	/* lock for validating this vnode */
+	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
+	int			update_cnt;	/* number of outstanding ops that will update the
+						 * status */
+	spinlock_t		lock;		/* waitqueue/flags lock */
+	unsigned long		flags;
+#define AFS_VNODE_CB_BROKEN	0		/* set if vnode's callback was broken */
+#define AFS_VNODE_UNSET		1		/* set if vnode attributes not yet set */
+#define AFS_VNODE_MODIFIED	2		/* set if vnode's data modified */
+#define AFS_VNODE_ZAP_DATA	3		/* set if vnode's data should be invalidated */
+#define AFS_VNODE_DELETED	4		/* set if vnode deleted on server */
+#define AFS_VNODE_MOUNTPOINT	5		/* set if vnode is a mountpoint symlink */
+
+	long			acl_order;	/* ACL check count (callback break count) */
+
+	/* outstanding callback notification on this file */
+	struct rb_node		server_rb;	/* link in server->fs_vnodes */
+	struct rb_node		cb_promise;	/* link in server->cb_promises */
+	struct work_struct	cb_broken_work;	/* work to be done on callback break */
+	time_t			cb_expires;	/* time at which callback expires */
+	time_t			cb_expires_at;	/* time used to order cb_promise */
+	unsigned		cb_version;	/* callback version */
+	unsigned		cb_expiry;	/* callback expiry time */
+	afs_callback_type_t	cb_type;	/* type of callback */
+	bool			cb_promised;	/* true if promise still holds */
+};
+
+/*
+ * cached security record for one user's attempt to access a vnode
+ */
+struct afs_permit {
+	struct key		*key;		/* RxRPC ticket holding a security context */
+	afs_access_t		access_mask;	/* access mask for this key */
+};
+
+/*
+ * cache of security records from attempts to access a vnode
+ */
+struct afs_permits {
+	struct rcu_head		rcu;		/* disposal procedure */
+	int			count;		/* number of records */
+	struct afs_permit	permits[0];	/* the permits so far examined */
+};
+
+/*
+ * record of one of a system's set of network interfaces
+ */
+struct afs_interface {
+	unsigned	index;		/* interface index */
+	struct in_addr	address;	/* IPv4 address bound to interface */
+	struct in_addr	netmask;	/* netmask applied to address */
+	unsigned	mtu;		/* MTU of interface */
+};
+
+/*
+ * UUID definition [internet draft]
+ * - the timestamp is a 60-bit value, split 32/16/12, and goes in 100ns
+ *   increments since midnight 15th October 1582
+ *   - add AFS_UUID_TO_UNIX_TIME to convert unix time in 100ns units to UUID
+ *     time
+ * - the clock sequence is a 14-bit counter to avoid duplicate times
+ */
+struct afs_uuid {
+	u32		time_low;			/* low part of timestamp */
+	u16		time_mid;			/* mid part of timestamp */
+	u16		time_hi_and_version;		/* high part of timestamp and version  */
+#define AFS_UUID_TO_UNIX_TIME	0x01b21dd213814000
+#define AFS_UUID_TIMEHI_MASK	0x0fff
+#define AFS_UUID_VERSION_TIME	0x1000	/* time-based UUID */
+#define AFS_UUID_VERSION_NAME	0x3000	/* name-based UUID */
+#define AFS_UUID_VERSION_RANDOM	0x4000	/* (pseudo-)random generated UUID */
+	u8		clock_seq_hi_and_reserved;	/* clock seq hi and variant */
+#define AFS_UUID_CLOCKHI_MASK	0x3f
+#define AFS_UUID_VARIANT_STD	0x80
+	u8		clock_seq_low;			/* clock seq low */
+	u8		node[6];			/* spatially unique node ID (MAC addr) */
+};
+
+/*****************************************************************************/
+/*
+ * callback.c
+ */
+extern void afs_init_callback_state(struct afs_server *);
+extern void afs_broken_callback_work(struct work_struct *);
+extern void afs_break_callbacks(struct afs_server *, size_t,
+				struct afs_callback[]);
+extern void afs_discard_callback_on_delete(struct afs_vnode *);
+extern void afs_give_up_callback(struct afs_vnode *);
+extern void afs_dispatch_give_up_callbacks(struct work_struct *);
+extern void afs_flush_callback_breaks(struct afs_server *);
+extern int __init afs_callback_update_init(void);
+extern void __exit afs_callback_update_kill(void);
+
 /*
  * cell.c
  */
@@ -60,57 +403,156 @@ extern struct list_head afs_proc_cells;
 extern struct cachefs_index_def afs_cache_cell_index_def;
 #endif
 
+#define afs_get_cell(C) do { atomic_inc(&(C)->usage); } while(0)
+extern int afs_cell_init(char *);
+extern struct afs_cell *afs_cell_create(const char *, char *);
+extern struct afs_cell *afs_cell_lookup(const char *, unsigned);
+extern struct afs_cell *afs_grab_cell(struct afs_cell *);
+extern void afs_put_cell(struct afs_cell *);
+extern void afs_cell_purge(void);
+
+/*
+ * cmservice.c
+ */
+extern bool afs_cm_incoming_call(struct afs_call *);
+
 /*
  * dir.c
  */
 extern const struct inode_operations afs_dir_inode_operations;
 extern const struct file_operations afs_dir_file_operations;
 
+extern int afs_permission(struct inode *, int, struct nameidata *);
+
 /*
  * file.c
  */
 extern const struct address_space_operations afs_fs_aops;
 extern const struct inode_operations afs_file_inode_operations;
+extern const struct file_operations afs_file_operations;
+
+extern int afs_open(struct inode *, struct file *);
+extern int afs_release(struct inode *, struct file *);
 
 #ifdef AFS_CACHING_SUPPORT
-extern int afs_cache_get_page_cookie(struct page *page,
-				     struct cachefs_page **_page_cookie);
+extern int afs_cache_get_page_cookie(struct page *, struct cachefs_page **);
 #endif
 
 /*
- * inode.c
+ * fsclient.c
  */
-extern int afs_iget(struct super_block *sb, struct afs_fid *fid,
-		    struct inode **_inode);
-extern int afs_inode_getattr(struct vfsmount *mnt, struct dentry *dentry,
-			     struct kstat *stat);
-extern void afs_clear_inode(struct inode *inode);
+extern int afs_fs_fetch_file_status(struct afs_server *, struct key *,
+				    struct afs_vnode *, struct afs_volsync *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_give_up_callbacks(struct afs_server *,
+				    const struct afs_wait_mode *);
+extern int afs_fs_fetch_data(struct afs_server *, struct key *,
+			     struct afs_vnode *, off_t, size_t, struct page *,
+			     const struct afs_wait_mode *);
+extern int afs_fs_create(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, umode_t,
+			 struct afs_fid *, struct afs_file_status *,
+			 struct afs_callback *,
+			 const struct afs_wait_mode *);
+extern int afs_fs_remove(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *, bool,
+			 const struct afs_wait_mode *);
+extern int afs_fs_link(struct afs_server *, struct key *, struct afs_vnode *,
+		       struct afs_vnode *, const char *,
+		       const struct afs_wait_mode *);
+extern int afs_fs_symlink(struct afs_server *, struct key *,
+			  struct afs_vnode *, const char *, const char *,
+			  struct afs_fid *, struct afs_file_status *,
+			  const struct afs_wait_mode *);
+extern int afs_fs_rename(struct afs_server *, struct key *,
+			 struct afs_vnode *, const char *,
+			 struct afs_vnode *, const char *,
+			 const struct afs_wait_mode *);
 
 /*
- * key_afs.c
+ * inode.c
  */
-#ifdef CONFIG_KEYS
-extern int afs_key_register(void);
-extern void afs_key_unregister(void);
-#endif
+extern struct inode *afs_iget(struct super_block *, struct key *,
+			      struct afs_fid *, struct afs_file_status *,
+			      struct afs_callback *);
+extern int afs_validate(struct afs_vnode *, struct key *);
+extern int afs_inode_getattr(struct vfsmount *, struct dentry *,
+			     struct kstat *);
+extern void afs_zap_permits(struct rcu_head *);
+extern void afs_clear_inode(struct inode *);
 
 /*
  * main.c
  */
+extern struct afs_uuid afs_uuid;
 #ifdef AFS_CACHING_SUPPORT
 extern struct cachefs_netfs afs_cache_netfs;
 #endif
 
 /*
+ * misc.c
+ */
+extern int afs_abort_to_error(u32);
+
+/*
  * mntpt.c
  */
 extern const struct inode_operations afs_mntpt_inode_operations;
 extern const struct file_operations afs_mntpt_file_operations;
-extern struct afs_timer afs_mntpt_expiry_timer;
-extern struct afs_timer_ops afs_mntpt_expiry_timer_ops;
 extern unsigned long afs_mntpt_expiry_timeout;
 
-extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
+extern int afs_mntpt_check_symlink(struct afs_vnode *, struct key *);
+extern void afs_mntpt_kill_timer(void);
+extern void afs_umount_begin(struct vfsmount *, int);
+
+/*
+ * proc.c
+ */
+extern int afs_proc_init(void);
+extern void afs_proc_cleanup(void);
+extern int afs_proc_cell_setup(struct afs_cell *);
+extern void afs_proc_cell_remove(struct afs_cell *);
+
+/*
+ * rxrpc.c
+ */
+extern int afs_open_socket(void);
+extern void afs_close_socket(void);
+extern int afs_make_call(struct in_addr *, struct afs_call *, gfp_t,
+			 const struct afs_wait_mode *);
+extern struct afs_call *afs_alloc_flat_call(const struct afs_call_type *,
+					    size_t, size_t);
+extern void afs_flat_call_destructor(struct afs_call *);
+extern void afs_transfer_reply(struct afs_call *, struct sk_buff *);
+extern void afs_send_empty_reply(struct afs_call *);
+extern void afs_send_simple_reply(struct afs_call *, const void *, size_t);
+extern int afs_extract_data(struct afs_call *, struct sk_buff *, bool, void *,
+			    size_t);
+
+/*
+ * security.c
+ */
+extern void afs_clear_permits(struct afs_vnode *);
+extern void afs_cache_permit(struct afs_vnode *, struct key *, long);
+extern struct key *afs_request_key(struct afs_cell *);
+extern int afs_permission(struct inode *, int, struct nameidata *);
+
+/*
+ * server.c
+ */
+extern spinlock_t afs_server_peer_lock;
+
+#define afs_get_server(S)					\
+do {								\
+	_debug("GET SERVER %d", atomic_read(&(S)->usage));	\
+	atomic_inc(&(S)->usage);				\
+} while(0)
+
+extern struct afs_server *afs_lookup_server(struct afs_cell *,
+					    const struct in_addr *);
+extern struct afs_server *afs_find_server(const struct in_addr *);
+extern void afs_put_server(struct afs_server *);
+extern void __exit afs_purge_servers(void);
 
 /*
  * super.c
@@ -118,22 +560,211 @@ extern int afs_mntpt_check_symlink(struct afs_vnode *vnode);
 extern int afs_fs_init(void);
 extern void afs_fs_exit(void);
 
-#define AFS_CB_HASH_COUNT (PAGE_SIZE / sizeof(struct list_head))
+/*
+ * use-rtnetlink.c
+ */
+extern int afs_get_ipv4_interfaces(struct afs_interface *, size_t, bool);
+extern int afs_get_MAC_address(u8 [6]);
 
-extern struct list_head afs_cb_hash_tbl[];
-extern spinlock_t afs_cb_hash_lock;
+/*
+ * vlclient.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vlocation_cache_index_def;
+#endif
 
-#define afs_cb_hash(SRV,FID) \
-	afs_cb_hash_tbl[((unsigned long)(SRV) + \
-			(FID)->vid + (FID)->vnode + (FID)->unique) % \
-			AFS_CB_HASH_COUNT]
+extern int afs_vl_get_entry_by_name(struct in_addr *, struct key *,
+				    const char *, struct afs_cache_vlocation *,
+				    const struct afs_wait_mode *);
+extern int afs_vl_get_entry_by_id(struct in_addr *, struct key *,
+				  afs_volid_t, afs_voltype_t,
+				  struct afs_cache_vlocation *,
+				  const struct afs_wait_mode *);
 
 /*
- * proc.c
+ * vlocation.c
  */
-extern int afs_proc_init(void);
-extern void afs_proc_cleanup(void);
-extern int afs_proc_cell_setup(struct afs_cell *cell);
-extern void afs_proc_cell_remove(struct afs_cell *cell);
+#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern int __init afs_vlocation_update_init(void);
+extern struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *,
+						  struct key *,
+						  const char *, size_t);
+extern void afs_put_vlocation(struct afs_vlocation *);
+extern void __exit afs_vlocation_purge(void);
+
+/*
+ * vnode.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_vnode_cache_index_def;
+#endif
+
+extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
+
+static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
+{
+	return container_of(inode, struct afs_vnode, vfs_inode);
+}
+
+static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
+{
+	return &vnode->vfs_inode;
+}
+
+extern void afs_vnode_finalise_status_update(struct afs_vnode *,
+					     struct afs_server *);
+extern int afs_vnode_fetch_status(struct afs_vnode *, struct afs_vnode *,
+				  struct key *);
+extern int afs_vnode_fetch_data(struct afs_vnode *, struct key *,
+				off_t, size_t, struct page *);
+extern int afs_vnode_create(struct afs_vnode *, struct key *, const char *,
+			    umode_t, struct afs_fid *, struct afs_file_status *,
+			    struct afs_callback *, struct afs_server **);
+extern int afs_vnode_remove(struct afs_vnode *, struct key *, const char *,
+			    bool);
+extern int afs_vnode_link(struct afs_vnode *, struct afs_vnode *, struct key *,
+			  const char *);
+extern int afs_vnode_symlink(struct afs_vnode *, struct key *, const char *,
+			     const char *, struct afs_fid *,
+			     struct afs_file_status *, struct afs_server **);
+extern int afs_vnode_rename(struct afs_vnode *, struct afs_vnode *,
+			    struct key *, const char *, const char *);
+
+/*
+ * volume.c
+ */
+#ifdef AFS_CACHING_SUPPORT
+extern struct cachefs_index_def afs_volume_cache_index_def;
+#endif
+
+#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
+
+extern void afs_put_volume(struct afs_volume *);
+extern struct afs_volume *afs_volume_lookup(struct afs_mount_params *);
+extern struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *);
+extern int afs_volume_release_fileserver(struct afs_vnode *,
+					 struct afs_server *, int);
+
+/*****************************************************************************/
+/*
+ * debug tracing
+ */
+extern unsigned afs_debug;
+
+#define dbgprintk(FMT,...) \
+	printk("[%x%-6.6s] "FMT"\n", smp_processor_id(), current->comm ,##__VA_ARGS__)
+
+/* make sure we maintain the format strings, even when debugging is disabled */
+static inline __attribute__((format(printf,1,2)))
+void _dbprintk(const char *fmt, ...)
+{
+}
+
+#define kenter(FMT,...)	dbgprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define kleave(FMT,...)	dbgprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define kdebug(FMT,...)	dbgprintk("    "FMT ,##__VA_ARGS__)
+
+
+#if defined(__KDEBUG)
+#define _enter(FMT,...)	kenter(FMT,##__VA_ARGS__)
+#define _leave(FMT,...)	kleave(FMT,##__VA_ARGS__)
+#define _debug(FMT,...)	kdebug(FMT,##__VA_ARGS__)
+
+#elif defined(CONFIG_AFS_DEBUG)
+#define AFS_DEBUG_KENTER	0x01
+#define AFS_DEBUG_KLEAVE	0x02
+#define AFS_DEBUG_KDEBUG	0x04
+
+#define _enter(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KENTER))	\
+		kenter(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _leave(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KLEAVE))	\
+		kleave(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#define _debug(FMT,...)					\
+do {							\
+	if (unlikely(afs_debug & AFS_DEBUG_KDEBUG))	\
+		kdebug(FMT,##__VA_ARGS__);		\
+} while (0)
+
+#else
+#define _enter(FMT,...)	_dbprintk("==> %s("FMT")",__FUNCTION__ ,##__VA_ARGS__)
+#define _leave(FMT,...)	_dbprintk("<== %s()"FMT"",__FUNCTION__ ,##__VA_ARGS__)
+#define _debug(FMT,...)	_dbprintk("    "FMT ,##__VA_ARGS__)
+#endif
+
+/*
+ * debug assertion checking
+ */
+#if 1 // defined(__KDEBUGALL)
+
+#define ASSERT(X)						\
+do {								\
+	if (unlikely(!(X))) {					\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)						\
+do {									\
+	if (unlikely(!((X) OP (Y)))) {					\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#define ASSERTIF(C, X)						\
+do {								\
+	if (unlikely((C) && !(X))) {				\
+		printk(KERN_ERR "\n");				\
+		printk(KERN_ERR "AFS: Assertion failed\n");	\
+		BUG();						\
+	}							\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)					\
+do {									\
+	if (unlikely((C) && !((X) OP (Y)))) {				\
+		printk(KERN_ERR "\n");					\
+		printk(KERN_ERR "AFS: Assertion failed\n");		\
+		printk(KERN_ERR "%lu " #OP " %lu is false\n",		\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		printk(KERN_ERR "0x%lx " #OP " 0x%lx is false\n",	\
+		       (unsigned long)(X), (unsigned long)(Y));		\
+		BUG();							\
+	}								\
+} while(0)
+
+#else
+
+#define ASSERT(X)				\
+do {						\
+} while(0)
+
+#define ASSERTCMP(X, OP, Y)			\
+do {						\
+} while(0)
+
+#define ASSERTIF(C, X)				\
+do {						\
+} while(0)
+
+#define ASSERTIFCMP(C, X, OP, Y)		\
+do {						\
+} while(0)
 
-#endif /* AFS_INTERNAL_H */
+#endif /* __KDEBUGALL */
diff --git a/fs/afs/kafsasyncd.c b/fs/afs/kafsasyncd.c
deleted file mode 100644
index 615df2407cb2..000000000000
--- a/fs/afs/kafsasyncd.c
+++ /dev/null
@@ -1,255 +0,0 @@
-/* kafsasyncd.c: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- *
- *
- * The AFS async daemon is used to the following:
- * - probe "dead" servers to see whether they've come back to life yet.
- * - probe "live" servers that we haven't talked to for a while to see if they are better
- *   candidates for serving than what we're currently using
- * - poll volume location servers to keep up to date volume location lists
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "server.h"
-#include "volume.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include <rxrpc/call.h>
-#include <asm/errno.h>
-#include "internal.h"
-
-static DECLARE_COMPLETION(kafsasyncd_alive);
-static DECLARE_COMPLETION(kafsasyncd_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafsasyncd_sleepq);
-static struct task_struct *kafsasyncd_task;
-static int kafsasyncd_die;
-
-static int kafsasyncd(void *arg);
-
-static LIST_HEAD(kafsasyncd_async_attnq);
-static LIST_HEAD(kafsasyncd_async_busyq);
-static DEFINE_SPINLOCK(kafsasyncd_async_lock);
-
-static void kafsasyncd_null_call_attn_func(struct rxrpc_call *call)
-{
-}
-
-static void kafsasyncd_null_call_error_func(struct rxrpc_call *call)
-{
-}
-
-/*****************************************************************************/
-/*
- * start the async daemon
- */
-int afs_kafsasyncd_start(void)
-{
-	int ret;
-
-	ret = kernel_thread(kafsasyncd, NULL, 0);
-	if (ret < 0)
-		return ret;
-
-	wait_for_completion(&kafsasyncd_alive);
-
-	return ret;
-} /* end afs_kafsasyncd_start() */
-
-/*****************************************************************************/
-/*
- * stop the async daemon
- */
-void afs_kafsasyncd_stop(void)
-{
-	/* get rid of my daemon */
-	kafsasyncd_die = 1;
-	wake_up(&kafsasyncd_sleepq);
-	wait_for_completion(&kafsasyncd_dead);
-
-} /* end afs_kafsasyncd_stop() */
-
-/*****************************************************************************/
-/*
- * probing daemon
- */
-static int kafsasyncd(void *arg)
-{
-	struct afs_async_op *op;
-	int die;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	kafsasyncd_task = current;
-
-	printk("kAFS: Started kafsasyncd %d\n", current->pid);
-
-	daemonize("kafsasyncd");
-
-	complete(&kafsasyncd_alive);
-
-	/* loop around looking for things to attend to */
-	do {
-		set_current_state(TASK_INTERRUPTIBLE);
-		add_wait_queue(&kafsasyncd_sleepq, &myself);
-
-		for (;;) {
-			if (!list_empty(&kafsasyncd_async_attnq) ||
-			    signal_pending(current) ||
-			    kafsasyncd_die)
-				break;
-
-			schedule();
-			set_current_state(TASK_INTERRUPTIBLE);
-		}
-
-		remove_wait_queue(&kafsasyncd_sleepq, &myself);
-		set_current_state(TASK_RUNNING);
-
-		try_to_freeze();
-
-		/* discard pending signals */
-		afs_discard_my_signals();
-
-		die = kafsasyncd_die;
-
-		/* deal with the next asynchronous operation requiring
-		 * attention */
-		if (!list_empty(&kafsasyncd_async_attnq)) {
-			struct afs_async_op *op;
-
-			_debug("@@@ Begin Asynchronous Operation");
-
-			op = NULL;
-			spin_lock(&kafsasyncd_async_lock);
-
-			if (!list_empty(&kafsasyncd_async_attnq)) {
-				op = list_entry(kafsasyncd_async_attnq.next,
-						struct afs_async_op, link);
-				list_move_tail(&op->link,
-					      &kafsasyncd_async_busyq);
-			}
-
-			spin_unlock(&kafsasyncd_async_lock);
-
-			_debug("@@@ Operation %p {%p}\n",
-			       op, op ? op->ops : NULL);
-
-			if (op)
-				op->ops->attend(op);
-
-			_debug("@@@ End Asynchronous Operation");
-		}
-
-	} while(!die);
-
-	/* need to kill all outstanding asynchronous operations before
-	 * exiting */
-	kafsasyncd_task = NULL;
-	spin_lock(&kafsasyncd_async_lock);
-
-	/* fold the busy and attention queues together */
-	list_splice_init(&kafsasyncd_async_busyq,
-			 &kafsasyncd_async_attnq);
-
-	/* dequeue kafsasyncd from all their wait queues */
-	list_for_each_entry(op, &kafsasyncd_async_attnq, link) {
-		op->call->app_attn_func = kafsasyncd_null_call_attn_func;
-		op->call->app_error_func = kafsasyncd_null_call_error_func;
-		remove_wait_queue(&op->call->waitq, &op->waiter);
-	}
-
-	spin_unlock(&kafsasyncd_async_lock);
-
-	/* abort all the operations */
-	while (!list_empty(&kafsasyncd_async_attnq)) {
-		op = list_entry(kafsasyncd_async_attnq.next, struct afs_async_op, link);
-		list_del_init(&op->link);
-
-		rxrpc_call_abort(op->call, -EIO);
-		rxrpc_put_call(op->call);
-		op->call = NULL;
-
-		op->ops->discard(op);
-	}
-
-	/* and that's all */
-	_leave("");
-	complete_and_exit(&kafsasyncd_dead, 0);
-
-} /* end kafsasyncd() */
-
-/*****************************************************************************/
-/*
- * begin an operation
- * - place operation on busy queue
- */
-void afs_kafsasyncd_begin_op(struct afs_async_op *op)
-{
-	_enter("");
-
-	spin_lock(&kafsasyncd_async_lock);
-
-	init_waitqueue_entry(&op->waiter, kafsasyncd_task);
-	add_wait_queue(&op->call->waitq, &op->waiter);
-
-	list_move_tail(&op->link, &kafsasyncd_async_busyq);
-
-	spin_unlock(&kafsasyncd_async_lock);
-
-	_leave("");
-} /* end afs_kafsasyncd_begin_op() */
-
-/*****************************************************************************/
-/*
- * request attention for an operation
- * - move to attention queue
- */
-void afs_kafsasyncd_attend_op(struct afs_async_op *op)
-{
-	_enter("");
-
-	spin_lock(&kafsasyncd_async_lock);
-
-	list_move_tail(&op->link, &kafsasyncd_async_attnq);
-
-	spin_unlock(&kafsasyncd_async_lock);
-
-	wake_up(&kafsasyncd_sleepq);
-
-	_leave("");
-} /* end afs_kafsasyncd_attend_op() */
-
-/*****************************************************************************/
-/*
- * terminate an operation
- * - remove from either queue
- */
-void afs_kafsasyncd_terminate_op(struct afs_async_op *op)
-{
-	_enter("");
-
-	spin_lock(&kafsasyncd_async_lock);
-
-	if (!list_empty(&op->link)) {
-		list_del_init(&op->link);
-		remove_wait_queue(&op->call->waitq, &op->waiter);
-	}
-
-	spin_unlock(&kafsasyncd_async_lock);
-
-	wake_up(&kafsasyncd_sleepq);
-
-	_leave("");
-} /* end afs_kafsasyncd_terminate_op() */
diff --git a/fs/afs/kafsasyncd.h b/fs/afs/kafsasyncd.h
deleted file mode 100644
index 791803f9a6fb..000000000000
--- a/fs/afs/kafsasyncd.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/* kafsasyncd.h: AFS asynchronous operation daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_KAFSASYNCD_H
-#define _LINUX_AFS_KAFSASYNCD_H
-
-#include "types.h"
-
-struct afs_async_op;
-
-struct afs_async_op_ops {
-	void (*attend)(struct afs_async_op *op);
-	void (*discard)(struct afs_async_op *op);
-};
-
-/*****************************************************************************/
-/*
- * asynchronous operation record
- */
-struct afs_async_op
-{
-	struct list_head		link;
-	struct afs_server		*server;	/* server being contacted */
-	struct rxrpc_call		*call;		/* RxRPC call performing op */
-	wait_queue_t			waiter;		/* wait queue for kafsasyncd */
-	const struct afs_async_op_ops	*ops;		/* operations */
-};
-
-static inline void afs_async_op_init(struct afs_async_op *op,
-				     const struct afs_async_op_ops *ops)
-{
-	INIT_LIST_HEAD(&op->link);
-	op->call = NULL;
-	op->ops = ops;
-}
-
-extern int afs_kafsasyncd_start(void);
-extern void afs_kafsasyncd_stop(void);
-
-extern void afs_kafsasyncd_begin_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_attend_op(struct afs_async_op *op);
-extern void afs_kafsasyncd_terminate_op(struct afs_async_op *op);
-
-#endif /* _LINUX_AFS_KAFSASYNCD_H */
diff --git a/fs/afs/kafstimod.c b/fs/afs/kafstimod.c
deleted file mode 100644
index 694344e4d3c7..000000000000
--- a/fs/afs/kafstimod.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/* kafstimod.c: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/sched.h>
-#include <linux/completion.h>
-#include <linux/freezer.h>
-#include "cell.h"
-#include "volume.h"
-#include "kafstimod.h"
-#include <asm/errno.h>
-#include "internal.h"
-
-static DECLARE_COMPLETION(kafstimod_alive);
-static DECLARE_COMPLETION(kafstimod_dead);
-static DECLARE_WAIT_QUEUE_HEAD(kafstimod_sleepq);
-static int kafstimod_die;
-
-static LIST_HEAD(kafstimod_list);
-static DEFINE_SPINLOCK(kafstimod_lock);
-
-static int kafstimod(void *arg);
-
-/*****************************************************************************/
-/*
- * start the timeout daemon
- */
-int afs_kafstimod_start(void)
-{
-	int ret;
-
-	ret = kernel_thread(kafstimod, NULL, 0);
-	if (ret < 0)
-		return ret;
-
-	wait_for_completion(&kafstimod_alive);
-
-	return ret;
-} /* end afs_kafstimod_start() */
-
-/*****************************************************************************/
-/*
- * stop the timeout daemon
- */
-void afs_kafstimod_stop(void)
-{
-	/* get rid of my daemon */
-	kafstimod_die = 1;
-	wake_up(&kafstimod_sleepq);
-	wait_for_completion(&kafstimod_dead);
-
-} /* end afs_kafstimod_stop() */
-
-/*****************************************************************************/
-/*
- * timeout processing daemon
- */
-static int kafstimod(void *arg)
-{
-	struct afs_timer *timer;
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	printk("kAFS: Started kafstimod %d\n", current->pid);
-
-	daemonize("kafstimod");
-
-	complete(&kafstimod_alive);
-
-	/* loop around looking for things to attend to */
- loop:
-	set_current_state(TASK_INTERRUPTIBLE);
-	add_wait_queue(&kafstimod_sleepq, &myself);
-
-	for (;;) {
-		unsigned long jif;
-		signed long timeout;
-
-		/* deal with the server being asked to die */
-		if (kafstimod_die) {
-			remove_wait_queue(&kafstimod_sleepq, &myself);
-			_leave("");
-			complete_and_exit(&kafstimod_dead, 0);
-		}
-
-		try_to_freeze();
-
-		/* discard pending signals */
-		afs_discard_my_signals();
-
-		/* work out the time to elapse before the next event */
-		spin_lock(&kafstimod_lock);
-		if (list_empty(&kafstimod_list)) {
-			timeout = MAX_SCHEDULE_TIMEOUT;
-		}
-		else {
-			timer = list_entry(kafstimod_list.next,
-					   struct afs_timer, link);
-			timeout = timer->timo_jif;
-			jif = jiffies;
-
-			if (time_before_eq((unsigned long) timeout, jif))
-				goto immediate;
-
-			else {
-				timeout = (long) timeout - (long) jiffies;
-			}
-		}
-		spin_unlock(&kafstimod_lock);
-
-		schedule_timeout(timeout);
-
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-
-	/* the thing on the front of the queue needs processing
-	 * - we come here with the lock held and timer pointing to the expired
-	 *   entry
-	 */
- immediate:
-	remove_wait_queue(&kafstimod_sleepq, &myself);
-	set_current_state(TASK_RUNNING);
-
-	_debug("@@@ Begin Timeout of %p", timer);
-
-	/* dequeue the timer */
-	list_del_init(&timer->link);
-	spin_unlock(&kafstimod_lock);
-
-	/* call the timeout function */
-	timer->ops->timed_out(timer);
-
-	_debug("@@@ End Timeout");
-	goto loop;
-
-} /* end kafstimod() */
-
-/*****************************************************************************/
-/*
- * (re-)queue a timer
- */
-void afs_kafstimod_add_timer(struct afs_timer *timer, unsigned long timeout)
-{
-	struct afs_timer *ptimer;
-	struct list_head *_p;
-
-	_enter("%p,%lu", timer, timeout);
-
-	spin_lock(&kafstimod_lock);
-
-	list_del(&timer->link);
-
-	/* the timer was deferred or reset - put it back in the queue at the
-	 * right place */
-	timer->timo_jif = jiffies + timeout;
-
-	list_for_each(_p, &kafstimod_list) {
-		ptimer = list_entry(_p, struct afs_timer, link);
-		if (time_before(timer->timo_jif, ptimer->timo_jif))
-			break;
-	}
-
-	list_add_tail(&timer->link, _p); /* insert before stopping point */
-
-	spin_unlock(&kafstimod_lock);
-
-	wake_up(&kafstimod_sleepq);
-
-	_leave("");
-} /* end afs_kafstimod_add_timer() */
-
-/*****************************************************************************/
-/*
- * dequeue a timer
- * - returns 0 if the timer was deleted or -ENOENT if it wasn't queued
- */
-int afs_kafstimod_del_timer(struct afs_timer *timer)
-{
-	int ret = 0;
-
-	_enter("%p", timer);
-
-	spin_lock(&kafstimod_lock);
-
-	if (list_empty(&timer->link))
-		ret = -ENOENT;
-	else
-		list_del_init(&timer->link);
-
-	spin_unlock(&kafstimod_lock);
-
-	wake_up(&kafstimod_sleepq);
-
-	_leave(" = %d", ret);
-	return ret;
-} /* end afs_kafstimod_del_timer() */
diff --git a/fs/afs/kafstimod.h b/fs/afs/kafstimod.h
deleted file mode 100644
index e312f1a61a7f..000000000000
--- a/fs/afs/kafstimod.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/* kafstimod.h: AFS timeout daemon
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_KAFSTIMOD_H
-#define _LINUX_AFS_KAFSTIMOD_H
-
-#include "types.h"
-
-struct afs_timer;
-
-struct afs_timer_ops {
-	/* called when the front of the timer queue has timed out */
-	void (*timed_out)(struct afs_timer *timer);
-};
-
-/*****************************************************************************/
-/*
- * AFS timer/timeout record
- */
-struct afs_timer
-{
-	struct list_head		link;		/* link in timer queue */
-	unsigned long			timo_jif;	/* timeout time */
-	const struct afs_timer_ops	*ops;		/* timeout expiry function */
-};
-
-static inline void afs_timer_init(struct afs_timer *timer,
-				  const struct afs_timer_ops *ops)
-{
-	INIT_LIST_HEAD(&timer->link);
-	timer->ops = ops;
-}
-
-extern int afs_kafstimod_start(void);
-extern void afs_kafstimod_stop(void);
-
-extern void afs_kafstimod_add_timer(struct afs_timer *timer,
-				    unsigned long timeout);
-extern int afs_kafstimod_del_timer(struct afs_timer *timer);
-
-#endif /* _LINUX_AFS_KAFSTIMOD_H */
diff --git a/fs/afs/main.c b/fs/afs/main.c
index f2704ba53857..40c2704e7557 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -1,4 +1,4 @@
-/* main.c: AFS client file system
+/* AFS client file system
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -13,43 +13,21 @@
 #include <linux/moduleparam.h>
 #include <linux/init.h>
 #include <linux/completion.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/call.h>
-#include <rxrpc/peer.h>
-#include "cache.h"
-#include "cell.h"
-#include "server.h"
-#include "fsclient.h"
-#include "cmservice.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
 #include "internal.h"
 
-struct rxrpc_transport *afs_transport;
-
-static int afs_adding_peer(struct rxrpc_peer *peer);
-static void afs_discarding_peer(struct rxrpc_peer *peer);
-
-
 MODULE_DESCRIPTION("AFS Client File System");
 MODULE_AUTHOR("Red Hat, Inc.");
 MODULE_LICENSE("GPL");
 
+unsigned afs_debug;
+module_param_named(debug, afs_debug, uint, S_IWUSR | S_IRUGO);
+MODULE_PARM_DESC(afs_debug, "AFS debugging mask");
+
 static char *rootcell;
 
 module_param(rootcell, charp, 0);
 MODULE_PARM_DESC(rootcell, "root AFS cell name and VL server IP addr list");
 
-
-static struct rxrpc_peer_ops afs_peer_ops = {
-	.adding		= afs_adding_peer,
-	.discarding	= afs_discarding_peer,
-};
-
-struct list_head afs_cb_hash_tbl[AFS_CB_HASH_COUNT];
-DEFINE_SPINLOCK(afs_cb_hash_lock);
-
 #ifdef AFS_CACHING_SUPPORT
 static struct cachefs_netfs_operations afs_cache_ops = {
 	.get_page_cookie	= afs_cache_get_page_cookie,
@@ -62,20 +40,63 @@ struct cachefs_netfs afs_cache_netfs = {
 };
 #endif
 
-/*****************************************************************************/
+struct afs_uuid afs_uuid;
+
+/*
+ * get a client UUID
+ */
+static int __init afs_get_client_UUID(void)
+{
+	struct timespec ts;
+	u64 uuidtime;
+	u16 clockseq;
+	int ret;
+
+	/* read the MAC address of one of the external interfaces and construct
+	 * a UUID from it */
+	ret = afs_get_MAC_address(afs_uuid.node);
+	if (ret < 0)
+		return ret;
+
+	getnstimeofday(&ts);
+	uuidtime = (u64) ts.tv_sec * 1000 * 1000 * 10;
+	uuidtime += ts.tv_nsec / 100;
+	uuidtime += AFS_UUID_TO_UNIX_TIME;
+	afs_uuid.time_low = uuidtime;
+	afs_uuid.time_mid = uuidtime >> 32;
+	afs_uuid.time_hi_and_version = (uuidtime >> 48) & AFS_UUID_TIMEHI_MASK;
+	afs_uuid.time_hi_and_version = AFS_UUID_VERSION_TIME;
+
+	get_random_bytes(&clockseq, 2);
+	afs_uuid.clock_seq_low = clockseq;
+	afs_uuid.clock_seq_hi_and_reserved =
+		(clockseq >> 8) & AFS_UUID_CLOCKHI_MASK;
+	afs_uuid.clock_seq_hi_and_reserved = AFS_UUID_VARIANT_STD;
+
+	_debug("AFS UUID: %08x-%04x-%04x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+	       afs_uuid.time_low,
+	       afs_uuid.time_mid,
+	       afs_uuid.time_hi_and_version,
+	       afs_uuid.clock_seq_hi_and_reserved,
+	       afs_uuid.clock_seq_low,
+	       afs_uuid.node[0], afs_uuid.node[1], afs_uuid.node[2],
+	       afs_uuid.node[3], afs_uuid.node[4], afs_uuid.node[5]);
+
+	return 0;
+}
+
 /*
  * initialise the AFS client FS module
  */
 static int __init afs_init(void)
 {
-	int loop, ret;
+	int ret;
 
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 registering.\n");
 
-	/* initialise the callback hash table */
-	spin_lock_init(&afs_cb_hash_lock);
-	for (loop = AFS_CB_HASH_COUNT - 1; loop >= 0; loop--)
-		INIT_LIST_HEAD(&afs_cb_hash_tbl[loop]);
+	ret = afs_get_client_UUID();
+	if (ret < 0)
+		return ret;
 
 	/* register the /proc stuff */
 	ret = afs_proc_init();
@@ -87,70 +108,56 @@ static int __init afs_init(void)
 	ret = cachefs_register_netfs(&afs_cache_netfs,
 				     &afs_cache_cell_index_def);
 	if (ret < 0)
-		goto error;
-#endif
-
-#ifdef CONFIG_KEYS_TURNED_OFF
-	ret = afs_key_register();
-	if (ret < 0)
 		goto error_cache;
 #endif
 
 	/* initialise the cell DB */
 	ret = afs_cell_init(rootcell);
 	if (ret < 0)
-		goto error_keys;
+		goto error_cell_init;
 
-	/* start the timeout daemon */
-	ret = afs_kafstimod_start();
+	/* initialise the VL update process */
+	ret = afs_vlocation_update_init();
 	if (ret < 0)
-		goto error_keys;
+		goto error_vl_update_init;
 
-	/* start the async operation daemon */
-	ret = afs_kafsasyncd_start();
-	if (ret < 0)
-		goto error_kafstimod;
+	/* initialise the callback update process */
+	ret = afs_callback_update_init();
 
 	/* create the RxRPC transport */
-	ret = rxrpc_create_transport(7001, &afs_transport);
+	ret = afs_open_socket();
 	if (ret < 0)
-		goto error_kafsasyncd;
-
-	afs_transport->peer_ops = &afs_peer_ops;
+		goto error_open_socket;
 
 	/* register the filesystems */
 	ret = afs_fs_init();
 	if (ret < 0)
-		goto error_transport;
+		goto error_fs;
 
 	return ret;
 
- error_transport:
-	rxrpc_put_transport(afs_transport);
- error_kafsasyncd:
-	afs_kafsasyncd_stop();
- error_kafstimod:
-	afs_kafstimod_stop();
- error_keys:
-#ifdef CONFIG_KEYS_TURNED_OFF
-	afs_key_unregister();
- error_cache:
-#endif
+error_fs:
+	afs_close_socket();
+error_open_socket:
+error_vl_update_init:
+error_cell_init:
 #ifdef AFS_CACHING_SUPPORT
 	cachefs_unregister_netfs(&afs_cache_netfs);
- error:
+error_cache:
 #endif
+	afs_callback_update_kill();
+	afs_vlocation_purge();
 	afs_cell_purge();
 	afs_proc_cleanup();
 	printk(KERN_ERR "kAFS: failed to register: %d\n", ret);
 	return ret;
-} /* end afs_init() */
+}
 
 /* XXX late_initcall is kludgy, but the only alternative seems to create
  * a transport upon the first mount, which is worse. Or is it?
  */
 late_initcall(afs_init);	/* must be called after net/ to create socket */
-/*****************************************************************************/
+
 /*
  * clean up on module removal
  */
@@ -159,127 +166,16 @@ static void __exit afs_exit(void)
 	printk(KERN_INFO "kAFS: Red Hat AFS client v0.1 unregistering.\n");
 
 	afs_fs_exit();
-	rxrpc_put_transport(afs_transport);
-	afs_kafstimod_stop();
-	afs_kafsasyncd_stop();
+	afs_close_socket();
+	afs_purge_servers();
+	afs_callback_update_kill();
+	afs_vlocation_purge();
+	flush_scheduled_work();
 	afs_cell_purge();
-#ifdef CONFIG_KEYS_TURNED_OFF
-	afs_key_unregister();
-#endif
 #ifdef AFS_CACHING_SUPPORT
 	cachefs_unregister_netfs(&afs_cache_netfs);
 #endif
 	afs_proc_cleanup();
-
-} /* end afs_exit() */
-
-module_exit(afs_exit);
-
-/*****************************************************************************/
-/*
- * notification that new peer record is being added
- * - called from krxsecd
- * - return an error to induce an abort
- * - mustn't sleep (caller holds an rwlock)
- */
-static int afs_adding_peer(struct rxrpc_peer *peer)
-{
-	struct afs_server *server;
-	int ret;
-
-	_debug("kAFS: Adding new peer %08x\n", ntohl(peer->addr.s_addr));
-
-	/* determine which server the peer resides in (if any) */
-	ret = afs_server_find_by_peer(peer, &server);
-	if (ret < 0)
-		return ret; /* none that we recognise, so abort */
-
-	_debug("Server %p{u=%d}\n", server, atomic_read(&server->usage));
-
-	_debug("Cell %p{u=%d}\n",
-	       server->cell, atomic_read(&server->cell->usage));
-
-	/* cross-point the structs under a global lock */
-	spin_lock(&afs_server_peer_lock);
-	peer->user = server;
-	server->peer = peer;
-	spin_unlock(&afs_server_peer_lock);
-
-	afs_put_server(server);
-
-	return 0;
-} /* end afs_adding_peer() */
-
-/*****************************************************************************/
-/*
- * notification that a peer record is being discarded
- * - called from krxiod or krxsecd
- */
-static void afs_discarding_peer(struct rxrpc_peer *peer)
-{
-	struct afs_server *server;
-
-	_enter("%p",peer);
-
-	_debug("Discarding peer %08x (rtt=%lu.%lumS)\n",
-	       ntohl(peer->addr.s_addr),
-	       (long) (peer->rtt / 1000),
-	       (long) (peer->rtt % 1000));
-
-	/* uncross-point the structs under a global lock */
-	spin_lock(&afs_server_peer_lock);
-	server = peer->user;
-	if (server) {
-		peer->user = NULL;
-		server->peer = NULL;
-	}
-	spin_unlock(&afs_server_peer_lock);
-
-	_leave("");
-
-} /* end afs_discarding_peer() */
-
-/*****************************************************************************/
-/*
- * clear the dead space between task_struct and kernel stack
- * - called by supplying -finstrument-functions to gcc
- */
-#if 0
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-
-void __cyg_profile_func_enter (void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xedededed,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
 }
 
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-__attribute__((no_instrument_function));
-
-void __cyg_profile_func_exit(void *this_fn, void *call_site)
-{
-       asm volatile("  movl    %%esp,%%edi     \n"
-                    "  andl    %0,%%edi        \n"
-                    "  addl    %1,%%edi        \n"
-                    "  movl    %%esp,%%ecx     \n"
-                    "  subl    %%edi,%%ecx     \n"
-                    "  shrl    $2,%%ecx        \n"
-                    "  movl    $0xdadadada,%%eax     \n"
-                    "  rep stosl               \n"
-                    :
-                    : "i"(~(THREAD_SIZE - 1)), "i"(sizeof(struct thread_info))
-                    : "eax", "ecx", "edi", "memory", "cc"
-                    );
-}
-#endif
+module_exit(afs_exit);
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index e4fce66d76e0..cdb9792d8161 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -1,6 +1,6 @@
-/* misc.c: miscellaneous bits
+/* miscellaneous bits
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -12,19 +12,20 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/errno.h>
-#include "errors.h"
 #include "internal.h"
+#include "afs_fs.h"
 
-/*****************************************************************************/
 /*
  * convert an AFS abort code to a Linux error number
  */
-int afs_abort_to_error(int abortcode)
+int afs_abort_to_error(u32 abort_code)
 {
-	switch (abortcode) {
+	switch (abort_code) {
+	case 13:		return -EACCES;
+	case 30:		return -EROFS;
 	case VSALVAGE:		return -EIO;
 	case VNOVNODE:		return -ENOENT;
-	case VNOVOL:		return -ENXIO;
+	case VNOVOL:		return -ENOMEDIUM;
 	case VVOLEXISTS:	return -EEXIST;
 	case VNOSERVICE:	return -EIO;
 	case VOFFLINE:		return -ENOENT;
@@ -33,7 +34,24 @@ int afs_abort_to_error(int abortcode)
 	case VOVERQUOTA:	return -EDQUOT;
 	case VBUSY:		return -EBUSY;
 	case VMOVED:		return -ENXIO;
-	default:		return -EIO;
+	case 0x2f6df0c:		return -EACCES;
+	case 0x2f6df0f:		return -EBUSY;
+	case 0x2f6df10:		return -EEXIST;
+	case 0x2f6df11:		return -EXDEV;
+	case 0x2f6df13:		return -ENOTDIR;
+	case 0x2f6df14:		return -EISDIR;
+	case 0x2f6df15:		return -EINVAL;
+	case 0x2f6df1a:		return -EFBIG;
+	case 0x2f6df1b:		return -ENOSPC;
+	case 0x2f6df1d:		return -EROFS;
+	case 0x2f6df1e:		return -EMLINK;
+	case 0x2f6df20:		return -EDOM;
+	case 0x2f6df21:		return -ERANGE;
+	case 0x2f6df22:		return -EDEADLK;
+	case 0x2f6df23:		return -ENAMETOOLONG;
+	case 0x2f6df24:		return -ENOLCK;
+	case 0x2f6df26:		return -ENOTEMPTY;
+	case 0x2f6df78:		return -EDQUOT;
+	default:		return -EREMOTEIO;
 	}
-
-} /* end afs_abort_to_error() */
+}
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 68495f0de7b3..b905ae37f912 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -1,4 +1,4 @@
-/* mntpt.c: mountpoint management
+/* mountpoint management
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -18,10 +18,6 @@
 #include <linux/mount.h>
 #include <linux/namei.h>
 #include <linux/mnt_namespace.h>
-#include "super.h"
-#include "cell.h"
-#include "volume.h"
-#include "vnode.h"
 #include "internal.h"
 
 
@@ -30,6 +26,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct nameidata *nd);
 static int afs_mntpt_open(struct inode *inode, struct file *file);
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd);
+static void afs_mntpt_expiry_timed_out(struct work_struct *work);
 
 const struct file_operations afs_mntpt_file_operations = {
 	.open		= afs_mntpt_open,
@@ -43,24 +40,19 @@ const struct inode_operations afs_mntpt_inode_operations = {
 };
 
 static LIST_HEAD(afs_vfsmounts);
+static DECLARE_DELAYED_WORK(afs_mntpt_expiry_timer, afs_mntpt_expiry_timed_out);
 
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer);
+unsigned long afs_mntpt_expiry_timeout = 10 * 60;
 
-struct afs_timer_ops afs_mntpt_expiry_timer_ops = {
-	.timed_out	= afs_mntpt_expiry_timed_out,
-};
-
-struct afs_timer afs_mntpt_expiry_timer;
-
-unsigned long afs_mntpt_expiry_timeout = 20;
-
-/*****************************************************************************/
 /*
  * check a symbolic link to see whether it actually encodes a mountpoint
  * - sets the AFS_VNODE_MOUNTPOINT flag on the vnode appropriately
  */
-int afs_mntpt_check_symlink(struct afs_vnode *vnode)
+int afs_mntpt_check_symlink(struct afs_vnode *vnode, struct key *key)
 {
+	struct file file = {
+		.private_data = key,
+	};
 	struct page *page;
 	size_t size;
 	char *buf;
@@ -69,7 +61,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	_enter("{%u,%u}", vnode->fid.vnode, vnode->fid.unique);
 
 	/* read the contents of the symlink into the pagecache */
-	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, NULL);
+	page = read_mapping_page(AFS_VNODE_TO_I(vnode)->i_mapping, 0, &file);
 	if (IS_ERR(page)) {
 		ret = PTR_ERR(page);
 		goto out;
@@ -85,7 +77,7 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 
 	/* examine the symlink's contents */
 	size = vnode->status.size;
-	_debug("symlink to %*.*s", size, (int) size, buf);
+	_debug("symlink to %*.*s", (int) size, (int) size, buf);
 
 	if (size > 2 &&
 	    (buf[0] == '%' || buf[0] == '#') &&
@@ -93,22 +85,20 @@ int afs_mntpt_check_symlink(struct afs_vnode *vnode)
 	    ) {
 		_debug("symlink is a mountpoint");
 		spin_lock(&vnode->lock);
-		vnode->flags |= AFS_VNODE_MOUNTPOINT;
+		set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags);
 		spin_unlock(&vnode->lock);
 	}
 
 	ret = 0;
 
- out_free:
+out_free:
 	kunmap(page);
 	page_cache_release(page);
- out:
+out:
 	_leave(" = %d", ret);
 	return ret;
+}
 
-} /* end afs_mntpt_check_symlink() */
-
-/*****************************************************************************/
 /*
  * no valid lookup procedure on this sort of dir
  */
@@ -116,7 +106,7 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 				       struct dentry *dentry,
 				       struct nameidata *nd)
 {
-	kenter("%p,%p{%p{%s},%s}",
+	_enter("%p,%p{%p{%s},%s}",
 	       dir,
 	       dentry,
 	       dentry->d_parent,
@@ -125,15 +115,14 @@ static struct dentry *afs_mntpt_lookup(struct inode *dir,
 	       dentry->d_name.name);
 
 	return ERR_PTR(-EREMOTE);
-} /* end afs_mntpt_lookup() */
+}
 
-/*****************************************************************************/
 /*
  * no valid open procedure on this sort of dir
  */
 static int afs_mntpt_open(struct inode *inode, struct file *file)
 {
-	kenter("%p,%p{%p{%s},%s}",
+	_enter("%p,%p{%p{%s},%s}",
 	       inode, file,
 	       file->f_path.dentry->d_parent,
 	       file->f_path.dentry->d_parent ?
@@ -142,9 +131,8 @@ static int afs_mntpt_open(struct inode *inode, struct file *file)
 	       file->f_path.dentry->d_name.name);
 
 	return -EREMOTE;
-} /* end afs_mntpt_open() */
+}
 
-/*****************************************************************************/
 /*
  * create a vfsmount to be automounted
  */
@@ -157,7 +145,7 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 	char *buf, *devname = NULL, *options = NULL;
 	int ret;
 
-	kenter("{%s}", mntpt->d_name.name);
+	_enter("{%s}", mntpt->d_name.name);
 
 	BUG_ON(!mntpt->d_inode);
 
@@ -201,79 +189,108 @@ static struct vfsmount *afs_mntpt_do_automount(struct dentry *mntpt)
 		strcat(options, ",rwpath");
 
 	/* try and do the mount */
-	kdebug("--- attempting mount %s -o %s ---", devname, options);
+	_debug("--- attempting mount %s -o %s ---", devname, options);
 	mnt = vfs_kern_mount(&afs_fs_type, 0, devname, options);
-	kdebug("--- mount result %p ---", mnt);
+	_debug("--- mount result %p ---", mnt);
 
 	free_page((unsigned long) devname);
 	free_page((unsigned long) options);
-	kleave(" = %p", mnt);
+	_leave(" = %p", mnt);
 	return mnt;
 
- error:
+error:
 	if (page)
 		page_cache_release(page);
 	if (devname)
 		free_page((unsigned long) devname);
 	if (options)
 		free_page((unsigned long) options);
-	kleave(" = %d", ret);
+	_leave(" = %d", ret);
 	return ERR_PTR(ret);
-} /* end afs_mntpt_do_automount() */
+}
 
-/*****************************************************************************/
 /*
  * follow a link from a mountpoint directory, thus causing it to be mounted
  */
 static void *afs_mntpt_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct vfsmount *newmnt;
-	struct dentry *old_dentry;
 	int err;
 
-	kenter("%p{%s},{%s:%p{%s}}",
+	_enter("%p{%s},{%s:%p{%s},}",
 	       dentry,
 	       dentry->d_name.name,
 	       nd->mnt->mnt_devname,
 	       dentry,
 	       nd->dentry->d_name.name);
 
-	newmnt = afs_mntpt_do_automount(dentry);
+	dput(nd->dentry);
+	nd->dentry = dget(dentry);
+
+	newmnt = afs_mntpt_do_automount(nd->dentry);
 	if (IS_ERR(newmnt)) {
 		path_release(nd);
 		return (void *)newmnt;
 	}
 
-	old_dentry = nd->dentry;
-	nd->dentry = dentry;
-	err = do_add_mount(newmnt, nd, 0, &afs_vfsmounts);
-	nd->dentry = old_dentry;
-
-	path_release(nd);
-
-	if (!err) {
-		mntget(newmnt);
+	mntget(newmnt);
+	err = do_add_mount(newmnt, nd, MNT_SHRINKABLE, &afs_vfsmounts);
+	switch (err) {
+	case 0:
+		mntput(nd->mnt);
+		dput(nd->dentry);
 		nd->mnt = newmnt;
-		dget(newmnt->mnt_root);
-		nd->dentry = newmnt->mnt_root;
+		nd->dentry = dget(newmnt->mnt_root);
+		schedule_delayed_work(&afs_mntpt_expiry_timer,
+				      afs_mntpt_expiry_timeout * HZ);
+		break;
+	case -EBUSY:
+		/* someone else made a mount here whilst we were busy */
+		while (d_mountpoint(nd->dentry) &&
+		       follow_down(&nd->mnt, &nd->dentry))
+			;
+		err = 0;
+	default:
+		mntput(newmnt);
+		break;
 	}
 
-	kleave(" = %d", err);
+	_leave(" = %d", err);
 	return ERR_PTR(err);
-} /* end afs_mntpt_follow_link() */
+}
 
-/*****************************************************************************/
 /*
  * handle mountpoint expiry timer going off
  */
-static void afs_mntpt_expiry_timed_out(struct afs_timer *timer)
+static void afs_mntpt_expiry_timed_out(struct work_struct *work)
 {
-	kenter("");
+	_enter("");
 
-	mark_mounts_for_expiry(&afs_vfsmounts);
+	if (!list_empty(&afs_vfsmounts)) {
+		mark_mounts_for_expiry(&afs_vfsmounts);
+		schedule_delayed_work(&afs_mntpt_expiry_timer,
+				      afs_mntpt_expiry_timeout * HZ);
+	}
+
+	_leave("");
+}
 
-	afs_kafstimod_add_timer(&afs_mntpt_expiry_timer,
-				afs_mntpt_expiry_timeout * HZ);
+/*
+ * kill the AFS mountpoint timer if it's still running
+ */
+void afs_mntpt_kill_timer(void)
+{
+	_enter("");
 
-	kleave("");
-} /* end afs_mntpt_expiry_timed_out() */
+	ASSERT(list_empty(&afs_vfsmounts));
+	cancel_delayed_work(&afs_mntpt_expiry_timer);
+	flush_scheduled_work();
+}
+
+/*
+ * begin unmount by attempting to remove all automounted mountpoints we added
+ */
+void afs_umount_begin(struct vfsmount *vfsmnt, int flags)
+{
+	shrink_submounts(vfsmnt, &afs_vfsmounts);
+}
diff --git a/fs/afs/mount.h b/fs/afs/mount.h
deleted file mode 100644
index 9d2f46ec549f..000000000000
--- a/fs/afs/mount.h
+++ /dev/null
@@ -1,23 +0,0 @@
-/* mount.h: mount parameters
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_MOUNT_H
-#define _LINUX_AFS_MOUNT_H
-
-struct afs_mountdata {
-	const char		*volume;	/* name of volume */
-	const char		*cell;		/* name of cell containing volume */
-	const char		*cache;		/* name of cache block device */
-	size_t			nservers;	/* number of server addresses listed */
-	uint32_t		servers[10];	/* IP addresses of servers in this cell */
-};
-
-#endif /* _LINUX_AFS_MOUNT_H */
diff --git a/fs/afs/proc.c b/fs/afs/proc.c
index ae6b85b1e484..d5601f617cdb 100644
--- a/fs/afs/proc.c
+++ b/fs/afs/proc.c
@@ -1,4 +1,4 @@
-/* proc.c: /proc interface for AFS
+/* /proc interface for AFS
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -13,8 +13,6 @@
 #include <linux/module.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include "cell.h"
-#include "volume.h"
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -130,7 +128,6 @@ static const struct file_operations afs_proc_cell_servers_fops = {
 	.release	= afs_proc_cell_servers_release,
 };
 
-/*****************************************************************************/
 /*
  * initialise the /proc/fs/afs/ directory
  */
@@ -142,47 +139,43 @@ int afs_proc_init(void)
 
 	proc_afs = proc_mkdir("fs/afs", NULL);
 	if (!proc_afs)
-		goto error;
+		goto error_dir;
 	proc_afs->owner = THIS_MODULE;
 
 	p = create_proc_entry("cells", 0, proc_afs);
 	if (!p)
-		goto error_proc;
+		goto error_cells;
 	p->proc_fops = &afs_proc_cells_fops;
 	p->owner = THIS_MODULE;
 
 	p = create_proc_entry("rootcell", 0, proc_afs);
 	if (!p)
-		goto error_cells;
+		goto error_rootcell;
 	p->proc_fops = &afs_proc_rootcell_fops;
 	p->owner = THIS_MODULE;
 
 	_leave(" = 0");
 	return 0;
 
- error_cells:
+error_rootcell:
  	remove_proc_entry("cells", proc_afs);
- error_proc:
+error_cells:
 	remove_proc_entry("fs/afs", NULL);
- error:
+error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
+}
 
-} /* end afs_proc_init() */
-
-/*****************************************************************************/
 /*
  * clean up the /proc/fs/afs/ directory
  */
 void afs_proc_cleanup(void)
 {
+	remove_proc_entry("rootcell", proc_afs);
 	remove_proc_entry("cells", proc_afs);
-
 	remove_proc_entry("fs/afs", NULL);
+}
 
-} /* end afs_proc_cleanup() */
-
-/*****************************************************************************/
 /*
  * open "/proc/fs/afs/cells" which provides a summary of extant cells
  */
@@ -199,9 +192,8 @@ static int afs_proc_cells_open(struct inode *inode, struct file *file)
 	m->private = PDE(inode)->data;
 
 	return 0;
-} /* end afs_proc_cells_open() */
+}
 
-/*****************************************************************************/
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
@@ -225,9 +217,8 @@ static void *afs_proc_cells_start(struct seq_file *m, loff_t *_pos)
 			break;
 
 	return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_start() */
+}
 
-/*****************************************************************************/
 /*
  * move to next cell in cells list
  */
@@ -241,19 +232,16 @@ static void *afs_proc_cells_next(struct seq_file *p, void *v, loff_t *pos)
 	_p = v == (void *) 1 ? afs_proc_cells.next : _p->next;
 
 	return _p != &afs_proc_cells ? _p : NULL;
-} /* end afs_proc_cells_next() */
+}
 
-/*****************************************************************************/
 /*
  * clean up after reading from the cells list
  */
 static void afs_proc_cells_stop(struct seq_file *p, void *v)
 {
 	up_read(&afs_proc_cells_sem);
+}
 
-} /* end afs_proc_cells_stop() */
-
-/*****************************************************************************/
 /*
  * display a header line followed by a load of cell lines
  */
@@ -261,19 +249,18 @@ static int afs_proc_cells_show(struct seq_file *m, void *v)
 {
 	struct afs_cell *cell = list_entry(v, struct afs_cell, proc_link);
 
-	/* display header on line 1 */
 	if (v == (void *) 1) {
+		/* display header on line 1 */
 		seq_puts(m, "USE NAME\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %s\n", atomic_read(&cell->usage), cell->name);
-
+	seq_printf(m, "%3d %s\n",
+		   atomic_read(&cell->usage), cell->name);
 	return 0;
-} /* end afs_proc_cells_show() */
+}
 
-/*****************************************************************************/
 /*
  * handle writes to /proc/fs/afs/cells
  * - to add cells: echo "add <cellname> <IP>[:<IP>][:<IP>]"
@@ -326,30 +313,32 @@ static ssize_t afs_proc_cells_write(struct file *file, const char __user *buf,
 
 	if (strcmp(kbuf, "add") == 0) {
 		struct afs_cell *cell;
-		ret = afs_cell_create(name, args, &cell);
-		if (ret < 0)
+
+		cell = afs_cell_create(name, args);
+		if (IS_ERR(cell)) {
+			ret = PTR_ERR(cell);
 			goto done;
+		}
 
+		afs_put_cell(cell);
 		printk("kAFS: Added new cell '%s'\n", name);
-	}
-	else {
+	} else {
 		goto inval;
 	}
 
 	ret = size;
 
- done:
+done:
 	kfree(kbuf);
 	_leave(" = %d", ret);
 	return ret;
 
- inval:
+inval:
 	ret = -EINVAL;
 	printk("kAFS: Invalid Command on /proc/fs/afs/cells file\n");
 	goto done;
-} /* end afs_proc_cells_write() */
+}
 
-/*****************************************************************************/
 /*
  * Stubs for /proc/fs/afs/rootcell
  */
@@ -369,7 +358,6 @@ static ssize_t afs_proc_rootcell_read(struct file *file, char __user *buf,
 	return 0;
 }
 
-/*****************************************************************************/
 /*
  * handle writes to /proc/fs/afs/rootcell
  * - to initialize rootcell: echo "cell.name:192.168.231.14"
@@ -407,14 +395,13 @@ static ssize_t afs_proc_rootcell_write(struct file *file,
 	if (ret >= 0)
 		ret = size;	/* consume everything, always */
 
- infault:
+infault:
 	kfree(kbuf);
- nomem:
+nomem:
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_proc_rootcell_write() */
+}
 
-/*****************************************************************************/
 /*
  * initialise /proc/fs/afs/<cell>/
  */
@@ -426,25 +413,25 @@ int afs_proc_cell_setup(struct afs_cell *cell)
 
 	cell->proc_dir = proc_mkdir(cell->name, proc_afs);
 	if (!cell->proc_dir)
-		return -ENOMEM;
+		goto error_dir;
 
 	p = create_proc_entry("servers", 0, cell->proc_dir);
 	if (!p)
-		goto error_proc;
+		goto error_servers;
 	p->proc_fops = &afs_proc_cell_servers_fops;
 	p->owner = THIS_MODULE;
 	p->data = cell;
 
 	p = create_proc_entry("vlservers", 0, cell->proc_dir);
 	if (!p)
-		goto error_servers;
+		goto error_vlservers;
 	p->proc_fops = &afs_proc_cell_vlservers_fops;
 	p->owner = THIS_MODULE;
 	p->data = cell;
 
 	p = create_proc_entry("volumes", 0, cell->proc_dir);
 	if (!p)
-		goto error_vlservers;
+		goto error_volumes;
 	p->proc_fops = &afs_proc_cell_volumes_fops;
 	p->owner = THIS_MODULE;
 	p->data = cell;
@@ -452,17 +439,17 @@ int afs_proc_cell_setup(struct afs_cell *cell)
 	_leave(" = 0");
 	return 0;
 
- error_vlservers:
+error_volumes:
 	remove_proc_entry("vlservers", cell->proc_dir);
- error_servers:
+error_vlservers:
 	remove_proc_entry("servers", cell->proc_dir);
- error_proc:
+error_servers:
 	remove_proc_entry(cell->name, proc_afs);
+error_dir:
 	_leave(" = -ENOMEM");
 	return -ENOMEM;
-} /* end afs_proc_cell_setup() */
+}
 
-/*****************************************************************************/
 /*
  * remove /proc/fs/afs/<cell>/
  */
@@ -476,9 +463,8 @@ void afs_proc_cell_remove(struct afs_cell *cell)
 	remove_proc_entry(cell->name, proc_afs);
 
 	_leave("");
-} /* end afs_proc_cell_remove() */
+}
 
-/*****************************************************************************/
 /*
  * open "/proc/fs/afs/<cell>/volumes" which provides a summary of extant cells
  */
@@ -488,7 +474,7 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+	cell = PDE(inode)->data;
 	if (!cell)
 		return -ENOENT;
 
@@ -500,25 +486,16 @@ static int afs_proc_cell_volumes_open(struct inode *inode, struct file *file)
 	m->private = cell;
 
 	return 0;
-} /* end afs_proc_cell_volumes_open() */
+}
 
-/*****************************************************************************/
 /*
  * close the file and release the ref to the cell
  */
 static int afs_proc_cell_volumes_release(struct inode *inode, struct file *file)
 {
-	struct afs_cell *cell = PDE(inode)->data;
-	int ret;
-
-	ret = seq_release(inode,file);
-
-	afs_put_cell(cell);
-
-	return ret;
-} /* end afs_proc_cell_volumes_release() */
+	return seq_release(inode, file);
+}
 
-/*****************************************************************************/
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
@@ -545,9 +522,8 @@ static void *afs_proc_cell_volumes_start(struct seq_file *m, loff_t *_pos)
 			break;
 
 	return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_start() */
+}
 
-/*****************************************************************************/
 /*
  * move to next cell in cells list
  */
@@ -562,12 +538,11 @@ static void *afs_proc_cell_volumes_next(struct seq_file *p, void *v,
 	(*_pos)++;
 
 	_p = v;
-	_p = v == (void *) 1 ? cell->vl_list.next : _p->next;
+	_p = (v == (void *) 1) ? cell->vl_list.next : _p->next;
 
-	return _p != &cell->vl_list ? _p : NULL;
-} /* end afs_proc_cell_volumes_next() */
+	return (_p != &cell->vl_list) ? _p : NULL;
+}
 
-/*****************************************************************************/
 /*
  * clean up after reading from the cells list
  */
@@ -576,10 +551,18 @@ static void afs_proc_cell_volumes_stop(struct seq_file *p, void *v)
 	struct afs_cell *cell = p->private;
 
 	up_read(&cell->vl_sem);
+}
 
-} /* end afs_proc_cell_volumes_stop() */
+const char afs_vlocation_states[][4] = {
+	[AFS_VL_NEW]			= "New",
+	[AFS_VL_CREATING]		= "Crt",
+	[AFS_VL_VALID]			= "Val",
+	[AFS_VL_NO_VOLUME]		= "NoV",
+	[AFS_VL_UPDATING]		= "Upd",
+	[AFS_VL_VOLUME_DELETED]		= "Del",
+	[AFS_VL_UNCERTAIN]		= "Unc",
+};
 
-/*****************************************************************************/
 /*
  * display a header line followed by a load of volume lines
  */
@@ -590,23 +573,22 @@ static int afs_proc_cell_volumes_show(struct seq_file *m, void *v)
 
 	/* display header on line 1 */
 	if (v == (void *) 1) {
-		seq_puts(m, "USE VLID[0]  VLID[1]  VLID[2]  NAME\n");
+		seq_puts(m, "USE STT VLID[0]  VLID[1]  VLID[2]  NAME\n");
 		return 0;
 	}
 
 	/* display one cell per line on subsequent lines */
-	seq_printf(m, "%3d %08x %08x %08x %s\n",
+	seq_printf(m, "%3d %s %08x %08x %08x %s\n",
 		   atomic_read(&vlocation->usage),
+		   afs_vlocation_states[vlocation->state],
 		   vlocation->vldb.vid[0],
 		   vlocation->vldb.vid[1],
 		   vlocation->vldb.vid[2],
-		   vlocation->vldb.name
-		   );
+		   vlocation->vldb.name);
 
 	return 0;
-} /* end afs_proc_cell_volumes_show() */
+}
 
-/*****************************************************************************/
 /*
  * open "/proc/fs/afs/<cell>/vlservers" which provides a list of volume
  * location server
@@ -617,11 +599,11 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = afs_get_cell_maybe((struct afs_cell**)&PDE(inode)->data);
+	cell = PDE(inode)->data;
 	if (!cell)
 		return -ENOENT;
 
-	ret = seq_open(file,&afs_proc_cell_vlservers_ops);
+	ret = seq_open(file, &afs_proc_cell_vlservers_ops);
 	if (ret<0)
 		return ret;
 
@@ -629,26 +611,17 @@ static int afs_proc_cell_vlservers_open(struct inode *inode, struct file *file)
 	m->private = cell;
 
 	return 0;
-} /* end afs_proc_cell_vlservers_open() */
+}
 
-/*****************************************************************************/
 /*
  * close the file and release the ref to the cell
  */
 static int afs_proc_cell_vlservers_release(struct inode *inode,
 					   struct file *file)
 {
-	struct afs_cell *cell = PDE(inode)->data;
-	int ret;
-
-	ret = seq_release(inode,file);
-
-	afs_put_cell(cell);
-
-	return ret;
-} /* end afs_proc_cell_vlservers_release() */
+	return seq_release(inode, file);
+}
 
-/*****************************************************************************/
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
@@ -672,9 +645,8 @@ static void *afs_proc_cell_vlservers_start(struct seq_file *m, loff_t *_pos)
 		return NULL;
 
 	return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_start() */
+}
 
-/*****************************************************************************/
 /*
  * move to next cell in cells list
  */
@@ -692,9 +664,8 @@ static void *afs_proc_cell_vlservers_next(struct seq_file *p, void *v,
 		return NULL;
 
 	return &cell->vl_addrs[pos];
-} /* end afs_proc_cell_vlservers_next() */
+}
 
-/*****************************************************************************/
 /*
  * clean up after reading from the cells list
  */
@@ -703,10 +674,8 @@ static void afs_proc_cell_vlservers_stop(struct seq_file *p, void *v)
 	struct afs_cell *cell = p->private;
 
 	up_read(&cell->vl_sem);
+}
 
-} /* end afs_proc_cell_vlservers_stop() */
-
-/*****************************************************************************/
 /*
  * display a header line followed by a load of volume lines
  */
@@ -722,11 +691,9 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
 
 	/* display one cell per line on subsequent lines */
 	seq_printf(m, "%u.%u.%u.%u\n", NIPQUAD(addr->s_addr));
-
 	return 0;
-} /* end afs_proc_cell_vlservers_show() */
+}
 
-/*****************************************************************************/
 /*
  * open "/proc/fs/afs/<cell>/servers" which provides a summary of active
  * servers
@@ -737,7 +704,7 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 	struct seq_file *m;
 	int ret;
 
-	cell = afs_get_cell_maybe((struct afs_cell **) &PDE(inode)->data);
+	cell = PDE(inode)->data;
 	if (!cell)
 		return -ENOENT;
 
@@ -747,34 +714,24 @@ static int afs_proc_cell_servers_open(struct inode *inode, struct file *file)
 
 	m = file->private_data;
 	m->private = cell;
-
 	return 0;
-} /* end afs_proc_cell_servers_open() */
+}
 
-/*****************************************************************************/
 /*
  * close the file and release the ref to the cell
  */
 static int afs_proc_cell_servers_release(struct inode *inode,
 					 struct file *file)
 {
-	struct afs_cell *cell = PDE(inode)->data;
-	int ret;
-
-	ret = seq_release(inode, file);
-
-	afs_put_cell(cell);
-
-	return ret;
-} /* end afs_proc_cell_servers_release() */
+	return seq_release(inode, file);
+}
 
-/*****************************************************************************/
 /*
  * set up the iterator to start reading from the cells list and return the
  * first item
  */
 static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
-	__acquires(m->private->sv_lock)
+	__acquires(m->private->servers_lock)
 {
 	struct list_head *_p;
 	struct afs_cell *cell = m->private;
@@ -783,7 +740,7 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
 	_enter("cell=%p pos=%Ld", cell, *_pos);
 
 	/* lock the list against modification */
-	read_lock(&cell->sv_lock);
+	read_lock(&cell->servers_lock);
 
 	/* allow for the header line */
 	if (!pos)
@@ -791,14 +748,13 @@ static void *afs_proc_cell_servers_start(struct seq_file *m, loff_t *_pos)
 	pos--;
 
 	/* find the n'th element in the list */
-	list_for_each(_p, &cell->sv_list)
+	list_for_each(_p, &cell->servers)
 		if (!pos--)
 			break;
 
-	return _p != &cell->sv_list ? _p : NULL;
-} /* end afs_proc_cell_servers_start() */
+	return _p != &cell->servers ? _p : NULL;
+}
 
-/*****************************************************************************/
 /*
  * move to next cell in cells list
  */
@@ -813,25 +769,22 @@ static void *afs_proc_cell_servers_next(struct seq_file *p, void *v,
 	(*_pos)++;
 
 	_p = v;
-	_p = v == (void *) 1 ? cell->sv_list.next : _p->next;
+	_p = v == (void *) 1 ? cell->servers.next : _p->next;
 
-	return _p != &cell->sv_list ? _p : NULL;
-} /* end afs_proc_cell_servers_next() */
+	return _p != &cell->servers ? _p : NULL;
+}
 
-/*****************************************************************************/
 /*
  * clean up after reading from the cells list
  */
 static void afs_proc_cell_servers_stop(struct seq_file *p, void *v)
-	__releases(p->private->sv_lock)
+	__releases(p->private->servers_lock)
 {
 	struct afs_cell *cell = p->private;
 
-	read_unlock(&cell->sv_lock);
-
-} /* end afs_proc_cell_servers_stop() */
+	read_unlock(&cell->servers_lock);
+}
 
-/*****************************************************************************/
 /*
  * display a header line followed by a load of volume lines
  */
@@ -849,10 +802,7 @@ static int afs_proc_cell_servers_show(struct seq_file *m, void *v)
 	/* display one cell per line on subsequent lines */
 	sprintf(ipaddr, "%u.%u.%u.%u", NIPQUAD(server->addr));
 	seq_printf(m, "%3d %-15.15s %5d\n",
-		   atomic_read(&server->usage),
-		   ipaddr,
-		   server->fs_state
-		   );
+		   atomic_read(&server->usage), ipaddr, server->fs_state);
 
 	return 0;
-} /* end afs_proc_cell_servers_show() */
+}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
new file mode 100644
index 000000000000..e7b047328a39
--- /dev/null
+++ b/fs/afs/rxrpc.c
@@ -0,0 +1,782 @@
+/* Maintain an RxRPC server socket to do AFS communications through
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <net/sock.h>
+#include <net/af_rxrpc.h>
+#include <rxrpc/packet.h>
+#include "internal.h"
+#include "afs_cm.h"
+
+static struct socket *afs_socket; /* my RxRPC socket */
+static struct workqueue_struct *afs_async_calls;
+static atomic_t afs_outstanding_calls;
+static atomic_t afs_outstanding_skbs;
+
+static void afs_wake_up_call_waiter(struct afs_call *);
+static int afs_wait_for_call_to_complete(struct afs_call *);
+static void afs_wake_up_async_call(struct afs_call *);
+static int afs_dont_wait_for_call_to_complete(struct afs_call *);
+static void afs_process_async_call(struct work_struct *);
+static void afs_rx_interceptor(struct sock *, unsigned long, struct sk_buff *);
+static int afs_deliver_cm_op_id(struct afs_call *, struct sk_buff *, bool);
+
+/* synchronous call management */
+const struct afs_wait_mode afs_sync_call = {
+	.rx_wakeup	= afs_wake_up_call_waiter,
+	.wait		= afs_wait_for_call_to_complete,
+};
+
+/* asynchronous call management */
+const struct afs_wait_mode afs_async_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+	.wait		= afs_dont_wait_for_call_to_complete,
+};
+
+/* asynchronous incoming call management */
+static const struct afs_wait_mode afs_async_incoming_call = {
+	.rx_wakeup	= afs_wake_up_async_call,
+};
+
+/* asynchronous incoming call initial processing */
+static const struct afs_call_type afs_RXCMxxxx = {
+	.name		= "CB.xxxx",
+	.deliver	= afs_deliver_cm_op_id,
+	.abort_to_error	= afs_abort_to_error,
+};
+
+static void afs_collect_incoming_call(struct work_struct *);
+
+static struct sk_buff_head afs_incoming_calls;
+static DECLARE_WORK(afs_collect_incoming_call_work, afs_collect_incoming_call);
+
+/*
+ * open an RxRPC socket and bind it to be a server for callback notifications
+ * - the socket is left in blocking mode and non-blocking ops use MSG_DONTWAIT
+ */
+int afs_open_socket(void)
+{
+	struct sockaddr_rxrpc srx;
+	struct socket *socket;
+	int ret;
+
+	_enter("");
+
+	skb_queue_head_init(&afs_incoming_calls);
+
+	afs_async_calls = create_singlethread_workqueue("kafsd");
+	if (!afs_async_calls) {
+		_leave(" = -ENOMEM [wq]");
+		return -ENOMEM;
+	}
+
+	ret = sock_create_kern(AF_RXRPC, SOCK_DGRAM, PF_INET, &socket);
+	if (ret < 0) {
+		destroy_workqueue(afs_async_calls);
+		_leave(" = %d [socket]", ret);
+		return ret;
+	}
+
+	socket->sk->sk_allocation = GFP_NOFS;
+
+	/* bind the callback manager's address to make this a server socket */
+	srx.srx_family			= AF_RXRPC;
+	srx.srx_service			= CM_SERVICE;
+	srx.transport_type		= SOCK_DGRAM;
+	srx.transport_len		= sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family	= AF_INET;
+	srx.transport.sin.sin_port	= htons(AFS_CM_PORT);
+	memset(&srx.transport.sin.sin_addr, 0,
+	       sizeof(srx.transport.sin.sin_addr));
+
+	ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
+	if (ret < 0) {
+		sock_release(socket);
+		_leave(" = %d [bind]", ret);
+		return ret;
+	}
+
+	rxrpc_kernel_intercept_rx_messages(socket, afs_rx_interceptor);
+
+	afs_socket = socket;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * close the RxRPC socket AFS was using
+ */
+void afs_close_socket(void)
+{
+	_enter("");
+
+	sock_release(afs_socket);
+
+	_debug("dework");
+	destroy_workqueue(afs_async_calls);
+
+	ASSERTCMP(atomic_read(&afs_outstanding_skbs), ==, 0);
+	ASSERTCMP(atomic_read(&afs_outstanding_calls), ==, 0);
+	_leave("");
+}
+
+/*
+ * note that the data in a socket buffer is now delivered and that the buffer
+ * should be freed
+ */
+static void afs_data_delivered(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("DLVR NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("DLVR %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_data_delivered(skb);
+	}
+}
+
+/*
+ * free a socket buffer
+ */
+static void afs_free_skb(struct sk_buff *skb)
+{
+	if (!skb) {
+		_debug("FREE NULL [%d]", atomic_read(&afs_outstanding_skbs));
+		dump_stack();
+	} else {
+		_debug("FREE %p{%u} [%d]",
+		       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+		if (atomic_dec_return(&afs_outstanding_skbs) == -1)
+			BUG();
+		rxrpc_kernel_free_skb(skb);
+	}
+}
+
+/*
+ * free a call
+ */
+static void afs_free_call(struct afs_call *call)
+{
+	_debug("DONE %p{%s} [%d]",
+	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+	if (atomic_dec_return(&afs_outstanding_calls) == -1)
+		BUG();
+
+	ASSERTCMP(call->rxcall, ==, NULL);
+	ASSERT(!work_pending(&call->async_work));
+	ASSERT(skb_queue_empty(&call->rx_queue));
+	ASSERT(call->type->name != NULL);
+
+	kfree(call->request);
+	kfree(call);
+}
+
+/*
+ * allocate a call with flat request and reply buffers
+ */
+struct afs_call *afs_alloc_flat_call(const struct afs_call_type *type,
+				     size_t request_size, size_t reply_size)
+{
+	struct afs_call *call;
+
+	call = kzalloc(sizeof(*call), GFP_NOFS);
+	if (!call)
+		goto nomem_call;
+
+	_debug("CALL %p{%s} [%d]",
+	       call, type->name, atomic_read(&afs_outstanding_calls));
+	atomic_inc(&afs_outstanding_calls);
+
+	call->type = type;
+	call->request_size = request_size;
+	call->reply_max = reply_size;
+
+	if (request_size) {
+		call->request = kmalloc(request_size, GFP_NOFS);
+		if (!call->request)
+			goto nomem_free;
+	}
+
+	if (reply_size) {
+		call->buffer = kmalloc(reply_size, GFP_NOFS);
+		if (!call->buffer)
+			goto nomem_free;
+	}
+
+	init_waitqueue_head(&call->waitq);
+	skb_queue_head_init(&call->rx_queue);
+	return call;
+
+nomem_free:
+	afs_free_call(call);
+nomem_call:
+	return NULL;
+}
+
+/*
+ * clean up a call with flat buffer
+ */
+void afs_flat_call_destructor(struct afs_call *call)
+{
+	_enter("");
+
+	kfree(call->request);
+	call->request = NULL;
+	kfree(call->buffer);
+	call->buffer = NULL;
+}
+
+/*
+ * initiate a call
+ */
+int afs_make_call(struct in_addr *addr, struct afs_call *call, gfp_t gfp,
+		  const struct afs_wait_mode *wait_mode)
+{
+	struct sockaddr_rxrpc srx;
+	struct rxrpc_call *rxcall;
+	struct msghdr msg;
+	struct kvec iov[1];
+	int ret;
+
+	_enter("%x,{%d},", addr->s_addr, ntohs(call->port));
+
+	ASSERT(call->type != NULL);
+	ASSERT(call->type->name != NULL);
+
+	_debug("MAKE %p{%s} [%d]",
+	       call, call->type->name, atomic_read(&afs_outstanding_calls));
+
+	call->wait_mode = wait_mode;
+	INIT_WORK(&call->async_work, afs_process_async_call);
+
+	memset(&srx, 0, sizeof(srx));
+	srx.srx_family = AF_RXRPC;
+	srx.srx_service = call->service_id;
+	srx.transport_type = SOCK_DGRAM;
+	srx.transport_len = sizeof(srx.transport.sin);
+	srx.transport.sin.sin_family = AF_INET;
+	srx.transport.sin.sin_port = call->port;
+	memcpy(&srx.transport.sin.sin_addr, addr, 4);
+
+	/* create a call */
+	rxcall = rxrpc_kernel_begin_call(afs_socket, &srx, call->key,
+					 (unsigned long) call, gfp);
+	call->key = NULL;
+	if (IS_ERR(rxcall)) {
+		ret = PTR_ERR(rxcall);
+		goto error_kill_call;
+	}
+
+	call->rxcall = rxcall;
+
+	/* send the request */
+	iov[0].iov_base	= call->request;
+	iov[0].iov_len	= call->request_size;
+
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= (struct iovec *) iov;
+	msg.msg_iovlen		= 1;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	/* have to change the state *before* sending the last packet as RxRPC
+	 * might give us the reply before it returns from sending the
+	 * request */
+	call->state = AFS_CALL_AWAIT_REPLY;
+	ret = rxrpc_kernel_send_data(rxcall, &msg, call->request_size);
+	if (ret < 0)
+		goto error_do_abort;
+
+	/* at this point, an async call may no longer exist as it may have
+	 * already completed */
+	return wait_mode->wait(call);
+
+error_do_abort:
+	rxrpc_kernel_abort_call(rxcall, RX_USER_ABORT);
+	rxrpc_kernel_end_call(rxcall);
+	call->rxcall = NULL;
+error_kill_call:
+	call->type->destructor(call);
+	afs_free_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * handles intercepted messages that were arriving in the socket's Rx queue
+ * - called with the socket receive queue lock held to ensure message ordering
+ * - called with softirqs disabled
+ */
+static void afs_rx_interceptor(struct sock *sk, unsigned long user_call_ID,
+			       struct sk_buff *skb)
+{
+	struct afs_call *call = (struct afs_call *) user_call_ID;
+
+	_enter("%p,,%u", call, skb->mark);
+
+	_debug("ICPT %p{%u} [%d]",
+	       skb, skb->mark, atomic_read(&afs_outstanding_skbs));
+
+	ASSERTCMP(sk, ==, afs_socket->sk);
+	atomic_inc(&afs_outstanding_skbs);
+
+	if (!call) {
+		/* its an incoming call for our callback service */
+		skb_queue_tail(&afs_incoming_calls, skb);
+		schedule_work(&afs_collect_incoming_call_work);
+	} else {
+		/* route the messages directly to the appropriate call */
+		skb_queue_tail(&call->rx_queue, skb);
+		call->wait_mode->rx_wakeup(call);
+	}
+
+	_leave("");
+}
+
+/*
+ * deliver messages to a call
+ */
+static void afs_deliver_to_call(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	bool last;
+	u32 abort_code;
+	int ret;
+
+	_enter("");
+
+	while ((call->state == AFS_CALL_AWAIT_REPLY ||
+		call->state == AFS_CALL_AWAIT_OP_ID ||
+		call->state == AFS_CALL_AWAIT_REQUEST ||
+		call->state == AFS_CALL_AWAIT_ACK) &&
+	       (skb = skb_dequeue(&call->rx_queue))) {
+		switch (skb->mark) {
+		case RXRPC_SKB_MARK_DATA:
+			_debug("Rcv DATA");
+			last = rxrpc_kernel_is_data_last(skb);
+			ret = call->type->deliver(call, skb, last);
+			switch (ret) {
+			case 0:
+				if (last &&
+				    call->state == AFS_CALL_AWAIT_REPLY)
+					call->state = AFS_CALL_COMPLETE;
+				break;
+			case -ENOTCONN:
+				abort_code = RX_CALL_DEAD;
+				goto do_abort;
+			case -ENOTSUPP:
+				abort_code = RX_INVALID_OPERATION;
+				goto do_abort;
+			default:
+				abort_code = RXGEN_CC_UNMARSHAL;
+				if (call->state != AFS_CALL_AWAIT_REPLY)
+					abort_code = RXGEN_SS_UNMARSHAL;
+			do_abort:
+				rxrpc_kernel_abort_call(call->rxcall,
+							abort_code);
+				call->error = ret;
+				call->state = AFS_CALL_ERROR;
+				break;
+			}
+			afs_data_delivered(skb);
+			skb = NULL;
+			continue;
+		case RXRPC_SKB_MARK_FINAL_ACK:
+			_debug("Rcv ACK");
+			call->state = AFS_CALL_COMPLETE;
+			break;
+		case RXRPC_SKB_MARK_BUSY:
+			_debug("Rcv BUSY");
+			call->error = -EBUSY;
+			call->state = AFS_CALL_BUSY;
+			break;
+		case RXRPC_SKB_MARK_REMOTE_ABORT:
+			abort_code = rxrpc_kernel_get_abort_code(skb);
+			call->error = call->type->abort_to_error(abort_code);
+			call->state = AFS_CALL_ABORTED;
+			_debug("Rcv ABORT %u -> %d", abort_code, call->error);
+			break;
+		case RXRPC_SKB_MARK_NET_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv NET ERROR %d", call->error);
+			break;
+		case RXRPC_SKB_MARK_LOCAL_ERROR:
+			call->error = -rxrpc_kernel_get_error_number(skb);
+			call->state = AFS_CALL_ERROR;
+			_debug("Rcv LOCAL ERROR %d", call->error);
+			break;
+		default:
+			BUG();
+			break;
+		}
+
+		afs_free_skb(skb);
+	}
+
+	/* make sure the queue is empty if the call is done with (we might have
+	 * aborted the call early because of an unmarshalling error) */
+	if (call->state >= AFS_CALL_COMPLETE) {
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+		if (call->incoming) {
+			rxrpc_kernel_end_call(call->rxcall);
+			call->rxcall = NULL;
+			call->type->destructor(call);
+			afs_free_call(call);
+		}
+	}
+
+	_leave("");
+}
+
+/*
+ * wait synchronously for a call to complete
+ */
+static int afs_wait_for_call_to_complete(struct afs_call *call)
+{
+	struct sk_buff *skb;
+	int ret;
+
+	DECLARE_WAITQUEUE(myself, current);
+
+	_enter("");
+
+	add_wait_queue(&call->waitq, &myself);
+	for (;;) {
+		set_current_state(TASK_INTERRUPTIBLE);
+
+		/* deliver any messages that are in the queue */
+		if (!skb_queue_empty(&call->rx_queue)) {
+			__set_current_state(TASK_RUNNING);
+			afs_deliver_to_call(call);
+			continue;
+		}
+
+		ret = call->error;
+		if (call->state >= AFS_CALL_COMPLETE)
+			break;
+		ret = -EINTR;
+		if (signal_pending(current))
+			break;
+		schedule();
+	}
+
+	remove_wait_queue(&call->waitq, &myself);
+	__set_current_state(TASK_RUNNING);
+
+	/* kill the call */
+	if (call->state < AFS_CALL_COMPLETE) {
+		_debug("call incomplete");
+		rxrpc_kernel_abort_call(call->rxcall, RX_CALL_DEAD);
+		while ((skb = skb_dequeue(&call->rx_queue)))
+			afs_free_skb(skb);
+	}
+
+	_debug("call complete");
+	rxrpc_kernel_end_call(call->rxcall);
+	call->rxcall = NULL;
+	call->type->destructor(call);
+	afs_free_call(call);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * wake up a waiting call
+ */
+static void afs_wake_up_call_waiter(struct afs_call *call)
+{
+	wake_up(&call->waitq);
+}
+
+/*
+ * wake up an asynchronous call
+ */
+static void afs_wake_up_async_call(struct afs_call *call)
+{
+	_enter("");
+	queue_work(afs_async_calls, &call->async_work);
+}
+
+/*
+ * put a call into asynchronous mode
+ * - mustn't touch the call descriptor as the call my have completed by the
+ *   time we get here
+ */
+static int afs_dont_wait_for_call_to_complete(struct afs_call *call)
+{
+	_enter("");
+	return -EINPROGRESS;
+}
+
+/*
+ * delete an asynchronous call
+ */
+static void afs_delete_async_call(struct work_struct *work)
+{
+	struct afs_call *call =
+		container_of(work, struct afs_call, async_work);
+
+	_enter("");
+
+	afs_free_call(call);
+
+	_leave("");
+}
+
+/*
+ * perform processing on an asynchronous call
+ * - on a multiple-thread workqueue this work item may try to run on several
+ *   CPUs at the same time
+ */
+static void afs_process_async_call(struct work_struct *work)
+{
+	struct afs_call *call =
+		container_of(work, struct afs_call, async_work);
+
+	_enter("");
+
+	if (!skb_queue_empty(&call->rx_queue))
+		afs_deliver_to_call(call);
+
+	if (call->state >= AFS_CALL_COMPLETE && call->wait_mode) {
+		if (call->wait_mode->async_complete)
+			call->wait_mode->async_complete(call->reply,
+							call->error);
+		call->reply = NULL;
+
+		/* kill the call */
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+		if (call->type->destructor)
+			call->type->destructor(call);
+
+		/* we can't just delete the call because the work item may be
+		 * queued */
+		PREPARE_WORK(&call->async_work, afs_delete_async_call);
+		queue_work(afs_async_calls, &call->async_work);
+	}
+
+	_leave("");
+}
+
+/*
+ * empty a socket buffer into a flat reply buffer
+ */
+void afs_transfer_reply(struct afs_call *call, struct sk_buff *skb)
+{
+	size_t len = skb->len;
+
+	if (skb_copy_bits(skb, 0, call->buffer + call->reply_size, len) < 0)
+		BUG();
+	call->reply_size += len;
+}
+
+/*
+ * accept the backlog of incoming calls
+ */
+static void afs_collect_incoming_call(struct work_struct *work)
+{
+	struct rxrpc_call *rxcall;
+	struct afs_call *call = NULL;
+	struct sk_buff *skb;
+
+	while ((skb = skb_dequeue(&afs_incoming_calls))) {
+		_debug("new call");
+
+		/* don't need the notification */
+		afs_free_skb(skb);
+
+		if (!call) {
+			call = kzalloc(sizeof(struct afs_call), GFP_KERNEL);
+			if (!call) {
+				rxrpc_kernel_reject_call(afs_socket);
+				return;
+			}
+
+			INIT_WORK(&call->async_work, afs_process_async_call);
+			call->wait_mode = &afs_async_incoming_call;
+			call->type = &afs_RXCMxxxx;
+			init_waitqueue_head(&call->waitq);
+			skb_queue_head_init(&call->rx_queue);
+			call->state = AFS_CALL_AWAIT_OP_ID;
+
+			_debug("CALL %p{%s} [%d]",
+			       call, call->type->name,
+			       atomic_read(&afs_outstanding_calls));
+			atomic_inc(&afs_outstanding_calls);
+		}
+
+		rxcall = rxrpc_kernel_accept_call(afs_socket,
+						  (unsigned long) call);
+		if (!IS_ERR(rxcall)) {
+			call->rxcall = rxcall;
+			call = NULL;
+		}
+	}
+
+	if (call)
+		afs_free_call(call);
+}
+
+/*
+ * grab the operation ID from an incoming cache manager call
+ */
+static int afs_deliver_cm_op_id(struct afs_call *call, struct sk_buff *skb,
+				bool last)
+{
+	size_t len = skb->len;
+	void *oibuf = (void *) &call->operation_ID;
+
+	_enter("{%u},{%zu},%d", call->offset, len, last);
+
+	ASSERTCMP(call->offset, <, 4);
+
+	/* the operation ID forms the first four bytes of the request data */
+	len = min_t(size_t, len, 4 - call->offset);
+	if (skb_copy_bits(skb, 0, oibuf + call->offset, len) < 0)
+		BUG();
+	if (!pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < 4) {
+		if (last) {
+			_leave(" = -EBADMSG [op ID short]");
+			return -EBADMSG;
+		}
+		_leave(" = 0 [incomplete]");
+		return 0;
+	}
+
+	call->state = AFS_CALL_AWAIT_REQUEST;
+
+	/* ask the cache manager to route the call (it'll change the call type
+	 * if successful) */
+	if (!afs_cm_incoming_call(call))
+		return -ENOTSUPP;
+
+	/* pass responsibility for the remainer of this message off to the
+	 * cache manager op */
+	return call->type->deliver(call, skb, last);
+}
+
+/*
+ * send an empty reply
+ */
+void afs_send_empty_reply(struct afs_call *call)
+{
+	struct msghdr msg;
+	struct iovec iov[1];
+
+	_enter("");
+
+	iov[0].iov_base		= NULL;
+	iov[0].iov_len		= 0;
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= iov;
+	msg.msg_iovlen		= 0;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	switch (rxrpc_kernel_send_data(call->rxcall, &msg, 0)) {
+	case 0:
+		_leave(" [replied]");
+		return;
+
+	case -ENOMEM:
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	default:
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+		call->type->destructor(call);
+		afs_free_call(call);
+		_leave(" [error]");
+		return;
+	}
+}
+
+/*
+ * send a simple reply
+ */
+void afs_send_simple_reply(struct afs_call *call, const void *buf, size_t len)
+{
+	struct msghdr msg;
+	struct iovec iov[1];
+
+	_enter("");
+
+	iov[0].iov_base		= (void *) buf;
+	iov[0].iov_len		= len;
+	msg.msg_name		= NULL;
+	msg.msg_namelen		= 0;
+	msg.msg_iov		= iov;
+	msg.msg_iovlen		= 1;
+	msg.msg_control		= NULL;
+	msg.msg_controllen	= 0;
+	msg.msg_flags		= 0;
+
+	call->state = AFS_CALL_AWAIT_ACK;
+	switch (rxrpc_kernel_send_data(call->rxcall, &msg, len)) {
+	case 0:
+		_leave(" [replied]");
+		return;
+
+	case -ENOMEM:
+		_debug("oom");
+		rxrpc_kernel_abort_call(call->rxcall, RX_USER_ABORT);
+	default:
+		rxrpc_kernel_end_call(call->rxcall);
+		call->rxcall = NULL;
+		call->type->destructor(call);
+		afs_free_call(call);
+		_leave(" [error]");
+		return;
+	}
+}
+
+/*
+ * extract a piece of data from the received data socket buffers
+ */
+int afs_extract_data(struct afs_call *call, struct sk_buff *skb,
+		     bool last, void *buf, size_t count)
+{
+	size_t len = skb->len;
+
+	_enter("{%u},{%zu},%d,,%zu", call->offset, len, last, count);
+
+	ASSERTCMP(call->offset, <, count);
+
+	len = min_t(size_t, len, count - call->offset);
+	if (skb_copy_bits(skb, 0, buf + call->offset, len) < 0 ||
+	    !pskb_pull(skb, len))
+		BUG();
+	call->offset += len;
+
+	if (call->offset < count) {
+		if (last) {
+			_leave(" = -EBADMSG [%d < %lu]", call->offset, count);
+			return -EBADMSG;
+		}
+		_leave(" = -EAGAIN");
+		return -EAGAIN;
+	}
+	return 0;
+}
diff --git a/fs/afs/security.c b/fs/afs/security.c
new file mode 100644
index 000000000000..f9f424d80458
--- /dev/null
+++ b/fs/afs/security.c
@@ -0,0 +1,356 @@
+/* AFS security handling
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/init.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/ctype.h>
+#include <keys/rxrpc-type.h>
+#include "internal.h"
+
+/*
+ * get a key
+ */
+struct key *afs_request_key(struct afs_cell *cell)
+{
+	struct key *key;
+
+	_enter("{%x}", key_serial(cell->anonymous_key));
+
+	_debug("key %s", cell->anonymous_key->description);
+	key = request_key(&key_type_rxrpc, cell->anonymous_key->description,
+			  NULL);
+	if (IS_ERR(key)) {
+		if (PTR_ERR(key) != -ENOKEY) {
+			_leave(" = %ld", PTR_ERR(key));
+			return key;
+		}
+
+		/* act as anonymous user */
+		_leave(" = {%x} [anon]", key_serial(cell->anonymous_key));
+		return key_get(cell->anonymous_key);
+	} else {
+		/* act as authorised user */
+		_leave(" = {%x} [auth]", key_serial(key));
+		return key;
+	}
+}
+
+/*
+ * dispose of a permits list
+ */
+void afs_zap_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+	int loop;
+
+	_enter("{%d}", permits->count);
+
+	for (loop = permits->count - 1; loop >= 0; loop--)
+		key_put(permits->permits[loop].key);
+	kfree(permits);
+}
+
+/*
+ * dispose of a permits list in which all the key pointers have been copied
+ */
+static void afs_dispose_of_permits(struct rcu_head *rcu)
+{
+	struct afs_permits *permits =
+		container_of(rcu, struct afs_permits, rcu);
+
+	_enter("{%d}", permits->count);
+
+	kfree(permits);
+}
+
+/*
+ * get the authorising vnode - this is the specified inode itself if it's a
+ * directory or it's the parent directory if the specified inode is a file or
+ * symlink
+ * - the caller must release the ref on the inode
+ */
+static struct afs_vnode *afs_get_auth_inode(struct afs_vnode *vnode,
+					    struct key *key)
+{
+	struct afs_vnode *auth_vnode;
+	struct inode *auth_inode;
+
+	_enter("");
+
+	if (S_ISDIR(vnode->vfs_inode.i_mode)) {
+		auth_inode = igrab(&vnode->vfs_inode);
+		ASSERT(auth_inode != NULL);
+	} else {
+		auth_inode = afs_iget(vnode->vfs_inode.i_sb, key,
+				      &vnode->status.parent, NULL, NULL);
+		if (IS_ERR(auth_inode))
+			return ERR_PTR(PTR_ERR(auth_inode));
+	}
+
+	auth_vnode = AFS_FS_I(auth_inode);
+	_leave(" = {%x}", auth_vnode->fid.vnode);
+	return auth_vnode;
+}
+
+/*
+ * clear the permit cache on a directory vnode
+ */
+void afs_clear_permits(struct afs_vnode *vnode)
+{
+	struct afs_permits *permits;
+
+	_enter("{%x}", vnode->fid.vnode);
+
+	mutex_lock(&vnode->permits_lock);
+	permits = vnode->permits;
+	rcu_assign_pointer(vnode->permits, NULL);
+	mutex_unlock(&vnode->permits_lock);
+
+	if (permits)
+		call_rcu(&permits->rcu, afs_zap_permits);
+	_leave("");
+}
+
+/*
+ * add the result obtained for a vnode to its or its parent directory's cache
+ * for the key used to access it
+ */
+void afs_cache_permit(struct afs_vnode *vnode, struct key *key, long acl_order)
+{
+	struct afs_permits *permits, *xpermits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	int count, loop;
+
+	_enter("{%x},%x,%lx", vnode->fid.vnode, key_serial(key), acl_order);
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		_leave(" [get error %ld]", PTR_ERR(auth_vnode));
+		return;
+	}
+
+	mutex_lock(&auth_vnode->permits_lock);
+
+	/* guard against a rename being detected whilst we waited for the
+	 * lock */
+	if (memcmp(&auth_vnode->fid, &vnode->status.parent,
+		   sizeof(struct afs_fid)) != 0) {
+		_debug("renamed");
+		goto out_unlock;
+	}
+
+	/* have to be careful as the directory's callback may be broken between
+	 * us receiving the status we're trying to cache and us getting the
+	 * lock to update the cache for the status */
+	if (auth_vnode->acl_order - acl_order > 0) {
+		_debug("ACL changed?");
+		goto out_unlock;
+	}
+
+	/* always update the anonymous mask */
+	_debug("anon access %x", vnode->status.anon_access);
+	auth_vnode->status.anon_access = vnode->status.anon_access;
+	if (key == vnode->volume->cell->anonymous_key)
+		goto out_unlock;
+
+	xpermits = auth_vnode->permits;
+	count = 0;
+	if (xpermits) {
+		/* see if the permit is already in the list
+		 * - if it is then we just amend the list
+		 */
+		count = xpermits->count;
+		permit = xpermits->permits;
+		for (loop = count; loop > 0; loop--) {
+			if (permit->key == key) {
+				permit->access_mask =
+					vnode->status.caller_access;
+				goto out_unlock;
+			}
+			permit++;
+		}
+	}
+
+	permits = kmalloc(sizeof(*permits) + sizeof(*permit) * (count + 1),
+			  GFP_NOFS);
+	if (!permits)
+		goto out_unlock;
+
+	memcpy(permits->permits, xpermits->permits,
+	       count * sizeof(struct afs_permit));
+
+	_debug("key %x access %x",
+	       key_serial(key), vnode->status.caller_access);
+	permits->permits[count].access_mask = vnode->status.caller_access;
+	permits->permits[count].key = key_get(key);
+	permits->count = count + 1;
+
+	rcu_assign_pointer(auth_vnode->permits, permits);
+	if (xpermits)
+		call_rcu(&xpermits->rcu, afs_dispose_of_permits);
+
+out_unlock:
+	mutex_unlock(&auth_vnode->permits_lock);
+	iput(&auth_vnode->vfs_inode);
+	_leave("");
+}
+
+/*
+ * check with the fileserver to see if the directory or parent directory is
+ * permitted to be accessed with this authorisation, and if so, what access it
+ * is granted
+ */
+static int afs_check_permit(struct afs_vnode *vnode, struct key *key,
+			    afs_access_t *_access)
+{
+	struct afs_permits *permits;
+	struct afs_permit *permit;
+	struct afs_vnode *auth_vnode;
+	bool valid;
+	int loop, ret;
+
+	_enter("");
+
+	auth_vnode = afs_get_auth_inode(vnode, key);
+	if (IS_ERR(auth_vnode)) {
+		*_access = 0;
+		_leave(" = %ld", PTR_ERR(auth_vnode));
+		return PTR_ERR(auth_vnode);
+	}
+
+	ASSERT(S_ISDIR(auth_vnode->vfs_inode.i_mode));
+
+	/* check the permits to see if we've got one yet */
+	if (key == auth_vnode->volume->cell->anonymous_key) {
+		_debug("anon");
+		*_access = auth_vnode->status.anon_access;
+		valid = true;
+	} else {
+		valid = false;
+		rcu_read_lock();
+		permits = rcu_dereference(auth_vnode->permits);
+		if (permits) {
+			permit = permits->permits;
+			for (loop = permits->count; loop > 0; loop--) {
+				if (permit->key == key) {
+					_debug("found in cache");
+					*_access = permit->access_mask;
+					valid = true;
+					break;
+				}
+				permit++;
+			}
+		}
+		rcu_read_unlock();
+	}
+
+	if (!valid) {
+		/* check the status on the file we're actually interested in
+		 * (the post-processing will cache the result on auth_vnode) */
+		_debug("no valid permit");
+
+		set_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+		ret = afs_vnode_fetch_status(vnode, auth_vnode, key);
+		if (ret < 0) {
+			iput(&auth_vnode->vfs_inode);
+			*_access = 0;
+			_leave(" = %d", ret);
+			return ret;
+		}
+	}
+
+	*_access = vnode->status.caller_access;
+	iput(&auth_vnode->vfs_inode);
+	_leave(" = 0 [access %x]", *_access);
+	return 0;
+}
+
+/*
+ * check the permissions on an AFS file
+ * - AFS ACLs are attached to directories only, and a file is controlled by its
+ *   parent directory's ACL
+ */
+int afs_permission(struct inode *inode, int mask, struct nameidata *nd)
+{
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+	afs_access_t access;
+	struct key *key;
+	int ret;
+
+	_enter("{{%x:%x},%lx},%x,",
+	       vnode->fid.vid, vnode->fid.vnode, vnode->flags, mask);
+
+	key = afs_request_key(vnode->volume->cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		return PTR_ERR(key);
+	}
+
+	/* if the promise has expired, we need to check the server again */
+	if (!vnode->cb_promised) {
+		_debug("not promised");
+		ret = afs_vnode_fetch_status(vnode, NULL, key);
+		if (ret < 0)
+			goto error;
+		_debug("new promise [fl=%lx]", vnode->flags);
+	}
+
+	/* check the permits to see if we've got one yet */
+	ret = afs_check_permit(vnode, key, &access);
+	if (ret < 0)
+		goto error;
+
+	/* interpret the access mask */
+	_debug("REQ %x ACC %x on %s",
+	       mask, access, S_ISDIR(inode->i_mode) ? "dir" : "file");
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (mask & MAY_EXEC) {
+			if (!(access & AFS_ACE_LOOKUP))
+				goto permission_denied;
+		} else if (mask & MAY_READ) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & (AFS_ACE_DELETE | /* rmdir, unlink, rename from */
+					AFS_ACE_INSERT | /* create, mkdir, symlink, rename to */
+					AFS_ACE_WRITE))) /* chmod */
+				goto permission_denied;
+		} else {
+			BUG();
+		}
+	} else {
+		if (!(access & AFS_ACE_LOOKUP))
+			goto permission_denied;
+		if (mask & (MAY_EXEC | MAY_READ)) {
+			if (!(access & AFS_ACE_READ))
+				goto permission_denied;
+		} else if (mask & MAY_WRITE) {
+			if (!(access & AFS_ACE_WRITE))
+				goto permission_denied;
+		}
+	}
+
+	key_put(key);
+	ret = generic_permission(inode, mask, NULL);
+	_leave(" = %d", ret);
+	return ret;
+
+permission_denied:
+	ret = -EACCES;
+error:
+	key_put(key);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 44aff81dc6a7..96bb23b476a2 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -1,6 +1,6 @@
-/* server.c: AFS server record management
+/* AFS server record management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -11,489 +11,314 @@
 
 #include <linux/sched.h>
 #include <linux/slab.h>
-#include <rxrpc/peer.h>
-#include <rxrpc/connection.h>
-#include "volume.h"
-#include "cell.h"
-#include "server.h"
-#include "transport.h"
-#include "vlclient.h"
-#include "kafstimod.h"
 #include "internal.h"
 
-DEFINE_SPINLOCK(afs_server_peer_lock);
+unsigned afs_server_timeout = 10;	/* server timeout in seconds */
 
-#define FS_SERVICE_ID		1	/* AFS Volume Location Service ID */
-#define VL_SERVICE_ID		52	/* AFS Volume Location Service ID */
+static void afs_reap_server(struct work_struct *);
 
-static void __afs_server_timeout(struct afs_timer *timer)
+/* tree of all the servers, indexed by IP address */
+static struct rb_root afs_servers = RB_ROOT;
+static DEFINE_RWLOCK(afs_servers_lock);
+
+/* LRU list of all the servers not currently in use */
+static LIST_HEAD(afs_server_graveyard);
+static DEFINE_SPINLOCK(afs_server_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_server_reaper, afs_reap_server);
+
+/*
+ * install a server record in the master tree
+ */
+static int afs_install_server(struct afs_server *server)
 {
-	struct afs_server *server =
-		list_entry(timer, struct afs_server, timeout);
+	struct afs_server *xserver;
+	struct rb_node **pp, *p;
+	int ret;
 
-	_debug("SERVER TIMEOUT [%p{u=%d}]",
-	       server, atomic_read(&server->usage));
+	_enter("%p", server);
 
-	afs_server_do_timeout(server);
-}
+	write_lock(&afs_servers_lock);
+
+	ret = -EEXIST;
+	pp = &afs_servers.rb_node;
+	p = NULL;
+	while (*pp) {
+		p = *pp;
+		_debug("- consider %p", p);
+		xserver = rb_entry(p, struct afs_server, master_rb);
+		if (server->addr.s_addr < xserver->addr.s_addr)
+			pp = &(*pp)->rb_left;
+		else if (server->addr.s_addr > xserver->addr.s_addr)
+			pp = &(*pp)->rb_right;
+		else
+			goto error;
+	}
 
-static const struct afs_timer_ops afs_server_timer_ops = {
-	.timed_out	= __afs_server_timeout,
-};
+	rb_link_node(&server->master_rb, p, pp);
+	rb_insert_color(&server->master_rb, &afs_servers);
+	ret = 0;
+
+error:
+	write_unlock(&afs_servers_lock);
+	return ret;
+}
 
-/*****************************************************************************/
 /*
- * lookup a server record in a cell
- * - TODO: search the cell's server list
+ * allocate a new server record
  */
-int afs_server_lookup(struct afs_cell *cell, const struct in_addr *addr,
-		      struct afs_server **_server)
+static struct afs_server *afs_alloc_server(struct afs_cell *cell,
+					   const struct in_addr *addr)
 {
-	struct afs_server *server, *active, *zombie;
-	int loop;
+	struct afs_server *server;
 
-	_enter("%p,%08x,", cell, ntohl(addr->s_addr));
+	_enter("");
 
-	/* allocate and initialise a server record */
 	server = kzalloc(sizeof(struct afs_server), GFP_KERNEL);
-	if (!server) {
-		_leave(" = -ENOMEM");
-		return -ENOMEM;
+	if (server) {
+		atomic_set(&server->usage, 1);
+		server->cell = cell;
+
+		INIT_LIST_HEAD(&server->link);
+		INIT_LIST_HEAD(&server->grave);
+		init_rwsem(&server->sem);
+		spin_lock_init(&server->fs_lock);
+		server->fs_vnodes = RB_ROOT;
+		server->cb_promises = RB_ROOT;
+		spin_lock_init(&server->cb_lock);
+		init_waitqueue_head(&server->cb_break_waitq);
+		INIT_DELAYED_WORK(&server->cb_break_work,
+				  afs_dispatch_give_up_callbacks);
+
+		memcpy(&server->addr, addr, sizeof(struct in_addr));
+		server->addr.s_addr = addr->s_addr;
 	}
 
-	atomic_set(&server->usage, 1);
-
-	INIT_LIST_HEAD(&server->link);
-	init_rwsem(&server->sem);
-	INIT_LIST_HEAD(&server->fs_callq);
-	spin_lock_init(&server->fs_lock);
-	INIT_LIST_HEAD(&server->cb_promises);
-	spin_lock_init(&server->cb_lock);
-
-	for (loop = 0; loop < AFS_SERVER_CONN_LIST_SIZE; loop++)
-		server->fs_conn_cnt[loop] = 4;
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+}
 
-	memcpy(&server->addr, addr, sizeof(struct in_addr));
-	server->addr.s_addr = addr->s_addr;
+/*
+ * get an FS-server record for a cell
+ */
+struct afs_server *afs_lookup_server(struct afs_cell *cell,
+				     const struct in_addr *addr)
+{
+	struct afs_server *server, *candidate;
 
-	afs_timer_init(&server->timeout, &afs_server_timer_ops);
+	_enter("%p,"NIPQUAD_FMT, cell, NIPQUAD(addr->s_addr));
 
-	/* add to the cell */
-	write_lock(&cell->sv_lock);
+	/* quick scan of the list to see if we already have the server */
+	read_lock(&cell->servers_lock);
 
-	/* check the active list */
-	list_for_each_entry(active, &cell->sv_list, link) {
-		if (active->addr.s_addr == addr->s_addr)
-			goto use_active_server;
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server_quickly;
 	}
+	read_unlock(&cell->servers_lock);
 
-	/* check the inactive list */
-	spin_lock(&cell->sv_gylock);
-	list_for_each_entry(zombie, &cell->sv_graveyard, link) {
-		if (zombie->addr.s_addr == addr->s_addr)
-			goto resurrect_server;
+	candidate = afs_alloc_server(cell, addr);
+	if (!candidate) {
+		_leave(" = -ENOMEM");
+		return ERR_PTR(-ENOMEM);
 	}
-	spin_unlock(&cell->sv_gylock);
 
-	afs_get_cell(cell);
-	server->cell = cell;
-	list_add_tail(&server->link, &cell->sv_list);
+	write_lock(&cell->servers_lock);
 
-	write_unlock(&cell->sv_lock);
+	/* check the cell's server list again */
+	list_for_each_entry(server, &cell->servers, link) {
+		if (server->addr.s_addr == addr->s_addr)
+			goto found_server;
+	}
 
-	*_server = server;
-	_leave(" = 0 (%p)", server);
-	return 0;
+	_debug("new");
+	server = candidate;
+	if (afs_install_server(server) < 0)
+		goto server_in_two_cells;
 
-	/* found a matching active server */
- use_active_server:
-	_debug("active server");
-	afs_get_server(active);
-	write_unlock(&cell->sv_lock);
+	afs_get_cell(cell);
+	list_add_tail(&server->link, &cell->servers);
+
+	write_unlock(&cell->servers_lock);
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server quickly */
+found_server_quickly:
+	_debug("found quickly");
+	afs_get_server(server);
+	read_unlock(&cell->servers_lock);
+no_longer_unused:
+	if (!list_empty(&server->grave)) {
+		spin_lock(&afs_server_graveyard_lock);
+		list_del_init(&server->grave);
+		spin_unlock(&afs_server_graveyard_lock);
+	}
+	_leave(" = %p{%d}", server, atomic_read(&server->usage));
+	return server;
+
+	/* found a matching server on the second pass */
+found_server:
+	_debug("found");
+	afs_get_server(server);
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	goto no_longer_unused;
+
+	/* found a server that seems to be in two cells */
+server_in_two_cells:
+	write_unlock(&cell->servers_lock);
+	kfree(candidate);
+	printk(KERN_NOTICE "kAFS:"
+	       " Server "NIPQUAD_FMT" appears to be in two cells\n",
+	       NIPQUAD(*addr));
+	_leave(" = -EEXIST");
+	return ERR_PTR(-EEXIST);
+}
 
-	kfree(server);
+/*
+ * look up a server by its IP address
+ */
+struct afs_server *afs_find_server(const struct in_addr *_addr)
+{
+	struct afs_server *server = NULL;
+	struct rb_node *p;
+	struct in_addr addr = *_addr;
 
-	*_server = active;
-	_leave(" = 0 (%p)", active);
-	return 0;
+	_enter(NIPQUAD_FMT, NIPQUAD(addr.s_addr));
 
-	/* found a matching server in the graveyard, so resurrect it and
-	 * dispose of the new record */
- resurrect_server:
-	_debug("resurrecting server");
+	read_lock(&afs_servers_lock);
 
-	list_move_tail(&zombie->link, &cell->sv_list);
-	afs_get_server(zombie);
-	afs_kafstimod_del_timer(&zombie->timeout);
-	spin_unlock(&cell->sv_gylock);
-	write_unlock(&cell->sv_lock);
+	p = afs_servers.rb_node;
+	while (p) {
+		server = rb_entry(p, struct afs_server, master_rb);
 
-	kfree(server);
+		_debug("- consider %p", p);
 
-	*_server = zombie;
-	_leave(" = 0 (%p)", zombie);
-	return 0;
+		if (addr.s_addr < server->addr.s_addr) {
+			p = p->rb_left;
+		} else if (addr.s_addr > server->addr.s_addr) {
+			p = p->rb_right;
+		} else {
+			afs_get_server(server);
+			goto found;
+		}
+	}
 
-} /* end afs_server_lookup() */
+	server = NULL;
+found:
+	read_unlock(&afs_servers_lock);
+	ASSERTIFCMP(server, server->addr.s_addr, ==, addr.s_addr);
+	_leave(" = %p", server);
+	return server;
+}
 
-/*****************************************************************************/
 /*
  * destroy a server record
  * - removes from the cell list
  */
 void afs_put_server(struct afs_server *server)
 {
-	struct afs_cell *cell;
-
 	if (!server)
 		return;
 
-	_enter("%p", server);
-
-	cell = server->cell;
+	_enter("%p{%d}", server, atomic_read(&server->usage));
 
-	/* sanity check */
-	BUG_ON(atomic_read(&server->usage) <= 0);
+	_debug("PUT SERVER %d", atomic_read(&server->usage));
 
-	/* to prevent a race, the decrement and the dequeue must be effectively
-	 * atomic */
-	write_lock(&cell->sv_lock);
+	ASSERTCMP(atomic_read(&server->usage), >, 0);
 
 	if (likely(!atomic_dec_and_test(&server->usage))) {
-		write_unlock(&cell->sv_lock);
 		_leave("");
 		return;
 	}
 
-	spin_lock(&cell->sv_gylock);
-	list_move_tail(&server->link, &cell->sv_graveyard);
+	afs_flush_callback_breaks(server);
 
-	/* time out in 10 secs */
-	afs_kafstimod_add_timer(&server->timeout, 10 * HZ);
-
-	spin_unlock(&cell->sv_gylock);
-	write_unlock(&cell->sv_lock);
-
-	_leave(" [killed]");
-} /* end afs_put_server() */
+	spin_lock(&afs_server_graveyard_lock);
+	if (atomic_read(&server->usage) == 0) {
+		list_move_tail(&server->grave, &afs_server_graveyard);
+		server->time_of_death = get_seconds();
+		schedule_delayed_work(&afs_server_reaper,
+				      afs_server_timeout * HZ);
+	}
+	spin_unlock(&afs_server_graveyard_lock);
+	_leave(" [dead]");
+}
 
-/*****************************************************************************/
 /*
- * timeout server record
- * - removes from the cell's graveyard if the usage count is zero
+ * destroy a dead server
  */
-void afs_server_do_timeout(struct afs_server *server)
+static void afs_destroy_server(struct afs_server *server)
 {
-	struct rxrpc_peer *peer;
-	struct afs_cell *cell;
-	int loop;
-
 	_enter("%p", server);
 
-	cell = server->cell;
-
-	BUG_ON(atomic_read(&server->usage) < 0);
-
-	/* remove from graveyard if still dead */
-	spin_lock(&cell->vl_gylock);
-	if (atomic_read(&server->usage) == 0)
-		list_del_init(&server->link);
-	else
-		server = NULL;
-	spin_unlock(&cell->vl_gylock);
-
-	if (!server) {
-		_leave("");
-		return; /* resurrected */
-	}
-
-	/* we can now destroy it properly */
-	afs_put_cell(cell);
-
-	/* uncross-point the structs under a global lock */
-	spin_lock(&afs_server_peer_lock);
-	peer = server->peer;
-	if (peer) {
-		server->peer = NULL;
-		peer->user = NULL;
-	}
-	spin_unlock(&afs_server_peer_lock);
-
-	/* finish cleaning up the server */
-	for (loop = AFS_SERVER_CONN_LIST_SIZE - 1; loop >= 0; loop--)
-		if (server->fs_conn[loop])
-			rxrpc_put_connection(server->fs_conn[loop]);
-
-	if (server->vlserver)
-		rxrpc_put_connection(server->vlserver);
+	ASSERTCMP(server->fs_vnodes.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_promises.rb_node, ==, NULL);
+	ASSERTCMP(server->cb_break_head, ==, server->cb_break_tail);
+	ASSERTCMP(atomic_read(&server->cb_break_n), ==, 0);
 
+	afs_put_cell(server->cell);
 	kfree(server);
+}
 
-	_leave(" [destroyed]");
-} /* end afs_server_do_timeout() */
-
-/*****************************************************************************/
 /*
- * get a callslot on a connection to the fileserver on the specified server
+ * reap dead server records
  */
-int afs_server_request_callslot(struct afs_server *server,
-				struct afs_server_callslot *callslot)
+static void afs_reap_server(struct work_struct *work)
 {
-	struct afs_server_callslot *pcallslot;
-	struct rxrpc_connection *conn;
-	int nconn, ret;
-
-	_enter("%p,",server);
-
-	INIT_LIST_HEAD(&callslot->link);
-	callslot->task = current;
-	callslot->conn = NULL;
-	callslot->nconn = -1;
-	callslot->ready = 0;
-
-	ret = 0;
-	conn = NULL;
-
-	/* get hold of a callslot first */
-	spin_lock(&server->fs_lock);
-
-	/* resurrect the server if it's death timeout has expired */
-	if (server->fs_state) {
-		if (time_before(jiffies, server->fs_dead_jif)) {
-			ret = server->fs_state;
-			spin_unlock(&server->fs_lock);
-			_leave(" = %d [still dead]", ret);
-			return ret;
+	LIST_HEAD(corpses);
+	struct afs_server *server;
+	unsigned long delay, expiry;
+	time_t now;
+
+	now = get_seconds();
+	spin_lock(&afs_server_graveyard_lock);
+
+	while (!list_empty(&afs_server_graveyard)) {
+		server = list_entry(afs_server_graveyard.next,
+				    struct afs_server, grave);
+
+		/* the queue is ordered most dead first */
+		expiry = server->time_of_death + afs_server_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			if (!schedule_delayed_work(&afs_server_reaper, delay)) {
+				cancel_delayed_work(&afs_server_reaper);
+				schedule_delayed_work(&afs_server_reaper,
+						      delay);
+			}
+			break;
 		}
 
-		server->fs_state = 0;
-	}
-
-	/* try and find a connection that has spare callslots */
-	for (nconn = 0; nconn < AFS_SERVER_CONN_LIST_SIZE; nconn++) {
-		if (server->fs_conn_cnt[nconn] > 0) {
-			server->fs_conn_cnt[nconn]--;
-			spin_unlock(&server->fs_lock);
-			callslot->nconn = nconn;
-			goto obtained_slot;
+		write_lock(&server->cell->servers_lock);
+		write_lock(&afs_servers_lock);
+		if (atomic_read(&server->usage) > 0) {
+			list_del_init(&server->grave);
+		} else {
+			list_move_tail(&server->grave, &corpses);
+			list_del_init(&server->link);
+			rb_erase(&server->master_rb, &afs_servers);
 		}
+		write_unlock(&afs_servers_lock);
+		write_unlock(&server->cell->servers_lock);
 	}
 
-	/* none were available - wait interruptibly for one to become
-	 * available */
-	set_current_state(TASK_INTERRUPTIBLE);
-	list_add_tail(&callslot->link, &server->fs_callq);
-	spin_unlock(&server->fs_lock);
-
-	while (!callslot->ready && !signal_pending(current)) {
-		schedule();
-		set_current_state(TASK_INTERRUPTIBLE);
-	}
-
-	set_current_state(TASK_RUNNING);
-
-	/* even if we were interrupted we may still be queued */
-	if (!callslot->ready) {
-		spin_lock(&server->fs_lock);
-		list_del_init(&callslot->link);
-		spin_unlock(&server->fs_lock);
-	}
-
-	nconn = callslot->nconn;
+	spin_unlock(&afs_server_graveyard_lock);
 
-	/* if interrupted, we must release any slot we also got before
-	 * returning an error */
-	if (signal_pending(current)) {
-		ret = -EINTR;
-		goto error_release;
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		server = list_entry(corpses.next, struct afs_server, grave);
+		list_del(&server->grave);
+		afs_destroy_server(server);
 	}
+}
 
-	/* if we were woken up with an error, then pass that error back to the
-	 * called */
-	if (nconn < 0) {
-		_leave(" = %d", callslot->errno);
-		return callslot->errno;
-	}
-
-	/* were we given a connection directly? */
-	if (callslot->conn) {
-		/* yes - use it */
-		_leave(" = 0 (nc=%d)", nconn);
-		return 0;
-	}
-
-	/* got a callslot, but no connection */
- obtained_slot:
-
-	/* need to get hold of the RxRPC connection */
-	down_write(&server->sem);
-
-	/* quick check to see if there's an outstanding error */
-	ret = server->fs_state;
-	if (ret)
-		goto error_release_upw;
-
-	if (server->fs_conn[nconn]) {
-		/* reuse an existing connection */
-		rxrpc_get_connection(server->fs_conn[nconn]);
-		callslot->conn = server->fs_conn[nconn];
-	}
-	else {
-		/* create a new connection */
-		ret = rxrpc_create_connection(afs_transport,
-					      htons(7000),
-					      server->addr.s_addr,
-					      FS_SERVICE_ID,
-					      NULL,
-					      &server->fs_conn[nconn]);
-
-		if (ret < 0)
-			goto error_release_upw;
-
-		callslot->conn = server->fs_conn[0];
-		rxrpc_get_connection(callslot->conn);
-	}
-
-	up_write(&server->sem);
-
- 	_leave(" = 0");
-	return 0;
-
-	/* handle an error occurring */
- error_release_upw:
-	up_write(&server->sem);
-
- error_release:
-	/* either release the callslot or pass it along to another deserving
-	 * task */
-	spin_lock(&server->fs_lock);
-
-	if (nconn < 0) {
-		/* no callslot allocated */
-	}
-	else if (list_empty(&server->fs_callq)) {
-		/* no one waiting */
-		server->fs_conn_cnt[nconn]++;
-		spin_unlock(&server->fs_lock);
-	}
-	else {
-		/* someone's waiting - dequeue them and wake them up */
-		pcallslot = list_entry(server->fs_callq.next,
-				       struct afs_server_callslot, link);
-		list_del_init(&pcallslot->link);
-
-		pcallslot->errno = server->fs_state;
-		if (!pcallslot->errno) {
-			/* pass them out callslot details */
-			callslot->conn = xchg(&pcallslot->conn,
-					      callslot->conn);
-			pcallslot->nconn = nconn;
-			callslot->nconn = nconn = -1;
-		}
-		pcallslot->ready = 1;
-		wake_up_process(pcallslot->task);
-		spin_unlock(&server->fs_lock);
-	}
-
-	rxrpc_put_connection(callslot->conn);
-	callslot->conn = NULL;
-
-	_leave(" = %d", ret);
-	return ret;
-
-} /* end afs_server_request_callslot() */
-
-/*****************************************************************************/
-/*
- * release a callslot back to the server
- * - transfers the RxRPC connection to the next pending callslot if possible
- */
-void afs_server_release_callslot(struct afs_server *server,
-				 struct afs_server_callslot *callslot)
-{
-	struct afs_server_callslot *pcallslot;
-
-	_enter("{ad=%08x,cnt=%u},{%d}",
-	       ntohl(server->addr.s_addr),
-	       server->fs_conn_cnt[callslot->nconn],
-	       callslot->nconn);
-
-	BUG_ON(callslot->nconn < 0);
-
-	spin_lock(&server->fs_lock);
-
-	if (list_empty(&server->fs_callq)) {
-		/* no one waiting */
-		server->fs_conn_cnt[callslot->nconn]++;
-		spin_unlock(&server->fs_lock);
-	}
-	else {
-		/* someone's waiting - dequeue them and wake them up */
-		pcallslot = list_entry(server->fs_callq.next,
-				       struct afs_server_callslot, link);
-		list_del_init(&pcallslot->link);
-
-		pcallslot->errno = server->fs_state;
-		if (!pcallslot->errno) {
-			/* pass them out callslot details */
-			callslot->conn = xchg(&pcallslot->conn, callslot->conn);
-			pcallslot->nconn = callslot->nconn;
-			callslot->nconn = -1;
-		}
-
-		pcallslot->ready = 1;
-		wake_up_process(pcallslot->task);
-		spin_unlock(&server->fs_lock);
-	}
-
-	rxrpc_put_connection(callslot->conn);
-
-	_leave("");
-} /* end afs_server_release_callslot() */
-
-/*****************************************************************************/
 /*
- * get a handle to a connection to the vlserver (volume location) on the
- * specified server
+ * discard all the server records for rmmod
  */
-int afs_server_get_vlconn(struct afs_server *server,
-			  struct rxrpc_connection **_conn)
+void __exit afs_purge_servers(void)
 {
-	struct rxrpc_connection *conn;
-	int ret;
-
-	_enter("%p,", server);
-
-	ret = 0;
-	conn = NULL;
-	down_read(&server->sem);
-
-	if (server->vlserver) {
-		/* reuse an existing connection */
-		rxrpc_get_connection(server->vlserver);
-		conn = server->vlserver;
-		up_read(&server->sem);
-	}
-	else {
-		/* create a new connection */
-		up_read(&server->sem);
-		down_write(&server->sem);
-		if (!server->vlserver) {
-			ret = rxrpc_create_connection(afs_transport,
-						      htons(7003),
-						      server->addr.s_addr,
-						      VL_SERVICE_ID,
-						      NULL,
-						      &server->vlserver);
-		}
-		if (ret == 0) {
-			rxrpc_get_connection(server->vlserver);
-			conn = server->vlserver;
-		}
-		up_write(&server->sem);
-	}
-
-	*_conn = conn;
-	_leave(" = %d", ret);
-	return ret;
-} /* end afs_server_get_vlconn() */
+	afs_server_timeout = 0;
+	cancel_delayed_work(&afs_server_reaper);
+	schedule_delayed_work(&afs_server_reaper, 0);
+}
diff --git a/fs/afs/server.h b/fs/afs/server.h
deleted file mode 100644
index c3d24115578f..000000000000
--- a/fs/afs/server.h
+++ /dev/null
@@ -1,102 +0,0 @@
-/* server.h: AFS server record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_SERVER_H
-#define _LINUX_AFS_SERVER_H
-
-#include "types.h"
-#include "kafstimod.h"
-#include <rxrpc/peer.h>
-#include <linux/rwsem.h>
-
-extern spinlock_t afs_server_peer_lock;
-
-/*****************************************************************************/
-/*
- * AFS server record
- */
-struct afs_server
-{
-	atomic_t		usage;
-	struct afs_cell		*cell;		/* cell in which server resides */
-	struct list_head	link;		/* link in cell's server list */
-	struct rw_semaphore	sem;		/* access lock */
-	struct afs_timer	timeout;	/* graveyard timeout */
-	struct in_addr		addr;		/* server address */
-	struct rxrpc_peer	*peer;		/* peer record for this server */
-	struct rxrpc_connection	*vlserver;	/* connection to the volume location service */
-
-	/* file service access */
-#define AFS_SERVER_CONN_LIST_SIZE 2
-	struct rxrpc_connection	*fs_conn[AFS_SERVER_CONN_LIST_SIZE]; /* FS connections */
-	unsigned		fs_conn_cnt[AFS_SERVER_CONN_LIST_SIZE];	/* per conn call count */
-	struct list_head	fs_callq;	/* queue of processes waiting to make a call */
-	spinlock_t		fs_lock;	/* access lock */
-	int			fs_state;      	/* 0 or reason FS currently marked dead (-errno) */
-	unsigned		fs_rtt;		/* FS round trip time */
-	unsigned long		fs_act_jif;	/* time at which last activity occurred */
-	unsigned long		fs_dead_jif;	/* time at which no longer to be considered dead */
-
-	/* callback promise management */
-	struct list_head	cb_promises;	/* as yet unbroken promises from this server */
-	spinlock_t		cb_lock;	/* access lock */
-};
-
-extern int afs_server_lookup(struct afs_cell *cell,
-			     const struct in_addr *addr,
-			     struct afs_server **_server);
-
-#define afs_get_server(S) do { atomic_inc(&(S)->usage); } while(0)
-
-extern void afs_put_server(struct afs_server *server);
-extern void afs_server_do_timeout(struct afs_server *server);
-
-extern int afs_server_find_by_peer(const struct rxrpc_peer *peer,
-				   struct afs_server **_server);
-
-extern int afs_server_get_vlconn(struct afs_server *server,
-				 struct rxrpc_connection **_conn);
-
-static inline
-struct afs_server *afs_server_get_from_peer(struct rxrpc_peer *peer)
-{
-	struct afs_server *server;
-
-	spin_lock(&afs_server_peer_lock);
-	server = peer->user;
-	if (server)
-		afs_get_server(server);
-	spin_unlock(&afs_server_peer_lock);
-
-	return server;
-}
-
-/*****************************************************************************/
-/*
- * AFS server callslot grant record
- */
-struct afs_server_callslot
-{
-	struct list_head	link;		/* link in server's list */
-	struct task_struct	*task;		/* process waiting to make call */
-	struct rxrpc_connection	*conn;		/* connection to use (or NULL on error) */
-	short			nconn;		/* connection slot number (-1 on error) */
-	char			ready;		/* T when ready */
-	int			errno;		/* error number if nconn==-1 */
-};
-
-extern int afs_server_request_callslot(struct afs_server *server,
-				       struct afs_server_callslot *callslot);
-
-extern void afs_server_release_callslot(struct afs_server *server,
-					struct afs_server_callslot *callslot);
-
-#endif /* _LINUX_AFS_SERVER_H */
diff --git a/fs/afs/super.c b/fs/afs/super.c
index eb7e32349da3..cebd03c91f57 100644
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -1,5 +1,6 @@
-/*
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
+/* AFS superblock handling
+ *
+ * Copyright (c) 2002, 2007 Red Hat, Inc. All rights reserved.
  *
  * This software may be freely redistributed under the terms of the
  * GNU General Public License.
@@ -9,7 +10,7 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  * Authors: David Howells <dhowells@redhat.com>
- *          David Woodhouse <dwmw2@cambridge.redhat.com>
+ *          David Woodhouse <dwmw2@redhat.com>
  *
  */
 
@@ -19,22 +20,10 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "vnode.h"
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "super.h"
 #include "internal.h"
 
 #define AFS_FS_MAGIC 0x6B414653 /* 'kAFS' */
 
-struct afs_mount_params {
-	int			rwpath;
-	struct afs_cell		*default_cell;
-	struct afs_volume	*volume;
-};
-
 static void afs_i_init_once(void *foo, struct kmem_cache *cachep,
 			    unsigned long flags);
 
@@ -62,13 +51,13 @@ static const struct super_operations afs_super_ops = {
 	.drop_inode	= generic_delete_inode,
 	.destroy_inode	= afs_destroy_inode,
 	.clear_inode	= afs_clear_inode,
+	.umount_begin	= afs_umount_begin,
 	.put_super	= afs_put_super,
 };
 
 static struct kmem_cache *afs_inode_cachep;
 static atomic_t afs_count_active_inodes;
 
-/*****************************************************************************/
 /*
  * initialise the filesystem
  */
@@ -78,8 +67,6 @@ int __init afs_fs_init(void)
 
 	_enter("");
 
-	afs_timer_init(&afs_mntpt_expiry_timer, &afs_mntpt_expiry_timer_ops);
-
 	/* create ourselves an inode cache */
 	atomic_set(&afs_count_active_inodes, 0);
 
@@ -99,20 +86,22 @@ int __init afs_fs_init(void)
 	ret = register_filesystem(&afs_fs_type);
 	if (ret < 0) {
 		kmem_cache_destroy(afs_inode_cachep);
-		kleave(" = %d", ret);
+		_leave(" = %d", ret);
 		return ret;
 	}
 
-	kleave(" = 0");
+	_leave(" = 0");
 	return 0;
-} /* end afs_fs_init() */
+}
 
-/*****************************************************************************/
 /*
  * clean up the filesystem
  */
 void __exit afs_fs_exit(void)
 {
+	_enter("");
+
+	afs_mntpt_kill_timer();
 	unregister_filesystem(&afs_fs_type);
 
 	if (atomic_read(&afs_count_active_inodes) != 0) {
@@ -122,10 +111,9 @@ void __exit afs_fs_exit(void)
 	}
 
 	kmem_cache_destroy(afs_inode_cachep);
+	_leave("");
+}
 
-} /* end afs_fs_exit() */
-
-/*****************************************************************************/
 /*
  * check that an argument has a value
  */
@@ -136,9 +124,8 @@ static int want_arg(char **_value, const char *option)
 		return 0;
 	}
 	return 1;
-} /* end want_arg() */
+}
 
-/*****************************************************************************/
 /*
  * check that there's no subsequent value
  */
@@ -150,18 +137,17 @@ static int want_no_value(char *const *_value, const char *option)
 		return 0;
 	}
 	return 1;
-} /* end want_no_value() */
+}
 
-/*****************************************************************************/
 /*
  * parse the mount options
  * - this function has been shamelessly adapted from the ext3 fs which
  *   shamelessly adapted it from the msdos fs
  */
-static int afs_super_parse_options(struct afs_mount_params *params,
-				   char *options,
-				   const char **devname)
+static int afs_parse_options(struct afs_mount_params *params,
+			     char *options, const char **devname)
 {
+	struct afs_cell *cell;
 	char *key, *value;
 	int ret;
 
@@ -170,51 +156,135 @@ static int afs_super_parse_options(struct afs_mount_params *params,
 	options[PAGE_SIZE - 1] = 0;
 
 	ret = 0;
-	while ((key = strsep(&options, ",")) != 0)
-	{
+	while ((key = strsep(&options, ","))) {
 		value = strchr(key, '=');
 		if (value)
 			*value++ = 0;
 
-		printk("kAFS: KEY: %s, VAL:%s\n", key, value ?: "-");
+		_debug("kAFS: KEY: %s, VAL:%s", key, value ?: "-");
 
 		if (strcmp(key, "rwpath") == 0) {
 			if (!want_no_value(&value, "rwpath"))
 				return -EINVAL;
 			params->rwpath = 1;
-			continue;
-		}
-		else if (strcmp(key, "vol") == 0) {
+		} else if (strcmp(key, "vol") == 0) {
 			if (!want_arg(&value, "vol"))
 				return -EINVAL;
 			*devname = value;
-			continue;
-		}
-		else if (strcmp(key, "cell") == 0) {
+		} else if (strcmp(key, "cell") == 0) {
 			if (!want_arg(&value, "cell"))
 				return -EINVAL;
-			afs_put_cell(params->default_cell);
-			ret = afs_cell_lookup(value,
-					      strlen(value),
-					      &params->default_cell);
-			if (ret < 0)
-				return -EINVAL;
-			continue;
+			cell = afs_cell_lookup(value, strlen(value));
+			if (IS_ERR(cell))
+				return PTR_ERR(cell);
+			afs_put_cell(params->cell);
+			params->cell = cell;
+		} else {
+			printk("kAFS: Unknown mount option: '%s'\n",  key);
+			ret = -EINVAL;
+			goto error;
 		}
-
-		printk("kAFS: Unknown mount option: '%s'\n",  key);
-		ret = -EINVAL;
-		goto error;
 	}
 
 	ret = 0;
-
- error:
+error:
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_super_parse_options() */
+}
+
+/*
+ * parse a device name to get cell name, volume name, volume type and R/W
+ * selector
+ * - this can be one of the following:
+ *	"%[cell:]volume[.]"		R/W volume
+ *	"#[cell:]volume[.]"		R/O or R/W volume (rwpath=0),
+ *					 or R/W (rwpath=1) volume
+ *	"%[cell:]volume.readonly"	R/O volume
+ *	"#[cell:]volume.readonly"	R/O volume
+ *	"%[cell:]volume.backup"		Backup volume
+ *	"#[cell:]volume.backup"		Backup volume
+ */
+static int afs_parse_device_name(struct afs_mount_params *params,
+				 const char *name)
+{
+	struct afs_cell *cell;
+	const char *cellname, *suffix;
+	int cellnamesz;
+
+	_enter(",%s", name);
+
+	if (!name) {
+		printk(KERN_ERR "kAFS: no volume name specified\n");
+		return -EINVAL;
+	}
+
+	if ((name[0] != '%' && name[0] != '#') || !name[1]) {
+		printk(KERN_ERR "kAFS: unparsable volume name\n");
+		return -EINVAL;
+	}
+
+	/* determine the type of volume we're looking for */
+	params->type = AFSVL_ROVOL;
+	params->force = false;
+	if (params->rwpath || name[0] == '%') {
+		params->type = AFSVL_RWVOL;
+		params->force = true;
+	}
+	name++;
+
+	/* split the cell name out if there is one */
+	params->volname = strchr(name, ':');
+	if (params->volname) {
+		cellname = name;
+		cellnamesz = params->volname - name;
+		params->volname++;
+	} else {
+		params->volname = name;
+		cellname = NULL;
+		cellnamesz = 0;
+	}
+
+	/* the volume type is further affected by a possible suffix */
+	suffix = strrchr(params->volname, '.');
+	if (suffix) {
+		if (strcmp(suffix, ".readonly") == 0) {
+			params->type = AFSVL_ROVOL;
+			params->force = true;
+		} else if (strcmp(suffix, ".backup") == 0) {
+			params->type = AFSVL_BACKVOL;
+			params->force = true;
+		} else if (suffix[1] == 0) {
+		} else {
+			suffix = NULL;
+		}
+	}
+
+	params->volnamesz = suffix ?
+		suffix - params->volname : strlen(params->volname);
+
+	_debug("cell %*.*s [%p]",
+	       cellnamesz, cellnamesz, cellname ?: "", params->cell);
+
+	/* lookup the cell record */
+	if (cellname || !params->cell) {
+		cell = afs_cell_lookup(cellname, cellnamesz);
+		if (IS_ERR(cell)) {
+			printk(KERN_ERR "kAFS: unable to lookup cell '%s'\n",
+			       cellname ?: "");
+			return PTR_ERR(cell);
+		}
+		afs_put_cell(params->cell);
+		params->cell = cell;
+	}
+
+	_debug("CELL:%s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
+	       params->cell->name, params->cell,
+	       params->volnamesz, params->volnamesz, params->volname,
+	       suffix ?: "-", params->type, params->force ? " FORCE" : "");
+
+	return 0;
+}
 
-/*****************************************************************************/
 /*
  * check a superblock to see if it's the one we're looking for
  */
@@ -224,13 +294,12 @@ static int afs_test_super(struct super_block *sb, void *data)
 	struct afs_super_info *as = sb->s_fs_info;
 
 	return as->volume == params->volume;
-} /* end afs_test_super() */
+}
 
-/*****************************************************************************/
 /*
  * fill in the superblock
  */
-static int afs_fill_super(struct super_block *sb, void *data, int silent)
+static int afs_fill_super(struct super_block *sb, void *data)
 {
 	struct afs_mount_params *params = data;
 	struct afs_super_info *as = NULL;
@@ -239,7 +308,7 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
 	struct inode *inode = NULL;
 	int ret;
 
-	kenter("");
+	_enter("");
 
 	/* allocate a superblock info record */
 	as = kzalloc(sizeof(struct afs_super_info), GFP_KERNEL);
@@ -262,9 +331,9 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
 	fid.vid		= as->volume->vid;
 	fid.vnode	= 1;
 	fid.unique	= 1;
-	ret = afs_iget(sb, &fid, &inode);
-	if (ret < 0)
-		goto error;
+	inode = afs_iget(sb, params->key, &fid, NULL, NULL);
+	if (IS_ERR(inode))
+		goto error_inode;
 
 	ret = -ENOMEM;
 	root = d_alloc_root(inode);
@@ -273,21 +342,23 @@ static int afs_fill_super(struct super_block *sb, void *data, int silent)
 
 	sb->s_root = root;
 
-	kleave(" = 0");
+	_leave(" = 0");
 	return 0;
 
- error:
+error_inode:
+	ret = PTR_ERR(inode);
+	inode = NULL;
+error:
 	iput(inode);
 	afs_put_volume(as->volume);
 	kfree(as);
 
 	sb->s_fs_info = NULL;
 
-	kleave(" = %d", ret);
+	_leave(" = %d", ret);
 	return ret;
-} /* end afs_fill_super() */
+}
 
-/*****************************************************************************/
 /*
  * get an AFS superblock
  * - TODO: don't use get_sb_nodev(), but rather call sget() directly
@@ -300,69 +371,80 @@ static int afs_get_sb(struct file_system_type *fs_type,
 {
 	struct afs_mount_params params;
 	struct super_block *sb;
+	struct afs_volume *vol;
+	struct key *key;
 	int ret;
 
 	_enter(",,%s,%p", dev_name, options);
 
 	memset(&params, 0, sizeof(params));
 
-	/* start the cache manager */
-	ret = afscm_start();
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
-	}
-
-	/* parse the options */
+	/* parse the options and device name */
 	if (options) {
-		ret = afs_super_parse_options(&params, options, &dev_name);
+		ret = afs_parse_options(&params, options, &dev_name);
 		if (ret < 0)
 			goto error;
-		if (!dev_name) {
-			printk("kAFS: no volume name specified\n");
-			ret = -EINVAL;
-			goto error;
-		}
 	}
 
-	/* parse the device name */
-	ret = afs_volume_lookup(dev_name,
-				params.default_cell,
-				params.rwpath,
-				&params.volume);
+
+	ret = afs_parse_device_name(&params, dev_name);
 	if (ret < 0)
 		goto error;
 
-	/* allocate a deviceless superblock */
-	sb = sget(fs_type, afs_test_super, set_anon_super, &params);
-	if (IS_ERR(sb))
+	/* try and do the mount securely */
+	key = afs_request_key(params.cell);
+	if (IS_ERR(key)) {
+		_leave(" = %ld [key]", PTR_ERR(key));
+		ret = PTR_ERR(key);
 		goto error;
+	}
+	params.key = key;
 
-	sb->s_flags = flags;
+	/* parse the device name */
+	vol = afs_volume_lookup(&params);
+	if (IS_ERR(vol)) {
+		ret = PTR_ERR(vol);
+		goto error;
+	}
+	params.volume = vol;
 
-	ret = afs_fill_super(sb, &params, flags & MS_SILENT ? 1 : 0);
-	if (ret < 0) {
-		up_write(&sb->s_umount);
-		deactivate_super(sb);
+	/* allocate a deviceless superblock */
+	sb = sget(fs_type, afs_test_super, set_anon_super, &params);
+	if (IS_ERR(sb)) {
+		ret = PTR_ERR(sb);
 		goto error;
 	}
-	sb->s_flags |= MS_ACTIVE;
-	simple_set_mnt(mnt, sb);
 
+	if (!sb->s_root) {
+		/* initial superblock/root creation */
+		_debug("create");
+		sb->s_flags = flags;
+		ret = afs_fill_super(sb, &params);
+		if (ret < 0) {
+			up_write(&sb->s_umount);
+			deactivate_super(sb);
+			goto error;
+		}
+		sb->s_flags |= MS_ACTIVE;
+	} else {
+		_debug("reuse");
+		ASSERTCMP(sb->s_flags, &, MS_ACTIVE);
+	}
+
+	simple_set_mnt(mnt, sb);
 	afs_put_volume(params.volume);
-	afs_put_cell(params.default_cell);
-	_leave(" = 0 [%p]", 0, sb);
+	afs_put_cell(params.cell);
+	_leave(" = 0 [%p]", sb);
 	return 0;
 
- error:
+error:
 	afs_put_volume(params.volume);
-	afs_put_cell(params.default_cell);
-	afscm_stop();
+	afs_put_cell(params.cell);
+	key_put(params.key);
 	_leave(" = %d", ret);
 	return ret;
-} /* end afs_get_sb() */
+}
 
-/*****************************************************************************/
 /*
  * finish the unmounting process on the superblock
  */
@@ -373,35 +455,30 @@ static void afs_put_super(struct super_block *sb)
 	_enter("");
 
 	afs_put_volume(as->volume);
-	afscm_stop();
 
 	_leave("");
-} /* end afs_put_super() */
+}
 
-/*****************************************************************************/
 /*
  * initialise an inode cache slab element prior to any use
  */
 static void afs_i_init_once(void *_vnode, struct kmem_cache *cachep,
 			    unsigned long flags)
 {
-	struct afs_vnode *vnode = (struct afs_vnode *) _vnode;
+	struct afs_vnode *vnode = _vnode;
 
 	if ((flags & (SLAB_CTOR_VERIFY|SLAB_CTOR_CONSTRUCTOR)) ==
 	    SLAB_CTOR_CONSTRUCTOR) {
 		memset(vnode, 0, sizeof(*vnode));
 		inode_init_once(&vnode->vfs_inode);
 		init_waitqueue_head(&vnode->update_waitq);
+		mutex_init(&vnode->permits_lock);
+		mutex_init(&vnode->validate_lock);
 		spin_lock_init(&vnode->lock);
-		INIT_LIST_HEAD(&vnode->cb_link);
-		INIT_LIST_HEAD(&vnode->cb_hash_link);
-		afs_timer_init(&vnode->cb_timeout,
-			       &afs_vnode_cb_timed_out_ops);
+		INIT_WORK(&vnode->cb_broken_work, afs_broken_callback_work);
 	}
+}
 
-} /* end afs_i_init_once() */
-
-/*****************************************************************************/
 /*
  * allocate an AFS inode struct from our slab cache
  */
@@ -409,8 +486,7 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 {
 	struct afs_vnode *vnode;
 
-	vnode = (struct afs_vnode *)
-		kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
+	vnode = kmem_cache_alloc(afs_inode_cachep, GFP_KERNEL);
 	if (!vnode)
 		return NULL;
 
@@ -421,21 +497,25 @@ static struct inode *afs_alloc_inode(struct super_block *sb)
 
 	vnode->volume		= NULL;
 	vnode->update_cnt	= 0;
-	vnode->flags		= 0;
+	vnode->flags		= 1 << AFS_VNODE_UNSET;
+	vnode->cb_promised	= false;
 
 	return &vnode->vfs_inode;
-} /* end afs_alloc_inode() */
+}
 
-/*****************************************************************************/
 /*
  * destroy an AFS inode struct
  */
 static void afs_destroy_inode(struct inode *inode)
 {
+	struct afs_vnode *vnode = AFS_FS_I(inode);
+
 	_enter("{%lu}", inode->i_ino);
 
-	kmem_cache_free(afs_inode_cachep, AFS_FS_I(inode));
+	_debug("DESTROY INODE %p", inode);
 
-	atomic_dec(&afs_count_active_inodes);
+	ASSERTCMP(vnode->server, ==, NULL);
 
-} /* end afs_destroy_inode() */
+	kmem_cache_free(afs_inode_cachep, vnode);
+	atomic_dec(&afs_count_active_inodes);
+}
diff --git a/fs/afs/super.h b/fs/afs/super.h
deleted file mode 100644
index 32de8cc6fae8..000000000000
--- a/fs/afs/super.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/* super.h: AFS filesystem internal private data
- *
- * Copyright (c) 2002 Red Hat, Inc. All rights reserved.
- *
- * This software may be freely redistributed under the terms of the
- * GNU General Public License.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Authors: David Woodhouse <dwmw2@cambridge.redhat.com>
- *          David Howells <dhowells@redhat.com>
- *
- */
-
-#ifndef _LINUX_AFS_SUPER_H
-#define _LINUX_AFS_SUPER_H
-
-#include <linux/fs.h>
-#include "server.h"
-
-#ifdef __KERNEL__
-
-/*****************************************************************************/
-/*
- * AFS superblock private data
- * - there's one superblock per volume
- */
-struct afs_super_info
-{
-	struct afs_volume	*volume;	/* volume record */
-	char			rwparent;	/* T if parent is R/W AFS volume */
-};
-
-static inline struct afs_super_info *AFS_FS_S(struct super_block *sb)
-{
-	return sb->s_fs_info;
-}
-
-extern struct file_system_type afs_fs_type;
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_SUPER_H */
diff --git a/fs/afs/transport.h b/fs/afs/transport.h
deleted file mode 100644
index 7013ae6ccc8c..000000000000
--- a/fs/afs/transport.h
+++ /dev/null
@@ -1,21 +0,0 @@
-/* transport.h: AFS transport management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_TRANSPORT_H
-#define _LINUX_AFS_TRANSPORT_H
-
-#include "types.h"
-#include <rxrpc/transport.h>
-
-/* the cache manager transport endpoint */
-extern struct rxrpc_transport *afs_transport;
-
-#endif /* _LINUX_AFS_TRANSPORT_H */
diff --git a/fs/afs/types.h b/fs/afs/types.h
deleted file mode 100644
index b1a2367c7587..000000000000
--- a/fs/afs/types.h
+++ /dev/null
@@ -1,125 +0,0 @@
-/* types.h: AFS types
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_TYPES_H
-#define _LINUX_AFS_TYPES_H
-
-#ifdef __KERNEL__
-#include <rxrpc/types.h>
-#endif /* __KERNEL__ */
-
-typedef unsigned			afs_volid_t;
-typedef unsigned			afs_vnodeid_t;
-typedef unsigned long long		afs_dataversion_t;
-
-typedef enum {
-	AFSVL_RWVOL,			/* read/write volume */
-	AFSVL_ROVOL,			/* read-only volume */
-	AFSVL_BACKVOL,			/* backup volume */
-} __attribute__((packed)) afs_voltype_t;
-
-typedef enum {
-	AFS_FTYPE_INVALID	= 0,
-	AFS_FTYPE_FILE		= 1,
-	AFS_FTYPE_DIR		= 2,
-	AFS_FTYPE_SYMLINK	= 3,
-} afs_file_type_t;
-
-#ifdef __KERNEL__
-
-struct afs_cell;
-struct afs_vnode;
-
-/*****************************************************************************/
-/*
- * AFS file identifier
- */
-struct afs_fid
-{
-	afs_volid_t	vid;		/* volume ID */
-	afs_vnodeid_t	vnode;		/* file index within volume */
-	unsigned	unique;		/* unique ID number (file index version) */
-};
-
-/*****************************************************************************/
-/*
- * AFS callback notification
- */
-typedef enum {
-	AFSCM_CB_UNTYPED	= 0,	/* no type set on CB break */
-	AFSCM_CB_EXCLUSIVE	= 1,	/* CB exclusive to CM [not implemented] */
-	AFSCM_CB_SHARED		= 2,	/* CB shared by other CM's */
-	AFSCM_CB_DROPPED	= 3,	/* CB promise cancelled by file server */
-} afs_callback_type_t;
-
-struct afs_callback
-{
-	struct afs_server	*server;	/* server that made the promise */
-	struct afs_fid		fid;		/* file identifier */
-	unsigned		version;	/* callback version */
-	unsigned		expiry;		/* time at which expires */
-	afs_callback_type_t	type;		/* type of callback */
-};
-
-#define AFSCBMAX 50
-
-/*****************************************************************************/
-/*
- * AFS volume information
- */
-struct afs_volume_info
-{
-	afs_volid_t		vid;		/* volume ID */
-	afs_voltype_t		type;		/* type of this volume */
-	afs_volid_t		type_vids[5];	/* volume ID's for possible types for this vol */
-	
-	/* list of fileservers serving this volume */
-	size_t			nservers;	/* number of entries used in servers[] */
-	struct {
-		struct in_addr	addr;		/* fileserver address */
-	} servers[8];
-};
-
-/*****************************************************************************/
-/*
- * AFS file status information
- */
-struct afs_file_status
-{
-	unsigned		if_version;	/* interface version */
-#define AFS_FSTATUS_VERSION	1
-
-	afs_file_type_t		type;		/* file type */
-	unsigned		nlink;		/* link count */
-	size_t			size;		/* file size */
-	afs_dataversion_t	version;	/* current data version */
-	unsigned		author;		/* author ID */
-	unsigned		owner;		/* owner ID */
-	unsigned		caller_access;	/* access rights for authenticated caller */
-	unsigned		anon_access;	/* access rights for unauthenticated caller */
-	umode_t			mode;		/* UNIX mode */
-	struct afs_fid		parent;		/* parent file ID */
-	time_t			mtime_client;	/* last time client changed data */
-	time_t			mtime_server;	/* last time server changed data */
-};
-
-/*****************************************************************************/
-/*
- * AFS volume synchronisation information
- */
-struct afs_volsync
-{
-	time_t			creation;	/* volume creation time */
-};
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_TYPES_H */
diff --git a/fs/afs/use-rtnetlink.c b/fs/afs/use-rtnetlink.c
new file mode 100644
index 000000000000..82f0daa28970
--- /dev/null
+++ b/fs/afs/use-rtnetlink.c
@@ -0,0 +1,473 @@
+/* RTNETLINK client
+ *
+ * Copyright (C) 2007 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_addr.h>
+#include <linux/if_arp.h>
+#include <linux/inetdevice.h>
+#include <net/netlink.h>
+#include "internal.h"
+
+struct afs_rtm_desc {
+	struct socket		*nlsock;
+	struct afs_interface	*bufs;
+	u8			*mac;
+	size_t			nbufs;
+	size_t			maxbufs;
+	void			*data;
+	ssize_t			datalen;
+	size_t			datamax;
+	int			msg_seq;
+	unsigned		mac_index;
+	bool			wantloopback;
+	int (*parse)(struct afs_rtm_desc *, struct nlmsghdr *);
+};
+
+/*
+ * parse an RTM_GETADDR response
+ */
+static int afs_rtm_getaddr_parse(struct afs_rtm_desc *desc,
+				 struct nlmsghdr *nlhdr)
+{
+	struct afs_interface *this;
+	struct ifaddrmsg *ifa;
+	struct rtattr *rtattr;
+	const char *name;
+	size_t len;
+
+	ifa = (struct ifaddrmsg *) NLMSG_DATA(nlhdr);
+
+	_enter("{ix=%d,af=%d}", ifa->ifa_index, ifa->ifa_family);
+
+	if (ifa->ifa_family != AF_INET) {
+		_leave(" = 0 [family %d]", ifa->ifa_family);
+		return 0;
+	}
+	if (desc->nbufs >= desc->maxbufs) {
+		_leave(" = 0 [max %zu/%zu]", desc->nbufs, desc->maxbufs);
+		return 0;
+	}
+
+	this = &desc->bufs[desc->nbufs];
+
+	this->index = ifa->ifa_index;
+	this->netmask.s_addr = inet_make_mask(ifa->ifa_prefixlen);
+	this->mtu = 0;
+
+	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifaddrmsg));
+	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifaddrmsg));
+
+	name = "unknown";
+	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+		switch (rtattr->rta_type) {
+		case IFA_ADDRESS:
+			memcpy(&this->address, RTA_DATA(rtattr), 4);
+			break;
+		case IFA_LABEL:
+			name = RTA_DATA(rtattr);
+			break;
+		}
+	}
+
+	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT,
+	       name, NIPQUAD(this->address), NIPQUAD(this->netmask));
+
+	desc->nbufs++;
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * parse an RTM_GETLINK response for MTUs
+ */
+static int afs_rtm_getlink_if_parse(struct afs_rtm_desc *desc,
+				    struct nlmsghdr *nlhdr)
+{
+	struct afs_interface *this;
+	struct ifinfomsg *ifi;
+	struct rtattr *rtattr;
+	const char *name;
+	size_t len, loop;
+
+	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+
+	_enter("{ix=%d}", ifi->ifi_index);
+
+	for (loop = 0; loop < desc->nbufs; loop++) {
+		this = &desc->bufs[loop];
+		if (this->index == ifi->ifi_index)
+			goto found;
+	}
+
+	_leave(" = 0 [no match]");
+	return 0;
+
+found:
+	if (ifi->ifi_type == ARPHRD_LOOPBACK && !desc->wantloopback) {
+		_leave(" = 0 [loopback]");
+		return 0;
+	}
+
+	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+	len = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+
+	name = "unknown";
+	for (; RTA_OK(rtattr, len); rtattr = RTA_NEXT(rtattr, len)) {
+		switch (rtattr->rta_type) {
+		case IFLA_MTU:
+			memcpy(&this->mtu, RTA_DATA(rtattr), 4);
+			break;
+		case IFLA_IFNAME:
+			name = RTA_DATA(rtattr);
+			break;
+		}
+	}
+
+	_debug("%s: "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+	       name, NIPQUAD(this->address), NIPQUAD(this->netmask),
+	       this->mtu);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * parse an RTM_GETLINK response for the MAC address belonging to the lowest
+ * non-internal interface
+ */
+static int afs_rtm_getlink_mac_parse(struct afs_rtm_desc *desc,
+				     struct nlmsghdr *nlhdr)
+{
+	struct ifinfomsg *ifi;
+	struct rtattr *rtattr;
+	const char *name;
+	size_t remain, len;
+	bool set;
+
+	ifi = (struct ifinfomsg *) NLMSG_DATA(nlhdr);
+
+	_enter("{ix=%d}", ifi->ifi_index);
+
+	if (ifi->ifi_index >= desc->mac_index) {
+		_leave(" = 0 [high]");
+		return 0;
+	}
+	if (ifi->ifi_type == ARPHRD_LOOPBACK) {
+		_leave(" = 0 [loopback]");
+		return 0;
+	}
+
+	rtattr = NLMSG_DATA(nlhdr) + NLMSG_ALIGN(sizeof(struct ifinfomsg));
+	remain = NLMSG_PAYLOAD(nlhdr, sizeof(struct ifinfomsg));
+
+	name = "unknown";
+	set = false;
+	for (; RTA_OK(rtattr, remain); rtattr = RTA_NEXT(rtattr, remain)) {
+		switch (rtattr->rta_type) {
+		case IFLA_ADDRESS:
+			len = RTA_PAYLOAD(rtattr);
+			memcpy(desc->mac, RTA_DATA(rtattr),
+			       min_t(size_t, len, 6));
+			desc->mac_index = ifi->ifi_index;
+			set = true;
+			break;
+		case IFLA_IFNAME:
+			name = RTA_DATA(rtattr);
+			break;
+		}
+	}
+
+	if (set)
+		_debug("%s: %02x:%02x:%02x:%02x:%02x:%02x",
+		       name,
+		       desc->mac[0], desc->mac[1], desc->mac[2],
+		       desc->mac[3], desc->mac[4], desc->mac[5]);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * read the rtnetlink response and pass to parsing routine
+ */
+static int afs_read_rtm(struct afs_rtm_desc *desc)
+{
+	struct nlmsghdr *nlhdr, tmphdr;
+	struct msghdr msg;
+	struct kvec iov[1];
+	void *data;
+	bool last = false;
+	int len, ret, remain;
+
+	_enter("");
+
+	do {
+		/* first of all peek to see how big the packet is */
+		memset(&msg, 0, sizeof(msg));
+		iov[0].iov_base = &tmphdr;
+		iov[0].iov_len = sizeof(tmphdr);
+		len = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+				     sizeof(tmphdr), MSG_PEEK | MSG_TRUNC);
+		if (len < 0) {
+			_leave(" = %d [peek]", len);
+			return len;
+		}
+		if (len == 0)
+			continue;
+		if (len < sizeof(tmphdr) || len < NLMSG_PAYLOAD(&tmphdr, 0)) {
+			_leave(" = -EMSGSIZE");
+			return -EMSGSIZE;
+		}
+
+		if (desc->datamax < len) {
+			kfree(desc->data);
+			desc->data = NULL;
+			data = kmalloc(len, GFP_KERNEL);
+			if (!data)
+				return -ENOMEM;
+			desc->data = data;
+		}
+		desc->datamax = len;
+
+		/* read all the data from this packet */
+		iov[0].iov_base = desc->data;
+		iov[0].iov_len = desc->datamax;
+		desc->datalen = kernel_recvmsg(desc->nlsock, &msg, iov, 1,
+					       desc->datamax, 0);
+		if (desc->datalen < 0) {
+			_leave(" = %ld [recv]", desc->datalen);
+			return desc->datalen;
+		}
+
+		nlhdr = desc->data;
+
+		/* check if the header is valid */
+		if (!NLMSG_OK(nlhdr, desc->datalen) ||
+		    nlhdr->nlmsg_type == NLMSG_ERROR) {
+			_leave(" = -EIO");
+			return -EIO;
+		}
+
+		/* see if this is the last message */
+		if (nlhdr->nlmsg_type == NLMSG_DONE ||
+		    !(nlhdr->nlmsg_flags & NLM_F_MULTI))
+			last = true;
+
+		/* parse the bits we got this time */
+		nlmsg_for_each_msg(nlhdr, desc->data, desc->datalen, remain) {
+			ret = desc->parse(desc, nlhdr);
+			if (ret < 0) {
+				_leave(" = %d [parse]", ret);
+				return ret;
+			}
+		}
+
+	} while (!last);
+
+	_leave(" = 0");
+	return 0;
+}
+
+/*
+ * list the interface bound addresses to get the address and netmask
+ */
+static int afs_rtm_getaddr(struct afs_rtm_desc *desc)
+{
+	struct msghdr msg;
+	struct kvec iov[1];
+	int ret;
+
+	struct {
+		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+		struct ifaddrmsg addr_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+	} request;
+
+	_enter("");
+
+	memset(&request, 0, sizeof(request));
+
+	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg));
+	request.nl_msg.nlmsg_type = RTM_GETADDR;
+	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP;
+	request.nl_msg.nlmsg_seq = desc->msg_seq++;
+	request.nl_msg.nlmsg_pid = 0;
+
+	memset(&msg, 0, sizeof(msg));
+	iov[0].iov_base = &request;
+	iov[0].iov_len = sizeof(request);
+
+	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * list the interface link statuses to get the MTUs
+ */
+static int afs_rtm_getlink(struct afs_rtm_desc *desc)
+{
+	struct msghdr msg;
+	struct kvec iov[1];
+	int ret;
+
+	struct {
+		struct nlmsghdr nl_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+		struct ifinfomsg link_msg __attribute__((aligned(NLMSG_ALIGNTO)));
+	} request;
+
+	_enter("");
+
+	memset(&request, 0, sizeof(request));
+
+	request.nl_msg.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg));
+	request.nl_msg.nlmsg_type = RTM_GETLINK;
+	request.nl_msg.nlmsg_flags = NLM_F_REQUEST | NLM_F_ROOT;
+	request.nl_msg.nlmsg_seq = desc->msg_seq++;
+	request.nl_msg.nlmsg_pid = 0;
+
+	memset(&msg, 0, sizeof(msg));
+	iov[0].iov_base = &request;
+	iov[0].iov_len = sizeof(request);
+
+	ret = kernel_sendmsg(desc->nlsock, &msg, iov, 1, iov[0].iov_len);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * cull any interface records for which there isn't an MTU value
+ */
+static void afs_cull_interfaces(struct afs_rtm_desc *desc)
+{
+	struct afs_interface *bufs = desc->bufs;
+	size_t nbufs = desc->nbufs;
+	int loop, point = 0;
+
+	_enter("{%zu}", nbufs);
+
+	for (loop = 0; loop < nbufs; loop++) {
+		if (desc->bufs[loop].mtu != 0) {
+			if (loop != point) {
+				ASSERTCMP(loop, >, point);
+				bufs[point] = bufs[loop];
+			}
+			point++;
+		}
+	}
+
+	desc->nbufs = point;
+	_leave(" [%zu/%zu]", desc->nbufs, nbufs);
+}
+
+/*
+ * get a list of this system's interface IPv4 addresses, netmasks and MTUs
+ * - returns the number of interface records in the buffer
+ */
+int afs_get_ipv4_interfaces(struct afs_interface *bufs, size_t maxbufs,
+			    bool wantloopback)
+{
+	struct afs_rtm_desc desc;
+	int ret, loop;
+
+	_enter("");
+
+	memset(&desc, 0, sizeof(desc));
+	desc.bufs = bufs;
+	desc.maxbufs = maxbufs;
+	desc.wantloopback = wantloopback;
+
+	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+			       &desc.nlsock);
+	if (ret < 0) {
+		_leave(" = %d [sock]", ret);
+		return ret;
+	}
+
+	/* issue RTM_GETADDR */
+	desc.parse = afs_rtm_getaddr_parse;
+	ret = afs_rtm_getaddr(&desc);
+	if (ret < 0)
+		goto error;
+	ret = afs_read_rtm(&desc);
+	if (ret < 0)
+		goto error;
+
+	/* issue RTM_GETLINK */
+	desc.parse = afs_rtm_getlink_if_parse;
+	ret = afs_rtm_getlink(&desc);
+	if (ret < 0)
+		goto error;
+	ret = afs_read_rtm(&desc);
+	if (ret < 0)
+		goto error;
+
+	afs_cull_interfaces(&desc);
+	ret = desc.nbufs;
+
+	for (loop = 0; loop < ret; loop++)
+		_debug("[%d] "NIPQUAD_FMT"/"NIPQUAD_FMT" mtu %u",
+		       bufs[loop].index,
+		       NIPQUAD(bufs[loop].address),
+		       NIPQUAD(bufs[loop].netmask),
+		       bufs[loop].mtu);
+
+error:
+	kfree(desc.data);
+	sock_release(desc.nlsock);
+	_leave(" = %d", ret);
+	return ret;
+}
+
+/*
+ * get a MAC address from a random ethernet interface that has a real one
+ * - the buffer should be 6 bytes in size
+ */
+int afs_get_MAC_address(u8 mac[6])
+{
+	struct afs_rtm_desc desc;
+	int ret;
+
+	_enter("");
+
+	memset(&desc, 0, sizeof(desc));
+	desc.mac = mac;
+	desc.mac_index = UINT_MAX;
+
+	ret = sock_create_kern(AF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE,
+			       &desc.nlsock);
+	if (ret < 0) {
+		_leave(" = %d [sock]", ret);
+		return ret;
+	}
+
+	/* issue RTM_GETLINK */
+	desc.parse = afs_rtm_getlink_mac_parse;
+	ret = afs_rtm_getlink(&desc);
+	if (ret < 0)
+		goto error;
+	ret = afs_read_rtm(&desc);
+	if (ret < 0)
+		goto error;
+
+	if (desc.mac_index < UINT_MAX) {
+		/* got a MAC address */
+		_debug("[%d] %02x:%02x:%02x:%02x:%02x:%02x",
+		       desc.mac_index,
+		       mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
+	} else {
+		ret = -ENONET;
+	}
+
+error:
+	sock_release(desc.nlsock);
+	_leave(" = %d", ret);
+	return ret;
+}
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index 7b0e3192ee39..36c1306e09e0 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -1,4 +1,4 @@
-/* vlclient.c: AFS Volume Location Service client
+/* AFS Volume Location Service client
  *
  * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
@@ -11,247 +11,76 @@
 
 #include <linux/init.h>
 #include <linux/sched.h>
-#include <rxrpc/rxrpc.h>
-#include <rxrpc/transport.h>
-#include <rxrpc/connection.h>
-#include <rxrpc/call.h>
-#include "server.h"
-#include "volume.h"
-#include "vlclient.h"
-#include "kafsasyncd.h"
-#include "kafstimod.h"
-#include "errors.h"
 #include "internal.h"
 
-#define VLGETENTRYBYID		503	/* AFS Get Cache Entry By ID operation ID */
-#define VLGETENTRYBYNAME	504	/* AFS Get Cache Entry By Name operation ID */
-#define VLPROBE			514	/* AFS Probe Volume Location Service operation ID */
-
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call);
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call);
-
-/*****************************************************************************/
 /*
- * map afs VL abort codes to/from Linux error codes
- * - called with call->lock held
+ * map volume locator abort codes to error codes
  */
-static void afs_rxvl_aemap(struct rxrpc_call *call)
+static int afs_vl_abort_to_error(u32 abort_code)
 {
-	int err;
-
-	_enter("{%u,%u,%d}",
-	       call->app_err_state, call->app_abort_code, call->app_errno);
-
-	switch (call->app_err_state) {
-	case RXRPC_ESTATE_LOCAL_ABORT:
-		call->app_abort_code = -call->app_errno;
-		return;
-
-	case RXRPC_ESTATE_PEER_ABORT:
-		switch (call->app_abort_code) {
-		case AFSVL_IDEXIST:		err = -EEXIST;		break;
-		case AFSVL_IO:			err = -EREMOTEIO;	break;
-		case AFSVL_NAMEEXIST:		err = -EEXIST;		break;
-		case AFSVL_CREATEFAIL:		err = -EREMOTEIO;	break;
-		case AFSVL_NOENT:		err = -ENOMEDIUM;	break;
-		case AFSVL_EMPTY:		err = -ENOMEDIUM;	break;
-		case AFSVL_ENTDELETED:		err = -ENOMEDIUM;	break;
-		case AFSVL_BADNAME:		err = -EINVAL;		break;
-		case AFSVL_BADINDEX:		err = -EINVAL;		break;
-		case AFSVL_BADVOLTYPE:		err = -EINVAL;		break;
-		case AFSVL_BADSERVER:		err = -EINVAL;		break;
-		case AFSVL_BADPARTITION:	err = -EINVAL;		break;
-		case AFSVL_REPSFULL:		err = -EFBIG;		break;
-		case AFSVL_NOREPSERVER:		err = -ENOENT;		break;
-		case AFSVL_DUPREPSERVER:	err = -EEXIST;		break;
-		case AFSVL_RWNOTFOUND:		err = -ENOENT;		break;
-		case AFSVL_BADREFCOUNT:		err = -EINVAL;		break;
-		case AFSVL_SIZEEXCEEDED:	err = -EINVAL;		break;
-		case AFSVL_BADENTRY:		err = -EINVAL;		break;
-		case AFSVL_BADVOLIDBUMP:	err = -EINVAL;		break;
-		case AFSVL_IDALREADYHASHED:	err = -EINVAL;		break;
-		case AFSVL_ENTRYLOCKED:		err = -EBUSY;		break;
-		case AFSVL_BADVOLOPER:		err = -EBADRQC;		break;
-		case AFSVL_BADRELLOCKTYPE:	err = -EINVAL;		break;
-		case AFSVL_RERELEASE:		err = -EREMOTEIO;	break;
-		case AFSVL_BADSERVERFLAG:	err = -EINVAL;		break;
-		case AFSVL_PERM:		err = -EACCES;		break;
-		case AFSVL_NOMEM:		err = -EREMOTEIO;	break;
-		default:
-			err = afs_abort_to_error(call->app_abort_code);
-			break;
-		}
-		call->app_errno = err;
-		return;
-
+	_enter("%u", abort_code);
+
+	switch (abort_code) {
+	case AFSVL_IDEXIST:		return -EEXIST;
+	case AFSVL_IO:			return -EREMOTEIO;
+	case AFSVL_NAMEEXIST:		return -EEXIST;
+	case AFSVL_CREATEFAIL:		return -EREMOTEIO;
+	case AFSVL_NOENT:		return -ENOMEDIUM;
+	case AFSVL_EMPTY:		return -ENOMEDIUM;
+	case AFSVL_ENTDELETED:		return -ENOMEDIUM;
+	case AFSVL_BADNAME:		return -EINVAL;
+	case AFSVL_BADINDEX:		return -EINVAL;
+	case AFSVL_BADVOLTYPE:		return -EINVAL;
+	case AFSVL_BADSERVER:		return -EINVAL;
+	case AFSVL_BADPARTITION:	return -EINVAL;
+	case AFSVL_REPSFULL:		return -EFBIG;
+	case AFSVL_NOREPSERVER:		return -ENOENT;
+	case AFSVL_DUPREPSERVER:	return -EEXIST;
+	case AFSVL_RWNOTFOUND:		return -ENOENT;
+	case AFSVL_BADREFCOUNT:		return -EINVAL;
+	case AFSVL_SIZEEXCEEDED:	return -EINVAL;
+	case AFSVL_BADENTRY:		return -EINVAL;
+	case AFSVL_BADVOLIDBUMP:	return -EINVAL;
+	case AFSVL_IDALREADYHASHED:	return -EINVAL;
+	case AFSVL_ENTRYLOCKED:		return -EBUSY;
+	case AFSVL_BADVOLOPER:		return -EBADRQC;
+	case AFSVL_BADRELLOCKTYPE:	return -EINVAL;
+	case AFSVL_RERELEASE:		return -EREMOTEIO;
+	case AFSVL_BADSERVERFLAG:	return -EINVAL;
+	case AFSVL_PERM:		return -EACCES;
+	case AFSVL_NOMEM:		return -EREMOTEIO;
 	default:
-		return;
+		return afs_abort_to_error(abort_code);
 	}
-} /* end afs_rxvl_aemap() */
+}
 
-#if 0
-/*****************************************************************************/
 /*
- * probe a volume location server to see if it is still alive -- unused
+ * deliver reply data to a VL.GetEntryByXXX call
  */
-static int afs_rxvl_probe(struct afs_server *server, int alloc_flags)
+static int afs_deliver_vl_get_entry_by_xxx(struct afs_call *call,
+					   struct sk_buff *skb, bool last)
 {
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	size_t sent;
-	int ret;
-	__be32 param[1];
-
-	DECLARE_WAITQUEUE(myself, current);
-
-	/* get hold of the vlserver connection */
-	ret = afs_server_get_vlconn(server, &conn);
-	if (ret < 0)
-		goto out;
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = VLPROBE;
-
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
-
-	/* marshall the parameters */
-	param[0] = htonl(VLPROBE);
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET,
-				    alloc_flags, 0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	for (;;) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		if (call->app_call_state != RXRPC_CSTATE_CLNT_RCV_REPLY ||
-		    signal_pending(current))
-			break;
-		schedule();
-	}
-	set_current_state(TASK_RUNNING);
-
-	ret = -EINTR;
-	if (signal_pending(current))
-		goto abort;
-
-	switch (call->app_call_state) {
-	case RXRPC_CSTATE_ERROR:
-		ret = call->app_errno;
-		goto out_unwait;
-
-	case RXRPC_CSTATE_CLNT_GOT_REPLY:
-		ret = 0;
-		goto out_unwait;
-
-	default:
-		BUG();
-	}
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	rxrpc_put_connection(conn);
- out:
-	return ret;
+	struct afs_cache_vlocation *entry;
+	__be32 *bp;
+	u32 tmp;
+	int loop;
 
-} /* end afs_rxvl_probe() */
-#endif
+	_enter(",,%u", last);
 
-/*****************************************************************************/
-/*
- * look up a volume location database entry by name
- */
-int afs_rxvl_get_entry_by_name(struct afs_server *server,
-			       const char *volname,
-			       unsigned volnamesz,
-			       struct afs_cache_vlocation *entry)
-{
-	DECLARE_WAITQUEUE(myself, current);
-
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[3];
-	unsigned tmp;
-	size_t sent;
-	int ret, loop;
-	__be32 *bp, param[2], zero;
-
-	_enter(",%*.*s,%u,", volnamesz, volnamesz, volname, volnamesz);
-
-	memset(entry, 0, sizeof(*entry));
-
-	/* get hold of the vlserver connection */
-	ret = afs_server_get_vlconn(server, &conn);
-	if (ret < 0)
-		goto out;
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = VLGETENTRYBYNAME;
+	afs_transfer_reply(call, skb);
+	if (!last)
+		return 0;
 
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
+	if (call->reply_size != call->reply_max)
+		return -EBADMSG;
 
-	/* marshall the parameters */
-	piov[1].iov_len = volnamesz;
-	piov[1].iov_base = (char *) volname;
-
-	zero = 0;
-	piov[2].iov_len = (4 - (piov[1].iov_len & 3)) & 3;
-	piov[2].iov_base = &zero;
-
-	param[0] = htonl(VLGETENTRYBYNAME);
-	param[1] = htonl(piov[1].iov_len);
-
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 3, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	bp = rxrpc_call_alloc_scratch(call, 384);
-
-	ret = rxrpc_call_read_data(call, bp, 384,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0) {
-		if (ret == -ECONNABORTED) {
-			ret = call->app_errno;
-			goto out_unwait;
-		}
-		goto abort;
-	}
+	/* unmarshall the reply once we've received all of it */
+	entry = call->reply;
+	bp = call->buffer;
 
-	/* unmarshall the reply */
 	for (loop = 0; loop < 64; loop++)
 		entry->name[loop] = ntohl(*bp++);
+	entry->name[loop] = 0;
 	bp++; /* final NUL */
 
 	bp++; /* type */
@@ -264,6 +93,7 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
 
 	for (loop = 0; loop < 8; loop++) {
 		tmp = ntohl(*bp++);
+		entry->srvtmask[loop] = 0;
 		if (tmp & AFS_VLSF_RWVOL)
 			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
 		if (tmp & AFS_VLSF_ROVOL)
@@ -279,417 +109,110 @@ int afs_rxvl_get_entry_by_name(struct afs_server *server,
 	bp++; /* clone ID */
 
 	tmp = ntohl(*bp++); /* flags */
+	entry->vidmask = 0;
 	if (tmp & AFS_VLF_RWEXISTS)
 		entry->vidmask |= AFS_VOL_VTM_RW;
 	if (tmp & AFS_VLF_ROEXISTS)
 		entry->vidmask |= AFS_VOL_VTM_RO;
 	if (tmp & AFS_VLF_BACKEXISTS)
 		entry->vidmask |= AFS_VOL_VTM_BAK;
-
-	ret = -ENOMEDIUM;
 	if (!entry->vidmask)
-		goto abort;
-
-	/* success */
-	entry->rtime = get_seconds();
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	rxrpc_put_connection(conn);
- out:
-	_leave(" = %d", ret);
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-} /* end afs_rxvl_get_entry_by_name() */
-
-/*****************************************************************************/
-/*
- * look up a volume location database entry by ID
- */
-int afs_rxvl_get_entry_by_id(struct afs_server *server,
-			     afs_volid_t volid,
-			     afs_voltype_t voltype,
-			     struct afs_cache_vlocation *entry)
-{
-	DECLARE_WAITQUEUE(myself, current);
-
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	unsigned tmp;
-	size_t sent;
-	int ret, loop;
-	__be32 *bp, param[3];
-
-	_enter(",%x,%d,", volid, voltype);
-
-	memset(entry, 0, sizeof(*entry));
-
-	/* get hold of the vlserver connection */
-	ret = afs_server_get_vlconn(server, &conn);
-	if (ret < 0)
-		goto out;
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn, NULL, NULL, afs_rxvl_aemap, &call);
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		goto out_put_conn;
-	}
-	call->app_opcode = VLGETENTRYBYID;
-
-	/* we want to get event notifications from the call */
-	add_wait_queue(&call->waitq, &myself);
-
-	/* marshall the parameters */
-	param[0] = htonl(VLGETENTRYBYID);
-	param[1] = htonl(volid);
-	param[2] = htonl(voltype);
-
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0)
-		goto abort;
-
-	/* wait for the reply to completely arrive */
-	bp = rxrpc_call_alloc_scratch(call, 384);
-
-	ret = rxrpc_call_read_data(call, bp, 384,
-				   RXRPC_CALL_READ_BLOCK |
-				   RXRPC_CALL_READ_ALL);
-	if (ret < 0) {
-		if (ret == -ECONNABORTED) {
-			ret = call->app_errno;
-			goto out_unwait;
-		}
-		goto abort;
-	}
-
-	/* unmarshall the reply */
-	for (loop = 0; loop < 64; loop++)
-		entry->name[loop] = ntohl(*bp++);
-	bp++; /* final NUL */
+		return -EBADMSG;
 
-	bp++; /* type */
-	entry->nservers = ntohl(*bp++);
-
-	for (loop = 0; loop < 8; loop++)
-		entry->servers[loop].s_addr = *bp++;
-
-	bp += 8; /* partition IDs */
+	_leave(" = 0 [done]");
+	return 0;
+}
 
-	for (loop = 0; loop < 8; loop++) {
-		tmp = ntohl(*bp++);
-		if (tmp & AFS_VLSF_RWVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-		if (tmp & AFS_VLSF_ROVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-		if (tmp & AFS_VLSF_BACKVOL)
-			entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-	}
-
-	entry->vid[0] = ntohl(*bp++);
-	entry->vid[1] = ntohl(*bp++);
-	entry->vid[2] = ntohl(*bp++);
-
-	bp++; /* clone ID */
-
-	tmp = ntohl(*bp++); /* flags */
-	if (tmp & AFS_VLF_RWEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RW;
-	if (tmp & AFS_VLF_ROEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_RO;
-	if (tmp & AFS_VLF_BACKEXISTS)
-		entry->vidmask |= AFS_VOL_VTM_BAK;
-
-	ret = -ENOMEDIUM;
-	if (!entry->vidmask)
-		goto abort;
-
-#if 0 /* TODO: remove */
-	entry->nservers = 3;
-	entry->servers[0].s_addr = htonl(0xac101249);
-	entry->servers[1].s_addr = htonl(0xac101243);
-	entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-
-	entry->srvtmask[0] = AFS_VOL_VTM_RO;
-	entry->srvtmask[1] = AFS_VOL_VTM_RO;
-	entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-
-	/* success */
-	entry->rtime = get_seconds();
-	ret = 0;
-
- out_unwait:
-	set_current_state(TASK_RUNNING);
-	remove_wait_queue(&call->waitq, &myself);
-	rxrpc_put_call(call);
- out_put_conn:
-	rxrpc_put_connection(conn);
- out:
-	_leave(" = %d", ret);
-	return ret;
-
- abort:
-	set_current_state(TASK_UNINTERRUPTIBLE);
-	rxrpc_call_abort(call, ret);
-	schedule();
-	goto out_unwait;
-} /* end afs_rxvl_get_entry_by_id() */
-
-/*****************************************************************************/
 /*
- * look up a volume location database entry by ID asynchronously
+ * VL.GetEntryByName operation type
  */
-int afs_rxvl_get_entry_by_id_async(struct afs_async_op *op,
-				   afs_volid_t volid,
-				   afs_voltype_t voltype)
-{
-	struct rxrpc_connection *conn;
-	struct rxrpc_call *call;
-	struct kvec piov[1];
-	size_t sent;
-	int ret;
-	__be32 param[3];
-
-	_enter(",%x,%d,", volid, voltype);
-
-	/* get hold of the vlserver connection */
-	ret = afs_server_get_vlconn(op->server, &conn);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
-	}
-
-	/* create a call through that connection */
-	ret = rxrpc_create_call(conn,
-				afs_rxvl_get_entry_by_id_attn,
-				afs_rxvl_get_entry_by_id_error,
-				afs_rxvl_aemap,
-				&op->call);
-	rxrpc_put_connection(conn);
-
-	if (ret < 0) {
-		printk("kAFS: Unable to create call: %d\n", ret);
-		_leave(" = %d", ret);
-		return ret;
-	}
+static const struct afs_call_type afs_RXVLGetEntryByName = {
+	.name		= "VL.GetEntryByName",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
 
-	op->call->app_opcode = VLGETENTRYBYID;
-	op->call->app_user = op;
-
-	call = op->call;
-	rxrpc_get_call(call);
-
-	/* send event notifications from the call to kafsasyncd */
-	afs_kafsasyncd_begin_op(op);
-
-	/* marshall the parameters */
-	param[0] = htonl(VLGETENTRYBYID);
-	param[1] = htonl(volid);
-	param[2] = htonl(voltype);
-
-	piov[0].iov_len = sizeof(param);
-	piov[0].iov_base = param;
-
-	/* allocate result read buffer in scratch space */
-	call->app_scr_ptr = rxrpc_call_alloc_scratch(op->call, 384);
-
-	/* send the parameters to the server */
-	ret = rxrpc_call_write_data(call, 1, piov, RXRPC_LAST_PACKET, GFP_NOFS,
-				    0, &sent);
-	if (ret < 0) {
-		rxrpc_call_abort(call, ret); /* handle from kafsasyncd */
-		ret = 0;
-		goto out;
-	}
-
-	/* wait for the reply to completely arrive */
-	ret = rxrpc_call_read_data(call, call->app_scr_ptr, 384, 0);
-	switch (ret) {
-	case 0:
-	case -EAGAIN:
-	case -ECONNABORTED:
-		ret = 0;
-		break;	/* all handled by kafsasyncd */
-
-	default:
-		rxrpc_call_abort(call, ret); /* make kafsasyncd handle it */
-		ret = 0;
-		break;
-	}
-
- out:
-	rxrpc_put_call(call);
-	_leave(" = %d", ret);
-	return ret;
-
-} /* end afs_rxvl_get_entry_by_id_async() */
+/*
+ * VL.GetEntryById operation type
+ */
+static const struct afs_call_type afs_RXVLGetEntryById = {
+	.name		= "VL.GetEntryById",
+	.deliver	= afs_deliver_vl_get_entry_by_xxx,
+	.abort_to_error	= afs_vl_abort_to_error,
+	.destructor	= afs_flat_call_destructor,
+};
 
-/*****************************************************************************/
 /*
- * attend to the asynchronous get VLDB entry by ID
+ * dispatch a get volume entry by name operation
  */
-int afs_rxvl_get_entry_by_id_async2(struct afs_async_op *op,
-				    struct afs_cache_vlocation *entry)
+int afs_vl_get_entry_by_name(struct in_addr *addr,
+			     struct key *key,
+			     const char *volname,
+			     struct afs_cache_vlocation *entry,
+			     const struct afs_wait_mode *wait_mode)
 {
+	struct afs_call *call;
+	size_t volnamesz, reqsz, padsz;
 	__be32 *bp;
-	__u32 tmp;
-	int loop, ret;
-
-	_enter("{op=%p cst=%u}", op, op->call->app_call_state);
-
-	memset(entry, 0, sizeof(*entry));
-
-	if (op->call->app_call_state == RXRPC_CSTATE_COMPLETE) {
-		/* operation finished */
-		afs_kafsasyncd_terminate_op(op);
-
-		bp = op->call->app_scr_ptr;
-
-		/* unmarshall the reply */
-		for (loop = 0; loop < 64; loop++)
-			entry->name[loop] = ntohl(*bp++);
-		bp++; /* final NUL */
-
-		bp++; /* type */
-		entry->nservers = ntohl(*bp++);
-
-		for (loop = 0; loop < 8; loop++)
-			entry->servers[loop].s_addr = *bp++;
-
-		bp += 8; /* partition IDs */
-
-		for (loop = 0; loop < 8; loop++) {
-			tmp = ntohl(*bp++);
-			if (tmp & AFS_VLSF_RWVOL)
-				entry->srvtmask[loop] |= AFS_VOL_VTM_RW;
-			if (tmp & AFS_VLSF_ROVOL)
-				entry->srvtmask[loop] |= AFS_VOL_VTM_RO;
-			if (tmp & AFS_VLSF_BACKVOL)
-				entry->srvtmask[loop] |= AFS_VOL_VTM_BAK;
-		}
-
-		entry->vid[0] = ntohl(*bp++);
-		entry->vid[1] = ntohl(*bp++);
-		entry->vid[2] = ntohl(*bp++);
-
-		bp++; /* clone ID */
-
-		tmp = ntohl(*bp++); /* flags */
-		if (tmp & AFS_VLF_RWEXISTS)
-			entry->vidmask |= AFS_VOL_VTM_RW;
-		if (tmp & AFS_VLF_ROEXISTS)
-			entry->vidmask |= AFS_VOL_VTM_RO;
-		if (tmp & AFS_VLF_BACKEXISTS)
-			entry->vidmask |= AFS_VOL_VTM_BAK;
-
-		ret = -ENOMEDIUM;
-		if (!entry->vidmask) {
-			rxrpc_call_abort(op->call, ret);
-			goto done;
-		}
-
-#if 0 /* TODO: remove */
-		entry->nservers = 3;
-		entry->servers[0].s_addr = htonl(0xac101249);
-		entry->servers[1].s_addr = htonl(0xac101243);
-		entry->servers[2].s_addr = htonl(0xac10125b /*0xac10125b*/);
-
-		entry->srvtmask[0] = AFS_VOL_VTM_RO;
-		entry->srvtmask[1] = AFS_VOL_VTM_RO;
-		entry->srvtmask[2] = AFS_VOL_VTM_RO | AFS_VOL_VTM_RW;
-#endif
-
-		/* success */
-		entry->rtime = get_seconds();
-		ret = 0;
-		goto done;
-	}
 
-	if (op->call->app_call_state == RXRPC_CSTATE_ERROR) {
-		/* operation error */
-		ret = op->call->app_errno;
-		goto done;
-	}
+	_enter("");
 
-	_leave(" = -EAGAIN");
-	return -EAGAIN;
+	volnamesz = strlen(volname);
+	padsz = (4 - (volnamesz & 3)) & 3;
+	reqsz = 8 + volnamesz + padsz;
 
- done:
-	rxrpc_put_call(op->call);
-	op->call = NULL;
-	_leave(" = %d", ret);
-	return ret;
-} /* end afs_rxvl_get_entry_by_id_async2() */
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryByName, reqsz, 384);
+	if (!call)
+		return -ENOMEM;
 
-/*****************************************************************************/
-/*
- * handle attention events on an async get-entry-by-ID op
- * - called from krxiod
- */
-static void afs_rxvl_get_entry_by_id_attn(struct rxrpc_call *call)
-{
-	struct afs_async_op *op = call->app_user;
-
-	_enter("{op=%p cst=%u}", op, call->app_call_state);
-
-	switch (call->app_call_state) {
-	case RXRPC_CSTATE_COMPLETE:
-		afs_kafsasyncd_attend_op(op);
-		break;
-	case RXRPC_CSTATE_CLNT_RCV_REPLY:
-		if (call->app_async_read)
-			break;
-	case RXRPC_CSTATE_CLNT_GOT_REPLY:
-		if (call->app_read_count == 0)
-			break;
-		printk("kAFS: Reply bigger than expected"
-		       " {cst=%u asyn=%d mark=%Zu rdy=%Zu pr=%u%s}",
-		       call->app_call_state,
-		       call->app_async_read,
-		       call->app_mark,
-		       call->app_ready_qty,
-		       call->pkt_rcv_count,
-		       call->app_last_rcv ? " last" : "");
-
-		rxrpc_call_abort(call, -EBADMSG);
-		break;
-	default:
-		BUG();
-	}
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
 
-	_leave("");
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYNAME);
+	*bp++ = htonl(volnamesz);
+	memcpy(bp, volname, volnamesz);
+	if (padsz > 0)
+		memset((void *) bp + volnamesz, 0, padsz);
 
-} /* end afs_rxvl_get_entry_by_id_attn() */
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
 
-/*****************************************************************************/
 /*
- * handle error events on an async get-entry-by-ID op
- * - called from krxiod
+ * dispatch a get volume entry by ID operation
  */
-static void afs_rxvl_get_entry_by_id_error(struct rxrpc_call *call)
+int afs_vl_get_entry_by_id(struct in_addr *addr,
+			   struct key *key,
+			   afs_volid_t volid,
+			   afs_voltype_t voltype,
+			   struct afs_cache_vlocation *entry,
+			   const struct afs_wait_mode *wait_mode)
 {
-	struct afs_async_op *op = call->app_user;
+	struct afs_call *call;
+	__be32 *bp;
 
-	_enter("{op=%p cst=%u}", op, call->app_call_state);
+	_enter("");
 
-	afs_kafsasyncd_attend_op(op);
+	call = afs_alloc_flat_call(&afs_RXVLGetEntryById, 12, 384);
+	if (!call)
+		return -ENOMEM;
 
-	_leave("");
+	call->key = key;
+	call->reply = entry;
+	call->service_id = VL_SERVICE;
+	call->port = htons(AFS_VL_PORT);
 
-} /* end afs_rxvl_get_entry_by_id_error() */
+	/* marshall the parameters */
+	bp = call->request;
+	*bp++ = htonl(VLGETENTRYBYID);
+	*bp++ = htonl(volid);
+	*bp   = htonl(voltype);
+
+	/* initiate the call */
+	return afs_make_call(addr, call, GFP_KERNEL, wait_mode);
+}
diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c
index 782ee7c600ca..74cce174882a 100644
--- a/fs/afs/vlocation.c
+++ b/fs/afs/vlocation.c
@@ -1,6 +1,6 @@
-/* vlocation.c: volume location management
+/* AFS volume location management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -12,131 +12,61 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "kafstimod.h"
-#include <rxrpc/connection.h>
 #include "internal.h"
 
-#define AFS_VLDB_TIMEOUT HZ*1000
+unsigned afs_vlocation_timeout = 10;	/* volume location timeout in seconds */
+unsigned afs_vlocation_update_timeout = 10 * 60;
 
-static void afs_vlocation_update_timer(struct afs_timer *timer);
-static void afs_vlocation_update_attend(struct afs_async_op *op);
-static void afs_vlocation_update_discard(struct afs_async_op *op);
-static void __afs_put_vlocation(struct afs_vlocation *vlocation);
+static void afs_vlocation_reaper(struct work_struct *);
+static void afs_vlocation_updater(struct work_struct *);
 
-static void __afs_vlocation_timeout(struct afs_timer *timer)
-{
-	struct afs_vlocation *vlocation =
-		list_entry(timer, struct afs_vlocation, timeout);
-
-	_debug("VL TIMEOUT [%s{u=%d}]",
-	       vlocation->vldb.name, atomic_read(&vlocation->usage));
-
-	afs_vlocation_do_timeout(vlocation);
-}
-
-static const struct afs_timer_ops afs_vlocation_timer_ops = {
-	.timed_out	= __afs_vlocation_timeout,
-};
+static LIST_HEAD(afs_vlocation_updates);
+static LIST_HEAD(afs_vlocation_graveyard);
+static DEFINE_SPINLOCK(afs_vlocation_updates_lock);
+static DEFINE_SPINLOCK(afs_vlocation_graveyard_lock);
+static DECLARE_DELAYED_WORK(afs_vlocation_reap, afs_vlocation_reaper);
+static DECLARE_DELAYED_WORK(afs_vlocation_update, afs_vlocation_updater);
+static struct workqueue_struct *afs_vlocation_update_worker;
 
-static const struct afs_timer_ops afs_vlocation_update_timer_ops = {
-	.timed_out	= afs_vlocation_update_timer,
-};
-
-static const struct afs_async_op_ops afs_vlocation_update_op_ops = {
-	.attend		= afs_vlocation_update_attend,
-	.discard	= afs_vlocation_update_discard,
-};
-
-static LIST_HEAD(afs_vlocation_update_pendq);	/* queue of VLs awaiting update */
-static struct afs_vlocation *afs_vlocation_update;	/* VL currently being updated */
-static DEFINE_SPINLOCK(afs_vlocation_update_lock); /* lock guarding update queue */
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-						     const void *entry);
-static void afs_vlocation_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_vlocation_cache_index_def = {
-	.name		= "vldb",
-	.data_size	= sizeof(struct afs_cache_vlocation),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_ASCIIZ, 64 },
-	.match		= afs_vlocation_cache_match,
-	.update		= afs_vlocation_cache_update,
-};
-#endif
-
-/*****************************************************************************/
 /*
  * iterate through the VL servers in a cell until one of them admits knowing
  * about the volume in question
- * - caller must have cell->vl_sem write-locked
  */
-static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
-					   const char *name,
-					   unsigned namesz,
+static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vl,
+					   struct key *key,
 					   struct afs_cache_vlocation *vldb)
 {
-	struct afs_server *server = NULL;
-	struct afs_cell *cell = vlocation->cell;
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
 	int count, ret;
 
-	_enter("%s,%*.*s,%u", cell->name, namesz, namesz, name, namesz);
+	_enter("%s,%s", cell->name, vl->vldb.name);
 
+	down_write(&vl->cell->vl_sem);
 	ret = -ENOMEDIUM;
 	for (count = cell->vl_naddrs; count > 0; count--) {
-		_debug("CellServ[%hu]: %08x",
-		       cell->vl_curr_svix,
-		       cell->vl_addrs[cell->vl_curr_svix].s_addr);
-
-		/* try and create a server */
-		ret = afs_server_lookup(cell,
-					&cell->vl_addrs[cell->vl_curr_svix],
-					&server);
-		switch (ret) {
-		case 0:
-			break;
-		case -ENOMEM:
-		case -ENONET:
-			goto out;
-		default:
-			goto rotate;
-		}
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
 
 		/* attempt to access the VL server */
-		ret = afs_rxvl_get_entry_by_name(server, name, namesz, vldb);
+		ret = afs_vl_get_entry_by_name(&addr, key, vl->vldb.name, vldb,
+					       &afs_sync_call);
 		switch (ret) {
 		case 0:
-			afs_put_server(server);
 			goto out;
 		case -ENOMEM:
 		case -ENONET:
 		case -ENETUNREACH:
 		case -EHOSTUNREACH:
 		case -ECONNREFUSED:
-			down_write(&server->sem);
-			if (server->vlserver) {
-				rxrpc_put_connection(server->vlserver);
-				server->vlserver = NULL;
-			}
-			up_write(&server->sem);
-			afs_put_server(server);
 			if (ret == -ENOMEM || ret == -ENONET)
 				goto out;
 			goto rotate;
 		case -ENOMEDIUM:
-			afs_put_server(server);
 			goto out;
 		default:
-			afs_put_server(server);
-			ret = -ENOMEDIUM;
+			ret = -EIO;
 			goto rotate;
 		}
 
@@ -146,76 +76,66 @@ static int afs_vlocation_access_vl_by_name(struct afs_vlocation *vlocation,
 		cell->vl_curr_svix %= cell->vl_naddrs;
 	}
 
- out:
+out:
+	up_write(&vl->cell->vl_sem);
 	_leave(" = %d", ret);
 	return ret;
+}
 
-} /* end afs_vlocation_access_vl_by_name() */
-
-/*****************************************************************************/
 /*
  * iterate through the VL servers in a cell until one of them admits knowing
  * about the volume in question
- * - caller must have cell->vl_sem write-locked
  */
-static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
+static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl,
+					 struct key *key,
 					 afs_volid_t volid,
 					 afs_voltype_t voltype,
 					 struct afs_cache_vlocation *vldb)
 {
-	struct afs_server *server = NULL;
-	struct afs_cell *cell = vlocation->cell;
+	struct afs_cell *cell = vl->cell;
+	struct in_addr addr;
 	int count, ret;
 
 	_enter("%s,%x,%d,", cell->name, volid, voltype);
 
+	down_write(&vl->cell->vl_sem);
 	ret = -ENOMEDIUM;
 	for (count = cell->vl_naddrs; count > 0; count--) {
-		_debug("CellServ[%hu]: %08x",
-		       cell->vl_curr_svix,
-		       cell->vl_addrs[cell->vl_curr_svix].s_addr);
-
-		/* try and create a server */
-		ret = afs_server_lookup(cell,
-					&cell->vl_addrs[cell->vl_curr_svix],
-					&server);
-		switch (ret) {
-		case 0:
-			break;
-		case -ENOMEM:
-		case -ENONET:
-			goto out;
-		default:
-			goto rotate;
-		}
+		addr = cell->vl_addrs[cell->vl_curr_svix];
+
+		_debug("CellServ[%hu]: %08x", cell->vl_curr_svix, addr.s_addr);
 
 		/* attempt to access the VL server */
-		ret = afs_rxvl_get_entry_by_id(server, volid, voltype, vldb);
+		ret = afs_vl_get_entry_by_id(&addr, key, volid, voltype, vldb,
+					     &afs_sync_call);
 		switch (ret) {
 		case 0:
-			afs_put_server(server);
 			goto out;
 		case -ENOMEM:
 		case -ENONET:
 		case -ENETUNREACH:
 		case -EHOSTUNREACH:
 		case -ECONNREFUSED:
-			down_write(&server->sem);
-			if (server->vlserver) {
-				rxrpc_put_connection(server->vlserver);
-				server->vlserver = NULL;
-			}
-			up_write(&server->sem);
-			afs_put_server(server);
 			if (ret == -ENOMEM || ret == -ENONET)
 				goto out;
 			goto rotate;
+		case -EBUSY:
+			vl->upd_busy_cnt++;
+			if (vl->upd_busy_cnt <= 3) {
+				if (vl->upd_busy_cnt > 1) {
+					/* second+ BUSY - sleep a little bit */
+					set_current_state(TASK_UNINTERRUPTIBLE);
+					schedule_timeout(1);
+					__set_current_state(TASK_RUNNING);
+				}
+				continue;
+			}
+			break;
 		case -ENOMEDIUM:
-			afs_put_server(server);
-			goto out;
+			vl->upd_rej_cnt++;
+			goto rotate;
 		default:
-			afs_put_server(server);
-			ret = -ENOMEDIUM;
+			ret = -EIO;
 			goto rotate;
 		}
 
@@ -223,729 +143,580 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vlocation,
 	rotate:
 		cell->vl_curr_svix++;
 		cell->vl_curr_svix %= cell->vl_naddrs;
+		vl->upd_busy_cnt = 0;
 	}
 
- out:
+out:
+	if (ret < 0 && vl->upd_rej_cnt > 0) {
+		printk(KERN_NOTICE "kAFS:"
+		       " Active volume no longer valid '%s'\n",
+		       vl->vldb.name);
+		vl->valid = 0;
+		ret = -ENOMEDIUM;
+	}
+
+	up_write(&vl->cell->vl_sem);
 	_leave(" = %d", ret);
 	return ret;
+}
 
-} /* end afs_vlocation_access_vl_by_id() */
-
-/*****************************************************************************/
 /*
- * lookup volume location
- * - caller must have cell->vol_sem write-locked
- * - iterate through the VL servers in a cell until one of them admits knowing
- *   about the volume in question
- * - lookup in the local cache if not able to find on the VL server
- * - insert/update in the local cache if did get a VL response
+ * allocate a volume location record
  */
-int afs_vlocation_lookup(struct afs_cell *cell,
-			 const char *name,
-			 unsigned namesz,
-			 struct afs_vlocation **_vlocation)
+static struct afs_vlocation *afs_vlocation_alloc(struct afs_cell *cell,
+						 const char *name,
+						 size_t namesz)
 {
-	struct afs_cache_vlocation vldb;
-	struct afs_vlocation *vlocation;
-	afs_voltype_t voltype;
-	afs_volid_t vid;
-	int active = 0, ret;
-
-	_enter("{%s},%*.*s,%u,", cell->name, namesz, namesz, name, namesz);
-
-	if (namesz > sizeof(vlocation->vldb.name)) {
-		_leave(" = -ENAMETOOLONG");
-		return -ENAMETOOLONG;
-	}
-
-	/* search the cell's active list first */
-	list_for_each_entry(vlocation, &cell->vl_list, link) {
-		if (namesz < sizeof(vlocation->vldb.name) &&
-		    vlocation->vldb.name[namesz] != '\0')
-			continue;
-
-		if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-			goto found_in_memory;
-	}
-
-	/* search the cell's graveyard list second */
-	spin_lock(&cell->vl_gylock);
-	list_for_each_entry(vlocation, &cell->vl_graveyard, link) {
-		if (namesz < sizeof(vlocation->vldb.name) &&
-		    vlocation->vldb.name[namesz] != '\0')
-			continue;
-
-		if (memcmp(vlocation->vldb.name, name, namesz) == 0)
-			goto found_in_graveyard;
-	}
-	spin_unlock(&cell->vl_gylock);
-
-	/* not in the cell's in-memory lists - create a new record */
-	vlocation = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
-	if (!vlocation)
-		return -ENOMEM;
-
-	atomic_set(&vlocation->usage, 1);
-	INIT_LIST_HEAD(&vlocation->link);
-	rwlock_init(&vlocation->lock);
-	memcpy(vlocation->vldb.name, name, namesz);
-
-	afs_timer_init(&vlocation->timeout, &afs_vlocation_timer_ops);
-	afs_timer_init(&vlocation->upd_timer, &afs_vlocation_update_timer_ops);
-	afs_async_op_init(&vlocation->upd_op, &afs_vlocation_update_op_ops);
-
-	afs_get_cell(cell);
-	vlocation->cell = cell;
-
-	list_add_tail(&vlocation->link, &cell->vl_list);
-
-#ifdef AFS_CACHING_SUPPORT
-	/* we want to store it in the cache, plus it might already be
-	 * encached */
-	cachefs_acquire_cookie(cell->cache,
-			       &afs_volume_cache_index_def,
-			       vlocation,
-			       &vlocation->cache);
-
-	if (vlocation->valid)
-		goto found_in_cache;
-#endif
-
-	/* try to look up an unknown volume in the cell VL databases by name */
-	ret = afs_vlocation_access_vl_by_name(vlocation, name, namesz, &vldb);
-	if (ret < 0) {
-		printk("kAFS: failed to locate '%*.*s' in cell '%s'\n",
-		       namesz, namesz, name, cell->name);
-		goto error;
+	struct afs_vlocation *vl;
+
+	vl = kzalloc(sizeof(struct afs_vlocation), GFP_KERNEL);
+	if (vl) {
+		vl->cell = cell;
+		vl->state = AFS_VL_NEW;
+		atomic_set(&vl->usage, 1);
+		INIT_LIST_HEAD(&vl->link);
+		INIT_LIST_HEAD(&vl->grave);
+		INIT_LIST_HEAD(&vl->update);
+		init_waitqueue_head(&vl->waitq);
+		spin_lock_init(&vl->lock);
+		memcpy(vl->vldb.name, name, namesz);
 	}
 
-	goto found_on_vlserver;
-
- found_in_graveyard:
-	/* found in the graveyard - resurrect */
-	_debug("found in graveyard");
-	atomic_inc(&vlocation->usage);
-	list_move_tail(&vlocation->link, &cell->vl_list);
-	spin_unlock(&cell->vl_gylock);
-
-	afs_kafstimod_del_timer(&vlocation->timeout);
-	goto active;
-
- found_in_memory:
-	/* found in memory - check to see if it's active */
-	_debug("found in memory");
-	atomic_inc(&vlocation->usage);
+	_leave(" = %p", vl);
+	return vl;
+}
 
- active:
-	active = 1;
+/*
+ * update record if we found it in the cache
+ */
+static int afs_vlocation_update_record(struct afs_vlocation *vl,
+				       struct key *key,
+				       struct afs_cache_vlocation *vldb)
+{
+	afs_voltype_t voltype;
+	afs_volid_t vid;
+	int ret;
 
-#ifdef AFS_CACHING_SUPPORT
- found_in_cache:
-#endif
 	/* try to look up a cached volume in the cell VL databases by ID */
-	_debug("found in cache");
-
 	_debug("Locally Cached: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       vlocation->vldb.name,
-	       vlocation->vldb.vidmask,
-	       ntohl(vlocation->vldb.servers[0].s_addr),
-	       vlocation->vldb.srvtmask[0],
-	       ntohl(vlocation->vldb.servers[1].s_addr),
-	       vlocation->vldb.srvtmask[1],
-	       ntohl(vlocation->vldb.servers[2].s_addr),
-	       vlocation->vldb.srvtmask[2]
-	       );
+	       vl->vldb.name,
+	       vl->vldb.vidmask,
+	       ntohl(vl->vldb.servers[0].s_addr),
+	       vl->vldb.srvtmask[0],
+	       ntohl(vl->vldb.servers[1].s_addr),
+	       vl->vldb.srvtmask[1],
+	       ntohl(vl->vldb.servers[2].s_addr),
+	       vl->vldb.srvtmask[2]);
 
 	_debug("Vids: %08x %08x %08x",
-	       vlocation->vldb.vid[0],
-	       vlocation->vldb.vid[1],
-	       vlocation->vldb.vid[2]);
+	       vl->vldb.vid[0],
+	       vl->vldb.vid[1],
+	       vl->vldb.vid[2]);
 
-	if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
-		vid = vlocation->vldb.vid[0];
+	if (vl->vldb.vidmask & AFS_VOL_VTM_RW) {
+		vid = vl->vldb.vid[0];
 		voltype = AFSVL_RWVOL;
-	}
-	else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
-		vid = vlocation->vldb.vid[1];
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_RO) {
+		vid = vl->vldb.vid[1];
 		voltype = AFSVL_ROVOL;
-	}
-	else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
-		vid = vlocation->vldb.vid[2];
+	} else if (vl->vldb.vidmask & AFS_VOL_VTM_BAK) {
+		vid = vl->vldb.vid[2];
 		voltype = AFSVL_BACKVOL;
-	}
-	else {
+	} else {
 		BUG();
 		vid = 0;
 		voltype = 0;
 	}
 
-	ret = afs_vlocation_access_vl_by_id(vlocation, vid, voltype, &vldb);
+	/* contact the server to make sure the volume is still available
+	 * - TODO: need to handle disconnected operation here
+	 */
+	ret = afs_vlocation_access_vl_by_id(vl, key, vid, voltype, vldb);
 	switch (ret) {
 		/* net error */
 	default:
-		printk("kAFS: failed to volume '%*.*s' (%x) up in '%s': %d\n",
-		       namesz, namesz, name, vid, cell->name, ret);
-		goto error;
+		printk(KERN_WARNING "kAFS:"
+		       " failed to update volume '%s' (%x) up in '%s': %d\n",
+		       vl->vldb.name, vid, vl->cell->name, ret);
+		_leave(" = %d", ret);
+		return ret;
 
 		/* pulled from local cache into memory */
 	case 0:
-		goto found_on_vlserver;
+		_leave(" = 0");
+		return 0;
 
 		/* uh oh... looks like the volume got deleted */
 	case -ENOMEDIUM:
-		printk("kAFS: volume '%*.*s' (%x) does not exist '%s'\n",
-		       namesz, namesz, name, vid, cell->name);
+		printk(KERN_ERR "kAFS:"
+		       " volume '%s' (%x) does not exist '%s'\n",
+		       vl->vldb.name, vid, vl->cell->name);
 
 		/* TODO: make existing record unavailable */
-		goto error;
+		_leave(" = %d", ret);
+		return ret;
 	}
+}
 
- found_on_vlserver:
-	_debug("Done VL Lookup: %*.*s %02x { %08x(%x) %08x(%x) %08x(%x) }",
-	       namesz, namesz, name,
-	       vldb.vidmask,
-	       ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
-	       ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
-	       ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
-	       );
-
-	_debug("Vids: %08x %08x %08x", vldb.vid[0], vldb.vid[1], vldb.vid[2]);
+/*
+ * apply the update to a VL record
+ */
+static void afs_vlocation_apply_update(struct afs_vlocation *vl,
+				       struct afs_cache_vlocation *vldb)
+{
+	_debug("Done VL Lookup: %s %02x { %08x(%x) %08x(%x) %08x(%x) }",
+	       vldb->name, vldb->vidmask,
+	       ntohl(vldb->servers[0].s_addr), vldb->srvtmask[0],
+	       ntohl(vldb->servers[1].s_addr), vldb->srvtmask[1],
+	       ntohl(vldb->servers[2].s_addr), vldb->srvtmask[2]);
 
-	if ((namesz < sizeof(vlocation->vldb.name) &&
-	     vlocation->vldb.name[namesz] != '\0') ||
-	    memcmp(vldb.name, name, namesz) != 0)
-		printk("kAFS: name of volume '%*.*s' changed to '%s' on server\n",
-		       namesz, namesz, name, vldb.name);
+	_debug("Vids: %08x %08x %08x",
+	       vldb->vid[0], vldb->vid[1], vldb->vid[2]);
 
-	memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
+	if (strcmp(vldb->name, vl->vldb.name) != 0)
+		printk(KERN_NOTICE "kAFS:"
+		       " name of volume '%s' changed to '%s' on server\n",
+		       vl->vldb.name, vldb->name);
 
-	afs_kafstimod_add_timer(&vlocation->upd_timer, 10 * HZ);
+	vl->vldb = *vldb;
 
 #ifdef AFS_CACHING_SUPPORT
 	/* update volume entry in local cache */
-	cachefs_update_cookie(vlocation->cache);
-#endif
-
-	*_vlocation = vlocation;
-	_leave(" = 0 (%p)",vlocation);
-	return 0;
-
- error:
-	if (vlocation) {
-		if (active) {
-			__afs_put_vlocation(vlocation);
-		}
-		else {
-			list_del(&vlocation->link);
-#ifdef AFS_CACHING_SUPPORT
-			cachefs_relinquish_cookie(vlocation->cache, 0);
+	cachefs_update_cookie(vl->cache);
 #endif
-			afs_put_cell(vlocation->cell);
-			kfree(vlocation);
-		}
-	}
-
-	_leave(" = %d", ret);
-	return ret;
-} /* end afs_vlocation_lookup() */
+}
 
-/*****************************************************************************/
 /*
- * finish using a volume location record
- * - caller must have cell->vol_sem write-locked
+ * fill in a volume location record, consulting the cache and the VL server
+ * both
  */
-static void __afs_put_vlocation(struct afs_vlocation *vlocation)
+static int afs_vlocation_fill_in_record(struct afs_vlocation *vl,
+					struct key *key)
 {
-	struct afs_cell *cell;
+	struct afs_cache_vlocation vldb;
+	int ret;
 
-	if (!vlocation)
-		return;
+	_enter("");
 
-	_enter("%s", vlocation->vldb.name);
+	ASSERTCMP(vl->valid, ==, 0);
 
-	cell = vlocation->cell;
+	memset(&vldb, 0, sizeof(vldb));
 
-	/* sanity check */
-	BUG_ON(atomic_read(&vlocation->usage) <= 0);
+	/* see if we have an in-cache copy (will set vl->valid if there is) */
+#ifdef AFS_CACHING_SUPPORT
+	cachefs_acquire_cookie(cell->cache,
+			       &afs_volume_cache_index_def,
+			       vlocation,
+			       &vl->cache);
+#endif
 
-	spin_lock(&cell->vl_gylock);
-	if (likely(!atomic_dec_and_test(&vlocation->usage))) {
-		spin_unlock(&cell->vl_gylock);
-		_leave("");
-		return;
+	if (vl->valid) {
+		/* try to update a known volume in the cell VL databases by
+		 * ID as the name may have changed */
+		_debug("found in cache");
+		ret = afs_vlocation_update_record(vl, key, &vldb);
+	} else {
+		/* try to look up an unknown volume in the cell VL databases by
+		 * name */
+		ret = afs_vlocation_access_vl_by_name(vl, key, &vldb);
+		if (ret < 0) {
+			printk("kAFS: failed to locate '%s' in cell '%s'\n",
+			       vl->vldb.name, vl->cell->name);
+			return ret;
+		}
 	}
 
-	/* move to graveyard queue */
-	list_move_tail(&vlocation->link,&cell->vl_graveyard);
-
-	/* remove from pending timeout queue (refcounted if actually being
-	 * updated) */
-	list_del_init(&vlocation->upd_op.link);
-
-	/* time out in 10 secs */
-	afs_kafstimod_del_timer(&vlocation->upd_timer);
-	afs_kafstimod_add_timer(&vlocation->timeout, 10 * HZ);
-
-	spin_unlock(&cell->vl_gylock);
-
-	_leave(" [killed]");
-} /* end __afs_put_vlocation() */
-
-/*****************************************************************************/
-/*
- * finish using a volume location record
- */
-void afs_put_vlocation(struct afs_vlocation *vlocation)
-{
-	if (vlocation) {
-		struct afs_cell *cell = vlocation->cell;
-
-		down_write(&cell->vl_sem);
-		__afs_put_vlocation(vlocation);
-		up_write(&cell->vl_sem);
-	}
-} /* end afs_put_vlocation() */
+	afs_vlocation_apply_update(vl, &vldb);
+	_leave(" = 0");
+	return 0;
+}
 
-/*****************************************************************************/
 /*
- * timeout vlocation record
- * - removes from the cell's graveyard if the usage count is zero
+ * queue a vlocation record for updates
  */
-void afs_vlocation_do_timeout(struct afs_vlocation *vlocation)
+void afs_vlocation_queue_for_updates(struct afs_vlocation *vl)
 {
-	struct afs_cell *cell;
+	struct afs_vlocation *xvl;
 
-	_enter("%s", vlocation->vldb.name);
+	/* wait at least 10 minutes before updating... */
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
 
-	cell = vlocation->cell;
+	spin_lock(&afs_vlocation_updates_lock);
 
-	BUG_ON(atomic_read(&vlocation->usage) < 0);
-
-	/* remove from graveyard if still dead */
-	spin_lock(&cell->vl_gylock);
-	if (atomic_read(&vlocation->usage) == 0)
-		list_del_init(&vlocation->link);
-	else
-		vlocation = NULL;
-	spin_unlock(&cell->vl_gylock);
-
-	if (!vlocation) {
-		_leave("");
-		return; /* resurrected */
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* ... but wait at least 1 second more than the newest record
+		 * already queued so that we don't spam the VL server suddenly
+		 * with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+	} else {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update,
+				   afs_vlocation_update_timeout * HZ);
 	}
 
-	/* we can now destroy it properly */
-#ifdef AFS_CACHING_SUPPORT
-	cachefs_relinquish_cookie(vlocation->cache, 0);
-#endif
-	afs_put_cell(cell);
-
-	kfree(vlocation);
-
-	_leave(" [destroyed]");
-} /* end afs_vlocation_do_timeout() */
+	list_add_tail(&vl->update, &afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+}
 
-/*****************************************************************************/
 /*
- * send an update operation to the currently selected server
+ * lookup volume location
+ * - iterate through the VL servers in a cell until one of them admits knowing
+ *   about the volume in question
+ * - lookup in the local cache if not able to find on the VL server
+ * - insert/update in the local cache if did get a VL response
  */
-static int afs_vlocation_update_begin(struct afs_vlocation *vlocation)
+struct afs_vlocation *afs_vlocation_lookup(struct afs_cell *cell,
+					   struct key *key,
+					   const char *name,
+					   size_t namesz)
 {
-	afs_voltype_t voltype;
-	afs_volid_t vid;
+	struct afs_vlocation *vl;
 	int ret;
 
-	_enter("%s{ufs=%u ucs=%u}",
-	       vlocation->vldb.name,
-	       vlocation->upd_first_svix,
-	       vlocation->upd_curr_svix);
+	_enter("{%s},{%x},%*.*s,%zu",
+	       cell->name, key_serial(key),
+	       (int) namesz, (int) namesz, name, namesz);
 
-	/* try to look up a cached volume in the cell VL databases by ID */
-	if (vlocation->vldb.vidmask & AFS_VOL_VTM_RW) {
-		vid = vlocation->vldb.vid[0];
-		voltype = AFSVL_RWVOL;
-	}
-	else if (vlocation->vldb.vidmask & AFS_VOL_VTM_RO) {
-		vid = vlocation->vldb.vid[1];
-		voltype = AFSVL_ROVOL;
+	if (namesz > sizeof(vl->vldb.name)) {
+		_leave(" = -ENAMETOOLONG");
+		return ERR_PTR(-ENAMETOOLONG);
 	}
-	else if (vlocation->vldb.vidmask & AFS_VOL_VTM_BAK) {
-		vid = vlocation->vldb.vid[2];
-		voltype = AFSVL_BACKVOL;
+
+	/* see if we have an in-memory copy first */
+	down_write(&cell->vl_sem);
+	spin_lock(&cell->vl_lock);
+	list_for_each_entry(vl, &cell->vl_list, link) {
+		if (vl->vldb.name[namesz] != '\0')
+			continue;
+		if (memcmp(vl->vldb.name, name, namesz) == 0)
+			goto found_in_memory;
 	}
-	else {
-		BUG();
-		vid = 0;
-		voltype = 0;
+	spin_unlock(&cell->vl_lock);
+
+	/* not in the cell's in-memory lists - create a new record */
+	vl = afs_vlocation_alloc(cell, name, namesz);
+	if (!vl) {
+		up_write(&cell->vl_sem);
+		return ERR_PTR(-ENOMEM);
 	}
 
-	/* contact the chosen server */
-	ret = afs_server_lookup(
-		vlocation->cell,
-		&vlocation->cell->vl_addrs[vlocation->upd_curr_svix],
-		&vlocation->upd_op.server);
+	afs_get_cell(cell);
 
-	switch (ret) {
-	case 0:
-		break;
-	case -ENOMEM:
-	case -ENONET:
-	default:
-		_leave(" = %d", ret);
-		return ret;
-	}
+	list_add_tail(&vl->link, &cell->vl_list);
+	vl->state = AFS_VL_CREATING;
+	up_write(&cell->vl_sem);
 
-	/* initiate the update operation */
-	ret = afs_rxvl_get_entry_by_id_async(&vlocation->upd_op, vid, voltype);
-	if (ret < 0) {
-		_leave(" = %d", ret);
-		return ret;
+fill_in_record:
+	ret = afs_vlocation_fill_in_record(vl, key);
+	if (ret < 0)
+		goto error_abandon;
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_VALID;
+	wake_up(&vl->waitq);
+	spin_unlock(&vl->lock);
+
+	/* schedule for regular updates */
+	afs_vlocation_queue_for_updates(vl);
+	goto success;
+
+found_in_memory:
+	/* found in memory */
+	_debug("found in memory");
+	atomic_inc(&vl->usage);
+	spin_unlock(&cell->vl_lock);
+	if (!list_empty(&vl->grave)) {
+		spin_lock(&afs_vlocation_graveyard_lock);
+		list_del_init(&vl->grave);
+		spin_unlock(&afs_vlocation_graveyard_lock);
 	}
+	up_write(&cell->vl_sem);
+
+	/* see if it was an abandoned record that we might try filling in */
+	spin_lock(&vl->lock);
+	while (vl->state != AFS_VL_VALID) {
+		afs_vlocation_state_t state = vl->state;
+
+		_debug("invalid [state %d]", state);
+
+		if ((state == AFS_VL_NEW || state == AFS_VL_NO_VOLUME)) {
+			vl->state = AFS_VL_CREATING;
+			spin_unlock(&vl->lock);
+			goto fill_in_record;
+		}
+
+		/* must now wait for creation or update by someone else to
+		 * complete */
+		_debug("wait");
 
+		spin_unlock(&vl->lock);
+		ret = wait_event_interruptible(
+			vl->waitq,
+			vl->state == AFS_VL_NEW ||
+			vl->state == AFS_VL_VALID ||
+			vl->state == AFS_VL_NO_VOLUME);
+		if (ret < 0)
+			goto error;
+		spin_lock(&vl->lock);
+	}
+	spin_unlock(&vl->lock);
+
+success:
+	_leave(" = %p",vl);
+	return vl;
+
+error_abandon:
+	spin_lock(&vl->lock);
+	vl->state = AFS_VL_NEW;
+	wake_up(&vl->waitq);
+	spin_unlock(&vl->lock);
+error:
+	ASSERT(vl != NULL);
+	afs_put_vlocation(vl);
 	_leave(" = %d", ret);
-	return ret;
-} /* end afs_vlocation_update_begin() */
+	return ERR_PTR(ret);
+}
 
-/*****************************************************************************/
 /*
- * abandon updating a VL record
- * - does not restart the update timer
+ * finish using a volume location record
  */
-static void afs_vlocation_update_abandon(struct afs_vlocation *vlocation,
-					 afs_vlocation_upd_t state,
-					 int ret)
+void afs_put_vlocation(struct afs_vlocation *vl)
 {
-	_enter("%s,%u", vlocation->vldb.name, state);
-
-	if (ret < 0)
-		printk("kAFS: Abandoning VL update '%s': %d\n",
-		       vlocation->vldb.name, ret);
-
-	/* discard the server record */
-	afs_put_server(vlocation->upd_op.server);
-	vlocation->upd_op.server = NULL;
+	if (!vl)
+		return;
 
-	spin_lock(&afs_vlocation_update_lock);
-	afs_vlocation_update = NULL;
-	vlocation->upd_state = state;
+	_enter("%s", vl->vldb.name);
 
-	/* TODO: start updating next VL record on pending list */
+	ASSERTCMP(atomic_read(&vl->usage), >, 0);
 
-	spin_unlock(&afs_vlocation_update_lock);
+	if (likely(!atomic_dec_and_test(&vl->usage))) {
+		_leave("");
+		return;
+	}
 
-	_leave("");
-} /* end afs_vlocation_update_abandon() */
+	spin_lock(&afs_vlocation_graveyard_lock);
+	if (atomic_read(&vl->usage) == 0) {
+		_debug("buried");
+		list_move_tail(&vl->grave, &afs_vlocation_graveyard);
+		vl->time_of_death = get_seconds();
+		schedule_delayed_work(&afs_vlocation_reap,
+				      afs_vlocation_timeout * HZ);
+
+		/* suspend updates on this record */
+		if (!list_empty(&vl->update)) {
+			spin_lock(&afs_vlocation_updates_lock);
+			list_del_init(&vl->update);
+			spin_unlock(&afs_vlocation_updates_lock);
+		}
+	}
+	spin_unlock(&afs_vlocation_graveyard_lock);
+	_leave(" [killed?]");
+}
 
-/*****************************************************************************/
 /*
- * handle periodic update timeouts and busy retry timeouts
- * - called from kafstimod
+ * destroy a dead volume location record
  */
-static void afs_vlocation_update_timer(struct afs_timer *timer)
+static void afs_vlocation_destroy(struct afs_vlocation *vl)
 {
-	struct afs_vlocation *vlocation =
-		list_entry(timer, struct afs_vlocation, upd_timer);
-	int ret;
+	_enter("%p", vl);
 
-	_enter("%s", vlocation->vldb.name);
+#ifdef AFS_CACHING_SUPPORT
+	cachefs_relinquish_cookie(vl->cache, 0);
+#endif
 
-	/* only update if not in the graveyard (defend against putting too) */
-	spin_lock(&vlocation->cell->vl_gylock);
+	afs_put_cell(vl->cell);
+	kfree(vl);
+}
 
-	if (!atomic_read(&vlocation->usage))
-		goto out_unlock1;
+/*
+ * reap dead volume location records
+ */
+static void afs_vlocation_reaper(struct work_struct *work)
+{
+	LIST_HEAD(corpses);
+	struct afs_vlocation *vl;
+	unsigned long delay, expiry;
+	time_t now;
 
-	spin_lock(&afs_vlocation_update_lock);
+	_enter("");
 
-	/* if we were woken up due to EBUSY sleep then restart immediately if
-	 * possible or else jump to front of pending queue */
-	if (vlocation->upd_state == AFS_VLUPD_BUSYSLEEP) {
-		if (afs_vlocation_update) {
-			list_add(&vlocation->upd_op.link,
-				 &afs_vlocation_update_pendq);
+	now = get_seconds();
+	spin_lock(&afs_vlocation_graveyard_lock);
+
+	while (!list_empty(&afs_vlocation_graveyard)) {
+		vl = list_entry(afs_vlocation_graveyard.next,
+				struct afs_vlocation, grave);
+
+		_debug("check %p", vl);
+
+		/* the queue is ordered most dead first */
+		expiry = vl->time_of_death + afs_vlocation_timeout;
+		if (expiry > now) {
+			delay = (expiry - now) * HZ;
+			_debug("delay %lu", delay);
+			if (!schedule_delayed_work(&afs_vlocation_reap,
+						   delay)) {
+				cancel_delayed_work(&afs_vlocation_reap);
+				schedule_delayed_work(&afs_vlocation_reap,
+						      delay);
+			}
+			break;
 		}
-		else {
-			afs_get_vlocation(vlocation);
-			afs_vlocation_update = vlocation;
-			vlocation->upd_state = AFS_VLUPD_INPROGRESS;
+
+		spin_lock(&vl->cell->vl_lock);
+		if (atomic_read(&vl->usage) > 0) {
+			_debug("no reap");
+			list_del_init(&vl->grave);
+		} else {
+			_debug("reap");
+			list_move_tail(&vl->grave, &corpses);
+			list_del_init(&vl->link);
 		}
-		goto out_unlock2;
+		spin_unlock(&vl->cell->vl_lock);
 	}
 
-	/* put on pending queue if there's already another update in progress */
-	if (afs_vlocation_update) {
-		vlocation->upd_state = AFS_VLUPD_PENDING;
-		list_add_tail(&vlocation->upd_op.link,
-			      &afs_vlocation_update_pendq);
-		goto out_unlock2;
-	}
+	spin_unlock(&afs_vlocation_graveyard_lock);
 
-	/* hold a ref on it while actually updating */
-	afs_get_vlocation(vlocation);
-	afs_vlocation_update = vlocation;
-	vlocation->upd_state = AFS_VLUPD_INPROGRESS;
-
-	spin_unlock(&afs_vlocation_update_lock);
-	spin_unlock(&vlocation->cell->vl_gylock);
-
-	/* okay... we can start the update */
-	_debug("BEGIN VL UPDATE [%s]", vlocation->vldb.name);
-	vlocation->upd_first_svix = vlocation->cell->vl_curr_svix;
-	vlocation->upd_curr_svix = vlocation->upd_first_svix;
-	vlocation->upd_rej_cnt = 0;
-	vlocation->upd_busy_cnt = 0;
-
-	ret = afs_vlocation_update_begin(vlocation);
-	if (ret < 0) {
-		afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
-		afs_kafstimod_add_timer(&vlocation->upd_timer,
-					AFS_VLDB_TIMEOUT);
-		afs_put_vlocation(vlocation);
+	/* now reap the corpses we've extracted */
+	while (!list_empty(&corpses)) {
+		vl = list_entry(corpses.next, struct afs_vlocation, grave);
+		list_del(&vl->grave);
+		afs_vlocation_destroy(vl);
 	}
 
 	_leave("");
-	return;
+}
 
- out_unlock2:
-	spin_unlock(&afs_vlocation_update_lock);
- out_unlock1:
-	spin_unlock(&vlocation->cell->vl_gylock);
-	_leave("");
-	return;
+/*
+ * initialise the VL update process
+ */
+int __init afs_vlocation_update_init(void)
+{
+	afs_vlocation_update_worker =
+		create_singlethread_workqueue("kafs_vlupdated");
+	return afs_vlocation_update_worker ? 0 : -ENOMEM;
+}
 
-} /* end afs_vlocation_update_timer() */
+/*
+ * discard all the volume location records for rmmod
+ */
+void __exit afs_vlocation_purge(void)
+{
+	afs_vlocation_timeout = 0;
+
+	spin_lock(&afs_vlocation_updates_lock);
+	list_del_init(&afs_vlocation_updates);
+	spin_unlock(&afs_vlocation_updates_lock);
+	cancel_delayed_work(&afs_vlocation_update);
+	queue_delayed_work(afs_vlocation_update_worker,
+			   &afs_vlocation_update, 0);
+	destroy_workqueue(afs_vlocation_update_worker);
+
+	cancel_delayed_work(&afs_vlocation_reap);
+	schedule_delayed_work(&afs_vlocation_reap, 0);
+}
 
-/*****************************************************************************/
 /*
- * attend to an update operation upon which an event happened
- * - called in kafsasyncd context
+ * update a volume location
  */
-static void afs_vlocation_update_attend(struct afs_async_op *op)
+static void afs_vlocation_updater(struct work_struct *work)
 {
 	struct afs_cache_vlocation vldb;
-	struct afs_vlocation *vlocation =
-		list_entry(op, struct afs_vlocation, upd_op);
-	unsigned tmp;
+	struct afs_vlocation *vl, *xvl;
+	time_t now;
+	long timeout;
 	int ret;
 
-	_enter("%s", vlocation->vldb.name);
-
-	ret = afs_rxvl_get_entry_by_id_async2(op, &vldb);
-	switch (ret) {
-	case -EAGAIN:
-		_leave(" [unfinished]");
-		return;
-
-	case 0:
-		_debug("END VL UPDATE: %d\n", ret);
-		vlocation->valid = 1;
-
-		_debug("Done VL Lookup: %02x { %08x(%x) %08x(%x) %08x(%x) }",
-		       vldb.vidmask,
-		       ntohl(vldb.servers[0].s_addr), vldb.srvtmask[0],
-		       ntohl(vldb.servers[1].s_addr), vldb.srvtmask[1],
-		       ntohl(vldb.servers[2].s_addr), vldb.srvtmask[2]
-		       );
-
-		_debug("Vids: %08x %08x %08x",
-		       vldb.vid[0], vldb.vid[1], vldb.vid[2]);
-
-		afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
-
-		down_write(&vlocation->cell->vl_sem);
-
-		/* actually update the cache */
-		if (strncmp(vldb.name, vlocation->vldb.name,
-			    sizeof(vlocation->vldb.name)) != 0)
-			printk("kAFS: name of volume '%s'"
-			       " changed to '%s' on server\n",
-			       vlocation->vldb.name, vldb.name);
-
-		memcpy(&vlocation->vldb, &vldb, sizeof(vlocation->vldb));
-
-#if 0
-		/* TODO update volume entry in local cache */
-#endif
-
-		up_write(&vlocation->cell->vl_sem);
-
-		if (ret < 0)
-			printk("kAFS: failed to update local cache: %d\n", ret);
-
-		afs_kafstimod_add_timer(&vlocation->upd_timer,
-					AFS_VLDB_TIMEOUT);
-		afs_put_vlocation(vlocation);
-		_leave(" [found]");
-		return;
-
-	case -ENOMEDIUM:
-		vlocation->upd_rej_cnt++;
-		goto try_next;
-
-		/* the server is locked - retry in a very short while */
-	case -EBUSY:
-		vlocation->upd_busy_cnt++;
-		if (vlocation->upd_busy_cnt > 3)
-			goto try_next; /* too many retries */
-
-		afs_vlocation_update_abandon(vlocation,
-					     AFS_VLUPD_BUSYSLEEP, 0);
-		afs_kafstimod_add_timer(&vlocation->upd_timer, HZ / 2);
-		afs_put_vlocation(vlocation);
-		_leave(" [busy]");
-		return;
-
-	case -ENETUNREACH:
-	case -EHOSTUNREACH:
-	case -ECONNREFUSED:
-	case -EREMOTEIO:
-		/* record bad vlserver info in the cell too
-		 * - TODO: use down_write_trylock() if available
-		 */
-		if (vlocation->upd_curr_svix == vlocation->cell->vl_curr_svix)
-			vlocation->cell->vl_curr_svix =
-				vlocation->cell->vl_curr_svix %
-				vlocation->cell->vl_naddrs;
-
-	case -EBADRQC:
-	case -EINVAL:
-	case -EACCES:
-	case -EBADMSG:
-		goto try_next;
-
-	default:
-		goto abandon;
-	}
-
-	/* try contacting the next server */
- try_next:
-	vlocation->upd_busy_cnt = 0;
-
-	/* discard the server record */
-	afs_put_server(vlocation->upd_op.server);
-	vlocation->upd_op.server = NULL;
+	_enter("");
 
-	tmp = vlocation->cell->vl_naddrs;
-	if (tmp == 0)
-		goto abandon;
+	now = get_seconds();
 
-	vlocation->upd_curr_svix++;
-	if (vlocation->upd_curr_svix >= tmp)
-		vlocation->upd_curr_svix = 0;
-	if (vlocation->upd_first_svix >= tmp)
-		vlocation->upd_first_svix = tmp - 1;
+	/* find a record to update */
+	spin_lock(&afs_vlocation_updates_lock);
+	for (;;) {
+		if (list_empty(&afs_vlocation_updates)) {
+			spin_unlock(&afs_vlocation_updates_lock);
+			_leave(" [nothing]");
+			return;
+		}
 
-	/* move to the next server */
-	if (vlocation->upd_curr_svix != vlocation->upd_first_svix) {
-		afs_vlocation_update_begin(vlocation);
-		_leave(" [next]");
-		return;
+		vl = list_entry(afs_vlocation_updates.next,
+				struct afs_vlocation, update);
+		if (atomic_read(&vl->usage) > 0)
+			break;
+		list_del_init(&vl->update);
 	}
 
-	/* run out of servers to try - was the volume rejected? */
-	if (vlocation->upd_rej_cnt > 0) {
-		printk("kAFS: Active volume no longer valid '%s'\n",
-		       vlocation->vldb.name);
-		vlocation->valid = 0;
-		afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, 0);
-		afs_kafstimod_add_timer(&vlocation->upd_timer,
-					AFS_VLDB_TIMEOUT);
-		afs_put_vlocation(vlocation);
-		_leave(" [invalidated]");
+	timeout = vl->update_at - now;
+	if (timeout > 0) {
+		queue_delayed_work(afs_vlocation_update_worker,
+				   &afs_vlocation_update, timeout * HZ);
+		spin_unlock(&afs_vlocation_updates_lock);
+		_leave(" [nothing]");
 		return;
 	}
 
-	/* abandon the update */
- abandon:
-	afs_vlocation_update_abandon(vlocation, AFS_VLUPD_SLEEP, ret);
-	afs_kafstimod_add_timer(&vlocation->upd_timer, HZ * 10);
-	afs_put_vlocation(vlocation);
-	_leave(" [abandoned]");
-
-} /* end afs_vlocation_update_attend() */
-
-/*****************************************************************************/
-/*
- * deal with an update operation being discarded
- * - called in kafsasyncd context when it's dying due to rmmod
- * - the call has already been aborted and put()'d
- */
-static void afs_vlocation_update_discard(struct afs_async_op *op)
-{
-	struct afs_vlocation *vlocation =
-		list_entry(op, struct afs_vlocation, upd_op);
+	list_del_init(&vl->update);
+	atomic_inc(&vl->usage);
+	spin_unlock(&afs_vlocation_updates_lock);
 
-	_enter("%s", vlocation->vldb.name);
+	/* we can now perform the update */
+	_debug("update %s", vl->vldb.name);
+	vl->state = AFS_VL_UPDATING;
+	vl->upd_rej_cnt = 0;
+	vl->upd_busy_cnt = 0;
 
-	afs_put_server(op->server);
-	op->server = NULL;
+	ret = afs_vlocation_update_record(vl, NULL, &vldb);
+	spin_lock(&vl->lock);
+	switch (ret) {
+	case 0:
+		afs_vlocation_apply_update(vl, &vldb);
+		vl->state = AFS_VL_VALID;
+		wake_up(&vl->waitq);
+		break;
+	case -ENOMEDIUM:
+		vl->state = AFS_VL_VOLUME_DELETED;
+		break;
+	default:
+		vl->state = AFS_VL_UNCERTAIN;
+		break;
+	}
+	spin_unlock(&vl->lock);
 
-	afs_put_vlocation(vlocation);
+	/* and then reschedule */
+	_debug("reschedule");
+	vl->update_at = get_seconds() + afs_vlocation_update_timeout;
 
-	_leave("");
-} /* end afs_vlocation_update_discard() */
+	spin_lock(&afs_vlocation_updates_lock);
 
-/*****************************************************************************/
-/*
- * match a VLDB record stored in the cache
- * - may also load target from entry
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vlocation_cache_match(void *target,
-						     const void *entry)
-{
-	const struct afs_cache_vlocation *vldb = entry;
-	struct afs_vlocation *vlocation = target;
-
-	_enter("{%s},{%s}", vlocation->vldb.name, vldb->name);
-
-	if (strncmp(vlocation->vldb.name, vldb->name, sizeof(vldb->name)) == 0
-	    ) {
-		if (!vlocation->valid ||
-		    vlocation->vldb.rtime == vldb->rtime
-		    ) {
-			vlocation->vldb = *vldb;
-			vlocation->valid = 1;
-			_leave(" = SUCCESS [c->m]");
-			return CACHEFS_MATCH_SUCCESS;
-		}
-		/* need to update cache if cached info differs */
-		else if (memcmp(&vlocation->vldb, vldb, sizeof(*vldb)) != 0) {
-			/* delete if VIDs for this name differ */
-			if (memcmp(&vlocation->vldb.vid,
-				   &vldb->vid,
-				   sizeof(vldb->vid)) != 0) {
-				_leave(" = DELETE");
-				return CACHEFS_MATCH_SUCCESS_DELETE;
-			}
-
-			_leave(" = UPDATE");
-			return CACHEFS_MATCH_SUCCESS_UPDATE;
-		}
-		else {
-			_leave(" = SUCCESS");
-			return CACHEFS_MATCH_SUCCESS;
-		}
+	if (!list_empty(&afs_vlocation_updates)) {
+		/* next update in 10 minutes, but wait at least 1 second more
+		 * than the newest record already queued so that we don't spam
+		 * the VL server suddenly with lots of requests
+		 */
+		xvl = list_entry(afs_vlocation_updates.prev,
+				 struct afs_vlocation, update);
+		if (vl->update_at <= xvl->update_at)
+			vl->update_at = xvl->update_at + 1;
+		xvl = list_entry(afs_vlocation_updates.next,
+				 struct afs_vlocation, update);
+		timeout = xvl->update_at - now;
+		if (timeout < 0)
+			timeout = 0;
+	} else {
+		timeout = afs_vlocation_update_timeout;
 	}
 
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
-} /* end afs_vlocation_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a VLDB record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vlocation_cache_update(void *source, void *entry)
-{
-	struct afs_cache_vlocation *vldb = entry;
-	struct afs_vlocation *vlocation = source;
+	ASSERT(list_empty(&vl->update));
 
-	_enter("");
-
-	*vldb = vlocation->vldb;
+	list_add_tail(&vl->update, &afs_vlocation_updates);
 
-} /* end afs_vlocation_cache_update() */
-#endif
+	_debug("timeout %ld", timeout);
+	queue_delayed_work(afs_vlocation_update_worker,
+			   &afs_vlocation_update, timeout * HZ);
+	spin_unlock(&afs_vlocation_updates_lock);
+	afs_put_vlocation(vl);
+}
diff --git a/fs/afs/vnode.c b/fs/afs/vnode.c
index cf62da5d7825..a1904ab8426a 100644
--- a/fs/afs/vnode.c
+++ b/fs/afs/vnode.c
@@ -1,6 +1,6 @@
-/* vnode.c: AFS vnode management
+/* AFS vnode management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -14,142 +14,237 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
-#include <linux/pagemap.h>
-#include "volume.h"
-#include "cell.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
-#include "vnode.h"
 #include "internal.h"
 
-static void afs_vnode_cb_timed_out(struct afs_timer *timer);
+#if 0
+static noinline bool dump_tree_aux(struct rb_node *node, struct rb_node *parent,
+				   int depth, char lr)
+{
+	struct afs_vnode *vnode;
+	bool bad = false;
+
+	if (!node)
+		return false;
+
+	if (node->rb_left)
+		bad = dump_tree_aux(node->rb_left, node, depth + 2, '/');
+
+	vnode = rb_entry(node, struct afs_vnode, cb_promise);
+	_debug("%c %*.*s%c%p {%d}",
+	       rb_is_red(node) ? 'R' : 'B',
+	       depth, depth, "", lr,
+	       vnode, vnode->cb_expires_at);
+	if (rb_parent(node) != parent) {
+		printk("BAD: %p != %p\n", rb_parent(node), parent);
+		bad = true;
+	}
 
-struct afs_timer_ops afs_vnode_cb_timed_out_ops = {
-	.timed_out	= afs_vnode_cb_timed_out,
-};
+	if (node->rb_right)
+		bad |= dump_tree_aux(node->rb_right, node, depth + 2, '\\');
 
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-						 const void *entry);
-static void afs_vnode_cache_update(void *source, void *entry);
+	return bad;
+}
 
-struct cachefs_index_def afs_vnode_cache_index_def = {
-	.name		= "vnode",
-	.data_size	= sizeof(struct afs_cache_vnode),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 4 },
-	.match		= afs_vnode_cache_match,
-	.update		= afs_vnode_cache_update,
-};
+static noinline void dump_tree(const char *name, struct afs_server *server)
+{
+	_enter("%s", name);
+	if (dump_tree_aux(server->cb_promises.rb_node, NULL, 0, '-'))
+		BUG();
+}
 #endif
 
-/*****************************************************************************/
 /*
- * handle a callback timing out
- * TODO: retain a ref to vnode struct for an outstanding callback timeout
+ * insert a vnode into the backing server's vnode tree
  */
-static void afs_vnode_cb_timed_out(struct afs_timer *timer)
+static void afs_install_vnode(struct afs_vnode *vnode,
+			      struct afs_server *server)
 {
-	struct afs_server *oldserver;
-	struct afs_vnode *vnode;
+	struct afs_server *old_server = vnode->server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
 
-	vnode = list_entry(timer, struct afs_vnode, cb_timeout);
+	_enter("%p,%p", vnode, server);
 
-	_enter("%p", vnode);
+	if (old_server) {
+		spin_lock(&old_server->fs_lock);
+		rb_erase(&vnode->server_rb, &old_server->fs_vnodes);
+		spin_unlock(&old_server->fs_lock);
+	}
 
-	/* set the changed flag in the vnode and release the server */
-	spin_lock(&vnode->lock);
+	afs_get_server(server);
+	vnode->server = server;
+	afs_put_server(old_server);
+
+	/* insert into the server's vnode tree in FID order */
+	spin_lock(&server->fs_lock);
+
+	parent = NULL;
+	p = &server->fs_vnodes.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, server_rb);
+		if (vnode->fid.vid < xvnode->fid.vid)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vid > xvnode->fid.vid)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.vnode < xvnode->fid.vnode)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.vnode > xvnode->fid.vnode)
+			p = &(*p)->rb_right;
+		else if (vnode->fid.unique < xvnode->fid.unique)
+			p = &(*p)->rb_left;
+		else if (vnode->fid.unique > xvnode->fid.unique)
+			p = &(*p)->rb_right;
+		else
+			BUG(); /* can't happen unless afs_iget() malfunctions */
+	}
+
+	rb_link_node(&vnode->server_rb, parent, p);
+	rb_insert_color(&vnode->server_rb, &server->fs_vnodes);
 
-	oldserver = xchg(&vnode->cb_server, NULL);
-	if (oldserver) {
-		vnode->flags |= AFS_VNODE_CHANGED;
+	spin_unlock(&server->fs_lock);
+	_leave("");
+}
 
-		spin_lock(&afs_cb_hash_lock);
-		list_del_init(&vnode->cb_hash_link);
-		spin_unlock(&afs_cb_hash_lock);
+/*
+ * insert a vnode into the promising server's update/expiration tree
+ * - caller must hold vnode->lock
+ */
+static void afs_vnode_note_promise(struct afs_vnode *vnode,
+				   struct afs_server *server)
+{
+	struct afs_server *old_server;
+	struct afs_vnode *xvnode;
+	struct rb_node *parent, **p;
 
-		spin_lock(&oldserver->cb_lock);
-		list_del_init(&vnode->cb_link);
-		spin_unlock(&oldserver->cb_lock);
+	_enter("%p,%p", vnode, server);
+
+	ASSERT(server != NULL);
+
+	old_server = vnode->server;
+	if (vnode->cb_promised) {
+		if (server == old_server &&
+		    vnode->cb_expires == vnode->cb_expires_at) {
+			_leave(" [no change]");
+			return;
+		}
+
+		spin_lock(&old_server->cb_lock);
+		if (vnode->cb_promised) {
+			_debug("delete");
+			rb_erase(&vnode->cb_promise, &old_server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&old_server->cb_lock);
 	}
 
-	spin_unlock(&vnode->lock);
+	if (vnode->server != server)
+		afs_install_vnode(vnode, server);
+
+	vnode->cb_expires_at = vnode->cb_expires;
+	_debug("PROMISE on %p {%lu}",
+	       vnode, (unsigned long) vnode->cb_expires_at);
+
+	/* abuse an RB-tree to hold the expiration order (we may have multiple
+	 * items with the same expiration time) */
+	spin_lock(&server->cb_lock);
+
+	parent = NULL;
+	p = &server->cb_promises.rb_node;
+	while (*p) {
+		parent = *p;
+		xvnode = rb_entry(parent, struct afs_vnode, cb_promise);
+		if (vnode->cb_expires_at < xvnode->cb_expires_at)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
 
-	afs_put_server(oldserver);
+	rb_link_node(&vnode->cb_promise, parent, p);
+	rb_insert_color(&vnode->cb_promise, &server->cb_promises);
+	vnode->cb_promised = true;
 
+	spin_unlock(&server->cb_lock);
 	_leave("");
-} /* end afs_vnode_cb_timed_out() */
+}
 
-/*****************************************************************************/
 /*
- * finish off updating the recorded status of a file
+ * handle remote file deletion by discarding the callback promise
+ */
+static void afs_vnode_deleted_remotely(struct afs_vnode *vnode)
+{
+	struct afs_server *server;
+
+	set_bit(AFS_VNODE_DELETED, &vnode->flags);
+
+	server = vnode->server;
+	if (vnode->cb_promised) {
+		spin_lock(&server->cb_lock);
+		if (vnode->cb_promised) {
+			rb_erase(&vnode->cb_promise, &server->cb_promises);
+			vnode->cb_promised = false;
+		}
+		spin_unlock(&server->cb_lock);
+	}
+
+	spin_lock(&vnode->server->fs_lock);
+	rb_erase(&vnode->server_rb, &vnode->server->fs_vnodes);
+	spin_unlock(&vnode->server->fs_lock);
+
+	vnode->server = NULL;
+	afs_put_server(server);
+}
+
+/*
+ * finish off updating the recorded status of a file after a successful
+ * operation completion
  * - starts callback expiry timer
  * - adds to server's callback list
  */
-static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
-					     struct afs_server *server,
-					     int ret)
+void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
+				      struct afs_server *server)
 {
 	struct afs_server *oldserver = NULL;
 
-	_enter("%p,%p,%d", vnode, server, ret);
+	_enter("%p,%p", vnode, server);
 
 	spin_lock(&vnode->lock);
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
+	afs_vnode_note_promise(vnode, server);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+
+	wake_up_all(&vnode->update_waitq);
+	afs_put_server(oldserver);
+	_leave("");
+}
 
-	vnode->flags &= ~AFS_VNODE_CHANGED;
+/*
+ * finish off updating the recorded status of a file after an operation failed
+ */
+static void afs_vnode_status_update_failed(struct afs_vnode *vnode, int ret)
+{
+	_enter("%p,%d", vnode, ret);
 
-	if (ret == 0) {
-		/* adjust the callback timeout appropriately */
-		afs_kafstimod_add_timer(&vnode->cb_timeout,
-					vnode->cb_expiry * HZ);
-
-		spin_lock(&afs_cb_hash_lock);
-		list_move_tail(&vnode->cb_hash_link,
-			      &afs_cb_hash(server, &vnode->fid));
-		spin_unlock(&afs_cb_hash_lock);
-
-		/* swap ref to old callback server with that for new callback
-		 * server */
-		oldserver = xchg(&vnode->cb_server, server);
-		if (oldserver != server) {
-			if (oldserver) {
-				spin_lock(&oldserver->cb_lock);
-				list_del_init(&vnode->cb_link);
-				spin_unlock(&oldserver->cb_lock);
-			}
+	spin_lock(&vnode->lock);
 
-			afs_get_server(server);
-			spin_lock(&server->cb_lock);
-			list_add_tail(&vnode->cb_link, &server->cb_promises);
-			spin_unlock(&server->cb_lock);
-		}
-		else {
-			/* same server */
-			oldserver = NULL;
-		}
-	}
-	else if (ret == -ENOENT) {
-		/* the file was deleted - clear the callback timeout */
-		oldserver = xchg(&vnode->cb_server, NULL);
-		afs_kafstimod_del_timer(&vnode->cb_timeout);
+	clear_bit(AFS_VNODE_CB_BROKEN, &vnode->flags);
 
+	if (ret == -ENOENT) {
+		/* the file was deleted on the server */
 		_debug("got NOENT from server - marking file deleted");
-		vnode->flags |= AFS_VNODE_DELETED;
+		afs_vnode_deleted_remotely(vnode);
 	}
 
 	vnode->update_cnt--;
-
+	ASSERTCMP(vnode->update_cnt, >=, 0);
 	spin_unlock(&vnode->lock);
 
 	wake_up_all(&vnode->update_waitq);
-
-	afs_put_server(oldserver);
-
 	_leave("");
+}
 
-} /* end afs_vnode_finalise_status_update() */
-
-/*****************************************************************************/
 /*
  * fetch file status from the volume
  * - don't issue a fetch if:
@@ -157,9 +252,11 @@ static void afs_vnode_finalise_status_update(struct afs_vnode *vnode,
  *   - there are any outstanding ops that will fetch the status
  * - TODO implement local caching
  */
-int afs_vnode_fetch_status(struct afs_vnode *vnode)
+int afs_vnode_fetch_status(struct afs_vnode *vnode,
+			   struct afs_vnode *auth_vnode, struct key *key)
 {
 	struct afs_server *server;
+	unsigned long acl_order;
 	int ret;
 
 	DECLARE_WAITQUEUE(myself, current);
@@ -168,38 +265,49 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique);
 
-	if (!(vnode->flags & AFS_VNODE_CHANGED) && vnode->cb_server) {
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
 		_leave(" [unchanged]");
 		return 0;
 	}
 
-	if (vnode->flags & AFS_VNODE_DELETED) {
+	if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
 		_leave(" [deleted]");
 		return -ENOENT;
 	}
 
+	acl_order = 0;
+	if (auth_vnode)
+		acl_order = auth_vnode->acl_order;
+
 	spin_lock(&vnode->lock);
 
-	if (!(vnode->flags & AFS_VNODE_CHANGED)) {
+	if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags) &&
+	    vnode->cb_promised) {
 		spin_unlock(&vnode->lock);
 		_leave(" [unchanged]");
 		return 0;
 	}
 
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
 	if (vnode->update_cnt > 0) {
 		/* someone else started a fetch */
+		_debug("wait on fetch %d", vnode->update_cnt);
+
 		set_current_state(TASK_UNINTERRUPTIBLE);
+		ASSERT(myself.func != NULL);
 		add_wait_queue(&vnode->update_waitq, &myself);
 
 		/* wait for the status to be updated */
 		for (;;) {
-			if (!(vnode->flags & AFS_VNODE_CHANGED))
+			if (!test_bit(AFS_VNODE_CB_BROKEN, &vnode->flags))
 				break;
-			if (vnode->flags & AFS_VNODE_DELETED)
+			if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
 				break;
 
-			/* it got updated and invalidated all before we saw
-			 * it */
+			/* check to see if it got updated and invalidated all
+			 * before we saw it */
 			if (vnode->update_cnt == 0) {
 				remove_wait_queue(&vnode->update_waitq,
 						  &myself);
@@ -219,10 +327,11 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
 		spin_unlock(&vnode->lock);
 		set_current_state(TASK_RUNNING);
 
-		return vnode->flags & AFS_VNODE_DELETED ? -ENOENT : 0;
+		return test_bit(AFS_VNODE_DELETED, &vnode->flags) ?
+			-ENOENT : 0;
 	}
 
- get_anyway:
+get_anyway:
 	/* okay... we're going to have to initiate the op */
 	vnode->update_cnt++;
 
@@ -232,39 +341,60 @@ int afs_vnode_fetch_status(struct afs_vnode *vnode)
 	 * vnode */
 	do {
 		/* pick a server to query */
-		ret = afs_volume_pick_fileserver(vnode->volume, &server);
-		if (ret<0)
-			return ret;
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
 
-		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+		_debug("USING SERVER: %p{%08x}",
+		       server, ntohl(server->addr.s_addr));
 
-		ret = afs_rxfs_fetch_file_status(server, vnode, NULL);
+		ret = afs_fs_fetch_file_status(server, key, vnode, NULL,
+					       &afs_sync_call);
 
-	} while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
 	/* adjust the flags */
-	afs_vnode_finalise_status_update(vnode, server, ret);
+	if (ret == 0) {
+		_debug("adjust");
+		if (auth_vnode)
+			afs_cache_permit(vnode, key, acl_order);
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		_debug("failed [%d]", ret);
+		afs_vnode_status_update_failed(vnode, ret);
+	}
 
-	_leave(" = %d", ret);
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
 	return ret;
-} /* end afs_vnode_fetch_status() */
 
-/*****************************************************************************/
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
 /*
  * fetch file data from the volume
- * - TODO implement caching and server failover
+ * - TODO implement caching
  */
-int afs_vnode_fetch_data(struct afs_vnode *vnode,
-			 struct afs_rxfs_fetch_descriptor *desc)
+int afs_vnode_fetch_data(struct afs_vnode *vnode, struct key *key,
+			 off_t offset, size_t length, struct page *page)
 {
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s,{%u,%u,%u}",
+	_enter("%s{%u,%u,%u},%x,,,",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
-	       vnode->fid.unique);
+	       vnode->fid.unique,
+	       key_serial(key));
 
 	/* this op will fetch the status */
 	spin_lock(&vnode->lock);
@@ -275,120 +405,351 @@ int afs_vnode_fetch_data(struct afs_vnode *vnode,
 	 * vnode */
 	do {
 		/* pick a server to query */
-		ret = afs_volume_pick_fileserver(vnode->volume, &server);
-		if (ret < 0)
-			return ret;
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
 
 		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-		ret = afs_rxfs_fetch_file_data(server, vnode, desc, NULL);
+		ret = afs_fs_fetch_data(server, key, vnode, offset, length,
+					page, &afs_sync_call);
 
-	} while (!afs_volume_release_fileserver(vnode->volume, server, ret));
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
 
 	/* adjust the flags */
-	afs_vnode_finalise_status_update(vnode, server, ret);
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+	}
 
 	_leave(" = %d", ret);
 	return ret;
 
-} /* end afs_vnode_fetch_data() */
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	return PTR_ERR(server);
+}
 
-/*****************************************************************************/
 /*
- * break any outstanding callback on a vnode
- * - only relevent to server that issued it
+ * make a file or a directory
  */
-int afs_vnode_give_up_callback(struct afs_vnode *vnode)
+int afs_vnode_create(struct afs_vnode *vnode, struct key *key,
+		     const char *name, umode_t mode, struct afs_fid *newfid,
+		     struct afs_file_status *newstatus,
+		     struct afs_callback *newcb, struct afs_server **_server)
 {
 	struct afs_server *server;
 	int ret;
 
-	_enter("%s,{%u,%u,%u}",
+	_enter("%s{%u,%u,%u},%x,%s,,",
 	       vnode->volume->vlocation->vldb.name,
 	       vnode->fid.vid,
 	       vnode->fid.vnode,
-	       vnode->fid.unique);
-
-	spin_lock(&afs_cb_hash_lock);
-	list_del_init(&vnode->cb_hash_link);
-	spin_unlock(&afs_cb_hash_lock);
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
 
-	/* set the changed flag in the vnode and release the server */
+	/* this op will fetch the status on the directory we're creating in */
 	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
 
-	afs_kafstimod_del_timer(&vnode->cb_timeout);
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
 
-	server = xchg(&vnode->cb_server, NULL);
-	if (server) {
-		vnode->flags |= AFS_VNODE_CHANGED;
+		ret = afs_fs_create(server, key, vnode, name, mode, newfid,
+				    newstatus, newcb, &afs_sync_call);
 
-		spin_lock(&server->cb_lock);
-		list_del_init(&vnode->cb_link);
-		spin_unlock(&server->cb_lock);
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
 	}
 
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
 	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
 
-	ret = 0;
-	if (server) {
-		ret = afs_rxfs_give_up_callback(server, vnode);
+/*
+ * remove a file or directory
+ */
+int afs_vnode_remove(struct afs_vnode *vnode, struct key *key, const char *name,
+		     bool isdir)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%u,%u,%u},%x,%s",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_remove(server, key, vnode, name, isdir,
+				    &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
 		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
 	}
 
-	_leave(" = %d", ret);
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
 	return ret;
-} /* end afs_vnode_give_up_callback() */
 
-/*****************************************************************************/
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
 /*
- * match a vnode record stored in the cache
+ * create a hard link
  */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_vnode_cache_match(void *target,
-						 const void *entry)
+extern int afs_vnode_link(struct afs_vnode *dvnode, struct afs_vnode *vnode,
+			  struct key *key, const char *name)
 {
-	const struct afs_cache_vnode *cvnode = entry;
-	struct afs_vnode *vnode = target;
+	struct afs_server *server;
+	int ret;
 
-	_enter("{%x,%x,%Lx},{%x,%x,%Lx}",
+	_enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s",
+	       dvnode->volume->vlocation->vldb.name,
+	       dvnode->fid.vid,
+	       dvnode->fid.vnode,
+	       dvnode->fid.unique,
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
 	       vnode->fid.vnode,
 	       vnode->fid.unique,
-	       vnode->status.version,
-	       cvnode->vnode_id,
-	       cvnode->vnode_unique,
-	       cvnode->data_version);
-
-	if (vnode->fid.vnode != cvnode->vnode_id) {
-		_leave(" = FAILED");
-		return CACHEFS_MATCH_FAILED;
+	       key_serial(key),
+	       name);
+
+	/* this op will fetch the status on the directory we're removing from */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt++;
+	spin_unlock(&dvnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(dvnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_link(server, key, dvnode, vnode, name,
+				  &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		afs_vnode_finalise_status_update(dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		afs_vnode_status_update_failed(dvnode, ret);
 	}
 
-	if (vnode->fid.unique != cvnode->vnode_unique ||
-	    vnode->status.version != cvnode->data_version) {
-		_leave(" = DELETE");
-		return CACHEFS_MATCH_SUCCESS_DELETE;
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	spin_lock(&dvnode->lock);
+	dvnode->update_cnt--;
+	ASSERTCMP(dvnode->update_cnt, >=, 0);
+	spin_unlock(&dvnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
+
+/*
+ * create a symbolic link
+ */
+int afs_vnode_symlink(struct afs_vnode *vnode, struct key *key,
+		      const char *name, const char *content,
+		      struct afs_fid *newfid,
+		      struct afs_file_status *newstatus,
+		      struct afs_server **_server)
+{
+	struct afs_server *server;
+	int ret;
+
+	_enter("%s{%u,%u,%u},%x,%s,%s,,,",
+	       vnode->volume->vlocation->vldb.name,
+	       vnode->fid.vid,
+	       vnode->fid.vnode,
+	       vnode->fid.unique,
+	       key_serial(key),
+	       name, content);
+
+	/* this op will fetch the status on the directory we're creating in */
+	spin_lock(&vnode->lock);
+	vnode->update_cnt++;
+	spin_unlock(&vnode->lock);
+
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(vnode);
+		if (IS_ERR(server))
+			goto no_server;
+
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_symlink(server, key, vnode, name, content,
+				     newfid, newstatus, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(vnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(vnode, server);
+		*_server = server;
+	} else {
+		afs_vnode_status_update_failed(vnode, ret);
+		*_server = NULL;
 	}
 
-	_leave(" = SUCCESS");
-	return CACHEFS_MATCH_SUCCESS;
-} /* end afs_vnode_cache_match() */
-#endif
+	_leave(" = %d [cnt %d]", ret, vnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&vnode->lock);
+	vnode->update_cnt--;
+	ASSERTCMP(vnode->update_cnt, >=, 0);
+	spin_unlock(&vnode->lock);
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), vnode->update_cnt);
+	return PTR_ERR(server);
+}
 
-/*****************************************************************************/
 /*
- * update a vnode record stored in the cache
+ * rename a file
  */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_vnode_cache_update(void *source, void *entry)
+int afs_vnode_rename(struct afs_vnode *orig_dvnode,
+		     struct afs_vnode *new_dvnode,
+		     struct key *key,
+		     const char *orig_name,
+		     const char *new_name)
 {
-	struct afs_cache_vnode *cvnode = entry;
-	struct afs_vnode *vnode = source;
+	struct afs_server *server;
+	int ret;
 
-	_enter("");
+	_enter("%s{%u,%u,%u},%s{%u,%u,%u},%x,%s,%s",
+	       orig_dvnode->volume->vlocation->vldb.name,
+	       orig_dvnode->fid.vid,
+	       orig_dvnode->fid.vnode,
+	       orig_dvnode->fid.unique,
+	       new_dvnode->volume->vlocation->vldb.name,
+	       new_dvnode->fid.vid,
+	       new_dvnode->fid.vnode,
+	       new_dvnode->fid.unique,
+	       key_serial(key),
+	       orig_name,
+	       new_name);
+
+	/* this op will fetch the status on both the directories we're dealing
+	 * with */
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt++;
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt++;
+		spin_unlock(&new_dvnode->lock);
+	}
 
-	cvnode->vnode_id	= vnode->fid.vnode;
-	cvnode->vnode_unique	= vnode->fid.unique;
-	cvnode->data_version	= vnode->status.version;
+	do {
+		/* pick a server to query */
+		server = afs_volume_pick_fileserver(orig_dvnode);
+		if (IS_ERR(server))
+			goto no_server;
 
-} /* end afs_vnode_cache_update() */
-#endif
+		_debug("USING SERVER: %08x\n", ntohl(server->addr.s_addr));
+
+		ret = afs_fs_rename(server, key, orig_dvnode, orig_name,
+				    new_dvnode, new_name, &afs_sync_call);
+
+	} while (!afs_volume_release_fileserver(orig_dvnode, server, ret));
+
+	/* adjust the flags */
+	if (ret == 0) {
+		afs_vnode_finalise_status_update(orig_dvnode, server);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_finalise_status_update(new_dvnode, server);
+		afs_put_server(server);
+	} else {
+		afs_vnode_status_update_failed(orig_dvnode, ret);
+		if (new_dvnode != orig_dvnode)
+			afs_vnode_status_update_failed(new_dvnode, ret);
+	}
+
+	_leave(" = %d [cnt %d]", ret, orig_dvnode->update_cnt);
+	return ret;
+
+no_server:
+	spin_lock(&orig_dvnode->lock);
+	orig_dvnode->update_cnt--;
+	ASSERTCMP(orig_dvnode->update_cnt, >=, 0);
+	spin_unlock(&orig_dvnode->lock);
+	if (new_dvnode != orig_dvnode) {
+		spin_lock(&new_dvnode->lock);
+		new_dvnode->update_cnt--;
+		ASSERTCMP(new_dvnode->update_cnt, >=, 0);
+		spin_unlock(&new_dvnode->lock);
+	}
+	_leave(" = %ld [cnt %d]", PTR_ERR(server), orig_dvnode->update_cnt);
+	return PTR_ERR(server);
+}
diff --git a/fs/afs/vnode.h b/fs/afs/vnode.h
deleted file mode 100644
index b86a97102e8b..000000000000
--- a/fs/afs/vnode.h
+++ /dev/null
@@ -1,94 +0,0 @@
-/* vnode.h: AFS vnode record
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_VNODE_H
-#define _LINUX_AFS_VNODE_H
-
-#include <linux/fs.h>
-#include "server.h"
-#include "kafstimod.h"
-#include "cache.h"
-
-#ifdef __KERNEL__
-
-struct afs_rxfs_fetch_descriptor;
-
-/*****************************************************************************/
-/*
- * vnode catalogue entry
- */
-struct afs_cache_vnode
-{
-	afs_vnodeid_t		vnode_id;	/* vnode ID */
-	unsigned		vnode_unique;	/* vnode ID uniquifier */
-	afs_dataversion_t	data_version;	/* data version */
-};
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vnode_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * AFS inode private data
- */
-struct afs_vnode
-{
-	struct inode		vfs_inode;	/* the VFS's inode record */
-
-	struct afs_volume	*volume;	/* volume on which vnode resides */
-	struct afs_fid		fid;		/* the file identifier for this inode */
-	struct afs_file_status	status;		/* AFS status info for this file */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
-#endif
-
-	wait_queue_head_t	update_waitq;	/* status fetch waitqueue */
-	unsigned		update_cnt;	/* number of outstanding ops that will update the
-						 * status */
-	spinlock_t		lock;		/* waitqueue/flags lock */
-	unsigned		flags;
-#define AFS_VNODE_CHANGED	0x00000001	/* set if vnode reported changed by callback */
-#define AFS_VNODE_DELETED	0x00000002	/* set if vnode deleted on server */
-#define AFS_VNODE_MOUNTPOINT	0x00000004	/* set if vnode is a mountpoint symlink */
-
-	/* outstanding callback notification on this file */
-	struct afs_server	*cb_server;	/* server that made the current promise */
-	struct list_head	cb_link;	/* link in server's promises list */
-	struct list_head	cb_hash_link;	/* link in master callback hash */
-	struct afs_timer	cb_timeout;	/* timeout on promise */
-	unsigned		cb_version;	/* callback version */
-	unsigned		cb_expiry;	/* callback expiry time */
-	afs_callback_type_t	cb_type;	/* type of callback */
-};
-
-static inline struct afs_vnode *AFS_FS_I(struct inode *inode)
-{
-	return container_of(inode,struct afs_vnode,vfs_inode);
-}
-
-static inline struct inode *AFS_VNODE_TO_I(struct afs_vnode *vnode)
-{
-	return &vnode->vfs_inode;
-}
-
-extern int afs_vnode_fetch_status(struct afs_vnode *vnode);
-
-extern int afs_vnode_fetch_data(struct afs_vnode *vnode,
-				struct afs_rxfs_fetch_descriptor *desc);
-
-extern int afs_vnode_give_up_callback(struct afs_vnode *vnode);
-
-extern struct afs_timer_ops afs_vnode_cb_timed_out_ops;
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_AFS_VNODE_H */
diff --git a/fs/afs/volume.c b/fs/afs/volume.c
index 768c6dbd323a..dd160cada45d 100644
--- a/fs/afs/volume.c
+++ b/fs/afs/volume.c
@@ -1,6 +1,6 @@
-/* volume.c: AFS volume management
+/* AFS volume management
  *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
+ * Copyright (C) 2002, 2007 Red Hat, Inc. All Rights Reserved.
  * Written by David Howells (dhowells@redhat.com)
  *
  * This program is free software; you can redistribute it and/or
@@ -15,35 +15,10 @@
 #include <linux/slab.h>
 #include <linux/fs.h>
 #include <linux/pagemap.h>
-#include "volume.h"
-#include "vnode.h"
-#include "cell.h"
-#include "cache.h"
-#include "cmservice.h"
-#include "fsclient.h"
-#include "vlclient.h"
 #include "internal.h"
 
-#ifdef __KDEBUG
 static const char *afs_voltypes[] = { "R/W", "R/O", "BAK" };
-#endif
-
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-						  const void *entry);
-static void afs_volume_cache_update(void *source, void *entry);
-
-struct cachefs_index_def afs_volume_cache_index_def = {
-	.name		= "volume",
-	.data_size	= sizeof(struct afs_cache_vhash),
-	.keys[0]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
-	.keys[1]	= { CACHEFS_INDEX_KEYS_BIN, 1 },
-	.match		= afs_volume_cache_match,
-	.update		= afs_volume_cache_update,
-};
-#endif
 
-/*****************************************************************************/
 /*
  * lookup a volume by name
  * - this can be one of the following:
@@ -66,118 +41,52 @@ struct cachefs_index_def afs_volume_cache_index_def = {
  * - Rule 3: If parent volume is R/W, then only mount R/W volume unless
  *           explicitly told otherwise
  */
-int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
-		      struct afs_volume **_volume)
+struct afs_volume *afs_volume_lookup(struct afs_mount_params *params)
 {
 	struct afs_vlocation *vlocation = NULL;
 	struct afs_volume *volume = NULL;
-	afs_voltype_t type;
-	const char *cellname, *volname, *suffix;
+	struct afs_server *server = NULL;
 	char srvtmask;
-	int force, ret, loop, cellnamesz, volnamesz;
-
-	_enter("%s,,%d,", name, rwpath);
-
-	if (!name || (name[0] != '%' && name[0] != '#') || !name[1]) {
-		printk("kAFS: unparsable volume name\n");
-		return -EINVAL;
-	}
-
-	/* determine the type of volume we're looking for */
-	force = 0;
-	type = AFSVL_ROVOL;
-
-	if (rwpath || name[0] == '%') {
-		type = AFSVL_RWVOL;
-		force = 1;
-	}
-
-	suffix = strrchr(name, '.');
-	if (suffix) {
-		if (strcmp(suffix, ".readonly") == 0) {
-			type = AFSVL_ROVOL;
-			force = 1;
-		}
-		else if (strcmp(suffix, ".backup") == 0) {
-			type = AFSVL_BACKVOL;
-			force = 1;
-		}
-		else if (suffix[1] == 0) {
-		}
-		else {
-			suffix = NULL;
-		}
-	}
+	int ret, loop;
 
-	/* split the cell and volume names */
-	name++;
-	volname = strchr(name, ':');
-	if (volname) {
-		cellname = name;
-		cellnamesz = volname - name;
-		volname++;
-	}
-	else {
-		volname = name;
-		cellname = NULL;
-		cellnamesz = 0;
-	}
-
-	volnamesz = suffix ? suffix - volname : strlen(volname);
-
-	_debug("CELL:%*.*s [%p] VOLUME:%*.*s SUFFIX:%s TYPE:%d%s",
-	       cellnamesz, cellnamesz, cellname ?: "", cell,
-	       volnamesz, volnamesz, volname, suffix ?: "-",
-	       type,
-	       force ? " FORCE" : "");
-
-	/* lookup the cell record */
-	if (cellname || !cell) {
-		ret = afs_cell_lookup(cellname, cellnamesz, &cell);
-		if (ret<0) {
-			printk("kAFS: unable to lookup cell '%s'\n",
-			       cellname ?: "");
-			goto error;
-		}
-	}
-	else {
-		afs_get_cell(cell);
-	}
+	_enter("{%*.*s,%d}",
+	       params->volnamesz, params->volnamesz, params->volname, params->rwpath);
 
 	/* lookup the volume location record */
-	ret = afs_vlocation_lookup(cell, volname, volnamesz, &vlocation);
-	if (ret < 0)
+	vlocation = afs_vlocation_lookup(params->cell, params->key,
+					 params->volname, params->volnamesz);
+	if (IS_ERR(vlocation)) {
+		ret = PTR_ERR(vlocation);
+		vlocation = NULL;
 		goto error;
+	}
 
 	/* make the final decision on the type we want */
 	ret = -ENOMEDIUM;
-	if (force && !(vlocation->vldb.vidmask & (1 << type)))
+	if (params->force && !(vlocation->vldb.vidmask & (1 << params->type)))
 		goto error;
 
 	srvtmask = 0;
 	for (loop = 0; loop < vlocation->vldb.nservers; loop++)
 		srvtmask |= vlocation->vldb.srvtmask[loop];
 
-	if (force) {
-		if (!(srvtmask & (1 << type)))
+	if (params->force) {
+		if (!(srvtmask & (1 << params->type)))
 			goto error;
-	}
-	else if (srvtmask & AFS_VOL_VTM_RO) {
-		type = AFSVL_ROVOL;
-	}
-	else if (srvtmask & AFS_VOL_VTM_RW) {
-		type = AFSVL_RWVOL;
-	}
-	else {
+	} else if (srvtmask & AFS_VOL_VTM_RO) {
+		params->type = AFSVL_ROVOL;
+	} else if (srvtmask & AFS_VOL_VTM_RW) {
+		params->type = AFSVL_RWVOL;
+	} else {
 		goto error;
 	}
 
-	down_write(&cell->vl_sem);
+	down_write(&params->cell->vl_sem);
 
 	/* is the volume already active? */
-	if (vlocation->vols[type]) {
+	if (vlocation->vols[params->type]) {
 		/* yes - re-use it */
-		volume = vlocation->vols[type];
+		volume = vlocation->vols[params->type];
 		afs_get_volume(volume);
 		goto success;
 	}
@@ -191,23 +100,24 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
 		goto error_up;
 
 	atomic_set(&volume->usage, 1);
-	volume->type		= type;
-	volume->type_force	= force;
-	volume->cell		= cell;
-	volume->vid		= vlocation->vldb.vid[type];
+	volume->type		= params->type;
+	volume->type_force	= params->force;
+	volume->cell		= params->cell;
+	volume->vid		= vlocation->vldb.vid[params->type];
 
 	init_rwsem(&volume->server_sem);
 
 	/* look up all the applicable server records */
 	for (loop = 0; loop < 8; loop++) {
 		if (vlocation->vldb.srvtmask[loop] & (1 << volume->type)) {
-			ret = afs_server_lookup(
-				volume->cell,
-				&vlocation->vldb.servers[loop],
-				&volume->servers[volume->nservers]);
-			if (ret < 0)
+			server = afs_lookup_server(
+			       volume->cell, &vlocation->vldb.servers[loop]);
+			if (IS_ERR(server)) {
+				ret = PTR_ERR(server);
 				goto error_discard;
+			}
 
+			volume->servers[volume->nservers] = server;
 			volume->nservers++;
 		}
 	}
@@ -223,35 +133,34 @@ int afs_volume_lookup(const char *name, struct afs_cell *cell, int rwpath,
 	afs_get_vlocation(vlocation);
 	volume->vlocation = vlocation;
 
-	vlocation->vols[type] = volume;
+	vlocation->vols[volume->type] = volume;
 
- success:
+success:
 	_debug("kAFS selected %s volume %08x",
 	       afs_voltypes[volume->type], volume->vid);
-	*_volume = volume;
-	ret = 0;
+	up_write(&params->cell->vl_sem);
+	afs_put_vlocation(vlocation);
+	_leave(" = %p", volume);
+	return volume;
 
 	/* clean up */
- error_up:
-	up_write(&cell->vl_sem);
- error:
+error_up:
+	up_write(&params->cell->vl_sem);
+error:
 	afs_put_vlocation(vlocation);
-	afs_put_cell(cell);
-
-	_leave(" = %d (%p)", ret, volume);
-	return ret;
+	_leave(" = %d", ret);
+	return ERR_PTR(ret);
 
- error_discard:
-	up_write(&cell->vl_sem);
+error_discard:
+	up_write(&params->cell->vl_sem);
 
 	for (loop = volume->nservers - 1; loop >= 0; loop--)
 		afs_put_server(volume->servers[loop]);
 
 	kfree(volume);
 	goto error;
-} /* end afs_volume_lookup() */
+}
 
-/*****************************************************************************/
 /*
  * destroy a volume record
  */
@@ -265,10 +174,9 @@ void afs_put_volume(struct afs_volume *volume)
 
 	_enter("%p", volume);
 
-	vlocation = volume->vlocation;
+	ASSERTCMP(atomic_read(&volume->usage), >, 0);
 
-	/* sanity check */
-	BUG_ON(atomic_read(&volume->usage) <= 0);
+	vlocation = volume->vlocation;
 
 	/* to prevent a race, the decrement and the dequeue must be effectively
 	 * atomic */
@@ -296,21 +204,27 @@ void afs_put_volume(struct afs_volume *volume)
 	kfree(volume);
 
 	_leave(" [destroyed]");
-} /* end afs_put_volume() */
+}
 
-/*****************************************************************************/
 /*
  * pick a server to use to try accessing this volume
  * - returns with an elevated usage count on the server chosen
  */
-int afs_volume_pick_fileserver(struct afs_volume *volume,
-			       struct afs_server **_server)
+struct afs_server *afs_volume_pick_fileserver(struct afs_vnode *vnode)
 {
+	struct afs_volume *volume = vnode->volume;
 	struct afs_server *server;
 	int ret, state, loop;
 
 	_enter("%s", volume->vlocation->vldb.name);
 
+	/* stick with the server we're already using if we can */
+	if (vnode->server && vnode->server->fs_state == 0) {
+		afs_get_server(vnode->server);
+		_leave(" = %p [current]", vnode->server);
+		return vnode->server;
+	}
+
 	down_read(&volume->server_sem);
 
 	/* handle the no-server case */
@@ -318,7 +232,7 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
 		ret = volume->rjservers ? -ENOMEDIUM : -ESTALE;
 		up_read(&volume->server_sem);
 		_leave(" = %d [no servers]", ret);
-		return ret;
+		return ERR_PTR(ret);
 	}
 
 	/* basically, just search the list for the first live server and use
@@ -328,15 +242,16 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
 		server = volume->servers[loop];
 		state = server->fs_state;
 
+		_debug("consider %d [%d]", loop, state);
+
 		switch (state) {
 			/* found an apparently healthy server */
 		case 0:
 			afs_get_server(server);
 			up_read(&volume->server_sem);
-			*_server = server;
-			_leave(" = 0 (picked %08x)",
-			       ntohl(server->addr.s_addr));
-			return 0;
+			_leave(" = %p (picked %08x)",
+			       server, ntohl(server->addr.s_addr));
+			return server;
 
 		case -ENETUNREACH:
 			if (ret == 0)
@@ -372,20 +287,21 @@ int afs_volume_pick_fileserver(struct afs_volume *volume,
 	 */
 	up_read(&volume->server_sem);
 	_leave(" = %d", ret);
-	return ret;
-} /* end afs_volume_pick_fileserver() */
+	return ERR_PTR(ret);
+}
 
-/*****************************************************************************/
 /*
  * release a server after use
  * - releases the ref on the server struct that was acquired by picking
  * - records result of using a particular server to access a volume
  * - return 0 to try again, 1 if okay or to issue error
+ * - the caller must release the server struct if result was 0
  */
-int afs_volume_release_fileserver(struct afs_volume *volume,
+int afs_volume_release_fileserver(struct afs_vnode *vnode,
 				  struct afs_server *server,
 				  int result)
 {
+	struct afs_volume *volume = vnode->volume;
 	unsigned loop;
 
 	_enter("%s,%08x,%d",
@@ -396,14 +312,16 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
 		/* success */
 	case 0:
 		server->fs_act_jif = jiffies;
-		break;
+		server->fs_state = 0;
+		_leave("");
+		return 1;
 
 		/* the fileserver denied all knowledge of the volume */
 	case -ENOMEDIUM:
 		server->fs_act_jif = jiffies;
 		down_write(&volume->server_sem);
 
-		/* first, find where the server is in the active list (if it
+		/* firstly, find where the server is in the active list (if it
 		 * is) */
 		for (loop = 0; loop < volume->nservers; loop++)
 			if (volume->servers[loop] == server)
@@ -441,6 +359,7 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
 	case -ENETUNREACH:
 	case -EHOSTUNREACH:
 	case -ECONNREFUSED:
+	case -ETIME:
 	case -ETIMEDOUT:
 	case -EREMOTEIO:
 		/* mark the server as dead
@@ -460,60 +379,17 @@ int afs_volume_release_fileserver(struct afs_volume *volume,
 		server->fs_act_jif = jiffies;
 	case -ENOMEM:
 	case -ENONET:
-		break;
+		/* tell the caller to accept the result */
+		afs_put_server(server);
+		_leave(" [local failure]");
+		return 1;
 	}
 
-	/* tell the caller to accept the result */
-	afs_put_server(server);
-	_leave("");
-	return 1;
-
 	/* tell the caller to loop around and try the next server */
- try_next_server_upw:
+try_next_server_upw:
 	up_write(&volume->server_sem);
- try_next_server:
+try_next_server:
 	afs_put_server(server);
 	_leave(" [try next server]");
 	return 0;
-
-} /* end afs_volume_release_fileserver() */
-
-/*****************************************************************************/
-/*
- * match a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static cachefs_match_val_t afs_volume_cache_match(void *target,
-						  const void *entry)
-{
-	const struct afs_cache_vhash *vhash = entry;
-	struct afs_volume *volume = target;
-
-	_enter("{%u},{%u}", volume->type, vhash->vtype);
-
-	if (volume->type == vhash->vtype) {
-		_leave(" = SUCCESS");
-		return CACHEFS_MATCH_SUCCESS;
-	}
-
-	_leave(" = FAILED");
-	return CACHEFS_MATCH_FAILED;
-} /* end afs_volume_cache_match() */
-#endif
-
-/*****************************************************************************/
-/*
- * update a volume hash record stored in the cache
- */
-#ifdef AFS_CACHING_SUPPORT
-static void afs_volume_cache_update(void *source, void *entry)
-{
-	struct afs_cache_vhash *vhash = entry;
-	struct afs_volume *volume = source;
-
-	_enter("");
-
-	vhash->vtype = volume->type;
-
-} /* end afs_volume_cache_update() */
-#endif
+}
diff --git a/fs/afs/volume.h b/fs/afs/volume.h
deleted file mode 100644
index bfdcf19ba3f3..000000000000
--- a/fs/afs/volume.h
+++ /dev/null
@@ -1,140 +0,0 @@
-/* volume.h: AFS volume management
- *
- * Copyright (C) 2002 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * as published by the Free Software Foundation; either version
- * 2 of the License, or (at your option) any later version.
- */
-
-#ifndef _LINUX_AFS_VOLUME_H
-#define _LINUX_AFS_VOLUME_H
-
-#include "types.h"
-#include "fsclient.h"
-#include "kafstimod.h"
-#include "kafsasyncd.h"
-#include "cache.h"
-
-typedef enum {
-	AFS_VLUPD_SLEEP,		/* sleeping waiting for update timer to fire */
-	AFS_VLUPD_PENDING,		/* on pending queue */
-	AFS_VLUPD_INPROGRESS,		/* op in progress */
-	AFS_VLUPD_BUSYSLEEP,		/* sleeping because server returned EBUSY */
-	
-} __attribute__((packed)) afs_vlocation_upd_t;
-
-/*****************************************************************************/
-/*
- * entry in the cached volume location catalogue
- */
-struct afs_cache_vlocation
-{
-	uint8_t			name[64];	/* volume name (lowercase, padded with NULs) */
-	uint8_t			nservers;	/* number of entries used in servers[] */
-	uint8_t			vidmask;	/* voltype mask for vid[] */
-	uint8_t			srvtmask[8];	/* voltype masks for servers[] */
-#define AFS_VOL_VTM_RW	0x01 /* R/W version of the volume is available (on this server) */
-#define AFS_VOL_VTM_RO	0x02 /* R/O version of the volume is available (on this server) */
-#define AFS_VOL_VTM_BAK	0x04 /* backup version of the volume is available (on this server) */
-
-	afs_volid_t		vid[3];		/* volume IDs for R/W, R/O and Bak volumes */
-	struct in_addr		servers[8];	/* fileserver addresses */
-	time_t			rtime;		/* last retrieval time */
-};
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_vlocation_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * volume -> vnode hash table entry
- */
-struct afs_cache_vhash
-{
-	afs_voltype_t		vtype;		/* which volume variation */
-	uint8_t			hash_bucket;	/* which hash bucket this represents */
-} __attribute__((packed));
-
-#ifdef AFS_CACHING_SUPPORT
-extern struct cachefs_index_def afs_volume_cache_index_def;
-#endif
-
-/*****************************************************************************/
-/*
- * AFS volume location record
- */
-struct afs_vlocation
-{
-	atomic_t		usage;
-	struct list_head	link;		/* link in cell volume location list */
-	struct afs_timer	timeout;	/* decaching timer */
-	struct afs_cell		*cell;		/* cell to which volume belongs */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
-#endif
-	struct afs_cache_vlocation vldb;	/* volume information DB record */
-	struct afs_volume	*vols[3];	/* volume access record pointer (index by type) */
-	rwlock_t		lock;		/* access lock */
-	unsigned long		read_jif;	/* time at which last read from vlserver */
-	struct afs_timer	upd_timer;	/* update timer */
-	struct afs_async_op	upd_op;		/* update operation */
-	afs_vlocation_upd_t	upd_state;	/* update state */
-	unsigned short		upd_first_svix;	/* first server index during update */
-	unsigned short		upd_curr_svix;	/* current server index during update */
-	unsigned short		upd_rej_cnt;	/* ENOMEDIUM count during update */
-	unsigned short		upd_busy_cnt;	/* EBUSY count during update */
-	unsigned short		valid;		/* T if valid */
-};
-
-extern int afs_vlocation_lookup(struct afs_cell *cell,
-				const char *name,
-				unsigned namesz,
-				struct afs_vlocation **_vlocation);
-
-#define afs_get_vlocation(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern void afs_put_vlocation(struct afs_vlocation *vlocation);
-extern void afs_vlocation_do_timeout(struct afs_vlocation *vlocation);
-
-/*****************************************************************************/
-/*
- * AFS volume access record
- */
-struct afs_volume
-{
-	atomic_t		usage;
-	struct afs_cell		*cell;		/* cell to which belongs (unrefd ptr) */
-	struct afs_vlocation	*vlocation;	/* volume location */
-#ifdef AFS_CACHING_SUPPORT
-	struct cachefs_cookie	*cache;		/* caching cookie */
-#endif
-	afs_volid_t		vid;		/* volume ID */
-	afs_voltype_t		type;		/* type of volume */
-	char			type_force;	/* force volume type (suppress R/O -> R/W) */
-	unsigned short		nservers;	/* number of server slots filled */
-	unsigned short		rjservers;	/* number of servers discarded due to -ENOMEDIUM */
-	struct afs_server	*servers[8];	/* servers on which volume resides (ordered) */
-	struct rw_semaphore	server_sem;	/* lock for accessing current server */
-};
-
-extern int afs_volume_lookup(const char *name,
-			     struct afs_cell *cell,
-			     int rwpath,
-			     struct afs_volume **_volume);
-
-#define afs_get_volume(V) do { atomic_inc(&(V)->usage); } while(0)
-
-extern void afs_put_volume(struct afs_volume *volume);
-
-extern int afs_volume_pick_fileserver(struct afs_volume *volume,
-				      struct afs_server **_server);
-
-extern int afs_volume_release_fileserver(struct afs_volume *volume,
-					 struct afs_server *server,
-					 int result);
-
-#endif /* _LINUX_AFS_VOLUME_H */
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 8b1c5d8bf4ef..c68b055fa26e 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -266,6 +266,23 @@ static int do_siocgstamp(unsigned int fd, unsigned int cmd, unsigned long arg)
 	return err;
 }
 
+static int do_siocgstampns(unsigned int fd, unsigned int cmd, unsigned long arg)
+{
+	struct compat_timespec __user *up = compat_ptr(arg);
+	struct timespec kts;
+	mm_segment_t old_fs = get_fs();
+	int err;
+
+	set_fs(KERNEL_DS);
+	err = sys_ioctl(fd, cmd, (unsigned long)&kts);
+	set_fs(old_fs);
+	if (!err) {
+		err = put_user(kts.tv_sec, &up->tv_sec);
+		err |= __put_user(kts.tv_nsec, &up->tv_nsec);
+	}
+	return err;
+}
+
 struct ifmap32 {
 	compat_ulong_t mem_start;
 	compat_ulong_t mem_end;
@@ -2437,6 +2454,7 @@ HANDLE_IOCTL(SIOCBRDELIF, dev_ifsioc)
 /* Note SIOCRTMSG is no longer, so this is safe and * the user would have seen just an -EINVAL anyways. */
 HANDLE_IOCTL(SIOCRTMSG, ret_einval)
 HANDLE_IOCTL(SIOCGSTAMP, do_siocgstamp)
+HANDLE_IOCTL(SIOCGSTAMPNS, do_siocgstampns)
 #endif
 #ifdef CONFIG_BLOCK
 HANDLE_IOCTL(HDIO_GETGEO, hdio_getgeo)
diff --git a/fs/ecryptfs/netlink.c b/fs/ecryptfs/netlink.c
index e3aa2253c850..fe9186312d7c 100644
--- a/fs/ecryptfs/netlink.c
+++ b/fs/ecryptfs/netlink.c
@@ -97,7 +97,7 @@ out:
  */
 static int ecryptfs_process_nl_response(struct sk_buff *skb)
 {
-	struct nlmsghdr *nlh = (struct nlmsghdr*)skb->data;
+	struct nlmsghdr *nlh = nlmsg_hdr(skb);
 	struct ecryptfs_message *msg = NLMSG_DATA(nlh);
 	int rc;
 
@@ -181,7 +181,7 @@ receive:
 				"rc = [%d]\n", rc);
 		return;
 	}
-	nlh = (struct nlmsghdr *)skb->data;
+	nlh = nlmsg_hdr(skb);
 	if (!NLMSG_OK(nlh, skb->len)) {
 		ecryptfs_printk(KERN_ERR, "Received corrupt netlink "
 				"message\n");
@@ -229,7 +229,7 @@ int ecryptfs_init_netlink(void)
 
 	ecryptfs_nl_sock = netlink_kernel_create(NETLINK_ECRYPTFS, 0,
 						 ecryptfs_receive_nl_message,
-						 THIS_MODULE);
+						 NULL, THIS_MODULE);
 	if (!ecryptfs_nl_sock) {
 		rc = -EIO;
 		ecryptfs_printk(KERN_ERR, "Failed to create netlink socket\n");
diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index f27e5378caf2..a0c8667caa72 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -27,6 +27,7 @@
 #include <linux/types.h>
 #include <linux/slab.h>
 #include <linux/highmem.h>
+#include <linux/swap.h>
 
 #define MLOG_MASK_PREFIX ML_DISK_ALLOC
 #include <cluster/masklog.h>
@@ -34,6 +35,7 @@
 #include "ocfs2.h"
 
 #include "alloc.h"
+#include "aops.h"
 #include "dlmglue.h"
 #include "extent_map.h"
 #include "inode.h"
@@ -47,63 +49,243 @@
 
 #include "buffer_head_io.h"
 
-static int ocfs2_extent_contig(struct inode *inode,
-			       struct ocfs2_extent_rec *ext,
-			       u64 blkno);
+static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 
-static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb,
-				     handle_t *handle,
-				     struct inode *inode,
-				     int wanted,
-				     struct ocfs2_alloc_context *meta_ac,
-				     struct buffer_head *bhs[]);
+/*
+ * Structures which describe a path through a btree, and functions to
+ * manipulate them.
+ *
+ * The idea here is to be as generic as possible with the tree
+ * manipulation code.
+ */
+struct ocfs2_path_item {
+	struct buffer_head		*bh;
+	struct ocfs2_extent_list	*el;
+};
 
-static int ocfs2_add_branch(struct ocfs2_super *osb,
-			    handle_t *handle,
-			    struct inode *inode,
-			    struct buffer_head *fe_bh,
-			    struct buffer_head *eb_bh,
-			    struct buffer_head *last_eb_bh,
-			    struct ocfs2_alloc_context *meta_ac);
+#define OCFS2_MAX_PATH_DEPTH	5
 
-static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
-				  handle_t *handle,
-				  struct inode *inode,
-				  struct buffer_head *fe_bh,
-				  struct ocfs2_alloc_context *meta_ac,
-				  struct buffer_head **ret_new_eb_bh);
+struct ocfs2_path {
+	int			p_tree_depth;
+	struct ocfs2_path_item	p_node[OCFS2_MAX_PATH_DEPTH];
+};
 
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
-				  handle_t *handle,
-				  struct inode *inode,
-				  struct buffer_head *fe_bh,
-				  u64 blkno,
-				  u32 new_clusters);
+#define path_root_bh(_path) ((_path)->p_node[0].bh)
+#define path_root_el(_path) ((_path)->p_node[0].el)
+#define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh)
+#define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el)
+#define path_num_items(_path) ((_path)->p_tree_depth + 1)
 
-static int ocfs2_find_branch_target(struct ocfs2_super *osb,
-				    struct inode *inode,
-				    struct buffer_head *fe_bh,
-				    struct buffer_head **target_bh);
+/*
+ * Reset the actual path elements so that we can re-use the structure
+ * to build another path. Generally, this involves freeing the buffer
+ * heads.
+ */
+static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root)
+{
+	int i, start = 0, depth = 0;
+	struct ocfs2_path_item *node;
 
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
-				       struct inode *inode,
-				       struct ocfs2_dinode *fe,
-				       unsigned int new_i_clusters,
-				       struct buffer_head *old_last_eb,
-				       struct buffer_head **new_last_eb);
+	if (keep_root)
+		start = 1;
+
+	for(i = start; i < path_num_items(path); i++) {
+		node = &path->p_node[i];
+
+		brelse(node->bh);
+		node->bh = NULL;
+		node->el = NULL;
+	}
+
+	/*
+	 * Tree depth may change during truncate, or insert. If we're
+	 * keeping the root extent list, then make sure that our path
+	 * structure reflects the proper depth.
+	 */
+	if (keep_root)
+		depth = le16_to_cpu(path_root_el(path)->l_tree_depth);
+
+	path->p_tree_depth = depth;
+}
+
+static void ocfs2_free_path(struct ocfs2_path *path)
+{
+	if (path) {
+		ocfs2_reinit_path(path, 0);
+		kfree(path);
+	}
+}
+
+/*
+ * Make the *dest path the same as src and re-initialize src path to
+ * have a root only.
+ */
+static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src)
+{
+	int i;
+
+	BUG_ON(path_root_bh(dest) != path_root_bh(src));
+
+	for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) {
+		brelse(dest->p_node[i].bh);
+
+		dest->p_node[i].bh = src->p_node[i].bh;
+		dest->p_node[i].el = src->p_node[i].el;
+
+		src->p_node[i].bh = NULL;
+		src->p_node[i].el = NULL;
+	}
+}
+
+/*
+ * Insert an extent block at given index.
+ *
+ * This will not take an additional reference on eb_bh.
+ */
+static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index,
+					struct buffer_head *eb_bh)
+{
+	struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data;
+
+	/*
+	 * Right now, no root bh is an extent block, so this helps
+	 * catch code errors with dinode trees. The assertion can be
+	 * safely removed if we ever need to insert extent block
+	 * structures at the root.
+	 */
+	BUG_ON(index == 0);
+
+	path->p_node[index].bh = eb_bh;
+	path->p_node[index].el = &eb->h_list;
+}
+
+static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh,
+					 struct ocfs2_extent_list *root_el)
+{
+	struct ocfs2_path *path;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH);
+
+	path = kzalloc(sizeof(*path), GFP_NOFS);
+	if (path) {
+		path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth);
+		get_bh(root_bh);
+		path_root_bh(path) = root_bh;
+		path_root_el(path) = root_el;
+	}
+
+	return path;
+}
+
+/*
+ * Allocate and initialize a new path based on a disk inode tree.
+ */
+static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh)
+{
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_list *el = &di->id2.i_list;
+
+	return ocfs2_new_path(di_bh, el);
+}
+
+/*
+ * Convenience function to journal all components in a path.
+ */
+static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle,
+				     struct ocfs2_path *path)
+{
+	int i, ret = 0;
+
+	if (!path)
+		goto out;
+
+	for(i = 0; i < path_num_items(path); i++) {
+		ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	return ret;
+}
+
+enum ocfs2_contig_type {
+	CONTIG_NONE = 0,
+	CONTIG_LEFT,
+	CONTIG_RIGHT
+};
 
-static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc);
 
-static int ocfs2_extent_contig(struct inode *inode,
-			       struct ocfs2_extent_rec *ext,
-			       u64 blkno)
+/*
+ * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and
+ * ocfs2_extent_contig only work properly against leaf nodes!
+ */
+static int ocfs2_block_extent_contig(struct super_block *sb,
+				     struct ocfs2_extent_rec *ext,
+				     u64 blkno)
+{
+	u64 blk_end = le64_to_cpu(ext->e_blkno);
+
+	blk_end += ocfs2_clusters_to_blocks(sb,
+				    le16_to_cpu(ext->e_leaf_clusters));
+
+	return blkno == blk_end;
+}
+
+static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left,
+				  struct ocfs2_extent_rec *right)
+{
+	u32 left_range;
+
+	left_range = le32_to_cpu(left->e_cpos) +
+		le16_to_cpu(left->e_leaf_clusters);
+
+	return (left_range == le32_to_cpu(right->e_cpos));
+}
+
+static enum ocfs2_contig_type
+	ocfs2_extent_contig(struct inode *inode,
+			    struct ocfs2_extent_rec *ext,
+			    struct ocfs2_extent_rec *insert_rec)
 {
-	return blkno == (le64_to_cpu(ext->e_blkno) +
-			 ocfs2_clusters_to_blocks(inode->i_sb,
-						  le32_to_cpu(ext->e_clusters)));
+	u64 blkno = le64_to_cpu(insert_rec->e_blkno);
+
+	if (ocfs2_extents_adjacent(ext, insert_rec) &&
+	    ocfs2_block_extent_contig(inode->i_sb, ext, blkno))
+			return CONTIG_RIGHT;
+
+	blkno = le64_to_cpu(ext->e_blkno);
+	if (ocfs2_extents_adjacent(insert_rec, ext) &&
+	    ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno))
+		return CONTIG_LEFT;
+
+	return CONTIG_NONE;
 }
 
 /*
+ * NOTE: We can have pretty much any combination of contiguousness and
+ * appending.
+ *
+ * The usefulness of APPEND_TAIL is more in that it lets us know that
+ * we'll have to update the path to that leaf.
+ */
+enum ocfs2_append_type {
+	APPEND_NONE = 0,
+	APPEND_TAIL,
+};
+
+struct ocfs2_insert_type {
+	enum ocfs2_append_type	ins_appending;
+	enum ocfs2_contig_type	ins_contig;
+	int			ins_contig_index;
+	int			ins_free_records;
+	int			ins_tree_depth;
+};
+
+/*
  * How many free extents have we got before we need more meta data?
  */
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -242,6 +424,28 @@ bail:
 }
 
 /*
+ * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth().
+ *
+ * Returns the sum of the rightmost extent rec logical offset and
+ * cluster count.
+ *
+ * ocfs2_add_branch() uses this to determine what logical cluster
+ * value should be populated into the leftmost new branch records.
+ *
+ * ocfs2_shift_tree_depth() uses this to determine the # clusters
+ * value for the new topmost tree record.
+ */
+static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list  *el)
+{
+	int i;
+
+	i = le16_to_cpu(el->l_next_free_rec) - 1;
+
+	return le32_to_cpu(el->l_recs[i].e_cpos) +
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
+}
+
+/*
  * Add an entire tree branch to our inode. eb_bh is the extent block
  * to start at, if we don't want to start the branch at the dinode
  * structure.
@@ -250,7 +454,7 @@ bail:
  * for the new last extent block.
  *
  * the new branch will be 'empty' in the sense that every block will
- * contain a single record with e_clusters == 0.
+ * contain a single record with cluster count == 0.
  */
 static int ocfs2_add_branch(struct ocfs2_super *osb,
 			    handle_t *handle,
@@ -268,6 +472,7 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list  *eb_el;
 	struct ocfs2_extent_list  *el;
+	u32 new_cpos;
 
 	mlog_entry_void();
 
@@ -302,6 +507,9 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	eb = (struct ocfs2_extent_block *)last_eb_bh->b_data;
+	new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list);
+
 	/* Note: new_eb_bhs[new_blocks - 1] is the guy which will be
 	 * linked with the rest of the tree.
 	 * conversly, new_eb_bhs[0] is the new bottommost leaf.
@@ -330,9 +538,18 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 		eb->h_next_leaf_blk = 0;
 		eb_el->l_tree_depth = cpu_to_le16(i);
 		eb_el->l_next_free_rec = cpu_to_le16(1);
-		eb_el->l_recs[0].e_cpos = fe->i_clusters;
+		/*
+		 * This actually counts as an empty extent as
+		 * c_clusters == 0
+		 */
+		eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos);
 		eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno);
-		eb_el->l_recs[0].e_clusters = cpu_to_le32(0);
+		/*
+		 * eb_el isn't always an interior node, but even leaf
+		 * nodes want a zero'd flags and reserved field so
+		 * this gets the whole 32 bits regardless of use.
+		 */
+		eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0);
 		if (!eb_el->l_tree_depth)
 			new_last_eb_blk = le64_to_cpu(eb->h_blkno);
 
@@ -376,8 +593,8 @@ static int ocfs2_add_branch(struct ocfs2_super *osb,
 	 * either be on the fe, or the extent block passed in. */
 	i = le16_to_cpu(el->l_next_free_rec);
 	el->l_recs[i].e_blkno = cpu_to_le64(next_blkno);
-	el->l_recs[i].e_cpos = fe->i_clusters;
-	el->l_recs[i].e_clusters = 0;
+	el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
+	el->l_recs[i].e_int_clusters = 0;
 	le16_add_cpu(&el->l_next_free_rec, 1);
 
 	/* fe needs a new last extent block pointer, as does the
@@ -425,6 +642,7 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 				  struct buffer_head **ret_new_eb_bh)
 {
 	int status, i;
+	u32 new_clusters;
 	struct buffer_head *new_eb_bh = NULL;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
@@ -461,11 +679,8 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 	/* copy the fe data into the new extent block */
 	eb_el->l_tree_depth = fe_el->l_tree_depth;
 	eb_el->l_next_free_rec = fe_el->l_next_free_rec;
-	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		eb_el->l_recs[i].e_cpos = fe_el->l_recs[i].e_cpos;
-		eb_el->l_recs[i].e_clusters = fe_el->l_recs[i].e_clusters;
-		eb_el->l_recs[i].e_blkno = fe_el->l_recs[i].e_blkno;
-	}
+	for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		eb_el->l_recs[i] = fe_el->l_recs[i];
 
 	status = ocfs2_journal_dirty(handle, new_eb_bh);
 	if (status < 0) {
@@ -480,16 +695,15 @@ static int ocfs2_shift_tree_depth(struct ocfs2_super *osb,
 		goto bail;
 	}
 
+	new_clusters = ocfs2_sum_rightmost_rec(eb_el);
+
 	/* update fe now */
 	le16_add_cpu(&fe_el->l_tree_depth, 1);
 	fe_el->l_recs[0].e_cpos = 0;
 	fe_el->l_recs[0].e_blkno = eb->h_blkno;
-	fe_el->l_recs[0].e_clusters = fe->i_clusters;
-	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) {
-		fe_el->l_recs[i].e_cpos = 0;
-		fe_el->l_recs[i].e_clusters = 0;
-		fe_el->l_recs[i].e_blkno = 0;
-	}
+	fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters);
+	for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++)
+		memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec));
 	fe_el->l_next_free_rec = cpu_to_le16(1);
 
 	/* If this is our 1st tree depth shift, then last_eb_blk
@@ -515,199 +729,6 @@ bail:
 }
 
 /*
- * Expects the tree to already have room in the rightmost leaf for the
- * extent.  Updates all the extent blocks (and the dinode) on the way
- * down.
- */
-static int ocfs2_do_insert_extent(struct ocfs2_super *osb,
-				  handle_t *handle,
-				  struct inode *inode,
-				  struct buffer_head *fe_bh,
-				  u64 start_blk,
-				  u32 new_clusters)
-{
-	int status, i, num_bhs = 0;
-	u64 next_blkno;
-	u16 next_free;
-	struct buffer_head **eb_bhs = NULL;
-	struct ocfs2_dinode *fe;
-	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *el;
-
-	mlog_entry_void();
-
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
-	if (el->l_tree_depth) {
-		/* This is another operation where we want to be
-		 * careful about our tree updates. An error here means
-		 * none of the previous changes we made should roll
-		 * forward. As a result, we have to record the buffers
-		 * for this part of the tree in an array and reserve a
-		 * journal write to them before making any changes. */
-		num_bhs = le16_to_cpu(fe->id2.i_list.l_tree_depth);
-		eb_bhs = kcalloc(num_bhs, sizeof(struct buffer_head *),
-				 GFP_KERNEL);
-		if (!eb_bhs) {
-			status = -ENOMEM;
-			mlog_errno(status);
-			goto bail;
-		}
-
-		i = 0;
-		while(el->l_tree_depth) {
-			next_free = le16_to_cpu(el->l_next_free_rec);
-			if (next_free == 0) {
-				ocfs2_error(inode->i_sb,
-					    "Dinode %llu has a bad extent list",
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-				status = -EIO;
-				goto bail;
-			}
-			next_blkno = le64_to_cpu(el->l_recs[next_free - 1].e_blkno);
-
-			BUG_ON(i >= num_bhs);
-			status = ocfs2_read_block(osb, next_blkno, &eb_bhs[i],
-						  OCFS2_BH_CACHED, inode);
-			if (status < 0) {
-				mlog_errno(status);
-				goto bail;
-			}
-			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-			if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-				OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb,
-								 eb);
-				status = -EIO;
-				goto bail;
-			}
-
-			status = ocfs2_journal_access(handle, inode, eb_bhs[i],
-						      OCFS2_JOURNAL_ACCESS_WRITE);
-			if (status < 0) {
-				mlog_errno(status);
-				goto bail;
-			}
-
-			el = &eb->h_list;
-			i++;
-			/* When we leave this loop, eb_bhs[num_bhs - 1] will
-			 * hold the bottom-most leaf extent block. */
-		}
-		BUG_ON(el->l_tree_depth);
-
-		el = &fe->id2.i_list;
-		/* If we have tree depth, then the fe update is
-		 * trivial, and we want to switch el out for the
-		 * bottom-most leaf in order to update it with the
-		 * actual extent data below. */
-		next_free = le16_to_cpu(el->l_next_free_rec);
-		if (next_free == 0) {
-			ocfs2_error(inode->i_sb,
-				    "Dinode %llu has a bad extent list",
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			status = -EIO;
-			goto bail;
-		}
-		le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-			     new_clusters);
-		/* (num_bhs - 1) to avoid the leaf */
-		for(i = 0; i < (num_bhs - 1); i++) {
-			eb = (struct ocfs2_extent_block *) eb_bhs[i]->b_data;
-			el = &eb->h_list;
-
-			/* finally, make our actual change to the
-			 * intermediate extent blocks. */
-			next_free = le16_to_cpu(el->l_next_free_rec);
-			le32_add_cpu(&el->l_recs[next_free - 1].e_clusters,
-				     new_clusters);
-
-			status = ocfs2_journal_dirty(handle, eb_bhs[i]);
-			if (status < 0)
-				mlog_errno(status);
-		}
-		BUG_ON(i != (num_bhs - 1));
-		/* note that the leaf block wasn't touched in
-		 * the loop above */
-		eb = (struct ocfs2_extent_block *) eb_bhs[num_bhs - 1]->b_data;
-		el = &eb->h_list;
-		BUG_ON(el->l_tree_depth);
-	}
-
-	/* yay, we can finally add the actual extent now! */
-	i = le16_to_cpu(el->l_next_free_rec) - 1;
-	if (le16_to_cpu(el->l_next_free_rec) &&
-	    ocfs2_extent_contig(inode, &el->l_recs[i], start_blk)) {
-		le32_add_cpu(&el->l_recs[i].e_clusters, new_clusters);
-	} else if (le16_to_cpu(el->l_next_free_rec) &&
-		   (le32_to_cpu(el->l_recs[i].e_clusters) == 0)) {
-		/* having an empty extent at eof is legal. */
-		if (el->l_recs[i].e_cpos != fe->i_clusters) {
-			ocfs2_error(inode->i_sb,
-				    "Dinode %llu trailing extent is bad: "
-				    "cpos (%u) != number of clusters (%u)",
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-				    le32_to_cpu(el->l_recs[i].e_cpos),
-				    le32_to_cpu(fe->i_clusters));
-			status = -EIO;
-			goto bail;
-		}
-		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-	} else {
-		/* No contiguous record, or no empty record at eof, so
-		 * we add a new one. */
-
-		BUG_ON(le16_to_cpu(el->l_next_free_rec) >=
-		       le16_to_cpu(el->l_count));
-		i = le16_to_cpu(el->l_next_free_rec);
-
-		el->l_recs[i].e_blkno = cpu_to_le64(start_blk);
-		el->l_recs[i].e_clusters = cpu_to_le32(new_clusters);
-		el->l_recs[i].e_cpos = fe->i_clusters;
-		le16_add_cpu(&el->l_next_free_rec, 1);
-	}
-
-	/*
-	 * extent_map errors are not fatal, so they are ignored outside
-	 * of flushing the thing.
-	 */
-	status = ocfs2_extent_map_append(inode, &el->l_recs[i],
-					 new_clusters);
-	if (status) {
-		mlog_errno(status);
-		ocfs2_extent_map_drop(inode, le32_to_cpu(fe->i_clusters));
-	}
-
-	status = ocfs2_journal_dirty(handle, fe_bh);
-	if (status < 0)
-		mlog_errno(status);
-	if (fe->id2.i_list.l_tree_depth) {
-		status = ocfs2_journal_dirty(handle, eb_bhs[num_bhs - 1]);
-		if (status < 0)
-			mlog_errno(status);
-	}
-
-	status = 0;
-bail:
-	if (eb_bhs) {
-		for (i = 0; i < num_bhs; i++)
-			if (eb_bhs[i])
-				brelse(eb_bhs[i]);
-		kfree(eb_bhs);
-	}
-
-	mlog_exit(status);
-	return status;
-}
-
-/*
  * Should only be called when there is no space left in any of the
  * leaf nodes. What we want to do is find the lowest tree depth
  * non-leaf extent block with room for new records. There are three
@@ -807,53 +828,1548 @@ bail:
 	return status;
 }
 
-/* the caller needs to update fe->i_clusters */
-int ocfs2_insert_extent(struct ocfs2_super *osb,
-			handle_t *handle,
-			struct inode *inode,
-			struct buffer_head *fe_bh,
-			u64 start_blk,
-			u32 new_clusters,
-			struct ocfs2_alloc_context *meta_ac)
+/*
+ * This is only valid for leaf nodes, which are the only ones that can
+ * have empty extents anyway.
+ */
+static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec)
 {
-	int status, i, shift;
-	struct buffer_head *last_eb_bh = NULL;
+	return !rec->e_leaf_clusters;
+}
+
+/*
+ * This function will discard the rightmost extent record.
+ */
+static void ocfs2_shift_records_right(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+	int count = le16_to_cpu(el->l_count);
+	unsigned int num_bytes;
+
+	BUG_ON(!next_free);
+	/* This will cause us to go off the end of our extent list. */
+	BUG_ON(next_free >= count);
+
+	num_bytes = sizeof(struct ocfs2_extent_rec) * next_free;
+
+	memmove(&el->l_recs[1], &el->l_recs[0], num_bytes);
+}
+
+static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el,
+			      struct ocfs2_extent_rec *insert_rec)
+{
+	int i, insert_index, next_free, has_empty, num_bytes;
+	u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	has_empty = ocfs2_is_empty_extent(&el->l_recs[0]);
+
+	BUG_ON(!next_free);
+
+	/* The tree code before us didn't allow enough room in the leaf. */
+	if (el->l_next_free_rec == el->l_count && !has_empty)
+		BUG();
+
+	/*
+	 * The easiest way to approach this is to just remove the
+	 * empty extent and temporarily decrement next_free.
+	 */
+	if (has_empty) {
+		/*
+		 * If next_free was 1 (only an empty extent), this
+		 * loop won't execute, which is fine. We still want
+		 * the decrement above to happen.
+		 */
+		for(i = 0; i < (next_free - 1); i++)
+			el->l_recs[i] = el->l_recs[i+1];
+
+		next_free--;
+	}
+
+	/*
+	 * Figure out what the new record index should be.
+	 */
+	for(i = 0; i < next_free; i++) {
+		rec = &el->l_recs[i];
+
+		if (insert_cpos < le32_to_cpu(rec->e_cpos))
+			break;
+	}
+	insert_index = i;
+
+	mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n",
+	     insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count));
+
+	BUG_ON(insert_index < 0);
+	BUG_ON(insert_index >= le16_to_cpu(el->l_count));
+	BUG_ON(insert_index > next_free);
+
+	/*
+	 * No need to memmove if we're just adding to the tail.
+	 */
+	if (insert_index != next_free) {
+		BUG_ON(next_free >= le16_to_cpu(el->l_count));
+
+		num_bytes = next_free - insert_index;
+		num_bytes *= sizeof(struct ocfs2_extent_rec);
+		memmove(&el->l_recs[insert_index + 1],
+			&el->l_recs[insert_index],
+			num_bytes);
+	}
+
+	/*
+	 * Either we had an empty extent, and need to re-increment or
+	 * there was no empty extent on a non full rightmost leaf node,
+	 * in which case we still need to increment.
+	 */
+	next_free++;
+	el->l_next_free_rec = cpu_to_le16(next_free);
+	/*
+	 * Make sure none of the math above just messed up our tree.
+	 */
+	BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count));
+
+	el->l_recs[insert_index] = *insert_rec;
+
+}
+
+/*
+ * Create an empty extent record .
+ *
+ * l_next_free_rec may be updated.
+ *
+ * If an empty extent already exists do nothing.
+ */
+static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el)
+{
+	int next_free = le16_to_cpu(el->l_next_free_rec);
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (next_free == 0)
+		goto set_and_inc;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0]))
+		return;
+
+	mlog_bug_on_msg(el->l_count == el->l_next_free_rec,
+			"Asked to create an empty extent in a full list:\n"
+			"count = %u, tree depth = %u",
+			le16_to_cpu(el->l_count),
+			le16_to_cpu(el->l_tree_depth));
+
+	ocfs2_shift_records_right(el);
+
+set_and_inc:
+	le16_add_cpu(&el->l_next_free_rec, 1);
+	memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+}
+
+/*
+ * For a rotation which involves two leaf nodes, the "root node" is
+ * the lowest level tree node which contains a path to both leafs. This
+ * resulting set of information can be used to form a complete "subtree"
+ *
+ * This function is passed two full paths from the dinode down to a
+ * pair of adjacent leaves. It's task is to figure out which path
+ * index contains the subtree root - this can be the root index itself
+ * in a worst-case rotation.
+ *
+ * The array index of the subtree root is passed back.
+ */
+static int ocfs2_find_subtree_root(struct inode *inode,
+				   struct ocfs2_path *left,
+				   struct ocfs2_path *right)
+{
+	int i = 0;
+
+	/*
+	 * Check that the caller passed in two paths from the same tree.
+	 */
+	BUG_ON(path_root_bh(left) != path_root_bh(right));
+
+	do {
+		i++;
+
+		/*
+		 * The caller didn't pass two adjacent paths.
+		 */
+		mlog_bug_on_msg(i > left->p_tree_depth,
+				"Inode %lu, left depth %u, right depth %u\n"
+				"left leaf blk %llu, right leaf blk %llu\n",
+				inode->i_ino, left->p_tree_depth,
+				right->p_tree_depth,
+				(unsigned long long)path_leaf_bh(left)->b_blocknr,
+				(unsigned long long)path_leaf_bh(right)->b_blocknr);
+	} while (left->p_node[i].bh->b_blocknr ==
+		 right->p_node[i].bh->b_blocknr);
+
+	return i - 1;
+}
+
+typedef void (path_insert_t)(void *, struct buffer_head *);
+
+/*
+ * Traverse a btree path in search of cpos, starting at root_el.
+ *
+ * This code can be called with a cpos larger than the tree, in which
+ * case it will return the rightmost path.
+ */
+static int __ocfs2_find_path(struct inode *inode,
+			     struct ocfs2_extent_list *root_el, u32 cpos,
+			     path_insert_t *func, void *data)
+{
+	int i, ret = 0;
+	u32 range;
+	u64 blkno;
 	struct buffer_head *bh = NULL;
-	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_list  *el;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	mlog_entry_void();
+	el = root_el;
+	while (el->l_tree_depth) {
+		if (le16_to_cpu(el->l_next_free_rec) == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %llu has empty extent list at "
+				    "depth %u\n",
+				    (unsigned long long)oi->ip_blkno,
+				    le16_to_cpu(el->l_tree_depth));
+			ret = -EROFS;
+			goto out;
 
-	mlog(0, "add %u clusters starting at block %llu to inode %llu\n",
-	     new_clusters, (unsigned long long)start_blk,
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+		}
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
-	el = &fe->id2.i_list;
+		for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) {
+			rec = &el->l_recs[i];
+
+			/*
+			 * In the case that cpos is off the allocation
+			 * tree, this should just wind up returning the
+			 * rightmost record.
+			 */
+			range = le32_to_cpu(rec->e_cpos) +
+				ocfs2_rec_clusters(el, rec);
+			if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range)
+			    break;
+		}
 
-	if (el->l_tree_depth) {
-		/* jump to end of tree */
-		status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk),
-					  &last_eb_bh, OCFS2_BH_CACHED, inode);
-		if (status < 0) {
-			mlog_exit(status);
-			goto bail;
+		blkno = le64_to_cpu(el->l_recs[i].e_blkno);
+		if (blkno == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %llu has bad blkno in extent list "
+				    "at depth %u (index %d)\n",
+				    (unsigned long long)oi->ip_blkno,
+				    le16_to_cpu(el->l_tree_depth), i);
+			ret = -EROFS;
+			goto out;
 		}
-		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+
+		brelse(bh);
+		bh = NULL;
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno,
+				       &bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		eb = (struct ocfs2_extent_block *) bh->b_data;
 		el = &eb->h_list;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+			ret = -EIO;
+			goto out;
+		}
+
+		if (le16_to_cpu(el->l_next_free_rec) >
+		    le16_to_cpu(el->l_count)) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %llu has bad count in extent list "
+				    "at block %llu (next free=%u, count=%u)\n",
+				    (unsigned long long)oi->ip_blkno,
+				    (unsigned long long)bh->b_blocknr,
+				    le16_to_cpu(el->l_next_free_rec),
+				    le16_to_cpu(el->l_count));
+			ret = -EROFS;
+			goto out;
+		}
+
+		if (func)
+			func(data, bh);
+	}
+
+out:
+	/*
+	 * Catch any trailing bh that the loop didn't handle.
+	 */
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Given an initialized path (that is, it has a valid root extent
+ * list), this function will traverse the btree in search of the path
+ * which would contain cpos.
+ *
+ * The path traveled is recorded in the path structure.
+ *
+ * Note that this will not do any comparisons on leaf node extent
+ * records, so it will work fine in the case that we just added a tree
+ * branch.
+ */
+struct find_path_data {
+	int index;
+	struct ocfs2_path *path;
+};
+static void find_path_ins(void *data, struct buffer_head *bh)
+{
+	struct find_path_data *fp = data;
+
+	get_bh(bh);
+	ocfs2_path_insert_eb(fp->path, fp->index, bh);
+	fp->index++;
+}
+static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path,
+			   u32 cpos)
+{
+	struct find_path_data data;
+
+	data.index = 1;
+	data.path = path;
+	return __ocfs2_find_path(inode, path_root_el(path), cpos,
+				 find_path_ins, &data);
+}
+
+static void find_leaf_ins(void *data, struct buffer_head *bh)
+{
+	struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data;
+	struct ocfs2_extent_list *el = &eb->h_list;
+	struct buffer_head **ret = data;
+
+	/* We want to retain only the leaf block. */
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		get_bh(bh);
+		*ret = bh;
+	}
+}
+/*
+ * Find the leaf block in the tree which would contain cpos. No
+ * checking of the actual leaf is done.
+ *
+ * Some paths want to call this instead of allocating a path structure
+ * and calling ocfs2_find_path().
+ *
+ * This function doesn't handle non btree extent lists.
+ */
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+		    u32 cpos, struct buffer_head **leaf_bh)
+{
+	int ret;
+	struct buffer_head *bh = NULL;
+
+	ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	*leaf_bh = bh;
+out:
+	return ret;
+}
+
+/*
+ * Adjust the adjacent records (left_rec, right_rec) involved in a rotation.
+ *
+ * Basically, we've moved stuff around at the bottom of the tree and
+ * we need to fix up the extent records above the changes to reflect
+ * the new changes.
+ *
+ * left_rec: the record on the left.
+ * left_child_el: is the child list pointed to by left_rec
+ * right_rec: the record to the right of left_rec
+ * right_child_el: is the child list pointed to by right_rec
+ *
+ * By definition, this only works on interior nodes.
+ */
+static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec,
+				  struct ocfs2_extent_list *left_child_el,
+				  struct ocfs2_extent_rec *right_rec,
+				  struct ocfs2_extent_list *right_child_el)
+{
+	u32 left_clusters, right_end;
+
+	/*
+	 * Interior nodes never have holes. Their cpos is the cpos of
+	 * the leftmost record in their child list. Their cluster
+	 * count covers the full theoretical range of their child list
+	 * - the range between their cpos and the cpos of the record
+	 * immediately to their right.
+	 */
+	left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos);
+	left_clusters -= le32_to_cpu(left_rec->e_cpos);
+	left_rec->e_int_clusters = cpu_to_le32(left_clusters);
+
+	/*
+	 * Calculate the rightmost cluster count boundary before
+	 * moving cpos - we will need to adjust clusters after
+	 * updating e_cpos to keep the same highest cluster count.
+	 */
+	right_end = le32_to_cpu(right_rec->e_cpos);
+	right_end += le32_to_cpu(right_rec->e_int_clusters);
+
+	right_rec->e_cpos = left_rec->e_cpos;
+	le32_add_cpu(&right_rec->e_cpos, left_clusters);
+
+	right_end -= le32_to_cpu(right_rec->e_cpos);
+	right_rec->e_int_clusters = cpu_to_le32(right_end);
+}
+
+/*
+ * Adjust the adjacent root node records involved in a
+ * rotation. left_el_blkno is passed in as a key so that we can easily
+ * find it's index in the root list.
+ */
+static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el,
+				      struct ocfs2_extent_list *left_el,
+				      struct ocfs2_extent_list *right_el,
+				      u64 left_el_blkno)
+{
+	int i;
+
+	BUG_ON(le16_to_cpu(root_el->l_tree_depth) <=
+	       le16_to_cpu(left_el->l_tree_depth));
+
+	for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) {
+		if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno)
+			break;
+	}
+
+	/*
+	 * The path walking code should have never returned a root and
+	 * two paths which are not adjacent.
+	 */
+	BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1));
+
+	ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el,
+				      &root_el->l_recs[i + 1], right_el);
+}
+
+/*
+ * We've changed a leaf block (in right_path) and need to reflect that
+ * change back up the subtree.
+ *
+ * This happens in multiple places:
+ *   - When we've moved an extent record from the left path leaf to the right
+ *     path leaf to make room for an empty extent in the left path leaf.
+ *   - When our insert into the right path leaf is at the leftmost edge
+ *     and requires an update of the path immediately to it's left. This
+ *     can occur at the end of some types of rotation and appending inserts.
+ */
+static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle,
+				       struct ocfs2_path *left_path,
+				       struct ocfs2_path *right_path,
+				       int subtree_index)
+{
+	int ret, i, idx;
+	struct ocfs2_extent_list *el, *left_el, *right_el;
+	struct ocfs2_extent_rec *left_rec, *right_rec;
+	struct buffer_head *root_bh = left_path->p_node[subtree_index].bh;
+
+	/*
+	 * Update the counts and position values within all the
+	 * interior nodes to reflect the leaf rotation we just did.
+	 *
+	 * The root node is handled below the loop.
+	 *
+	 * We begin the loop with right_el and left_el pointing to the
+	 * leaf lists and work our way up.
+	 *
+	 * NOTE: within this loop, left_el and right_el always refer
+	 * to the *child* lists.
+	 */
+	left_el = path_leaf_el(left_path);
+	right_el = path_leaf_el(right_path);
+	for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) {
+		mlog(0, "Adjust records at index %u\n", i);
+
+		/*
+		 * One nice property of knowing that all of these
+		 * nodes are below the root is that we only deal with
+		 * the leftmost right node record and the rightmost
+		 * left node record.
+		 */
+		el = left_path->p_node[i].el;
+		idx = le16_to_cpu(left_el->l_next_free_rec) - 1;
+		left_rec = &el->l_recs[idx];
+
+		el = right_path->p_node[i].el;
+		right_rec = &el->l_recs[0];
+
+		ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec,
+					      right_el);
+
+		ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh);
+		if (ret)
+			mlog_errno(ret);
+
+		ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh);
+		if (ret)
+			mlog_errno(ret);
+
+		/*
+		 * Setup our list pointers now so that the current
+		 * parents become children in the next iteration.
+		 */
+		left_el = left_path->p_node[i].el;
+		right_el = right_path->p_node[i].el;
+	}
+
+	/*
+	 * At the root node, adjust the two adjacent records which
+	 * begin our path to the leaves.
+	 */
+
+	el = left_path->p_node[subtree_index].el;
+	left_el = left_path->p_node[subtree_index + 1].el;
+	right_el = right_path->p_node[subtree_index + 1].el;
+
+	ocfs2_adjust_root_records(el, left_el, right_el,
+				  left_path->p_node[subtree_index + 1].bh->b_blocknr);
+
+	root_bh = left_path->p_node[subtree_index].bh;
+
+	ret = ocfs2_journal_dirty(handle, root_bh);
+	if (ret)
+		mlog_errno(ret);
+}
+
+static int ocfs2_rotate_subtree_right(struct inode *inode,
+				      handle_t *handle,
+				      struct ocfs2_path *left_path,
+				      struct ocfs2_path *right_path,
+				      int subtree_index)
+{
+	int ret, i;
+	struct buffer_head *right_leaf_bh;
+	struct buffer_head *left_leaf_bh = NULL;
+	struct buffer_head *root_bh;
+	struct ocfs2_extent_list *right_el, *left_el;
+	struct ocfs2_extent_rec move_rec;
+
+	left_leaf_bh = path_leaf_bh(left_path);
+	left_el = path_leaf_el(left_path);
+
+	if (left_el->l_next_free_rec != left_el->l_count) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has non-full interior leaf node %llu"
+			    "(next free = %u)",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)left_leaf_bh->b_blocknr,
+			    le16_to_cpu(left_el->l_next_free_rec));
+		return -EROFS;
+	}
+
+	/*
+	 * This extent block may already have an empty record, so we
+	 * return early if so.
+	 */
+	if (ocfs2_is_empty_extent(&left_el->l_recs[0]))
+		return 0;
+
+	root_bh = left_path->p_node[subtree_index].bh;
+	BUG_ON(root_bh != right_path->p_node[subtree_index].bh);
+
+	ret = ocfs2_journal_access(handle, inode, root_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	for(i = subtree_index + 1; i < path_num_items(right_path); i++) {
+		ret = ocfs2_journal_access(handle, inode,
+					   right_path->p_node[i].bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access(handle, inode,
+					   left_path->p_node[i].bh,
+					   OCFS2_JOURNAL_ACCESS_WRITE);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	right_leaf_bh = path_leaf_bh(right_path);
+	right_el = path_leaf_el(right_path);
+
+	/* This is a code error, not a disk corruption. */
+	mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails "
+			"because rightmost leaf block %llu is empty\n",
+			(unsigned long long)OCFS2_I(inode)->ip_blkno,
+			(unsigned long long)right_leaf_bh->b_blocknr);
+
+	ocfs2_create_empty_extent(right_el);
+
+	ret = ocfs2_journal_dirty(handle, right_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/* Do the copy now. */
+	i = le16_to_cpu(left_el->l_next_free_rec) - 1;
+	move_rec = left_el->l_recs[i];
+	right_el->l_recs[0] = move_rec;
+
+	/*
+	 * Clear out the record we just copied and shift everything
+	 * over, leaving an empty extent in the left leaf.
+	 *
+	 * We temporarily subtract from next_free_rec so that the
+	 * shift will lose the tail record (which is now defunct).
+	 */
+	le16_add_cpu(&left_el->l_next_free_rec, -1);
+	ocfs2_shift_records_right(left_el);
+	memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec));
+	le16_add_cpu(&left_el->l_next_free_rec, 1);
+
+	ret = ocfs2_journal_dirty(handle, left_leaf_bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ocfs2_complete_edge_insert(inode, handle, left_path, right_path,
+				subtree_index);
+
+out:
+	return ret;
+}
+
+/*
+ * Given a full path, determine what cpos value would return us a path
+ * containing the leaf immediately to the left of the current one.
+ *
+ * Will return zero if the path passed in is already the leftmost path.
+ */
+static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb,
+					 struct ocfs2_path *path, u32 *cpos)
+{
+	int i, j, ret = 0;
+	u64 blkno;
+	struct ocfs2_extent_list *el;
+
+	BUG_ON(path->p_tree_depth == 0);
+
+	*cpos = 0;
+
+	blkno = path_leaf_bh(path)->b_blocknr;
+
+	/* Start at the tree node just above the leaf and work our way up. */
+	i = path->p_tree_depth - 1;
+	while (i >= 0) {
+		el = path->p_node[i].el;
+
+		/*
+		 * Find the extent record just before the one in our
+		 * path.
+		 */
+		for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) {
+			if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) {
+				if (j == 0) {
+					if (i == 0) {
+						/*
+						 * We've determined that the
+						 * path specified is already
+						 * the leftmost one - return a
+						 * cpos of zero.
+						 */
+						goto out;
+					}
+					/*
+					 * The leftmost record points to our
+					 * leaf - we need to travel up the
+					 * tree one level.
+					 */
+					goto next_node;
+				}
+
+				*cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos);
+				*cpos = *cpos + ocfs2_rec_clusters(el,
+							   &el->l_recs[j - 1]);
+				*cpos = *cpos - 1;
+				goto out;
+			}
+		}
+
+		/*
+		 * If we got here, we never found a valid node where
+		 * the tree indicated one should be.
+		 */
+		ocfs2_error(sb,
+			    "Invalid extent tree at extent block %llu\n",
+			    (unsigned long long)blkno);
+		ret = -EROFS;
+		goto out;
+
+next_node:
+		blkno = path->p_node[i].bh->b_blocknr;
+		i--;
+	}
+
+out:
+	return ret;
+}
+
+static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth,
+					   struct ocfs2_path *path)
+{
+	int credits = (path->p_tree_depth - subtree_depth) * 2 + 1;
+
+	if (handle->h_buffer_credits < credits)
+		return ocfs2_extend_trans(handle, credits);
+
+	return 0;
+}
+
+/*
+ * Trap the case where we're inserting into the theoretical range past
+ * the _actual_ left leaf range. Otherwise, we'll rotate a record
+ * whose cpos is less than ours into the right leaf.
+ *
+ * It's only necessary to look at the rightmost record of the left
+ * leaf because the logic that calls us should ensure that the
+ * theoretical ranges in the path components above the leaves are
+ * correct.
+ */
+static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path,
+						 u32 insert_cpos)
+{
+	struct ocfs2_extent_list *left_el;
+	struct ocfs2_extent_rec *rec;
+	int next_free;
+
+	left_el = path_leaf_el(left_path);
+	next_free = le16_to_cpu(left_el->l_next_free_rec);
+	rec = &left_el->l_recs[next_free - 1];
+
+	if (insert_cpos > le32_to_cpu(rec->e_cpos))
+		return 1;
+	return 0;
+}
+
+/*
+ * Rotate all the records in a btree right one record, starting at insert_cpos.
+ *
+ * The path to the rightmost leaf should be passed in.
+ *
+ * The array is assumed to be large enough to hold an entire path (tree depth).
+ *
+ * Upon succesful return from this function:
+ *
+ * - The 'right_path' array will contain a path to the leaf block
+ *   whose range contains e_cpos.
+ * - That leaf block will have a single empty extent in list index 0.
+ * - In the case that the rotation requires a post-insert update,
+ *   *ret_left_path will contain a valid path which can be passed to
+ *   ocfs2_insert_path().
+ */
+static int ocfs2_rotate_tree_right(struct inode *inode,
+				   handle_t *handle,
+				   u32 insert_cpos,
+				   struct ocfs2_path *right_path,
+				   struct ocfs2_path **ret_left_path)
+{
+	int ret, start;
+	u32 cpos;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	left_path = ocfs2_new_path(path_root_bh(right_path),
+				   path_root_el(right_path));
+	if (!left_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos);
+
+	/*
+	 * What we want to do here is:
+	 *
+	 * 1) Start with the rightmost path.
+	 *
+	 * 2) Determine a path to the leaf block directly to the left
+	 *    of that leaf.
+	 *
+	 * 3) Determine the 'subtree root' - the lowest level tree node
+	 *    which contains a path to both leaves.
+	 *
+	 * 4) Rotate the subtree.
+	 *
+	 * 5) Find the next subtree by considering the left path to be
+	 *    the new right path.
+	 *
+	 * The check at the top of this while loop also accepts
+	 * insert_cpos == cpos because cpos is only a _theoretical_
+	 * value to get us the left path - insert_cpos might very well
+	 * be filling that hole.
+	 *
+	 * Stop at a cpos of '0' because we either started at the
+	 * leftmost branch (i.e., a tree with one branch and a
+	 * rotation inside of it), or we've gone as far as we can in
+	 * rotating subtrees.
+	 */
+	while (cpos && insert_cpos <= cpos) {
+		mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n",
+		     insert_cpos, cpos);
+
+		ret = ocfs2_find_path(inode, left_path, cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		mlog_bug_on_msg(path_leaf_bh(left_path) ==
+				path_leaf_bh(right_path),
+				"Inode %lu: error during insert of %u "
+				"(left path cpos %u) results in two identical "
+				"paths ending at %llu\n",
+				inode->i_ino, insert_cpos, cpos,
+				(unsigned long long)
+				path_leaf_bh(left_path)->b_blocknr);
+
+		if (ocfs2_rotate_requires_path_adjustment(left_path,
+							  insert_cpos)) {
+			mlog(0, "Path adjustment required\n");
+
+			/*
+			 * We've rotated the tree as much as we
+			 * should. The rest is up to
+			 * ocfs2_insert_path() to complete, after the
+			 * record insertion. We indicate this
+			 * situation by returning the left path.
+			 *
+			 * The reason we don't adjust the records here
+			 * before the record insert is that an error
+			 * later might break the rule where a parent
+			 * record e_cpos will reflect the actual
+			 * e_cpos of the 1st nonempty record of the
+			 * child list.
+			 */
+			*ret_left_path = left_path;
+			goto out_ret_path;
+		}
+
+		start = ocfs2_find_subtree_root(inode, left_path, right_path);
+
+		mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n",
+		     start,
+		     (unsigned long long) right_path->p_node[start].bh->b_blocknr,
+		     right_path->p_tree_depth);
+
+		ret = ocfs2_extend_rotate_transaction(handle, start,
+						      right_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_rotate_subtree_right(inode, handle, left_path,
+						 right_path, start);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		/*
+		 * There is no need to re-read the next right path
+		 * as we know that it'll be our current left
+		 * path. Optimize by copying values instead.
+		 */
+		ocfs2_mv_path(right_path, left_path);
+
+		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+						    &cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+out:
+	ocfs2_free_path(left_path);
+
+out_ret_path:
+	return ret;
+}
+
+/*
+ * Do the final bits of extent record insertion at the target leaf
+ * list. If this leaf is part of an allocation tree, it is assumed
+ * that the tree above has been prepared.
+ */
+static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec,
+				 struct ocfs2_extent_list *el,
+				 struct ocfs2_insert_type *insert,
+				 struct inode *inode)
+{
+	int i = insert->ins_contig_index;
+	unsigned int range;
+	struct ocfs2_extent_rec *rec;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	/*
+	 * Contiguous insert - either left or right.
+	 */
+	if (insert->ins_contig != CONTIG_NONE) {
+		rec = &el->l_recs[i];
+		if (insert->ins_contig == CONTIG_LEFT) {
+			rec->e_blkno = insert_rec->e_blkno;
+			rec->e_cpos = insert_rec->e_cpos;
+		}
+		le16_add_cpu(&rec->e_leaf_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		return;
+	}
+
+	/*
+	 * Handle insert into an empty leaf.
+	 */
+	if (le16_to_cpu(el->l_next_free_rec) == 0 ||
+	    ((le16_to_cpu(el->l_next_free_rec) == 1) &&
+	     ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		el->l_recs[0] = *insert_rec;
+		el->l_next_free_rec = cpu_to_le16(1);
+		return;
+	}
+
+	/*
+	 * Appending insert.
+	 */
+	if (insert->ins_appending == APPEND_TAIL) {
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[i];
+		range = le32_to_cpu(rec->e_cpos)
+			+ le16_to_cpu(rec->e_leaf_clusters);
+		BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range);
+
+		mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >=
+				le16_to_cpu(el->l_count),
+				"inode %lu, depth %u, count %u, next free %u, "
+				"rec.cpos %u, rec.clusters %u, "
+				"insert.cpos %u, insert.clusters %u\n",
+				inode->i_ino,
+				le16_to_cpu(el->l_tree_depth),
+				le16_to_cpu(el->l_count),
+				le16_to_cpu(el->l_next_free_rec),
+				le32_to_cpu(el->l_recs[i].e_cpos),
+				le16_to_cpu(el->l_recs[i].e_leaf_clusters),
+				le32_to_cpu(insert_rec->e_cpos),
+				le16_to_cpu(insert_rec->e_leaf_clusters));
+		i++;
+		el->l_recs[i] = *insert_rec;
+		le16_add_cpu(&el->l_next_free_rec, 1);
+		return;
+	}
+
+	/*
+	 * Ok, we have to rotate.
+	 *
+	 * At this point, it is safe to assume that inserting into an
+	 * empty leaf and appending to a leaf have both been handled
+	 * above.
+	 *
+	 * This leaf needs to have space, either by the empty 1st
+	 * extent record, or by virtue of an l_next_rec < l_count.
+	 */
+	ocfs2_rotate_leaf(el, insert_rec);
+}
+
+static inline void ocfs2_update_dinode_clusters(struct inode *inode,
+						struct ocfs2_dinode *di,
+						u32 clusters)
+{
+	le32_add_cpu(&di->i_clusters, clusters);
+	spin_lock(&OCFS2_I(inode)->ip_lock);
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters);
+	spin_unlock(&OCFS2_I(inode)->ip_lock);
+}
+
+static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_path *right_path,
+				    struct ocfs2_path **ret_left_path)
+{
+	int ret, i, next_free;
+	struct buffer_head *bh;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *left_path = NULL;
+
+	*ret_left_path = NULL;
+
+	/*
+	 * This shouldn't happen for non-trees. The extent rec cluster
+	 * count manipulation below only works for interior nodes.
+	 */
+	BUG_ON(right_path->p_tree_depth == 0);
+
+	/*
+	 * If our appending insert is at the leftmost edge of a leaf,
+	 * then we might need to update the rightmost records of the
+	 * neighboring path.
+	 */
+	el = path_leaf_el(right_path);
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	if (next_free == 0 ||
+	    (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) {
+		u32 left_cpos;
+
+		ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path,
+						    &left_cpos);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		mlog(0, "Append may need a left path update. cpos: %u, "
+		     "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos),
+		     left_cpos);
+
+		/*
+		 * No need to worry if the append is already in the
+		 * leftmost leaf.
+		 */
+		if (left_cpos) {
+			left_path = ocfs2_new_path(path_root_bh(right_path),
+						   path_root_el(right_path));
+			if (!left_path) {
+				ret = -ENOMEM;
+				mlog_errno(ret);
+				goto out;
+			}
+
+			ret = ocfs2_find_path(inode, left_path, left_cpos);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+
+			/*
+			 * ocfs2_insert_path() will pass the left_path to the
+			 * journal for us.
+			 */
+		}
+	}
+
+	ret = ocfs2_journal_access_path(inode, handle, right_path);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_root_el(right_path);
+	bh = path_root_bh(right_path);
+	i = 0;
+	while (1) {
+		struct ocfs2_extent_rec *rec;
+
+		next_free = le16_to_cpu(el->l_next_free_rec);
+		if (next_free == 0) {
+			ocfs2_error(inode->i_sb,
+				    "Dinode %llu has a bad extent list",
+				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			ret = -EIO;
+			goto out;
+		}
+
+		rec = &el->l_recs[next_free - 1];
+
+		rec->e_int_clusters = insert_rec->e_cpos;
+		le32_add_cpu(&rec->e_int_clusters,
+			     le16_to_cpu(insert_rec->e_leaf_clusters));
+		le32_add_cpu(&rec->e_int_clusters,
+			     -le32_to_cpu(rec->e_cpos));
+
+		ret = ocfs2_journal_dirty(handle, bh);
+		if (ret)
+			mlog_errno(ret);
+
+		/* Don't touch the leaf node */
+		if (++i >= right_path->p_tree_depth)
+			break;
+
+		bh = right_path->p_node[i].bh;
+		el = right_path->p_node[i].el;
+	}
+
+	*ret_left_path = left_path;
+	ret = 0;
+out:
+	if (ret != 0)
+		ocfs2_free_path(left_path);
+
+	return ret;
+}
+
+/*
+ * This function only does inserts on an allocation b-tree. For dinode
+ * lists, ocfs2_insert_at_leaf() is called directly.
+ *
+ * right_path is the path we want to do the actual insert
+ * in. left_path should only be passed in if we need to update that
+ * portion of the tree after an edge insert.
+ */
+static int ocfs2_insert_path(struct inode *inode,
+			     handle_t *handle,
+			     struct ocfs2_path *left_path,
+			     struct ocfs2_path *right_path,
+			     struct ocfs2_extent_rec *insert_rec,
+			     struct ocfs2_insert_type *insert)
+{
+	int ret, subtree_index;
+	struct buffer_head *leaf_bh = path_leaf_bh(right_path);
+	struct ocfs2_extent_list *el;
+
+	/*
+	 * Pass both paths to the journal. The majority of inserts
+	 * will be touching all components anyway.
+	 */
+	ret = ocfs2_journal_access_path(inode, handle, right_path);
+	if (ret < 0) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (left_path) {
+		int credits = handle->h_buffer_credits;
+
+		/*
+		 * There's a chance that left_path got passed back to
+		 * us without being accounted for in the
+		 * journal. Extend our transaction here to be sure we
+		 * can change those blocks.
+		 */
+		credits += left_path->p_tree_depth;
+
+		ret = ocfs2_extend_trans(handle, credits);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		ret = ocfs2_journal_access_path(inode, handle, left_path);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	el = path_leaf_el(right_path);
+
+	ocfs2_insert_at_leaf(insert_rec, el, insert, inode);
+	ret = ocfs2_journal_dirty(handle, leaf_bh);
+	if (ret)
+		mlog_errno(ret);
+
+	if (left_path) {
+		/*
+		 * The rotate code has indicated that we need to fix
+		 * up portions of the tree after the insert.
+		 *
+		 * XXX: Should we extend the transaction here?
+		 */
+		subtree_index = ocfs2_find_subtree_root(inode, left_path,
+							right_path);
+		ocfs2_complete_edge_insert(inode, handle, left_path,
+					   right_path, subtree_index);
+	}
+
+	ret = 0;
+out:
+	return ret;
+}
+
+static int ocfs2_do_insert_extent(struct inode *inode,
+				  handle_t *handle,
+				  struct buffer_head *di_bh,
+				  struct ocfs2_extent_rec *insert_rec,
+				  struct ocfs2_insert_type *type)
+{
+	int ret, rotate = 0;
+	u32 cpos;
+	struct ocfs2_path *right_path = NULL;
+	struct ocfs2_path *left_path = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_list *el;
+
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (le16_to_cpu(el->l_tree_depth) == 0) {
+		ocfs2_insert_at_leaf(insert_rec, el, type, inode);
+		goto out_update_clusters;
+	}
+
+	right_path = ocfs2_new_inode_path(di_bh);
+	if (!right_path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Determine the path to start with. Rotations need the
+	 * rightmost path, everything else can go directly to the
+	 * target leaf.
+	 */
+	cpos = le32_to_cpu(insert_rec->e_cpos);
+	if (type->ins_appending == APPEND_NONE &&
+	    type->ins_contig == CONTIG_NONE) {
+		rotate = 1;
+		cpos = UINT_MAX;
+	}
+
+	ret = ocfs2_find_path(inode, right_path, cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Rotations and appends need special treatment - they modify
+	 * parts of the tree's above them.
+	 *
+	 * Both might pass back a path immediate to the left of the
+	 * one being inserted to. This will be cause
+	 * ocfs2_insert_path() to modify the rightmost records of
+	 * left_path to account for an edge insert.
+	 *
+	 * XXX: When modifying this code, keep in mind that an insert
+	 * can wind up skipping both of these two special cases...
+	 */
+	if (rotate) {
+		ret = ocfs2_rotate_tree_right(inode, handle,
+					      le32_to_cpu(insert_rec->e_cpos),
+					      right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	} else if (type->ins_appending == APPEND_TAIL
+		   && type->ins_contig != CONTIG_LEFT) {
+		ret = ocfs2_append_rec_to_path(inode, handle, insert_rec,
+					       right_path, &left_path);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_insert_path(inode, handle, left_path, right_path,
+				insert_rec, type);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+out_update_clusters:
+	ocfs2_update_dinode_clusters(inode, di,
+				     le16_to_cpu(insert_rec->e_leaf_clusters));
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	ocfs2_free_path(left_path);
+	ocfs2_free_path(right_path);
+
+	return ret;
+}
+
+static void ocfs2_figure_contig_type(struct inode *inode,
+				     struct ocfs2_insert_type *insert,
+				     struct ocfs2_extent_list *el,
+				     struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	enum ocfs2_contig_type contig_type = CONTIG_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		contig_type = ocfs2_extent_contig(inode, &el->l_recs[i],
+						  insert_rec);
+		if (contig_type != CONTIG_NONE) {
+			insert->ins_contig_index = i;
+			break;
+		}
+	}
+	insert->ins_contig = contig_type;
+}
+
+/*
+ * This should only be called against the righmost leaf extent list.
+ *
+ * ocfs2_figure_appending_type() will figure out whether we'll have to
+ * insert at the tail of the rightmost leaf.
+ *
+ * This should also work against the dinode list for tree's with 0
+ * depth. If we consider the dinode list to be the rightmost leaf node
+ * then the logic here makes sense.
+ */
+static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert,
+					struct ocfs2_extent_list *el,
+					struct ocfs2_extent_rec *insert_rec)
+{
+	int i;
+	u32 cpos = le32_to_cpu(insert_rec->e_cpos);
+	struct ocfs2_extent_rec *rec;
+
+	insert->ins_appending = APPEND_NONE;
+
+	BUG_ON(le16_to_cpu(el->l_tree_depth) != 0);
+
+	if (!el->l_next_free_rec)
+		goto set_tail_append;
+
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		/* Were all records empty? */
+		if (le16_to_cpu(el->l_next_free_rec) == 1)
+			goto set_tail_append;
 	}
 
-	/* Can we allocate without adding/shifting tree bits? */
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
-	if (le16_to_cpu(el->l_next_free_rec) == 0
-	    || (le16_to_cpu(el->l_next_free_rec) < le16_to_cpu(el->l_count))
-	    || le32_to_cpu(el->l_recs[i].e_clusters) == 0
-	    || ocfs2_extent_contig(inode, &el->l_recs[i], start_blk))
-		goto out_add;
+	rec = &el->l_recs[i];
+
+	if (cpos >=
+	    (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)))
+		goto set_tail_append;
+
+	return;
+
+set_tail_append:
+	insert->ins_appending = APPEND_TAIL;
+}
+
+/*
+ * Helper function called at the begining of an insert.
+ *
+ * This computes a few things that are commonly used in the process of
+ * inserting into the btree:
+ *   - Whether the new extent is contiguous with an existing one.
+ *   - The current tree depth.
+ *   - Whether the insert is an appending one.
+ *   - The total # of free records in the tree.
+ *
+ * All of the information is stored on the ocfs2_insert_type
+ * structure.
+ */
+static int ocfs2_figure_insert_type(struct inode *inode,
+				    struct buffer_head *di_bh,
+				    struct buffer_head **last_eb_bh,
+				    struct ocfs2_extent_rec *insert_rec,
+				    struct ocfs2_insert_type *insert)
+{
+	int ret;
+	struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_path *path = NULL;
+	struct buffer_head *bh = NULL;
+
+	el = &di->id2.i_list;
+	insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth);
+
+	if (el->l_tree_depth) {
+		/*
+		 * If we have tree depth, we read in the
+		 * rightmost extent block ahead of time as
+		 * ocfs2_figure_insert_type() and ocfs2_add_branch()
+		 * may want it later.
+		 */
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(di->i_last_eb_blk), &bh,
+				       OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_exit(ret);
+			goto out;
+		}
+		eb = (struct ocfs2_extent_block *) bh->b_data;
+		el = &eb->h_list;
+	}
+
+	/*
+	 * Unless we have a contiguous insert, we'll need to know if
+	 * there is room left in our allocation tree for another
+	 * extent record.
+	 *
+	 * XXX: This test is simplistic, we can search for empty
+	 * extent records too.
+	 */
+	insert->ins_free_records = le16_to_cpu(el->l_count) -
+		le16_to_cpu(el->l_next_free_rec);
+
+	if (!insert->ins_tree_depth) {
+		ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+		return 0;
+	}
+
+	path = ocfs2_new_inode_path(di_bh);
+	if (!path) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * In the case that we're inserting past what the tree
+	 * currently accounts for, ocfs2_find_path() will return for
+	 * us the rightmost tree path. This is accounted for below in
+	 * the appending code.
+	 */
+	ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos));
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el = path_leaf_el(path);
+
+	/*
+	 * Now that we have the path, there's two things we want to determine:
+	 * 1) Contiguousness (also set contig_index if this is so)
+	 *
+	 * 2) Are we doing an append? We can trivially break this up
+         *     into two types of appends: simple record append, or a
+         *     rotate inside the tail leaf.
+	 */
+	ocfs2_figure_contig_type(inode, insert, el, insert_rec);
+
+	/*
+	 * The insert code isn't quite ready to deal with all cases of
+	 * left contiguousness. Specifically, if it's an insert into
+	 * the 1st record in a leaf, it will require the adjustment of
+	 * cluster count on the last record of the path directly to it's
+	 * left. For now, just catch that case and fool the layers
+	 * above us. This works just fine for tree_depth == 0, which
+	 * is why we allow that above.
+	 */
+	if (insert->ins_contig == CONTIG_LEFT &&
+	    insert->ins_contig_index == 0)
+		insert->ins_contig = CONTIG_NONE;
+
+	/*
+	 * Ok, so we can simply compare against last_eb to figure out
+	 * whether the path doesn't exist. This will only happen in
+	 * the case that we're doing a tail append, so maybe we can
+	 * take advantage of that information somehow.
+	 */
+	if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) {
+		/*
+		 * Ok, ocfs2_find_path() returned us the rightmost
+		 * tree path. This might be an appending insert. There are
+		 * two cases:
+		 *    1) We're doing a true append at the tail:
+		 *	-This might even be off the end of the leaf
+		 *    2) We're "appending" by rotating in the tail
+		 */
+		ocfs2_figure_appending_type(insert, el, insert_rec);
+	}
+
+out:
+	ocfs2_free_path(path);
+
+	if (ret == 0)
+		*last_eb_bh = bh;
+	else
+		brelse(bh);
+	return ret;
+}
+
+/*
+ * Insert an extent into an inode btree.
+ *
+ * The caller needs to update fe->i_clusters
+ */
+int ocfs2_insert_extent(struct ocfs2_super *osb,
+			handle_t *handle,
+			struct inode *inode,
+			struct buffer_head *fe_bh,
+			u32 cpos,
+			u64 start_blk,
+			u32 new_clusters,
+			struct ocfs2_alloc_context *meta_ac)
+{
+	int status, shift;
+	struct buffer_head *last_eb_bh = NULL;
+	struct buffer_head *bh = NULL;
+	struct ocfs2_insert_type insert = {0, };
+	struct ocfs2_extent_rec rec;
+
+	mlog(0, "add %u clusters at position %u to inode %llu\n",
+	     new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) &&
+			(OCFS2_I(inode)->ip_clusters != cpos),
+			"Device %s, asking for sparse allocation: inode %llu, "
+			"cpos %u, clusters %u\n",
+			osb->dev_str,
+			(unsigned long long)OCFS2_I(inode)->ip_blkno, cpos,
+			OCFS2_I(inode)->ip_clusters);
+
+	memset(&rec, 0, sizeof(rec));
+	rec.e_cpos = cpu_to_le32(cpos);
+	rec.e_blkno = cpu_to_le64(start_blk);
+	rec.e_leaf_clusters = cpu_to_le16(new_clusters);
+
+	status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec,
+					  &insert);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
 
-	mlog(0, "ocfs2_allocate_extent: couldn't do a simple add, traversing "
-	     "tree now.\n");
+	mlog(0, "Insert.appending: %u, Insert.Contig: %u, "
+	     "Insert.contig_index: %d, Insert.free_records: %d, "
+	     "Insert.tree_depth: %d\n",
+	     insert.ins_appending, insert.ins_contig, insert.ins_contig_index,
+	     insert.ins_free_records, insert.ins_tree_depth);
+
+	/*
+	 * Avoid growing the tree unless we're out of records and the
+	 * insert type requres one.
+	 */
+	if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records)
+		goto out_add;
 
 	shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh);
 	if (shift < 0) {
@@ -866,13 +2382,9 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	 * and didn't find room for any more extents - we need to add
 	 * another tree level */
 	if (shift) {
-		/* if we hit a leaf, we'd better be empty :) */
-		BUG_ON(le16_to_cpu(el->l_next_free_rec) !=
-		       le16_to_cpu(el->l_count));
 		BUG_ON(bh);
-		mlog(0, "ocfs2_allocate_extent: need to shift tree depth "
-		     "(current = %u)\n",
-		     le16_to_cpu(fe->id2.i_list.l_tree_depth));
+		mlog(0, "need to shift tree depth "
+		     "(current = %d)\n", insert.ins_tree_depth);
 
 		/* ocfs2_shift_tree_depth will return us a buffer with
 		 * the new extent block (so we can pass that to
@@ -883,15 +2395,16 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			mlog_errno(status);
 			goto bail;
 		}
+		insert.ins_tree_depth++;
 		/* Special case: we have room now if we shifted from
 		 * tree_depth 0 */
-		if (fe->id2.i_list.l_tree_depth == cpu_to_le16(1))
+		if (insert.ins_tree_depth == 1)
 			goto out_add;
 	}
 
 	/* call ocfs2_add_branch to add the final part of the tree with
 	 * the new data. */
-	mlog(0, "ocfs2_allocate_extent: add branch. bh = %p\n", bh);
+	mlog(0, "add branch. bh = %p\n", bh);
 	status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh,
 				  meta_ac);
 	if (status < 0) {
@@ -900,11 +2413,12 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 	}
 
 out_add:
-	/* Finally, we can add clusters. */
-	status = ocfs2_do_insert_extent(osb, handle, inode, fe_bh,
-					start_blk, new_clusters);
+	/* Finally, we can add clusters. This might rotate the tree for us. */
+	status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert);
 	if (status < 0)
 		mlog_errno(status);
+	else
+		ocfs2_extent_map_insert_rec(inode, &rec);
 
 bail:
 	if (bh)
@@ -1447,168 +2961,389 @@ int ocfs2_truncate_log_init(struct ocfs2_super *osb)
  * block will be deleted, and if it will, what the new last extent
  * block will be so we can update his h_next_leaf_blk field, as well
  * as the dinodes i_last_eb_blk */
-static int ocfs2_find_new_last_ext_blk(struct ocfs2_super *osb,
-				       struct inode *inode,
-				       struct ocfs2_dinode *fe,
-				       u32 new_i_clusters,
-				       struct buffer_head *old_last_eb,
+static int ocfs2_find_new_last_ext_blk(struct inode *inode,
+				       unsigned int clusters_to_del,
+				       struct ocfs2_path *path,
 				       struct buffer_head **new_last_eb)
 {
-	int i, status = 0;
-	u64 block = 0;
+	int next_free, ret = 0;
+	u32 cpos;
+	struct ocfs2_extent_rec *rec;
 	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_list *el;
 	struct buffer_head *bh = NULL;
 
 	*new_last_eb = NULL;
 
-	if (!OCFS2_IS_VALID_DINODE(fe)) {
-		OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe);
-		status = -EIO;
-		goto bail;
-	}
-
 	/* we have no tree, so of course, no last_eb. */
-	if (!fe->id2.i_list.l_tree_depth)
-		goto bail;
+	if (!path->p_tree_depth)
+		goto out;
 
 	/* trunc to zero special case - this makes tree_depth = 0
 	 * regardless of what it is.  */
-	if (!new_i_clusters)
-		goto bail;
+	if (OCFS2_I(inode)->ip_clusters == clusters_to_del)
+		goto out;
 
-	eb = (struct ocfs2_extent_block *) old_last_eb->b_data;
-	el = &(eb->h_list);
+	el = path_leaf_el(path);
 	BUG_ON(!el->l_next_free_rec);
 
-	/* Make sure that this guy will actually be empty after we
-	 * clear away the data. */
-	if (le32_to_cpu(el->l_recs[0].e_cpos) < new_i_clusters)
-		goto bail;
+	/*
+	 * Make sure that this extent list will actually be empty
+	 * after we clear away the data. We can shortcut out if
+	 * there's more than one non-empty extent in the
+	 * list. Otherwise, a check of the remaining extent is
+	 * necessary.
+	 */
+	next_free = le16_to_cpu(el->l_next_free_rec);
+	rec = NULL;
+	if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+		if (next_free > 2)
+			goto out;
 
-	/* Ok, at this point, we know that last_eb will definitely
-	 * change, so lets traverse the tree and find the second to
-	 * last extent block. */
-	el = &(fe->id2.i_list);
-	/* go down the tree, */
-	do {
-		for(i = (le16_to_cpu(el->l_next_free_rec) - 1); i >= 0; i--) {
-			if (le32_to_cpu(el->l_recs[i].e_cpos) <
-			    new_i_clusters) {
-				block = le64_to_cpu(el->l_recs[i].e_blkno);
-				break;
-			}
+		/* We may have a valid extent in index 1, check it. */
+		if (next_free == 2)
+			rec = &el->l_recs[1];
+
+		/*
+		 * Fall through - no more nonempty extents, so we want
+		 * to delete this leaf.
+		 */
+	} else {
+		if (next_free > 1)
+			goto out;
+
+		rec = &el->l_recs[0];
+	}
+
+	if (rec) {
+		/*
+		 * Check it we'll only be trimming off the end of this
+		 * cluster.
+		 */
+		if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del)
+			goto out;
+	}
+
+	ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	eb = (struct ocfs2_extent_block *) bh->b_data;
+	el = &eb->h_list;
+	if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
+		OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
+		ret = -EROFS;
+		goto out;
+	}
+
+	*new_last_eb = bh;
+	get_bh(*new_last_eb);
+	mlog(0, "returning block %llu, (cpos: %u)\n",
+	     (unsigned long long)le64_to_cpu(eb->h_blkno), cpos);
+out:
+	brelse(bh);
+
+	return ret;
+}
+
+/*
+ * Trim some clusters off the rightmost edge of a tree. Only called
+ * during truncate.
+ *
+ * The caller needs to:
+ *   - start journaling of each path component.
+ *   - compute and fully set up any new last ext block
+ */
+static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path,
+			   handle_t *handle, struct ocfs2_truncate_context *tc,
+			   u32 clusters_to_del, u64 *delete_start)
+{
+	int ret, i, index = path->p_tree_depth;
+	u32 new_edge = 0;
+	u64 deleted_eb = 0;
+	struct buffer_head *bh;
+	struct ocfs2_extent_list *el;
+	struct ocfs2_extent_rec *rec;
+
+	*delete_start = 0;
+
+	while (index >= 0) {
+		bh = path->p_node[index].bh;
+		el = path->p_node[index].el;
+
+		mlog(0, "traveling tree (index = %d, block = %llu)\n",
+		     index,  (unsigned long long)bh->b_blocknr);
+
+		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
+
+		if (index !=
+		    (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has invalid ext. block %llu",
+				    inode->i_ino,
+				    (unsigned long long)bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
 		}
-		BUG_ON(i < 0);
 
-		if (bh) {
-			brelse(bh);
-			bh = NULL;
+find_tail_record:
+		i = le16_to_cpu(el->l_next_free_rec) - 1;
+		rec = &el->l_recs[i];
+
+		mlog(0, "Extent list before: record %d: (%u, %u, %llu), "
+		     "next = %u\n", i, le32_to_cpu(rec->e_cpos),
+		     ocfs2_rec_clusters(el, rec),
+		     (unsigned long long)le64_to_cpu(rec->e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del);
+
+		if (le16_to_cpu(el->l_tree_depth) == 0) {
+			/*
+			 * If the leaf block contains a single empty
+			 * extent and no records, we can just remove
+			 * the block.
+			 */
+			if (i == 0 && ocfs2_is_empty_extent(rec)) {
+				memset(rec, 0,
+				       sizeof(struct ocfs2_extent_rec));
+				el->l_next_free_rec = cpu_to_le16(0);
+
+				goto delete;
+			}
+
+			/*
+			 * Remove any empty extents by shifting things
+			 * left. That should make life much easier on
+			 * the code below. This condition is rare
+			 * enough that we shouldn't see a performance
+			 * hit.
+			 */
+			if (ocfs2_is_empty_extent(&el->l_recs[0])) {
+				le16_add_cpu(&el->l_next_free_rec, -1);
+
+				for(i = 0;
+				    i < le16_to_cpu(el->l_next_free_rec); i++)
+					el->l_recs[i] = el->l_recs[i + 1];
+
+				memset(&el->l_recs[i], 0,
+				       sizeof(struct ocfs2_extent_rec));
+
+				/*
+				 * We've modified our extent list. The
+				 * simplest way to handle this change
+				 * is to being the search from the
+				 * start again.
+				 */
+				goto find_tail_record;
+			}
+
+			le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del);
+
+			/*
+			 * We'll use "new_edge" on our way back up the
+			 * tree to know what our rightmost cpos is.
+			 */
+			new_edge = le16_to_cpu(rec->e_leaf_clusters);
+			new_edge += le32_to_cpu(rec->e_cpos);
+
+			/*
+			 * The caller will use this to delete data blocks.
+			 */
+			*delete_start = le64_to_cpu(rec->e_blkno)
+				+ ocfs2_clusters_to_blocks(inode->i_sb,
+					le16_to_cpu(rec->e_leaf_clusters));
+
+			/*
+			 * If it's now empty, remove this record.
+			 */
+			if (le16_to_cpu(rec->e_leaf_clusters) == 0) {
+				memset(rec, 0,
+				       sizeof(struct ocfs2_extent_rec));
+				le16_add_cpu(&el->l_next_free_rec, -1);
+			}
+		} else {
+			if (le64_to_cpu(rec->e_blkno) == deleted_eb) {
+				memset(rec, 0,
+				       sizeof(struct ocfs2_extent_rec));
+				le16_add_cpu(&el->l_next_free_rec, -1);
+
+				goto delete;
+			}
+
+			/* Can this actually happen? */
+			if (le16_to_cpu(el->l_next_free_rec) == 0)
+				goto delete;
+
+			/*
+			 * We never actually deleted any clusters
+			 * because our leaf was empty. There's no
+			 * reason to adjust the rightmost edge then.
+			 */
+			if (new_edge == 0)
+				goto delete;
+
+			rec->e_int_clusters = cpu_to_le32(new_edge);
+			le32_add_cpu(&rec->e_int_clusters,
+				     -le32_to_cpu(rec->e_cpos));
+
+			 /*
+			  * A deleted child record should have been
+			  * caught above.
+			  */
+			 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0);
 		}
 
-		status = ocfs2_read_block(osb, block, &bh, OCFS2_BH_CACHED,
-					 inode);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
+delete:
+		ret = ocfs2_journal_dirty(handle, bh);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
 		}
-		eb = (struct ocfs2_extent_block *) bh->b_data;
-		el = &eb->h_list;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
+
+		mlog(0, "extent list container %llu, after: record %d: "
+		     "(%u, %u, %llu), next = %u.\n",
+		     (unsigned long long)bh->b_blocknr, i,
+		     le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec),
+		     (unsigned long long)le64_to_cpu(rec->e_blkno),
+		     le16_to_cpu(el->l_next_free_rec));
+
+		/*
+		 * We must be careful to only attempt delete of an
+		 * extent block (and not the root inode block).
+		 */
+		if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) {
+			struct ocfs2_extent_block *eb =
+				(struct ocfs2_extent_block *)bh->b_data;
+
+			/*
+			 * Save this for use when processing the
+			 * parent block.
+			 */
+			deleted_eb = le64_to_cpu(eb->h_blkno);
+
+			mlog(0, "deleting this extent block.\n");
+
+			ocfs2_remove_from_cache(inode, bh);
+
+			BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0]));
+			BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos));
+			BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno));
+
+			if (le16_to_cpu(eb->h_suballoc_slot) == 0) {
+				/*
+				 * This code only understands how to
+				 * lock the suballocator in slot 0,
+				 * which is fine because allocation is
+				 * only ever done out of that
+				 * suballocator too. A future version
+				 * might change that however, so avoid
+				 * a free if we don't know how to
+				 * handle it. This way an fs incompat
+				 * bit will not be necessary.
+				 */
+				ret = ocfs2_free_extent_block(handle,
+							      tc->tc_ext_alloc_inode,
+							      tc->tc_ext_alloc_bh,
+							      eb);
+
+				/* An error here is not fatal. */
+				if (ret < 0)
+					mlog_errno(ret);
+			}
+		} else {
+			deleted_eb = 0;
 		}
-	} while (el->l_tree_depth);
 
-	*new_last_eb = bh;
-	get_bh(*new_last_eb);
-	mlog(0, "returning block %llu\n",
-	     (unsigned long long)le64_to_cpu(eb->h_blkno));
-bail:
-	if (bh)
-		brelse(bh);
+		index--;
+	}
 
-	return status;
+	ret = 0;
+out:
+	return ret;
 }
 
 static int ocfs2_do_truncate(struct ocfs2_super *osb,
 			     unsigned int clusters_to_del,
 			     struct inode *inode,
 			     struct buffer_head *fe_bh,
-			     struct buffer_head *old_last_eb_bh,
 			     handle_t *handle,
-			     struct ocfs2_truncate_context *tc)
+			     struct ocfs2_truncate_context *tc,
+			     struct ocfs2_path *path)
 {
-	int status, i, depth;
+	int status;
 	struct ocfs2_dinode *fe;
-	struct ocfs2_extent_block *eb;
 	struct ocfs2_extent_block *last_eb = NULL;
 	struct ocfs2_extent_list *el;
-	struct buffer_head *eb_bh = NULL;
 	struct buffer_head *last_eb_bh = NULL;
-	u64 next_eb = 0;
 	u64 delete_blk = 0;
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	status = ocfs2_find_new_last_ext_blk(osb,
-					     inode,
-					     fe,
-					     le32_to_cpu(fe->i_clusters) -
-					     		clusters_to_del,
-					     old_last_eb_bh,
-					     &last_eb_bh);
+	status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del,
+					     path, &last_eb_bh);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
-	if (last_eb_bh)
-		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
 
-	status = ocfs2_journal_access(handle, inode, fe_bh,
-				      OCFS2_JOURNAL_ACCESS_WRITE);
+	/*
+	 * Each component will be touched, so we might as well journal
+	 * here to avoid having to handle errors later.
+	 */
+	status = ocfs2_journal_access_path(inode, handle, path);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
 	}
+
+	if (last_eb_bh) {
+		status = ocfs2_journal_access(handle, inode, last_eb_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto bail;
+		}
+
+		last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
+	}
+
 	el = &(fe->id2.i_list);
 
+	/*
+	 * Lower levels depend on this never happening, but it's best
+	 * to check it up here before changing the tree.
+	 */
+	if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %lu has an empty extent record, depth %u\n",
+			    inode->i_ino, le16_to_cpu(el->l_tree_depth));
+		status = -EROFS;
+		goto bail;
+	}
+
 	spin_lock(&OCFS2_I(inode)->ip_lock);
 	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) -
 				      clusters_to_del;
 	spin_unlock(&OCFS2_I(inode)->ip_lock);
 	le32_add_cpu(&fe->i_clusters, -clusters_to_del);
-	fe->i_mtime = cpu_to_le64(CURRENT_TIME.tv_sec);
-	fe->i_mtime_nsec = cpu_to_le32(CURRENT_TIME.tv_nsec);
-
-	i = le16_to_cpu(el->l_next_free_rec) - 1;
-
-	BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-	le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-	/* tree depth zero, we can just delete the clusters, otherwise
-	 * we need to record the offset of the next level extent block
-	 * as we may overwrite it. */
-	if (!el->l_tree_depth)
-		delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-			+ ocfs2_clusters_to_blocks(osb->sb,
-					le32_to_cpu(el->l_recs[i].e_clusters));
-	else
-		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
 
-	if (!el->l_recs[i].e_clusters) {
-		/* if we deleted the whole extent record, then clear
-		 * out the other fields and update the extent
-		 * list. For depth > 0 trees, we've already recorded
-		 * the extent block in 'next_eb' */
-		el->l_recs[i].e_cpos = 0;
-		el->l_recs[i].e_blkno = 0;
-		BUG_ON(!el->l_next_free_rec);
-		le16_add_cpu(&el->l_next_free_rec, -1);
+	status = ocfs2_trim_tree(inode, path, handle, tc,
+				 clusters_to_del, &delete_blk);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
 	}
 
-	depth = le16_to_cpu(el->l_tree_depth);
-	if (!fe->i_clusters) {
+	if (le32_to_cpu(fe->i_clusters) == 0) {
 		/* trunc to zero is a special case. */
 		el->l_tree_depth = 0;
 		fe->i_last_eb_blk = 0;
@@ -1625,12 +3360,6 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 		/* If there will be a new last extent block, then by
 		 * definition, there cannot be any leaves to the right of
 		 * him. */
-		status = ocfs2_journal_access(handle, inode, last_eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
 		last_eb->h_next_leaf_blk = 0;
 		status = ocfs2_journal_dirty(handle, last_eb_bh);
 		if (status < 0) {
@@ -1639,123 +3368,247 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb,
 		}
 	}
 
-	/* if our tree depth > 0, update all the tree blocks below us. */
-	while (depth) {
-		mlog(0, "traveling tree (depth = %d, next_eb = %llu)\n",
-		     depth,  (unsigned long long)next_eb);
-		status = ocfs2_read_block(osb, next_eb, &eb_bh,
-					  OCFS2_BH_CACHED, inode);
+	if (delete_blk) {
+		status = ocfs2_truncate_log_append(osb, handle, delete_blk,
+						   clusters_to_del);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
 		}
-		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
+	}
+	status = 0;
+bail:
+
+	mlog_exit(status);
+	return status;
+}
+
+static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return 0;
+}
+
+static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh)
+{
+	set_buffer_uptodate(bh);
+	mark_buffer_dirty(bh);
+	return ocfs2_journal_dirty_data(handle, bh);
+}
+
+static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize,
+				     struct page **pages, int numpages,
+				     u64 phys, handle_t *handle)
+{
+	int i, ret, partial = 0;
+	void *kaddr;
+	struct page *page;
+	unsigned int from, to = PAGE_CACHE_SIZE;
+	struct super_block *sb = inode->i_sb;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	if (numpages == 0)
+		goto out;
+
+	from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */
+	if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) {
+		/*
+		 * Since 'from' has been capped to a value below page
+		 * size, this calculation won't be able to overflow
+		 * 'to'
+		 */
+		to = ocfs2_align_bytes_to_clusters(sb, from);
+
+		/*
+		 * The truncate tail in this case should never contain
+		 * more than one page at maximum. The loop below also
+		 * assumes this.
+		 */
+		BUG_ON(numpages != 1);
+	}
+
+	for(i = 0; i < numpages; i++) {
+		page = pages[i];
+
+		BUG_ON(from > PAGE_CACHE_SIZE);
+		BUG_ON(to > PAGE_CACHE_SIZE);
+
+		ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0);
+		if (ret)
+			mlog_errno(ret);
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr + from, 0, to - from);
+		kunmap_atomic(kaddr, KM_USER0);
+
+		/*
+		 * Need to set the buffers we zero'd into uptodate
+		 * here if they aren't - ocfs2_map_page_blocks()
+		 * might've skipped some
+		 */
+		if (ocfs2_should_order_data(inode)) {
+			ret = walk_page_buffers(handle,
+						page_buffers(page),
+						from, to, &partial,
+						ocfs2_ordered_zero_func);
+			if (ret < 0)
+				mlog_errno(ret);
+		} else {
+			ret = walk_page_buffers(handle, page_buffers(page),
+						from, to, &partial,
+						ocfs2_writeback_zero_func);
+			if (ret < 0)
+				mlog_errno(ret);
 		}
-		el = &(eb->h_list);
 
-		status = ocfs2_journal_access(handle, inode, eb_bh,
-					      OCFS2_JOURNAL_ACCESS_WRITE);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
+		if (!partial)
+			SetPageUptodate(page);
+
+		flush_dcache_page(page);
+
+		/*
+		 * Every page after the 1st one should be completely zero'd.
+		 */
+		from = 0;
+	}
+out:
+	if (pages) {
+		for (i = 0; i < numpages; i++) {
+			page = pages[i];
+			unlock_page(page);
+			mark_page_accessed(page);
+			page_cache_release(page);
 		}
+	}
+}
 
-		BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0);
-		BUG_ON(depth != (le16_to_cpu(el->l_tree_depth) + 1));
+static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages,
+				int *num, u64 *phys)
+{
+	int i, numpages = 0, ret = 0;
+	unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize;
+	unsigned int ext_flags;
+	struct super_block *sb = inode->i_sb;
+	struct address_space *mapping = inode->i_mapping;
+	unsigned long index;
+	u64 next_cluster_bytes;
+
+	BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb)));
+
+	/* Cluster boundary, so we don't need to grab any pages. */
+	if ((isize & (csize - 1)) == 0)
+		goto out;
 
-		i = le16_to_cpu(el->l_next_free_rec) - 1;
+	ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits,
+					  phys, NULL, &ext_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
 
-		mlog(0, "extent block %llu, before: record %d: "
-		     "(%u, %u, %llu), next = %u\n",
-		     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-		     le32_to_cpu(el->l_recs[i].e_cpos),
-		     le32_to_cpu(el->l_recs[i].e_clusters),
-		     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-		     le16_to_cpu(el->l_next_free_rec));
+	/* Tail is a hole. */
+	if (*phys == 0)
+		goto out;
 
-		BUG_ON(le32_to_cpu(el->l_recs[i].e_clusters) < clusters_to_del);
-		le32_add_cpu(&el->l_recs[i].e_clusters, -clusters_to_del);
-
-		next_eb = le64_to_cpu(el->l_recs[i].e_blkno);
-		/* bottom-most block requires us to delete data.*/
-		if (!el->l_tree_depth)
-			delete_blk = le64_to_cpu(el->l_recs[i].e_blkno)
-				+ ocfs2_clusters_to_blocks(osb->sb,
-					le32_to_cpu(el->l_recs[i].e_clusters));
-		if (!el->l_recs[i].e_clusters) {
-			el->l_recs[i].e_cpos = 0;
-			el->l_recs[i].e_blkno = 0;
-			BUG_ON(!el->l_next_free_rec);
-			le16_add_cpu(&el->l_next_free_rec, -1);
-		}
-		mlog(0, "extent block %llu, after: record %d: "
-		     "(%u, %u, %llu), next = %u\n",
-		     (unsigned long long)le64_to_cpu(eb->h_blkno), i,
-		     le32_to_cpu(el->l_recs[i].e_cpos),
-		     le32_to_cpu(el->l_recs[i].e_clusters),
-		     (unsigned long long)le64_to_cpu(el->l_recs[i].e_blkno),
-		     le16_to_cpu(el->l_next_free_rec));
+	/* Tail is marked as unwritten, we can count on write to zero
+	 * in that case. */
+	if (ext_flags & OCFS2_EXT_UNWRITTEN)
+		goto out;
 
-		status = ocfs2_journal_dirty(handle, eb_bh);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
+	next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize);
+	index = isize >> PAGE_CACHE_SHIFT;
+	do {
+		pages[numpages] = grab_cache_page(mapping, index);
+		if (!pages[numpages]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
 		}
 
-		if (!el->l_next_free_rec) {
-			mlog(0, "deleting this extent block.\n");
-
-			ocfs2_remove_from_cache(inode, eb_bh);
+		numpages++;
+		index++;
+	} while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT));
 
-			BUG_ON(el->l_recs[0].e_clusters);
-			BUG_ON(el->l_recs[0].e_cpos);
-			BUG_ON(el->l_recs[0].e_blkno);
-			if (eb->h_suballoc_slot == 0) {
-				/*
-				 * This code only understands how to
-				 * lock the suballocator in slot 0,
-				 * which is fine because allocation is
-				 * only ever done out of that
-				 * suballocator too. A future version
-				 * might change that however, so avoid
-				 * a free if we don't know how to
-				 * handle it. This way an fs incompat
-				 * bit will not be necessary.
-				 */
-				status = ocfs2_free_extent_block(handle,
-								 tc->tc_ext_alloc_inode,
-								 tc->tc_ext_alloc_bh,
-								 eb);
-				if (status < 0) {
-					mlog_errno(status);
-					goto bail;
+out:
+	if (ret != 0) {
+		if (pages) {
+			for (i = 0; i < numpages; i++) {
+				if (pages[i]) {
+					unlock_page(pages[i]);
+					page_cache_release(pages[i]);
 				}
 			}
 		}
-		brelse(eb_bh);
-		eb_bh = NULL;
-		depth--;
+		numpages = 0;
 	}
 
-	BUG_ON(!delete_blk);
-	status = ocfs2_truncate_log_append(osb, handle, delete_blk,
-					   clusters_to_del);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
+	*num = numpages;
+
+	return ret;
+}
+
+/*
+ * Zero the area past i_size but still within an allocated
+ * cluster. This avoids exposing nonzero data on subsequent file
+ * extends.
+ *
+ * We need to call this before i_size is updated on the inode because
+ * otherwise block_write_full_page() will skip writeout of pages past
+ * i_size. The new_i_size parameter is passed for this reason.
+ */
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+				 u64 new_i_size)
+{
+	int ret, numpages;
+	loff_t endbyte;
+	struct page **pages = NULL;
+	u64 phys;
+
+	/*
+	 * File systems which don't support sparse files zero on every
+	 * extend.
+	 */
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		return 0;
+
+	pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb),
+			sizeof(struct page *), GFP_NOFS);
+	if (pages == NULL) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		goto out;
 	}
-	status = 0;
-bail:
-	if (!status)
-		ocfs2_extent_map_trunc(inode, le32_to_cpu(fe->i_clusters));
-	else
-		ocfs2_extent_map_drop(inode, 0);
-	mlog_exit(status);
-	return status;
+
+	ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	if (numpages == 0)
+		goto out;
+
+	ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys,
+				 handle);
+
+	/*
+	 * Initiate writeout of the pages we zero'd here. We don't
+	 * wait on them - the truncate_inode_pages() call later will
+	 * do that for us.
+	 */
+	endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size);
+	ret = do_sync_mapping_range(inode->i_mapping, new_i_size,
+				    endbyte - 1, SYNC_FILE_RANGE_WRITE);
+	if (ret)
+		mlog_errno(ret);
+
+out:
+	if (pages)
+		kfree(pages);
+
+	return ret;
 }
 
 /*
@@ -1770,82 +3623,90 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct ocfs2_truncate_context *tc)
 {
 	int status, i, credits, tl_sem = 0;
-	u32 clusters_to_del, target_i_clusters;
-	u64 last_eb = 0;
-	struct ocfs2_dinode *fe;
-	struct ocfs2_extent_block *eb;
+	u32 clusters_to_del, new_highest_cpos, range;
 	struct ocfs2_extent_list *el;
-	struct buffer_head *last_eb_bh;
 	handle_t *handle = NULL;
 	struct inode *tl_inode = osb->osb_tl_inode;
+	struct ocfs2_path *path = NULL;
 
 	mlog_entry_void();
 
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 
-	target_i_clusters = ocfs2_clusters_for_bytes(osb->sb,
+	new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb,
 						     i_size_read(inode));
 
-	last_eb_bh = tc->tc_last_eb_bh;
-	tc->tc_last_eb_bh = NULL;
+	path = ocfs2_new_inode_path(fe_bh);
+	if (!path) {
+		status = -ENOMEM;
+		mlog_errno(status);
+		goto bail;
+	}
 
-	fe = (struct ocfs2_dinode *) fe_bh->b_data;
+	ocfs2_extent_map_trunc(inode, new_highest_cpos);
 
-	if (fe->id2.i_list.l_tree_depth) {
-		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		el = &eb->h_list;
-	} else
-		el = &fe->id2.i_list;
-	last_eb = le64_to_cpu(fe->i_last_eb_blk);
 start:
-	mlog(0, "ocfs2_commit_truncate: fe->i_clusters = %u, "
-	     "last_eb = %llu, fe->i_last_eb_blk = %llu, "
-	     "fe->id2.i_list.l_tree_depth = %u last_eb_bh = %p\n",
-	     le32_to_cpu(fe->i_clusters), (unsigned long long)last_eb,
-	     (unsigned long long)le64_to_cpu(fe->i_last_eb_blk),
-	     le16_to_cpu(fe->id2.i_list.l_tree_depth), last_eb_bh);
-
-	if (last_eb != le64_to_cpu(fe->i_last_eb_blk)) {
-		mlog(0, "last_eb changed!\n");
-		BUG_ON(!fe->id2.i_list.l_tree_depth);
-		last_eb = le64_to_cpu(fe->i_last_eb_blk);
-		/* i_last_eb_blk may have changed, read it if
-		 * necessary. We don't have to worry about the
-		 * truncate to zero case here (where there becomes no
-		 * last_eb) because we never loop back after our work
-		 * is done. */
-		if (last_eb_bh) {
-			brelse(last_eb_bh);
-			last_eb_bh = NULL;
-		}
+	/*
+	 * Check that we still have allocation to delete.
+	 */
+	if (OCFS2_I(inode)->ip_clusters == 0) {
+		status = 0;
+		goto bail;
+	}
 
-		status = ocfs2_read_block(osb, last_eb,
-					  &last_eb_bh, OCFS2_BH_CACHED,
-					  inode);
-		if (status < 0) {
-			mlog_errno(status);
-			goto bail;
-		}
-		eb = (struct ocfs2_extent_block *) last_eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			status = -EIO;
-			goto bail;
-		}
-		el = &(eb->h_list);
+	/*
+	 * Truncate always works against the rightmost tree branch.
+	 */
+	status = ocfs2_find_path(inode, path, UINT_MAX);
+	if (status) {
+		mlog_errno(status);
+		goto bail;
+	}
+
+	mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n",
+	     OCFS2_I(inode)->ip_clusters, path->p_tree_depth);
+
+	/*
+	 * By now, el will point to the extent list on the bottom most
+	 * portion of this tree. Only the tail record is considered in
+	 * each pass.
+	 *
+	 * We handle the following cases, in order:
+	 * - empty extent: delete the remaining branch
+	 * - remove the entire record
+	 * - remove a partial record
+	 * - no record needs to be removed (truncate has completed)
+	 */
+	el = path_leaf_el(path);
+	if (le16_to_cpu(el->l_next_free_rec) == 0) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has empty extent block at %llu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)path_leaf_bh(path)->b_blocknr);
+		status = -EROFS;
+		goto bail;
 	}
 
-	/* by now, el will point to the extent list on the bottom most
-	 * portion of this tree. */
 	i = le16_to_cpu(el->l_next_free_rec) - 1;
-	if (le32_to_cpu(el->l_recs[i].e_cpos) >= target_i_clusters)
-		clusters_to_del = le32_to_cpu(el->l_recs[i].e_clusters);
-	else
-		clusters_to_del = (le32_to_cpu(el->l_recs[i].e_clusters) +
+	range = le32_to_cpu(el->l_recs[i].e_cpos) +
+		ocfs2_rec_clusters(el, &el->l_recs[i]);
+	if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) {
+		clusters_to_del = 0;
+	} else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) {
+		clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]);
+	} else if (range > new_highest_cpos) {
+		clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) +
 				   le32_to_cpu(el->l_recs[i].e_cpos)) -
-				  target_i_clusters;
+				  new_highest_cpos;
+	} else {
+		status = 0;
+		goto bail;
+	}
 
-	mlog(0, "clusters_to_del = %u in this pass\n", clusters_to_del);
+	mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n",
+	     clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr);
+
+	BUG_ON(clusters_to_del == 0);
 
 	mutex_lock(&tl_inode->i_mutex);
 	tl_sem = 1;
@@ -1861,7 +3722,8 @@ start:
 	}
 
 	credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del,
-						fe, el);
+						(struct ocfs2_dinode *)fe_bh->b_data,
+						el);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
 		status = PTR_ERR(handle);
@@ -1870,13 +3732,8 @@ start:
 		goto bail;
 	}
 
-	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
-	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
-	if (status < 0)
-		mlog_errno(status);
-
-	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh,
-				   last_eb_bh, handle, tc);
+	status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle,
+				   tc, path);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1888,9 +3745,14 @@ start:
 	ocfs2_commit_trans(osb, handle);
 	handle = NULL;
 
-	BUG_ON(le32_to_cpu(fe->i_clusters) < target_i_clusters);
-	if (le32_to_cpu(fe->i_clusters) > target_i_clusters)
-		goto start;
+	ocfs2_reinit_path(path, 1);
+
+	/*
+	 * The check above will catch the case where we've truncated
+	 * away all allocation.
+	 */
+	goto start;
+
 bail:
 	up_write(&OCFS2_I(inode)->ip_alloc_sem);
 
@@ -1902,8 +3764,7 @@ bail:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
 
-	if (last_eb_bh)
-		brelse(last_eb_bh);
+	ocfs2_free_path(path);
 
 	/* This will drop the ext_alloc cluster lock for us */
 	ocfs2_free_truncate_context(tc);
@@ -1912,7 +3773,6 @@ bail:
 	return status;
 }
 
-
 /*
  * Expects the inode to already be locked. This will figure out which
  * inodes need to be locked and will put them on the returned truncate
@@ -1923,7 +3783,7 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct buffer_head *fe_bh,
 			   struct ocfs2_truncate_context **tc)
 {
-	int status, metadata_delete;
+	int status, metadata_delete, i;
 	unsigned int new_i_clusters;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_extent_block *eb;
@@ -1944,21 +3804,6 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 	     "%llu\n", fe->i_clusters, new_i_clusters,
 	     (unsigned long long)fe->i_size);
 
-	if (le32_to_cpu(fe->i_clusters) <= new_i_clusters) {
-		ocfs2_error(inode->i_sb, "Dinode %llu has cluster count "
-			    "%u and size %llu whereas struct inode has "
-			    "cluster count %u and size %llu which caused an "
-			    "invalid truncate to %u clusters.",
-			    (unsigned long long)le64_to_cpu(fe->i_blkno),
-			    le32_to_cpu(fe->i_clusters),
-			    (unsigned long long)le64_to_cpu(fe->i_size),
-			    OCFS2_I(inode)->ip_clusters, i_size_read(inode),
-			    new_i_clusters);
-		mlog_meta_lvb(ML_ERROR, &OCFS2_I(inode)->ip_meta_lockres);
-		status = -EIO;
-		goto bail;
-	}
-
 	*tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL);
 	if (!(*tc)) {
 		status = -ENOMEM;
@@ -1986,7 +3831,15 @@ int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			goto bail;
 		}
 		el = &(eb->h_list);
-		if (le32_to_cpu(el->l_recs[0].e_cpos) >= new_i_clusters)
+
+		i = 0;
+		if (ocfs2_is_empty_extent(&el->l_recs[0]))
+			i = 1;
+		/*
+		 * XXX: Should we check that next_free_rec contains
+		 * the extent?
+		 */
+		if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters)
 			metadata_delete = 1;
 	}
 
diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h
index 0b82e8044325..fbcb5934a081 100644
--- a/fs/ocfs2/alloc.h
+++ b/fs/ocfs2/alloc.h
@@ -31,7 +31,8 @@ int ocfs2_insert_extent(struct ocfs2_super *osb,
 			handle_t *handle,
 			struct inode *inode,
 			struct buffer_head *fe_bh,
-			u64 blkno,
+			u32 cpos,
+			u64 start_blk,
 			u32 new_clusters,
 			struct ocfs2_alloc_context *meta_ac);
 int ocfs2_num_free_extents(struct ocfs2_super *osb,
@@ -70,6 +71,8 @@ struct ocfs2_truncate_context {
 	struct buffer_head *tc_last_eb_bh;
 };
 
+int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle,
+				 u64 new_i_size);
 int ocfs2_prepare_truncate(struct ocfs2_super *osb,
 			   struct inode *inode,
 			   struct buffer_head *fe_bh,
@@ -79,4 +82,26 @@ int ocfs2_commit_truncate(struct ocfs2_super *osb,
 			  struct buffer_head *fe_bh,
 			  struct ocfs2_truncate_context *tc);
 
+int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el,
+		    u32 cpos, struct buffer_head **leaf_bh);
+
+/*
+ * Helper function to look at the # of clusters in an extent record.
+ */
+static inline unsigned int ocfs2_rec_clusters(struct ocfs2_extent_list *el,
+					      struct ocfs2_extent_rec *rec)
+{
+	/*
+	 * Cluster count in extent records is slightly different
+	 * between interior nodes and leaf nodes. This is to support
+	 * unwritten extents which need a flags field in leaf node
+	 * records, thus shrinking the available space for a clusters
+	 * field.
+	 */
+	if (el->l_tree_depth)
+		return le32_to_cpu(rec->e_int_clusters);
+	else
+		return le16_to_cpu(rec->e_leaf_clusters);
+}
+
 #endif /* OCFS2_ALLOC_H */
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index 875c11443817..56963e6c46c0 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -24,6 +24,8 @@
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
 #include <asm/byteorder.h>
+#include <linux/swap.h>
+#include <linux/pipe_fs_i.h>
 
 #define MLOG_MASK_PREFIX ML_FILE_IO
 #include <cluster/masklog.h>
@@ -37,6 +39,7 @@
 #include "file.h"
 #include "inode.h"
 #include "journal.h"
+#include "suballoc.h"
 #include "super.h"
 #include "symlink.h"
 
@@ -134,7 +137,9 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 			   struct buffer_head *bh_result, int create)
 {
 	int err = 0;
+	unsigned int ext_flags;
 	u64 p_blkno, past_eof;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry("(0x%p, %llu, 0x%p, %d)\n", inode,
 		   (unsigned long long)iblock, bh_result, create);
@@ -149,17 +154,8 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	/* this can happen if another node truncs after our extend! */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	if (iblock >= ocfs2_clusters_to_blocks(inode->i_sb,
-					       OCFS2_I(inode)->ip_clusters))
-		err = -EIO;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-	if (err)
-		goto bail;
-
-	err = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-					  NULL);
+	err = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno, NULL,
+					  &ext_flags);
 	if (err) {
 		mlog(ML_ERROR, "Error %d from get_blocks(0x%p, %llu, 1, "
 		     "%llu, NULL)\n", err, inode, (unsigned long long)iblock,
@@ -167,22 +163,39 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	map_bh(bh_result, inode->i_sb, p_blkno);
-
-	if (bh_result->b_blocknr == 0) {
-		err = -EIO;
-		mlog(ML_ERROR, "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
-		     (unsigned long long)iblock,
-		     (unsigned long long)p_blkno,
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	}
+	/*
+	 * ocfs2 never allocates in this function - the only time we
+	 * need to use BH_New is when we're extending i_size on a file
+	 * system which doesn't support holes, in which case BH_New
+	 * allows block_prepare_write() to zero.
+	 */
+	mlog_bug_on_msg(create && p_blkno == 0 && ocfs2_sparse_alloc(osb),
+			"ino %lu, iblock %llu\n", inode->i_ino,
+			(unsigned long long)iblock);
+
+	/* Treat the unwritten extent as a hole for zeroing purposes. */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+
+	if (!ocfs2_sparse_alloc(osb)) {
+		if (p_blkno == 0) {
+			err = -EIO;
+			mlog(ML_ERROR,
+			     "iblock = %llu p_blkno = %llu blkno=(%llu)\n",
+			     (unsigned long long)iblock,
+			     (unsigned long long)p_blkno,
+			     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+			mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters);
+			dump_stack();
+		}
 
-	past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
-	mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
-	     (unsigned long long)past_eof);
+		past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
+		mlog(0, "Inode %lu, past_eof = %llu\n", inode->i_ino,
+		     (unsigned long long)past_eof);
 
-	if (create && (iblock >= past_eof))
-		set_buffer_new(bh_result);
+		if (create && (iblock >= past_eof))
+			set_buffer_new(bh_result);
+	}
 
 bail:
 	if (err < 0)
@@ -276,8 +289,11 @@ static int ocfs2_writepage(struct page *page, struct writeback_control *wbc)
 	return ret;
 }
 
-/* This can also be called from ocfs2_write_zero_page() which has done
- * it's own cluster locking. */
+/*
+ * This is called from ocfs2_write_zero_page() which has handled it's
+ * own cluster locking and has ensured allocation exists for those
+ * blocks to be written.
+ */
 int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 			       unsigned from, unsigned to)
 {
@@ -292,44 +308,17 @@ int ocfs2_prepare_write_nolock(struct inode *inode, struct page *page,
 	return ret;
 }
 
-/*
- * ocfs2_prepare_write() can be an outer-most ocfs2 call when it is called
- * from loopback.  It must be able to perform its own locking around
- * ocfs2_get_block().
- */
-static int ocfs2_prepare_write(struct file *file, struct page *page,
-			       unsigned from, unsigned to)
-{
-	struct inode *inode = page->mapping->host;
-	int ret;
-
-	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	ret = ocfs2_meta_lock_with_page(inode, NULL, 0, page);
-	if (ret != 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_prepare_write_nolock(inode, page, from, to);
-
-	ocfs2_meta_unlock(inode, 0);
-out:
-	mlog_exit(ret);
-	return ret;
-}
-
 /* Taken from ext3. We don't necessarily need the full blown
  * functionality yet, but IMHO it's better to cut and paste the whole
  * thing so we can avoid introducing our own bugs (and easily pick up
  * their fixes when they happen) --Mark */
-static int walk_page_buffers(	handle_t *handle,
-				struct buffer_head *head,
-				unsigned from,
-				unsigned to,
-				int *partial,
-				int (*fn)(	handle_t *handle,
-						struct buffer_head *bh))
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh))
 {
 	struct buffer_head *bh;
 	unsigned block_start, block_end;
@@ -388,95 +377,6 @@ out:
 	return handle;
 }
 
-static int ocfs2_commit_write(struct file *file, struct page *page,
-			      unsigned from, unsigned to)
-{
-	int ret;
-	struct buffer_head *di_bh = NULL;
-	struct inode *inode = page->mapping->host;
-	handle_t *handle = NULL;
-	struct ocfs2_dinode *di;
-
-	mlog_entry("(0x%p, 0x%p, %u, %u)\n", file, page, from, to);
-
-	/* NOTE: ocfs2_file_aio_write has ensured that it's safe for
-	 * us to continue here without rechecking the I/O against
-	 * changed inode values.
-	 *
-	 * 1) We're currently holding the inode alloc lock, so no
-	 *    nodes can change it underneath us.
-	 *
-	 * 2) We've had to take the metadata lock at least once
-	 *    already to check for extending writes, suid removal, etc.
-	 *    The meta data update code then ensures that we don't get a
-	 *    stale inode allocation image (i_size, i_clusters, etc).
-	 */
-
-	ret = ocfs2_meta_lock_with_page(inode, &di_bh, 1, page);
-	if (ret != 0) {
-		mlog_errno(ret);
-		goto out;
-	}
-
-	ret = ocfs2_data_lock_with_page(inode, 1, page);
-	if (ret != 0) {
-		mlog_errno(ret);
-		goto out_unlock_meta;
-	}
-
-	handle = ocfs2_start_walk_page_trans(inode, page, from, to);
-	if (IS_ERR(handle)) {
-		ret = PTR_ERR(handle);
-		goto out_unlock_data;
-	}
-
-	/* Mark our buffer early. We'd rather catch this error up here
-	 * as opposed to after a successful commit_write which would
-	 * require us to set back inode->i_size. */
-	ret = ocfs2_journal_access(handle, inode, di_bh,
-				   OCFS2_JOURNAL_ACCESS_WRITE);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	/* might update i_size */
-	ret = generic_commit_write(file, page, from, to);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-	di = (struct ocfs2_dinode *)di_bh->b_data;
-
-	/* ocfs2_mark_inode_dirty() is too heavy to use here. */
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
-	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
-	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
-
-	inode->i_blocks = ocfs2_align_bytes_to_sectors((u64)(i_size_read(inode)));
-	di->i_size = cpu_to_le64((u64)i_size_read(inode));
-
-	ret = ocfs2_journal_dirty(handle, di_bh);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out_commit;
-	}
-
-out_commit:
-	ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
-out_unlock_data:
-	ocfs2_data_unlock(inode, 1);
-out_unlock_meta:
-	ocfs2_meta_unlock(inode, 1);
-out:
-	if (di_bh)
-		brelse(di_bh);
-
-	mlog_exit(ret);
-	return ret;
-}
-
 static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 {
 	sector_t status;
@@ -499,8 +399,7 @@ static sector_t ocfs2_bmap(struct address_space *mapping, sector_t block)
 		down_read(&OCFS2_I(inode)->ip_alloc_sem);
 	}
 
-	err = ocfs2_extent_map_get_blocks(inode, block, 1, &p_blkno,
-					  NULL);
+	err = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL, NULL);
 
 	if (!INODE_JOURNAL(inode)) {
 		up_read(&OCFS2_I(inode)->ip_alloc_sem);
@@ -540,8 +439,8 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 				     struct buffer_head *bh_result, int create)
 {
 	int ret;
-	u64 p_blkno, inode_blocks;
-	int contig_blocks;
+	u64 p_blkno, inode_blocks, contig_blocks;
+	unsigned int ext_flags;
 	unsigned char blocksize_bits = inode->i_sb->s_blocksize_bits;
 	unsigned long max_blocks = bh_result->b_size >> inode->i_blkbits;
 
@@ -549,33 +448,20 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 	 * nicely aligned and of the right size, so there's no need
 	 * for us to check any of that. */
 
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	inode_blocks = ocfs2_clusters_to_blocks(inode->i_sb,
-						OCFS2_I(inode)->ip_clusters);
-
-	/*
-	 * For a read which begins past the end of file, we return a hole.
-	 */
-	if (!create && (iblock >= inode_blocks)) {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		ret = 0;
-		goto bail;
-	}
+	inode_blocks = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode));
 
 	/*
 	 * Any write past EOF is not allowed because we'd be extending.
 	 */
 	if (create && (iblock + max_blocks) > inode_blocks) {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
 		ret = -EIO;
 		goto bail;
 	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
 
 	/* This figures out the size of the next contiguous block, and
 	 * our logical offset */
-	ret = ocfs2_extent_map_get_blocks(inode, iblock, 1, &p_blkno,
-					  &contig_blocks);
+	ret = ocfs2_extent_map_get_blocks(inode, iblock, &p_blkno,
+					  &contig_blocks, &ext_flags);
 	if (ret) {
 		mlog(ML_ERROR, "get_blocks() failed iblock=%llu\n",
 		     (unsigned long long)iblock);
@@ -583,7 +469,37 @@ static int ocfs2_direct_IO_get_blocks(struct inode *inode, sector_t iblock,
 		goto bail;
 	}
 
-	map_bh(bh_result, inode->i_sb, p_blkno);
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)) && !p_blkno) {
+		ocfs2_error(inode->i_sb,
+			    "Inode %llu has a hole at block %llu\n",
+			    (unsigned long long)OCFS2_I(inode)->ip_blkno,
+			    (unsigned long long)iblock);
+		ret = -EROFS;
+		goto bail;
+	}
+
+	/*
+	 * get_more_blocks() expects us to describe a hole by clearing
+	 * the mapped bit on bh_result().
+	 *
+	 * Consider an unwritten extent as a hole.
+	 */
+	if (p_blkno && !(ext_flags & OCFS2_EXT_UNWRITTEN))
+		map_bh(bh_result, inode->i_sb, p_blkno);
+	else {
+		/*
+		 * ocfs2_prepare_inode_for_write() should have caught
+		 * the case where we'd be filling a hole and triggered
+		 * a buffered write instead.
+		 */
+		if (create) {
+			ret = -EIO;
+			mlog_errno(ret);
+			goto bail;
+		}
+
+		clear_buffer_mapped(bh_result);
+	}
 
 	/* make sure we don't map more than max_blocks blocks here as
 	   that's all the kernel will handle at this point. */
@@ -606,12 +522,17 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
 			     void *private)
 {
 	struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
+	int level;
 
 	/* this io's submitter should not have unlocked this before we could */
 	BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
+
 	ocfs2_iocb_clear_rw_locked(iocb);
-	up_read(&inode->i_alloc_sem);
-	ocfs2_rw_unlock(inode, 0);
+
+	level = ocfs2_iocb_rw_locked_level(iocb);
+	if (!level)
+		up_read(&inode->i_alloc_sem);
+	ocfs2_rw_unlock(inode, level);
 }
 
 /*
@@ -647,23 +568,27 @@ static ssize_t ocfs2_direct_IO(int rw,
 
 	mlog_entry_void();
 
-	/*
-	 * We get PR data locks even for O_DIRECT.  This allows
-	 * concurrent O_DIRECT I/O but doesn't let O_DIRECT with
-	 * extending and buffered zeroing writes race.  If they did
-	 * race then the buffered zeroing could be written back after
-	 * the O_DIRECT I/O.  It's one thing to tell people not to mix
-	 * buffered and O_DIRECT writes, but expecting them to
-	 * understand that file extension is also an implicit buffered
-	 * write is too much.  By getting the PR we force writeback of
-	 * the buffered zeroing before proceeding.
-	 */
-	ret = ocfs2_data_lock(inode, 0);
-	if (ret < 0) {
-		mlog_errno(ret);
-		goto out;
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+		/*
+		 * We get PR data locks even for O_DIRECT.  This
+		 * allows concurrent O_DIRECT I/O but doesn't let
+		 * O_DIRECT with extending and buffered zeroing writes
+		 * race.  If they did race then the buffered zeroing
+		 * could be written back after the O_DIRECT I/O.  It's
+		 * one thing to tell people not to mix buffered and
+		 * O_DIRECT writes, but expecting them to understand
+		 * that file extension is also an implicit buffered
+		 * write is too much.  By getting the PR we force
+		 * writeback of the buffered zeroing before
+		 * proceeding.
+		 */
+		ret = ocfs2_data_lock(inode, 0);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+		ocfs2_data_unlock(inode, 0);
 	}
-	ocfs2_data_unlock(inode, 0);
 
 	ret = blockdev_direct_IO_no_locking(rw, iocb, inode,
 					    inode->i_sb->s_bdev, iov, offset,
@@ -675,11 +600,715 @@ out:
 	return ret;
 }
 
+static void ocfs2_figure_cluster_boundaries(struct ocfs2_super *osb,
+					    u32 cpos,
+					    unsigned int *start,
+					    unsigned int *end)
+{
+	unsigned int cluster_start = 0, cluster_end = PAGE_CACHE_SIZE;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits)) {
+		unsigned int cpp;
+
+		cpp = 1 << (PAGE_CACHE_SHIFT - osb->s_clustersize_bits);
+
+		cluster_start = cpos % cpp;
+		cluster_start = cluster_start << osb->s_clustersize_bits;
+
+		cluster_end = cluster_start + osb->s_clustersize;
+	}
+
+	BUG_ON(cluster_start > PAGE_SIZE);
+	BUG_ON(cluster_end > PAGE_SIZE);
+
+	if (start)
+		*start = cluster_start;
+	if (end)
+		*end = cluster_end;
+}
+
+/*
+ * 'from' and 'to' are the region in the page to avoid zeroing.
+ *
+ * If pagesize > clustersize, this function will avoid zeroing outside
+ * of the cluster boundary.
+ *
+ * from == to == 0 is code for "zero the entire cluster region"
+ */
+static void ocfs2_clear_page_regions(struct page *page,
+				     struct ocfs2_super *osb, u32 cpos,
+				     unsigned from, unsigned to)
+{
+	void *kaddr;
+	unsigned int cluster_start, cluster_end;
+
+	ocfs2_figure_cluster_boundaries(osb, cpos, &cluster_start, &cluster_end);
+
+	kaddr = kmap_atomic(page, KM_USER0);
+
+	if (from || to) {
+		if (from > cluster_start)
+			memset(kaddr + cluster_start, 0, from - cluster_start);
+		if (to < cluster_end)
+			memset(kaddr + to, 0, cluster_end - to);
+	} else {
+		memset(kaddr + cluster_start, 0, cluster_end - cluster_start);
+	}
+
+	kunmap_atomic(kaddr, KM_USER0);
+}
+
+/*
+ * Some of this taken from block_prepare_write(). We already have our
+ * mapping by now though, and the entire write will be allocating or
+ * it won't, so not much need to use BH_New.
+ *
+ * This will also skip zeroing, which is handled externally.
+ */
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new)
+{
+	int ret = 0;
+	struct buffer_head *head, *bh, *wait[2], **wait_bh = wait;
+	unsigned int block_end, block_start;
+	unsigned int bsize = 1 << inode->i_blkbits;
+
+	if (!page_has_buffers(page))
+		create_empty_buffers(page, bsize, 0);
+
+	head = page_buffers(page);
+	for (bh = head, block_start = 0; bh != head || !block_start;
+	     bh = bh->b_this_page, block_start += bsize) {
+		block_end = block_start + bsize;
+
+		/*
+		 * Ignore blocks outside of our i/o range -
+		 * they may belong to unallocated clusters.
+		 */
+		if (block_start >= to || block_end <= from) {
+			if (PageUptodate(page))
+				set_buffer_uptodate(bh);
+			continue;
+		}
+
+		/*
+		 * For an allocating write with cluster size >= page
+		 * size, we always write the entire page.
+		 */
+
+		if (buffer_new(bh))
+			clear_buffer_new(bh);
+
+		if (!buffer_mapped(bh)) {
+			map_bh(bh, inode->i_sb, *p_blkno);
+			unmap_underlying_metadata(bh->b_bdev, bh->b_blocknr);
+		}
+
+		if (PageUptodate(page)) {
+			if (!buffer_uptodate(bh))
+				set_buffer_uptodate(bh);
+		} else if (!buffer_uptodate(bh) && !buffer_delay(bh) &&
+		     (block_start < from || block_end > to)) {
+			ll_rw_block(READ, 1, &bh);
+			*wait_bh++=bh;
+		}
+
+		*p_blkno = *p_blkno + 1;
+	}
+
+	/*
+	 * If we issued read requests - let them complete.
+	 */
+	while(wait_bh > wait) {
+		wait_on_buffer(*--wait_bh);
+		if (!buffer_uptodate(*wait_bh))
+			ret = -EIO;
+	}
+
+	if (ret == 0 || !new)
+		return ret;
+
+	/*
+	 * If we get -EIO above, zero out any newly allocated blocks
+	 * to avoid exposing stale data.
+	 */
+	bh = head;
+	block_start = 0;
+	do {
+		void *kaddr;
+
+		block_end = block_start + bsize;
+		if (block_end <= from)
+			goto next_bh;
+		if (block_start >= to)
+			break;
+
+		kaddr = kmap_atomic(page, KM_USER0);
+		memset(kaddr+block_start, 0, bh->b_size);
+		flush_dcache_page(page);
+		kunmap_atomic(kaddr, KM_USER0);
+		set_buffer_uptodate(bh);
+		mark_buffer_dirty(bh);
+
+next_bh:
+		block_start = block_end;
+		bh = bh->b_this_page;
+	} while (bh != head);
+
+	return ret;
+}
+
+/*
+ * This will copy user data from the buffer page in the splice
+ * context.
+ *
+ * For now, we ignore SPLICE_F_MOVE as that would require some extra
+ * communication out all the way to ocfs2_write().
+ */
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+				  unsigned int *ret_from, unsigned int *ret_to)
+{
+	int ret;
+	unsigned int to, from, cluster_start, cluster_end;
+	char *src, *dst;
+	struct ocfs2_splice_write_priv *sp = wc->w_private;
+	struct pipe_buffer *buf = sp->s_buf;
+	unsigned long bytes, src_from;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+					&cluster_end);
+
+	from = sp->s_offset;
+	src_from = sp->s_buf_offset;
+	bytes = wc->w_count;
+
+	if (wc->w_large_pages) {
+		/*
+		 * For cluster size < page size, we have to
+		 * calculate pos within the cluster and obey
+		 * the rightmost boundary.
+		 */
+		bytes = min(bytes, (unsigned long)(osb->s_clustersize
+				   - (wc->w_pos & (osb->s_clustersize - 1))));
+	}
+	to = from + bytes;
+
+	if (wc->w_this_page_new)
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    cluster_start, cluster_end, 1);
+	else
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    from, to, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > osb->s_clustersize);
+	BUG_ON(to > osb->s_clustersize);
+
+	src = buf->ops->map(sp->s_pipe, buf, 1);
+	dst = kmap_atomic(wc->w_this_page, KM_USER1);
+	memcpy(dst + from, src + src_from, bytes);
+	kunmap_atomic(wc->w_this_page, KM_USER1);
+	buf->ops->unmap(sp->s_pipe, buf, src);
+
+	wc->w_finished_copy = 1;
+
+	*ret_from = from;
+	*ret_to = to;
+out:
+
+	return bytes ? (unsigned int)bytes : ret;
+}
+
+/*
+ * This will copy user data from the iovec in the buffered write
+ * context.
+ */
+int ocfs2_map_and_write_user_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc, u64 *p_blkno,
+				  unsigned int *ret_from, unsigned int *ret_to)
+{
+	int ret;
+	unsigned int to, from, cluster_start, cluster_end;
+	unsigned long bytes, src_from;
+	char *dst;
+	struct ocfs2_buffered_write_priv *bp = wc->w_private;
+	const struct iovec *cur_iov = bp->b_cur_iov;
+	char __user *buf;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	ocfs2_figure_cluster_boundaries(osb, wc->w_cpos, &cluster_start,
+					&cluster_end);
+
+	buf = cur_iov->iov_base + bp->b_cur_off;
+	src_from = (unsigned long)buf & ~PAGE_CACHE_MASK;
+
+	from = wc->w_pos & (PAGE_CACHE_SIZE - 1);
+
+	/*
+	 * This is a lot of comparisons, but it reads quite
+	 * easily, which is important here.
+	 */
+	/* Stay within the src page */
+	bytes = PAGE_SIZE - src_from;
+	/* Stay within the vector */
+	bytes = min(bytes,
+		    (unsigned long)(cur_iov->iov_len - bp->b_cur_off));
+	/* Stay within count */
+	bytes = min(bytes, (unsigned long)wc->w_count);
+	/*
+	 * For clustersize > page size, just stay within
+	 * target page, otherwise we have to calculate pos
+	 * within the cluster and obey the rightmost
+	 * boundary.
+	 */
+	if (wc->w_large_pages) {
+		/*
+		 * For cluster size < page size, we have to
+		 * calculate pos within the cluster and obey
+		 * the rightmost boundary.
+		 */
+		bytes = min(bytes, (unsigned long)(osb->s_clustersize
+				   - (wc->w_pos & (osb->s_clustersize - 1))));
+	} else {
+		/*
+		 * cluster size > page size is the most common
+		 * case - we just stay within the target page
+		 * boundary.
+		 */
+		bytes = min(bytes, PAGE_CACHE_SIZE - from);
+	}
+
+	to = from + bytes;
+
+	if (wc->w_this_page_new)
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    cluster_start, cluster_end, 1);
+	else
+		ret = ocfs2_map_page_blocks(wc->w_this_page, p_blkno, inode,
+					    from, to, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(from > PAGE_CACHE_SIZE);
+	BUG_ON(to > PAGE_CACHE_SIZE);
+	BUG_ON(from > osb->s_clustersize);
+	BUG_ON(to > osb->s_clustersize);
+
+	dst = kmap(wc->w_this_page);
+	memcpy(dst + from, bp->b_src_buf + src_from, bytes);
+	kunmap(wc->w_this_page);
+
+	/*
+	 * XXX: This is slow, but simple. The caller of
+	 * ocfs2_buffered_write_cluster() is responsible for
+	 * passing through the iovecs, so it's difficult to
+	 * predict what our next step is in here after our
+	 * initial write. A future version should be pushing
+	 * that iovec manipulation further down.
+	 *
+	 * By setting this, we indicate that a copy from user
+	 * data was done, and subsequent calls for this
+	 * cluster will skip copying more data.
+	 */
+	wc->w_finished_copy = 1;
+
+	*ret_from = from;
+	*ret_to = to;
+out:
+
+	return bytes ? (unsigned int)bytes : ret;
+}
+
+/*
+ * Map, fill and write a page to disk.
+ *
+ * The work of copying data is done via callback.  Newly allocated
+ * pages which don't take user data will be zero'd (set 'new' to
+ * indicate an allocating write)
+ *
+ * Returns a negative error code or the number of bytes copied into
+ * the page.
+ */
+int ocfs2_write_data_page(struct inode *inode, handle_t *handle,
+			  u64 *p_blkno, struct page *page,
+			  struct ocfs2_write_ctxt *wc, int new)
+{
+	int ret, copied = 0;
+	unsigned int from = 0, to = 0;
+	unsigned int cluster_start, cluster_end;
+	unsigned int zero_from = 0, zero_to = 0;
+
+	ocfs2_figure_cluster_boundaries(OCFS2_SB(inode->i_sb), wc->w_cpos,
+					&cluster_start, &cluster_end);
+
+	if ((wc->w_pos >> PAGE_CACHE_SHIFT) == page->index
+	    && !wc->w_finished_copy) {
+
+		wc->w_this_page = page;
+		wc->w_this_page_new = new;
+		ret = wc->w_write_data_page(inode, wc, p_blkno, &from, &to);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		copied = ret;
+
+		zero_from = from;
+		zero_to = to;
+		if (new) {
+			from = cluster_start;
+			to = cluster_end;
+		}
+	} else {
+		/*
+		 * If we haven't allocated the new page yet, we
+		 * shouldn't be writing it out without copying user
+		 * data. This is likely a math error from the caller.
+		 */
+		BUG_ON(!new);
+
+		from = cluster_start;
+		to = cluster_end;
+
+		ret = ocfs2_map_page_blocks(page, p_blkno, inode,
+					    cluster_start, cluster_end, 1);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	/*
+	 * Parts of newly allocated pages need to be zero'd.
+	 *
+	 * Above, we have also rewritten 'to' and 'from' - as far as
+	 * the rest of the function is concerned, the entire cluster
+	 * range inside of a page needs to be written.
+	 *
+	 * We can skip this if the page is up to date - it's already
+	 * been zero'd from being read in as a hole.
+	 */
+	if (new && !PageUptodate(page))
+		ocfs2_clear_page_regions(page, OCFS2_SB(inode->i_sb),
+					 wc->w_cpos, zero_from, zero_to);
+
+	flush_dcache_page(page);
+
+	if (ocfs2_should_order_data(inode)) {
+		ret = walk_page_buffers(handle,
+					page_buffers(page),
+					from, to, NULL,
+					ocfs2_journal_dirty_data);
+		if (ret < 0)
+			mlog_errno(ret);
+	}
+
+	/*
+	 * We don't use generic_commit_write() because we need to
+	 * handle our own i_size update.
+	 */
+	ret = block_commit_write(page, from, to);
+	if (ret)
+		mlog_errno(ret);
+out:
+
+	return copied ? copied : ret;
+}
+
+/*
+ * Do the actual write of some data into an inode. Optionally allocate
+ * in order to fulfill the write.
+ *
+ * cpos is the logical cluster offset within the file to write at
+ *
+ * 'phys' is the physical mapping of that offset. a 'phys' value of
+ * zero indicates that allocation is required. In this case, data_ac
+ * and meta_ac should be valid (meta_ac can be null if metadata
+ * allocation isn't required).
+ */
+static ssize_t ocfs2_write(struct file *file, u32 phys, handle_t *handle,
+			   struct buffer_head *di_bh,
+			   struct ocfs2_alloc_context *data_ac,
+			   struct ocfs2_alloc_context *meta_ac,
+			   struct ocfs2_write_ctxt *wc)
+{
+	int ret, i, numpages = 1, new;
+	unsigned int copied = 0;
+	u32 tmp_pos;
+	u64 v_blkno, p_blkno;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	unsigned long index, start;
+	struct page **cpages;
+
+	new = phys == 0 ? 1 : 0;
+
+	/*
+	 * Figure out how many pages we'll be manipulating here. For
+	 * non allocating write, we just change the one
+	 * page. Otherwise, we'll need a whole clusters worth.
+	 */
+	if (new)
+		numpages = ocfs2_pages_per_cluster(inode->i_sb);
+
+	cpages = kzalloc(sizeof(*cpages) * numpages, GFP_NOFS);
+	if (!cpages) {
+		ret = -ENOMEM;
+		mlog_errno(ret);
+		return ret;
+	}
+
+	/*
+	 * Fill our page array first. That way we've grabbed enough so
+	 * that we can zero and flush if we error after adding the
+	 * extent.
+	 */
+	if (new) {
+		start = ocfs2_align_clusters_to_page_index(inode->i_sb,
+							   wc->w_cpos);
+		v_blkno = ocfs2_clusters_to_blocks(inode->i_sb, wc->w_cpos);
+	} else {
+		start = wc->w_pos >> PAGE_CACHE_SHIFT;
+		v_blkno = wc->w_pos >> inode->i_sb->s_blocksize_bits;
+	}
+
+	for(i = 0; i < numpages; i++) {
+		index = start + i;
+
+		cpages[i] = grab_cache_page(mapping, index);
+		if (!cpages[i]) {
+			ret = -ENOMEM;
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	if (new) {
+		/*
+		 * This is safe to call with the page locks - it won't take
+		 * any additional semaphores or cluster locks.
+		 */
+		tmp_pos = wc->w_cpos;
+		ret = ocfs2_do_extend_allocation(OCFS2_SB(inode->i_sb), inode,
+						 &tmp_pos, 1, di_bh, handle,
+						 data_ac, meta_ac, NULL);
+		/*
+		 * This shouldn't happen because we must have already
+		 * calculated the correct meta data allocation required. The
+		 * internal tree allocation code should know how to increase
+		 * transaction credits itself.
+		 *
+		 * If need be, we could handle -EAGAIN for a
+		 * RESTART_TRANS here.
+		 */
+		mlog_bug_on_msg(ret == -EAGAIN,
+				"Inode %llu: EAGAIN return during allocation.\n",
+				(unsigned long long)OCFS2_I(inode)->ip_blkno);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_extent_map_get_blocks(inode, v_blkno, &p_blkno, NULL,
+					  NULL);
+	if (ret < 0) {
+
+		/*
+		 * XXX: Should we go readonly here?
+		 */
+
+		mlog_errno(ret);
+		goto out;
+	}
+
+	BUG_ON(p_blkno == 0);
+
+	for(i = 0; i < numpages; i++) {
+		ret = ocfs2_write_data_page(inode, handle, &p_blkno, cpages[i],
+					    wc, new);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		copied += ret;
+	}
+
+out:
+	for(i = 0; i < numpages; i++) {
+		unlock_page(cpages[i]);
+		mark_page_accessed(cpages[i]);
+		page_cache_release(cpages[i]);
+	}
+	kfree(cpages);
+
+	return copied ? copied : ret;
+}
+
+static void ocfs2_write_ctxt_init(struct ocfs2_write_ctxt *wc,
+				  struct ocfs2_super *osb, loff_t pos,
+				  size_t count, ocfs2_page_writer *cb,
+				  void *cb_priv)
+{
+	wc->w_count = count;
+	wc->w_pos = pos;
+	wc->w_cpos = wc->w_pos >> osb->s_clustersize_bits;
+	wc->w_finished_copy = 0;
+
+	if (unlikely(PAGE_CACHE_SHIFT > osb->s_clustersize_bits))
+		wc->w_large_pages = 1;
+	else
+		wc->w_large_pages = 0;
+
+	wc->w_write_data_page = cb;
+	wc->w_private = cb_priv;
+}
+
+/*
+ * Write a cluster to an inode. The cluster may not be allocated yet,
+ * in which case it will be. This only exists for buffered writes -
+ * O_DIRECT takes a more "traditional" path through the kernel.
+ *
+ * The caller is responsible for incrementing pos, written counts, etc
+ *
+ * For file systems that don't support sparse files, pre-allocation
+ * and page zeroing up until cpos should be done prior to this
+ * function call.
+ *
+ * Callers should be holding i_sem, and the rw cluster lock.
+ *
+ * Returns the number of user bytes written, or less than zero for
+ * error.
+ */
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+				     size_t count, ocfs2_page_writer *actor,
+				     void *priv)
+{
+	int ret, credits = OCFS2_INODE_UPDATE_CREDITS;
+	ssize_t written = 0;
+	u32 phys;
+	struct inode *inode = file->f_mapping->host;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct buffer_head *di_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_alloc_context *data_ac = NULL;
+	struct ocfs2_alloc_context *meta_ac = NULL;
+	handle_t *handle;
+	struct ocfs2_write_ctxt wc;
+
+	ocfs2_write_ctxt_init(&wc, osb, pos, count, actor, priv);
+
+	ret = ocfs2_meta_lock(inode, &di_bh, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+	di = (struct ocfs2_dinode *)di_bh->b_data;
+
+	/*
+	 * Take alloc sem here to prevent concurrent lookups. That way
+	 * the mapping, zeroing and tree manipulation within
+	 * ocfs2_write() will be safe against ->readpage(). This
+	 * should also serve to lock out allocation from a shared
+	 * writeable region.
+	 */
+	down_write(&OCFS2_I(inode)->ip_alloc_sem);
+
+	ret = ocfs2_get_clusters(inode, wc.w_cpos, &phys, NULL, NULL);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_meta;
+	}
+
+	/* phys == 0 means that allocation is required. */
+	if (phys == 0) {
+		ret = ocfs2_lock_allocators(inode, di, 1, &data_ac, &meta_ac);
+		if (ret) {
+			mlog_errno(ret);
+			goto out_meta;
+		}
+
+		credits = ocfs2_calc_extend_credits(inode->i_sb, di, 1);
+	}
+
+	ret = ocfs2_data_lock(inode, 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_meta;
+	}
+
+	handle = ocfs2_start_trans(osb, credits);
+	if (IS_ERR(handle)) {
+		ret = PTR_ERR(handle);
+		mlog_errno(ret);
+		goto out_data;
+	}
+
+	written = ocfs2_write(file, phys, handle, di_bh, data_ac,
+			      meta_ac, &wc);
+	if (written < 0) {
+		ret = written;
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	ret = ocfs2_journal_access(handle, inode, di_bh,
+				   OCFS2_JOURNAL_ACCESS_WRITE);
+	if (ret) {
+		mlog_errno(ret);
+		goto out_commit;
+	}
+
+	pos += written;
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
+	di->i_size = cpu_to_le64((u64)i_size_read(inode));
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec);
+	di->i_mtime_nsec = di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
+
+	ret = ocfs2_journal_dirty(handle, di_bh);
+	if (ret)
+		mlog_errno(ret);
+
+out_commit:
+	ocfs2_commit_trans(osb, handle);
+
+out_data:
+	ocfs2_data_unlock(inode, 1);
+
+out_meta:
+	up_write(&OCFS2_I(inode)->ip_alloc_sem);
+	ocfs2_meta_unlock(inode, 1);
+
+out:
+	brelse(di_bh);
+	if (data_ac)
+		ocfs2_free_alloc_context(data_ac);
+	if (meta_ac)
+		ocfs2_free_alloc_context(meta_ac);
+
+	return written ? written : ret;
+}
+
 const struct address_space_operations ocfs2_aops = {
 	.readpage	= ocfs2_readpage,
 	.writepage	= ocfs2_writepage,
-	.prepare_write	= ocfs2_prepare_write,
-	.commit_write	= ocfs2_commit_write,
 	.bmap		= ocfs2_bmap,
 	.sync_page	= block_sync_page,
 	.direct_IO	= ocfs2_direct_IO,
diff --git a/fs/ocfs2/aops.h b/fs/ocfs2/aops.h
index f446a15eab88..45821d479b5a 100644
--- a/fs/ocfs2/aops.h
+++ b/fs/ocfs2/aops.h
@@ -30,12 +30,83 @@ handle_t *ocfs2_start_walk_page_trans(struct inode *inode,
 							 unsigned from,
 							 unsigned to);
 
+int ocfs2_map_page_blocks(struct page *page, u64 *p_blkno,
+			  struct inode *inode, unsigned int from,
+			  unsigned int to, int new);
+
+int walk_page_buffers(	handle_t *handle,
+			struct buffer_head *head,
+			unsigned from,
+			unsigned to,
+			int *partial,
+			int (*fn)(	handle_t *handle,
+					struct buffer_head *bh));
+
+struct ocfs2_write_ctxt;
+typedef int (ocfs2_page_writer)(struct inode *, struct ocfs2_write_ctxt *,
+				u64 *, unsigned int *, unsigned int *);
+
+ssize_t ocfs2_buffered_write_cluster(struct file *file, loff_t pos,
+				     size_t count, ocfs2_page_writer *actor,
+				     void *priv);
+
+struct ocfs2_write_ctxt {
+	size_t				w_count;
+	loff_t				w_pos;
+	u32				w_cpos;
+	unsigned int			w_finished_copy;
+
+	/* This is true if page_size > cluster_size */
+	unsigned int			w_large_pages;
+
+	/* Filler callback and private data */
+	ocfs2_page_writer		*w_write_data_page;
+	void				*w_private;
+
+	/* Only valid for the filler callback */
+	struct page			*w_this_page;
+	unsigned int			w_this_page_new;
+};
+
+struct ocfs2_buffered_write_priv {
+	char				*b_src_buf;
+	const struct iovec		*b_cur_iov; /* Current iovec */
+	size_t				b_cur_off; /* Offset in the
+						    * current iovec */
+};
+int ocfs2_map_and_write_user_data(struct inode *inode,
+				  struct ocfs2_write_ctxt *wc,
+				  u64 *p_blkno,
+				  unsigned int *ret_from,
+				  unsigned int *ret_to);
+
+struct ocfs2_splice_write_priv {
+	struct splice_desc		*s_sd;
+	struct pipe_buffer		*s_buf;
+	struct pipe_inode_info		*s_pipe;
+	/* Neither offset value is ever larger than one page */
+	unsigned int			s_offset;
+	unsigned int			s_buf_offset;
+};
+int ocfs2_map_and_write_splice_data(struct inode *inode,
+				    struct ocfs2_write_ctxt *wc,
+				    u64 *p_blkno,
+				    unsigned int *ret_from,
+				    unsigned int *ret_to);
+
 /* all ocfs2_dio_end_io()'s fault */
 #define ocfs2_iocb_is_rw_locked(iocb) \
 	test_bit(0, (unsigned long *)&iocb->private)
-#define ocfs2_iocb_set_rw_locked(iocb) \
-	set_bit(0, (unsigned long *)&iocb->private)
+static inline void ocfs2_iocb_set_rw_locked(struct kiocb *iocb, int level)
+{
+	set_bit(0, (unsigned long *)&iocb->private);
+	if (level)
+		set_bit(1, (unsigned long *)&iocb->private);
+	else
+		clear_bit(1, (unsigned long *)&iocb->private);
+}
 #define ocfs2_iocb_clear_rw_locked(iocb) \
 	clear_bit(0, (unsigned long *)&iocb->private)
-
+#define ocfs2_iocb_rw_locked_level(iocb) \
+	test_bit(1, (unsigned long *)&iocb->private)
 #endif /* OCFS2_FILE_H */
diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c
index 4705d659fe57..bbacf7da48a4 100644
--- a/fs/ocfs2/cluster/quorum.c
+++ b/fs/ocfs2/cluster/quorum.c
@@ -46,6 +46,7 @@
 #include <linux/kernel.h>
 #include <linux/slab.h>
 #include <linux/workqueue.h>
+#include <linux/reboot.h>
 
 #include "heartbeat.h"
 #include "nodemanager.h"
@@ -72,7 +73,9 @@ static void o2quo_fence_self(void)
 	/* panic spins with interrupts enabled.  with preempt
 	 * threads can still schedule, etc, etc */
 	o2hb_stop_all_regions();
-	panic("ocfs2 is very sorry to be fencing this system by panicing\n");
+
+	printk("ocfs2 is very sorry to be fencing this system by restarting\n");
+	emergency_restart();
 }
 
 /* Indicate that a timeout occured on a hearbeat region write. The
diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h
index 4dae5df5e467..9606111fe89d 100644
--- a/fs/ocfs2/cluster/tcp_internal.h
+++ b/fs/ocfs2/cluster/tcp_internal.h
@@ -38,6 +38,9 @@
  * locking semantics of the file system using the protocol.  It should 
  * be somewhere else, I'm sure, but right now it isn't.
  *
+ * New in version 8:
+ * 	- Replace delete inode votes with a cluster lock
+ *
  * New in version 7:
  * 	- DLM join domain includes the live nodemap
  *
@@ -57,7 +60,7 @@
  * 	- full 64 bit i_size in the metadata lock lvbs
  * 	- introduction of "rw" lock and pushing meta/data locking down
  */
-#define O2NET_PROTOCOL_VERSION 7ULL
+#define O2NET_PROTOCOL_VERSION 8ULL
 struct o2net_handshake {
 	__be64	protocol_version;
 	__be64	connector_id;
diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c
index 66821e178167..67e6866a2a4f 100644
--- a/fs/ocfs2/dir.c
+++ b/fs/ocfs2/dir.c
@@ -358,15 +358,17 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 {
 	int status;
 	int extend;
-	u64 p_blkno;
+	u64 p_blkno, v_blkno;
 
 	spin_lock(&OCFS2_I(dir)->ip_lock);
 	extend = (i_size_read(dir) == ocfs2_clusters_to_bytes(sb, OCFS2_I(dir)->ip_clusters));
 	spin_unlock(&OCFS2_I(dir)->ip_lock);
 
 	if (extend) {
-		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, 1,
-						    parent_fe_bh, handle,
+		u32 offset = OCFS2_I(dir)->ip_clusters;
+
+		status = ocfs2_do_extend_allocation(OCFS2_SB(sb), dir, &offset,
+						    1, parent_fe_bh, handle,
 						    data_ac, meta_ac, NULL);
 		BUG_ON(status == -EAGAIN);
 		if (status < 0) {
@@ -375,9 +377,8 @@ int ocfs2_do_extend_dir(struct super_block *sb,
 		}
 	}
 
-	status = ocfs2_extent_map_get_blocks(dir, (dir->i_blocks >>
-						   (sb->s_blocksize_bits - 9)),
-					     1, &p_blkno, NULL);
+	v_blkno = ocfs2_blocks_for_bytes(sb, i_size_read(dir));
+	status = ocfs2_extent_map_get_blocks(dir, v_blkno, &p_blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -486,7 +487,7 @@ static int ocfs2_extend_dir(struct ocfs2_super *osb,
 
 	dir_i_size += dir->i_sb->s_blocksize;
 	i_size_write(dir, dir_i_size);
-	dir->i_blocks = ocfs2_align_bytes_to_sectors(dir_i_size);
+	dir->i_blocks = ocfs2_inode_sector_count(dir);
 	status = ocfs2_mark_inode_dirty(handle, dir, parent_fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index c558442a0b44..d836b98dd99a 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -430,11 +430,10 @@ redo_bucket:
 
 			dlm_lockres_put(res);
 
-			cond_resched_lock(&dlm->spinlock);
-
 			if (dropped)
 				goto redo_bucket;
 		}
+		cond_resched_lock(&dlm->spinlock);
 		num += n;
 		mlog(0, "%s: touched %d lockreses in bucket %d "
 		     "(tot=%d)\n", dlm->name, n, i, num);
@@ -1035,7 +1034,7 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
 {
 	int status = 0, tmpstat, node;
 	struct domain_join_ctxt *ctxt;
-	enum dlm_query_join_response response;
+	enum dlm_query_join_response response = JOIN_DISALLOW;
 
 	mlog_entry("%p", dlm);
 
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 6d4a83d50152..c1807a42c49f 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -611,6 +611,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 			}
 		} while (status != 0);
 
+		spin_lock(&dlm_reco_state_lock);
 		switch (ndata->state) {
 			case DLM_RECO_NODE_DATA_INIT:
 			case DLM_RECO_NODE_DATA_FINALIZE_SENT:
@@ -641,6 +642,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
 				     ndata->node_num, dead_node);
 				break;
 		}
+		spin_unlock(&dlm_reco_state_lock);
 	}
 
 	mlog(0, "done requesting all lock info\n");
diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index e335541727f9..27e43b0c0eae 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -225,11 +225,17 @@ static struct ocfs2_lock_res_ops ocfs2_dentry_lops = {
 	.flags		= 0,
 };
 
+static struct ocfs2_lock_res_ops ocfs2_inode_open_lops = {
+	.get_osb	= ocfs2_get_inode_osb,
+	.flags		= 0,
+};
+
 static inline int ocfs2_is_inode_lock(struct ocfs2_lock_res *lockres)
 {
 	return lockres->l_type == OCFS2_LOCK_TYPE_META ||
 		lockres->l_type == OCFS2_LOCK_TYPE_DATA ||
-		lockres->l_type == OCFS2_LOCK_TYPE_RW;
+		lockres->l_type == OCFS2_LOCK_TYPE_RW ||
+		lockres->l_type == OCFS2_LOCK_TYPE_OPEN;
 }
 
 static inline struct inode *ocfs2_lock_res_inode(struct ocfs2_lock_res *lockres)
@@ -373,6 +379,9 @@ void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
 		case OCFS2_LOCK_TYPE_DATA:
 			ops = &ocfs2_inode_data_lops;
 			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			ops = &ocfs2_inode_open_lops;
+			break;
 		default:
 			mlog_bug_on_msg(1, "type: %d\n", type);
 			ops = NULL; /* thanks, gcc */
@@ -1129,6 +1138,12 @@ int ocfs2_create_new_inode_locks(struct inode *inode)
 		goto bail;
 	}
 
+	ret = ocfs2_create_new_lock(osb, &OCFS2_I(inode)->ip_open_lockres, 0, 0);
+	if (ret) {
+		mlog_errno(ret);
+		goto bail;
+	}
+
 bail:
 	mlog_exit(ret);
 	return ret;
@@ -1182,6 +1197,99 @@ void ocfs2_rw_unlock(struct inode *inode, int write)
 	mlog_exit_void();
 }
 
+/*
+ * ocfs2_open_lock always get PR mode lock.
+ */
+int ocfs2_open_lock(struct inode *inode)
+{
+	int status = 0;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu take PRMODE open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    LKM_PRMODE, 0, 0);
+	if (status < 0)
+		mlog_errno(status);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+int ocfs2_try_open_lock(struct inode *inode, int write)
+{
+	int status = 0, level;
+	struct ocfs2_lock_res *lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	BUG_ON(!inode);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu try to take %s open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno,
+	     write ? "EXMODE" : "PRMODE");
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	lockres = &OCFS2_I(inode)->ip_open_lockres;
+
+	level = write ? LKM_EXMODE : LKM_PRMODE;
+
+	/*
+	 * The file system may already holding a PRMODE/EXMODE open lock.
+	 * Since we pass LKM_NOQUEUE, the request won't block waiting on
+	 * other nodes and the -EAGAIN will indicate to the caller that
+	 * this inode is still in use.
+	 */
+	status = ocfs2_cluster_lock(OCFS2_SB(inode->i_sb), lockres,
+				    level, LKM_NOQUEUE, 0);
+
+out:
+	mlog_exit(status);
+	return status;
+}
+
+/*
+ * ocfs2_open_unlock unlock PR and EX mode open locks.
+ */
+void ocfs2_open_unlock(struct inode *inode)
+{
+	struct ocfs2_lock_res *lockres = &OCFS2_I(inode)->ip_open_lockres;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	mlog_entry_void();
+
+	mlog(0, "inode %llu drop open lock\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno);
+
+	if (ocfs2_mount_local(osb))
+		goto out;
+
+	if(lockres->l_ro_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     LKM_PRMODE);
+	if(lockres->l_ex_holders)
+		ocfs2_cluster_unlock(OCFS2_SB(inode->i_sb), lockres,
+				     LKM_EXMODE);
+
+out:
+	mlog_exit_void();
+}
+
 int ocfs2_data_lock_full(struct inode *inode,
 			 int write,
 			 int arg_flags)
@@ -1387,8 +1495,7 @@ static void ocfs2_refresh_inode_from_lvb(struct inode *inode)
 	if (S_ISLNK(inode->i_mode) && !oi->ip_clusters)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks =
-			ocfs2_align_bytes_to_sectors(i_size_read(inode));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 
 	inode->i_uid     = be32_to_cpu(lvb->lvb_iuid);
 	inode->i_gid     = be32_to_cpu(lvb->lvb_igid);
@@ -1479,12 +1586,15 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 {
 	int status = 0;
 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-	struct ocfs2_lock_res *lockres = NULL;
+	struct ocfs2_lock_res *lockres = &oi->ip_meta_lockres;
 	struct ocfs2_dinode *fe;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 
 	mlog_entry_void();
 
+	if (ocfs2_mount_local(osb))
+		goto bail;
+
 	spin_lock(&oi->ip_lock);
 	if (oi->ip_flags & OCFS2_INODE_DELETED) {
 		mlog(0, "Orphaned inode %llu was deleted while we "
@@ -1496,22 +1606,16 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 	}
 	spin_unlock(&oi->ip_lock);
 
-	if (!ocfs2_mount_local(osb)) {
-		lockres = &oi->ip_meta_lockres;
-
-		if (!ocfs2_should_refresh_lock_res(lockres))
-			goto bail;
-	}
+	if (!ocfs2_should_refresh_lock_res(lockres))
+		goto bail;
 
 	/* This will discard any caching information we might have had
 	 * for the inode metadata. */
 	ocfs2_metadata_cache_purge(inode);
 
-	/* will do nothing for inode types that don't use the extent
-	 * map (directories, bitmap files, etc) */
 	ocfs2_extent_map_trunc(inode, 0);
 
-	if (lockres && ocfs2_meta_lvb_is_trustable(inode, lockres)) {
+	if (ocfs2_meta_lvb_is_trustable(inode, lockres)) {
 		mlog(0, "Trusting LVB on inode %llu\n",
 		     (unsigned long long)oi->ip_blkno);
 		ocfs2_refresh_inode_from_lvb(inode);
@@ -1558,8 +1662,7 @@ static int ocfs2_meta_lock_update(struct inode *inode,
 
 	status = 0;
 bail_refresh:
-	if (lockres)
-		ocfs2_complete_lock_res_refresh(lockres, status);
+	ocfs2_complete_lock_res_refresh(lockres, status);
 bail:
 	mlog_exit(status);
 	return status;
@@ -1630,7 +1733,6 @@ int ocfs2_meta_lock_full(struct inode *inode,
 		wait_event(osb->recovery_event,
 			   ocfs2_node_map_is_empty(osb, &osb->recovery_map));
 
-	acquired = 0;
 	lockres = &OCFS2_I(inode)->ip_meta_lockres;
 	level = ex ? LKM_EXMODE : LKM_PRMODE;
 	dlm_flags = 0;
@@ -2458,13 +2560,20 @@ int ocfs2_drop_inode_locks(struct inode *inode)
 	 * ocfs2_clear_inode has done it for us. */
 
 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
-			      &OCFS2_I(inode)->ip_data_lockres);
+			      &OCFS2_I(inode)->ip_open_lockres);
 	if (err < 0)
 		mlog_errno(err);
 
 	status = err;
 
 	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
+			      &OCFS2_I(inode)->ip_data_lockres);
+	if (err < 0)
+		mlog_errno(err);
+	if (err < 0 && !status)
+		status = err;
+
+	err = ocfs2_drop_lock(OCFS2_SB(inode->i_sb),
 			      &OCFS2_I(inode)->ip_meta_lockres);
 	if (err < 0)
 		mlog_errno(err);
diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h
index c343fca68cf1..59cb566e7983 100644
--- a/fs/ocfs2/dlmglue.h
+++ b/fs/ocfs2/dlmglue.h
@@ -80,6 +80,9 @@ void ocfs2_data_unlock(struct inode *inode,
 		       int write);
 int ocfs2_rw_lock(struct inode *inode, int write);
 void ocfs2_rw_unlock(struct inode *inode, int write);
+int ocfs2_open_lock(struct inode *inode);
+int ocfs2_try_open_lock(struct inode *inode, int write);
+void ocfs2_open_unlock(struct inode *inode);
 int ocfs2_meta_lock_atime(struct inode *inode,
 			  struct vfsmount *vfsmnt,
 			  int *level);
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 80ac69f11d9f..ba2b2ab1c6e4 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -3,8 +3,7 @@
  *
  * extent_map.c
  *
- * In-memory extent map for OCFS2.  Man, this code was prettier in
- * the library.
+ * Block/Cluster mapping functions
  *
  * Copyright (C) 2004 Oracle.  All rights reserved.
  *
@@ -26,1016 +25,528 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/types.h>
-#include <linux/slab.h>
-#include <linux/rbtree.h>
 
 #define MLOG_MASK_PREFIX ML_EXTENT_MAP
 #include <cluster/masklog.h>
 
 #include "ocfs2.h"
 
+#include "alloc.h"
 #include "extent_map.h"
 #include "inode.h"
 #include "super.h"
 
 #include "buffer_head_io.h"
 
-
 /*
- * SUCK SUCK SUCK
- * Our headers are so bad that struct ocfs2_extent_map is in ocfs.h
- */
-
-struct ocfs2_extent_map_entry {
-	struct rb_node e_node;
-	int e_tree_depth;
-	struct ocfs2_extent_rec e_rec;
-};
-
-struct ocfs2_em_insert_context {
-	int need_left;
-	int need_right;
-	struct ocfs2_extent_map_entry *new_ent;
-	struct ocfs2_extent_map_entry *old_ent;
-	struct ocfs2_extent_map_entry *left_ent;
-	struct ocfs2_extent_map_entry *right_ent;
-};
-
-static struct kmem_cache *ocfs2_em_ent_cachep = NULL;
-
-
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent);
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth);
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent);
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el);
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos, u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent);
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt);
-
-/* returns 1 only if the rec contains all the given clusters -- that is that
- * rec's cpos is <= the cluster cpos and that the rec endpoint (cpos +
- * clusters) is >= the argument's endpoint */
-static int ocfs2_extent_rec_contains_clusters(struct ocfs2_extent_rec *rec,
-					      u32 cpos, u32 clusters)
-{
-	if (le32_to_cpu(rec->e_cpos) > cpos)
-		return 0;
-	if (cpos + clusters > le32_to_cpu(rec->e_cpos) + 
-			      le32_to_cpu(rec->e_clusters))
-		return 0;
-	return 1;
-}
-
-
-/*
- * Find an entry in the tree that intersects the region passed in.
- * Note that this will find straddled intervals, it is up to the
- * callers to enforce any boundary conditions.
- *
- * Callers must hold ip_lock.  This lookup is not guaranteed to return
- * a tree_depth 0 match, and as such can race inserts if the lock
- * were not held.
+ * The extent caching implementation is intentionally trivial.
  *
- * The rb_node garbage lets insertion share the search.  Trivial
- * callers pass NULL.
+ * We only cache a small number of extents stored directly on the
+ * inode, so linear order operations are acceptable. If we ever want
+ * to increase the size of the extent map, then these algorithms must
+ * get smarter.
  */
-static struct ocfs2_extent_map_entry *
-ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
-			u32 cpos, u32 clusters,
-			struct rb_node ***ret_p,
-			struct rb_node **ret_parent)
+
+void ocfs2_extent_map_init(struct inode *inode)
 {
-	struct rb_node **p = &em->em_extents.rb_node;
-	struct rb_node *parent = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	while (*p)
-	{
-		parent = *p;
-		ent = rb_entry(parent, struct ocfs2_extent_map_entry,
-			       e_node);
-		if ((cpos + clusters) <= le32_to_cpu(ent->e_rec.e_cpos)) {
-			p = &(*p)->rb_left;
-			ent = NULL;
-		} else if (cpos >= (le32_to_cpu(ent->e_rec.e_cpos) +
-				    le32_to_cpu(ent->e_rec.e_clusters))) {
-			p = &(*p)->rb_right;
-			ent = NULL;
-		} else
-			break;
-	}
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
 
-	if (ret_p != NULL)
-		*ret_p = p;
-	if (ret_parent != NULL)
-		*ret_parent = parent;
-	return ent;
+	oi->ip_extent_map.em_num_items = 0;
+	INIT_LIST_HEAD(&oi->ip_extent_map.em_list);
 }
 
-/*
- * Find the leaf containing the interval we want.  While we're on our
- * way down the tree, fill in every record we see at any depth, because
- * we might want it later.
- *
- * Note that this code is run without ip_lock.  That's because it
- * sleeps while reading.  If someone is also filling the extent list at
- * the same time we are, we might have to restart.
- */
-static int ocfs2_extent_map_find_leaf(struct inode *inode,
-				      u32 cpos, u32 clusters,
-				      struct ocfs2_extent_list *el)
+static void __ocfs2_extent_map_lookup(struct ocfs2_extent_map *em,
+				      unsigned int cpos,
+				      struct ocfs2_extent_map_item **ret_emi)
 {
-	int i, ret;
-	struct buffer_head *eb_bh = NULL;
-	u64 blkno;
-	u32 rec_end;
-	struct ocfs2_extent_block *eb;
-	struct ocfs2_extent_rec *rec;
-
-	/*
-	 * The bh data containing the el cannot change here, because
-	 * we hold alloc_sem.  So we can do this without other
-	 * locks.
-	 */
-	while (el->l_tree_depth)
-	{
-		blkno = 0;
-		for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-			rec = &el->l_recs[i];
-			rec_end = (le32_to_cpu(rec->e_cpos) +
-				   le32_to_cpu(rec->e_clusters));
-
-			ret = -EBADR;
-			if (rec_end > OCFS2_I(inode)->ip_clusters) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-					    i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno),
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    OCFS2_I(inode)->ip_clusters);
-				goto out_free;
-			}
-
-			if (rec_end <= cpos) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
-			if ((cpos + clusters) <= le32_to_cpu(rec->e_cpos)) {
-				ret = ocfs2_extent_map_insert(inode, rec,
-						le16_to_cpu(el->l_tree_depth));
-				if (ret && (ret != -EEXIST)) {
-					mlog_errno(ret);
-					goto out_free;
-				}
-				continue;
-			}
+	unsigned int range;
+	struct ocfs2_extent_map_item *emi;
 
-			/*
-			 * We've found a record that matches our
-			 * interval.  We don't insert it because we're
-			 * about to traverse it.
-			 */
-
-			/* Check to see if we're stradling */
-			ret = -ESRCH;
-			if (!ocfs2_extent_rec_contains_clusters(rec,
-							        cpos,
-								clusters)) {
-				mlog_errno(ret);
-				goto out_free;
-			}
+	*ret_emi = NULL;
 
-			/*
-			 * If we've already found a record, the el has
-			 * two records covering the same interval.
-			 * EEEK!
-			 */
-			ret = -EBADR;
-			if (blkno) {
-				mlog_errno(ret);
-				ocfs2_error(inode->i_sb,
-					    "Multiple extents for (cpos = %u, clusters = %u) on inode %llu; e_blkno %llu and rec %d at e_blkno %llu\n",
-					    cpos, clusters,
-					    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-					    (unsigned long long)blkno, i,
-					    (unsigned long long)le64_to_cpu(rec->e_blkno));
-				goto out_free;
-			}
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		range = emi->ei_cpos + emi->ei_clusters;
 
-			blkno = le64_to_cpu(rec->e_blkno);
-		}
+		if (cpos >= emi->ei_cpos && cpos < range) {
+			list_move(&emi->ei_list, &em->em_list);
 
-		/*
-		 * We don't support holes, and we're still up
-		 * in the branches, so we'd better have found someone
-		 */
-		ret = -EBADR;
-		if (!blkno) {
-			ocfs2_error(inode->i_sb,
-				    "No record found for (cpos = %u, clusters = %u) on inode %llu\n",
-				    cpos, clusters,
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			mlog_errno(ret);
-			goto out_free;
-		}
-
-		if (eb_bh) {
-			brelse(eb_bh);
-			eb_bh = NULL;
-		}
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       blkno, &eb_bh, OCFS2_BH_CACHED,
-				       inode);
-		if (ret) {
-			mlog_errno(ret);
-			goto out_free;
-		}
-		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			ret = -EIO;
-			goto out_free;
+			*ret_emi = emi;
+			break;
 		}
-		el = &eb->h_list;
 	}
+}
 
-	BUG_ON(el->l_tree_depth);
-
-	for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
-		rec = &el->l_recs[i];
-
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-		    OCFS2_I(inode)->ip_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Extent %d at e_blkno %llu of inode %llu goes past ip_clusters of %u\n",
-				    i,
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno,
-				    OCFS2_I(inode)->ip_clusters);
-			return ret;
-		}
-
-		ret = ocfs2_extent_map_insert(inode, rec,
-					      le16_to_cpu(el->l_tree_depth));
-		if (ret && (ret != -EEXIST)) {
-			mlog_errno(ret);
-			goto out_free;
-		}
+static int ocfs2_extent_map_lookup(struct inode *inode, unsigned int cpos,
+				   unsigned int *phys, unsigned int *len,
+				   unsigned int *flags)
+{
+	unsigned int coff;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map_item *emi;
+
+	spin_lock(&oi->ip_lock);
+
+	__ocfs2_extent_map_lookup(&oi->ip_extent_map, cpos, &emi);
+	if (emi) {
+		coff = cpos - emi->ei_cpos;
+		*phys = emi->ei_phys + coff;
+		if (len)
+			*len = emi->ei_clusters - coff;
+		if (flags)
+			*flags = emi->ei_flags;
 	}
 
-	ret = 0;
+	spin_unlock(&oi->ip_lock);
 
-out_free:
-	if (eb_bh)
-		brelse(eb_bh);
+	if (emi == NULL)
+		return -ENOENT;
 
-	return ret;
+	return 0;
 }
 
 /*
- * This lookup actually will read from disk.  It has one invariant:
- * It will never re-traverse blocks.  This means that all inserts should
- * be new regions or more granular regions (both allowed by insert).
+ * Forget about all clusters equal to or greater than cpos.
  */
-static int ocfs2_extent_map_lookup_read(struct inode *inode,
-					u32 cpos,
-					u32 clusters,
-					struct ocfs2_extent_map_entry **ret_ent)
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cpos)
 {
-	int ret;
-	u64 blkno;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct buffer_head *bh = NULL;
-	struct ocfs2_extent_block *eb;
-	struct ocfs2_dinode *di;
-	struct ocfs2_extent_list *el;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (ent) {
-		if (!ent->e_tree_depth) {
-			spin_unlock(&OCFS2_I(inode)->ip_lock);
-			*ret_ent = ent;
-			return 0;
-		}
-		blkno = le64_to_cpu(ent->e_rec.e_blkno);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
+	struct list_head *p, *n;
+	struct ocfs2_extent_map_item *emi;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	LIST_HEAD(tmp_list);
+	unsigned int range;
+
+	spin_lock(&oi->ip_lock);
+	list_for_each_safe(p, n, &em->em_list) {
+		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+
+		if (emi->ei_cpos >= cpos) {
+			/* Full truncate of this record. */
+			list_move(&emi->ei_list, &tmp_list);
+			BUG_ON(em->em_num_items == 0);
+			em->em_num_items--;
+			continue;
 		}
-		eb = (struct ocfs2_extent_block *)bh->b_data;
-		if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) {
-			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb);
-			brelse(bh);
-			return -EIO;
-		}
-		el = &eb->h_list;
-	} else {
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
 
-		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
-				       OCFS2_I(inode)->ip_blkno, &bh,
-				       OCFS2_BH_CACHED, inode);
-		if (ret) {
-			mlog_errno(ret);
-			if (bh)
-				brelse(bh);
-			return ret;
+		range = emi->ei_cpos + emi->ei_clusters;
+		if (range > cpos) {
+			/* Partial truncate */
+			emi->ei_clusters = cpos - emi->ei_cpos;
 		}
-		di = (struct ocfs2_dinode *)bh->b_data;
-		if (!OCFS2_IS_VALID_DINODE(di)) {
-			brelse(bh);
-			OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, di);
-			return -EIO;
-		}
-		el = &di->id2.i_list;
-	}
-
-	ret = ocfs2_extent_map_find_leaf(inode, cpos, clusters, el);
-	brelse(bh);
-	if (ret) {
-		mlog_errno(ret);
-		return ret;
 	}
+	spin_unlock(&oi->ip_lock);
 
-	ent = ocfs2_extent_map_lookup(em, cpos, clusters, NULL, NULL);
-	if (!ent) {
-		ret = -ESRCH;
-		mlog_errno(ret);
-		return ret;
+	list_for_each_safe(p, n, &tmp_list) {
+		emi = list_entry(p, struct ocfs2_extent_map_item, ei_list);
+		list_del(&emi->ei_list);
+		kfree(emi);
 	}
-
-	/* FIXME: Make sure this isn't a corruption */
-	BUG_ON(ent->e_tree_depth);
-
-	*ret_ent = ent;
-
-	return 0;
 }
 
 /*
- * Callers must hold ip_lock.  This can insert pieces of the tree,
- * thus racing lookup if the lock weren't held.
+ * Is any part of emi2 contained within emi1
  */
-static int ocfs2_extent_map_insert_entry(struct ocfs2_extent_map *em,
-					 struct ocfs2_extent_map_entry *ent)
+static int ocfs2_ei_is_contained(struct ocfs2_extent_map_item *emi1,
+				 struct ocfs2_extent_map_item *emi2)
 {
-	struct rb_node **p, *parent;
-	struct ocfs2_extent_map_entry *old_ent;
+	unsigned int range1, range2;
 
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(ent->e_rec.e_cpos),
-					  le32_to_cpu(ent->e_rec.e_clusters),
-					  &p, &parent);
-	if (old_ent)
-		return -EEXIST;
+	/*
+	 * Check if logical start of emi2 is inside emi1
+	 */
+	range1 = emi1->ei_cpos + emi1->ei_clusters;
+	if (emi2->ei_cpos >= emi1->ei_cpos && emi2->ei_cpos < range1)
+		return 1;
 
-	rb_link_node(&ent->e_node, parent, p);
-	rb_insert_color(&ent->e_node, &em->em_extents);
+	/*
+	 * Check if logical end of emi2 is inside emi1
+	 */
+	range2 = emi2->ei_cpos + emi2->ei_clusters;
+	if (range2 > emi1->ei_cpos && range2 <= range1)
+		return 1;
 
 	return 0;
 }
 
+static void ocfs2_copy_emi_fields(struct ocfs2_extent_map_item *dest,
+				  struct ocfs2_extent_map_item *src)
+{
+	dest->ei_cpos = src->ei_cpos;
+	dest->ei_phys = src->ei_phys;
+	dest->ei_clusters = src->ei_clusters;
+	dest->ei_flags = src->ei_flags;
+}
 
 /*
- * Simple rule: on any return code other than -EAGAIN, anything left
- * in the insert_context will be freed.
- *
- * Simple rule #2: A return code of -EEXIST from this function or
- * its calls to ocfs2_extent_map_insert_entry() signifies that another
- * thread beat us to the insert.  It is not an actual error, but it
- * tells the caller we have no more work to do.
+ * Try to merge emi with ins. Returns 1 if merge succeeds, zero
+ * otherwise.
  */
-static int ocfs2_extent_map_try_insert(struct inode *inode,
-				       struct ocfs2_extent_rec *rec,
-				       int tree_depth,
-				       struct ocfs2_em_insert_context *ctxt)
+static int ocfs2_try_to_merge_extent_map(struct ocfs2_extent_map_item *emi,
+					 struct ocfs2_extent_map_item *ins)
 {
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *old_ent;
-
-	ctxt->need_left = 0;
-	ctxt->need_right = 0;
-	ctxt->old_ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-	if (!ret) {
-		ctxt->new_ent = NULL;
-		goto out_unlock;
-	}
-
-	/* Since insert_entry failed, the map MUST have old_ent */
-	old_ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos),
-					  le32_to_cpu(rec->e_clusters),
-					  NULL, NULL);
-
-	BUG_ON(!old_ent);
-
-	if (old_ent->e_tree_depth < tree_depth) {
-		/* Another thread beat us to the lower tree_depth */
-		ret = -EEXIST;
-		goto out_unlock;
-	}
-
-	if (old_ent->e_tree_depth == tree_depth) {
-		/*
-		 * Another thread beat us to this tree_depth.
-		 * Let's make sure we agree with that thread (the
-		 * extent_rec should be identical).
-		 */
-		if (!memcmp(rec, &old_ent->e_rec,
-			    sizeof(struct ocfs2_extent_rec)))
-			ret = 0;
-		else
-			/* FIXME: Should this be ESRCH/EBADR??? */
-			ret = -EEXIST;
-
-		goto out_unlock;
-	}
-
 	/*
-	 * We do it in this order specifically so that no actual tree
-	 * changes occur until we have all the pieces we need.  We
-	 * don't want malloc failures to leave an inconsistent tree.
-	 * Whenever we drop the lock, another process could be
-	 * inserting.  Also note that, if another process just beat us
-	 * to an insert, we might not need the same pieces we needed
-	 * the first go round.  In the end, the pieces we need will
-	 * be used, and the pieces we don't will be freed.
+	 * Handle contiguousness
 	 */
-	ctxt->need_left = !!(le32_to_cpu(rec->e_cpos) >
-			     le32_to_cpu(old_ent->e_rec.e_cpos));
-	ctxt->need_right = !!((le32_to_cpu(old_ent->e_rec.e_cpos) +
-			       le32_to_cpu(old_ent->e_rec.e_clusters)) >
-			      (le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)));
-	ret = -EAGAIN;
-	if (ctxt->need_left) {
-		if (!ctxt->left_ent)
-			goto out_unlock;
-		*(ctxt->left_ent) = *old_ent;
-		ctxt->left_ent->e_rec.e_clusters =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) -
-				    le32_to_cpu(ctxt->left_ent->e_rec.e_cpos));
-	}
-	if (ctxt->need_right) {
-		if (!ctxt->right_ent)
-			goto out_unlock;
-		*(ctxt->right_ent) = *old_ent;
-		ctxt->right_ent->e_rec.e_cpos =
-			cpu_to_le32(le32_to_cpu(rec->e_cpos) +
-				    le32_to_cpu(rec->e_clusters));
-		ctxt->right_ent->e_rec.e_clusters =
-			cpu_to_le32((le32_to_cpu(old_ent->e_rec.e_cpos) +
-				     le32_to_cpu(old_ent->e_rec.e_clusters)) -
-				    le32_to_cpu(ctxt->right_ent->e_rec.e_cpos));
-	}
-
-	rb_erase(&old_ent->e_node, &em->em_extents);
-	/* Now that he's erased, set him up for deletion */
-	ctxt->old_ent = old_ent;
-
-	if (ctxt->need_left) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->left_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->left_ent = NULL;
+	if (ins->ei_phys == (emi->ei_phys + emi->ei_clusters) &&
+	    ins->ei_cpos == (emi->ei_cpos + emi->ei_clusters) &&
+	    ins->ei_flags == emi->ei_flags) {
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
+	} else if ((ins->ei_phys + ins->ei_clusters) == emi->ei_phys &&
+		   (ins->ei_cpos + ins->ei_clusters) == emi->ei_phys &&
+		   ins->ei_flags == emi->ei_flags) {
+		emi->ei_phys = ins->ei_phys;
+		emi->ei_cpos = ins->ei_cpos;
+		emi->ei_clusters += ins->ei_clusters;
+		return 1;
 	}
 
-	if (ctxt->need_right) {
-		ret = ocfs2_extent_map_insert_entry(em,
-						    ctxt->right_ent);
-		if (ret)
-			goto out_unlock;
-		ctxt->right_ent = NULL;
+	/*
+	 * Overlapping extents - this shouldn't happen unless we've
+	 * split an extent to change it's flags. That is exceedingly
+	 * rare, so there's no sense in trying to optimize it yet.
+	 */
+	if (ocfs2_ei_is_contained(emi, ins) ||
+	    ocfs2_ei_is_contained(ins, emi)) {
+		ocfs2_copy_emi_fields(emi, ins);
+		return 1;
 	}
 
-	ret = ocfs2_extent_map_insert_entry(em, ctxt->new_ent);
-
-	if (!ret)
-		ctxt->new_ent = NULL;
-
-out_unlock:
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	return ret;
+	/* No merge was possible. */
+	return 0;
 }
 
-
-static int ocfs2_extent_map_insert(struct inode *inode,
-				   struct ocfs2_extent_rec *rec,
-				   int tree_depth)
+/*
+ * In order to reduce complexity on the caller, this insert function
+ * is intentionally liberal in what it will accept.
+ *
+ * The only rule is that the truncate call *must* be used whenever
+ * records have been deleted. This avoids inserting overlapping
+ * records with different physical mappings.
+ */
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec)
 {
-	int ret;
-	struct ocfs2_em_insert_context ctxt = {0, };
-
-	if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) >
-	    OCFS2_I(inode)->ip_map.em_clusters) {
-		ret = -EBADR;
-		mlog_errno(ret);
-		return ret;
+	struct ocfs2_inode_info *oi = OCFS2_I(inode);
+	struct ocfs2_extent_map *em = &oi->ip_extent_map;
+	struct ocfs2_extent_map_item *emi, *new_emi = NULL;
+	struct ocfs2_extent_map_item ins;
+
+	ins.ei_cpos = le32_to_cpu(rec->e_cpos);
+	ins.ei_phys = ocfs2_blocks_to_clusters(inode->i_sb,
+					       le64_to_cpu(rec->e_blkno));
+	ins.ei_clusters = le16_to_cpu(rec->e_leaf_clusters);
+	ins.ei_flags = rec->e_flags;
+
+search:
+	spin_lock(&oi->ip_lock);
+
+	list_for_each_entry(emi, &em->em_list, ei_list) {
+		if (ocfs2_try_to_merge_extent_map(emi, &ins)) {
+			list_move(&emi->ei_list, &em->em_list);
+			spin_unlock(&oi->ip_lock);
+			goto out;
+		}
 	}
 
-	/* Zero e_clusters means a truncated tail record.  It better be EOF */
-	if (!rec->e_clusters) {
-		if ((le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters)) !=
-		    OCFS2_I(inode)->ip_map.em_clusters) {
-			ret = -EBADR;
-			mlog_errno(ret);
-			ocfs2_error(inode->i_sb,
-				    "Zero e_clusters on non-tail extent record at e_blkno %llu on inode %llu\n",
-				    (unsigned long long)le64_to_cpu(rec->e_blkno),
-				    (unsigned long long)OCFS2_I(inode)->ip_blkno);
-			return ret;
-		}
+	/*
+	 * No item could be merged.
+	 *
+	 * Either allocate and add a new item, or overwrite the last recently
+	 * inserted.
+	 */
 
-		/* Ignore the truncated tail */
-		return 0;
-	}
+	if (em->em_num_items < OCFS2_MAX_EXTENT_MAP_ITEMS) {
+		if (new_emi == NULL) {
+			spin_unlock(&oi->ip_lock);
 
-	ret = -ENOMEM;
-	ctxt.new_ent = kmem_cache_alloc(ocfs2_em_ent_cachep,
-					GFP_NOFS);
-	if (!ctxt.new_ent) {
-		mlog_errno(ret);
-		return ret;
-	}
+			new_emi = kmalloc(sizeof(*new_emi), GFP_NOFS);
+			if (new_emi == NULL)
+				goto out;
 
-	ctxt.new_ent->e_rec = *rec;
-	ctxt.new_ent->e_tree_depth = tree_depth;
-
-	do {
-		ret = -ENOMEM;
-		if (ctxt.need_left && !ctxt.left_ent) {
-			ctxt.left_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.left_ent)
-				break;
-		}
-		if (ctxt.need_right && !ctxt.right_ent) {
-			ctxt.right_ent =
-				kmem_cache_alloc(ocfs2_em_ent_cachep,
-						 GFP_NOFS);
-			if (!ctxt.right_ent)
-				break;
+			goto search;
 		}
 
-		ret = ocfs2_extent_map_try_insert(inode, rec,
-						  tree_depth, &ctxt);
-	} while (ret == -EAGAIN);
-
-	if ((ret < 0) && (ret != -EEXIST))
-		mlog_errno(ret);
+		ocfs2_copy_emi_fields(new_emi, &ins);
+		list_add(&new_emi->ei_list, &em->em_list);
+		em->em_num_items++;
+		new_emi = NULL;
+	} else {
+		BUG_ON(list_empty(&em->em_list) || em->em_num_items == 0);
+		emi = list_entry(em->em_list.prev,
+				 struct ocfs2_extent_map_item, ei_list);
+		list_move(&emi->ei_list, &em->em_list);
+		ocfs2_copy_emi_fields(emi, &ins);
+	}
 
-	if (ctxt.left_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.left_ent);
-	if (ctxt.right_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.right_ent);
-	if (ctxt.old_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.old_ent);
-	if (ctxt.new_ent)
-		kmem_cache_free(ocfs2_em_ent_cachep, ctxt.new_ent);
+	spin_unlock(&oi->ip_lock);
 
-	return ret;
+out:
+	if (new_emi)
+		kfree(new_emi);
 }
 
 /*
- * Append this record to the tail of the extent map.  It must be
- * tree_depth 0.  The record might be an extension of an existing
- * record, and as such that needs to be handled.  eg:
- *
- * Existing record in the extent map:
- *
- *	cpos = 10, len = 10
- *	|---------|
- *
- * New Record:
- *
- *	cpos = 10, len = 20
- *	|------------------|
- *
- * The passed record is the new on-disk record.  The new_clusters value
- * is how many clusters were added to the file.  If the append is a
- * contiguous append, the new_clusters has been added to
- * rec->e_clusters.  If the append is an entirely new extent, then
- * rec->e_clusters is == new_clusters.
+ * Return the 1st index within el which contains an extent start
+ * larger than v_cluster.
  */
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters)
+static int ocfs2_search_for_hole_index(struct ocfs2_extent_list *el,
+				       u32 v_cluster)
 {
-	int ret;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-	struct ocfs2_extent_rec *old;
-
-	BUG_ON(!new_clusters);
-	BUG_ON(le32_to_cpu(rec->e_clusters) < new_clusters);
+	int i;
+	struct ocfs2_extent_rec *rec;
 
-	if (em->em_clusters < OCFS2_I(inode)->ip_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
 
-	mlog_bug_on_msg((le32_to_cpu(rec->e_cpos) +
-			 le32_to_cpu(rec->e_clusters)) !=
-			(em->em_clusters + new_clusters),
-			"Inode %llu:\n"
-			"rec->e_cpos = %u + rec->e_clusters = %u = %u\n"
-			"em->em_clusters = %u + new_clusters = %u = %u\n",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-			le32_to_cpu(rec->e_cpos), le32_to_cpu(rec->e_clusters),
-			le32_to_cpu(rec->e_cpos) + le32_to_cpu(rec->e_clusters),
-			em->em_clusters, new_clusters,
-			em->em_clusters + new_clusters);
-
-	em->em_clusters += new_clusters;
-
-	ret = -ENOENT;
-	if (le32_to_cpu(rec->e_clusters) > new_clusters) {
-		/* This is a contiguous append */
-		ent = ocfs2_extent_map_lookup(em, le32_to_cpu(rec->e_cpos), 1,
-					      NULL, NULL);
-		if (ent) {
-			old = &ent->e_rec;
-			BUG_ON((le32_to_cpu(rec->e_cpos) +
-				le32_to_cpu(rec->e_clusters)) !=
-				 (le32_to_cpu(old->e_cpos) +
-				  le32_to_cpu(old->e_clusters) +
-				  new_clusters));
-			if (ent->e_tree_depth == 0) {
-				BUG_ON(le32_to_cpu(old->e_cpos) !=
-				       le32_to_cpu(rec->e_cpos));
-				BUG_ON(le64_to_cpu(old->e_blkno) !=
-				       le64_to_cpu(rec->e_blkno));
-				ret = 0;
-			}
-			/*
-			 * Let non-leafs fall through as -ENOENT to
-			 * force insertion of the new leaf.
-			 */
-			le32_add_cpu(&old->e_clusters, new_clusters);
-		}
+		if (v_cluster < le32_to_cpu(rec->e_cpos))
+			break;
 	}
 
-	if (ret == -ENOENT)
-		ret = ocfs2_extent_map_insert(inode, rec, 0);
-	if (ret < 0)
-		mlog_errno(ret);
-	return ret;
+	return i;
 }
 
-#if 0
-/* Code here is included but defined out as it completes the extent
- * map api and may be used in the future. */
-
 /*
- * Look up the record containing this cluster offset.  This record is
- * part of the extent map.  Do not free it.  Any changes you make to
- * it will reflect in the extent map.  So, if your last extent
- * is (cpos = 10, clusters = 10) and you truncate the file by 5
- * clusters, you can do:
+ * Figure out the size of a hole which starts at v_cluster within the given
+ * extent list.
  *
- * ret = ocfs2_extent_map_get_rec(em, orig_size - 5, &rec);
- * rec->e_clusters -= 5;
+ * If there is no more allocation past v_cluster, we return the maximum
+ * cluster size minus v_cluster.
  *
- * The lookup does not read from disk.  If the map isn't filled in for
- * an entry, you won't find it.
- *
- * Also note that the returned record is valid until alloc_sem is
- * dropped.  After that, truncate and extend can happen.  Caveat Emptor.
+ * If we have in-inode extents, then el points to the dinode list and
+ * eb_bh is NULL. Otherwise, eb_bh should point to the extent block
+ * containing el.
  */
-int ocfs2_extent_map_get_rec(struct inode *inode, u32 cpos,
-			     struct ocfs2_extent_rec **rec,
-			     int *tree_depth)
+static int ocfs2_figure_hole_clusters(struct inode *inode,
+				      struct ocfs2_extent_list *el,
+				      struct buffer_head *eb_bh,
+				      u32 v_cluster,
+				      u32 *num_clusters)
 {
-	int ret = -ENOENT;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
+	int ret, i;
+	struct buffer_head *next_eb_bh = NULL;
+	struct ocfs2_extent_block *eb, *next_eb;
 
-	*rec = NULL;
+	i = ocfs2_search_for_hole_index(el, v_cluster);
 
-	if (cpos >= OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
+	if (i == le16_to_cpu(el->l_next_free_rec) && eb_bh) {
+		eb = (struct ocfs2_extent_block *)eb_bh->b_data;
 
-	if (cpos >= em->em_clusters) {
 		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
+		 * Check the next leaf for any extents.
 		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters ;
-	}
-
-	ent = ocfs2_extent_map_lookup(&OCFS2_I(inode)->ip_map, cpos, 1,
-				      NULL, NULL);
 
-	if (ent) {
-		*rec = &ent->e_rec;
-		if (tree_depth)
-			*tree_depth = ent->e_tree_depth;
-		ret = 0;
-	}
+		if (le64_to_cpu(eb->h_next_leaf_blk) == 0ULL)
+			goto no_more_extents;
 
-	return ret;
-}
+		ret = ocfs2_read_block(OCFS2_SB(inode->i_sb),
+				       le64_to_cpu(eb->h_next_leaf_blk),
+				       &next_eb_bh, OCFS2_BH_CACHED, inode);
+		if (ret) {
+			mlog_errno(ret);
+			goto out;
+		}
+		next_eb = (struct ocfs2_extent_block *)next_eb_bh->b_data;
 
-int ocfs2_extent_map_get_clusters(struct inode *inode,
-				  u32 v_cpos, int count,
-				  u32 *p_cpos, int *ret_count)
-{
-	int ret;
-	u32 coff, ccount;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent = NULL;
+		if (!OCFS2_IS_VALID_EXTENT_BLOCK(next_eb)) {
+			ret = -EROFS;
+			OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, next_eb);
+			goto out;
+		}
 
-	*p_cpos = ccount = 0;
+		el = &next_eb->h_list;
 
-	if ((v_cpos + count) > OCFS2_I(inode)->ip_clusters)
-		return -EINVAL;
+		i = ocfs2_search_for_hole_index(el, v_cluster);
+	}
 
-	if ((v_cpos + count) > em->em_clusters) {
+no_more_extents:
+	if (i == le16_to_cpu(el->l_next_free_rec)) {
 		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
+		 * We're at the end of our existing allocation. Just
+		 * return the maximum number of clusters we could
+		 * possibly allocate.
 		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
+		*num_clusters = UINT_MAX - v_cluster;
+	} else {
+		*num_clusters = le32_to_cpu(el->l_recs[i].e_cpos) - v_cluster;
 	}
 
+	ret = 0;
+out:
+	brelse(next_eb_bh);
+	return ret;
+}
 
-	ret = ocfs2_extent_map_lookup_read(inode, v_cpos, count, &ent);
-	if (ret)
-		return ret;
+/*
+ * Return the index of the extent record which contains cluster #v_cluster.
+ * -1 is returned if it was not found.
+ *
+ * Should work fine on interior and exterior nodes.
+ */
+static int ocfs2_search_extent_list(struct ocfs2_extent_list *el,
+				    u32 v_cluster)
+{
+	int ret = -1;
+	int i;
+	struct ocfs2_extent_rec *rec;
+	u32 rec_end, rec_start, clusters;
 
-	if (ent) {
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(&ent->e_rec,
-							v_cpos,
-							count))
-			return -ESRCH;
+	for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
+		rec = &el->l_recs[i];
 
-		coff = v_cpos - le32_to_cpu(ent->e_rec.e_cpos);
-		*p_cpos = ocfs2_blocks_to_clusters(inode->i_sb,
-				le64_to_cpu(ent->e_rec.e_blkno)) +
-			  coff;
+		rec_start = le32_to_cpu(rec->e_cpos);
+		clusters = ocfs2_rec_clusters(el, rec);
 
-		if (ret_count)
-			*ret_count = le32_to_cpu(ent->e_rec.e_clusters) - coff;
+		rec_end = rec_start + clusters;
 
-		return 0;
+		if (v_cluster >= rec_start && v_cluster < rec_end) {
+			ret = i;
+			break;
+		}
 	}
 
-
-	return -ENOENT;
+	return ret;
 }
 
-#endif  /*  0  */
-
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count)
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster,
+		       u32 *p_cluster, u32 *num_clusters,
+		       unsigned int *extent_flags)
 {
-	int ret;
-	u64 boff;
-	u32 cpos, clusters;
-	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
-	struct ocfs2_extent_map_entry *ent = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
+	int ret, i;
+	unsigned int flags = 0;
+	struct buffer_head *di_bh = NULL;
+	struct buffer_head *eb_bh = NULL;
+	struct ocfs2_dinode *di;
+	struct ocfs2_extent_block *eb;
+	struct ocfs2_extent_list *el;
 	struct ocfs2_extent_rec *rec;
+	u32 coff;
 
-	*p_blkno = 0;
-
-	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
-	clusters = ocfs2_blocks_to_clusters(inode->i_sb,
-					    (u64)count + bpc - 1);
-	if ((cpos + clusters) > OCFS2_I(inode)->ip_clusters) {
-		ret = -EINVAL;
-		mlog_errno(ret);
-		return ret;
-	}
-
-	if ((cpos + clusters) > em->em_clusters) {
-		/*
-		 * Size changed underneath us on disk.  Drop any
-		 * straddling records and update our idea of
-		 * i_clusters
-		 */
-		ocfs2_extent_map_drop(inode, em->em_clusters - 1);
-		em->em_clusters = OCFS2_I(inode)->ip_clusters;
-	}
+	ret = ocfs2_extent_map_lookup(inode, v_cluster, p_cluster,
+				      num_clusters, extent_flags);
+	if (ret == 0)
+		goto out;
 
-	ret = ocfs2_extent_map_lookup_read(inode, cpos, clusters, &ent);
+	ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), OCFS2_I(inode)->ip_blkno,
+			       &di_bh, OCFS2_BH_CACHED, inode);
 	if (ret) {
 		mlog_errno(ret);
-		return ret;
+		goto out;
 	}
 
-	if (ent)
-	{
-		rec = &ent->e_rec;
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	el = &di->id2.i_list;
 
-		/* We should never find ourselves straddling an interval */
-		if (!ocfs2_extent_rec_contains_clusters(rec, cpos, clusters)) {
-			ret = -ESRCH;
+	if (el->l_tree_depth) {
+		ret = ocfs2_find_leaf(inode, el, v_cluster, &eb_bh);
+		if (ret) {
 			mlog_errno(ret);
-			return ret;
+			goto out;
 		}
 
-		boff = ocfs2_clusters_to_blocks(inode->i_sb, cpos -
-						le32_to_cpu(rec->e_cpos));
-		boff += (v_blkno & (u64)(bpc - 1));
-		*p_blkno = le64_to_cpu(rec->e_blkno) + boff;
+		eb = (struct ocfs2_extent_block *) eb_bh->b_data;
+		el = &eb->h_list;
 
-		if (ret_count) {
-			*ret_count = ocfs2_clusters_to_blocks(inode->i_sb,
-					le32_to_cpu(rec->e_clusters)) - boff;
+		if (el->l_tree_depth) {
+			ocfs2_error(inode->i_sb,
+				    "Inode %lu has non zero tree depth in "
+				    "leaf block %llu\n", inode->i_ino,
+				    (unsigned long long)eb_bh->b_blocknr);
+			ret = -EROFS;
+			goto out;
 		}
-
-		return 0;
 	}
 
-	return -ENOENT;
-}
-
-int ocfs2_extent_map_init(struct inode *inode)
-{
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-
-	em->em_extents = RB_ROOT;
-	em->em_clusters = 0;
-
-	return 0;
-}
-
-/* Needs the lock */
-static void __ocfs2_extent_map_drop(struct inode *inode,
-				    u32 new_clusters,
-				    struct rb_node **free_head,
-				    struct ocfs2_extent_map_entry **tail_ent)
-{
-	struct rb_node *node, *next;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
+	i = ocfs2_search_extent_list(el, v_cluster);
+	if (i == -1) {
+		/*
+		 * A hole was found. Return some canned values that
+		 * callers can key on. If asked for, num_clusters will
+		 * be populated with the size of the hole.
+		 */
+		*p_cluster = 0;
+		if (num_clusters) {
+			ret = ocfs2_figure_hole_clusters(inode, el, eb_bh,
+							 v_cluster,
+							 num_clusters);
+			if (ret) {
+				mlog_errno(ret);
+				goto out;
+			}
+		}
+	} else {
+		rec = &el->l_recs[i];
 
-	*free_head = NULL;
+		BUG_ON(v_cluster < le32_to_cpu(rec->e_cpos));
 
-	ent = NULL;
-	node = rb_last(&em->em_extents);
-	while (node)
-	{
-		next = rb_prev(node);
+		if (!rec->e_blkno) {
+			ocfs2_error(inode->i_sb, "Inode %lu has bad extent "
+				    "record (%u, %u, 0)", inode->i_ino,
+				    le32_to_cpu(rec->e_cpos),
+				    ocfs2_rec_clusters(el, rec));
+			ret = -EROFS;
+			goto out;
+		}
 
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		if (le32_to_cpu(ent->e_rec.e_cpos) < new_clusters)
-			break;
+		coff = v_cluster - le32_to_cpu(rec->e_cpos);
 
-		rb_erase(&ent->e_node, &em->em_extents);
+		*p_cluster = ocfs2_blocks_to_clusters(inode->i_sb,
+						    le64_to_cpu(rec->e_blkno));
+		*p_cluster = *p_cluster + coff;
 
-		node->rb_right = *free_head;
-		*free_head = node;
+		if (num_clusters)
+			*num_clusters = ocfs2_rec_clusters(el, rec) - coff;
 
-		ent = NULL;
-		node = next;
-	}
+		flags = rec->e_flags;
 
-	/* Do we have an entry straddling new_clusters? */
-	if (tail_ent) {
-		if (ent &&
-		    ((le32_to_cpu(ent->e_rec.e_cpos) +
-		      le32_to_cpu(ent->e_rec.e_clusters)) > new_clusters))
-			*tail_ent = ent;
-		else
-			*tail_ent = NULL;
+		ocfs2_extent_map_insert_rec(inode, rec);
 	}
-}
-
-static void __ocfs2_extent_map_drop_cleanup(struct rb_node *free_head)
-{
-	struct rb_node *node;
-	struct ocfs2_extent_map_entry *ent;
 
-	while (free_head) {
-		node = free_head;
-		free_head = node->rb_right;
+	if (extent_flags)
+		*extent_flags = flags;
 
-		ent = rb_entry(node, struct ocfs2_extent_map_entry,
-			       e_node);
-		kmem_cache_free(ocfs2_em_ent_cachep, ent);
-	}
+out:
+	brelse(di_bh);
+	brelse(eb_bh);
+	return ret;
 }
 
 /*
- * Remove all entries past new_clusters, inclusive of an entry that
- * contains new_clusters.  This is effectively a cache forget.
- *
- * If you want to also clip the last extent by some number of clusters,
- * you need to call ocfs2_extent_map_trunc().
- * This code does not check or modify ip_clusters.
+ * This expects alloc_sem to be held. The allocation cannot change at
+ * all while the map is in the process of being updated.
  */
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters)
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags)
 {
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map *em = &OCFS2_I(inode)->ip_map;
-	struct ocfs2_extent_map_entry *ent;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
+	int ret;
+	int bpc = ocfs2_clusters_to_blocks(inode->i_sb, 1);
+	u32 cpos, num_clusters, p_cluster;
+	u64 boff = 0;
 
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
+	cpos = ocfs2_blocks_to_clusters(inode->i_sb, v_blkno);
 
-	if (ent) {
-		rb_erase(&ent->e_node, &em->em_extents);
-		ent->e_node.rb_right = free_head;
-		free_head = &ent->e_node;
+	ret = ocfs2_get_clusters(inode, cpos, &p_cluster, &num_clusters,
+				 extent_flags);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
 	}
 
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
-
-/*
- * Remove all entries past new_clusters and also clip any extent
- * straddling new_clusters, if there is one.  This does not check
- * or modify ip_clusters
- */
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters)
-{
-	struct rb_node *free_head = NULL;
-	struct ocfs2_extent_map_entry *ent = NULL;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	__ocfs2_extent_map_drop(inode, new_clusters, &free_head, &ent);
-
-	if (ent)
-		ent->e_rec.e_clusters = cpu_to_le32(new_clusters -
-					       le32_to_cpu(ent->e_rec.e_cpos));
-
-	OCFS2_I(inode)->ip_map.em_clusters = new_clusters;
-
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	if (free_head)
-		__ocfs2_extent_map_drop_cleanup(free_head);
-
-	return 0;
-}
+	/*
+	 * p_cluster == 0 indicates a hole.
+	 */
+	if (p_cluster) {
+		boff = ocfs2_clusters_to_blocks(inode->i_sb, p_cluster);
+		boff += (v_blkno & (u64)(bpc - 1));
+	}
 
-int __init init_ocfs2_extent_maps(void)
-{
-	ocfs2_em_ent_cachep =
-		kmem_cache_create("ocfs2_em_ent",
-				  sizeof(struct ocfs2_extent_map_entry),
-				  0, SLAB_HWCACHE_ALIGN, NULL, NULL);
-	if (!ocfs2_em_ent_cachep)
-		return -ENOMEM;
+	*p_blkno = boff;
 
-	return 0;
-}
+	if (ret_count) {
+		*ret_count = ocfs2_clusters_to_blocks(inode->i_sb, num_clusters);
+		*ret_count -= v_blkno & (u64)(bpc - 1);
+	}
 
-void exit_ocfs2_extent_maps(void)
-{
-	kmem_cache_destroy(ocfs2_em_ent_cachep);
+out:
+	return ret;
 }
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index fa3745efa886..de91e3e41a22 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -25,22 +25,29 @@
 #ifndef _EXTENT_MAP_H
 #define _EXTENT_MAP_H
 
-int init_ocfs2_extent_maps(void);
-void exit_ocfs2_extent_maps(void);
+struct ocfs2_extent_map_item {
+	unsigned int			ei_cpos;
+	unsigned int			ei_phys;
+	unsigned int			ei_clusters;
+	unsigned int			ei_flags;
 
-/*
- * EVERY CALL here except _init, _trunc, and _drop expects alloc_sem
- * to be held.  The allocation cannot change at all while the map is
- * in the process of being updated.
- */
-int ocfs2_extent_map_init(struct inode *inode);
-int ocfs2_extent_map_append(struct inode *inode,
-			    struct ocfs2_extent_rec *rec,
-			    u32 new_clusters);
-int ocfs2_extent_map_get_blocks(struct inode *inode,
-				u64 v_blkno, int count,
-				u64 *p_blkno, int *ret_count);
-int ocfs2_extent_map_drop(struct inode *inode, u32 new_clusters);
-int ocfs2_extent_map_trunc(struct inode *inode, u32 new_clusters);
+	struct list_head		ei_list;
+};
+
+#define OCFS2_MAX_EXTENT_MAP_ITEMS			3
+struct ocfs2_extent_map {
+	unsigned int			em_num_items;
+	struct list_head		em_list;
+};
+
+void ocfs2_extent_map_init(struct inode *inode);
+void ocfs2_extent_map_trunc(struct inode *inode, unsigned int cluster);
+void ocfs2_extent_map_insert_rec(struct inode *inode,
+				 struct ocfs2_extent_rec *rec);
+
+int ocfs2_get_clusters(struct inode *inode, u32 v_cluster, u32 *p_cluster,
+		       u32 *num_clusters, unsigned int *extent_flags);
+int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
+				u64 *ret_count, unsigned int *extent_flags);
 
 #endif  /* _EXTENT_MAP_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index f2cd3bf9efb2..520a2a6d7670 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -33,6 +33,7 @@
 #include <linux/sched.h>
 #include <linux/pipe_fs_i.h>
 #include <linux/mount.h>
+#include <linux/writeback.h>
 
 #define MLOG_MASK_PREFIX ML_INODE
 #include <cluster/masklog.h>
@@ -215,7 +216,7 @@ int ocfs2_set_inode_size(handle_t *handle,
 
 	mlog_entry_void();
 	i_size_write(inode, new_i_size);
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
 
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
@@ -261,6 +262,7 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 {
 	int status;
 	handle_t *handle;
+	struct ocfs2_dinode *di;
 
 	mlog_entry_void();
 
@@ -274,12 +276,39 @@ static int ocfs2_orphan_for_truncate(struct ocfs2_super *osb,
 		goto out;
 	}
 
-	status = ocfs2_set_inode_size(handle, inode, fe_bh, new_i_size);
+	status = ocfs2_journal_access(handle, inode, fe_bh,
+				      OCFS2_JOURNAL_ACCESS_WRITE);
+	if (status < 0) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	/*
+	 * Do this before setting i_size.
+	 */
+	status = ocfs2_zero_tail_for_truncate(inode, handle, new_i_size);
+	if (status) {
+		mlog_errno(status);
+		goto out_commit;
+	}
+
+	i_size_write(inode, new_i_size);
+	inode->i_blocks = ocfs2_align_bytes_to_sectors(new_i_size);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+
+	di = (struct ocfs2_dinode *) fe_bh->b_data;
+	di->i_size = cpu_to_le64(new_i_size);
+	di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec);
+	di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
+
+	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0)
 		mlog_errno(status);
 
+out_commit:
 	ocfs2_commit_trans(osb, handle);
 out:
+
 	mlog_exit(status);
 	return status;
 }
@@ -342,19 +371,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		mlog_errno(status);
 		goto bail;
 	}
-	ocfs2_data_unlock(inode, 1);
-
-	if (le32_to_cpu(fe->i_clusters) ==
-	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
-		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
-		     fe->i_clusters);
-		/* No allocation change is required, so lets fast path
-		 * this truncate. */
-		status = ocfs2_simple_size_update(inode, di_bh, new_i_size);
-		if (status < 0)
-			mlog_errno(status);
-		goto bail;
-	}
 
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
@@ -363,22 +379,25 @@ static int ocfs2_truncate_file(struct inode *inode,
 	status = ocfs2_orphan_for_truncate(osb, inode, di_bh, new_i_size);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	status = ocfs2_prepare_truncate(osb, inode, di_bh, &tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	status = ocfs2_commit_truncate(osb, inode, di_bh, tc);
 	if (status < 0) {
 		mlog_errno(status);
-		goto bail;
+		goto bail_unlock_data;
 	}
 
 	/* TODO: orphan dir cleanup here. */
+bail_unlock_data:
+	ocfs2_data_unlock(inode, 1);
+
 bail:
 
 	mlog_exit(status);
@@ -397,6 +416,7 @@ bail:
  */
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
+			       u32 *logical_offset,
 			       u32 clusters_to_add,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
@@ -460,18 +480,14 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
 	mlog(0, "Allocating %u clusters at block %u for inode %llu\n",
 	     num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno);
-	status = ocfs2_insert_extent(osb, handle, inode, fe_bh, block,
-				     num_bits, meta_ac);
+	status = ocfs2_insert_extent(osb, handle, inode, fe_bh,
+				     *logical_offset, block, num_bits,
+				     meta_ac);
 	if (status < 0) {
 		mlog_errno(status);
 		goto leave;
 	}
 
-	le32_add_cpu(&fe->i_clusters, num_bits);
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
 	status = ocfs2_journal_dirty(handle, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -479,6 +495,7 @@ int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 	}
 
 	clusters_to_add -= num_bits;
+	*logical_offset += num_bits;
 
 	if (clusters_to_add) {
 		mlog(0, "need to alloc once more, clusters = %u, wanted = "
@@ -494,14 +511,87 @@ leave:
 	return status;
 }
 
+/*
+ * For a given allocation, determine which allocators will need to be
+ * accessed, and lock them, reserving the appropriate number of bits.
+ *
+ * Called from ocfs2_extend_allocation() for file systems which don't
+ * support holes, and from ocfs2_write() for file systems which
+ * understand sparse inodes.
+ */
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac)
+{
+	int ret, num_free_extents;
+	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+
+	*meta_ac = NULL;
+	*data_ac = NULL;
+
+	mlog(0, "extend inode %llu, i_size = %lld, di->i_clusters = %u, "
+	     "clusters_to_add = %u\n",
+	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
+	     le32_to_cpu(di->i_clusters), clusters_to_add);
+
+	num_free_extents = ocfs2_num_free_extents(osb, inode, di);
+	if (num_free_extents < 0) {
+		ret = num_free_extents;
+		mlog_errno(ret);
+		goto out;
+	}
+
+	/*
+	 * Sparse allocation file systems need to be more conservative
+	 * with reserving room for expansion - the actual allocation
+	 * happens while we've got a journal handle open so re-taking
+	 * a cluster lock (because we ran out of room for another
+	 * extent) will violate ordering rules.
+	 *
+	 * Most of the time we'll only be seeing this 1 cluster at a time
+	 * anyway.
+	 */
+	if (!num_free_extents ||
+	    (ocfs2_sparse_alloc(osb) && num_free_extents < clusters_to_add)) {
+		ret = ocfs2_reserve_new_metadata(osb, di, meta_ac);
+		if (ret < 0) {
+			if (ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
+
+	ret = ocfs2_reserve_clusters(osb, clusters_to_add, data_ac);
+	if (ret < 0) {
+		if (ret != -ENOSPC)
+			mlog_errno(ret);
+		goto out;
+	}
+
+out:
+	if (ret) {
+		if (*meta_ac) {
+			ocfs2_free_alloc_context(*meta_ac);
+			*meta_ac = NULL;
+		}
+
+		/*
+		 * We cannot have an error and a non null *data_ac.
+		 */
+	}
+
+	return ret;
+}
+
 static int ocfs2_extend_allocation(struct inode *inode,
 				   u32 clusters_to_add)
 {
 	int status = 0;
 	int restart_func = 0;
 	int drop_alloc_sem = 0;
-	int credits, num_free_extents;
-	u32 prev_clusters;
+	int credits;
+	u32 prev_clusters, logical_start;
 	struct buffer_head *bh = NULL;
 	struct ocfs2_dinode *fe = NULL;
 	handle_t *handle = NULL;
@@ -512,6 +602,12 @@ static int ocfs2_extend_allocation(struct inode *inode,
 
 	mlog_entry("(clusters_to_add = %u)\n", clusters_to_add);
 
+	/*
+	 * This function only exists for file systems which don't
+	 * support holes.
+	 */
+	BUG_ON(ocfs2_sparse_alloc(osb));
+
 	status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh,
 				  OCFS2_BH_CACHED, inode);
 	if (status < 0) {
@@ -526,39 +622,11 @@ static int ocfs2_extend_allocation(struct inode *inode,
 		goto leave;
 	}
 
+	logical_start = OCFS2_I(inode)->ip_clusters;
+
 restart_all:
 	BUG_ON(le32_to_cpu(fe->i_clusters) != OCFS2_I(inode)->ip_clusters);
 
-	mlog(0, "extend inode %llu, i_size = %lld, fe->i_clusters = %u, "
-	     "clusters_to_add = %u\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, i_size_read(inode),
-	     fe->i_clusters, clusters_to_add);
-
-	num_free_extents = ocfs2_num_free_extents(osb,
-						  inode,
-						  fe);
-	if (num_free_extents < 0) {
-		status = num_free_extents;
-		mlog_errno(status);
-		goto leave;
-	}
-
-	if (!num_free_extents) {
-		status = ocfs2_reserve_new_metadata(osb, fe, &meta_ac);
-		if (status < 0) {
-			if (status != -ENOSPC)
-				mlog_errno(status);
-			goto leave;
-		}
-	}
-
-	status = ocfs2_reserve_clusters(osb, clusters_to_add, &data_ac);
-	if (status < 0) {
-		if (status != -ENOSPC)
-			mlog_errno(status);
-		goto leave;
-	}
-
 	/* blocks peope in read/write from reading our allocation
 	 * until we're done changing it. We depend on i_mutex to block
 	 * other extend/truncate calls while we're here. Ordering wrt
@@ -566,6 +634,13 @@ restart_all:
 	down_write(&OCFS2_I(inode)->ip_alloc_sem);
 	drop_alloc_sem = 1;
 
+	status = ocfs2_lock_allocators(inode, fe, clusters_to_add, &data_ac,
+				       &meta_ac);
+	if (status) {
+		mlog_errno(status);
+		goto leave;
+	}
+
 	credits = ocfs2_calc_extend_credits(osb->sb, fe, clusters_to_add);
 	handle = ocfs2_start_trans(osb, credits);
 	if (IS_ERR(handle)) {
@@ -590,6 +665,7 @@ restarted_transaction:
 
 	status = ocfs2_do_extend_allocation(osb,
 					    inode,
+					    &logical_start,
 					    clusters_to_add,
 					    bh,
 					    handle,
@@ -778,7 +854,7 @@ static int ocfs2_extend_file(struct inode *inode,
 			     size_t tail_to_skip)
 {
 	int ret = 0;
-	u32 clusters_to_add;
+	u32 clusters_to_add = 0;
 
 	BUG_ON(!tail_to_skip && !di_bh);
 
@@ -790,6 +866,11 @@ static int ocfs2_extend_file(struct inode *inode,
   		goto out;
 	BUG_ON(new_i_size < i_size_read(inode));
 
+	if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+		BUG_ON(tail_to_skip != 0);
+		goto out_update_size;
+	}
+
 	clusters_to_add = ocfs2_clusters_for_bytes(inode->i_sb, new_i_size) - 
 		OCFS2_I(inode)->ip_clusters;
 
@@ -825,6 +906,7 @@ static int ocfs2_extend_file(struct inode *inode,
 		goto out_unlock;
 	}
 
+out_update_size:
 	if (!tail_to_skip) {
 		/* We're being called from ocfs2_setattr() which wants
 		 * us to update i_size */
@@ -834,7 +916,8 @@ static int ocfs2_extend_file(struct inode *inode,
 	}
 
 out_unlock:
-	ocfs2_data_unlock(inode, 1);
+	if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb)))
+		ocfs2_data_unlock(inode, 1);
 
 out:
 	return ret;
@@ -972,7 +1055,8 @@ int ocfs2_permission(struct inode *inode, int mask, struct nameidata *nd)
 
 	ret = ocfs2_meta_lock(inode, NULL, 0);
 	if (ret) {
-		mlog_errno(ret);
+		if (ret != -ENOENT)
+			mlog_errno(ret);
 		goto out;
 	}
 
@@ -1035,10 +1119,49 @@ out:
 	return ret;
 }
 
+/*
+ * Will look for holes and unwritten extents in the range starting at
+ * pos for count bytes (inclusive).
+ */
+static int ocfs2_check_range_for_holes(struct inode *inode, loff_t pos,
+				       size_t count)
+{
+	int ret = 0;
+	unsigned int extent_flags;
+	u32 cpos, clusters, extent_len, phys_cpos;
+	struct super_block *sb = inode->i_sb;
+
+	cpos = pos >> OCFS2_SB(sb)->s_clustersize_bits;
+	clusters = ocfs2_clusters_for_bytes(sb, pos + count) - cpos;
+
+	while (clusters) {
+		ret = ocfs2_get_clusters(inode, cpos, &phys_cpos, &extent_len,
+					 &extent_flags);
+		if (ret < 0) {
+			mlog_errno(ret);
+			goto out;
+		}
+
+		if (phys_cpos == 0 || (extent_flags & OCFS2_EXT_UNWRITTEN)) {
+			ret = 1;
+			break;
+		}
+
+		if (extent_len > clusters)
+			extent_len = clusters;
+
+		clusters -= extent_len;
+		cpos += extent_len;
+	}
+out:
+	return ret;
+}
+
 static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 					 loff_t *ppos,
 					 size_t count,
-					 int appending)
+					 int appending,
+					 int *direct_io)
 {
 	int ret = 0, meta_level = appending;
 	struct inode *inode = dentry->d_inode;
@@ -1089,6 +1212,49 @@ static int ocfs2_prepare_inode_for_write(struct dentry *dentry,
 		} else {
 			saved_pos = *ppos;
 		}
+
+		if (ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) {
+			loff_t end = saved_pos + count;
+
+			/*
+			 * Skip the O_DIRECT checks if we don't need
+			 * them.
+			 */
+			if (!direct_io || !(*direct_io))
+				break;
+
+			/*
+			 * Allowing concurrent direct writes means
+			 * i_size changes wouldn't be synchronized, so
+			 * one node could wind up truncating another
+			 * nodes writes.
+			 */
+			if (end > i_size_read(inode)) {
+				*direct_io = 0;
+				break;
+			}
+
+			/*
+			 * We don't fill holes during direct io, so
+			 * check for them here. If any are found, the
+			 * caller will have to retake some cluster
+			 * locks and initiate the io as buffered.
+			 */
+			ret = ocfs2_check_range_for_holes(inode, saved_pos,
+							  count);
+			if (ret == 1) {
+				*direct_io = 0;
+				ret = 0;
+			} else if (ret < 0)
+				mlog_errno(ret);
+			break;
+		}
+
+		/*
+		 * The rest of this loop is concerned with legacy file
+		 * systems which don't support sparse files.
+		 */
+
 		newsize = count + saved_pos;
 
 		mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
@@ -1141,55 +1307,264 @@ out:
 	return ret;
 }
 
+static inline void
+ocfs2_set_next_iovec(const struct iovec **iovp, size_t *basep, size_t bytes)
+{
+	const struct iovec *iov = *iovp;
+	size_t base = *basep;
+
+	do {
+		int copy = min(bytes, iov->iov_len - base);
+
+		bytes -= copy;
+		base += copy;
+		if (iov->iov_len == base) {
+			iov++;
+			base = 0;
+		}
+	} while (bytes);
+	*iovp = iov;
+	*basep = base;
+}
+
+static struct page * ocfs2_get_write_source(struct ocfs2_buffered_write_priv *bp,
+					    const struct iovec *cur_iov,
+					    size_t iov_offset)
+{
+	int ret;
+	char *buf;
+	struct page *src_page = NULL;
+
+	buf = cur_iov->iov_base + iov_offset;
+
+	if (!segment_eq(get_fs(), KERNEL_DS)) {
+		/*
+		 * Pull in the user page. We want to do this outside
+		 * of the meta data locks in order to preserve locking
+		 * order in case of page fault.
+		 */
+		ret = get_user_pages(current, current->mm,
+				     (unsigned long)buf & PAGE_CACHE_MASK, 1,
+				     0, 0, &src_page, NULL);
+		if (ret == 1)
+			bp->b_src_buf = kmap(src_page);
+		else
+			src_page = ERR_PTR(-EFAULT);
+	} else {
+		bp->b_src_buf = buf;
+	}
+
+	return src_page;
+}
+
+static void ocfs2_put_write_source(struct ocfs2_buffered_write_priv *bp,
+				   struct page *page)
+{
+	if (page) {
+		kunmap(page);
+		page_cache_release(page);
+	}
+}
+
+static ssize_t ocfs2_file_buffered_write(struct file *file, loff_t *ppos,
+					 const struct iovec *iov,
+					 unsigned long nr_segs,
+					 size_t count,
+					 ssize_t o_direct_written)
+{
+	int ret = 0;
+	ssize_t copied, total = 0;
+	size_t iov_offset = 0;
+	const struct iovec *cur_iov = iov;
+	struct ocfs2_buffered_write_priv bp;
+	struct page *page;
+
+	/*
+	 * handle partial DIO write.  Adjust cur_iov if needed.
+	 */
+	ocfs2_set_next_iovec(&cur_iov, &iov_offset, o_direct_written);
+
+	do {
+		bp.b_cur_off = iov_offset;
+		bp.b_cur_iov = cur_iov;
+
+		page = ocfs2_get_write_source(&bp, cur_iov, iov_offset);
+		if (IS_ERR(page)) {
+			ret = PTR_ERR(page);
+			goto out;
+		}
+
+		copied = ocfs2_buffered_write_cluster(file, *ppos, count,
+						      ocfs2_map_and_write_user_data,
+						      &bp);
+
+		ocfs2_put_write_source(&bp, page);
+
+		if (copied < 0) {
+			mlog_errno(copied);
+			ret = copied;
+			goto out;
+		}
+
+		total += copied;
+		*ppos = *ppos + copied;
+		count -= copied;
+
+		ocfs2_set_next_iovec(&cur_iov, &iov_offset, copied);
+	} while(count);
+
+out:
+	return total ? total : ret;
+}
+
+static int ocfs2_check_iovec(const struct iovec *iov, size_t *counted,
+			     unsigned long *nr_segs)
+{
+	size_t ocount;		/* original count */
+	unsigned long seg;
+
+	ocount = 0;
+	for (seg = 0; seg < *nr_segs; seg++) {
+		const struct iovec *iv = &iov[seg];
+
+		/*
+		 * If any segment has a negative length, or the cumulative
+		 * length ever wraps negative then return -EINVAL.
+		 */
+		ocount += iv->iov_len;
+		if (unlikely((ssize_t)(ocount|iv->iov_len) < 0))
+			return -EINVAL;
+		if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len))
+			continue;
+		if (seg == 0)
+			return -EFAULT;
+		*nr_segs = seg;
+		ocount -= iv->iov_len;	/* This segment is no good */
+		break;
+	}
+
+	*counted = ocount;
+	return 0;
+}
+
 static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 				    const struct iovec *iov,
 				    unsigned long nr_segs,
 				    loff_t pos)
 {
-	int ret, rw_level, have_alloc_sem = 0;
-	struct file *filp = iocb->ki_filp;
-	struct inode *inode = filp->f_path.dentry->d_inode;
-	int appending = filp->f_flags & O_APPEND ? 1 : 0;
-
-	mlog_entry("(0x%p, %u, '%.*s')\n", filp,
+	int ret, direct_io, appending, rw_level, have_alloc_sem  = 0;
+	int can_do_direct, sync = 0;
+	ssize_t written = 0;
+	size_t ocount;		/* original count */
+	size_t count;		/* after file limit checks */
+	loff_t *ppos = &iocb->ki_pos;
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_path.dentry->d_inode;
+
+	mlog_entry("(0x%p, %u, '%.*s')\n", file,
 		   (unsigned int)nr_segs,
-		   filp->f_path.dentry->d_name.len,
-		   filp->f_path.dentry->d_name.name);
+		   file->f_path.dentry->d_name.len,
+		   file->f_path.dentry->d_name.name);
 
-	/* happy write of zero bytes */
 	if (iocb->ki_left == 0)
 		return 0;
 
+	ret = ocfs2_check_iovec(iov, &ocount, &nr_segs);
+	if (ret)
+		return ret;
+
+	count = ocount;
+
+	vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE);
+
+	appending = file->f_flags & O_APPEND ? 1 : 0;
+	direct_io = file->f_flags & O_DIRECT ? 1 : 0;
+
 	mutex_lock(&inode->i_mutex);
+
+relock:
 	/* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */
-	if (filp->f_flags & O_DIRECT) {
-		have_alloc_sem = 1;
+	if (direct_io) {
 		down_read(&inode->i_alloc_sem);
+		have_alloc_sem = 1;
 	}
 
 	/* concurrent O_DIRECT writes are allowed */
-	rw_level = (filp->f_flags & O_DIRECT) ? 0 : 1;
+	rw_level = !direct_io;
 	ret = ocfs2_rw_lock(inode, rw_level);
 	if (ret < 0) {
-		rw_level = -1;
 		mlog_errno(ret);
-		goto out;
+		goto out_sems;
 	}
 
-	ret = ocfs2_prepare_inode_for_write(filp->f_path.dentry, &iocb->ki_pos,
-					    iocb->ki_left, appending);
+	can_do_direct = direct_io;
+	ret = ocfs2_prepare_inode_for_write(file->f_path.dentry, ppos,
+					    iocb->ki_left, appending,
+					    &can_do_direct);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out;
 	}
 
-	/* communicate with ocfs2_dio_end_io */
-	ocfs2_iocb_set_rw_locked(iocb);
+	/*
+	 * We can't complete the direct I/O as requested, fall back to
+	 * buffered I/O.
+	 */
+	if (direct_io && !can_do_direct) {
+		ocfs2_rw_unlock(inode, rw_level);
+		up_read(&inode->i_alloc_sem);
+
+		have_alloc_sem = 0;
+		rw_level = -1;
 
-	ret = generic_file_aio_write_nolock(iocb, iov, nr_segs, iocb->ki_pos);
+		direct_io = 0;
+		sync = 1;
+		goto relock;
+	}
+
+	if (!sync && ((file->f_flags & O_SYNC) || IS_SYNC(inode)))
+		sync = 1;
+
+	/*
+	 * XXX: Is it ok to execute these checks a second time?
+	 */
+	ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out;
+
+	/*
+	 * Set pos so that sync_page_range_nolock() below understands
+	 * where to start from. We might've moved it around via the
+	 * calls above. The range we want to actually sync starts from
+	 * *ppos here.
+	 *
+	 */
+	pos = *ppos;
+
+	/* communicate with ocfs2_dio_end_io */
+	ocfs2_iocb_set_rw_locked(iocb, rw_level);
+
+	if (direct_io) {
+		written = generic_file_direct_write(iocb, iov, &nr_segs, *ppos,
+						    ppos, count, ocount);
+		if (written < 0) {
+			ret = written;
+			goto out_dio;
+		}
+	} else {
+		written = ocfs2_file_buffered_write(file, ppos, iov, nr_segs,
+						    count, written);
+		if (written < 0) {
+			ret = written;
+			if (ret != -EFAULT || ret != -ENOSPC)
+				mlog_errno(ret);
+			goto out;
+		}
+	}
 
+out_dio:
 	/* buffered aio wouldn't have proper lock coverage today */
-	BUG_ON(ret == -EIOCBQUEUED && !(filp->f_flags & O_DIRECT));
+	BUG_ON(ret == -EIOCBQUEUED && !(file->f_flags & O_DIRECT));
 
 	/* 
 	 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
@@ -1207,13 +1582,102 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
 	}
 
 out:
+	if (rw_level != -1)
+		ocfs2_rw_unlock(inode, rw_level);
+
+out_sems:
 	if (have_alloc_sem)
 		up_read(&inode->i_alloc_sem);
-	if (rw_level != -1) 
-		ocfs2_rw_unlock(inode, rw_level);
+
+	if (written > 0 && sync) {
+		ssize_t err;
+
+		err = sync_page_range_nolock(inode, file->f_mapping, pos, count);
+		if (err < 0)
+			written = err;
+	}
+
 	mutex_unlock(&inode->i_mutex);
 
 	mlog_exit(ret);
+	return written ? written : ret;
+}
+
+static int ocfs2_splice_write_actor(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf,
+				    struct splice_desc *sd)
+{
+	int ret, count, total = 0;
+	ssize_t copied = 0;
+	struct ocfs2_splice_write_priv sp;
+
+	ret = buf->ops->pin(pipe, buf);
+	if (ret)
+		goto out;
+
+	sp.s_sd = sd;
+	sp.s_buf = buf;
+	sp.s_pipe = pipe;
+	sp.s_offset = sd->pos & ~PAGE_CACHE_MASK;
+	sp.s_buf_offset = buf->offset;
+
+	count = sd->len;
+	if (count + sp.s_offset > PAGE_CACHE_SIZE)
+		count = PAGE_CACHE_SIZE - sp.s_offset;
+
+	do {
+		/*
+		 * splice wants us to copy up to one page at a
+		 * time. For pagesize > cluster size, this means we
+		 * might enter ocfs2_buffered_write_cluster() more
+		 * than once, so keep track of our progress here.
+		 */
+		copied = ocfs2_buffered_write_cluster(sd->file,
+						      (loff_t)sd->pos + total,
+						      count,
+						      ocfs2_map_and_write_splice_data,
+						      &sp);
+		if (copied < 0) {
+			mlog_errno(copied);
+			ret = copied;
+			goto out;
+		}
+
+		count -= copied;
+		sp.s_offset += copied;
+		sp.s_buf_offset += copied;
+		total += copied;
+	} while (count);
+
+	ret = 0;
+out:
+
+	return total ? total : ret;
+}
+
+static ssize_t __ocfs2_file_splice_write(struct pipe_inode_info *pipe,
+					 struct file *out,
+					 loff_t *ppos,
+					 size_t len,
+					 unsigned int flags)
+{
+	int ret, err;
+	struct address_space *mapping = out->f_mapping;
+	struct inode *inode = mapping->host;
+
+	ret = __splice_from_pipe(pipe, out, ppos, len, flags,
+				 ocfs2_splice_write_actor);
+	if (ret > 0) {
+		*ppos += ret;
+
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+			if (err)
+				ret = err;
+		}
+	}
+
 	return ret;
 }
 
@@ -1239,14 +1703,15 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
 		goto out;
 	}
 
-	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0);
+	ret = ocfs2_prepare_inode_for_write(out->f_path.dentry, ppos, len, 0,
+					    NULL);
 	if (ret < 0) {
 		mlog_errno(ret);
 		goto out_unlock;
 	}
 
 	/* ok, we're done with i_size and alloc work */
-	ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
+	ret = __ocfs2_file_splice_write(pipe, out, ppos, len, flags);
 
 out_unlock:
 	ocfs2_rw_unlock(inode, 1);
@@ -1323,7 +1788,7 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
 		}
 		rw_level = 0;
 		/* communicate with ocfs2_dio_end_io */
-		ocfs2_iocb_set_rw_locked(iocb);
+		ocfs2_iocb_set_rw_locked(iocb, rw_level);
 	}
 
 	/*
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index cc973f01f6ce..2c4460fced52 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -39,12 +39,17 @@ enum ocfs2_alloc_restarted {
 };
 int ocfs2_do_extend_allocation(struct ocfs2_super *osb,
 			       struct inode *inode,
+			       u32 *cluster_start,
 			       u32 clusters_to_add,
 			       struct buffer_head *fe_bh,
 			       handle_t *handle,
 			       struct ocfs2_alloc_context *data_ac,
 			       struct ocfs2_alloc_context *meta_ac,
 			       enum ocfs2_alloc_restarted *reason);
+int ocfs2_lock_allocators(struct inode *inode, struct ocfs2_dinode *di,
+			  u32 clusters_to_add,
+			  struct ocfs2_alloc_context **data_ac,
+			  struct ocfs2_alloc_context **meta_ac);
 int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
 int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
 		  struct kstat *stat);
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index 28ab56f2b98c..21a605079c62 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -89,24 +89,6 @@ void ocfs2_set_inode_flags(struct inode *inode)
 		inode->i_flags |= S_DIRSYNC;
 }
 
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-				     u64 blkno,
-				     int delete_vote)
-{
-	struct ocfs2_find_inode_args args;
-
-	/* ocfs2_ilookup_for_vote should *only* be called from the
-	 * vote thread */
-	BUG_ON(current != osb->vote_task);
-
-	args.fi_blkno = blkno;
-	args.fi_flags = OCFS2_FI_FLAG_NOWAIT;
-	if (delete_vote)
-		args.fi_flags |= OCFS2_FI_FLAG_DELETE;
-	args.fi_ino = ino_from_blkno(osb->sb, blkno);
-	return ilookup5(osb->sb, args.fi_ino, ocfs2_find_actor, &args);
-}
-
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 blkno, int flags)
 {
 	struct inode *inode = NULL;
@@ -182,28 +164,6 @@ static int ocfs2_find_actor(struct inode *inode, void *opaque)
 	if (oi->ip_blkno != args->fi_blkno)
 		goto bail;
 
-	/* OCFS2_FI_FLAG_NOWAIT is *only* set from
-	 * ocfs2_ilookup_for_vote which won't create an inode for one
-	 * that isn't found. The vote thread which doesn't want to get
-	 * an inode which is in the process of going away - otherwise
-	 * the call to __wait_on_freeing_inode in find_inode_fast will
-	 * cause it to deadlock on an inode which may be waiting on a
-	 * vote (or lock release) in delete_inode */
-	if ((args->fi_flags & OCFS2_FI_FLAG_NOWAIT) &&
-	    (inode->i_state & (I_FREEING|I_CLEAR))) {
-		/* As stated above, we're not going to return an
-		 * inode.  In the case of a delete vote, the voting
-		 * code is going to signal the other node to go
-		 * ahead. Mark that state here, so this freeing inode
-		 * has the state when it gets to delete_inode. */
-		if (args->fi_flags & OCFS2_FI_FLAG_DELETE) {
-			spin_lock(&oi->ip_lock);
-			ocfs2_mark_inode_remotely_deleted(inode);
-			spin_unlock(&oi->ip_lock);
-		}
-		goto bail;
-	}
-
 	ret = 1;
 bail:
 	mlog_exit(ret);
@@ -261,6 +221,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		goto bail;
 	}
 
+	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
+	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
+
 	inode->i_version = 1;
 	inode->i_generation = le32_to_cpu(fe->i_generation);
 	inode->i_rdev = huge_decode_dev(le64_to_cpu(fe->id1.dev1.i_rdev));
@@ -272,8 +235,7 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 	if (S_ISLNK(inode->i_mode) && !fe->i_clusters)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks =
-			ocfs2_align_bytes_to_sectors(le64_to_cpu(fe->i_size));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_mapping->a_ops = &ocfs2_aops;
 	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
 	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
@@ -288,10 +250,6 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
 		     (unsigned long long)fe->i_blkno);
 
-	OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
-	OCFS2_I(inode)->ip_orphaned_slot = OCFS2_INVALID_SLOT;
-	OCFS2_I(inode)->ip_attr = le32_to_cpu(fe->i_attr);
-
 	inode->i_nlink = le16_to_cpu(fe->i_links_count);
 
 	if (fe->i_flags & cpu_to_le32(OCFS2_SYSTEM_FL))
@@ -347,6 +305,9 @@ int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
 
 		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_meta_lockres,
 					  OCFS2_LOCK_TYPE_META, 0, inode);
+
+		ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+					  OCFS2_LOCK_TYPE_OPEN, 0, inode);
 	}
 
 	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_rw_lockres,
@@ -421,7 +382,7 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 	 * cluster lock before trusting anything anyway.
 	 */
 	can_lock = !(args->fi_flags & OCFS2_FI_FLAG_SYSFILE)
-		&& !(args->fi_flags & OCFS2_FI_FLAG_NOLOCK)
+		&& !(args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY)
 		&& !ocfs2_mount_local(osb);
 
 	/*
@@ -438,7 +399,17 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 				  OCFS2_LOCK_TYPE_META,
 				  generation, inode);
 
+	ocfs2_inode_lock_res_init(&OCFS2_I(inode)->ip_open_lockres,
+				  OCFS2_LOCK_TYPE_OPEN,
+				  0, inode);
+
 	if (can_lock) {
+		status = ocfs2_open_lock(inode);
+		if (status) {
+			make_bad_inode(inode);
+			mlog_errno(status);
+			return status;
+		}
 		status = ocfs2_meta_lock(inode, NULL, 0);
 		if (status) {
 			make_bad_inode(inode);
@@ -447,6 +418,14 @@ static int ocfs2_read_locked_inode(struct inode *inode,
 		}
 	}
 
+	if (args->fi_flags & OCFS2_FI_FLAG_ORPHAN_RECOVERY) {
+		status = ocfs2_try_open_lock(inode, 0);
+		if (status) {
+			make_bad_inode(inode);	
+			return status;
+		}
+	}
+
 	status = ocfs2_read_block(osb, args->fi_blkno, &bh, 0,
 				  can_lock ? inode : NULL);
 	if (status < 0) {
@@ -507,50 +486,56 @@ static int ocfs2_truncate_for_delete(struct ocfs2_super *osb,
 				     struct buffer_head *fe_bh)
 {
 	int status = 0;
-	handle_t *handle = NULL;
 	struct ocfs2_truncate_context *tc = NULL;
 	struct ocfs2_dinode *fe;
+	handle_t *handle = NULL;
 
 	mlog_entry_void();
 
 	fe = (struct ocfs2_dinode *) fe_bh->b_data;
 
-	/* zero allocation, zero truncate :) */
-	if (!fe->i_clusters)
-		goto bail;
+	if (fe->i_clusters) {
+		handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
+		if (IS_ERR(handle)) {
+			status = PTR_ERR(handle);
+			mlog_errno(status);
+			goto out;
+		}
 
-	handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
-	if (IS_ERR(handle)) {
-		status = PTR_ERR(handle);
-		handle = NULL;
-		mlog_errno(status);
-		goto bail;
-	}
+		status = ocfs2_journal_access(handle, inode, fe_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
 
-	status = ocfs2_set_inode_size(handle, inode, fe_bh, 0ULL);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+		i_size_write(inode, 0);
 
-	ocfs2_commit_trans(osb, handle);
-	handle = NULL;
+		status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
 
-	status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
+		ocfs2_commit_trans(osb, handle);
+		handle = NULL;
 
-	status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
+		status = ocfs2_prepare_truncate(osb, inode, fe_bh, &tc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
+
+		status = ocfs2_commit_truncate(osb, inode, fe_bh, tc);
+		if (status < 0) {
+			mlog_errno(status);
+			goto out;
+		}
 	}
-bail:
+
+out:
 	if (handle)
 		ocfs2_commit_trans(osb, handle);
-
 	mlog_exit(status);
 	return status;
 }
@@ -678,10 +663,10 @@ static int ocfs2_wipe_inode(struct inode *inode,
 	struct inode *orphan_dir_inode = NULL;
 	struct buffer_head *orphan_dir_bh = NULL;
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
+	struct ocfs2_dinode *di;
 
-	/* We've already voted on this so it should be readonly - no
-	 * spinlock needed. */
-	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
+	di = (struct ocfs2_dinode *) di_bh->b_data;
+	orphaned_slot = le16_to_cpu(di->i_orphaned_slot);
 
 	status = ocfs2_check_orphan_recovery_state(osb, orphaned_slot);
 	if (status)
@@ -839,11 +824,20 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		goto bail;
 	}
 
-	status = ocfs2_request_delete_vote(inode);
-	/* -EBUSY means that other nodes are still using the
-	 * inode. We're done here though, so avoid doing anything on
-	 * disk and let them worry about deleting it. */
-	if (status == -EBUSY) {
+	/*
+	 * This is how ocfs2 determines whether an inode is still live
+	 * within the cluster. Every node takes a shared read lock on
+	 * the inode open lock in ocfs2_read_locked_inode(). When we
+	 * get to ->delete_inode(), each node tries to convert it's
+	 * lock to an exclusive. Trylocks are serialized by the inode
+	 * meta data lock. If the upconvert suceeds, we know the inode
+	 * is no longer live and can be deleted.
+	 *
+	 * Though we call this with the meta data lock held, the
+	 * trylock keeps us from ABBA deadlock.
+	 */
+	status = ocfs2_try_open_lock(inode, 1);
+	if (status == -EAGAIN) {
 		status = 0;
 		mlog(0, "Skipping delete of %llu because it is in use on"
 		     "other nodes\n", (unsigned long long)oi->ip_blkno);
@@ -854,21 +848,10 @@ static int ocfs2_query_inode_wipe(struct inode *inode,
 		goto bail;
 	}
 
-	spin_lock(&oi->ip_lock);
-	if (oi->ip_orphaned_slot == OCFS2_INVALID_SLOT) {
-		/* Nobody knew which slot this inode was orphaned
-		 * into. This may happen during node death and
-		 * recovery knows how to clean it up so we can safely
-		 * ignore this inode for now on. */
-		mlog(0, "Nobody knew where inode %llu was orphaned!\n",
-		     (unsigned long long)oi->ip_blkno);
-	} else {
-		*wipe = 1;
-
-		mlog(0, "Inode %llu is ok to wipe from orphan dir %d\n",
-		     (unsigned long long)oi->ip_blkno, oi->ip_orphaned_slot);
-	}
-	spin_unlock(&oi->ip_lock);
+	*wipe = 1;
+	mlog(0, "Inode %llu is ok to wipe from orphan dir %u\n",
+	     (unsigned long long)oi->ip_blkno,
+	     le16_to_cpu(di->i_orphaned_slot));
 
 bail:
 	return status;
@@ -1001,11 +984,16 @@ void ocfs2_clear_inode(struct inode *inode)
 	mlog_bug_on_msg(OCFS2_SB(inode->i_sb) == NULL,
 			"Inode=%lu\n", inode->i_ino);
 
+	/* For remove delete_inode vote, we hold open lock before,
+	 * now it is time to unlock PR and EX open locks. */
+	ocfs2_open_unlock(inode);
+
 	/* Do these before all the other work so that we don't bounce
 	 * the vote thread while waiting to destroy the locks. */
 	ocfs2_mark_lockres_freeing(&oi->ip_rw_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_meta_lockres);
 	ocfs2_mark_lockres_freeing(&oi->ip_data_lockres);
+	ocfs2_mark_lockres_freeing(&oi->ip_open_lockres);
 
 	/* We very well may get a clear_inode before all an inodes
 	 * metadata has hit disk. Of course, we can't drop any cluster
@@ -1020,8 +1008,7 @@ void ocfs2_clear_inode(struct inode *inode)
 			"Clear inode of %llu, inode has io markers\n",
 			(unsigned long long)oi->ip_blkno);
 
-	ocfs2_extent_map_drop(inode, 0);
-	ocfs2_extent_map_init(inode);
+	ocfs2_extent_map_trunc(inode, 0);
 
 	status = ocfs2_drop_inode_locks(inode);
 	if (status < 0)
@@ -1030,6 +1017,7 @@ void ocfs2_clear_inode(struct inode *inode)
 	ocfs2_lock_res_free(&oi->ip_rw_lockres);
 	ocfs2_lock_res_free(&oi->ip_meta_lockres);
 	ocfs2_lock_res_free(&oi->ip_data_lockres);
+	ocfs2_lock_res_free(&oi->ip_open_lockres);
 
 	ocfs2_metadata_cache_purge(inode);
 
@@ -1086,9 +1074,6 @@ void ocfs2_drop_inode(struct inode *inode)
 	mlog(0, "Drop inode %llu, nlink = %u, ip_flags = 0x%x\n",
 	     (unsigned long long)oi->ip_blkno, inode->i_nlink, oi->ip_flags);
 
-	/* Testing ip_orphaned_slot here wouldn't work because we may
-	 * not have gotten a delete_inode vote from any other nodes
-	 * yet. */
 	if (oi->ip_flags & OCFS2_INODE_MAYBE_ORPHANED)
 		generic_delete_inode(inode);
 	else
@@ -1121,8 +1106,8 @@ struct buffer_head *ocfs2_bread(struct inode *inode,
 		return NULL;
 	}
 
-	tmperr = ocfs2_extent_map_get_blocks(inode, block, 1,
-					     &p_blkno, NULL);
+	tmperr = ocfs2_extent_map_get_blocks(inode, block, &p_blkno, NULL,
+					     NULL);
 	if (tmperr < 0) {
 		mlog_errno(tmperr);
 		goto fail;
@@ -1259,7 +1244,7 @@ void ocfs2_refresh_inode(struct inode *inode,
 	if (S_ISLNK(inode->i_mode) && le32_to_cpu(fe->i_clusters) == 0)
 		inode->i_blocks = 0;
 	else
-		inode->i_blocks = ocfs2_align_bytes_to_sectors(i_size_read(inode));
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	inode->i_atime.tv_sec = le64_to_cpu(fe->i_atime);
 	inode->i_atime.tv_nsec = le32_to_cpu(fe->i_atime_nsec);
 	inode->i_mtime.tv_sec = le64_to_cpu(fe->i_mtime);
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 1a7dd2945b34..03ae075869ee 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -26,6 +26,8 @@
 #ifndef OCFS2_INODE_H
 #define OCFS2_INODE_H
 
+#include "extent_map.h"
+
 /* OCFS2 Inode Private Data */
 struct ocfs2_inode_info
 {
@@ -34,6 +36,7 @@ struct ocfs2_inode_info
 	struct ocfs2_lock_res		ip_rw_lockres;
 	struct ocfs2_lock_res		ip_meta_lockres;
 	struct ocfs2_lock_res		ip_data_lockres;
+	struct ocfs2_lock_res		ip_open_lockres;
 
 	/* protects allocation changes on this inode. */
 	struct rw_semaphore		ip_alloc_sem;
@@ -42,9 +45,7 @@ struct ocfs2_inode_info
 	spinlock_t			ip_lock;
 	u32				ip_open_count;
 	u32				ip_clusters;
-	struct ocfs2_extent_map		ip_map;
 	struct list_head		ip_io_markers;
-	int				ip_orphaned_slot;
 
 	struct mutex			ip_io_mutex;
 
@@ -64,6 +65,8 @@ struct ocfs2_inode_info
 
 	struct ocfs2_caching_info	ip_metadata_cache;
 
+	struct ocfs2_extent_map		ip_extent_map;
+
 	struct inode			vfs_inode;
 };
 
@@ -117,14 +120,9 @@ void ocfs2_delete_inode(struct inode *inode);
 void ocfs2_drop_inode(struct inode *inode);
 
 /* Flags for ocfs2_iget() */
-#define OCFS2_FI_FLAG_NOWAIT	0x1
-#define OCFS2_FI_FLAG_DELETE	0x2
-#define OCFS2_FI_FLAG_SYSFILE	0x4
-#define OCFS2_FI_FLAG_NOLOCK	0x8
+#define OCFS2_FI_FLAG_SYSFILE		0x4
+#define OCFS2_FI_FLAG_ORPHAN_RECOVERY	0x8
 struct inode *ocfs2_iget(struct ocfs2_super *osb, u64 feoff, int flags);
-struct inode *ocfs2_ilookup_for_vote(struct ocfs2_super *osb,
-				     u64 blkno,
-				     int delete_vote);
 int ocfs2_inode_init_private(struct inode *inode);
 int ocfs2_inode_revalidate(struct dentry *dentry);
 int ocfs2_populate_inode(struct inode *inode, struct ocfs2_dinode *fe,
@@ -144,4 +142,11 @@ int ocfs2_aio_write(struct file *file, struct kiocb *req, struct iocb *iocb);
 
 void ocfs2_set_inode_flags(struct inode *inode);
 
+static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode)
+{
+	int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9;
+
+	return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits);
+}
+
 #endif /* OCFS2_INODE_H */
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 825cb0ae1b4c..5a8a90d1c787 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -649,29 +649,20 @@ bail:
 static int ocfs2_force_read_journal(struct inode *inode)
 {
 	int status = 0;
-	int i, p_blocks;
-	u64 v_blkno, p_blkno;
-#define CONCURRENT_JOURNAL_FILL 32
+	int i;
+	u64 v_blkno, p_blkno, p_blocks, num_blocks;
+#define CONCURRENT_JOURNAL_FILL 32ULL
 	struct buffer_head *bhs[CONCURRENT_JOURNAL_FILL];
 
 	mlog_entry_void();
 
-	BUG_ON(inode->i_blocks !=
-		     ocfs2_align_bytes_to_sectors(i_size_read(inode)));
-
 	memset(bhs, 0, sizeof(struct buffer_head *) * CONCURRENT_JOURNAL_FILL);
 
-	mlog(0, "Force reading %llu blocks\n",
-		(unsigned long long)(inode->i_blocks >>
-			(inode->i_sb->s_blocksize_bits - 9)));
-
+	num_blocks = ocfs2_blocks_for_bytes(inode->i_sb, inode->i_size);
 	v_blkno = 0;
-	while (v_blkno <
-	       (inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9))) {
-
+	while (v_blkno < num_blocks) {
 		status = ocfs2_extent_map_get_blocks(inode, v_blkno,
-						     1, &p_blkno,
-						     &p_blocks);
+						     &p_blkno, &p_blocks, NULL);
 		if (status < 0) {
 			mlog_errno(status);
 			goto bail;
@@ -1306,7 +1297,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb,
 				continue;
 
 			iter = ocfs2_iget(osb, le64_to_cpu(de->inode),
-					  OCFS2_FI_FLAG_NOLOCK);
+					  OCFS2_FI_FLAG_ORPHAN_RECOVERY);
 			if (IS_ERR(iter))
 				continue;
 
@@ -1418,7 +1409,6 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb,
 		/* Set the proper information to get us going into
 		 * ocfs2_delete_inode. */
 		oi->ip_flags |= OCFS2_INODE_MAYBE_ORPHANED;
-		oi->ip_orphaned_slot = slot;
 		spin_unlock(&oi->ip_lock);
 
 		iput(inode);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index d026b4f27757..3db5de4506da 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -390,7 +390,7 @@ static inline int ocfs2_calc_tree_trunc_credits(struct super_block *sb,
 	/* We may be deleting metadata blocks, so metadata alloc dinode +
 	   one desc. block for each possible delete. */
 	if (tree_depth && next_free == 1 &&
-	    le32_to_cpu(last_el->l_recs[i].e_clusters) == clusters_to_del)
+	    ocfs2_rec_clusters(last_el, &last_el->l_recs[i]) == clusters_to_del)
 		credits += 1 + tree_depth;
 
 	/* update to the truncate log. */
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 51b020447683..af01158b39f5 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -85,8 +85,11 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
 	int ret = 0, lock_level = 0;
 	struct ocfs2_super *osb = OCFS2_SB(file->f_dentry->d_inode->i_sb);
 
-	/* We don't want to support shared writable mappings yet. */
-	if (!ocfs2_mount_local(osb) &&
+	/*
+	 * Only support shared writeable mmap for local mounts which
+	 * don't know about holes.
+	 */
+	if ((!ocfs2_mount_local(osb) || ocfs2_sparse_alloc(osb)) &&
 	    ((vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_MAYSHARE)) &&
 	    ((vma->vm_flags & VM_WRITE) || (vma->vm_flags & VM_MAYWRITE))) {
 		mlog(0, "disallow shared writable mmaps %lx\n", vma->vm_flags);
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 28dd757ff67d..2bcf353fd7c5 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -175,8 +175,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 
 	inode = ocfs2_iget(OCFS2_SB(dir->i_sb), blkno, 0);
 	if (IS_ERR(inode)) {
-		mlog(ML_ERROR, "Unable to create inode %llu\n",
-		     (unsigned long long)blkno);
 		ret = ERR_PTR(-EACCES);
 		goto bail_unlock;
 	}
@@ -189,7 +187,6 @@ static struct dentry *ocfs2_lookup(struct inode *dir, struct dentry *dentry,
 	 * unlink. */
 	spin_lock(&oi->ip_lock);
 	oi->ip_flags &= ~OCFS2_INODE_MAYBE_ORPHANED;
-	oi->ip_orphaned_slot = OCFS2_INVALID_SLOT;
 	spin_unlock(&oi->ip_lock);
 
 bail_add:
@@ -288,7 +285,7 @@ static int ocfs2_fill_new_dir(struct ocfs2_super *osb,
 
 	i_size_write(inode, inode->i_sb->s_blocksize);
 	inode->i_nlink = 2;
-	inode->i_blocks = ocfs2_align_bytes_to_sectors(inode->i_sb->s_blocksize);
+	inode->i_blocks = ocfs2_inode_sector_count(inode);
 	status = ocfs2_mark_inode_dirty(handle, inode, fe_bh);
 	if (status < 0) {
 		mlog_errno(status);
@@ -1486,8 +1483,7 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 	struct buffer_head **bhs = NULL;
 	const char *c;
 	struct super_block *sb = osb->sb;
-	u64 p_blkno;
-	int p_blocks;
+	u64 p_blkno, p_blocks;
 	int virtual, blocks, status, i, bytes_left;
 
 	bytes_left = i_size_read(inode) + 1;
@@ -1514,8 +1510,8 @@ static int ocfs2_create_symlink_data(struct ocfs2_super *osb,
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0, 1, &p_blkno,
-					     &p_blocks);
+	status = ocfs2_extent_map_get_blocks(inode, 0, &p_blkno, &p_blocks,
+					     NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
@@ -1674,8 +1670,11 @@ static int ocfs2_symlink(struct inode *dir,
 	inode->i_rdev = 0;
 	newsize = l - 1;
 	if (l > ocfs2_fast_symlink_chars(sb)) {
+		u32 offset = 0;
+
 		inode->i_op = &ocfs2_symlink_inode_operations;
-		status = ocfs2_do_extend_allocation(osb, inode, 1, new_fe_bh,
+		status = ocfs2_do_extend_allocation(osb, inode, &offset, 1,
+						    new_fe_bh,
 						    handle, data_ac, NULL,
 						    NULL);
 		if (status < 0) {
@@ -1689,7 +1688,7 @@ static int ocfs2_symlink(struct inode *dir,
 			goto bail;
 		}
 		i_size_write(inode, newsize);
-		inode->i_blocks = ocfs2_align_bytes_to_sectors(newsize);
+		inode->i_blocks = ocfs2_inode_sector_count(inode);
 	} else {
 		inode->i_op = &ocfs2_fast_symlink_inode_operations;
 		memcpy((char *) fe->id2.i_symlink, symname, l);
@@ -2222,9 +2221,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb,
 	/* Record which orphan dir our inode now resides
 	 * in. delete_inode will use this to determine which orphan
 	 * dir to lock. */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	OCFS2_I(inode)->ip_orphaned_slot = osb->slot_num;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
+	fe->i_orphaned_slot = cpu_to_le16(osb->slot_num);
 
 	mlog(0, "Inode %llu orphaned in slot %d\n",
 	     (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num);
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index db8e77cd35d3..82cc92dcf8a6 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -46,11 +46,6 @@
 #include "endian.h"
 #include "ocfs2_lockid.h"
 
-struct ocfs2_extent_map {
-	u32		em_clusters;
-	struct rb_root	em_extents;
-};
-
 /* Most user visible OCFS2 inodes will have very few pieces of
  * metadata, but larger files (including bitmaps, etc) must be taken
  * into account when designing an access scheme. We allow a small
@@ -303,6 +298,13 @@ static inline int ocfs2_should_order_data(struct inode *inode)
 	return 1;
 }
 
+static inline int ocfs2_sparse_alloc(struct ocfs2_super *osb)
+{
+	if (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
+		return 1;
+	return 0;
+}
+
 /* set / clear functions because cluster events can make these happen
  * in parallel so we want the transitions to be atomic. this also
  * means that any future flags osb_flags must be protected by spinlock
@@ -461,6 +463,49 @@ static inline unsigned long ocfs2_align_bytes_to_sectors(u64 bytes)
 	return (unsigned long)((bytes + 511) >> 9);
 }
 
+static inline unsigned int ocfs2_page_index_to_clusters(struct super_block *sb,
+							unsigned long pg_index)
+{
+	u32 clusters = pg_index;
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+
+	if (unlikely(PAGE_CACHE_SHIFT > cbits))
+		clusters = pg_index << (PAGE_CACHE_SHIFT - cbits);
+	else if (PAGE_CACHE_SHIFT < cbits)
+		clusters = pg_index >> (cbits - PAGE_CACHE_SHIFT);
+
+	return clusters;
+}
+
+/*
+ * Find the 1st page index which covers the given clusters.
+ */
+static inline unsigned long ocfs2_align_clusters_to_page_index(struct super_block *sb,
+							u32 clusters)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned long index = clusters;
+
+	if (PAGE_CACHE_SHIFT > cbits) {
+		index = clusters >> (PAGE_CACHE_SHIFT - cbits);
+	} else if (PAGE_CACHE_SHIFT < cbits) {
+		index = clusters << (cbits - PAGE_CACHE_SHIFT);
+	}
+
+	return index;
+}
+
+static inline unsigned int ocfs2_pages_per_cluster(struct super_block *sb)
+{
+	unsigned int cbits = OCFS2_SB(sb)->s_clustersize_bits;
+	unsigned int pages_per_cluster = 1;
+
+	if (PAGE_CACHE_SHIFT < cbits)
+		pages_per_cluster = 1 << (cbits - PAGE_CACHE_SHIFT);
+
+	return pages_per_cluster;
+}
+
 #define ocfs2_set_bit ext2_set_bit
 #define ocfs2_clear_bit ext2_clear_bit
 #define ocfs2_test_bit ext2_test_bit
diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h
index e61e218f5e0b..71306479c68f 100644
--- a/fs/ocfs2/ocfs2_fs.h
+++ b/fs/ocfs2/ocfs2_fs.h
@@ -86,7 +86,8 @@
 	OCFS2_SB(sb)->s_feature_incompat &= ~(mask)
 
 #define OCFS2_FEATURE_COMPAT_SUPP	OCFS2_FEATURE_COMPAT_BACKUP_SB
-#define OCFS2_FEATURE_INCOMPAT_SUPP	OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT
+#define OCFS2_FEATURE_INCOMPAT_SUPP	(OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT \
+					 | OCFS2_FEATURE_INCOMPAT_SPARSE_ALLOC)
 #define OCFS2_FEATURE_RO_COMPAT_SUPP	0
 
 /*
@@ -155,6 +156,12 @@
 #define OCFS2_FL_MODIFIABLE	(0x000100FF)	/* User modifiable flags */
 
 /*
+ * Extent record flags (e_node.leaf.flags)
+ */
+#define OCFS2_EXT_UNWRITTEN	(0x01)	/* Extent is allocated but
+					 * unwritten */
+
+/*
  * ioctl commands
  */
 #define OCFS2_IOC_GETFLAGS	_IOR('f', 1, long)
@@ -282,10 +289,21 @@ static unsigned char ocfs2_type_by_mode[S_IFMT >> S_SHIFT] = {
 /*
  * On disk extent record for OCFS2
  * It describes a range of clusters on disk.
+ *
+ * Length fields are divided into interior and leaf node versions.
+ * This leaves room for a flags field (OCFS2_EXT_*) in the leaf nodes.
  */
 struct ocfs2_extent_rec {
 /*00*/	__le32 e_cpos;		/* Offset into the file, in clusters */
-	__le32 e_clusters;	/* Clusters covered by this extent */
+	union {
+		__le32 e_int_clusters; /* Clusters covered by all children */
+		struct {
+			__le16 e_leaf_clusters; /* Clusters covered by this
+						   extent */
+			__u8 e_reserved1;
+			__u8 e_flags; /* Extent flags */
+		};
+	};
 	__le64 e_blkno;		/* Physical disk offset, in blocks */
 /*10*/
 };
@@ -311,7 +329,10 @@ struct ocfs2_extent_list {
 /*00*/	__le16 l_tree_depth;		/* Extent tree depth from this
 					   point.  0 means data extents
 					   hang directly off this
-					   header (a leaf) */
+					   header (a leaf)
+					   NOTE: The high 8 bits cannot be
+					   used - tree_depth is never that big.
+					*/
 	__le16 l_count;			/* Number of extent records */
 	__le16 l_next_free_rec;		/* Next unused extent slot */
 	__le16 l_reserved1;
@@ -446,7 +467,9 @@ struct ocfs2_dinode {
 	__le32 i_ctime_nsec;
 	__le32 i_mtime_nsec;
 	__le32 i_attr;
-	__le32 i_reserved1;
+	__le16 i_orphaned_slot;		/* Only valid when OCFS2_ORPHANED_FL
+					   was set in i_flags */
+	__le16 i_reserved1;
 /*70*/	__le64 i_reserved2[8];
 /*B8*/	union {
 		__le64 i_pad1;		/* Generic way to refer to this
diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h
index 4d5d5655c185..4ca02b1c38ac 100644
--- a/fs/ocfs2/ocfs2_lockid.h
+++ b/fs/ocfs2/ocfs2_lockid.h
@@ -44,6 +44,7 @@ enum ocfs2_lock_type {
 	OCFS2_LOCK_TYPE_RENAME,
 	OCFS2_LOCK_TYPE_RW,
 	OCFS2_LOCK_TYPE_DENTRY,
+	OCFS2_LOCK_TYPE_OPEN,
 	OCFS2_NUM_LOCK_TYPES
 };
 
@@ -69,6 +70,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type)
 		case OCFS2_LOCK_TYPE_DENTRY:
 			c = 'N';
 			break;
+		case OCFS2_LOCK_TYPE_OPEN:
+			c = 'O';
+			break;
 		default:
 			c = '\0';
 	}
@@ -85,6 +89,7 @@ static char *ocfs2_lock_type_strings[] = {
 	 * important job it does, anyway. */
 	[OCFS2_LOCK_TYPE_RW] = "Write/Read",
 	[OCFS2_LOCK_TYPE_DENTRY] = "Dentry",
+	[OCFS2_LOCK_TYPE_OPEN] = "Open",
 };
 
 static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 2d3ac32cb74e..d921a28329dc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -197,7 +197,7 @@ int ocfs2_init_slot_info(struct ocfs2_super *osb)
 		goto bail;
 	}
 
-	status = ocfs2_extent_map_get_blocks(inode, 0ULL, 1, &blkno, NULL);
+	status = ocfs2_extent_map_get_blocks(inode, 0ULL, &blkno, NULL, NULL);
 	if (status < 0) {
 		mlog_errno(status);
 		goto bail;
diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c
index 6dbb11762759..0da655ae5d6f 100644
--- a/fs/ocfs2/suballoc.c
+++ b/fs/ocfs2/suballoc.c
@@ -381,8 +381,7 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb,
 					     le32_to_cpu(fe->i_clusters)));
 	spin_unlock(&OCFS2_I(alloc_inode)->ip_lock);
 	i_size_write(alloc_inode, le64_to_cpu(fe->i_size));
-	alloc_inode->i_blocks =
-		ocfs2_align_bytes_to_sectors(i_size_read(alloc_inode));
+	alloc_inode->i_blocks = ocfs2_inode_sector_count(alloc_inode);
 
 	status = 0;
 bail:
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 6534f92424dd..5c9e8243691f 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -806,9 +806,6 @@ static int __init ocfs2_init(void)
 
 	ocfs2_print_version();
 
-	if (init_ocfs2_extent_maps())
-		return -ENOMEM;
-
 	status = init_ocfs2_uptodate_cache();
 	if (status < 0) {
 		mlog_errno(status);
@@ -837,7 +834,6 @@ leave:
 	if (status < 0) {
 		ocfs2_free_mem_caches();
 		exit_ocfs2_uptodate_cache();
-		exit_ocfs2_extent_maps();
 	}
 
 	mlog_exit(status);
@@ -863,8 +859,6 @@ static void __exit ocfs2_exit(void)
 
 	unregister_filesystem(&ocfs2_fs_type);
 
-	exit_ocfs2_extent_maps();
-
 	exit_ocfs2_uptodate_cache();
 
 	mlog_exit_void();
@@ -963,6 +957,7 @@ static void ocfs2_inode_init_once(void *data,
 		ocfs2_lock_res_init_once(&oi->ip_rw_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_meta_lockres);
 		ocfs2_lock_res_init_once(&oi->ip_data_lockres);
+		ocfs2_lock_res_init_once(&oi->ip_open_lockres);
 
 		ocfs2_metadata_cache_init(&oi->vfs_inode);
 
diff --git a/fs/ocfs2/vote.c b/fs/ocfs2/vote.c
index f30e63b9910c..4f82a2f0efef 100644
--- a/fs/ocfs2/vote.c
+++ b/fs/ocfs2/vote.c
@@ -63,17 +63,10 @@ struct ocfs2_msg_hdr
 	__be32 h_node_num;    /* node sending this particular message. */
 };
 
-/* OCFS2_MAX_FILENAME_LEN is 255 characters, but we want to align this
- * for the network. */
-#define OCFS2_VOTE_FILENAME_LEN 256
 struct ocfs2_vote_msg
 {
 	struct ocfs2_msg_hdr v_hdr;
-	union {
-		__be32 v_generic1;
-		__be32 v_orphaned_slot;	/* Used during delete votes */
-		__be32 v_nlink;		/* Used during unlink votes */
-	} md1;				/* Message type dependant 1 */
+	__be32 v_reserved1;
 };
 
 /* Responses are given these values to maintain backwards
@@ -86,7 +79,6 @@ struct ocfs2_response_msg
 {
 	struct ocfs2_msg_hdr r_hdr;
 	__be32 r_response;
-	__be32 r_orphaned_slot;
 };
 
 struct ocfs2_vote_work {
@@ -96,7 +88,6 @@ struct ocfs2_vote_work {
 
 enum ocfs2_vote_request {
 	OCFS2_VOTE_REQ_INVALID = 0,
-	OCFS2_VOTE_REQ_DELETE,
 	OCFS2_VOTE_REQ_MOUNT,
 	OCFS2_VOTE_REQ_UMOUNT,
 	OCFS2_VOTE_REQ_LAST
@@ -151,135 +142,23 @@ static void ocfs2_process_umount_request(struct ocfs2_super *osb,
 	ocfs2_node_map_set_bit(osb, &osb->umount_map, node_num);
 }
 
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode)
-{
-	struct ocfs2_inode_info *oi = OCFS2_I(inode);
-
-	assert_spin_locked(&oi->ip_lock);
-	/* We set the SKIP_DELETE flag on the inode so we don't try to
-	 * delete it in delete_inode ourselves, thus avoiding
-	 * unecessary lock pinging. If the other node failed to wipe
-	 * the inode as a result of a crash, then recovery will pick
-	 * up the slack. */
-	oi->ip_flags |= OCFS2_INODE_DELETED|OCFS2_INODE_SKIP_DELETE;
-}
-
-static int ocfs2_process_delete_request(struct inode *inode,
-					int *orphaned_slot)
-{
-	int response = OCFS2_RESPONSE_BUSY;
-
-	mlog(0, "DELETE vote on inode %lu, read lnk_cnt = %u, slot = %d\n",
-	     inode->i_ino, inode->i_nlink, *orphaned_slot);
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-
-	/* Whatever our vote response is, we want to make sure that
-	 * the orphaned slot is recorded properly on this node *and*
-	 * on the requesting node. Technically, if the requesting node
-	 * did not know which slot the inode is orphaned in but we
-	 * respond with BUSY he doesn't actually need the orphaned
-	 * slot, but it doesn't hurt to do it here anyway. */
-	if ((*orphaned_slot) != OCFS2_INVALID_SLOT) {
-		mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot !=
-				OCFS2_INVALID_SLOT &&
-				OCFS2_I(inode)->ip_orphaned_slot !=
-				(*orphaned_slot),
-				"Inode %llu: This node thinks it's "
-				"orphaned in slot %d, messaged it's in %d\n",
-				(unsigned long long)OCFS2_I(inode)->ip_blkno,
-				OCFS2_I(inode)->ip_orphaned_slot,
-				*orphaned_slot);
-
-		mlog(0, "Setting orphaned slot for inode %llu to %d\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     *orphaned_slot);
-
-		OCFS2_I(inode)->ip_orphaned_slot = *orphaned_slot;
-	} else {
-		mlog(0, "Sending back orphaned slot %d for inode %llu\n",
-		     OCFS2_I(inode)->ip_orphaned_slot,
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-
-		*orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-	}
-
-	/* vote no if the file is still open. */
-	if (OCFS2_I(inode)->ip_open_count) {
-		mlog(0, "open count = %u\n",
-		     OCFS2_I(inode)->ip_open_count);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		goto done;
-	}
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	/* directories are a bit ugly... What if someone is sitting in
-	 * it? We want to make sure the inode is removed completely as
-	 * a result of the iput in process_vote. */
-	if (S_ISDIR(inode->i_mode) && (atomic_read(&inode->i_count) != 1)) {
-		mlog(0, "i_count = %u\n", atomic_read(&inode->i_count));
-		goto done;
-	}
-
-	if (filemap_fdatawrite(inode->i_mapping)) {
-		mlog(ML_ERROR, "Could not sync inode %llu for delete!\n",
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno);
-		goto done;
-	}
-	sync_mapping_buffers(inode->i_mapping);
-	truncate_inode_pages(inode->i_mapping, 0);
-	ocfs2_extent_map_trunc(inode, 0);
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	/* double check open count - someone might have raced this
-	 * thread into ocfs2_file_open while we were writing out
-	 * data. If we're to allow a wipe of this inode now, we *must*
-	 * hold the spinlock until we've marked it. */
-	if (OCFS2_I(inode)->ip_open_count) {
-		mlog(0, "Raced to wipe! open count = %u\n",
-		     OCFS2_I(inode)->ip_open_count);
-		spin_unlock(&OCFS2_I(inode)->ip_lock);
-		goto done;
-	}
-
-	/* Mark the inode as being wiped from disk. */
-	ocfs2_mark_inode_remotely_deleted(inode);
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	/* Not sure this is necessary anymore. */
-	d_prune_aliases(inode);
-
-	/* If we get here, then we're voting 'yes', so commit the
-	 * delete on our side. */
-	response = OCFS2_RESPONSE_OK;
-done:
-	return response;
-}
-
 static void ocfs2_process_vote(struct ocfs2_super *osb,
 			       struct ocfs2_vote_msg *msg)
 {
 	int net_status, vote_response;
-	int orphaned_slot = 0;
-	unsigned int node_num, generation;
+	unsigned int node_num;
 	u64 blkno;
 	enum ocfs2_vote_request request;
-	struct inode *inode = NULL;
 	struct ocfs2_msg_hdr *hdr = &msg->v_hdr;
 	struct ocfs2_response_msg response;
 
 	/* decode the network mumbo jumbo into local variables. */
 	request = be32_to_cpu(hdr->h_request);
 	blkno = be64_to_cpu(hdr->h_blkno);
-	generation = be32_to_cpu(hdr->h_generation);
 	node_num = be32_to_cpu(hdr->h_node_num);
-	if (request == OCFS2_VOTE_REQ_DELETE)
-		orphaned_slot = be32_to_cpu(msg->md1.v_orphaned_slot);
 
-	mlog(0, "processing vote: request = %u, blkno = %llu, "
-	     "generation = %u, node_num = %u, priv1 = %u\n", request,
-	     (unsigned long long)blkno, generation, node_num,
-	     be32_to_cpu(msg->md1.v_generic1));
+	mlog(0, "processing vote: request = %u, blkno = %llu, node_num = %u\n",
+	     request, (unsigned long long)blkno, node_num);
 
 	if (!ocfs2_is_valid_vote_request(request)) {
 		mlog(ML_ERROR, "Invalid vote request %d from node %u\n",
@@ -302,52 +181,6 @@ static void ocfs2_process_vote(struct ocfs2_super *osb,
 		break;
 	}
 
-	/* We cannot process the remaining message types before we're
-	 * fully mounted. It's perfectly safe however to send a 'yes'
-	 * response as we can't possibly have any of the state they're
-	 * asking us to modify yet. */
-	if (atomic_read(&osb->vol_state) == VOLUME_INIT)
-		goto respond;
-
-	/* If we get here, then the request is against an inode. */
-	inode = ocfs2_ilookup_for_vote(osb, blkno,
-				       request == OCFS2_VOTE_REQ_DELETE);
-
-	/* Not finding the inode is perfectly valid - it means we're
-	 * not interested in what the other node is about to do to it
-	 * so in those cases we automatically respond with an
-	 * affirmative. Cluster locking ensures that we won't race
-	 * interest in the inode with this vote request. */
-	if (!inode)
-		goto respond;
-
-	/* Check generation values. It's possible for us to get a
-	 * request against a stale inode. If so then we proceed as if
-	 * we had not found an inode in the first place. */
-	if (inode->i_generation != generation) {
-		mlog(0, "generation passed %u != inode generation = %u, "
-		     "ip_flags = %x, ip_blkno = %llu, msg %llu, i_count = %u, "
-		     "message type = %u\n", generation, inode->i_generation,
-		     OCFS2_I(inode)->ip_flags,
-		     (unsigned long long)OCFS2_I(inode)->ip_blkno,
-		     (unsigned long long)blkno, atomic_read(&inode->i_count),
-		     request);
-		iput(inode);
-		inode = NULL;
-		goto respond;
-	}
-
-	switch (request) {
-	case OCFS2_VOTE_REQ_DELETE:
-		vote_response = ocfs2_process_delete_request(inode,
-							     &orphaned_slot);
-		break;
-	default:
-		mlog(ML_ERROR, "node %u, invalid request: %u\n",
-		     node_num, request);
-		vote_response = OCFS2_RESPONSE_BAD_MSG;
-	}
-
 respond:
 	/* Response struture is small so we just put it on the stack
 	 * and stuff it inline. */
@@ -357,7 +190,6 @@ respond:
 	response.r_hdr.h_generation = hdr->h_generation;
 	response.r_hdr.h_node_num = cpu_to_be32(osb->node_num);
 	response.r_response = cpu_to_be32(vote_response);
-	response.r_orphaned_slot = cpu_to_be32(orphaned_slot);
 
 	net_status = o2net_send_message(OCFS2_MESSAGE_TYPE_RESPONSE,
 					osb->net_key,
@@ -373,9 +205,6 @@ respond:
 	    && net_status != -ENOTCONN)
 		mlog(ML_ERROR, "message to node %u fails with error %d!\n",
 		     node_num, net_status);
-
-	if (inode)
-		iput(inode);
 }
 
 static void ocfs2_vote_thread_do_work(struct ocfs2_super *osb)
@@ -634,8 +463,7 @@ bail:
 static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
 						      u64 blkno,
 						      unsigned int generation,
-						      enum ocfs2_vote_request type,
-						      u32 priv)
+						      enum ocfs2_vote_request type)
 {
 	struct ocfs2_vote_msg *request;
 	struct ocfs2_msg_hdr *hdr;
@@ -651,8 +479,6 @@ static struct ocfs2_vote_msg * ocfs2_new_vote_request(struct ocfs2_super *osb,
 		hdr->h_request = cpu_to_be32(type);
 		hdr->h_blkno = cpu_to_be64(blkno);
 		hdr->h_generation = cpu_to_be32(generation);
-
-		request->md1.v_generic1 = cpu_to_be32(priv);
 	}
 
 	return request;
@@ -664,7 +490,7 @@ static int ocfs2_do_request_vote(struct ocfs2_super *osb,
 				 struct ocfs2_vote_msg *request,
 				 struct ocfs2_net_response_cb *callback)
 {
-	int status, response;
+	int status, response = -EBUSY;
 	unsigned int response_id;
 	struct ocfs2_msg_hdr *hdr;
 
@@ -686,109 +512,12 @@ bail:
 	return status;
 }
 
-static int ocfs2_request_vote(struct inode *inode,
-			      struct ocfs2_vote_msg *request,
-			      struct ocfs2_net_response_cb *callback)
-{
-	int status;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-
-	if (ocfs2_inode_is_new(inode))
-		return 0;
-
-	status = -EAGAIN;
-	while (status == -EAGAIN) {
-		if (!(osb->s_mount_opt & OCFS2_MOUNT_NOINTR) &&
-		    signal_pending(current))
-			return -ERESTARTSYS;
-
-		status = ocfs2_super_lock(osb, 0);
-		if (status < 0) {
-			mlog_errno(status);
-			break;
-		}
-
-		status = 0;
-		if (!ocfs2_node_map_is_only(osb, &osb->mounted_map,
-					   osb->node_num))
-			status = ocfs2_do_request_vote(osb, request, callback);
-
-		ocfs2_super_unlock(osb, 0);
-	}
-	return status;
-}
-
-static void ocfs2_delete_response_cb(void *priv,
-				     struct ocfs2_response_msg *resp)
-{
-	int orphaned_slot, node;
-	struct inode *inode = priv;
-
-	orphaned_slot = be32_to_cpu(resp->r_orphaned_slot);
-	node = be32_to_cpu(resp->r_hdr.h_node_num);
-	mlog(0, "node %d tells us that inode %llu is orphaned in slot %d\n",
-	     node, (unsigned long long)OCFS2_I(inode)->ip_blkno,
-	     orphaned_slot);
-
-	/* The other node may not actually know which slot the inode
-	 * is orphaned in. */
-	if (orphaned_slot == OCFS2_INVALID_SLOT)
-		return;
-
-	/* Ok, the responding node knows which slot this inode is
-	 * orphaned in. We verify that the information is correct and
-	 * then record this in the inode. ocfs2_delete_inode will use
-	 * this information to determine which lock to take. */
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	mlog_bug_on_msg(OCFS2_I(inode)->ip_orphaned_slot != orphaned_slot &&
-			OCFS2_I(inode)->ip_orphaned_slot
-			!= OCFS2_INVALID_SLOT, "Inode %llu: Node %d says it's "
-			"orphaned in slot %d, we think it's in %d\n",
-			(unsigned long long)OCFS2_I(inode)->ip_blkno,
-			be32_to_cpu(resp->r_hdr.h_node_num),
-			orphaned_slot, OCFS2_I(inode)->ip_orphaned_slot);
-
-	OCFS2_I(inode)->ip_orphaned_slot = orphaned_slot;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-}
-
-int ocfs2_request_delete_vote(struct inode *inode)
-{
-	int orphaned_slot, status;
-	struct ocfs2_net_response_cb delete_cb;
-	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
-	struct ocfs2_vote_msg *request;
-
-	spin_lock(&OCFS2_I(inode)->ip_lock);
-	orphaned_slot = OCFS2_I(inode)->ip_orphaned_slot;
-	spin_unlock(&OCFS2_I(inode)->ip_lock);
-
-	delete_cb.rc_cb = ocfs2_delete_response_cb;
-	delete_cb.rc_priv = inode;
-
-	mlog(0, "Inode %llu, we start thinking orphaned slot is %d\n",
-	     (unsigned long long)OCFS2_I(inode)->ip_blkno, orphaned_slot);
-
-	status = -ENOMEM;
-	request = ocfs2_new_vote_request(osb, OCFS2_I(inode)->ip_blkno,
-					 inode->i_generation,
-					 OCFS2_VOTE_REQ_DELETE, orphaned_slot);
-	if (request) {
-		status = ocfs2_request_vote(inode, request, &delete_cb);
-
-		kfree(request);
-	}
-
-	return status;
-}
-
 int ocfs2_request_mount_vote(struct ocfs2_super *osb)
 {
 	int status;
 	struct ocfs2_vote_msg *request = NULL;
 
-	request = ocfs2_new_vote_request(osb, 0ULL, 0,
-					 OCFS2_VOTE_REQ_MOUNT, 0);
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_MOUNT);
 	if (!request) {
 		status = -ENOMEM;
 		goto bail;
@@ -821,8 +550,7 @@ int ocfs2_request_umount_vote(struct ocfs2_super *osb)
 	int status;
 	struct ocfs2_vote_msg *request = NULL;
 
-	request = ocfs2_new_vote_request(osb, 0ULL, 0,
-					 OCFS2_VOTE_REQ_UMOUNT, 0);
+	request = ocfs2_new_vote_request(osb, 0ULL, 0, OCFS2_VOTE_REQ_UMOUNT);
 	if (!request) {
 		status = -ENOMEM;
 		goto bail;
@@ -969,7 +697,6 @@ static int ocfs2_handle_vote_message(struct o2net_msg *msg,
 	     be32_to_cpu(work->w_msg.v_hdr.h_generation));
 	mlog(0, "h_node_num = %u\n",
 	     be32_to_cpu(work->w_msg.v_hdr.h_node_num));
-	mlog(0, "v_generic1 = %u\n", be32_to_cpu(work->w_msg.md1.v_generic1));
 
 	spin_lock(&osb->vote_task_lock);
 	list_add_tail(&work->w_list, &osb->vote_list);
diff --git a/fs/ocfs2/vote.h b/fs/ocfs2/vote.h
index 53ebc1c69e56..9ea46f62de31 100644
--- a/fs/ocfs2/vote.h
+++ b/fs/ocfs2/vote.h
@@ -38,14 +38,11 @@ static inline void ocfs2_kick_vote_thread(struct ocfs2_super *osb)
 	wake_up(&osb->vote_event);
 }
 
-int ocfs2_request_delete_vote(struct inode *inode);
 int ocfs2_request_mount_vote(struct ocfs2_super *osb);
 int ocfs2_request_umount_vote(struct ocfs2_super *osb);
 int ocfs2_register_net_handlers(struct ocfs2_super *osb);
 void ocfs2_unregister_net_handlers(struct ocfs2_super *osb);
 
-void ocfs2_mark_inode_remotely_deleted(struct inode *inode);
-
 void ocfs2_remove_node_from_vote_queues(struct ocfs2_super *osb,
 					int node_num);
 #endif
diff --git a/fs/sync.c b/fs/sync.c
index d0feff61e6aa..5cb9e7e43383 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -239,13 +239,11 @@ out:
 /*
  * `endbyte' is inclusive
  */
-int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			unsigned int flags)
+int do_sync_mapping_range(struct address_space *mapping, loff_t offset,
+			  loff_t endbyte, unsigned int flags)
 {
 	int ret;
-	struct address_space *mapping;
 
-	mapping = file->f_mapping;
 	if (!mapping) {
 		ret = -EINVAL;
 		goto out;
@@ -275,4 +273,4 @@ int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
 out:
 	return ret;
 }
-EXPORT_SYMBOL_GPL(do_sync_file_range);
+EXPORT_SYMBOL_GPL(do_sync_mapping_range);