Merge branch 'master'

author: Steven Whitehouse <swhiteho@redhat.com> 2006-04-21 18:52:36 +0200
committer: Steven Whitehouse <swhiteho@redhat.com> 2006-04-21 18:52:36 +0200
commit: a748422ee45725e04e1d3792fa19dfa90ddfd116 (patch)
tree: 978e12895468baaa9f7ab2747b9f7d50beaf1717 /fs
parent: [GFS2] journal recovery patch (diff)
parent: Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/... (diff)
download: linux-a748422ee45725e04e1d3792fa19dfa90ddfd116.tar.xz
linux-a748422ee45725e04e1d3792fa19dfa90ddfd116.zip
58 files changed, 1758 insertions, 1106 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index b0a0ae509c00..61c599b4a1e3 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) {
 		dprintk(DEBUG_ERROR, "problem initiating session\n");
-		kfree(v9ses);
-		return ERR_PTR(newfid);
+		sb = ERR_PTR(newfid);
+		goto out_free_session;
 	}
 
 	sb = sget(fs_type, NULL, v9fs_set_super, v9ses);
-
+	if (IS_ERR(sb))
+		goto out_close_session;
 	v9fs_fill_super(sb, v9ses, flags);
 
 	inode = v9fs_get_inode(sb, S_IFDIR | mode);
@@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type
 
 	return sb;
 
+out_close_session:
+	v9fs_session_close(v9ses);
+out_free_session:
+	kfree(v9ses);
+	return sb;
+
 put_back_sb:
 	/* deactivate_super calls v9fs_kill_super which will frees the rest */
 	up_write(&sb->s_umount);
diff --git a/fs/Kconfig b/fs/Kconfig
index 62ee097776f0..563a59e5e694 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -800,6 +800,7 @@ config PROC_KCORE
 config PROC_VMCORE
         bool "/proc/vmcore support (EXPERIMENTAL)"
         depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP
+	default y
         help
         Exports the dump image of crashed kernel in ELF format.
 
@@ -842,6 +843,12 @@ config TMPFS
 config HUGETLBFS
 	bool "HugeTLB file system support"
 	depends X86 || IA64 || PPC64 || SPARC64 || SUPERH || BROKEN
+	help
+	  hugetlbfs is a filesystem backing for HugeTLB pages, based on
+	  ramfs. For architectures that support it, say Y here and read
+	  <file:Documentation/vm/hugetlbpage.txt> for details.
+
+	  If unsure, say N.
 
 config HUGETLB_PAGE
 	def_bool HUGETLBFS
@@ -862,7 +869,7 @@ config RAMFS
 
 config CONFIGFS_FS
 	tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on SYSFS && EXPERIMENTAL
 	help
 	  configfs is a ram-based filesystem that provides the converse
 	  of sysfs's functionality. Where sysfs is a filesystem-based
diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c
index 8ed9b06a9828..5638c8f9362f 100644
--- a/fs/configfs/dir.c
+++ b/fs/configfs/dir.c
@@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group)
 	int ret = 0;
 	int i;
 
-	if (group && group->default_groups) {
+	if (group->default_groups) {
 		/* FYI, we're faking mkdir here
 		 * I'm not sure we need this semaphore, as we're called
 		 * from our parent's mkdir.  That holds our parent's
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 242fe1a66ce5..1b4491cdd115 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 	switch (op) {
 	case EPOLL_CTL_ADD:
 		if (!epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 
 			error = ep_insert(ep, &epds, tfile, fd);
 		} else
@@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event)
 		break;
 	case EPOLL_CTL_MOD:
 		if (epi) {
-			epds.events |= POLLERR | POLLHUP | POLLRDHUP;
+			epds.events |= POLLERR | POLLHUP;
 			error = ep_modify(ep, epi, &epds);
 		} else
 			error = -ENOENT;
diff --git a/fs/exec.c b/fs/exec.c
index 0291a68a3626..3a79d97ac234 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk)
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct task_struct *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
@@ -678,6 +676,18 @@ static int de_thread(struct task_struct *tsk)
 		while (leader->exit_state != EXIT_ZOMBIE)
 			yield();
 
+		/*
+		 * The only record we have of the real-time age of a
+		 * process, regardless of execs it's done, is start_time.
+		 * All the past CPU time is accumulated in signal_struct
+		 * from sister threads now dead.  But in this non-leader
+		 * exec, nothing survives from the original leader thread,
+		 * whose birth marks the true age of this process now.
+		 * When we take on its identity by switching to its PID, we
+		 * also take its birthdate (always earlier than our own).
+		 */
+		current->start_time = leader->start_time;
+
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
 		proc_dentry1 = proc_pid_unhash(current);
@@ -692,22 +702,6 @@ static int de_thread(struct task_struct *tsk)
 		 * two threads with a switched PID, and release
 		 * the former thread group leader:
 		 */
-		ptrace = leader->ptrace;
-		parent = leader->parent;
-		if (unlikely(ptrace) && unlikely(parent == current)) {
-			/*
-			 * Joker was ptracing his own group leader,
-			 * and now he wants to be his own parent!
-			 * We can't have that.
-			 */
-			ptrace = 0;
-		}
-
-		ptrace_unlink(current);
-		ptrace_unlink(leader);
-		remove_parent(current);
-		remove_parent(leader);
-
 
 		/* Become a process group leader with the old leader's pid.
 		 * Note: The old leader also uses thispid until release_task
@@ -718,19 +712,15 @@ static int de_thread(struct task_struct *tsk)
 		attach_pid(current, PIDTYPE_PID,  current->pid);
 		attach_pid(current, PIDTYPE_PGID, current->signal->pgrp);
 		attach_pid(current, PIDTYPE_SID,  current->signal->session);
-		list_add_tail(&current->tasks, &init_task.tasks);
+		list_add_tail_rcu(&current->tasks, &init_task.tasks);
 
-		current->parent = current->real_parent = leader->real_parent;
-		leader->parent = leader->real_parent = child_reaper;
 		current->group_leader = current;
-		leader->group_leader = leader;
+		leader->group_leader = current;
 
-		add_parent(current);
-		add_parent(leader);
-		if (ptrace) {
-			current->ptrace = ptrace;
-			__ptrace_link(current, parent);
-		}
+		/* Reduce leader to a thread */
+		detach_pid(leader, PIDTYPE_PGID);
+		detach_pid(leader, PIDTYPE_SID);
+		list_del_init(&leader->tasks);
 
 		current->exit_signal = SIGCHLD;
 
diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c
index 1041dab6de2f..c5ffa8523968 100644
--- a/fs/ext3/resize.c
+++ b/fs/ext3/resize.c
@@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input)
 	if (input->group != sbi->s_groups_count) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_journal;
 	}
@@ -974,6 +975,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es,
 	if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) {
 		ext3_warning(sb, __FUNCTION__,
 			     "multiple resizers run on filesystem!");
+		unlock_super(sb);
 		err = -EBUSY;
 		goto exit_put;
 	}
diff --git a/fs/fifo.c b/fs/fifo.c
index 889f722ee36d..49035b174b48 100644
--- a/fs/fifo.c
+++ b/fs/fifo.c
@@ -15,30 +15,35 @@
 #include <linux/fs.h>
 #include <linux/pipe_fs_i.h>
 
-static void wait_for_partner(struct inode* inode, unsigned int* cnt)
+static void wait_for_partner(struct inode* inode, unsigned int *cnt)
 {
 	int cur = *cnt;	
-	while(cur == *cnt) {
-		pipe_wait(inode);
-		if(signal_pending(current))
+
+	while (cur == *cnt) {
+		pipe_wait(inode->i_pipe);
+		if (signal_pending(current))
 			break;
 	}
 }
 
 static void wake_up_partner(struct inode* inode)
 {
-	wake_up_interruptible(PIPE_WAIT(*inode));
+	wake_up_interruptible(&inode->i_pipe->wait);
 }
 
 static int fifo_open(struct inode *inode, struct file *filp)
 {
+	struct pipe_inode_info *pipe;
 	int ret;
 
-	mutex_lock(PIPE_MUTEX(*inode));
-	if (!inode->i_pipe) {
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+	if (!pipe) {
 		ret = -ENOMEM;
-		if(!pipe_new(inode))
+		pipe = alloc_pipe_info(inode);
+		if (!pipe)
 			goto err_nocleanup;
+		inode->i_pipe = pipe;
 	}
 	filp->f_version = 0;
 
@@ -53,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  opened, even when there is no process writing the FIFO.
 	 */
 		filp->f_op = &read_fifo_fops;
-		PIPE_RCOUNTER(*inode)++;
-		if (PIPE_READERS(*inode)++ == 0)
+		pipe->r_counter++;
+		if (pipe->readers++ == 0)
 			wake_up_partner(inode);
 
-		if (!PIPE_WRITERS(*inode)) {
+		if (!pipe->writers) {
 			if ((filp->f_flags & O_NONBLOCK)) {
 				/* suppress POLLHUP until we have
 				 * seen a writer */
-				filp->f_version = PIPE_WCOUNTER(*inode);
+				filp->f_version = pipe->w_counter;
 			} else 
 			{
-				wait_for_partner(inode, &PIPE_WCOUNTER(*inode));
+				wait_for_partner(inode, &pipe->w_counter);
 				if(signal_pending(current))
 					goto err_rd;
 			}
@@ -78,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 *  errno=ENXIO when there is no process reading the FIFO.
 	 */
 		ret = -ENXIO;
-		if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode))
+		if ((filp->f_flags & O_NONBLOCK) && !pipe->readers)
 			goto err;
 
 		filp->f_op = &write_fifo_fops;
-		PIPE_WCOUNTER(*inode)++;
-		if (!PIPE_WRITERS(*inode)++)
+		pipe->w_counter++;
+		if (!pipe->writers++)
 			wake_up_partner(inode);
 
-		if (!PIPE_READERS(*inode)) {
-			wait_for_partner(inode, &PIPE_RCOUNTER(*inode));
+		if (!pipe->readers) {
+			wait_for_partner(inode, &pipe->r_counter);
 			if (signal_pending(current))
 				goto err_wr;
 		}
@@ -102,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	 */
 		filp->f_op = &rdwr_fifo_fops;
 
-		PIPE_READERS(*inode)++;
-		PIPE_WRITERS(*inode)++;
-		PIPE_RCOUNTER(*inode)++;
-		PIPE_WCOUNTER(*inode)++;
-		if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1)
+		pipe->readers++;
+		pipe->writers++;
+		pipe->r_counter++;
+		pipe->w_counter++;
+		if (pipe->readers == 1 || pipe->writers == 1)
 			wake_up_partner(inode);
 		break;
 
@@ -116,27 +121,27 @@ static int fifo_open(struct inode *inode, struct file *filp)
 	}
 
 	/* Ok! */
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	return 0;
 
 err_rd:
-	if (!--PIPE_READERS(*inode))
-		wake_up_interruptible(PIPE_WAIT(*inode));
+	if (!--pipe->readers)
+		wake_up_interruptible(&pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err_wr:
-	if (!--PIPE_WRITERS(*inode))
-		wake_up_interruptible(PIPE_WAIT(*inode));
+	if (!--pipe->writers)
+		wake_up_interruptible(&pipe->wait);
 	ret = -ERESTARTSYS;
 	goto err;
 
 err:
-	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode))
+	if (!pipe->readers && !pipe->writers)
 		free_pipe_info(inode);
 
 err_nocleanup:
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
 
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
index 23d1f52eb1b8..cc750c68fe70 100644
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep;
 
 static struct fuse_conn *fuse_get_conn(struct file *file)
 {
-	struct fuse_conn *fc;
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	if (fc && !fc->connected)
-		fc = NULL;
-	spin_unlock(&fuse_lock);
-	return fc;
+	/*
+	 * Lockless access is OK, because file->private data is set
+	 * once during mount and is valid until the file is released.
+	 */
+	return file->private_data;
 }
 
 static void fuse_request_init(struct fuse_req *req)
@@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset)
  */
 void fuse_reset_request(struct fuse_req *req)
 {
-	int preallocated = req->preallocated;
 	BUG_ON(atomic_read(&req->count) != 1);
 	fuse_request_init(req);
-	req->preallocated = preallocated;
 }
 
 static void __fuse_get_request(struct fuse_req *req)
@@ -92,80 +88,54 @@ static void __fuse_put_request(struct fuse_req *req)
 	atomic_dec(&req->count);
 }
 
-static struct fuse_req *do_get_request(struct fuse_conn *fc)
+struct fuse_req *fuse_get_req(struct fuse_conn *fc)
 {
 	struct fuse_req *req;
-
-	spin_lock(&fuse_lock);
-	BUG_ON(list_empty(&fc->unused_list));
-	req = list_entry(fc->unused_list.next, struct fuse_req, list);
-	list_del_init(&req->list);
-	spin_unlock(&fuse_lock);
-	fuse_request_init(req);
-	req->preallocated = 1;
-	req->in.h.uid = current->fsuid;
-	req->in.h.gid = current->fsgid;
-	req->in.h.pid = current->pid;
-	return req;
-}
-
-/* This can return NULL, but only in case it's interrupted by a SIGKILL */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc)
-{
-	int intr;
 	sigset_t oldset;
+	int intr;
+	int err;
 
 	atomic_inc(&fc->num_waiting);
 	block_sigs(&oldset);
-	intr = down_interruptible(&fc->outstanding_sem);
+	intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked);
 	restore_sigs(&oldset);
-	if (intr) {
-		atomic_dec(&fc->num_waiting);
-		return NULL;
-	}
-	return do_get_request(fc);
-}
+	err = -EINTR;
+	if (intr)
+		goto out;
 
-/* Must be called with fuse_lock held */
-static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (req->preallocated) {
-		atomic_dec(&fc->num_waiting);
-		list_add(&req->list, &fc->unused_list);
-	} else
-		fuse_request_free(req);
+	req = fuse_request_alloc();
+	err = -ENOMEM;
+	if (!req)
+		goto out;
 
-	/* If we are in debt decrease that first */
-	if (fc->outstanding_debt)
-		fc->outstanding_debt--;
-	else
-		up(&fc->outstanding_sem);
+	req->in.h.uid = current->fsuid;
+	req->in.h.gid = current->fsgid;
+	req->in.h.pid = current->pid;
+	req->waiting = 1;
+	return req;
+
+ out:
+	atomic_dec(&fc->num_waiting);
+	return ERR_PTR(err);
 }
 
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (atomic_dec_and_test(&req->count)) {
-		spin_lock(&fuse_lock);
-		fuse_putback_request(fc, req);
-		spin_unlock(&fuse_lock);
+		if (req->waiting)
+			atomic_dec(&fc->num_waiting);
+		fuse_request_free(req);
 	}
 }
 
-static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req)
-{
-	if (atomic_dec_and_test(&req->count))
-		fuse_putback_request(fc, req);
-}
-
-void fuse_release_background(struct fuse_req *req)
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req)
 {
-	iput(req->inode);
-	iput(req->inode2);
-	if (req->file)
-		fput(req->file);
-	spin_lock(&fuse_lock);
-	list_del(&req->bg_entry);
-	spin_unlock(&fuse_lock);
+	list_del_init(&req->bg_entry);
+	if (fc->num_background == FUSE_MAX_BACKGROUND) {
+		fc->blocked = 0;
+		wake_up_all(&fc->blocked_waitq);
+	}
+	fc->num_background--;
 }
 
 /*
@@ -184,28 +154,38 @@ void fuse_release_background(struct fuse_req *req)
  * interrupted and put in the background, it will return with an error
  * and hence never be reset and reused.
  *
- * Called with fuse_lock, unlocks it
+ * Called with fc->lock, unlocks it
  */
 static void request_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	list_del(&req->list);
 	req->state = FUSE_REQ_FINISHED;
 	if (!req->background) {
+		spin_unlock(&fc->lock);
 		wake_up(&req->waitq);
-		fuse_put_request_locked(fc, req);
-		spin_unlock(&fuse_lock);
+		fuse_put_request(fc, req);
 	} else {
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+		struct file *file = req->file;
 		void (*end) (struct fuse_conn *, struct fuse_req *) = req->end;
 		req->end = NULL;
-		spin_unlock(&fuse_lock);
-		down_read(&fc->sbput_sem);
-		if (fc->mounted)
-			fuse_release_background(req);
-		up_read(&fc->sbput_sem);
+		req->inode = NULL;
+		req->inode2 = NULL;
+		req->file = NULL;
+		if (!list_empty(&req->bg_entry))
+			fuse_remove_background(fc, req);
+		spin_unlock(&fc->lock);
+
 		if (end)
 			end(fc, req);
 		else
 			fuse_put_request(fc, req);
+
+		if (file)
+			fput(file);
+		iput(inode);
+		iput(inode2);
 	}
 }
 
@@ -242,6 +222,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->background = 1;
 	list_add(&req->bg_entry, &fc->background);
+	fc->num_background++;
+	if (fc->num_background == FUSE_MAX_BACKGROUND)
+		fc->blocked = 1;
 	if (req->inode)
 		req->inode = igrab(req->inode);
 	if (req->inode2)
@@ -250,16 +233,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req)
 		get_file(req->file);
 }
 
-/* Called with fuse_lock held.  Releases, and then reacquires it. */
+/* Called with fc->lock held.  Releases, and then reacquires it. */
 static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 {
 	sigset_t oldset;
 
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	block_sigs(&oldset);
 	wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED);
 	restore_sigs(&oldset);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (req->state == FUSE_REQ_FINISHED && !req->interrupted)
 		return;
 
@@ -273,9 +256,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
 		   locked state, there mustn't be any filesystem
 		   operation (e.g. page fault), since that could lead
 		   to deadlock */
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		wait_event(req->waitq, !req->locked);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	if (req->state == FUSE_REQ_PENDING) {
 		list_del(&req->list);
@@ -304,19 +287,14 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 	req->in.h.unique = fc->reqctr;
 	req->in.h.len = sizeof(struct fuse_in_header) +
 		len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
-	if (!req->preallocated) {
-		/* If request is not preallocated (either FORGET or
-		   RELEASE), then still decrease outstanding_sem, so
-		   user can't open infinite number of files while not
-		   processing the RELEASE requests.  However for
-		   efficiency do it without blocking, so if down()
-		   would block, just increase the debt instead */
-		if (down_trylock(&fc->outstanding_sem))
-			fc->outstanding_debt++;
-	}
 	list_add_tail(&req->list, &fc->pending);
 	req->state = FUSE_REQ_PENDING;
+	if (!req->waiting) {
+		req->waiting = 1;
+		atomic_inc(&fc->num_waiting);
+	}
 	wake_up(&fc->waitq);
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 }
 
 /*
@@ -325,7 +303,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
 void request_send(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (!fc->connected)
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
@@ -338,15 +316,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
 
 		request_wait_answer(fc, req);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
+	background_request(fc, req);
 	if (fc->connected) {
 		queue_request(fc, req);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	} else {
 		req->out.h.error = -ENOTCONN;
 		request_end(fc, req);
@@ -362,9 +341,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req)
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
 {
 	req->isreply = 1;
-	spin_lock(&fuse_lock);
-	background_request(fc, req);
-	spin_unlock(&fuse_lock);
 	request_send_nowait(fc, req);
 }
 
@@ -373,16 +349,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req)
  * anything that could cause a page-fault.  If the request was already
  * interrupted bail out.
  */
-static int lock_request(struct fuse_req *req)
+static int lock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int err = 0;
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		if (req->interrupted)
 			err = -ENOENT;
 		else
 			req->locked = 1;
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return err;
 }
@@ -392,18 +368,19 @@ static int lock_request(struct fuse_req *req)
  * requester thread is currently waiting for it to be unlocked, so
  * wake it up.
  */
-static void unlock_request(struct fuse_req *req)
+static void unlock_request(struct fuse_conn *fc, struct fuse_req *req)
 {
 	if (req) {
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		req->locked = 0;
 		if (req->interrupted)
 			wake_up(&req->waitq);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 }
 
 struct fuse_copy_state {
+	struct fuse_conn *fc;
 	int write;
 	struct fuse_req *req;
 	const struct iovec *iov;
@@ -416,11 +393,12 @@ struct fuse_copy_state {
 	unsigned len;
 };
 
-static void fuse_copy_init(struct fuse_copy_state *cs, int write,
-			   struct fuse_req *req, const struct iovec *iov,
-			   unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc,
+			   int write, struct fuse_req *req,
+			   const struct iovec *iov, unsigned long nr_segs)
 {
 	memset(cs, 0, sizeof(*cs));
+	cs->fc = fc;
 	cs->write = write;
 	cs->req = req;
 	cs->iov = iov;
@@ -450,7 +428,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	unsigned long offset;
 	int err;
 
-	unlock_request(cs->req);
+	unlock_request(cs->fc, cs->req);
 	fuse_copy_finish(cs);
 	if (!cs->seglen) {
 		BUG_ON(!cs->nr_segs);
@@ -473,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
 	cs->seglen -= cs->len;
 	cs->addr += cs->len;
 
-	return lock_request(cs->req);
+	return lock_request(cs->fc, cs->req);
 }
 
 /* Do as much copy to/from userspace buffer as we can */
@@ -585,9 +563,9 @@ static void request_wait(struct fuse_conn *fc)
 		if (signal_pending(current))
 			break;
 
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		schedule();
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 	set_current_state(TASK_RUNNING);
 	remove_wait_queue(&fc->waitq, &wait);
@@ -606,18 +584,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 			      unsigned long nr_segs, loff_t *off)
 {
 	int err;
-	struct fuse_conn *fc;
 	struct fuse_req *req;
 	struct fuse_in *in;
 	struct fuse_copy_state cs;
 	unsigned reqsize;
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
 
  restart:
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
-	err = -EPERM;
-	if (!fc)
+	spin_lock(&fc->lock);
+	err = -EAGAIN;
+	if ((file->f_flags & O_NONBLOCK) && fc->connected &&
+	    list_empty(&fc->pending))
 		goto err_unlock;
+
 	request_wait(fc);
 	err = -ENODEV;
 	if (!fc->connected)
@@ -641,14 +622,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 		request_end(fc, req);
 		goto restart;
 	}
-	spin_unlock(&fuse_lock);
-	fuse_copy_init(&cs, 1, req, iov, nr_segs);
+	spin_unlock(&fc->lock);
+	fuse_copy_init(&cs, fc, 1, req, iov, nr_segs);
 	err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
 	if (!err)
 		err = fuse_copy_args(&cs, in->numargs, in->argpages,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(&cs);
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err && req->interrupted)
 		err = -ENOENT;
@@ -663,12 +644,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
 	else {
 		req->state = FUSE_REQ_SENT;
 		list_move_tail(&req->list, &fc->processing);
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 	}
 	return reqsize;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 	return err;
 }
 
@@ -735,9 +716,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	struct fuse_copy_state cs;
 	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return -EPERM;
 
-	fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+	fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs);
 	if (nbytes < sizeof(struct fuse_out_header))
 		return -EINVAL;
 
@@ -749,7 +730,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	    oh.len != nbytes)
 		goto err_finish;
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	err = -ENOENT;
 	if (!fc->connected)
 		goto err_unlock;
@@ -760,9 +741,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 		goto err_unlock;
 
 	if (req->interrupted) {
-		spin_unlock(&fuse_lock);
+		spin_unlock(&fc->lock);
 		fuse_copy_finish(&cs);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 		request_end(fc, req);
 		return -ENOENT;
 	}
@@ -770,12 +751,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	req->out.h = oh;
 	req->locked = 1;
 	cs.req = req;
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 
 	err = copy_out_args(&cs, &req->out, nbytes);
 	fuse_copy_finish(&cs);
 
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	req->locked = 0;
 	if (!err) {
 		if (req->interrupted)
@@ -787,7 +768,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
 	return err ? err : nbytes;
 
  err_unlock:
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
  err_finish:
 	fuse_copy_finish(&cs);
 	return err;
@@ -804,18 +785,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf,
 
 static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 {
-	struct fuse_conn *fc = fuse_get_conn(file);
 	unsigned mask = POLLOUT | POLLWRNORM;
-
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (!fc)
-		return -ENODEV;
+		return POLLERR;
 
 	poll_wait(file, &fc->waitq, wait);
 
-	spin_lock(&fuse_lock);
-	if (!list_empty(&fc->pending))
-                mask |= POLLIN | POLLRDNORM;
-	spin_unlock(&fuse_lock);
+	spin_lock(&fc->lock);
+	if (!fc->connected)
+		mask = POLLERR;
+	else if (!list_empty(&fc->pending))
+		mask |= POLLIN | POLLRDNORM;
+	spin_unlock(&fc->lock);
 
 	return mask;
 }
@@ -823,7 +805,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
 /*
  * Abort all requests on the given list (pending or processing)
  *
- * This function releases and reacquires fuse_lock
+ * This function releases and reacquires fc->lock
  */
 static void end_requests(struct fuse_conn *fc, struct list_head *head)
 {
@@ -832,7 +814,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head)
 		req = list_entry(head->next, struct fuse_req, list);
 		req->out.h.error = -ECONNABORTED;
 		request_end(fc, req);
-		spin_lock(&fuse_lock);
+		spin_lock(&fc->lock);
 	}
 }
 
@@ -863,10 +845,10 @@ static void end_io_requests(struct fuse_conn *fc)
 			req->end = NULL;
 			/* The end function will consume this reference */
 			__fuse_get_request(req);
-			spin_unlock(&fuse_lock);
+			spin_unlock(&fc->lock);
 			wait_event(req->waitq, !req->locked);
 			end(fc, req);
-			spin_lock(&fuse_lock);
+			spin_lock(&fc->lock);
 		}
 	}
 }
@@ -893,35 +875,44 @@ static void end_io_requests(struct fuse_conn *fc)
  */
 void fuse_abort_conn(struct fuse_conn *fc)
 {
-	spin_lock(&fuse_lock);
+	spin_lock(&fc->lock);
 	if (fc->connected) {
 		fc->connected = 0;
 		end_io_requests(fc);
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
 		wake_up_all(&fc->waitq);
+		kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	}
-	spin_unlock(&fuse_lock);
+	spin_unlock(&fc->lock);
 }
 
 static int fuse_dev_release(struct inode *inode, struct file *file)
 {
-	struct fuse_conn *fc;
-
-	spin_lock(&fuse_lock);
-	fc = file->private_data;
+	struct fuse_conn *fc = fuse_get_conn(file);
 	if (fc) {
+		spin_lock(&fc->lock);
 		fc->connected = 0;
 		end_requests(fc, &fc->pending);
 		end_requests(fc, &fc->processing);
-	}
-	spin_unlock(&fuse_lock);
-	if (fc)
+		spin_unlock(&fc->lock);
+		fasync_helper(-1, file, 0, &fc->fasync);
 		kobject_put(&fc->kobj);
+	}
 
 	return 0;
 }
 
+static int fuse_dev_fasync(int fd, struct file *file, int on)
+{
+	struct fuse_conn *fc = fuse_get_conn(file);
+	if (!fc)
+		return -EPERM;
+
+	/* No locking - fasync_helper does its own locking */
+	return fasync_helper(fd, file, on, &fc->fasync);
+}
+
 const struct file_operations fuse_dev_operations = {
 	.owner		= THIS_MODULE,
 	.llseek		= no_llseek,
@@ -931,6 +922,7 @@ const struct file_operations fuse_dev_operations = {
 	.writev		= fuse_dev_writev,
 	.poll		= fuse_dev_poll,
 	.release	= fuse_dev_release,
+	.fasync		= fuse_dev_fasync,
 };
 
 static struct miscdevice fuse_miscdevice = {
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 256355b80256..8d7546e832e8 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd)
 			return 0;
 
 		fc = get_fuse_conn(inode);
-		req = fuse_get_request(fc);
-		if (!req)
+		req = fuse_get_req(fc);
+		if (IS_ERR(req))
 			return 0;
 
 		fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg);
@@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
 	if (entry->d_name.len > FUSE_NAME_MAX)
 		return ERR_PTR(-ENAMETOOLONG);
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return ERR_PTR(-EINTR);
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	fuse_lookup_init(req, dir, entry, &outarg);
 	request_send(fc, req);
@@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	struct file *file;
 	int flags = nd->intent.open.flags - 1;
 
-	err = -ENOSYS;
 	if (fc->no_create)
-		goto out;
+		return -ENOSYS;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
-		goto out;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
+	err = -ENOMEM;
 	ff = fuse_file_alloc();
 	if (!ff)
 		goto out_put_request;
@@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode,
 	fuse_file_free(ff);
  out_put_request:
 	fuse_put_request(fc, req);
- out:
 	return err;
 }
 
@@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode,
 {
 	struct fuse_mknod_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode)
 {
 	struct fuse_mkdir_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mode = mode;
@@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry,
 {
 	struct fuse_conn *fc = get_fuse_conn(dir);
 	unsigned len = strlen(link) + 1;
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_SYMLINK;
 	req->in.numargs = 2;
@@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_UNLINK;
 	req->in.h.nodeid = get_node_id(dir);
@@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry)
 {
 	int err;
 	struct fuse_conn *fc = get_fuse_conn(dir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_RMDIR;
 	req->in.h.nodeid = get_node_id(dir);
@@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent,
 	int err;
 	struct fuse_rename_in inarg;
 	struct fuse_conn *fc = get_fuse_conn(olddir);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.newdir = get_node_id(newdir);
@@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir,
 	struct fuse_link_in inarg;
 	struct inode *inode = entry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.oldnodeid = get_node_id(inode);
@@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode)
 	int err;
 	struct fuse_attr_out arg;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	struct fuse_req *req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_GETATTR;
 	req->in.h.nodeid = get_node_id(inode);
@@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask)
 	if (fc->no_access)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.mask = mask;
@@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir)
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry)
 {
 	struct inode *inode = dentry->d_inode;
 	struct fuse_conn *fc = get_fuse_conn(inode);
-	struct fuse_req *req = fuse_get_request(fc);
+	struct fuse_req *req = fuse_get_req(fc);
 	char *link;
 
-	if (!req)
-		return ERR_PTR(-EINTR);
+	if (IS_ERR(req))
+		return ERR_PTR(PTR_ERR(req));
 
 	link = (char *) __get_free_page(GFP_KERNEL);
 	if (!link) {
@@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr)
 		}
 	}
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	iattr_to_fattr(attr, &inarg);
@@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name,
 	if (fc->no_setxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name,
 	if (fc->no_getxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size)
 	if (fc->no_listxattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.size = size;
@@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	if (fc->no_removexattr)
 		return -EOPNOTSUPP;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->in.h.opcode = FUSE_REMOVEXATTR;
 	req->in.h.nodeid = get_node_id(inode);
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index 975f2697e866..fc342cf7c2cc 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir,
 	struct fuse_req *req;
 	int err;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
@@ -184,9 +184,9 @@ static int fuse_flush(struct file *file)
 	if (fc->no_flush)
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync,
 	if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir))
 		return 0;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page)
 	if (is_bad_inode(inode))
 		goto out;
 
-	err = -EINTR;
-	req = fuse_get_request(fc);
-	if (!req)
+	req = fuse_get_req(fc);
+	err = PTR_ERR(req);
+	if (IS_ERR(req))
 		goto out;
 
 	req->out.page_zeroing = 1;
@@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read ||
 	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
 		fuse_send_readpages(req, data->file, inode);
-		data->req = req = fuse_get_request(fc);
-		if (!req) {
+		data->req = req = fuse_get_req(fc);
+		if (IS_ERR(req)) {
 			unlock_page(page);
-			return -EINTR;
+			return PTR_ERR(req);
 		}
 	}
 	req->pages[req->num_pages] = page;
@@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping,
 
 	data.file = file;
 	data.inode = inode;
-	data.req = fuse_get_request(fc);
-	if (!data.req)
-		return -EINTR;
+	data.req = fuse_get_req(fc);
+	if (IS_ERR(data.req))
+		return PTR_ERR(data.req);
 
 	err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data);
-	if (!err)
-		fuse_send_readpages(data.req, file, inode);
+	if (!err) {
+		if (data.req->num_pages)
+			fuse_send_readpages(data.req, file, inode);
+		else
+			fuse_put_request(fc, data.req);
+	}
 	return err;
 }
 
@@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	req->num_pages = 1;
 	req->pages[0] = page;
@@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	while (count) {
 		size_t nres;
@@ -561,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf,
 		buf += nres;
 		if (nres != nbytes)
 			break;
-		if (count)
-			fuse_reset_request(req);
+		if (count) {
+			fuse_put_request(fc, req);
+			req = fuse_get_req(fc);
+			if (IS_ERR(req))
+				break;
+		}
 	}
 	fuse_put_request(fc, req);
 	if (res > 0) {
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
index a16a04fcf41e..59661c481d9d 100644
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -18,8 +18,8 @@
 /** Max number of pages that can be used in a single read request */
 #define FUSE_MAX_PAGES_PER_REQ 32
 
-/** If more requests are outstanding, then the operation will block */
-#define FUSE_MAX_OUTSTANDING 10
+/** Maximum number of outstanding background requests */
+#define FUSE_MAX_BACKGROUND 10
 
 /** It could be as large as PATH_MAX, but would that have any uses? */
 #define FUSE_NAME_MAX 1024
@@ -131,8 +131,8 @@ struct fuse_conn;
  * A request to the client
  */
 struct fuse_req {
-	/** This can be on either unused_list, pending processing or
-	    io lists in fuse_conn */
+	/** This can be on either pending processing or io lists in
+	    fuse_conn */
 	struct list_head list;
 
 	/** Entry on the background list */
@@ -144,15 +144,12 @@ struct fuse_req {
 	/*
 	 * The following bitfields are either set once before the
 	 * request is queued or setting/clearing them is protected by
-	 * fuse_lock
+	 * fuse_conn->lock
 	 */
 
 	/** True if the request has reply */
 	unsigned isreply:1;
 
-	/** The request is preallocated */
-	unsigned preallocated:1;
-
 	/** The request was interrupted */
 	unsigned interrupted:1;
 
@@ -162,6 +159,9 @@ struct fuse_req {
 	/** Data is being copied to/from the request */
 	unsigned locked:1;
 
+	/** Request is counted as "waiting" */
+	unsigned waiting:1;
+
 	/** State of the request */
 	enum fuse_req_state state;
 
@@ -213,6 +213,9 @@ struct fuse_req {
  * unmounted.
  */
 struct fuse_conn {
+	/** Lock protecting accessess to  members of this structure */
+	spinlock_t lock;
+
 	/** The user id for this mount */
 	uid_t user_id;
 
@@ -244,25 +247,20 @@ struct fuse_conn {
 	    interrupted request) */
 	struct list_head background;
 
-	/** Controls the maximum number of outstanding requests */
-	struct semaphore outstanding_sem;
+	/** Number of requests currently in the background */
+	unsigned num_background;
 
-	/** This counts the number of outstanding requests if
-	    outstanding_sem would go negative */
-	unsigned outstanding_debt;
+	/** Flag indicating if connection is blocked.  This will be
+	    the case before the INIT reply is received, and if there
+	    are too many outstading backgrounds requests */
+	int blocked;
 
-	/** RW semaphore for exclusion with fuse_put_super() */
-	struct rw_semaphore sbput_sem;
-
-	/** The list of unused requests */
-	struct list_head unused_list;
+	/** waitq for blocked connection */
+	wait_queue_head_t blocked_waitq;
 
 	/** The next unique request id */
 	u64 reqctr;
 
-	/** Mount is active */
-	unsigned mounted;
-
 	/** Connection established, cleared on umount, connection
 	    abort and device release */
 	unsigned connected;
@@ -318,6 +316,9 @@ struct fuse_conn {
 
 	/** kobject */
 	struct kobject kobj;
+
+	/** O_ASYNC requests */
+	struct fasync_struct *fasync;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -349,21 +350,6 @@ static inline u64 get_node_id(struct inode *inode)
 extern const struct file_operations fuse_dev_operations;
 
 /**
- * This is the single global spinlock which protects FUSE's structures
- *
- * The following data is protected by this lock:
- *
- *  - the private_data field of the device file
- *  - the s_fs_info field of the super block
- *  - unused_list, pending, processing lists in fuse_conn
- *  - background list in fuse_conn
- *  - the unique request ID counter reqctr in fuse_conn
- *  - the sb (super_block) field in fuse_conn
- *  - the file (device file) field in fuse_conn
- */
-extern spinlock_t fuse_lock;
-
-/**
  * Get a filled in inode
  */
 struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid,
@@ -461,11 +447,11 @@ void fuse_reset_request(struct fuse_req *req);
 /**
  * Reserve a preallocated request
  */
-struct fuse_req *fuse_get_request(struct fuse_conn *fc);
+struct fuse_req *fuse_get_req(struct fuse_conn *fc);
 
 /**
- * Decrement reference count of a request.  If count goes to zero put
- * on unused list (preallocated) or free request (not preallocated).
+ * Decrement reference count of a request.  If count goes to zero free
+ * the request.
  */
 void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 
@@ -485,11 +471,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req);
 void request_send_background(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
- * Release inodes and file associated with background request
+ * Remove request from the the background list
  */
-void fuse_release_background(struct fuse_req *req);
+void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req);
 
-/* Abort all requests */
+/** Abort all requests */
 void fuse_abort_conn(struct fuse_conn *fc);
 
 /**
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 879e6fba9480..43a6fc0db8a7 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1,6 +1,6 @@
 /*
   FUSE: Filesystem in Userspace
-  Copyright (C) 2001-2005  Miklos Szeredi <miklos@szeredi.hu>
+  Copyright (C) 2001-2006  Miklos Szeredi <miklos@szeredi.hu>
 
   This program can be distributed under the terms of the GNU GPL.
   See the file COPYING.
@@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
 MODULE_LICENSE("GPL");
 
-spinlock_t fuse_lock;
 static kmem_cache_t *fuse_inode_cachep;
 static struct subsystem connections_subsys;
 
@@ -205,17 +204,28 @@ static void fuse_put_super(struct super_block *sb)
 {
 	struct fuse_conn *fc = get_fuse_conn_super(sb);
 
-	down_write(&fc->sbput_sem);
-	while (!list_empty(&fc->background))
-		fuse_release_background(list_entry(fc->background.next,
-						   struct fuse_req, bg_entry));
-
-	spin_lock(&fuse_lock);
-	fc->mounted = 0;
+	spin_lock(&fc->lock);
 	fc->connected = 0;
-	spin_unlock(&fuse_lock);
-	up_write(&fc->sbput_sem);
+	while (!list_empty(&fc->background)) {
+		struct fuse_req *req = list_entry(fc->background.next,
+						  struct fuse_req, bg_entry);
+		struct inode *inode = req->inode;
+		struct inode *inode2 = req->inode2;
+
+		/* File would hold a reference to vfsmount */
+		BUG_ON(req->file);
+		req->inode = NULL;
+		req->inode2 = NULL;
+		fuse_remove_background(fc, req);
+
+		spin_unlock(&fc->lock);
+		iput(inode);
+		iput(inode2);
+		spin_lock(&fc->lock);
+	}
+	spin_unlock(&fc->lock);
 	/* Flush all readers on this fs */
+	kill_fasync(&fc->fasync, SIGIO, POLL_IN);
 	wake_up_all(&fc->waitq);
 	kobject_del(&fc->kobj);
 	kobject_put(&fc->kobj);
@@ -242,9 +252,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf)
 	struct fuse_statfs_out outarg;
 	int err;
 
-        req = fuse_get_request(fc);
-	if (!req)
-		return -EINTR;
+	req = fuse_get_req(fc);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 
 	memset(&outarg, 0, sizeof(outarg));
 	req->in.numargs = 0;
@@ -369,15 +379,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt)
 
 static void fuse_conn_release(struct kobject *kobj)
 {
-	struct fuse_conn *fc = get_fuse_conn_kobj(kobj);
-
-	while (!list_empty(&fc->unused_list)) {
-		struct fuse_req *req;
-		req = list_entry(fc->unused_list.next, struct fuse_req, list);
-		list_del(&req->list);
-		fuse_request_free(req);
-	}
-	kfree(fc);
+	kfree(get_fuse_conn_kobj(kobj));
 }
 
 static struct fuse_conn *new_conn(void)
@@ -386,64 +388,24 @@ static struct fuse_conn *new_conn(void)
 
 	fc = kzalloc(sizeof(*fc), GFP_KERNEL);
 	if (fc) {
-		int i;
+		spin_lock_init(&fc->lock);
 		init_waitqueue_head(&fc->waitq);
+		init_waitqueue_head(&fc->blocked_waitq);
 		INIT_LIST_HEAD(&fc->pending);
 		INIT_LIST_HEAD(&fc->processing);
 		INIT_LIST_HEAD(&fc->io);
-		INIT_LIST_HEAD(&fc->unused_list);
 		INIT_LIST_HEAD(&fc->background);
-		sema_init(&fc->outstanding_sem, 1); /* One for INIT */
-		init_rwsem(&fc->sbput_sem);
 		kobj_set_kset_s(fc, connections_subsys);
 		kobject_init(&fc->kobj);
 		atomic_set(&fc->num_waiting, 0);
-		for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) {
-			struct fuse_req *req = fuse_request_alloc();
-			if (!req) {
-				kobject_put(&fc->kobj);
-				return NULL;
-			}
-			list_add(&req->list, &fc->unused_list);
-		}
 		fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE;
 		fc->bdi.unplug_io_fn = default_unplug_io_fn;
 		fc->reqctr = 0;
+		fc->blocked = 1;
 	}
 	return fc;
 }
 
-static struct fuse_conn *get_conn(struct file *file, struct super_block *sb)
-{
-	struct fuse_conn *fc;
-	int err;
-
-	err = -EINVAL;
-	if (file->f_op != &fuse_dev_operations)
-		goto out_err;
-
-	err = -ENOMEM;
-	fc = new_conn();
-	if (!fc)
-		goto out_err;
-
-	spin_lock(&fuse_lock);
-	err = -EINVAL;
-	if (file->private_data)
-		goto out_unlock;
-
-	kobject_get(&fc->kobj);
-	file->private_data = fc;
-	spin_unlock(&fuse_lock);
-	return fc;
-
- out_unlock:
-	spin_unlock(&fuse_lock);
-	kobject_put(&fc->kobj);
- out_err:
-	return ERR_PTR(err);
-}
-
 static struct inode *get_root_inode(struct super_block *sb, unsigned mode)
 {
 	struct fuse_attr attr;
@@ -467,7 +429,6 @@ static struct super_operations fuse_super_operations = {
 
 static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 {
-	int i;
 	struct fuse_init_out *arg = &req->misc.init_out;
 
 	if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION)
@@ -486,22 +447,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req)
 		fc->minor = arg->minor;
 		fc->max_write = arg->minor < 5 ? 4096 : arg->max_write;
 	}
-
-	/* After INIT reply is received other requests can go
-	   out.  So do (FUSE_MAX_OUTSTANDING - 1) number of
-	   up()s on outstanding_sem.  The last up() is done in
-	   fuse_putback_request() */
-	for (i = 1; i < FUSE_MAX_OUTSTANDING; i++)
-		up(&fc->outstanding_sem);
-
 	fuse_put_request(fc, req);
+	fc->blocked = 0;
+	wake_up_all(&fc->blocked_waitq);
 }
 
-static void fuse_send_init(struct fuse_conn *fc)
+static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req)
 {
-	/* This is called from fuse_read_super() so there's guaranteed
-	   to be exactly one request available */
-	struct fuse_req *req = fuse_get_request(fc);
 	struct fuse_init_in *arg = &req->misc.init_in;
 
 	arg->major = FUSE_KERNEL_VERSION;
@@ -525,12 +477,9 @@ static void fuse_send_init(struct fuse_conn *fc)
 
 static unsigned long long conn_id(void)
 {
+	/* BKL is held for ->get_sb() */
 	static unsigned long long ctr = 1;
-	unsigned long long val;
-	spin_lock(&fuse_lock);
-	val = ctr++;
-	spin_unlock(&fuse_lock);
-	return val;
+	return ctr++;
 }
 
 static int fuse_fill_super(struct super_block *sb, void *data, int silent)
@@ -540,6 +489,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	struct fuse_mount_data d;
 	struct file *file;
 	struct dentry *root_dentry;
+	struct fuse_req *init_req;
 	int err;
 
 	if (!parse_fuse_opt((char *) data, &d))
@@ -555,10 +505,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	if (!file)
 		return -EINVAL;
 
-	fc = get_conn(file, sb);
-	fput(file);
-	if (IS_ERR(fc))
-		return PTR_ERR(fc);
+	if (file->f_op != &fuse_dev_operations)
+		return -EINVAL;
+
+	/* Setting file->private_data can't race with other mount()
+	   instances, since BKL is held for ->get_sb() */
+	if (file->private_data)
+		return -EINVAL;
+
+	fc = new_conn();
+	if (!fc)
+		return -ENOMEM;
 
 	fc->flags = d.flags;
 	fc->user_id = d.user_id;
@@ -579,27 +536,39 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err;
 	}
 
+	init_req = fuse_request_alloc();
+	if (!init_req)
+		goto err_put_root;
+
 	err = kobject_set_name(&fc->kobj, "%llu", conn_id());
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	err = kobject_add(&fc->kobj);
 	if (err)
-		goto err_put_root;
+		goto err_free_req;
 
 	sb->s_root = root_dentry;
-	spin_lock(&fuse_lock);
-	fc->mounted = 1;
 	fc->connected = 1;
-	spin_unlock(&fuse_lock);
+	kobject_get(&fc->kobj);
+	file->private_data = fc;
+	/*
+	 * atomic_dec_and_test() in fput() provides the necessary
+	 * memory barrier for file->private_data to be visible on all
+	 * CPUs after this
+	 */
+	fput(file);
 
-	fuse_send_init(fc);
+	fuse_send_init(fc, init_req);
 
 	return 0;
 
+ err_free_req:
+	fuse_request_free(init_req);
  err_put_root:
 	dput(root_dentry);
  err:
+	fput(file);
 	kobject_put(&fc->kobj);
 	return err;
 }
@@ -753,7 +722,6 @@ static int __init fuse_init(void)
 	printk("fuse init (API version %i.%i)\n",
 	       FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION);
 
-	spin_lock_init(&fuse_lock);
 	res = fuse_fs_init();
 	if (res)
 		goto err;
diff --git a/fs/inotify.c b/fs/inotify.c
index 367c487c014b..1f50302849c5 100644
--- a/fs/inotify.c
+++ b/fs/inotify.c
@@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode)
 	WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED);
 	spin_lock(&entry->d_lock);
 	parent = entry->d_parent;
-	if (inotify_inode_watched(parent->d_inode))
+	if (parent->d_inode && inotify_inode_watched(parent->d_inode))
 		entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED;
 	spin_unlock(&entry->d_lock);
 }
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index d2b66bad7d50..3ef739120dff 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -650,7 +650,7 @@ static void nlmsvc_grant_callback(struct rpc_task *task, void *data)
 	svc_wake_up(block->b_daemon);
 }
 
-void nlmsvc_grant_release(void *data)
+static void nlmsvc_grant_release(void *data)
 {
 	struct nlm_rqst		*call = data;
 
diff --git a/fs/locks.c b/fs/locks.c
index dda83d6cd48b..efad798824dc 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2230,7 +2230,12 @@ void steal_locks(fl_owner_t from)
 
 	lock_kernel();
 	j = 0;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structures, so
+	 * we need to acquire ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	fdt = files_fdtable(files);
 	for (;;) {
 		unsigned long set;
@@ -2248,7 +2253,7 @@ void steal_locks(fl_owner_t from)
 			set >>= 1;
 		}
 	}
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	unlock_kernel();
 }
 EXPORT_SYMBOL(steal_locks);
diff --git a/fs/namespace.c b/fs/namespace.c
index bf478addb852..2c5f1f80bdc2 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag)
 /*
  * do loopback mount.
  */
-static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
+static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags)
 {
 	struct nameidata old_nd;
 	struct vfsmount *mnt = NULL;
+	int recurse = flags & MS_REC;
 	int err = mount_is_safe(nd);
+
 	if (err)
 		return err;
 	if (!old_name || !*old_name)
@@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse)
 		spin_unlock(&vfsmount_lock);
 		release_mounts(&umount_list);
 	}
+	mnt->mnt_flags = mnt_flags;
 
 out:
 	up_write(&namespace_sem);
@@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page,
 		retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags,
 				    data_page);
 	else if (flags & MS_BIND)
-		retval = do_loopback(&nd, dev_name, flags & MS_REC);
+		retval = do_loopback(&nd, dev_name, flags, mnt_flags);
 	else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE))
 		retval = do_change_type(&nd, flags);
 	else if (flags & MS_MOVE)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index a23f34894167..cae74dd4c7f5 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -128,15 +128,14 @@ struct inode_operations nfs4_dir_inode_operations = {
 static int
 nfs_opendir(struct inode *inode, struct file *filp)
 {
-	int res = 0;
+	int res;
 
 	dfprintk(VFS, "NFS: opendir(%s/%ld)\n",
 			inode->i_sb->s_id, inode->i_ino);
 
 	lock_kernel();
 	/* Call generic open code in order to cache credentials */
-	if (!res)
-		res = nfs_open(inode, filp);
+	res = nfs_open(inode, filp);
 	unlock_kernel();
 	return res;
 }
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 0f583cb16ddb..3c72b0c07283 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -112,10 +112,9 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode
  */
 ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs)
 {
-	struct dentry *dentry = iocb->ki_filp->f_dentry;
-
 	dprintk("NFS: nfs_direct_IO (%s) off/no(%Ld/%lu) EINVAL\n",
-			dentry->d_name.name, (long long) pos, nr_segs);
+			iocb->ki_filp->f_dentry->d_name.name,
+			(long long) pos, nr_segs);
 
 	return -EINVAL;
 }
@@ -468,7 +467,6 @@ static const struct rpc_call_ops nfs_commit_direct_ops = {
 static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 {
 	struct nfs_write_data *data = dreq->commit_data;
-	struct rpc_task *task = &data->task;
 
 	data->inode = dreq->inode;
 	data->cred = dreq->ctx->cred;
@@ -489,7 +487,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
 	/* Note: task.tk_ops->rpc_release will free dreq->commit_data */
 	dreq->commit_data = NULL;
 
-	dprintk("NFS: %5u initiated commit call\n", task->tk_pid);
+	dprintk("NFS: %5u initiated commit call\n", data->task.tk_pid);
 
 	lock_kernel();
 	rpc_execute(&data->task);
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index f1df2c8d9259..fade02c15e6e 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -534,10 +534,9 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
  */
 static int nfs_flock(struct file *filp, int cmd, struct file_lock *fl)
 {
-	struct inode * inode = filp->f_mapping->host;
-
 	dprintk("NFS: nfs_flock(f=%s/%ld, t=%x, fl=%x)\n",
-			inode->i_sb->s_id, inode->i_ino,
+			filp->f_dentry->d_inode->i_sb->s_id,
+			filp->f_dentry->d_inode->i_ino,
 			fl->fl_type, fl->fl_flags);
 
 	/*
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 2f7656b911b6..d0b991a92327 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -700,12 +700,9 @@ static int nfs_show_stats(struct seq_file *m, struct vfsmount *mnt)
 	/*
 	 * Display superblock I/O counters
 	 */
-	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+	for_each_possible_cpu(cpu) {
 		struct nfs_iostats *stats;
 
-		if (!cpu_possible(cpu))
-			continue;
-
 		preempt_disable();
 		stats = per_cpu_ptr(nfss->io_stats, cpu);
 
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 47ece1dd3c67..d86c0db7b1e8 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -1218,7 +1218,7 @@ out:
 	return status;
 }
 
-static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
+static int nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, struct nfs4_state *state)
 {
 	struct file *filp;
 
@@ -1227,8 +1227,10 @@ static void nfs4_intent_set_file(struct nameidata *nd, struct dentry *dentry, st
 		struct nfs_open_context *ctx;
 		ctx = (struct nfs_open_context *)filp->private_data;
 		ctx->state = state;
-	} else
-		nfs4_close_state(state, nd->intent.open.flags);
+		return 0;
+	}
+	nfs4_close_state(state, nd->intent.open.flags);
+	return PTR_ERR(filp);
 }
 
 struct dentry *
@@ -1835,7 +1837,7 @@ nfs4_proc_create(struct inode *dir, struct dentry *dentry, struct iattr *sattr,
 			nfs_setattr_update_inode(state->inode, sattr);
 	}
 	if (status == 0 && nd != NULL && (nd->flags & LOOKUP_OPEN))
-		nfs4_intent_set_file(nd, dentry, state);
+		status = nfs4_intent_set_file(nd, dentry, state);
 	else
 		nfs4_close_state(state, flags);
 out:
diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c
index cfe9ce881613..6e92b0fe5323 100644
--- a/fs/nfsd/auth.c
+++ b/fs/nfsd/auth.c
@@ -14,46 +14,46 @@
 
 int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp)
 {
-	struct svc_cred	*cred = &rqstp->rq_cred;
+	struct svc_cred	cred = rqstp->rq_cred;
 	int i;
 	int ret;
 
 	if (exp->ex_flags & NFSEXP_ALLSQUASH) {
-		cred->cr_uid = exp->ex_anon_uid;
-		cred->cr_gid = exp->ex_anon_gid;
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = groups_alloc(0);
+		cred.cr_uid = exp->ex_anon_uid;
+		cred.cr_gid = exp->ex_anon_gid;
+		cred.cr_group_info = groups_alloc(0);
 	} else if (exp->ex_flags & NFSEXP_ROOTSQUASH) {
 		struct group_info *gi;
-		if (!cred->cr_uid)
-			cred->cr_uid = exp->ex_anon_uid;
-		if (!cred->cr_gid)
-			cred->cr_gid = exp->ex_anon_gid;
-		gi = groups_alloc(cred->cr_group_info->ngroups);
+		if (!cred.cr_uid)
+			cred.cr_uid = exp->ex_anon_uid;
+		if (!cred.cr_gid)
+			cred.cr_gid = exp->ex_anon_gid;
+		gi = groups_alloc(cred.cr_group_info->ngroups);
 		if (gi)
-			for (i = 0; i < cred->cr_group_info->ngroups; i++) {
-				if (!GROUP_AT(cred->cr_group_info, i))
+			for (i = 0; i < cred.cr_group_info->ngroups; i++) {
+				if (!GROUP_AT(cred.cr_group_info, i))
 					GROUP_AT(gi, i) = exp->ex_anon_gid;
 				else
-					GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i);
+					GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i);
 			}
-		put_group_info(cred->cr_group_info);
-		cred->cr_group_info = gi;
-	}
+		cred.cr_group_info = gi;
+	} else
+		get_group_info(cred.cr_group_info);
 
-	if (cred->cr_uid != (uid_t) -1)
-		current->fsuid = cred->cr_uid;
+	if (cred.cr_uid != (uid_t) -1)
+		current->fsuid = cred.cr_uid;
 	else
 		current->fsuid = exp->ex_anon_uid;
-	if (cred->cr_gid != (gid_t) -1)
-		current->fsgid = cred->cr_gid;
+	if (cred.cr_gid != (gid_t) -1)
+		current->fsgid = cred.cr_gid;
 	else
 		current->fsgid = exp->ex_anon_gid;
 
-	if (!cred->cr_group_info)
+	if (!cred.cr_group_info)
 		return -ENOMEM;
-	ret = set_current_groups(cred->cr_group_info);
-	if ((cred->cr_uid)) {
+	ret = set_current_groups(cred.cr_group_info);
+	put_group_info(cred.cr_group_info);
+	if ((cred.cr_uid)) {
 		cap_t(current->cap_effective) &= ~CAP_NFSD_MASK;
 	} else {
 		cap_t(current->cap_effective) |= (CAP_NFSD_MASK &
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index c340be0a3f59..4e0578121d9a 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
 	if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0)
 		goto out;
 	err = path_lookup(buf, 0, &nd);
-	if (err) goto out;
+	if (err) goto out_no_path;
 
 	exp.h.flags = 0;
 	exp.ex_client = dom;
@@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen)
  out:
 	if (nd.dentry)
 		path_release(&nd);
+ out_no_path:
 	if (dom)
 		auth_domain_put(dom);
 	kfree(buf);
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 6d2dfed1de08..f61142afea44 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -682,7 +682,7 @@ static struct svc_procedure		nfsd_procedures3[22] = {
   PROC(lookup,	 dirop,		dirop,		fhandle2, RC_NOCACHE, ST+FH+pAT+pAT),
   PROC(access,	 access,	access,		fhandle,  RC_NOCACHE, ST+pAT+1),
   PROC(readlink, readlink,	readlink,	fhandle,  RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4),
-  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 read,		read,		fhandle,  RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4),
   PROC(write,	 write,		write,		fhandle,  RC_REPLBUFF, ST+WC+4),
   PROC(create,	 create,	create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
   PROC(mkdir,	 mkdir,		create,		fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC),
diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c
index 7391f4aabedb..edb107e61b91 100644
--- a/fs/nfsd/nfs4acl.c
+++ b/fs/nfsd/nfs4acl.c
@@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl)
 		/* Also, the remaining entries are for named users and
 		 * groups, and come in threes (mask, allow, deny): */
 		if (n4acl->naces < 7)
-			return -1;
+			return -EINVAL;
 		if ((n4acl->naces - 7) % 3)
-			return -1;
+			return -EINVAL;
 		return 4 + (n4acl->naces - 7)/3;
 	}
 }
@@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl)
 			continue;
 
 		error = nfs4_acl_add_ace(dacl, ace->type, ace->flag,
-				ace->access_mask, ace->whotype, ace->who) == -1;
+				ace->access_mask, ace->whotype, ace->who);
 		if (error < 0)
 			goto out;
 
@@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask,
 	struct nfs4_ace *ace;
 
 	if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL)
-		return -1;
+		return -ENOMEM;
 
 	ace->type = type;
 	ace->flag = flag;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c872bd07fc10..dbaf3f93f328 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 		goto out_clnt;
 	}
 
-	/* the task holds a reference to the nfs4_client struct */
 	cb->cb_client = clnt;
+
+	/* the task holds a reference to the nfs4_client struct */
 	atomic_inc(&clp->cl_count);
 
 	msg.rpc_cred = nfsd4_lookupcred(clp,0);
@@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp)
 out_rpciod:
 	atomic_dec(&clp->cl_count);
 	rpciod_down();
+	cb->cb_client = NULL;
 out_clnt:
 	rpc_shutdown_client(clnt);
-	goto out_err;
 out_err:
 	dprintk("NFSD: warning: no callback path to client %.*s\n",
 		(int)clp->cl_name.len, clp->cl_name.data);
-	cb->cb_client = NULL;
 }
 
 static void
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 6d63f1d9e5f5..b0e095ea0c03 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh)
 	fh_put(current_fh);
 	status = exp_pseudoroot(rqstp->rq_client, current_fh,
 			      &rqstp->rq_chandle);
-	if (!status)
-		status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export));
 	return status;
 }
 
@@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; };
  */
 static struct svc_procedure		nfsd_procedures4[2] = {
   PROC(null,	 void,		void,		void,	  RC_NOCACHE, 1),
-  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE)
+  PROC(compound, compound,	compound,	compound, RC_NOCACHE, NFSD_BUFSIZE/4)
 };
 
 struct svc_version	nfsd_version4 = {
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 47ec112b266c..96c7578cbe1e 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi)
 	kref_get(&fi->fi_ref);
 }
 
+static int num_delegations;
+
+/*
+ * Open owner state (share locks)
+ */
+
+/* hash tables for nfs4_stateowner */
+#define OWNER_HASH_BITS              8
+#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
+#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
+
+#define ownerid_hashval(id) \
+        ((id) & OWNER_HASH_MASK)
+#define ownerstr_hashval(clientid, ownername) \
+        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
+
+static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
+static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
+
+/* hash table for nfs4_file */
+#define FILE_HASH_BITS                   8
+#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
+#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
+/* hash table for (open)nfs4_stateid */
+#define STATEID_HASH_BITS              10
+#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
+#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
+
+#define file_hashval(x) \
+        hash_ptr(x, FILE_HASH_BITS)
+#define stateid_hashval(owner_id, file_id)  \
+        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
+
+static struct list_head file_hashtbl[FILE_HASH_SIZE];
+static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
+
 static struct nfs4_delegation *
 alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type)
 {
@@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
 	struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback;
 
 	dprintk("NFSD alloc_init_deleg\n");
+	if (num_delegations > STATEID_HASH_SIZE * 4)
+		return NULL;
 	dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL);
 	if (dp == NULL)
 		return dp;
+	num_delegations++;
 	INIT_LIST_HEAD(&dp->dl_perfile);
 	INIT_LIST_HEAD(&dp->dl_perclnt);
 	INIT_LIST_HEAD(&dp->dl_recall_lru);
@@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp)
 		dprintk("NFSD: freeing dp %p\n",dp);
 		put_nfs4_file(dp->dl_file);
 		kmem_cache_free(deleg_slab, dp);
+		num_delegations--;
 	}
 }
 
@@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp)
 }
 
 static void
+shutdown_callback_client(struct nfs4_client *clp)
+{
+	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
+
+	/* shutdown rpc client, ending any outstanding recall rpcs */
+	if (clnt) {
+		clp->cl_callback.cb_client = NULL;
+		rpc_shutdown_client(clnt);
+		rpciod_down();
+	}
+}
+
+static void
 expire_client(struct nfs4_client *clp)
 {
 	struct nfs4_stateowner *sop;
 	struct nfs4_delegation *dp;
-	struct nfs4_callback *cb = &clp->cl_callback;
-	struct rpc_clnt *clnt = clp->cl_callback.cb_client;
 	struct list_head reaplist;
 
 	dprintk("NFSD: expire_client cl_count %d\n",
 	                    atomic_read(&clp->cl_count));
 
-	/* shutdown rpc client, ending any outstanding recall rpcs */
-	if (atomic_read(&cb->cb_set) == 1 && clnt) {
-		rpc_shutdown_client(clnt);
-		clnt = clp->cl_callback.cb_client = NULL;
-	}
+	shutdown_callback_client(clp);
 
 	INIT_LIST_HEAD(&reaplist);
 	spin_lock(&recall_lock);
@@ -936,40 +983,6 @@ out:
 	return status;
 }
 
-/* 
- * Open owner state (share locks)
- */
-
-/* hash tables for nfs4_stateowner */
-#define OWNER_HASH_BITS              8
-#define OWNER_HASH_SIZE             (1 << OWNER_HASH_BITS)
-#define OWNER_HASH_MASK             (OWNER_HASH_SIZE - 1)
-
-#define ownerid_hashval(id) \
-        ((id) & OWNER_HASH_MASK)
-#define ownerstr_hashval(clientid, ownername) \
-        (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK)
-
-static struct list_head	ownerid_hashtbl[OWNER_HASH_SIZE];
-static struct list_head	ownerstr_hashtbl[OWNER_HASH_SIZE];
-
-/* hash table for nfs4_file */
-#define FILE_HASH_BITS                   8
-#define FILE_HASH_SIZE                  (1 << FILE_HASH_BITS)
-#define FILE_HASH_MASK                  (FILE_HASH_SIZE - 1)
-/* hash table for (open)nfs4_stateid */
-#define STATEID_HASH_BITS              10
-#define STATEID_HASH_SIZE              (1 << STATEID_HASH_BITS)
-#define STATEID_HASH_MASK              (STATEID_HASH_SIZE - 1)
-
-#define file_hashval(x) \
-        hash_ptr(x, FILE_HASH_BITS)
-#define stateid_hashval(owner_id, file_id)  \
-        (((owner_id) + (file_id)) & STATEID_HASH_MASK)
-
-static struct list_head file_hashtbl[FILE_HASH_SIZE];
-static struct list_head stateid_hashtbl[STATEID_HASH_SIZE];
-
 /* OPEN Share state helper functions */
 static inline struct nfs4_file *
 alloc_init_file(struct inode *ino)
@@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop)
 {
 	dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
 
-	unhash_stateowner(sop);
-	list_add_tail(&sop->so_close_lru, &close_lru);
+	list_move_tail(&sop->so_close_lru, &close_lru);
 	sop->so_time = get_seconds();
 }
 
@@ -1916,8 +1928,7 @@ nfs4_laundromat(void)
 		}
 		dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
 			sop->so_id);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
+		release_stateowner(sop);
 	}
 	if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
 		clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock)
 		lock->fl_end = OFFSET_MAX;
 }
 
-static int
-nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval)
-{
-	struct nfs4_stateowner *local = NULL;
-	int status = 0;
-			        
-	if (hashval >= LOCK_HASH_SIZE)
-		goto out;
-	list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) {
-		if (local == sop) {
-			status = 1;
-			goto out;
-		}
-	}
-out:
-	return status;
-}
-
+/* Hack!: For now, we're defining this just so we can use a pointer to it
+ * as a unique cookie to identify our (NFSv4's) posix locks. */
+static struct lock_manager_operations nfsd_posix_mng_ops  = {
+};
 
 static inline void
 nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny)
 {
-	struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner;
-	unsigned int hval = lockownerid_hashval(sop->so_id);
+	struct nfs4_stateowner *sop;
+	unsigned int hval;
 
-	deny->ld_sop = NULL;
-	if (nfs4_verify_lock_stateowner(sop, hval)) {
+	if (fl->fl_lmops == &nfsd_posix_mng_ops) {
+		sop = (struct nfs4_stateowner *) fl->fl_owner;
+		hval = lockownerid_hashval(sop->so_id);
 		kref_get(&sop->so_ref);
 		deny->ld_sop = sop;
 		deny->ld_clientid = sop->so_client->cl_clientid;
+	} else {
+		deny->ld_sop = NULL;
+		deny->ld_clientid.cl_boot = 0;
+		deny->ld_clientid.cl_id = 0;
 	}
 	deny->ld_start = fl->fl_start;
 	deny->ld_length = ~(u64)0;
@@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lock->lk_offset;
 	if ((lock->lk_length == ~(u64)0) || 
@@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 		file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner;
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_flags = FL_POSIX;
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 
 	file_lock.fl_start = lockt->lt_offset;
 	if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length))
@@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock
 	file_lock.fl_pid = current->tgid;
 	file_lock.fl_file = filp;
 	file_lock.fl_flags = FL_POSIX; 
+	file_lock.fl_lmops = &nfsd_posix_mng_ops;
 	file_lock.fl_start = locku->lu_offset;
 
 	if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length))
@@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void)
 	int i;
 	struct nfs4_client *clp = NULL;
 	struct nfs4_delegation *dp = NULL;
-	struct nfs4_stateowner *sop = NULL;
 	struct list_head *pos, *next, reaplist;
 
-	list_for_each_safe(pos, next, &close_lru) {
-		sop = list_entry(pos, struct nfs4_stateowner, so_close_lru);
-		list_del(&sop->so_close_lru);
-		nfs4_put_stateowner(sop);
-	}
-
 	for (i = 0; i < CLIENT_HASH_SIZE; i++) {
 		while (!list_empty(&conf_id_hashtbl[i])) {
 			clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash);
@@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void)
 	}
 
 	cancel_delayed_work(&laundromat_work);
-	flush_workqueue(laundry_wq);
-	destroy_workqueue(laundry_wq);
 	nfsd4_shutdown_recdir();
 	nfs4_init = 0;
 }
@@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void)
 void
 nfs4_state_shutdown(void)
 {
+	cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work);
+	destroy_workqueue(laundry_wq);
 	nfs4_lock_state();
 	nfs4_release_reclaim();
 	__nfs4_state_shutdown();
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 03857fd81126..de3998f15f10 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
 						buf, dummy32, &ace.who);
 			if (status)
 				goto out_nfserr;
-			if (nfs4_acl_add_ace(*acl, ace.type, ace.flag,
-				 ace.access_mask, ace.whotype, ace.who) != 0) {
-				status = -ENOMEM;
+			status = nfs4_acl_add_ace(*acl, ace.type, ace.flag,
+				 ace.access_mask, ace.whotype, ace.who);
+			if (status)
 				goto out_nfserr;
-			}
 		}
 	} else
 		*acl = NULL;
@@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read
 	WRITE32(eof);
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
-
+	resp->xbuf->head[0].iov_len = (char*)p
+					- (char*)resp->xbuf->head[0].iov_base;
 	resp->xbuf->page_len = maxcount;
 
-	/* read zero bytes -> don't set up tail */
-	if(!maxcount)
-		return 0;        
-
-	/* set up page for remaining responses */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
@@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r
 
 	WRITE32(maxcount);
 	ADJUST_ARGS();
-	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->head[0].iov_len = (char*)p
+				- (char*)resp->xbuf->head[0].iov_base;
+	resp->xbuf->page_len = maxcount;
 
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = p;
 	resp->xbuf->tail[0].iov_len = 0;
-	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
-
-	resp->xbuf->page_len = maxcount;
 	if (maxcount&3) {
-		*(resp->p)++ = 0;
+		RESERVE_SPACE(4);
+		WRITE32(0);
 		resp->xbuf->tail[0].iov_base += maxcount&3;
 		resp->xbuf->tail[0].iov_len = 4 - (maxcount&3);
+		ADJUST_ARGS();
 	}
 	return 0;
 }
@@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 {
 	int maxcount;
 	loff_t offset;
-	u32 *page, *savep;
+	u32 *page, *savep, *tailbase;
 	ENCODE_HEAD;
 
 	if (nfserr)
@@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	WRITE32(0);
 	ADJUST_ARGS();
 	resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base;
+	tailbase = p;
 
 	maxcount = PAGE_SIZE;
 	if (maxcount > readdir->rd_maxcount)
@@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re
 	*p++ = htonl(readdir->common.err == nfserr_eof);
 	resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
 
-	/* allocate a page for the tail */
-	svc_take_page(resp->rqstp);
-	resp->xbuf->tail[0].iov_base = 
-		page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]);
-	resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1;
+	/* Use rest of head for padding and remaining ops: */
+	resp->rqstp->rq_restailpage = 0;
+	resp->xbuf->tail[0].iov_base = tailbase;
 	resp->xbuf->tail[0].iov_len = 0;
 	resp->p = resp->xbuf->tail[0].iov_base;
-	resp->end = resp->p + PAGE_SIZE/4;
+	resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4;
 
 	return 0;
 err_no_verf:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 3e6b75cd90fd..06cd0db0f32b 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -553,7 +553,7 @@ static struct svc_procedure		nfsd_procedures2[18] = {
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(lookup,	 diropargs,	diropres,	fhandle,	RC_NOCACHE, ST+FH+AT),
   PROC(readlink, readlinkargs,	readlinkres,	none,		RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4),
-  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE),
+  PROC(read,	 readargs,	readres,	fhandle,	RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4),
   PROC(none,	 void,		void,		none,		RC_NOCACHE, ST),
   PROC(write,	 writeargs,	attrstat,	fhandle,	RC_REPLBUFF, ST+AT),
   PROC(create,	 createargs,	diropres,	fhandle,	RC_REPLBUFF, ST+FH+AT),
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 31018333dc38..6aa92d0e6876 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -371,7 +371,6 @@ out_nfserr:
 static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 {
 	ssize_t buflen;
-	int error;
 
 	buflen = vfs_getxattr(dentry, key, NULL, 0);
 	if (buflen <= 0)
@@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf)
 	if (!*buf)
 		return -ENOMEM;
 
-	error = vfs_getxattr(dentry, key, *buf, buflen);
-	if (error < 0)
-		return error;
-	return buflen;
+	return vfs_getxattr(dentry, key, *buf, buflen);
 }
 #endif
 
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index bff0f0d06867..21f38accd039 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -153,6 +153,7 @@ struct o2hb_region {
 struct o2hb_bio_wait_ctxt {
 	atomic_t          wc_num_reqs;
 	struct completion wc_io_complete;
+	int               wc_error;
 };
 
 static void o2hb_write_timeout(void *arg)
@@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc,
 {
 	atomic_set(&wc->wc_num_reqs, num_ios);
 	init_completion(&wc->wc_io_complete);
+	wc->wc_error = 0;
 }
 
 /* Used in error paths too */
@@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio,
 {
 	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;
 
-	if (error)
+	if (error) {
 		mlog(ML_ERROR, "IO Error %d\n", error);
+		wc->wc_error = error;
+	}
 
 	if (bio->bi_size)
 		return 1;
@@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg,
 
 bail_and_wait:
 	o2hb_wait_on_io(reg, &wc);
+	if (wc.wc_error && !status)
+		status = wc.wc_error;
 
 	if (bios) {
 		for(i = 0; i < num_bios; i++)
@@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes,
 	return highest;
 }
 
-static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
+static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 {
 	int i, ret, highest_node, change = 0;
 	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
 	struct bio *write_bio;
 	struct o2hb_bio_wait_ctxt write_wc;
 
-	if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes)))
-		return;
+	ret = o2nm_configured_node_map(configured_nodes,
+				       sizeof(configured_nodes));
+	if (ret) {
+		mlog_errno(ret);
+		return ret;
+	}
 
 	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
 	if (highest_node >= O2NM_MAX_NODES) {
 		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
-		return;
+		return -EINVAL;
 	}
 
 	/* No sense in reading the slots of nodes that don't exist
@@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_read_slots(reg, highest_node + 1);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	/* With an up to date view of the slots, we can check that no
@@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	ret = o2hb_issue_node_write(reg, &write_bio, &write_wc);
 	if (ret < 0) {
 		mlog_errno(ret);
-		return;
+		return ret;
 	}
 
 	i = -1;
@@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 	 */
 	o2hb_wait_on_io(reg, &write_wc);
 	bio_put(write_bio);
+	if (write_wc.wc_error) {
+		/* Do not re-arm the write timeout on I/O error - we
+		 * can't be sure that the new block ever made it to
+		 * disk */
+		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
+		     write_wc.wc_error, reg->hr_dev_name);
+		return write_wc.wc_error;
+	}
+
 	o2hb_arm_write_timeout(reg);
 
 	/* let the person who launched us know when things are steady */
@@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg)
 		if (atomic_dec_and_test(&reg->hr_steady_iterations))
 			wake_up(&o2hb_steady_queue);
 	}
+
+	return 0;
 }
 
 /* Subtract b from a, storing the result in a. a *must* have a larger
@@ -913,7 +934,10 @@ static int o2hb_thread(void *data)
 		 * likely to time itself out. */
 		do_gettimeofday(&before_hb);
 
-		o2hb_do_disk_heartbeat(reg);
+		i = 0;
+		do {
+			ret = o2hb_do_disk_heartbeat(reg);
+		} while (ret && ++i < 2);
 
 		do_gettimeofday(&after_hb);
 		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index c3764f4744ee..74ca4e5f9765 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -139,6 +139,10 @@ static void user_ast(void *opaque)
 		return;
 	}
 
+	mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE,
+			"Lockres %s, requested ivmode. flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
+
 	/* we're downconverting. */
 	if (lockres->l_requested < lockres->l_level) {
 		if (lockres->l_requested <=
@@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status)
 
 	mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name);
 
-	if (status != DLM_NORMAL)
+	if (status != DLM_NORMAL && status != DLM_CANCELGRANT)
 		mlog(ML_ERROR, "Dlm returns status %d\n", status);
 
 	spin_lock(&lockres->l_lock);
-	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN)
+	/* The teardown flag gets set early during the unlock process,
+	 * so test the cancel flag to make sure that this ast isn't
+	 * for a concurrent cancel. */
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN
+	    && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) {
 		lockres->l_level = LKM_IVMODE;
-	else {
+	} else if (status == DLM_CANCELGRANT) {
+		mlog(0, "Lock %s, cancel fails, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+		/* We tried to cancel a convert request, but it was
+		 * already granted. Don't clear the busy flag - the
+		 * ast should've done this already. */
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
+		goto out_noclear;
+	} else {
+		BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL));
+		/* Cancel succeeded, we want to re-queue */
+		mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
 		lockres->l_requested = LKM_IVMODE; /* cancel an
 						    * upconvert
 						    * request. */
 		lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
 		/* we want the unblock thread to look at it again
 		 * now. */
-		__user_dlm_queue_lockres(lockres);
+		if (lockres->l_flags & USER_LOCK_BLOCKED)
+			__user_dlm_queue_lockres(lockres);
 	}
 
 	lockres->l_flags &= ~USER_LOCK_BUSY;
+out_noclear:
 	spin_unlock(&lockres->l_lock);
 
 	wake_up(&lockres->l_event);
@@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque)
 
 	spin_lock(&lockres->l_lock);
 
-	BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED));
-	BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED));
+	mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED),
+			"Lockres %s, flags 0x%x\n",
+			lockres->l_name, lockres->l_flags);
 
-	/* notice that we don't clear USER_LOCK_BLOCKED here. That's
-	 * for user_ast to do. */
+	/* notice that we don't clear USER_LOCK_BLOCKED here. If it's
+	 * set, we want user_ast clear it. */
 	lockres->l_flags &= ~USER_LOCK_QUEUED;
 
+	/* It's valid to get here and no longer be blocked - if we get
+	 * several basts in a row, we might be queued by the first
+	 * one, the unblock thread might run and clear the queued
+	 * flag, and finally we might get another bast which re-queues
+	 * us before our ast for the downconvert is called. */
+	if (!(lockres->l_flags & USER_LOCK_BLOCKED)) {
+		mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n",
+			lockres->l_name, lockres->l_flags);
+		spin_unlock(&lockres->l_lock);
+		goto drop_ref;
+	}
+
 	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
 		mlog(0, "lock is in teardown so we do nothing\n");
 		spin_unlock(&lockres->l_lock);
@@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque)
 	}
 
 	if (lockres->l_flags & USER_LOCK_BUSY) {
-		mlog(0, "BUSY flag detected...\n");
+		mlog(0, "Cancel lock %s, flags 0x%x\n",
+		     lockres->l_name, lockres->l_flags);
+
 		if (lockres->l_flags & USER_LOCK_IN_CANCEL) {
 			spin_unlock(&lockres->l_lock);
 			goto drop_ref;
@@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque)
 				   LKM_CANCEL,
 				   user_unlock_ast,
 				   lockres);
-		if (status == DLM_CANCELGRANT) {
-			/* If we got this, then the ast was fired
-			 * before we could cancel. We cleanup our
-			 * state, and restart the function. */
-			spin_lock(&lockres->l_lock);
-			lockres->l_flags &= ~USER_LOCK_IN_CANCEL;
-			spin_unlock(&lockres->l_lock);
-		} else if (status != DLM_NORMAL)
+		if (status != DLM_NORMAL)
 			user_log_dlm_error("dlmunlock", status, lockres);
 		goto drop_ref;
 	}
@@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 	mlog(0, "asked to destroy %s\n", lockres->l_name);
 
 	spin_lock(&lockres->l_lock);
+	if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+		mlog(0, "Lock is already torn down\n");
+		spin_unlock(&lockres->l_lock);
+		return 0;
+	}
+
+	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
+
 	while (lockres->l_flags & USER_LOCK_BUSY) {
 		spin_unlock(&lockres->l_lock);
 
@@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres)
 
 	lockres->l_flags &= ~USER_LOCK_ATTACHED;
 	lockres->l_flags |= USER_LOCK_BUSY;
-	lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
 	spin_unlock(&lockres->l_lock);
 
 	mlog(0, "unlocking lockres %s\n", lockres->l_name);
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 34e903a6a46b..581eb451a41a 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode,
 	if (new_i_size == le64_to_cpu(fe->i_size))
 		goto bail;
 
+	/* This forces other nodes to sync and drop their pages. Do
+	 * this even if we have a truncate without allocation change -
+	 * ocfs2 cluster sizes can be much greater than page size, so
+	 * we have to truncate them anyway.  */
+	status = ocfs2_data_lock(inode, 1);
+	if (status < 0) {
+		mlog_errno(status);
+		goto bail;
+	}
+	ocfs2_data_unlock(inode, 1);
+
 	if (le32_to_cpu(fe->i_clusters) ==
 	    ocfs2_clusters_for_bytes(osb->sb, new_i_size)) {
 		mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n",
@@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode,
 		goto bail;
 	}
 
-	/* This forces other nodes to sync and drop their pages */
-	status = ocfs2_data_lock(inode, 1);
-	if (status < 0) {
-		mlog_errno(status);
-		goto bail;
-	}
-	ocfs2_data_unlock(inode, 1);
-
 	/* alright, we're going to need to do a full blown alloc size
 	 * change. Orphan the inode so that recovery can complete the
 	 * truncate if necessary. This does the task of marking
diff --git a/fs/open.c b/fs/open.c
index c32c89d6d8db..53ec28c36777 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -331,7 +331,10 @@ out:
 
 asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length)
 {
-	return do_sys_ftruncate(fd, length, 1);
+	long ret = do_sys_ftruncate(fd, length, 1);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 
 /* LFS versions of truncate are only needed on 32 bit machines */
@@ -343,7 +346,10 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length)
 
 asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length)
 {
-	return do_sys_ftruncate(fd, length, 0);
+	long ret = do_sys_ftruncate(fd, length, 0);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 #endif
 
@@ -1093,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode)
 
 asmlinkage long sys_open(const char __user *filename, int flags, int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(AT_FDCWD, filename, flags, mode);
+	ret = do_sys_open(AT_FDCWD, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_open);
 
 asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
 			   int mode)
 {
+	long ret;
+
 	if (force_o_largefile())
 		flags |= O_LARGEFILE;
 
-	return do_sys_open(dfd, filename, flags, mode);
+	ret = do_sys_open(dfd, filename, flags, mode);
+	/* avoid REGPARM breakage on x86: */
+	prevent_tail_call(ret);
+	return ret;
 }
 EXPORT_SYMBOL_GPL(sys_openat);
 
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index af0cb4b9e784..45ae7dd3c650 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part)
 	devfs_remove("%s/part%d", disk->devfs_name, part);
 	if (p->holder_dir)
 		kobject_unregister(p->holder_dir);
-	kobject_unregister(&p->kobj);
+	kobject_uevent(&p->kobj, KOBJ_REMOVE);
+	kobject_del(&p->kobj);
+	kobject_put(&p->kobj);
 }
 
 void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
@@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len)
 		snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part);
 	p->kobj.parent = &disk->kobj;
 	p->kobj.ktype = &ktype_part;
-	kobject_register(&p->kobj);
+	kobject_init(&p->kobj);
+	kobject_add(&p->kobj);
+	if (!disk->part_uevent_suppress)
+		kobject_uevent(&p->kobj, KOBJ_ADD);
 	partition_sysfs_add_subdir(p);
 	disk->part[part-1] = p;
 }
@@ -367,6 +372,7 @@ static char *make_block_name(struct gendisk *disk)
 	char *name;
 	static char *block_str = "block:";
 	int size;
+	char *s;
 
 	size = strlen(block_str) + strlen(disk->disk_name) + 1;
 	name = kmalloc(size, GFP_KERNEL);
@@ -374,6 +380,10 @@ static char *make_block_name(struct gendisk *disk)
 		return NULL;
 	strcpy(name, block_str);
 	strcat(name, disk->disk_name);
+	/* ewww... some of these buggers have / in name... */
+	s = strchr(name, '/');
+	if (s)
+		*s = '!';
 	return name;
 }
 
@@ -395,6 +405,8 @@ void register_disk(struct gendisk *disk)
 {
 	struct block_device *bdev;
 	char *s;
+	int i;
+	struct hd_struct *p;
 	int err;
 
 	strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN);
@@ -406,13 +418,12 @@ void register_disk(struct gendisk *disk)
 		return;
 	disk_sysfs_symlinks(disk);
  	disk_sysfs_add_subdirs(disk);
-	kobject_uevent(&disk->kobj, KOBJ_ADD);
 
 	/* No minors to use for partitions */
 	if (disk->minors == 1) {
 		if (disk->devfs_name[0] != '\0')
 			devfs_add_disk(disk);
-		return;
+		goto exit;
 	}
 
 	/* always add handle for the whole disk */
@@ -420,16 +431,32 @@ void register_disk(struct gendisk *disk)
 
 	/* No such device (e.g., media were just removed) */
 	if (!get_capacity(disk))
-		return;
+		goto exit;
 
 	bdev = bdget_disk(disk, 0);
 	if (!bdev)
-		return;
+		goto exit;
 
+	/* scan partition table, but suppress uevents */
 	bdev->bd_invalidated = 1;
-	if (blkdev_get(bdev, FMODE_READ, 0) < 0)
-		return;
+	disk->part_uevent_suppress = 1;
+	err = blkdev_get(bdev, FMODE_READ, 0);
+	disk->part_uevent_suppress = 0;
+	if (err < 0)
+		goto exit;
 	blkdev_put(bdev);
+
+exit:
+	/* announce disk after possible partitions are already created */
+	kobject_uevent(&disk->kobj, KOBJ_ADD);
+
+	/* announce possible partitions */
+	for (i = 1; i < disk->minors; i++) {
+		p = disk->part[i-1];
+		if (!p || !p->nr_sects)
+			continue;
+		kobject_uevent(&p->kobj, KOBJ_ADD);
+	}
 }
 
 int rescan_partitions(struct gendisk *disk, struct block_device *bdev)
diff --git a/fs/pipe.c b/fs/pipe.c
index 795df987cd38..7fefb10db8d9 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -36,7 +36,7 @@
  */
 
 /* Drop the inode semaphore and wait for a pipe event, atomically */
-void pipe_wait(struct inode * inode)
+void pipe_wait(struct pipe_inode_info *pipe)
 {
 	DEFINE_WAIT(wait);
 
@@ -44,11 +44,14 @@ void pipe_wait(struct inode * inode)
 	 * Pipes are system-local resources, so sleeping on them
 	 * is considered a noninteractive wait:
 	 */
-	prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE);
-	mutex_unlock(PIPE_MUTEX(*inode));
+	prepare_to_wait(&pipe->wait, &wait,
+			TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE);
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 	schedule();
-	finish_wait(PIPE_WAIT(*inode), &wait);
-	mutex_lock(PIPE_MUTEX(*inode));
+	finish_wait(&pipe->wait, &wait);
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 }
 
 static int
@@ -91,7 +94,8 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len)
 	return 0;
 }
 
-static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
+				  struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
 
@@ -100,42 +104,46 @@ static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buff
 	/*
 	 * If nobody else uses this page, and we don't already have a
 	 * temporary page, let's keep track of it as a one-deep
-	 * allocation cache
+	 * allocation cache. (Otherwise just release our reference to it)
 	 */
-	if (page_count(page) == 1 && !info->tmp_page) {
-		info->tmp_page = page;
-		return;
-	}
-
-	/*
-	 * Otherwise just release our reference to it
-	 */
-	page_cache_release(page);
+	if (page_count(page) == 1 && !pipe->tmp_page)
+		pipe->tmp_page = page;
+	else
+		page_cache_release(page);
 }
 
-static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
 {
 	return kmap(buf->page);
 }
 
-static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf)
+static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
 {
 	kunmap(buf->page);
 }
 
-static int anon_pipe_buf_steal(struct pipe_inode_info *info,
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
 			       struct pipe_buffer *buf)
 {
 	buf->flags |= PIPE_BUF_FLAG_STOLEN;
 	return 0;
 }
 
+static void anon_pipe_buf_get(struct pipe_inode_info *info,
+			      struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations anon_pipe_buf_ops = {
 	.can_merge = 1,
 	.map = anon_pipe_buf_map,
 	.unmap = anon_pipe_buf_unmap,
 	.release = anon_pipe_buf_release,
 	.steal = anon_pipe_buf_steal,
+	.get = anon_pipe_buf_get,
 };
 
 static ssize_t
@@ -143,7 +151,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 	   unsigned long nr_segs, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	int do_wakeup;
 	ssize_t ret;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -156,13 +164,13 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(PIPE_MUTEX(*inode));
-	info = inode->i_pipe;
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
 	for (;;) {
-		int bufs = info->nrbufs;
+		int bufs = pipe->nrbufs;
 		if (bufs) {
-			int curbuf = info->curbuf;
-			struct pipe_buffer *buf = info->bufs + curbuf;
+			int curbuf = pipe->curbuf;
+			struct pipe_buffer *buf = pipe->bufs + curbuf;
 			struct pipe_buf_operations *ops = buf->ops;
 			void *addr;
 			size_t chars = buf->len;
@@ -171,16 +179,17 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			if (chars > total_len)
 				chars = total_len;
 
-			addr = ops->map(filp, info, buf);
+			addr = ops->map(filp, pipe, buf);
 			if (IS_ERR(addr)) {
 				if (!ret)
 					ret = PTR_ERR(addr);
 				break;
 			}
 			error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars);
-			ops->unmap(info, buf);
+			ops->unmap(pipe, buf);
 			if (unlikely(error)) {
-				if (!ret) ret = -EFAULT;
+				if (!ret)
+					ret = -EFAULT;
 				break;
 			}
 			ret += chars;
@@ -188,10 +197,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			buf->len -= chars;
 			if (!buf->len) {
 				buf->ops = NULL;
-				ops->release(info, buf);
+				ops->release(pipe, buf);
 				curbuf = (curbuf + 1) & (PIPE_BUFFERS-1);
-				info->curbuf = curbuf;
-				info->nrbufs = --bufs;
+				pipe->curbuf = curbuf;
+				pipe->nrbufs = --bufs;
 				do_wakeup = 1;
 			}
 			total_len -= chars;
@@ -200,9 +209,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 		}
 		if (bufs)	/* More to do? */
 			continue;
-		if (!PIPE_WRITERS(*inode))
+		if (!pipe->writers)
 			break;
-		if (!PIPE_WAITING_WRITERS(*inode)) {
+		if (!pipe->waiting_writers) {
 			/* syscall merging: Usually we must not sleep
 			 * if O_NONBLOCK is set, or if we got some data.
 			 * But if a writer sleeps in kernel space, then
@@ -216,20 +225,22 @@ pipe_readv(struct file *filp, const struct iovec *_iov,
 			}
 		}
 		if (signal_pending(current)) {
-			if (!ret) ret = -ERESTARTSYS;
+			if (!ret)
+				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
- 			kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+			wake_up_interruptible_sync(&pipe->wait);
+ 			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 		}
-		pipe_wait(inode);
+		pipe_wait(pipe);
 	}
-	mutex_unlock(PIPE_MUTEX(*inode));
-	/* Signal writers asynchronously that there is more room.  */
+	mutex_unlock(&inode->i_mutex);
+
+	/* Signal writers asynchronously that there is more room. */
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 	if (ret > 0)
 		file_accessed(filp);
@@ -240,6 +251,7 @@ static ssize_t
 pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
+
 	return pipe_readv(filp, &iov, 1, ppos);
 }
 
@@ -248,7 +260,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 	    unsigned long nr_segs, loff_t *ppos)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	ssize_t ret;
 	int do_wakeup;
 	struct iovec *iov = (struct iovec *)_iov;
@@ -262,10 +274,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	do_wakeup = 0;
 	ret = 0;
-	mutex_lock(PIPE_MUTEX(*inode));
-	info = inode->i_pipe;
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
 
-	if (!PIPE_READERS(*inode)) {
+	if (!pipe->readers) {
 		send_sig(SIGPIPE, current, 0);
 		ret = -EPIPE;
 		goto out;
@@ -273,23 +285,25 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	/* We try to merge small writes */
 	chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */
-	if (info->nrbufs && chars != 0) {
-		int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1);
-		struct pipe_buffer *buf = info->bufs + lastbuf;
+	if (pipe->nrbufs && chars != 0) {
+		int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) &
+							(PIPE_BUFFERS-1);
+		struct pipe_buffer *buf = pipe->bufs + lastbuf;
 		struct pipe_buf_operations *ops = buf->ops;
 		int offset = buf->offset + buf->len;
+
 		if (ops->can_merge && offset + chars <= PAGE_SIZE) {
 			void *addr;
 			int error;
 
-			addr = ops->map(filp, info, buf);
+			addr = ops->map(filp, pipe, buf);
 			if (IS_ERR(addr)) {
 				error = PTR_ERR(addr);
 				goto out;
 			}
 			error = pipe_iov_copy_from_user(offset + addr, iov,
 							chars);
-			ops->unmap(info, buf);
+			ops->unmap(pipe, buf);
 			ret = error;
 			do_wakeup = 1;
 			if (error)
@@ -304,16 +318,18 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 
 	for (;;) {
 		int bufs;
-		if (!PIPE_READERS(*inode)) {
+
+		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
-			if (!ret) ret = -EPIPE;
+			if (!ret)
+				ret = -EPIPE;
 			break;
 		}
-		bufs = info->nrbufs;
+		bufs = pipe->nrbufs;
 		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1);
-			struct pipe_buffer *buf = info->bufs + newbuf;
-			struct page *page = info->tmp_page;
+			int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
+			struct page *page = pipe->tmp_page;
 			int error;
 
 			if (!page) {
@@ -322,9 +338,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 					ret = ret ? : -ENOMEM;
 					break;
 				}
-				info->tmp_page = page;
+				pipe->tmp_page = page;
 			}
-			/* Always wakeup, even if the copy fails. Otherwise
+			/* Always wake up, even if the copy fails. Otherwise
 			 * we lock up (O_NONBLOCK-)readers that sleep due to
 			 * syscall merging.
 			 * FIXME! Is this really true?
@@ -337,7 +353,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			error = pipe_iov_copy_from_user(kmap(page), iov, chars);
 			kunmap(page);
 			if (unlikely(error)) {
-				if (!ret) ret = -EFAULT;
+				if (!ret)
+					ret = -EFAULT;
 				break;
 			}
 			ret += chars;
@@ -347,8 +364,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 			buf->ops = &anon_pipe_buf_ops;
 			buf->offset = 0;
 			buf->len = chars;
-			info->nrbufs = ++bufs;
-			info->tmp_page = NULL;
+			pipe->nrbufs = ++bufs;
+			pipe->tmp_page = NULL;
 
 			total_len -= chars;
 			if (!total_len)
@@ -357,27 +374,29 @@ pipe_writev(struct file *filp, const struct iovec *_iov,
 		if (bufs < PIPE_BUFFERS)
 			continue;
 		if (filp->f_flags & O_NONBLOCK) {
-			if (!ret) ret = -EAGAIN;
+			if (!ret)
+				ret = -EAGAIN;
 			break;
 		}
 		if (signal_pending(current)) {
-			if (!ret) ret = -ERESTARTSYS;
+			if (!ret)
+				ret = -ERESTARTSYS;
 			break;
 		}
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+			wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
-		PIPE_WAITING_WRITERS(*inode)++;
-		pipe_wait(inode);
-		PIPE_WAITING_WRITERS(*inode)--;
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
 	}
 out:
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 	if (ret > 0)
 		file_update_time(filp);
@@ -389,6 +408,7 @@ pipe_write(struct file *filp, const char __user *buf,
 	   size_t count, loff_t *ppos)
 {
 	struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count };
+
 	return pipe_writev(filp, &iov, 1, ppos);
 }
 
@@ -399,7 +419,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
 }
 
 static ssize_t
-bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
+bad_pipe_w(struct file *filp, const char __user *buf, size_t count,
+	   loff_t *ppos)
 {
 	return -EBADF;
 }
@@ -409,21 +430,22 @@ pipe_ioctl(struct inode *pino, struct file *filp,
 	   unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info;
+	struct pipe_inode_info *pipe;
 	int count, buf, nrbufs;
 
 	switch (cmd) {
 		case FIONREAD:
-			mutex_lock(PIPE_MUTEX(*inode));
-			info =  inode->i_pipe;
+			mutex_lock(&inode->i_mutex);
+			pipe = inode->i_pipe;
 			count = 0;
-			buf = info->curbuf;
-			nrbufs = info->nrbufs;
+			buf = pipe->curbuf;
+			nrbufs = pipe->nrbufs;
 			while (--nrbufs >= 0) {
-				count += info->bufs[buf].len;
+				count += pipe->bufs[buf].len;
 				buf = (buf+1) & (PIPE_BUFFERS-1);
 			}
-			mutex_unlock(PIPE_MUTEX(*inode));
+			mutex_unlock(&inode->i_mutex);
+
 			return put_user(count, (int __user *)arg);
 		default:
 			return -EINVAL;
@@ -436,17 +458,17 @@ pipe_poll(struct file *filp, poll_table *wait)
 {
 	unsigned int mask;
 	struct inode *inode = filp->f_dentry->d_inode;
-	struct pipe_inode_info *info = inode->i_pipe;
+	struct pipe_inode_info *pipe = inode->i_pipe;
 	int nrbufs;
 
-	poll_wait(filp, PIPE_WAIT(*inode), wait);
+	poll_wait(filp, &pipe->wait, wait);
 
 	/* Reading only -- no need for acquiring the semaphore.  */
-	nrbufs = info->nrbufs;
+	nrbufs = pipe->nrbufs;
 	mask = 0;
 	if (filp->f_mode & FMODE_READ) {
 		mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0;
-		if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode))
+		if (!pipe->writers && filp->f_version != pipe->w_counter)
 			mask |= POLLHUP;
 	}
 
@@ -456,7 +478,7 @@ pipe_poll(struct file *filp, poll_table *wait)
 		 * Most Unices do not set POLLERR for FIFOs but on Linux they
 		 * behave exactly like pipes for poll().
 		 */
-		if (!PIPE_READERS(*inode))
+		if (!pipe->readers)
 			mask |= POLLERR;
 	}
 
@@ -466,17 +488,21 @@ pipe_poll(struct file *filp, poll_table *wait)
 static int
 pipe_release(struct inode *inode, int decr, int decw)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_READERS(*inode) -= decr;
-	PIPE_WRITERS(*inode) -= decw;
-	if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) {
+	struct pipe_inode_info *pipe;
+
+	mutex_lock(&inode->i_mutex);
+	pipe = inode->i_pipe;
+	pipe->readers -= decr;
+	pipe->writers -= decw;
+
+	if (!pipe->readers && !pipe->writers) {
 		free_pipe_info(inode);
 	} else {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -487,9 +513,9 @@ pipe_read_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers);
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -504,9 +530,9 @@ pipe_write_fasync(int fd, struct file *filp, int on)
 	struct inode *inode = filp->f_dentry->d_inode;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers);
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -519,16 +545,17 @@ static int
 pipe_rdwr_fasync(int fd, struct file *filp, int on)
 {
 	struct inode *inode = filp->f_dentry->d_inode;
+	struct pipe_inode_info *pipe = inode->i_pipe;
 	int retval;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 
-	retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode));
+	retval = fasync_helper(fd, filp, on, &pipe->fasync_readers);
 
 	if (retval >= 0)
-		retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode));
+		retval = fasync_helper(fd, filp, on, &pipe->fasync_writers);
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_unlock(&inode->i_mutex);
 
 	if (retval < 0)
 		return retval;
@@ -567,9 +594,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 {
 	/* We could have perhaps used atomic_t, but this and friends
 	   below are the only places.  So it doesn't seem worthwhile.  */
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_READERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	inode->i_pipe->readers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -577,9 +604,9 @@ pipe_read_open(struct inode *inode, struct file *filp)
 static int
 pipe_write_open(struct inode *inode, struct file *filp)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
-	PIPE_WRITERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
+	inode->i_pipe->writers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -587,12 +614,12 @@ pipe_write_open(struct inode *inode, struct file *filp)
 static int
 pipe_rdwr_open(struct inode *inode, struct file *filp)
 {
-	mutex_lock(PIPE_MUTEX(*inode));
+	mutex_lock(&inode->i_mutex);
 	if (filp->f_mode & FMODE_READ)
-		PIPE_READERS(*inode)++;
+		inode->i_pipe->readers++;
 	if (filp->f_mode & FMODE_WRITE)
-		PIPE_WRITERS(*inode)++;
-	mutex_unlock(PIPE_MUTEX(*inode));
+		inode->i_pipe->writers++;
+	mutex_unlock(&inode->i_mutex);
 
 	return 0;
 }
@@ -675,37 +702,38 @@ static struct file_operations rdwr_pipe_fops = {
 	.fasync		= pipe_rdwr_fasync,
 };
 
-void free_pipe_info(struct inode *inode)
+struct pipe_inode_info * alloc_pipe_info(struct inode *inode)
+{
+	struct pipe_inode_info *pipe;
+
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	if (pipe) {
+		init_waitqueue_head(&pipe->wait);
+		pipe->r_counter = pipe->w_counter = 1;
+		pipe->inode = inode;
+	}
+
+	return pipe;
+}
+
+void __free_pipe_info(struct pipe_inode_info *pipe)
 {
 	int i;
-	struct pipe_inode_info *info = inode->i_pipe;
 
-	inode->i_pipe = NULL;
 	for (i = 0; i < PIPE_BUFFERS; i++) {
-		struct pipe_buffer *buf = info->bufs + i;
+		struct pipe_buffer *buf = pipe->bufs + i;
 		if (buf->ops)
-			buf->ops->release(info, buf);
+			buf->ops->release(pipe, buf);
 	}
-	if (info->tmp_page)
-		__free_page(info->tmp_page);
-	kfree(info);
+	if (pipe->tmp_page)
+		__free_page(pipe->tmp_page);
+	kfree(pipe);
 }
 
-struct inode* pipe_new(struct inode* inode)
+void free_pipe_info(struct inode *inode)
 {
-	struct pipe_inode_info *info;
-
-	info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
-	if (!info)
-		goto fail_page;
-	inode->i_pipe = info;
-
-	init_waitqueue_head(PIPE_WAIT(*inode));
-	PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1;
-
-	return inode;
-fail_page:
-	return NULL;
+	__free_pipe_info(inode->i_pipe);
+	inode->i_pipe = NULL;
 }
 
 static struct vfsmount *pipe_mnt __read_mostly;
@@ -713,6 +741,7 @@ static int pipefs_delete_dentry(struct dentry *dentry)
 {
 	return 1;
 }
+
 static struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 };
@@ -720,13 +749,17 @@ static struct dentry_operations pipefs_dentry_operations = {
 static struct inode * get_pipe_inode(void)
 {
 	struct inode *inode = new_inode(pipe_mnt->mnt_sb);
+	struct pipe_inode_info *pipe;
 
 	if (!inode)
 		goto fail_inode;
 
-	if(!pipe_new(inode))
+	pipe = alloc_pipe_info(inode);
+	if (!pipe)
 		goto fail_iput;
-	PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1;
+	inode->i_pipe = pipe;
+
+	pipe->readers = pipe->writers = 1;
 	inode->i_fop = &rdwr_pipe_fops;
 
 	/*
@@ -741,10 +774,12 @@ static struct inode * get_pipe_inode(void)
 	inode->i_gid = current->fsgid;
 	inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
 	inode->i_blksize = PAGE_SIZE;
+
 	return inode;
 
 fail_iput:
 	iput(inode);
+
 fail_inode:
 	return NULL;
 }
@@ -757,7 +792,7 @@ int do_pipe(int *fd)
 	struct inode * inode;
 	struct file *f1, *f2;
 	int error;
-	int i,j;
+	int i, j;
 
 	error = -ENFILE;
 	f1 = get_empty_filp();
@@ -790,6 +825,7 @@ int do_pipe(int *fd)
 	dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this);
 	if (!dentry)
 		goto close_f12_inode_i_j;
+
 	dentry->d_op = &pipefs_dentry_operations;
 	d_add(dentry, inode);
 	f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt));
@@ -813,6 +849,7 @@ int do_pipe(int *fd)
 	fd_install(j, f2);
 	fd[0] = i;
 	fd[1] = j;
+
 	return 0;
 
 close_f12_inode_i_j:
@@ -837,8 +874,9 @@ no_files:
  * d_name - pipe: will go nicely and kill the special-casing in procfs.
  */
 
-static struct super_block *pipefs_get_sb(struct file_system_type *fs_type,
-	int flags, const char *dev_name, void *data)
+static struct super_block *
+pipefs_get_sb(struct file_system_type *fs_type, int flags,
+	      const char *dev_name, void *data)
 {
 	return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC);
 }
@@ -852,6 +890,7 @@ static struct file_system_type pipe_fs_type = {
 static int __init init_pipe_fs(void)
 {
 	int err = register_filesystem(&pipe_fs_type);
+
 	if (!err) {
 		pipe_mnt = kern_mount(&pipe_fs_type);
 		if (IS_ERR(pipe_mnt)) {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index a3a3eecef689..6cc77dc3f3ff 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -297,16 +297,20 @@ static int proc_fd_link(struct inode *inode, struct dentry **dentry, struct vfsm
 
 	files = get_files_struct(task);
 	if (files) {
-		rcu_read_lock();
+		/*
+		 * We are not taking a ref to the file structure, so we must
+		 * hold ->file_lock.
+		 */
+		spin_lock(&files->file_lock);
 		file = fcheck_files(files, fd);
 		if (file) {
 			*mnt = mntget(file->f_vfsmnt);
 			*dentry = dget(file->f_dentry);
-			rcu_read_unlock();
+			spin_unlock(&files->file_lock);
 			put_files_struct(files);
 			return 0;
 		}
-		rcu_read_unlock();
+		spin_unlock(&files->file_lock);
 		put_files_struct(files);
 	}
 	return -ENOENT;
@@ -1523,7 +1527,12 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	if (!files)
 		goto out_unlock;
 	inode->i_mode = S_IFLNK;
-	rcu_read_lock();
+
+	/*
+	 * We are not taking a ref to the file structure, so we must
+	 * hold ->file_lock.
+	 */
+	spin_lock(&files->file_lock);
 	file = fcheck_files(files, fd);
 	if (!file)
 		goto out_unlock2;
@@ -1531,7 +1540,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 		inode->i_mode |= S_IRUSR | S_IXUSR;
 	if (file->f_mode & 2)
 		inode->i_mode |= S_IWUSR | S_IXUSR;
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 	inode->i_op = &proc_pid_link_inode_operations;
 	inode->i_size = 64;
@@ -1541,7 +1550,7 @@ static struct dentry *proc_lookupfd(struct inode * dir, struct dentry * dentry,
 	return NULL;
 
 out_unlock2:
-	rcu_read_unlock();
+	spin_unlock(&files->file_lock);
 	put_files_struct(files);
 out_unlock:
 	iput(inode);
diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c
index 7efa73d44c9a..20d4b2237fce 100644
--- a/fs/proc/vmcore.c
+++ b/fs/proc/vmcore.c
@@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer,
 				size_t buflen, loff_t *fpos)
 {
 	ssize_t acc = 0, tmp;
-	size_t tsz, nr_bytes;
-	u64 start;
+	size_t tsz;
+	u64 start, nr_bytes;
 	struct vmcore *curr_m = NULL;
 
 	if (buflen == 0 || *fpos >= vmcore_size)
diff --git a/fs/read_write.c b/fs/read_write.c
index 6256ca81a718..5bc0e9234f9d 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 		goto Einval;
 
 	inode = file->f_dentry->d_inode;
-	if (inode->i_flock && MANDATORY_LOCK(inode)) {
+	if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
 		int retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff --git a/fs/select.c b/fs/select.c
index 071660fa7b01..a8109baa5e46 100644
--- a/fs/select.c
+++ b/fs/select.c
@@ -310,8 +310,9 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 			   fd_set __user *exp, s64 *timeout)
 {
 	fd_set_bits fds;
-	char *bits;
-	int ret, size, max_fdset;
+	void *bits;
+	int ret, max_fdset;
+	unsigned int size;
 	struct fdtable *fdt;
 	/* Allocate small arguments on the stack to save memory and be faster */
 	long stack_fds[SELECT_STACK_ALLOC/sizeof(long)];
@@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	 * since we used fdset we need to allocate memory in units of
 	 * long-words. 
 	 */
-	ret = -ENOMEM;
 	size = FDS_BYTES(n);
-	if (6*size < SELECT_STACK_ALLOC)
-		bits = stack_fds;
-	else
+	bits = stack_fds;
+	if (size > sizeof(stack_fds) / 6) {
+		/* Not enough space in on-stack array; must use kmalloc */
+		ret = -ENOMEM;
 		bits = kmalloc(6 * size, GFP_KERNEL);
-	if (!bits)
-		goto out_nofds;
-	fds.in      = (unsigned long *)  bits;
-	fds.out     = (unsigned long *) (bits +   size);
-	fds.ex      = (unsigned long *) (bits + 2*size);
-	fds.res_in  = (unsigned long *) (bits + 3*size);
-	fds.res_out = (unsigned long *) (bits + 4*size);
-	fds.res_ex  = (unsigned long *) (bits + 5*size);
+		if (!bits)
+			goto out_nofds;
+	}
+	fds.in      = bits;
+	fds.out     = bits +   size;
+	fds.ex      = bits + 2*size;
+	fds.res_in  = bits + 3*size;
+	fds.res_out = bits + 4*size;
+	fds.res_ex  = bits + 5*size;
 
 	if ((ret = get_fd_set(n, inp, fds.in)) ||
 	    (ret = get_fd_set(n, outp, fds.out)) ||
diff --git a/fs/splice.c b/fs/splice.c
index bfa42a277bb8..0559e7577a04 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -9,11 +9,12 @@
  * that transfers data buffers to or from a pipe buffer.
  *
  * Named by Larry McVoy, original implementation from Linus, extended by
- * Jens to support splicing to files and fixing the initial implementation
- * bugs.
+ * Jens to support splicing to files, network, direct splicing, etc and
+ * fixing lots of bugs.
  *
- * Copyright (C) 2005 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2005 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org>
+ * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu>
  *
  */
 #include <linux/fs.h>
@@ -49,7 +50,8 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	struct page *page = buf->page;
 	struct address_space *mapping = page_mapping(page);
 
-	WARN_ON(!PageLocked(page));
+	lock_page(page);
+
 	WARN_ON(!PageUptodate(page));
 
 	/*
@@ -64,8 +66,10 @@ static int page_cache_pipe_buf_steal(struct pipe_inode_info *info,
 	if (PagePrivate(page))
 		try_to_release_page(page, mapping_gfp_mask(mapping));
 
-	if (!remove_mapping(mapping, page))
+	if (!remove_mapping(mapping, page)) {
+		unlock_page(page);
 		return 1;
+	}
 
 	buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU;
 	return 0;
@@ -84,69 +88,89 @@ static void *page_cache_pipe_buf_map(struct file *file,
 				     struct pipe_buffer *buf)
 {
 	struct page *page = buf->page;
-
-	lock_page(page);
+	int err;
 
 	if (!PageUptodate(page)) {
-		unlock_page(page);
-		return ERR_PTR(-EIO);
-	}
+		lock_page(page);
 
-	if (!page->mapping) {
+		/*
+		 * Page got truncated/unhashed. This will cause a 0-byte
+		 * splice, if this is the first page.
+		 */
+		if (!page->mapping) {
+			err = -ENODATA;
+			goto error;
+		}
+
+		/*
+		 * Uh oh, read-error from disk.
+		 */
+		if (!PageUptodate(page)) {
+			err = -EIO;
+			goto error;
+		}
+
+		/*
+		 * Page is ok afterall, fall through to mapping.
+		 */
 		unlock_page(page);
-		return ERR_PTR(-ENODATA);
 	}
 
-	return kmap(buf->page);
+	return kmap(page);
+error:
+	unlock_page(page);
+	return ERR_PTR(err);
 }
 
 static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info,
 				      struct pipe_buffer *buf)
 {
-	unlock_page(buf->page);
 	kunmap(buf->page);
 }
 
+static void page_cache_pipe_buf_get(struct pipe_inode_info *info,
+				    struct pipe_buffer *buf)
+{
+	page_cache_get(buf->page);
+}
+
 static struct pipe_buf_operations page_cache_pipe_buf_ops = {
 	.can_merge = 0,
 	.map = page_cache_pipe_buf_map,
 	.unmap = page_cache_pipe_buf_unmap,
 	.release = page_cache_pipe_buf_release,
 	.steal = page_cache_pipe_buf_steal,
+	.get = page_cache_pipe_buf_get,
 };
 
 /*
  * Pipe output worker. This sets up our pipe format with the page cache
  * pipe buffer operations. Otherwise very similar to the regular pipe_writev().
  */
-static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
-			    int nr_pages, unsigned long offset,
-			    unsigned long len, unsigned int flags)
+static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages,
+			    int nr_pages, unsigned long len,
+			    unsigned int offset, unsigned int flags)
 {
-	struct pipe_inode_info *info;
 	int ret, do_wakeup, i;
 
 	ret = 0;
 	do_wakeup = 0;
 	i = 0;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 
-	info = inode->i_pipe;
 	for (;;) {
-		int bufs;
-
-		if (!PIPE_READERS(*inode)) {
+		if (!pipe->readers) {
 			send_sig(SIGPIPE, current, 0);
 			if (!ret)
 				ret = -EPIPE;
 			break;
 		}
 
-		bufs = info->nrbufs;
-		if (bufs < PIPE_BUFFERS) {
-			int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS - 1);
-			struct pipe_buffer *buf = info->bufs + newbuf;
+		if (pipe->nrbufs < PIPE_BUFFERS) {
+			int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1);
+			struct pipe_buffer *buf = pipe->bufs + newbuf;
 			struct page *page = pages[i++];
 			unsigned long this_len;
 
@@ -158,8 +182,9 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 			buf->offset = offset;
 			buf->len = this_len;
 			buf->ops = &page_cache_pipe_buf_ops;
-			info->nrbufs = ++bufs;
-			do_wakeup = 1;
+			pipe->nrbufs++;
+			if (pipe->inode)
+				do_wakeup = 1;
 
 			ret += this_len;
 			len -= this_len;
@@ -168,7 +193,7 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 				break;
 			if (!len)
 				break;
-			if (bufs < PIPE_BUFFERS)
+			if (pipe->nrbufs < PIPE_BUFFERS)
 				continue;
 
 			break;
@@ -187,22 +212,26 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 		}
 
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO,
-				    POLL_IN);
+			smp_mb();
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 			do_wakeup = 0;
 		}
 
-		PIPE_WAITING_WRITERS(*inode)++;
-		pipe_wait(inode);
-		PIPE_WAITING_WRITERS(*inode)--;
+		pipe->waiting_writers++;
+		pipe_wait(pipe);
+		pipe->waiting_writers--;
 	}
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
 	}
 
 	while (i < nr_pages)
@@ -211,96 +240,155 @@ static ssize_t move_to_pipe(struct inode *inode, struct page **pages,
 	return ret;
 }
 
-static int __generic_file_splice_read(struct file *in, struct inode *pipe,
-				      size_t len, unsigned int flags)
+static int
+__generic_file_splice_read(struct file *in, loff_t *ppos,
+			   struct pipe_inode_info *pipe, size_t len,
+			   unsigned int flags)
 {
 	struct address_space *mapping = in->f_mapping;
-	unsigned int offset, nr_pages;
-	struct page *pages[PIPE_BUFFERS], *shadow[PIPE_BUFFERS];
+	unsigned int loff, offset, nr_pages;
+	struct page *pages[PIPE_BUFFERS];
 	struct page *page;
-	pgoff_t index, pidx;
-	int i, j;
+	pgoff_t index, end_index;
+	loff_t isize;
+	size_t bytes;
+	int i, error;
 
-	index = in->f_pos >> PAGE_CACHE_SHIFT;
-	offset = in->f_pos & ~PAGE_CACHE_MASK;
+	index = *ppos >> PAGE_CACHE_SHIFT;
+	loff = offset = *ppos & ~PAGE_CACHE_MASK;
 	nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 
 	if (nr_pages > PIPE_BUFFERS)
 		nr_pages = PIPE_BUFFERS;
 
 	/*
-	 * initiate read-ahead on this page range
+	 * Initiate read-ahead on this page range. however, don't call into
+	 * read-ahead if this is a non-zero offset (we are likely doing small
+	 * chunk splice and the page is already there) for a single page.
 	 */
-	do_page_cache_readahead(mapping, in, index, nr_pages);
+	if (!offset || nr_pages > 1)
+		do_page_cache_readahead(mapping, in, index, nr_pages);
 
 	/*
-	 * Get as many pages from the page cache as possible..
-	 * Start IO on the page cache entries we create (we
-	 * can assume that any pre-existing ones we find have
-	 * already had IO started on them).
+	 * Now fill in the holes:
 	 */
-	i = find_get_pages(mapping, index, nr_pages, pages);
+	error = 0;
+	bytes = 0;
+	for (i = 0; i < nr_pages; i++, index++) {
+		unsigned int this_len;
 
-	/*
-	 * common case - we found all pages and they are contiguous,
-	 * kick them off
-	 */
-	if (i && (pages[i - 1]->index == index + i - 1))
-		goto splice_them;
+		if (!len)
+			break;
 
-	/*
-	 * fill shadow[] with pages at the right locations, so we only
-	 * have to fill holes
-	 */
-	memset(shadow, 0, nr_pages * sizeof(struct page *));
-	for (j = 0; j < i; j++)
-		shadow[pages[j]->index - index] = pages[j];
+		/*
+		 * this_len is the max we'll use from this page
+		 */
+		this_len = min(len, PAGE_CACHE_SIZE - loff);
+find_page:
+		/*
+		 * lookup the page for this index
+		 */
+		page = find_get_page(mapping, index);
+		if (!page) {
+			/*
+			 * page didn't exist, allocate one
+			 */
+			page = page_cache_alloc_cold(mapping);
+			if (!page)
+				break;
 
-	/*
-	 * now fill in the holes
-	 */
-	for (i = 0, pidx = index; i < nr_pages; pidx++, i++) {
-		int error;
+			error = add_to_page_cache_lru(page, mapping, index,
+						mapping_gfp_mask(mapping));
+			if (unlikely(error)) {
+				page_cache_release(page);
+				break;
+			}
 
-		if (shadow[i])
-			continue;
+			goto readpage;
+		}
 
 		/*
-		 * no page there, look one up / create it
+		 * If the page isn't uptodate, we may need to start io on it
 		 */
-		page = find_or_create_page(mapping, pidx,
-						   mapping_gfp_mask(mapping));
-		if (!page)
-			break;
+		if (!PageUptodate(page)) {
+			/*
+			 * If in nonblock mode then dont block on waiting
+			 * for an in-flight io page
+			 */
+			if (flags & SPLICE_F_NONBLOCK)
+				break;
+
+			lock_page(page);
+
+			/*
+			 * page was truncated, stop here. if this isn't the
+			 * first page, we'll just complete what we already
+			 * added
+			 */
+			if (!page->mapping) {
+				unlock_page(page);
+				page_cache_release(page);
+				break;
+			}
+			/*
+			 * page was already under io and is now done, great
+			 */
+			if (PageUptodate(page)) {
+				unlock_page(page);
+				goto fill_it;
+			}
 
-		if (PageUptodate(page))
-			unlock_page(page);
-		else {
+readpage:
+			/*
+			 * need to read in the page
+			 */
 			error = mapping->a_ops->readpage(in, page);
 
 			if (unlikely(error)) {
 				page_cache_release(page);
+				if (error == AOP_TRUNCATED_PAGE)
+					goto find_page;
 				break;
 			}
-		}
-		shadow[i] = page;
-	}
 
-	if (!i) {
-		for (i = 0; i < nr_pages; i++) {
-			 if (shadow[i])
-				page_cache_release(shadow[i]);
+			/*
+			 * i_size must be checked after ->readpage().
+			 */
+			isize = i_size_read(mapping->host);
+			end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+			if (unlikely(!isize || index > end_index)) {
+				page_cache_release(page);
+				break;
+			}
+
+			/*
+			 * if this is the last page, see if we need to shrink
+			 * the length and stop
+			 */
+			if (end_index == index) {
+				loff = PAGE_CACHE_SIZE - (isize & ~PAGE_CACHE_MASK);
+				if (bytes + loff > isize) {
+					page_cache_release(page);
+					break;
+				}
+				/*
+				 * force quit after adding this page
+				 */
+				nr_pages = i;
+				this_len = min(this_len, loff);
+			}
 		}
-		return 0;
+fill_it:
+		pages[i] = page;
+		bytes += this_len;
+		len -= this_len;
+		loff = 0;
 	}
 
-	memcpy(pages, shadow, i * sizeof(struct page *));
+	if (i)
+		return move_to_pipe(pipe, pages, i, bytes, offset, flags);
 
-	/*
-	 * Now we splice them into the pipe..
-	 */
-splice_them:
-	return move_to_pipe(pipe, pages, i, offset, len, flags);
+	return error;
 }
 
 /**
@@ -311,30 +399,34 @@ splice_them:
  * @flags:	splice modifier flags
  *
  * Will read pages from given file and fill them into a pipe.
- *
  */
-ssize_t generic_file_splice_read(struct file *in, struct inode *pipe,
-				 size_t len, unsigned int flags)
+ssize_t generic_file_splice_read(struct file *in, loff_t *ppos,
+				 struct pipe_inode_info *pipe, size_t len,
+				 unsigned int flags)
 {
 	ssize_t spliced;
 	int ret;
 
 	ret = 0;
 	spliced = 0;
+
 	while (len) {
-		ret = __generic_file_splice_read(in, pipe, len, flags);
+		ret = __generic_file_splice_read(in, ppos, pipe, len, flags);
 
-		if (ret <= 0)
+		if (ret < 0)
 			break;
+		else if (!ret) {
+			if (spliced)
+				break;
+			if (flags & SPLICE_F_NONBLOCK) {
+				ret = -EAGAIN;
+				break;
+			}
+		}
 
-		in->f_pos += ret;
+		*ppos += ret;
 		len -= ret;
 		spliced += ret;
-
-		if (!(flags & SPLICE_F_NONBLOCK))
-			continue;
-		ret = -EAGAIN;
-		break;
 	}
 
 	if (spliced)
@@ -360,10 +452,10 @@ static int pipe_to_sendpage(struct pipe_inode_info *info,
 	int more;
 
 	/*
-	 * sub-optimal, but we are limited by the pipe ->map. we don't
+	 * Sub-optimal, but we are limited by the pipe ->map. We don't
 	 * need a kmap'ed buffer here, we just want to make sure we
 	 * have the page pinned if the pipe page originates from the
-	 * page cache
+	 * page cache.
 	 */
 	ptr = buf->ops->map(file, info, buf);
 	if (IS_ERR(ptr))
@@ -414,7 +506,7 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	int ret;
 
 	/*
-	 * after this, page will be locked and unmapped
+	 * make sure the data in this buffer is uptodate
 	 */
 	src = buf->ops->map(file, info, buf);
 	if (IS_ERR(src))
@@ -424,12 +516,13 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 	offset = sd->pos & ~PAGE_CACHE_MASK;
 
 	/*
-	 * reuse buf page, if SPLICE_F_MOVE is set
+	 * Reuse buf page, if SPLICE_F_MOVE is set.
 	 */
 	if (sd->flags & SPLICE_F_MOVE) {
 		/*
 		 * If steal succeeds, buf->page is now pruned from the vm
-		 * side (LRU and page cache) and we can reuse it.
+		 * side (LRU and page cache) and we can reuse it. The page
+		 * will also be looked on successful return.
 		 */
 		if (buf->ops->steal(info, buf))
 			goto find_page;
@@ -442,15 +535,27 @@ static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf,
 			lru_cache_add(page);
 	} else {
 find_page:
-		ret = -ENOMEM;
-		page = find_or_create_page(mapping, index, gfp_mask);
-		if (!page)
-			goto out;
+		page = find_lock_page(mapping, index);
+		if (!page) {
+			ret = -ENOMEM;
+			page = page_cache_alloc_cold(mapping);
+			if (unlikely(!page))
+				goto out_nomem;
+
+			/*
+			 * This will also lock the page
+			 */
+			ret = add_to_page_cache_lru(page, mapping, index,
+						    gfp_mask);
+			if (unlikely(ret))
+				goto out;
+		}
 
 		/*
-		 * If the page is uptodate, it is also locked. If it isn't
-		 * uptodate, we can mark it uptodate if we are filling the
-		 * full page. Otherwise we need to read it in first...
+		 * We get here with the page locked. If the page is also
+		 * uptodate, we don't need to do more. If it isn't, we
+		 * may need to bring it in if we are not going to overwrite
+		 * the full page.
 		 */
 		if (!PageUptodate(page)) {
 			if (sd->len < PAGE_CACHE_SIZE) {
@@ -462,7 +567,7 @@ find_page:
 
 				if (!PageUptodate(page)) {
 					/*
-					 * page got invalidated, repeat
+					 * Page got invalidated, repeat.
 					 */
 					if (!page->mapping) {
 						unlock_page(page);
@@ -472,10 +577,8 @@ find_page:
 					ret = -EIO;
 					goto out;
 				}
-			} else {
-				WARN_ON(!PageLocked(page));
+			} else
 				SetPageUptodate(page);
-			}
 		}
 	}
 
@@ -501,12 +604,14 @@ find_page:
 	} else if (ret)
 		goto out;
 
+	mark_page_accessed(page);
 	balance_dirty_pages_ratelimited(mapping);
 out:
-	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) {
+	if (!(buf->flags & PIPE_BUF_FLAG_STOLEN))
 		page_cache_release(page);
-		unlock_page(page);
-	}
+
+	unlock_page(page);
+out_nomem:
 	buf->ops->unmap(info, buf);
 	return ret;
 }
@@ -519,11 +624,10 @@ typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *,
  * key here is the 'actor' worker passed in that actually moves the data
  * to the wanted destination. See pipe_to_file/pipe_to_sendpage above.
  */
-static ssize_t move_from_pipe(struct inode *inode, struct file *out,
-			      size_t len, unsigned int flags,
+static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out,
+			      loff_t *ppos, size_t len, unsigned int flags,
 			      splice_actor *actor)
 {
-	struct pipe_inode_info *info;
 	int ret, do_wakeup, err;
 	struct splice_desc sd;
 
@@ -533,24 +637,21 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 	sd.total_len = len;
 	sd.flags = flags;
 	sd.file = out;
-	sd.pos = out->f_pos;
+	sd.pos = *ppos;
 
-	mutex_lock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_lock(&pipe->inode->i_mutex);
 
-	info = inode->i_pipe;
 	for (;;) {
-		int bufs = info->nrbufs;
-
-		if (bufs) {
-			int curbuf = info->curbuf;
-			struct pipe_buffer *buf = info->bufs + curbuf;
+		if (pipe->nrbufs) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
 			struct pipe_buf_operations *ops = buf->ops;
 
 			sd.len = buf->len;
 			if (sd.len > sd.total_len)
 				sd.len = sd.total_len;
 
-			err = actor(info, buf, &sd);
+			err = actor(pipe, buf, &sd);
 			if (err) {
 				if (!ret && err != -ENODATA)
 					ret = err;
@@ -561,13 +662,14 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 			ret += sd.len;
 			buf->offset += sd.len;
 			buf->len -= sd.len;
+
 			if (!buf->len) {
 				buf->ops = NULL;
-				ops->release(info, buf);
-				curbuf = (curbuf + 1) & (PIPE_BUFFERS - 1);
-				info->curbuf = curbuf;
-				info->nrbufs = --bufs;
-				do_wakeup = 1;
+				ops->release(pipe, buf);
+				pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1);
+				pipe->nrbufs--;
+				if (pipe->inode)
+					do_wakeup = 1;
 			}
 
 			sd.pos += sd.len;
@@ -576,11 +678,11 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 				break;
 		}
 
-		if (bufs)
+		if (pipe->nrbufs)
 			continue;
-		if (!PIPE_WRITERS(*inode))
+		if (!pipe->writers)
 			break;
-		if (!PIPE_WAITING_WRITERS(*inode)) {
+		if (!pipe->waiting_writers) {
 			if (ret)
 				break;
 		}
@@ -598,31 +700,32 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
 		}
 
 		if (do_wakeup) {
-			wake_up_interruptible_sync(PIPE_WAIT(*inode));
-			kill_fasync(PIPE_FASYNC_WRITERS(*inode),SIGIO,POLL_OUT);
+			smp_mb();
+			if (waitqueue_active(&pipe->wait))
+				wake_up_interruptible_sync(&pipe->wait);
+			kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 			do_wakeup = 0;
 		}
 
-		pipe_wait(inode);
+		pipe_wait(pipe);
 	}
 
-	mutex_unlock(PIPE_MUTEX(*inode));
+	if (pipe->inode)
+		mutex_unlock(&pipe->inode->i_mutex);
 
 	if (do_wakeup) {
-		wake_up_interruptible(PIPE_WAIT(*inode));
-		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
+		smp_mb();
+		if (waitqueue_active(&pipe->wait))
+			wake_up_interruptible(&pipe->wait);
+		kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT);
 	}
 
-	mutex_lock(&out->f_mapping->host->i_mutex);
-	out->f_pos = sd.pos;
-	mutex_unlock(&out->f_mapping->host->i_mutex);
 	return ret;
-
 }
 
 /**
  * generic_file_splice_write - splice data from a pipe to a file
- * @inode:	pipe inode
+ * @pipe:	pipe info
  * @out:	file to write to
  * @len:	number of bytes to splice
  * @flags:	splice modifier flags
@@ -631,27 +734,34 @@ static ssize_t move_from_pipe(struct inode *inode, struct file *out,
  * the given pipe inode to the given file.
  *
  */
-ssize_t generic_file_splice_write(struct inode *inode, struct file *out,
-				  size_t len, unsigned int flags)
+ssize_t
+generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
 {
 	struct address_space *mapping = out->f_mapping;
-	ssize_t ret = move_from_pipe(inode, out, len, flags, pipe_to_file);
+	ssize_t ret;
 
-	/*
-	 * if file or inode is SYNC and we actually wrote some data, sync it
-	 */
-	if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host))
-	    && ret > 0) {
+	ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file);
+	if (ret > 0) {
 		struct inode *inode = mapping->host;
-		int err;
 
-		mutex_lock(&inode->i_mutex);
-		err = generic_osync_inode(mapping->host, mapping,
-						OSYNC_METADATA|OSYNC_DATA);
-		mutex_unlock(&inode->i_mutex);
+		*ppos += ret;
+
+		/*
+		 * If file or inode is SYNC and we actually wrote some data,
+		 * sync it.
+		 */
+		if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(inode))) {
+			int err;
+
+			mutex_lock(&inode->i_mutex);
+			err = generic_osync_inode(inode, mapping,
+						  OSYNC_METADATA|OSYNC_DATA);
+			mutex_unlock(&inode->i_mutex);
 
-		if (err)
-			ret = err;
+			if (err)
+				ret = err;
+		}
 	}
 
 	return ret;
@@ -670,10 +780,10 @@ EXPORT_SYMBOL(generic_file_splice_write);
  * is involved.
  *
  */
-ssize_t generic_splice_sendpage(struct inode *inode, struct file *out,
-				size_t len, unsigned int flags)
+ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out,
+				loff_t *ppos, size_t len, unsigned int flags)
 {
-	return move_from_pipe(inode, out, len, flags, pipe_to_sendpage);
+	return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage);
 }
 
 EXPORT_SYMBOL(generic_splice_sendpage);
@@ -681,77 +791,228 @@ EXPORT_SYMBOL(generic_splice_sendpage);
 /*
  * Attempt to initiate a splice from pipe to file.
  */
-static long do_splice_from(struct inode *pipe, struct file *out, size_t len,
-			   unsigned int flags)
+static long do_splice_from(struct pipe_inode_info *pipe, struct file *out,
+			   loff_t *ppos, size_t len, unsigned int flags)
 {
-	loff_t pos;
 	int ret;
 
-	if (!out->f_op || !out->f_op->splice_write)
+	if (unlikely(!out->f_op || !out->f_op->splice_write))
 		return -EINVAL;
 
-	if (!(out->f_mode & FMODE_WRITE))
+	if (unlikely(!(out->f_mode & FMODE_WRITE)))
 		return -EBADF;
 
-	pos = out->f_pos;
-	ret = rw_verify_area(WRITE, out, &pos, len);
+	ret = rw_verify_area(WRITE, out, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
-	return out->f_op->splice_write(pipe, out, len, flags);
+	return out->f_op->splice_write(pipe, out, ppos, len, flags);
 }
 
 /*
  * Attempt to initiate a splice from a file to a pipe.
  */
-static long do_splice_to(struct file *in, struct inode *pipe, size_t len,
+static long do_splice_to(struct file *in, loff_t *ppos,
+			 struct pipe_inode_info *pipe, size_t len,
 			 unsigned int flags)
 {
-	loff_t pos, isize, left;
+	loff_t isize, left;
 	int ret;
 
-	if (!in->f_op || !in->f_op->splice_read)
+	if (unlikely(!in->f_op || !in->f_op->splice_read))
 		return -EINVAL;
 
-	if (!(in->f_mode & FMODE_READ))
+	if (unlikely(!(in->f_mode & FMODE_READ)))
 		return -EBADF;
 
-	pos = in->f_pos;
-	ret = rw_verify_area(READ, in, &pos, len);
+	ret = rw_verify_area(READ, in, ppos, len);
 	if (unlikely(ret < 0))
 		return ret;
 
 	isize = i_size_read(in->f_mapping->host);
-	if (unlikely(in->f_pos >= isize))
+	if (unlikely(*ppos >= isize))
 		return 0;
 	
-	left = isize - in->f_pos;
-	if (left < len)
+	left = isize - *ppos;
+	if (unlikely(left < len))
 		len = left;
 
-	return in->f_op->splice_read(in, pipe, len, flags);
+	return in->f_op->splice_read(in, ppos, pipe, len, flags);
+}
+
+long do_splice_direct(struct file *in, loff_t *ppos, struct file *out,
+		      size_t len, unsigned int flags)
+{
+	struct pipe_inode_info *pipe;
+	long ret, bytes;
+	loff_t out_off;
+	umode_t i_mode;
+	int i;
+
+	/*
+	 * We require the input being a regular file, as we don't want to
+	 * randomly drop data for eg socket -> socket splicing. Use the
+	 * piped splicing for that!
+	 */
+	i_mode = in->f_dentry->d_inode->i_mode;
+	if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode)))
+		return -EINVAL;
+
+	/*
+	 * neither in nor out is a pipe, setup an internal pipe attached to
+	 * 'out' and transfer the wanted data from 'in' to 'out' through that
+	 */
+	pipe = current->splice_pipe;
+	if (unlikely(!pipe)) {
+		pipe = alloc_pipe_info(NULL);
+		if (!pipe)
+			return -ENOMEM;
+
+		/*
+		 * We don't have an immediate reader, but we'll read the stuff
+		 * out of the pipe right after the move_to_pipe(). So set
+		 * PIPE_READERS appropriately.
+		 */
+		pipe->readers = 1;
+
+		current->splice_pipe = pipe;
+	}
+
+	/*
+	 * Do the splice.
+	 */
+	ret = 0;
+	bytes = 0;
+	out_off = 0;
+
+	while (len) {
+		size_t read_len, max_read_len;
+
+		/*
+		 * Do at most PIPE_BUFFERS pages worth of transfer:
+		 */
+		max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE));
+
+		ret = do_splice_to(in, ppos, pipe, max_read_len, flags);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		read_len = ret;
+
+		/*
+		 * NOTE: nonblocking mode only applies to the input. We
+		 * must not do the output in nonblocking mode as then we
+		 * could get stuck data in the internal pipe:
+		 */
+		ret = do_splice_from(pipe, out, &out_off, read_len,
+				     flags & ~SPLICE_F_NONBLOCK);
+		if (unlikely(ret < 0))
+			goto out_release;
+
+		bytes += ret;
+		len -= ret;
+
+		/*
+		 * In nonblocking mode, if we got back a short read then
+		 * that was due to either an IO error or due to the
+		 * pagecache entry not being there. In the IO error case
+		 * the _next_ splice attempt will produce a clean IO error
+		 * return value (not a short read), so in both cases it's
+		 * correct to break out of the loop here:
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len))
+			break;
+	}
+
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	return bytes;
+
+out_release:
+	/*
+	 * If we did an incomplete transfer we must release
+	 * the pipe buffers in question:
+	 */
+	for (i = 0; i < PIPE_BUFFERS; i++) {
+		struct pipe_buffer *buf = pipe->bufs + i;
+
+		if (buf->ops) {
+			buf->ops->release(pipe, buf);
+			buf->ops = NULL;
+		}
+	}
+	pipe->nrbufs = pipe->curbuf = 0;
+
+	/*
+	 * If we transferred some data, return the number of bytes:
+	 */
+	if (bytes > 0)
+		return bytes;
+
+	return ret;
 }
 
+EXPORT_SYMBOL(do_splice_direct);
+
 /*
  * Determine where to splice to/from.
  */
-static long do_splice(struct file *in, struct file *out, size_t len,
-		      unsigned int flags)
+static long do_splice(struct file *in, loff_t __user *off_in,
+		      struct file *out, loff_t __user *off_out,
+		      size_t len, unsigned int flags)
 {
-	struct inode *pipe;
+	struct pipe_inode_info *pipe;
+	loff_t offset, *off;
+	long ret;
+
+	pipe = in->f_dentry->d_inode->i_pipe;
+	if (pipe) {
+		if (off_in)
+			return -ESPIPE;
+		if (off_out) {
+			if (out->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&offset, off_out, sizeof(loff_t)))
+				return -EFAULT;
+			off = &offset;
+		} else
+			off = &out->f_pos;
+
+		ret = do_splice_from(pipe, out, off, len, flags);
+
+		if (off_out && copy_to_user(off_out, off, sizeof(loff_t)))
+			ret = -EFAULT;
 
-	pipe = in->f_dentry->d_inode;
-	if (pipe->i_pipe)
-		return do_splice_from(pipe, out, len, flags);
+		return ret;
+	}
+
+	pipe = out->f_dentry->d_inode->i_pipe;
+	if (pipe) {
+		if (off_out)
+			return -ESPIPE;
+		if (off_in) {
+			if (in->f_op->llseek == no_llseek)
+				return -EINVAL;
+			if (copy_from_user(&offset, off_in, sizeof(loff_t)))
+				return -EFAULT;
+			off = &offset;
+		} else
+			off = &in->f_pos;
 
-	pipe = out->f_dentry->d_inode;
-	if (pipe->i_pipe)
-		return do_splice_to(in, pipe, len, flags);
+		ret = do_splice_to(in, off, pipe, len, flags);
+
+		if (off_in && copy_to_user(off_in, off, sizeof(loff_t)))
+			ret = -EFAULT;
+
+		return ret;
+	}
 
 	return -EINVAL;
 }
 
-asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
+asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
+			   int fd_out, loff_t __user *off_out,
+			   size_t len, unsigned int flags)
 {
 	long error;
 	struct file *in, *out;
@@ -761,13 +1022,15 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
 		return 0;
 
 	error = -EBADF;
-	in = fget_light(fdin, &fput_in);
+	in = fget_light(fd_in, &fput_in);
 	if (in) {
 		if (in->f_mode & FMODE_READ) {
-			out = fget_light(fdout, &fput_out);
+			out = fget_light(fd_out, &fput_out);
 			if (out) {
 				if (out->f_mode & FMODE_WRITE)
-					error = do_splice(in, out, len, flags);
+					error = do_splice(in, off_in,
+							  out, off_out,
+							  len, flags);
 				fput_light(out, fput_out);
 			}
 		}
@@ -777,3 +1040,192 @@ asmlinkage long sys_splice(int fdin, int fdout, size_t len, unsigned int flags)
 
 	return error;
 }
+
+/*
+ * Link contents of ipipe to opipe.
+ */
+static int link_pipe(struct pipe_inode_info *ipipe,
+		     struct pipe_inode_info *opipe,
+		     size_t len, unsigned int flags)
+{
+	struct pipe_buffer *ibuf, *obuf;
+	int ret, do_wakeup, i, ipipe_first;
+
+	ret = do_wakeup = ipipe_first = 0;
+
+	/*
+	 * Potential ABBA deadlock, work around it by ordering lock
+	 * grabbing by inode address. Otherwise two different processes
+	 * could deadlock (one doing tee from A -> B, the other from B -> A).
+	 */
+	if (ipipe->inode < opipe->inode) {
+		ipipe_first = 1;
+		mutex_lock(&ipipe->inode->i_mutex);
+		mutex_lock(&opipe->inode->i_mutex);
+	} else {
+		mutex_lock(&opipe->inode->i_mutex);
+		mutex_lock(&ipipe->inode->i_mutex);
+	}
+
+	for (i = 0;; i++) {
+		if (!opipe->readers) {
+			send_sig(SIGPIPE, current, 0);
+			if (!ret)
+				ret = -EPIPE;
+			break;
+		}
+		if (ipipe->nrbufs - i) {
+			ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1));
+
+			/*
+			 * If we have room, fill this buffer
+			 */
+			if (opipe->nrbufs < PIPE_BUFFERS) {
+				int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1);
+
+				/*
+				 * Get a reference to this pipe buffer,
+				 * so we can copy the contents over.
+				 */
+				ibuf->ops->get(ipipe, ibuf);
+
+				obuf = opipe->bufs + nbuf;
+				*obuf = *ibuf;
+
+				if (obuf->len > len)
+					obuf->len = len;
+
+				opipe->nrbufs++;
+				do_wakeup = 1;
+				ret += obuf->len;
+				len -= obuf->len;
+
+				if (!len)
+					break;
+				if (opipe->nrbufs < PIPE_BUFFERS)
+					continue;
+			}
+
+			/*
+			 * We have input available, but no output room.
+			 * If we already copied data, return that. If we
+			 * need to drop the opipe lock, it must be ordered
+			 * last to avoid deadlocks.
+			 */
+			if ((flags & SPLICE_F_NONBLOCK) || !ipipe_first) {
+				if (!ret)
+					ret = -EAGAIN;
+				break;
+			}
+			if (signal_pending(current)) {
+				if (!ret)
+					ret = -ERESTARTSYS;
+				break;
+			}
+			if (do_wakeup) {
+				smp_mb();
+				if (waitqueue_active(&opipe->wait))
+					wake_up_interruptible(&opipe->wait);
+				kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+				do_wakeup = 0;
+			}
+
+			opipe->waiting_writers++;
+			pipe_wait(opipe);
+			opipe->waiting_writers--;
+			continue;
+		}
+
+		/*
+		 * No input buffers, do the usual checks for available
+		 * writers and blocking and wait if necessary
+		 */
+		if (!ipipe->writers)
+			break;
+		if (!ipipe->waiting_writers) {
+			if (ret)
+				break;
+		}
+		/*
+		 * pipe_wait() drops the ipipe mutex. To avoid deadlocks
+		 * with another process, we can only safely do that if
+		 * the ipipe lock is ordered last.
+		 */
+		if ((flags & SPLICE_F_NONBLOCK) || ipipe_first) {
+			if (!ret)
+				ret = -EAGAIN;
+			break;
+		}
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -ERESTARTSYS;
+			break;
+		}
+
+		if (waitqueue_active(&ipipe->wait))
+			wake_up_interruptible_sync(&ipipe->wait);
+		kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT);
+
+		pipe_wait(ipipe);
+	}
+
+	mutex_unlock(&ipipe->inode->i_mutex);
+	mutex_unlock(&opipe->inode->i_mutex);
+
+	if (do_wakeup) {
+		smp_mb();
+		if (waitqueue_active(&opipe->wait))
+			wake_up_interruptible(&opipe->wait);
+		kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN);
+	}
+
+	return ret;
+}
+
+/*
+ * This is a tee(1) implementation that works on pipes. It doesn't copy
+ * any data, it simply references the 'in' pages on the 'out' pipe.
+ * The 'flags' used are the SPLICE_F_* variants, currently the only
+ * applicable one is SPLICE_F_NONBLOCK.
+ */
+static long do_tee(struct file *in, struct file *out, size_t len,
+		   unsigned int flags)
+{
+	struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe;
+	struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe;
+
+	/*
+	 * Link ipipe to the two output pipes, consuming as we go along.
+	 */
+	if (ipipe && opipe)
+		return link_pipe(ipipe, opipe, len, flags);
+
+	return -EINVAL;
+}
+
+asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags)
+{
+	struct file *in;
+	int error, fput_in;
+
+	if (unlikely(!len))
+		return 0;
+
+	error = -EBADF;
+	in = fget_light(fdin, &fput_in);
+	if (in) {
+		if (in->f_mode & FMODE_READ) {
+			int fput_out;
+			struct file *out = fget_light(fdout, &fput_out);
+
+			if (out) {
+				if (out->f_mode & FMODE_WRITE)
+					error = do_tee(in, out, len, flags);
+				fput_light(out, fput_out);
+			}
+		}
+ 		fput_light(in, fput_in);
+ 	}
+
+	return error;
+}
diff --git a/fs/sync.c b/fs/sync.c
index 8616006d2094..aab5ffe77e9f 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -61,7 +61,7 @@
  * will be available after a crash.
  */
 asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
-					int flags)
+					unsigned int flags)
 {
 	int ret;
 	struct file *file;
@@ -126,7 +126,7 @@ out:
  * `endbyte' is inclusive
  */
 int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte,
-			int flags)
+			unsigned int flags)
 {
 	int ret;
 	struct address_space *mapping;
diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c
index 6cfdc9a87772..610b5bdbe75b 100644
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd,
 
 	memset(sd, 0, sizeof(*sd));
 	atomic_set(&sd->s_count, 1);
+	atomic_set(&sd->s_event, 0);
 	INIT_LIST_HEAD(&sd->s_children);
 	list_add(&sd->s_sibling, &parent_sd->s_children);
 	sd->s_element = element;
diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c
index f1cb1ddde511..cf3786625bfa 100644
--- a/fs/sysfs/file.c
+++ b/fs/sysfs/file.c
@@ -6,6 +6,7 @@
 #include <linux/fsnotify.h>
 #include <linux/kobject.h>
 #include <linux/namei.h>
+#include <linux/poll.h>
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
 
@@ -57,6 +58,7 @@ struct sysfs_buffer {
 	struct sysfs_ops	* ops;
 	struct semaphore	sem;
 	int			needs_read_fill;
+	int			event;
 };
 
 
@@ -72,6 +74,7 @@ struct sysfs_buffer {
  */
 static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer)
 {
+	struct sysfs_dirent * sd = dentry->d_fsdata;
 	struct attribute * attr = to_attr(dentry);
 	struct kobject * kobj = to_kobj(dentry->d_parent);
 	struct sysfs_ops * ops = buffer->ops;
@@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer
 	if (!buffer->page)
 		return -ENOMEM;
 
+	buffer->event = atomic_read(&sd->s_event);
 	count = ops->show(kobj,attr,buffer->page);
 	buffer->needs_read_fill = 0;
 	BUG_ON(count > (ssize_t)PAGE_SIZE);
@@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp)
 	return 0;
 }
 
+/* Sysfs attribute files are pollable.  The idea is that you read
+ * the content and then you use 'poll' or 'select' to wait for
+ * the content to change.  When the content changes (assuming the
+ * manager for the kobject supports notification), poll will
+ * return POLLERR|POLLPRI, and select will return the fd whether
+ * it is waiting for read, write, or exceptions.
+ * Once poll/select indicates that the value has changed, you
+ * need to close and re-open the file, as simply seeking and reading
+ * again will not get new data, or reset the state of 'poll'.
+ * Reminder: this only works for attributes which actively support
+ * it, and it is not possible to test an attribute from userspace
+ * to see if it supports poll (Nether 'poll' or 'select' return
+ * an appropriate error code).  When in doubt, set a suitable timeout value.
+ */
+static unsigned int sysfs_poll(struct file *filp, poll_table *wait)
+{
+	struct sysfs_buffer * buffer = filp->private_data;
+	struct kobject * kobj = to_kobj(filp->f_dentry->d_parent);
+	struct sysfs_dirent * sd = filp->f_dentry->d_fsdata;
+	int res = 0;
+
+	poll_wait(filp, &kobj->poll, wait);
+
+	if (buffer->event != atomic_read(&sd->s_event)) {
+		res = POLLERR|POLLPRI;
+		buffer->needs_read_fill = 1;
+	}
+
+	return res;
+}
+
+
+static struct dentry *step_down(struct dentry *dir, const char * name)
+{
+	struct dentry * de;
+
+	if (dir == NULL || dir->d_inode == NULL)
+		return NULL;
+
+	mutex_lock(&dir->d_inode->i_mutex);
+	de = lookup_one_len(name, dir, strlen(name));
+	mutex_unlock(&dir->d_inode->i_mutex);
+	dput(dir);
+	if (IS_ERR(de))
+		return NULL;
+	if (de->d_inode == NULL) {
+		dput(de);
+		return NULL;
+	}
+	return de;
+}
+
+void sysfs_notify(struct kobject * k, char *dir, char *attr)
+{
+	struct dentry *de = k->dentry;
+	if (de)
+		dget(de);
+	if (de && dir)
+		de = step_down(de, dir);
+	if (de && attr)
+		de = step_down(de, attr);
+	if (de) {
+		struct sysfs_dirent * sd = de->d_fsdata;
+		if (sd)
+			atomic_inc(&sd->s_event);
+		wake_up_interruptible(&k->poll);
+		dput(de);
+	}
+}
+EXPORT_SYMBOL_GPL(sysfs_notify);
+
 const struct file_operations sysfs_file_operations = {
 	.read		= sysfs_read_file,
 	.write		= sysfs_write_file,
 	.llseek		= generic_file_llseek,
 	.open		= sysfs_open_file,
 	.release	= sysfs_release,
+	.poll		= sysfs_poll,
 };
 
 
diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h
index 32958a7c50e9..3651ffb5ec09 100644
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *,
 
 extern int sysfs_add_file(struct dentry *, const struct attribute *, int);
 extern void sysfs_hash_and_remove(struct dentry * dir, const char * name);
+extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name);
 
 extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **);
 extern void sysfs_remove_subdir(struct dentry *);
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 6cbbd165c60d..4d191ef39b67 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -870,12 +870,14 @@ xfs_page_state_convert(
 	pgoff_t                 end_index, last_index, tlast;
 	ssize_t			size, len;
 	int			flags, err, iomap_valid = 0, uptodate = 1;
-	int			page_dirty, count = 0, trylock_flag = 0;
+	int			page_dirty, count = 0;
+	int			trylock = 0;
 	int			all_bh = unmapped;
 
-	/* wait for other IO threads? */
-	if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking))
-		trylock_flag |= BMAPI_TRYLOCK;
+	if (startio) {
+		if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)
+			trylock |= BMAPI_TRYLOCK;
+	}
 
 	/* Is this page beyond the end of the file? */
 	offset = i_size_read(inode);
@@ -956,15 +958,13 @@ xfs_page_state_convert(
 
 			if (buffer_unwritten(bh)) {
 				type = IOMAP_UNWRITTEN;
-				flags = BMAPI_WRITE|BMAPI_IGNSTATE;
+				flags = BMAPI_WRITE | BMAPI_IGNSTATE;
 			} else if (buffer_delay(bh)) {
 				type = IOMAP_DELAY;
-				flags = BMAPI_ALLOCATE;
-				if (!startio)
-					flags |= trylock_flag;
+				flags = BMAPI_ALLOCATE | trylock;
 			} else {
 				type = IOMAP_NEW;
-				flags = BMAPI_WRITE|BMAPI_MMAP;
+				flags = BMAPI_WRITE | BMAPI_MMAP;
 			}
 
 			if (!iomap_valid) {
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 9fb0312665ca..26fed0756f01 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -182,7 +182,7 @@ free_address(
 {
 	a_list_t	*aentry;
 
-	aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH);
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
 	if (likely(aentry)) {
 		spin_lock(&as_lock);
 		aentry->next = as_free_head;
diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c
index ae4c4754ed31..c847416f6d10 100644
--- a/fs/xfs/linux-2.6/xfs_file.c
+++ b/fs/xfs/linux-2.6/xfs_file.c
@@ -252,56 +252,60 @@ xfs_file_sendfile_invis(
 STATIC ssize_t
 xfs_file_splice_read(
 	struct file		*infilp,
-	struct inode		*pipe,
+	loff_t			*ppos,
+	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, 0, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval);
 	return rval;
 }
 
 STATIC ssize_t
 xfs_file_splice_read_invis(
 	struct file		*infilp,
-	struct inode		*pipe,
+	loff_t			*ppos,
+	struct pipe_inode_info	*pipe,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(infilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_READ(vp, infilp, pipe, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
 STATIC ssize_t
 xfs_file_splice_write(
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, 0, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval);
 	return rval;
 }
 
 STATIC ssize_t
 xfs_file_splice_write_invis(
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			len,
 	unsigned int		flags)
 {
 	vnode_t			*vp = vn_from_inode(outfilp->f_dentry->d_inode);
 	ssize_t			rval;
 
-	VOP_SPLICE_WRITE(vp, pipe, outfilp, len, flags, IO_INVIS, NULL, rval);
+	VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval);
 	return rval;
 }
 
diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c
index 149237304fb6..2e2e275c786f 100644
--- a/fs/xfs/linux-2.6/xfs_iops.c
+++ b/fs/xfs/linux-2.6/xfs_iops.c
@@ -673,8 +673,7 @@ xfs_vn_setattr(
 	if (ia_valid & ATTR_ATIME) {
 		vattr.va_mask |= XFS_AT_ATIME;
 		vattr.va_atime = attr->ia_atime;
-		if (ia_valid & ATTR_ATIME_SET)
-			inode->i_atime = attr->ia_atime;
+		inode->i_atime = attr->ia_atime;
 	}
 	if (ia_valid & ATTR_MTIME) {
 		vattr.va_mask |= XFS_AT_MTIME;
diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c
index 90cd314acbaa..67efe3308980 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.c
+++ b/fs/xfs/linux-2.6/xfs_lrw.c
@@ -338,7 +338,8 @@ ssize_t
 xfs_splice_read(
 	bhv_desc_t		*bdp,
 	struct file		*infilp,
-	struct inode		*pipe,
+	loff_t			*ppos,
+	struct pipe_inode_info	*pipe,
 	size_t			count,
 	int			flags,
 	int			ioflags,
@@ -360,7 +361,7 @@ xfs_splice_read(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp),
-					infilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(infilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_SHARED);
@@ -368,8 +369,8 @@ xfs_splice_read(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore,
-			   pipe, count, infilp->f_pos, ioflags);
-	ret = generic_file_splice_read(infilp, pipe, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_read(infilp, ppos, pipe, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_read_bytes, ret);
 
@@ -380,8 +381,9 @@ xfs_splice_read(
 ssize_t
 xfs_splice_write(
 	bhv_desc_t		*bdp,
-	struct inode		*pipe,
+	struct pipe_inode_info	*pipe,
 	struct file		*outfilp,
+	loff_t			*ppos,
 	size_t			count,
 	int			flags,
 	int			ioflags,
@@ -403,7 +405,7 @@ xfs_splice_write(
 		int error;
 
 		error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp),
-					outfilp->f_pos, count,
+					*ppos, count,
 					FILP_DELAY_FLAG(outfilp), &locktype);
 		if (error) {
 			xfs_iunlock(ip, XFS_IOLOCK_EXCL);
@@ -411,8 +413,8 @@ xfs_splice_write(
 		}
 	}
 	xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore,
-			   pipe, count, outfilp->f_pos, ioflags);
-	ret = generic_file_splice_write(pipe, outfilp, count, flags);
+			   pipe, count, *ppos, ioflags);
+	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
 	if (ret > 0)
 		XFS_STATS_ADD(xs_write_bytes, ret);
 
diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h
index eaa5659713fb..8f4539952350 100644
--- a/fs/xfs/linux-2.6/xfs_lrw.h
+++ b/fs/xfs/linux-2.6/xfs_lrw.h
@@ -93,11 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *,
 extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *,
-				struct inode *, size_t, int, int,
+extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *,
+				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
-extern ssize_t xfs_splice_write(struct bhv_desc *, struct inode *,
-				struct file *, size_t, int, int,
+extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 
 #endif	/* __XFS_LRW_H__ */
diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h
index 6f1c79a28f8b..2a8e16c22353 100644
--- a/fs/xfs/linux-2.6/xfs_vnode.h
+++ b/fs/xfs/linux-2.6/xfs_vnode.h
@@ -173,11 +173,11 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *,
 typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *,
 				loff_t *, int, size_t, read_actor_t,
 				void *, struct cred *);
-typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *,
-				struct inode *, size_t, int, int,
+typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *,
+				struct pipe_inode_info *, size_t, int, int,
 				struct cred *);
-typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct inode *,
-				struct file *, size_t, int, int,
+typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *,
+				struct file *, loff_t *, size_t, int, int,
 				struct cred *);
 typedef int	(*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *,
 				int, unsigned int, void __user *);
@@ -284,10 +284,10 @@ typedef struct vnodeops {
 	rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr)
 #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv)		\
 	rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr)
-#define VOP_SPLICE_READ(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
-#define VOP_SPLICE_WRITE(vp,f,pipe,cnt,fl,iofl,cr,rv)			\
-	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
+#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv)			\
+	rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr)
 #define VOP_BMAP(vp,of,sz,rw,b,n,rv)					\
 	rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n)
 #define VOP_OPEN(vp, cr, rv)						\
diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c
index 4eeb856183b1..deddbd03c166 100644
--- a/fs/xfs/xfs_ialloc.c
+++ b/fs/xfs/xfs_ialloc.c
@@ -158,9 +158,10 @@ xfs_ialloc_ag_alloc(
  	 */
 	agi = XFS_BUF_TO_AGI(agbp);
 	newino = be32_to_cpu(agi->agi_newino);
-	if(likely(newino != NULLAGINO)) {
-		args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
-				XFS_IALLOC_BLOCKS(args.mp);
+	args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) +
+			XFS_IALLOC_BLOCKS(args.mp);
+	if (likely(newino != NULLAGINO &&
+		  (args.agbno < be32_to_cpu(agi->agi_length)))) {
 		args.fsbno = XFS_AGB_TO_FSB(args.mp,
 				be32_to_cpu(agi->agi_seqno), args.agbno);
 		args.type = XFS_ALLOCTYPE_THIS_BNO;
@@ -182,8 +183,8 @@ xfs_ialloc_ag_alloc(
 		 * Set the alignment for the allocation.
 		 * If stripe alignment is turned on then align at stripe unit
 		 * boundary.
-		 * If the cluster size is smaller than a filesystem block 
-		 * then we're doing I/O for inodes in filesystem block size 
+		 * If the cluster size is smaller than a filesystem block
+		 * then we're doing I/O for inodes in filesystem block size
 		 * pieces, so don't need alignment anyway.
 		 */
 		isaligned = 0;
@@ -192,7 +193,7 @@ xfs_ialloc_ag_alloc(
 			args.alignment = args.mp->m_dalign;
 			isaligned = 1;
 		} else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) &&
-			   args.mp->m_sb.sb_inoalignmt >= 
+			   args.mp->m_sb.sb_inoalignmt >=
 			   XFS_B_TO_FSBT(args.mp,
 			  	XFS_INODE_CLUSTER_SIZE(args.mp)))
 				args.alignment = args.mp->m_sb.sb_inoalignmt;
@@ -220,7 +221,7 @@ xfs_ialloc_ag_alloc(
 		if ((error = xfs_alloc_vextent(&args)))
 			return error;
 	}
- 
+
 	/*
 	 * If stripe alignment is turned on, then try again with cluster
 	 * alignment.
diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c
index bb33113eef9f..b53854325266 100644
--- a/fs/xfs/xfs_iget.c
+++ b/fs/xfs/xfs_iget.c
@@ -421,7 +421,10 @@ finish_inode:
 			ip->i_chash = chlnew;
 			chlnew->chl_ip = ip;
 			chlnew->chl_blkno = ip->i_blkno;
+			if (ch->ch_list)
+				ch->ch_list->chl_prev = chlnew;
 			chlnew->chl_next = ch->ch_list;
+			chlnew->chl_prev = NULL;
 			ch->ch_list = chlnew;
 			chlnew = NULL;
 		}
@@ -723,23 +726,15 @@ xfs_iextract(
 		ASSERT(ip->i_cnext == ip && ip->i_cprev == ip);
 		ASSERT(ip->i_chash != NULL);
 		chm=NULL;
-		for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) {
-			if (chl->chl_blkno == ip->i_blkno) {
-				if (chm == NULL) {
-					/* first item on the list */
-					ch->ch_list = chl->chl_next;
-				} else {
-					chm->chl_next = chl->chl_next;
-				}
-				kmem_zone_free(xfs_chashlist_zone, chl);
-				break;
-			} else {
-				ASSERT(chl->chl_ip != ip);
-				chm = chl;
-			}
-		}
-		ASSERT_ALWAYS(chl != NULL);
-       } else {
+		chl = ip->i_chash;
+		if (chl->chl_prev)
+			chl->chl_prev->chl_next = chl->chl_next;
+		else
+			ch->ch_list = chl->chl_next;
+		if (chl->chl_next)
+			chl->chl_next->chl_prev = chl->chl_prev;
+		kmem_zone_free(xfs_chashlist_zone, chl);
+	} else {
 		/* delete one inode from a non-empty list */
 		iq = ip->i_cnext;
 		iq->i_cprev = ip->i_cprev;
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 48146bdc6bdd..94b60dd03801 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2732,16 +2732,29 @@ xfs_iunpin(
 	ASSERT(atomic_read(&ip->i_pincount) > 0);
 
 	if (atomic_dec_and_test(&ip->i_pincount)) {
-		vnode_t	*vp = XFS_ITOV_NULL(ip);
+		/*
+		 * If the inode is currently being reclaimed, the
+		 * linux inode _and_ the xfs vnode may have been
+		 * freed so we cannot reference either of them safely.
+		 * Hence we should not try to do anything to them
+		 * if the xfs inode is currently in the reclaim
+		 * path.
+		 *
+		 * However, we still need to issue the unpin wakeup
+		 * call as the inode reclaim may be blocked waiting for
+		 * the inode to become unpinned.
+		 */
+		if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) {
+			vnode_t	*vp = XFS_ITOV_NULL(ip);
 
-		/* make sync come back and flush this inode */
-		if (vp) {
-			struct inode	*inode = vn_to_inode(vp);
+			/* make sync come back and flush this inode */
+			if (vp) {
+				struct inode	*inode = vn_to_inode(vp);
 
-			if (!(inode->i_state & I_NEW))
-				mark_inode_dirty_sync(inode);
+				if (!(inode->i_state & I_NEW))
+					mark_inode_dirty_sync(inode);
+			}
 		}
-
 		wake_up(&ip->i_ipin_wait);
 	}
 }
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 39ef9c36ea55..3b544db1790b 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -189,6 +189,7 @@ typedef struct xfs_ihash {
  */
 typedef struct xfs_chashlist {
 	struct xfs_chashlist	*chl_next;
+	struct xfs_chashlist	*chl_prev;
 	struct xfs_inode	*chl_ip;
 	xfs_daddr_t		chl_blkno;	/* starting block number of
 						 * the cluster */
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 049fabb7f7e0..c0b1c2906880 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -270,7 +270,7 @@ xfs_mount_validate_sb(
 	    (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE)	||
 	    (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE)	||
-	    (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) {
+	    (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) {
 		xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed");
 		return XFS_ERROR(EFSCORRUPTED);
 	}
author	Steven Whitehouse <swhiteho@redhat.com>	2006-04-21 18:52:36 +0200
committer	Steven Whitehouse <swhiteho@redhat.com>	2006-04-21 18:52:36 +0200
commit	a748422ee45725e04e1d3792fa19dfa90ddfd116 (patch)
tree	978e12895468baaa9f7ab2747b9f7d50beaf1717 /fs
parent	[GFS2] journal recovery patch (diff)
parent	Merge branch 'upstream-linus' of master.kernel.org:/pub/scm/linux/kernel/git/... (diff)
download	linux-a748422ee45725e04e1d3792fa19dfa90ddfd116.tar.xz linux-a748422ee45725e04e1d3792fa19dfa90ddfd116.zip