From 0e2bedaa394f74fa9f75ee937488c33d90039b5a Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 30 Jan 2009 21:24:41 +0000
Subject: [CIFS] ipv6_addr_equal for address comparison

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/connect.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 2209be943051..005df85219a8 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -23,7 +23,6 @@
 #include <linux/string.h>
 #include <linux/list.h>
 #include <linux/wait.h>
-#include <linux/ipv6.h>
 #include <linux/pagemap.h>
 #include <linux/ctype.h>
 #include <linux/utsname.h>
@@ -35,6 +34,7 @@
 #include <linux/freezer.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
 #include "cifsproto.h"
@@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr)
 		     server->addr.sockAddr.sin_addr.s_addr))
 			continue;
 		else if (addr->ss_family == AF_INET6 &&
-			 memcmp(&server->addr.sockAddr6.sin6_addr,
-				&addr6->sin6_addr, sizeof(addr6->sin6_addr)))
+			 !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr,
+					  &addr6->sin6_addr))
 			continue;
 
 		++server->srv_count;
-- 
cgit v1.2.3


From 0bf2f3aec5474da80a60e1baca629af87ecb67b6 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 6 Feb 2009 11:45:46 +0000
Subject: CRED: Fix SUID exec regression

The patch:

	commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
	CRED: Make execve() take advantage of copy-on-write credentials

moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called.  This means that LSM_UNSAFE_SHARE is now
calculated incorrectly.  This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made.  All of which are true for threads created by the
pthread library.

However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().

So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us.  These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().

We do have to be careful because CLONE_THREAD does not imply FS or FILES.

We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.

This can be tested with the attached pair of programs.  Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user.  If
successful, you should see something like:

	[dhowells@andromeda tmp]$ ./test1
	--TEST1--
	uid=4043, euid=4043 suid=4043
	exec ./test2
	--TEST2--
	uid=4043, euid=0 suid=0
	SUCCESS - Correct effective user ID

and if unsuccessful, something like:

	[dhowells@andromeda tmp]$ ./test1
	--TEST1--
	uid=4043, euid=4043 suid=4043
	exec ./test2
	--TEST2--
	uid=4043, euid=4043 suid=4043
	ERROR - Incorrect effective user ID!

The non-root user ID you see will depend on the user you run as.

[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>

static void *thread_func(void *arg)
{
	while (1) {}
}

int main(int argc, char **argv)
{
	pthread_t tid;
	uid_t uid, euid, suid;

	printf("--TEST1--\n");
	getresuid(&uid, &euid, &suid);
	printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);

	if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
		perror("pthread_create");
		exit(1);
	}

	printf("exec ./test2\n");
	execlp("./test2", "test2", NULL);
	perror("./test2");
	_exit(1);
}

[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>

int main(int argc, char **argv)
{
	uid_t uid, euid, suid;

	getresuid(&uid, &euid, &suid);
	printf("--TEST2--\n");
	printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);

	if (euid != 0) {
		fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
		exit(1);
	}
	printf("SUCCESS - Correct effective user ID\n");
	exit(0);
}

[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2

test1: test1.c
	gcc $(CFLAGS) -o test1 test1.c -lpthread

test2: test2.c
	gcc $(CFLAGS) -o test2 test2.c
	sudo chown root.root test2
	sudo chmod +s test2

Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 fs/compat.c   |  2 +-
 fs/exec.c     | 28 ++++++++++++++++++++++------
 fs/internal.h |  2 +-
 3 files changed, 24 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/compat.c b/fs/compat.c
index 65a070e705ab..d0145ca27572 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+	check_unsafe_exec(bprm, current->files);
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/exec.c b/fs/exec.c
index 0dd60a01f1b4..929b58004b7e 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files)
 {
-	struct task_struct *p = current;
+	struct task_struct *p = current, *t;
+	unsigned long flags;
+	unsigned n_fs, n_files, n_sighand;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
-	if (atomic_read(&p->fs->count) > 1 ||
-	    atomic_read(&p->files->count) > 1 ||
-	    atomic_read(&p->sighand->count) > 1)
+	n_fs = 1;
+	n_files = 1;
+	n_sighand = 1;
+	lock_task_sighand(p, &flags);
+	for (t = next_thread(p); t != p; t = next_thread(t)) {
+		if (t->fs == p->fs)
+			n_fs++;
+		if (t->files == files)
+			n_files++;
+		n_sighand++;
+	}
+
+	if (atomic_read(&p->fs->count) > n_fs ||
+	    atomic_read(&p->files->count) > n_files ||
+	    atomic_read(&p->sighand->count) > n_sighand)
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+
+	unlock_task_sighand(p, &flags);
 }
 
 /* 
@@ -1273,7 +1289,7 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+	check_unsafe_exec(bprm, displaced);
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..0d8ac497b3d5 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -43,7 +43,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *);
 
 /*
  * namespace.c
-- 
cgit v1.2.3


From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 20 Jan 2009 15:31:31 +0100
Subject: async: Rename _special -> _domain for clarity.

Rename the async_*_special() functions to async_*_domain(), which
describes the purpose of these functions much better.
[Broke up long lines to silence checkpatch]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/super.c            |  4 ++--
 include/linux/async.h |  8 +++++---
 kernel/async.c        | 41 ++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..61dce001dd57 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
 		/*
 		 * wait for asynchronous fs operations to finish before going further
 		 */
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/async.h b/include/linux/async.h
index c4ecacd0b327..68a9530196f2 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -17,9 +17,11 @@ typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
-extern async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *list);
+extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+					    struct list_head *list);
 extern void async_synchronize_full(void);
-extern void async_synchronize_full_special(struct list_head *list);
+extern void async_synchronize_full_domain(struct list_head *list);
 extern void async_synchronize_cookie(async_cookie_t cookie);
-extern void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *list);
+extern void async_synchronize_cookie_domain(async_cookie_t cookie,
+					    struct list_head *list);
 
diff --git a/kernel/async.c b/kernel/async.c
index b5f0d4b94937..e23399d88bac 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -224,22 +224,23 @@ async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 EXPORT_SYMBOL_GPL(async_schedule);
 
 /**
- * async_schedule_special - schedule a function for asynchronous execution with a special running queue
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
  * @ptr: function to execute asynchronously
  * @data: data pointer to pass to the function
- * @running: list head to add to while running
+ * @running: running list for the domain
  *
  * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_special() functions
- * to wait on a special running queue rather than on the global running
- * queue.
+ * @running may be used in the async_synchronize_*_domain() functions
+ * to wait within a certain synchronization domain rather than globally.
+ * A synchronization domain is specified via the running queue @running to use.
  * Note: This function may be called from atomic or non-atomic contexts.
  */
-async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+				     struct list_head *running)
 {
 	return __async_schedule(ptr, data, running);
 }
-EXPORT_SYMBOL_GPL(async_schedule_special);
+EXPORT_SYMBOL_GPL(async_schedule_domain);
 
 /**
  * async_synchronize_full - synchronize all asynchronous function calls
@@ -255,27 +256,29 @@ void async_synchronize_full(void)
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
- * async_synchronize_full_special - synchronize all asynchronous function calls for a running list
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @list: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list have been done.
  */
-void async_synchronize_full_special(struct list_head *list)
+void async_synchronize_full_domain(struct list_head *list)
 {
-	async_synchronize_cookie_special(next_cookie, list);
+	async_synchronize_cookie_domain(next_cookie, list);
 }
-EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
 /**
- * async_synchronize_cookie_special - synchronize asynchronous function calls on a running list with cookie checkpointing
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
  * @cookie: async_cookie_t to use as checkpoint
  * @running: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list submitted prior to @cookie have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list submitted
+ * prior to @cookie have been done.
  */
-void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+				     struct list_head *running)
 {
 	ktime_t starttime, delta, endtime;
 
@@ -295,7 +298,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 			(long long)ktime_to_ns(delta) >> 10);
 	}
 }
-EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 
 /**
  * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
@@ -306,7 +309,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
  */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-	async_synchronize_cookie_special(cookie, &async_running);
+	async_synchronize_cookie_domain(cookie, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
 
-- 
cgit v1.2.3


From 9d9b87c1218be78ddecbc85ec3bb91c79c1d56ab Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 4 Feb 2009 17:35:38 -0500
Subject: lockd: fix regression in lockd's handling of blocked locks

If a client requests a blocking lock, is denied, then requests it again,
then here in nlmsvc_lock() we will call vfs_lock_file() without FL_SLEEP
set, because we've already queued a block and don't need the locks code
to do it again.

But that means vfs_lock_file() will return -EAGAIN instead of
FILE_LOCK_DENIED.  So we still need to translate that -EAGAIN return
into a nlm_lck_blocked error in this case, and put ourselves back on
lockd's block list.

The bug was introduced by bde74e4bc64415b1 "locks: add special return
value for asynchronous locks".

Thanks to Frank van Maarseveen for the report; his original test
case was essentially

	for i in `seq 30`; do flock /nfsmount/foo sleep 10 & done

Tested-by: Frank van Maarseveen <frankvm@frankvm.com>
Reported-by: Frank van Maarseveen <frankvm@frankvm.com>
Cc: Miklos Szeredi <mszeredi@suse.cz>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/lockd/svclock.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 6063a8e4b9f3..763b78a6e9de 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 		case -EAGAIN:
 			ret = nlm_lck_denied;
-			goto out;
+			break;
 		case FILE_LOCK_DEFERRED:
 			if (wait)
 				break;
@@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
 			goto out;
 	}
 
+	ret = nlm_lck_denied;
+	if (!wait)
+		goto out;
+
 	ret = nlm_lck_blocked;
 
 	/* Append to list of blocked */
-- 
cgit v1.2.3


From 284b066af41579f62649048fdec5c5e7091703e6 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Feb 2009 16:22:03 -0500
Subject: Btrfs: don't use spin_is_contended

Btrfs was using spin_is_contended to see if it should drop locks before
doing extent allocations during btrfs_search_slot.  The idea was to avoid
expensive searches in the tree unless the lock was actually contended.

But, spin_is_contended is specific to the ticket spinlocks on x86, so this
is causing compile errors everywhere else.

In practice, the contention could easily appear some time after we started
doing the extent allocation, and it makes more sense to always drop the lock
instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c   |  3 +--
 fs/btrfs/locking.c | 22 ----------------------
 fs/btrfs/locking.h |  2 --
 3 files changed, 1 insertion(+), 26 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 551177c0011a..35443cc4b9a9 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1530,8 +1530,7 @@ again:
 			 * for higher level blocks, try not to allocate blocks
 			 * with the block and the parent locks held.
 			 */
-			if (level > 0 && !prealloc_block.objectid &&
-			    btrfs_path_lock_waiting(p, level)) {
+			if (level > 0 && !prealloc_block.objectid) {
 				u32 size = b->len;
 				u64 hint = b->start;
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 68fd9ccf1805..9ebe9385129b 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -236,25 +236,3 @@ int btrfs_tree_locked(struct extent_buffer *eb)
 	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
 			spin_is_locked(&eb->lock);
 }
-
-/*
- * btrfs_search_slot uses this to decide if it should drop its locks
- * before doing something expensive like allocating free blocks for cow.
- */
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level)
-{
-	int i;
-	struct extent_buffer *eb;
-
-	for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) {
-		eb = path->nodes[i];
-		if (!eb)
-			break;
-		smp_mb();
-		if (spin_is_contended(&eb->lock) ||
-		    waitqueue_active(&eb->lock_wq))
-			return 1;
-	}
-	return 0;
-}
-
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index d92e707f5870..6bb0afbff928 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -26,8 +26,6 @@ int btrfs_tree_locked(struct extent_buffer *eb);
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
-int btrfs_path_lock_waiting(struct btrfs_path *path, int level);
-
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
 #endif
-- 
cgit v1.2.3


From c88ccea3143975294f5a52097546bcbb75975f52 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 11:27:46 -0500
Subject: jbd2: Fix return value of jbd2_journal_start_commit()

The function jbd2_journal_start_commit() returns 1 if either a
transaction is committing or the function has queued a transaction
commit. But it returns 0 if we raced with somebody queueing the
transaction commit as well. This resulted in ext4_sync_fs() not
functioning correctly (description from Arthur Jones):

   In the case of a data=ordered umount with pending long symlinks
   which are delayed due to a long list of other I/O on the backing
   block device, this causes the buffer associated with the long
   symlinks to not be moved to the inode dirty list in the second
   phase of fsync_super.  Then, before they can be dirtied again,
   kjournald exits, seeing the UMOUNT flag and the dirty pages are
   never written to the backing block device, causing long symlink
   corruption and exposing new or previously freed block data to
   userspace.

This can be reproduced with a script created by Eric Sandeen
<sandeen@redhat.com>:

        #!/bin/bash

        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        rm -f /mnt/test2/*
        dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
        touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        /mnt/test2/link
        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        ls /mnt/test2/

This patch fixes jbd2_journal_start_commit() to always return 1 when
there's a transaction committing or queued for commit.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
CC: Eric Sandeen <sandeen@redhat.com>
CC: linux-ext4@vger.kernel.org
---
 fs/jbd2/journal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index eb343008eded..58144102bf25 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __jbd2_log_start_commit(journal_t *journal, tid_t target)
 {
@@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __jbd2_log_start_commit(journal, tid);
-		if (ret && ptid)
+		__jbd2_log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
-- 
cgit v1.2.3


From 9eddacf9e9c03578ef2c07c9534423e823d677f8 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 06:46:05 -0500
Subject: Revert "ext4: wait on all pending commits in ext4_sync_fs()"

This undoes commit 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184.

Since jbd2_journal_start_commit() is now fixed to return 1 when we
started a transaction commit, there's some transaction waiting to be
committed or there's a transaction already committing, we don't
need to call ext4_force_commit() in ext4_sync_fs(). Furthermore
ext4_force_commit() can unnecessarily create sync transaction which is
expensive so it's worthwhile to remove it when we can.

http://bugzilla.kernel.org/show_bug.cgi?id=12224

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Cc: Eric Sandeen <sandeen@redhat.com>
Cc: linux-ext4@vger.kernel.org
---
 fs/ext4/super.c | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e5f06a5f045e..a5732c58f676 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb)
 static int ext4_sync_fs(struct super_block *sb, int wait)
 {
 	int ret = 0;
+	tid_t target;
 
 	trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
 	sb->s_dirt = 0;
 	if (EXT4_SB(sb)->s_journal) {
-		if (wait)
-			ret = ext4_force_commit(sb);
-		else
- 			jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL);
+		if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
+					      &target)) {
+			if (wait)
+				jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
+						     target);
+		}
 	} else {
 		ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
 	}
-- 
cgit v1.2.3


From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Tue, 10 Feb 2009 11:15:34 -0500
Subject: jbd2: Avoid possible NULL dereference in
 jbd2_journal_begin_ordered_truncate()

If we race with commit code setting i_transaction to NULL, we could
possibly dereference it.  Proper locking requires the journal pointer
(to access journal->j_list_lock), which we don't have.  So we have to
change the prototype of the function so that filesystem passes us the
journal pointer.  Also add a more detailed comment about why the
function jbd2_journal_begin_ordered_truncate() does what it does and
how it should be used.

Thanks to Dan Carpenter <error27@gmail.com> for pointing to the
suspitious code.

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Acked-by: Joel Becker <joel.becker@oracle.com>
CC: linux-ext4@vger.kernel.org
CC: ocfs2-devel@oss.oracle.com
CC: mfasheh@suse.de
CC: Dan Carpenter <error27@gmail.com>
---
 fs/ext4/inode.c       |  6 ++++--
 fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++-----------
 fs/ocfs2/journal.h    |  6 ++++--
 include/linux/jbd2.h  |  3 ++-
 4 files changed, 41 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 03ba20be1329..658c4a7f2578 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -47,8 +47,10 @@
 static inline int ext4_begin_ordered_truncate(struct inode *inode,
 					      loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+					EXT4_SB(inode->i_sb)->s_journal,
+					&EXT4_I(inode)->jinode,
+					new_size);
 }
 
 static void ext4_invalidatepage(struct page *page, unsigned long offset);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 46b4e347ed7d..28ce21d8598e 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -2129,26 +2129,46 @@ done:
 }
 
 /*
- * This function must be called when inode is journaled in ordered mode
- * before truncation happens. It starts writeout of truncated part in
- * case it is in the committing transaction so that we stand to ordered
- * mode consistency guarantees.
+ * File truncate and transaction commit interact with each other in a
+ * non-trivial way.  If a transaction writing data block A is
+ * committing, we cannot discard the data by truncate until we have
+ * written them.  Otherwise if we crashed after the transaction with
+ * write has committed but before the transaction with truncate has
+ * committed, we could see stale data in block A.  This function is a
+ * helper to solve this problem.  It starts writeout of the truncated
+ * part in case it is in the committing transaction.
+ *
+ * Filesystem code must call this function when inode is journaled in
+ * ordered mode before truncation happens and after the inode has been
+ * placed on orphan list with the new inode size. The second condition
+ * avoids the race that someone writes new data and we start
+ * committing the transaction after this function has been called but
+ * before a transaction for truncate is started (and furthermore it
+ * allows us to optimize the case where the addition to orphan list
+ * happens in the same transaction as write --- we don't have to write
+ * any data in such case).
  */
-int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode,
+int jbd2_journal_begin_ordered_truncate(journal_t *journal,
+					struct jbd2_inode *jinode,
 					loff_t new_size)
 {
-	journal_t *journal;
-	transaction_t *commit_trans;
+	transaction_t *inode_trans, *commit_trans;
 	int ret = 0;
 
-	if (!inode->i_transaction && !inode->i_next_transaction)
+	/* This is a quick check to avoid locking if not necessary */
+	if (!jinode->i_transaction)
 		goto out;
-	journal = inode->i_transaction->t_journal;
+	/* Locks are here just to force reading of recent values, it is
+	 * enough that the transaction was not committing before we started
+	 * a transaction adding the inode to orphan list */
 	spin_lock(&journal->j_state_lock);
 	commit_trans = journal->j_committing_transaction;
 	spin_unlock(&journal->j_state_lock);
-	if (inode->i_transaction == commit_trans) {
-		ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping,
+	spin_lock(&journal->j_list_lock);
+	inode_trans = jinode->i_transaction;
+	spin_unlock(&journal->j_list_lock);
+	if (inode_trans == commit_trans) {
+		ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping,
 			new_size, LLONG_MAX);
 		if (ret)
 			jbd2_journal_abort(journal, ret);
diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h
index 3c3532e1307c..172850a9a12a 100644
--- a/fs/ocfs2/journal.h
+++ b/fs/ocfs2/journal.h
@@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode)
 static inline int ocfs2_begin_ordered_truncate(struct inode *inode,
 					       loff_t new_size)
 {
-	return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode,
-						   new_size);
+	return jbd2_journal_begin_ordered_truncate(
+				OCFS2_SB(inode->i_sb)->journal->j_journal,
+				&OCFS2_I(inode)->ip_jinode,
+				new_size);
 }
 
 #endif /* OCFS2_JOURNAL_H */
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index b28b37eb11c6..4d248b3f1323 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -1150,7 +1150,8 @@ extern int	   jbd2_journal_clear_err  (journal_t *);
 extern int	   jbd2_journal_bmap(journal_t *, unsigned long, unsigned long long *);
 extern int	   jbd2_journal_force_commit(journal_t *);
 extern int	   jbd2_journal_file_inode(handle_t *handle, struct jbd2_inode *inode);
-extern int	   jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, loff_t new_size);
+extern int	   jbd2_journal_begin_ordered_truncate(journal_t *journal,
+				struct jbd2_inode *inode, loff_t new_size);
 extern void	   jbd2_journal_init_jbd_inode(struct jbd2_inode *jinode, struct inode *inode);
 extern void	   jbd2_journal_release_jbd_inode(journal_t *journal, struct jbd2_inode *jinode);
 
-- 
cgit v1.2.3


From 7be2baaa0322c59ba888aa5260a8c130666acd41 Mon Sep 17 00:00:00 2001
From: Wei Yongjun <yjwei@cn.fujitsu.com>
Date: Tue, 10 Feb 2009 09:53:42 -0500
Subject: ext4: Fix to read empty directory blocks correctly in 64k

The rec_len field in the directory entry is 16 bits, so there was a
problem representing rec_len for filesystems with a 64k block size in
the case where the directory entry takes the entire 64k block.
Unfortunately, there were two schemes that were proposed; one where
all zeros meant 65536 and one where all ones (65535) meant 65536.
E2fsprogs used 0, whereas the kernel used 65535.  Oops.  Fortunately
this case happens extremely rarely, with the most common case being
the lost+found directory, created by mke2fs.

So we will be liberal in what we accept, and accept both encodings,
but we will continue to encode 65536 as 65535.  This will require a
change in e2fsprogs, but with fortunately ext4 filesystems normally
have the dir_index feature enabled, which precludes having a
completely empty directory block.

Signed-off-by: Wei Yongjun <yjwei@cn.fujitsu.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ext4.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index aafc9eba1c25..b0c87dce66a3 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
 {
 	unsigned len = le16_to_cpu(dlen);
 
-	if (len == EXT4_MAX_REC_LEN)
+	if (len == EXT4_MAX_REC_LEN || len == 0)
 		return 1 << 16;
 	return len;
 }
-- 
cgit v1.2.3


From ba4439165f0f0d25b2fe065cf0c1ff8130b802eb Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Tue, 10 Feb 2009 11:14:34 -0500
Subject: ext4: Fix lockdep warning

We should not call ext4_mb_add_n_trim while holding alloc_semp.

    =============================================
    [ INFO: possible recursive locking detected ]
    2.6.29-rc4-git1-dirty #124
    ---------------------------------------------
    ffsb/3116 is trying to acquire lock:
     (&meta_group_info[i]->alloc_sem){----}, at: [<ffffffff8035a6e8>]
     ext4_mb_load_buddy+0xd2/0x343

    but task is already holding lock:
     (&meta_group_info[i]->alloc_sem){----}, at: [<ffffffff8035a6e8>]
     ext4_mb_load_buddy+0xd2/0x343

http://bugzilla.kernel.org/show_bug.cgi?id=12672

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 29 ++++++++++++++++-------------
 1 file changed, 16 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index deba54f6cbed..c962d0690505 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4476,23 +4476,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 			pa->pa_free -= ac->ac_b_ex.fe_len;
 			pa->pa_len -= ac->ac_b_ex.fe_len;
 			spin_unlock(&pa->pa_lock);
-			/*
-			 * We want to add the pa to the right bucket.
-			 * Remove it from the list and while adding
-			 * make sure the list to which we are adding
-			 * doesn't grow big.
-			 */
-			if (likely(pa->pa_free)) {
-				spin_lock(pa->pa_obj_lock);
-				list_del_rcu(&pa->pa_inode_list);
-				spin_unlock(pa->pa_obj_lock);
-				ext4_mb_add_n_trim(ac);
-			}
 		}
-		ext4_mb_put_pa(ac, ac->ac_sb, pa);
 	}
 	if (ac->alloc_semp)
 		up_read(ac->alloc_semp);
+	if (pa) {
+		/*
+		 * We want to add the pa to the right bucket.
+		 * Remove it from the list and while adding
+		 * make sure the list to which we are adding
+		 * doesn't grow big.  We need to release
+		 * alloc_semp before calling ext4_mb_add_n_trim()
+		 */
+		if (pa->pa_linear && likely(pa->pa_free)) {
+			spin_lock(pa->pa_obj_lock);
+			list_del_rcu(&pa->pa_inode_list);
+			spin_unlock(pa->pa_obj_lock);
+			ext4_mb_add_n_trim(ac);
+		}
+		ext4_mb_put_pa(ac, ac->ac_sb, pa);
+	}
 	if (ac->ac_bitmap_page)
 		page_cache_release(ac->ac_bitmap_page);
 	if (ac->ac_buddy_page)
-- 
cgit v1.2.3


From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001
From: Mel Gorman <mel@csn.ul.ie>
Date: Tue, 10 Feb 2009 14:02:27 +0000
Subject: Do not account for the address space used by hugetlbfs using
 VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.

Signed-off-by: Mel Gorman <mel@csn.ul.ie>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/hugetlbfs/inode.c    |  8 +++++---
 include/linux/hugetlb.h |  5 +++--
 include/linux/mm.h      |  3 +--
 ipc/shm.c               |  8 +++++---
 mm/fremap.c             |  2 +-
 mm/hugetlb.c            | 39 +++++++++++++++++++++++++--------------
 mm/mmap.c               | 38 ++++++++++++++++++++++----------------
 mm/mprotect.c           |  5 +++--
 8 files changed, 65 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 6903d37af037..9b800d97a687 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
 
 	if (hugetlb_reserve_pages(inode,
 				vma->vm_pgoff >> huge_page_order(h),
-				len >> huge_page_shift(h), vma))
+				len >> huge_page_shift(h), vma,
+				vma->vm_flags))
 		goto out;
 
 	ret = 0;
@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
 			can_do_mlock());
 }
 
-struct file *hugetlb_file_setup(const char *name, size_t size)
+struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
 {
 	int error = -ENOMEM;
 	struct file *file;
@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
 
 	error = -ENOMEM;
 	if (hugetlb_reserve_pages(inode, 0,
-			size >> huge_page_shift(hstate_inode(inode)), NULL))
+			size >> huge_page_shift(hstate_inode(inode)), NULL,
+			acctflag))
 		goto out_inode;
 
 	d_instantiate(dentry, inode);
diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h
index f1d2fba19ea0..af09660001c7 100644
--- a/include/linux/hugetlb.h
+++ b/include/linux/hugetlb.h
@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
 int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 			unsigned long address, int write_access);
 int hugetlb_reserve_pages(struct inode *inode, long from, long to,
-						struct vm_area_struct *vma);
+						struct vm_area_struct *vma,
+						int acctflags);
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
 
 extern unsigned long hugepages_treat_as_movable;
@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
 
 extern const struct file_operations hugetlbfs_file_operations;
 extern struct vm_operations_struct hugetlb_vm_ops;
-struct file *hugetlb_file_setup(const char *name, size_t);
+struct file *hugetlb_file_setup(const char *name, size_t, int);
 int hugetlb_get_quota(struct address_space *mapping, long delta);
 void hugetlb_put_quota(struct address_space *mapping, long delta);
 
diff --git a/include/linux/mm.h b/include/linux/mm.h
index e8ddc98b8405..323561582c10 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	unsigned long flag, unsigned long pgoff);
 extern unsigned long mmap_region(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long flags,
-	unsigned int vm_flags, unsigned long pgoff,
-	int accountable);
+	unsigned int vm_flags, unsigned long pgoff);
 
 static inline unsigned long do_mmap(struct file *file, unsigned long addr,
 	unsigned long len, unsigned long prot,
diff --git a/ipc/shm.c b/ipc/shm.c
index f8f69fad3a27..05d51d2a792c 100644
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 	struct file * file;
 	char name[13];
 	int id;
+	int acctflag = 0;
 
 	if (size < SHMMIN || size > ns->shm_ctlmax)
 		return -EINVAL;
@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
 
 	sprintf (name, "SYSV%08x", key);
 	if (shmflg & SHM_HUGETLB) {
-		/* hugetlb_file_setup takes care of mlock user accounting */
-		file = hugetlb_file_setup(name, size);
+		/* hugetlb_file_setup applies strict accounting */
+		if (shmflg & SHM_NORESERVE)
+			acctflag = VM_NORESERVE;
+		file = hugetlb_file_setup(name, size, acctflag);
 		shp->mlock_user = current_user();
 	} else {
-		int acctflag = 0;
 		/*
 		 * Do not allow no accounting for OVERCOMMIT_NEVER, even
 	 	 * if it's asked for.
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f3306a..b6ec85abbb39 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 			flags &= MAP_NONBLOCK;
 			get_file(file);
 			addr = mmap_region(file, start, size,
-					flags, vma->vm_flags, pgoff, 1);
+					flags, vma->vm_flags, pgoff);
 			fput(file);
 			if (IS_ERR_VALUE(addr)) {
 				err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e98304080..207464209546 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
 
 int hugetlb_reserve_pages(struct inode *inode,
 					long from, long to,
-					struct vm_area_struct *vma)
+					struct vm_area_struct *vma,
+					int acctflag)
 {
-	long ret, chg;
+	long ret = 0, chg;
 	struct hstate *h = hstate_inode(inode);
 
-	if (vma && vma->vm_flags & VM_NORESERVE)
-		return 0;
-
 	/*
 	 * Shared mappings base their reservation on the number of pages that
 	 * are already allocated on behalf of the file. Private mappings need
@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
 	 */
 	if (!vma || vma->vm_flags & VM_SHARED)
 		chg = region_chg(&inode->i_mapping->private_list, from, to);
-	else {
-		struct resv_map *resv_map = resv_map_alloc();
-		if (!resv_map)
-			return -ENOMEM;
-
+	else
 		chg = to - from;
 
-		set_vma_resv_map(vma, resv_map);
-		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
-	}
-
 	if (chg < 0)
 		return chg;
 
 	if (hugetlb_get_quota(inode->i_mapping, chg))
 		return -ENOSPC;
+
+	/*
+	 * Only apply hugepage reservation if asked. We still have to
+	 * take the filesystem quota because it is an upper limit
+	 * defined for the mount and not necessarily memory as a whole
+	 */
+	if (acctflag & VM_NORESERVE) {
+		reset_vma_resv_huge_pages(vma);
+		return 0;
+	}
+
 	ret = hugetlb_acct_memory(h, chg);
 	if (ret < 0) {
 		hugetlb_put_quota(inode->i_mapping, chg);
@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
 	}
 	if (!vma || vma->vm_flags & VM_SHARED)
 		region_add(&inode->i_mapping->private_list, from, to);
+	else {
+		struct resv_map *resv_map = resv_map_alloc();
+
+		if (!resv_map)
+			return -ENOMEM;
+
+		set_vma_resv_map(vma, resv_map);
+		set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+	}
+
 	return 0;
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a258eeb..eb1270bebe67 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	struct inode *inode;
 	unsigned int vm_flags;
 	int error;
-	int accountable = 1;
 	unsigned long reqprot = prot;
 
 	/*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 					return -EPERM;
 				vm_flags &= ~VM_MAYEXEC;
 			}
-			if (is_file_hugepages(file))
-				accountable = 0;
 
 			if (!file->f_op || !file->f_op->mmap)
 				return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 	if (error)
 		return error;
 
-	return mmap_region(file, addr, len, flags, vm_flags, pgoff,
-			   accountable);
+	return mmap_region(file, addr, len, flags, vm_flags, pgoff);
 }
 EXPORT_SYMBOL(do_mmap_pgoff);
 
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
 
 /*
  * We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
  */
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
 {
+	/*
+	 * hugetlb has its own accounting separate from the core VM
+	 * VM_HUGETLB may not be set yet so we cannot check for that flag.
+	 */
+	if (file && is_file_hugepages(file))
+		return 0;
+
 	return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
 }
 
 unsigned long mmap_region(struct file *file, unsigned long addr,
 			  unsigned long len, unsigned long flags,
-			  unsigned int vm_flags, unsigned long pgoff,
-			  int accountable)
+			  unsigned int vm_flags, unsigned long pgoff)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
 
 	/*
 	 * Set 'VM_NORESERVE' if we should not account for the
-	 * memory use of this mapping. We only honor MAP_NORESERVE
-	 * if we're allowed to overcommit memory.
+	 * memory use of this mapping.
 	 */
-	if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
-		vm_flags |= VM_NORESERVE;
-	if (!accountable)
-		vm_flags |= VM_NORESERVE;
+	if ((flags & MAP_NORESERVE)) {
+		/* We honor MAP_NORESERVE if allowed to overcommit */
+		if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+			vm_flags |= VM_NORESERVE;
+
+		/* hugetlb applies strict overcommit unless MAP_NORESERVE */
+		if (file && is_file_hugepages(file))
+			vm_flags |= VM_NORESERVE;
+	}
 
 	/*
 	 * Private writable mapping: check memory availability
 	 */
-	if (accountable_mapping(vm_flags)) {
+	if (accountable_mapping(file, vm_flags)) {
 		charged = len >> PAGE_SHIFT;
 		if (security_vm_enough_memory(charged))
 			return -ENOMEM;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694e13f4..258197b76fb4 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
-	 * make it unwritable again.
+	 * make it unwritable again. hugetlb mapping were accounted for
+	 * even if read-only so there is no need to account for them here
 	 */
 	if (newflags & VM_WRITE) {
-		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+		if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory(charged))
-- 
cgit v1.2.3


From 8fe4cd0dc5ea43760c59eb256404188272cc95dd Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Feb 2009 13:04:25 -0800
Subject: jbd: fix return value of journal_start_commit()

journal_start_commit() returns 1 if either a transaction is committing or
the function has queued a transaction commit.  But it returns 0 if we
raced with somebody queueing the transaction commit as well.  This
resulted in ext3_sync_fs() not functioning correctly (description from
Arthur Jones): In the case of a data=ordered umount with pending long
symlinks which are delayed due to a long list of other I/O on the backing
block device, this causes the buffer associated with the long symlinks to
not be moved to the inode dirty list in the second phase of fsync_super.
Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT
flag and the dirty pages are never written to the backing block device,
causing long symlink corruption and exposing new or previously freed block
data to userspace.

This can be reproduced with a script created by Eric Sandeen
<sandeen@redhat.com>:

        #!/bin/bash

        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        rm -f /mnt/test2/*
        dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512
        touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename
        /mnt/test2/link
        umount /mnt/test2
        mount /dev/sdb4 /mnt/test2
        ls /mnt/test2/

This patch fixes journal_start_commit() to always return 1 when there's
a transaction committing or queued for commit.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: Mike Snitzer <snitzer@gmail.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/jbd/journal.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c
index 9e4fa52d7dc8..e79c07812afa 100644
--- a/fs/jbd/journal.c
+++ b/fs/jbd/journal.c
@@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal)
 }
 
 /*
- * Called under j_state_lock.  Returns true if a transaction was started.
+ * Called under j_state_lock.  Returns true if a transaction commit was started.
  */
 int __log_start_commit(journal_t *journal, tid_t target)
 {
@@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal)
 
 /*
  * Start a commit of the current running transaction (if any).  Returns true
- * if a transaction was started, and fills its tid in at *ptid
+ * if a transaction is going to be committed (or is currently already
+ * committing), and fills its tid in at *ptid
  */
 int journal_start_commit(journal_t *journal, tid_t *ptid)
 {
@@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid)
 	if (journal->j_running_transaction) {
 		tid_t tid = journal->j_running_transaction->t_tid;
 
-		ret = __log_start_commit(journal, tid);
-		if (ret && ptid)
+		__log_start_commit(journal, tid);
+		/* There's a running transaction and we've just made sure
+		 * it's commit has been scheduled. */
+		if (ptid)
 			*ptid = tid;
-	} else if (journal->j_committing_transaction && ptid) {
+		ret = 1;
+	} else if (journal->j_committing_transaction) {
 		/*
 		 * If ext3_write_super() recently started a commit, then we
 		 * have to wait for completion of that transaction
 		 */
-		*ptid = journal->j_committing_transaction->t_tid;
+		if (ptid)
+			*ptid = journal->j_committing_transaction->t_tid;
 		ret = 1;
 	}
 	spin_unlock(&journal->j_state_lock);
-- 
cgit v1.2.3


From 02ac597c9b86af49b2016aa98aee20ab59dbf0d2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 11 Feb 2009 13:04:26 -0800
Subject: ext3: revert "ext3: wait on all pending commits in ext3_sync_fs"

This reverts commit c87591b719737b4e91eb1a9fa8fd55a4ff1886d6.

Since journal_start_commit() is now fixed to return 1 when we started a
transaction commit, there's some transaction waiting to be committed or
there's a transaction already committing, we don't need to call
ext3_force_commit() in ext3_sync_fs().  Furthermore ext3_force_commit()
can unnecessarily create sync transaction which is expensive so it's
worthwhile to remove it when we can.

Cc: Eric Sandeen <sandeen@redhat.com>
Cc: <linux-ext4@vger.kernel.org>
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext3/super.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index b70d90e08a3c..4a970411a458 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb)
 
 static int ext3_sync_fs(struct super_block *sb, int wait)
 {
-	sb->s_dirt = 0;
-	if (wait)
-		ext3_force_commit(sb);
-	else
-		journal_start_commit(EXT3_SB(sb)->s_journal, NULL);
+	tid_t target;
 
+	sb->s_dirt = 0;
+	if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) {
+		if (wait)
+			log_wait_commit(EXT3_SB(sb)->s_journal, target);
+	}
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0e4a9b59282914fe057ab17027f55123964bc2e2 Mon Sep 17 00:00:00 2001
From: Carsten Otte <cotte@de.ibm.com>
Date: Wed, 11 Feb 2009 13:04:37 -0800
Subject: ext2/xip: refuse to change xip flag during remount with busy inodes

For a reason that I was unable to understand in three months of debugging,
mount ext2 -o remount stopped working properly when remounting from
regular operation to xip, or the other way around.  According to a git
bisect search, the problem was introduced with the VM_MIXEDMAP/PTE_SPECIAL
rework in the vm:

commit 70688e4dd1647f0ceb502bbd5964fa344c5eb411
Author: Nick Piggin <npiggin@suse.de>
Date:   Mon Apr 28 02:13:02 2008 -0700

    xip: support non-struct page backed memory

In the failing scenario, the filesystem is mounted read only via root=
kernel parameter on s390x.  During remount (in rc.sysinit), the inodes of
the bash binary and its libraries are busy and cannot be invalidated (the
bash which is running rc.sysinit resides on subject filesystem).
Afterwards, another bash process (running ifup-eth) recurses into a
subshell, runs dup_mm (via fork).  Some of the mappings in this bash
process were created from inodes that could not be invalidated during
remount.

Both parent and child process crash some time later due to inconsistencies
in their address spaces.  The issue seems to be timing sensitive, various
attempts to recreate it have failed.

This patch refuses to change the xip flag during remount in case some
inodes cannot be invalidated.  This patch keeps users from running into
that issue.

[akpm@linux-foundation.org: cleanup]
Signed-off-by: Carsten Otte <cotte@de.ibm.com>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jared Hulbert <jaredeh@gmail.com>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/ext2/super.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index da8bdeaa2e6d..7c6e3606f0ec 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data)
 	es = sbi->s_es;
 	if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) !=
 	    (old_mount_opt & EXT2_MOUNT_XIP)) &&
-	    invalidate_inodes(sb))
-		ext2_warning(sb, __func__, "busy inodes while remounting "\
-			     "xip remain in cache (no functional problem)");
+	    invalidate_inodes(sb)) {
+		ext2_warning(sb, __func__, "refusing change of xip flag "
+			     "with busy inodes while remounting");
+		sbi->s_mount_opt &= ~EXT2_MOUNT_XIP;
+		sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP;
+	}
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 	if (*flags & MS_RDONLY) {
-- 
cgit v1.2.3


From eb099670895f22970cd143875467c2768d6d87e5 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Thu, 12 Feb 2009 09:27:38 -0500
Subject: Btrfs: make sure all pending extent operations are complete

Theres a slight problem with finish_current_insert, if we set all to 1 and then
go through and don't actually skip any of the extents on the pending list, we
could exit right after we've added new extents.

This is a problem because by inserting the new extents we could have gotten new
COW's to happen and such, so we may have some pending updates to do or even
more inserts to do after that.

So this patch will only exit if we have never skipped any of the extents in the
pending list, and we have no extents to insert, this will make sure that all of
the pending work is truly done before we return.  I've been running with this
patch for a few days with all of my other testing and have not seen issues.
Thanks,

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++---------------------
 1 file changed, 42 insertions(+), 29 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 7527523c2d2d..376656f65b33 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
 int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
 			 struct btrfs_root *root)
 {
-	finish_current_insert(trans, root->fs_info->extent_root, 1);
-	del_pending_extents(trans, root->fs_info->extent_root, 1);
+	u64 start;
+	u64 end;
+	int ret;
+
+	while(1) {
+		finish_current_insert(trans, root->fs_info->extent_root, 1);
+		del_pending_extents(trans, root->fs_info->extent_root, 1);
+
+		/* is there more work to do? */
+		ret = find_first_extent_bit(&root->fs_info->pending_del,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		ret = find_first_extent_bit(&root->fs_info->extent_ins,
+					    0, &start, &end, EXTENT_WRITEBACK);
+		if (!ret)
+			continue;
+		break;
+	}
 	return 0;
 }
 
@@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans,
 	u64 end;
 	u64 priv;
 	u64 search = 0;
-	u64 skipped = 0;
 	struct btrfs_fs_info *info = extent_root->fs_info;
 	struct btrfs_path *path;
 	struct pending_extent_op *extent_op, *tmp;
 	struct list_head insert_list, update_list;
 	int ret;
-	int num_inserts = 0, max_inserts;
+	int num_inserts = 0, max_inserts, restart = 0;
 
 	path = btrfs_alloc_path();
 	INIT_LIST_HEAD(&insert_list);
@@ -2233,19 +2249,19 @@ again:
 		ret = find_first_extent_bit(&info->extent_ins, search, &start,
 					    &end, EXTENT_WRITEBACK);
 		if (ret) {
-			if (skipped && all && !num_inserts &&
+			if (restart && !num_inserts &&
 			    list_empty(&update_list)) {
-				skipped = 0;
+				restart = 0;
 				search = 0;
 				continue;
 			}
-			mutex_unlock(&info->extent_ins_mutex);
 			break;
 		}
 
 		ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
 		if (!ret) {
-			skipped = 1;
+			if (all)
+				restart = 1;
 			search = end + 1;
 			if (need_resched()) {
 				mutex_unlock(&info->extent_ins_mutex);
@@ -2264,7 +2280,7 @@ again:
 			list_add_tail(&extent_op->list, &insert_list);
 			search = end + 1;
 			if (num_inserts == max_inserts) {
-				mutex_unlock(&info->extent_ins_mutex);
+				restart = 1;
 				break;
 			}
 		} else if (extent_op->type == PENDING_BACKREF_UPDATE) {
@@ -2280,7 +2296,6 @@ again:
 	 * somebody marked this thing for deletion then just unlock it and be
 	 * done, the free_extents will handle it
 	 */
-	mutex_lock(&info->extent_ins_mutex);
 	list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
 		clear_extent_bits(&info->extent_ins, extent_op->bytenr,
 				  extent_op->bytenr + extent_op->num_bytes - 1,
@@ -2302,6 +2317,10 @@ again:
 	if (!list_empty(&update_list)) {
 		ret = update_backrefs(trans, extent_root, path, &update_list);
 		BUG_ON(ret);
+
+		/* we may have COW'ed new blocks, so lets start over */
+		if (all)
+			restart = 1;
 	}
 
 	/*
@@ -2309,9 +2328,9 @@ again:
 	 * need to make sure everything is cleaned then reset everything and
 	 * go back to the beginning
 	 */
-	if (!num_inserts && all && skipped) {
+	if (!num_inserts && restart) {
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		INIT_LIST_HEAD(&update_list);
 		INIT_LIST_HEAD(&insert_list);
 		goto again;
@@ -2368,27 +2387,19 @@ again:
 	BUG_ON(ret);
 
 	/*
-	 * if we broke out of the loop in order to insert stuff because we hit
-	 * the maximum number of inserts at a time we can handle, then loop
-	 * back and pick up where we left off
+	 * if restart is set for whatever reason we need to go back and start
+	 * searching through the pending list again.
+	 *
+	 * We just inserted some extents, which could have resulted in new
+	 * blocks being allocated, which would result in new blocks needing
+	 * updates, so if all is set we _must_ restart to get the updated
+	 * blocks.
 	 */
-	if (num_inserts == max_inserts) {
-		INIT_LIST_HEAD(&insert_list);
-		INIT_LIST_HEAD(&update_list);
-		num_inserts = 0;
-		goto again;
-	}
-
-	/*
-	 * again, if we need to make absolutely sure there are no more pending
-	 * extent operations left and we know that we skipped some, go back to
-	 * the beginning and do it all again
-	 */
-	if (all && skipped) {
+	if (restart || all) {
 		INIT_LIST_HEAD(&insert_list);
 		INIT_LIST_HEAD(&update_list);
 		search = 0;
-		skipped = 0;
+		restart = 0;
 		num_inserts = 0;
 		goto again;
 	}
@@ -2709,6 +2720,8 @@ again:
 		goto again;
 	}
 
+	if (!err)
+		finish_current_insert(trans, extent_root, 0);
 	return err;
 }
 
-- 
cgit v1.2.3


From b288052e1779261ae80138074989ef50358c4e58 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:37:35 -0500
Subject: Btrfs: process mount options on mount -o remount,

Btrfs wasn't parsing any new mount options during remount, making it
difficult to set mount options on a root drive.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f3fd7e2cbc38..66b8341e2dba 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -511,6 +511,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data)
 	struct btrfs_root *root = btrfs_sb(sb);
 	int ret;
 
+	ret = btrfs_parse_options(root, data);
+	if (ret)
+		return -EINVAL;
+
 	if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
 		return 0;
 
-- 
cgit v1.2.3


From 536ac8ae86e68bb5574d7cc81c7d229a86b82601 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:41:38 -0500
Subject: Btrfs: use larger metadata clusters in ssd mode

Larger metadata clusters can significantly improve writeback performance
on ssd drives with large erasure blocks.  The larger clusters make it
more likely a given IO will completely overwrite the ssd block, so it
doesn't have to do an internal rwm cycle.

On spinning media, lager metadata clusters end up spreading out the
metadata more over time, which makes fsck slower, so we don't want this
to be the default.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent-tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 376656f65b33..c59e12036e20 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2872,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans,
 
 	if (data & BTRFS_BLOCK_GROUP_METADATA) {
 		last_ptr = &root->fs_info->last_alloc;
-		empty_cluster = 64 * 1024;
+		if (!btrfs_test_opt(root, SSD))
+			empty_cluster = 64 * 1024;
 	}
 
 	if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD))
-- 
cgit v1.2.3


From e1df36d2f18254d0690a0fbe036cece74ec311b8 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 09:45:08 -0500
Subject: Btrfs: don't clean old snapshots on sync(1)

Cleaning old snapshots can make sync(1) somewhat slow, and some users
and applications still use it in a global fsync kind of workload.

This patch changes btrfs not to clean old snapshots during sync, which is
safe from a FS consistency point of view.  The major downside is that it
makes it difficult to tell when old snapshots have been reaped and
the space they were using has been reclaimed.  A new ioctl will be added
for this purpose instead.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 66b8341e2dba..19a4daf03ccb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
 	btrfs_start_delalloc_inodes(root);
 	btrfs_wait_ordered_extents(root, 0);
 
-	btrfs_clean_old_snapshots(root);
 	trans = btrfs_start_transaction(root, 1);
 	ret = btrfs_commit_transaction(trans, root);
 	sb->s_dirt = 0;
-- 
cgit v1.2.3


From b335b0034e252e79ec2e9c6697f5d663c4627bec Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 10:06:04 -0500
Subject: Btrfs: Avoid using __GFP_HIGHMEM with slab allocator

btrfs_releasepage may call kmem_cache_alloc indirectly,
and provide same GFP flags it gets to kmem_cache_alloc.
So it's possible to use __GFP_HIGHMEM with the slab
allocator.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8f0706210a47..638bcb5e49f6 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4263,7 +4263,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	if (PageWriteback(page) || PageDirty(page))
 		return 0;
-	return __btrfs_releasepage(page, gfp_flags);
+	return __btrfs_releasepage(page, gfp_flags & GFP_NOFS);
 }
 
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
-- 
cgit v1.2.3


From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 10:06:15 -0500
Subject: Btrfs: balance_level checks !child after access

The BUG_ON() is in the wrong spot.

Signed-off-by: Jeff Mahoney <jeffm@suse.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35443cc4b9a9..6674692f7023 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
 
 		/* promote the child to a root */
 		child = read_node_slot(root, mid, 0);
+		BUG_ON(!child);
 		btrfs_tree_lock(child);
 		btrfs_set_lock_blocking(child);
-		BUG_ON(!child);
 		ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
 		BUG_ON(ret);
 
-- 
cgit v1.2.3


From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001
From: Jeff Mahoney <jeffm@suse.com>
Date: Thu, 12 Feb 2009 14:11:25 -0500
Subject: Btrfs: remove btrfs_init_path

btrfs_init_path was initially used when the path objects were on the
stack.  Now all the work is done by btrfs_alloc_path and btrfs_init_path
isn't required.

This patch removes it, and just uses kmem_cache_zalloc to zero out the object.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c     | 11 ++---------
 fs/btrfs/ctree.h     |  1 -
 fs/btrfs/inode-map.c |  1 -
 fs/btrfs/inode.c     |  2 --
 4 files changed, 2 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 6674692f7023..c8f4c540cc2c 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
 static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		   struct btrfs_path *path, int level, int slot);
 
-inline void btrfs_init_path(struct btrfs_path *p)
-{
-	memset(p, 0, sizeof(*p));
-}
-
 struct btrfs_path *btrfs_alloc_path(void)
 {
 	struct btrfs_path *path;
-	path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS);
-	if (path) {
-		btrfs_init_path(path);
+	path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
+	if (path)
 		path->reada = 1;
-	}
 	return path;
 }
 
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 531db112c8bd..3f7a8058df2b 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -1834,7 +1834,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
 void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
-void btrfs_init_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
 void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c
index 2aa79873eb46..cc7334d833c9 100644
--- a/fs/btrfs/inode-map.c
+++ b/fs/btrfs/inode-map.c
@@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans,
 	search_key.type = 0;
 	search_key.offset = 0;
 
-	btrfs_init_path(path);
 	start_found = 0;
 	ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0);
 	if (ret < 0)
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 638bcb5e49f6..3cee77ae03c8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
 	key.offset = (u64)-1;
 	key.type = (u8)-1;
 
-	btrfs_init_path(path);
-
 search_again:
 	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 	if (ret < 0)
-- 
cgit v1.2.3


From a48ddf08ba9bab91efd95e458737afa9d7699623 Mon Sep 17 00:00:00 2001
From: Qinghuang Feng <qhfeng.kernel@gmail.com>
Date: Thu, 12 Feb 2009 14:25:23 -0500
Subject: Btrfs: remove unused code in split_state()

These two lines are not used, remove them.

Signed-off-by: Qinghuang Feng <qhfeng.kernel@gmail.com>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/extent_io.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 37d43b516b79..ebe6b29e6069 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
 
 	node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node);
 	if (node) {
-		struct extent_state *found;
-		found = rb_entry(node, struct extent_state, rb_node);
 		free_extent_state(prealloc);
 		return -EEXIST;
 	}
-- 
cgit v1.2.3


From 3f3420df505e47751ef76a652b5cb660e5360d6f Mon Sep 17 00:00:00 2001
From: Julia Lawall <julia@diku.dk>
Date: Thu, 12 Feb 2009 10:16:03 -0500
Subject: Btrfs: fs/btrfs/volumes.c: remove useless kzalloc

The call to kzalloc is followed by a kmalloc whose result is stored in the
same variable.

The semantic match that finds the problem is as follows:
(http://www.emn.fr/x-info/coccinelle/)

// <smpl>
@r exists@
local idexpression x;
statement S;
expression E;
identifier f,l;
position p1,p2;
expression *ptr != NULL;
@@

(
if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S
|
x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...);
...
if (x == NULL) S
)
<... when != x
     when != if (...) { <+...x...+> }
x->f = E
...>
(
 return \(0\|<+...x...+>\|ptr\);
|
 return@p2 ...;
)

@script:python@
p1 << r.p1;
p2 << r.p2;
@@

print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line)
// </smpl>

Signed-off-by: Julia Lawall <julia@diku.dk>
Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/volumes.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bcd14ebccae1..c793b6f50d8d 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
 		free_extent_map(em);
 	}
 
-	map = kzalloc(sizeof(*map), GFP_NOFS);
-	if (!map)
-		return -ENOMEM;
-
 	em = alloc_extent_map(GFP_NOFS);
 	if (!em)
 		return -ENOMEM;
-- 
cgit v1.2.3


From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Thu, 12 Feb 2009 14:09:45 -0500
Subject: Btrfs: make a lockdep class for the extent buffer locks

Btrfs is currently using spin_lock_nested with a nested value based
on the tree depth of the block.  But, this doesn't quite work because
the max tree depth is bigger than what spin_lock_nested can deal with,
and because locks are sometimes taken before the level field is filled in.

The solution here is to use lockdep_set_class_and_name instead, and to
set the class before unlocking the pages when the block is read from the
disk and just after init of a freshly allocated tree block.

btrfs_clear_path_blocking is also changed to take the locks in the proper
order, and it also makes sure all the locks currently held are properly
set to blocking before it tries to retake the spinlocks.  Otherwise, lockdep
gets upset about bad lock orderin.

The lockdep magic cam from Peter Zijlstra <peterz@infradead.org>

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 45 ++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/ctree.h       | 10 +++-------
 fs/btrfs/disk-io.c     | 46 +++++++++++++++++++++++++++++++++++++++++++++-
 fs/btrfs/disk-io.h     | 10 ++++++++++
 fs/btrfs/extent-tree.c |  7 +++++--
 fs/btrfs/locking.c     | 11 -----------
 fs/btrfs/volumes.c     |  2 ++
 7 files changed, 99 insertions(+), 32 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index c8f4c540cc2c..42491d728e99 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p)
 
 /*
  * reset all the locked nodes in the patch to spinning locks.
+ *
+ * held is used to keep lockdep happy, when lockdep is enabled
+ * we set held to a blocking lock before we go around and
+ * retake all the spinlocks in the path.  You can safely use NULL
+ * for held
  */
-noinline void btrfs_clear_path_blocking(struct btrfs_path *p)
+noinline void btrfs_clear_path_blocking(struct btrfs_path *p,
+					struct extent_buffer *held)
 {
 	int i;
-	for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	/* lockdep really cares that we take all of these spinlocks
+	 * in the right order.  If any of the locks in the path are not
+	 * currently blocking, it is going to complain.  So, make really
+	 * really sure by forcing the path to blocking before we clear
+	 * the path blocking.
+	 */
+	if (held)
+		btrfs_set_lock_blocking(held);
+	btrfs_set_path_blocking(p);
+#endif
+
+	for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) {
 		if (p->nodes[i] && p->locks[i])
 			btrfs_clear_lock_blocking(p->nodes[i]);
 	}
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+	if (held)
+		btrfs_clear_lock_blocking(held);
+#endif
 }
 
 /* this also releases the path */
@@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 						  trans->transid, level, &ins);
 		BUG_ON(ret);
 		cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
-					    buf->len);
+					    buf->len, level);
 	} else {
 		cow = btrfs_alloc_free_block(trans, root, buf->len,
 					     parent_start,
@@ -1559,7 +1583,7 @@ cow_done:
 		if (!p->skip_locking)
 			p->locks[level] = 1;
 
-		btrfs_clear_path_blocking(p);
+		btrfs_clear_path_blocking(p, NULL);
 
 		/*
 		 * we have a lock on b and as long as we aren't changing
@@ -1598,7 +1622,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = split_node(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -1618,7 +1642,7 @@ cow_done:
 
 				btrfs_set_path_blocking(p);
 				sret = balance_level(trans, root, p, level);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				if (sret) {
 					ret = sret;
@@ -1681,13 +1705,13 @@ cow_done:
 			if (!p->skip_locking) {
 				int lret;
 
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 				lret = btrfs_try_spin_lock(b);
 
 				if (!lret) {
 					btrfs_set_path_blocking(p);
 					btrfs_tree_lock(b);
-					btrfs_clear_path_blocking(p);
+					btrfs_clear_path_blocking(p, b);
 				}
 			}
 		} else {
@@ -1699,7 +1723,7 @@ cow_done:
 				btrfs_set_path_blocking(p);
 				sret = split_leaf(trans, root, key,
 						      p, ins_len, ret == 0);
-				btrfs_clear_path_blocking(p);
+				btrfs_clear_path_blocking(p, NULL);
 
 				BUG_ON(sret > 0);
 				if (sret) {
@@ -3919,7 +3943,6 @@ find_next_key:
 				btrfs_release_path(root, path);
 				goto again;
 			} else {
-				btrfs_clear_path_blocking(path);
 				goto out;
 			}
 		}
@@ -3939,7 +3962,7 @@ find_next_key:
 		path->locks[level - 1] = 1;
 		path->nodes[level - 1] = cur;
 		unlock_up(path, level, 1);
-		btrfs_clear_path_blocking(path);
+		btrfs_clear_path_blocking(path, NULL);
 	}
 out:
 	if (ret == 0)
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 3f7a8058df2b..766b31ae3186 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -43,11 +43,7 @@ struct btrfs_ordered_sum;
 
 #define BTRFS_ACL_NOT_CACHED    ((void *)-1)
 
-#ifdef CONFIG_LOCKDEP
-# define BTRFS_MAX_LEVEL 7
-#else
-# define BTRFS_MAX_LEVEL 8
-#endif
+#define BTRFS_MAX_LEVEL 8
 
 /* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 					     u64 empty_size);
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize);
+					    u64 bytenr, u32 blocksize,
+					    int level);
 int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root,
 		       u64 num_bytes, u64 parent, u64 min_bytes,
@@ -1835,7 +1832,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p);
 struct btrfs_path *btrfs_alloc_path(void);
 void btrfs_free_path(struct btrfs_path *p);
 void btrfs_set_path_blocking(struct btrfs_path *p);
-void btrfs_clear_path_blocking(struct btrfs_path *p);
 void btrfs_unlock_up_safe(struct btrfs_path *p, int level);
 
 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5aebddd71193..adda739a0215 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -75,6 +75,40 @@ struct async_submit_bio {
 	struct btrfs_work work;
 };
 
+/* These are used to set the lockdep class on the extent buffer locks.
+ * The class is set by the readpage_end_io_hook after the buffer has
+ * passed csum validation but before the pages are unlocked.
+ *
+ * The lockdep class is also set by btrfs_init_new_buffer on freshly
+ * allocated blocks.
+ *
+ * The class is based on the level in the tree block, which allows lockdep
+ * to know that lower nodes nest inside the locks of higher nodes.
+ *
+ * We also add a check to make sure the highest level of the tree is
+ * the same as our lockdep setup here.  If BTRFS_MAX_LEVEL changes, this
+ * code needs update as well.
+ */
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+# if BTRFS_MAX_LEVEL != 8
+#  error
+# endif
+static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1];
+static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = {
+	/* leaf */
+	"btrfs-extent-00",
+	"btrfs-extent-01",
+	"btrfs-extent-02",
+	"btrfs-extent-03",
+	"btrfs-extent-04",
+	"btrfs-extent-05",
+	"btrfs-extent-06",
+	"btrfs-extent-07",
+	/* highest possible level */
+	"btrfs-extent-08",
+};
+#endif
+
 /*
  * extents on the btree inode are pretty simple, there's one extent
  * that covers the entire device
@@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root,
 	return ret;
 }
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level)
+{
+	lockdep_set_class_and_name(&eb->lock,
+			   &btrfs_eb_class[level],
+			   btrfs_eb_name[level]);
+}
+#endif
+
 static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 			       struct extent_state *state)
 {
@@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end,
 	}
 	found_level = btrfs_header_level(eb);
 
+	btrfs_set_buffer_lockdep_class(eb, found_level);
+
 	ret = csum_tree_block(root, eb, 1);
 	if (ret)
 		ret = -EIO;
@@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_DEV_TREE_OBJECTID, dev_root);
 	dev_root->track_dirty = 1;
-
 	if (ret)
 		goto fail_extent_root;
 
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 494a56eb2986..95029db227be 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root);
 int btree_lock_page_hook(struct page *page);
+
+
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level);
+#else
+static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb,
+						 int level)
+{
+}
+#endif
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c59e12036e20..cd86bffbdc9f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -3416,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
 
 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 					    struct btrfs_root *root,
-					    u64 bytenr, u32 blocksize)
+					    u64 bytenr, u32 blocksize,
+					    int level)
 {
 	struct extent_buffer *buf;
 
@@ -3424,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
 	if (!buf)
 		return ERR_PTR(-ENOMEM);
 	btrfs_set_header_generation(buf, trans->transid);
+	btrfs_set_buffer_lockdep_class(buf, level);
 	btrfs_tree_lock(buf);
 	clean_tree_block(trans, root, buf);
 
@@ -3467,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
 		return ERR_PTR(ret);
 	}
 
-	buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize);
+	buf = btrfs_init_new_buffer(trans, root, ins.objectid,
+				    blocksize, level);
 	return buf;
 }
 
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 9ebe9385129b..85506c4a3af7 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -25,21 +25,10 @@
 #include "extent_io.h"
 #include "locking.h"
 
-/*
- * btrfs_header_level() isn't free, so don't call it when lockdep isn't
- * on
- */
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static inline void spin_nested(struct extent_buffer *eb)
-{
-	spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb));
-}
-#else
 static inline void spin_nested(struct extent_buffer *eb)
 {
 	spin_lock(&eb->lock);
 }
-#endif
 
 /*
  * Setting a lock to blocking will drop the spinlock and set the
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index c793b6f50d8d..1316139bf9e8 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -3102,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root)
 	if (!sb)
 		return -ENOMEM;
 	btrfs_set_buffer_uptodate(sb);
+	btrfs_set_buffer_lockdep_class(sb, 0);
+
 	write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
 	array_size = btrfs_super_sys_array_size(super_copy);
 
-- 
cgit v1.2.3


From 2456242530a21cfee82646ebeeda65d3f74faa4c Mon Sep 17 00:00:00 2001
From: Yan Zheng <zheng.yan@oracle.com>
Date: Thu, 12 Feb 2009 14:14:53 -0500
Subject: Btrfs: hold trans_mutex when using btrfs_record_root_in_trans

btrfs_record_root_in_trans needs the trans_mutex held to make sure two
callers don't race to setup the root in a given transaction.  This adds
it to all the places that were missing it.

Signed-off-by: Yan Zheng <zheng.yan@oracle.com>
---
 fs/btrfs/extent-tree.c | 2 ++
 fs/btrfs/transaction.c | 2 ++
 fs/btrfs/tree-log.c    | 2 ++
 3 files changed, 6 insertions(+)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index cd86bffbdc9f..0a5d796c9f7e 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -5658,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root,
 			prev_block = block_start;
 		}
 
+		mutex_lock(&extent_root->fs_info->trans_mutex);
 		btrfs_record_root_in_trans(found_root);
+		mutex_unlock(&extent_root->fs_info->trans_mutex);
 		if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
 			/*
 			 * try to update data extent references while
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 919172de5c9a..4112d53d4f4d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
 		num_bytes -= btrfs_root_used(&dirty->root->root_item);
 		bytes_used = btrfs_root_used(&root->root_item);
 		if (num_bytes) {
+			mutex_lock(&root->fs_info->trans_mutex);
 			btrfs_record_root_in_trans(root);
+			mutex_unlock(&root->fs_info->trans_mutex);
 			btrfs_set_root_used(&root->root_item,
 					    bytes_used - num_bytes);
 		}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 20794290256b..9c462fbd60fa 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -2832,7 +2832,9 @@ again:
 		BUG_ON(!wc.replay_dest);
 
 		wc.replay_dest->log_root = log;
+		mutex_lock(&fs_info->trans_mutex);
 		btrfs_record_root_in_trans(wc.replay_dest);
+		mutex_unlock(&fs_info->trans_mutex);
 		ret = walk_log_tree(trans, log, &wc);
 		BUG_ON(ret);
 
-- 
cgit v1.2.3


From efab0b5d3eed6aa71f8e3233e4e11774eedc04dc Mon Sep 17 00:00:00 2001
From: Andres Salomon <dilinger@queued.net>
Date: Wed, 11 Feb 2009 13:27:02 -0800
Subject: [JFFS2] force the jffs2 GC daemon to behave a bit better

I've noticed some pretty poor behavior on OLPC machines after bootup, when
gdm/X are starting.  The GCD monopolizes the scheduler (which in turns
means it gets to do more nand i/o), which results in processes taking much
much longer than they should to start.

As an example, on an OLPC machine going from OFW to a usable X (via
auto-login gdm) takes 2m 30s.  The majority of this time is consumed by
the switch into graphical mode.  With this patch, we cut a full 60s off of
bootup time.  After bootup, things are much snappier as well.

Note that we have seen a CRC node error with this patch that causes the machine
to fail to boot, but we've also seen that problem without this patch.

Signed-off-by: Andres Salomon <dilinger@debian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/background.c | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c
index 3cceef4ad2b7..e9580104b6ba 100644
--- a/fs/jffs2/background.c
+++ b/fs/jffs2/background.c
@@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c)
 			spin_unlock(&c->erase_completion_lock);
 			
 
-		/* This thread is purely an optimisation. But if it runs when
-		   other things could be running, it actually makes things a
-		   lot worse. Use yield() and put it at the back of the runqueue
-		   every time. Especially during boot, pulling an inode in
-		   with read_inode() is much preferable to having the GC thread
-		   get there first. */
-		yield();
+		/* Problem - immediately after bootup, the GCD spends a lot
+		 * of time in places like jffs2_kill_fragtree(); so much so
+		 * that userspace processes (like gdm and X) are starved
+		 * despite plenty of cond_resched()s and renicing.  Yield()
+		 * doesn't help, either (presumably because userspace and GCD
+		 * are generally competing for a higher latency resource -
+		 * disk).
+		 * This forces the GCD to slow the hell down.   Pulling an
+		 * inode in with read_inode() is much preferable to having
+		 * the GC thread get there first. */
+		schedule_timeout_interruptible(msecs_to_jiffies(50));
 
 		/* Put_super will send a SIGKILL and then wait on the sem.
 		 */
-- 
cgit v1.2.3


From d794bf8e0936dce45104565cd48c571061f4c1e3 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 14 Feb 2009 10:31:16 -0500
Subject: ext4: Initialize preallocation list_head's properly

When creating a new ext4_prealloc_space structure, we have to
initialize its list_head pointers before we add them to any prealloc
lists.  Otherwise, with list debug enabled, we will get list
corruption warnings.

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/mballoc.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index c962d0690505..4415beeb0b62 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
 	pa->pa_free = pa->pa_len;
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
+	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 0;
 
@@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
 	atomic_set(&pa->pa_count, 1);
 	spin_lock_init(&pa->pa_lock);
 	INIT_LIST_HEAD(&pa->pa_inode_list);
+	INIT_LIST_HEAD(&pa->pa_group_list);
 	pa->pa_deleted = 0;
 	pa->pa_linear = 1;
 
-- 
cgit v1.2.3


From 2acf2c261b823d9d9ed954f348b97620297a36b5 Mon Sep 17 00:00:00 2001
From: "Aneesh Kumar K.V" <aneesh.kumar@linux.vnet.ibm.com>
Date: Sat, 14 Feb 2009 10:42:58 -0500
Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of
 write_cache_pages

With delayed allocation we lock the page in write_cache_pages() and
try to build an in memory extent of contiguous blocks.  This is needed
so that we can get large contiguous blocks request.  If range_cyclic
mode is enabled, write_cache_pages() will loop back to the 0 index if
no I/O has been done yet, and try to start writing from the beginning
of the range.  That causes an attempt to take the page lock of lower
index page while holding the page lock of higher index page, which can
cause a dead lock with another writeback thread.

The solution is to implement the range_cyclic behavior in
ext4_da_writepages() instead.

http://bugzilla.kernel.org/show_bug.cgi?id=12579

Signed-off-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 658c4a7f2578..cbd2ca99d113 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2439,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	int no_nrwrite_index_update;
 	int pages_written = 0;
 	long pages_skipped;
+	int range_cyclic, cycled = 1, io_done = 0;
 	int needed_blocks, ret = 0, nr_to_writebump = 0;
 	struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb);
 
@@ -2490,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping,
 	if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
 		range_whole = 1;
 
-	if (wbc->range_cyclic)
+	range_cyclic = wbc->range_cyclic;
+	if (wbc->range_cyclic) {
 		index = mapping->writeback_index;
-	else
+		if (index)
+			cycled = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = LLONG_MAX;
+		wbc->range_cyclic = 0;
+	} else
 		index = wbc->range_start >> PAGE_CACHE_SHIFT;
 
 	mpd.wbc = wbc;
@@ -2506,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 	wbc->no_nrwrite_index_update = 1;
 	pages_skipped = wbc->pages_skipped;
 
+retry:
 	while (!ret && wbc->nr_to_write > 0) {
 
 		/*
@@ -2548,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 			pages_written += mpd.pages_written;
 			wbc->pages_skipped = pages_skipped;
 			ret = 0;
+			io_done = 1;
 		} else if (wbc->nr_to_write)
 			/*
 			 * There is no more writeout needed
@@ -2556,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping,
 			 */
 			break;
 	}
+	if (!io_done && !cycled) {
+		cycled = 1;
+		index = 0;
+		wbc->range_start = index << PAGE_CACHE_SHIFT;
+		wbc->range_end  = mapping->writeback_index - 1;
+		goto retry;
+	}
 	if (pages_skipped != wbc->pages_skipped)
 		printk(KERN_EMERG "This should not happen leaving %s "
 				"with nr_to_write = %ld ret = %d\n",
@@ -2563,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping,
 
 	/* Update index */
 	index += pages_written;
+	wbc->range_cyclic = range_cyclic;
 	if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
 		/*
 		 * set the writeback_index so that range_cyclic
-- 
cgit v1.2.3


From 090542641de833c6f756895fc2f139f046e298f9 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <error27@gmail.com>
Date: Sun, 15 Feb 2009 20:02:19 -0500
Subject: ext4: Fix NULL dereference in ext4_ext_migrate()'s error handling

This was found through a code checker (http://repo.or.cz/w/smatch.git/).
It looks like you might be able to trigger the error by trying to migrate
a readonly file system.

Signed-off-by: Dan Carpenter <error27@gmail.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/migrate.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 734abca25e35..fe64d9f79852 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode)
 					+ 1);
 	if (IS_ERR(handle)) {
 		retval = PTR_ERR(handle);
-		goto err_out;
+		return retval;
 	}
 	tmp_inode = ext4_new_inode(handle,
 				inode->i_sb->s_root->d_inode,
@@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode)
 	if (IS_ERR(tmp_inode)) {
 		retval = -ENOMEM;
 		ext4_journal_stop(handle);
-		tmp_inode = NULL;
-		goto err_out;
+		return retval;
 	}
 	i_size_write(tmp_inode, i_size_read(inode));
 	/*
@@ -618,8 +617,7 @@ err_out:
 
 	ext4_journal_stop(handle);
 
-	if (tmp_inode)
-		iput(tmp_inode);
+	iput(tmp_inode);
 
 	return retval;
 }
-- 
cgit v1.2.3


From 1a88b5364b535edaa321d70a566e358390ff0872 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Mon, 16 Feb 2009 02:38:12 +0000
Subject: Fix incomplete __mntput locking

Getting this wrong caused

	WARNING: at fs/namespace.c:636 mntput_no_expire+0xac/0xf2()

due to optimistically checking cpu_writer->mnt outside the spinlock.

Here's what we really want:
 * we know that nobody will set cpu_writer->mnt to mnt from now on
 * all changes to that sucker are done under cpu_writer->lock
 * we want the laziest equivalent of
	spin_lock(&cpu_writer->lock);
	if (likely(cpu_writer->mnt != mnt)) {
		spin_unlock(&cpu_writer->lock);
		continue;
	}
	/* do stuff */
  that would make sure we won't miss earlier setting of ->mnt done by
  another CPU.

Anyway, for now we just move the spin_lock() earlier and move the test
into the properly locked region.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Reported-and-tested-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namespace.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namespace.c b/fs/namespace.c
index 228d8c4bfd18..06f8e63f6cb1 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt)
 	 */
 	for_each_possible_cpu(cpu) {
 		struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu);
-		if (cpu_writer->mnt != mnt)
-			continue;
 		spin_lock(&cpu_writer->lock);
+		if (cpu_writer->mnt != mnt) {
+			spin_unlock(&cpu_writer->lock);
+			continue;
+		}
 		atomic_add(cpu_writer->count, &mnt->__mnt_writers);
 		cpu_writer->count = 0;
 		/*
-- 
cgit v1.2.3


From a60e78e57a17d55bbd5a96da16fe9649d364b987 Mon Sep 17 00:00:00 2001
From: Subhash Peddamallu <subhash.peddamallu@gmail.com>
Date: Mon, 16 Feb 2009 10:27:07 +0100
Subject: fs/bio: bio_alloc_bioset: pass right object ptr to mempool_free

When freeing from bio pool use right ptr to account for bs->front_pad,
instead of bio ptr,

Signed-off-by: Subhash Peddamallu <subhash.peddamallu@gmail.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 062299acbccd..72ab251cdb9c 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,9 +302,10 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = NULL;
+	void *p;
 
 	if (bs) {
-		void *p = mempool_alloc(bs->bio_pool, gfp_mask);
+		p = mempool_alloc(bs->bio_pool, gfp_mask);
 
 		if (p)
 			bio = p + bs->front_pad;
@@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 			}
 			if (unlikely(!bvl)) {
 				if (bs)
-					mempool_free(bio, bs->bio_pool);
+					mempool_free(p, bs->bio_pool);
 				else
 					kfree(bio);
 				bio = NULL;
-- 
cgit v1.2.3


From 78f707bfc723552e8309b7c38a8d0cc51012e813 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Tue, 17 Feb 2009 13:59:08 +0100
Subject: block: revert part of 18ce3751ccd488c78d3827e9f6bf54e6322676fb

The above commit added WRITE_SYNC and switched various places to using
that for committing writes that will be waited upon immediately after
submission. However, this causes a performance regression with AS and CFQ
for ext3 at least, since sync_dirty_buffer() will submit some writes with
WRITE_SYNC while ext3 has sumitted others dependent writes without the sync
flag set. This causes excessive anticipation/idling in the IO scheduler
because sync and async writes get interleaved, causing a big performance
regression for the below test case (which is meant to simulate sqlite
like behaviour).

---- test case ----

int main(int argc, char **argv)
{

	int fdes, i;
	FILE *fp;
	struct timeval start;
	struct timeval end;
	struct timeval res;

	gettimeofday(&start, NULL);
	for (i=0; i<ROWS; i++) {
		fp = fopen("test_file", "a");
		fprintf(fp, "Some Text Data\n");
		fdes = fileno(fp);
		fsync(fdes);
		fclose(fp);
	}
	gettimeofday(&end, NULL);

	timersub(&end, &start, &res);
	fprintf(stdout, "time to write %d lines is %ld(msec)\n", ROWS,
			(res.tv_sec*1000000 + res.tv_usec)/1000);

	return 0;
}

-------------------

Thanks to Sean.White@APCC.com for tracking down this performance
regression and providing a test case.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..62b57e330b69 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -3108,7 +3108,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
 	if (test_clear_buffer_dirty(bh)) {
 		get_bh(bh);
 		bh->b_end_io = end_buffer_write_sync;
-		ret = submit_bh(WRITE_SYNC, bh);
+		ret = submit_bh(WRITE, bh);
 		wait_on_buffer(bh);
 		if (buffer_eopnotsupp(bh)) {
 			clear_buffer_eopnotsupp(bh);
-- 
cgit v1.2.3


From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001
From: Eric Biederman <ebiederm@xmission.com>
Date: Wed, 18 Feb 2009 14:48:16 -0800
Subject: seq_file: properly cope with pread

Currently seq_read assumes that the offset passed to it is always the
offset it passed to user space.  In the case pread this assumption is
broken and we do the wrong thing when presented with pread.

To solve this I introduce an offset cache inside of struct seq_file so we
know where our logical file position is.  Then in seq_read if we try to
read from another offset we reset our data structures and attempt to go to
the offset user space wanted.

[akpm@linux-foundation.org: restore FMODE_PWRITE]
[pjt@google.com: seq_open needs its fmode opened up to take advantage of this]
Signed-off-by: Eric Biederman <ebiederm@xmission.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Turner <pjt@google.com>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/seq_file.c            | 36 ++++++++++++++++++++++++++++++++----
 include/linux/seq_file.h |  1 +
 2 files changed, 33 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/seq_file.c b/fs/seq_file.c
index 5267098532bf..a1a4cfe19210 100644
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	 */
 	file->f_version = 0;
 
-	/* SEQ files support lseek, but not pread/pwrite */
-	file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE);
+	/*
+	 * seq_files support lseek() and pread().  They do not implement
+	 * write() at all, but we clear FMODE_PWRITE here for historical
+	 * reasons.
+	 *
+	 * If a client of seq_files a) implements file.write() and b) wishes to
+	 * support pwrite() then that client will need to implement its own
+	 * file.open() which calls seq_open() and then sets FMODE_PWRITE.
+	 */
+	file->f_mode &= ~FMODE_PWRITE;
 	return 0;
 }
 EXPORT_SYMBOL(seq_open);
@@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos)
 	int err = 0;
 
 	mutex_lock(&m->lock);
+
+	/* Don't assume *ppos is where we left it */
+	if (unlikely(*ppos != m->read_pos)) {
+		m->read_pos = *ppos;
+		while ((err = traverse(m, *ppos)) == -EAGAIN)
+			;
+		if (err) {
+			/* With prejudice... */
+			m->read_pos = 0;
+			m->version = 0;
+			m->index = 0;
+			m->count = 0;
+			goto Done;
+		}
+	}
+
 	/*
 	 * seq_file->op->..m_start/m_stop/m_next may do special actions
 	 * or optimisations based on the file->f_version, so we want to
@@ -230,8 +254,10 @@ Fill:
 Done:
 	if (!copied)
 		copied = err;
-	else
+	else {
 		*ppos += copied;
+		m->read_pos += copied;
+	}
 	file->f_version = m->version;
 	mutex_unlock(&m->lock);
 	return copied;
@@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin)
 			if (offset < 0)
 				break;
 			retval = offset;
-			if (offset != file->f_pos) {
+			if (offset != m->read_pos) {
 				while ((retval=traverse(m, offset)) == -EAGAIN)
 					;
 				if (retval) {
 					/* with extreme prejudice... */
 					file->f_pos = 0;
+					m->read_pos = 0;
 					m->version = 0;
 					m->index = 0;
 					m->count = 0;
 				} else {
+					m->read_pos = offset;
 					retval = file->f_pos = offset;
 				}
 			}
diff --git a/include/linux/seq_file.h b/include/linux/seq_file.h
index 40ea5058c2ec..f616f31576d7 100644
--- a/include/linux/seq_file.h
+++ b/include/linux/seq_file.h
@@ -19,6 +19,7 @@ struct seq_file {
 	size_t from;
 	size_t count;
 	loff_t index;
+	loff_t read_pos;
 	u64 version;
 	struct mutex lock;
 	const struct seq_operations *op;
-- 
cgit v1.2.3


From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: timerfd: add flags check

As requested by Michael, add a missing check for valid flags in
timerfd_settime(), and make it return EINVAL in case some extra bits are
set.

Michael said:
If this is to be any use to userland apps that want to check flag
support (perhaps it is too late already), then the sooner we get it
into the kernel the better: 2.6.29 would be good; earlier stables as
well would be even better.

[akpm@linux-foundation.org: remove unused TFD_FLAGS_SET]
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: <stable@kernel.org>		[2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/timerfd.c            | 12 ++++++------
 include/linux/timerfd.h | 16 ++++++++++++----
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/timerfd.c b/fs/timerfd.c
index 6a123b8ff3f5..b042bd7034b1 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC);
 	BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK);
 
-	if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK))
-		return -EINVAL;
-	if (clockid != CLOCK_MONOTONIC &&
-	    clockid != CLOCK_REALTIME)
+	if ((flags & ~TFD_CREATE_FLAGS) ||
+	    (clockid != CLOCK_MONOTONIC &&
+	     clockid != CLOCK_REALTIME))
 		return -EINVAL;
 
 	ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 	hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS);
 
 	ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx,
-			       flags & (O_CLOEXEC | O_NONBLOCK));
+			       flags & TFD_SHARED_FCNTL_FLAGS);
 	if (ufd < 0)
 		kfree(ctx);
 
@@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags,
 	if (copy_from_user(&ktmr, utmr, sizeof(ktmr)))
 		return -EFAULT;
 
-	if (!timespec_valid(&ktmr.it_value) ||
+	if ((flags & ~TFD_SETTIME_FLAGS) ||
+	    !timespec_valid(&ktmr.it_value) ||
 	    !timespec_valid(&ktmr.it_interval))
 		return -EINVAL;
 
diff --git a/include/linux/timerfd.h b/include/linux/timerfd.h
index 86cb0501d3e2..2d0792983f8c 100644
--- a/include/linux/timerfd.h
+++ b/include/linux/timerfd.h
@@ -11,13 +11,21 @@
 /* For O_CLOEXEC and O_NONBLOCK */
 #include <linux/fcntl.h>
 
-/* Flags for timerfd_settime.  */
+/*
+ * CAREFUL: Check include/asm-generic/fcntl.h when defining
+ * new flags, since they might collide with O_* ones. We want
+ * to re-use O_* flags that couldn't possibly have a meaning
+ * from eventfd, in order to leave a free define-space for
+ * shared O_* flags.
+ */
 #define TFD_TIMER_ABSTIME (1 << 0)
-
-/* Flags for timerfd_create.  */
 #define TFD_CLOEXEC O_CLOEXEC
 #define TFD_NONBLOCK O_NONBLOCK
 
+#define TFD_SHARED_FCNTL_FLAGS (TFD_CLOEXEC | TFD_NONBLOCK)
+/* Flags for timerfd_create.  */
+#define TFD_CREATE_FLAGS TFD_SHARED_FCNTL_FLAGS
+/* Flags for timerfd_settime.  */
+#define TFD_SETTIME_FLAGS TFD_TIMER_ABSTIME
 
 #endif /* _LINUX_TIMERFD_H */
-
-- 
cgit v1.2.3


From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 18 Feb 2009 14:48:18 -0800
Subject: mm: task dirty accounting fix

YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for
cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty).

Additionally, there is some inconsistency about when task_dirty_inc is
called.  It is used for dirty balancing, however it even gets called for
__set_page_dirty_no_writeback.

So rather than increment it in a set_page_dirty wrapper, move it down to
exactly where the dirty page accounting stats are incremented.

Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c         |  1 +
 include/linux/mm.h  |  1 +
 mm/page-writeback.c | 13 +++----------
 3 files changed, 5 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 665d446b25bc..ff4d1cdd779b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page,
 			__inc_zone_page_state(page, NR_FILE_DIRTY);
 			__inc_bdi_stat(mapping->backing_dev_info,
 					BDI_RECLAIMABLE);
+			task_dirty_inc(current);
 			task_io_account_write(PAGE_CACHE_SIZE);
 		}
 		radix_tree_tag_set(&mapping->page_tree,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 7dc04ff5ab89..10074212a35b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1159,6 +1159,7 @@ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
+void task_dirty_inc(struct task_struct *tsk);
 
 /* readahead.c */
 #define VM_MAX_READAHEAD	128	/* kbytes */
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3c84128596ba..74dc57c74349 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -240,7 +240,7 @@ void bdi_writeout_inc(struct backing_dev_info *bdi)
 }
 EXPORT_SYMBOL_GPL(bdi_writeout_inc);
 
-static inline void task_dirty_inc(struct task_struct *tsk)
+void task_dirty_inc(struct task_struct *tsk)
 {
 	prop_inc_single(&vm_dirties, &tsk->dirties);
 }
@@ -1230,6 +1230,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 				__inc_zone_page_state(page, NR_FILE_DIRTY);
 				__inc_bdi_stat(mapping->backing_dev_info,
 						BDI_RECLAIMABLE);
+				task_dirty_inc(current);
 				task_io_account_write(PAGE_CACHE_SIZE);
 			}
 			radix_tree_tag_set(&mapping->page_tree,
@@ -1262,7 +1263,7 @@ EXPORT_SYMBOL(redirty_page_for_writepage);
  * If the mapping doesn't provide a set_page_dirty a_op, then
  * just fall through and assume that it wants buffer_heads.
  */
-static int __set_page_dirty(struct page *page)
+int set_page_dirty(struct page *page)
 {
 	struct address_space *mapping = page_mapping(page);
 
@@ -1280,14 +1281,6 @@ static int __set_page_dirty(struct page *page)
 	}
 	return 0;
 }
-
-int set_page_dirty(struct page *page)
-{
-	int ret = __set_page_dirty(page);
-	if (ret)
-		task_dirty_inc(current);
-	return ret;
-}
 EXPORT_SYMBOL(set_page_dirty);
 
 /*
-- 
cgit v1.2.3


From ada723dcd681e2dffd7d73345cc8fda0eb0df9bd Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 18 Feb 2009 14:48:30 -0800
Subject: fs/super.c: add lockdep annotation to s_umount

Li Zefan said:

Thread 1:
  for ((; ;))
  {
      mount -t cpuset xxx /mnt > /dev/null 2>&1
      cat /mnt/cpus > /dev/null 2>&1
      umount /mnt > /dev/null 2>&1
  }

Thread 2:
  for ((; ;))
  {
      mount -t cpuset xxx /mnt > /dev/null 2>&1
      umount /mnt > /dev/null 2>&1
  }

(Note: It is irrelevant which cgroup subsys is used.)

After a while a lockdep warning showed up:

=============================================
[ INFO: possible recursive locking detected ]
2.6.28 #479
---------------------------------------------
mount/13554 is trying to acquire lock:
 (&type->s_umount_key#19){--..}, at: [<c049d888>] sget+0x5e/0x321

but task is already holding lock:
 (&type->s_umount_key#19){--..}, at: [<c049da0c>] sget+0x1e2/0x321

other info that might help us debug this:
1 lock held by mount/13554:
 #0:  (&type->s_umount_key#19){--..}, at: [<c049da0c>] sget+0x1e2/0x321

stack backtrace:
Pid: 13554, comm: mount Not tainted 2.6.28-mc #479
Call Trace:
 [<c044ad2e>] validate_chain+0x4c6/0xbbd
 [<c044ba9b>] __lock_acquire+0x676/0x700
 [<c044bb82>] lock_acquire+0x5d/0x7a
 [<c049d888>] ? sget+0x5e/0x321
 [<c061b9b8>] down_write+0x34/0x50
 [<c049d888>] ? sget+0x5e/0x321
 [<c049d888>] sget+0x5e/0x321
 [<c045a2e7>] ? cgroup_set_super+0x0/0x3e
 [<c045959f>] ? cgroup_test_super+0x0/0x2f
 [<c045bcea>] cgroup_get_sb+0x98/0x2e7
 [<c045cfb6>] cpuset_get_sb+0x4a/0x5f
 [<c049dfa4>] vfs_kern_mount+0x40/0x7b
 [<c049e02d>] do_kern_mount+0x37/0xbf
 [<c04af4a0>] do_mount+0x5c3/0x61a
 [<c04addd2>] ? copy_mount_options+0x2c/0x111
 [<c04af560>] sys_mount+0x69/0xa0
 [<c0403251>] sysenter_do_call+0x12/0x31

The cause is after alloc_super() and then retry, an old entry in list
fs_supers is found, so grab_super(old) is called, but both functions hold
s_umount lock:

struct super_block *sget(...)
{
	...
retry:
	spin_lock(&sb_lock);
	if (test) {
		list_for_each_entry(old, &type->fs_supers, s_instances) {
			if (!test(old, data))
				continue;
			if (!grab_super(old))  <--- 2nd: down_write(&old->s_umount);
				goto retry;
			if (s)
				destroy_super(s);
			return old;
		}
	}
	if (!s) {
		spin_unlock(&sb_lock);
		s = alloc_super(type);   <--- 1th: down_write(&s->s_umount)
		if (!s)
			return ERR_PTR(-ENOMEM);
		goto retry;
	}
	...
}

It seems like a false positive, and seems like VFS but not cgroup needs to
be fixed.

Peter said:

We can simply put the new s_umount instance in a but lockdep doesn't
particularly cares about subclass order.

If there's any issue with the callers of sget() assuming the s_umount lock
being of sublcass 0, then there is another annotation we can use to fix
that, but lets not bother with that if this is sufficient.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12673

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: Li Zefan <lizf@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Paul Menage <menage@google.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/super.c | 17 ++++++++++++++++-
 1 file changed, 16 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/super.c b/fs/super.c
index 61dce001dd57..8349ed6b1412 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type)
 		 * lock ordering than usbfs:
 		 */
 		lockdep_set_class(&s->s_lock, &type->s_lock_key);
-		down_write(&s->s_umount);
+		/*
+		 * sget() can have s_umount recursion.
+		 *
+		 * When it cannot find a suitable sb, it allocates a new
+		 * one (this one), and tries again to find a suitable old
+		 * one.
+		 *
+		 * In case that succeeds, it will acquire the s_umount
+		 * lock of the old one. Since these are clearly distrinct
+		 * locks, and this object isn't exposed yet, there's no
+		 * risk of deadlocks.
+		 *
+		 * Annotate this by putting this lock in a different
+		 * subclass.
+		 */
+		down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING);
 		s->s_count = S_BIAS;
 		atomic_set(&s->s_active, 1);
 		mutex_init(&s->s_vfs_rename_mutex);
-- 
cgit v1.2.3


From 2db69a9340da12a4db44edb7506dd68799aeff55 Mon Sep 17 00:00:00 2001
From: Bill Nottingham <notting@redhat.com>
Date: Wed, 18 Feb 2009 14:48:39 -0800
Subject: vt: Declare PIO_CMAP/GIO_CMAP as compatbile ioctls.

Otherwise, these don't work when called from 32-bit userspace on 64-bit
kernels.

Cc: Jiri Kosina <jkosina@suse.cz>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: <stable@kernel.org>		[2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x]
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 9c6d815dd191..39bd4d38e889 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1938,6 +1938,8 @@ ULONG_IOCTL(SET_BITMAP_FILE)
 /* Big K */
 COMPATIBLE_IOCTL(PIO_FONT)
 COMPATIBLE_IOCTL(GIO_FONT)
+COMPATIBLE_IOCTL(PIO_CMAP)
+COMPATIBLE_IOCTL(GIO_CMAP)
 ULONG_IOCTL(KDSIGACCEPT)
 COMPATIBLE_IOCTL(KDGETKEYCODE)
 COMPATIBLE_IOCTL(KDSETKEYCODE)
-- 
cgit v1.2.3


From f04b30de3c82528f1ab4c58b3dd4c975f5341901 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 14:48:43 -0800
Subject: inotify: fix GFP_KERNEL related deadlock

Enhanced lockdep coverage of __GFP_NOFS turned up this new lockdep
assert:

[ 1093.677775]
[ 1093.677781] =================================
[ 1093.680031] [ INFO: inconsistent lock state ]
[ 1093.680031] 2.6.29-rc5-tip-01504-gb49eca1-dirty #1
[ 1093.680031] ---------------------------------
[ 1093.680031] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage.
[ 1093.680031] kswapd0/308 [HC0[0]:SC0[0]:HE1:SE1] takes:
[ 1093.680031]  (&inode->inotify_mutex){+.+.?.}, at: [<c0205942>] inotify_inode_is_dead+0x20/0x80
[ 1093.680031] {RECLAIM_FS-ON-W} state was registered at:
[ 1093.680031]   [<c01696b9>] mark_held_locks+0x43/0x5b
[ 1093.680031]   [<c016baa4>] lockdep_trace_alloc+0x6c/0x6e
[ 1093.680031]   [<c01cf8b0>] kmem_cache_alloc+0x20/0x150
[ 1093.680031]   [<c040d0ec>] idr_pre_get+0x27/0x6c
[ 1093.680031]   [<c02056e3>] inotify_handle_get_wd+0x25/0xad
[ 1093.680031]   [<c0205f43>] inotify_add_watch+0x7a/0x129
[ 1093.680031]   [<c020679e>] sys_inotify_add_watch+0x20f/0x250
[ 1093.680031]   [<c010389e>] sysenter_do_call+0x12/0x35
[ 1093.680031]   [<ffffffff>] 0xffffffff
[ 1093.680031] irq event stamp: 60417
[ 1093.680031] hardirqs last  enabled at (60417): [<c018d5f5>] call_rcu+0x53/0x59
[ 1093.680031] hardirqs last disabled at (60416): [<c018d5b9>] call_rcu+0x17/0x59
[ 1093.680031] softirqs last  enabled at (59656): [<c0146229>] __do_softirq+0x157/0x16b
[ 1093.680031] softirqs last disabled at (59651): [<c0106293>] do_softirq+0x74/0x15d
[ 1093.680031]
[ 1093.680031] other info that might help us debug this:
[ 1093.680031] 2 locks held by kswapd0/308:
[ 1093.680031]  #0:  (shrinker_rwsem){++++..}, at: [<c01b0502>] shrink_slab+0x36/0x189
[ 1093.680031]  #1:  (&type->s_umount_key#4){+++++.}, at: [<c01e6d77>] shrink_dcache_memory+0x110/0x1fb
[ 1093.680031]
[ 1093.680031] stack backtrace:
[ 1093.680031] Pid: 308, comm: kswapd0 Not tainted 2.6.29-rc5-tip-01504-gb49eca1-dirty #1
[ 1093.680031] Call Trace:
[ 1093.680031]  [<c016947a>] valid_state+0x12a/0x13d
[ 1093.680031]  [<c016954e>] mark_lock+0xc1/0x1e9
[ 1093.680031]  [<c016a5b4>] ? check_usage_forwards+0x0/0x3f
[ 1093.680031]  [<c016ab74>] __lock_acquire+0x2c6/0xac8
[ 1093.680031]  [<c01688d9>] ? register_lock_class+0x17/0x228
[ 1093.680031]  [<c016b3d3>] lock_acquire+0x5d/0x7a
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c08824c4>] __mutex_lock_common+0x3a/0x4cb
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c08829ed>] mutex_lock_nested+0x2e/0x36
[ 1093.680031]  [<c0205942>] ? inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c0205942>] inotify_inode_is_dead+0x20/0x80
[ 1093.680031]  [<c01e6672>] dentry_iput+0x90/0xc2
[ 1093.680031]  [<c01e67a3>] d_kill+0x21/0x45
[ 1093.680031]  [<c01e6a46>] __shrink_dcache_sb+0x27f/0x355
[ 1093.680031]  [<c01e6dc5>] shrink_dcache_memory+0x15e/0x1fb
[ 1093.680031]  [<c01b05ed>] shrink_slab+0x121/0x189
[ 1093.680031]  [<c01b0d12>] kswapd+0x39f/0x561
[ 1093.680031]  [<c01ae499>] ? isolate_pages_global+0x0/0x233
[ 1093.680031]  [<c0157eae>] ? autoremove_wake_function+0x0/0x43
[ 1093.680031]  [<c01b0973>] ? kswapd+0x0/0x561
[ 1093.680031]  [<c0157daf>] kthread+0x41/0x82
[ 1093.680031]  [<c0157d6e>] ? kthread+0x0/0x82
[ 1093.680031]  [<c01043ab>] kernel_thread_helper+0x7/0x10

inotify_handle_get_wd() does idr_pre_get() which does a
kmem_cache_alloc() without __GFP_FS - and is hence deadlockable under
extreme MM pressure.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: MinChan Kim <minchan.kim@gmail.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/inotify/inotify.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c
index dae3f28f30d4..331f2e88e284 100644
--- a/fs/notify/inotify/inotify.c
+++ b/fs/notify/inotify/inotify.c
@@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih,
 	int ret;
 
 	do {
-		if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL)))
+		if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS)))
 			return -ENOSPC;
 		ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd);
 	} while (ret == -EAGAIN);
-- 
cgit v1.2.3


From 7fdf582447aa01658b624adc0a51a31e4278b68c Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Wed, 18 Feb 2009 15:41:28 -0600
Subject: Revert "[XFS] use scalable vmap API"

This reverts commit 95f8e302c04c0b0c6de35ab399a5551605eeb006.

This commit caused regression. We'll try to fix use of new
vmap API for next release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index d71dc44e21ed..0b2177a9fbdc 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -264,7 +264,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count);
+                       vunmap(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,8 +386,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
-               bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count,
-                                       -1, PAGE_KERNEL);
+		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
+					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
 			return -ENOMEM;
 		bp->b_addr += bp->b_offset;
-- 
cgit v1.2.3


From 27e88bf6af7d42adf790f7b2ed7d65475f191cf2 Mon Sep 17 00:00:00 2001
From: Felix Blyakher <felixb@sgi.com>
Date: Wed, 18 Feb 2009 15:56:51 -0600
Subject: Revert "[XFS] remove old vmap cache"

This reverts commit d2859751cd0bf586941ffa7308635a293f943c17.

This commit caused regression. We'll try to fix use of new
vmap API for next release.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Felix Blyakher <felixb@sgi.com>
---
 fs/xfs/linux-2.6/xfs_buf.c | 75 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 74 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 0b2177a9fbdc..cb329edc925b 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -165,6 +165,75 @@ test_page_region(
 	return (mask && (page_private(page) & mask) == mask);
 }
 
+/*
+ *	Mapping of multi-page buffers into contiguous virtual space
+ */
+
+typedef struct a_list {
+	void		*vm_addr;
+	struct a_list	*next;
+} a_list_t;
+
+static a_list_t		*as_free_head;
+static int		as_list_len;
+static DEFINE_SPINLOCK(as_lock);
+
+/*
+ *	Try to batch vunmaps because they are costly.
+ */
+STATIC void
+free_address(
+	void		*addr)
+{
+	a_list_t	*aentry;
+
+#ifdef CONFIG_XEN
+	/*
+	 * Xen needs to be able to make sure it can get an exclusive
+	 * RO mapping of pages it wants to turn into a pagetable.  If
+	 * a newly allocated page is also still being vmap()ed by xfs,
+	 * it will cause pagetable construction to fail.  This is a
+	 * quick workaround to always eagerly unmap pages so that Xen
+	 * is happy.
+	 */
+	vunmap(addr);
+	return;
+#endif
+
+	aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT);
+	if (likely(aentry)) {
+		spin_lock(&as_lock);
+		aentry->next = as_free_head;
+		aentry->vm_addr = addr;
+		as_free_head = aentry;
+		as_list_len++;
+		spin_unlock(&as_lock);
+	} else {
+		vunmap(addr);
+	}
+}
+
+STATIC void
+purge_addresses(void)
+{
+	a_list_t	*aentry, *old;
+
+	if (as_free_head == NULL)
+		return;
+
+	spin_lock(&as_lock);
+	aentry = as_free_head;
+	as_free_head = NULL;
+	as_list_len = 0;
+	spin_unlock(&as_lock);
+
+	while ((old = aentry) != NULL) {
+		vunmap(aentry->vm_addr);
+		aentry = aentry->next;
+		kfree(old);
+	}
+}
+
 /*
  *	Internal xfs_buf_t object manipulation
  */
@@ -264,7 +333,7 @@ xfs_buf_free(
 		uint		i;
 
 		if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1))
-                       vunmap(bp->b_addr - bp->b_offset);
+			free_address(bp->b_addr - bp->b_offset);
 
 		for (i = 0; i < bp->b_page_count; i++) {
 			struct page	*page = bp->b_pages[i];
@@ -386,6 +455,8 @@ _xfs_buf_map_pages(
 		bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset;
 		bp->b_flags |= XBF_MAPPED;
 	} else if (flags & XBF_MAPPED) {
+		if (as_list_len > 64)
+			purge_addresses();
 		bp->b_addr = vmap(bp->b_pages, bp->b_page_count,
 					VM_MAP, PAGE_KERNEL);
 		if (unlikely(bp->b_addr == NULL))
@@ -1672,6 +1743,8 @@ xfsbufd(
 			count++;
 		}
 
+		if (as_list_len > 0)
+			purge_addresses();
 		if (count)
 			blk_run_address_space(target->bt_mapping);
 
-- 
cgit v1.2.3


From 2cfbd50b536c878e58ab3681c4e944fa3d99b415 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Fri, 20 Feb 2009 10:55:10 -0500
Subject: Btrfs: check file pointer in btrfs_sync_file

fsync can be called by NFS with a null file pointer, and btrfs was
oopsing in this case.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/file.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 3e8023efaff7..872f104576e5 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1222,7 +1222,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	/*
 	 * ok we haven't committed the transaction yet, lets do a commit
 	 */
-	if (file->private_data)
+	if (file && file->private_data)
 		btrfs_ioctl_trans_end(file);
 
 	trans = btrfs_start_transaction(root, 1);
@@ -1231,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		goto out;
 	}
 
-	ret = btrfs_log_dentry_safe(trans, root, file->f_dentry);
+	ret = btrfs_log_dentry_safe(trans, root, dentry);
 	if (ret < 0)
 		goto out;
 
@@ -1245,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 	 * file again, but that will end up using the synchronization
 	 * inside btrfs_sync_log to keep things safe.
 	 */
-	mutex_unlock(&file->f_dentry->d_inode->i_mutex);
+	mutex_unlock(&dentry->d_inode->i_mutex);
 
 	if (ret > 0) {
 		ret = btrfs_commit_transaction(trans, root);
@@ -1253,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
 		btrfs_sync_log(trans, root);
 		ret = btrfs_end_transaction(trans, root);
 	}
-	mutex_lock(&file->f_dentry->d_inode->i_mutex);
+	mutex_lock(&dentry->d_inode->i_mutex);
 out:
 	return ret > 0 ? EIO : ret;
 }
-- 
cgit v1.2.3


From 6a63209fc02d5483371f07e4913ee8abad608051 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 11:00:09 -0500
Subject: Btrfs: add better -ENOSPC handling

This is a step in the direction of better -ENOSPC handling.  Instead of
checking the global bytes counter we check the space_info bytes counters to
make sure we have enough space.

If we don't we go ahead and try to allocate a new chunk, and then if that fails
we return -ENOSPC.  This patch adds two counters to btrfs_space_info,
bytes_delalloc and bytes_may_use.

bytes_delalloc account for extents we've actually setup for delalloc and will
be allocated at some point down the line.

bytes_may_use is to keep track of how many bytes we may use for delalloc at
some point.  When we actually set the extent_bit for the delalloc bytes we
subtract the reserved bytes from the bytes_may_use counter.  This keeps us from
not actually being able to allocate space for any delalloc bytes.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/btrfs_inode.h |   8 ++
 fs/btrfs/ctree.h       |  40 ++++++---
 fs/btrfs/extent-tree.c | 215 +++++++++++++++++++++++++++++++++++++++++++++----
 fs/btrfs/file.c        |  16 +++-
 fs/btrfs/inode.c       |  62 ++++----------
 fs/btrfs/ioctl.c       |   6 +-
 6 files changed, 271 insertions(+), 76 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index a8c9693b75ac..72677ce2b74f 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,9 @@ struct btrfs_inode {
 	 */
 	struct list_head delalloc_inodes;
 
+	/* the space_info for where this inode's data allocations are done */
+	struct btrfs_space_info *space_info;
+
 	/* full 64 bit generation number, struct vfs_inode doesn't have a big
 	 * enough field for this.
 	 */
@@ -94,6 +97,11 @@ struct btrfs_inode {
 	 */
 	u64 delalloc_bytes;
 
+	/* total number of bytes that may be used for this inode for
+	 * delalloc
+	 */
+	u64 reserved_bytes;
+
 	/*
 	 * the size of the file stored in the metadata on disk.  data=ordered
 	 * means the in-memory i_size might be larger than the size on disk
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 766b31ae3186..82491ba8fa40 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -596,13 +596,27 @@ struct btrfs_block_group_item {
 
 struct btrfs_space_info {
 	u64 flags;
-	u64 total_bytes;
-	u64 bytes_used;
-	u64 bytes_pinned;
-	u64 bytes_reserved;
-	u64 bytes_readonly;
-	int full;
-	int force_alloc;
+
+	u64 total_bytes;	/* total bytes in the space */
+	u64 bytes_used;		/* total bytes used on disk */
+	u64 bytes_pinned;	/* total bytes pinned, will be freed when the
+				   transaction finishes */
+	u64 bytes_reserved;	/* total bytes the allocator has reserved for
+				   current allocations */
+	u64 bytes_readonly;	/* total bytes that are read only */
+
+	/* delalloc accounting */
+	u64 bytes_delalloc;	/* number of bytes reserved for allocation,
+				   this space is not necessarily reserved yet
+				   by the allocator */
+	u64 bytes_may_use;	/* number of bytes that may be used for
+				   delalloc */
+
+	int full;		/* indicates that we cannot allocate any more
+				   chunks for this space */
+	int force_alloc;	/* set if we need to force a chunk alloc for
+				   this space */
+
 	struct list_head list;
 
 	/* for block groups in our same type */
@@ -1782,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root);
 int btrfs_cleanup_reloc_trees(struct btrfs_root *root);
 int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len);
 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags);
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde);
+int btrfs_check_metadata_free_space(struct btrfs_root *root);
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes);
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes);
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				 u64 bytes);
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes);
 /* ctree.c */
 int btrfs_previous_item(struct btrfs_root *root,
 			struct btrfs_path *path, u64 min_objectid,
@@ -2027,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset,
 unsigned long btrfs_force_ra(struct address_space *mapping,
 			      struct file_ra_state *ra, struct file *file,
 			      pgoff_t offset, pgoff_t last_index);
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del);
 int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page);
 int btrfs_readpage(struct file *file, struct page *page);
 void btrfs_delete_inode(struct inode *inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 0a5d796c9f7e..e11875e97c2f 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -60,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans,
 			      u64 bytenr, u64 num_bytes, int alloc,
 			      int mark_free);
 
+static int do_chunk_alloc(struct btrfs_trans_handle *trans,
+			  struct btrfs_root *extent_root, u64 alloc_bytes,
+			  u64 flags, int force);
+
 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
 {
 	return (cache->flags & bits) == bits;
@@ -1909,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags,
 	found->bytes_pinned = 0;
 	found->bytes_reserved = 0;
 	found->bytes_readonly = 0;
+	found->bytes_delalloc = 0;
 	found->full = 0;
 	found->force_alloc = 0;
 	*space_info = found;
@@ -1972,6 +1977,196 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
 	return flags;
 }
 
+static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	u64 alloc_profile;
+
+	if (data) {
+		alloc_profile = info->avail_data_alloc_bits &
+			info->data_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
+	} else if (root == root->fs_info->chunk_root) {
+		alloc_profile = info->avail_system_alloc_bits &
+			info->system_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
+	} else {
+		alloc_profile = info->avail_metadata_alloc_bits &
+			info->metadata_alloc_profile;
+		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
+	}
+
+	return btrfs_reduce_alloc_profile(root, data);
+}
+
+void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode)
+{
+	u64 alloc_target;
+
+	alloc_target = btrfs_get_alloc_profile(root, 1);
+	BTRFS_I(inode)->space_info = __find_space_info(root->fs_info,
+						       alloc_target);
+}
+
+/*
+ * for now this just makes sure we have at least 5% of our metadata space free
+ * for use.
+ */
+int btrfs_check_metadata_free_space(struct btrfs_root *root)
+{
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_space_info *meta_sinfo;
+	u64 alloc_target, thresh;
+
+	/* get the space info for where the metadata will live */
+	alloc_target = btrfs_get_alloc_profile(root, 0);
+	meta_sinfo = __find_space_info(info, alloc_target);
+
+	/*
+	 * if the metadata area isn't maxed out then there is no sense in
+	 * checking how much is used, since we can always allocate a new chunk
+	 */
+	if (!meta_sinfo->full)
+		return 0;
+
+	spin_lock(&meta_sinfo->lock);
+	thresh = meta_sinfo->total_bytes * 95;
+
+	do_div(thresh, 100);
+
+	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
+	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		spin_unlock(&meta_sinfo->lock);
+		return -ENOSPC;
+	}
+	spin_unlock(&meta_sinfo->lock);
+
+	return 0;
+}
+
+/*
+ * This will check the space that the inode allocates from to make sure we have
+ * enough space for bytes.
+ */
+int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
+				u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+	int ret = 0;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+again:
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	if (data_sinfo->total_bytes - data_sinfo->bytes_used -
+	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
+	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
+	    data_sinfo->bytes_may_use < bytes) {
+		/*
+		 * if we don't have enough free bytes in this space then we need
+		 * to alloc a new chunk.
+		 */
+		if (!data_sinfo->full) {
+			u64 alloc_target;
+			struct btrfs_trans_handle *trans;
+
+			data_sinfo->force_alloc = 1;
+			spin_unlock(&data_sinfo->lock);
+
+			alloc_target = btrfs_get_alloc_profile(root, 1);
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     bytes + 2 * 1024 * 1024,
+					     alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+		spin_unlock(&data_sinfo->lock);
+		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
+		       ", %llu bytes_used, %llu bytes_reserved, "
+		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
+		       "%llu total\n", bytes, data_sinfo->bytes_delalloc,
+		       data_sinfo->bytes_used, data_sinfo->bytes_reserved,
+		       data_sinfo->bytes_pinned, data_sinfo->bytes_readonly,
+		       data_sinfo->bytes_may_use, data_sinfo->total_bytes);
+		return -ENOSPC;
+	}
+	data_sinfo->bytes_may_use += bytes;
+	BTRFS_I(inode)->reserved_bytes += bytes;
+	spin_unlock(&data_sinfo->lock);
+
+	return btrfs_check_metadata_free_space(root);
+}
+
+/*
+ * if there was an error for whatever reason after calling
+ * btrfs_check_data_free_space, call this so we can cleanup the counters.
+ */
+void btrfs_free_reserved_data_space(struct btrfs_root *root,
+				    struct inode *inode, u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* make sure bytes are sectorsize aligned */
+	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
+
+	data_sinfo = BTRFS_I(inode)->space_info;
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_may_use -= bytes;
+	BTRFS_I(inode)->reserved_bytes -= bytes;
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are adding a delalloc extent to the inode's io_tree */
+void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode,
+				  u64 bytes)
+{
+	struct btrfs_space_info *data_sinfo;
+
+	/* get the space info for where this inode will be storing its data */
+	data_sinfo = BTRFS_I(inode)->space_info;
+
+	/* make sure we have enough space to handle the data first */
+	spin_lock(&data_sinfo->lock);
+	data_sinfo->bytes_delalloc += bytes;
+
+	/*
+	 * we are adding a delalloc extent without calling
+	 * btrfs_check_data_free_space first.  This happens on a weird
+	 * writepage condition, but shouldn't hurt our accounting
+	 */
+	if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) {
+		data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes;
+		BTRFS_I(inode)->reserved_bytes = 0;
+	} else {
+		data_sinfo->bytes_may_use -= bytes;
+		BTRFS_I(inode)->reserved_bytes -= bytes;
+	}
+
+	spin_unlock(&data_sinfo->lock);
+}
+
+/* called when we are clearing an delalloc extent from the inode's io_tree */
+void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode,
+			      u64 bytes)
+{
+	struct btrfs_space_info *info;
+
+	info = BTRFS_I(inode)->space_info;
+
+	spin_lock(&info->lock);
+	info->bytes_delalloc -= bytes;
+	spin_unlock(&info->lock);
+}
+
 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *extent_root, u64 alloc_bytes,
 			  u64 flags, int force)
@@ -3105,6 +3300,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes)
 	       (unsigned long long)(info->total_bytes - info->bytes_used -
 				    info->bytes_pinned - info->bytes_reserved),
 	       (info->full) ? "" : "not ");
+	printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu,"
+	       " may_use=%llu, used=%llu\n", info->total_bytes,
+	       info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use,
+	       info->bytes_used);
 
 	down_read(&info->groups_sem);
 	list_for_each_entry(cache, &info->block_groups, list) {
@@ -3131,24 +3330,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
 {
 	int ret;
 	u64 search_start = 0;
-	u64 alloc_profile;
 	struct btrfs_fs_info *info = root->fs_info;
 
-	if (data) {
-		alloc_profile = info->avail_data_alloc_bits &
-			info->data_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_DATA | alloc_profile;
-	} else if (root == root->fs_info->chunk_root) {
-		alloc_profile = info->avail_system_alloc_bits &
-			info->system_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile;
-	} else {
-		alloc_profile = info->avail_metadata_alloc_bits &
-			info->metadata_alloc_profile;
-		data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile;
-	}
+	data = btrfs_get_alloc_profile(root, data);
 again:
-	data = btrfs_reduce_alloc_profile(root, data);
 	/*
 	 * the only place that sets empty_size is btrfs_realloc_node, which
 	 * is not called recursively on allocations
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 872f104576e5..dc78954861b3 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1091,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		WARN_ON(num_pages > nrptrs);
 		memset(pages, 0, sizeof(struct page *) * nrptrs);
 
-		ret = btrfs_check_free_space(root, write_bytes, 0);
+		ret = btrfs_check_data_free_space(root, inode, write_bytes);
 		if (ret)
 			goto out;
 
 		ret = prepare_pages(root, file, pages, num_pages,
 				    pos, first_index, last_index,
 				    write_bytes);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		ret = btrfs_copy_from_user(pos, num_pages,
 					   write_bytes, pages, buf);
 		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			btrfs_drop_pages(pages, num_pages);
 			goto out;
 		}
@@ -1111,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 		ret = dirty_and_release_pages(NULL, root, file, pages,
 					      num_pages, pos, write_bytes);
 		btrfs_drop_pages(pages, num_pages);
-		if (ret)
+		if (ret) {
+			btrfs_free_reserved_data_space(root, inode,
+						       write_bytes);
 			goto out;
+		}
 
 		if (will_write) {
 			btrfs_fdatawrite_range(inode->i_mapping, pos,
@@ -1136,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf,
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
+	if (ret)
+		err = ret;
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3cee77ae03c8..7d4f948bc22a 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -101,34 +101,6 @@ static int btrfs_init_inode_security(struct inode *inode,  struct inode *dir)
 	return err;
 }
 
-/*
- * a very lame attempt at stopping writes when the FS is 85% full.  There
- * are countless ways this is incorrect, but it is better than nothing.
- */
-int btrfs_check_free_space(struct btrfs_root *root, u64 num_required,
-			   int for_del)
-{
-	u64 total;
-	u64 used;
-	u64 thresh;
-	int ret = 0;
-
-	spin_lock(&root->fs_info->delalloc_lock);
-	total = btrfs_super_total_bytes(&root->fs_info->super_copy);
-	used = btrfs_super_bytes_used(&root->fs_info->super_copy);
-	if (for_del)
-		thresh = total * 90;
-	else
-		thresh = total * 85;
-
-	do_div(thresh, 100);
-
-	if (used + root->fs_info->delalloc_bytes + num_required > thresh)
-		ret = -ENOSPC;
-	spin_unlock(&root->fs_info->delalloc_lock);
-	return ret;
-}
-
 /*
  * this does all the hard work for inserting an inline extent into
  * the btree.  The caller should have done a btrfs_drop_extents so that
@@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end,
 	 */
 	if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
 		struct btrfs_root *root = BTRFS_I(inode)->root;
+		btrfs_delalloc_reserve_space(root, inode, end - start + 1);
 		spin_lock(&root->fs_info->delalloc_lock);
 		BTRFS_I(inode)->delalloc_bytes += end - start + 1;
 		root->fs_info->delalloc_bytes += end - start + 1;
@@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end,
 			       (unsigned long long)end - start + 1,
 			       (unsigned long long)
 			       root->fs_info->delalloc_bytes);
+			btrfs_delalloc_free_space(root, inode, (u64)-1);
 			root->fs_info->delalloc_bytes = 0;
 			BTRFS_I(inode)->delalloc_bytes = 0;
 		} else {
+			btrfs_delalloc_free_space(root, inode,
+						  end - start + 1);
 			root->fs_info->delalloc_bytes -= end - start + 1;
 			BTRFS_I(inode)->delalloc_bytes -= end - start + 1;
 		}
@@ -2245,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 
 	root = BTRFS_I(dir)->root;
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 
 	btrfs_set_trans_block_group(trans, dir);
@@ -2261,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
 	nr = trans->blocks_used;
 
 	btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2284,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 		return -ENOTEMPTY;
 	}
 
-	ret = btrfs_check_free_space(root, 1, 1);
-	if (ret)
-		goto fail;
-
 	trans = btrfs_start_transaction(root, 1);
 	btrfs_set_trans_block_group(trans, dir);
 
@@ -2304,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry)
 fail_trans:
 	nr = trans->blocks_used;
 	ret = btrfs_end_transaction_throttle(trans, root);
-fail:
 	btrfs_btree_balance_dirty(root, nr);
 
 	if (ret && !err)
@@ -2818,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size)
 	if (size <= hole_start)
 		return 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		return err;
 
@@ -3014,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode)
 	bi->last_trans = 0;
 	bi->logged_trans = 0;
 	bi->delalloc_bytes = 0;
+	bi->reserved_bytes = 0;
 	bi->disk_i_size = 0;
 	bi->flags = 0;
 	bi->index_cnt = (u64)-1;
@@ -3035,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p)
 	inode->i_ino = args->ino;
 	init_btrfs_i(inode);
 	BTRFS_I(inode)->root = args->root;
+	btrfs_set_inode_space_info(args->root, inode);
 	return 0;
 }
 
@@ -3455,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	BTRFS_I(inode)->index_cnt = 2;
 	BTRFS_I(inode)->root = root;
 	BTRFS_I(inode)->generation = trans->transid;
+	btrfs_set_inode_space_info(root, inode);
 
 	if (mode & S_IFDIR)
 		owner = 0;
@@ -3602,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 	if (!new_valid_dev(rdev))
 		return -EINVAL;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 
@@ -3665,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	u64 objectid;
 	u64 index = 0;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	trans = btrfs_start_transaction(root, 1);
@@ -3733,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
 		return -ENOENT;
 
 	btrfs_inc_nlink(inode);
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto fail;
 	err = btrfs_set_inode_index(dir, &index);
@@ -3779,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_unlock;
 
@@ -4336,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page)
 	u64 page_start;
 	u64 page_end;
 
-	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
+	ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE);
 	if (ret)
 		goto out;
 
@@ -4349,6 +4318,7 @@ again:
 
 	if ((page->mapping != inode->i_mapping) ||
 	    (page_start >= size)) {
+		btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE);
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
@@ -4631,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
 	if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID)
 		return -EXDEV;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto out_unlock;
 
@@ -4749,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root))
 		return -ENAMETOOLONG;
 
-	err = btrfs_check_free_space(root, 1, 0);
+	err = btrfs_check_metadata_free_space(root);
 	if (err)
 		goto out_fail;
 
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 988fdc8b49eb..bca729fc80c8 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root,
 	u64 index = 0;
 	unsigned long nr = 1;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_commit;
 
@@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry,
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	ret = btrfs_check_free_space(root, 1, 0);
+	ret = btrfs_check_metadata_free_space(root);
 	if (ret)
 		goto fail_unlock;
 
@@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file)
 	unsigned long i;
 	int ret;
 
-	ret = btrfs_check_free_space(root, inode->i_size, 0);
+	ret = btrfs_check_data_free_space(root, inode, inode->i_size);
 	if (ret)
 		return -ENOSPC;
 
-- 
cgit v1.2.3


From 4e06bdd6cbd5105376e7caf4e683ed131e777389 Mon Sep 17 00:00:00 2001
From: Josef Bacik <jbacik@redhat.com>
Date: Fri, 20 Feb 2009 10:59:53 -0500
Subject: Btrfs: try committing transaction before returning ENOSPC

This fixes a problem where we could return -ENOSPC when we may actually have
plenty of space, the space is just pinned.  Instead of returning -ENOSPC
immediately, commit the transaction first and then try and do the allocation
again.

This patch also does chunk allocation for metadata if we pass the 80%
threshold for metadata space.  This will help with stack usage since the chunk
allocation will happen early on, instead of when the allocation is happening.

Signed-off-by: Josef Bacik <jbacik@redhat.com>
---
 fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++++++++++---------
 1 file changed, 47 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index e11875e97c2f..6b5966aacf44 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2017,26 +2017,49 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root)
 	struct btrfs_fs_info *info = root->fs_info;
 	struct btrfs_space_info *meta_sinfo;
 	u64 alloc_target, thresh;
+	int committed = 0, ret;
 
 	/* get the space info for where the metadata will live */
 	alloc_target = btrfs_get_alloc_profile(root, 0);
 	meta_sinfo = __find_space_info(info, alloc_target);
 
-	/*
-	 * if the metadata area isn't maxed out then there is no sense in
-	 * checking how much is used, since we can always allocate a new chunk
-	 */
-	if (!meta_sinfo->full)
-		return 0;
-
+again:
 	spin_lock(&meta_sinfo->lock);
-	thresh = meta_sinfo->total_bytes * 95;
+	if (!meta_sinfo->full)
+		thresh = meta_sinfo->total_bytes * 80;
+	else
+		thresh = meta_sinfo->total_bytes * 95;
 
 	do_div(thresh, 100);
 
 	if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved +
 	    meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) {
+		struct btrfs_trans_handle *trans;
+		if (!meta_sinfo->full) {
+			meta_sinfo->force_alloc = 1;
+			spin_unlock(&meta_sinfo->lock);
+
+			trans = btrfs_start_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+
+			ret = do_chunk_alloc(trans, root->fs_info->extent_root,
+					     2 * 1024 * 1024, alloc_target, 0);
+			btrfs_end_transaction(trans, root);
+			goto again;
+		}
 		spin_unlock(&meta_sinfo->lock);
+
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
 		return -ENOSPC;
 	}
 	spin_unlock(&meta_sinfo->lock);
@@ -2052,7 +2075,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode,
 				u64 bytes)
 {
 	struct btrfs_space_info *data_sinfo;
-	int ret = 0;
+	int ret = 0, committed = 0;
 
 	/* make sure bytes are sectorsize aligned */
 	bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
@@ -2065,13 +2088,14 @@ again:
 	    data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved -
 	    data_sinfo->bytes_pinned - data_sinfo->bytes_readonly -
 	    data_sinfo->bytes_may_use < bytes) {
+		struct btrfs_trans_handle *trans;
+
 		/*
 		 * if we don't have enough free bytes in this space then we need
 		 * to alloc a new chunk.
 		 */
 		if (!data_sinfo->full) {
 			u64 alloc_target;
-			struct btrfs_trans_handle *trans;
 
 			data_sinfo->force_alloc = 1;
 			spin_unlock(&data_sinfo->lock);
@@ -2090,6 +2114,19 @@ again:
 			goto again;
 		}
 		spin_unlock(&data_sinfo->lock);
+
+		/* commit the current transaction and try again */
+		if (!committed) {
+			committed = 1;
+			trans = btrfs_join_transaction(root, 1);
+			if (!trans)
+				return -ENOMEM;
+			ret = btrfs_commit_transaction(trans, root);
+			if (ret)
+				return ret;
+			goto again;
+		}
+
 		printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes"
 		       ", %llu bytes_used, %llu bytes_reserved, "
 		       "%llu bytes_pinned, %llu bytes_readonly, %llu may use"
-- 
cgit v1.2.3


From e4cce94c9c8797b08faf6a79396df4d175e377fa Mon Sep 17 00:00:00 2001
From: Igor Mammedov <niallain@gmail.com>
Date: Tue, 10 Feb 2009 14:10:26 +0300
Subject: [CIFS] Prevent OOPs when mounting with remote prefixpath.

Fixes OOPs with message 'kernel BUG at fs/cifs/cifs_dfs_ref.c:274!'.
Checks if the prefixpath in an accesible while we are still in cifs_mount
and fails with reporting a error if we can't access the prefixpath

Should fix Samba bugs 6086 and 5861 and kernel bug 12192

Signed-off-by: Igor Mammedov <niallain@gmail.com>
Acked-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  1 +
 fs/cifs/connect.c   | 45 +++++++++++++++++++++++++++++++++++++++++++++
 fs/cifs/inode.c     |  4 ++--
 3 files changed, 48 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 382ba6298809..ec9f9c1c7d88 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int);
 #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid()));
 #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));}
 extern char *build_path_from_dentry(struct dentry *);
+extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb);
 extern char *build_wildcard_path_from_dentry(struct dentry *direntry);
 /* extern void renew_parental_timestamps(struct dentry *direntry);*/
 extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *,
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 005df85219a8..da0f4ffa0613 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info,
 			   "mount option supported"));
 }
 
+static int
+is_path_accessible(int xid, struct cifsTconInfo *tcon,
+		   struct cifs_sb_info *cifs_sb, const char *full_path)
+{
+	int rc;
+	__u64 inode_num;
+	FILE_ALL_INFO *pfile_info;
+
+	rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num,
+				   cifs_sb->local_nls,
+				   cifs_sb->mnt_cifs_flags &
+						CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc != -EOPNOTSUPP)
+		return rc;
+
+	pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
+	if (pfile_info == NULL)
+		return -ENOMEM;
+
+	rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info,
+			      0 /* not legacy */, cifs_sb->local_nls,
+			      cifs_sb->mnt_cifs_flags &
+				CIFS_MOUNT_MAP_SPECIAL_CHR);
+	kfree(pfile_info);
+	return rc;
+}
+
 int
 cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	   char *mount_data, const char *devname)
@@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb,
 	struct cifsSesInfo *pSesInfo = NULL;
 	struct cifsTconInfo *tcon = NULL;
 	struct TCP_Server_Info *srvTcp = NULL;
+	char   *full_path;
 
 	xid = GetXid();
 
@@ -2426,6 +2454,23 @@ mount_fail_check:
 		cifs_sb->rsize = min(cifs_sb->rsize,
 			       (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE));
 
+	if (!rc && cifs_sb->prepathlen) {
+		/* build_path_to_root works only when we have a valid tcon */
+		full_path = cifs_build_path_to_root(cifs_sb);
+		if (full_path == NULL) {
+			rc = -ENOMEM;
+			goto mount_fail_check;
+		}
+		rc = is_path_accessible(xid, tcon, cifs_sb, full_path);
+		if (rc) {
+			cERROR(1, ("Path %s in not accessible: %d",
+						full_path, rc));
+			kfree(full_path);
+			goto mount_fail_check;
+		}
+		kfree(full_path);
+	}
+
 	/* volume_info->password is freed above when existing session found
 	(in which case it is not needed anymore) but when new sesion is created
 	the password ptr is put in the new session structure (in which case the
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index bcf7b5184664..7342bfb02ae0 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -621,7 +621,7 @@ static const struct inode_operations cifs_ipc_inode_ops = {
 	.lookup = cifs_lookup,
 };
 
-static char *build_path_to_root(struct cifs_sb_info *cifs_sb)
+char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 {
 	int pplen = cifs_sb->prepathlen;
 	int dfsplen;
@@ -678,7 +678,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
 		return inode;
 
 	cifs_sb = CIFS_SB(inode->i_sb);
-	full_path = build_path_to_root(cifs_sb);
+	full_path = cifs_build_path_to_root(cifs_sb);
 	if (full_path == NULL)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From 132ac7b77cc95a22d6118d327c96586759fbf006 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Tue, 10 Feb 2009 07:33:57 -0500
Subject: cifs: refactor new_inode() calls and inode initialization

Move new inode creation into a separate routine and refactor the
callers to take advantage of it.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  2 ++
 fs/cifs/inode.c     | 96 +++++++++++++++++++++++++++++++++--------------------
 fs/cifs/readdir.c   | 54 ++++++++++++++----------------
 3 files changed, 86 insertions(+), 66 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index ec9f9c1c7d88..62fd5bd499f6 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern struct inode *cifs_new_inode(struct super_block *sb,
+				    unsigned long *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 7342bfb02ae0..c7674f595adb 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
 	pfnd_dat->Gid = cpu_to_le64(pinode->i_gid);
 }
 
+/**
+ * cifs_new inode - create new inode, initialize, and hash it
+ * @sb - pointer to superblock
+ * @inum - if valid pointer and serverino is enabled, replace i_ino with val
+ *
+ * Create a new inode, initialize it for CIFS and hash it. Returns the new
+ * inode or NULL if one couldn't be allocated.
+ *
+ * If the share isn't mounted with "serverino" or inum is a NULL pointer then
+ * we'll just use the inode number assigned by new_inode(). Note that this can
+ * mean i_ino collisions since the i_ino assigned by new_inode is not
+ * guaranteed to be unique.
+ */
+struct inode *
+cifs_new_inode(struct super_block *sb, unsigned long *inum)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (inode == NULL)
+		return NULL;
+
+	/*
+	 * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we
+	 *     stop passing inum as ptr. Are there sanity checks we can use to
+	 *     ensure that the server is really filling in that field? Also,
+	 *     if serverino is disabled, perhaps we should be using iunique()?
+	 */
+	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
+		inode->i_ino = *inum;
+
+	/*
+	 * must set this here instead of cifs_alloc_inode since VFS will
+	 * clobber i_flags
+	 */
+	if (sb->s_flags & MS_NOATIME)
+		inode->i_flags |= S_NOATIME | S_NOCMTIME;
+
+	insert_inode_hash(inode);
+
+	return inode;
+}
+
 int cifs_get_inode_info_unix(struct inode **pinode,
 	const unsigned char *full_path, struct super_block *sb, int xid)
 {
@@ -233,22 +276,12 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
+		*pinode = cifs_new_inode(sb, (unsigned long *)
+						&find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
 		}
-		/* Is an i_ino of zero legal? */
-		/* note ino incremented to unique num in new_inode */
-		/* Are there sanity checks we can use to ensure that
-		   the server is really filling in that field? */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			(*pinode)->i_ino = (unsigned long)find_data.UniqueId;
-
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-
-		insert_inode_hash(*pinode);
 	}
 
 	inode = *pinode;
@@ -465,11 +498,8 @@ int cifs_get_inode_info(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = new_inode(sb);
-		if (*pinode == NULL) {
-			rc = -ENOMEM;
-			goto cgii_exit;
-		}
+		__u64 inode_num;
+
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
 		   there other sanity checks we can use to ensure that
@@ -486,7 +516,6 @@ int cifs_get_inode_info(struct inode **pinode,
 
 		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
 			int rc1 = 0;
-			__u64 inode_num;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
 					full_path, &inode_num,
@@ -496,12 +525,17 @@ int cifs_get_inode_info(struct inode **pinode,
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
 				/* BB EOPNOSUPP disable SERVER_INUM? */
-			} else /* do we need cast or hash to ino? */
-				(*pinode)->i_ino = inode_num;
-		} /* else ino incremented to unique num in new_inode*/
-		if (sb->s_flags & MS_NOATIME)
-			(*pinode)->i_flags |= S_NOATIME | S_NOCMTIME;
-		insert_inode_hash(*pinode);
+			}
+			*pinode = cifs_new_inode(sb, (unsigned long *)
+							&inode_num);
+		} else {
+			*pinode = cifs_new_inode(sb, NULL);
+		}
+
+		if (*pinode == NULL) {
+			rc = -ENOMEM;
+			goto cgii_exit;
+		}
 	}
 	inode = *pinode;
 	cifsInfo = CIFS_I(inode);
@@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = new_inode(inode->i_sb);
+			newinode = cifs_new_inode(inode->i_sb, (unsigned long *)
+							&pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
 			}
 
-			/* Is an i_ino of zero legal? */
-			/* Are there sanity checks we can use to ensure that
-			   the server is really filling in that field? */
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) {
-				newinode->i_ino =
-					(unsigned long)pInfo->UniqueId;
-			} /* note ino incremented to unique num in new_inode */
-			if (inode->i_sb->s_flags & MS_NOATIME)
-				newinode->i_flags |= S_NOATIME | S_NOCMTIME;
 			newinode->i_nlink = 2;
-
-			insert_inode_hash(newinode);
 			d_instantiate(direntry, newinode);
 
 			/* we already checked in POSIXCreate whether
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 9f51f9bf0292..02a20221e841 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 }
 #endif /* DEBUG2 */
 
-/* Returns one if new inode created (which therefore needs to be hashed) */
+/* Returns 1 if new inode created, 2 if both dentry and inode were */
 /* Might check in the future if inode number changed so we can rehash inode */
-static int construct_dentry(struct qstr *qstring, struct file *file,
-	struct inode **ptmp_inode, struct dentry **pnew_dentry)
+static int
+construct_dentry(struct qstr *qstring, struct file *file,
+		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
+		 unsigned long *inum)
 {
-	struct dentry *tmp_dentry;
-	struct cifs_sb_info *cifs_sb;
-	struct cifsTconInfo *pTcon;
+	struct dentry *tmp_dentry = NULL;
+	struct super_block *sb = file->f_path.dentry->d_sb;
 	int rc = 0;
 
 	cFYI(1, ("For %s", qstring->name));
-	cifs_sb = CIFS_SB(file->f_path.dentry->d_sb);
-	pTcon = cifs_sb->tcon;
 
 	qstring->hash = full_name_hash(qstring->name, qstring->len);
 	tmp_dentry = d_lookup(file->f_path.dentry, qstring);
 	if (tmp_dentry) {
+		/* BB: overwrite old name? i.e. tmp_dentry->d_name and
+		 * tmp_dentry->d_name.len??
+		 */
 		cFYI(0, ("existing dentry with inode 0x%p",
 			 tmp_dentry->d_inode));
 		*ptmp_inode = tmp_dentry->d_inode;
-/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/
 		if (*ptmp_inode == NULL) {
-			*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
+			*ptmp_inode = cifs_new_inode(sb, inum);
 			if (*ptmp_inode == NULL)
 				return rc;
 			rc = 1;
 		}
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 	} else {
 		tmp_dentry = d_alloc(file->f_path.dentry, qstring);
 		if (tmp_dentry == NULL) {
@@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file,
 			return rc;
 		}
 
-		*ptmp_inode = new_inode(file->f_path.dentry->d_sb);
-		if (pTcon->nocase)
+		if (CIFS_SB(sb)->tcon->nocase)
 			tmp_dentry->d_op = &cifs_ci_dentry_ops;
 		else
 			tmp_dentry->d_op = &cifs_dentry_ops;
+
+		*ptmp_inode = cifs_new_inode(sb, inum);
 		if (*ptmp_inode == NULL)
 			return rc;
-		if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME)
-			(*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME;
 		rc = 2;
 	}
 
@@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst,
 			len = strnlen(filename, PATH_MAX);
 		}
 
-		/* BB fixme - hash low and high 32 bits if not 64 bit arch BB */
-		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)
-			*pinum = pFindData->UniqueId;
+		*pinum = pFindData->UniqueId;
 	} else if (level == SMB_FIND_FILE_DIRECTORY_INFO) {
 		FILE_DIRECTORY_INFO *pFindData =
 			(FILE_DIRECTORY_INFO *)current_entry;
@@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	if (rc)
 		return rc;
 
-	rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry);
+	/* only these two infolevels return valid inode numbers */
+	if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX ||
+	    pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO)
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					&inum);
+	else
+		rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry,
+					NULL);
+
 	if ((tmp_inode == NULL) || (tmp_dentry == NULL))
 		return -ENOMEM;
 
-	if (rc) {
-		/* inode created, we need to hash it with right inode number */
-		if (inum != 0) {
-			/* BB fixme - hash the 2 32 quantities bits together if
-			 *  necessary BB */
-			tmp_inode->i_ino = inum;
-		}
-		insert_inode_hash(tmp_inode);
-	}
-
 	/* we pass in rc below, indicating whether it is a new inode,
 	   so we can figure out whether to invalidate the inode cached
 	   data if the file has changed */
-- 
cgit v1.2.3


From 950ec52880fab89b957c7dc45e8b8476dd63741f Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Feb 2009 08:08:26 -0500
Subject: cifs: properly handle case where CIFSGetSrvInodeNumber fails

...if it does then we pass a pointer to an unintialized variable for
the inode number to cifs_new_inode. Have it pass a NULL pointer instead.

Also tweak the function prototypes to reduce the amount of casting.

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h |  3 +--
 fs/cifs/inode.c     | 20 ++++++++++----------
 fs/cifs/readdir.c   |  6 +++---
 3 files changed, 14 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 62fd5bd499f6..446e62cbece9 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,8 +92,7 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
-extern struct inode *cifs_new_inode(struct super_block *sb,
-				    unsigned long *inum);
+extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
 			FILE_ALL_INFO *pfile_info,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index c7674f595adb..475115c7cc79 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -213,7 +213,7 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat,
  * guaranteed to be unique.
  */
 struct inode *
-cifs_new_inode(struct super_block *sb, unsigned long *inum)
+cifs_new_inode(struct super_block *sb, __u64 *inum)
 {
 	struct inode *inode;
 
@@ -228,7 +228,7 @@ cifs_new_inode(struct super_block *sb, unsigned long *inum)
 	 *     if serverino is disabled, perhaps we should be using iunique()?
 	 */
 	if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM))
-		inode->i_ino = *inum;
+		inode->i_ino = (unsigned long) *inum;
 
 	/*
 	 * must set this here instead of cifs_alloc_inode since VFS will
@@ -276,8 +276,7 @@ int cifs_get_inode_info_unix(struct inode **pinode,
 
 	/* get new inode */
 	if (*pinode == NULL) {
-		*pinode = cifs_new_inode(sb, (unsigned long *)
-						&find_data.UniqueId);
+		*pinode = cifs_new_inode(sb, &find_data.UniqueId);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgiiu_exit;
@@ -499,6 +498,7 @@ int cifs_get_inode_info(struct inode **pinode,
 	/* get new inode */
 	if (*pinode == NULL) {
 		__u64 inode_num;
+		__u64 *pinum = &inode_num;
 
 		/* Is an i_ino of zero legal? Can we use that to check
 		   if the server supports returning inode numbers?  Are
@@ -518,20 +518,20 @@ int cifs_get_inode_info(struct inode **pinode,
 			int rc1 = 0;
 
 			rc1 = CIFSGetSrvInodeNumber(xid, pTcon,
-					full_path, &inode_num,
+					full_path, pinum,
 					cifs_sb->local_nls,
 					cifs_sb->mnt_cifs_flags &
 						CIFS_MOUNT_MAP_SPECIAL_CHR);
 			if (rc1) {
 				cFYI(1, ("GetSrvInodeNum rc %d", rc1));
+				pinum = NULL;
 				/* BB EOPNOSUPP disable SERVER_INUM? */
 			}
-			*pinode = cifs_new_inode(sb, (unsigned long *)
-							&inode_num);
 		} else {
-			*pinode = cifs_new_inode(sb, NULL);
+			pinum = NULL;
 		}
 
+		*pinode = cifs_new_inode(sb, pinum);
 		if (*pinode == NULL) {
 			rc = -ENOMEM;
 			goto cgii_exit;
@@ -1148,8 +1148,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode)
 			else
 				direntry->d_op = &cifs_dentry_ops;
 
-			newinode = cifs_new_inode(inode->i_sb, (unsigned long *)
-							&pInfo->UniqueId);
+			newinode = cifs_new_inode(inode->i_sb,
+						  &pInfo->UniqueId);
 			if (newinode == NULL) {
 				kfree(pInfo);
 				goto mkdir_get_info;
diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 02a20221e841..c2c01ff4c32c 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -61,7 +61,7 @@ static inline void dump_cifs_file_struct(struct file *file, char *label)
 static int
 construct_dentry(struct qstr *qstring, struct file *file,
 		 struct inode **ptmp_inode, struct dentry **pnew_dentry,
-		 unsigned long *inum)
+		 __u64 *inum)
 {
 	struct dentry *tmp_dentry = NULL;
 	struct super_block *sb = file->f_path.dentry->d_sb;
@@ -820,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon,
 /* inode num, inode type and filename returned */
 static int cifs_get_name_from_search_buf(struct qstr *pqst,
 	char *current_entry, __u16 level, unsigned int unicode,
-	struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum)
+	struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum)
 {
 	int rc = 0;
 	unsigned int len = 0;
@@ -903,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file,
 	struct qstr qstring;
 	struct cifsFileInfo *pCifsF;
 	unsigned int obj_type;
-	ino_t  inum;
+	__u64  inum;
 	struct cifs_sb_info *cifs_sb;
 	struct inode *tmp_inode;
 	struct dentry *tmp_dentry;
-- 
cgit v1.2.3


From 44f68fadd865bb288ebdcea2b602f0b1cab27a0c Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@redhat.com>
Date: Wed, 11 Feb 2009 08:08:28 -0500
Subject: cifs: posix fill in inode needed by posix open

function needed to prepare for posix open

Signed-off-by: Jeff Layton <jlayton@redhat.com>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/cifsproto.h | 2 ++
 fs/cifs/inode.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 446e62cbece9..083dfc57c7a3 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec);
 extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
 extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
 
+extern void posix_fill_in_inode(struct inode *tmp_inode,
+				FILE_UNIX_BASIC_INFO *pData, int isNewInode);
 extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum);
 extern int cifs_get_inode_info(struct inode **pinode,
 			const unsigned char *search_path,
diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 475115c7cc79..4690a360c855 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1051,7 +1051,7 @@ out_reval:
 	return rc;
 }
 
-static void posix_fill_in_inode(struct inode *tmp_inode,
+void posix_fill_in_inode(struct inode *tmp_inode,
 	FILE_UNIX_BASIC_INFO *pData, int isNewInode)
 {
 	struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode);
-- 
cgit v1.2.3


From 69765529d701c838df19ea1f5ad2f33a528261ae Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Tue, 17 Feb 2009 01:29:40 +0000
Subject: [CIFS] Fix oops in cifs_strfromUCS_le mounting to servers which do
 not specify their OS

Fixes kernel bug #10451 http://bugzilla.kernel.org/show_bug.cgi?id=10451

Certain NAS appliances do not set the operating system or network operating system
fields in the session setup response on the wire.  cifs was oopsing on the unexpected
zero length response fields (when trying to null terminate a zero length field).

This fixes the oops.

Acked-by: Jeff Layton <jlayton@redhat.com>
CC: stable <stable@kernel.org>
Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES | 3 ++-
 fs/cifs/sess.c  | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 73ac7ebd1dfc..1cfa72ef1f37 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -7,7 +7,8 @@ specified and user does not have access to query information about the
 top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
-removed directory.
+removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
+when using DFS.
 
 Version 1.55
 ------------
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 5f22de7b79a9..b234407a3007 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -228,7 +228,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 
 	kfree(ses->serverOS);
 	/* UTF-8 string will not grow more than four times as big as UCS-16 */
-	ses->serverOS = kzalloc(4 * len, GFP_KERNEL);
+	ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverOS != NULL)
 		cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp);
 	data += 2 * (len + 1);
@@ -241,7 +241,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft,
 		return rc;
 
 	kfree(ses->serverNOS);
-	ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */
+	ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL);
 	if (ses->serverNOS != NULL) {
 		cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len,
 				   nls_cp);
-- 
cgit v1.2.3


From c3b2a0c640bff7df85d79fb4f89674949a267ec2 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 20 Feb 2009 04:32:45 +0000
Subject: [CIFS] improve posix semantics of file create

Samba server added support for a new posix open/create/mkdir operation
a year or so ago, and we added support to cifs for mkdir to use it,
but had not added the corresponding code to file create.

The following patch helps improve the performance of the cifs create
path (to Samba and servers which support the cifs posix protocol
extensions).  Using Connectathon basic test1, with 2000 files, the
performance improved about 15%, and also helped reduce network traffic
(17% fewer SMBs sent over the wire) due to saving a network round trip
for the SetPathInfo on every file create.

It should also help the semantics (and probably the performance) of
write (e.g. when posix byte range locks are on the file) on file
handles opened with posix create, and adds support for a few flags
which would have to be ignored otherwise.

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES |   4 +-
 fs/cifs/dir.c   | 307 +++++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 208 insertions(+), 103 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 1cfa72ef1f37..72063f5e56b1 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -8,7 +8,9 @@ top of the share.  Fix problem in 2.6.28 resolving DFS paths to
 Samba servers (worked to Windows).  Fix rmdir so that pending search
 (readdir) requests do not get invalid results which include the now
 removed directory.  Fix oops in cifs_dfs_ref.c when prefixpath is not reachable
-when using DFS.
+when using DFS.  Add better file create support to servers which support
+the CIFS POSIX protocol extensions (this adds support for new flags
+on create, and improves semantics for write of locked ranges).
 
 Version 1.55
 ------------
diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 964aad03c5ad..89fb72832652 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -3,7 +3,7 @@
  *
  *   vfs operations that deal with dentries
  *
- *   Copyright (C) International Business Machines  Corp., 2002,2008
+ *   Copyright (C) International Business Machines  Corp., 2002,2009
  *   Author(s): Steve French (sfrench@us.ibm.com)
  *
  *   This library is free software; you can redistribute it and/or modify
@@ -129,6 +129,78 @@ cifs_bp_rename_retry:
 	return full_path;
 }
 
+static int cifs_posix_open(char *full_path, struct inode **pinode,
+		    struct super_block *sb, int mode, int oflags,
+		    int *poplock, __u16 *pnetfid, int xid)
+{
+	int rc;
+	__u32 oplock;
+	FILE_UNIX_BASIC_INFO *presp_data;
+	__u32 posix_flags = 0;
+	struct cifs_sb_info *cifs_sb = CIFS_SB(sb);
+
+	cFYI(1, ("posix open %s", full_path));
+
+	presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL);
+	if (presp_data == NULL)
+		return -ENOMEM;
+
+/* So far cifs posix extensions can only map the following flags.
+   There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but
+   so far we do not seem to need them, and we can treat them as local only */
+	if ((oflags & (FMODE_READ | FMODE_WRITE)) ==
+		(FMODE_READ | FMODE_WRITE))
+		posix_flags = SMB_O_RDWR;
+	else if (oflags & FMODE_READ)
+		posix_flags = SMB_O_RDONLY;
+	else if (oflags & FMODE_WRITE)
+		posix_flags = SMB_O_WRONLY;
+	if (oflags & O_CREAT)
+		posix_flags |= SMB_O_CREAT;
+	if (oflags & O_EXCL)
+		posix_flags |= SMB_O_EXCL;
+	if (oflags & O_TRUNC)
+		posix_flags |= SMB_O_TRUNC;
+	if (oflags & O_APPEND)
+		posix_flags |= SMB_O_APPEND;
+	if (oflags & O_SYNC)
+		posix_flags |= SMB_O_SYNC;
+	if (oflags & O_DIRECTORY)
+		posix_flags |= SMB_O_DIRECTORY;
+	if (oflags & O_NOFOLLOW)
+		posix_flags |= SMB_O_NOFOLLOW;
+	if (oflags & O_DIRECT)
+		posix_flags |= SMB_O_DIRECT;
+
+
+	rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode,
+			pnetfid, presp_data, &oplock, full_path,
+			cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
+					CIFS_MOUNT_MAP_SPECIAL_CHR);
+	if (rc)
+		goto posix_open_ret;
+
+	if (presp_data->Type == cpu_to_le32(-1))
+		goto posix_open_ret; /* open ok, caller does qpathinfo */
+
+	/* get new inode and set it up */
+	if (!pinode)
+		goto posix_open_ret; /* caller does not need info */
+
+	*pinode = cifs_new_inode(sb, &presp_data->UniqueId);
+
+	/* We do not need to close the file if new_inode fails since
+	   the caller will retry qpathinfo as long as inode is null */
+	if (*pinode == NULL)
+		goto posix_open_ret;
+
+	posix_fill_in_inode(*pinode, presp_data, 1);
+
+posix_open_ret:
+	kfree(presp_data);
+	return rc;
+}
+
 static void setup_cifs_dentry(struct cifsTconInfo *tcon,
 			      struct dentry *direntry,
 			      struct inode *newinode)
@@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	int xid;
 	int create_options = CREATE_NOT_DIR;
 	int oplock = 0;
-	/* BB below access is too much for the mknod to request */
+	int oflags;
+	/*
+	 * BB below access is probably too much for mknod to request
+	 *    but we have to do query and setpathinfo so requesting
+	 *    less could fail (unless we want to request getatr and setatr
+	 *    permissions (only).  At least for POSIX we do not have to
+	 *    request so much.
+	 */
 	int desiredAccess = GENERIC_READ | GENERIC_WRITE;
 	__u16 fileHandle;
 	struct cifs_sb_info *cifs_sb;
@@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 
 	mode &= ~current->fs->umask;
+	if (oplockEnabled)
+		oplock = REQ_OPLOCK;
 
-	if (nd && (nd->flags & LOOKUP_OPEN)) {
-		int oflags = nd->intent.open.flags;
+	if (nd && (nd->flags & LOOKUP_OPEN))
+		oflags = nd->intent.open.flags;
+	else
+		oflags = FMODE_READ;
+
+	if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) &&
+	    (CIFS_UNIX_POSIX_PATH_OPS_CAP &
+			le64_to_cpu(tcon->fsUnixInfo.Capability))) {
+		rc = cifs_posix_open(full_path, &newinode, inode->i_sb,
+				     mode, oflags, &oplock, &fileHandle, xid);
+		/* EIO could indicate that (posix open) operation is not
+		   supported, despite what server claimed in capability
+		   negotation.  EREMOTE indicates DFS junction, which is not
+		   handled in posix open */
+
+		if ((rc == 0) && (newinode == NULL))
+			goto cifs_create_get_file_info; /* query inode info */
+		else if (rc == 0) /* success, no need to query */
+			goto cifs_create_set_dentry;
+		else if ((rc != -EIO) && (rc != -EREMOTE) &&
+			 (rc != -EOPNOTSUPP)) /* path not found or net err */
+			goto cifs_create_out;
+		/* else fallthrough to retry, using older open call, this is
+		   case where server does not support this SMB level, and
+		   falsely claims capability (also get here for DFS case
+		   which should be rare for path not covered on files) */
+	}
 
+	if (nd && (nd->flags & LOOKUP_OPEN)) {
+		/* if the file is going to stay open, then we
+		   need to set the desired access properly */
 		desiredAccess = 0;
 		if (oflags & FMODE_READ)
-			desiredAccess |= GENERIC_READ;
+			desiredAccess |= GENERIC_READ; /* is this too little? */
 		if (oflags & FMODE_WRITE) {
 			desiredAccess |= GENERIC_WRITE;
 			if (!(oflags & FMODE_READ))
@@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 
 	/* BB add processing to set equivalent of mode - e.g. via CreateX with
 	   ACLs */
-	if (oplockEnabled)
-		oplock = REQ_OPLOCK;
 
 	buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL);
 	if (buf == NULL) {
@@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode,
 	}
 	if (rc) {
 		cFYI(1, ("cifs_create returned 0x%x", rc));
-	} else {
-		/* If Open reported that we actually created a file
-		then we now have to set the mode if possible */
-		if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
-			struct cifs_unix_set_info_args args = {
+		goto cifs_create_out;
+	}
+
+	/* If Open reported that we actually created a file
+	   then we now have to set the mode if possible */
+	if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) {
+		struct cifs_unix_set_info_args args = {
 				.mode	= mode,
 				.ctime	= NO_CHANGE_64,
 				.atime	= NO_CHANGE_64,
 				.mtime	= NO_CHANGE_64,
 				.device	= 0,
-			};
+		};
 
-			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
-				args.uid = (__u64) current_fsuid();
-				if (inode->i_mode & S_ISGID)
-					args.gid = (__u64) inode->i_gid;
-				else
-					args.gid = (__u64) current_fsgid();
-			} else {
-				args.uid = NO_CHANGE_64;
-				args.gid = NO_CHANGE_64;
-			}
-			CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
-				cifs_sb->local_nls,
-				cifs_sb->mnt_cifs_flags &
-					CIFS_MOUNT_MAP_SPECIAL_CHR);
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) {
+			args.uid = (__u64) current_fsuid();
+			if (inode->i_mode & S_ISGID)
+				args.gid = (__u64) inode->i_gid;
+			else
+				args.gid = (__u64) current_fsgid();
 		} else {
-			/* BB implement mode setting via Windows security
-			   descriptors e.g. */
-			/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
-
-			/* Could set r/o dos attribute if mode & 0222 == 0 */
+			args.uid = NO_CHANGE_64;
+			args.gid = NO_CHANGE_64;
 		}
+		CIFSSMBUnixSetInfo(xid, tcon, full_path, &args,
+			cifs_sb->local_nls,
+			cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+	} else {
+		/* BB implement mode setting via Windows security
+		   descriptors e.g. */
+		/* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/
 
-		/* server might mask mode so we have to query for it */
-		if (tcon->unix_ext)
-			rc = cifs_get_inode_info_unix(&newinode, full_path,
-						 inode->i_sb, xid);
-		else {
-			rc = cifs_get_inode_info(&newinode, full_path,
-						 buf, inode->i_sb, xid,
-						 &fileHandle);
-			if (newinode) {
-				if (cifs_sb->mnt_cifs_flags &
-				    CIFS_MOUNT_DYNPERM)
-					newinode->i_mode = mode;
-				if ((oplock & CIFS_CREATE_ACTION) &&
-				    (cifs_sb->mnt_cifs_flags &
-				     CIFS_MOUNT_SET_UID)) {
-					newinode->i_uid = current_fsuid();
-					if (inode->i_mode & S_ISGID)
-						newinode->i_gid =
-							inode->i_gid;
-					else
-						newinode->i_gid =
-							current_fsgid();
-				}
+		/* Could set r/o dos attribute if mode & 0222 == 0 */
+	}
+
+cifs_create_get_file_info:
+	/* server might mask mode so we have to query for it */
+	if (tcon->unix_ext)
+		rc = cifs_get_inode_info_unix(&newinode, full_path,
+					      inode->i_sb, xid);
+	else {
+		rc = cifs_get_inode_info(&newinode, full_path, buf,
+					 inode->i_sb, xid, &fileHandle);
+		if (newinode) {
+			if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM)
+				newinode->i_mode = mode;
+			if ((oplock & CIFS_CREATE_ACTION) &&
+			    (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) {
+				newinode->i_uid = current_fsuid();
+				if (inode->i_mode & S_ISGID)
+					newinode->i_gid = inode->i_gid;
+				else
+					newinode->i_gid = current_fsgid();
 			}
 		}
+	}
 
-		if (rc != 0) {
-			cFYI(1, ("Create worked, get_inode_info failed rc = %d",
-				 rc));
-		} else
-			setup_cifs_dentry(tcon, direntry, newinode);
-
-		if ((nd == NULL /* nfsd case - nfs srv does not set nd */) ||
-			(!(nd->flags & LOOKUP_OPEN))) {
-			/* mknod case - do not leave file open */
-			CIFSSMBClose(xid, tcon, fileHandle);
-		} else if (newinode) {
-			struct cifsFileInfo *pCifsFile =
-			   kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
-
-			if (pCifsFile == NULL)
-				goto cifs_create_out;
-			pCifsFile->netfid = fileHandle;
-			pCifsFile->pid = current->tgid;
-			pCifsFile->pInode = newinode;
-			pCifsFile->invalidHandle = false;
-			pCifsFile->closePend     = false;
-			init_MUTEX(&pCifsFile->fh_sem);
-			mutex_init(&pCifsFile->lock_mutex);
-			INIT_LIST_HEAD(&pCifsFile->llist);
-			atomic_set(&pCifsFile->wrtPending, 0);
-
-			/* set the following in open now
+cifs_create_set_dentry:
+	if (rc == 0)
+		setup_cifs_dentry(tcon, direntry, newinode);
+	else
+		cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc));
+
+	/* nfsd case - nfs srv does not set nd */
+	if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) {
+		/* mknod case - do not leave file open */
+		CIFSSMBClose(xid, tcon, fileHandle);
+	} else if (newinode) {
+		struct cifsFileInfo *pCifsFile =
+			kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL);
+
+		if (pCifsFile == NULL)
+			goto cifs_create_out;
+		pCifsFile->netfid = fileHandle;
+		pCifsFile->pid = current->tgid;
+		pCifsFile->pInode = newinode;
+		pCifsFile->invalidHandle = false;
+		pCifsFile->closePend     = false;
+		init_MUTEX(&pCifsFile->fh_sem);
+		mutex_init(&pCifsFile->lock_mutex);
+		INIT_LIST_HEAD(&pCifsFile->llist);
+		atomic_set(&pCifsFile->wrtPending, 0);
+
+		/* set the following in open now
 				pCifsFile->pfile = file; */
-			write_lock(&GlobalSMBSeslock);
-			list_add(&pCifsFile->tlist, &tcon->openFileList);
-			pCifsInode = CIFS_I(newinode);
-			if (pCifsInode) {
-				/* if readable file instance put first in list*/
-				if (write_only) {
-					list_add_tail(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				} else {
-					list_add(&pCifsFile->flist,
-						&pCifsInode->openFileList);
-				}
-				if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
-					pCifsInode->clientCanCacheAll = true;
-					pCifsInode->clientCanCacheRead = true;
-					cFYI(1, ("Exclusive Oplock inode %p",
-						newinode));
-				} else if ((oplock & 0xF) == OPLOCK_READ)
-					pCifsInode->clientCanCacheRead = true;
+		write_lock(&GlobalSMBSeslock);
+		list_add(&pCifsFile->tlist, &tcon->openFileList);
+		pCifsInode = CIFS_I(newinode);
+		if (pCifsInode) {
+			/* if readable file instance put first in list*/
+			if (write_only) {
+				list_add_tail(&pCifsFile->flist,
+					      &pCifsInode->openFileList);
+			} else {
+				list_add(&pCifsFile->flist,
+					 &pCifsInode->openFileList);
 			}
-			write_unlock(&GlobalSMBSeslock);
+			if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) {
+				pCifsInode->clientCanCacheAll = true;
+				pCifsInode->clientCanCacheRead = true;
+				cFYI(1, ("Exclusive Oplock inode %p",
+					newinode));
+			} else if ((oplock & 0xF) == OPLOCK_READ)
+				pCifsInode->clientCanCacheRead = true;
 		}
+		write_unlock(&GlobalSMBSeslock);
 	}
 cifs_create_out:
 	kfree(buf);
-- 
cgit v1.2.3


From eca6acf91552a9b2e997cc76339115c95eac0217 Mon Sep 17 00:00:00 2001
From: Steve French <sfrench@us.ibm.com>
Date: Fri, 20 Feb 2009 05:43:09 +0000
Subject: [CIFS] Fix multiuser mounts so server does not invalidate earlier
 security contexts

When two different users mount the same Windows 2003 Server share using CIFS,
the first session mounted can be invalidated.  Some servers invalidate the first
smb session when a second similar user (e.g. two users who get mapped by server to "guest")
authenticates an smb session from the same client.

By making sure that we set the 2nd and subsequent vc numbers to nonzero values,
this ensures that we will not have this problem.

Fixes Samba bug 6004, problem description follows:
How to reproduce:

- configure an "open share" (full permissions to Guest user) on Windows 2003
Server (I couldn't reproduce the problem with Samba server or Windows older
than 2003)
- mount the share twice with different users who will be authenticated as guest.

 noacl,noperm,user=john,dir_mode=0700,domain=DOMAIN,rw
 noacl,noperm,user=jeff,dir_mode=0700,domain=DOMAIN,rw

Result:

- just the mount point mounted last is accessible:

Signed-off-by: Steve French <sfrench@us.ibm.com>
---
 fs/cifs/CHANGES    | 10 +++++++
 fs/cifs/cifsfs.h   |  2 +-
 fs/cifs/cifsglob.h |  6 +++-
 fs/cifs/cifssmb.c  |  7 +++--
 fs/cifs/sess.c     | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 105 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES
index 72063f5e56b1..851388fafc73 100644
--- a/fs/cifs/CHANGES
+++ b/fs/cifs/CHANGES
@@ -1,3 +1,13 @@
+Version 1.57
+------------
+Improve support for multiple security contexts to the same server. We
+used to use the same "vcnumber" for all connections which could cause
+the server to treat subsequent connections, especially those that
+are authenticated as guest, as reconnections, invalidating the earlier
+user's smb session.  This fix allows cifs to mount multiple times to the
+same server with different userids without risking invalidating earlier
+established security contexts.
+
 Version 1.56
 ------------
 Add "forcemandatorylock" mount option to allow user to use mandatory
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 7ac481841f87..2b1d28a9ee28 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.56"
+#define CIFS_VERSION   "1.57"
 #endif				/* _CIFSFS_H */
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 94c1ca0ec953..e004f6db5fc8 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -164,9 +164,12 @@ struct TCP_Server_Info {
 	/* multiplexed reads or writes */
 	unsigned int maxBuf;	/* maxBuf specifies the maximum */
 	/* message size the server can send or receive for non-raw SMBs */
-	unsigned int maxRw;	/* maxRw specifies the maximum */
+	unsigned int max_rw;	/* maxRw specifies the maximum */
 	/* message size the server can send or receive for */
 	/* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */
+	unsigned int max_vcs;	/* maximum number of smb sessions, at least
+				   those that can be specified uniquely with
+				   vcnumbers */
 	char sessid[4];		/* unique token id for this session */
 	/* (returned on Negotiate */
 	int capabilities; /* allow selective disabling of caps by smb sess */
@@ -210,6 +213,7 @@ struct cifsSesInfo {
 	unsigned overrideSecFlg;  /* if non-zero override global sec flags */
 	__u16 ipc_tid;		/* special tid for connection to IPC share */
 	__u16 flags;
+	__u16 vcnum;
 	char *serverOS;		/* name of operating system underlying server */
 	char *serverNOS;	/* name of network operating system of server */
 	char *serverDomain;	/* security realm of server */
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 552642a507c4..939e2f76b959 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 		server->maxReq = le16_to_cpu(rsp->MaxMpxCount);
 		server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize),
 				(__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
+		server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs);
 		GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey);
 		/* even though we do not use raw we might as well set this
 		accurately, in case we ever find a need for it */
 		if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) {
-			server->maxRw = 0xFF00;
+			server->max_rw = 0xFF00;
 			server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE;
 		} else {
-			server->maxRw = 0;/* we do not need to use raw anyway */
+			server->max_rw = 0;/* do not need to use raw anyway */
 			server->capabilities = CAP_MPX_MODE;
 		}
 		tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone);
@@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
 	/* probably no need to store and check maxvcs */
 	server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize),
 			(__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE);
-	server->maxRw = le32_to_cpu(pSMBr->MaxRawSize);
+	server->max_rw = le32_to_cpu(pSMBr->MaxRawSize);
 	cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf));
 	GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey);
 	server->capabilities = le32_to_cpu(pSMBr->Capabilities);
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index b234407a3007..5c68b4282be9 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -34,15 +34,99 @@
 extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8,
 			 unsigned char *p24);
 
+/* Checks if this is the first smb session to be reconnected after
+   the socket has been reestablished (so we know whether to use vc 0).
+   Called while holding the cifs_tcp_ses_lock, so do not block */
+static bool is_first_ses_reconnect(struct cifsSesInfo *ses)
+{
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+
+	list_for_each(tmp, &ses->server->smb_ses_list) {
+		tmp_ses = list_entry(tmp, struct cifsSesInfo,
+				     smb_ses_list);
+		if (tmp_ses->need_reconnect == false)
+			return false;
+	}
+	/* could not find a session that was already connected,
+	   this must be the first one we are reconnecting */
+	return true;
+}
+
+/*
+ *	vc number 0 is treated specially by some servers, and should be the
+ *      first one we request.  After that we can use vcnumbers up to maxvcs,
+ *	one for each smb session (some Windows versions set maxvcs incorrectly
+ *	so maxvc=1 can be ignored).  If we have too many vcs, we can reuse
+ *	any vc but zero (some servers reset the connection on vcnum zero)
+ *
+ */
+static __le16 get_next_vcnum(struct cifsSesInfo *ses)
+{
+	__u16 vcnum = 0;
+	struct list_head *tmp;
+	struct cifsSesInfo *tmp_ses;
+	__u16 max_vcs = ses->server->max_vcs;
+	__u16 i;
+	int free_vc_found = 0;
+
+	/* Quoting the MS-SMB specification: "Windows-based SMB servers set this
+	field to one but do not enforce this limit, which allows an SMB client
+	to establish more virtual circuits than allowed by this value ... but
+	other server implementations can enforce this limit." */
+	if (max_vcs < 2)
+		max_vcs = 0xFFFF;
+
+	write_lock(&cifs_tcp_ses_lock);
+	if ((ses->need_reconnect) && is_first_ses_reconnect(ses))
+			goto get_vc_num_exit;  /* vcnum will be zero */
+	for (i = ses->server->srv_count - 1; i < max_vcs; i++) {
+		if (i == 0) /* this is the only connection, use vc 0 */
+			break;
+
+		free_vc_found = 1;
+
+		list_for_each(tmp, &ses->server->smb_ses_list) {
+			tmp_ses = list_entry(tmp, struct cifsSesInfo,
+					     smb_ses_list);
+			if (tmp_ses->vcnum == i) {
+				free_vc_found = 0;
+				break; /* found duplicate, try next vcnum */
+			}
+		}
+		if (free_vc_found)
+			break; /* we found a vcnumber that will work - use it */
+	}
+
+	if (i == 0)
+		vcnum = 0; /* for most common case, ie if one smb session, use
+			      vc zero.  Also for case when no free vcnum, zero
+			      is safest to send (some clients only send zero) */
+	else if (free_vc_found == 0)
+		vcnum = 1;  /* we can not reuse vc=0 safely, since some servers
+				reset all uids on that, but 1 is ok. */
+	else
+		vcnum = i;
+	ses->vcnum = vcnum;
+get_vc_num_exit:
+	write_unlock(&cifs_tcp_ses_lock);
+
+	return le16_to_cpu(vcnum);
+}
+
 static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 {
 	__u32 capabilities = 0;
 
 	/* init fields common to all four types of SessSetup */
-	/* note that header is initialized to zero in header_assemble */
+	/* Note that offsets for first seven fields in req struct are same  */
+	/*	in CIFS Specs so does not matter which of 3 forms of struct */
+	/*	that we use in next few lines                               */
+	/* Note that header is initialized to zero in header_assemble */
 	pSMB->req.AndXCommand = 0xFF;
 	pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf);
 	pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq);
+	pSMB->req.VcNumber = get_next_vcnum(ses);
 
 	/* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */
 
@@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB)
 	if (ses->capabilities & CAP_UNIX)
 		capabilities |= CAP_UNIX;
 
-	/* BB check whether to init vcnum BB */
 	return capabilities;
 }
 
-- 
cgit v1.2.3


From 4c41bd0ec953954158f92bed5d3062645062b98e Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 16 Feb 2009 21:29:31 +0100
Subject: [JFFS2] fix mount crash caused by removed nodes

At scan time we observed following scenario:

   node A inserted
   node B inserted
   node C inserted -> sets overlapped flag on node B

   node A is removed due to CRC failure -> overlapped flag on node B remains

   while (tn->overlapped)
   	 tn = tn_prev(tn);

   ==> crash, when tn_prev(B) is referenced.

When the ultimate node is removed at scan time and the overlapped flag
is set on the penultimate node, then nothing updates the overlapped
flag of that node. The overlapped iterators blindly expect that the
ultimate node does not have the overlapped flag set, which causes the
scan code to crash.

It would be a huge overhead to go through the node chain on node
removal and fix up the overlapped flags, so detecting such a case on
the fly in the overlapped iterators is a simpler and reliable
solution.

Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 fs/jffs2/readinode.c | 42 +++++++++++++++++++++++++++++++++---------
 1 file changed, 33 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c
index 6ca08ad887c0..1fc1e92356ee 100644
--- a/fs/jffs2/readinode.c
+++ b/fs/jffs2/readinode.c
@@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 				struct jffs2_tmp_dnode_info *tn)
 {
 	uint32_t fn_end = tn->fn->ofs + tn->fn->size;
-	struct jffs2_tmp_dnode_info *this;
+	struct jffs2_tmp_dnode_info *this, *ptn;
 
 	dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw));
 
@@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 	if (this) {
 		/* If the node is coincident with another at a lower address,
 		   back up until the other node is found. It may be relevant */
-		while (this->overlapped)
-			this = tn_prev(this);
-
-		/* First node should never be marked overlapped */
-		BUG_ON(!this);
+		while (this->overlapped) {
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
+		}
 		dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole");
 	}
 
@@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c,
 			}
 			if (!this->overlapped)
 				break;
-			this = tn_prev(this);
+
+			ptn = tn_prev(this);
+			if (!ptn) {
+				/*
+				 * We killed a node which set the overlapped
+				 * flags during the scan. Fix it up.
+				 */
+				this->overlapped = 0;
+				break;
+			}
+			this = ptn;
 		}
 	}
 
@@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c,
 		eat_last(&rii->tn_root, &last->rb);
 		ver_insert(&ver_root, last);
 
-		if (unlikely(last->overlapped))
-			continue;
+		if (unlikely(last->overlapped)) {
+			if (pen)
+				continue;
+			/*
+			 * We killed a node which set the overlapped
+			 * flags during the scan. Fix it up.
+			 */
+			last->overlapped = 0;
+		}
 
 		/* Now we have a bunch of nodes in reverse version
 		   order, in the tree at ver_root. Most of the time,
-- 
cgit v1.2.3


From 05bf9e839d9de4e8a094274a0a2fd07beb47eaf1 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 21 Feb 2009 12:13:24 -0500
Subject: ext4: Add fallback for find_group_flex

This is a workaround for find_group_flex() which badly needs to be
replaced.  One of its problems (besides ignoring the Orlov algorithm)
is that it is a bit hyperactive about returning failure under
suspicious circumstances.  This can lead to spurious ENOSPC failures
even when there are inodes still available.

Work around this for now by retrying the search using
find_group_other() if find_group_flex() returns -1.  If
find_group_other() succeeds when find_group_flex() has failed, log a
warning message.

A better block/inode allocator that will fix this problem for real has
been queued up for the next merge window.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 4fb86a0061d0..f18a919be70b 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -715,6 +715,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
 
 	if (sbi->s_log_groups_per_flex) {
 		ret2 = find_group_flex(sb, dir, &group);
+		if (ret2 == -1) {
+			ret2 = find_group_other(sb, dir, &group);
+			if (ret2 == 0 && printk_ratelimit())
+				printk(KERN_NOTICE "ext4: find_group_flex "
+				       "failed, fallback succeeded dir %lu\n",
+				       dir->i_ino);
+		}
 		goto got_group;
 	}
 
-- 
cgit v1.2.3


From ebd3610b110bbb18ea6f9f2aeed1e1068c537227 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Sun, 22 Feb 2009 21:09:59 -0500
Subject: ext4: Fix deadlock in ext4_write_begin() and ext4_da_write_begin()

Functions ext4_write_begin() and ext4_da_write_begin() call
grab_cache_page_write_begin() without AOP_FLAG_NOFS. Thus it
can happen that page reclaim is triggered in that function
and it recurses back into the filesystem (or some other filesystem).
But this can lead to various problems as a transaction is already
started at that point. Add the necessary flag.

http://bugzilla.kernel.org/show_bug.cgi?id=11688

Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/inode.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cbd2ca99d113..51cdd13e1c31 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1368,6 +1368,10 @@ retry:
 		goto out;
 	}
 
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
+
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
 		ext4_journal_stop(handle);
@@ -1377,7 +1381,7 @@ retry:
 	*pagep = page;
 
 	ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-							ext4_get_block);
+				ext4_get_block);
 
 	if (!ret && ext4_should_journal_data(inode)) {
 		ret = walk_page_buffers(handle, page_buffers(page),
@@ -2667,6 +2671,9 @@ retry:
 		ret = PTR_ERR(handle);
 		goto out;
 	}
+	/* We cannot recurse into the filesystem as the transaction is already
+	 * started */
+	flags |= AOP_FLAG_NOFS;
 
 	page = grab_cache_page_write_begin(mapping, index, flags);
 	if (!page) {
-- 
cgit v1.2.3


From cac711211a039ae2e2dc6322ffb3c2279d093bf1 Mon Sep 17 00:00:00 2001
From: Krzysztof Sachanowicz <analyzer1@gmail.com>
Date: Mon, 23 Feb 2009 22:21:55 +0100
Subject: proc: proc_get_inode should de_put when inode already initialized

de_get is called before every proc_get_inode, but corresponding de_put is
called only when dropping last reference to an inode. This might cause
something like
remove_proc_entry: /proc/stats busy, count=14496
to be printed to the syslog.

The fix is to call de_put in case of an already initialized inode in
proc_get_inode.

Signed-off-by: Krzysztof Sachanowicz <analyzer1@gmail.com>
Tested-by: Marcin Pilipczuk <marcin.pilipczuk@gmail.com>
Acked-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 3e76bb9b3ad6..d8bb5c671f42 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino,
 			}
 		}
 		unlock_new_inode(inode);
-	} else
+	} else {
 	       module_put(de->owner);
+	       de_put(de);
+	}
 	return inode;
 
 out_ino:
-- 
cgit v1.2.3


From e07a4b9217d1e97d2f3a62b6b070efdc61212110 Mon Sep 17 00:00:00 2001
From: Helge Bahmann <helge.bahmann@secunet.com>
Date: Fri, 20 Feb 2009 16:24:12 +0300
Subject: proc: fix PG_locked reporting in /proc/kpageflags

Expr always evaluates to zero.

Cc: Matt Mackall <mpm@selenic.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 fs/proc/page.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/proc/page.c b/fs/proc/page.c
index 767d95a6d1b1..2d1345112a42 100644
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf,
 		else
 			kflags = ppage->flags;
 
-		uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) |
+		uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) |
 			kpf_copy_bit(kflags, KPF_ERROR, PG_error) |
 			kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) |
 			kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) |
-- 
cgit v1.2.3


From 8b1a8ff8b321a9384304aeea4dbdb9747daf7ee8 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Feb 2009 00:08:53 -0500
Subject: ext4: Remove duplicate call to ext4_commit_super() in ext4_freeze()

Commit c4be0c1d added error checking to ext4_freeze() when calling
ext4_commit_super().  Unfortunately the patch failed to remove the
original call to ext4_commit_super(), with the net result that when
freezing the filesystem, the superblock gets written twice, the first
time without error checking.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/super.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index a5732c58f676..39d1993cfa13 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3091,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb)
 
 		/* Journal blocked and flushed, clear needs_recovery flag. */
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-		ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
 		if (error)
 			goto out;
-- 
cgit v1.2.3


From d8ae4601a4b7ea1fa17fa395c3468c0e144d1275 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Sat, 28 Feb 2009 09:50:01 -0500
Subject: ext4: Reorder fs/Makefile so that ext2 root fs's are mounted using
 ext2

In fs/Makefile, ext3 was placed before ext2 so that a root filesystem
that possessed a journal, it would be mounted as ext3 instead of ext2.
This was necessary because a cleanly unmounted ext3 filesystem was
fully backwards compatible with ext2, and could be mounted by ext2 ---
but it was desirable that it be mounted with ext3 so that the
journaling would be enabled.

The ext4 filesystem supports new incompatible features, so there is no
danger of an ext4 filesystem being mistaken for an ext2 filesystem.
At that point, the relative ordering of ext4 with respect to ext2
didn't matter until ext4 gained the ability to mount filesystems
without a journal starting in 2.6.29-rc1.  Now that this is the case,
given that ext4 is before ext2, it means that root filesystems that
were using the plain-jane ext2 format are getting mounted using the
ext4 filesystem driver, which is a change in behavior which could be
surprising to users.

It's doubtful that there are that many ext2-only root filesystem users
that would also have ext4 compiled into the kernel, but to adhere to
the principle of least surprise, the correct ordering in fs/Makefile
is ext3, followed by ext2, and finally ext4.

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/Makefile | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/Makefile b/fs/Makefile
index 38bc735c67ad..dc20db348679 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -69,10 +69,12 @@ obj-$(CONFIG_DLM)		+= dlm/
 # Do not add any filesystems before this line
 obj-$(CONFIG_REISERFS_FS)	+= reiserfs/
 obj-$(CONFIG_EXT3_FS)		+= ext3/ # Before ext2 so root fs can be ext3
-obj-$(CONFIG_EXT4_FS)		+= ext4/ # Before ext2 so root fs can be ext4
+obj-$(CONFIG_EXT2_FS)		+= ext2/
+# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2
+# unless explicitly requested by rootfstype
+obj-$(CONFIG_EXT4_FS)		+= ext4/
 obj-$(CONFIG_JBD)		+= jbd/
 obj-$(CONFIG_JBD2)		+= jbd2/
-obj-$(CONFIG_EXT2_FS)		+= ext2/
 obj-$(CONFIG_CRAMFS)		+= cramfs/
 obj-$(CONFIG_SQUASHFS)		+= squashfs/
 obj-y				+= ramfs/
-- 
cgit v1.2.3


From 8f64b32eb73fbfe9f38c4123121b63ee409278a7 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Thu, 26 Feb 2009 00:57:35 -0500
Subject: ext4: don't call jbd2_journal_force_commit_nested without journal

Running without a journal, I oopsed when I ran out of space,
because we called jbd2_journal_force_commit_nested() from
ext4_should_retry_alloc() without a journal.

This should take care of it, I think.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/balloc.c | 4 +++-
 fs/ext4/inode.c  | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 9a50b8052dcf..de9459b4cb94 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi,
  */
 int ext4_should_retry_alloc(struct super_block *sb, int *retries)
 {
-	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3)
+	if (!ext4_has_free_blocks(EXT4_SB(sb), 1) ||
+	    (*retries)++ > 3 ||
+	    !EXT4_SB(sb)->s_journal)
 		return 0;
 
 	jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 51cdd13e1c31..c7fed5b18745 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2544,7 +2544,7 @@ retry:
 
 		ext4_journal_stop(handle);
 
-		if (mpd.retval == -ENOSPC) {
+		if ((mpd.retval == -ENOSPC) && sbi->s_journal) {
 			/* commit the transaction which would
 			 * free blocks released in the transaction
 			 * and try again
-- 
cgit v1.2.3


From b2bf96833c5782befc3e7700f791fde754a47b01 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Feb 2009 08:50:26 +0100
Subject: block: fix bogus gcc warning for uninitialized var usage

Newer gcc throw this warning:

        fs/bio.c: In function ?bio_alloc_bioset?:
        fs/bio.c:305: warning: ?p? may be used uninitialized in this function

since it cannot figure out that 'p' is only ever used if 'bs' is non-NULL.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 fs/bio.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/bio.c b/fs/bio.c
index 72ab251cdb9c..124b95c4d582 100644
--- a/fs/bio.c
+++ b/fs/bio.c
@@ -302,7 +302,7 @@ void bio_init(struct bio *bio)
 struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs)
 {
 	struct bio *bio = NULL;
-	void *p;
+	void *uninitialized_var(p);
 
 	if (bs) {
 		p = mempool_alloc(bs->bio_pool, gfp_mask);
-- 
cgit v1.2.3


From 47be12e4eec84c1846f29af64fe25a396b57a026 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Fri, 9 Jan 2009 07:32:48 +0800
Subject: ocfs2: Access and dirty the buffer_head in mark_written.

In __ocfs2_mark_extent_written, when we meet with the situation
of c_split_covers_rec, the old solution just replace the extent
record and forget to access and dirty the buffer_head. This will
cause a problem when the unwritten extent is in an extent block.
So access and dirty it.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/alloc.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c
index 60fe74035db5..3a9e5deed74d 100644
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4796,6 +4796,29 @@ out:
 	return ret;
 }
 
+static int ocfs2_replace_extent_rec(struct inode *inode,
+				    handle_t *handle,
+				    struct ocfs2_path *path,
+				    struct ocfs2_extent_list *el,
+				    int split_index,
+				    struct ocfs2_extent_rec *split_rec)
+{
+	int ret;
+
+	ret = ocfs2_path_bh_journal_access(handle, inode, path,
+					   path_num_items(path) - 1);
+	if (ret) {
+		mlog_errno(ret);
+		goto out;
+	}
+
+	el->l_recs[split_index] = *split_rec;
+
+	ocfs2_journal_dirty(handle, path_leaf_bh(path));
+out:
+	return ret;
+}
+
 /*
  * Mark part or all of the extent record at split_index in the leaf
  * pointed to by path as written. This removes the unwritten
@@ -4885,7 +4908,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode,
 
 	if (ctxt.c_contig_type == CONTIG_NONE) {
 		if (ctxt.c_split_covers_rec)
-			el->l_recs[split_index] = *split_rec;
+			ret = ocfs2_replace_extent_rec(inode, handle,
+						       path, el,
+						       split_index, split_rec);
 		else
 			ret = ocfs2_split_and_insert(inode, handle, path, et,
 						     &last_eb_bh, split_index,
-- 
cgit v1.2.3


From 7dc102b737e9f49dac426161294cb2d326a97d8e Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:13 -0800
Subject: ocfs2/dlm: Retract fix for race between purge and migrate

Mainline commit d4f7e650e55af6b235871126f747da88600e8040 attempts to delay
the dlm_thread from sending the drop ref message if the lockres is being
migrated. The problem is that we make the dlm_thread wait for the migration
to complete. This causes a deadlock as dlm_thread also participates in the
lockres migration process.

A better fix for the original oss bugzilla#1012 is in testing.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmthread.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index d1295203029f..4060bb328bc8 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm,
 
 		spin_lock(&res->spinlock);
 		/* This ensures that clear refmap is sent after the set */
-		__dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG |
-						  DLM_LOCK_RES_MIGRATING));
+		__dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
 		spin_unlock(&res->spinlock);
 
 		/* clear our bit from the master's refmap, ignore errors */
-- 
cgit v1.2.3


From c74ff8bb2235d848beb67fcfddae71ecbe3f92b1 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:14 -0800
Subject: ocfs2: Cleanup the lockname print in dlmglue.c

The dentry lock has a different format than other locks. This patch fixes
ocfs2_log_dlm_error() macro to make it print the dentry lock correctly.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlmglue.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c
index 206a2370876a..7219a86d34cc 100644
--- a/fs/ocfs2/dlmglue.c
+++ b/fs/ocfs2/dlmglue.c
@@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb,
 					struct ocfs2_lock_res *lockres);
 static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres,
 						int convert);
-#define ocfs2_log_dlm_error(_func, _err, _lockres) do {			\
-	mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \
-	     _err, _func, _lockres->l_name);				\
+#define ocfs2_log_dlm_error(_func, _err, _lockres) do {					\
+	if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY)				\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n",	\
+		     _err, _func, _lockres->l_name);					\
+	else										\
+		mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n",	\
+		     _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name,	\
+		     (unsigned int)ocfs2_get_dentry_lock_ino(_lockres));		\
 } while (0)
 static int ocfs2_downconvert_thread(void *arg);
 static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb,
-- 
cgit v1.2.3


From dabc47de7a23f57522dc762d9d2ad875700d3497 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:15 -0800
Subject: ocfs2/dlm: Use ast_lock to protect ast_list

The code was using dlm->spinlock instead of dlm->ast_lock to protect the
ast_list. This patch fixes the issue.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmunlock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index 86ca085ef324..fcf879ed6930 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
 	else
 		BUG_ON(res->owner == dlm->node_num);
 
-	spin_lock(&dlm->spinlock);
+	spin_lock(&dlm->ast_lock);
 	/* We want to be sure that we're not freeing a lock
 	 * that still has AST's pending... */
 	in_use = !list_empty(&lock->ast_list);
-	spin_unlock(&dlm->spinlock);
+	spin_unlock(&dlm->ast_lock);
 	if (in_use) {
 	       mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock "
 		    "while waiting for an ast!", res->lockname.len,
-- 
cgit v1.2.3


From 53ecd25e148615e0ed2a72635cc76f4773f97f90 Mon Sep 17 00:00:00 2001
From: Sunil Mushran <sunil.mushran@oracle.com>
Date: Tue, 3 Feb 2009 12:37:16 -0800
Subject: ocfs2/dlm: Make dlm_assert_master_handler() kill itself instead of
 the asserter

In dlm_assert_master_handler(), if we get an incorrect assert master from a node
that, we reply with EINVAL asking the asserter to die. The problem is that an
assert is sent after so many hoops, it is invariably the node that thinks the
asserter is wrong, is actually wrong. So instead of killing the asserter, this
patch kills the assertee.

This patch papers over a race that is still being addressed.

Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/dlm/dlmmaster.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 54e182a27caf..0a2813947853 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data,
 		if (!mle) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN &&
 			    res->owner != assert->node_idx) {
-				mlog(ML_ERROR, "assert_master from "
-					  "%u, but current owner is "
-					  "%u! (%.*s)\n",
-				       assert->node_idx, res->owner,
-				       namelen, name);
-				goto kill;
+				mlog(ML_ERROR, "DIE! Mastery assert from %u, "
+				     "but current owner is %u! (%.*s)\n",
+				     assert->node_idx, res->owner, namelen,
+				     name);
+				__dlm_print_one_lock_resource(res);
+				BUG();
 			}
 		} else if (mle->type != DLM_MLE_MIGRATION) {
 			if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) {
-- 
cgit v1.2.3


From 89a907afe073b8971a83d0ad54f391542b64d327 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 17 Feb 2009 04:39:28 +0800
Subject: ocfs2: Use the right access_* method in ctime update of xattr.

In ctime updating of xattr, it use the wrong type of access for
inode, so use ocfs2_journal_access_di instead.

Reported-and-Tested-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Tao Ma <tao.ma@oracle.com>
Acked-by: Joel Becker <joel.becker@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index 915039fffe6e..e3933158e1d7 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -2592,8 +2592,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode,
 
 	if (!ret) {
 		/* Update inode ctime. */
-		ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh,
-					   OCFS2_JOURNAL_ACCESS_WRITE);
+		ret = ocfs2_journal_access_di(ctxt->handle, inode,
+					      xis->inode_bh,
+					      OCFS2_JOURNAL_ACCESS_WRITE);
 		if (ret) {
 			mlog_errno(ret);
 			goto out;
-- 
cgit v1.2.3


From c8b9cf9a7cd25ba65166116d0a958f0bc709f0a7 Mon Sep 17 00:00:00 2001
From: Tao Ma <tao.ma@oracle.com>
Date: Tue, 24 Feb 2009 17:40:26 -0800
Subject: ocfs2: lock the metaecc process for xattr bucket

For other metadata in ocfs2, metaecc is checked in ocfs2_read_blocks
with io_mutex held. While for xattr bucket, it is calculated by
the whole buckets. So we have to add a spin_lock to prevent multiple
processes calculating metaecc.

Signed-off-by: Tao Ma <tao.ma@oracle.com>
Tested-by: Tristan Ye <tristan.ye@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/ocfs2.h | 3 +++
 fs/ocfs2/super.c | 1 +
 fs/ocfs2/xattr.c | 4 ++++
 3 files changed, 8 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 077384135f4e..946d3c34b90b 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -341,6 +341,9 @@ struct ocfs2_super
 	struct ocfs2_node_map		osb_recovering_orphan_dirs;
 	unsigned int			*osb_orphan_wipes;
 	wait_queue_head_t		osb_wipe_event;
+
+	/* used to protect metaecc calculation check of xattr. */
+	spinlock_t osb_xattr_lock;
 };
 
 #define OCFS2_SB(sb)	    ((struct ocfs2_super *)(sb)->s_fs_info)
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index b1cb38fbe807..1c3acc4654d8 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1747,6 +1747,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	INIT_LIST_HEAD(&osb->blocked_lock_list);
 	osb->blocked_lock_count = 0;
 	spin_lock_init(&osb->osb_lock);
+	spin_lock_init(&osb->osb_xattr_lock);
 	ocfs2_init_inode_steal_slot(osb);
 
 	atomic_set(&osb->alloc_stats.moves, 0);
diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index e3933158e1d7..a7c167905c56 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -274,10 +274,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket,
 			       bucket->bu_blocks, bucket->bu_bhs, 0,
 			       NULL);
 	if (!rc) {
+		spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb,
 						 bucket->bu_bhs,
 						 bucket->bu_blocks,
 						 &bucket_xh(bucket)->xh_check);
+		spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 		if (rc)
 			mlog_errno(rc);
 	}
@@ -310,9 +312,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle,
 {
 	int i;
 
+	spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 	ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb,
 				   bucket->bu_bhs, bucket->bu_blocks,
 				   &bucket_xh(bucket)->xh_check);
+	spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock);
 
 	for (i = 0; i < bucket->bu_blocks; i++)
 		ocfs2_journal_dirty(handle, bucket->bu_bhs[i]);
-- 
cgit v1.2.3


From 4442f518269c6b3686fcbcadad22dc4475309b16 Mon Sep 17 00:00:00 2001
From: Tiger Yang <tiger.yang@oracle.com>
Date: Fri, 20 Feb 2009 11:11:50 +0800
Subject: ocfs2: set gap to seperate entry and value when xattr in bucket

This patch set a gap (4 bytes) between xattr entry and
name/value when xattr in bucket. This gap use to seperate
entry and name/value when a bucket is full. It had already
been set when xattr in inode/block.

Signed-off-by: Tiger Yang <tiger.yang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/xattr.c | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

(limited to 'fs')

diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
index a7c167905c56..4ddd788add67 100644
--- a/fs/ocfs2/xattr.c
+++ b/fs/ocfs2/xattr.c
@@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt {
 
 #define OCFS2_XATTR_ROOT_SIZE	(sizeof(struct ocfs2_xattr_def_value_root))
 #define OCFS2_XATTR_INLINE_SIZE	80
+#define OCFS2_XATTR_HEADER_GAP	4
 #define OCFS2_XATTR_FREE_IN_IBODY	(OCFS2_MIN_XATTR_INLINE_SIZE \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 #define OCFS2_XATTR_FREE_IN_BLOCK(ptr)	((ptr)->i_sb->s_blocksize \
 					 - sizeof(struct ocfs2_xattr_block) \
 					 - sizeof(struct ocfs2_xattr_header) \
-					 - sizeof(__u32))
+					 - OCFS2_XATTR_HEADER_GAP)
 
 static struct ocfs2_xattr_def_value_root def_xv = {
 	.xv.xr_list.l_count = cpu_to_le16(1),
@@ -1511,7 +1512,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return -EIO;
 
@@ -2194,7 +2195,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode,
 		last += 1;
 	}
 
-	free = min_offs - ((void *)last - xs->base) - sizeof(__u32);
+	free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP;
 	if (free < 0)
 		return 0;
 
@@ -5065,8 +5066,8 @@ try_again:
 	xh_free_start = le16_to_cpu(xh->xh_free_start);
 	header_size = sizeof(struct ocfs2_xattr_header) +
 			count * sizeof(struct ocfs2_xattr_entry);
-	max_free = OCFS2_XATTR_BUCKET_SIZE -
-		le16_to_cpu(xh->xh_name_value_len) - header_size;
+	max_free = OCFS2_XATTR_BUCKET_SIZE - header_size -
+		le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP;
 
 	mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size "
 			"of %u which exceed block size\n",
@@ -5099,7 +5100,7 @@ try_again:
 			need = 0;
 	}
 
-	free = xh_free_start - header_size;
+	free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP;
 	/*
 	 * We need to make sure the new name/value pair
 	 * can exist in the same block.
@@ -5132,7 +5133,8 @@ try_again:
 			}
 
 			xh_free_start = le16_to_cpu(xh->xh_free_start);
-			free = xh_free_start - header_size;
+			free = xh_free_start - header_size
+				- OCFS2_XATTR_HEADER_GAP;
 			if (xh_free_start % blocksize < need)
 				free -= xh_free_start % blocksize;
 
-- 
cgit v1.2.3


From 28d57d437786eb3e44f1ca3f0f41e7cfe29c6dd4 Mon Sep 17 00:00:00 2001
From: wengang wang <wen.gang.wang@oracle.com>
Date: Fri, 13 Feb 2009 10:11:47 +0800
Subject: ocfs2: add IO error check in ocfs2_get_sector()

Check for IO error in ocfs2_get_sector().

Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com>
Signed-off-by: Mark Fasheh <mfasheh@suse.com>
---
 fs/ocfs2/super.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'fs')

diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 1c3acc4654d8..7ac83a81ee55 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb,
 	unlock_buffer(*bh);
 	ll_rw_block(READ, 1, bh);
 	wait_on_buffer(*bh);
+	if (!buffer_uptodate(*bh)) {
+		mlog_errno(-EIO);
+		brelse(*bh);
+		*bh = NULL;
+		return -EIO;
+	}
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From adc487204a9373d2b5a535412466326036147a72 Mon Sep 17 00:00:00 2001
From: Benny Halevy <bhalevy@panasas.com>
Date: Fri, 27 Feb 2009 14:02:59 -0800
Subject: EXPORT_SYMBOL(d_obtain_alias) rather than EXPORT_SYMBOL_GPL

Commit 4ea3ada2955e4519befa98ff55dd62d6dfbd1705 declares d_obtain_alias()
as EXPORT_SYMBOL_GPL where it's supposed to replace d_alloc_anon which was
previously declared as EXPORT_SYMBOL and thus available to any loadable
module.

This patch reverts that.

Signed-off-by: Benny Halevy <bhalevy@panasas.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index 937df0fb0da5..07e2d4a44bda 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode)
 	iput(inode);
 	return res;
 }
-EXPORT_SYMBOL_GPL(d_obtain_alias);
+EXPORT_SYMBOL(d_obtain_alias);
 
 /**
  * d_splice_alias - splice a disconnected dentry into the tree if one exists
-- 
cgit v1.2.3


From 5cf8cf4146de03de67d1a8aefbece66b65f255cc Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Thu, 26 Feb 2009 21:32:51 +0100
Subject: Fix FREEZE/THAW compat_ioctl regression

Commit 8e961870bb9804110d5c8211d5d9d500451c4518 removed the FREEZE/THAW
handling in xfs_compat_ioctl but never added any compat handler back, so
now any freeze/thaw request from a 32-bit binary ond 64-bit userspace
will fail.

As these ioctls are 32/64-bit compatible two simple COMPATIBLE_IOCTL
entries in fs/compat_ioctl.c will do the job.

Signed-off-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/compat_ioctl.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index 39bd4d38e889..45e59d3c7f1f 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1913,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD)  /* This is also TIOCINQ */
 /* 0x00 */
 COMPATIBLE_IOCTL(FIBMAP)
 COMPATIBLE_IOCTL(FIGETBSZ)
+/* 'X' - originally XFS but some now in the VFS */
+COMPATIBLE_IOCTL(FIFREEZE)
+COMPATIBLE_IOCTL(FITHAW)
 /* RAID */
 COMPATIBLE_IOCTL(RAID_VERSION)
 COMPATIBLE_IOCTL(GET_ARRAY_INFO)
-- 
cgit v1.2.3


From 7ce9d5d1f3c8736511daa413c64985a05b2feee3 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@redhat.com>
Date: Wed, 4 Mar 2009 18:38:18 -0500
Subject: ext4: fix ext4_free_inode() vs. ext4_claim_inode() race

I was seeing fsck errors on inode bitmaps after a 4 thread
dbench run on a 4 cpu machine:

Inode bitmap differences: -50736 -(50752--50753) etc...

I believe that this is because ext4_free_inode() uses atomic
bitops, and although ext4_new_inode() *used* to also use atomic
bitops for synchronization, commit
393418676a7602e1d7d3f6e560159c65c8cbd50e changed this to use
the sb_bgl_lock, so that we could also synchronize against
read_inode_bitmap and initialization of uninit inode tables.

However, that change left ext4_free_inode using atomic bitops,
which I think leaves no synchronization between setting &
unsetting bits in the inode table.

The below patch fixes it for me, although I wonder if we're
getting at all heavy-handed with this spinlock...

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Reviewed-by: Aneesh Kumar K.V <aneesh.kumar@linux.vnet.ibm.com>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
---
 fs/ext4/ialloc.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index f18a919be70b..627f8c3337a3 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 	struct ext4_group_desc *gdp;
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi;
-	int fatal = 0, err, count;
+	int fatal = 0, err, count, cleared;
 	ext4_group_t flex_group;
 
 	if (atomic_read(&inode->i_count) > 1) {
@@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
 		goto error_return;
 
 	/* Ok, now we can actually update the inode bitmaps.. */
-	if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
-					bit, bitmap_bh->b_data))
+	spin_lock(sb_bgl_lock(sbi, block_group));
+	cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
+	spin_unlock(sb_bgl_lock(sbi, block_group));
+	if (!cleared)
 		ext4_error(sb, "ext4_free_inode",
 			   "bit already cleared for inode %lu", ino);
 	else {
-- 
cgit v1.2.3


From 118e1ef6fabfc023126e6075f6ac0fc729cb5285 Mon Sep 17 00:00:00 2001
From: Phillip Lougher <phillip@lougher.demon.co.uk>
Date: Thu, 5 Mar 2009 00:31:12 +0000
Subject: Squashfs: Fix oops when reading fsfuzzer corrupted filesystems

This fixes a code regression caused by the recent mainlining changes.
The recent code changes call zlib_inflate repeatedly, decompressing into
separate 4K buffers, this code didn't check for the possibility that
zlib_inflate might ask for too many buffers when decompressing corrupted
data.

Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/block.c    | 13 +++++++++++--
 fs/squashfs/cache.c    |  4 ++--
 fs/squashfs/squashfs.h |  2 +-
 fs/squashfs/super.c    |  2 +-
 4 files changed, 15 insertions(+), 6 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c
index c837dfc2b3c6..321728f48f2d 100644
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb,
  * generated a larger block - this does occasionally happen with zlib).
  */
 int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
-			int length, u64 *next_index, int srclength)
+			int length, u64 *next_index, int srclength, int pages)
 {
 	struct squashfs_sb_info *msblk = sb->s_fs_info;
 	struct buffer_head **bh;
@@ -185,6 +185,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index,
 			}
 
 			if (msblk->stream.avail_out == 0) {
+				if (page == pages) {
+					ERROR("zlib_inflate tried to "
+						"decompress too much data, "
+						"expected %d bytes.  Zlib "
+						"data probably corrupt\n",
+						srclength);
+					goto release_mutex;
+				}
 				msblk->stream.next_out = buffer[page++];
 				msblk->stream.avail_out = PAGE_CACHE_SIZE;
 			}
@@ -268,7 +276,8 @@ block_release:
 		put_bh(bh[k]);
 
 read_failure:
-	ERROR("sb_bread failed reading block 0x%llx\n", cur_index);
+	ERROR("squashfs_read_data failed to read block 0x%llx\n",
+					(unsigned long long) index);
 	kfree(bh);
 	return -EIO;
 }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c
index f29eda16d25e..1c4739e33af6 100644
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb,
 
 			entry->length = squashfs_read_data(sb, entry->data,
 				block, length, &entry->next_index,
-				cache->block_size);
+				cache->block_size, cache->pages);
 
 			spin_lock(&cache->lock);
 
@@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block,
 	for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE)
 		data[i] = buffer;
 	res = squashfs_read_data(sb, data, block, length |
-		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length);
+		SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages);
 	kfree(data);
 	return res;
 }
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h
index 6b2515d027d5..0e9feb6adf7e 100644
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode)
 
 /* block.c */
 extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *,
-				int);
+				int, int);
 
 /* cache.c */
 extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c
index 071df5b5b491..681ec0d83799 100644
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void)
 		return err;
 	}
 
-	printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) "
+	printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) "
 		"Phillip Lougher\n");
 
 	return 0;
-- 
cgit v1.2.3


From f4f8056a862a9950320429dfda708c88b4ce6025 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Thu, 5 Mar 2009 00:55:31 +0000
Subject: Squashfs: frag_size should be signed, as it can hold an error result

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Signed-off-by: Phillip Lougher <phillip@lougher.demon.co.uk>
---
 fs/squashfs/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c
index 7a63398bb855..9101dbde39ec 100644
--- a/fs/squashfs/inode.c
+++ b/fs/squashfs/inode.c
@@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 	type = le16_to_cpu(sqshb_ino->inode_type);
 	switch (type) {
 	case SQUASHFS_REG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg;
 
@@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino)
 		break;
 	}
 	case SQUASHFS_LREG_TYPE: {
-		unsigned int frag_offset, frag_size, frag;
+		unsigned int frag_offset, frag;
+		int frag_size;
 		u64 frag_blk;
 		struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg;
 
-- 
cgit v1.2.3


From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001
From: Chris Mason <chris.mason@oracle.com>
Date: Mon, 9 Mar 2009 11:45:38 -0400
Subject: Btrfs: fix spinlock assertions on UP systems

btrfs_tree_locked was being used to make sure a given extent_buffer was
properly locked in a few places.  But, it wasn't correct for UP compiled
kernels.

This switches it to using assert_spin_locked instead, and renames it to
btrfs_assert_tree_locked to better reflect how it was really being used.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
---
 fs/btrfs/ctree.c       | 10 +++++-----
 fs/btrfs/disk-io.c     |  4 ++--
 fs/btrfs/extent-tree.c |  4 ++--
 fs/btrfs/locking.c     |  6 +++---
 fs/btrfs/locking.h     |  2 +-
 5 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 42491d728e99..37f31b5529aa 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
 	if (*cow_ret == buf)
 		unlock_orig = 1;
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 
 	if (parent)
 		parent_start = parent->start;
@@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (slot >= btrfs_header_nritems(upper) - 1)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	right = read_node_slot(root, upper, slot + 1);
 	btrfs_tree_lock(right);
@@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
 	if (right_nritems == 0)
 		return 1;
 
-	WARN_ON(!btrfs_tree_locked(path->nodes[1]));
+	btrfs_assert_tree_locked(path->nodes[1]);
 
 	left = read_node_slot(root, path->nodes[1], slot - 1);
 	btrfs_tree_lock(left);
@@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 
 		next = read_node_slot(root, c, slot);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(c));
+			btrfs_assert_tree_locked(c);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
@@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
 			reada_for_search(root, path, level, slot, 0);
 		next = read_node_slot(root, next, 0);
 		if (!path->skip_locking) {
-			WARN_ON(!btrfs_tree_locked(path->nodes[level]));
+			btrfs_assert_tree_locked(path->nodes[level]);
 			btrfs_tree_lock(next);
 			btrfs_set_lock_blocking(next);
 		}
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index adda739a0215..3e18175248e0 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 	struct inode *btree_inode = root->fs_info->btree_inode;
 	if (btrfs_header_generation(buf) ==
 	    root->fs_info->running_transaction->transid) {
-		WARN_ON(!btrfs_tree_locked(buf));
+		btrfs_assert_tree_locked(buf);
 
 		/* ugh, clear_extent_buffer_dirty can be expensive */
 		btrfs_set_lock_blocking(buf);
@@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 
 	btrfs_set_lock_blocking(buf);
 
-	WARN_ON(!btrfs_tree_locked(buf));
+	btrfs_assert_tree_locked(buf);
 	if (transid != root->fs_info->generation) {
 		printk(KERN_CRIT "btrfs transid mismatch buffer %llu, "
 		       "found %llu running %llu\n",
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 6b5966aacf44..9abf81f71c46 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 
-	BUG_ON(!btrfs_tree_locked(parent));
+	btrfs_assert_tree_locked(parent);
 	parent_level = btrfs_header_level(parent);
 	extent_buffer_get(parent);
 	path->nodes[parent_level] = parent;
 	path->slots[parent_level] = btrfs_header_nritems(parent);
 
-	BUG_ON(!btrfs_tree_locked(node));
+	btrfs_assert_tree_locked(node);
 	level = btrfs_header_level(node);
 	extent_buffer_get(node);
 	path->nodes[level] = node;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 85506c4a3af7..47b0a88c12a2 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb)
 	return 0;
 }
 
-int btrfs_tree_locked(struct extent_buffer *eb)
+void btrfs_assert_tree_locked(struct extent_buffer *eb)
 {
-	return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) ||
-			spin_is_locked(&eb->lock);
+	if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+		assert_spin_locked(&eb->lock);
 }
diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h
index 6bb0afbff928..6c4ce457168c 100644
--- a/fs/btrfs/locking.h
+++ b/fs/btrfs/locking.h
@@ -21,11 +21,11 @@
 
 int btrfs_tree_lock(struct extent_buffer *eb);
 int btrfs_tree_unlock(struct extent_buffer *eb);
-int btrfs_tree_locked(struct extent_buffer *eb);
 
 int btrfs_try_tree_lock(struct extent_buffer *eb);
 int btrfs_try_spin_lock(struct extent_buffer *eb);
 
 void btrfs_set_lock_blocking(struct extent_buffer *eb);
 void btrfs_clear_lock_blocking(struct extent_buffer *eb);
+void btrfs_assert_tree_locked(struct extent_buffer *eb);
 #endif
-- 
cgit v1.2.3


From 260219cc48cfb22486e5d0d706c978228a080d63 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 10 Mar 2009 12:55:51 -0700
Subject: devpts: remove graffiti

Very annoying when working with containters.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/devpts/inode.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'fs')

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 5f3231b9633f..bff4052b05e7 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb)
 
 	fsi->ptmx_dentry = dentry;
 	rc = 0;
-
-	printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n",
-			inode->i_ino);
 out:
 	mutex_unlock(&root->d_inode->i_mutex);
 	return rc;
@@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags,
 	struct pts_fs_info *fsi;
 	struct pts_mount_opts *opts;
 
-	printk(KERN_NOTICE "devpts: newinstance mount\n");
-
 	err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt);
 	if (err)
 		return err;
-- 
cgit v1.2.3