From 2bb93d244157b6dfa4964d4088be4680b3169701 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Wed, 20 Aug 2014 18:56:29 -0500
Subject: Trivial whitespace fix

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2ops.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 5a48aa290dfe..4e4eecdec4f9 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -1035,7 +1035,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon,
 		if (keep_size == false)
 			return -EOPNOTSUPP;
 
-	/* 
+	/*
 	 * Must check if file sparse since fallocate -z (zero range) assumes
 	 * non-sparse allocation
 	 */
-- 
cgit v1.2.3


From eaebdedc61ac7bc3dd6ef6eb508097c4aaabef0c Mon Sep 17 00:00:00 2001
From: Fabian Frederick <fabf@skynet.be>
Date: Wed, 2 Jul 2014 22:08:46 +0200
Subject: GFS2: fs/gfs2/super.c: replace seq_printf by seq_puts

fix checkpatch warnings:
"WARNING: Prefer seq_puts to seq_printf"

Cc: cluster-devel@redhat.com
Signed-off-by: Fabian Frederick <fabf@skynet.be>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/super.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index 2607ff13d486..a346f56c4c6d 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	int val;
 
 	if (is_ancestor(root, sdp->sd_master_dir))
-		seq_printf(s, ",meta");
+		seq_puts(s, ",meta");
 	if (args->ar_lockproto[0])
 		seq_printf(s, ",lockproto=%s", args->ar_lockproto);
 	if (args->ar_locktable[0])
@@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 	if (args->ar_hostdata[0])
 		seq_printf(s, ",hostdata=%s", args->ar_hostdata);
 	if (args->ar_spectator)
-		seq_printf(s, ",spectator");
+		seq_puts(s, ",spectator");
 	if (args->ar_localflocks)
-		seq_printf(s, ",localflocks");
+		seq_puts(s, ",localflocks");
 	if (args->ar_debug)
-		seq_printf(s, ",debug");
+		seq_puts(s, ",debug");
 	if (args->ar_posix_acl)
-		seq_printf(s, ",acl");
+		seq_puts(s, ",acl");
 	if (args->ar_quota != GFS2_QUOTA_DEFAULT) {
 		char *state;
 		switch (args->ar_quota) {
@@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",quota=%s", state);
 	}
 	if (args->ar_suiddir)
-		seq_printf(s, ",suiddir");
+		seq_puts(s, ",suiddir");
 	if (args->ar_data != GFS2_DATA_DEFAULT) {
 		char *state;
 		switch (args->ar_data) {
@@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",data=%s", state);
 	}
 	if (args->ar_discard)
-		seq_printf(s, ",discard");
+		seq_puts(s, ",discard");
 	val = sdp->sd_tune.gt_logd_secs;
 	if (val != 30)
 		seq_printf(s, ",commit=%d", val);
@@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root)
 		seq_printf(s, ",errors=%s", state);
 	}
 	if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags))
-		seq_printf(s, ",nobarrier");
+		seq_puts(s, ",nobarrier");
 	if (test_bit(SDF_DEMOTE, &sdp->sd_flags))
-		seq_printf(s, ",demote_interface_used");
+		seq_puts(s, ",demote_interface_used");
 	if (args->ar_rgrplvb)
-		seq_printf(s, ",rgrplvb");
+		seq_puts(s, ",rgrplvb");
 	return 0;
 }
 
-- 
cgit v1.2.3


From b650738cd093a9f9e9551db9ce5cd68acd842dc0 Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 6 Aug 2014 09:08:36 -0400
Subject: GFS2: Change maxlen variables to size_t

This patch changes some variables (especially maxlen in function
gfs2_block_map) from unsigned int to size_t. We need 64-bit arithmetic
for very large files (e.g. 1PB) where the variables otherwise get
shifted to all 0's.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/bmap.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index e6ee5b6e8d99..f0b945ab853e 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp)
  * Returns: The length of the extent (minimum of one block)
  */
 
-static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob)
+static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob)
 {
 	const __be64 *end = (start + len);
 	const __be64 *first = ptr;
@@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 			   struct buffer_head *bh_map, struct metapath *mp,
 			   const unsigned int sheight,
 			   const unsigned int height,
-			   const unsigned int maxlen)
+			   const size_t maxlen)
 {
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
@@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock,
 	} else {
 		/* Need to allocate indirect blocks */
 		ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs;
-		dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]);
+		dblks = min(maxlen, (size_t)(ptrs_per_blk -
+					     mp->mp_list[end_of_metadata]));
 		if (height == ip->i_height) {
 			/* Writing into existing tree, extend tree down */
 			iblks = height - sheight;
@@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock,
 	struct gfs2_inode *ip = GFS2_I(inode);
 	struct gfs2_sbd *sdp = GFS2_SB(inode);
 	unsigned int bsize = sdp->sd_sb.sb_bsize;
-	const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits;
+	const size_t maxlen = bh_map->b_size >> inode->i_blkbits;
 	const u64 *arr = sdp->sd_heightsize;
 	__be64 *ptr;
 	u64 size;
-- 
cgit v1.2.3


From 2ddfbdd6848d496d8088b28992d257fd02e58c9d Mon Sep 17 00:00:00 2001
From: Bob Peterson <rpeterso@redhat.com>
Date: Wed, 20 Aug 2014 12:44:45 -0400
Subject: GFS2: Request demote when a "try" flock fails

This patch changes the flock code so that it uses the TRY_1CB flag
instead of the TRY flag on the first attempt. That forces any holding
nodes to issue a dlm callback, which requests a demote of the glock.
Then, if the "try" failed, it sleeps a small amount of time for the
demote to occur. Then it tries again, for an increasing amount of time.
Subsequent attempts to gain the "try" lock don't use "_1CB" so that
only one callback is issued.

Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/file.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index 26b3f952e6b1..7f4ed3daa38c 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -26,6 +26,7 @@
 #include <linux/dlm.h>
 #include <linux/dlm_plock.h>
 #include <linux/aio.h>
+#include <linux/delay.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -979,9 +980,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 	unsigned int state;
 	int flags;
 	int error = 0;
+	int sleeptime;
 
 	state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED;
-	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT;
+	flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT;
 
 	mutex_lock(&fp->f_fl_mutex);
 
@@ -1001,7 +1003,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl)
 		gfs2_holder_init(gl, state, flags, fl_gh);
 		gfs2_glock_put(gl);
 	}
-	error = gfs2_glock_nq(fl_gh);
+	for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) {
+		error = gfs2_glock_nq(fl_gh);
+		if (error != GLR_TRYFAILED)
+			break;
+		fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT;
+		fl_gh->gh_error = 0;
+		msleep(sleeptime);
+	}
 	if (error) {
 		gfs2_holder_uninit(fl_gh);
 		if (error == GLR_TRYFAILED)
@@ -1024,7 +1033,7 @@ static void do_unflock(struct file *file, struct file_lock *fl)
 	mutex_lock(&fp->f_fl_mutex);
 	flock_lock_file_wait(file, fl);
 	if (fl_gh->gh_gl) {
-		gfs2_glock_dq_wait(fl_gh);
+		gfs2_glock_dq(fl_gh);
 		gfs2_holder_uninit(fl_gh);
 	}
 	mutex_unlock(&fp->f_fl_mutex);
-- 
cgit v1.2.3


From 27b7edcf1ce03a3eddda24d4d271a9b29572a78b Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Wed, 20 Aug 2014 19:39:28 +0900
Subject: cifs: fix a possible null pointer deref in decode_ascii_ssetup

When kzalloc fails, we will end up doing NULL pointer derefrence

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/sess.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 39ee32688eac..3a5e83317683 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft,
 	kfree(ses->serverOS);
 
 	ses->serverOS = kzalloc(len + 1, GFP_KERNEL);
-	if (ses->serverOS)
+	if (ses->serverOS) {
 		strncpy(ses->serverOS, bcc_ptr, len);
-	if (strncmp(ses->serverOS, "OS/2", 4) == 0)
-		cifs_dbg(FYI, "OS/2 server\n");
+		if (strncmp(ses->serverOS, "OS/2", 4) == 0)
+			cifs_dbg(FYI, "OS/2 server\n");
+	}
 
 	bcc_ptr += len + 1;
 	bleft -= len + 1;
-- 
cgit v1.2.3


From d6ccf4997e62fb6629f9f003980dca5292138b7b Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Thu, 21 Aug 2014 19:11:20 +0900
Subject: cifs: fix memory leak when password is supplied multiple times

Unlikely but possible. When password is supplied multiple times, we have
to free the previous allocation.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 03ed8a09581c..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 			tmp_end++;
 			if (!(tmp_end < end && tmp_end[1] == delim)) {
 				/* No it is not. Set the password to NULL */
+				kfree(vol->password);
 				vol->password = NULL;
 				break;
 			}
@@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname,
 					options = end;
 			}
 
+			kfree(vol->password);
 			/* Now build new password string */
 			temp_len = strlen(value);
 			vol->password = kzalloc(temp_len+1, GFP_KERNEL);
-- 
cgit v1.2.3


From 7de975e349b295f387f34eed38f115223f17d5ee Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Wed, 20 Aug 2014 19:39:41 +0900
Subject: cifs: fix a possible use of uninit variable in SMB2_sess_setup

In case of error, goto ssetup_exit can be hit and we could end up using
uninitialized value of resp_buftype

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2pdu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index fa0dd044213b..9df5d8effe47 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses,
 	struct smb2_sess_setup_rsp *rsp = NULL;
 	struct kvec iov[2];
 	int rc = 0;
-	int resp_buftype;
+	int resp_buftype = CIFS_NO_BUFFER;
 	__le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */
 	struct TCP_Server_Info *server = ses->server;
 	u16 blob_length = 0;
-- 
cgit v1.2.3


From d4a029d21556437b09ffb3207cf2871651bec38f Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Wed, 20 Aug 2014 19:39:59 +0900
Subject: cifs: remove unneeded check of null checking in if condition

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/smb2pdu.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 9df5d8effe47..cb39c51cd3e0 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1403,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon,
 	rsp = (struct smb2_close_rsp *)iov[0].iov_base;
 
 	if (rc != 0) {
-		if (tcon)
-			cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
+		cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE);
 		goto close_exit;
 	}
 
-- 
cgit v1.2.3


From 787aded65044e4cabefcf7eb7576c2dd6b151468 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Fri, 22 Aug 2014 14:22:51 +0900
Subject: cifs: Allow directIO read/write during cache=strict

Currently cifs have all or nothing approach for directIO operations.
cache=strict mode does not allow directIO while cache=none mode performs
all the operations as directIO even when user does not specify O_DIRECT
flag. This patch enables strict cache mode to honour directIO semantics.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/dir.c  | 8 ++++++++
 fs/cifs/file.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c
index 3db0c5fd9a11..6cbd9c688cfe 100644
--- a/fs/cifs/dir.c
+++ b/fs/cifs/dir.c
@@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry,
 		goto out;
 	}
 
+	if (file->f_flags & O_DIRECT &&
+	    CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+		if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			file->f_op = &cifs_file_direct_nobrl_ops;
+		else
+			file->f_op = &cifs_file_direct_ops;
+		}
+
 	file_info = cifs_new_fileinfo(&fid, file, tlink, oplock);
 	if (file_info == NULL) {
 		if (server->ops->close)
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index d5fec92e0360..7c018a1c52f7 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file)
 	cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n",
 		 inode, file->f_flags, full_path);
 
+	if (file->f_flags & O_DIRECT &&
+	    cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) {
+		if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL)
+			file->f_op = &cifs_file_direct_nobrl_ops;
+		else
+			file->f_op = &cifs_file_direct_ops;
+	}
+
 	if (server->oplocks)
 		oplock = REQ_OPLOCK;
 	else
-- 
cgit v1.2.3


From 52a36244443eabb594bdb63622ff2dd7a083f0e2 Mon Sep 17 00:00:00 2001
From: Namjae Jeon <namjae.jeon@samsung.com>
Date: Wed, 20 Aug 2014 19:39:17 +0900
Subject: cifs: No need to send SIGKILL to demux_thread during umount

There is no need to explicitly send SIGKILL to cifs_demultiplex_thread
as it is calling module_put_and_exit to exit cleanly.

socket sk_rcvtimeo is set to 7 HZ so the thread will wake up in 7 seconds and
clean itself.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
Signed-off-by: Ashish Sangwan <a.sangwan@samsung.com>
Acked-by: Jeff Layton <jlayton@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 19 -------------------
 1 file changed, 19 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 36ca2045009b..8a9fded7c135 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -837,7 +837,6 @@ cifs_demultiplex_thread(void *p)
 	struct TCP_Server_Info *server = p;
 	unsigned int pdu_length;
 	char *buf = NULL;
-	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 
 	current->flags |= PF_MEMALLOC;
@@ -928,19 +927,7 @@ cifs_demultiplex_thread(void *p)
 	if (server->smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(server->smallbuf);
 
-	task_to_wake = xchg(&server->tsk, NULL);
 	clean_demultiplex_info(server);
-
-	/* if server->tsk was NULL then wait for a signal before exiting */
-	if (!task_to_wake) {
-		set_current_state(TASK_INTERRUPTIBLE);
-		while (!signal_pending(current)) {
-			schedule();
-			set_current_state(TASK_INTERRUPTIBLE);
-		}
-		set_current_state(TASK_RUNNING);
-	}
-
 	module_put_and_exit(0);
 }
 
@@ -2063,8 +2050,6 @@ cifs_find_tcp_session(struct smb_vol *vol)
 static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
-	struct task_struct *task;
-
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--server->srv_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -2088,10 +2073,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
-
-	task = xchg(&server->tsk, NULL);
-	if (task)
-		force_sig(SIGKILL, task);
 }
 
 static struct TCP_Server_Info *
-- 
cgit v1.2.3


From a07d322059db66b84c9eb4f98959df468e88b34b Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilovsky@samba.org>
Date: Fri, 22 Aug 2014 13:32:09 +0400
Subject: CIFS: Fix directory rename error

CIFS servers process nlink counts differently for files and directories.
In cifs_rename() if we the request fails on the existing target, we
try to remove it through cifs_unlink() but this is not what we want
to do for directories. As the result the following sequence of commands

mkdir {1,2}; mv -T 1 2; rmdir {1,2}; mkdir {1,2}; echo foo > 2/bar

and XFS test generic/023 fail with -ENOENT error. That's why the second
mkdir reuses the existing inode (target inode of the mv -T command) with
S_DEAD flag.

Fix this by checking whether the target is directory or not and
calling cifs_rmdir() rather than cifs_unlink() for directories.

Cc: <stable@vger.kernel.org>
Signed-off-by: Pavel Shilovsky <pshilovsky@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/inode.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c
index 949ec909ec9a..7899a40465b3 100644
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -1720,7 +1720,10 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry,
 unlink_target:
 	/* Try unlinking the target dentry if it's not negative */
 	if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) {
-		tmprc = cifs_unlink(target_dir, target_dentry);
+		if (d_is_dir(target_dentry))
+			tmprc = cifs_rmdir(target_dir, target_dentry);
+		else
+			tmprc = cifs_unlink(target_dir, target_dentry);
 		if (tmprc)
 			goto cifs_rename_exit;
 		rc = cifs_do_rename(xid, source_dentry, from_name,
-- 
cgit v1.2.3


From f736906a7669a77cf8cabdcbcf1dc8cb694e12ef Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilovsky@samba.org>
Date: Tue, 26 Aug 2014 19:04:44 +0400
Subject: CIFS: Fix wrong restart readdir for SMB1

The existing code calls server->ops->close() that is not
right. This causes XFS test generic/310 to fail. Fix this
by using server->ops->closedir() function.

Cc: <stable@vger.kernel.org> # v3.7+
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Pavel Shilovsky <pshilovsky@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/readdir.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c
index 798c80a41c88..b334a89d6a66 100644
--- a/fs/cifs/readdir.c
+++ b/fs/cifs/readdir.c
@@ -596,8 +596,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos,
 		if (server->ops->dir_needs_close(cfile)) {
 			cfile->invalidHandle = true;
 			spin_unlock(&cifs_file_list_lock);
-			if (server->ops->close)
-				server->ops->close(xid, tcon, &cfile->fid);
+			if (server->ops->close_dir)
+				server->ops->close_dir(xid, tcon, &cfile->fid);
 		} else
 			spin_unlock(&cifs_file_list_lock);
 		if (cfile->srch_inf.ntwrk_buf_start) {
-- 
cgit v1.2.3


From 1bbe4997b13de903c421c1cc78440e544b5f9064 Mon Sep 17 00:00:00 2001
From: Pavel Shilovsky <pshilovsky@samba.org>
Date: Fri, 22 Aug 2014 13:32:11 +0400
Subject: CIFS: Fix wrong filename length for SMB2

The existing code uses the old MAX_NAME constant. This causes
XFS test generic/013 to fail. Fix it by replacing MAX_NAME with
PATH_MAX that SMB1 uses. Also remove an unused MAX_NAME constant
definition.

Cc: <stable@vger.kernel.org> # v3.7+
Signed-off-by: Pavel Shilovsky <pshilovsky@samba.org>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/cifsglob.h  | 5 -----
 fs/cifs/smb2file.c  | 2 +-
 fs/cifs/smb2inode.c | 2 +-
 fs/cifs/smb2ops.c   | 2 +-
 fs/cifs/smb2pdu.c   | 2 +-
 5 files changed, 4 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index dfc731b02aa9..25b8392bfdd2 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -70,11 +70,6 @@
 #define SERVER_NAME_LENGTH 40
 #define SERVER_NAME_LEN_WITH_NULL     (SERVER_NAME_LENGTH + 1)
 
-/* used to define string lengths for reversing unicode strings */
-/*         (256+1)*2 = 514                                     */
-/*           (max path length + 1 for null) * 2 for unicode    */
-#define MAX_NAME 514
-
 /* SMB echo "timeout" -- FIXME: tunable? */
 #define SMB_ECHO_INTERVAL (60 * HZ)
 
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 3f17b4550831..45992944e238 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms,
 		goto out;
 	}
 
-	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
 			    GFP_KERNEL);
 	if (smb2_data == NULL) {
 		rc = -ENOMEM;
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index 0150182a4494..899bbc86f73e 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon,
 	*adjust_tz = false;
 	*symlink = false;
 
-	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
 			    GFP_KERNEL);
 	if (smb2_data == NULL)
 		return -ENOMEM;
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index 4e4eecdec4f9..f522193b7184 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon,
 	int rc;
 	struct smb2_file_all_info *smb2_data;
 
-	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+	smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
 			    GFP_KERNEL);
 	if (smb2_data == NULL)
 		return -ENOMEM;
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index cb39c51cd3e0..74b3a6684383 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -1532,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon,
 {
 	return query_info(xid, tcon, persistent_fid, volatile_fid,
 			  FILE_ALL_INFORMATION,
-			  sizeof(struct smb2_file_all_info) + MAX_NAME * 2,
+			  sizeof(struct smb2_file_all_info) + PATH_MAX * 2,
 			  sizeof(struct smb2_file_all_info), data);
 }
 
-- 
cgit v1.2.3


From ca5d13fc33cc56f7405004574d18172ddbdea2ef Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Fri, 22 Aug 2014 04:40:46 -0500
Subject: Clarify Kconfig help text for CIFS and SMB2/SMB3

Clarify descriptions of SMB2 and SMB3 support in Kconfig

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Pavel Shilovsky <pshilovsky@samba.org>
---
 fs/cifs/Kconfig | 35 +++++++++++++++++++++++------------
 1 file changed, 23 insertions(+), 12 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig
index 603f18a65c12..a2172f3f69e3 100644
--- a/fs/cifs/Kconfig
+++ b/fs/cifs/Kconfig
@@ -22,6 +22,11 @@ config CIFS
 	  support for OS/2 and Windows ME and similar servers is provided as
 	  well.
 
+	  The module also provides optional support for the followon
+	  protocols for CIFS including SMB3, which enables
+	  useful performance and security features (see the description
+	  of CONFIG_CIFS_SMB2).
+
 	  The cifs module provides an advanced network file system
 	  client for mounting to CIFS compliant servers.  It includes
 	  support for DFS (hierarchical name space), secure per-user
@@ -121,7 +126,8 @@ config CIFS_ACL
 	  depends on CIFS_XATTR && KEYS
 	  help
 	    Allows fetching CIFS/NTFS ACL from the server.  The DACL blob
-	    is handed over to the application/caller.
+	    is handed over to the application/caller.  See the man
+	    page for getcifsacl for more information.
 
 config CIFS_DEBUG
 	bool "Enable CIFS debugging routines"
@@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT
 	   Allows NFS server to export a CIFS mounted share (nfsd over cifs)
 
 config CIFS_SMB2
-	bool "SMB2 network file system support"
+	bool "SMB2 and SMB3 network file system support"
 	depends on CIFS && INET
 	select NLS
 	select KEYS
@@ -170,16 +176,21 @@ config CIFS_SMB2
 	select DNS_RESOLVER
 
 	help
-	  This enables experimental support for the SMB2 (Server Message Block
-	  version 2) protocol. The SMB2 protocol is the successor to the
-	  popular CIFS and SMB network file sharing protocols. SMB2 is the
-	  native file sharing mechanism for recent versions of Windows
-	  operating systems (since Vista).  SMB2 enablement will eventually
-	  allow users better performance, security and features, than would be
-	  possible with cifs. Note that smb2 mount options also are simpler
-	  (compared to cifs) due to protocol improvements.
-
-	  Unless you are a developer or tester, say N.
+	  This enables support for the Server Message Block version 2
+	  family of protocols, including SMB3.  SMB3 support is
+	  enabled on mount by specifying "vers=3.0" in the mount
+	  options. These protocols are the successors to the popular
+	  CIFS and SMB network file sharing protocols. SMB3 is the
+	  native file sharing mechanism for the more recent
+	  versions of Windows (Windows 8 and Windows 2012 and
+	  later) and Samba server and many others support SMB3 well.
+	  In general SMB3 enables better performance, security
+	  and features, than would be possible with CIFS (Note that
+	  when mounting to Samba, due to the CIFS POSIX extensions,
+	  CIFS mounts can provide slightly better POSIX compatibility
+	  than SMB3 mounts do though). Note that SMB2/SMB3 mount
+	  options are also slightly simpler (compared to CIFS) due
+	  to protocol improvements.
 
 config CIFS_FSCACHE
 	  bool "Provide CIFS client caching support"
-- 
cgit v1.2.3


From 9776de96e51565286da25c74d6f631abc50c63ef Mon Sep 17 00:00:00 2001
From: Milosz Tanski <milosz@adfin.com>
Date: Wed, 13 Aug 2014 12:58:16 -0400
Subject: FS-Cache: Timeout for releasepage()

This is meant to avoid a recusive hang caused by underlying filesystem trying
to grab a free page and causing a write-out.

INFO: task kworker/u30:7:28375 blocked for more than 120 seconds.
      Not tainted 3.15.0-virtual #74
"echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
kworker/u30:7   D 0000000000000000     0 28375      2 0x00000000
Workqueue: fscache_operation fscache_op_work_func [fscache]
 ffff88000b147148 0000000000000046 0000000000000000 ffff88000b1471c8
 ffff8807aa031820 0000000000014040 ffff88000b147fd8 0000000000014040
 ffff880f0c50c860 ffff8807aa031820 ffff88000b147158 ffff88007be59cd0
Call Trace:
 [<ffffffff815930e9>] schedule+0x29/0x70
 [<ffffffffa018bed5>] __fscache_wait_on_page_write+0x55/0x90 [fscache]
 [<ffffffff810a4350>] ? __wake_up_sync+0x20/0x20
 [<ffffffffa018c135>] __fscache_maybe_release_page+0x65/0x1e0 [fscache]
 [<ffffffffa02ad813>] ceph_releasepage+0x83/0x100 [ceph]
 [<ffffffff811635b0>] ? anon_vma_fork+0x130/0x130
 [<ffffffff8112cdd2>] try_to_release_page+0x32/0x50
 [<ffffffff81140096>] shrink_page_list+0x7e6/0x9d0
 [<ffffffff8113f278>] ? isolate_lru_pages.isra.73+0x78/0x1e0
 [<ffffffff81140932>] shrink_inactive_list+0x252/0x4c0
 [<ffffffff811412b1>] shrink_lruvec+0x3e1/0x670
 [<ffffffff8114157f>] shrink_zone+0x3f/0x110
 [<ffffffff81141b06>] do_try_to_free_pages+0x1d6/0x450
 [<ffffffff8114a939>] ? zone_statistics+0x99/0xc0
 [<ffffffff81141e44>] try_to_free_pages+0xc4/0x180
 [<ffffffff81136982>] __alloc_pages_nodemask+0x6b2/0xa60
 [<ffffffff811c1d4e>] ? __find_get_block+0xbe/0x250
 [<ffffffff810a405e>] ? wake_up_bit+0x2e/0x40
 [<ffffffff811740c3>] alloc_pages_current+0xb3/0x180
 [<ffffffff8112cf07>] __page_cache_alloc+0xb7/0xd0
 [<ffffffff8112da6c>] grab_cache_page_write_begin+0x7c/0xe0
 [<ffffffff81214072>] ? ext4_mark_inode_dirty+0x82/0x220
 [<ffffffff81214a89>] ext4_da_write_begin+0x89/0x2d0
 [<ffffffff8112c6ee>] generic_perform_write+0xbe/0x1d0
 [<ffffffff811a96b1>] ? update_time+0x81/0xc0
 [<ffffffff811ad4c2>] ? mnt_clone_write+0x12/0x30
 [<ffffffff8112e80e>] __generic_file_aio_write+0x1ce/0x3f0
 [<ffffffff8112ea8e>] generic_file_aio_write+0x5e/0xe0
 [<ffffffff8120b94f>] ext4_file_write+0x9f/0x410
 [<ffffffff8120af56>] ? ext4_file_open+0x66/0x180
 [<ffffffff8118f0da>] do_sync_write+0x5a/0x90
 [<ffffffffa025c6c9>] cachefiles_write_page+0x149/0x430 [cachefiles]
 [<ffffffff812cf439>] ? radix_tree_gang_lookup_tag+0x89/0xd0
 [<ffffffffa018c512>] fscache_write_op+0x222/0x3b0 [fscache]
 [<ffffffffa018b35a>] fscache_op_work_func+0x3a/0x100 [fscache]
 [<ffffffff8107bfe9>] process_one_work+0x179/0x4a0
 [<ffffffff8107d47b>] worker_thread+0x11b/0x370
 [<ffffffff8107d360>] ? manage_workers.isra.21+0x2e0/0x2e0
 [<ffffffff81083d69>] kthread+0xc9/0xe0
 [<ffffffff81010000>] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90
 [<ffffffff81083ca0>] ? flush_kthread_worker+0xb0/0xb0
 [<ffffffff8159eefc>] ret_from_fork+0x7c/0xb0
 [<ffffffff81083ca0>] ? flush_kthread_worker+0xb0/0xb0

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/page.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 85332b9d19d1..781ac7b0b53e 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -43,6 +43,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa
 }
 EXPORT_SYMBOL(__fscache_wait_on_page_write);
 
+/*
+ * wait for a page to finish being written to the cache. Put a timeout here
+ * since we might be called recursively via parent fs.
+ */
+static
+bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page)
+{
+	wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0);
+
+	return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page),
+				  HZ);
+}
+
 /*
  * decide whether a page can be released, possibly by cancelling a store to it
  * - we're allowed to sleep if __GFP_WAIT is flagged
@@ -115,7 +128,10 @@ page_busy:
 	}
 
 	fscache_stat(&fscache_n_store_vmscan_wait);
-	__fscache_wait_on_page_write(cookie, page);
+	if (!release_page_wait_timeout(cookie, page))
+		_debug("fscache writeout timeout page: %p{%lx}",
+			page, page->index);
+
 	gfp &= ~__GFP_WAIT;
 	goto try_again;
 }
-- 
cgit v1.2.3


From 920bce20d74817bdd8bfcbc28ecb1179c9e01081 Mon Sep 17 00:00:00 2001
From: Milosz Tanski <milosz@adfin.com>
Date: Wed, 13 Aug 2014 12:58:21 -0400
Subject: FS-Cache: Reduce cookie ref count if submit fails.

I've been seeing issues with disposing cookies under vma pressure. The symptom
is that the refcount gets out of sync. In this case we fail to decrement the
refcount if submit fails. I found this while auditing the error in and around
cookie operations.

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/object.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'fs')

diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index d3b4539f1651..da032daf0e0d 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -982,6 +982,7 @@ nomem:
 submit_op_failed:
 	clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags);
 	spin_unlock(&cookie->lock);
+	fscache_unuse_cookie(object);
 	kfree(op);
 	_leave(" [EIO]");
 	return transit_to(KILL_OBJECT);
-- 
cgit v1.2.3


From e9512d72e8e61c750c90efacd720abe3c4569822 Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Tue, 26 Aug 2014 13:55:54 -0700
Subject: Btrfs: fix autodefrag with compression

The autodefrag code skips defrag when two extents are adjacent.  But one
big advantage for autodefrag is cutting down on the number of small
extents, even when they are adjacent.  This commit changes it to defrag
all small extents.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index fce6fd0e3f50..a018ea484d39 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1019,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em)
 		return false;
 
 	next = defrag_lookup_extent(inode, em->start + em->len);
-	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE ||
-	    (em->block_start + em->block_len == next->block_start))
+	if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
+		ret = false;
+	else if ((em->block_start + em->block_len == next->block_start) &&
+		 (em->block_len > 128 * 1024 && next->block_len > 128 * 1024))
 		ret = false;
 
 	free_extent_map(next);
@@ -1055,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh,
 	}
 
 	next_mergeable = defrag_check_next_extent(inode, em);
-
 	/*
 	 * we hit a real extent, if it is big or the next extent is not a
 	 * real extent, don't bother defragging it
-- 
cgit v1.2.3


From d9f85963e3f7f5582552fdae54a2b89d6c62daf5 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Mon, 25 Aug 2014 10:43:00 +0100
Subject: Btrfs: fix corruption after write/fsync failure + fsync + log
 recovery

While writing to a file, in inode.c:cow_file_range() (and same applies to
submit_compressed_extents()), after reserving an extent for the file data,
we create a new extent map for the written range and insert it into the
extent map cache. After that, we create an ordered operation, but if it
fails (due to a transient/temporary-ENOMEM), we return without dropping
that extent map, which points to a reserved extent that is freed when we
return. A subsequent incremental fsync (when the btrfs inode doesn't have
the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and
logs a file extent item based on that extent map, which points to a disk
extent that doesn't contain valid data - it was freed by us earlier, at this
point it might contain any random/garbage data.

Therefore, if we reach an error condition when cowing a file range after
we added the new extent map to the cache, drop it from the cache before
returning.

Some sequence of steps that lead to this:

    $ mkfs.btrfs -f /dev/sdd
    $ mount -o commit=9999 /dev/sdd /mnt
    $ cd /mnt

    $ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo
    $ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096"
    $ sync

    $ od -t x1 foo
    0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
    *
    0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02
    *
    0020000

    $ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo

    # Now this write + fsync fail with -ENOMEM, which was returned by
    # btrfs_add_ordered_extent() in inode.c:cow_file_range().
    $ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo
    $ xfs_io -c "fsync" foo
    fsync: Cannot allocate memory

    # Now do a new write + fsync, which will succeed. Our previous
    # -ENOMEM was a transient/temporary error.
    $ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo
    $ xfs_io -c "fsync" foo

    # Our file content (in page cache) is now:
    $ od -t x1 foo
    0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1
    *
    0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
    *
    0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    *
    0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
    *
    0050000

    # Now reboot the machine, and mount the fs, so that fsync log replay
    # takes place.

    # The file content is now weird, in particular the first 8Kb, which
    # do not match our data before nor after the sync command above.
    $ od -t x1 foo
    0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
    *
    0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01
    *
    0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
    *
    0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee
    *
    0050000

    # In fact these first 4Kb are a duplicate of the last 4kb block.
    # The last write got an extent map/file extent item that points to
    # the same disk extent that we got in the write+fsync that failed
    # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to
    # verify that:

    $ btrfs-debug-tree /dev/sdd
    (...)
	item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53
		extent data disk byte 12582912 nr 8192
		extent data offset 0 nr 8192 ram 8192
	item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53
		extent data disk byte 0 nr 0
		extent data offset 0 nr 8192 ram 8192
	item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53
		extent data disk byte 12582912 nr 4096
		extent data offset 0 nr 4096 ram 4096

    $ umount /dev/sdd
    $ btrfsck /dev/sdd
    Checking filesystem on /dev/sdd
    UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f
    checking extents
    extent item 12582912 has multiple extent items
    ref mismatch on [12582912 4096] extent item 1, found 2
    Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192
    backpointer mismatch on [12582912 4096]
    Errors found in extent allocation tree or chunk allocation
    checking free space cache
    checking fs roots
    root 5 inode 257 errors 1000, some csum missing
    found 131074 bytes used err is 1
    total csum bytes: 4
    total tree bytes: 131072
    total fs tree bytes: 32768
    total extent tree bytes: 16384
    btree space waste bytes: 123404
    file data blocks allocated: 274432
     referenced 274432
    Btrfs v3.14.1-96-gcc7fd5a-dirty

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 3d020d6d9ace..7313571e1860 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -778,8 +778,12 @@ retry:
 						ins.offset,
 						BTRFS_ORDERED_COMPRESSED,
 						async_extent->compress_type);
-		if (ret)
+		if (ret) {
+			btrfs_drop_extent_cache(inode, async_extent->start,
+						async_extent->start +
+						async_extent->ram_size - 1, 0);
 			goto out_free_reserve;
+		}
 
 		/*
 		 * clear dirty, set writeback and unlock the pages.
@@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode,
 		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
 					       ram_size, cur_alloc_size, 0);
 		if (ret)
-			goto out_reserve;
+			goto out_drop_extent_cache;
 
 		if (root->root_key.objectid ==
 		    BTRFS_DATA_RELOC_TREE_OBJECTID) {
 			ret = btrfs_reloc_clone_csums(inode, start,
 						      cur_alloc_size);
 			if (ret)
-				goto out_reserve;
+				goto out_drop_extent_cache;
 		}
 
 		if (disk_num_bytes < cur_alloc_size)
@@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode,
 out:
 	return ret;
 
+out_drop_extent_cache:
+	btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0);
 out_reserve:
 	btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1);
 out_unlock:
-- 
cgit v1.2.3


From dac5705cad20070a70bb028ca52e1f0bc157b42d Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Fri, 29 Aug 2014 20:54:26 +0100
Subject: Btrfs: fix crash while doing a ranged fsync

While doing a ranged fsync, that is, one whose range doesn't cover the
whole possible file range (0 to LLONG_MAX), we can crash under certain
circumstances with a trace like the following:

[41074.641913] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
(...)
[41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1
(...)
[41074.643886] RIP: 0010:[<ffffffffa01ecc99>]  [<ffffffffa01ecc99>] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs]
(...)
[41074.644919] Stack:
(...)
[41074.644919] Call Trace:
[41074.644919]  [<ffffffffa01db531>] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs]
[41074.644919]  [<ffffffffa01eb54f>] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs]
[41074.644919]  [<ffffffffa02137a9>] btrfs_log_inode+0x2f9/0x970 [btrfs]
[41074.644919]  [<ffffffff81090875>] ? sched_clock_local+0x25/0xa0
[41074.644919]  [<ffffffff8164a55e>] ? mutex_unlock+0xe/0x10
[41074.644919]  [<ffffffff810af51d>] ? trace_hardirqs_on+0xd/0x10
[41074.644919]  [<ffffffffa0214b4f>] btrfs_log_inode_parent+0x1ef/0x560 [btrfs]
[41074.644919]  [<ffffffff811d0c55>] ? dget_parent+0x5/0x180
[41074.644919]  [<ffffffffa0215d11>] btrfs_log_dentry_safe+0x51/0x80 [btrfs]
[41074.644919]  [<ffffffffa01e2d1a>] btrfs_sync_file+0x1ba/0x3e0 [btrfs]
[41074.644919]  [<ffffffff811eda6b>] vfs_fsync_range+0x1b/0x30
(...)

The necessary conditions that lead to such crash are:

* an incremental fsync (when the inode doesn't have the
  BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged
  a file extent item ending at offset X;

* the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due
  to a file truncate operation that reduces the file to a size smaller
  than X;

* a ranged fsync call happens (via an msync for example), with a range that
  doesn't cover the whole file and the end of this range, lets call it Y, is
  smaller than X;

* btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and
  calls btrfs_truncate_inode_items() to remove all items from the log
  tree that are associated with our file;

* btrfs_truncate_inode_items() removes all of the inode's items, and the lowest
  file extent item it removed is the one ending at offset X, where X > 0 and
  X > Y - before returning, it calls btrfs_ordered_update_i_size() with an offset
  parameter set to X;

* btrfs_ordered_update_i_size() sees that X is greater then the current ordered
  size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing
  ordered operation with a range covering the offset X, calling a BUG_ON() if
  such ordered operation exists. This assumption is made because the disk_i_size
  is only increased after the corresponding file extent item is added to the
  btree (btrfs_finish_ordered_io);

* But because our fsync covers only a limited range, such an ordered extent might
  exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered
  extent to finish when calling btrfs_wait_ordered_range();

And then by the time btrfs_ordered_update_i_size() is called, via:

   btrfs_sync_file() ->
       btrfs_log_dentry_safe() ->
           btrfs_log_inode_parent() ->
               btrfs_log_inode() ->
                   btrfs_truncate_inode_items() ->
                       btrfs_ordered_update_i_size()

We hit the BUG_ON(), which could never happen if the fsync range covered the whole
possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to
finish before calling btrfs_truncate_inode_items().

So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items
from a log tree, which isn't supposed to change the in memory inode's disk_i_size.

Issue found while running xfstests/generic/127 (happens very rarely for me), more
specifically via the fsx calls that use memory mapped IO (and issue msync calls).

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/inode.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7313571e1860..88823f4ca451 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -4248,7 +4248,8 @@ out:
 			btrfs_abort_transaction(trans, root, ret);
 	}
 error:
-	if (last_size != (u64)-1)
+	if (last_size != (u64)-1 &&
+	    root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
 		btrfs_ordered_update_i_size(inode, last_size, NULL);
 	btrfs_free_path(path);
 	return err;
-- 
cgit v1.2.3


From a9cfcd63e8d206ce4235c355d857c4fbdf0f4587 Mon Sep 17 00:00:00 2001
From: Theodore Ts'o <tytso@mit.edu>
Date: Wed, 3 Sep 2014 09:33:00 -0400
Subject: ext4: avoid trying to kfree an ERR_PTR pointer

Thanks to Dan Carpenter for extending smatch to find bugs like this.
(This was found using a development version of smatch.)

Fixes: 36de928641ee48b2078d3fe9514242aaa2f92013
Reported-by: Dan Carpenter <dan.carpenter@oracle.com
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Cc: stable@vger.kernel.org
---
 fs/ext4/namei.c  | 2 ++
 fs/ext4/resize.c | 2 ++
 2 files changed, 4 insertions(+)

(limited to 'fs')

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 90a3cdca3f88..603e4ebbd0ac 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -3240,6 +3240,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 				 &new.de, &new.inlined);
 	if (IS_ERR(new.bh)) {
 		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
 		goto end_rename;
 	}
 	if (new.bh) {
@@ -3386,6 +3387,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
 				 &new.de, &new.inlined);
 	if (IS_ERR(new.bh)) {
 		retval = PTR_ERR(new.bh);
+		new.bh = NULL;
 		goto end_rename;
 	}
 
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index bb0e80f03e2e..1e43b905ff98 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -575,6 +575,7 @@ handle_bb:
 		bh = bclean(handle, sb, block);
 		if (IS_ERR(bh)) {
 			err = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 		overhead = ext4_group_overhead_blocks(sb, group);
@@ -603,6 +604,7 @@ handle_ib:
 		bh = bclean(handle, sb, block);
 		if (IS_ERR(bh)) {
 			err = PTR_ERR(bh);
+			bh = NULL;
 			goto out;
 		}
 
-- 
cgit v1.2.3


From 8a70ee3307908c46f952df91be72a18d5f5ad0a3 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 11:47:51 +0200
Subject: udf: Avoid dir link count to go negative

If we are writing back inode of unlinked directory, its link count ends
up being (u16)-1. Although the inode is deleted, udf_iget() can load the
inode when NFS uses stale file handle and get confused.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 236cd48184c2..e86f9b67aa16 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1664,7 +1664,7 @@ static int udf_update_inode(struct inode *inode, int do_sync)
 		     FE_PERM_U_DELETE | FE_PERM_U_CHATTR));
 	fe->permissions = cpu_to_le32(udfperms);
 
-	if (S_ISDIR(inode->i_mode))
+	if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0)
 		fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1);
 	else
 		fe->fileLinkCount = cpu_to_le16(inode->i_nlink);
-- 
cgit v1.2.3


From bb7720a0b4a8ca3269fd86fbb45a78d2e0d3deaf Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 13:32:50 +0200
Subject: udf: Fold udf_fill_inode() into __udf_read_inode()

There's no good reason to separate these since udf_fill_inode() is
called only from __udf_read_inode() and both do part of the same thing.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 22 +++++-----------------
 1 file changed, 5 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index e86f9b67aa16..68cc7b144c26 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -51,7 +51,6 @@ MODULE_LICENSE("GPL");
 
 static umode_t udf_convert_permissions(struct fileEntry *);
 static int udf_update_inode(struct inode *, int);
-static void udf_fill_inode(struct inode *, struct buffer_head *);
 static int udf_sync_inode(struct inode *inode);
 static int udf_alloc_i_data(struct inode *inode, size_t size);
 static sector_t inode_getblk(struct inode *, sector_t, int *, int *);
@@ -1275,8 +1274,11 @@ static void __udf_read_inode(struct inode *inode)
 {
 	struct buffer_head *bh = NULL;
 	struct fileEntry *fe;
+	struct extendedFileEntry *efe;
 	uint16_t ident;
 	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	unsigned int link_count;
 
 	/*
 	 * Set defaults, but the inode is still incomplete!
@@ -1307,6 +1309,7 @@ static void __udf_read_inode(struct inode *inode)
 	}
 
 	fe = (struct fileEntry *)bh->b_data;
+	efe = (struct extendedFileEntry *)bh->b_data;
 
 	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
 		struct buffer_head *ibh;
@@ -1346,22 +1349,6 @@ static void __udf_read_inode(struct inode *inode)
 		make_bad_inode(inode);
 		return;
 	}
-	udf_fill_inode(inode, bh);
-
-	brelse(bh);
-}
-
-static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
-{
-	struct fileEntry *fe;
-	struct extendedFileEntry *efe;
-	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
-	struct udf_inode_info *iinfo = UDF_I(inode);
-	unsigned int link_count;
-
-	fe = (struct fileEntry *)bh->b_data;
-	efe = (struct extendedFileEntry *)bh->b_data;
-
 	if (fe->icbTag.strategyType == cpu_to_le16(4))
 		iinfo->i_strat4096 = 0;
 	else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */
@@ -1551,6 +1538,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh)
 		} else
 			make_bad_inode(inode);
 	}
+	brelse(bh);
 }
 
 static int udf_alloc_i_data(struct inode *inode, size_t size)
-- 
cgit v1.2.3


From c03aa9f6e1f938618e6db2e23afef0574efeeb65 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 14:06:55 +0200
Subject: udf: Avoid infinite loop when processing indirect ICBs

We did not implement any bound on number of indirect ICBs we follow when
loading inode. Thus corrupted medium could cause kernel to go into an
infinite loop, possibly causing a stack overflow.

Fix the possible stack overflow by removing recursion from
__udf_read_inode() and limit number of indirect ICBs we follow to avoid
infinite loops.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 35 +++++++++++++++++++++--------------
 1 file changed, 21 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 68cc7b144c26..a6a40536ebf1 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1270,6 +1270,13 @@ update_time:
 	return 0;
 }
 
+/*
+ * Maximum length of linked list formed by ICB hierarchy. The chosen number is
+ * arbitrary - just that we hopefully don't limit any real use of rewritten
+ * inode on write-once media but avoid looping for too long on corrupted media.
+ */
+#define UDF_MAX_ICB_NESTING 1024
+
 static void __udf_read_inode(struct inode *inode)
 {
 	struct buffer_head *bh = NULL;
@@ -1279,7 +1286,9 @@ static void __udf_read_inode(struct inode *inode)
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
 	unsigned int link_count;
+	unsigned int indirections = 0;
 
+reread:
 	/*
 	 * Set defaults, but the inode is still incomplete!
 	 * Note: get_new_inode() sets the following on a new inode:
@@ -1317,28 +1326,26 @@ static void __udf_read_inode(struct inode *inode)
 		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
 					&ident);
 		if (ident == TAG_IDENT_IE && ibh) {
-			struct buffer_head *nbh = NULL;
 			struct kernel_lb_addr loc;
 			struct indirectEntry *ie;
 
 			ie = (struct indirectEntry *)ibh->b_data;
 			loc = lelb_to_cpu(ie->indirectICB.extLocation);
 
-			if (ie->indirectICB.extLength &&
-				(nbh = udf_read_ptagged(inode->i_sb, &loc, 0,
-							&ident))) {
-				if (ident == TAG_IDENT_FE ||
-					ident == TAG_IDENT_EFE) {
-					memcpy(&iinfo->i_location,
-						&loc,
-						sizeof(struct kernel_lb_addr));
-					brelse(bh);
-					brelse(ibh);
-					brelse(nbh);
-					__udf_read_inode(inode);
+			if (ie->indirectICB.extLength) {
+				brelse(bh);
+				brelse(ibh);
+				memcpy(&iinfo->i_location, &loc,
+				       sizeof(struct kernel_lb_addr));
+				if (++indirections > UDF_MAX_ICB_NESTING) {
+					udf_err(inode->i_sb,
+						"too many ICBs in ICB hierarchy"
+						" (max %d supported)\n",
+						UDF_MAX_ICB_NESTING);
+					make_bad_inode(inode);
 					return;
 				}
-				brelse(nbh);
+				goto reread;
 			}
 		}
 		brelse(ibh);
-- 
cgit v1.2.3


From 6d3d5e860a114ae606b1af2ba7f64cb19fbeb414 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 16:15:51 +0200
Subject: udf: Make udf_read_inode() and udf_iget() return error

Currently __udf_read_inode() wasn't returning anything and we found out
whether we succeeded reading inode by checking whether inode is bad or
not. udf_iget() returned NULL on failure and inode pointer otherwise.
Make these two functions properly propagate errors up the call stack and
use the return value in callers.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c   | 99 ++++++++++++++++++++++++++------------------------------
 fs/udf/namei.c   | 22 ++++++-------
 fs/udf/super.c   | 69 +++++++++++++++++++++++----------------
 fs/udf/udfdecl.h |  1 -
 4 files changed, 96 insertions(+), 95 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index a6a40536ebf1..788fc58ea78e 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1277,7 +1277,7 @@ update_time:
  */
 #define UDF_MAX_ICB_NESTING 1024
 
-static void __udf_read_inode(struct inode *inode)
+static int udf_read_inode(struct inode *inode)
 {
 	struct buffer_head *bh = NULL;
 	struct fileEntry *fe;
@@ -1285,10 +1285,19 @@ static void __udf_read_inode(struct inode *inode)
 	uint16_t ident;
 	struct udf_inode_info *iinfo = UDF_I(inode);
 	struct udf_sb_info *sbi = UDF_SB(inode->i_sb);
+	struct kernel_lb_addr *iloc = &iinfo->i_location;
 	unsigned int link_count;
 	unsigned int indirections = 0;
+	int ret = -EIO;
 
 reread:
+	if (iloc->logicalBlockNum >=
+	    sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) {
+		udf_debug("block=%d, partition=%d out of range\n",
+			  iloc->logicalBlockNum, iloc->partitionReferenceNum);
+		return -EIO;
+	}
+
 	/*
 	 * Set defaults, but the inode is still incomplete!
 	 * Note: get_new_inode() sets the following on a new inode:
@@ -1301,20 +1310,17 @@ reread:
 	 *      i_nlink = 1
 	 *      i_op = NULL;
 	 */
-	bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident);
+	bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident);
 	if (!bh) {
 		udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino);
-		make_bad_inode(inode);
-		return;
+		return -EIO;
 	}
 
 	if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE &&
 	    ident != TAG_IDENT_USE) {
 		udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n",
 			inode->i_ino, ident);
-		brelse(bh);
-		make_bad_inode(inode);
-		return;
+		goto out;
 	}
 
 	fe = (struct fileEntry *)bh->b_data;
@@ -1323,8 +1329,7 @@ reread:
 	if (fe->icbTag.strategyType == cpu_to_le16(4096)) {
 		struct buffer_head *ibh;
 
-		ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1,
-					&ident);
+		ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident);
 		if (ident == TAG_IDENT_IE && ibh) {
 			struct kernel_lb_addr loc;
 			struct indirectEntry *ie;
@@ -1333,7 +1338,6 @@ reread:
 			loc = lelb_to_cpu(ie->indirectICB.extLocation);
 
 			if (ie->indirectICB.extLength) {
-				brelse(bh);
 				brelse(ibh);
 				memcpy(&iinfo->i_location, &loc,
 				       sizeof(struct kernel_lb_addr));
@@ -1342,9 +1346,9 @@ reread:
 						"too many ICBs in ICB hierarchy"
 						" (max %d supported)\n",
 						UDF_MAX_ICB_NESTING);
-					make_bad_inode(inode);
-					return;
+					goto out;
 				}
+				brelse(bh);
 				goto reread;
 			}
 		}
@@ -1352,9 +1356,7 @@ reread:
 	} else if (fe->icbTag.strategyType != cpu_to_le16(4)) {
 		udf_err(inode->i_sb, "unsupported strategy type: %d\n",
 			le16_to_cpu(fe->icbTag.strategyType));
-		brelse(bh);
-		make_bad_inode(inode);
-		return;
+		goto out;
 	}
 	if (fe->icbTag.strategyType == cpu_to_le16(4))
 		iinfo->i_strat4096 = 0;
@@ -1372,11 +1374,10 @@ reread:
 	if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) {
 		iinfo->i_efe = 1;
 		iinfo->i_use = 0;
-		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-					sizeof(struct extendedFileEntry))) {
-			make_bad_inode(inode);
-			return;
-		}
+		ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+					sizeof(struct extendedFileEntry));
+		if (ret)
+			goto out;
 		memcpy(iinfo->i_ext.i_data,
 		       bh->b_data + sizeof(struct extendedFileEntry),
 		       inode->i_sb->s_blocksize -
@@ -1384,11 +1385,10 @@ reread:
 	} else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) {
 		iinfo->i_efe = 0;
 		iinfo->i_use = 0;
-		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-						sizeof(struct fileEntry))) {
-			make_bad_inode(inode);
-			return;
-		}
+		ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+						sizeof(struct fileEntry));
+		if (ret)
+			goto out;
 		memcpy(iinfo->i_ext.i_data,
 		       bh->b_data + sizeof(struct fileEntry),
 		       inode->i_sb->s_blocksize - sizeof(struct fileEntry));
@@ -1398,18 +1398,18 @@ reread:
 		iinfo->i_lenAlloc = le32_to_cpu(
 				((struct unallocSpaceEntry *)bh->b_data)->
 				 lengthAllocDescs);
-		if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
-					sizeof(struct unallocSpaceEntry))) {
-			make_bad_inode(inode);
-			return;
-		}
+		ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize -
+					sizeof(struct unallocSpaceEntry));
+		if (ret)
+			goto out;
 		memcpy(iinfo->i_ext.i_data,
 		       bh->b_data + sizeof(struct unallocSpaceEntry),
 		       inode->i_sb->s_blocksize -
 					sizeof(struct unallocSpaceEntry));
-		return;
+		return 0;
 	}
 
+	ret = -EIO;
 	read_lock(&sbi->s_cred_lock);
 	i_uid_write(inode, le32_to_cpu(fe->uid));
 	if (!uid_valid(inode->i_uid) ||
@@ -1531,8 +1531,7 @@ reread:
 	default:
 		udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n",
 			inode->i_ino, fe->icbTag.fileType);
-		make_bad_inode(inode);
-		return;
+		goto out;
 	}
 	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
 		struct deviceSpec *dsea =
@@ -1543,9 +1542,12 @@ reread:
 				      le32_to_cpu(dsea->minorDeviceIdent)));
 			/* Developer ID ??? */
 		} else
-			make_bad_inode(inode);
+			goto out;
 	}
+	ret = 0;
+out:
 	brelse(bh);
+	return ret;
 }
 
 static int udf_alloc_i_data(struct inode *inode, size_t size)
@@ -1825,32 +1827,23 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino)
 {
 	unsigned long block = udf_get_lb_pblock(sb, ino, 0);
 	struct inode *inode = iget_locked(sb, block);
+	int err;
 
 	if (!inode)
-		return NULL;
-
-	if (inode->i_state & I_NEW) {
-		memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
-		__udf_read_inode(inode);
-		unlock_new_inode(inode);
-	}
+		return ERR_PTR(-ENOMEM);
 
-	if (is_bad_inode(inode))
-		goto out_iput;
+	if (!(inode->i_state & I_NEW))
+		return inode;
 
-	if (ino->logicalBlockNum >= UDF_SB(sb)->
-			s_partmaps[ino->partitionReferenceNum].s_partition_len) {
-		udf_debug("block=%d, partition=%d out of range\n",
-			  ino->logicalBlockNum, ino->partitionReferenceNum);
-		make_bad_inode(inode);
-		goto out_iput;
+	memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr));
+	err = udf_read_inode(inode);
+	if (err < 0) {
+		iget_failed(inode);
+		return ERR_PTR(err);
 	}
+	unlock_new_inode(inode);
 
 	return inode;
-
- out_iput:
-	iput(inode);
-	return NULL;
 }
 
 int udf_add_aext(struct inode *inode, struct extent_position *epos,
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 83a06001742b..e041fd9c6816 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 						NULL, 0),
 		};
 		inode = udf_iget(dir->i_sb, lb);
-		if (!inode) {
-			return ERR_PTR(-EACCES);
-		}
+		if (IS_ERR(inode))
+			return inode;
 	} else
 #endif /* UDF_RECOVERY */
 
@@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry,
 
 		loc = lelb_to_cpu(cfi.icb.extLocation);
 		inode = udf_iget(dir->i_sb, &loc);
-		if (!inode) {
-			return ERR_PTR(-EACCES);
-		}
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
 	}
 
 	return d_splice_alias(inode, dentry);
@@ -1222,7 +1220,7 @@ static struct dentry *udf_get_parent(struct dentry *child)
 	struct udf_fileident_bh fibh;
 
 	if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi))
-		goto out_unlock;
+		return ERR_PTR(-EACCES);
 
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
@@ -1230,12 +1228,10 @@ static struct dentry *udf_get_parent(struct dentry *child)
 
 	tloc = lelb_to_cpu(cfi.icb.extLocation);
 	inode = udf_iget(child->d_inode->i_sb, &tloc);
-	if (!inode)
-		goto out_unlock;
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 	return d_obtain_alias(inode);
-out_unlock:
-	return ERR_PTR(-EACCES);
 }
 
 
@@ -1252,8 +1248,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block,
 	loc.partitionReferenceNum = partref;
 	inode = udf_iget(sb, &loc);
 
-	if (inode == NULL)
-		return ERR_PTR(-ENOMEM);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
 
 	if (generation && inode->i_generation != generation) {
 		iput(inode);
diff --git a/fs/udf/super.c b/fs/udf/super.c
index 813da94d447b..5401fc33f5cc 100644
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -961,12 +961,14 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb,
 
 	metadata_fe = udf_iget(sb, &addr);
 
-	if (metadata_fe == NULL)
+	if (IS_ERR(metadata_fe)) {
 		udf_warn(sb, "metadata inode efe not found\n");
-	else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
+		return metadata_fe;
+	}
+	if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) {
 		udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n");
 		iput(metadata_fe);
-		metadata_fe = NULL;
+		return ERR_PTR(-EIO);
 	}
 
 	return metadata_fe;
@@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	struct udf_part_map *map;
 	struct udf_meta_data *mdata;
 	struct kernel_lb_addr addr;
+	struct inode *fe;
 
 	map = &sbi->s_partmaps[partition];
 	mdata = &map->s_type_specific.s_metadata;
@@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 	udf_debug("Metadata file location: block = %d part = %d\n",
 		  mdata->s_meta_file_loc, map->s_partition_num);
 
-	mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb,
-		mdata->s_meta_file_loc, map->s_partition_num);
-
-	if (mdata->s_metadata_fe == NULL) {
+	fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc,
+					 map->s_partition_num);
+	if (IS_ERR(fe)) {
 		/* mirror file entry */
 		udf_debug("Mirror metadata file location: block = %d part = %d\n",
 			  mdata->s_mirror_file_loc, map->s_partition_num);
 
-		mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb,
-			mdata->s_mirror_file_loc, map->s_partition_num);
+		fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc,
+						 map->s_partition_num);
 
-		if (mdata->s_mirror_fe == NULL) {
+		if (IS_ERR(fe)) {
 			udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n");
-			return -EIO;
+			return PTR_ERR(fe);
 		}
-	}
+		mdata->s_mirror_fe = fe;
+	} else
+		mdata->s_metadata_fe = fe;
+
 
 	/*
 	 * bitmap file entry
@@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition)
 		udf_debug("Bitmap file location: block = %d part = %d\n",
 			  addr.logicalBlockNum, addr.partitionReferenceNum);
 
-		mdata->s_bitmap_fe = udf_iget(sb, &addr);
-		if (mdata->s_bitmap_fe == NULL) {
+		fe = udf_iget(sb, &addr);
+		if (IS_ERR(fe)) {
 			if (sb->s_flags & MS_RDONLY)
 				udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n");
 			else {
 				udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n");
-				return -EIO;
+				return PTR_ERR(fe);
 			}
-		}
+		} else
+			mdata->s_bitmap_fe = fe;
 	}
 
 	udf_debug("udf_load_metadata_files Ok\n");
@@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 				phd->unallocSpaceTable.extPosition),
 			.partitionReferenceNum = p_index,
 		};
+		struct inode *inode;
 
-		map->s_uspace.s_table = udf_iget(sb, &loc);
-		if (!map->s_uspace.s_table) {
+		inode = udf_iget(sb, &loc);
+		if (IS_ERR(inode)) {
 			udf_debug("cannot load unallocSpaceTable (part %d)\n",
 				  p_index);
-			return -EIO;
+			return PTR_ERR(inode);
 		}
+		map->s_uspace.s_table = inode;
 		map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE;
 		udf_debug("unallocSpaceTable (part %d) @ %ld\n",
 			  p_index, map->s_uspace.s_table->i_ino);
@@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb,
 				phd->freedSpaceTable.extPosition),
 			.partitionReferenceNum = p_index,
 		};
+		struct inode *inode;
 
-		map->s_fspace.s_table = udf_iget(sb, &loc);
-		if (!map->s_fspace.s_table) {
+		inode = udf_iget(sb, &loc);
+		if (IS_ERR(inode)) {
 			udf_debug("cannot load freedSpaceTable (part %d)\n",
 				  p_index);
-			return -EIO;
+			return PTR_ERR(inode);
 		}
-
+		map->s_fspace.s_table = inode;
 		map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE;
 		udf_debug("freedSpaceTable (part %d) @ %ld\n",
 			  p_index, map->s_fspace.s_table->i_ino);
@@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
 	struct udf_part_map *map = &sbi->s_partmaps[p_index];
 	sector_t vat_block;
 	struct kernel_lb_addr ino;
+	struct inode *inode;
 
 	/*
 	 * VAT file entry is in the last recorded block. Some broken disks have
@@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index,
 	ino.partitionReferenceNum = type1_index;
 	for (vat_block = start_block;
 	     vat_block >= map->s_partition_root &&
-	     vat_block >= start_block - 3 &&
-	     !sbi->s_vat_inode; vat_block--) {
+	     vat_block >= start_block - 3; vat_block--) {
 		ino.logicalBlockNum = vat_block - map->s_partition_root;
-		sbi->s_vat_inode = udf_iget(sb, &ino);
+		inode = udf_iget(sb, &ino);
+		if (!IS_ERR(inode)) {
+			sbi->s_vat_inode = inode;
+			break;
+		}
 	}
 }
 
@@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent)
 	/* assign inodes by physical block number */
 	/* perhaps it's not extensible enough, but for now ... */
 	inode = udf_iget(sb, &rootdir);
-	if (!inode) {
+	if (IS_ERR(inode)) {
 		udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n",
 		       rootdir.logicalBlockNum, rootdir.partitionReferenceNum);
-		ret = -EIO;
+		ret = PTR_ERR(inode);
 		goto error_out;
 	}
 
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index be7dabbbcb49..41a8115c9345 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -143,7 +143,6 @@ extern int udf_expand_file_adinicb(struct inode *);
 extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *);
 extern struct buffer_head *udf_bread(struct inode *, int, int, int *);
 extern int udf_setsize(struct inode *, loff_t);
-extern void udf_read_inode(struct inode *);
 extern void udf_evict_inode(struct inode *);
 extern int udf_write_inode(struct inode *, struct writeback_control *wbc);
 extern long udf_block_map(struct inode *, sector_t);
-- 
cgit v1.2.3


From 4071b913622316970d0e1919f7d82b4403fec5f2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 16:19:47 +0200
Subject: udf: Properly detect stale inodes

NFS can easily ask for inodes that are already deleted. Currently UDF
happily returns such inodes which is a bug. Return -ESTALE if
udf_read_inode() is asked to read deleted inode.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/inode.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 788fc58ea78e..3a44d9187aad 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1435,8 +1435,10 @@ reread:
 	read_unlock(&sbi->s_cred_lock);
 
 	link_count = le16_to_cpu(fe->fileLinkCount);
-	if (!link_count)
-		link_count = 1;
+	if (!link_count) {
+		ret = -ESTALE;
+		goto out;
+	}
 	set_nlink(inode, link_count);
 
 	inode->i_size = le64_to_cpu(fe->informationLength);
-- 
cgit v1.2.3


From 470cca56c366428d4d5785a0a5a291619c332c7f Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Thu, 4 Sep 2014 16:26:19 +0200
Subject: udf: Set i_generation field

Currently UDF doesn't initialize i_generation in any way and thus NFS
can easily get reallocated inodes from stale file handles. Luckily UDF
already has a unique object identifier associated with each inode -
i_unique. Use that for initialization of i_generation.

Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ialloc.c | 1 +
 fs/udf/inode.c  | 1 +
 2 files changed, 2 insertions(+)

(limited to 'fs')

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 6eaf5edf1ea1..647370d70175 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -95,6 +95,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 	lvidiu = udf_sb_lvidiu(sb);
 	if (lvidiu) {
 		iinfo->i_unique = lvid_get_unique_id(sb);
+		inode->i_generation = iinfo->i_unique;
 		mutex_lock(&sbi->s_alloc_mutex);
 		if (S_ISDIR(mode))
 			le32_add_cpu(&lvidiu->numDirs, 1);
diff --git a/fs/udf/inode.c b/fs/udf/inode.c
index 3a44d9187aad..08598843288f 100644
--- a/fs/udf/inode.c
+++ b/fs/udf/inode.c
@@ -1484,6 +1484,7 @@ reread:
 		iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs);
 		iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint);
 	}
+	inode->i_generation = iinfo->i_unique;
 
 	switch (fe->icbTag.fileType) {
 	case ICBTAG_FILE_TYPE_DIRECTORY:
-- 
cgit v1.2.3


From d2be51cb34dc501791f3b8c01a99a3f2064bd8d1 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Sep 2014 09:34:14 -0400
Subject: udf: merge the pieces inserting a new non-directory object into
 directory

boilerplate code in udf_{create,mknod,symlink} taken to new helper

symlink case converted to unique id calculated by udf_new_inode() - no
point finding a new one.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/namei.c | 98 +++++++++++++++++-----------------------------------------
 1 file changed, 29 insertions(+), 69 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index e041fd9c6816..abec86466735 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -548,31 +548,16 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi,
 	return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL);
 }
 
-static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-		      bool excl)
+static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 {
+	struct udf_inode_info *iinfo = UDF_I(inode);
+	struct inode *dir = dentry->d_parent->d_inode;
 	struct udf_fileident_bh fibh;
-	struct inode *inode;
 	struct fileIdentDesc cfi, *fi;
 	int err;
-	struct udf_inode_info *iinfo;
-
-	inode = udf_new_inode(dir, mode, &err);
-	if (!inode) {
-		return err;
-	}
-
-	iinfo = UDF_I(inode);
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-		inode->i_data.a_ops = &udf_adinicb_aops;
-	else
-		inode->i_data.a_ops = &udf_aops;
-	inode->i_op = &udf_file_inode_operations;
-	inode->i_fop = &udf_file_operations;
-	mark_inode_dirty(inode);
 
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
-	if (!fi) {
+	if (unlikely(!fi)) {
 		inode_dec_link_count(inode);
 		iput(inode);
 		return err;
@@ -592,6 +577,28 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 	return 0;
 }
 
+static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+		      bool excl)
+{
+	struct inode *inode;
+	int err;
+
+	inode = udf_new_inode(dir, mode, &err);
+	if (!inode) {
+		return err;
+	}
+
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+		inode->i_data.a_ops = &udf_adinicb_aops;
+	else
+		inode->i_data.a_ops = &udf_aops;
+	inode->i_op = &udf_file_inode_operations;
+	inode->i_fop = &udf_file_operations;
+	mark_inode_dirty(inode);
+
+	return udf_add_nondir(dentry, inode);
+}
+
 static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
 	struct inode *inode;
@@ -619,10 +626,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 		     dev_t rdev)
 {
 	struct inode *inode;
-	struct udf_fileident_bh fibh;
-	struct fileIdentDesc cfi, *fi;
 	int err;
-	struct udf_inode_info *iinfo;
 
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
@@ -630,33 +634,10 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 	err = -EIO;
 	inode = udf_new_inode(dir, mode, &err);
 	if (!inode)
-		goto out;
-
-	iinfo = UDF_I(inode);
-	init_special_inode(inode, mode, rdev);
-	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
-	if (!fi) {
-		inode_dec_link_count(inode);
-		iput(inode);
 		return err;
-	}
-	cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize);
-	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-	*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-		cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL);
-	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-		mark_inode_dirty(dir);
-	mark_inode_dirty(inode);
-
-	if (fibh.sbh != fibh.ebh)
-		brelse(fibh.ebh);
-	brelse(fibh.sbh);
-	d_instantiate(dentry, inode);
-	err = 0;
 
-out:
-	return err;
+	init_special_inode(inode, mode, rdev);
+	return udf_add_nondir(dentry, inode);
 }
 
 static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
@@ -877,11 +858,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	struct inode *inode;
 	struct pathComponent *pc;
 	const char *compstart;
-	struct udf_fileident_bh fibh;
 	struct extent_position epos = {};
 	int eoffset, elen = 0;
-	struct fileIdentDesc *fi;
-	struct fileIdentDesc cfi;
 	uint8_t *ea;
 	int err;
 	int block;
@@ -1010,31 +988,13 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	mark_inode_dirty(inode);
 	up_write(&iinfo->i_data_sem);
 
-	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
-	if (!fi)
-		goto out_fail;
-	cfi.icb.extLength = cpu_to_le32(sb->s_blocksize);
-	cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location);
-	if (UDF_SB(inode->i_sb)->s_lvid_bh) {
-		*(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse =
-			cpu_to_le32(lvid_get_unique_id(sb));
-	}
-	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
-	if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
-		mark_inode_dirty(dir);
-	if (fibh.sbh != fibh.ebh)
-		brelse(fibh.ebh);
-	brelse(fibh.sbh);
-	d_instantiate(dentry, inode);
-	err = 0;
-
+	err = udf_add_nondir(dentry, inode);
 out:
 	kfree(name);
 	return err;
 
 out_no_entry:
 	up_write(&iinfo->i_data_sem);
-out_fail:
 	inode_dec_link_count(inode);
 	iput(inode);
 	goto out;
-- 
cgit v1.2.3


From b231509616feb911c2a7a8814d58c0014ef5b17f Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Sep 2014 09:38:11 -0400
Subject: udf: fix the udf_iget() vs. udf_new_inode() races

Currently udf_iget() (triggered by NFS) can race with udf_new_inode()
leading to two inode structures with the same inode number:

nfsd: iget_locked() creates inode
nfsd: try to read from disk, block on that.
udf_new_inode(): allocate inode with that inumber
udf_new_inode(): insert it into icache, set it up and dirty
udf_write_inode(): write inode into buffer cache
nfsd: get CPU again, look into buffer cache, see nice and sane on-disk
  inode, set the in-core inode from it

Fix the problem by putting inode into icache in locked state (I_NEW set)
and unlocking it only after it's fully set up.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ialloc.c | 7 ++++++-
 fs/udf/namei.c  | 7 +++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 647370d70175..598f33bdcd26 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -124,7 +124,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 		iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG;
 	inode->i_mtime = inode->i_atime = inode->i_ctime =
 		iinfo->i_crtime = current_fs_time(inode->i_sb);
-	insert_inode_hash(inode);
+	if (unlikely(insert_inode_locked(inode) < 0)) {
+		make_bad_inode(inode);
+		iput(inode);
+		*err = -EIO;
+		return NULL;
+	}
 	mark_inode_dirty(inode);
 
 	*err = 0;
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index abec86466735..d106fdd1bef7 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -559,6 +559,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err);
 	if (unlikely(!fi)) {
 		inode_dec_link_count(inode);
+		unlock_new_inode(inode);
 		iput(inode);
 		return err;
 	}
@@ -572,6 +573,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
 	brelse(fibh.sbh);
+	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 
 	return 0;
@@ -619,6 +621,7 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	mark_inode_dirty(inode);
 
 	d_tmpfile(dentry, inode);
+	unlock_new_inode(inode);
 	return 0;
 }
 
@@ -660,6 +663,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err);
 	if (!fi) {
 		inode_dec_link_count(inode);
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out;
 	}
@@ -678,6 +682,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	if (!fi) {
 		clear_nlink(inode);
 		mark_inode_dirty(inode);
+		unlock_new_inode(inode);
 		iput(inode);
 		goto out;
 	}
@@ -689,6 +694,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL);
 	inc_nlink(dir);
 	mark_inode_dirty(dir);
+	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 	if (fibh.sbh != fibh.ebh)
 		brelse(fibh.ebh);
@@ -996,6 +1002,7 @@ out:
 out_no_entry:
 	up_write(&iinfo->i_data_sem);
 	inode_dec_link_count(inode);
+	unlock_new_inode(inode);
 	iput(inode);
 	goto out;
 }
-- 
cgit v1.2.3


From 0b93a92be4cb48f22b78c95dea84089a1e72f860 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 4 Sep 2014 09:47:41 -0400
Subject: udf: saner calling conventions for udf_new_inode()

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Jan Kara <jack@suse.cz>
---
 fs/udf/ialloc.c  | 24 ++++++++++--------------
 fs/udf/namei.c   | 44 ++++++++++++++++----------------------------
 fs/udf/udfdecl.h |  2 +-
 3 files changed, 27 insertions(+), 43 deletions(-)

(limited to 'fs')

diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c
index 598f33bdcd26..e77db621ec89 100644
--- a/fs/udf/ialloc.c
+++ b/fs/udf/ialloc.c
@@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode)
 	udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1);
 }
 
-struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
+struct inode *udf_new_inode(struct inode *dir, umode_t mode)
 {
 	struct super_block *sb = dir->i_sb;
 	struct udf_sb_info *sbi = UDF_SB(sb);
@@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 	struct udf_inode_info *iinfo;
 	struct udf_inode_info *dinfo = UDF_I(dir);
 	struct logicalVolIntegrityDescImpUse *lvidiu;
+	int err;
 
 	inode = new_inode(sb);
 
-	if (!inode) {
-		*err = -ENOMEM;
-		return NULL;
-	}
-	*err = -ENOSPC;
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
 
 	iinfo = UDF_I(inode);
 	if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) {
@@ -80,16 +78,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 	}
 	if (!iinfo->i_ext.i_data) {
 		iput(inode);
-		*err = -ENOMEM;
-		return NULL;
+		return ERR_PTR(-ENOMEM);
 	}
 
+	err = -ENOSPC;
 	block = udf_new_block(dir->i_sb, NULL,
 			      dinfo->i_location.partitionReferenceNum,
-			      start, err);
-	if (*err) {
+			      start, &err);
+	if (err) {
 		iput(inode);
-		return NULL;
+		return ERR_PTR(err);
 	}
 
 	lvidiu = udf_sb_lvidiu(sb);
@@ -127,11 +125,9 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err)
 	if (unlikely(insert_inode_locked(inode) < 0)) {
 		make_bad_inode(inode);
 		iput(inode);
-		*err = -EIO;
-		return NULL;
+		return ERR_PTR(-EIO);
 	}
 	mark_inode_dirty(inode);
 
-	*err = 0;
 	return inode;
 }
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index d106fdd1bef7..c12e260fd6c4 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -582,13 +582,10 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode)
 static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 		      bool excl)
 {
-	struct inode *inode;
-	int err;
+	struct inode *inode = udf_new_inode(dir, mode);
 
-	inode = udf_new_inode(dir, mode, &err);
-	if (!inode) {
-		return err;
-	}
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 		inode->i_data.a_ops = &udf_adinicb_aops;
@@ -603,23 +600,18 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode,
 
 static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 {
-	struct inode *inode;
-	struct udf_inode_info *iinfo;
-	int err;
+	struct inode *inode = udf_new_inode(dir, mode);
 
-	inode = udf_new_inode(dir, mode, &err);
-	if (!inode)
-		return err;
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
-	iinfo = UDF_I(inode);
-	if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
+	if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB)
 		inode->i_data.a_ops = &udf_adinicb_aops;
 	else
 		inode->i_data.a_ops = &udf_aops;
 	inode->i_op = &udf_file_inode_operations;
 	inode->i_fop = &udf_file_operations;
 	mark_inode_dirty(inode);
-
 	d_tmpfile(dentry, inode);
 	unlock_new_inode(inode);
 	return 0;
@@ -629,15 +621,13 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
 		     dev_t rdev)
 {
 	struct inode *inode;
-	int err;
 
 	if (!old_valid_dev(rdev))
 		return -EINVAL;
 
-	err = -EIO;
-	inode = udf_new_inode(dir, mode, &err);
-	if (!inode)
-		return err;
+	inode = udf_new_inode(dir, mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	init_special_inode(inode, mode, rdev);
 	return udf_add_nondir(dentry, inode);
@@ -652,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	struct udf_inode_info *dinfo = UDF_I(dir);
 	struct udf_inode_info *iinfo;
 
-	err = -EIO;
-	inode = udf_new_inode(dir, S_IFDIR | mode, &err);
-	if (!inode)
-		goto out;
+	inode = udf_new_inode(dir, S_IFDIR | mode);
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	iinfo = UDF_I(inode);
 	inode->i_op = &udf_dir_inode_operations;
@@ -861,7 +850,7 @@ out:
 static int udf_symlink(struct inode *dir, struct dentry *dentry,
 		       const char *symname)
 {
-	struct inode *inode;
+	struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO);
 	struct pathComponent *pc;
 	const char *compstart;
 	struct extent_position epos = {};
@@ -874,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry,
 	struct udf_inode_info *iinfo;
 	struct super_block *sb = dir->i_sb;
 
-	inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err);
-	if (!inode)
-		goto out;
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	iinfo = UDF_I(inode);
 	down_write(&iinfo->i_data_sem);
diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h
index 41a8115c9345..742557be9936 100644
--- a/fs/udf/udfdecl.h
+++ b/fs/udf/udfdecl.h
@@ -208,7 +208,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *);
 
 /* ialloc.c */
 extern void udf_free_inode(struct inode *);
-extern struct inode *udf_new_inode(struct inode *, umode_t, int *);
+extern struct inode *udf_new_inode(struct inode *, umode_t);
 
 /* truncate.c */
 extern void udf_truncate_tail_extent(struct inode *);
-- 
cgit v1.2.3


From aee3776441461c14ba6d8ed9e2149933e65abb6e Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Wed, 20 Aug 2014 14:49:50 -0400
Subject: nfsd4: fix rd_dircount enforcement

Commit 3b299709091b "nfsd4: enforce rd_dircount" totally misunderstood
rd_dircount; it refers to total non-attribute bytes returned, not number
of directory entries returned.

Bring the code into agreement with RFC 3530 section 14.2.24.

Cc: stable@vger.kernel.org
Fixes: 3b299709091b "nfsd4: enforce rd_dircount"
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/nfsd/nfs4xdr.c | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index f9821ce6658a..e94457c33ad6 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2657,6 +2657,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	struct xdr_stream *xdr = cd->xdr;
 	int start_offset = xdr->buf->len;
 	int cookie_offset;
+	u32 name_and_cookie;
 	int entry_bytes;
 	__be32 nfserr = nfserr_toosmall;
 	__be64 wire_offset;
@@ -2718,7 +2719,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen,
 	cd->rd_maxcount -= entry_bytes;
 	if (!cd->rd_dircount)
 		goto fail;
-	cd->rd_dircount--;
+	/*
+	 * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so
+	 * let's always let through the first entry, at least:
+	 */
+	name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8;
+	if (name_and_cookie > cd->rd_dircount && cd->cookie_offset)
+		goto fail;
+	cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie);
 	cd->cookie_offset = cookie_offset;
 skip_entry:
 	cd->common.err = nfs_ok;
@@ -3321,6 +3329,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4
 	}
 	maxcount = min_t(int, maxcount-16, bytes_left);
 
+	/* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */
+	if (!readdir->rd_dircount)
+		readdir->rd_dircount = INT_MAX;
+
 	readdir->xdr = xdr;
 	readdir->rd_maxcount = maxcount;
 	readdir->common.err = 0;
-- 
cgit v1.2.3


From 7c17705e77b12b20fb8afb7c1b15dcdb126c0c12 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@redhat.com>
Date: Fri, 29 Aug 2014 16:25:50 -0400
Subject: lockd: fix rpcbind crash on lockd startup failure

Nikita Yuschenko reported that booting a kernel with init=/bin/sh and
then nfs mounting without portmap or rpcbind running using a busybox
mount resulted in:

  # mount -t nfs 10.30.130.21:/opt /mnt
  svc: failed to register lockdv1 RPC service (errno 111).
  lockd_up: makesock failed, error=-111
  Unable to handle kernel paging request for data at address 0x00000030
  Faulting instruction address: 0xc055e65c
  Oops: Kernel access of bad area, sig: 11 [#1]
  MPC85xx CDS
  Modules linked in:
  CPU: 0 PID: 1338 Comm: mount Not tainted 3.10.44.cge #117
  task: cf29cea0 ti: cf35c000 task.ti: cf35c000
  NIP: c055e65c LR: c0566490 CTR: c055e648
  REGS: cf35dad0 TRAP: 0300   Not tainted  (3.10.44.cge)
  MSR: 00029000 <CE,EE,ME>  CR: 22442488  XER: 20000000
  DEAR: 00000030, ESR: 00000000

  GPR00: c05606f4 cf35db80 cf29cea0 cf0ded80 cf0dedb8 00000001 1dec3086
  00000000
  GPR08: 00000000 c07b1640 00000007 1dec3086 22442482 100b9758 00000000
  10090ae8
  GPR16: 00000000 000186a5 00000000 00000000 100c3018 bfa46edc 100b0000
  bfa46ef0
  GPR24: cf386ae0 c07834f0 00000000 c0565f88 00000001 cf0dedb8 00000000
  cf0ded80
  NIP [c055e65c] call_start+0x14/0x34
  LR [c0566490] __rpc_execute+0x70/0x250
  Call Trace:
  [cf35db80] [00000080] 0x80 (unreliable)
  [cf35dbb0] [c05606f4] rpc_run_task+0x9c/0xc4
  [cf35dbc0] [c0560840] rpc_call_sync+0x50/0xb8
  [cf35dbf0] [c056ee90] rpcb_register_call+0x54/0x84
  [cf35dc10] [c056f24c] rpcb_register+0xf8/0x10c
  [cf35dc70] [c0569e18] svc_unregister.isra.23+0x100/0x108
  [cf35dc90] [c0569e38] svc_rpcb_cleanup+0x18/0x30
  [cf35dca0] [c0198c5c] lockd_up+0x1dc/0x2e0
  [cf35dcd0] [c0195348] nlmclnt_init+0x2c/0xc8
  [cf35dcf0] [c015bb5c] nfs_start_lockd+0x98/0xec
  [cf35dd20] [c015ce6c] nfs_create_server+0x1e8/0x3f4
  [cf35dd90] [c0171590] nfs3_create_server+0x10/0x44
  [cf35dda0] [c016528c] nfs_try_mount+0x158/0x1e4
  [cf35de20] [c01670d0] nfs_fs_mount+0x434/0x8c8
  [cf35de70] [c00cd3bc] mount_fs+0x20/0xbc
  [cf35de90] [c00e4f88] vfs_kern_mount+0x50/0x104
  [cf35dec0] [c00e6e0c] do_mount+0x1d0/0x8e0
  [cf35df10] [c00e75ac] SyS_mount+0x90/0xd0
  [cf35df40] [c000ccf4] ret_from_syscall+0x0/0x3c

The addition of svc_shutdown_net() resulted in two calls to
svc_rpcb_cleanup(); the second is no longer necessary and crashes when
it calls rpcb_register_call with clnt=NULL.

Reported-by: Nikita Yushchenko <nyushchenko@dev.rtsoft.ru>
Fixes: 679b033df484 "lockd: ensure we tear down any live sockets when socket creation fails during lockd_up"
Cc: stable@vger.kernel.org
Acked-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: J. Bruce Fields <bfields@redhat.com>
---
 fs/lockd/svc.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 8f27c93f8d2e..ec9e082f9ecd 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -253,13 +253,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net)
 
 	error = make_socks(serv, net);
 	if (error < 0)
-		goto err_socks;
+		goto err_bind;
 	set_grace_period(net);
 	dprintk("lockd_up_net: per-net data created; net=%p\n", net);
 	return 0;
 
-err_socks:
-	svc_rpcb_cleanup(serv, net);
 err_bind:
 	ln->nlmsvc_users--;
 	return error;
-- 
cgit v1.2.3


From c47ca32d3aadb234f73389a34c97574085bc9eda Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Thu, 4 Sep 2014 14:09:15 +0300
Subject: Btrfs: kfree()ing ERR_PTRs

The "inherit" in btrfs_ioctl_snap_create_v2() and "vol_args" in
btrfs_ioctl_rm_dev() are ERR_PTRs so we can't call kfree() on them.

These kind of bugs are "One Err Bugs" where there is just one error
label that does everything.  I could set the "inherit = NULL" and keep
the single out label but it ends up being more complicated that way.  It
makes the code simpler to re-order the unwind so it's in the mirror
order of the allocation and introduce some new error labels.

Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/ioctl.c | 25 +++++++++++++++----------
 1 file changed, 15 insertions(+), 10 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index a018ea484d39..8a8e29878c34 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -1703,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	    ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY |
 	      BTRFS_SUBVOL_QGROUP_INHERIT)) {
 		ret = -EOPNOTSUPP;
-		goto out;
+		goto free_args;
 	}
 
 	if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC)
@@ -1713,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
 	if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
 		if (vol_args->size > PAGE_CACHE_SIZE) {
 			ret = -EINVAL;
-			goto out;
+			goto free_args;
 		}
 		inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
 		if (IS_ERR(inherit)) {
 			ret = PTR_ERR(inherit);
-			goto out;
+			goto free_args;
 		}
 	}
 
 	ret = btrfs_ioctl_snap_create_transid(file, vol_args->name,
 					      vol_args->fd, subvol, ptr,
 					      readonly, inherit);
+	if (ret)
+		goto free_inherit;
 
-	if (ret == 0 && ptr &&
-	    copy_to_user(arg +
-			 offsetof(struct btrfs_ioctl_vol_args_v2,
-				  transid), ptr, sizeof(*ptr)))
+	if (ptr && copy_to_user(arg +
+				offsetof(struct btrfs_ioctl_vol_args_v2,
+					transid),
+				ptr, sizeof(*ptr)))
 		ret = -EFAULT;
-out:
-	kfree(vol_args);
+
+free_inherit:
 	kfree(inherit);
+free_args:
+	kfree(vol_args);
 	return ret;
 }
 
@@ -2653,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 	vol_args = memdup_user(arg, sizeof(*vol_args));
 	if (IS_ERR(vol_args)) {
 		ret = PTR_ERR(vol_args);
-		goto out;
+		goto err_drop;
 	}
 
 	vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
@@ -2671,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
 
 out:
 	kfree(vol_args);
+err_drop:
 	mnt_drop_write_file(file);
 	return ret;
 }
-- 
cgit v1.2.3


From 49dae1bc1c665817e434d01eefaa11967f618243 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Sat, 6 Sep 2014 22:34:39 +0100
Subject: Btrfs: fix fsync data loss after a ranged fsync

While we're doing a full fsync (when the inode has the flag
BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a
portion of the file), we might have ordered operations that are started
before or while we're logging the inode and that fall outside the fsync
range.

Therefore when a full ranged fsync finishes don't remove every extent
map from the list of modified extent maps - as for some of them, that
fall outside our fsync range, their respective ordered operation hasn't
finished yet, meaning the corresponding file extent item wasn't inserted
into the fs/subvol tree yet and therefore we didn't log it, and we must
let the next fast fsync (one that checks only the modified list) see this
extent map and log a matching file extent item to the log btree and wait
for its ordered operation to finish (if it's still ongoing).

A test case for xfstests follows.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/file.c     |  2 +-
 fs/btrfs/tree-log.c | 77 ++++++++++++++++++++++++++++++++++++++++++-----------
 fs/btrfs/tree-log.h |  2 ++
 3 files changed, 64 insertions(+), 17 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 36861b7a6757..ff1cc0399b9a 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1966,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
 
 	btrfs_init_log_ctx(&ctx);
 
-	ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx);
+	ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx);
 	if (ret < 0) {
 		/* Fallthrough and commit/free transaction. */
 		ret = 1;
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 7e0e6e3029dd..d296efe2d3e7 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -94,8 +94,10 @@
 #define LOG_WALK_REPLAY_ALL 3
 
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct inode *inode,
-			     int inode_only);
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end);
 static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root,
 			     struct btrfs_path *path, u64 objectid);
@@ -3858,8 +3860,10 @@ process:
  * This handles both files and directories.
  */
 static int btrfs_log_inode(struct btrfs_trans_handle *trans,
-			     struct btrfs_root *root, struct inode *inode,
-			     int inode_only)
+			   struct btrfs_root *root, struct inode *inode,
+			   int inode_only,
+			   const loff_t start,
+			   const loff_t end)
 {
 	struct btrfs_path *path;
 	struct btrfs_path *dst_path;
@@ -3876,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans,
 	int ins_nr;
 	bool fast_search = false;
 	u64 ino = btrfs_ino(inode);
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
 
 	path = btrfs_alloc_path();
 	if (!path)
@@ -4049,13 +4054,35 @@ log_extents:
 			goto out_unlock;
 		}
 	} else if (inode_only == LOG_INODE_ALL) {
-		struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree;
 		struct extent_map *em, *n;
 
-		write_lock(&tree->lock);
-		list_for_each_entry_safe(em, n, &tree->modified_extents, list)
-			list_del_init(&em->list);
-		write_unlock(&tree->lock);
+		write_lock(&em_tree->lock);
+		/*
+		 * We can't just remove every em if we're called for a ranged
+		 * fsync - that is, one that doesn't cover the whole possible
+		 * file range (0 to LLONG_MAX). This is because we can have
+		 * em's that fall outside the range we're logging and therefore
+		 * their ordered operations haven't completed yet
+		 * (btrfs_finish_ordered_io() not invoked yet). This means we
+		 * didn't get their respective file extent item in the fs/subvol
+		 * tree yet, and need to let the next fast fsync (one which
+		 * consults the list of modified extent maps) find the em so
+		 * that it logs a matching file extent item and waits for the
+		 * respective ordered operation to complete (if it's still
+		 * running).
+		 *
+		 * Removing every em outside the range we're logging would make
+		 * the next fast fsync not log their matching file extent items,
+		 * therefore making us lose data after a log replay.
+		 */
+		list_for_each_entry_safe(em, n, &em_tree->modified_extents,
+					 list) {
+			const u64 mod_end = em->mod_start + em->mod_len - 1;
+
+			if (em->mod_start >= start && mod_end <= end)
+				list_del_init(&em->list);
+		}
+		write_unlock(&em_tree->lock);
 	}
 
 	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
@@ -4065,8 +4092,19 @@ log_extents:
 			goto out_unlock;
 		}
 	}
-	BTRFS_I(inode)->logged_trans = trans->transid;
-	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
+
+	write_lock(&em_tree->lock);
+	/*
+	 * If we're doing a ranged fsync and there are still modified extents
+	 * in the list, we must run on the next fsync call as it might cover
+	 * those extents (a full fsync or an fsync for other range).
+	 */
+	if (list_empty(&em_tree->modified_extents)) {
+		BTRFS_I(inode)->logged_trans = trans->transid;
+		BTRFS_I(inode)->last_log_commit =
+			BTRFS_I(inode)->last_sub_trans;
+	}
+	write_unlock(&em_tree->lock);
 out_unlock:
 	if (unlikely(err))
 		btrfs_put_logged_extents(&logged_list);
@@ -4161,7 +4199,10 @@ out:
  */
 static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 			    	  struct btrfs_root *root, struct inode *inode,
-			    	  struct dentry *parent, int exists_only,
+				  struct dentry *parent,
+				  const loff_t start,
+				  const loff_t end,
+				  int exists_only,
 				  struct btrfs_log_ctx *ctx)
 {
 	int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
@@ -4207,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 	if (ret)
 		goto end_no_trans;
 
-	ret = btrfs_log_inode(trans, root, inode, inode_only);
+	ret = btrfs_log_inode(trans, root, inode, inode_only, start, end);
 	if (ret)
 		goto end_trans;
 
@@ -4235,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
 
 		if (BTRFS_I(inode)->generation >
 		    root->fs_info->last_trans_committed) {
-			ret = btrfs_log_inode(trans, root, inode, inode_only);
+			ret = btrfs_log_inode(trans, root, inode, inode_only,
+					      0, LLONG_MAX);
 			if (ret)
 				goto end_trans;
 		}
@@ -4269,13 +4311,15 @@ end_no_trans:
  */
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
 			  struct btrfs_log_ctx *ctx)
 {
 	struct dentry *parent = dget_parent(dentry);
 	int ret;
 
 	ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent,
-				     0, ctx);
+				     start, end, 0, ctx);
 	dput(parent);
 
 	return ret;
@@ -4512,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans,
 		    root->fs_info->last_trans_committed))
 		return 0;
 
-	return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL);
+	return btrfs_log_inode_parent(trans, root, inode, parent, 0,
+				      LLONG_MAX, 1, NULL);
 }
 
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index 7f5b41bd5373..e2e798ae7cd7 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans,
 int btrfs_recover_log_trees(struct btrfs_root *tree_root);
 int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
 			  struct btrfs_root *root, struct dentry *dentry,
+			  const loff_t start,
+			  const loff_t end,
 			  struct btrfs_log_ctx *ctx);
 int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
 				 struct btrfs_root *root,
-- 
cgit v1.2.3


From b0d5d10f41a0f1cd839408dd94427f2db3553bca Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Mon, 8 Sep 2014 13:08:51 -0700
Subject: Btrfs: use insert_inode_locked4 for inode creation

Btrfs was inserting inodes into the hash table before we had fully
set the inode up on disk.  This leaves us open to rare races that allow
two different inodes in memory for the same [root, inode] pair.

This patch fixes things by using insert_inode_locked4 to insert an I_NEW
inode and unlock_new_inode when we're ready for the rest of the kernel
to use the inode.

It also makes sure to init the operations pointers on the inode before
going into the error handling paths.

Signed-off-by: Chris Mason <clm@fb.com>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/btrfs/inode.c | 176 ++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 109 insertions(+), 67 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 88823f4ca451..214b936bdd3d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -5634,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index)
 	return ret;
 }
 
+static int btrfs_insert_inode_locked(struct inode *inode)
+{
+	struct btrfs_iget_args args;
+	args.location = &BTRFS_I(inode)->location;
+	args.root = BTRFS_I(inode)->root;
+
+	return insert_inode_locked4(inode,
+		   btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root),
+		   btrfs_find_actor, &args);
+}
+
 static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				     struct btrfs_root *root,
 				     struct inode *dir,
@@ -5726,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 		sizes[1] = name_len + sizeof(*ref);
 	}
 
+	location = &BTRFS_I(inode)->location;
+	location->objectid = objectid;
+	location->offset = 0;
+	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
+
+	ret = btrfs_insert_inode_locked(inode);
+	if (ret < 0)
+		goto fail;
+
 	path->leave_spinning = 1;
 	ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems);
 	if (ret != 0)
-		goto fail;
+		goto fail_unlock;
 
 	inode_init_owner(inode, dir, mode);
 	inode_set_bytes(inode, 0);
@@ -5752,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 	btrfs_mark_buffer_dirty(path->nodes[0]);
 	btrfs_free_path(path);
 
-	location = &BTRFS_I(inode)->location;
-	location->objectid = objectid;
-	location->offset = 0;
-	btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY);
-
 	btrfs_inherit_iflags(inode, dir);
 
 	if (S_ISREG(mode)) {
@@ -5767,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 				BTRFS_INODE_NODATASUM;
 	}
 
-	btrfs_insert_inode_hash(inode);
 	inode_tree_add(inode);
 
 	trace_btrfs_inode_new(inode);
@@ -5782,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
 			  btrfs_ino(inode), root->root_key.objectid, ret);
 
 	return inode;
+
+fail_unlock:
+	unlock_new_inode(inode);
 fail:
 	if (dir && name)
 		BTRFS_I(dir)->index_cnt--;
@@ -5916,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-	if (err) {
-		drop_inode = 1;
-		goto out_unlock;
-	}
-
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
 	* if the filesystem supports xattrs by looking at the
 	* ops vector.
 	*/
-
 	inode->i_op = &btrfs_special_inode_operations;
-	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	init_special_inode(inode, inode->i_mode, rdev);
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
-		drop_inode = 1;
-	else {
-		init_special_inode(inode, inode->i_mode, rdev);
+		goto out_unlock_inode;
+
+	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
+	if (err) {
+		goto out_unlock_inode;
+	} else {
 		btrfs_update_inode(trans, root, inode);
+		unlock_new_inode(inode);
 		d_instantiate(dentry, inode);
 	}
+
 out_unlock:
 	btrfs_end_transaction(trans, root);
 	btrfs_balance_delayed_items(root);
@@ -5947,6 +5964,12 @@ out_unlock:
 		iput(inode);
 	}
 	return err;
+
+out_unlock_inode:
+	drop_inode = 1;
+	unlock_new_inode(inode);
+	goto out_unlock;
+
 }
 
 static int btrfs_create(struct inode *dir, struct dentry *dentry,
@@ -5981,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 	drop_inode_on_err = 1;
-
-	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-	if (err)
-		goto out_unlock;
-
-	err = btrfs_update_inode(trans, root, inode);
-	if (err)
-		goto out_unlock;
-
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -5998,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry,
 	*/
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock_inode;
+
+	err = btrfs_update_inode(trans, root, inode);
+	if (err)
+		goto out_unlock_inode;
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
-		goto out_unlock;
+		goto out_unlock_inode;
 
-	inode->i_mapping->a_ops = &btrfs_aops;
-	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 
 out_unlock:
@@ -6017,6 +6040,11 @@ out_unlock:
 	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
+
+out_unlock_inode:
+	unlock_new_inode(inode);
+	goto out_unlock;
+
 }
 
 static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
@@ -6124,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	}
 
 	drop_on_err = 1;
+	/* these must be set before we unlock the inode */
+	inode->i_op = &btrfs_dir_inode_operations;
+	inode->i_fop = &btrfs_dir_file_operations;
 
 	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
 	if (err)
-		goto out_fail;
-
-	inode->i_op = &btrfs_dir_inode_operations;
-	inode->i_fop = &btrfs_dir_file_operations;
+		goto out_fail_inode;
 
 	btrfs_i_size_write(inode, 0);
 	err = btrfs_update_inode(trans, root, inode);
 	if (err)
-		goto out_fail;
+		goto out_fail_inode;
 
 	err = btrfs_add_link(trans, dir, inode, dentry->d_name.name,
 			     dentry->d_name.len, 0, index);
 	if (err)
-		goto out_fail;
+		goto out_fail_inode;
 
 	d_instantiate(dentry, inode);
+	/*
+	 * mkdir is special.  We're unlocking after we call d_instantiate
+	 * to avoid a race with nfsd calling d_instantiate.
+	 */
+	unlock_new_inode(inode);
 	drop_on_err = 0;
 
 out_fail:
@@ -6152,6 +6185,10 @@ out_fail:
 	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
 	return err;
+
+out_fail_inode:
+	unlock_new_inode(inode);
+	goto out_fail;
 }
 
 /* helper for btfs_get_extent.  Given an existing extent in the tree,
@@ -8107,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
 
 	set_nlink(inode, 1);
 	btrfs_i_size_write(inode, 0);
+	unlock_new_inode(inode);
 
 	err = btrfs_subvol_inherit_props(trans, new_root, parent_root);
 	if (err)
@@ -8757,12 +8795,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 		goto out_unlock;
 	}
 
-	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
-	if (err) {
-		drop_inode = 1;
-		goto out_unlock;
-	}
-
 	/*
 	* If the active LSM wants to access the inode during
 	* d_instantiate it needs these. Smack checks to see
@@ -8771,23 +8803,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	*/
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
+	inode->i_mapping->a_ops = &btrfs_aops;
+	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
+	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
+
+	err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name);
+	if (err)
+		goto out_unlock_inode;
 
 	err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index);
 	if (err)
-		drop_inode = 1;
-	else {
-		inode->i_mapping->a_ops = &btrfs_aops;
-		inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
-		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
-	}
-	if (drop_inode)
-		goto out_unlock;
+		goto out_unlock_inode;
 
 	path = btrfs_alloc_path();
 	if (!path) {
 		err = -ENOMEM;
-		drop_inode = 1;
-		goto out_unlock;
+		goto out_unlock_inode;
 	}
 	key.objectid = btrfs_ino(inode);
 	key.offset = 0;
@@ -8796,9 +8827,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	err = btrfs_insert_empty_item(trans, root, path, &key,
 				      datasize);
 	if (err) {
-		drop_inode = 1;
 		btrfs_free_path(path);
-		goto out_unlock;
+		goto out_unlock_inode;
 	}
 	leaf = path->nodes[0];
 	ei = btrfs_item_ptr(leaf, path->slots[0],
@@ -8822,12 +8852,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry,
 	inode_set_bytes(inode, name_len);
 	btrfs_i_size_write(inode, name_len);
 	err = btrfs_update_inode(trans, root, inode);
-	if (err)
+	if (err) {
 		drop_inode = 1;
+		goto out_unlock_inode;
+	}
+
+	unlock_new_inode(inode);
+	d_instantiate(dentry, inode);
 
 out_unlock:
-	if (!err)
-		d_instantiate(dentry, inode);
 	btrfs_end_transaction(trans, root);
 	if (drop_inode) {
 		inode_dec_link_count(inode);
@@ -8835,6 +8868,11 @@ out_unlock:
 	}
 	btrfs_btree_balance_dirty(root);
 	return err;
+
+out_unlock_inode:
+	drop_inode = 1;
+	unlock_new_inode(inode);
+	goto out_unlock;
 }
 
 static int __btrfs_prealloc_file_range(struct inode *inode, int mode,
@@ -9018,14 +9056,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 		goto out;
 	}
 
-	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
-	if (ret)
-		goto out;
-
-	ret = btrfs_update_inode(trans, root, inode);
-	if (ret)
-		goto out;
-
 	inode->i_fop = &btrfs_file_operations;
 	inode->i_op = &btrfs_file_inode_operations;
 
@@ -9033,9 +9063,16 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	inode->i_mapping->backing_dev_info = &root->fs_info->bdi;
 	BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 
+	ret = btrfs_init_inode_security(trans, inode, dir, NULL);
+	if (ret)
+		goto out_inode;
+
+	ret = btrfs_update_inode(trans, root, inode);
+	if (ret)
+		goto out_inode;
 	ret = btrfs_orphan_add(trans, inode);
 	if (ret)
-		goto out;
+		goto out_inode;
 
 	/*
 	 * We set number of links to 0 in btrfs_new_inode(), and here we set
@@ -9045,6 +9082,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
 	 *    d_tmpfile() -> inode_dec_link_count() -> drop_nlink()
 	 */
 	set_nlink(inode, 1);
+	unlock_new_inode(inode);
 	d_tmpfile(dentry, inode);
 	mark_inode_dirty(inode);
 
@@ -9054,8 +9092,12 @@ out:
 		iput(inode);
 	btrfs_balance_delayed_items(root);
 	btrfs_btree_balance_dirty(root);
-
 	return ret;
+
+out_inode:
+	unlock_new_inode(inode);
+	goto out;
+
 }
 
 static const struct inode_operations btrfs_dir_inode_operations = {
-- 
cgit v1.2.3


From 21e81002f9788a3af591416b6dec60d7b67f2fb2 Mon Sep 17 00:00:00 2001
From: Cong Wang <xiyou.wangcong@gmail.com>
Date: Mon, 8 Sep 2014 16:17:55 -0700
Subject: nfs: fix kernel warning when removing proc entry

I saw the following kernel warning:

[ 1852.321222] ------------[ cut here ]------------
[ 1852.326527] WARNING: CPU: 0 PID: 118 at fs/proc/generic.c:521 remove_proc_entry+0x154/0x16b()
[ 1852.335630] remove_proc_entry: removing non-empty directory 'fs/nfsfs', leaking at least 'volumes'
[ 1852.344084] CPU: 0 PID: 118 Comm: kworker/u8:2 Not tainted 3.16.0+ #540
[ 1852.350036] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011
[ 1852.354992] Workqueue: netns cleanup_net
[ 1852.358701]  0000000000000000 ffff880116f2fbd0 ffffffff819c03e9 ffff880116f2fc18
[ 1852.366474]  ffff880116f2fc08 ffffffff810744ee ffffffff811e0e6e ffff8800d4e96238
[ 1852.373507]  ffffffff81dbe665 ffff8800d46a5948 0000000000000005 ffff880116f2fc68
[ 1852.380224] Call Trace:
[ 1852.381976]  [<ffffffff819c03e9>] dump_stack+0x4d/0x66
[ 1852.385495]  [<ffffffff810744ee>] warn_slowpath_common+0x7a/0x93
[ 1852.389869]  [<ffffffff811e0e6e>] ? remove_proc_entry+0x154/0x16b
[ 1852.393987]  [<ffffffff8107457b>] warn_slowpath_fmt+0x4c/0x4e
[ 1852.397999]  [<ffffffff811e0e6e>] remove_proc_entry+0x154/0x16b
[ 1852.402034]  [<ffffffff8129c73d>] nfs_fs_proc_net_exit+0x53/0x56
[ 1852.406136]  [<ffffffff812a103b>] nfs_net_exit+0x12/0x1d
[ 1852.409774]  [<ffffffff81785bc9>] ops_exit_list+0x44/0x55
[ 1852.413529]  [<ffffffff81786389>] cleanup_net+0xee/0x182
[ 1852.417198]  [<ffffffff81088c9e>] process_one_work+0x209/0x40d
[ 1852.502320]  [<ffffffff81088bf7>] ? process_one_work+0x162/0x40d
[ 1852.587629]  [<ffffffff810890c1>] worker_thread+0x1f0/0x2c7
[ 1852.673291]  [<ffffffff81088ed1>] ? process_scheduled_works+0x2f/0x2f
[ 1852.759470]  [<ffffffff8108e079>] kthread+0xc9/0xd1
[ 1852.843099]  [<ffffffff8109427f>] ? finish_task_switch+0x3a/0xce
[ 1852.926518]  [<ffffffff8108dfb0>] ? __kthread_parkme+0x61/0x61
[ 1853.008565]  [<ffffffff819cbeac>] ret_from_fork+0x7c/0xb0
[ 1853.076477]  [<ffffffff8108dfb0>] ? __kthread_parkme+0x61/0x61
[ 1853.140653] ---[ end trace 69c4c6617f78e32d ]---

It looks wrong that we add "/proc/net/nfsfs" in nfs_fs_proc_net_init()
while remove "/proc/fs/nfsfs" in nfs_fs_proc_net_exit().

Fixes: commit 65b38851a17 (NFS: Fix /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes)
Cc: Eric W. Biederman <ebiederm@xmission.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: Dan Aloni <dan@kernelim.com>
Signed-off-by: Cong Wang <xiyou.wangcong@gmail.com>
[Trond: replace uses of remove_proc_entry() with remove_proc_subtree()
as suggested by Al Viro]
Cc: stable@vger.kernel.org # 3.4.x : 65b38851a17: NFS: Fix /proc/fs/nfsfs/servers
Cc: stable@vger.kernel.org # 3.4.x
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/client.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1c5ff6d58385..6a4f3666e273 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -1412,24 +1412,18 @@ int nfs_fs_proc_net_init(struct net *net)
 	p = proc_create("volumes", S_IFREG|S_IRUGO,
 			nn->proc_nfsfs, &nfs_volume_list_fops);
 	if (!p)
-		goto error_2;
+		goto error_1;
 	return 0;
 
-error_2:
-	remove_proc_entry("servers", nn->proc_nfsfs);
 error_1:
-	remove_proc_entry("fs/nfsfs", NULL);
+	remove_proc_subtree("nfsfs", net->proc_net);
 error_0:
 	return -ENOMEM;
 }
 
 void nfs_fs_proc_net_exit(struct net *net)
 {
-	struct nfs_net *nn = net_generic(net, nfs_net_id);
-
-	remove_proc_entry("volumes", nn->proc_nfsfs);
-	remove_proc_entry("servers", nn->proc_nfsfs);
-	remove_proc_entry("fs/nfsfs", NULL);
+	remove_proc_subtree("nfsfs", net->proc_net);
 }
 
 /*
-- 
cgit v1.2.3


From 0c0e0d3c091ed5b823fb89e1d46dbb9abd5830a7 Mon Sep 17 00:00:00 2001
From: Jeff Layton <jlayton@primarydata.com>
Date: Mon, 8 Sep 2014 08:26:01 -0400
Subject: nfs: revert "nfs4: queue free_lock_state job submission to nfsiod"

This reverts commit 49a4bda22e186c4d0eb07f4a36b5b1a378f9398d.

Christoph reported an oops due to the above commit:

generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1]
SMP
[ 2187.042899] Modules linked in:
[ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151
[ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007
[ 2187.044287] Workqueue: nfsiod free_lock_state_work
[ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000
[ 2187.044287] RIP: 0010:[<ffffffff81361ca6>]  [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287] RSP: 0018:ffff88007a4efd58  EFLAGS: 00010296
[ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000
[ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8
[ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000
[ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240
[ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000
[ 2187.044287] FS:  0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000
[ 2187.044287] CS:  0010 DS: 0000 ES: 0000 CR0: 000000008005003b
[ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0
[ 2187.044287] Stack:
[ 2187.044287]  ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258
[ 2187.044287]  000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0
[ 2187.044287]  0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240
[ 2187.044287] Call Trace:
[ 2187.044287]  [<ffffffff810cc877>] process_one_work+0x1c7/0x490
[ 2187.044287]  [<ffffffff810cc80d>] ? process_one_work+0x15d/0x490
[ 2187.044287]  [<ffffffff810cd569>] worker_thread+0x119/0x4f0
[ 2187.044287]  [<ffffffff810fbbad>] ? trace_hardirqs_on+0xd/0x10
[ 2187.044287]  [<ffffffff810cd450>] ? init_pwq+0x190/0x190
[ 2187.044287]  [<ffffffff810d3c6f>] kthread+0xdf/0x100
[ 2187.044287]  [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287]  [<ffffffff81d9873c>] ret_from_fork+0x7c/0xb0
[ 2187.044287]  [<ffffffff810d3b90>] ? __init_kthread_worker+0x70/0x70
[ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3
[ 2187.044287] RIP  [<ffffffff81361ca6>] free_lock_state_work+0x16/0x30
[ 2187.044287]  RSP <ffff88007a4efd58>
[ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]---

The original reason for this patch was because the fl_release_private
operation couldn't sleep. With commit ed9814d85810 (locks: defer freeing
locks in locks_delete_lock until after i_lock has been dropped), this is
no longer a problem so we can revert this patch.

Reported-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Jeff Layton <jlayton@primarydata.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Tested-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4_fs.h   | 13 ++++++-------
 fs/nfs/nfs4state.c | 24 ++++++------------------
 2 files changed, 12 insertions(+), 25 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 92193eddb41d..a8b855ab4e22 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -130,16 +130,15 @@ enum {
  */
 
 struct nfs4_lock_state {
-	struct list_head		ls_locks;   /* Other lock stateids */
-	struct nfs4_state *		ls_state;   /* Pointer to open state */
+	struct list_head	ls_locks;	/* Other lock stateids */
+	struct nfs4_state *	ls_state;	/* Pointer to open state */
 #define NFS_LOCK_INITIALIZED 0
 #define NFS_LOCK_LOST        1
-	unsigned long			ls_flags;
+	unsigned long		ls_flags;
 	struct nfs_seqid_counter	ls_seqid;
-	nfs4_stateid			ls_stateid;
-	atomic_t			ls_count;
-	fl_owner_t			ls_owner;
-	struct work_struct		ls_release;
+	nfs4_stateid		ls_stateid;
+	atomic_t		ls_count;
+	fl_owner_t		ls_owner;
 };
 
 /* bits for nfs4_state->flags */
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index a043f618cd5a..22fe35104c0c 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner)
 	return NULL;
 }
 
-static void
-free_lock_state_work(struct work_struct *work)
-{
-	struct nfs4_lock_state *lsp = container_of(work,
-					struct nfs4_lock_state, ls_release);
-	struct nfs4_state *state = lsp->ls_state;
-	struct nfs_server *server = state->owner->so_server;
-	struct nfs_client *clp = server->nfs_client;
-
-	clp->cl_mvops->free_lock_state(server, lsp);
-}
-
 /*
  * Return a compatible lock_state. If no initialized lock_state structure
  * exists, return an uninitialized one.
@@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f
 	if (lsp->ls_seqid.owner_id < 0)
 		goto out_free;
 	INIT_LIST_HEAD(&lsp->ls_locks);
-	INIT_WORK(&lsp->ls_release, free_lock_state_work);
 	return lsp;
 out_free:
 	kfree(lsp);
@@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp)
 	if (list_empty(&state->lock_states))
 		clear_bit(LK_STATE_IN_USE, &state->flags);
 	spin_unlock(&state->state_lock);
-	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags))
-		queue_work(nfsiod_workqueue, &lsp->ls_release);
-	else {
-		server = state->owner->so_server;
+	server = state->owner->so_server;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+		struct nfs_client *clp = server->nfs_client;
+
+		clp->cl_mvops->free_lock_state(server, lsp);
+	} else
 		nfs4_free_lock_state(server, lsp);
-	}
 }
 
 static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src)
-- 
cgit v1.2.3


From 224ecbf5a674ec7da3a3b3ea21ca62e2853653fa Mon Sep 17 00:00:00 2001
From: Weston Andros Adamson <dros@primarydata.com>
Date: Tue, 9 Sep 2014 17:51:47 -0400
Subject: pnfs: fix filelayout_retry_commit when idx > 0

filelayout_retry_commit was recently split out from alloc_ds_commits,
but was done in such a way that the bucket pointer always starts at
index 0 no matter what the @idx argument is set to.

The intention of the @idx argument is to retry commits starting at
bucket @idx. This is called when alloc_ds_commits fails for a bucket.

Signed-off-by: Weston Andros Adamson <dros@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/filelayout/filelayout.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index 1359c4a27393..90978075f730 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
 static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx)
 {
 	struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
-	struct pnfs_commit_bucket *bucket = fl_cinfo->buckets;
+	struct pnfs_commit_bucket *bucket;
 	struct pnfs_layout_segment *freeme;
 	int i;
 
-	for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) {
+	for (i = idx; i < fl_cinfo->nbuckets; i++) {
+		bucket = &fl_cinfo->buckets[i];
 		if (list_empty(&bucket->committing))
 			continue;
 		nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo);
-- 
cgit v1.2.3


From c680e41b3a2e944185c74bf60531e3d316d3ecc4 Mon Sep 17 00:00:00 2001
From: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Date: Tue, 9 Sep 2014 14:50:51 -0700
Subject: eventpoll: fix uninitialized variable in epoll_ctl
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When calling epoll_ctl with operation EPOLL_CTL_DEL, structure epds is
not initialized but ep_take_care_of_epollwakeup reads its event field.
When this unintialized field has EPOLLWAKEUP bit set, a capability check
is done for CAP_BLOCK_SUSPEND in ep_take_care_of_epollwakeup.  This
produces unexpected messages in the audit log, such as (on a system
running SELinux):

    type=AVC msg=audit(1408212798.866:410): avc:  denied
    { block_suspend } for  pid=7754 comm="dbus-daemon" capability=36
    scontext=unconfined_u:unconfined_r:unconfined_t
    tcontext=unconfined_u:unconfined_r:unconfined_t
    tclass=capability2 permissive=1

    type=SYSCALL msg=audit(1408212798.866:410): arch=c000003e syscall=233
    success=yes exit=0 a0=3 a1=2 a2=9 a3=7fffd4d66ec0 items=0 ppid=1
    pid=7754 auid=1000 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0
    fsgid=0 tty=(none) ses=3 comm="dbus-daemon"
    exe="/usr/bin/dbus-daemon"
    subj=unconfined_u:unconfined_r:unconfined_t key=(null)

("arch=c000003e syscall=233 a1=2" means "epoll_ctl(op=EPOLL_CTL_DEL)")

Remove use of epds in epoll_ctl when op == EPOLL_CTL_DEL.

Fixes: 4d7e30d98939 ("epoll: Add a flag, EPOLLWAKEUP, to prevent suspend while epoll events are ready")
Signed-off-by: Nicolas Iooss <nicolas.iooss_linux@m4x.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Arve Hjønnevåg <arve@android.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/eventpoll.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b10b48c2a7af..7bcfff900f05 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd,
 		goto error_tgt_fput;
 
 	/* Check if EPOLLWAKEUP is allowed */
-	ep_take_care_of_epollwakeup(&epds);
+	if (ep_op_has_event(op))
+		ep_take_care_of_epollwakeup(&epds);
 
 	/*
 	 * We have to check that the file structure underneath the file descriptor
-- 
cgit v1.2.3


From 1fc98d11cac6dd66342e5580cb2687e5b1e9a613 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 9 Sep 2014 14:51:04 -0700
Subject: fsnotify/fdinfo: use named constants instead of hardcoded values

MAX_HANDLE_SZ is equal to 128, but currently the size of pad is only 64
bytes, so exportfs_encode_inode_fh can return an error.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fdinfo.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 238a5930cb3c..660d33bc1bef 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 {
 	struct {
 		struct file_handle handle;
-		u8 pad[64];
+		u8 pad[MAX_HANDLE_SZ];
 	} f;
 	int size, ret, i;
 
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	size = f.handle.handle_bytes >> 2;
 
 	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
-	if ((ret == 255) || (ret == -ENOSPC)) {
+	if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) {
 		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
 		return 0;
 	}
-- 
cgit v1.2.3


From 7e8824816bda16bb11ff5ff1e1212d642e57b0b3 Mon Sep 17 00:00:00 2001
From: Andrey Vagin <avagin@openvz.org>
Date: Tue, 9 Sep 2014 14:51:06 -0700
Subject: fs/notify: don't show f_handle if exportfs_encode_inode_fh failed

Currently we handle only ENOSPC.  In case of other errors the file_handle
variable isn't filled properly and we will show a part of stack.

Signed-off-by: Andrey Vagin <avagin@openvz.org>
Acked-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/notify/fdinfo.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index 660d33bc1bef..9d7e2b9659cb 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	size = f.handle.handle_bytes >> 2;
 
 	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
-	if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) {
+	if ((ret == FILEID_INVALID) || (ret < 0)) {
 		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
 		return 0;
 	}
-- 
cgit v1.2.3


From 7b7a91152d3c2ddca95172cac1675625cc8dffaf Mon Sep 17 00:00:00 2001
From: Benjamin Coddington <bcodding@redhat.com>
Date: Wed, 10 Sep 2014 14:09:20 -0400
Subject: GFS2: Hash the negative dentry during inode lookup

Fix a regression introduced by:
6d4ade986f9c8df31e68 GFS2: Add atomic_open support
where an early return misses d_splice_alias() which had been
adding the negative dentry.

Signed-off-by: Benjamin Coddington <bcodding@redhat.com>
Signed-off-by: Bob Peterson <rpeterso@redhat.com>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index e62e59477884..9317ddc1b3c3 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -840,8 +840,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 	int error;
 
 	inode = gfs2_lookupi(dir, &dentry->d_name, 0);
-	if (!inode)
+	if (inode == NULL) {
+		d_add(dentry, NULL);
 		return NULL;
+	}
 	if (IS_ERR(inode))
 		return ERR_CAST(inode);
 
-- 
cgit v1.2.3


From a937cca270c544c4319572b57b196b2251b07dd2 Mon Sep 17 00:00:00 2001
From: Jan Kara <jack@suse.cz>
Date: Wed, 10 Sep 2014 21:22:51 +0200
Subject: GFS2: Don't use MAXQUOTAS value

MAXQUOTAS value defines maximum number of quota types VFS supports.
This isn't necessarily the number of types gfs2 supports and with
addition of project quotas these two numbers stop matching. So make gfs2
use its private definition.

CC: cluster-devel@redhat.com
Signed-off-by: Jan Kara <jack@suse.cz>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/incore.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index 67d310c9ada3..39e7e9959b74 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -262,6 +262,9 @@ struct gfs2_holder {
 	unsigned long gh_ip;
 };
 
+/* Number of quota types we support */
+#define GFS2_MAXQUOTAS 2
+
 /* Resource group multi-block reservation, in order of appearance:
 
    Step 1. Function prepares to write, allocates a mb, sets the size hint.
@@ -282,8 +285,8 @@ struct gfs2_blkreserv {
 	u64 rs_inum;                  /* Inode number for reservation */
 
 	/* ancillary quota stuff */
-	struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS];
-	struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS];
+	struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS];
+	struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS];
 	unsigned int rs_qa_qd_num;
 };
 
-- 
cgit v1.2.3


From f39c01047994e66e7f3d89ddb4c6141f23349d8d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Thu, 11 Sep 2014 16:19:37 +1000
Subject: NFS: remove BUG possibility in nfs4_open_and_get_state

commit 4fa2c54b5198d09607a534e2fd436581064587ed
    NFS: nfs4_do_open should add negative results to the dcache.

used "d_drop(); d_add();" to ensure that a dentry was hashed
as a negative cached entry.
This is not safe if the dentry has an non-NULL ->d_inode.
It will trigger a BUG_ON in d_instantiate().
In that case, d_delete() is needed.

Also, only d_add if the dentry is currently unhashed, it seems
pointless removed and re-adding it unchanged.

Reported-by: Christoph Hellwig <hch@infradead.org>
Fixes: 4fa2c54b5198d09607a534e2fd436581064587ed
Cc: Jeff Layton <jeff.layton@primarydata.com>
Link: http://lkml.kernel.org/r/20140908144525.GB19811@infradead.org
Signed-off-by: NeilBrown <neilb@suse.de>
Acked-by: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 7dd8aca31c29..ac2dd953fc18 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2226,9 +2226,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 	ret = _nfs4_proc_open(opendata);
 	if (ret != 0) {
 		if (ret == -ENOENT) {
-			d_drop(opendata->dentry);
-			d_add(opendata->dentry, NULL);
-			nfs_set_verifier(opendata->dentry,
+			dentry = opendata->dentry;
+			if (dentry->d_inode)
+				d_delete(dentry);
+			else if (d_unhashed(dentry))
+				d_add(dentry, NULL);
+
+			nfs_set_verifier(dentry,
 					 nfs_save_change_attribute(opendata->dir->d_inode));
 		}
 		goto out;
-- 
cgit v1.2.3


From cfb2f9d5c921e38b0f12bb26fed10b877664444d Mon Sep 17 00:00:00 2001
From: Al Viro <viro@ZenIV.linux.org.uk>
Date: Fri, 12 Sep 2014 20:56:04 +0100
Subject: GFS2: fix d_splice_alias() misuses

Callers of d_splice_alias(dentry, inode) don't need iput(), neither
on success nor on failure.  Either the reference to inode is stored
in a previously negative dentry, or it's dropped.  In either case
inode reference the caller used to hold is consumed.

__gfs2_lookup() does iput() in case when d_splice_alias() has failed.
Double iput() if we ever hit that.  And gfs2_create_inode() ends up
not only with double iput(), but with link count dropped to zero - on
an inode it has just found in directory.

Cc: stable@vger.kernel.org # v3.14+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
---
 fs/gfs2/inode.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 9317ddc1b3c3..fc8ac2ee0667 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry,
 	if (!IS_ERR(inode)) {
 		d = d_splice_alias(inode, dentry);
 		error = PTR_ERR(d);
-		if (IS_ERR(d))
+		if (IS_ERR(d)) {
+			inode = ERR_CAST(d);
 			goto fail_gunlock;
+		}
 		error = 0;
 		if (file) {
 			if (S_ISREG(inode->i_mode)) {
@@ -856,7 +858,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry,
 
 	d = d_splice_alias(inode, dentry);
 	if (IS_ERR(d)) {
-		iput(inode);
 		gfs2_glock_dq_uninit(&gh);
 		return d;
 	}
-- 
cgit v1.2.3


From 99d263d4c5b2f541dfacb5391e22e8c91ea982a6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sat, 13 Sep 2014 11:30:10 -0700
Subject: vfs: fix bad hashing of dentries

Josef Bacik found a performance regression between 3.2 and 3.10 and
narrowed it down to commit bfcfaa77bdf0 ("vfs: use 'unsigned long'
accesses for dcache name comparison and hashing"). He reports:

 "The test case is essentially

      for (i = 0; i < 1000000; i++)
              mkdir("a$i");

  On xfs on a fio card this goes at about 20k dir/sec with 3.2, and 12k
  dir/sec with 3.10.  This is because we spend waaaaay more time in
  __d_lookup on 3.10 than in 3.2.

  The new hashing function for strings is suboptimal for <
  sizeof(unsigned long) string names (and hell even > sizeof(unsigned
  long) string names that I've tested).  I broke out the old hashing
  function and the new one into a userspace helper to get real numbers
  and this is what I'm getting:

      Old hash table had 1000000 entries, 0 dupes, 0 max dupes
      New hash table had 12628 entries, 987372 dupes, 900 max dupes
      We had 11400 buckets with a p50 of 30 dupes, p90 of 240 dupes, p99 of 567 dupes for the new hash

  My test does the hash, and then does the d_hash into a integer pointer
  array the same size as the dentry hash table on my system, and then
  just increments the value at the address we got to see how many
  entries we overlap with.

  As you can see the old hash function ended up with all 1 million
  entries in their own bucket, whereas the new one they are only
  distributed among ~12.5k buckets, which is why we're using so much
  more CPU in __d_lookup".

The reason for this hash regression is two-fold:

 - On 64-bit architectures the down-mixing of the original 64-bit
   word-at-a-time hash into the final 32-bit hash value is very
   simplistic and suboptimal, and just adds the two 32-bit parts
   together.

   In particular, because there is no bit shuffling and the mixing
   boundary is also a byte boundary, similar character patterns in the
   low and high word easily end up just canceling each other out.

 - the old byte-at-a-time hash mixed each byte into the final hash as it
   hashed the path component name, resulting in the low bits of the hash
   generally being a good source of hash data.  That is not true for the
   word-at-a-time case, and the hash data is distributed among all the
   bits.

The fix is the same in both cases: do a better job of mixing the bits up
and using as much of the hash data as possible.  We already have the
"hash_32|64()" functions to do that.

Reported-by: Josef Bacik <jbacik@fb.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Chris Mason <clm@fb.com>
Cc: linux-fsdevel@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/dcache.c | 3 +--
 fs/namei.c  | 4 ++--
 2 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..4023e77b800e 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent,
 					unsigned int hash)
 {
 	hash += (unsigned long) parent / L1_CACHE_BYTES;
-	hash = hash + (hash >> d_hash_shift);
-	return dentry_hashtable + (hash & d_hash_mask);
+	return dentry_hashtable + hash_32(hash, d_hash_shift);
 }
 
 /* Statistics gathering. */
diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..229235862e50 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
+#include <linux/hash.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -1634,8 +1635,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
 
 static inline unsigned int fold_hash(unsigned long hash)
 {
-	hash += hash >> (8*sizeof(int));
-	return hash;
+	return hash_64(hash, 32);
 }
 
 #else	/* 32-bit case */
-- 
cgit v1.2.3


From 6f18493e541c690169c3b1479d47d95f624161cf Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 11 Sep 2014 18:55:50 -0400
Subject: move the call of __d_drop(anon) into __d_materialise_unique(dentry,
 anon)

and lock the right list there

Cc: stable@vger.kernel.org
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/dcache.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/dcache.c b/fs/dcache.c
index d30ce699ae4b..5c6e71dc23f5 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2656,6 +2656,12 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon)
 	dentry->d_parent = dentry;
 	list_del_init(&dentry->d_u.d_child);
 	anon->d_parent = dparent;
+	if (likely(!d_unhashed(anon))) {
+		hlist_bl_lock(&anon->d_sb->s_anon);
+		__hlist_bl_del(&anon->d_hash);
+		anon->d_hash.pprev = NULL;
+		hlist_bl_unlock(&anon->d_sb->s_anon);
+	}
 	list_move(&anon->d_u.d_child, &dparent->d_subdirs);
 
 	write_seqcount_end(&dentry->d_seq);
@@ -2714,7 +2720,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry)
 			write_seqlock(&rename_lock);
 			__d_materialise_dentry(dentry, new);
 			write_sequnlock(&rename_lock);
-			__d_drop(new);
 			_d_rehash(new);
 			spin_unlock(&new->d_lock);
 			spin_unlock(&inode->i_lock);
@@ -2778,7 +2783,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode)
 				 * could splice into our tree? */
 				__d_materialise_dentry(dentry, alias);
 				write_sequnlock(&rename_lock);
-				__d_drop(alias);
 				goto found;
 			} else {
 				/* Nope, but we must(!) avoid directory
-- 
cgit v1.2.3


From f5be3e29127aec8c87f883aadadff337f8c2cfd7 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Sep 2014 21:50:45 -0400
Subject: fix bogus read_seqretry() checks introduced in b37199e

read_seqretry() returns true on mismatch, not on match...

Cc: stable@vger.kernel.org # 3.15+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index a996bb48dfab..3d1dc745f9d8 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1137,7 +1137,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 		 */
 		*inode = path->dentry->d_inode;
 	}
-	return read_seqretry(&mount_lock, nd->m_seq) &&
+	return !read_seqretry(&mount_lock, nd->m_seq) &&
 		!(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT);
 }
 
@@ -1174,7 +1174,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
-		if (!read_seqretry(&mount_lock, nd->m_seq))
+		if (read_seqretry(&mount_lock, nd->m_seq))
 			goto failed;
 	}
 	nd->inode = nd->path.dentry->d_inode;
-- 
cgit v1.2.3


From 7bd88377d482e1eae3c5329b12e33cfd664fa6a9 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Sep 2014 21:55:46 -0400
Subject: don't bugger nd->seq on set_root_rcu() from follow_dotdot_rcu()

return the value instead, and have path_init() do the assignment.  Broken by
"vfs: Fix absolute RCU path walk failures due to uninitialized seq number",
which was Cc-stable with 2.6.38+ as destination.  This one should go where
it went.

To avoid dummy value returned in case when root is already set (it would do
no harm, actually, since the only caller that doesn't ignore the return value
is guaranteed to have nd->root *not* set, but it's more obvious that way),
lift the check into callers.  And do the same to set_root(), to keep them
in sync.

Cc: stable@vger.kernel.org # 2.6.38+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 3d1dc745f9d8..fe47e6d8e85f 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -643,24 +643,22 @@ static int complete_walk(struct nameidata *nd)
 
 static __always_inline void set_root(struct nameidata *nd)
 {
-	if (!nd->root.mnt)
-		get_fs_root(current->fs, &nd->root);
+	get_fs_root(current->fs, &nd->root);
 }
 
 static int link_path_walk(const char *, struct nameidata *);
 
-static __always_inline void set_root_rcu(struct nameidata *nd)
+static __always_inline unsigned set_root_rcu(struct nameidata *nd)
 {
-	if (!nd->root.mnt) {
-		struct fs_struct *fs = current->fs;
-		unsigned seq;
+	struct fs_struct *fs = current->fs;
+	unsigned seq, res;
 
-		do {
-			seq = read_seqcount_begin(&fs->seq);
-			nd->root = fs->root;
-			nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq);
-		} while (read_seqcount_retry(&fs->seq, seq));
-	}
+	do {
+		seq = read_seqcount_begin(&fs->seq);
+		nd->root = fs->root;
+		res = __read_seqcount_begin(&nd->root.dentry->d_seq);
+	} while (read_seqcount_retry(&fs->seq, seq));
+	return res;
 }
 
 static void path_put_conditional(struct path *path, struct nameidata *nd)
@@ -860,7 +858,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p)
 			return PTR_ERR(s);
 		}
 		if (*s == '/') {
-			set_root(nd);
+			if (!nd->root.mnt)
+				set_root(nd);
 			path_put(&nd->path);
 			nd->path = nd->root;
 			path_get(&nd->root);
@@ -1143,7 +1142,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
-	set_root_rcu(nd);
+	if (!nd->root.mnt)
+		set_root_rcu(nd);
 
 	while (1) {
 		if (nd->path.dentry == nd->root.dentry &&
@@ -1256,7 +1256,8 @@ static void follow_mount(struct path *path)
 
 static void follow_dotdot(struct nameidata *nd)
 {
-	set_root(nd);
+	if (!nd->root.mnt)
+		set_root(nd);
 
 	while(1) {
 		struct dentry *old = nd->path.dentry;
@@ -1852,7 +1853,7 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	if (*name=='/') {
 		if (flags & LOOKUP_RCU) {
 			rcu_read_lock();
-			set_root_rcu(nd);
+			nd->seq = set_root_rcu(nd);
 		} else {
 			set_root(nd);
 			path_get(&nd->root);
-- 
cgit v1.2.3


From 4023bfc9f351a7994fb6a7d515476c320f94a574 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sat, 13 Sep 2014 21:59:43 -0400
Subject: be careful with nd->inode in path_init() and follow_dotdot_rcu()

in the former we simply check if dentry is still valid after picking
its ->d_inode; in the latter we fetch ->d_inode in the same places
where we fetch dentry and its ->d_seq, under the same checks.

Cc: stable@vger.kernel.org # 2.6.38+
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namei.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index fe47e6d8e85f..d07bc1b206c3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1142,6 +1142,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path,
 
 static int follow_dotdot_rcu(struct nameidata *nd)
 {
+	struct inode *inode = nd->inode;
 	if (!nd->root.mnt)
 		set_root_rcu(nd);
 
@@ -1155,6 +1156,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			struct dentry *parent = old->d_parent;
 			unsigned seq;
 
+			inode = parent->d_inode;
 			seq = read_seqcount_begin(&parent->d_seq);
 			if (read_seqcount_retry(&old->d_seq, nd->seq))
 				goto failed;
@@ -1164,6 +1166,7 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 		}
 		if (!follow_up_rcu(&nd->path))
 			break;
+		inode = nd->path.dentry->d_inode;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 	}
 	while (d_mountpoint(nd->path.dentry)) {
@@ -1173,11 +1176,12 @@ static int follow_dotdot_rcu(struct nameidata *nd)
 			break;
 		nd->path.mnt = &mounted->mnt;
 		nd->path.dentry = mounted->mnt.mnt_root;
+		inode = nd->path.dentry->d_inode;
 		nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq);
 		if (read_seqretry(&mount_lock, nd->m_seq))
 			goto failed;
 	}
-	nd->inode = nd->path.dentry->d_inode;
+	nd->inode = inode;
 	return 0;
 
 failed:
@@ -1904,7 +1908,14 @@ static int path_init(int dfd, const char *name, unsigned int flags,
 	}
 
 	nd->inode = nd->path.dentry->d_inode;
-	return 0;
+	if (!(flags & LOOKUP_RCU))
+		return 0;
+	if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq)))
+		return 0;
+	if (!(nd->flags & LOOKUP_ROOT))
+		nd->root.mnt = NULL;
+	rcu_read_unlock();
+	return -ECHILD;
 }
 
 static inline int lookup_last(struct nameidata *nd, struct path *path)
-- 
cgit v1.2.3


From 2ae83bf93882d1ec0cd775c489bd1bee611f792e Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 14 Sep 2014 17:06:36 -0500
Subject: [CIFS] Fix setting time before epoch (negative time values)

xfstest generic/258 sets the time on a file to a negative value
(before 1970) which fails since do_div can not handle negative
numbers.  In addition 'normal' division of 64 bit values does
not build on 32 bit arch so have to workaround this by special
casing negative values in cifs_NTtimeToUnix

Samba server also has a bug with this (see samba bugzilla 7771)
but it works to Windows server.

Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/netmisc.c | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c
index 6834b9c3bec1..b333ff60781d 100644
--- a/fs/cifs/netmisc.c
+++ b/fs/cifs/netmisc.c
@@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc)
 	/* BB what about the timezone? BB */
 
 	/* Subtract the NTFS time offset, then convert to 1s intervals. */
-	u64 t;
+	s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
+
+	/*
+	 * Unfortunately can not use normal 64 bit division on 32 bit arch, but
+	 * the alternative, do_div, does not work with negative numbers so have
+	 * to special case them
+	 */
+	if (t < 0) {
+		t = -t;
+		ts.tv_nsec = (long)(do_div(t, 10000000) * 100);
+		ts.tv_nsec = -ts.tv_nsec;
+		ts.tv_sec = -t;
+	} else {
+		ts.tv_nsec = (long)do_div(t, 10000000) * 100;
+		ts.tv_sec = t;
+	}
 
-	t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
-	ts.tv_nsec = do_div(t, 10000000) * 100;
-	ts.tv_sec = t;
 	return ts;
 }
 
-- 
cgit v1.2.3


From 9226b5b440f2b4fbb3b797f3cb74a9a627220660 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Sun, 14 Sep 2014 17:28:32 -0700
Subject: vfs: avoid non-forwarding large load after small store in path lookup

The performance regression that Josef Bacik reported in the pathname
lookup (see commit 99d263d4c5b2 "vfs: fix bad hashing of dentries") made
me look at performance stability of the dcache code, just to verify that
the problem was actually fixed.  That turned up a few other problems in
this area.

There are a few cases where we exit RCU lookup mode and go to the slow
serializing case when we shouldn't, Al has fixed those and they'll come
in with the next VFS pull.

But my performance verification also shows that link_path_walk() turns
out to have a very unfortunate 32-bit store of the length and hash of
the name we look up, followed by a 64-bit read of the combined hash_len
field.  That screws up the processor store to load forwarding, causing
an unnecessary hickup in this critical routine.

It's caused by the ugly calling convention for the "hash_name()"
function, and easily fixed by just making hash_name() fill in the whole
'struct qstr' rather than passing it a pointer to just the hash value.

With that, the profile for this function looks much smoother.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c             | 19 ++++++++++---------
 include/linux/dcache.h |  1 +
 2 files changed, 11 insertions(+), 9 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 229235862e50..2be5120b81b3 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1669,13 +1669,14 @@ EXPORT_SYMBOL(full_name_hash);
 
 /*
  * Calculate the length and hash of the path component, and
- * return the length of the component;
+ * fill in the qstr. return the "len" as the result.
  */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline unsigned long hash_name(const char *name, struct qstr *res)
 {
 	unsigned long a, b, adata, bdata, mask, hash, len;
 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
+	res->name = name;
 	hash = a = 0;
 	len = -sizeof(unsigned long);
 	do {
@@ -1691,9 +1692,10 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp)
 	mask = create_zero_mask(adata | bdata);
 
 	hash += a & zero_bytemask(mask);
-	*hashp = fold_hash(hash);
+	len += find_zero(mask);
+	res->hash_len = hashlen_create(fold_hash(hash), len);
 
-	return len + find_zero(mask);
+	return len;
 }
 
 #else
@@ -1711,18 +1713,19 @@ EXPORT_SYMBOL(full_name_hash);
  * We know there's a real path component here of at least
  * one character.
  */
-static inline unsigned long hash_name(const char *name, unsigned int *hashp)
+static inline long hash_name(const char *name, struct qstr *res)
 {
 	unsigned long hash = init_name_hash();
 	unsigned long len = 0, c;
 
+	res->name = name;
 	c = (unsigned char)*name;
 	do {
 		len++;
 		hash = partial_name_hash(c, hash);
 		c = (unsigned char)name[len];
 	} while (c && c != '/');
-	*hashp = end_name_hash(hash);
+	res->hash_len = hashlen_create(end_name_hash(hash), len);
 	return len;
 }
 
@@ -1756,9 +1759,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
  		if (err)
 			break;
 
-		len = hash_name(name, &this.hash);
-		this.name = name;
-		this.len = len;
+		len = hash_name(name, &this);
 
 		type = LAST_NORM;
 		if (name[0] == '.') switch (len) {
diff --git a/include/linux/dcache.h b/include/linux/dcache.h
index e4ae2ad48d07..75a227cc7ce2 100644
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -55,6 +55,7 @@ struct qstr {
 #define QSTR_INIT(n,l) { { { .len = l } }, .name = n }
 #define hashlen_hash(hashlen) ((u32) (hashlen))
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
+#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash))
 
 struct dentry_stat_t {
 	long nr_dentry;
-- 
cgit v1.2.3


From da80659d4aa758dc6935b10ec64513f0b67bc969 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Sun, 14 Sep 2014 23:27:09 -0500
Subject: [SMB3] Fix oops when creating symlinks on smb3

We were not checking for symlink support properly for SMB2/SMB3
mounts so could oops when mounted with mfsymlinks when try
to create symlink when mfsymlinks on smb2/smb3 mounts

Signed-off-by: Steve French <smfrench@gmail.com>
Cc: <stable@vger.kernel.org> # 3.14+
CC: Sachin Prabhu <sprabhu@redhat.com>
---
 fs/cifs/link.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 68559fd557fb..a5c2812ead68 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		goto out;
 
-	rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb,
-					fromName, buf, &bytes_written);
+	if (tcon->ses->server->ops->create_mf_symlink)
+		rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon,
+					cifs_sb, fromName, buf, &bytes_written);
+	else
+		rc = -EOPNOTSUPP;
+
 	if (rc)
 		goto out;
 
-- 
cgit v1.2.3


From d6bb3e9075bbcf758d6084bb581f797bb6ea24c6 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 15 Sep 2014 10:51:07 -0700
Subject: vfs: simplify and shrink stack frame of link_path_walk()

Commit 9226b5b440f2 ("vfs: avoid non-forwarding large load after small
store in path lookup") made link_path_walk() always access the
"hash_len" field as a single 64-bit entity, in order to avoid mixed size
accesses to the members.

However, what I didn't notice was that that effectively means that the
whole "struct qstr this" is now basically redundant.  We already
explicitly track the "const char *name", and if we just use "u64
hash_len" instead of "long len", there is nothing else left of the
"struct qstr".

We do end up wanting the "struct qstr" if we have a filesystem with a
"d_hash()" function, but that's a rare case, and we might as well then
just squirrell away the name and hash_len at that point.

End result: fewer live variables in the loop, a smaller stack frame, and
better code generation.  And we don't need to pass in pointers variables
to helper functions any more, because the return value contains all the
relevant information.  So this removes more lines than it adds, and the
source code is clearer too.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 39 ++++++++++++++++++---------------------
 1 file changed, 18 insertions(+), 21 deletions(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 215e44254c53..01d03892316c 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1674,14 +1674,13 @@ EXPORT_SYMBOL(full_name_hash);
 
 /*
  * Calculate the length and hash of the path component, and
- * fill in the qstr. return the "len" as the result.
+ * return the "hash_len" as the result.
  */
-static inline unsigned long hash_name(const char *name, struct qstr *res)
+static inline u64 hash_name(const char *name)
 {
 	unsigned long a, b, adata, bdata, mask, hash, len;
 	const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
 
-	res->name = name;
 	hash = a = 0;
 	len = -sizeof(unsigned long);
 	do {
@@ -1698,9 +1697,7 @@ static inline unsigned long hash_name(const char *name, struct qstr *res)
 
 	hash += a & zero_bytemask(mask);
 	len += find_zero(mask);
-	res->hash_len = hashlen_create(fold_hash(hash), len);
-
-	return len;
+	return hashlen_create(fold_hash(hash), len);
 }
 
 #else
@@ -1718,20 +1715,18 @@ EXPORT_SYMBOL(full_name_hash);
  * We know there's a real path component here of at least
  * one character.
  */
-static inline long hash_name(const char *name, struct qstr *res)
+static inline u64 hash_name(const char *name)
 {
 	unsigned long hash = init_name_hash();
 	unsigned long len = 0, c;
 
-	res->name = name;
 	c = (unsigned char)*name;
 	do {
 		len++;
 		hash = partial_name_hash(c, hash);
 		c = (unsigned char)name[len];
 	} while (c && c != '/');
-	res->hash_len = hashlen_create(end_name_hash(hash), len);
-	return len;
+	return hashlen_create(end_name_hash(hash), len);
 }
 
 #endif
@@ -1756,18 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 
 	/* At this point we know we have a real path component. */
 	for(;;) {
-		struct qstr this;
-		long len;
+		u64 hash_len;
 		int type;
 
 		err = may_lookup(nd);
  		if (err)
 			break;
 
-		len = hash_name(name, &this);
+		hash_len = hash_name(name);
 
 		type = LAST_NORM;
-		if (name[0] == '.') switch (len) {
+		if (name[0] == '.') switch (hashlen_len(hash_len)) {
 			case 2:
 				if (name[1] == '.') {
 					type = LAST_DOTDOT;
@@ -1781,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
+				struct qstr this = { .hash_len = hash_len, .name = name };
 				err = parent->d_op->d_hash(parent, &this);
 				if (err < 0)
 					break;
+				hash_len = this.hash_len;
+				name = this.name;
 			}
 		}
 
-		nd->last = this;
+		nd->last.hash_len = hash_len;
+		nd->last.name = name;
 		nd->last_type = type;
 
-		if (!name[len])
+		name += hashlen_len(hash_len);
+		if (!*name)
 			return 0;
 		/*
 		 * If it wasn't NUL, we know it was '/'. Skip that
 		 * slash, and continue until no more slashes.
 		 */
 		do {
-			len++;
-		} while (unlikely(name[len] == '/'));
-		if (!name[len])
+			name++;
+		} while (unlikely(*name == '/'));
+		if (!*name)
 			return 0;
 
-		name += len;
-
 		err = walk_component(nd, &next, LOOKUP_FOLLOW);
 		if (err < 0)
 			return err;
-- 
cgit v1.2.3


From a5c3e1c725af9e84deceb3c33939ca4ffe3fefc8 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 16 Sep 2014 04:16:19 -0500
Subject: Revert "cifs: No need to send SIGKILL to demux_thread during umount"

This reverts commit 52a36244443eabb594bdb63622ff2dd7a083f0e2.

Causes rmmod to fail for at least 7 seconds after unmount which
makes automated testing a little harder when reloading cifs.ko
between test runs.

Signed-off-by: Namjae Jeon <namjae.jeon@samsung.com>
CC: Jeff Layton <jlayton@primarydata.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/connect.c | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

(limited to 'fs')

diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 8a9fded7c135..36ca2045009b 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -837,6 +837,7 @@ cifs_demultiplex_thread(void *p)
 	struct TCP_Server_Info *server = p;
 	unsigned int pdu_length;
 	char *buf = NULL;
+	struct task_struct *task_to_wake = NULL;
 	struct mid_q_entry *mid_entry;
 
 	current->flags |= PF_MEMALLOC;
@@ -927,7 +928,19 @@ cifs_demultiplex_thread(void *p)
 	if (server->smallbuf) /* no sense logging a debug message if NULL */
 		cifs_small_buf_release(server->smallbuf);
 
+	task_to_wake = xchg(&server->tsk, NULL);
 	clean_demultiplex_info(server);
+
+	/* if server->tsk was NULL then wait for a signal before exiting */
+	if (!task_to_wake) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		while (!signal_pending(current)) {
+			schedule();
+			set_current_state(TASK_INTERRUPTIBLE);
+		}
+		set_current_state(TASK_RUNNING);
+	}
+
 	module_put_and_exit(0);
 }
 
@@ -2050,6 +2063,8 @@ cifs_find_tcp_session(struct smb_vol *vol)
 static void
 cifs_put_tcp_session(struct TCP_Server_Info *server)
 {
+	struct task_struct *task;
+
 	spin_lock(&cifs_tcp_ses_lock);
 	if (--server->srv_count > 0) {
 		spin_unlock(&cifs_tcp_ses_lock);
@@ -2073,6 +2088,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server)
 	kfree(server->session_key.response);
 	server->session_key.response = NULL;
 	server->session_key.len = 0;
+
+	task = xchg(&server->tsk, NULL);
+	if (task)
+		force_sig(SIGKILL, task);
 }
 
 static struct TCP_Server_Info *
-- 
cgit v1.2.3


From 116ae5e2b09f7022281c253a6037a74d0446bfaf Mon Sep 17 00:00:00 2001
From: Arnd Bergmann <arnd@arndb.de>
Date: Mon, 8 Sep 2014 17:20:56 +0200
Subject: cifs: remove dead code

cifs provides two dummy functions 'sess_auth_lanman' and
'sess_auth_kerberos' for the case in which the respective
features are not defined. However, the caller is also under
an #ifdef, so we just get warnings about unused code:

fs/cifs/sess.c:1109:1: warning: 'sess_auth_kerberos' defined but not used [-Wunused-function]
 sess_auth_kerberos(struct sess_data *sess_data)

Removing the dead functions gets rid of the warnings without
any downsides that I can see.

(Yalin Wang reported the identical problem and fix so added him)

Signed-off-by: Arnd Bergmann <arnd@arndb.de>
Signed-off-by: Yalin Wang <yalin.wang@sonymobile.com>
Signed-off-by: Steve French <smfrench@gmail.com>
---
 fs/cifs/sess.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'fs')

diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index 3a5e83317683..57db63ff88da 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -745,14 +745,6 @@ out:
 	sess_free_buffer(sess_data);
 }
 
-#else
-
-static void
-sess_auth_lanman(struct sess_data *sess_data)
-{
-	sess_data->result = -EOPNOTSUPP;
-	sess_data->func = NULL;
-}
 #endif
 
 static void
@@ -1103,15 +1095,6 @@ out:
 	ses->auth_key.response = NULL;
 }
 
-#else
-
-static void
-sess_auth_kerberos(struct sess_data *sess_data)
-{
-	cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n");
-	sess_data->result = -ENOSYS;
-	sess_data->func = NULL;
-}
 #endif /* ! CONFIG_CIFS_UPCALL */
 
 /*
-- 
cgit v1.2.3


From 69af38dbc5b44e90dde35af4a1df3f5510809a1a Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 16 Sep 2014 04:13:31 -0500
Subject: Update version number displayed by modinfo for cifs.ko

Update cifs.ko version to 2.05

Signed-off-by: Steve French <smfrench@gmail.com>w
---
 fs/cifs/cifsfs.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index b0fafa499505..002e0c173939 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* CONFIG_CIFS_NFSD_EXPORT */
 
-#define CIFS_VERSION   "2.04"
+#define CIFS_VERSION   "2.05"
 #endif				/* _CIFSFS_H */
-- 
cgit v1.2.3


From 364d42930d96a872b2076deeb9c24f9ff132de34 Mon Sep 17 00:00:00 2001
From: Steve French <smfrench@gmail.com>
Date: Tue, 16 Sep 2014 06:40:25 -0500
Subject: Fix mfsymlinks file size check

If the mfsymlinks file size has changed (e.g. the file no longer
represents an emulated symlink) we were not returning an error properly.

Signed-off-by: Steve French <smfrench@gmail.com>
Reviewed-by: Stefan Metzmacher <metze@samba.org>
---
 fs/cifs/link.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index a5c2812ead68..5657416d3483 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -343,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon,
 	if (rc)
 		return rc;
 
-	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE))
+	if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) {
+		rc = -ENOENT;
 		/* it's not a symlink */
 		goto out;
+	}
 
 	io_parms.netfid = fid.netfid;
 	io_parms.pid = current->tgid;
-- 
cgit v1.2.3


From a060dc5010ffa32f3a83e5336f6eeb6551fa137a Mon Sep 17 00:00:00 2001
From: James Hogan <james.hogan@imgtec.com>
Date: Tue, 16 Sep 2014 13:07:35 +0100
Subject: vfs: workaround gcc <4.6 build error in link_path_walk()

Commit d6bb3e9075bb ("vfs: simplify and shrink stack frame of
link_path_walk()") introduced build problems with GCC versions older
than 4.6 due to the initialisation of a member of an anonymous union in
struct qstr without enclosing braces.

This hits GCC bug 10676 [1] (which was fixed in GCC 4.6 by [2]), and
causes the following build error:

  fs/namei.c: In function 'link_path_walk':
  fs/namei.c:1778: error: unknown field 'hash_len' specified in initializer

This is worked around by adding explicit braces.

[1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676
[2] https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=159206

Fixes: d6bb3e9075bb (vfs: simplify and shrink stack frame of link_path_walk())
Signed-off-by: James Hogan <james.hogan@imgtec.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: linux-fsdevel@vger.kernel.org
Cc: linux-metag@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/namei.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/namei.c b/fs/namei.c
index 01d03892316c..a7b05bf82d31 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1775,7 +1775,7 @@ static int link_path_walk(const char *name, struct nameidata *nd)
 			struct dentry *parent = nd->path.dentry;
 			nd->flags &= ~LOOKUP_JUMPED;
 			if (unlikely(parent->d_flags & DCACHE_OP_HASH)) {
-				struct qstr this = { .hash_len = hash_len, .name = name };
+				struct qstr this = { { .hash_len = hash_len }, .name = name };
 				err = parent->d_op->d_hash(parent, &this);
 				if (err < 0)
 					break;
-- 
cgit v1.2.3


From 125c4cf9f37c98fed2c08229b31358cfec63dcf6 Mon Sep 17 00:00:00 2001
From: Filipe Manana <fdmanana@suse.com>
Date: Thu, 11 Sep 2014 21:22:14 +0100
Subject: Btrfs: set inode's logged_trans/last_log_commit after ranged fsync

When a ranged fsync finishes if there are still extent maps in the modified
list, still set the inode's logged_trans and last_log_commit. This is important
in case an inode is fsync'ed and unlinked in the same transaction, to ensure its
inode ref gets deleted from the log and the respective dentries in its parent
are deleted too from the log (if the parent directory was fsync'ed in the same
transaction).

Instead make btrfs_inode_in_log() return false if the list of modified extent
maps isn't empty.

This is an incremental on top of the v4 version of the patch:

    "Btrfs: fix fsync data loss after a ranged fsync"

which was added to its v5, but didn't make it on time.

Signed-off-by: Filipe Manana <fdmanana@suse.com>
Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/btrfs_inode.h | 13 +++++++++++--
 fs/btrfs/tree-log.c    | 14 ++------------
 2 files changed, 13 insertions(+), 14 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 43527fd78825..56b8522d5767 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -234,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation)
 	    BTRFS_I(inode)->last_sub_trans <=
 	    BTRFS_I(inode)->last_log_commit &&
 	    BTRFS_I(inode)->last_sub_trans <=
-	    BTRFS_I(inode)->root->last_log_commit)
-		return 1;
+	    BTRFS_I(inode)->root->last_log_commit) {
+		/*
+		 * After a ranged fsync we might have left some extent maps
+		 * (that fall outside the fsync's range). So return false
+		 * here if the list isn't empty, to make sure btrfs_log_inode()
+		 * will be called and process those extent maps.
+		 */
+		smp_mb();
+		if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents))
+			return 1;
+	}
 	return 0;
 }
 
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index d296efe2d3e7..1d1ba083ca6e 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -4093,18 +4093,8 @@ log_extents:
 		}
 	}
 
-	write_lock(&em_tree->lock);
-	/*
-	 * If we're doing a ranged fsync and there are still modified extents
-	 * in the list, we must run on the next fsync call as it might cover
-	 * those extents (a full fsync or an fsync for other range).
-	 */
-	if (list_empty(&em_tree->modified_extents)) {
-		BTRFS_I(inode)->logged_trans = trans->transid;
-		BTRFS_I(inode)->last_log_commit =
-			BTRFS_I(inode)->last_sub_trans;
-	}
-	write_unlock(&em_tree->lock);
+	BTRFS_I(inode)->logged_trans = trans->transid;
+	BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans;
 out_unlock:
 	if (unlikely(err))
 		btrfs_put_logged_extents(&logged_list);
-- 
cgit v1.2.3


From 3e1199dcad004a40b55297a4736ccd1b9b81a952 Mon Sep 17 00:00:00 2001
From: Milosz Tanski <milosz@adfin.com>
Date: Wed, 13 Aug 2014 12:58:26 -0400
Subject: FS-Cache: refcount becomes corrupt under vma pressure.

In rare cases under heavy VMA pressure the ref count for a fscache cookie
becomes corrupt. In this case we decrement ref count even if we fail before
incrementing the refcount.

FS-Cache: Assertion failed bnode-eca5f9c6/syslog
0 > 0 is false
------------[ cut here ]------------
kernel BUG at fs/fscache/cookie.c:519!
invalid opcode: 0000 [#1] SMP
Call Trace:
[<ffffffffa01ba060>] __fscache_relinquish_cookie+0x50/0x220 [fscache]
[<ffffffffa02d64ce>] ceph_fscache_unregister_inode_cookie+0x3e/0x50 [ceph]
[<ffffffffa02ae1d3>] ceph_destroy_inode+0x33/0x200 [ceph]
[<ffffffff811cf67e>] ? __fsnotify_inode_delete+0xe/0x10
[<ffffffff811a9e0c>] destroy_inode+0x3c/0x70
[<ffffffff811a9f51>] evict+0x111/0x180
[<ffffffff811aa763>] iput+0x103/0x190
[<ffffffff811a5de8>] __dentry_kill+0x1c8/0x220
[<ffffffff811a5f31>] shrink_dentry_list+0xf1/0x250
[<ffffffff811a762c>] prune_dcache_sb+0x4c/0x60
[<ffffffff811930af>] super_cache_scan+0xff/0x170
[<ffffffff8113d7a0>] shrink_slab_node+0x140/0x2c0
[<ffffffff8113f2da>] shrink_slab+0x8a/0x130
[<ffffffff81142572>] balance_pgdat+0x3e2/0x5d0
[<ffffffff811428ca>] kswapd+0x16a/0x4a0
[<ffffffff810a43f0>] ? __wake_up_sync+0x20/0x20
[<ffffffff81142760>] ? balance_pgdat+0x5d0/0x5d0
[<ffffffff81083e09>] kthread+0xc9/0xe0
[<ffffffff81010000>] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90
[<ffffffff81083d40>] ? flush_kthread_worker+0xb0/0xb0
[<ffffffff8159f63c>] ret_from_fork+0x7c/0xb0
[<ffffffff81083d40>] ? flush_kthread_worker+0xb0/0xb0
RIP [<ffffffffa01b984b>] __fscache_disable_cookie+0x1db/0x210 [fscache]
RSP <ffff8803bc85f9b8>
---[ end trace 254d0d7c74a01f25 ]---

Signed-off-by: Milosz Tanski <milosz@adfin.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/fscache/page.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'fs')

diff --git a/fs/fscache/page.c b/fs/fscache/page.c
index 781ac7b0b53e..de33b3fccca6 100644
--- a/fs/fscache/page.c
+++ b/fs/fscache/page.c
@@ -198,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 {
 	struct fscache_operation *op;
 	struct fscache_object *object;
-	bool wake_cookie;
+	bool wake_cookie = false;
 
 	_enter("%p", cookie);
 
@@ -228,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie)
 
 	__fscache_use_cookie(cookie);
 	if (fscache_submit_exclusive_op(object, op) < 0)
-		goto nobufs;
+		goto nobufs_dec;
 	spin_unlock(&cookie->lock);
 	fscache_stat(&fscache_n_attr_changed_ok);
 	fscache_put_operation(op);
 	_leave(" = 0");
 	return 0;
 
-nobufs:
+nobufs_dec:
 	wake_cookie = __fscache_unuse_cookie(cookie);
+nobufs:
 	spin_unlock(&cookie->lock);
 	kfree(op);
 	if (wake_cookie)
-- 
cgit v1.2.3


From 696382f938d22597f4945865ed8e3f25e240cd41 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.de>
Date: Wed, 27 Aug 2014 15:06:20 +0100
Subject: cachefiles: remove two unused pagevecs.

These two have been unused since

commit c4d6d8dbf335c7fa47341654a37c53a512b519bb
    CacheFiles: Fix the marking of cached pages

in 3.8.

Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/rdwr.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'fs')

diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c
index 4b1fb5ca65b8..25e745b8eb1b 100644
--- a/fs/cachefiles/rdwr.c
+++ b/fs/cachefiles/rdwr.c
@@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 	struct cachefiles_one_read *monitor;
 	struct cachefiles_object *object;
 	struct fscache_retrieval *op;
-	struct pagevec pagevec;
 	int error, max;
 
 	op = container_of(_op, struct fscache_retrieval, op);
@@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op)
 
 	_enter("{ino=%lu}", object->backer->d_inode->i_ino);
 
-	pagevec_init(&pagevec, 0);
-
 	max = 8;
 	spin_lock_irq(&object->work_lock);
 
@@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 {
 	struct cachefiles_object *object;
 	struct cachefiles_cache *cache;
-	struct pagevec pagevec;
 	struct inode *inode;
 	sector_t block0, block;
 	unsigned shift;
@@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op,
 	op->op.flags |= FSCACHE_OP_ASYNC;
 	op->op.processor = cachefiles_read_copier;
 
-	pagevec_init(&pagevec, 0);
-
 	/* we assume the absence or presence of the first block is a good
 	 * enough indication for the page as a whole
 	 * - TODO: don't use bmap() for this as it is _not_ actually good
-- 
cgit v1.2.3


From e2cf1f1cc7636bd860e47cd0ad6194da8975f8b5 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Wed, 17 Sep 2014 23:28:38 +0100
Subject: CacheFiles: Handle rename2

Not all filesystems now provide the rename i_op - ext4 for one - but rather
provide the rename2 i_op.  CacheFiles checks that the filesystem has rename
and so will reject ext4 now with EPERM:

	CacheFiles: Failed to register: -1

Fix this by checking for rename2 as an alternative.  The call to vfs_rename()
actually handles selection of the appropriate function, so we needn't worry
about that.

Turning on debugging shows:

	[cachef] ==> cachefiles_get_directory(,,cache)
	[cachef] subdir -> ffff88000b22b778 positive
	[cachef] <== cachefiles_get_directory() = -1 [check]

where -1 is EPERM.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 fs/cachefiles/namei.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'fs')

diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c
index 5bf2b41e66d3..83e9c94ca2cf 100644
--- a/fs/cachefiles/namei.c
+++ b/fs/cachefiles/namei.c
@@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache,
 	    !subdir->d_inode->i_op->lookup ||
 	    !subdir->d_inode->i_op->mkdir ||
 	    !subdir->d_inode->i_op->create ||
-	    !subdir->d_inode->i_op->rename ||
+	    (!subdir->d_inode->i_op->rename &&
+	     !subdir->d_inode->i_op->rename2) ||
 	    !subdir->d_inode->i_op->rmdir ||
 	    !subdir->d_inode->i_op->unlink)
 		goto check_error;
-- 
cgit v1.2.3


From 0f23ae74f589304bf33233f85737f4fd368549eb Mon Sep 17 00:00:00 2001
From: Chris Mason <clm@fb.com>
Date: Thu, 18 Sep 2014 07:49:05 -0700
Subject: Revert "Btrfs: device_list_add() should not update list when mounted"

This reverts commit b96de000bc8bc9688b3a2abea4332bd57648a49f.

This commit is triggering failures to mount by subvolume id in some
configurations.  The main problem is how many different ways this
scanning function is used, both for scanning while mounted and
unmounted.  A proper cleanup is too big for late rcs.

For now, just revert the commit and we'll put a better fix into a later
merge window.

Signed-off-by: Chris Mason <clm@fb.com>
---
 fs/btrfs/volumes.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'fs')

diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index 340a92d08e84..2c2d6d1d8eee 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -529,12 +529,12 @@ static noinline int device_list_add(const char *path,
 		 */
 
 		/*
-		 * As of now don't allow update to btrfs_fs_device through
-		 * the btrfs dev scan cli, after FS has been mounted.
+		 * For now, we do allow update to btrfs_fs_device through the
+		 * btrfs dev scan cli after FS has been mounted.  We're still
+		 * tracking a problem where systems fail mount by subvolume id
+		 * when we reject replacement on a mounted FS.
 		 */
-		if (fs_devices->opened) {
-			return -EBUSY;
-		} else {
+		if (!fs_devices->opened && found_transid < device->generation) {
 			/*
 			 * That is if the FS is _not_ mounted and if you
 			 * are here, that means there is more than one
@@ -542,8 +542,7 @@ static noinline int device_list_add(const char *path,
 			 * with larger generation number or the last-in if
 			 * generation are equal.
 			 */
-			if (found_transid < device->generation)
-				return -EEXIST;
+			return -EEXIST;
 		}
 
 		name = rcu_string_strdup(path, GFP_NOFS);
-- 
cgit v1.2.3


From 080af20cc945d110f9912d01cf6b66f94a375b8d Mon Sep 17 00:00:00 2001
From: Steve Dickson <steved@redhat.com>
Date: Thu, 18 Sep 2014 09:13:17 -0400
Subject: NFSv4: nfs4_state_manager() vs. nfs_server_remove_lists()

There is a race between nfs4_state_manager() and
nfs_server_remove_lists() that happens during a nfsv3 mount.

The v3 mount notices there is already a supper block so
nfs_server_remove_lists() called which uses the nfs_client_lock
spin lock to synchronize access to the client list.

At the same time nfs4_state_manager() is running through
the client list looking for work to do, using the same
lock. When nfs4_state_manager() wins the race to the
list, a v3 client pointer is found and not ignored
properly which causes the panic.

Moving some protocol checks before the state checking
avoids the panic.

CC: Stable Tree <stable@vger.kernel.org>
Signed-off-by: Steve Dickson <steved@redhat.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4client.c | 38 ++++++++++++++++++++------------------
 1 file changed, 20 insertions(+), 18 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 53e435a95260..ffdb28d86cf8 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new,
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_proto != new->cl_proto)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos" */
 		if (pos->cl_cons_state > NFS_CS_READY) {
@@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new,
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (pos->rpc_ops != new->rpc_ops)
-			continue;
-
-		if (pos->cl_proto != new->cl_proto)
-			continue;
-
-		if (pos->cl_minorversion != new->cl_minorversion)
-			continue;
-
 		if (pos->cl_clientid != new->cl_clientid)
 			continue;
 
@@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new,
 
 	spin_lock(&nn->nfs_client_lock);
 	list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) {
+
+		if (pos->rpc_ops != new->rpc_ops)
+			continue;
+
+		if (pos->cl_proto != new->cl_proto)
+			continue;
+
+		if (pos->cl_minorversion != new->cl_minorversion)
+			continue;
+
 		/* If "pos" isn't marked ready, we can't trust the
 		 * remaining fields in "pos", especially the client
 		 * ID and serverowner fields.  Wait for CREATE_SESSION
@@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new,
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (pos->rpc_ops != new->rpc_ops)
-			continue;
-
-		if (pos->cl_proto != new->cl_proto)
-			continue;
-
-		if (pos->cl_minorversion != new->cl_minorversion)
-			continue;
-
 		if (!nfs4_match_clientids(pos, new))
 			continue;
 
-- 
cgit v1.2.3


From cd9288ffaea4359d5cfe2b8d264911506aed26a4 Mon Sep 17 00:00:00 2001
From: Trond Myklebust <trond.myklebust@primarydata.com>
Date: Thu, 18 Sep 2014 11:51:32 -0400
Subject: NFSv4: Fix another bug in the close/open_downgrade code

James Drew reports another bug whereby the NFS client is now sending
an OPEN_DOWNGRADE in a situation where it should really have sent a
CLOSE: the client is opening the file for O_RDWR, but then trying to
do a downgrade to O_RDONLY, which is not allowed by the NFSv4 spec.

Reported-by: James Drews <drews@engr.wisc.edu>
Link: http://lkml.kernel.org/r/541AD7E5.8020409@engr.wisc.edu
Fixes: aee7af356e15 (NFSv4: Fix problems with close in the presence...)
Cc: stable@vger.kernel.org # 2.6.33+
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
---
 fs/nfs/nfs4proc.c | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

(limited to 'fs')

diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index ac2dd953fc18..6ca0c8e7a945 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2618,23 +2618,23 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data)
 	is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags);
 	is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags);
 	is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags);
-	/* Calculate the current open share mode */
-	calldata->arg.fmode = 0;
-	if (is_rdonly || is_rdwr)
-		calldata->arg.fmode |= FMODE_READ;
-	if (is_wronly || is_rdwr)
-		calldata->arg.fmode |= FMODE_WRITE;
 	/* Calculate the change in open mode */
+	calldata->arg.fmode = 0;
 	if (state->n_rdwr == 0) {
-		if (state->n_rdonly == 0) {
-			call_close |= is_rdonly || is_rdwr;
-			calldata->arg.fmode &= ~FMODE_READ;
-		}
-		if (state->n_wronly == 0) {
-			call_close |= is_wronly || is_rdwr;
-			calldata->arg.fmode &= ~FMODE_WRITE;
-		}
-	}
+		if (state->n_rdonly == 0)
+			call_close |= is_rdonly;
+		else if (is_rdonly)
+			calldata->arg.fmode |= FMODE_READ;
+		if (state->n_wronly == 0)
+			call_close |= is_wronly;
+		else if (is_wronly)
+			calldata->arg.fmode |= FMODE_WRITE;
+	} else if (is_rdwr)
+		calldata->arg.fmode |= FMODE_READ|FMODE_WRITE;
+
+	if (calldata->arg.fmode == 0)
+		call_close |= is_rdwr;
+
 	if (!nfs4_valid_open_stateid(state))
 		call_close = 0;
 	spin_unlock(&state->owner->so_lock);
-- 
cgit v1.2.3


From f2d5a94436cc7cc0221b9a81bba2276a25187dd3 Mon Sep 17 00:00:00 2001
From: Anton Altaparmakov <aia21@cam.ac.uk>
Date: Mon, 22 Sep 2014 01:53:03 +0100
Subject: Fix nasty 32-bit overflow bug in buffer i/o code.

On 32-bit architectures, the legacy buffer_head functions are not always
handling the sector number with the proper 64-bit types, and will thus
fail on 4TB+ disks.

Any code that uses __getblk() (and thus bread(), breadahead(),
sb_bread(), sb_breadahead(), sb_getblk()), and calls it using a 64-bit
block on a 32-bit arch (where "long" is 32-bit) causes an inifinite loop
in __getblk_slow() with an infinite stream of errors logged to dmesg
like this:

  __find_get_block_slow() failed. block=6740375944, b_blocknr=2445408648
  b_state=0x00000020, b_size=512
  device sda1 blocksize: 512

Note how in hex block is 0x191C1F988 and b_blocknr is 0x91C1F988 i.e. the
top 32-bits are missing (in this case the 0x1 at the top).

This is because grow_dev_page() is broken and has a 32-bit overflow due
to shifting the page index value (a pgoff_t - which is just 32 bits on
32-bit architectures) left-shifted as the block number.  But the top
bits to get lost as the pgoff_t is not type cast to sector_t / 64-bit
before the shift.

This patch fixes this issue by type casting "index" to sector_t before
doing the left shift.

Note this is not a theoretical bug but has been seen in the field on a
4TiB hard drive with logical sector size 512 bytes.

This patch has been verified to fix the infinite loop problem on 3.17-rc5
kernel using a 4TB disk image mounted using "-o loop".  Without this patch
doing a "find /nt" where /nt is an NTFS volume causes the inifinite loop
100% reproducibly whilst with the patch it works fine as expected.

Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
Cc: stable@vger.kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/buffer.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'fs')

diff --git a/fs/buffer.c b/fs/buffer.c
index 8f05111bbb8b..3588a80854b2 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 		bh = page_buffers(page);
 		if (bh->b_size == size) {
 			end_block = init_page_buffers(page, bdev,
-						index << sizebits, size);
+						(sector_t)index << sizebits,
+						size);
 			goto done;
 		}
 		if (!try_to_free_buffers(page))
@@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block,
 	 */
 	spin_lock(&inode->i_mapping->private_lock);
 	link_dev_buffers(page, bh);
-	end_block = init_page_buffers(page, bdev, index << sizebits, size);
+	end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits,
+			size);
 	spin_unlock(&inode->i_mapping->private_lock);
 done:
 	ret = (block < end_block) ? 1 : -ENXIO;
-- 
cgit v1.2.3