From 2bb93d244157b6dfa4964d4088be4680b3169701 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Aug 2014 18:56:29 -0500 Subject: Trivial whitespace fix Signed-off-by: Steve French --- fs/cifs/smb2ops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 5a48aa290dfe..4e4eecdec4f9 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1035,7 +1035,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, if (keep_size == false) return -EOPNOTSUPP; - /* + /* * Must check if file sparse since fallocate -z (zero range) assumes * non-sparse allocation */ -- cgit v1.2.3 From eaebdedc61ac7bc3dd6ef6eb508097c4aaabef0c Mon Sep 17 00:00:00 2001 From: Fabian Frederick Date: Wed, 2 Jul 2014 22:08:46 +0200 Subject: GFS2: fs/gfs2/super.c: replace seq_printf by seq_puts fix checkpatch warnings: "WARNING: Prefer seq_puts to seq_printf" Cc: cluster-devel@redhat.com Signed-off-by: Fabian Frederick Signed-off-by: Steven Whitehouse --- fs/gfs2/super.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 2607ff13d486..a346f56c4c6d 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1294,7 +1294,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) int val; if (is_ancestor(root, sdp->sd_master_dir)) - seq_printf(s, ",meta"); + seq_puts(s, ",meta"); if (args->ar_lockproto[0]) seq_printf(s, ",lockproto=%s", args->ar_lockproto); if (args->ar_locktable[0]) @@ -1302,13 +1302,13 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) if (args->ar_hostdata[0]) seq_printf(s, ",hostdata=%s", args->ar_hostdata); if (args->ar_spectator) - seq_printf(s, ",spectator"); + seq_puts(s, ",spectator"); if (args->ar_localflocks) - seq_printf(s, ",localflocks"); + seq_puts(s, ",localflocks"); if (args->ar_debug) - seq_printf(s, ",debug"); + seq_puts(s, ",debug"); if (args->ar_posix_acl) - seq_printf(s, ",acl"); + seq_puts(s, ",acl"); if (args->ar_quota != GFS2_QUOTA_DEFAULT) { char *state; switch (args->ar_quota) { @@ -1328,7 +1328,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",quota=%s", state); } if (args->ar_suiddir) - seq_printf(s, ",suiddir"); + seq_puts(s, ",suiddir"); if (args->ar_data != GFS2_DATA_DEFAULT) { char *state; switch (args->ar_data) { @@ -1345,7 +1345,7 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",data=%s", state); } if (args->ar_discard) - seq_printf(s, ",discard"); + seq_puts(s, ",discard"); val = sdp->sd_tune.gt_logd_secs; if (val != 30) seq_printf(s, ",commit=%d", val); @@ -1376,11 +1376,11 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",errors=%s", state); } if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) - seq_printf(s, ",nobarrier"); + seq_puts(s, ",nobarrier"); if (test_bit(SDF_DEMOTE, &sdp->sd_flags)) - seq_printf(s, ",demote_interface_used"); + seq_puts(s, ",demote_interface_used"); if (args->ar_rgrplvb) - seq_printf(s, ",rgrplvb"); + seq_puts(s, ",rgrplvb"); return 0; } -- cgit v1.2.3 From b650738cd093a9f9e9551db9ce5cd68acd842dc0 Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 6 Aug 2014 09:08:36 -0400 Subject: GFS2: Change maxlen variables to size_t This patch changes some variables (especially maxlen in function gfs2_block_map) from unsigned int to size_t. We need 64-bit arithmetic for very large files (e.g. 1PB) where the variables otherwise get shifted to all 0's. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index e6ee5b6e8d99..f0b945ab853e 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -359,7 +359,7 @@ static inline void release_metapath(struct metapath *mp) * Returns: The length of the extent (minimum of one block) */ -static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, unsigned limit, int *eob) +static inline unsigned int gfs2_extent_length(void *start, unsigned int len, __be64 *ptr, size_t limit, int *eob) { const __be64 *end = (start + len); const __be64 *first = ptr; @@ -449,7 +449,7 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, struct buffer_head *bh_map, struct metapath *mp, const unsigned int sheight, const unsigned int height, - const unsigned int maxlen) + const size_t maxlen) { struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); @@ -483,7 +483,8 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, } else { /* Need to allocate indirect blocks */ ptrs_per_blk = height > 1 ? sdp->sd_inptrs : sdp->sd_diptrs; - dblks = min(maxlen, ptrs_per_blk - mp->mp_list[end_of_metadata]); + dblks = min(maxlen, (size_t)(ptrs_per_blk - + mp->mp_list[end_of_metadata])); if (height == ip->i_height) { /* Writing into existing tree, extend tree down */ iblks = height - sheight; @@ -605,7 +606,7 @@ int gfs2_block_map(struct inode *inode, sector_t lblock, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_sbd *sdp = GFS2_SB(inode); unsigned int bsize = sdp->sd_sb.sb_bsize; - const unsigned int maxlen = bh_map->b_size >> inode->i_blkbits; + const size_t maxlen = bh_map->b_size >> inode->i_blkbits; const u64 *arr = sdp->sd_heightsize; __be64 *ptr; u64 size; -- cgit v1.2.3 From 2ddfbdd6848d496d8088b28992d257fd02e58c9d Mon Sep 17 00:00:00 2001 From: Bob Peterson Date: Wed, 20 Aug 2014 12:44:45 -0400 Subject: GFS2: Request demote when a "try" flock fails This patch changes the flock code so that it uses the TRY_1CB flag instead of the TRY flag on the first attempt. That forces any holding nodes to issue a dlm callback, which requests a demote of the glock. Then, if the "try" failed, it sleeps a small amount of time for the demote to occur. Then it tries again, for an increasing amount of time. Subsequent attempts to gain the "try" lock don't use "_1CB" so that only one callback is issued. Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/file.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 26b3f952e6b1..7f4ed3daa38c 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -26,6 +26,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -979,9 +980,10 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) unsigned int state; int flags; int error = 0; + int sleeptime; state = (fl->fl_type == F_WRLCK) ? LM_ST_EXCLUSIVE : LM_ST_SHARED; - flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY) | GL_EXACT; + flags = (IS_SETLKW(cmd) ? 0 : LM_FLAG_TRY_1CB) | GL_EXACT; mutex_lock(&fp->f_fl_mutex); @@ -1001,7 +1003,14 @@ static int do_flock(struct file *file, int cmd, struct file_lock *fl) gfs2_holder_init(gl, state, flags, fl_gh); gfs2_glock_put(gl); } - error = gfs2_glock_nq(fl_gh); + for (sleeptime = 1; sleeptime <= 4; sleeptime <<= 1) { + error = gfs2_glock_nq(fl_gh); + if (error != GLR_TRYFAILED) + break; + fl_gh->gh_flags = LM_FLAG_TRY | GL_EXACT; + fl_gh->gh_error = 0; + msleep(sleeptime); + } if (error) { gfs2_holder_uninit(fl_gh); if (error == GLR_TRYFAILED) @@ -1024,7 +1033,7 @@ static void do_unflock(struct file *file, struct file_lock *fl) mutex_lock(&fp->f_fl_mutex); flock_lock_file_wait(file, fl); if (fl_gh->gh_gl) { - gfs2_glock_dq_wait(fl_gh); + gfs2_glock_dq(fl_gh); gfs2_holder_uninit(fl_gh); } mutex_unlock(&fp->f_fl_mutex); -- cgit v1.2.3 From 27b7edcf1ce03a3eddda24d4d271a9b29572a78b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:28 +0900 Subject: cifs: fix a possible null pointer deref in decode_ascii_ssetup When kzalloc fails, we will end up doing NULL pointer derefrence Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/sess.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 39ee32688eac..3a5e83317683 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -243,10 +243,11 @@ static void decode_ascii_ssetup(char **pbcc_area, __u16 bleft, kfree(ses->serverOS); ses->serverOS = kzalloc(len + 1, GFP_KERNEL); - if (ses->serverOS) + if (ses->serverOS) { strncpy(ses->serverOS, bcc_ptr, len); - if (strncmp(ses->serverOS, "OS/2", 4) == 0) - cifs_dbg(FYI, "OS/2 server\n"); + if (strncmp(ses->serverOS, "OS/2", 4) == 0) + cifs_dbg(FYI, "OS/2 server\n"); + } bcc_ptr += len + 1; bleft -= len + 1; -- cgit v1.2.3 From d6ccf4997e62fb6629f9f003980dca5292138b7b Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 21 Aug 2014 19:11:20 +0900 Subject: cifs: fix memory leak when password is supplied multiple times Unlikely but possible. When password is supplied multiple times, we have to free the previous allocation. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/connect.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 03ed8a09581c..36ca2045009b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1600,6 +1600,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, tmp_end++; if (!(tmp_end < end && tmp_end[1] == delim)) { /* No it is not. Set the password to NULL */ + kfree(vol->password); vol->password = NULL; break; } @@ -1637,6 +1638,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, options = end; } + kfree(vol->password); /* Now build new password string */ temp_len = strlen(value); vol->password = kzalloc(temp_len+1, GFP_KERNEL); -- cgit v1.2.3 From 7de975e349b295f387f34eed38f115223f17d5ee Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:41 +0900 Subject: cifs: fix a possible use of uninit variable in SMB2_sess_setup In case of error, goto ssetup_exit can be hit and we could end up using uninitialized value of resp_buftype Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index fa0dd044213b..9df5d8effe47 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -530,7 +530,7 @@ SMB2_sess_setup(const unsigned int xid, struct cifs_ses *ses, struct smb2_sess_setup_rsp *rsp = NULL; struct kvec iov[2]; int rc = 0; - int resp_buftype; + int resp_buftype = CIFS_NO_BUFFER; __le32 phase = NtLmNegotiate; /* NTLMSSP, if needed, is multistage */ struct TCP_Server_Info *server = ses->server; u16 blob_length = 0; -- cgit v1.2.3 From d4a029d21556437b09ffb3207cf2871651bec38f Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:59 +0900 Subject: cifs: remove unneeded check of null checking in if condition Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/smb2pdu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 9df5d8effe47..cb39c51cd3e0 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1403,8 +1403,7 @@ SMB2_close(const unsigned int xid, struct cifs_tcon *tcon, rsp = (struct smb2_close_rsp *)iov[0].iov_base; if (rc != 0) { - if (tcon) - cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); + cifs_stats_fail_inc(tcon, SMB2_CLOSE_HE); goto close_exit; } -- cgit v1.2.3 From 787aded65044e4cabefcf7eb7576c2dd6b151468 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Fri, 22 Aug 2014 14:22:51 +0900 Subject: cifs: Allow directIO read/write during cache=strict Currently cifs have all or nothing approach for directIO operations. cache=strict mode does not allow directIO while cache=none mode performs all the operations as directIO even when user does not specify O_DIRECT flag. This patch enables strict cache mode to honour directIO semantics. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Signed-off-by: Steve French --- fs/cifs/dir.c | 8 ++++++++ fs/cifs/file.c | 8 ++++++++ 2 files changed, 16 insertions(+) (limited to 'fs') diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 3db0c5fd9a11..6cbd9c688cfe 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -497,6 +497,14 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, goto out; } + if (file->f_flags & O_DIRECT && + CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (CIFS_SB(inode->i_sb)->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + file_info = cifs_new_fileinfo(&fid, file, tlink, oplock); if (file_info == NULL) { if (server->ops->close) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index d5fec92e0360..7c018a1c52f7 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -467,6 +467,14 @@ int cifs_open(struct inode *inode, struct file *file) cifs_dbg(FYI, "inode = 0x%p file flags are 0x%x for %s\n", inode, file->f_flags, full_path); + if (file->f_flags & O_DIRECT && + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_STRICT_IO) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_BRL) + file->f_op = &cifs_file_direct_nobrl_ops; + else + file->f_op = &cifs_file_direct_ops; + } + if (server->oplocks) oplock = REQ_OPLOCK; else -- cgit v1.2.3 From 52a36244443eabb594bdb63622ff2dd7a083f0e2 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Wed, 20 Aug 2014 19:39:17 +0900 Subject: cifs: No need to send SIGKILL to demux_thread during umount There is no need to explicitly send SIGKILL to cifs_demultiplex_thread as it is calling module_put_and_exit to exit cleanly. socket sk_rcvtimeo is set to 7 HZ so the thread will wake up in 7 seconds and clean itself. Signed-off-by: Namjae Jeon Signed-off-by: Ashish Sangwan Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 19 ------------------- 1 file changed, 19 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 36ca2045009b..8a9fded7c135 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -837,7 +837,6 @@ cifs_demultiplex_thread(void *p) struct TCP_Server_Info *server = p; unsigned int pdu_length; char *buf = NULL; - struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; @@ -928,19 +927,7 @@ cifs_demultiplex_thread(void *p) if (server->smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(server->smallbuf); - task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); - - /* if server->tsk was NULL then wait for a signal before exiting */ - if (!task_to_wake) { - set_current_state(TASK_INTERRUPTIBLE); - while (!signal_pending(current)) { - schedule(); - set_current_state(TASK_INTERRUPTIBLE); - } - set_current_state(TASK_RUNNING); - } - module_put_and_exit(0); } @@ -2063,8 +2050,6 @@ cifs_find_tcp_session(struct smb_vol *vol) static void cifs_put_tcp_session(struct TCP_Server_Info *server) { - struct task_struct *task; - spin_lock(&cifs_tcp_ses_lock); if (--server->srv_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2088,10 +2073,6 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; - - task = xchg(&server->tsk, NULL); - if (task) - force_sig(SIGKILL, task); } static struct TCP_Server_Info * -- cgit v1.2.3 From a07d322059db66b84c9eb4f98959df468e88b34b Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 22 Aug 2014 13:32:09 +0400 Subject: CIFS: Fix directory rename error CIFS servers process nlink counts differently for files and directories. In cifs_rename() if we the request fails on the existing target, we try to remove it through cifs_unlink() but this is not what we want to do for directories. As the result the following sequence of commands mkdir {1,2}; mv -T 1 2; rmdir {1,2}; mkdir {1,2}; echo foo > 2/bar and XFS test generic/023 fail with -ENOENT error. That's why the second mkdir reuses the existing inode (target inode of the mv -T command) with S_DEAD flag. Fix this by checking whether the target is directory or not and calling cifs_rmdir() rather than cifs_unlink() for directories. Cc: Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 949ec909ec9a..7899a40465b3 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1720,7 +1720,10 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, unlink_target: /* Try unlinking the target dentry if it's not negative */ if (target_dentry->d_inode && (rc == -EACCES || rc == -EEXIST)) { - tmprc = cifs_unlink(target_dir, target_dentry); + if (d_is_dir(target_dentry)) + tmprc = cifs_rmdir(target_dir, target_dentry); + else + tmprc = cifs_unlink(target_dir, target_dentry); if (tmprc) goto cifs_rename_exit; rc = cifs_do_rename(xid, source_dentry, from_name, -- cgit v1.2.3 From f736906a7669a77cf8cabdcbcf1dc8cb694e12ef Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Tue, 26 Aug 2014 19:04:44 +0400 Subject: CIFS: Fix wrong restart readdir for SMB1 The existing code calls server->ops->close() that is not right. This causes XFS test generic/310 to fail. Fix this by using server->ops->closedir() function. Cc: # v3.7+ Signed-off-by: Dan Carpenter Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 798c80a41c88..b334a89d6a66 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -596,8 +596,8 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, if (server->ops->dir_needs_close(cfile)) { cfile->invalidHandle = true; spin_unlock(&cifs_file_list_lock); - if (server->ops->close) - server->ops->close(xid, tcon, &cfile->fid); + if (server->ops->close_dir) + server->ops->close_dir(xid, tcon, &cfile->fid); } else spin_unlock(&cifs_file_list_lock); if (cfile->srch_inf.ntwrk_buf_start) { -- cgit v1.2.3 From 1bbe4997b13de903c421c1cc78440e544b5f9064 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Fri, 22 Aug 2014 13:32:11 +0400 Subject: CIFS: Fix wrong filename length for SMB2 The existing code uses the old MAX_NAME constant. This causes XFS test generic/013 to fail. Fix it by replacing MAX_NAME with PATH_MAX that SMB1 uses. Also remove an unused MAX_NAME constant definition. Cc: # v3.7+ Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French --- fs/cifs/cifsglob.h | 5 ----- fs/cifs/smb2file.c | 2 +- fs/cifs/smb2inode.c | 2 +- fs/cifs/smb2ops.c | 2 +- fs/cifs/smb2pdu.c | 2 +- 5 files changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index dfc731b02aa9..25b8392bfdd2 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,11 +70,6 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* used to define string lengths for reversing unicode strings */ -/* (256+1)*2 = 514 */ -/* (max path length + 1 for null) * 2 for unicode */ -#define MAX_NAME 514 - /* SMB echo "timeout" -- FIXME: tunable? */ #define SMB_ECHO_INTERVAL (60 * HZ) diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c index 3f17b4550831..45992944e238 100644 --- a/fs/cifs/smb2file.c +++ b/fs/cifs/smb2file.c @@ -50,7 +50,7 @@ smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, goto out; } - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) { rc = -ENOMEM; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index 0150182a4494..899bbc86f73e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -131,7 +131,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, *adjust_tz = false; *symlink = false; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 4e4eecdec4f9..f522193b7184 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -389,7 +389,7 @@ smb2_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; - smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + smb2_data = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, GFP_KERNEL); if (smb2_data == NULL) return -ENOMEM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index cb39c51cd3e0..74b3a6684383 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1532,7 +1532,7 @@ SMB2_query_info(const unsigned int xid, struct cifs_tcon *tcon, { return query_info(xid, tcon, persistent_fid, volatile_fid, FILE_ALL_INFORMATION, - sizeof(struct smb2_file_all_info) + MAX_NAME * 2, + sizeof(struct smb2_file_all_info) + PATH_MAX * 2, sizeof(struct smb2_file_all_info), data); } -- cgit v1.2.3 From ca5d13fc33cc56f7405004574d18172ddbdea2ef Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Aug 2014 04:40:46 -0500 Subject: Clarify Kconfig help text for CIFS and SMB2/SMB3 Clarify descriptions of SMB2 and SMB3 support in Kconfig Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky --- fs/cifs/Kconfig | 35 +++++++++++++++++++++++------------ 1 file changed, 23 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/Kconfig b/fs/cifs/Kconfig index 603f18a65c12..a2172f3f69e3 100644 --- a/fs/cifs/Kconfig +++ b/fs/cifs/Kconfig @@ -22,6 +22,11 @@ config CIFS support for OS/2 and Windows ME and similar servers is provided as well. + The module also provides optional support for the followon + protocols for CIFS including SMB3, which enables + useful performance and security features (see the description + of CONFIG_CIFS_SMB2). + The cifs module provides an advanced network file system client for mounting to CIFS compliant servers. It includes support for DFS (hierarchical name space), secure per-user @@ -121,7 +126,8 @@ config CIFS_ACL depends on CIFS_XATTR && KEYS help Allows fetching CIFS/NTFS ACL from the server. The DACL blob - is handed over to the application/caller. + is handed over to the application/caller. See the man + page for getcifsacl for more information. config CIFS_DEBUG bool "Enable CIFS debugging routines" @@ -162,7 +168,7 @@ config CIFS_NFSD_EXPORT Allows NFS server to export a CIFS mounted share (nfsd over cifs) config CIFS_SMB2 - bool "SMB2 network file system support" + bool "SMB2 and SMB3 network file system support" depends on CIFS && INET select NLS select KEYS @@ -170,16 +176,21 @@ config CIFS_SMB2 select DNS_RESOLVER help - This enables experimental support for the SMB2 (Server Message Block - version 2) protocol. The SMB2 protocol is the successor to the - popular CIFS and SMB network file sharing protocols. SMB2 is the - native file sharing mechanism for recent versions of Windows - operating systems (since Vista). SMB2 enablement will eventually - allow users better performance, security and features, than would be - possible with cifs. Note that smb2 mount options also are simpler - (compared to cifs) due to protocol improvements. - - Unless you are a developer or tester, say N. + This enables support for the Server Message Block version 2 + family of protocols, including SMB3. SMB3 support is + enabled on mount by specifying "vers=3.0" in the mount + options. These protocols are the successors to the popular + CIFS and SMB network file sharing protocols. SMB3 is the + native file sharing mechanism for the more recent + versions of Windows (Windows 8 and Windows 2012 and + later) and Samba server and many others support SMB3 well. + In general SMB3 enables better performance, security + and features, than would be possible with CIFS (Note that + when mounting to Samba, due to the CIFS POSIX extensions, + CIFS mounts can provide slightly better POSIX compatibility + than SMB3 mounts do though). Note that SMB2/SMB3 mount + options are also slightly simpler (compared to CIFS) due + to protocol improvements. config CIFS_FSCACHE bool "Provide CIFS client caching support" -- cgit v1.2.3 From 9776de96e51565286da25c74d6f631abc50c63ef Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:16 -0400 Subject: FS-Cache: Timeout for releasepage() This is meant to avoid a recusive hang caused by underlying filesystem trying to grab a free page and causing a write-out. INFO: task kworker/u30:7:28375 blocked for more than 120 seconds. Not tainted 3.15.0-virtual #74 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kworker/u30:7 D 0000000000000000 0 28375 2 0x00000000 Workqueue: fscache_operation fscache_op_work_func [fscache] ffff88000b147148 0000000000000046 0000000000000000 ffff88000b1471c8 ffff8807aa031820 0000000000014040 ffff88000b147fd8 0000000000014040 ffff880f0c50c860 ffff8807aa031820 ffff88000b147158 ffff88007be59cd0 Call Trace: [] schedule+0x29/0x70 [] __fscache_wait_on_page_write+0x55/0x90 [fscache] [] ? __wake_up_sync+0x20/0x20 [] __fscache_maybe_release_page+0x65/0x1e0 [fscache] [] ceph_releasepage+0x83/0x100 [ceph] [] ? anon_vma_fork+0x130/0x130 [] try_to_release_page+0x32/0x50 [] shrink_page_list+0x7e6/0x9d0 [] ? isolate_lru_pages.isra.73+0x78/0x1e0 [] shrink_inactive_list+0x252/0x4c0 [] shrink_lruvec+0x3e1/0x670 [] shrink_zone+0x3f/0x110 [] do_try_to_free_pages+0x1d6/0x450 [] ? zone_statistics+0x99/0xc0 [] try_to_free_pages+0xc4/0x180 [] __alloc_pages_nodemask+0x6b2/0xa60 [] ? __find_get_block+0xbe/0x250 [] ? wake_up_bit+0x2e/0x40 [] alloc_pages_current+0xb3/0x180 [] __page_cache_alloc+0xb7/0xd0 [] grab_cache_page_write_begin+0x7c/0xe0 [] ? ext4_mark_inode_dirty+0x82/0x220 [] ext4_da_write_begin+0x89/0x2d0 [] generic_perform_write+0xbe/0x1d0 [] ? update_time+0x81/0xc0 [] ? mnt_clone_write+0x12/0x30 [] __generic_file_aio_write+0x1ce/0x3f0 [] generic_file_aio_write+0x5e/0xe0 [] ext4_file_write+0x9f/0x410 [] ? ext4_file_open+0x66/0x180 [] do_sync_write+0x5a/0x90 [] cachefiles_write_page+0x149/0x430 [cachefiles] [] ? radix_tree_gang_lookup_tag+0x89/0xd0 [] fscache_write_op+0x222/0x3b0 [fscache] [] fscache_op_work_func+0x3a/0x100 [fscache] [] process_one_work+0x179/0x4a0 [] worker_thread+0x11b/0x370 [] ? manage_workers.isra.21+0x2e0/0x2e0 [] kthread+0xc9/0xe0 [] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90 [] ? flush_kthread_worker+0xb0/0xb0 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0xb0/0xb0 Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/page.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 85332b9d19d1..781ac7b0b53e 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -43,6 +43,19 @@ void __fscache_wait_on_page_write(struct fscache_cookie *cookie, struct page *pa } EXPORT_SYMBOL(__fscache_wait_on_page_write); +/* + * wait for a page to finish being written to the cache. Put a timeout here + * since we might be called recursively via parent fs. + */ +static +bool release_page_wait_timeout(struct fscache_cookie *cookie, struct page *page) +{ + wait_queue_head_t *wq = bit_waitqueue(&cookie->flags, 0); + + return wait_event_timeout(*wq, !__fscache_check_page_write(cookie, page), + HZ); +} + /* * decide whether a page can be released, possibly by cancelling a store to it * - we're allowed to sleep if __GFP_WAIT is flagged @@ -115,7 +128,10 @@ page_busy: } fscache_stat(&fscache_n_store_vmscan_wait); - __fscache_wait_on_page_write(cookie, page); + if (!release_page_wait_timeout(cookie, page)) + _debug("fscache writeout timeout page: %p{%lx}", + page, page->index); + gfp &= ~__GFP_WAIT; goto try_again; } -- cgit v1.2.3 From 920bce20d74817bdd8bfcbc28ecb1179c9e01081 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:21 -0400 Subject: FS-Cache: Reduce cookie ref count if submit fails. I've been seeing issues with disposing cookies under vma pressure. The symptom is that the refcount gets out of sync. In this case we fail to decrement the refcount if submit fails. I found this while auditing the error in and around cookie operations. Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/object.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/fscache/object.c b/fs/fscache/object.c index d3b4539f1651..da032daf0e0d 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -982,6 +982,7 @@ nomem: submit_op_failed: clear_bit(FSCACHE_OBJECT_IS_LIVE, &object->flags); spin_unlock(&cookie->lock); + fscache_unuse_cookie(object); kfree(op); _leave(" [EIO]"); return transit_to(KILL_OBJECT); -- cgit v1.2.3 From e9512d72e8e61c750c90efacd720abe3c4569822 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 26 Aug 2014 13:55:54 -0700 Subject: Btrfs: fix autodefrag with compression The autodefrag code skips defrag when two extents are adjacent. But one big advantage for autodefrag is cutting down on the number of small extents, even when they are adjacent. This commit changes it to defrag all small extents. Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index fce6fd0e3f50..a018ea484d39 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1019,8 +1019,10 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return false; next = defrag_lookup_extent(inode, em->start + em->len); - if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE || - (em->block_start + em->block_len == next->block_start)) + if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) + ret = false; + else if ((em->block_start + em->block_len == next->block_start) && + (em->block_len > 128 * 1024 && next->block_len > 128 * 1024)) ret = false; free_extent_map(next); @@ -1055,7 +1057,6 @@ static int should_defrag_range(struct inode *inode, u64 start, int thresh, } next_mergeable = defrag_check_next_extent(inode, em); - /* * we hit a real extent, if it is big or the next extent is not a * real extent, don't bother defragging it -- cgit v1.2.3 From d9f85963e3f7f5582552fdae54a2b89d6c62daf5 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 25 Aug 2014 10:43:00 +0100 Subject: Btrfs: fix corruption after write/fsync failure + fsync + log recovery While writing to a file, in inode.c:cow_file_range() (and same applies to submit_compressed_extents()), after reserving an extent for the file data, we create a new extent map for the written range and insert it into the extent map cache. After that, we create an ordered operation, but if it fails (due to a transient/temporary-ENOMEM), we return without dropping that extent map, which points to a reserved extent that is freed when we return. A subsequent incremental fsync (when the btrfs inode doesn't have the flag BTRFS_INODE_NEEDS_FULL_SYNC) considers this extent map valid and logs a file extent item based on that extent map, which points to a disk extent that doesn't contain valid data - it was freed by us earlier, at this point it might contain any random/garbage data. Therefore, if we reach an error condition when cowing a file range after we added the new extent map to the cache, drop it from the cache before returning. Some sequence of steps that lead to this: $ mkfs.btrfs -f /dev/sdd $ mount -o commit=9999 /dev/sdd /mnt $ cd /mnt $ xfs_io -f -c "pwrite -S 0x01 -b 4096 0 4096" -c "fsync" foo $ xfs_io -c "pwrite -S 0x02 -b 4096 4096 4096" $ sync $ od -t x1 foo 0000000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0010000 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 02 * 0020000 $ xfs_io -c "pwrite -S 0xa1 -b 4096 0 4096" foo # Now this write + fsync fail with -ENOMEM, which was returned by # btrfs_add_ordered_extent() in inode.c:cow_file_range(). $ xfs_io -c "pwrite -S 0xff -b 4096 4096 4096" foo $ xfs_io -c "fsync" foo fsync: Cannot allocate memory # Now do a new write + fsync, which will succeed. Our previous # -ENOMEM was a transient/temporary error. $ xfs_io -c "pwrite -S 0xee -b 4096 16384 4096" foo $ xfs_io -c "fsync" foo # Our file content (in page cache) is now: $ od -t x1 foo 0000000 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 a1 * 0010000 ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # Now reboot the machine, and mount the fs, so that fsync log replay # takes place. # The file content is now weird, in particular the first 8Kb, which # do not match our data before nor after the sync command above. $ od -t x1 foo 0000000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0010000 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 * 0020000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 * 0040000 ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee ee * 0050000 # In fact these first 4Kb are a duplicate of the last 4kb block. # The last write got an extent map/file extent item that points to # the same disk extent that we got in the write+fsync that failed # with the -ENOMEM error. btrfs-debug-tree and btrfsck allow us to # verify that: $ btrfs-debug-tree /dev/sdd (...) item 6 key (257 EXTENT_DATA 0) itemoff 15819 itemsize 53 extent data disk byte 12582912 nr 8192 extent data offset 0 nr 8192 ram 8192 item 7 key (257 EXTENT_DATA 8192) itemoff 15766 itemsize 53 extent data disk byte 0 nr 0 extent data offset 0 nr 8192 ram 8192 item 8 key (257 EXTENT_DATA 16384) itemoff 15713 itemsize 53 extent data disk byte 12582912 nr 4096 extent data offset 0 nr 4096 ram 4096 $ umount /dev/sdd $ btrfsck /dev/sdd Checking filesystem on /dev/sdd UUID: db5e60e1-050d-41e6-8c7f-3d742dea5d8f checking extents extent item 12582912 has multiple extent items ref mismatch on [12582912 4096] extent item 1, found 2 Backref bytes do not match extent backref, bytenr=12582912, ref bytes=4096, backref bytes=8192 backpointer mismatch on [12582912 4096] Errors found in extent allocation tree or chunk allocation checking free space cache checking fs roots root 5 inode 257 errors 1000, some csum missing found 131074 bytes used err is 1 total csum bytes: 4 total tree bytes: 131072 total fs tree bytes: 32768 total extent tree bytes: 16384 btree space waste bytes: 123404 file data blocks allocated: 274432 referenced 274432 Btrfs v3.14.1-96-gcc7fd5a-dirty Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3d020d6d9ace..7313571e1860 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -778,8 +778,12 @@ retry: ins.offset, BTRFS_ORDERED_COMPRESSED, async_extent->compress_type); - if (ret) + if (ret) { + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); goto out_free_reserve; + } /* * clear dirty, set writeback and unlock the pages. @@ -971,14 +975,14 @@ static noinline int cow_file_range(struct inode *inode, ret = btrfs_add_ordered_extent(inode, start, ins.objectid, ram_size, cur_alloc_size, 0); if (ret) - goto out_reserve; + goto out_drop_extent_cache; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) { ret = btrfs_reloc_clone_csums(inode, start, cur_alloc_size); if (ret) - goto out_reserve; + goto out_drop_extent_cache; } if (disk_num_bytes < cur_alloc_size) @@ -1006,6 +1010,8 @@ static noinline int cow_file_range(struct inode *inode, out: return ret; +out_drop_extent_cache: + btrfs_drop_extent_cache(inode, start, start + ram_size - 1, 0); out_reserve: btrfs_free_reserved_extent(root, ins.objectid, ins.offset, 1); out_unlock: -- cgit v1.2.3 From dac5705cad20070a70bb028ca52e1f0bc157b42d Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 29 Aug 2014 20:54:26 +0100 Subject: Btrfs: fix crash while doing a ranged fsync While doing a ranged fsync, that is, one whose range doesn't cover the whole possible file range (0 to LLONG_MAX), we can crash under certain circumstances with a trace like the following: [41074.641913] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC (...) [41074.642692] CPU: 0 PID: 24580 Comm: fsx Not tainted 3.16.0-fdm-btrfs-next-45+ #1 (...) [41074.643886] RIP: 0010:[] [] btrfs_ordered_update_i_size+0x279/0x2b0 [btrfs] (...) [41074.644919] Stack: (...) [41074.644919] Call Trace: [41074.644919] [] btrfs_truncate_inode_items+0x3f1/0xa10 [btrfs] [41074.644919] [] ? btrfs_get_logged_extents+0x4f/0x80 [btrfs] [41074.644919] [] btrfs_log_inode+0x2f9/0x970 [btrfs] [41074.644919] [] ? sched_clock_local+0x25/0xa0 [41074.644919] [] ? mutex_unlock+0xe/0x10 [41074.644919] [] ? trace_hardirqs_on+0xd/0x10 [41074.644919] [] btrfs_log_inode_parent+0x1ef/0x560 [btrfs] [41074.644919] [] ? dget_parent+0x5/0x180 [41074.644919] [] btrfs_log_dentry_safe+0x51/0x80 [btrfs] [41074.644919] [] btrfs_sync_file+0x1ba/0x3e0 [btrfs] [41074.644919] [] vfs_fsync_range+0x1b/0x30 (...) The necessary conditions that lead to such crash are: * an incremental fsync (when the inode doesn't have the BTRFS_INODE_NEEDS_FULL_SYNC flag set) happened for our file and it logged a file extent item ending at offset X; * the file got the flag BTRFS_INODE_NEEDS_FULL_SYNC set in its inode, due to a file truncate operation that reduces the file to a size smaller than X; * a ranged fsync call happens (via an msync for example), with a range that doesn't cover the whole file and the end of this range, lets call it Y, is smaller than X; * btrfs_log_inode, sees the flag BTRFS_INODE_NEEDS_FULL_SYNC set and calls btrfs_truncate_inode_items() to remove all items from the log tree that are associated with our file; * btrfs_truncate_inode_items() removes all of the inode's items, and the lowest file extent item it removed is the one ending at offset X, where X > 0 and X > Y - before returning, it calls btrfs_ordered_update_i_size() with an offset parameter set to X; * btrfs_ordered_update_i_size() sees that X is greater then the current ordered size (btrfs_inode's disk_i_size) and then it assumes there can't be any ongoing ordered operation with a range covering the offset X, calling a BUG_ON() if such ordered operation exists. This assumption is made because the disk_i_size is only increased after the corresponding file extent item is added to the btree (btrfs_finish_ordered_io); * But because our fsync covers only a limited range, such an ordered extent might exist, and our fsync callback (btrfs_sync_file) doesn't wait for such ordered extent to finish when calling btrfs_wait_ordered_range(); And then by the time btrfs_ordered_update_i_size() is called, via: btrfs_sync_file() -> btrfs_log_dentry_safe() -> btrfs_log_inode_parent() -> btrfs_log_inode() -> btrfs_truncate_inode_items() -> btrfs_ordered_update_i_size() We hit the BUG_ON(), which could never happen if the fsync range covered the whole possible file range (0 to LLONG_MAX), as we would wait for all ordered extents to finish before calling btrfs_truncate_inode_items(). So just don't call btrfs_ordered_update_i_size() if we're removing the inode's items from a log tree, which isn't supposed to change the in memory inode's disk_i_size. Issue found while running xfstests/generic/127 (happens very rarely for me), more specifically via the fsx calls that use memory mapped IO (and issue msync calls). Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7313571e1860..88823f4ca451 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4248,7 +4248,8 @@ out: btrfs_abort_transaction(trans, root, ret); } error: - if (last_size != (u64)-1) + if (last_size != (u64)-1 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) btrfs_ordered_update_i_size(inode, last_size, NULL); btrfs_free_path(path); return err; -- cgit v1.2.3 From a9cfcd63e8d206ce4235c355d857c4fbdf0f4587 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 3 Sep 2014 09:33:00 -0400 Subject: ext4: avoid trying to kfree an ERR_PTR pointer Thanks to Dan Carpenter for extending smatch to find bugs like this. (This was found using a development version of smatch.) Fixes: 36de928641ee48b2078d3fe9514242aaa2f92013 Reported-by: Dan Carpenter Cc: stable@vger.kernel.org --- fs/ext4/namei.c | 2 ++ fs/ext4/resize.c | 2 ++ 2 files changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 90a3cdca3f88..603e4ebbd0ac 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3240,6 +3240,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); + new.bh = NULL; goto end_rename; } if (new.bh) { @@ -3386,6 +3387,7 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, &new.de, &new.inlined); if (IS_ERR(new.bh)) { retval = PTR_ERR(new.bh); + new.bh = NULL; goto end_rename; } diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index bb0e80f03e2e..1e43b905ff98 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -575,6 +575,7 @@ handle_bb: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } overhead = ext4_group_overhead_blocks(sb, group); @@ -603,6 +604,7 @@ handle_ib: bh = bclean(handle, sb, block); if (IS_ERR(bh)) { err = PTR_ERR(bh); + bh = NULL; goto out; } -- cgit v1.2.3 From 8a70ee3307908c46f952df91be72a18d5f5ad0a3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 11:47:51 +0200 Subject: udf: Avoid dir link count to go negative If we are writing back inode of unlinked directory, its link count ends up being (u16)-1. Although the inode is deleted, udf_iget() can load the inode when NFS uses stale file handle and get confused. Signed-off-by: Jan Kara --- fs/udf/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 236cd48184c2..e86f9b67aa16 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1664,7 +1664,7 @@ static int udf_update_inode(struct inode *inode, int do_sync) FE_PERM_U_DELETE | FE_PERM_U_CHATTR)); fe->permissions = cpu_to_le32(udfperms); - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) && inode->i_nlink > 0) fe->fileLinkCount = cpu_to_le16(inode->i_nlink - 1); else fe->fileLinkCount = cpu_to_le16(inode->i_nlink); -- cgit v1.2.3 From bb7720a0b4a8ca3269fd86fbb45a78d2e0d3deaf Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 13:32:50 +0200 Subject: udf: Fold udf_fill_inode() into __udf_read_inode() There's no good reason to separate these since udf_fill_inode() is called only from __udf_read_inode() and both do part of the same thing. Signed-off-by: Jan Kara --- fs/udf/inode.c | 22 +++++----------------- 1 file changed, 5 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index e86f9b67aa16..68cc7b144c26 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -51,7 +51,6 @@ MODULE_LICENSE("GPL"); static umode_t udf_convert_permissions(struct fileEntry *); static int udf_update_inode(struct inode *, int); -static void udf_fill_inode(struct inode *, struct buffer_head *); static int udf_sync_inode(struct inode *inode); static int udf_alloc_i_data(struct inode *inode, size_t size); static sector_t inode_getblk(struct inode *, sector_t, int *, int *); @@ -1275,8 +1274,11 @@ static void __udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; + struct extendedFileEntry *efe; uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); + struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + unsigned int link_count; /* * Set defaults, but the inode is still incomplete! @@ -1307,6 +1309,7 @@ static void __udf_read_inode(struct inode *inode) } fe = (struct fileEntry *)bh->b_data; + efe = (struct extendedFileEntry *)bh->b_data; if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; @@ -1346,22 +1349,6 @@ static void __udf_read_inode(struct inode *inode) make_bad_inode(inode); return; } - udf_fill_inode(inode, bh); - - brelse(bh); -} - -static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) -{ - struct fileEntry *fe; - struct extendedFileEntry *efe; - struct udf_sb_info *sbi = UDF_SB(inode->i_sb); - struct udf_inode_info *iinfo = UDF_I(inode); - unsigned int link_count; - - fe = (struct fileEntry *)bh->b_data; - efe = (struct extendedFileEntry *)bh->b_data; - if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; else /* if (fe->icbTag.strategyType == cpu_to_le16(4096)) */ @@ -1551,6 +1538,7 @@ static void udf_fill_inode(struct inode *inode, struct buffer_head *bh) } else make_bad_inode(inode); } + brelse(bh); } static int udf_alloc_i_data(struct inode *inode, size_t size) -- cgit v1.2.3 From c03aa9f6e1f938618e6db2e23afef0574efeeb65 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 14:06:55 +0200 Subject: udf: Avoid infinite loop when processing indirect ICBs We did not implement any bound on number of indirect ICBs we follow when loading inode. Thus corrupted medium could cause kernel to go into an infinite loop, possibly causing a stack overflow. Fix the possible stack overflow by removing recursion from __udf_read_inode() and limit number of indirect ICBs we follow to avoid infinite loops. Signed-off-by: Jan Kara --- fs/udf/inode.c | 35 +++++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 68cc7b144c26..a6a40536ebf1 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1270,6 +1270,13 @@ update_time: return 0; } +/* + * Maximum length of linked list formed by ICB hierarchy. The chosen number is + * arbitrary - just that we hopefully don't limit any real use of rewritten + * inode on write-once media but avoid looping for too long on corrupted media. + */ +#define UDF_MAX_ICB_NESTING 1024 + static void __udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; @@ -1279,7 +1286,9 @@ static void __udf_read_inode(struct inode *inode) struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); unsigned int link_count; + unsigned int indirections = 0; +reread: /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: @@ -1317,28 +1326,26 @@ static void __udf_read_inode(struct inode *inode) ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { - struct buffer_head *nbh = NULL; struct kernel_lb_addr loc; struct indirectEntry *ie; ie = (struct indirectEntry *)ibh->b_data; loc = lelb_to_cpu(ie->indirectICB.extLocation); - if (ie->indirectICB.extLength && - (nbh = udf_read_ptagged(inode->i_sb, &loc, 0, - &ident))) { - if (ident == TAG_IDENT_FE || - ident == TAG_IDENT_EFE) { - memcpy(&iinfo->i_location, - &loc, - sizeof(struct kernel_lb_addr)); - brelse(bh); - brelse(ibh); - brelse(nbh); - __udf_read_inode(inode); + if (ie->indirectICB.extLength) { + brelse(bh); + brelse(ibh); + memcpy(&iinfo->i_location, &loc, + sizeof(struct kernel_lb_addr)); + if (++indirections > UDF_MAX_ICB_NESTING) { + udf_err(inode->i_sb, + "too many ICBs in ICB hierarchy" + " (max %d supported)\n", + UDF_MAX_ICB_NESTING); + make_bad_inode(inode); return; } - brelse(nbh); + goto reread; } } brelse(ibh); -- cgit v1.2.3 From 6d3d5e860a114ae606b1af2ba7f64cb19fbeb414 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:15:51 +0200 Subject: udf: Make udf_read_inode() and udf_iget() return error Currently __udf_read_inode() wasn't returning anything and we found out whether we succeeded reading inode by checking whether inode is bad or not. udf_iget() returned NULL on failure and inode pointer otherwise. Make these two functions properly propagate errors up the call stack and use the return value in callers. Signed-off-by: Jan Kara --- fs/udf/inode.c | 99 ++++++++++++++++++++++++++------------------------------ fs/udf/namei.c | 22 ++++++------- fs/udf/super.c | 69 +++++++++++++++++++++++---------------- fs/udf/udfdecl.h | 1 - 4 files changed, 96 insertions(+), 95 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index a6a40536ebf1..788fc58ea78e 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1277,7 +1277,7 @@ update_time: */ #define UDF_MAX_ICB_NESTING 1024 -static void __udf_read_inode(struct inode *inode) +static int udf_read_inode(struct inode *inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; @@ -1285,10 +1285,19 @@ static void __udf_read_inode(struct inode *inode) uint16_t ident; struct udf_inode_info *iinfo = UDF_I(inode); struct udf_sb_info *sbi = UDF_SB(inode->i_sb); + struct kernel_lb_addr *iloc = &iinfo->i_location; unsigned int link_count; unsigned int indirections = 0; + int ret = -EIO; reread: + if (iloc->logicalBlockNum >= + sbi->s_partmaps[iloc->partitionReferenceNum].s_partition_len) { + udf_debug("block=%d, partition=%d out of range\n", + iloc->logicalBlockNum, iloc->partitionReferenceNum); + return -EIO; + } + /* * Set defaults, but the inode is still incomplete! * Note: get_new_inode() sets the following on a new inode: @@ -1301,20 +1310,17 @@ reread: * i_nlink = 1 * i_op = NULL; */ - bh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 0, &ident); + bh = udf_read_ptagged(inode->i_sb, iloc, 0, &ident); if (!bh) { udf_err(inode->i_sb, "(ino %ld) failed !bh\n", inode->i_ino); - make_bad_inode(inode); - return; + return -EIO; } if (ident != TAG_IDENT_FE && ident != TAG_IDENT_EFE && ident != TAG_IDENT_USE) { udf_err(inode->i_sb, "(ino %ld) failed ident=%d\n", inode->i_ino, ident); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } fe = (struct fileEntry *)bh->b_data; @@ -1323,8 +1329,7 @@ reread: if (fe->icbTag.strategyType == cpu_to_le16(4096)) { struct buffer_head *ibh; - ibh = udf_read_ptagged(inode->i_sb, &iinfo->i_location, 1, - &ident); + ibh = udf_read_ptagged(inode->i_sb, iloc, 1, &ident); if (ident == TAG_IDENT_IE && ibh) { struct kernel_lb_addr loc; struct indirectEntry *ie; @@ -1333,7 +1338,6 @@ reread: loc = lelb_to_cpu(ie->indirectICB.extLocation); if (ie->indirectICB.extLength) { - brelse(bh); brelse(ibh); memcpy(&iinfo->i_location, &loc, sizeof(struct kernel_lb_addr)); @@ -1342,9 +1346,9 @@ reread: "too many ICBs in ICB hierarchy" " (max %d supported)\n", UDF_MAX_ICB_NESTING); - make_bad_inode(inode); - return; + goto out; } + brelse(bh); goto reread; } } @@ -1352,9 +1356,7 @@ reread: } else if (fe->icbTag.strategyType != cpu_to_le16(4)) { udf_err(inode->i_sb, "unsupported strategy type: %d\n", le16_to_cpu(fe->icbTag.strategyType)); - brelse(bh); - make_bad_inode(inode); - return; + goto out; } if (fe->icbTag.strategyType == cpu_to_le16(4)) iinfo->i_strat4096 = 0; @@ -1372,11 +1374,10 @@ reread: if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_EFE)) { iinfo->i_efe = 1; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct extendedFileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct extendedFileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct extendedFileEntry), inode->i_sb->s_blocksize - @@ -1384,11 +1385,10 @@ reread: } else if (fe->descTag.tagIdent == cpu_to_le16(TAG_IDENT_FE)) { iinfo->i_efe = 0; iinfo->i_use = 0; - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct fileEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct fileEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct fileEntry), inode->i_sb->s_blocksize - sizeof(struct fileEntry)); @@ -1398,18 +1398,18 @@ reread: iinfo->i_lenAlloc = le32_to_cpu( ((struct unallocSpaceEntry *)bh->b_data)-> lengthAllocDescs); - if (udf_alloc_i_data(inode, inode->i_sb->s_blocksize - - sizeof(struct unallocSpaceEntry))) { - make_bad_inode(inode); - return; - } + ret = udf_alloc_i_data(inode, inode->i_sb->s_blocksize - + sizeof(struct unallocSpaceEntry)); + if (ret) + goto out; memcpy(iinfo->i_ext.i_data, bh->b_data + sizeof(struct unallocSpaceEntry), inode->i_sb->s_blocksize - sizeof(struct unallocSpaceEntry)); - return; + return 0; } + ret = -EIO; read_lock(&sbi->s_cred_lock); i_uid_write(inode, le32_to_cpu(fe->uid)); if (!uid_valid(inode->i_uid) || @@ -1531,8 +1531,7 @@ reread: default: udf_err(inode->i_sb, "(ino %ld) failed unknown file type=%d\n", inode->i_ino, fe->icbTag.fileType); - make_bad_inode(inode); - return; + goto out; } if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { struct deviceSpec *dsea = @@ -1543,9 +1542,12 @@ reread: le32_to_cpu(dsea->minorDeviceIdent))); /* Developer ID ??? */ } else - make_bad_inode(inode); + goto out; } + ret = 0; +out: brelse(bh); + return ret; } static int udf_alloc_i_data(struct inode *inode, size_t size) @@ -1825,32 +1827,23 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); + int err; if (!inode) - return NULL; - - if (inode->i_state & I_NEW) { - memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); - __udf_read_inode(inode); - unlock_new_inode(inode); - } + return ERR_PTR(-ENOMEM); - if (is_bad_inode(inode)) - goto out_iput; + if (!(inode->i_state & I_NEW)) + return inode; - if (ino->logicalBlockNum >= UDF_SB(sb)-> - s_partmaps[ino->partitionReferenceNum].s_partition_len) { - udf_debug("block=%d, partition=%d out of range\n", - ino->logicalBlockNum, ino->partitionReferenceNum); - make_bad_inode(inode); - goto out_iput; + memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); + err = udf_read_inode(inode); + if (err < 0) { + iget_failed(inode); + return ERR_PTR(err); } + unlock_new_inode(inode); return inode; - - out_iput: - iput(inode); - return NULL; } int udf_add_aext(struct inode *inode, struct extent_position *epos, diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 83a06001742b..e041fd9c6816 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -270,9 +270,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, NULL, 0), }; inode = udf_iget(dir->i_sb, lb); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return inode; } else #endif /* UDF_RECOVERY */ @@ -285,9 +284,8 @@ static struct dentry *udf_lookup(struct inode *dir, struct dentry *dentry, loc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(dir->i_sb, &loc); - if (!inode) { - return ERR_PTR(-EACCES); - } + if (IS_ERR(inode)) + return ERR_CAST(inode); } return d_splice_alias(inode, dentry); @@ -1222,7 +1220,7 @@ static struct dentry *udf_get_parent(struct dentry *child) struct udf_fileident_bh fibh; if (!udf_find_entry(child->d_inode, &dotdot, &fibh, &cfi)) - goto out_unlock; + return ERR_PTR(-EACCES); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -1230,12 +1228,10 @@ static struct dentry *udf_get_parent(struct dentry *child) tloc = lelb_to_cpu(cfi.icb.extLocation); inode = udf_iget(child->d_inode->i_sb, &tloc); - if (!inode) - goto out_unlock; + if (IS_ERR(inode)) + return ERR_CAST(inode); return d_obtain_alias(inode); -out_unlock: - return ERR_PTR(-EACCES); } @@ -1252,8 +1248,8 @@ static struct dentry *udf_nfs_get_inode(struct super_block *sb, u32 block, loc.partitionReferenceNum = partref; inode = udf_iget(sb, &loc); - if (inode == NULL) - return ERR_PTR(-ENOMEM); + if (IS_ERR(inode)) + return ERR_CAST(inode); if (generation && inode->i_generation != generation) { iput(inode); diff --git a/fs/udf/super.c b/fs/udf/super.c index 813da94d447b..5401fc33f5cc 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -961,12 +961,14 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, metadata_fe = udf_iget(sb, &addr); - if (metadata_fe == NULL) + if (IS_ERR(metadata_fe)) { udf_warn(sb, "metadata inode efe not found\n"); - else if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { + return metadata_fe; + } + if (UDF_I(metadata_fe)->i_alloc_type != ICBTAG_FLAG_AD_SHORT) { udf_warn(sb, "metadata inode efe does not have short allocation descriptors!\n"); iput(metadata_fe); - metadata_fe = NULL; + return ERR_PTR(-EIO); } return metadata_fe; @@ -978,6 +980,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) struct udf_part_map *map; struct udf_meta_data *mdata; struct kernel_lb_addr addr; + struct inode *fe; map = &sbi->s_partmaps[partition]; mdata = &map->s_type_specific.s_metadata; @@ -986,22 +989,24 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Metadata file location: block = %d part = %d\n", mdata->s_meta_file_loc, map->s_partition_num); - mdata->s_metadata_fe = udf_find_metadata_inode_efe(sb, - mdata->s_meta_file_loc, map->s_partition_num); - - if (mdata->s_metadata_fe == NULL) { + fe = udf_find_metadata_inode_efe(sb, mdata->s_meta_file_loc, + map->s_partition_num); + if (IS_ERR(fe)) { /* mirror file entry */ udf_debug("Mirror metadata file location: block = %d part = %d\n", mdata->s_mirror_file_loc, map->s_partition_num); - mdata->s_mirror_fe = udf_find_metadata_inode_efe(sb, - mdata->s_mirror_file_loc, map->s_partition_num); + fe = udf_find_metadata_inode_efe(sb, mdata->s_mirror_file_loc, + map->s_partition_num); - if (mdata->s_mirror_fe == NULL) { + if (IS_ERR(fe)) { udf_err(sb, "Both metadata and mirror metadata inode efe can not found\n"); - return -EIO; + return PTR_ERR(fe); } - } + mdata->s_mirror_fe = fe; + } else + mdata->s_metadata_fe = fe; + /* * bitmap file entry @@ -1015,15 +1020,16 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - mdata->s_bitmap_fe = udf_iget(sb, &addr); - if (mdata->s_bitmap_fe == NULL) { + fe = udf_iget(sb, &addr); + if (IS_ERR(fe)) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); else { udf_err(sb, "bitmap inode efe not found and attempted read-write mount\n"); - return -EIO; + return PTR_ERR(fe); } - } + } else + mdata->s_bitmap_fe = fe; } udf_debug("udf_load_metadata_files Ok\n"); @@ -1111,13 +1117,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->unallocSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_uspace.s_table = udf_iget(sb, &loc); - if (!map->s_uspace.s_table) { + inode = udf_iget(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } + map->s_uspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_UNALLOC_TABLE; udf_debug("unallocSpaceTable (part %d) @ %ld\n", p_index, map->s_uspace.s_table->i_ino); @@ -1144,14 +1152,15 @@ static int udf_fill_partdesc_info(struct super_block *sb, phd->freedSpaceTable.extPosition), .partitionReferenceNum = p_index, }; + struct inode *inode; - map->s_fspace.s_table = udf_iget(sb, &loc); - if (!map->s_fspace.s_table) { + inode = udf_iget(sb, &loc); + if (IS_ERR(inode)) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); - return -EIO; + return PTR_ERR(inode); } - + map->s_fspace.s_table = inode; map->s_partition_flags |= UDF_PART_FLAG_FREED_TABLE; udf_debug("freedSpaceTable (part %d) @ %ld\n", p_index, map->s_fspace.s_table->i_ino); @@ -1178,6 +1187,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, struct udf_part_map *map = &sbi->s_partmaps[p_index]; sector_t vat_block; struct kernel_lb_addr ino; + struct inode *inode; /* * VAT file entry is in the last recorded block. Some broken disks have @@ -1186,10 +1196,13 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, ino.partitionReferenceNum = type1_index; for (vat_block = start_block; vat_block >= map->s_partition_root && - vat_block >= start_block - 3 && - !sbi->s_vat_inode; vat_block--) { + vat_block >= start_block - 3; vat_block--) { ino.logicalBlockNum = vat_block - map->s_partition_root; - sbi->s_vat_inode = udf_iget(sb, &ino); + inode = udf_iget(sb, &ino); + if (!IS_ERR(inode)) { + sbi->s_vat_inode = inode; + break; + } } } @@ -2205,10 +2218,10 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) /* assign inodes by physical block number */ /* perhaps it's not extensible enough, but for now ... */ inode = udf_iget(sb, &rootdir); - if (!inode) { + if (IS_ERR(inode)) { udf_err(sb, "Error in udf_iget, block=%d, partition=%d\n", rootdir.logicalBlockNum, rootdir.partitionReferenceNum); - ret = -EIO; + ret = PTR_ERR(inode); goto error_out; } diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index be7dabbbcb49..41a8115c9345 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -143,7 +143,6 @@ extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); extern int udf_setsize(struct inode *, loff_t); -extern void udf_read_inode(struct inode *); extern void udf_evict_inode(struct inode *); extern int udf_write_inode(struct inode *, struct writeback_control *wbc); extern long udf_block_map(struct inode *, sector_t); -- cgit v1.2.3 From 4071b913622316970d0e1919f7d82b4403fec5f2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:19:47 +0200 Subject: udf: Properly detect stale inodes NFS can easily ask for inodes that are already deleted. Currently UDF happily returns such inodes which is a bug. Return -ESTALE if udf_read_inode() is asked to read deleted inode. Signed-off-by: Jan Kara --- fs/udf/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 788fc58ea78e..3a44d9187aad 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1435,8 +1435,10 @@ reread: read_unlock(&sbi->s_cred_lock); link_count = le16_to_cpu(fe->fileLinkCount); - if (!link_count) - link_count = 1; + if (!link_count) { + ret = -ESTALE; + goto out; + } set_nlink(inode, link_count); inode->i_size = le64_to_cpu(fe->informationLength); -- cgit v1.2.3 From 470cca56c366428d4d5785a0a5a291619c332c7f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 4 Sep 2014 16:26:19 +0200 Subject: udf: Set i_generation field Currently UDF doesn't initialize i_generation in any way and thus NFS can easily get reallocated inodes from stale file handles. Luckily UDF already has a unique object identifier associated with each inode - i_unique. Use that for initialization of i_generation. Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 1 + fs/udf/inode.c | 1 + 2 files changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 6eaf5edf1ea1..647370d70175 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -95,6 +95,7 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) lvidiu = udf_sb_lvidiu(sb); if (lvidiu) { iinfo->i_unique = lvid_get_unique_id(sb); + inode->i_generation = iinfo->i_unique; mutex_lock(&sbi->s_alloc_mutex); if (S_ISDIR(mode)) le32_add_cpu(&lvidiu->numDirs, 1); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 3a44d9187aad..08598843288f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1484,6 +1484,7 @@ reread: iinfo->i_lenAlloc = le32_to_cpu(efe->lengthAllocDescs); iinfo->i_checkpoint = le32_to_cpu(efe->checkpoint); } + inode->i_generation = iinfo->i_unique; switch (fe->icbTag.fileType) { case ICBTAG_FILE_TYPE_DIRECTORY: -- cgit v1.2.3 From d2be51cb34dc501791f3b8c01a99a3f2064bd8d1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:34:14 -0400 Subject: udf: merge the pieces inserting a new non-directory object into directory boilerplate code in udf_{create,mknod,symlink} taken to new helper symlink case converted to unique id calculated by udf_new_inode() - no point finding a new one. Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/namei.c | 98 +++++++++++++++++----------------------------------------- 1 file changed, 29 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/udf/namei.c b/fs/udf/namei.c index e041fd9c6816..abec86466735 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -548,31 +548,16 @@ static int udf_delete_entry(struct inode *inode, struct fileIdentDesc *fi, return udf_write_fi(inode, cfi, fi, fibh, NULL, NULL); } -static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, - bool excl) +static int udf_add_nondir(struct dentry *dentry, struct inode *inode) { + struct udf_inode_info *iinfo = UDF_I(inode); + struct inode *dir = dentry->d_parent->d_inode; struct udf_fileident_bh fibh; - struct inode *inode; struct fileIdentDesc cfi, *fi; int err; - struct udf_inode_info *iinfo; - - inode = udf_new_inode(dir, mode, &err); - if (!inode) { - return err; - } - - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - inode->i_data.a_ops = &udf_adinicb_aops; - else - inode->i_data.a_ops = &udf_aops; - inode->i_op = &udf_file_inode_operations; - inode->i_fop = &udf_file_operations; - mark_inode_dirty(inode); fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { + if (unlikely(!fi)) { inode_dec_link_count(inode); iput(inode); return err; @@ -592,6 +577,28 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, return 0; } +static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + struct inode *inode; + int err; + + inode = udf_new_inode(dir, mode, &err); + if (!inode) { + return err; + } + + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + inode->i_data.a_ops = &udf_adinicb_aops; + else + inode->i_data.a_ops = &udf_aops; + inode->i_op = &udf_file_inode_operations; + inode->i_fop = &udf_file_operations; + mark_inode_dirty(inode); + + return udf_add_nondir(dentry, inode); +} + static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { struct inode *inode; @@ -619,10 +626,7 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - struct udf_fileident_bh fibh; - struct fileIdentDesc cfi, *fi; int err; - struct udf_inode_info *iinfo; if (!old_valid_dev(rdev)) return -EINVAL; @@ -630,33 +634,10 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, err = -EIO; inode = udf_new_inode(dir, mode, &err); if (!inode) - goto out; - - iinfo = UDF_I(inode); - init_special_inode(inode, mode, rdev); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) { - inode_dec_link_count(inode); - iput(inode); return err; - } - cfi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(iinfo->i_unique & 0x00000000FFFFFFFFUL); - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - mark_inode_dirty(inode); - - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; -out: - return err; + init_special_inode(inode, mode, rdev); + return udf_add_nondir(dentry, inode); } static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) @@ -877,11 +858,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct inode *inode; struct pathComponent *pc; const char *compstart; - struct udf_fileident_bh fibh; struct extent_position epos = {}; int eoffset, elen = 0; - struct fileIdentDesc *fi; - struct fileIdentDesc cfi; uint8_t *ea; int err; int block; @@ -1010,31 +988,13 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, mark_inode_dirty(inode); up_write(&iinfo->i_data_sem); - fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); - if (!fi) - goto out_fail; - cfi.icb.extLength = cpu_to_le32(sb->s_blocksize); - cfi.icb.extLocation = cpu_to_lelb(iinfo->i_location); - if (UDF_SB(inode->i_sb)->s_lvid_bh) { - *(__le32 *)((struct allocDescImpUse *)cfi.icb.impUse)->impUse = - cpu_to_le32(lvid_get_unique_id(sb)); - } - udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); - if (UDF_I(dir)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) - mark_inode_dirty(dir); - if (fibh.sbh != fibh.ebh) - brelse(fibh.ebh); - brelse(fibh.sbh); - d_instantiate(dentry, inode); - err = 0; - + err = udf_add_nondir(dentry, inode); out: kfree(name); return err; out_no_entry: up_write(&iinfo->i_data_sem); -out_fail: inode_dec_link_count(inode); iput(inode); goto out; -- cgit v1.2.3 From b231509616feb911c2a7a8814d58c0014ef5b17f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:38:11 -0400 Subject: udf: fix the udf_iget() vs. udf_new_inode() races Currently udf_iget() (triggered by NFS) can race with udf_new_inode() leading to two inode structures with the same inode number: nfsd: iget_locked() creates inode nfsd: try to read from disk, block on that. udf_new_inode(): allocate inode with that inumber udf_new_inode(): insert it into icache, set it up and dirty udf_write_inode(): write inode into buffer cache nfsd: get CPU again, look into buffer cache, see nice and sane on-disk inode, set the in-core inode from it Fix the problem by putting inode into icache in locked state (I_NEW set) and unlocking it only after it's fully set up. Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 7 ++++++- fs/udf/namei.c | 7 +++++++ 2 files changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 647370d70175..598f33bdcd26 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -124,7 +124,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) iinfo->i_alloc_type = ICBTAG_FLAG_AD_LONG; inode->i_mtime = inode->i_atime = inode->i_ctime = iinfo->i_crtime = current_fs_time(inode->i_sb); - insert_inode_hash(inode); + if (unlikely(insert_inode_locked(inode) < 0)) { + make_bad_inode(inode); + iput(inode); + *err = -EIO; + return NULL; + } mark_inode_dirty(inode); *err = 0; diff --git a/fs/udf/namei.c b/fs/udf/namei.c index abec86466735..d106fdd1bef7 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -559,6 +559,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) fi = udf_add_entry(dir, dentry, &fibh, &cfi, &err); if (unlikely(!fi)) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); return err; } @@ -572,6 +573,7 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); brelse(fibh.sbh); + unlock_new_inode(inode); d_instantiate(dentry, inode); return 0; @@ -619,6 +621,7 @@ static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) mark_inode_dirty(inode); d_tmpfile(dentry, inode); + unlock_new_inode(inode); return 0; } @@ -660,6 +663,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) fi = udf_add_entry(inode, NULL, &fibh, &cfi, &err); if (!fi) { inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -678,6 +682,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (!fi) { clear_nlink(inode); mark_inode_dirty(inode); + unlock_new_inode(inode); iput(inode); goto out; } @@ -689,6 +694,7 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) udf_write_fi(dir, &cfi, fi, &fibh, NULL, NULL); inc_nlink(dir); mark_inode_dirty(dir); + unlock_new_inode(inode); d_instantiate(dentry, inode); if (fibh.sbh != fibh.ebh) brelse(fibh.ebh); @@ -996,6 +1002,7 @@ out: out_no_entry: up_write(&iinfo->i_data_sem); inode_dec_link_count(inode); + unlock_new_inode(inode); iput(inode); goto out; } -- cgit v1.2.3 From 0b93a92be4cb48f22b78c95dea84089a1e72f860 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Sep 2014 09:47:41 -0400 Subject: udf: saner calling conventions for udf_new_inode() Signed-off-by: Al Viro Signed-off-by: Jan Kara --- fs/udf/ialloc.c | 24 ++++++++++-------------- fs/udf/namei.c | 44 ++++++++++++++++---------------------------- fs/udf/udfdecl.h | 2 +- 3 files changed, 27 insertions(+), 43 deletions(-) (limited to 'fs') diff --git a/fs/udf/ialloc.c b/fs/udf/ialloc.c index 598f33bdcd26..e77db621ec89 100644 --- a/fs/udf/ialloc.c +++ b/fs/udf/ialloc.c @@ -45,7 +45,7 @@ void udf_free_inode(struct inode *inode) udf_free_blocks(sb, NULL, &UDF_I(inode)->i_location, 0, 1); } -struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) +struct inode *udf_new_inode(struct inode *dir, umode_t mode) { struct super_block *sb = dir->i_sb; struct udf_sb_info *sbi = UDF_SB(sb); @@ -55,14 +55,12 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) struct udf_inode_info *iinfo; struct udf_inode_info *dinfo = UDF_I(dir); struct logicalVolIntegrityDescImpUse *lvidiu; + int err; inode = new_inode(sb); - if (!inode) { - *err = -ENOMEM; - return NULL; - } - *err = -ENOSPC; + if (!inode) + return ERR_PTR(-ENOMEM); iinfo = UDF_I(inode); if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_EXTENDED_FE)) { @@ -80,16 +78,16 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) } if (!iinfo->i_ext.i_data) { iput(inode); - *err = -ENOMEM; - return NULL; + return ERR_PTR(-ENOMEM); } + err = -ENOSPC; block = udf_new_block(dir->i_sb, NULL, dinfo->i_location.partitionReferenceNum, - start, err); - if (*err) { + start, &err); + if (err) { iput(inode); - return NULL; + return ERR_PTR(err); } lvidiu = udf_sb_lvidiu(sb); @@ -127,11 +125,9 @@ struct inode *udf_new_inode(struct inode *dir, umode_t mode, int *err) if (unlikely(insert_inode_locked(inode) < 0)) { make_bad_inode(inode); iput(inode); - *err = -EIO; - return NULL; + return ERR_PTR(-EIO); } mark_inode_dirty(inode); - *err = 0; return inode; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index d106fdd1bef7..c12e260fd6c4 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -582,13 +582,10 @@ static int udf_add_nondir(struct dentry *dentry, struct inode *inode) static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct inode *inode; - int err; + struct inode *inode = udf_new_inode(dir, mode); - inode = udf_new_inode(dir, mode, &err); - if (!inode) { - return err; - } + if (IS_ERR(inode)) + return PTR_ERR(inode); if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; @@ -603,23 +600,18 @@ static int udf_create(struct inode *dir, struct dentry *dentry, umode_t mode, static int udf_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct inode *inode; - struct udf_inode_info *iinfo; - int err; + struct inode *inode = udf_new_inode(dir, mode); - inode = udf_new_inode(dir, mode, &err); - if (!inode) - return err; + if (IS_ERR(inode)) + return PTR_ERR(inode); - iinfo = UDF_I(inode); - if (iinfo->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) + if (UDF_I(inode)->i_alloc_type == ICBTAG_FLAG_AD_IN_ICB) inode->i_data.a_ops = &udf_adinicb_aops; else inode->i_data.a_ops = &udf_aops; inode->i_op = &udf_file_inode_operations; inode->i_fop = &udf_file_operations; mark_inode_dirty(inode); - d_tmpfile(dentry, inode); unlock_new_inode(inode); return 0; @@ -629,15 +621,13 @@ static int udf_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { struct inode *inode; - int err; if (!old_valid_dev(rdev)) return -EINVAL; - err = -EIO; - inode = udf_new_inode(dir, mode, &err); - if (!inode) - return err; + inode = udf_new_inode(dir, mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); init_special_inode(inode, mode, rdev); return udf_add_nondir(dentry, inode); @@ -652,10 +642,9 @@ static int udf_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct udf_inode_info *dinfo = UDF_I(dir); struct udf_inode_info *iinfo; - err = -EIO; - inode = udf_new_inode(dir, S_IFDIR | mode, &err); - if (!inode) - goto out; + inode = udf_new_inode(dir, S_IFDIR | mode); + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -861,7 +850,7 @@ out: static int udf_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct inode *inode; + struct inode *inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO); struct pathComponent *pc; const char *compstart; struct extent_position epos = {}; @@ -874,9 +863,8 @@ static int udf_symlink(struct inode *dir, struct dentry *dentry, struct udf_inode_info *iinfo; struct super_block *sb = dir->i_sb; - inode = udf_new_inode(dir, S_IFLNK | S_IRWXUGO, &err); - if (!inode) - goto out; + if (IS_ERR(inode)) + return PTR_ERR(inode); iinfo = UDF_I(inode); down_write(&iinfo->i_data_sem); diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 41a8115c9345..742557be9936 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -208,7 +208,7 @@ extern int udf_CS0toUTF8(struct ustr *, const struct ustr *); /* ialloc.c */ extern void udf_free_inode(struct inode *); -extern struct inode *udf_new_inode(struct inode *, umode_t, int *); +extern struct inode *udf_new_inode(struct inode *, umode_t); /* truncate.c */ extern void udf_truncate_tail_extent(struct inode *); -- cgit v1.2.3 From aee3776441461c14ba6d8ed9e2149933e65abb6e Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 20 Aug 2014 14:49:50 -0400 Subject: nfsd4: fix rd_dircount enforcement Commit 3b299709091b "nfsd4: enforce rd_dircount" totally misunderstood rd_dircount; it refers to total non-attribute bytes returned, not number of directory entries returned. Bring the code into agreement with RFC 3530 section 14.2.24. Cc: stable@vger.kernel.org Fixes: 3b299709091b "nfsd4: enforce rd_dircount" Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f9821ce6658a..e94457c33ad6 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2657,6 +2657,7 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, struct xdr_stream *xdr = cd->xdr; int start_offset = xdr->buf->len; int cookie_offset; + u32 name_and_cookie; int entry_bytes; __be32 nfserr = nfserr_toosmall; __be64 wire_offset; @@ -2718,7 +2719,14 @@ nfsd4_encode_dirent(void *ccdv, const char *name, int namlen, cd->rd_maxcount -= entry_bytes; if (!cd->rd_dircount) goto fail; - cd->rd_dircount--; + /* + * RFC 3530 14.2.24 describes rd_dircount as only a "hint", so + * let's always let through the first entry, at least: + */ + name_and_cookie = 4 * XDR_QUADLEN(namlen) + 8; + if (name_and_cookie > cd->rd_dircount && cd->cookie_offset) + goto fail; + cd->rd_dircount -= min(cd->rd_dircount, name_and_cookie); cd->cookie_offset = cookie_offset; skip_entry: cd->common.err = nfs_ok; @@ -3321,6 +3329,10 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4 } maxcount = min_t(int, maxcount-16, bytes_left); + /* RFC 3530 14.2.24 allows us to ignore dircount when it's 0: */ + if (!readdir->rd_dircount) + readdir->rd_dircount = INT_MAX; + readdir->xdr = xdr; readdir->rd_maxcount = maxcount; readdir->common.err = 0; -- cgit v1.2.3 From 7c17705e77b12b20fb8afb7c1b15dcdb126c0c12 Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Fri, 29 Aug 2014 16:25:50 -0400 Subject: lockd: fix rpcbind crash on lockd startup failure Nikita Yuschenko reported that booting a kernel with init=/bin/sh and then nfs mounting without portmap or rpcbind running using a busybox mount resulted in: # mount -t nfs 10.30.130.21:/opt /mnt svc: failed to register lockdv1 RPC service (errno 111). lockd_up: makesock failed, error=-111 Unable to handle kernel paging request for data at address 0x00000030 Faulting instruction address: 0xc055e65c Oops: Kernel access of bad area, sig: 11 [#1] MPC85xx CDS Modules linked in: CPU: 0 PID: 1338 Comm: mount Not tainted 3.10.44.cge #117 task: cf29cea0 ti: cf35c000 task.ti: cf35c000 NIP: c055e65c LR: c0566490 CTR: c055e648 REGS: cf35dad0 TRAP: 0300 Not tainted (3.10.44.cge) MSR: 00029000 CR: 22442488 XER: 20000000 DEAR: 00000030, ESR: 00000000 GPR00: c05606f4 cf35db80 cf29cea0 cf0ded80 cf0dedb8 00000001 1dec3086 00000000 GPR08: 00000000 c07b1640 00000007 1dec3086 22442482 100b9758 00000000 10090ae8 GPR16: 00000000 000186a5 00000000 00000000 100c3018 bfa46edc 100b0000 bfa46ef0 GPR24: cf386ae0 c07834f0 00000000 c0565f88 00000001 cf0dedb8 00000000 cf0ded80 NIP [c055e65c] call_start+0x14/0x34 LR [c0566490] __rpc_execute+0x70/0x250 Call Trace: [cf35db80] [00000080] 0x80 (unreliable) [cf35dbb0] [c05606f4] rpc_run_task+0x9c/0xc4 [cf35dbc0] [c0560840] rpc_call_sync+0x50/0xb8 [cf35dbf0] [c056ee90] rpcb_register_call+0x54/0x84 [cf35dc10] [c056f24c] rpcb_register+0xf8/0x10c [cf35dc70] [c0569e18] svc_unregister.isra.23+0x100/0x108 [cf35dc90] [c0569e38] svc_rpcb_cleanup+0x18/0x30 [cf35dca0] [c0198c5c] lockd_up+0x1dc/0x2e0 [cf35dcd0] [c0195348] nlmclnt_init+0x2c/0xc8 [cf35dcf0] [c015bb5c] nfs_start_lockd+0x98/0xec [cf35dd20] [c015ce6c] nfs_create_server+0x1e8/0x3f4 [cf35dd90] [c0171590] nfs3_create_server+0x10/0x44 [cf35dda0] [c016528c] nfs_try_mount+0x158/0x1e4 [cf35de20] [c01670d0] nfs_fs_mount+0x434/0x8c8 [cf35de70] [c00cd3bc] mount_fs+0x20/0xbc [cf35de90] [c00e4f88] vfs_kern_mount+0x50/0x104 [cf35dec0] [c00e6e0c] do_mount+0x1d0/0x8e0 [cf35df10] [c00e75ac] SyS_mount+0x90/0xd0 [cf35df40] [c000ccf4] ret_from_syscall+0x0/0x3c The addition of svc_shutdown_net() resulted in two calls to svc_rpcb_cleanup(); the second is no longer necessary and crashes when it calls rpcb_register_call with clnt=NULL. Reported-by: Nikita Yushchenko Fixes: 679b033df484 "lockd: ensure we tear down any live sockets when socket creation fails during lockd_up" Cc: stable@vger.kernel.org Acked-by: Jeff Layton Signed-off-by: J. Bruce Fields --- fs/lockd/svc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 8f27c93f8d2e..ec9e082f9ecd 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -253,13 +253,11 @@ static int lockd_up_net(struct svc_serv *serv, struct net *net) error = make_socks(serv, net); if (error < 0) - goto err_socks; + goto err_bind; set_grace_period(net); dprintk("lockd_up_net: per-net data created; net=%p\n", net); return 0; -err_socks: - svc_rpcb_cleanup(serv, net); err_bind: ln->nlmsvc_users--; return error; -- cgit v1.2.3 From c47ca32d3aadb234f73389a34c97574085bc9eda Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 4 Sep 2014 14:09:15 +0300 Subject: Btrfs: kfree()ing ERR_PTRs The "inherit" in btrfs_ioctl_snap_create_v2() and "vol_args" in btrfs_ioctl_rm_dev() are ERR_PTRs so we can't call kfree() on them. These kind of bugs are "One Err Bugs" where there is just one error label that does everything. I could set the "inherit = NULL" and keep the single out label but it ends up being more complicated that way. It makes the code simpler to re-order the unwind so it's in the mirror order of the allocation and introduce some new error labels. Signed-off-by: Dan Carpenter Signed-off-by: Chris Mason --- fs/btrfs/ioctl.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a018ea484d39..8a8e29878c34 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1703,7 +1703,7 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, ~(BTRFS_SUBVOL_CREATE_ASYNC | BTRFS_SUBVOL_RDONLY | BTRFS_SUBVOL_QGROUP_INHERIT)) { ret = -EOPNOTSUPP; - goto out; + goto free_args; } if (vol_args->flags & BTRFS_SUBVOL_CREATE_ASYNC) @@ -1713,27 +1713,31 @@ static noinline int btrfs_ioctl_snap_create_v2(struct file *file, if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) { if (vol_args->size > PAGE_CACHE_SIZE) { ret = -EINVAL; - goto out; + goto free_args; } inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size); if (IS_ERR(inherit)) { ret = PTR_ERR(inherit); - goto out; + goto free_args; } } ret = btrfs_ioctl_snap_create_transid(file, vol_args->name, vol_args->fd, subvol, ptr, readonly, inherit); + if (ret) + goto free_inherit; - if (ret == 0 && ptr && - copy_to_user(arg + - offsetof(struct btrfs_ioctl_vol_args_v2, - transid), ptr, sizeof(*ptr))) + if (ptr && copy_to_user(arg + + offsetof(struct btrfs_ioctl_vol_args_v2, + transid), + ptr, sizeof(*ptr))) ret = -EFAULT; -out: - kfree(vol_args); + +free_inherit: kfree(inherit); +free_args: + kfree(vol_args); return ret; } @@ -2653,7 +2657,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) vol_args = memdup_user(arg, sizeof(*vol_args)); if (IS_ERR(vol_args)) { ret = PTR_ERR(vol_args); - goto out; + goto err_drop; } vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; @@ -2671,6 +2675,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) out: kfree(vol_args); +err_drop: mnt_drop_write_file(file); return ret; } -- cgit v1.2.3 From 49dae1bc1c665817e434d01eefaa11967f618243 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Sat, 6 Sep 2014 22:34:39 +0100 Subject: Btrfs: fix fsync data loss after a ranged fsync While we're doing a full fsync (when the inode has the flag BTRFS_INODE_NEEDS_FULL_SYNC set) that is ranged too (covers only a portion of the file), we might have ordered operations that are started before or while we're logging the inode and that fall outside the fsync range. Therefore when a full ranged fsync finishes don't remove every extent map from the list of modified extent maps - as for some of them, that fall outside our fsync range, their respective ordered operation hasn't finished yet, meaning the corresponding file extent item wasn't inserted into the fs/subvol tree yet and therefore we didn't log it, and we must let the next fast fsync (one that checks only the modified list) see this extent map and log a matching file extent item to the log btree and wait for its ordered operation to finish (if it's still ongoing). A test case for xfstests follows. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/file.c | 2 +- fs/btrfs/tree-log.c | 77 ++++++++++++++++++++++++++++++++++++++++++----------- fs/btrfs/tree-log.h | 2 ++ 3 files changed, 64 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 36861b7a6757..ff1cc0399b9a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1966,7 +1966,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) btrfs_init_log_ctx(&ctx); - ret = btrfs_log_dentry_safe(trans, root, dentry, &ctx); + ret = btrfs_log_dentry_safe(trans, root, dentry, start, end, &ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7e0e6e3029dd..d296efe2d3e7 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -94,8 +94,10 @@ #define LOG_WALK_REPLAY_ALL 3 static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only); + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -3858,8 +3860,10 @@ process: * This handles both files and directories. */ static int btrfs_log_inode(struct btrfs_trans_handle *trans, - struct btrfs_root *root, struct inode *inode, - int inode_only) + struct btrfs_root *root, struct inode *inode, + int inode_only, + const loff_t start, + const loff_t end) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3876,6 +3880,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, int ins_nr; bool fast_search = false; u64 ino = btrfs_ino(inode); + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; path = btrfs_alloc_path(); if (!path) @@ -4049,13 +4054,35 @@ log_extents: goto out_unlock; } } else if (inode_only == LOG_INODE_ALL) { - struct extent_map_tree *tree = &BTRFS_I(inode)->extent_tree; struct extent_map *em, *n; - write_lock(&tree->lock); - list_for_each_entry_safe(em, n, &tree->modified_extents, list) - list_del_init(&em->list); - write_unlock(&tree->lock); + write_lock(&em_tree->lock); + /* + * We can't just remove every em if we're called for a ranged + * fsync - that is, one that doesn't cover the whole possible + * file range (0 to LLONG_MAX). This is because we can have + * em's that fall outside the range we're logging and therefore + * their ordered operations haven't completed yet + * (btrfs_finish_ordered_io() not invoked yet). This means we + * didn't get their respective file extent item in the fs/subvol + * tree yet, and need to let the next fast fsync (one which + * consults the list of modified extent maps) find the em so + * that it logs a matching file extent item and waits for the + * respective ordered operation to complete (if it's still + * running). + * + * Removing every em outside the range we're logging would make + * the next fast fsync not log their matching file extent items, + * therefore making us lose data after a log replay. + */ + list_for_each_entry_safe(em, n, &em_tree->modified_extents, + list) { + const u64 mod_end = em->mod_start + em->mod_len - 1; + + if (em->mod_start >= start && mod_end <= end) + list_del_init(&em->list); + } + write_unlock(&em_tree->lock); } if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { @@ -4065,8 +4092,19 @@ log_extents: goto out_unlock; } } - BTRFS_I(inode)->logged_trans = trans->transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; + + write_lock(&em_tree->lock); + /* + * If we're doing a ranged fsync and there are still modified extents + * in the list, we must run on the next fsync call as it might cover + * those extents (a full fsync or an fsync for other range). + */ + if (list_empty(&em_tree->modified_extents)) { + BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = + BTRFS_I(inode)->last_sub_trans; + } + write_unlock(&em_tree->lock); out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); @@ -4161,7 +4199,10 @@ out: */ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, - struct dentry *parent, int exists_only, + struct dentry *parent, + const loff_t start, + const loff_t end, + int exists_only, struct btrfs_log_ctx *ctx) { int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL; @@ -4207,7 +4248,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); if (ret) goto end_trans; @@ -4235,7 +4276,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { - ret = btrfs_log_inode(trans, root, inode, inode_only); + ret = btrfs_log_inode(trans, root, inode, inode_only, + 0, LLONG_MAX); if (ret) goto end_trans; } @@ -4269,13 +4311,15 @@ end_no_trans: */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, root, dentry->d_inode, parent, - 0, ctx); + start, end, 0, ctx); dput(parent); return ret; @@ -4512,6 +4556,7 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, root->fs_info->last_trans_committed)) return 0; - return btrfs_log_inode_parent(trans, root, inode, parent, 1, NULL); + return btrfs_log_inode_parent(trans, root, inode, parent, 0, + LLONG_MAX, 1, NULL); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 7f5b41bd5373..e2e798ae7cd7 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -59,6 +59,8 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct dentry *dentry, + const loff_t start, + const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, -- cgit v1.2.3 From b0d5d10f41a0f1cd839408dd94427f2db3553bca Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 8 Sep 2014 13:08:51 -0700 Subject: Btrfs: use insert_inode_locked4 for inode creation Btrfs was inserting inodes into the hash table before we had fully set the inode up on disk. This leaves us open to rare races that allow two different inodes in memory for the same [root, inode] pair. This patch fixes things by using insert_inode_locked4 to insert an I_NEW inode and unlock_new_inode when we're ready for the rest of the kernel to use the inode. It also makes sure to init the operations pointers on the inode before going into the error handling paths. Signed-off-by: Chris Mason Reported-by: Al Viro --- fs/btrfs/inode.c | 176 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 109 insertions(+), 67 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 88823f4ca451..214b936bdd3d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5634,6 +5634,17 @@ int btrfs_set_inode_index(struct inode *dir, u64 *index) return ret; } +static int btrfs_insert_inode_locked(struct inode *inode) +{ + struct btrfs_iget_args args; + args.location = &BTRFS_I(inode)->location; + args.root = BTRFS_I(inode)->root; + + return insert_inode_locked4(inode, + btrfs_inode_hash(inode->i_ino, BTRFS_I(inode)->root), + btrfs_find_actor, &args); +} + static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *dir, @@ -5726,10 +5737,19 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, sizes[1] = name_len + sizeof(*ref); } + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + ret = btrfs_insert_inode_locked(inode); + if (ret < 0) + goto fail; + path->leave_spinning = 1; ret = btrfs_insert_empty_items(trans, root, path, key, sizes, nitems); if (ret != 0) - goto fail; + goto fail_unlock; inode_init_owner(inode, dir, mode); inode_set_bytes(inode, 0); @@ -5752,11 +5772,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(path->nodes[0]); btrfs_free_path(path); - location = &BTRFS_I(inode)->location; - location->objectid = objectid; - location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); - btrfs_inherit_iflags(inode, dir); if (S_ISREG(mode)) { @@ -5767,7 +5782,6 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_INODE_NODATASUM; } - btrfs_insert_inode_hash(inode); inode_tree_add(inode); trace_btrfs_inode_new(inode); @@ -5782,6 +5796,9 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, btrfs_ino(inode), root->root_key.objectid, ret); return inode; + +fail_unlock: + unlock_new_inode(inode); fail: if (dir && name) BTRFS_I(dir)->index_cnt--; @@ -5916,28 +5933,28 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see * if the filesystem supports xattrs by looking at the * ops vector. */ - inode->i_op = &btrfs_special_inode_operations; - err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + init_special_inode(inode, inode->i_mode, rdev); + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - drop_inode = 1; - else { - init_special_inode(inode, inode->i_mode, rdev); + goto out_unlock_inode; + + err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); + if (err) { + goto out_unlock_inode; + } else { btrfs_update_inode(trans, root, inode); + unlock_new_inode(inode); d_instantiate(dentry, inode); } + out_unlock: btrfs_end_transaction(trans, root); btrfs_balance_delayed_items(root); @@ -5947,6 +5964,12 @@ out_unlock: iput(inode); } return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_create(struct inode *dir, struct dentry *dentry, @@ -5981,15 +6004,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, goto out_unlock; } drop_inode_on_err = 1; - - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) - goto out_unlock; - - err = btrfs_update_inode(trans, root, inode); - if (err) - goto out_unlock; - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -5998,14 +6012,23 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; + + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - goto out_unlock; + goto out_unlock_inode; - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + unlock_new_inode(inode); d_instantiate(dentry, inode); out_unlock: @@ -6017,6 +6040,11 @@ out_unlock: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + unlock_new_inode(inode); + goto out_unlock; + } static int btrfs_link(struct dentry *old_dentry, struct inode *dir, @@ -6124,25 +6152,30 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } drop_on_err = 1; + /* these must be set before we unlock the inode */ + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) - goto out_fail; - - inode->i_op = &btrfs_dir_inode_operations; - inode->i_fop = &btrfs_dir_file_operations; + goto out_fail_inode; btrfs_i_size_write(inode, 0); err = btrfs_update_inode(trans, root, inode); if (err) - goto out_fail; + goto out_fail_inode; err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, 0, index); if (err) - goto out_fail; + goto out_fail_inode; d_instantiate(dentry, inode); + /* + * mkdir is special. We're unlocking after we call d_instantiate + * to avoid a race with nfsd calling d_instantiate. + */ + unlock_new_inode(inode); drop_on_err = 0; out_fail: @@ -6152,6 +6185,10 @@ out_fail: btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); return err; + +out_fail_inode: + unlock_new_inode(inode); + goto out_fail; } /* helper for btfs_get_extent. Given an existing extent in the tree, @@ -8107,6 +8144,7 @@ int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, set_nlink(inode, 1); btrfs_i_size_write(inode, 0); + unlock_new_inode(inode); err = btrfs_subvol_inherit_props(trans, new_root, parent_root); if (err) @@ -8757,12 +8795,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, goto out_unlock; } - err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); - if (err) { - drop_inode = 1; - goto out_unlock; - } - /* * If the active LSM wants to access the inode during * d_instantiate it needs these. Smack checks to see @@ -8771,23 +8803,22 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, */ inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + + err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); + if (err) + goto out_unlock_inode; err = btrfs_add_nondir(trans, dir, dentry, inode, 0, index); if (err) - drop_inode = 1; - else { - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_mapping->backing_dev_info = &root->fs_info->bdi; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; - } - if (drop_inode) - goto out_unlock; + goto out_unlock_inode; path = btrfs_alloc_path(); if (!path) { err = -ENOMEM; - drop_inode = 1; - goto out_unlock; + goto out_unlock_inode; } key.objectid = btrfs_ino(inode); key.offset = 0; @@ -8796,9 +8827,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, err = btrfs_insert_empty_item(trans, root, path, &key, datasize); if (err) { - drop_inode = 1; btrfs_free_path(path); - goto out_unlock; + goto out_unlock_inode; } leaf = path->nodes[0]; ei = btrfs_item_ptr(leaf, path->slots[0], @@ -8822,12 +8852,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode_set_bytes(inode, name_len); btrfs_i_size_write(inode, name_len); err = btrfs_update_inode(trans, root, inode); - if (err) + if (err) { drop_inode = 1; + goto out_unlock_inode; + } + + unlock_new_inode(inode); + d_instantiate(dentry, inode); out_unlock: - if (!err) - d_instantiate(dentry, inode); btrfs_end_transaction(trans, root); if (drop_inode) { inode_dec_link_count(inode); @@ -8835,6 +8868,11 @@ out_unlock: } btrfs_btree_balance_dirty(root); return err; + +out_unlock_inode: + drop_inode = 1; + unlock_new_inode(inode); + goto out_unlock; } static int __btrfs_prealloc_file_range(struct inode *inode, int mode, @@ -9018,14 +9056,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) goto out; } - ret = btrfs_init_inode_security(trans, inode, dir, NULL); - if (ret) - goto out; - - ret = btrfs_update_inode(trans, root, inode); - if (ret) - goto out; - inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; @@ -9033,9 +9063,16 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + ret = btrfs_init_inode_security(trans, inode, dir, NULL); + if (ret) + goto out_inode; + + ret = btrfs_update_inode(trans, root, inode); + if (ret) + goto out_inode; ret = btrfs_orphan_add(trans, inode); if (ret) - goto out; + goto out_inode; /* * We set number of links to 0 in btrfs_new_inode(), and here we set @@ -9045,6 +9082,7 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) * d_tmpfile() -> inode_dec_link_count() -> drop_nlink() */ set_nlink(inode, 1); + unlock_new_inode(inode); d_tmpfile(dentry, inode); mark_inode_dirty(inode); @@ -9054,8 +9092,12 @@ out: iput(inode); btrfs_balance_delayed_items(root); btrfs_btree_balance_dirty(root); - return ret; + +out_inode: + unlock_new_inode(inode); + goto out; + } static const struct inode_operations btrfs_dir_inode_operations = { -- cgit v1.2.3 From 21e81002f9788a3af591416b6dec60d7b67f2fb2 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 8 Sep 2014 16:17:55 -0700 Subject: nfs: fix kernel warning when removing proc entry I saw the following kernel warning: [ 1852.321222] ------------[ cut here ]------------ [ 1852.326527] WARNING: CPU: 0 PID: 118 at fs/proc/generic.c:521 remove_proc_entry+0x154/0x16b() [ 1852.335630] remove_proc_entry: removing non-empty directory 'fs/nfsfs', leaking at least 'volumes' [ 1852.344084] CPU: 0 PID: 118 Comm: kworker/u8:2 Not tainted 3.16.0+ #540 [ 1852.350036] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [ 1852.354992] Workqueue: netns cleanup_net [ 1852.358701] 0000000000000000 ffff880116f2fbd0 ffffffff819c03e9 ffff880116f2fc18 [ 1852.366474] ffff880116f2fc08 ffffffff810744ee ffffffff811e0e6e ffff8800d4e96238 [ 1852.373507] ffffffff81dbe665 ffff8800d46a5948 0000000000000005 ffff880116f2fc68 [ 1852.380224] Call Trace: [ 1852.381976] [] dump_stack+0x4d/0x66 [ 1852.385495] [] warn_slowpath_common+0x7a/0x93 [ 1852.389869] [] ? remove_proc_entry+0x154/0x16b [ 1852.393987] [] warn_slowpath_fmt+0x4c/0x4e [ 1852.397999] [] remove_proc_entry+0x154/0x16b [ 1852.402034] [] nfs_fs_proc_net_exit+0x53/0x56 [ 1852.406136] [] nfs_net_exit+0x12/0x1d [ 1852.409774] [] ops_exit_list+0x44/0x55 [ 1852.413529] [] cleanup_net+0xee/0x182 [ 1852.417198] [] process_one_work+0x209/0x40d [ 1852.502320] [] ? process_one_work+0x162/0x40d [ 1852.587629] [] worker_thread+0x1f0/0x2c7 [ 1852.673291] [] ? process_scheduled_works+0x2f/0x2f [ 1852.759470] [] kthread+0xc9/0xd1 [ 1852.843099] [] ? finish_task_switch+0x3a/0xce [ 1852.926518] [] ? __kthread_parkme+0x61/0x61 [ 1853.008565] [] ret_from_fork+0x7c/0xb0 [ 1853.076477] [] ? __kthread_parkme+0x61/0x61 [ 1853.140653] ---[ end trace 69c4c6617f78e32d ]--- It looks wrong that we add "/proc/net/nfsfs" in nfs_fs_proc_net_init() while remove "/proc/fs/nfsfs" in nfs_fs_proc_net_exit(). Fixes: commit 65b38851a17 (NFS: Fix /proc/fs/nfsfs/servers and /proc/fs/nfsfs/volumes) Cc: Eric W. Biederman Cc: Trond Myklebust Cc: Dan Aloni Signed-off-by: Cong Wang [Trond: replace uses of remove_proc_entry() with remove_proc_subtree() as suggested by Al Viro] Cc: stable@vger.kernel.org # 3.4.x : 65b38851a17: NFS: Fix /proc/fs/nfsfs/servers Cc: stable@vger.kernel.org # 3.4.x Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 1c5ff6d58385..6a4f3666e273 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1412,24 +1412,18 @@ int nfs_fs_proc_net_init(struct net *net) p = proc_create("volumes", S_IFREG|S_IRUGO, nn->proc_nfsfs, &nfs_volume_list_fops); if (!p) - goto error_2; + goto error_1; return 0; -error_2: - remove_proc_entry("servers", nn->proc_nfsfs); error_1: - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("nfsfs", net->proc_net); error_0: return -ENOMEM; } void nfs_fs_proc_net_exit(struct net *net) { - struct nfs_net *nn = net_generic(net, nfs_net_id); - - remove_proc_entry("volumes", nn->proc_nfsfs); - remove_proc_entry("servers", nn->proc_nfsfs); - remove_proc_entry("fs/nfsfs", NULL); + remove_proc_subtree("nfsfs", net->proc_net); } /* -- cgit v1.2.3 From 0c0e0d3c091ed5b823fb89e1d46dbb9abd5830a7 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 8 Sep 2014 08:26:01 -0400 Subject: nfs: revert "nfs4: queue free_lock_state job submission to nfsiod" This reverts commit 49a4bda22e186c4d0eb07f4a36b5b1a378f9398d. Christoph reported an oops due to the above commit: generic/089 242s ...[ 2187.041239] general protection fault: 0000 [#1] SMP [ 2187.042899] Modules linked in: [ 2187.044000] CPU: 0 PID: 11913 Comm: kworker/0:1 Not tainted 3.16.0-rc6+ #1151 [ 2187.044287] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2007 [ 2187.044287] Workqueue: nfsiod free_lock_state_work [ 2187.044287] task: ffff880072b50cd0 ti: ffff88007a4ec000 task.ti: ffff88007a4ec000 [ 2187.044287] RIP: 0010:[] [] free_lock_state_work+0x16/0x30 [ 2187.044287] RSP: 0018:ffff88007a4efd58 EFLAGS: 00010296 [ 2187.044287] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007a947ac0 RCX: 8000000000000000 [ 2187.044287] RDX: ffffffff826af9e0 RSI: ffff88007b093c00 RDI: ffff88007b093db8 [ 2187.044287] RBP: ffff88007a4efd58 R08: ffffffff832d3e10 R09: 000001c40efc0000 [ 2187.044287] R10: 0000000000000000 R11: 0000000000059e30 R12: ffff88007fc13240 [ 2187.044287] R13: ffff88007fc18b00 R14: ffff88007b093db8 R15: 0000000000000000 [ 2187.044287] FS: 0000000000000000(0000) GS:ffff88007fc00000(0000) knlGS:0000000000000000 [ 2187.044287] CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b [ 2187.044287] CR2: 00007f93ec33fb80 CR3: 0000000079dc2000 CR4: 00000000000006f0 [ 2187.044287] Stack: [ 2187.044287] ffff88007a4efdd8 ffffffff810cc877 ffffffff810cc80d ffff88007fc13258 [ 2187.044287] 000000007a947af0 0000000000000000 ffffffff8353ccc8 ffffffff82b6f3d0 [ 2187.044287] 0000000000000000 ffffffff82267679 ffff88007a4efdd8 ffff88007fc13240 [ 2187.044287] Call Trace: [ 2187.044287] [] process_one_work+0x1c7/0x490 [ 2187.044287] [] ? process_one_work+0x15d/0x490 [ 2187.044287] [] worker_thread+0x119/0x4f0 [ 2187.044287] [] ? trace_hardirqs_on+0xd/0x10 [ 2187.044287] [] ? init_pwq+0x190/0x190 [ 2187.044287] [] kthread+0xdf/0x100 [ 2187.044287] [] ? __init_kthread_worker+0x70/0x70 [ 2187.044287] [] ret_from_fork+0x7c/0xb0 [ 2187.044287] [] ? __init_kthread_worker+0x70/0x70 [ 2187.044287] Code: 0f 1f 44 00 00 31 c0 5d c3 66 66 66 2e 0f 1f 84 00 00 00 00 00 55 48 8d b7 48 fe ff ff 48 8b 87 58 fe ff ff 48 89 e5 48 8b 40 30 <48> 8b 00 48 8b 10 48 89 c7 48 8b 92 90 03 00 00 ff 52 28 5d c3 [ 2187.044287] RIP [] free_lock_state_work+0x16/0x30 [ 2187.044287] RSP [ 2187.103626] ---[ end trace 0f11326d28e5d8fa ]--- The original reason for this patch was because the fl_release_private operation couldn't sleep. With commit ed9814d85810 (locks: defer freeing locks in locks_delete_lock until after i_lock has been dropped), this is no longer a problem so we can revert this patch. Reported-by: Christoph Hellwig Signed-off-by: Jeff Layton Reviewed-by: Christoph Hellwig Tested-by: Christoph Hellwig Signed-off-by: Trond Myklebust --- fs/nfs/nfs4_fs.h | 13 ++++++------- fs/nfs/nfs4state.c | 24 ++++++------------------ 2 files changed, 12 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 92193eddb41d..a8b855ab4e22 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -130,16 +130,15 @@ enum { */ struct nfs4_lock_state { - struct list_head ls_locks; /* Other lock stateids */ - struct nfs4_state * ls_state; /* Pointer to open state */ + struct list_head ls_locks; /* Other lock stateids */ + struct nfs4_state * ls_state; /* Pointer to open state */ #define NFS_LOCK_INITIALIZED 0 #define NFS_LOCK_LOST 1 - unsigned long ls_flags; + unsigned long ls_flags; struct nfs_seqid_counter ls_seqid; - nfs4_stateid ls_stateid; - atomic_t ls_count; - fl_owner_t ls_owner; - struct work_struct ls_release; + nfs4_stateid ls_stateid; + atomic_t ls_count; + fl_owner_t ls_owner; }; /* bits for nfs4_state->flags */ diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index a043f618cd5a..22fe35104c0c 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -799,18 +799,6 @@ __nfs4_find_lock_state(struct nfs4_state *state, fl_owner_t fl_owner) return NULL; } -static void -free_lock_state_work(struct work_struct *work) -{ - struct nfs4_lock_state *lsp = container_of(work, - struct nfs4_lock_state, ls_release); - struct nfs4_state *state = lsp->ls_state; - struct nfs_server *server = state->owner->so_server; - struct nfs_client *clp = server->nfs_client; - - clp->cl_mvops->free_lock_state(server, lsp); -} - /* * Return a compatible lock_state. If no initialized lock_state structure * exists, return an uninitialized one. @@ -832,7 +820,6 @@ static struct nfs4_lock_state *nfs4_alloc_lock_state(struct nfs4_state *state, f if (lsp->ls_seqid.owner_id < 0) goto out_free; INIT_LIST_HEAD(&lsp->ls_locks); - INIT_WORK(&lsp->ls_release, free_lock_state_work); return lsp; out_free: kfree(lsp); @@ -896,12 +883,13 @@ void nfs4_put_lock_state(struct nfs4_lock_state *lsp) if (list_empty(&state->lock_states)) clear_bit(LK_STATE_IN_USE, &state->flags); spin_unlock(&state->state_lock); - if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) - queue_work(nfsiod_workqueue, &lsp->ls_release); - else { - server = state->owner->so_server; + server = state->owner->so_server; + if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) { + struct nfs_client *clp = server->nfs_client; + + clp->cl_mvops->free_lock_state(server, lsp); + } else nfs4_free_lock_state(server, lsp); - } } static void nfs4_fl_copy_lock(struct file_lock *dst, struct file_lock *src) -- cgit v1.2.3 From 224ecbf5a674ec7da3a3b3ea21ca62e2853653fa Mon Sep 17 00:00:00 2001 From: Weston Andros Adamson Date: Tue, 9 Sep 2014 17:51:47 -0400 Subject: pnfs: fix filelayout_retry_commit when idx > 0 filelayout_retry_commit was recently split out from alloc_ds_commits, but was done in such a way that the bucket pointer always starts at index 0 no matter what the @idx argument is set to. The intention of the @idx argument is to retry commits starting at bucket @idx. This is called when alloc_ds_commits fails for a bucket. Signed-off-by: Weston Andros Adamson Signed-off-by: Trond Myklebust --- fs/nfs/filelayout/filelayout.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 1359c4a27393..90978075f730 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -1269,11 +1269,12 @@ filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page) static void filelayout_retry_commit(struct nfs_commit_info *cinfo, int idx) { struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds; - struct pnfs_commit_bucket *bucket = fl_cinfo->buckets; + struct pnfs_commit_bucket *bucket; struct pnfs_layout_segment *freeme; int i; - for (i = idx; i < fl_cinfo->nbuckets; i++, bucket++) { + for (i = idx; i < fl_cinfo->nbuckets; i++) { + bucket = &fl_cinfo->buckets[i]; if (list_empty(&bucket->committing)) continue; nfs_retry_commit(&bucket->committing, bucket->clseg, cinfo); -- cgit v1.2.3 From c680e41b3a2e944185c74bf60531e3d316d3ecc4 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Tue, 9 Sep 2014 14:50:51 -0700 Subject: eventpoll: fix uninitialized variable in epoll_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling epoll_ctl with operation EPOLL_CTL_DEL, structure epds is not initialized but ep_take_care_of_epollwakeup reads its event field. When this unintialized field has EPOLLWAKEUP bit set, a capability check is done for CAP_BLOCK_SUSPEND in ep_take_care_of_epollwakeup. This produces unexpected messages in the audit log, such as (on a system running SELinux): type=AVC msg=audit(1408212798.866:410): avc: denied { block_suspend } for pid=7754 comm="dbus-daemon" capability=36 scontext=unconfined_u:unconfined_r:unconfined_t tcontext=unconfined_u:unconfined_r:unconfined_t tclass=capability2 permissive=1 type=SYSCALL msg=audit(1408212798.866:410): arch=c000003e syscall=233 success=yes exit=0 a0=3 a1=2 a2=9 a3=7fffd4d66ec0 items=0 ppid=1 pid=7754 auid=1000 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=3 comm="dbus-daemon" exe="/usr/bin/dbus-daemon" subj=unconfined_u:unconfined_r:unconfined_t key=(null) ("arch=c000003e syscall=233 a1=2" means "epoll_ctl(op=EPOLL_CTL_DEL)") Remove use of epds in epoll_ctl when op == EPOLL_CTL_DEL. Fixes: 4d7e30d98939 ("epoll: Add a flag, EPOLLWAKEUP, to prevent suspend while epoll events are ready") Signed-off-by: Nicolas Iooss Cc: Alexander Viro Cc: Arve Hjønnevåg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b10b48c2a7af..7bcfff900f05 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - ep_take_care_of_epollwakeup(&epds); + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor -- cgit v1.2.3 From 1fc98d11cac6dd66342e5580cb2687e5b1e9a613 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:04 -0700 Subject: fsnotify/fdinfo: use named constants instead of hardcoded values MAX_HANDLE_SZ is equal to 128, but currently the size of pad is only 64 bytes, so exportfs_encode_inode_fh can return an error. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 238a5930cb3c..660d33bc1bef 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) { struct { struct file_handle handle; - u8 pad[64]; + u8 pad[MAX_HANDLE_SZ]; } f; int size, ret, i; @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == 255) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } -- cgit v1.2.3 From 7e8824816bda16bb11ff5ff1e1212d642e57b0b3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:06 -0700 Subject: fs/notify: don't show f_handle if exportfs_encode_inode_fh failed Currently we handle only ENOSPC. In case of other errors the file_handle variable isn't filled properly and we will show a part of stack. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 660d33bc1bef..9d7e2b9659cb 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } -- cgit v1.2.3 From 7b7a91152d3c2ddca95172cac1675625cc8dffaf Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Wed, 10 Sep 2014 14:09:20 -0400 Subject: GFS2: Hash the negative dentry during inode lookup Fix a regression introduced by: 6d4ade986f9c8df31e68 GFS2: Add atomic_open support where an early return misses d_splice_alias() which had been adding the negative dentry. Signed-off-by: Benjamin Coddington Signed-off-by: Bob Peterson Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index e62e59477884..9317ddc1b3c3 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -840,8 +840,10 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, int error; inode = gfs2_lookupi(dir, &dentry->d_name, 0); - if (!inode) + if (inode == NULL) { + d_add(dentry, NULL); return NULL; + } if (IS_ERR(inode)) return ERR_CAST(inode); -- cgit v1.2.3 From a937cca270c544c4319572b57b196b2251b07dd2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 10 Sep 2014 21:22:51 +0200 Subject: GFS2: Don't use MAXQUOTAS value MAXQUOTAS value defines maximum number of quota types VFS supports. This isn't necessarily the number of types gfs2 supports and with addition of project quotas these two numbers stop matching. So make gfs2 use its private definition. CC: cluster-devel@redhat.com Signed-off-by: Jan Kara Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 67d310c9ada3..39e7e9959b74 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -262,6 +262,9 @@ struct gfs2_holder { unsigned long gh_ip; }; +/* Number of quota types we support */ +#define GFS2_MAXQUOTAS 2 + /* Resource group multi-block reservation, in order of appearance: Step 1. Function prepares to write, allocates a mb, sets the size hint. @@ -282,8 +285,8 @@ struct gfs2_blkreserv { u64 rs_inum; /* Inode number for reservation */ /* ancillary quota stuff */ - struct gfs2_quota_data *rs_qa_qd[2 * MAXQUOTAS]; - struct gfs2_holder rs_qa_qd_ghs[2 * MAXQUOTAS]; + struct gfs2_quota_data *rs_qa_qd[2 * GFS2_MAXQUOTAS]; + struct gfs2_holder rs_qa_qd_ghs[2 * GFS2_MAXQUOTAS]; unsigned int rs_qa_qd_num; }; -- cgit v1.2.3 From f39c01047994e66e7f3d89ddb4c6141f23349d8d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Sep 2014 16:19:37 +1000 Subject: NFS: remove BUG possibility in nfs4_open_and_get_state commit 4fa2c54b5198d09607a534e2fd436581064587ed NFS: nfs4_do_open should add negative results to the dcache. used "d_drop(); d_add();" to ensure that a dentry was hashed as a negative cached entry. This is not safe if the dentry has an non-NULL ->d_inode. It will trigger a BUG_ON in d_instantiate(). In that case, d_delete() is needed. Also, only d_add if the dentry is currently unhashed, it seems pointless removed and re-adding it unchanged. Reported-by: Christoph Hellwig Fixes: 4fa2c54b5198d09607a534e2fd436581064587ed Cc: Jeff Layton Link: http://lkml.kernel.org/r/20140908144525.GB19811@infradead.org Signed-off-by: NeilBrown Acked-by: Jeff Layton Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 7dd8aca31c29..ac2dd953fc18 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2226,9 +2226,13 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, ret = _nfs4_proc_open(opendata); if (ret != 0) { if (ret == -ENOENT) { - d_drop(opendata->dentry); - d_add(opendata->dentry, NULL); - nfs_set_verifier(opendata->dentry, + dentry = opendata->dentry; + if (dentry->d_inode) + d_delete(dentry); + else if (d_unhashed(dentry)) + d_add(dentry, NULL); + + nfs_set_verifier(dentry, nfs_save_change_attribute(opendata->dir->d_inode)); } goto out; -- cgit v1.2.3 From cfb2f9d5c921e38b0f12bb26fed10b877664444d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 12 Sep 2014 20:56:04 +0100 Subject: GFS2: fix d_splice_alias() misuses Callers of d_splice_alias(dentry, inode) don't need iput(), neither on success nor on failure. Either the reference to inode is stored in a previously negative dentry, or it's dropped. In either case inode reference the caller used to hold is consumed. __gfs2_lookup() does iput() in case when d_splice_alias() has failed. Double iput() if we ever hit that. And gfs2_create_inode() ends up not only with double iput(), but with link count dropped to zero - on an inode it has just found in directory. Cc: stable@vger.kernel.org # v3.14+ Signed-off-by: Al Viro Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9317ddc1b3c3..fc8ac2ee0667 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -626,8 +626,10 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { d = d_splice_alias(inode, dentry); error = PTR_ERR(d); - if (IS_ERR(d)) + if (IS_ERR(d)) { + inode = ERR_CAST(d); goto fail_gunlock; + } error = 0; if (file) { if (S_ISREG(inode->i_mode)) { @@ -856,7 +858,6 @@ static struct dentry *__gfs2_lookup(struct inode *dir, struct dentry *dentry, d = d_splice_alias(inode, dentry); if (IS_ERR(d)) { - iput(inode); gfs2_glock_dq_uninit(&gh); return d; } -- cgit v1.2.3 From 99d263d4c5b2f541dfacb5391e22e8c91ea982a6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 13 Sep 2014 11:30:10 -0700 Subject: vfs: fix bad hashing of dentries Josef Bacik found a performance regression between 3.2 and 3.10 and narrowed it down to commit bfcfaa77bdf0 ("vfs: use 'unsigned long' accesses for dcache name comparison and hashing"). He reports: "The test case is essentially for (i = 0; i < 1000000; i++) mkdir("a$i"); On xfs on a fio card this goes at about 20k dir/sec with 3.2, and 12k dir/sec with 3.10. This is because we spend waaaaay more time in __d_lookup on 3.10 than in 3.2. The new hashing function for strings is suboptimal for < sizeof(unsigned long) string names (and hell even > sizeof(unsigned long) string names that I've tested). I broke out the old hashing function and the new one into a userspace helper to get real numbers and this is what I'm getting: Old hash table had 1000000 entries, 0 dupes, 0 max dupes New hash table had 12628 entries, 987372 dupes, 900 max dupes We had 11400 buckets with a p50 of 30 dupes, p90 of 240 dupes, p99 of 567 dupes for the new hash My test does the hash, and then does the d_hash into a integer pointer array the same size as the dentry hash table on my system, and then just increments the value at the address we got to see how many entries we overlap with. As you can see the old hash function ended up with all 1 million entries in their own bucket, whereas the new one they are only distributed among ~12.5k buckets, which is why we're using so much more CPU in __d_lookup". The reason for this hash regression is two-fold: - On 64-bit architectures the down-mixing of the original 64-bit word-at-a-time hash into the final 32-bit hash value is very simplistic and suboptimal, and just adds the two 32-bit parts together. In particular, because there is no bit shuffling and the mixing boundary is also a byte boundary, similar character patterns in the low and high word easily end up just canceling each other out. - the old byte-at-a-time hash mixed each byte into the final hash as it hashed the path component name, resulting in the low bits of the hash generally being a good source of hash data. That is not true for the word-at-a-time case, and the hash data is distributed among all the bits. The fix is the same in both cases: do a better job of mixing the bits up and using as much of the hash data as possible. We already have the "hash_32|64()" functions to do that. Reported-by: Josef Bacik Cc: Al Viro Cc: Christoph Hellwig Cc: Chris Mason Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/dcache.c | 3 +-- fs/namei.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index d30ce699ae4b..4023e77b800e 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -106,8 +106,7 @@ static inline struct hlist_bl_head *d_hash(const struct dentry *parent, unsigned int hash) { hash += (unsigned long) parent / L1_CACHE_BYTES; - hash = hash + (hash >> d_hash_shift); - return dentry_hashtable + (hash & d_hash_mask); + return dentry_hashtable + hash_32(hash, d_hash_shift); } /* Statistics gathering. */ diff --git a/fs/namei.c b/fs/namei.c index a996bb48dfab..229235862e50 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include "internal.h" @@ -1634,8 +1635,7 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) static inline unsigned int fold_hash(unsigned long hash) { - hash += hash >> (8*sizeof(int)); - return hash; + return hash_64(hash, 32); } #else /* 32-bit case */ -- cgit v1.2.3 From 6f18493e541c690169c3b1479d47d95f624161cf Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 11 Sep 2014 18:55:50 -0400 Subject: move the call of __d_drop(anon) into __d_materialise_unique(dentry, anon) and lock the right list there Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- fs/dcache.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index d30ce699ae4b..5c6e71dc23f5 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2656,6 +2656,12 @@ static void __d_materialise_dentry(struct dentry *dentry, struct dentry *anon) dentry->d_parent = dentry; list_del_init(&dentry->d_u.d_child); anon->d_parent = dparent; + if (likely(!d_unhashed(anon))) { + hlist_bl_lock(&anon->d_sb->s_anon); + __hlist_bl_del(&anon->d_hash); + anon->d_hash.pprev = NULL; + hlist_bl_unlock(&anon->d_sb->s_anon); + } list_move(&anon->d_u.d_child, &dparent->d_subdirs); write_seqcount_end(&dentry->d_seq); @@ -2714,7 +2720,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) write_seqlock(&rename_lock); __d_materialise_dentry(dentry, new); write_sequnlock(&rename_lock); - __d_drop(new); _d_rehash(new); spin_unlock(&new->d_lock); spin_unlock(&inode->i_lock); @@ -2778,7 +2783,6 @@ struct dentry *d_materialise_unique(struct dentry *dentry, struct inode *inode) * could splice into our tree? */ __d_materialise_dentry(dentry, alias); write_sequnlock(&rename_lock); - __d_drop(alias); goto found; } else { /* Nope, but we must(!) avoid directory -- cgit v1.2.3 From f5be3e29127aec8c87f883aadadff337f8c2cfd7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:50:45 -0400 Subject: fix bogus read_seqretry() checks introduced in b37199e read_seqretry() returns true on mismatch, not on match... Cc: stable@vger.kernel.org # 3.15+ Signed-off-by: Al Viro --- fs/namei.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index a996bb48dfab..3d1dc745f9d8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1137,7 +1137,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, */ *inode = path->dentry->d_inode; } - return read_seqretry(&mount_lock, nd->m_seq) && + return !read_seqretry(&mount_lock, nd->m_seq) && !(path->dentry->d_flags & DCACHE_NEED_AUTOMOUNT); } @@ -1174,7 +1174,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); - if (!read_seqretry(&mount_lock, nd->m_seq)) + if (read_seqretry(&mount_lock, nd->m_seq)) goto failed; } nd->inode = nd->path.dentry->d_inode; -- cgit v1.2.3 From 7bd88377d482e1eae3c5329b12e33cfd664fa6a9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:55:46 -0400 Subject: don't bugger nd->seq on set_root_rcu() from follow_dotdot_rcu() return the value instead, and have path_init() do the assignment. Broken by "vfs: Fix absolute RCU path walk failures due to uninitialized seq number", which was Cc-stable with 2.6.38+ as destination. This one should go where it went. To avoid dummy value returned in case when root is already set (it would do no harm, actually, since the only caller that doesn't ignore the return value is guaranteed to have nd->root *not* set, but it's more obvious that way), lift the check into callers. And do the same to set_root(), to keep them in sync. Cc: stable@vger.kernel.org # 2.6.38+ Signed-off-by: Al Viro --- fs/namei.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 3d1dc745f9d8..fe47e6d8e85f 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -643,24 +643,22 @@ static int complete_walk(struct nameidata *nd) static __always_inline void set_root(struct nameidata *nd) { - if (!nd->root.mnt) - get_fs_root(current->fs, &nd->root); + get_fs_root(current->fs, &nd->root); } static int link_path_walk(const char *, struct nameidata *); -static __always_inline void set_root_rcu(struct nameidata *nd) +static __always_inline unsigned set_root_rcu(struct nameidata *nd) { - if (!nd->root.mnt) { - struct fs_struct *fs = current->fs; - unsigned seq; + struct fs_struct *fs = current->fs; + unsigned seq, res; - do { - seq = read_seqcount_begin(&fs->seq); - nd->root = fs->root; - nd->seq = __read_seqcount_begin(&nd->root.dentry->d_seq); - } while (read_seqcount_retry(&fs->seq, seq)); - } + do { + seq = read_seqcount_begin(&fs->seq); + nd->root = fs->root; + res = __read_seqcount_begin(&nd->root.dentry->d_seq); + } while (read_seqcount_retry(&fs->seq, seq)); + return res; } static void path_put_conditional(struct path *path, struct nameidata *nd) @@ -860,7 +858,8 @@ follow_link(struct path *link, struct nameidata *nd, void **p) return PTR_ERR(s); } if (*s == '/') { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); path_put(&nd->path); nd->path = nd->root; path_get(&nd->root); @@ -1143,7 +1142,8 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { - set_root_rcu(nd); + if (!nd->root.mnt) + set_root_rcu(nd); while (1) { if (nd->path.dentry == nd->root.dentry && @@ -1256,7 +1256,8 @@ static void follow_mount(struct path *path) static void follow_dotdot(struct nameidata *nd) { - set_root(nd); + if (!nd->root.mnt) + set_root(nd); while(1) { struct dentry *old = nd->path.dentry; @@ -1852,7 +1853,7 @@ static int path_init(int dfd, const char *name, unsigned int flags, if (*name=='/') { if (flags & LOOKUP_RCU) { rcu_read_lock(); - set_root_rcu(nd); + nd->seq = set_root_rcu(nd); } else { set_root(nd); path_get(&nd->root); -- cgit v1.2.3 From 4023bfc9f351a7994fb6a7d515476c320f94a574 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 13 Sep 2014 21:59:43 -0400 Subject: be careful with nd->inode in path_init() and follow_dotdot_rcu() in the former we simply check if dentry is still valid after picking its ->d_inode; in the latter we fetch ->d_inode in the same places where we fetch dentry and its ->d_seq, under the same checks. Cc: stable@vger.kernel.org # 2.6.38+ Signed-off-by: Al Viro --- fs/namei.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index fe47e6d8e85f..d07bc1b206c3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1142,6 +1142,7 @@ static bool __follow_mount_rcu(struct nameidata *nd, struct path *path, static int follow_dotdot_rcu(struct nameidata *nd) { + struct inode *inode = nd->inode; if (!nd->root.mnt) set_root_rcu(nd); @@ -1155,6 +1156,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) struct dentry *parent = old->d_parent; unsigned seq; + inode = parent->d_inode; seq = read_seqcount_begin(&parent->d_seq); if (read_seqcount_retry(&old->d_seq, nd->seq)) goto failed; @@ -1164,6 +1166,7 @@ static int follow_dotdot_rcu(struct nameidata *nd) } if (!follow_up_rcu(&nd->path)) break; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); } while (d_mountpoint(nd->path.dentry)) { @@ -1173,11 +1176,12 @@ static int follow_dotdot_rcu(struct nameidata *nd) break; nd->path.mnt = &mounted->mnt; nd->path.dentry = mounted->mnt.mnt_root; + inode = nd->path.dentry->d_inode; nd->seq = read_seqcount_begin(&nd->path.dentry->d_seq); if (read_seqretry(&mount_lock, nd->m_seq)) goto failed; } - nd->inode = nd->path.dentry->d_inode; + nd->inode = inode; return 0; failed: @@ -1904,7 +1908,14 @@ static int path_init(int dfd, const char *name, unsigned int flags, } nd->inode = nd->path.dentry->d_inode; - return 0; + if (!(flags & LOOKUP_RCU)) + return 0; + if (likely(!read_seqcount_retry(&nd->path.dentry->d_seq, nd->seq))) + return 0; + if (!(nd->flags & LOOKUP_ROOT)) + nd->root.mnt = NULL; + rcu_read_unlock(); + return -ECHILD; } static inline int lookup_last(struct nameidata *nd, struct path *path) -- cgit v1.2.3 From 2ae83bf93882d1ec0cd775c489bd1bee611f792e Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 14 Sep 2014 17:06:36 -0500 Subject: [CIFS] Fix setting time before epoch (negative time values) xfstest generic/258 sets the time on a file to a negative value (before 1970) which fails since do_div can not handle negative numbers. In addition 'normal' division of 64 bit values does not build on 32 bit arch so have to workaround this by special casing negative values in cifs_NTtimeToUnix Samba server also has a bug with this (see samba bugzilla 7771) but it works to Windows server. Signed-off-by: Steve French --- fs/cifs/netmisc.c | 20 ++++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/cifs/netmisc.c b/fs/cifs/netmisc.c index 6834b9c3bec1..b333ff60781d 100644 --- a/fs/cifs/netmisc.c +++ b/fs/cifs/netmisc.c @@ -925,11 +925,23 @@ cifs_NTtimeToUnix(__le64 ntutc) /* BB what about the timezone? BB */ /* Subtract the NTFS time offset, then convert to 1s intervals. */ - u64 t; + s64 t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; + + /* + * Unfortunately can not use normal 64 bit division on 32 bit arch, but + * the alternative, do_div, does not work with negative numbers so have + * to special case them + */ + if (t < 0) { + t = -t; + ts.tv_nsec = (long)(do_div(t, 10000000) * 100); + ts.tv_nsec = -ts.tv_nsec; + ts.tv_sec = -t; + } else { + ts.tv_nsec = (long)do_div(t, 10000000) * 100; + ts.tv_sec = t; + } - t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET; - ts.tv_nsec = do_div(t, 10000000) * 100; - ts.tv_sec = t; return ts; } -- cgit v1.2.3 From 9226b5b440f2b4fbb3b797f3cb74a9a627220660 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 14 Sep 2014 17:28:32 -0700 Subject: vfs: avoid non-forwarding large load after small store in path lookup The performance regression that Josef Bacik reported in the pathname lookup (see commit 99d263d4c5b2 "vfs: fix bad hashing of dentries") made me look at performance stability of the dcache code, just to verify that the problem was actually fixed. That turned up a few other problems in this area. There are a few cases where we exit RCU lookup mode and go to the slow serializing case when we shouldn't, Al has fixed those and they'll come in with the next VFS pull. But my performance verification also shows that link_path_walk() turns out to have a very unfortunate 32-bit store of the length and hash of the name we look up, followed by a 64-bit read of the combined hash_len field. That screws up the processor store to load forwarding, causing an unnecessary hickup in this critical routine. It's caused by the ugly calling convention for the "hash_name()" function, and easily fixed by just making hash_name() fill in the whole 'struct qstr' rather than passing it a pointer to just the hash value. With that, the profile for this function looks much smoother. Signed-off-by: Linus Torvalds --- fs/namei.c | 19 ++++++++++--------- include/linux/dcache.h | 1 + 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 229235862e50..2be5120b81b3 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1669,13 +1669,14 @@ EXPORT_SYMBOL(full_name_hash); /* * Calculate the length and hash of the path component, and - * return the length of the component; + * fill in the qstr. return the "len" as the result. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline unsigned long hash_name(const char *name, struct qstr *res) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; + res->name = name; hash = a = 0; len = -sizeof(unsigned long); do { @@ -1691,9 +1692,10 @@ static inline unsigned long hash_name(const char *name, unsigned int *hashp) mask = create_zero_mask(adata | bdata); hash += a & zero_bytemask(mask); - *hashp = fold_hash(hash); + len += find_zero(mask); + res->hash_len = hashlen_create(fold_hash(hash), len); - return len + find_zero(mask); + return len; } #else @@ -1711,18 +1713,19 @@ EXPORT_SYMBOL(full_name_hash); * We know there's a real path component here of at least * one character. */ -static inline unsigned long hash_name(const char *name, unsigned int *hashp) +static inline long hash_name(const char *name, struct qstr *res) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; + res->name = name; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - *hashp = end_name_hash(hash); + res->hash_len = hashlen_create(end_name_hash(hash), len); return len; } @@ -1756,9 +1759,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) if (err) break; - len = hash_name(name, &this.hash); - this.name = name; - this.len = len; + len = hash_name(name, &this); type = LAST_NORM; if (name[0] == '.') switch (len) { diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e4ae2ad48d07..75a227cc7ce2 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -55,6 +55,7 @@ struct qstr { #define QSTR_INIT(n,l) { { { .len = l } }, .name = n } #define hashlen_hash(hashlen) ((u32) (hashlen)) #define hashlen_len(hashlen) ((u32)((hashlen) >> 32)) +#define hashlen_create(hash,len) (((u64)(len)<<32)|(u32)(hash)) struct dentry_stat_t { long nr_dentry; -- cgit v1.2.3 From da80659d4aa758dc6935b10ec64513f0b67bc969 Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 14 Sep 2014 23:27:09 -0500 Subject: [SMB3] Fix oops when creating symlinks on smb3 We were not checking for symlink support properly for SMB2/SMB3 mounts so could oops when mounted with mfsymlinks when try to create symlink when mfsymlinks on smb2/smb3 mounts Signed-off-by: Steve French Cc: # 3.14+ CC: Sachin Prabhu --- fs/cifs/link.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 68559fd557fb..a5c2812ead68 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -213,8 +213,12 @@ create_mf_symlink(const unsigned int xid, struct cifs_tcon *tcon, if (rc) goto out; - rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, cifs_sb, - fromName, buf, &bytes_written); + if (tcon->ses->server->ops->create_mf_symlink) + rc = tcon->ses->server->ops->create_mf_symlink(xid, tcon, + cifs_sb, fromName, buf, &bytes_written); + else + rc = -EOPNOTSUPP; + if (rc) goto out; -- cgit v1.2.3 From d6bb3e9075bbcf758d6084bb581f797bb6ea24c6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 15 Sep 2014 10:51:07 -0700 Subject: vfs: simplify and shrink stack frame of link_path_walk() Commit 9226b5b440f2 ("vfs: avoid non-forwarding large load after small store in path lookup") made link_path_walk() always access the "hash_len" field as a single 64-bit entity, in order to avoid mixed size accesses to the members. However, what I didn't notice was that that effectively means that the whole "struct qstr this" is now basically redundant. We already explicitly track the "const char *name", and if we just use "u64 hash_len" instead of "long len", there is nothing else left of the "struct qstr". We do end up wanting the "struct qstr" if we have a filesystem with a "d_hash()" function, but that's a rare case, and we might as well then just squirrell away the name and hash_len at that point. End result: fewer live variables in the loop, a smaller stack frame, and better code generation. And we don't need to pass in pointers variables to helper functions any more, because the return value contains all the relevant information. So this removes more lines than it adds, and the source code is clearer too. Signed-off-by: Linus Torvalds --- fs/namei.c | 39 ++++++++++++++++++--------------------- 1 file changed, 18 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 215e44254c53..01d03892316c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1674,14 +1674,13 @@ EXPORT_SYMBOL(full_name_hash); /* * Calculate the length and hash of the path component, and - * fill in the qstr. return the "len" as the result. + * return the "hash_len" as the result. */ -static inline unsigned long hash_name(const char *name, struct qstr *res) +static inline u64 hash_name(const char *name) { unsigned long a, b, adata, bdata, mask, hash, len; const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - res->name = name; hash = a = 0; len = -sizeof(unsigned long); do { @@ -1698,9 +1697,7 @@ static inline unsigned long hash_name(const char *name, struct qstr *res) hash += a & zero_bytemask(mask); len += find_zero(mask); - res->hash_len = hashlen_create(fold_hash(hash), len); - - return len; + return hashlen_create(fold_hash(hash), len); } #else @@ -1718,20 +1715,18 @@ EXPORT_SYMBOL(full_name_hash); * We know there's a real path component here of at least * one character. */ -static inline long hash_name(const char *name, struct qstr *res) +static inline u64 hash_name(const char *name) { unsigned long hash = init_name_hash(); unsigned long len = 0, c; - res->name = name; c = (unsigned char)*name; do { len++; hash = partial_name_hash(c, hash); c = (unsigned char)name[len]; } while (c && c != '/'); - res->hash_len = hashlen_create(end_name_hash(hash), len); - return len; + return hashlen_create(end_name_hash(hash), len); } #endif @@ -1756,18 +1751,17 @@ static int link_path_walk(const char *name, struct nameidata *nd) /* At this point we know we have a real path component. */ for(;;) { - struct qstr this; - long len; + u64 hash_len; int type; err = may_lookup(nd); if (err) break; - len = hash_name(name, &this); + hash_len = hash_name(name); type = LAST_NORM; - if (name[0] == '.') switch (len) { + if (name[0] == '.') switch (hashlen_len(hash_len)) { case 2: if (name[1] == '.') { type = LAST_DOTDOT; @@ -1781,29 +1775,32 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { + struct qstr this = { .hash_len = hash_len, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) break; + hash_len = this.hash_len; + name = this.name; } } - nd->last = this; + nd->last.hash_len = hash_len; + nd->last.name = name; nd->last_type = type; - if (!name[len]) + name += hashlen_len(hash_len); + if (!*name) return 0; /* * If it wasn't NUL, we know it was '/'. Skip that * slash, and continue until no more slashes. */ do { - len++; - } while (unlikely(name[len] == '/')); - if (!name[len]) + name++; + } while (unlikely(*name == '/')); + if (!*name) return 0; - name += len; - err = walk_component(nd, &next, LOOKUP_FOLLOW); if (err < 0) return err; -- cgit v1.2.3 From a5c3e1c725af9e84deceb3c33939ca4ffe3fefc8 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 04:16:19 -0500 Subject: Revert "cifs: No need to send SIGKILL to demux_thread during umount" This reverts commit 52a36244443eabb594bdb63622ff2dd7a083f0e2. Causes rmmod to fail for at least 7 seconds after unmount which makes automated testing a little harder when reloading cifs.ko between test runs. Signed-off-by: Namjae Jeon CC: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 8a9fded7c135..36ca2045009b 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -837,6 +837,7 @@ cifs_demultiplex_thread(void *p) struct TCP_Server_Info *server = p; unsigned int pdu_length; char *buf = NULL; + struct task_struct *task_to_wake = NULL; struct mid_q_entry *mid_entry; current->flags |= PF_MEMALLOC; @@ -927,7 +928,19 @@ cifs_demultiplex_thread(void *p) if (server->smallbuf) /* no sense logging a debug message if NULL */ cifs_small_buf_release(server->smallbuf); + task_to_wake = xchg(&server->tsk, NULL); clean_demultiplex_info(server); + + /* if server->tsk was NULL then wait for a signal before exiting */ + if (!task_to_wake) { + set_current_state(TASK_INTERRUPTIBLE); + while (!signal_pending(current)) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + set_current_state(TASK_RUNNING); + } + module_put_and_exit(0); } @@ -2050,6 +2063,8 @@ cifs_find_tcp_session(struct smb_vol *vol) static void cifs_put_tcp_session(struct TCP_Server_Info *server) { + struct task_struct *task; + spin_lock(&cifs_tcp_ses_lock); if (--server->srv_count > 0) { spin_unlock(&cifs_tcp_ses_lock); @@ -2073,6 +2088,10 @@ cifs_put_tcp_session(struct TCP_Server_Info *server) kfree(server->session_key.response); server->session_key.response = NULL; server->session_key.len = 0; + + task = xchg(&server->tsk, NULL); + if (task) + force_sig(SIGKILL, task); } static struct TCP_Server_Info * -- cgit v1.2.3 From 116ae5e2b09f7022281c253a6037a74d0446bfaf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 8 Sep 2014 17:20:56 +0200 Subject: cifs: remove dead code cifs provides two dummy functions 'sess_auth_lanman' and 'sess_auth_kerberos' for the case in which the respective features are not defined. However, the caller is also under an #ifdef, so we just get warnings about unused code: fs/cifs/sess.c:1109:1: warning: 'sess_auth_kerberos' defined but not used [-Wunused-function] sess_auth_kerberos(struct sess_data *sess_data) Removing the dead functions gets rid of the warnings without any downsides that I can see. (Yalin Wang reported the identical problem and fix so added him) Signed-off-by: Arnd Bergmann Signed-off-by: Yalin Wang Signed-off-by: Steve French --- fs/cifs/sess.c | 17 ----------------- 1 file changed, 17 deletions(-) (limited to 'fs') diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 3a5e83317683..57db63ff88da 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -745,14 +745,6 @@ out: sess_free_buffer(sess_data); } -#else - -static void -sess_auth_lanman(struct sess_data *sess_data) -{ - sess_data->result = -EOPNOTSUPP; - sess_data->func = NULL; -} #endif static void @@ -1103,15 +1095,6 @@ out: ses->auth_key.response = NULL; } -#else - -static void -sess_auth_kerberos(struct sess_data *sess_data) -{ - cifs_dbg(VFS, "Kerberos negotiated but upcall support disabled!\n"); - sess_data->result = -ENOSYS; - sess_data->func = NULL; -} #endif /* ! CONFIG_CIFS_UPCALL */ /* -- cgit v1.2.3 From 69af38dbc5b44e90dde35af4a1df3f5510809a1a Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 04:13:31 -0500 Subject: Update version number displayed by modinfo for cifs.ko Update cifs.ko version to 2.05 Signed-off-by: Steve French w --- fs/cifs/cifsfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index b0fafa499505..002e0c173939 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -136,5 +136,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.04" +#define CIFS_VERSION "2.05" #endif /* _CIFSFS_H */ -- cgit v1.2.3 From 364d42930d96a872b2076deeb9c24f9ff132de34 Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 16 Sep 2014 06:40:25 -0500 Subject: Fix mfsymlinks file size check If the mfsymlinks file size has changed (e.g. the file no longer represents an emulated symlink) we were not returning an error properly. Signed-off-by: Steve French Reviewed-by: Stefan Metzmacher --- fs/cifs/link.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/link.c b/fs/cifs/link.c index a5c2812ead68..5657416d3483 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -343,9 +343,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) + if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + rc = -ENOENT; /* it's not a symlink */ goto out; + } io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; -- cgit v1.2.3 From a060dc5010ffa32f3a83e5336f6eeb6551fa137a Mon Sep 17 00:00:00 2001 From: James Hogan Date: Tue, 16 Sep 2014 13:07:35 +0100 Subject: vfs: workaround gcc <4.6 build error in link_path_walk() Commit d6bb3e9075bb ("vfs: simplify and shrink stack frame of link_path_walk()") introduced build problems with GCC versions older than 4.6 due to the initialisation of a member of an anonymous union in struct qstr without enclosing braces. This hits GCC bug 10676 [1] (which was fixed in GCC 4.6 by [2]), and causes the following build error: fs/namei.c: In function 'link_path_walk': fs/namei.c:1778: error: unknown field 'hash_len' specified in initializer This is worked around by adding explicit braces. [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=10676 [2] https://gcc.gnu.org/viewcvs/gcc?view=revision&revision=159206 Fixes: d6bb3e9075bb (vfs: simplify and shrink stack frame of link_path_walk()) Signed-off-by: James Hogan Cc: Linus Torvalds Cc: Alexander Viro Cc: Geert Uytterhoeven Cc: linux-fsdevel@vger.kernel.org Cc: linux-metag@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/namei.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namei.c b/fs/namei.c index 01d03892316c..a7b05bf82d31 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1775,7 +1775,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) struct dentry *parent = nd->path.dentry; nd->flags &= ~LOOKUP_JUMPED; if (unlikely(parent->d_flags & DCACHE_OP_HASH)) { - struct qstr this = { .hash_len = hash_len, .name = name }; + struct qstr this = { { .hash_len = hash_len }, .name = name }; err = parent->d_op->d_hash(parent, &this); if (err < 0) break; -- cgit v1.2.3 From 125c4cf9f37c98fed2c08229b31358cfec63dcf6 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 11 Sep 2014 21:22:14 +0100 Subject: Btrfs: set inode's logged_trans/last_log_commit after ranged fsync When a ranged fsync finishes if there are still extent maps in the modified list, still set the inode's logged_trans and last_log_commit. This is important in case an inode is fsync'ed and unlinked in the same transaction, to ensure its inode ref gets deleted from the log and the respective dentries in its parent are deleted too from the log (if the parent directory was fsync'ed in the same transaction). Instead make btrfs_inode_in_log() return false if the list of modified extent maps isn't empty. This is an incremental on top of the v4 version of the patch: "Btrfs: fix fsync data loss after a ranged fsync" which was added to its v5, but didn't make it on time. Signed-off-by: Filipe Manana Signed-off-by: Chris Mason --- fs/btrfs/btrfs_inode.h | 13 +++++++++++-- fs/btrfs/tree-log.c | 14 ++------------ 2 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 43527fd78825..56b8522d5767 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -234,8 +234,17 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) BTRFS_I(inode)->last_sub_trans <= BTRFS_I(inode)->last_log_commit && BTRFS_I(inode)->last_sub_trans <= - BTRFS_I(inode)->root->last_log_commit) - return 1; + BTRFS_I(inode)->root->last_log_commit) { + /* + * After a ranged fsync we might have left some extent maps + * (that fall outside the fsync's range). So return false + * here if the list isn't empty, to make sure btrfs_log_inode() + * will be called and process those extent maps. + */ + smp_mb(); + if (list_empty(&BTRFS_I(inode)->extent_tree.modified_extents)) + return 1; + } return 0; } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index d296efe2d3e7..1d1ba083ca6e 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4093,18 +4093,8 @@ log_extents: } } - write_lock(&em_tree->lock); - /* - * If we're doing a ranged fsync and there are still modified extents - * in the list, we must run on the next fsync call as it might cover - * those extents (a full fsync or an fsync for other range). - */ - if (list_empty(&em_tree->modified_extents)) { - BTRFS_I(inode)->logged_trans = trans->transid; - BTRFS_I(inode)->last_log_commit = - BTRFS_I(inode)->last_sub_trans; - } - write_unlock(&em_tree->lock); + BTRFS_I(inode)->logged_trans = trans->transid; + BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->last_sub_trans; out_unlock: if (unlikely(err)) btrfs_put_logged_extents(&logged_list); -- cgit v1.2.3 From 3e1199dcad004a40b55297a4736ccd1b9b81a952 Mon Sep 17 00:00:00 2001 From: Milosz Tanski Date: Wed, 13 Aug 2014 12:58:26 -0400 Subject: FS-Cache: refcount becomes corrupt under vma pressure. In rare cases under heavy VMA pressure the ref count for a fscache cookie becomes corrupt. In this case we decrement ref count even if we fail before incrementing the refcount. FS-Cache: Assertion failed bnode-eca5f9c6/syslog 0 > 0 is false ------------[ cut here ]------------ kernel BUG at fs/fscache/cookie.c:519! invalid opcode: 0000 [#1] SMP Call Trace: [] __fscache_relinquish_cookie+0x50/0x220 [fscache] [] ceph_fscache_unregister_inode_cookie+0x3e/0x50 [ceph] [] ceph_destroy_inode+0x33/0x200 [ceph] [] ? __fsnotify_inode_delete+0xe/0x10 [] destroy_inode+0x3c/0x70 [] evict+0x111/0x180 [] iput+0x103/0x190 [] __dentry_kill+0x1c8/0x220 [] shrink_dentry_list+0xf1/0x250 [] prune_dcache_sb+0x4c/0x60 [] super_cache_scan+0xff/0x170 [] shrink_slab_node+0x140/0x2c0 [] shrink_slab+0x8a/0x130 [] balance_pgdat+0x3e2/0x5d0 [] kswapd+0x16a/0x4a0 [] ? __wake_up_sync+0x20/0x20 [] ? balance_pgdat+0x5d0/0x5d0 [] kthread+0xc9/0xe0 [] ? ftrace_raw_event_xen_mmu_release_ptpage+0x70/0x90 [] ? flush_kthread_worker+0xb0/0xb0 [] ret_from_fork+0x7c/0xb0 [] ? flush_kthread_worker+0xb0/0xb0 RIP [] __fscache_disable_cookie+0x1db/0x210 [fscache] RSP ---[ end trace 254d0d7c74a01f25 ]--- Signed-off-by: Milosz Tanski Signed-off-by: David Howells --- fs/fscache/page.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/fscache/page.c b/fs/fscache/page.c index 781ac7b0b53e..de33b3fccca6 100644 --- a/fs/fscache/page.c +++ b/fs/fscache/page.c @@ -198,7 +198,7 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) { struct fscache_operation *op; struct fscache_object *object; - bool wake_cookie; + bool wake_cookie = false; _enter("%p", cookie); @@ -228,15 +228,16 @@ int __fscache_attr_changed(struct fscache_cookie *cookie) __fscache_use_cookie(cookie); if (fscache_submit_exclusive_op(object, op) < 0) - goto nobufs; + goto nobufs_dec; spin_unlock(&cookie->lock); fscache_stat(&fscache_n_attr_changed_ok); fscache_put_operation(op); _leave(" = 0"); return 0; -nobufs: +nobufs_dec: wake_cookie = __fscache_unuse_cookie(cookie); +nobufs: spin_unlock(&cookie->lock); kfree(op); if (wake_cookie) -- cgit v1.2.3 From 696382f938d22597f4945865ed8e3f25e240cd41 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Wed, 27 Aug 2014 15:06:20 +0100 Subject: cachefiles: remove two unused pagevecs. These two have been unused since commit c4d6d8dbf335c7fa47341654a37c53a512b519bb CacheFiles: Fix the marking of cached pages in 3.8. Signed-off-by: NeilBrown Signed-off-by: David Howells --- fs/cachefiles/rdwr.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'fs') diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 4b1fb5ca65b8..25e745b8eb1b 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -151,7 +151,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) struct cachefiles_one_read *monitor; struct cachefiles_object *object; struct fscache_retrieval *op; - struct pagevec pagevec; int error, max; op = container_of(_op, struct fscache_retrieval, op); @@ -160,8 +159,6 @@ static void cachefiles_read_copier(struct fscache_operation *_op) _enter("{ino=%lu}", object->backer->d_inode->i_ino); - pagevec_init(&pagevec, 0); - max = 8; spin_lock_irq(&object->work_lock); @@ -396,7 +393,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, { struct cachefiles_object *object; struct cachefiles_cache *cache; - struct pagevec pagevec; struct inode *inode; sector_t block0, block; unsigned shift; @@ -427,8 +423,6 @@ int cachefiles_read_or_alloc_page(struct fscache_retrieval *op, op->op.flags |= FSCACHE_OP_ASYNC; op->op.processor = cachefiles_read_copier; - pagevec_init(&pagevec, 0); - /* we assume the absence or presence of the first block is a good * enough indication for the page as a whole * - TODO: don't use bmap() for this as it is _not_ actually good -- cgit v1.2.3 From e2cf1f1cc7636bd860e47cd0ad6194da8975f8b5 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Sep 2014 23:28:38 +0100 Subject: CacheFiles: Handle rename2 Not all filesystems now provide the rename i_op - ext4 for one - but rather provide the rename2 i_op. CacheFiles checks that the filesystem has rename and so will reject ext4 now with EPERM: CacheFiles: Failed to register: -1 Fix this by checking for rename2 as an alternative. The call to vfs_rename() actually handles selection of the appropriate function, so we needn't worry about that. Turning on debugging shows: [cachef] ==> cachefiles_get_directory(,,cache) [cachef] subdir -> ffff88000b22b778 positive [cachef] <== cachefiles_get_directory() = -1 [check] where -1 is EPERM. Signed-off-by: David Howells --- fs/cachefiles/namei.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 5bf2b41e66d3..83e9c94ca2cf 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -779,7 +779,8 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, !subdir->d_inode->i_op->lookup || !subdir->d_inode->i_op->mkdir || !subdir->d_inode->i_op->create || - !subdir->d_inode->i_op->rename || + (!subdir->d_inode->i_op->rename && + !subdir->d_inode->i_op->rename2) || !subdir->d_inode->i_op->rmdir || !subdir->d_inode->i_op->unlink) goto check_error; -- cgit v1.2.3 From 0f23ae74f589304bf33233f85737f4fd368549eb Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 18 Sep 2014 07:49:05 -0700 Subject: Revert "Btrfs: device_list_add() should not update list when mounted" This reverts commit b96de000bc8bc9688b3a2abea4332bd57648a49f. This commit is triggering failures to mount by subvolume id in some configurations. The main problem is how many different ways this scanning function is used, both for scanning while mounted and unmounted. A proper cleanup is too big for late rcs. For now, just revert the commit and we'll put a better fix into a later merge window. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 340a92d08e84..2c2d6d1d8eee 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -529,12 +529,12 @@ static noinline int device_list_add(const char *path, */ /* - * As of now don't allow update to btrfs_fs_device through - * the btrfs dev scan cli, after FS has been mounted. + * For now, we do allow update to btrfs_fs_device through the + * btrfs dev scan cli after FS has been mounted. We're still + * tracking a problem where systems fail mount by subvolume id + * when we reject replacement on a mounted FS. */ - if (fs_devices->opened) { - return -EBUSY; - } else { + if (!fs_devices->opened && found_transid < device->generation) { /* * That is if the FS is _not_ mounted and if you * are here, that means there is more than one @@ -542,8 +542,7 @@ static noinline int device_list_add(const char *path, * with larger generation number or the last-in if * generation are equal. */ - if (found_transid < device->generation) - return -EEXIST; + return -EEXIST; } name = rcu_string_strdup(path, GFP_NOFS); -- cgit v1.2.3 From 080af20cc945d110f9912d01cf6b66f94a375b8d Mon Sep 17 00:00:00 2001 From: Steve Dickson Date: Thu, 18 Sep 2014 09:13:17 -0400 Subject: NFSv4: nfs4_state_manager() vs. nfs_server_remove_lists() There is a race between nfs4_state_manager() and nfs_server_remove_lists() that happens during a nfsv3 mount. The v3 mount notices there is already a supper block so nfs_server_remove_lists() called which uses the nfs_client_lock spin lock to synchronize access to the client list. At the same time nfs4_state_manager() is running through the client list looking for work to do, using the same lock. When nfs4_state_manager() wins the race to the list, a v3 client pointer is found and not ignored properly which causes the panic. Moving some protocol checks before the state checking avoids the panic. CC: Stable Tree Signed-off-by: Steve Dickson Signed-off-by: Trond Myklebust --- fs/nfs/nfs4client.c | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 53e435a95260..ffdb28d86cf8 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -482,6 +482,16 @@ int nfs40_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos" */ if (pos->cl_cons_state > NFS_CS_READY) { @@ -501,15 +511,6 @@ int nfs40_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (pos->cl_clientid != new->cl_clientid) continue; @@ -622,6 +623,16 @@ int nfs41_walk_client_list(struct nfs_client *new, spin_lock(&nn->nfs_client_lock); list_for_each_entry(pos, &nn->nfs_client_list, cl_share_link) { + + if (pos->rpc_ops != new->rpc_ops) + continue; + + if (pos->cl_proto != new->cl_proto) + continue; + + if (pos->cl_minorversion != new->cl_minorversion) + continue; + /* If "pos" isn't marked ready, we can't trust the * remaining fields in "pos", especially the client * ID and serverowner fields. Wait for CREATE_SESSION @@ -647,15 +658,6 @@ int nfs41_walk_client_list(struct nfs_client *new, if (pos->cl_cons_state != NFS_CS_READY) continue; - if (pos->rpc_ops != new->rpc_ops) - continue; - - if (pos->cl_proto != new->cl_proto) - continue; - - if (pos->cl_minorversion != new->cl_minorversion) - continue; - if (!nfs4_match_clientids(pos, new)) continue; -- cgit v1.2.3 From cd9288ffaea4359d5cfe2b8d264911506aed26a4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 18 Sep 2014 11:51:32 -0400 Subject: NFSv4: Fix another bug in the close/open_downgrade code James Drew reports another bug whereby the NFS client is now sending an OPEN_DOWNGRADE in a situation where it should really have sent a CLOSE: the client is opening the file for O_RDWR, but then trying to do a downgrade to O_RDONLY, which is not allowed by the NFSv4 spec. Reported-by: James Drews Link: http://lkml.kernel.org/r/541AD7E5.8020409@engr.wisc.edu Fixes: aee7af356e15 (NFSv4: Fix problems with close in the presence...) Cc: stable@vger.kernel.org # 2.6.33+ Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ac2dd953fc18..6ca0c8e7a945 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2618,23 +2618,23 @@ static void nfs4_close_prepare(struct rpc_task *task, void *data) is_rdwr = test_bit(NFS_O_RDWR_STATE, &state->flags); is_rdonly = test_bit(NFS_O_RDONLY_STATE, &state->flags); is_wronly = test_bit(NFS_O_WRONLY_STATE, &state->flags); - /* Calculate the current open share mode */ - calldata->arg.fmode = 0; - if (is_rdonly || is_rdwr) - calldata->arg.fmode |= FMODE_READ; - if (is_wronly || is_rdwr) - calldata->arg.fmode |= FMODE_WRITE; /* Calculate the change in open mode */ + calldata->arg.fmode = 0; if (state->n_rdwr == 0) { - if (state->n_rdonly == 0) { - call_close |= is_rdonly || is_rdwr; - calldata->arg.fmode &= ~FMODE_READ; - } - if (state->n_wronly == 0) { - call_close |= is_wronly || is_rdwr; - calldata->arg.fmode &= ~FMODE_WRITE; - } - } + if (state->n_rdonly == 0) + call_close |= is_rdonly; + else if (is_rdonly) + calldata->arg.fmode |= FMODE_READ; + if (state->n_wronly == 0) + call_close |= is_wronly; + else if (is_wronly) + calldata->arg.fmode |= FMODE_WRITE; + } else if (is_rdwr) + calldata->arg.fmode |= FMODE_READ|FMODE_WRITE; + + if (calldata->arg.fmode == 0) + call_close |= is_rdwr; + if (!nfs4_valid_open_stateid(state)) call_close = 0; spin_unlock(&state->owner->so_lock); -- cgit v1.2.3 From f2d5a94436cc7cc0221b9a81bba2276a25187dd3 Mon Sep 17 00:00:00 2001 From: Anton Altaparmakov Date: Mon, 22 Sep 2014 01:53:03 +0100 Subject: Fix nasty 32-bit overflow bug in buffer i/o code. On 32-bit architectures, the legacy buffer_head functions are not always handling the sector number with the proper 64-bit types, and will thus fail on 4TB+ disks. Any code that uses __getblk() (and thus bread(), breadahead(), sb_bread(), sb_breadahead(), sb_getblk()), and calls it using a 64-bit block on a 32-bit arch (where "long" is 32-bit) causes an inifinite loop in __getblk_slow() with an infinite stream of errors logged to dmesg like this: __find_get_block_slow() failed. block=6740375944, b_blocknr=2445408648 b_state=0x00000020, b_size=512 device sda1 blocksize: 512 Note how in hex block is 0x191C1F988 and b_blocknr is 0x91C1F988 i.e. the top 32-bits are missing (in this case the 0x1 at the top). This is because grow_dev_page() is broken and has a 32-bit overflow due to shifting the page index value (a pgoff_t - which is just 32 bits on 32-bit architectures) left-shifted as the block number. But the top bits to get lost as the pgoff_t is not type cast to sector_t / 64-bit before the shift. This patch fixes this issue by type casting "index" to sector_t before doing the left shift. Note this is not a theoretical bug but has been seen in the field on a 4TiB hard drive with logical sector size 512 bytes. This patch has been verified to fix the infinite loop problem on 3.17-rc5 kernel using a 4TB disk image mounted using "-o loop". Without this patch doing a "find /nt" where /nt is an NTFS volume causes the inifinite loop 100% reproducibly whilst with the patch it works fine as expected. Signed-off-by: Anton Altaparmakov Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- fs/buffer.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 8f05111bbb8b..3588a80854b2 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1022,7 +1022,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, bh = page_buffers(page); if (bh->b_size == size) { end_block = init_page_buffers(page, bdev, - index << sizebits, size); + (sector_t)index << sizebits, + size); goto done; } if (!try_to_free_buffers(page)) @@ -1043,7 +1044,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, */ spin_lock(&inode->i_mapping->private_lock); link_dev_buffers(page, bh); - end_block = init_page_buffers(page, bdev, index << sizebits, size); + end_block = init_page_buffers(page, bdev, (sector_t)index << sizebits, + size); spin_unlock(&inode->i_mapping->private_lock); done: ret = (block < end_block) ? 1 : -ENXIO; -- cgit v1.2.3