From 4f38e4aadcca319650c0de31edf7193d7c384de2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Dec 2011 16:31:34 -0500 Subject: NFSv4: Don't error if we handled it in nfs4_recovery_handle_error If we handled an error condition, then nfs4_recovery_handle_error should return '0' so that the state recovery thread can continue. Also ensure that nfs4_check_lease() continues to abort if we haven't got any credentials by having it return ENOKEY (which is not handled). Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 39914be40b03..1da95e3947bb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1350,12 +1350,14 @@ static void nfs4_warn_keyexpired(const char *s) static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) { switch (error) { + case 0: + break; case -NFS4ERR_CB_PATH_DOWN: nfs_handle_cb_pathdown(clp); - return 0; + break; case -NFS4ERR_NO_GRACE: nfs4_state_end_reclaim_reboot(clp); - return 0; + break; case -NFS4ERR_STALE_CLIENTID: case -NFS4ERR_LEASE_MOVED: set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state); @@ -1375,13 +1377,15 @@ static int nfs4_recovery_handle_error(struct nfs_client *clp, int error) case -NFS4ERR_SEQ_MISORDERED: set_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state); /* Zero session reset errors */ - return 0; + break; case -EKEYEXPIRED: /* Nothing we can do */ nfs4_warn_keyexpired(clp->cl_hostname); - return 0; + break; + default: + return error; } - return error; + return 0; } static int nfs4_do_reclaim(struct nfs_client *clp, const struct nfs4_state_recovery_ops *ops) @@ -1428,7 +1432,7 @@ static int nfs4_check_lease(struct nfs_client *clp) struct rpc_cred *cred; const struct nfs4_state_maintenance_ops *ops = clp->cl_mvops->state_renewal_ops; - int status = -NFS4ERR_EXPIRED; + int status; /* Is the client already known to have an expired lease? */ if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) @@ -1438,6 +1442,7 @@ static int nfs4_check_lease(struct nfs_client *clp) spin_unlock(&clp->cl_lock); if (cred == NULL) { cred = nfs4_get_setclientid_cred(clp); + status = -ENOKEY; if (cred == NULL) goto out; } @@ -1662,10 +1667,10 @@ static void nfs4_state_manager(struct nfs_client *clp) if (test_and_clear_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state)) { status = nfs4_check_lease(clp); + if (status < 0) + goto out_error; if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state)) continue; - if (status < 0 && status != -NFS4ERR_CB_PATH_DOWN) - goto out_error; } /* Initialize or reset the session */ -- cgit v1.2.3 From 111d489f0fb431f4ae85d96851fbf8d3248c09d8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 1 Dec 2011 16:37:42 -0500 Subject: NFSv4.1: Ensure that we handle _all_ SEQUENCE status bits. Currently, the code assumes that the SEQUENCE status bits are mutually exclusive. They are not... Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>= 2.6.34] --- fs/nfs/nfs4state.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 1da95e3947bb..da8d73ed9e0f 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1530,16 +1530,16 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags) { if (!flags) return; - else if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) + if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED) nfs41_handle_server_reboot(clp); - else if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | + if (flags & (SEQ4_STATUS_EXPIRED_ALL_STATE_REVOKED | SEQ4_STATUS_EXPIRED_SOME_STATE_REVOKED | SEQ4_STATUS_ADMIN_STATE_REVOKED | SEQ4_STATUS_LEASE_MOVED)) nfs41_handle_state_revoked(clp); - else if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) + if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED) nfs41_handle_recallable_state_revoked(clp); - else if (flags & (SEQ4_STATUS_CB_PATH_DOWN | + if (flags & (SEQ4_STATUS_CB_PATH_DOWN | SEQ4_STATUS_BACKCHANNEL_FAULT | SEQ4_STATUS_CB_PATH_DOWN_SESSION)) nfs41_handle_cb_path_down(clp); -- cgit v1.2.3 From 4b44b40e04a758e2242ff4a3f7c15982801ec8bc Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Dec 2011 16:31:52 -0500 Subject: NFSv4: Ensure correct locking when accessing the 'lock_states' list There are currently 2 places in the state recovery code, where we do not take sufficient precautions before accessing the state->lock_states. In both cases, we should be holding the state->state_lock. Reported-by: Pascal Bouchareine Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index da8d73ed9e0f..6a7107ae6b72 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1156,11 +1156,13 @@ restart: if (status >= 0) { status = nfs4_reclaim_locks(state, ops); if (status >= 0) { + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { if (!(lock->ls_flags & NFS_LOCK_INITIALIZED)) printk("%s: Lock reclaim failed!\n", __func__); } + spin_unlock(&state->state_lock); nfs4_put_open_state(state); goto restart; } @@ -1224,10 +1226,12 @@ static void nfs4_clear_open_state(struct nfs4_state *state) clear_bit(NFS_O_RDONLY_STATE, &state->flags); clear_bit(NFS_O_WRONLY_STATE, &state->flags); clear_bit(NFS_O_RDWR_STATE, &state->flags); + spin_lock(&state->state_lock); list_for_each_entry(lock, &state->lock_states, ls_locks) { lock->ls_seqid.flags = 0; lock->ls_flags &= ~NFS_LOCK_INITIALIZED; } + spin_unlock(&state->state_lock); } static void nfs4_reset_seqids(struct nfs_server *server, -- cgit v1.2.3 From 652f89f64fabcdae9143ee2b4253cfa838fb0279 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 9 Dec 2011 19:05:58 -0500 Subject: NFSv4: Do not accept delegated opens when a delegation recall is in effect ...and report the servers that try to return a delegation when the client is using the CLAIM_DELEG_CUR open mode. That behaviour is explicitly forbidden in RFC3530. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index be2bbac13817..d9f4d78c3413 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -39,6 +39,8 @@ #include #include #include +#include +#include #include #include #include @@ -894,6 +896,8 @@ out: static int can_open_delegated(struct nfs_delegation *delegation, fmode_t fmode) { + if (delegation == NULL) + return 0; if ((delegation->type & fmode) != fmode) return 0; if (test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags)) @@ -1036,8 +1040,7 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) } rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); - if (delegation == NULL || - !can_open_delegated(delegation, fmode)) { + if (!can_open_delegated(delegation, fmode)) { rcu_read_unlock(); break; } @@ -1091,7 +1094,12 @@ static struct nfs4_state *nfs4_opendata_to_nfs4_state(struct nfs4_opendata *data if (delegation) delegation_flags = delegation->flags; rcu_read_unlock(); - if ((delegation_flags & 1UL<o_arg.claim == NFS4_OPEN_CLAIM_DELEGATE_CUR) { + pr_err_ratelimited("NFS: Broken NFSv4 server %s is " + "returning a delegation for " + "OPEN(CLAIM_DELEGATE_CUR)\n", + NFS_CLIENT(inode)->cl_server); + } else if ((delegation_flags & 1UL<inode, data->owner->so_cred, &data->o_res); @@ -1423,11 +1431,9 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) goto out_no_action; rcu_read_lock(); delegation = rcu_dereference(NFS_I(data->state->inode)->delegation); - if (delegation != NULL && - test_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags) == 0) { - rcu_read_unlock(); - goto out_no_action; - } + if (data->o_arg.claim != NFS4_OPEN_CLAIM_DELEGATE_CUR && + can_open_delegated(delegation, data->o_arg.fmode)) + goto unlock_no_action; rcu_read_unlock(); } /* Update sequence id. */ @@ -1444,6 +1450,8 @@ static void nfs4_open_prepare(struct rpc_task *task, void *calldata) return; rpc_call_start(task); return; +unlock_no_action: + rcu_read_unlock(); out_no_action: task->tk_action = NULL; -- cgit v1.2.3 From 6c52961743f38747401b47127b82159ab6d8a7a4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 15 Dec 2011 18:38:10 -0500 Subject: NFS: Fix a regression in nfs_file_llseek() After commit 06222e491e663dac939f04b125c9dc52126a75c4 (fs: handle SEEK_HOLE/SEEK_DATA properly in all fs's that define their own llseek) the behaviour of llseek() was changed so that it always revalidates the file size. The bug appears to be due to a logic error in the afore-mentioned commit, which always evaluates to 'true'. Reported-by: Roel Kluin Signed-off-by: Trond Myklebust Cc: stable@vger.kernel.org [>=3.1] --- fs/nfs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/file.c b/fs/nfs/file.c index eca56d4b39c0..606ef0f20aed 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -147,7 +147,7 @@ static loff_t nfs_file_llseek(struct file *filp, loff_t offset, int origin) * origin == SEEK_END || SEEK_DATA || SEEK_HOLE => we must revalidate * the cached file length */ - if (origin != SEEK_SET || origin != SEEK_CUR) { + if (origin != SEEK_SET && origin != SEEK_CUR) { struct inode *inode = filp->f_mapping->host; int retval = nfs_revalidate_file_size(inode, filp); -- cgit v1.2.3 From b3bba872ddb0320a7ecb54decae53c13ceb2ed4c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 8 Dec 2011 16:53:54 -0600 Subject: writeback: show writeback reason with __print_symbolic This makes the binary trace understandable by trace-cmd. CC: Dave Chinner CC: Curt Wohlgemuth CC: Steven Rostedt Signed-off-by: Wu Fengguang --- fs/fs-writeback.c | 11 ----------- include/trace/events/writeback.h | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index ac86f8b3e3cb..517f211a3bd4 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -47,17 +47,6 @@ struct wb_writeback_work { struct completion *done; /* set if the caller waits */ }; -const char *wb_reason_name[] = { - [WB_REASON_BACKGROUND] = "background", - [WB_REASON_TRY_TO_FREE_PAGES] = "try_to_free_pages", - [WB_REASON_SYNC] = "sync", - [WB_REASON_PERIODIC] = "periodic", - [WB_REASON_LAPTOP_TIMER] = "laptop_timer", - [WB_REASON_FREE_MORE_MEM] = "free_more_memory", - [WB_REASON_FS_FREE_SPACE] = "fs_free_space", - [WB_REASON_FORKER_THREAD] = "forker_thread" -}; - /* * Include the creation of the trace points after defining the * wb_writeback_work structure so that the definition remains local to this diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index b99caa8b780c..99d1d0decf88 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -21,6 +21,16 @@ {I_REFERENCED, "I_REFERENCED"} \ ) +#define WB_WORK_REASON \ + {WB_REASON_BACKGROUND, "background"}, \ + {WB_REASON_TRY_TO_FREE_PAGES, "try_to_free_pages"}, \ + {WB_REASON_SYNC, "sync"}, \ + {WB_REASON_PERIODIC, "periodic"}, \ + {WB_REASON_LAPTOP_TIMER, "laptop_timer"}, \ + {WB_REASON_FREE_MORE_MEM, "free_more_memory"}, \ + {WB_REASON_FS_FREE_SPACE, "fs_free_space"}, \ + {WB_REASON_FORKER_THREAD, "forker_thread"} + struct wb_writeback_work; DECLARE_EVENT_CLASS(writeback_work_class, @@ -55,7 +65,7 @@ DECLARE_EVENT_CLASS(writeback_work_class, __entry->for_kupdate, __entry->range_cyclic, __entry->for_background, - wb_reason_name[__entry->reason] + __print_symbolic(__entry->reason, WB_WORK_REASON) ) ); #define DEFINE_WRITEBACK_WORK_EVENT(name) \ @@ -184,7 +194,8 @@ TRACE_EVENT(writeback_queue_io, __entry->older, /* older_than_this in jiffies */ __entry->age, /* older_than_this in relative milliseconds */ __entry->moved, - wb_reason_name[__entry->reason]) + __print_symbolic(__entry->reason, WB_WORK_REASON) + ) ); TRACE_EVENT(global_dirty_state, -- cgit v1.2.3 From 695c60f21c69e525a89279a5f35bae4ff237afbc Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Mon, 19 Dec 2011 17:11:55 -0800 Subject: nilfs2: unbreak compat ioctl commit 828b1c50ae ("nilfs2: add compat ioctl") incidentally broke all other NILFS compat ioctls. Make them work again. Signed-off-by: Thomas Meyer Signed-off-by: Ryusuke Konishi Tested-by: Ryusuke Konishi Cc: [3.0+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/ioctl.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 41d6743d303c..3e654273cfc2 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -842,6 +842,19 @@ long nilfs_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; + case NILFS_IOCTL_CHANGE_CPMODE: + case NILFS_IOCTL_DELETE_CHECKPOINT: + case NILFS_IOCTL_GET_CPINFO: + case NILFS_IOCTL_GET_CPSTAT: + case NILFS_IOCTL_GET_SUINFO: + case NILFS_IOCTL_GET_SUSTAT: + case NILFS_IOCTL_GET_VINFO: + case NILFS_IOCTL_GET_BDESCS: + case NILFS_IOCTL_CLEAN_SEGMENTS: + case NILFS_IOCTL_SYNC: + case NILFS_IOCTL_RESIZE: + case NILFS_IOCTL_SET_ALLOC_RANGE: + break; default: return -ENOIOCTLCMD; } -- cgit v1.2.3 From 481fe17e973fb97aa3edf17c69557afe88d8334f Mon Sep 17 00:00:00 2001 From: Haogang Chen Date: Mon, 19 Dec 2011 17:11:56 -0800 Subject: nilfs2: potential integer overflow in nilfs_ioctl_clean_segments() There is a potential integer overflow in nilfs_ioctl_clean_segments(). When a large argv[n].v_nmembs is passed from the userspace, the subsequent call to vmalloc() will allocate a buffer smaller than expected, which leads to out-of-bound access in nilfs_ioctl_move_blocks() and lfs_clean_segments(). The following check does not prevent the overflow because nsegs is also controlled by the userspace and could be very large. if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) goto out_free; This patch clamps argv[n].v_nmembs to UINT_MAX / argv[n].v_size, and returns -EINVAL when overflow. Signed-off-by: Haogang Chen Signed-off-by: Ryusuke Konishi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/nilfs2/ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 3e654273cfc2..ac258beeda3c 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -625,6 +625,9 @@ static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp, if (argv[n].v_nmembs > nsegs * nilfs->ns_blocks_per_segment) goto out_free; + if (argv[n].v_nmembs >= UINT_MAX / argv[n].v_size) + goto out_free; + len = argv[n].v_size * argv[n].v_nmembs; base = (void __user *)(unsigned long)argv[n].v_base; if (len == 0) { -- cgit v1.2.3 From 8d532b2afb2eacc84588db709ec280a3d1219be3 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 23 Dec 2011 07:53:00 -0500 Subject: Btrfs: fix worker lock misuse in find_worker Dan Carpenter noticed that we were doing a double unlock on the worker lock, and sometimes picking a worker thread without the lock held. This fixes both errors. Signed-off-by: Chris Mason Reported-by: Dan Carpenter --- fs/btrfs/async-thread.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index cb97174e2366..0b394580d860 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -563,8 +563,8 @@ static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) struct list_head *fallback; int ret; -again: spin_lock_irqsave(&workers->lock, flags); +again: worker = next_worker(workers); if (!worker) { @@ -579,6 +579,7 @@ again: spin_unlock_irqrestore(&workers->lock, flags); /* we're below the limit, start another worker */ ret = __btrfs_start_workers(workers); + spin_lock_irqsave(&workers->lock, flags); if (ret) goto fallback; goto again; -- cgit v1.2.3 From 08c422c27f855d27b0b3d9fa30ebd938d4ae6f1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 23 Dec 2011 07:58:13 -0500 Subject: Btrfs: call d_instantiate after all ops are setup This closes races where btrfs is calling d_instantiate too soon during inode creation. All of the callers of btrfs_add_nondir are updated to instantiate after the inode is fully setup in memory. Signed-off-by: Al Viro Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 740e67bbe249..13b0542015ff 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4590,10 +4590,6 @@ static int btrfs_add_nondir(struct btrfs_trans_handle *trans, int err = btrfs_add_link(trans, dir, inode, dentry->d_name.name, dentry->d_name.len, backref, index); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } if (err > 0) err = -EEXIST; return err; @@ -4655,6 +4651,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, else { init_special_inode(inode, inode->i_mode, rdev); btrfs_update_inode(trans, root, inode); + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4722,6 +4719,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &btrfs_aops; inode->i_mapping->backing_dev_info = &root->fs_info->bdi; BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + d_instantiate(dentry, inode); } out_unlock: nr = trans->blocks_used; @@ -4779,6 +4777,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *parent = dentry->d_parent; err = btrfs_update_inode(trans, root, inode); BUG_ON(err); + d_instantiate(dentry, inode); btrfs_log_new_name(trans, inode, NULL, parent); } @@ -7245,6 +7244,8 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, drop_inode = 1; out_unlock: + if (!err) + d_instantiate(dentry, inode); nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); if (drop_inode) { -- cgit v1.2.3 From 0b8fd3033c308e4088760aa1d38ce77197b4e074 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 18 Dec 2011 15:49:55 +0000 Subject: xfs: log the inode in ->write_inode calls for kupdate If the writeback code writes back an inode because it has expired we currently use the non-blockin ->write_inode path. This means any inode that is pinned is skipped. With delayed logging and a workload that has very little log traffic otherwise it is very likely that an inode that gets constantly written to is always pinned, and thus we keep refusing to write it. The VM writeback code at that point redirties it and doesn't try to write it again for another 30 seconds. This means under certain scenarious time based metadata writeback never happens. Fix this by calling into xfs_log_inode for kupdate in addition to data integrity syncs, and thus transfer the inode to the log ASAP. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 3eca58f51ae9..1add17ca3350 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -905,7 +905,7 @@ xfs_fs_write_inode( if (!ip->i_update_core) return 0; - if (wbc->sync_mode == WB_SYNC_ALL) { + if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* * Make sure the inode has made it it into the log. Instead * of forcing it all the way to stable storage using a -- cgit v1.2.3 From be4f1ac828776bbc7868a68b465cd8eedb733cfd Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 20 Dec 2011 20:08:41 +0000 Subject: xfs: log all dirty inodes in xfs_fs_sync_fs Since Linux 2.6.36 the writeback code has introduces various measures for live lock prevention during sync(). Unfortunately some of these are actively harmful for the XFS model, where the inode gets marked dirty for metadata from the data I/O handler. The older_than_this checks that are now more strictly enforced since writeback: avoid livelocking WB_SYNC_ALL writeback by only calling into __writeback_inodes_sb and thus only sampling the current cut off time once. But on a slow enough devices the previous asynchronous sync pass might not have fully completed yet, and thus XFS might mark metadata dirty only after that sampling of the cut off time for the blocking pass already happened. I have not myself reproduced this myself on a real system, but by introducing artificial delay into the XFS I/O completion workqueues it can be reproduced easily. Fix this by iterating over all XFS inodes in ->sync_fs and log all that are dirty. This might log inode that only got redirtied after the previous pass, but given how cheap delayed logging of inodes is it isn't a major concern for performance. Signed-off-by: Christoph Hellwig Reviewed-by: Dave Chinner Tested-by: Mark Tinguely Reviewed-by: Mark Tinguely Signed-off-by: Ben Myers --- fs/xfs/xfs_super.c | 28 ++++------------------------ fs/xfs/xfs_sync.c | 36 ++++++++++++++++++++++++++++++++++++ fs/xfs/xfs_sync.h | 2 ++ 3 files changed, 42 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 1add17ca3350..8a899496fd5f 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -868,27 +868,6 @@ xfs_fs_dirty_inode( XFS_I(inode)->i_update_core = 1; } -STATIC int -xfs_log_inode( - struct xfs_inode *ip) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; - int error; - - tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); - error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); - if (error) { - xfs_trans_cancel(tp, 0); - return error; - } - - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); - xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); - return xfs_trans_commit(tp, 0); -} - STATIC int xfs_fs_write_inode( struct inode *inode, @@ -902,8 +881,6 @@ xfs_fs_write_inode( if (XFS_FORCED_SHUTDOWN(mp)) return -XFS_ERROR(EIO); - if (!ip->i_update_core) - return 0; if (wbc->sync_mode == WB_SYNC_ALL || wbc->for_kupdate) { /* @@ -913,11 +890,14 @@ xfs_fs_write_inode( * ->sync_fs call do that for thus, which reduces the number * of synchronous log forces dramatically. */ - error = xfs_log_inode(ip); + error = xfs_log_dirty_inode(ip, NULL, 0); if (error) goto out; return 0; } else { + if (!ip->i_update_core) + return 0; + /* * We make this non-blocking if the inode is contended, return * EAGAIN to indicate to the caller that they did not succeed. diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c index be5c51d8f757..f0994aedcd15 100644 --- a/fs/xfs/xfs_sync.c +++ b/fs/xfs/xfs_sync.c @@ -336,6 +336,32 @@ xfs_sync_fsdata( return error; } +int +xfs_log_dirty_inode( + struct xfs_inode *ip, + struct xfs_perag *pag, + int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + int error; + + if (!ip->i_update_core) + return 0; + + tp = xfs_trans_alloc(mp, XFS_TRANS_FSYNC_TS); + error = xfs_trans_reserve(tp, 0, XFS_FSYNC_TS_LOG_RES(mp), 0, 0, 0); + if (error) { + xfs_trans_cancel(tp, 0); + return error; + } + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL); + xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + return xfs_trans_commit(tp, 0); +} + /* * When remounting a filesystem read-only or freezing the filesystem, we have * two phases to execute. This first phase is syncing the data before we @@ -359,6 +385,16 @@ xfs_quiesce_data( { int error, error2 = 0; + /* + * Log all pending size and timestamp updates. The vfs writeback + * code is supposed to do this, but due to its overagressive + * livelock detection it will skip inodes where appending writes + * were written out in the first non-blocking sync phase if their + * completion took long enough that it happened after taking the + * timestamp for the cut-off in the blocking phase. + */ + xfs_inode_ag_iterator(mp, xfs_log_dirty_inode, 0); + xfs_qm_sync(mp, SYNC_TRYLOCK); xfs_qm_sync(mp, SYNC_WAIT); diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h index 941202e7ac6e..fa965479d788 100644 --- a/fs/xfs/xfs_sync.h +++ b/fs/xfs/xfs_sync.h @@ -34,6 +34,8 @@ void xfs_quiesce_attr(struct xfs_mount *mp); void xfs_flush_inodes(struct xfs_inode *ip); +int xfs_log_dirty_inode(struct xfs_inode *ip, struct xfs_perag *pag, int flags); + int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); int xfs_reclaim_inodes_count(struct xfs_mount *mp); void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); -- cgit v1.2.3 From 6d4b9e38d3980826abccfbd90e95bf4bd41b8dd2 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 26 Dec 2011 10:25:26 -0800 Subject: vfs: fix handling of lock allocation failure in lease-break case Bruce Fields notes that commit 778fc546f749 ("locks: fix tracking of inprogress lease breaks") introduced a possible error pointer dereference on failure to allocate memory. locks_conflict() will dereference the passed-in new lease lock structure that may be an error pointer. This means an open (without O_NONBLOCK set) on a file with a lease applied (generally only done when Samba or nfsd (with v4) is running) could crash if a kmalloc() fails. So instead of playing games with IS_ERROR() all over the place, just check the allocation failure early. That makes the code more straightforward, and avoids this possible bad pointer dereference. Based-on-patch-by: J. Bruce Fields Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/locks.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/locks.c b/fs/locks.c index 3b0d05dcd7c1..637694bf3a03 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1205,6 +1205,8 @@ int __break_lease(struct inode *inode, unsigned int mode) int want_write = (mode & O_ACCMODE) != O_RDONLY; new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); + if (IS_ERR(new_fl)) + return PTR_ERR(new_fl); lock_flocks(); @@ -1221,12 +1223,6 @@ int __break_lease(struct inode *inode, unsigned int mode) if (fl->fl_owner == current->files) i_have_this_lease = 1; - if (IS_ERR(new_fl) && !i_have_this_lease - && ((mode & O_NONBLOCK) == 0)) { - error = PTR_ERR(new_fl); - goto out; - } - break_time = 0; if (lease_break_time > 0) { break_time = jiffies + lease_break_time * HZ; @@ -1284,8 +1280,7 @@ restart: out: unlock_flocks(); - if (!IS_ERR(new_fl)) - locks_free_lock(new_fl); + locks_free_lock(new_fl); return error; } -- cgit v1.2.3 From a4d46363ce96c8fd7534c6f79051c78b52464132 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Thu, 29 Dec 2011 08:05:14 -0800 Subject: ceph: disable use of dcache for readdir etc. Ceph attempts to use the dcache to satisfy negative lookups and readdir when the entire directory contents are in cache. Disable this behavior until lingering bugs in this code are shaken out; we'll re-enable these hooks once things are fully stable. Signed-off-by: Sage Weil --- fs/ceph/dir.c | 29 +++-------------------------- 1 file changed, 3 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 3eeb97661262..98954003a8d3 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1094,42 +1094,19 @@ static int ceph_snapdir_d_revalidate(struct dentry *dentry, /* * Set/clear/test dir complete flag on the dir's dentry. */ -static struct dentry * __d_find_any_alias(struct inode *inode) -{ - struct dentry *alias; - - if (list_empty(&inode->i_dentry)) - return NULL; - alias = list_first_entry(&inode->i_dentry, struct dentry, d_alias); - return alias; -} - void ceph_dir_set_complete(struct inode *inode) { - struct dentry *dentry = __d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) complete\n", inode, dentry); - set_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } + /* not yet implemented */ } void ceph_dir_clear_complete(struct inode *inode) { - struct dentry *dentry = __d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) { - dout(" marking %p (%p) NOT complete\n", inode, dentry); - clear_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); - } + /* not yet implemented */ } bool ceph_dir_test_complete(struct inode *inode) { - struct dentry *dentry = __d_find_any_alias(inode); - - if (dentry && ceph_dentry(dentry)) - return test_bit(CEPH_D_COMPLETE, &ceph_dentry(dentry)->flags); + /* not yet implemented */ return false; } -- cgit v1.2.3 From 34845636a184f3be91a531098192592cbe6db587 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Wed, 28 Dec 2011 15:57:15 -0800 Subject: procfs: do not confuse jiffies with cputime64_t Commit 2a95ea6c0d129b4 ("procfs: do not overflow get_{idle,iowait}_time for nohz") did not take into account that one some architectures jiffies and cputime use different units. This causes get_idle_time() to return numbers in the wrong units, making the idle time fields in /proc/stat wrong. Instead of converting the usec value returned by get_cpu_{idle,iowait}_time_us to units of jiffies, use the new function usecs_to_cputime64 to convert it to the correct unit of cputime64_t. Signed-off-by: Andreas Schwab Acked-by: Michal Hocko Cc: Arnd Bergmann Cc: "Artem S. Tashkinov" Cc: Dave Jones Cc: Alexey Dobriyan Cc: Thomas Gleixner Cc: "Luck, Tony" Cc: Benjamin Herrenschmidt Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/cputime.h | 1 + arch/powerpc/include/asm/cputime.h | 2 ++ arch/s390/include/asm/cputime.h | 2 ++ fs/proc/stat.c | 4 ++-- include/asm-generic/cputime.h | 1 + 5 files changed, 8 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/arch/ia64/include/asm/cputime.h b/arch/ia64/include/asm/cputime.h index 6073b187528a..5a274af31b2b 100644 --- a/arch/ia64/include/asm/cputime.h +++ b/arch/ia64/include/asm/cputime.h @@ -60,6 +60,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) ((__ct) / NSEC_PER_USEC) #define usecs_to_cputime(__usecs) ((__usecs) * NSEC_PER_USEC) +#define usecs_to_cputime64(__usecs) usecs_to_cputime(__usecs) /* * Convert cputime <-> seconds diff --git a/arch/powerpc/include/asm/cputime.h b/arch/powerpc/include/asm/cputime.h index 1cf20bdfbeca..98b7c4b49c9d 100644 --- a/arch/powerpc/include/asm/cputime.h +++ b/arch/powerpc/include/asm/cputime.h @@ -150,6 +150,8 @@ static inline cputime_t usecs_to_cputime(const unsigned long us) return ct; } +#define usecs_to_cputime64(us) usecs_to_cputime(us) + /* * Convert cputime <-> seconds */ diff --git a/arch/s390/include/asm/cputime.h b/arch/s390/include/asm/cputime.h index 081434878296..b9acaaa175d8 100644 --- a/arch/s390/include/asm/cputime.h +++ b/arch/s390/include/asm/cputime.h @@ -87,6 +87,8 @@ usecs_to_cputime(const unsigned int m) return (cputime_t) m * 4096; } +#define usecs_to_cputime64(m) usecs_to_cputime(m) + /* * Convert cputime to milliseconds and back. */ diff --git a/fs/proc/stat.c b/fs/proc/stat.c index 2a30d67dd6b8..0855e6f20391 100644 --- a/fs/proc/stat.c +++ b/fs/proc/stat.c @@ -32,7 +32,7 @@ static cputime64_t get_idle_time(int cpu) idle = kstat_cpu(cpu).cpustat.idle; idle = cputime64_add(idle, arch_idle_time(cpu)); } else - idle = nsecs_to_jiffies64(1000 * idle_time); + idle = usecs_to_cputime64(idle_time); return idle; } @@ -46,7 +46,7 @@ static cputime64_t get_iowait_time(int cpu) /* !NO_HZ so we can rely on cpustat.iowait */ iowait = kstat_cpu(cpu).cpustat.iowait; else - iowait = nsecs_to_jiffies64(1000 * iowait_time); + iowait = usecs_to_cputime64(iowait_time); return iowait; } diff --git a/include/asm-generic/cputime.h b/include/asm-generic/cputime.h index 62ce6823c0f2..12a1764f612b 100644 --- a/include/asm-generic/cputime.h +++ b/include/asm-generic/cputime.h @@ -40,6 +40,7 @@ typedef u64 cputime64_t; */ #define cputime_to_usecs(__ct) jiffies_to_usecs(__ct) #define usecs_to_cputime(__msecs) usecs_to_jiffies(__msecs) +#define usecs_to_cputime64(__msecs) nsecs_to_jiffies64((__msecs) * 1000) /* * Convert cputime to seconds and back. -- cgit v1.2.3