From 8b712cd58adfe6aeeb0be4ecc011dc35620719e7 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Tue, 7 Jul 2009 17:22:12 -0400 Subject: ocfs2: Fixup orphan scan cleanup after failed mount If the mount fails for any reason, ocfs2_dismount_volume calls ocfs2_orphan_scan_stop. It requires that ocfs2_orphan_scan_init be called to setup the mutex and work queues, but that doesn't happen if the mount has failed and we oops accessing an uninitialized work queue. This patch splits the init and startup of the orphan scan, eliminating the oops. Signed-off-by: Jeff Mahoney Signed-off-by: Joel Becker --- fs/ocfs2/journal.c | 8 +++++++- fs/ocfs2/journal.h | 1 + fs/ocfs2/super.c | 4 +++- 3 files changed, 11 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f033760ecbea..c48b93ac6b65 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -1954,10 +1954,16 @@ void ocfs2_orphan_scan_init(struct ocfs2_super *osb) os->os_osb = osb; os->os_count = 0; os->os_seqno = 0; - os->os_scantime = CURRENT_TIME; mutex_init(&os->os_lock); INIT_DELAYED_WORK(&os->os_orphan_scan_work, ocfs2_orphan_scan_work); +} +void ocfs2_orphan_scan_start(struct ocfs2_super *osb) +{ + struct ocfs2_orphan_scan *os; + + os = &osb->osb_orphan_scan; + os->os_scantime = CURRENT_TIME; if (ocfs2_is_hard_readonly(osb) || ocfs2_mount_local(osb)) atomic_set(&os->os_state, ORPHAN_SCAN_INACTIVE); else { diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 5432c7f79cc6..81e8abcd246e 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -145,6 +145,7 @@ static inline void ocfs2_inode_set_new(struct ocfs2_super *osb, /* Exported only for the journal struct init code in super.c. Do not call. */ void ocfs2_orphan_scan_init(struct ocfs2_super *osb); +void ocfs2_orphan_scan_start(struct ocfs2_super *osb); void ocfs2_orphan_scan_stop(struct ocfs2_super *osb); void ocfs2_orphan_scan_exit(struct ocfs2_super *osb); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 7efb349fb9bd..63af2e36d834 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1182,7 +1182,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) wake_up(&osb->osb_mount_event); /* Start this when the mount is almost sure of being successful */ - ocfs2_orphan_scan_init(osb); + ocfs2_orphan_scan_start(osb); mlog_exit(status); return status; @@ -1981,6 +1981,8 @@ static int ocfs2_initialize_super(struct super_block *sb, snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); + ocfs2_orphan_scan_init(osb); + status = ocfs2_recovery_init(osb); if (status) { mlog(ML_ERROR, "Unable to initialize recovery state\n"); -- cgit v1.2.3 From 17ae26b669886efe237b77439e43eb390fceb119 Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 7 Jul 2009 15:51:40 +0800 Subject: ocfs2: trivial fix for s/migrate/migration/ in dlmrecovery.c logging in dlmrecovery.c:1121, replace 'migrate' to 'migration' to keep the consistency by comparing to other lines with the similar log info in the same file. Signed-off-by: Jeff Liu Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmrecovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index bcb9260c3735..43e6e3280569 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1118,7 +1118,7 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm, mlog(0, "%s:%.*s: sending mig lockres (%s) to %u\n", dlm->name, res->lockname.len, res->lockname.name, - orig_flags & DLM_MRES_MIGRATION ? "migrate" : "recovery", + orig_flags & DLM_MRES_MIGRATION ? "migration" : "recovery", send_to); /* send it */ -- cgit v1.2.3 From 812e7a6a43fc34bc8f70c2b80db4ea5997d66ea8 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Fri, 10 Jul 2009 13:26:04 +0800 Subject: ocfs2: log the actual return value of ocfs2_file_aio_write() in ocfs2_file_aio_write(), log_exit() could don't log the value which is really returned. this patch fixes it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 62442e413a00..a49fa44aea1f 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1918,8 +1918,10 @@ out_sems: mutex_unlock(&inode->i_mutex); + if (written) + ret = written; mlog_exit(ret); - return written ? written : ret; + return ret; } static int ocfs2_splice_to_file(struct pipe_inode_info *pipe, -- cgit v1.2.3 From cefcb800fa9536bb6f29485c53e6c82a65b0c022 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Sat, 11 Jul 2009 10:57:27 -0500 Subject: ocfs2: Initialize count in aio_write before generic_write_checks generic_write_checks() expects count to be initialized to the size of the write. Writes to files open with O_DIRECT|O_LARGEFILE write 0 bytes because count is uninitialized. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/file.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a49fa44aea1f..aa501d3f93f1 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1851,6 +1851,7 @@ relock: if (ret) goto out_dio; + count = ocount; ret = generic_write_checks(file, ppos, &count, S_ISBLK(inode->i_mode)); if (ret) -- cgit v1.2.3 From 44d8e4e1e9b941065eb7578669fe51eb65c73e63 Mon Sep 17 00:00:00 2001 From: Subrata Modak Date: Tue, 14 Jul 2009 01:19:31 +0530 Subject: ocfs2: Fix compilation warning for fs/ocfs2/xattr.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit gcc 4.4.1 generates the following build warning on i386: CC [M] fs/ocfs2/xattr.o fs/ocfs2/xattr.c: In function ‘ocfs2_xattr_block_get’: fs/ocfs2/xattr.c:1055: warning: ‘block_off’ may be used uninitialized in this function The following fix is based on a similar approach by David Howells few days back: http://lkml.org/lkml/2009/7/9/109, Signed-off-by: Subrata Modak, Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index ba320e250747..d1a27cda984f 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1052,7 +1052,8 @@ static int ocfs2_xattr_block_get(struct inode *inode, struct ocfs2_xattr_block *xb; struct ocfs2_xattr_value_root *xv; size_t size; - int ret = -ENODATA, name_offset, name_len, block_off, i; + int ret = -ENODATA, name_offset, name_len, i; + int uninitialized_var(block_off); xs->bucket = ocfs2_xattr_bucket_new(inode); if (!xs->bucket) { -- cgit v1.2.3 From cbfa9639aea5007313b75ec43b689d5f9e0c037d Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:23 +0800 Subject: ocfs2: Fix error return in ocfs2_write_cluster() A typo caused ocfs2_write_cluster() to return 0 in some error cases. Fix it. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index b2c52b3a1484..25aced3b0c7d 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1301,7 +1301,7 @@ static int ocfs2_write_cluster(struct address_space *mapping, if (tmpret) { mlog_errno(tmpret); if (ret == 0) - tmpret = ret; + ret = tmpret; } } -- cgit v1.2.3 From 1f4cea3790bf44137192f441ee84af256e3bf809 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Mon, 13 Jul 2009 11:38:58 +0800 Subject: ocfs2: Fail ocfs2_get_block() immediately when a block needs allocation ocfs2_get_block() does no allocation. Hole filling for writes should have happened farther up in the call chain. We detect this case and print an error, but we then continue with the function. We should be exiting immediately. Signed-off-by: Wengang Wang Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 25aced3b0c7d..e511df101451 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -193,6 +193,7 @@ static int ocfs2_get_block(struct inode *inode, sector_t iblock, (unsigned long long)OCFS2_I(inode)->ip_blkno); mlog(ML_ERROR, "Size %llu, clusters %u\n", (unsigned long long)i_size_read(inode), OCFS2_I(inode)->ip_clusters); dump_stack(); + goto bail; } past_eof = ocfs2_blocks_for_bytes(inode->i_sb, i_size_read(inode)); -- cgit v1.2.3 From 3c5e10683e684ef45614c9071847e48f633d9806 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 21 Jul 2009 15:42:05 +0800 Subject: ocfs2: Add extra credits and access the modified bh in update_edge_lengths. In normal tree rotation left process, we will never touch the tree branch above subtree_index and ocfs2_extend_rotate_transaction doesn't reserve the credits for them either. But when we want to delete the rightmost extent block, we have to update the rightmost records for all the rightmost branch(See ocfs2_update_edge_lengths), so we have to allocate extra credits for them. What's more, we have to access them also. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 44 +++++++++++++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 9edcde4974aa..11085af71247 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2476,15 +2476,37 @@ out_ret_path: return ret; } -static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, - struct ocfs2_path *path) +static int ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, + int subtree_index, struct ocfs2_path *path) { - int i, idx; + int i, idx, ret; struct ocfs2_extent_rec *rec; struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; u32 range; + /* + * In normal tree rotation process, we will never touch the + * tree branch above subtree_index and ocfs2_extend_rotate_transaction + * doesn't reserve the credits for them either. + * + * But we do have a special case here which will update the rightmost + * records for all the bh in the path. + * So we have to allocate extra credits and access them. + */ + ret = ocfs2_extend_trans(handle, + handle->h_buffer_credits + subtree_index); + if (ret) { + mlog_errno(ret); + goto out; + } + + ret = ocfs2_journal_access_path(inode, handle, path); + if (ret) { + mlog_errno(ret); + goto out; + } + /* Path should always be rightmost. */ eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; BUG_ON(eb->h_next_leaf_blk != 0ULL); @@ -2505,6 +2527,8 @@ static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, ocfs2_journal_dirty(handle, path->p_node[i].bh); } +out: + return ret; } static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, @@ -2717,7 +2741,12 @@ static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, if (del_right_subtree) { ocfs2_unlink_subtree(inode, handle, left_path, right_path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); @@ -3034,7 +3063,12 @@ static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, ocfs2_unlink_subtree(inode, handle, left_path, path, subtree_index, dealloc); - ocfs2_update_edge_lengths(inode, handle, left_path); + ret = ocfs2_update_edge_lengths(inode, handle, subtree_index, + left_path); + if (ret) { + mlog_errno(ret); + goto out; + } eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); -- cgit v1.2.3 From f7b1aa69be138ad9d7d3f31fa56f4c9407f56b6a Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 20 Jul 2009 12:12:36 +0200 Subject: ocfs2: Fix deadlock on umount In commit ea455f8ab68338ba69f5d3362b342c115bea8e13, we moved the dentry lock put process into ocfs2_wq. This causes problems during umount because ocfs2_wq can drop references to inodes while they are being invalidated by invalidate_inodes() causing all sorts of nasty things (invalidate_inodes() ending in an infinite loop, "Busy inodes after umount" messages etc.). We fix the problem by stopping ocfs2_wq from doing any further releasing of inode references on the superblock being unmounted, wait until it finishes the current round of releasing and finally cleaning up all the references in dentry_lock_list from ocfs2_put_super(). The issue was tracked down by Tao Ma . Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/dcache.c | 35 +++++++++++++++++++++++++++-------- fs/ocfs2/dcache.h | 3 +++ fs/ocfs2/ocfs2.h | 22 ++++++++++++++++++---- fs/ocfs2/super.c | 25 ++++++++++++++++++++++--- 4 files changed, 70 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dcache.c b/fs/ocfs2/dcache.c index b574431a031d..2f28b7de2c8d 100644 --- a/fs/ocfs2/dcache.c +++ b/fs/ocfs2/dcache.c @@ -310,22 +310,19 @@ out_attach: return ret; } -static DEFINE_SPINLOCK(dentry_list_lock); +DEFINE_SPINLOCK(dentry_list_lock); /* We limit the number of dentry locks to drop in one go. We have * this limit so that we don't starve other users of ocfs2_wq. */ #define DL_INODE_DROP_COUNT 64 /* Drop inode references from dentry locks */ -void ocfs2_drop_dl_inodes(struct work_struct *work) +static void __ocfs2_drop_dl_inodes(struct ocfs2_super *osb, int drop_count) { - struct ocfs2_super *osb = container_of(work, struct ocfs2_super, - dentry_lock_work); struct ocfs2_dentry_lock *dl; - int drop_count = DL_INODE_DROP_COUNT; spin_lock(&dentry_list_lock); - while (osb->dentry_lock_list && drop_count--) { + while (osb->dentry_lock_list && (drop_count < 0 || drop_count--)) { dl = osb->dentry_lock_list; osb->dentry_lock_list = dl->dl_next; spin_unlock(&dentry_list_lock); @@ -333,11 +330,32 @@ void ocfs2_drop_dl_inodes(struct work_struct *work) kfree(dl); spin_lock(&dentry_list_lock); } - if (osb->dentry_lock_list) + spin_unlock(&dentry_list_lock); +} + +void ocfs2_drop_dl_inodes(struct work_struct *work) +{ + struct ocfs2_super *osb = container_of(work, struct ocfs2_super, + dentry_lock_work); + + __ocfs2_drop_dl_inodes(osb, DL_INODE_DROP_COUNT); + /* + * Don't queue dropping if umount is in progress. We flush the + * list in ocfs2_dismount_volume + */ + spin_lock(&dentry_list_lock); + if (osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); spin_unlock(&dentry_list_lock); } +/* Flush the whole work queue */ +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb) +{ + __ocfs2_drop_dl_inodes(osb, -1); +} + /* * ocfs2_dentry_iput() and friends. * @@ -368,7 +386,8 @@ static void ocfs2_drop_dentry_lock(struct ocfs2_super *osb, /* We leave dropping of inode reference to ocfs2_wq as that can * possibly lead to inode deletion which gets tricky */ spin_lock(&dentry_list_lock); - if (!osb->dentry_lock_list) + if (!osb->dentry_lock_list && + !ocfs2_test_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED)) queue_work(ocfs2_wq, &osb->dentry_lock_work); dl->dl_next = osb->dentry_lock_list; osb->dentry_lock_list = dl; diff --git a/fs/ocfs2/dcache.h b/fs/ocfs2/dcache.h index faa12e75f98d..f5dd1789acf1 100644 --- a/fs/ocfs2/dcache.h +++ b/fs/ocfs2/dcache.h @@ -49,10 +49,13 @@ struct ocfs2_dentry_lock { int ocfs2_dentry_attach_lock(struct dentry *dentry, struct inode *inode, u64 parent_blkno); +extern spinlock_t dentry_list_lock; + void ocfs2_dentry_lock_put(struct ocfs2_super *osb, struct ocfs2_dentry_lock *dl); void ocfs2_drop_dl_inodes(struct work_struct *work); +void ocfs2_drop_all_dl_inodes(struct ocfs2_super *osb); struct dentry *ocfs2_find_local_alias(struct inode *inode, u64 parent_blkno, int skip_unhashed); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index c9345ebb8493..39e1d5a39505 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -224,10 +224,12 @@ enum ocfs2_mount_options OCFS2_MOUNT_GRPQUOTA = 1 << 10, /* We support group quotas */ }; -#define OCFS2_OSB_SOFT_RO 0x0001 -#define OCFS2_OSB_HARD_RO 0x0002 -#define OCFS2_OSB_ERROR_FS 0x0004 -#define OCFS2_DEFAULT_ATIME_QUANTUM 60 +#define OCFS2_OSB_SOFT_RO 0x0001 +#define OCFS2_OSB_HARD_RO 0x0002 +#define OCFS2_OSB_ERROR_FS 0x0004 +#define OCFS2_OSB_DROP_DENTRY_LOCK_IMMED 0x0008 + +#define OCFS2_DEFAULT_ATIME_QUANTUM 60 struct ocfs2_journal; struct ocfs2_slot_info; @@ -490,6 +492,18 @@ static inline void ocfs2_set_osb_flag(struct ocfs2_super *osb, spin_unlock(&osb->osb_lock); } + +static inline unsigned long ocfs2_test_osb_flag(struct ocfs2_super *osb, + unsigned long flag) +{ + unsigned long ret; + + spin_lock(&osb->osb_lock); + ret = osb->osb_flags & flag; + spin_unlock(&osb->osb_lock); + return ret; +} + static inline void ocfs2_set_ro_flag(struct ocfs2_super *osb, int hard) { diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 63af2e36d834..f2893878f8ad 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1213,14 +1213,27 @@ static int ocfs2_get_sb(struct file_system_type *fs_type, mnt); } +static void ocfs2_kill_sb(struct super_block *sb) +{ + struct ocfs2_super *osb = OCFS2_SB(sb); + + /* Prevent further queueing of inode drop events */ + spin_lock(&dentry_list_lock); + ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); + spin_unlock(&dentry_list_lock); + /* Wait for work to finish and/or remove it */ + cancel_work_sync(&osb->dentry_lock_work); + + kill_block_super(sb); +} + static struct file_system_type ocfs2_fs_type = { .owner = THIS_MODULE, .name = "ocfs2", .get_sb = ocfs2_get_sb, /* is this called when we mount * the fs? */ - .kill_sb = kill_block_super, /* set to the generic one - * right now, but do we - * need to change that? */ + .kill_sb = ocfs2_kill_sb, + .fs_flags = FS_REQUIRES_DEV|FS_RENAME_DOES_D_MOVE, .next = NULL }; @@ -1819,6 +1832,12 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) debugfs_remove(osb->osb_ctxt); + /* + * Flush inode dropping work queue so that deletes are + * performed while the filesystem is still working + */ + ocfs2_drop_all_dl_inodes(osb); + /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); -- cgit v1.2.3 From 82e12644cf5227dab15201fbcaf0ca6330ebd70f Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 23 Jul 2009 08:12:58 +0800 Subject: ocfs2: Use ocfs2_rec_clusters in ocfs2_adjust_adjacent_records. In ocfs2_adjust_adjacent_records, we will adjust adjacent records according to the extent_list in the lower level. But actually the lower level tree will either be a leaf or a branch. If we only use ocfs2_is_empty_extent we will meet with some problem if the lower tree is a branch (tree_depth > 1). So use !ocfs2_rec_clusters instead. And actually only the leaf record can have holes. So add a BUG_ON for non-leaf branch. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 11085af71247..f9a3e8942669 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1914,7 +1914,8 @@ static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, * immediately to their right. */ left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); - if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { + if (!ocfs2_rec_clusters(right_child_el, &right_child_el->l_recs[0])) { + BUG_ON(right_child_el->l_tree_depth); BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); } -- cgit v1.2.3 From b57ac2c43e66cb4aa76c9d2d0e9e0e04f19cc5a6 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:15 +0200 Subject: ocfs2: Make global quota files blocksize aligned Change i_size of global quota files so that it always remains aligned to block size. This is mainly because the end of quota block may contain checksum (if checksumming is enabled) and it's a bit awkward for it to be "outside" of quota file (and it makes life harder for ocfs2-tools). Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index edfa60cd155c..a66cb82fd6e6 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -211,14 +211,17 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, mutex_lock_nested(&gqinode->i_mutex, I_MUTEX_QUOTA); if (gqinode->i_size < off + len) { + loff_t rounded_end = + ocfs2_align_bytes_to_blocks(sb, off + len); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, off + len, off); + err = ocfs2_extend_no_holes(gqinode, rounded_end, off); up_write(&OCFS2_I(gqinode)->ip_alloc_sem); if (err < 0) goto out; err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, - off + len); + rounded_end); if (err < 0) goto out; new = 1; -- cgit v1.2.3 From 4b3fa1904c0d192461ebba692e4940a334b74353 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:16 +0200 Subject: ocfs2: Mark buffer uptodate before calling ocfs2_journal_access_dq() In a code path extending local quota files we marked new header buffer uptodate only after calling ocfs2_journal_access_dq() which triggers a bug. Fix it and also call ocfs2 variant of the function marking buffer uptodate. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_local.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 5a460fa82553..b624f9bc9281 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -20,6 +20,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "quota.h" +#include "uptodate.h" /* Number of local quota structures per block */ static inline unsigned int ol_quota_entries_per_block(struct super_block *sb) @@ -979,6 +980,8 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; handle = ocfs2_start_trans(OCFS2_SB(sb), 2); @@ -999,7 +1002,6 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( memset(dchunk->dqc_bitmap, 0, sb->s_blocksize - sizeof(struct ocfs2_local_disk_chunk) - OCFS2_QBLK_RESERVED_SPACE); - set_buffer_uptodate(bh); unlock_buffer(bh); status = ocfs2_journal_dirty(handle, bh); if (status < 0) { -- cgit v1.2.3 From 0e7f387bf386c99e8ee322649fe4fc23b9cccc6c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:17 +0200 Subject: ocfs2: Initialize blocks allocated to local quota file When we extend local quota file, we should initialize data in newly allocated block. Firstly because on recovery we could parse bogus data, secondly so that block checksums are properly computed. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_local.c | 98 ++++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index b624f9bc9281..9d2993de1082 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -941,7 +941,7 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( struct ocfs2_local_disk_chunk *dchunk; int status; handle_t *handle; - struct buffer_head *bh = NULL; + struct buffer_head *bh = NULL, *dbh = NULL; u64 p_blkno; /* We are protected by dqio_sem so no locking needed */ @@ -965,34 +965,32 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + if (IS_ERR(handle)) { + status = PTR_ERR(handle); + mlog_errno(status); + goto out; + } + /* Initialize chunk header */ down_read(&OCFS2_I(lqinode)->ip_alloc_sem); status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, &p_blkno, NULL, NULL); up_read(&OCFS2_I(lqinode)->ip_alloc_sem); if (status < 0) { mlog_errno(status); - goto out; + goto out_trans; } bh = sb_getblk(sb, p_blkno); if (!bh) { status = -ENOMEM; mlog_errno(status); - goto out; + goto out_trans; } - ocfs2_set_new_buffer_uptodate(lqinode, bh); - dchunk = (struct ocfs2_local_disk_chunk *)bh->b_data; - - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out; - } - + ocfs2_set_new_buffer_uptodate(lqinode, bh); status = ocfs2_journal_access_dq(handle, lqinode, bh, - OCFS2_JOURNAL_ACCESS_WRITE); + OCFS2_JOURNAL_ACCESS_CREATE); if (status < 0) { mlog_errno(status); goto out_trans; @@ -1009,6 +1007,38 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( goto out_trans; } + /* Initialize new block with structures */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks + 1, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + dbh = sb_getblk(sb, p_blkno); + if (!dbh) { + status = -ENOMEM; + mlog_errno(status); + goto out_trans; + } + ocfs2_set_new_buffer_uptodate(lqinode, dbh); + status = ocfs2_journal_access_dq(handle, lqinode, dbh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(dbh); + memset(dbh->b_data, 0, sb->s_blocksize - OCFS2_QBLK_RESERVED_SPACE); + unlock_buffer(dbh); + status = ocfs2_journal_dirty(handle, dbh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + + /* Update local quotafile info */ oinfo->dqi_blocks += 2; oinfo->dqi_chunks++; status = ocfs2_local_write_info(sb, type); @@ -1033,6 +1063,7 @@ out_trans: ocfs2_commit_trans(OCFS2_SB(sb), handle); out: brelse(bh); + brelse(dbh); kmem_cache_free(ocfs2_qf_chunk_cachep, chunk); return ERR_PTR(status); } @@ -1050,6 +1081,8 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( struct ocfs2_local_disk_chunk *dchunk; int epb = ol_quota_entries_per_block(sb); unsigned int chunk_blocks; + struct buffer_head *bh; + u64 p_blkno; int status; handle_t *handle; @@ -1077,12 +1110,46 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 2); + + /* Get buffer from the just added block */ + down_read(&OCFS2_I(lqinode)->ip_alloc_sem); + status = ocfs2_extent_map_get_blocks(lqinode, oinfo->dqi_blocks, + &p_blkno, NULL, NULL); + up_read(&OCFS2_I(lqinode)->ip_alloc_sem); + if (status < 0) { + mlog_errno(status); + goto out; + } + bh = sb_getblk(sb, p_blkno); + if (!bh) { + status = -ENOMEM; + mlog_errno(status); + goto out; + } + ocfs2_set_new_buffer_uptodate(lqinode, bh); + + handle = ocfs2_start_trans(OCFS2_SB(sb), 3); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); goto out; } + /* Zero created block */ + status = ocfs2_journal_access_dq(handle, lqinode, bh, + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + unlock_buffer(bh); + status = ocfs2_journal_dirty(handle, bh); + if (status < 0) { + mlog_errno(status); + goto out_trans; + } + /* Update chunk header */ status = ocfs2_journal_access_dq(handle, lqinode, chunk->qc_headerbh, OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { @@ -1099,6 +1166,7 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( mlog_errno(status); goto out_trans; } + /* Update file header */ oinfo->dqi_blocks++; status = ocfs2_local_write_info(sb, type); if (status < 0) { -- cgit v1.2.3 From 7669f54c55df225cbb2db939a6c9f111445ffc1c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:18 +0200 Subject: ocfs2: Zero out padding of on disk dquot structure Padding fields of on-disk dquot structure were not zeroed. Zero them so that it's easier to use them later. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index a66cb82fd6e6..7248db681351 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -69,6 +69,7 @@ static void ocfs2_global_mem2diskdqb(void *dp, struct dquot *dquot) d->dqb_curspace = cpu_to_le64(m->dqb_curspace); d->dqb_btime = cpu_to_le64(m->dqb_btime); d->dqb_itime = cpu_to_le64(m->dqb_itime); + d->dqb_pad1 = d->dqb_pad2 = 0; } static int ocfs2_global_is_id(void *dp, struct dquot *dquot) -- cgit v1.2.3 From 1c1d9793ff6720531c0125a28d321f283716e32f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:19 +0200 Subject: ocfs2: Fix initialization of blockcheck stats We just set blockcheck stats to zeros but we should also properly initialize the spinlock there. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index f2893878f8ad..b0ee0fdf799a 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -777,6 +777,7 @@ static int ocfs2_sb_probe(struct super_block *sb, } di = (struct ocfs2_dinode *) (*bh)->b_data; memset(stats, 0, sizeof(struct ocfs2_blockcheck_stats)); + spin_lock_init(&stats->b_lock); status = ocfs2_verify_volume(di, *bh, blksize, stats); if (status >= 0) goto bail; -- cgit v1.2.3 From 4539f1df25bcd0fdf0d8a5e2c92de6bece83c7a0 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:20 +0200 Subject: ocfs2: Remove syncjiff field from quota info syncjiff is just a converted value of syncms. Some places which are updating syncms forgot to update syncjiff as well. Since the conversion is just a simple division / multiplication and it does not happen frequently, just remove the syncjiff field to avoid forgotten conversions. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota.h | 1 - fs/ocfs2/quota_global.c | 5 ++--- 2 files changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index 7365e2e08706..3fb96fcd4c81 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -50,7 +50,6 @@ struct ocfs2_mem_dqinfo { unsigned int dqi_chunks; /* Number of chunks in local quota file */ unsigned int dqi_blocks; /* Number of blocks allocated for local quota file */ unsigned int dqi_syncms; /* How often should we sync with other nodes */ - unsigned int dqi_syncjiff; /* Precomputed dqi_syncms in jiffies */ struct list_head dqi_chunk; /* List of chunks */ struct inode *dqi_gqinode; /* Global quota file inode */ struct ocfs2_lock_res dqi_gqlock; /* Lock protecting quota information structure */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 7248db681351..dab45467b5fd 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -346,7 +346,6 @@ int ocfs2_global_read_info(struct super_block *sb, int type) info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); oinfo->dqi_syncms = le32_to_cpu(dinfo.dqi_syncms); - oinfo->dqi_syncjiff = msecs_to_jiffies(oinfo->dqi_syncms); oinfo->dqi_gi.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); oinfo->dqi_gi.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); oinfo->dqi_gi.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); @@ -356,7 +355,7 @@ int ocfs2_global_read_info(struct super_block *sb, int type) oinfo->dqi_gi.dqi_qtree_depth = qtree_depth(&oinfo->dqi_gi); INIT_DELAYED_WORK(&oinfo->dqi_sync_work, qsync_work_fn); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); out_err: mlog_exit(status); @@ -611,7 +610,7 @@ static void qsync_work_fn(struct work_struct *work) dquot_scan_active(sb, ocfs2_sync_dquot_helper, oinfo->dqi_type); queue_delayed_work(ocfs2_quota_wq, &oinfo->dqi_sync_work, - oinfo->dqi_syncjiff); + msecs_to_jiffies(oinfo->dqi_syncms)); } /* -- cgit v1.2.3 From 0584974a77796581eb3a64b6c5005edac4a95128 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 22 Jul 2009 13:17:21 +0200 Subject: ocfs2: Define credit counts for quota operations Numbers of needed credits for some quota operations were written as raw numbers. Create appropriate defines instead. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 13 ++++++++++--- fs/ocfs2/quota_global.c | 18 +++++++++++++----- fs/ocfs2/quota_local.c | 16 ++++++++++++---- 3 files changed, 35 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 81e8abcd246e..4a4d3b55fd22 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -330,20 +330,27 @@ int ocfs2_journal_dirty(handle_t *handle, /* extended attribute block update */ #define OCFS2_XATTR_BLOCK_UPDATE_CREDITS 1 +/* Update of a single quota block */ +#define OCFS2_QUOTA_BLOCK_UPDATE_CREDITS 1 + /* global quotafile inode update, data block */ -#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) +#define OCFS2_QINFO_WRITE_CREDITS (OCFS2_INODE_UPDATE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) +#define OCFS2_LOCAL_QINFO_WRITE_CREDITS OCFS2_QUOTA_BLOCK_UPDATE_CREDITS /* * The two writes below can accidentally see global info dirty due * to set_info() quotactl so make them prepared for the writes. */ /* quota data block, global info */ /* Write to local quota file */ -#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + 1) +#define OCFS2_QWRITE_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) /* global quota data block, local quota data block, global quota inode, * global quota info */ -#define OCFS2_QSYNC_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 3) +#define OCFS2_QSYNC_CREDITS (OCFS2_QINFO_WRITE_CREDITS + \ + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS) static inline int ocfs2_quota_trans_credits(struct super_block *sb) { diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index dab45467b5fd..cc8785cf8f61 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -648,10 +648,15 @@ int ocfs2_calc_qdel_credits(struct super_block *sb, int type) return 0; oinfo = sb_dqinfo(sb, type)->dqi_priv; - /* We modify tree, leaf block, global info, local chunk header, - * global and local inode */ - return oinfo->dqi_gi.dqi_qtree_depth + 2 + 1 + - 2 * OCFS2_INODE_UPDATE_CREDITS; + /* + * We modify tree, leaf block, global info, local chunk header, + * global and local inode; OCFS2_QINFO_WRITE_CREDITS already + * accounts for inode update + */ + return oinfo->dqi_gi.dqi_qtree_depth + + OCFS2_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + OCFS2_INODE_UPDATE_CREDITS; } static int ocfs2_release_dquot(struct dquot *dquot) @@ -701,7 +706,10 @@ int ocfs2_calc_qinit_credits(struct super_block *sb, int type) * global file we can modify info, tree and leaf block */ return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - 3 + oinfo->dqi_gi.dqi_qtree_depth + 2; + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + + oinfo->dqi_gi.dqi_qtree_depth + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; } static int ocfs2_acquire_dquot(struct dquot *dquot) diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 9d2993de1082..bdb09cb6e1fe 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -101,7 +101,8 @@ static int ocfs2_modify_bh(struct inode *inode, struct buffer_head *bh, handle_t *handle; int status; - handle = ocfs2_start_trans(OCFS2_SB(sb), 1); + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -611,7 +612,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, goto out_bh; /* Mark quota file as clean if we are recovering quota file of * some other node. */ - handle = ocfs2_start_trans(osb, 1); + handle = ocfs2_start_trans(osb, + OCFS2_LOCAL_QINFO_WRITE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -965,7 +967,10 @@ static struct ocfs2_quota_chunk *ocfs2_local_quota_add_chunk( mlog_errno(status); goto out; } - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info and two new blocks we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); @@ -1128,7 +1133,10 @@ static struct ocfs2_quota_chunk *ocfs2_extend_local_quota_file( } ocfs2_set_new_buffer_uptodate(lqinode, bh); - handle = ocfs2_start_trans(OCFS2_SB(sb), 3); + /* Local quota info, chunk header and the new block we initialize */ + handle = ocfs2_start_trans(OCFS2_SB(sb), + OCFS2_LOCAL_QINFO_WRITE_CREDITS + + 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS); if (IS_ERR(handle)) { status = PTR_ERR(handle); mlog_errno(status); -- cgit v1.2.3 From 7b91e2661addd8e2419cb45f6a322aa5dab9bcee Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 23 Jul 2009 15:22:30 -0400 Subject: cifs: fix error handling in mount-time DFS referral chasing code If the referral is malformed or the hostname can't be resolved, then the current code generates an oops. Fix it to handle these errors gracefully. Reported-by: Sandro Mathys Acked-by: Igor Mammedov CC: Stable Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifs_dfs_ref.c | 12 +++++++++--- fs/cifs/connect.c | 13 ++++++++++--- 2 files changed, 19 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 3bb11be8b6a8..606912d8f2a8 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -55,7 +55,7 @@ void cifs_dfs_release_automount_timer(void) * i.e. strips from UNC trailing path that is not part of share * name and fixup missing '\' in the begining of DFS node refferal * if neccessary. - * Returns pointer to share name on success or NULL on error. + * Returns pointer to share name on success or ERR_PTR on error. * Caller is responsible for freeing returned string. */ static char *cifs_get_share_name(const char *node_name) @@ -68,7 +68,7 @@ static char *cifs_get_share_name(const char *node_name) UNC = kmalloc(len+2 /*for term null and additional \ if it's missed */, GFP_KERNEL); if (!UNC) - return NULL; + return ERR_PTR(-ENOMEM); /* get share name and server name */ if (node_name[1] != '\\') { @@ -87,7 +87,7 @@ static char *cifs_get_share_name(const char *node_name) cERROR(1, ("%s: no server name end in node name: %s", __func__, node_name)); kfree(UNC); - return NULL; + return ERR_PTR(-EINVAL); } /* find sharename end */ @@ -133,6 +133,12 @@ char *cifs_compose_mount_options(const char *sb_mountdata, return ERR_PTR(-EINVAL); *devname = cifs_get_share_name(ref->node_name); + if (IS_ERR(*devname)) { + rc = PTR_ERR(*devname); + *devname = NULL; + goto compose_mount_options_err; + } + rc = dns_resolve_server_name_to_ip(*devname, &srvIP); if (rc != 0) { cERROR(1, ("%s: Failed to resolve server part of %s to IP: %d", diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index fc44d316d0bb..f2486889b7bb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2544,11 +2544,20 @@ remote_path_check: if (mount_data != mount_data_global) kfree(mount_data); + mount_data = cifs_compose_mount_options( cifs_sb->mountdata, full_path + 1, referrals, &fake_devname); - kfree(fake_devname); + free_dfs_info_array(referrals, num_referrals); + kfree(fake_devname); + kfree(full_path); + + if (IS_ERR(mount_data)) { + rc = PTR_ERR(mount_data); + mount_data = NULL; + goto mount_fail_check; + } if (tcon) cifs_put_tcon(tcon); @@ -2556,8 +2565,6 @@ remote_path_check: cifs_put_smb_ses(pSesInfo); cleanup_volume_info(&volume_info); - FreeXid(xid); - kfree(full_path); referral_walks_count++; goto try_mount_again; } -- cgit v1.2.3 From 5bd9052d79daa4c8beb45436c408b6de672adb82 Mon Sep 17 00:00:00 2001 From: Steve French Date: Thu, 30 Jul 2009 02:26:14 +0000 Subject: [CIFS] Updates fs/cifs/CHANGES Signed-off-by: Steve French --- fs/cifs/CHANGES | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 92888aa90749..651cefde385b 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,9 @@ +Version 1.60 +------------- +Fix memory leak in reconnect. Fix oops in DFS mount error path. +Set s_maxbytes to smaller (the max that vfs can handle) so that +sendfile will now work over cifs mounts again. + Version 1.59 ------------ Client uses server inode numbers (which are persistent) rather than -- cgit v1.2.3 From e9956fae7dbce6ac770e7a00436c496ddbbd3215 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 30 Jul 2009 16:07:10 +0800 Subject: ocfs2/quota: Release lock for error in ocfs2_quota_write. ocfs2_quota_write needs to release the lock if it fails to read quota block. So use "goto out" instead of "return err". Signed-off-by: Tao Ma Acked-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index cc8785cf8f61..d604a6aa0a22 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -238,7 +238,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, } if (err) { mlog_errno(err); - return err; + goto out; } lock_buffer(bh); if (new) -- cgit v1.2.3 From 6606bb97e146a387932efee263745b7240a11193 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 31 Jul 2009 11:03:58 -0400 Subject: Btrfs: fix btrfs_remove_from_free_space corner case Yan Zheng hit a problem where we tried to remove some free space but failed because we couldn't find the free space entry. This is because the free space was held within a bitmap that had a starting offset well before the actual offset of the free space, and there were free space extents that were in the same range as that offset, so tree_search_offset returned with NULL because we couldn't find a free space extent that had that offset. This is fixed by making sure that if we fail to find the entry, we re-search again with bitmap_only set to 1 and do an offset_to_bitmap so we can get the appropriate bitmap. A similar problem happens in btrfs_alloc_from_bitmap for the clustering code, but that is not as bad since we will just go and redo our cluster allocation. Also this adds some debugging checks to make sure that the free space we are trying to remove from the bitmap is in fact there. This can probably go away after a while, but since this code is only used by the tree-logging stuff it would be nice to run with it for a while to make sure there are no problems. Signed-off-by: Josef Bacik Signed-off-by: Chris Mason --- fs/btrfs/free-space-cache.c | 73 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 64 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index af99b78b288e..5edcee3a617f 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -414,11 +414,29 @@ static noinline int remove_from_bitmap(struct btrfs_block_group_cache *block_gro u64 *offset, u64 *bytes) { u64 end; + u64 search_start, search_bytes; + int ret; again: end = bitmap_info->offset + (u64)(BITS_PER_BITMAP * block_group->sectorsize) - 1; + /* + * XXX - this can go away after a few releases. + * + * since the only user of btrfs_remove_free_space is the tree logging + * stuff, and the only way to test that is under crash conditions, we + * want to have this debug stuff here just in case somethings not + * working. Search the bitmap for the space we are trying to use to + * make sure its actually there. If its not there then we need to stop + * because something has gone wrong. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + BUG_ON(ret < 0 || search_start != *offset); + if (*offset > bitmap_info->offset && *offset + *bytes > end) { bitmap_clear_bits(block_group, bitmap_info, *offset, end - *offset + 1); @@ -430,6 +448,7 @@ again: } if (*bytes) { + struct rb_node *next = rb_next(&bitmap_info->offset_index); if (!bitmap_info->bytes) { unlink_free_space(block_group, bitmap_info); kfree(bitmap_info->bitmap); @@ -438,16 +457,36 @@ again: recalculate_thresholds(block_group); } - bitmap_info = tree_search_offset(block_group, - offset_to_bitmap(block_group, - *offset), - 1, 0); - if (!bitmap_info) + /* + * no entry after this bitmap, but we still have bytes to + * remove, so something has gone wrong. + */ + if (!next) return -EINVAL; + bitmap_info = rb_entry(next, struct btrfs_free_space, + offset_index); + + /* + * if the next entry isn't a bitmap we need to return to let the + * extent stuff do its work. + */ if (!bitmap_info->bitmap) return -EAGAIN; + /* + * Ok the next item is a bitmap, but it may not actually hold + * the information for the rest of this free space stuff, so + * look for it, and if we don't find it return so we can try + * everything over again. + */ + search_start = *offset; + search_bytes = *bytes; + ret = search_bitmap(block_group, bitmap_info, &search_start, + &search_bytes); + if (ret < 0 || search_start != *offset) + return -EAGAIN; + goto again; } else if (!bitmap_info->bytes) { unlink_free_space(block_group, bitmap_info); @@ -644,8 +683,17 @@ int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, again: info = tree_search_offset(block_group, offset, 0, 0); if (!info) { - WARN_ON(1); - goto out_lock; + /* + * oops didn't find an extent that matched the space we wanted + * to remove, look for a bitmap instead + */ + info = tree_search_offset(block_group, + offset_to_bitmap(block_group, offset), + 1, 0); + if (!info) { + WARN_ON(1); + goto out_lock; + } } if (info->bytes < bytes && rb_next(&info->offset_index)) { @@ -957,8 +1005,15 @@ static u64 btrfs_alloc_from_bitmap(struct btrfs_block_group_cache *block_group, if (cluster->block_group != block_group) goto out; - entry = tree_search_offset(block_group, search_start, 0, 0); - + /* + * search_start is the beginning of the bitmap, but at some point it may + * be a good idea to point to the actual start of the free area in the + * bitmap, so do the offset_to_bitmap trick anyway, and set bitmap_only + * to 1 to make sure we get the bitmap entry + */ + entry = tree_search_offset(block_group, + offset_to_bitmap(block_group, search_start), + 1, 0); if (!entry || !entry->bitmap) goto out; -- cgit v1.2.3 From 013f1b12f4fc479f697acae2f31bad220162cd03 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 31 Jul 2009 14:57:55 -0400 Subject: Btrfs: make sure the async caching thread advances the key The async caching thread can end up looping forever if a given search puts it at the last key in a leaf. It will end up calling btrfs_next_leaf and then checking if it needs to politely drop the read semaphore. Most of the time this looping isn't noticed because it is able to make progress the next time around. But, during log replay, we wait on the async caching thread to finish, and the async thread is waiting on the commit, and no progress is really made. The fix used here is to copy the key out of the next leaf, that way our search lands there properly. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dc84daee6bc4..72a2b9c28e9f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -265,10 +265,6 @@ static int caching_kthread(void *data) atomic_inc(&block_group->space_info->caching_threads); last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); -again: - /* need to make sure the commit_root doesn't disappear */ - down_read(&fs_info->extent_commit_sem); - /* * We don't want to deadlock with somebody trying to allocate a new * extent for the extent root while also trying to search the extent @@ -282,6 +278,10 @@ again: key.objectid = last; key.offset = 0; btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); +again: + /* need to make sure the commit_root doesn't disappear */ + down_read(&fs_info->extent_commit_sem); + ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0); if (ret < 0) goto err; @@ -304,6 +304,19 @@ again: if (need_resched() || btrfs_transaction_in_commit(fs_info)) { + leaf = path->nodes[0]; + + /* this shouldn't happen, but if the + * leaf is empty just move on. + */ + if (btrfs_header_nritems(leaf) == 0) + break; + /* + * we need to copy the key out so that + * we are sure the next search advances + * us forward in the btree. + */ + btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(fs_info->extent_root, path); up_read(&fs_info->extent_commit_sem); schedule_timeout(1); -- cgit v1.2.3 From ab57a40827d99e2d8e59066a56b93bf6c844c916 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 31 Jul 2009 14:28:02 -0500 Subject: ocfs2: Remove redundant BUG_ON in __dlm_queue_ast() We BUG_ON() the same thing twice. Signed-off-by: Goldwyn Rodrigues Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmast.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmast.c b/fs/ocfs2/dlm/dlmast.c index d07ddbe4b283..81eff8e58322 100644 --- a/fs/ocfs2/dlm/dlmast.c +++ b/fs/ocfs2/dlm/dlmast.c @@ -103,7 +103,6 @@ static void __dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock) lock->ast_pending, lock->ml.type); BUG(); } - BUG_ON(!list_empty(&lock->ast_list)); if (lock->ast_pending) mlog(0, "lock has an ast getting flushed right now\n"); -- cgit v1.2.3 From a97778457f22181e8c38c4cd7d7e528378738a98 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Tue, 28 Jul 2009 17:55:29 +0900 Subject: nilfs2: fix oops due to inconsistent state in page with discrete b-tree nodes Andrea Gelmini gave me a report that a kernel oops hit on a nilfs filesystem with a 1KB block size when doing rsync. This turned out to be caused by an inconsistency of dirty state between a page and its buffers storing b-tree node blocks. If the page had multiple buffers split over multiple logs, and if the logs were written at a time, a dirty flag remained in the page even every dirty flag in the buffers was cleared. This will fix the failure by dropping the dirty flag properly for pages with the discrete multiple b-tree nodes. Reported-by: Andrea Gelmini Signed-off-by: Ryusuke Konishi Tested-by: Andrea Gelmini Cc: stable@kernel.org --- fs/nilfs2/segment.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 8b5e4778cf28..51ff3d0a4ee2 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -1859,12 +1859,26 @@ static void nilfs_end_page_io(struct page *page, int err) if (!page) return; - if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) + if (buffer_nilfs_node(page_buffers(page)) && !PageWriteback(page)) { /* * For b-tree node pages, this function may be called twice * or more because they might be split in a segment. */ + if (PageDirty(page)) { + /* + * For pages holding split b-tree node buffers, dirty + * flag on the buffers may be cleared discretely. + * In that case, the page is once redirtied for + * remaining buffers, and it must be cancelled if + * all the buffers get cleaned later. + */ + lock_page(page); + if (nilfs_page_buffers_clean(page)) + __nilfs_clear_page_dirty(page); + unlock_page(page); + } return; + } __nilfs_end_page_io(page, err); } -- cgit v1.2.3 From 9b9d6b2434fe942895c341b9a982f158529788ec Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 31 Jul 2009 06:56:09 -0400 Subject: cifs: reinstate original behavior when uid=/gid= options are specified This patch fixes the regression reported here: http://bugzilla.kernel.org/show_bug.cgi?id=13861 commit 4ae1507f6d266d0cc3dd36e474d83aad70fec9e4 changed the default behavior when the uid= or gid= option was specified for a mount. The existing behavior was to always clobber the ownership information provided by the server when these options were specified. The above commit changed this behavior so that these options simply provided defaults when the server did not provide this information (unless "forceuid" or "forcegid" were specified) This patch reverts this change so that the default behavior is restored. It also adds "noforceuid" and "noforcegid" options to make it so that ownership information from the server is preserved, even when the mount has uid= or gid= options specified. It also adds a couple of printk notices that pop up when forceuid or forcegid options are specified without a uid= or gid= option. Reported-by: Tom Chiverton Reviewed-by: Shirish Pargaonkar Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 42 ++++++++++++++++++++++++++++++------------ 1 file changed, 30 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index f2486889b7bb..1f3345d7fa79 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -803,6 +803,10 @@ cifs_parse_mount_options(char *options, const char *devname, char *data; unsigned int temp_len, i, j; char separator[2]; + short int override_uid = -1; + short int override_gid = -1; + bool uid_specified = false; + bool gid_specified = false; separator[0] = ','; separator[1] = 0; @@ -1093,18 +1097,20 @@ cifs_parse_mount_options(char *options, const char *devname, "too long.\n"); return 1; } - } else if (strnicmp(data, "uid", 3) == 0) { - if (value && *value) - vol->linux_uid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forceuid", 8) == 0) { - vol->override_uid = 1; - } else if (strnicmp(data, "gid", 3) == 0) { - if (value && *value) - vol->linux_gid = - simple_strtoul(value, &value, 0); - } else if (strnicmp(data, "forcegid", 8) == 0) { - vol->override_gid = 1; + } else if (!strnicmp(data, "uid", 3) && value && *value) { + vol->linux_uid = simple_strtoul(value, &value, 0); + uid_specified = true; + } else if (!strnicmp(data, "forceuid", 8)) { + override_uid = 1; + } else if (!strnicmp(data, "noforceuid", 10)) { + override_uid = 0; + } else if (!strnicmp(data, "gid", 3) && value && *value) { + vol->linux_gid = simple_strtoul(value, &value, 0); + gid_specified = true; + } else if (!strnicmp(data, "forcegid", 8)) { + override_gid = 1; + } else if (!strnicmp(data, "noforcegid", 10)) { + override_gid = 0; } else if (strnicmp(data, "file_mode", 4) == 0) { if (value && *value) { vol->file_mode = @@ -1355,6 +1361,18 @@ cifs_parse_mount_options(char *options, const char *devname, if (vol->UNCip == NULL) vol->UNCip = &vol->UNC[2]; + if (uid_specified) + vol->override_uid = override_uid; + else if (override_uid == 1) + printk(KERN_NOTICE "CIFS: ignoring forceuid mount option " + "specified with no uid= option.\n"); + + if (gid_specified) + vol->override_gid = override_gid; + else if (override_gid == 1) + printk(KERN_NOTICE "CIFS: ignoring forcegid mount option " + "specified with no gid= option.\n"); + return 0; } -- cgit v1.2.3 From 01a261e09a21e0ba342d3907a79cf5c78ee3f37a Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Sun, 2 Aug 2009 17:45:55 +0900 Subject: nilfs2: fix missing unlock in error path of nilfs_mdt_write_page This adds a missing unlock of nilfs->ns_writer_mutex in nilfs_mdt_write_page() function. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/mdt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 3d3ddb3f5177..2dfd47714ae5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -412,8 +412,10 @@ nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc) return 0; /* Do not request flush for shadow page cache */ if (!sb) { writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs); - if (!writer) + if (!writer) { + nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs); return -EROFS; + } sb = writer->s_super; } -- cgit v1.2.3 From 4486d6ede16b362f89b29845af6fe1a26ae78a54 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Mon, 3 Aug 2009 12:45:10 -0400 Subject: cifs: show noforceuid/noforcegid mount options (try #2) Since forceuid is the default, we now need to show when it's disabled. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsfs.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 44f30504b82d..84b75253b05a 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -376,10 +376,14 @@ cifs_show_options(struct seq_file *s, struct vfsmount *m) seq_printf(s, ",uid=%d", cifs_sb->mnt_uid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_UID) seq_printf(s, ",forceuid"); + else + seq_printf(s, ",noforceuid"); seq_printf(s, ",gid=%d", cifs_sb->mnt_gid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_OVERR_GID) seq_printf(s, ",forcegid"); + else + seq_printf(s, ",noforcegid"); cifs_show_address(s, tcon->ses->server); -- cgit v1.2.3 From 24e2fb615fd6b624c320cec9ea9d91a75dad902e Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Sun, 2 Aug 2009 13:00:18 +0200 Subject: cifs: Read buffer overflow Check whether index is within bounds before testing the element. Acked-by: Jeff Layton Signed-off-by: Roel Kluin Signed-off-by: Steve French --- fs/cifs/cifs_unicode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 60e3c4253de0..714a542cbafc 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -44,7 +44,7 @@ cifs_ucs2_bytes(const __le16 *from, int maxbytes, int maxwords = maxbytes / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - for (i = 0; from[i] && i < maxwords; i++) { + for (i = 0; i < maxwords && from[i]; i++) { charlen = codepage->uni2char(le16_to_cpu(from[i]), tmp, NLS_MAX_CHARSET_SIZE); if (charlen > 0) -- cgit v1.2.3 From d098564f3b2b5d555e51bca765a6a9e0dda8f2cd Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 4 Aug 2009 03:53:28 +0000 Subject: [CIFS] Update readme to reflect forceuid mount parms Signed-off-by: Steve French --- fs/cifs/CHANGES | 3 ++- fs/cifs/README | 25 ++++++++++++------------- 2 files changed, 14 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 651cefde385b..e85b1e4389e0 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -2,7 +2,8 @@ Version 1.60 ------------- Fix memory leak in reconnect. Fix oops in DFS mount error path. Set s_maxbytes to smaller (the max that vfs can handle) so that -sendfile will now work over cifs mounts again. +sendfile will now work over cifs mounts again. Add noforcegid +and noforceuid mount parameters. Version 1.59 ------------ diff --git a/fs/cifs/README b/fs/cifs/README index ad92921dbde4..79c1a93400be 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -262,11 +262,11 @@ A partial list of the supported mount options follows: mount. domain Set the SMB/CIFS workgroup name prepended to the username during CIFS session establishment - forceuid Set the default uid for inodes based on the uid - passed in. For mounts to servers + forceuid Set the default uid for inodes to the uid + passed in on mount. For mounts to servers which do support the CIFS Unix extensions, such as a properly configured Samba server, the server provides - the uid, gid and mode so this parameter should not be + the uid, gid and mode so this parameter should not be specified unless the server and clients uid and gid numbering differ. If the server and client are in the same domain (e.g. running winbind or nss_ldap) and @@ -278,11 +278,7 @@ A partial list of the supported mount options follows: of existing files will be the uid (gid) of the person who executed the mount (root, except when mount.cifs is configured setuid for user mounts) unless the "uid=" - (gid) mount option is specified. For the uid (gid) of newly - created files and directories, ie files created since - the last mount of the server share, the expected uid - (gid) is cached as long as the inode remains in - memory on the client. Also note that permission + (gid) mount option is specified. Also note that permission checks (authorization checks) on accesses to a file occur at the server, but there are cases in which an administrator may want to restrict at the client as well. For those @@ -290,12 +286,15 @@ A partial list of the supported mount options follows: (such as Windows), permissions can also be checked at the client, and a crude form of client side permission checking can be enabled by specifying file_mode and dir_mode on - the client. Note that the mount.cifs helper must be - at version 1.10 or higher to support specifying the uid - (or gid) in non-numeric form. - forcegid (similar to above but for the groupid instead of uid) + the client. (default) + forcegid (similar to above but for the groupid instead of uid) (default) + noforceuid Fill in file owner information (uid) by requesting it from + the server if possible. With this option, the value given in + the uid= option (on mount) will only be used if the server + can not support returning uids on inodes. + noforcegid (similar to above but for the group owner, gid, instead of uid) uid Set the default uid for inodes, and indicate to the - cifs kernel driver which local user mounted . If the server + cifs kernel driver which local user mounted. If the server supports the unix extensions the default uid is not used to fill in the owner fields of inodes (files) unless the "forceuid" parameter is specified. -- cgit v1.2.3 From 57ca7deb062abf56168d15f000c16e25f88a9cf3 Mon Sep 17 00:00:00 2001 From: Anders Grafström Date: Tue, 4 Aug 2009 13:11:47 +0200 Subject: jffs2: Fix return value from jffs2_do_readpage_nolock() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This fixes "kernel BUG at fs/jffs2/file.c:251!". This pseudocode hopefully illustrates the scenario that triggers it: jffs2_write_begin { jffs2_do_readpage_nolock { jffs2_read_inode_range { jffs2_read_dnode { Data CRC 33c102e9 != calculated CRC 0ef77e7b for node at 005d42e4 return -EIO; } } ClearPageUptodate(pg); return 0; } } jffs2_write_end { BUG_ON(!PageUptodate(pg)); } Signed-off-by: Anders Grafström Signed-off-by: David Woodhouse --- fs/jffs2/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index 5edc2bf20581..23c947539864 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -99,7 +99,7 @@ static int jffs2_do_readpage_nolock (struct inode *inode, struct page *pg) kunmap(pg); D2(printk(KERN_DEBUG "readpage finished\n")); - return 0; + return ret; } int jffs2_do_readpage_unlock(struct inode *inode, struct page *pg) -- cgit v1.2.3 From 54e346215e4fe2ca8c94c54e546cc61902060510 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:25 -0300 Subject: vfs: fix inode_init_always calling convention Currently inode_init_always calls into ->destroy_inode if the additional initialization fails. That's not only counter-intuitive because inode_init_always did not allocate the inode structure, but in case of XFS it's actively harmful as ->destroy_inode might delete the inode from a radix-tree that has never been added. This in turn might end up deleting the inode for the same inum that has been instanciated by another process and cause lots of cause subtile problems. Also in the case of re-initializing a reclaimable inode in XFS it would free an inode we still want to keep alive. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 30 +++++++++++++++++------------- fs/xfs/xfs_iget.c | 17 +++++------------ include/linux/fs.h | 2 +- 3 files changed, 23 insertions(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index 901bad1e5f12..af2c05235cc8 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -120,12 +120,11 @@ static void wake_up_inode(struct inode *inode) * These are initializations that need to be done on every inode * allocation as the fields are not initialised by slab allocation. */ -struct inode *inode_init_always(struct super_block *sb, struct inode *inode) +int inode_init_always(struct super_block *sb, struct inode *inode) { static const struct address_space_operations empty_aops; static struct inode_operations empty_iops; static const struct file_operations empty_fops; - struct address_space *const mapping = &inode->i_data; inode->i_sb = sb; @@ -152,7 +151,7 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->dirtied_when = 0; if (security_inode_alloc(inode)) - goto out_free_inode; + goto out; /* allocate and initialize an i_integrity */ if (ima_inode_alloc(inode)) @@ -198,16 +197,12 @@ struct inode *inode_init_always(struct super_block *sb, struct inode *inode) inode->i_fsnotify_mask = 0; #endif - return inode; + return 0; out_free_security: security_inode_free(inode); -out_free_inode: - if (inode->i_sb->s_op->destroy_inode) - inode->i_sb->s_op->destroy_inode(inode); - else - kmem_cache_free(inode_cachep, (inode)); - return NULL; +out: + return -ENOMEM; } EXPORT_SYMBOL(inode_init_always); @@ -220,9 +215,18 @@ static struct inode *alloc_inode(struct super_block *sb) else inode = kmem_cache_alloc(inode_cachep, GFP_KERNEL); - if (inode) - return inode_init_always(sb, inode); - return NULL; + if (!inode) + return NULL; + + if (unlikely(inode_init_always(sb, inode))) { + if (inode->i_sb->s_op->destroy_inode) + inode->i_sb->s_op->destroy_inode(inode); + else + kmem_cache_free(inode_cachep, inode); + return NULL; + } + + return inode; } void destroy_inode(struct inode *inode) diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 5fcec6f020a7..719c85b155f4 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -64,6 +64,10 @@ xfs_inode_alloc( ip = kmem_zone_alloc(xfs_inode_zone, KM_SLEEP); if (!ip) return NULL; + if (inode_init_always(mp->m_super, VFS_I(ip))) { + kmem_zone_free(xfs_inode_zone, ip); + return NULL; + } ASSERT(atomic_read(&ip->i_iocount) == 0); ASSERT(atomic_read(&ip->i_pincount) == 0); @@ -105,17 +109,6 @@ xfs_inode_alloc( #ifdef XFS_DIR2_TRACE ip->i_dir_trace = ktrace_alloc(XFS_DIR2_KTRACE_SIZE, KM_NOFS); #endif - /* - * Now initialise the VFS inode. We do this after the xfs_inode - * initialisation as internal failures will result in ->destroy_inode - * being called and that will pass down through the reclaim path and - * free the XFS inode. This path requires the XFS inode to already be - * initialised. Hence if this call fails, the xfs_inode has already - * been freed and we should not reference it at all in the error - * handling. - */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) - return NULL; /* prevent anyone from using this yet */ VFS_I(ip)->i_state = I_NEW|I_LOCK; @@ -167,7 +160,7 @@ xfs_iget_cache_hit( * errors cleanly, then tag it so it can be set up correctly * later. */ - if (!inode_init_always(mp->m_super, VFS_I(ip))) { + if (inode_init_always(mp->m_super, VFS_I(ip))) { error = ENOMEM; goto out_error; } diff --git a/include/linux/fs.h b/include/linux/fs.h index a36ffa5a77a4..0c3b5e58a986 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2137,7 +2137,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); -extern struct inode * inode_init_always(struct super_block *, struct inode *); +extern int inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); -- cgit v1.2.3 From 2e00c97e2c1d2ffc9e26252ca26b237678b0b772 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:29 -0300 Subject: vfs: add __destroy_inode When we want to tear down an inode that lost the add to the cache race in XFS we must not call into ->destroy_inode because that would delete the inode that won the race from the inode cache radix tree. This patch provides the __destroy_inode helper needed to fix this, the actual fix will be in th next patch. As XFS was the only reason destroy_inode was exported we shift the export to the new __destroy_inode. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen --- fs/inode.c | 10 +++++++--- include/linux/fs.h | 1 + 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/inode.c b/fs/inode.c index af2c05235cc8..ae7b67e48661 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -229,7 +229,7 @@ static struct inode *alloc_inode(struct super_block *sb) return inode; } -void destroy_inode(struct inode *inode) +void __destroy_inode(struct inode *inode) { BUG_ON(inode_has_buffers(inode)); ima_inode_free(inode); @@ -241,13 +241,17 @@ void destroy_inode(struct inode *inode) if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED) posix_acl_release(inode->i_default_acl); #endif +} +EXPORT_SYMBOL(__destroy_inode); + +void destroy_inode(struct inode *inode) +{ + __destroy_inode(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); else kmem_cache_free(inode_cachep, (inode)); } -EXPORT_SYMBOL(destroy_inode); - /* * These are initializations that only need to be done diff --git a/include/linux/fs.h b/include/linux/fs.h index 0c3b5e58a986..67888a9e0655 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2164,6 +2164,7 @@ extern void __iget(struct inode * inode); extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void destroy_inode(struct inode *); +extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); -- cgit v1.2.3 From b36ec0428a06fcbdb67d61e9e664154e5dd9a8c7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 7 Aug 2009 14:38:34 -0300 Subject: xfs: fix freeing of inodes not yet added to the inode cache When freeing an inode that lost race getting added to the inode cache we must not call into ->destroy_inode, because that would delete the inode that won the race from the inode cache radix tree. This patch uses splits a new xfs_inode_free helper out of xfs_ireclaim and uses that plus __destroy_inode to make sure we really only free the memory allocted for the inode that lost the race, and not mess with the inode cache state. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reported-by: Alex Samad Reported-by: Andrew Randrianasulu Reported-by: Stephane Reported-by: Tommy Reported-by: Miah Gregory Reported-by: Gabriel Barazer Reported-by: Leandro Lucarella Reported-by: Daniel Burr Reported-by: Nickolay Reported-by: Michael Guntsche Reported-by: Dan Carley Reported-by: Michael Ole Olsen Reported-by: Michael Weissenbacher Reported-by: Martin Spott Reported-by: Christian Kujau Tested-by: Michael Guntsche Tested-by: Dan Carley Tested-by: Christian Kujau --- fs/xfs/xfs_iget.c | 125 +++++++++++++++++++++++++++++------------------------ fs/xfs/xfs_inode.h | 17 -------- 2 files changed, 68 insertions(+), 74 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 719c85b155f4..34ec86923f7e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -116,6 +116,71 @@ xfs_inode_alloc( return ip; } +STATIC void +xfs_inode_free( + struct xfs_inode *ip) +{ + switch (ip->i_d.di_mode & S_IFMT) { + case S_IFREG: + case S_IFDIR: + case S_IFLNK: + xfs_idestroy_fork(ip, XFS_DATA_FORK); + break; + } + + if (ip->i_afp) + xfs_idestroy_fork(ip, XFS_ATTR_FORK); + +#ifdef XFS_INODE_TRACE + ktrace_free(ip->i_trace); +#endif +#ifdef XFS_BMAP_TRACE + ktrace_free(ip->i_xtrace); +#endif +#ifdef XFS_BTREE_TRACE + ktrace_free(ip->i_btrace); +#endif +#ifdef XFS_RW_TRACE + ktrace_free(ip->i_rwtrace); +#endif +#ifdef XFS_ILOCK_TRACE + ktrace_free(ip->i_lock_trace); +#endif +#ifdef XFS_DIR2_TRACE + ktrace_free(ip->i_dir_trace); +#endif + + if (ip->i_itemp) { + /* + * Only if we are shutting down the fs will we see an + * inode still in the AIL. If it is there, we should remove + * it to prevent a use-after-free from occurring. + */ + xfs_log_item_t *lip = &ip->i_itemp->ili_item; + struct xfs_ail *ailp = lip->li_ailp; + + ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || + XFS_FORCED_SHUTDOWN(ip->i_mount)); + if (lip->li_flags & XFS_LI_IN_AIL) { + spin_lock(&ailp->xa_lock); + if (lip->li_flags & XFS_LI_IN_AIL) + xfs_trans_ail_delete(ailp, lip); + else + spin_unlock(&ailp->xa_lock); + } + xfs_inode_item_destroy(ip); + ip->i_itemp = NULL; + } + + /* asserts to verify all state is correct here */ + ASSERT(atomic_read(&ip->i_iocount) == 0); + ASSERT(atomic_read(&ip->i_pincount) == 0); + ASSERT(!spin_is_locked(&ip->i_flags_lock)); + ASSERT(completion_done(&ip->i_flush)); + + kmem_zone_free(xfs_inode_zone, ip); +} + /* * Check the validity of the inode we just found it the cache */ @@ -292,7 +357,8 @@ out_preload_end: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: - xfs_destroy_inode(ip); + __destroy_inode(VFS_I(ip)); + xfs_inode_free(ip); return error; } @@ -497,62 +563,7 @@ xfs_ireclaim( xfs_qm_dqdetach(ip); xfs_iunlock(ip, XFS_ILOCK_EXCL | XFS_IOLOCK_EXCL); - switch (ip->i_d.di_mode & S_IFMT) { - case S_IFREG: - case S_IFDIR: - case S_IFLNK: - xfs_idestroy_fork(ip, XFS_DATA_FORK); - break; - } - - if (ip->i_afp) - xfs_idestroy_fork(ip, XFS_ATTR_FORK); - -#ifdef XFS_INODE_TRACE - ktrace_free(ip->i_trace); -#endif -#ifdef XFS_BMAP_TRACE - ktrace_free(ip->i_xtrace); -#endif -#ifdef XFS_BTREE_TRACE - ktrace_free(ip->i_btrace); -#endif -#ifdef XFS_RW_TRACE - ktrace_free(ip->i_rwtrace); -#endif -#ifdef XFS_ILOCK_TRACE - ktrace_free(ip->i_lock_trace); -#endif -#ifdef XFS_DIR2_TRACE - ktrace_free(ip->i_dir_trace); -#endif - if (ip->i_itemp) { - /* - * Only if we are shutting down the fs will we see an - * inode still in the AIL. If it is there, we should remove - * it to prevent a use-after-free from occurring. - */ - xfs_log_item_t *lip = &ip->i_itemp->ili_item; - struct xfs_ail *ailp = lip->li_ailp; - - ASSERT(((lip->li_flags & XFS_LI_IN_AIL) == 0) || - XFS_FORCED_SHUTDOWN(ip->i_mount)); - if (lip->li_flags & XFS_LI_IN_AIL) { - spin_lock(&ailp->xa_lock); - if (lip->li_flags & XFS_LI_IN_AIL) - xfs_trans_ail_delete(ailp, lip); - else - spin_unlock(&ailp->xa_lock); - } - xfs_inode_item_destroy(ip); - ip->i_itemp = NULL; - } - /* asserts to verify all state is correct here */ - ASSERT(atomic_read(&ip->i_iocount) == 0); - ASSERT(atomic_read(&ip->i_pincount) == 0); - ASSERT(!spin_is_locked(&ip->i_flags_lock)); - ASSERT(completion_done(&ip->i_flush)); - kmem_zone_free(xfs_inode_zone, ip); + xfs_inode_free(ip); } /* diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 1804f866a71d..65f24a3cc992 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -309,23 +309,6 @@ static inline struct inode *VFS_I(struct xfs_inode *ip) return &ip->i_vnode; } -/* - * Get rid of a partially initialized inode. - * - * We have to go through destroy_inode to make sure allocations - * from init_inode_always like the security data are undone. - * - * We mark the inode bad so that it takes the short cut in - * the reclaim path instead of going through the flush path - * which doesn't make sense for an inode that has never seen the - * light of day. - */ -static inline void xfs_destroy_inode(struct xfs_inode *ip) -{ - make_bad_inode(VFS_I(ip)); - return destroy_inode(VFS_I(ip)); -} - /* * i_flags helper functions */ -- cgit v1.2.3 From 69130c7cf96ea853dc5be599dd6a4b98907d39cc Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 6 Aug 2009 15:07:37 -0700 Subject: compat_ioctl: hook up compat handler for FIEMAP ioctl The FIEMAP_IOC_FIEMAP mapping ioctl was missing a 32-bit compat handler, which means that 32-bit suerspace on 64-bit kernels cannot use this ioctl command. The structure is nicely aligned, padded, and sized, so it is just this simple. Tested w/ 32-bit ioctl tester (from Josef) on a 64-bit kernel on ext4. Signed-off-by: Eric Sandeen Cc: Cc: Mark Lord Cc: Arnd Bergmann Cc: Josef Bacik Cc: Jan Kara Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index f28f070a60fc..f91fd51b32e3 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1905,6 +1905,7 @@ COMPATIBLE_IOCTL(FIONCLEX) COMPATIBLE_IOCTL(FIOASYNC) COMPATIBLE_IOCTL(FIONBIO) COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ +COMPATIBLE_IOCTL(FS_IOC_FIEMAP) /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) -- cgit v1.2.3 From 2d8dd38a5aa0cc2490bbad9b75e77fa154e1d145 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 6 Aug 2009 15:07:39 -0700 Subject: vfs: mnt_want_write_file(): fix special file handling I suspect that mnt_want_write_file() may have wrong assumption. I think mnt_want_write_file() is assuming it increments ->mnt_writers if (file->f_mode & FMODE_WRITE). But, if it's special_file(), it is false? Signed-off-by: OGAWA Hirofumi Acked-by: Dave Hansen Cc: Al Viro Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 277c28a63ead..7230787d18b0 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -316,7 +316,8 @@ EXPORT_SYMBOL_GPL(mnt_clone_write); */ int mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITE)) + struct inode *inode = file->f_dentry->d_inode; + if (!(file->f_mode & FMODE_WRITE) || special_file(inode->i_mode)) return mnt_want_write(file->f_path.mnt); else return mnt_clone_write(file->f_path.mnt); -- cgit v1.2.3 From 3440625d78711bee41a84cf29c3d8c579b522666 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 6 Aug 2009 15:09:34 -0700 Subject: flat: fix uninitialized ptr with shared libs The new credentials code broke load_flat_shared_library() as it now uses an uninitialized cred pointer. Reported-by: Bernd Schmidt Tested-by: Bernd Schmidt Cc: Mike Frysinger Cc: David Howells Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_flat.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 697f6b5f1313..e92f229e3c6e 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -828,15 +828,22 @@ static int load_flat_shared_library(int id, struct lib_info *libs) if (IS_ERR(bprm.file)) return res; + bprm.cred = prepare_exec_creds(); + res = -ENOMEM; + if (!bprm.cred) + goto out; + res = prepare_binprm(&bprm); if (res <= (unsigned long)-4096) res = load_flat_file(&bprm, libs, id, NULL); - if (bprm.file) { - allow_write_access(bprm.file); - fput(bprm.file); - bprm.file = NULL; - } + + abort_creds(bprm.cred); + +out: + allow_write_access(bprm.file); + fput(bprm.file); + return(res); } -- cgit v1.2.3 From 4baf8c9201e88546918cbfa61ea8062c38bc1644 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Fri, 7 Aug 2009 13:47:08 -0400 Subject: Btrfs: remove superfluous NULL pointer check in btrfs_rename() This takes care of the following entry from Dan's list: fs/btrfs/inode.c +4788 btrfs_rename(36) warning: variable derefenced before check 'old_inode' Reported-by: Dan Carpenter Cc: Jonathan Corbet Cc: Eugene Teo Cc: Julia Lawall Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Chris Mason --- fs/btrfs/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3ea827ddf0fe..04b53b5ebe59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4806,8 +4806,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, * and the replacement file is large. Start IO on it now so * we don't add too much work to the end of the transaction */ - if (new_inode && old_inode && S_ISREG(old_inode->i_mode) && - new_inode->i_size && + if (new_inode && S_ISREG(old_inode->i_mode) && new_inode->i_size && old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT) filemap_flush(old_inode->i_mapping); -- cgit v1.2.3 From 60f2e8f8a07331097a57ec4abcdc680405579377 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Fri, 7 Aug 2009 13:51:33 -0400 Subject: Btrfs: correct error-handling zlib error handling find_zlib_workspace returns an ERR_PTR value in an error case instead of NULL. A simplified version of the semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @match exists@ expression x, E; statement S1, S2; @@ x = find_zlib_workspace(...) ... when != x = E ( * if (x == NULL || ...) S1 else S2 | * if (x == NULL && ...) S1 else S2 ) // Signed-off-by: Julia Lawall Signed-off-by: Chris Mason --- fs/btrfs/zlib.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index ecfbce836d32..3e2b90eaa239 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -208,7 +208,7 @@ int btrfs_zlib_compress_pages(struct address_space *mapping, *total_in = 0; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -1; if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { @@ -366,7 +366,7 @@ int btrfs_zlib_decompress_biovec(struct page **pages_in, char *kaddr; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; data_in = kmap(pages_in[page_in_index]); @@ -547,7 +547,7 @@ int btrfs_zlib_decompress(unsigned char *data_in, return -ENOMEM; workspace = find_zlib_workspace(); - if (!workspace) + if (IS_ERR(workspace)) return -ENOMEM; workspace->inf_strm.next_in = data_in; -- cgit v1.2.3 From ceab36edd3d3ad3ffd01d41d6d1e05ac1ff8357e Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Fri, 7 Aug 2009 13:51:33 -0400 Subject: Btrfs: fix balancing oops when invalidate_inode_pages2 returns EBUSY invalidate_inode_pages2_range may return -EBUSY occasionally which results Oops. This patch fixes the issue by moving invalidate_inode_pages2_range into a loop and keeping calling it until the return value is not -EBUSY. The EBUSY return is temporary, and can happen when the btrfs release page function is unable to release a page because the EXTENT_LOCK bit is set. Signed-off-by: Yan Zheng Signed-off-by: Chris Mason --- fs/btrfs/relocation.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index e71264d1c2c9..c04f7f212602 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2553,8 +2553,13 @@ int relocate_inode_pages(struct inode *inode, u64 start, u64 len) last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; /* make sure the dirty trick played by the caller work */ - ret = invalidate_inode_pages2_range(inode->i_mapping, - first_index, last_index); + while (1) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret != -EBUSY) + break; + schedule_timeout(HZ/10); + } if (ret) goto out_unlock; -- cgit v1.2.3 From e7432675f8ca868a4af365759a8d4c3779a3d922 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Thu, 6 Aug 2009 16:12:58 -0700 Subject: ocfs2: Initialize the cluster we're writing to in a non-sparse extend In a non-sparse extend, we correctly allocate (and zero) the clusters between the old_i_size and pos, but we don't zero the portions of the cluster we're writing to outside of pos<->len. It handles clustersize > pagesize and blocksize < pagesize. [Cleaned up by Joel Becker.] Signed-off-by: Sunil Mushran Signed-off-by: Joel Becker --- fs/ocfs2/aops.c | 66 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index e511df101451..b401654011a2 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -895,18 +895,17 @@ struct ocfs2_write_cluster_desc { */ unsigned c_new; unsigned c_unwritten; + unsigned c_needs_zero; }; -static inline int ocfs2_should_zero_cluster(struct ocfs2_write_cluster_desc *d) -{ - return d->c_new || d->c_unwritten; -} - struct ocfs2_write_ctxt { /* Logical cluster position / len of write */ u32 w_cpos; u32 w_clen; + /* First cluster allocated in a nonsparse extend */ + u32 w_first_new_cpos; + struct ocfs2_write_cluster_desc w_desc[OCFS2_MAX_CLUSTERS_PER_PAGE]; /* @@ -984,6 +983,7 @@ static int ocfs2_alloc_write_ctxt(struct ocfs2_write_ctxt **wcp, return -ENOMEM; wc->w_cpos = pos >> osb->s_clustersize_bits; + wc->w_first_new_cpos = UINT_MAX; cend = (pos + len - 1) >> osb->s_clustersize_bits; wc->w_clen = cend - wc->w_cpos + 1; get_bh(di_bh); @@ -1218,20 +1218,18 @@ out: */ static int ocfs2_write_cluster(struct address_space *mapping, u32 phys, unsigned int unwritten, + unsigned int should_zero, struct ocfs2_alloc_context *data_ac, struct ocfs2_alloc_context *meta_ac, struct ocfs2_write_ctxt *wc, u32 cpos, loff_t user_pos, unsigned user_len) { - int ret, i, new, should_zero = 0; + int ret, i, new; u64 v_blkno, p_blkno; struct inode *inode = mapping->host; struct ocfs2_extent_tree et; new = phys == 0 ? 1 : 0; - if (new || unwritten) - should_zero = 1; - if (new) { u32 tmp_pos; @@ -1342,7 +1340,9 @@ static int ocfs2_write_cluster_by_desc(struct address_space *mapping, local_len = osb->s_clustersize - cluster_off; ret = ocfs2_write_cluster(mapping, desc->c_phys, - desc->c_unwritten, data_ac, meta_ac, + desc->c_unwritten, + desc->c_needs_zero, + data_ac, meta_ac, wc, desc->c_cpos, pos, local_len); if (ret) { mlog_errno(ret); @@ -1392,14 +1392,14 @@ static void ocfs2_set_target_boundaries(struct ocfs2_super *osb, * newly allocated cluster. */ desc = &wc->w_desc[0]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, &wc->w_target_from, NULL); desc = &wc->w_desc[wc->w_clen - 1]; - if (ocfs2_should_zero_cluster(desc)) + if (desc->c_needs_zero) ocfs2_figure_cluster_boundaries(osb, desc->c_cpos, NULL, @@ -1467,13 +1467,28 @@ static int ocfs2_populate_write_desc(struct inode *inode, phys++; } + /* + * If w_first_new_cpos is < UINT_MAX, we have a non-sparse + * file that got extended. w_first_new_cpos tells us + * where the newly allocated clusters are so we can + * zero them. + */ + if (desc->c_cpos >= wc->w_first_new_cpos) { + BUG_ON(phys == 0); + desc->c_needs_zero = 1; + } + desc->c_phys = phys; if (phys == 0) { desc->c_new = 1; + desc->c_needs_zero = 1; *clusters_to_alloc = *clusters_to_alloc + 1; } - if (ext_flags & OCFS2_EXT_UNWRITTEN) + + if (ext_flags & OCFS2_EXT_UNWRITTEN) { desc->c_unwritten = 1; + desc->c_needs_zero = 1; + } num_clusters--; } @@ -1633,10 +1648,13 @@ static int ocfs2_expand_nonsparse_inode(struct inode *inode, loff_t pos, if (newsize <= i_size_read(inode)) return 0; - ret = ocfs2_extend_no_holes(inode, newsize, newsize - len); + ret = ocfs2_extend_no_holes(inode, newsize, pos); if (ret) mlog_errno(ret); + wc->w_first_new_cpos = + ocfs2_clusters_for_bytes(inode->i_sb, i_size_read(inode)); + return ret; } @@ -1645,7 +1663,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, struct page **pagep, void **fsdata, struct buffer_head *di_bh, struct page *mmap_page) { - int ret, credits = OCFS2_INODE_UPDATE_CREDITS; + int ret, cluster_of_pages, credits = OCFS2_INODE_UPDATE_CREDITS; unsigned int clusters_to_alloc, extents_to_split; struct ocfs2_write_ctxt *wc; struct inode *inode = mapping->host; @@ -1723,8 +1741,19 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, } - ocfs2_set_target_boundaries(osb, wc, pos, len, - clusters_to_alloc + extents_to_split); + /* + * We have to zero sparse allocated clusters, unwritten extent clusters, + * and non-sparse clusters we just extended. For non-sparse writes, + * we know zeros will only be needed in the first and/or last cluster. + */ + if (clusters_to_alloc || extents_to_split || + wc->w_desc[0].c_needs_zero || + wc->w_desc[wc->w_clen - 1].c_needs_zero) + cluster_of_pages = 1; + else + cluster_of_pages = 0; + + ocfs2_set_target_boundaries(osb, wc, pos, len, cluster_of_pages); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -1757,8 +1786,7 @@ int ocfs2_write_begin_nolock(struct address_space *mapping, * extent. */ ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, - clusters_to_alloc + extents_to_split, - mmap_page); + cluster_of_pages, mmap_page); if (ret) { mlog_errno(ret); goto out_quota; -- cgit v1.2.3 From 8a57a9dda760ea7845390f1cd36f3eb2a49391d8 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 6 Aug 2009 16:07:50 -0700 Subject: ocfs2: keep index within status_map[] Do not exceed array status_map[] Signed-off-by: Roel Kluin Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Joel Becker --- fs/ocfs2/stack_o2cb.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c index 3f661376a2de..e49c41050264 100644 --- a/fs/ocfs2/stack_o2cb.c +++ b/fs/ocfs2/stack_o2cb.c @@ -17,6 +17,7 @@ * General Public License for more details. */ +#include #include #include @@ -153,7 +154,7 @@ static int status_map[] = { static int dlm_status_to_errno(enum dlm_status status) { - BUG_ON(status > (sizeof(status_map) / sizeof(status_map[0]))); + BUG_ON(status < 0 || status >= ARRAY_SIZE(status_map)); return status_map[status]; } -- cgit v1.2.3 From 13f0feafa6b8aead57a2a328e2fca6a5828bf286 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 23 Jun 2009 21:25:32 +0200 Subject: mm_for_maps: simplify, use ptrace_may_access() It would be nice to kill __ptrace_may_access(). It requires task_lock(), but this lock is only needed to read mm->flags in the middle. Convert mm_for_maps() to use ptrace_may_access(), this also simplifies the code a little bit. Also, we do not need to take ->mmap_sem in advance. In fact I think mm_for_maps() should not play with ->mmap_sem at all, the caller should take this lock. With or without this patch, without ->cred_guard_mutex held we can race with exec() and get the new ->mm but check old creds. Signed-off-by: Oleg Nesterov Reviewed-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/base.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 3ce5ae9e3d2d..917f338a6739 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -237,20 +237,19 @@ struct mm_struct *mm_for_maps(struct task_struct *task) struct mm_struct *mm = get_task_mm(task); if (!mm) return NULL; + if (mm != current->mm) { + /* + * task->mm can be changed before security check, + * in that case we must notice the change after. + */ + if (!ptrace_may_access(task, PTRACE_MODE_READ) || + mm != task->mm) { + mmput(mm); + return NULL; + } + } down_read(&mm->mmap_sem); - task_lock(task); - if (task->mm != mm) - goto out; - if (task->mm != current->mm && - __ptrace_may_access(task, PTRACE_MODE_READ) < 0) - goto out; - task_unlock(task); return mm; -out: - task_unlock(task); - up_read(&mm->mmap_sem); - mmput(mm); - return NULL; } static int proc_pid_cmdline(struct task_struct *task, char * buffer) -- cgit v1.2.3 From 00f89d218523b9bf6b522349c039d5ac80aa536d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 10 Jul 2009 03:27:38 +0200 Subject: mm_for_maps: shift down_read(mmap_sem) to the caller mm_for_maps() takes ->mmap_sem after security checks, this looks strange and obfuscates the locking rules. Move this lock to its single caller, m_start(). Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/base.c | 8 +++----- fs/proc/task_mmu.c | 1 + fs/proc/task_nommu.c | 1 + 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 917f338a6739..f3c2e4085fed 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -235,9 +235,8 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { struct mm_struct *mm = get_task_mm(task); - if (!mm) - return NULL; - if (mm != current->mm) { + + if (mm && mm != current->mm) { /* * task->mm can be changed before security check, * in that case we must notice the change after. @@ -245,10 +244,9 @@ struct mm_struct *mm_for_maps(struct task_struct *task) if (!ptrace_may_access(task, PTRACE_MODE_READ) || mm != task->mm) { mmput(mm); - return NULL; + mm = NULL; } } - down_read(&mm->mmap_sem); return mm; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6f61b7cc32e0..9bd8be1d235c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -119,6 +119,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) mm = mm_for_maps(priv->task); if (!mm) return NULL; + down_read(&mm->mmap_sem); tail_vma = get_gate_vma(priv->task); priv->tail_vma = tail_vma; diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 64a72e2e7650..8f5c05d3dbd3 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -189,6 +189,7 @@ static void *m_start(struct seq_file *m, loff_t *pos) priv->task = NULL; return NULL; } + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) -- cgit v1.2.3 From 704b836cbf19e885f8366bccb2e4b0474346c02d Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 10 Jul 2009 03:27:40 +0200 Subject: mm_for_maps: take ->cred_guard_mutex to fix the race with exec The problem is minor, but without ->cred_guard_mutex held we can race with exec() and get the new ->mm but check old creds. Now we do not need to re-check task->mm after ptrace_may_access(), it can't be changed to the new mm under us. Strictly speaking, this also fixes another very minor problem. Unless security check fails or the task exits mm_for_maps() should never return NULL, the caller should get either old or new ->mm. Signed-off-by: Oleg Nesterov Acked-by: Serge Hallyn Signed-off-by: James Morris --- fs/proc/base.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index f3c2e4085fed..175db258942f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -234,19 +234,19 @@ static int check_mem_permission(struct task_struct *task) struct mm_struct *mm_for_maps(struct task_struct *task) { - struct mm_struct *mm = get_task_mm(task); + struct mm_struct *mm; - if (mm && mm != current->mm) { - /* - * task->mm can be changed before security check, - * in that case we must notice the change after. - */ - if (!ptrace_may_access(task, PTRACE_MODE_READ) || - mm != task->mm) { - mmput(mm); - mm = NULL; - } + if (mutex_lock_killable(&task->cred_guard_mutex)) + return NULL; + + mm = get_task_mm(task); + if (mm && mm != current->mm && + !ptrace_may_access(task, PTRACE_MODE_READ)) { + mmput(mm); + mm = NULL; } + mutex_unlock(&task->cred_guard_mutex); + return mm; } -- cgit v1.2.3 From b409d7a0ab46fe530efe52734984b4ed5d46c3eb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 6 Aug 2009 23:29:34 +0200 Subject: ocfs2: Fix possible deadlock when extending quota file In OCFS2, allocator locks rank above transaction start. Thus we cannot extend quota file from inside a transaction less we could deadlock. We solve the problem by starting transaction not already in ocfs2_acquire_dquot() but only in ocfs2_local_read_dquot() and ocfs2_global_read_dquot() and we allocate blocks to quota files before starting the transaction. In case we crash, quota files will just have a few blocks more but that's no problem since we just use them next time we extend the quota file. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/journal.h | 5 --- fs/ocfs2/quota_global.c | 115 ++++++++++++++++++++++++------------------------ 2 files changed, 57 insertions(+), 63 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 4a4d3b55fd22..2c3222aec622 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -363,11 +363,6 @@ static inline int ocfs2_quota_trans_credits(struct super_block *sb) return credits; } -/* Number of credits needed for removing quota structure from file */ -int ocfs2_calc_qdel_credits(struct super_block *sb, int type); -/* Number of credits needed for initialization of new quota structure */ -int ocfs2_calc_qinit_credits(struct super_block *sb, int type); - /* group extend. inode update and last group update. */ #define OCFS2_GROUP_EXTEND_CREDITS (OCFS2_INODE_UPDATE_CREDITS + 1) diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index d604a6aa0a22..bf7742d0ee3b 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -215,11 +215,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, loff_t rounded_end = ocfs2_align_bytes_to_blocks(sb, off + len); - down_write(&OCFS2_I(gqinode)->ip_alloc_sem); - err = ocfs2_extend_no_holes(gqinode, rounded_end, off); - up_write(&OCFS2_I(gqinode)->ip_alloc_sem); - if (err < 0) - goto out; + /* Space is already allocated in ocfs2_global_read_dquot() */ err = ocfs2_simple_size_update(gqinode, oinfo->dqi_gqi_bh, rounded_end); @@ -405,13 +401,36 @@ int ocfs2_global_write_info(struct super_block *sb, int type) return err; } +static int ocfs2_global_qinit_alloc(struct super_block *sb, int type) +{ + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; + + /* + * We may need to allocate tree blocks and a leaf block but not the + * root block + */ + return oinfo->dqi_gi.dqi_qtree_depth; +} + +static int ocfs2_calc_global_qinit_credits(struct super_block *sb, int type) +{ + /* We modify all the allocated blocks, tree root, and info block */ + return (ocfs2_global_qinit_alloc(sb, type) + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; +} + /* Read in information from global quota file and acquire a reference to it. * dquot_acquire() has already started the transaction and locked quota file */ int ocfs2_global_read_dquot(struct dquot *dquot) { int err, err2, ex = 0; - struct ocfs2_mem_dqinfo *info = - sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; + struct super_block *sb = dquot->dq_sb; + int type = dquot->dq_type; + struct ocfs2_mem_dqinfo *info = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_super *osb = OCFS2_SB(sb); + struct inode *gqinode = info->dqi_gqinode; + int need_alloc = ocfs2_global_qinit_alloc(sb, type); + handle_t *handle = NULL; err = ocfs2_qinfo_lock(info, 0); if (err < 0) @@ -422,14 +441,33 @@ int ocfs2_global_read_dquot(struct dquot *dquot) OCFS2_DQUOT(dquot)->dq_use_count++; OCFS2_DQUOT(dquot)->dq_origspace = dquot->dq_dqb.dqb_curspace; OCFS2_DQUOT(dquot)->dq_originodes = dquot->dq_dqb.dqb_curinodes; + ocfs2_qinfo_unlock(info, 0); + if (!dquot->dq_off) { /* No real quota entry? */ - /* Upgrade to exclusive lock for allocation */ - ocfs2_qinfo_unlock(info, 0); - err = ocfs2_qinfo_lock(info, 1); - if (err < 0) - goto out_qlock; ex = 1; + /* + * Add blocks to quota file before we start a transaction since + * locking allocators ranks above a transaction start + */ + WARN_ON(journal_current_handle()); + down_write(&OCFS2_I(gqinode)->ip_alloc_sem); + err = ocfs2_extend_no_holes(gqinode, + gqinode->i_size + (need_alloc << sb->s_blocksize_bits), + gqinode->i_size); + up_write(&OCFS2_I(gqinode)->ip_alloc_sem); + if (err < 0) + goto out; + } + + handle = ocfs2_start_trans(osb, + ocfs2_calc_global_qinit_credits(sb, type)); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out; } + err = ocfs2_qinfo_lock(info, ex); + if (err < 0) + goto out_trans; err = qtree_write_dquot(&info->dqi_gi, dquot); if (ex && info_dirty(sb_dqinfo(dquot->dq_sb, dquot->dq_type))) { err2 = __ocfs2_global_write_info(dquot->dq_sb, dquot->dq_type); @@ -441,6 +479,9 @@ out_qlock: ocfs2_qinfo_unlock(info, 1); else ocfs2_qinfo_unlock(info, 0); +out_trans: + if (handle) + ocfs2_commit_trans(osb, handle); out: if (err < 0) mlog_errno(err); @@ -638,24 +679,17 @@ out: return status; } -int ocfs2_calc_qdel_credits(struct super_block *sb, int type) +static int ocfs2_calc_qdel_credits(struct super_block *sb, int type) { - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; + struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(sb, type)->dqi_priv; /* * We modify tree, leaf block, global info, local chunk header, * global and local inode; OCFS2_QINFO_WRITE_CREDITS already * accounts for inode update */ - return oinfo->dqi_gi.dqi_qtree_depth + + return (oinfo->dqi_gi.dqi_qtree_depth + 2) * + OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + OCFS2_INODE_UPDATE_CREDITS; } @@ -688,36 +722,10 @@ out: return status; } -int ocfs2_calc_qinit_credits(struct super_block *sb, int type) -{ - struct ocfs2_mem_dqinfo *oinfo; - int features[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA }; - struct ocfs2_dinode *lfe, *gfe; - - if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, features[type])) - return 0; - - oinfo = sb_dqinfo(sb, type)->dqi_priv; - gfe = (struct ocfs2_dinode *)oinfo->dqi_gqi_bh->b_data; - lfe = (struct ocfs2_dinode *)oinfo->dqi_lqi_bh->b_data; - /* We can extend local file + global file. In local file we - * can modify info, chunk header block and dquot block. In - * global file we can modify info, tree and leaf block */ - return ocfs2_calc_extend_credits(sb, &lfe->id2.i_list, 0) + - ocfs2_calc_extend_credits(sb, &gfe->id2.i_list, 0) + - OCFS2_LOCAL_QINFO_WRITE_CREDITS + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS + - oinfo->dqi_gi.dqi_qtree_depth + - 2 * OCFS2_QUOTA_BLOCK_UPDATE_CREDITS; -} - static int ocfs2_acquire_dquot(struct dquot *dquot) { - handle_t *handle; struct ocfs2_mem_dqinfo *oinfo = sb_dqinfo(dquot->dq_sb, dquot->dq_type)->dqi_priv; - struct ocfs2_super *osb = OCFS2_SB(dquot->dq_sb); int status = 0; mlog_entry("id=%u, type=%d", dquot->dq_id, dquot->dq_type); @@ -726,16 +734,7 @@ static int ocfs2_acquire_dquot(struct dquot *dquot) status = ocfs2_lock_global_qf(oinfo, 1); if (status < 0) goto out; - handle = ocfs2_start_trans(osb, - ocfs2_calc_qinit_credits(dquot->dq_sb, dquot->dq_type)); - if (IS_ERR(handle)) { - status = PTR_ERR(handle); - mlog_errno(status); - goto out_ilock; - } status = dquot_acquire(dquot); - ocfs2_commit_trans(osb, handle); -out_ilock: ocfs2_unlock_global_qf(oinfo, 1); out: mlog_exit(status); -- cgit v1.2.3 From 0cc6eee130b0c062feec8446d9cecdb17d2cfad3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:53 -0400 Subject: xfs: avoid memory allocation under m_peraglock in growfs code Allocate the memory for the larger m_perag array before taking the per-AG lock as the per-AG lock can be taken under the i_lock which can be taken from reclaim context. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_fsops.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index cbd451bb4848..2d0b3e1da9e6 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -167,17 +167,25 @@ xfs_growfs_data_private( new = nb - mp->m_sb.sb_dblocks; oagcount = mp->m_sb.sb_agcount; if (nagcount > oagcount) { + void *new_perag, *old_perag; + xfs_filestream_flush(mp); + + new_perag = kmem_zalloc(sizeof(xfs_perag_t) * nagcount, + KM_MAYFAIL); + if (!new_perag) + return XFS_ERROR(ENOMEM); + down_write(&mp->m_peraglock); - mp->m_perag = kmem_realloc(mp->m_perag, - sizeof(xfs_perag_t) * nagcount, - sizeof(xfs_perag_t) * oagcount, - KM_SLEEP); - memset(&mp->m_perag[oagcount], 0, - (nagcount - oagcount) * sizeof(xfs_perag_t)); + memcpy(new_perag, mp->m_perag, sizeof(xfs_perag_t) * oagcount); + old_perag = mp->m_perag; + mp->m_perag = new_perag; + mp->m_flags |= XFS_MOUNT_32BITINODES; nagimax = xfs_initialize_perag(mp, nagcount); up_write(&mp->m_peraglock); + + kmem_free(old_perag); } tp = xfs_trans_alloc(mp, XFS_TRANS_GROWFS); tp->t_flags |= XFS_TRANS_RESERVE; -- cgit v1.2.3 From ca35dcd6cae7d4a780c484c53f45548c4719f82c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:54 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_getbmap xfs_getbmap allocates memory with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 7928b9983c1d..8ee5b5a76a2a 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -6009,7 +6009,7 @@ xfs_getbmap( */ error = ENOMEM; subnex = 16; - map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL); + map = kmem_alloc(subnex * sizeof(*map), KM_MAYFAIL | KM_NOFS); if (!map) goto out_unlock_ilock; -- cgit v1.2.3 From f41d7fb9da05b604f8a69fb6cac2a0563c8ede4e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:55 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_state_alloc xfs_da_state_alloc is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_da_btree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 9ff6e57a5075..bd0bb6dfcdfa 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2201,7 +2201,7 @@ kmem_zone_t *xfs_dabuf_zone; /* dabuf zone */ xfs_da_state_t * xfs_da_state_alloc(void) { - return kmem_zone_zalloc(xfs_da_state_zone, KM_SLEEP); + return kmem_zone_zalloc(xfs_da_state_zone, KM_NOFS); } /* -- cgit v1.2.3 From 73195ed7864ae4a1fb0bea2ed9df59d19b4fde90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:56 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_da_buf_make i_lock is taken in the reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_da_btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index bd0bb6dfcdfa..2847bbc1c534 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -2261,9 +2261,9 @@ xfs_da_buf_make(int nbuf, xfs_buf_t **bps, inst_t *ra) int off; if (nbuf == 1) - dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_SLEEP); + dabuf = kmem_zone_alloc(xfs_dabuf_zone, KM_NOFS); else - dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_SLEEP); + dabuf = kmem_alloc(XFS_DA_BUF_SIZE(nbuf), KM_NOFS); dabuf->dirty = 0; #ifdef XFS_DABUF_DEBUG dabuf->ra = ra; -- cgit v1.2.3 From 3f52c2f0a07c23771909cc53f2e9451a7f1bf253 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:57 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_dir_cilookup_result xfs_dir_cilookup_result is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_dir2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_dir2.c b/fs/xfs/xfs_dir2.c index c657bec6d951..bb1d58eb3982 100644 --- a/fs/xfs/xfs_dir2.c +++ b/fs/xfs/xfs_dir2.c @@ -256,7 +256,7 @@ xfs_dir_cilookup_result( !(args->op_flags & XFS_DA_OP_CILOOKUP)) return EEXIST; - args->value = kmem_alloc(len, KM_MAYFAIL); + args->value = kmem_alloc(len, KM_NOFS | KM_MAYFAIL); if (!args->value) return ENOMEM; -- cgit v1.2.3 From 36fae17a648e0aee5d9560514d08477ef48dc87f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:58 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_buf_associate_memory xfs_buf_associate_memory is used for setting up the spare buffer for the log wrap case in xlog_sync which can happen under i_lock when called from xfs_fsync. The i_lock mutex is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. There are a couple more uses of xfs_buf_associate_memory in the log recovery code that are also affected by this, but I'd rather keep the code simple than passing on a gfp_mask argument. Longer term we should just stop requiring the memoery allocation in xlog_sync by some smaller rework of the buffer layer. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 1418b916fc27..178c20c13e83 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -770,7 +770,7 @@ xfs_buf_associate_memory( bp->b_pages = NULL; bp->b_addr = mem; - rval = _xfs_buf_get_pages(bp, page_count, 0); + rval = _xfs_buf_get_pages(bp, page_count, XBF_DONT_BLOCK); if (rval) return rval; -- cgit v1.2.3 From 10746e47e722b5688fcd6eba9fbf9b2e64a248a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:14:59 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_set xfs_attr_rmtval_set is always called with i_lock held, and i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_attr.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index db15feb906ff..bfb583791fe2 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2141,8 +2141,8 @@ xfs_attr_rmtval_set(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map.br_startblock), blkcnt = XFS_FSB_TO_BB(mp, map.br_blockcount); - bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK); + bp = xfs_buf_get_flags(mp->m_ddev_targp, dblkno, blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK); ASSERT(bp); ASSERT(!XFS_BUF_GETERROR(bp)); -- cgit v1.2.3 From 7b02ecb3031b192823bc732ae717febc0a59aa92 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:00 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_readlink_bmap xfs_readlink_bmap is called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_vnodeops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index c4eca5ed5dab..492d75bae2bf 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -538,7 +538,9 @@ xfs_readlink_bmap( d = XFS_FSB_TO_DADDR(mp, mval[n].br_startblock); byte_cnt = XFS_FSB_TO_B(mp, mval[n].br_blockcount); - bp = xfs_buf_read(mp->m_ddev_targp, d, BTOBB(byte_cnt), 0); + bp = xfs_buf_read_flags(mp->m_ddev_targp, d, BTOBB(byte_cnt), + XBF_LOCK | XBF_MAPPED | + XBF_DONT_BLOCK); error = XFS_BUF_GETERROR(bp); if (error) { xfs_ioerror_alert("xfs_readlink", -- cgit v1.2.3 From ddd3a14e0f030f0f7b900621f67532285b8657ef Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sat, 18 Jul 2009 18:15:01 -0400 Subject: xfs: switch to NOFS allocation under i_lock in xfs_attr_rmtval_get xfs_attr_rmtval_get is always called with i_lock held, but i_lock is taken in reclaim context so all allocations under it must avoid recursions into the filesystem. Reported by the new reclaim context tracing in lockdep. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_attr.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index bfb583791fe2..4ece1906bd41 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -2010,7 +2010,9 @@ xfs_attr_rmtval_get(xfs_da_args_t *args) dblkno = XFS_FSB_TO_DADDR(mp, map[i].br_startblock); blkcnt = XFS_FSB_TO_BB(mp, map[i].br_blockcount); error = xfs_read_buf(mp, mp->m_ddev_targp, dblkno, - blkcnt, XFS_BUF_LOCK, &bp); + blkcnt, + XFS_BUF_LOCK | XBF_DONT_BLOCK, + &bp); if (error) return(error); -- cgit v1.2.3 From e0c222c411e22f086e929cd69fdcc89336164ec1 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 20 Jul 2009 10:52:15 -0500 Subject: use XFS_CORRUPTION_ERROR in xfs_btree_check_sblock In Red Hat Bug 512552 - Can't write to XFS mount during raid5 resync a user ran into corruption while resyncing a raid, and we failed a consistency test, but didn't get much more info; it'd be nice to call XFS_CORRUPTION_ERROR here so we can see the buffer contents. Signed-off-by: Eric Sandeen Reviewed-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/xfs_btree.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_btree.c b/fs/xfs/xfs_btree.c index e9df99574829..26717388acf5 100644 --- a/fs/xfs/xfs_btree.c +++ b/fs/xfs/xfs_btree.c @@ -120,8 +120,8 @@ xfs_btree_check_sblock( XFS_RANDOM_BTREE_CHECK_SBLOCK))) { if (bp) xfs_buftrace("SBTREE ERROR", bp); - XFS_ERROR_REPORT("xfs_btree_check_sblock", XFS_ERRLEVEL_LOW, - cur->bc_mp); + XFS_CORRUPTION_ERROR("xfs_btree_check_sblock", + XFS_ERRLEVEL_LOW, cur->bc_mp, block); return XFS_ERROR(EFSCORRUPTED); } return 0; -- cgit v1.2.3 From b89d4208de3de442c9025919c4261be0b38e79a4 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:18 -0300 Subject: xfs: check for dinode realtime flag corruption Ramon tested XFS with a modified version of fsfuzzer and hit a NULL pointer dereference in __xfs_get_blocks due to the RT device target pointer being NULL. To fix this reject inode with the realtime bit set on a a filesystem without an RT subvolume during inode read. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Reported-by: Ramon de Carvalho Valle Tested-by: Ramon de Carvalho Valle Signed-off-by: Felix Blyakher --- fs/xfs/xfs_inode.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1f22d65fed0a..da428b3fe0f5 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -343,6 +343,16 @@ xfs_iformat( return XFS_ERROR(EFSCORRUPTED); } + if (unlikely((ip->i_d.di_flags & XFS_DIFLAG_REALTIME) && + !ip->i_mount->m_rtdev_targp)) { + xfs_fs_repair_cmn_err(CE_WARN, ip->i_mount, + "corrupt dinode %Lu, has realtime flag set.", + ip->i_ino); + XFS_CORRUPTION_ERROR("xfs_iformat(realtime)", + XFS_ERRLEVEL_LOW, ip->i_mount, dip); + return XFS_ERROR(EFSCORRUPTED); + } + switch (ip->i_d.di_mode & S_IFMT) { case S_IFIFO: case S_IFCHR: -- cgit v1.2.3 From a8914f3a6d72c97328597a556a99daaf5cc288ae Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 10 Aug 2009 11:32:44 -0300 Subject: xfs: fix spin_is_locked assert on uni-processor builds Without SMP or preemption spin_is_locked always returns false, so we can't do an assert with it. Instead use assert_spin_locked, which does the right thing on all builds. Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Reported-by: Johannes Engel Tested-by: Johannes Engel Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 3750f04ede0b..9dbdff3ea484 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -3180,7 +3180,7 @@ try_again: STATIC void xlog_state_want_sync(xlog_t *log, xlog_in_core_t *iclog) { - ASSERT(spin_is_locked(&log->l_icloglock)); + assert_spin_locked(&log->l_icloglock); if (iclog->ic_state == XLOG_STATE_ACTIVE) { xlog_state_switch_iclogs(log, iclog, 0); -- cgit v1.2.3 From 1ae88b2e446261c038f2c0c3150ffae142b227a2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 12 Aug 2009 09:12:30 -0400 Subject: NFS: Fix an O_DIRECT Oops... We can't call nfs_readdata_release()/nfs_writedata_release() without first initialising and referencing args.context. Doing so inside nfs_direct_read_schedule_segment()/nfs_direct_write_schedule_segment() causes an Oops. We should rather be calling nfs_readdata_free()/nfs_writedata_free() in those cases. Looking at the O_DIRECT code, the "struct nfs_direct_req" is already referencing the nfs_open_context for us. Since the readdata and writedata structures carry a reference to that, we can simplify things by getting rid of the extra nfs_open_context references, so that we can replace all instances of nfs_readdata_release()/nfs_writedata_release(). Reported-by: Catalin Marinas Signed-off-by: Trond Myklebust Tested-by: Catalin Marinas Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/nfs/direct.c | 20 ++++++++++---------- fs/nfs/read.c | 6 ++---- fs/nfs/write.c | 6 ++---- include/linux/nfs_fs.h | 5 ++--- 4 files changed, 16 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 489fc01a3204..e4e089a8f294 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -255,7 +255,7 @@ static void nfs_direct_read_release(void *calldata) if (put_dreq(dreq)) nfs_direct_complete(dreq); - nfs_readdata_release(calldata); + nfs_readdata_free(data); } static const struct rpc_call_ops nfs_read_direct_ops = { @@ -314,14 +314,14 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->npages, 1, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_readdata_release(data); + nfs_readdata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_readdata_release(data); + nfs_readdata_free(data); break; } bytes -= pgbase; @@ -334,7 +334,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; @@ -441,7 +441,7 @@ static void nfs_direct_free_writedata(struct nfs_direct_req *dreq) struct nfs_write_data *data = list_entry(dreq->rewrite_list.next, struct nfs_write_data, pages); list_del(&data->pages); nfs_direct_release_pages(data->pagevec, data->npages); - nfs_writedata_release(data); + nfs_writedata_free(data); } } @@ -534,7 +534,7 @@ static void nfs_direct_commit_release(void *calldata) dprintk("NFS: %5u commit returned %d\n", data->task.tk_pid, status); nfs_direct_write_complete(dreq, data->inode); - nfs_commitdata_release(calldata); + nfs_commit_free(data); } static const struct rpc_call_ops nfs_commit_direct_ops = { @@ -570,7 +570,7 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq) data->args.fh = NFS_FH(data->inode); data->args.offset = 0; data->args.count = 0; - data->args.context = get_nfs_open_context(dreq->ctx); + data->args.context = dreq->ctx; data->res.count = 0; data->res.fattr = &data->fattr; data->res.verf = &data->verf; @@ -734,14 +734,14 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->npages, 0, 0, data->pagevec, NULL); up_read(¤t->mm->mmap_sem); if (result < 0) { - nfs_writedata_release(data); + nfs_writedata_free(data); break; } if ((unsigned)result < data->npages) { bytes = result * PAGE_SIZE; if (bytes <= pgbase) { nfs_direct_release_pages(data->pagevec, result); - nfs_writedata_release(data); + nfs_writedata_free(data); break; } bytes -= pgbase; @@ -756,7 +756,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_direct_req *dreq, data->inode = inode; data->cred = msg.rpc_cred; data->args.fh = NFS_FH(inode); - data->args.context = get_nfs_open_context(ctx); + data->args.context = ctx; data->args.offset = pos; data->args.pgbase = pgbase; data->args.pages = data->pagevec; diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 73ea5e8d66ce..12c9e66d3f1d 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -60,17 +60,15 @@ struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount) return p; } -static void nfs_readdata_free(struct nfs_read_data *p) +void nfs_readdata_free(struct nfs_read_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_rdata_mempool); } -void nfs_readdata_release(void *data) +static void nfs_readdata_release(struct nfs_read_data *rdata) { - struct nfs_read_data *rdata = data; - put_nfs_open_context(rdata->args.context); nfs_readdata_free(rdata); } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 0a0a2ff767c3..a34fae21fe10 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -87,17 +87,15 @@ struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount) return p; } -static void nfs_writedata_free(struct nfs_write_data *p) +void nfs_writedata_free(struct nfs_write_data *p) { if (p && (p->pagevec != &p->page_array[0])) kfree(p->pagevec); mempool_free(p, nfs_wdata_mempool); } -void nfs_writedata_release(void *data) +static void nfs_writedata_release(struct nfs_write_data *wdata) { - struct nfs_write_data *wdata = data; - put_nfs_open_context(wdata->args.context); nfs_writedata_free(wdata); } diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index fdffb413b192..f6b90240dd41 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -473,7 +473,6 @@ extern int nfs_writepages(struct address_space *, struct writeback_control *); extern int nfs_flush_incompatible(struct file *file, struct page *page); extern int nfs_updatepage(struct file *, struct page *, unsigned int, unsigned int); extern int nfs_writeback_done(struct rpc_task *, struct nfs_write_data *); -extern void nfs_writedata_release(void *); /* * Try to write back everything synchronously (but check the @@ -488,7 +487,6 @@ extern int nfs_wb_page_cancel(struct inode *inode, struct page* page); extern int nfs_commit_inode(struct inode *, int); extern struct nfs_write_data *nfs_commitdata_alloc(void); extern void nfs_commit_free(struct nfs_write_data *wdata); -extern void nfs_commitdata_release(void *wdata); #else static inline int nfs_commit_inode(struct inode *inode, int how) @@ -507,6 +505,7 @@ nfs_have_writebacks(struct inode *inode) * Allocate nfs_write_data structures */ extern struct nfs_write_data *nfs_writedata_alloc(unsigned int npages); +extern void nfs_writedata_free(struct nfs_write_data *); /* * linux/fs/nfs/read.c @@ -515,7 +514,6 @@ extern int nfs_readpage(struct file *, struct page *); extern int nfs_readpages(struct file *, struct address_space *, struct list_head *, unsigned); extern int nfs_readpage_result(struct rpc_task *, struct nfs_read_data *); -extern void nfs_readdata_release(void *data); extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, struct page *); @@ -523,6 +521,7 @@ extern int nfs_readpage_async(struct nfs_open_context *, struct inode *, * Allocate nfs_read_data structures */ extern struct nfs_read_data *nfs_readdata_alloc(unsigned int npages); +extern void nfs_readdata_free(struct nfs_read_data *); /* * linux/fs/nfs3proc.c -- cgit v1.2.3 From d7e623da1a757fbd8c117fa29190ca8bef14dab3 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 11 Aug 2009 11:20:11 +0100 Subject: GFS2: Fix permissions on "recover" file Although this file is only ever written and not read by userspace, it seems that the utils are opening this file O_RDWR, so we need to allow that. Also fixes the whitespace which seemed to be broken. Signed-off-by: Steven Whitehouse Cc: David Teigland --- fs/gfs2/sys.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 23419dc3027b..a7cbfbd340c7 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -386,16 +386,16 @@ static ssize_t jid_show(struct gfs2_sbd *sdp, char *buf) #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); -GDLM_ATTR(jid, 0444, jid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0200, NULL, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, lkid_show, NULL); +GDLM_ATTR(jid, 0444, jid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0600, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, -- cgit v1.2.3 From b2add73dbf93fd50f00564d7abc3e2b9aa9dd20c Mon Sep 17 00:00:00 2001 From: Guillaume Knispel Date: Sat, 15 Aug 2009 19:30:24 +0200 Subject: poll/select: initialize triggered field of struct poll_wqueues The triggered field of struct poll_wqueues introduced in commit 5f820f648c92a5ecc771a96b3c29aa6e90013bba ("poll: allow f_op->poll to sleep"). It was first set to 1 in pollwake() (now __pollwake() ), tested and later set to 0 in poll_schedule_timeout(), but not initialized before. As a result when the process needs to sleep, triggered was likely to be non-zero even if pollwake() is not called before the first poll_schedule_timeout(), meaning schedule_hrtimeout_range() would not be called and an extra loop calling all ->poll() would be done. This patch initialize triggered to 0 in poll_initwait() so the ->poll() are not called twice before the process goes to sleep when it needs to. Signed-off-by: Guillaume Knispel Acked-by: Thomas Gleixner Acked-by: Tejun Heo Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/select.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/select.c b/fs/select.c index d870237e42c7..8084834e123e 100644 --- a/fs/select.c +++ b/fs/select.c @@ -110,6 +110,7 @@ void poll_initwait(struct poll_wqueues *pwq) { init_poll_funcptr(&pwq->pt, __pollwait); pwq->polling_task = current; + pwq->triggered = 0; pwq->error = 0; pwq->table = NULL; pwq->inline_index = 0; -- cgit v1.2.3 From bc990f5cb424cdca9dda866785d088e2c2110ecc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Sun, 16 Aug 2009 20:36:34 -0400 Subject: xfs: fix locking in xfs_iget_cache_hit The locking in xfs_iget_cache_hit currently has numerous problems: - we clear the reclaim tag without i_flags_lock which protects modifications to it - we call inode_init_always which can sleep with pag_ici_lock held (this is oss.sgi.com BZ #819) - we acquire and drop i_flags_lock a lot and thus provide no consistency between the various flags we set/clear under it This patch fixes all that with a major revamp of the locking in the function. The new version acquires i_flags_lock early and only drops it once we need to call into inode_init_always or before calling xfs_ilock. This patch fixes a bug seen in the wild where we race modifying the reclaim tag. Signed-off-by: Christoph Hellwig Reviewed-by: Felix Blyakher Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_sync.c | 13 ++++- fs/xfs/linux-2.6/xfs_sync.h | 1 + fs/xfs/xfs_iget.c | 113 +++++++++++++++++++++++--------------------- 3 files changed, 70 insertions(+), 57 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_sync.c b/fs/xfs/linux-2.6/xfs_sync.c index b619d6b8ca43..98ef624d9baf 100644 --- a/fs/xfs/linux-2.6/xfs_sync.c +++ b/fs/xfs/linux-2.6/xfs_sync.c @@ -708,6 +708,16 @@ xfs_reclaim_inode( return 0; } +void +__xfs_inode_set_reclaim_tag( + struct xfs_perag *pag, + struct xfs_inode *ip) +{ + radix_tree_tag_set(&pag->pag_ici_root, + XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), + XFS_ICI_RECLAIM_TAG); +} + /* * We set the inode flag atomically with the radix tree tag. * Once we get tag lookups on the radix tree, this inode flag @@ -722,8 +732,7 @@ xfs_inode_set_reclaim_tag( read_lock(&pag->pag_ici_lock); spin_lock(&ip->i_flags_lock); - radix_tree_tag_set(&pag->pag_ici_root, - XFS_INO_TO_AGINO(mp, ip->i_ino), XFS_ICI_RECLAIM_TAG); + __xfs_inode_set_reclaim_tag(pag, ip); __xfs_iflags_set(ip, XFS_IRECLAIMABLE); spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); diff --git a/fs/xfs/linux-2.6/xfs_sync.h b/fs/xfs/linux-2.6/xfs_sync.h index 2a10301c99c7..59120602588a 100644 --- a/fs/xfs/linux-2.6/xfs_sync.h +++ b/fs/xfs/linux-2.6/xfs_sync.h @@ -48,6 +48,7 @@ int xfs_reclaim_inode(struct xfs_inode *ip, int locked, int sync_mode); int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); void xfs_inode_set_reclaim_tag(struct xfs_inode *ip); +void __xfs_inode_set_reclaim_tag(struct xfs_perag *pag, struct xfs_inode *ip); void xfs_inode_clear_reclaim_tag(struct xfs_inode *ip); void __xfs_inode_clear_reclaim_tag(struct xfs_mount *mp, struct xfs_perag *pag, struct xfs_inode *ip); diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 34ec86923f7e..ecbf8b4d2e2e 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -191,80 +191,82 @@ xfs_iget_cache_hit( int flags, int lock_flags) __releases(pag->pag_ici_lock) { + struct inode *inode = VFS_I(ip); struct xfs_mount *mp = ip->i_mount; - int error = EAGAIN; + int error; + + spin_lock(&ip->i_flags_lock); /* - * If INEW is set this inode is being set up - * If IRECLAIM is set this inode is being torn down - * Pause and try again. + * If we are racing with another cache hit that is currently + * instantiating this inode or currently recycling it out of + * reclaimabe state, wait for the initialisation to complete + * before continuing. + * + * XXX(hch): eventually we should do something equivalent to + * wait_on_inode to wait for these flags to be cleared + * instead of polling for it. */ - if (xfs_iflags_test(ip, (XFS_INEW|XFS_IRECLAIM))) { + if (ip->i_flags & (XFS_INEW|XFS_IRECLAIM)) { XFS_STATS_INC(xs_ig_frecycle); + error = EAGAIN; goto out_error; } - /* If IRECLAIMABLE is set, we've torn down the vfs inode part */ - if (xfs_iflags_test(ip, XFS_IRECLAIMABLE)) { - - /* - * If lookup is racing with unlink, then we should return an - * error immediately so we don't remove it from the reclaim - * list and potentially leak the inode. - */ - if ((ip->i_d.di_mode == 0) && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - goto out_error; - } + /* + * If lookup is racing with unlink return an error immediately. + */ + if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { + error = ENOENT; + goto out_error; + } + /* + * If IRECLAIMABLE is set, we've torn down the VFS inode already. + * Need to carefully get it back into useable state. + */ + if (ip->i_flags & XFS_IRECLAIMABLE) { xfs_itrace_exit_tag(ip, "xfs_iget.alloc"); /* - * We need to re-initialise the VFS inode as it has been - * 'freed' by the VFS. Do this here so we can deal with - * errors cleanly, then tag it so it can be set up correctly - * later. + * We need to set XFS_INEW atomically with clearing the + * reclaimable tag so that we do have an indicator of the + * inode still being initialized. */ - if (inode_init_always(mp->m_super, VFS_I(ip))) { - error = ENOMEM; - goto out_error; - } + ip->i_flags |= XFS_INEW; + ip->i_flags &= ~XFS_IRECLAIMABLE; + __xfs_inode_clear_reclaim_tag(mp, pag, ip); - /* - * We must set the XFS_INEW flag before clearing the - * XFS_IRECLAIMABLE flag so that if a racing lookup does - * not find the XFS_IRECLAIMABLE above but has the igrab() - * below succeed we can safely check XFS_INEW to detect - * that this inode is still being initialised. - */ - xfs_iflags_set(ip, XFS_INEW); - xfs_iflags_clear(ip, XFS_IRECLAIMABLE); + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); - /* clear the radix tree reclaim flag as well. */ - __xfs_inode_clear_reclaim_tag(mp, pag, ip); - } else if (!igrab(VFS_I(ip))) { + error = -inode_init_always(mp->m_super, inode); + if (error) { + /* + * Re-initializing the inode failed, and we are in deep + * trouble. Try to re-add it to the reclaim list. + */ + read_lock(&pag->pag_ici_lock); + spin_lock(&ip->i_flags_lock); + + ip->i_flags &= ~XFS_INEW; + ip->i_flags |= XFS_IRECLAIMABLE; + __xfs_inode_set_reclaim_tag(pag, ip); + goto out_error; + } + inode->i_state = I_LOCK|I_NEW; + } else { /* If the VFS inode is being torn down, pause and try again. */ - XFS_STATS_INC(xs_ig_frecycle); - goto out_error; - } else if (xfs_iflags_test(ip, XFS_INEW)) { - /* - * We are racing with another cache hit that is - * currently recycling this inode out of the XFS_IRECLAIMABLE - * state. Wait for the initialisation to complete before - * continuing. - */ - wait_on_inode(VFS_I(ip)); - } + if (!igrab(inode)) { + error = EAGAIN; + goto out_error; + } - if (ip->i_d.di_mode == 0 && !(flags & XFS_IGET_CREATE)) { - error = ENOENT; - iput(VFS_I(ip)); - goto out_error; + /* We've got a live one. */ + spin_unlock(&ip->i_flags_lock); + read_unlock(&pag->pag_ici_lock); } - /* We've got a live one. */ - read_unlock(&pag->pag_ici_lock); - if (lock_flags != 0) xfs_ilock(ip, lock_flags); @@ -274,6 +276,7 @@ xfs_iget_cache_hit( return 0; out_error: + spin_unlock(&ip->i_flags_lock); read_unlock(&pag->pag_ici_lock); return error; } -- cgit v1.2.3 From ada508274b8698a33cb0e5bd037db0f9dc781795 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 3 Aug 2009 18:24:21 +0200 Subject: ocfs2: Handle quota file corruption more gracefully ocfs2_read_virt_blocks() does BUG when we try to read a block from a file beyond its end. Since this can happen due to filesystem corruption, it is not really an appropriate answer. Make ocfs2_read_quota_block() check the condition and handle it by calling ocfs2_error() and returning EIO. [ Modified to print ip_blkno in the error - Joel ] Reported-by: Tristan Ye Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/quota_global.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index bf7742d0ee3b..44f2a5e1d042 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -23,6 +23,7 @@ #include "sysfile.h" #include "dlmglue.h" #include "uptodate.h" +#include "super.h" #include "quota.h" static struct workqueue_struct *ocfs2_quota_wq = NULL; @@ -114,6 +115,15 @@ int ocfs2_read_quota_block(struct inode *inode, u64 v_block, int rc = 0; struct buffer_head *tmp = *bh; + if (i_size_read(inode) >> inode->i_sb->s_blocksize_bits <= v_block) { + ocfs2_error(inode->i_sb, + "Quota file %llu is probably corrupted! Requested " + "to read block %Lu but file has size only %Lu\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + (unsigned long long)v_block, + (unsigned long long)i_size_read(inode)); + return -EIO; + } rc = ocfs2_read_virt_blocks(inode, v_block, 1, &tmp, 0, ocfs2_validate_quota_block); if (rc) -- cgit v1.2.3 From 60e2ec48665b8495360ca4a6004c5cd52beb2bc1 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 12 Aug 2009 14:42:47 +0800 Subject: ocfs2: release the buffer head in ocfs2_do_truncate. In ocfs2_do_truncate, we forget to release last_eb_bh which will cause memleak. So call brelse in the end. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index f9a3e8942669..ab513ddaeff2 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -6851,7 +6851,7 @@ static int ocfs2_do_truncate(struct ocfs2_super *osb, } status = 0; bail: - + brelse(last_eb_bh); mlog_exit(status); return status; } -- cgit v1.2.3 From eef3a116be11d35396efb2a8cc7345fd3221e294 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sun, 16 Aug 2009 21:51:44 -0400 Subject: notify: unused event private race inotify decides if private data it passed to get added to an event was used by checking list_empty(). But it's possible that the event may have been dequeued and the private event removed so it would look empty. The fix is to use the return code from fsnotify_add_notify_event rather than looking at the list. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify_fsnotify.c | 13 +++++++------ fs/notify/inotify/inotify_user.c | 7 +++---- fs/notify/notification.c | 7 +++---- 3 files changed, 13 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 47cd258fd24d..5dcbafe72d71 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -62,13 +62,14 @@ static int inotify_handle_event(struct fsnotify_group *group, struct fsnotify_ev event_priv->wd = wd; ret = fsnotify_add_notify_event(group, event, fsn_event_priv); - /* EEXIST is not an error */ - if (ret == -EEXIST) - ret = 0; - - /* did event_priv get attached? */ - if (list_empty(&fsn_event_priv->event_list)) + if (ret) { inotify_free_event_priv(fsn_event_priv); + /* EEXIST says we tail matched, EOVERFLOW isn't something + * to report up the stack. */ + if ((ret == -EEXIST) || + (ret == -EOVERFLOW)) + ret = 0; + } /* * If we hold the entry until after the event is on the queue diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index f30d9bbc2e1b..c172a7a17b17 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -386,6 +386,7 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_event *ignored_event; struct inotify_event_private_data *event_priv; struct fsnotify_event_private_data *fsn_event_priv; + int ret; ignored_event = fsnotify_create_event(NULL, FS_IN_IGNORED, NULL, FSNOTIFY_EVENT_NONE, NULL, 0, @@ -404,10 +405,8 @@ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, fsn_event_priv->group = group; event_priv->wd = ientry->wd; - fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); - - /* did the private data get added? */ - if (list_empty(&fsn_event_priv->event_list)) + ret = fsnotify_add_notify_event(group, ignored_event, fsn_event_priv); + if (ret) inotify_free_event_priv(fsn_event_priv); skip_send_ignore: diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 521368574e97..74b3cf30bc6b 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -171,9 +171,7 @@ int fsnotify_add_notify_event(struct fsnotify_group *group, struct fsnotify_even struct list_head *list = &group->notification_list; struct fsnotify_event_holder *last_holder; struct fsnotify_event *last_event; - - /* easy to tell if priv was attached to the event */ - INIT_LIST_HEAD(&priv->event_list); + int ret = 0; /* * There is one fsnotify_event_holder embedded inside each fsnotify_event. @@ -194,6 +192,7 @@ alloc_holder: if (group->q_len >= group->max_events) { event = &q_overflow_event; + ret = -EOVERFLOW; /* sorry, no private data on the overflow event */ priv = NULL; } @@ -235,7 +234,7 @@ alloc_holder: mutex_unlock(&group->notification_mutex); wake_up(&group->notification_waitq); - return 0; + return ret; } /* -- cgit v1.2.3 From cd94c8bbef8d4b796a7ed4c551355a334604fd36 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sun, 16 Aug 2009 21:51:49 -0400 Subject: inotify: tail drop inotify q_overflow events In f44aebcc the tail drop logic of events with no file backing (q_overflow and in_ignored) was reversed so IN_IGNORED events would never be tail dropped. This now means that Q_OVERFLOW events are NOT tail dropped. The fix is to not tail drop IN_IGNORED, but to tail drop Q_OVERFLOW. Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- fs/notify/notification.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 74b3cf30bc6b..3816d5750dd5 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -153,6 +153,10 @@ static bool event_compare(struct fsnotify_event *old, struct fsnotify_event *new return true; break; case (FSNOTIFY_EVENT_NONE): + if (old->mask & FS_Q_OVERFLOW) + return true; + else if (old->mask & FS_IN_IGNORED) + return false; return false; }; } -- cgit v1.2.3 From 08e53fcb0db34baca3db84a457b6d67faabee4c6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Sun, 16 Aug 2009 21:51:55 -0400 Subject: inotify: start watch descriptor count at 1 The inotify_add_watch man page specifies that inotify_add_watch() will return a non-negative integer. However, historically the inotify watches started at 1, not at 0. Turns out that the inotifywait program provided by the inotify-tools package doesn't properly handle a 0 watch descriptor. In 7e790dd5 we changed from starting at 1 to starting at 0. This patch starts at 1, just like in previous kernels, but also just like in previous kernels it's possible for it to wrap back to 0. This preserves the kernel functionality exactly like it was before the patch (neither method broke the spec) Signed-off-by: Eric Paris Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify_user.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index c172a7a17b17..dc32ed8323ba 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -567,7 +567,7 @@ static struct fsnotify_group *inotify_new_group(struct user_struct *user, unsign spin_lock_init(&group->inotify_data.idr_lock); idr_init(&group->inotify_data.idr); - group->inotify_data.last_wd = 0; + group->inotify_data.last_wd = 1; group->inotify_data.user = user; group->inotify_data.fa = NULL; -- cgit v1.2.3 From 50fb6d2bd7062708892ae7147f30c3ee905b7a3d Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:57 -0600 Subject: 9p: Check for error in return value of v9fs_fid_add Check if v9fs_fid_add was successful or not based on its return value. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 81f8bbf12f9f..1fa5f15eaddc 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -470,7 +470,10 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, dentry->d_op = &v9fs_dentry_operations; d_instantiate(dentry, inode); - v9fs_fid_add(dentry, fid); + err = v9fs_fid_add(dentry, fid); + if (err < 0) + goto error; + return ofid; error: -- cgit v1.2.3 From 2bb541157fe2602af7b9952096d0524f6f9c1e73 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:56 -0600 Subject: 9p: Fix possible inode leak in v9fs_get_inode. Add a missing iput when cleaning up if v9fs_get_inode fails after returning a valid inode. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 105 +++++++++++++++++++++++++++++------------------------- 1 file changed, 56 insertions(+), 49 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 1fa5f15eaddc..0c8af1abf603 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -207,65 +207,72 @@ v9fs_blank_wstat(struct p9_wstat *wstat) struct inode *v9fs_get_inode(struct super_block *sb, int mode) { + int err; struct inode *inode; struct v9fs_session_info *v9ses = sb->s_fs_info; P9_DPRINTK(P9_DEBUG_VFS, "super block: %p mode: %o\n", sb, mode); inode = new_inode(sb); - if (inode) { - inode->i_mode = mode; - inode->i_uid = current_fsuid(); - inode->i_gid = current_fsgid(); - inode->i_blocks = 0; - inode->i_rdev = 0; - inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; - inode->i_mapping->a_ops = &v9fs_addr_operations; - - switch (mode & S_IFMT) { - case S_IFIFO: - case S_IFBLK: - case S_IFCHR: - case S_IFSOCK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "special files without extended mode\n"); - return ERR_PTR(-EINVAL); - } - init_special_inode(inode, inode->i_mode, - inode->i_rdev); - break; - case S_IFREG: - inode->i_op = &v9fs_file_inode_operations; - inode->i_fop = &v9fs_file_operations; - break; - case S_IFLNK: - if (!v9fs_extended(v9ses)) { - P9_DPRINTK(P9_DEBUG_ERROR, - "extended modes used w/o 9P2000.u\n"); - return ERR_PTR(-EINVAL); - } - inode->i_op = &v9fs_symlink_inode_operations; - break; - case S_IFDIR: - inc_nlink(inode); - if (v9fs_extended(v9ses)) - inode->i_op = &v9fs_dir_inode_operations_ext; - else - inode->i_op = &v9fs_dir_inode_operations; - inode->i_fop = &v9fs_dir_operations; - break; - default: + if (!inode) { + P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); + return -ENOMEM; + } + + inode->i_mode = mode; + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_blocks = 0; + inode->i_rdev = 0; + inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mapping->a_ops = &v9fs_addr_operations; + + switch (mode & S_IFMT) { + case S_IFIFO: + case S_IFBLK: + case S_IFCHR: + case S_IFSOCK: + if (!v9fs_extended(v9ses)) { P9_DPRINTK(P9_DEBUG_ERROR, - "BAD mode 0x%x S_IFMT 0x%x\n", - mode, mode & S_IFMT); - return ERR_PTR(-EINVAL); + "special files without extended mode\n"); + err = -EINVAL; + goto error; } - } else { - P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); - return ERR_PTR(-ENOMEM); + init_special_inode(inode, inode->i_mode, inode->i_rdev); + break; + case S_IFREG: + inode->i_op = &v9fs_file_inode_operations; + inode->i_fop = &v9fs_file_operations; + break; + case S_IFLNK: + if (!v9fs_extended(v9ses)) { + P9_DPRINTK(P9_DEBUG_ERROR, + "extended modes used w/o 9P2000.u\n"); + err = -EINVAL; + goto error; + } + inode->i_op = &v9fs_symlink_inode_operations; + break; + case S_IFDIR: + inc_nlink(inode); + if (v9fs_extended(v9ses)) + inode->i_op = &v9fs_dir_inode_operations_ext; + else + inode->i_op = &v9fs_dir_inode_operations; + inode->i_fop = &v9fs_dir_operations; + break; + default: + P9_DPRINTK(P9_DEBUG_ERROR, "BAD mode 0x%x S_IFMT 0x%x\n", + mode, mode & S_IFMT); + err = -EINVAL; + goto error; } + return inode; + +error: + iput(inode); + return ERR_PTR(err); } /* -- cgit v1.2.3 From 0e15597ebfe00e28857185f46aba00f400480ffe Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:55 -0600 Subject: 9p: minor comment fixes Fix the comments -- mostly the improper and/or missing descriptions of function parameters. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 3 +-- net/9p/client.c | 14 +++++++------- net/9p/trans_fd.c | 8 ++++---- net/9p/trans_rdma.c | 9 +++++---- net/9p/trans_virtio.c | 11 ++++------- 5 files changed, 21 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 0c8af1abf603..f22668afd0d6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -171,7 +171,6 @@ int v9fs_uflags2omode(int uflags, int extended) /** * v9fs_blank_wstat - helper function to setup a 9P stat structure - * @v9ses: 9P session info (for determining extended mode) * @wstat: structure to initialize * */ @@ -410,9 +409,9 @@ v9fs_open_created(struct inode *inode, struct file *file) * @v9ses: session information * @dir: directory that dentry is being created in * @dentry: dentry that is being created + * @extension: 9p2000.u extension string to support devices, etc. * @perm: create permissions * @mode: open mode - * @extension: 9p2000.u extension string to support devices, etc. * */ static struct p9_fid * diff --git a/net/9p/client.c b/net/9p/client.c index 787ccddb85ea..7bbd2d5ae8d3 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -60,9 +60,9 @@ static struct p9_req_t * p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...); /** - * v9fs_parse_options - parse mount options into session structure - * @options: options string passed from mount - * @v9ses: existing v9fs session information + * parse_options - parse mount options into client structure + * @opts: options string passed from mount + * @clnt: existing v9fs client information * * Return 0 upon success, -ERRNO upon failure */ @@ -232,7 +232,7 @@ EXPORT_SYMBOL(p9_tag_lookup); /** * p9_tag_init - setup tags structure and contents - * @tags: tags structure from the client struct + * @c: v9fs client struct * * This initializes the tags structure for each client instance. * @@ -258,7 +258,7 @@ error: /** * p9_tag_cleanup - cleans up tags structure and reclaims resources - * @tags: tags structure from the client struct + * @c: v9fs client struct * * This frees resources associated with the tags structure * @@ -430,8 +430,8 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) /** * p9_client_flush - flush (cancel) a request - * c: client state - * req: request to cancel + * @c: client state + * @oldreq: request to cancel * * This sents a flush for a particular requests and links * the flush request to the original request. The current diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index 8c2588e4edc0..8d934dd7fd54 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -119,8 +119,8 @@ struct p9_poll_wait { * @wpos: write position for current frame * @wsize: amount of data to write for current frame * @wbuf: current write buffer + * @poll_pending_link: pending links to be polled per conn * @poll_wait: array of wait_q's for various worker threads - * @poll_waddr: ???? * @pt: poll state * @rq: current read work * @wq: current write work @@ -700,9 +700,9 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) } /** - * parse_options - parse mount options into session structure - * @options: options string passed from mount - * @opts: transport-specific structure to parse options into + * parse_opts - parse mount options into p9_fd_opts structure + * @params: options string passed from mount + * @opts: fd transport-specific structure to parse options into * * Returns 0 upon success, -ERRNO upon failure */ diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index ac4990041ebb..65cb29db03f8 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -67,14 +67,15 @@ * @pd: Protection Domain pointer * @qp: Queue Pair pointer * @cq: Completion Queue pointer + * @dm_mr: DMA Memory Region pointer * @lkey: The local access only memory region key * @timeout: Number of uSecs to wait for connection management events * @sq_depth: The depth of the Send Queue * @sq_sem: Semaphore for the SQ * @rq_depth: The depth of the Receive Queue. + * @rq_count: Count of requests in the Receive Queue. * @addr: The remote peer's address * @req_lock: Protects the active request list - * @send_wait: Wait list when the SQ fills up * @cm_done: Completion event for connection management tracking */ struct p9_trans_rdma { @@ -154,9 +155,9 @@ static match_table_t tokens = { }; /** - * parse_options - parse mount options into session structure - * @options: options string passed from mount - * @opts: transport-specific structure to parse options into + * parse_opts - parse mount options into rdma options structure + * @params: options string passed from mount + * @opts: rdma transport-specific structure to parse options into * * Returns 0 upon success, -ERRNO upon failure */ diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index a49484e67e1d..9bf0b737aa51 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -57,11 +57,9 @@ static int chan_index; * @initialized: whether the channel is initialized * @inuse: whether the channel is in use * @lock: protects multiple elements within this structure + * @client: client instance * @vdev: virtio dev associated with this channel * @vq: virtio queue associated with this channel - * @tagpool: accounting for tag ids (and request slots) - * @reqs: array of request slots - * @max_tag: current number of request_slots allocated * @sg: scatter gather list which is used to pack a request (protected?) * * We keep all per-channel information in a structure. @@ -92,7 +90,7 @@ static unsigned int rest_of_page(void *data) /** * p9_virtio_close - reclaim resources of a channel - * @trans: transport state + * @client: client instance * * This reclaims a channel by freeing its resources and * reseting its inuse flag. @@ -181,9 +179,8 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) /** * p9_virtio_request - issue a request - * @t: transport state - * @tc: &p9_fcall request to transmit - * @rc: &p9_fcall to put reponse into + * @client: client instance issuing the request + * @req: request to be issued * */ -- cgit v1.2.3 From 02bc35672b2fdf251e264adca5407792f63191e4 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:54 -0600 Subject: 9p: Fix possible memleak in v9fs_inode_from fid. Add missing p9stat_free in v9fs_inode_from_fid to avoid any possible leaks. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index f22668afd0d6..fac30d21851f 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -344,30 +344,25 @@ v9fs_inode_from_fid(struct v9fs_session_info *v9ses, struct p9_fid *fid, ret = NULL; st = p9_client_stat(fid); - if (IS_ERR(st)) { - err = PTR_ERR(st); - st = NULL; - goto error; - } + if (IS_ERR(st)) + return ERR_CAST(st); umode = p9mode2unixmode(v9ses, st->mode); ret = v9fs_get_inode(sb, umode); if (IS_ERR(ret)) { err = PTR_ERR(ret); - ret = NULL; goto error; } v9fs_stat2inode(st, ret, sb); ret->i_ino = v9fs_qid2ino(&st->qid); + p9stat_free(st); kfree(st); return ret; error: + p9stat_free(st); kfree(st); - if (ret) - iput(ret); - return ERR_PTR(err); } -- cgit v1.2.3 From 4f4038328da5eb9cc237b51d3fe68138fd3fea14 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:53 -0600 Subject: 9p: Fix v9fs show_options Add the delimiter ',' before the options when they are passed and check if no option parameters are passed to prevent displaying NULL in /proc/mounts. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 38d695d66a0b..a9d7d08cfbe8 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -220,8 +220,8 @@ static void v9fs_kill_super(struct super_block *s) static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) { struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; - - seq_printf(m, "%s", v9ses->options); + if (v9ses->options != NULL) + seq_printf(m, ",%s", v9ses->options); return 0; } -- cgit v1.2.3 From 1b5ab3e86712b6be38ebbe0d821387c1d8f91d7c Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:52 -0600 Subject: 9p: Fix possible regressions when ->get_sb fails. ->get_sb can fail causing some badness. this patch fixes * clear sb->fs_s_info in kill_sb. * deactivate_locked_super() calls kill_sb (v9fs_kill_super) which closes the destroys the client, clunks all its fids and closes the v9fs session. Attempting to do it twice will cause an oops. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index a9d7d08cfbe8..2495af4ad9a6 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -120,7 +120,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, P9_DPRINTK(P9_DEBUG_VFS, " \n"); - st = NULL; v9ses = kzalloc(sizeof(struct v9fs_session_info), GFP_KERNEL); if (!v9ses) return -ENOMEM; @@ -173,10 +172,8 @@ P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n"); simple_set_mnt(mnt, sb); return 0; -release_sb: - deactivate_locked_super(sb); - free_stat: + p9stat_free(st); kfree(st); clunk_fid: @@ -185,7 +182,12 @@ clunk_fid: close_session: v9fs_session_close(v9ses); kfree(v9ses); + return retval; +release_sb: + p9stat_free(st); + kfree(st); + deactivate_locked_super(sb); return retval; } @@ -207,6 +209,7 @@ static void v9fs_kill_super(struct super_block *s) v9fs_session_close(v9ses); kfree(v9ses); + s->s_fs_info = NULL; P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } -- cgit v1.2.3 From 4d3297ca5bf37ff5956f76fb352e009880aad62d Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Sun, 19 Jul 2009 13:41:51 -0600 Subject: 9p: Remove redundant inode uid/gid assignment Remove a redundant update of inode's i_uid and i_gid after v9fs_get_inode() since the latter already sets up a new inode and sets the proper uid and gid values. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_super.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 2495af4ad9a6..072dce094477 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -113,8 +113,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, struct v9fs_session_info *v9ses = NULL; struct p9_wstat *st = NULL; int mode = S_IRWXUGO | S_ISVTX; - uid_t uid = current_fsuid(); - gid_t gid = current_fsgid(); struct p9_fid *fid; int retval = 0; @@ -149,9 +147,6 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, goto release_sb; } - inode->i_uid = uid; - inode->i_gid = gid; - root = d_alloc_root(inode); if (!root) { iput(inode); -- cgit v1.2.3 From 5fd131893793567c361ae64cbeb28a2a753bbe35 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 30 Jul 2009 17:01:53 +0200 Subject: ocfs2: Don't oops in ocfs2_kill_sb on a failed mount If we fail to mount the filesystem, we have to be careful not to dereference uninitialized structures in ocfs2_kill_sb. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/super.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b0ee0fdf799a..a3f8871d21fd 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1218,13 +1218,17 @@ static void ocfs2_kill_sb(struct super_block *sb) { struct ocfs2_super *osb = OCFS2_SB(sb); + /* Failed mount? */ + if (!osb || atomic_read(&osb->vol_state) == VOLUME_DISABLED) + goto out; + /* Prevent further queueing of inode drop events */ spin_lock(&dentry_list_lock); ocfs2_set_osb_flag(osb, OCFS2_OSB_DROP_DENTRY_LOCK_IMMED); spin_unlock(&dentry_list_lock); /* Wait for work to finish and/or remove it */ cancel_work_sync(&osb->dentry_lock_work); - +out: kill_block_super(sb); } -- cgit v1.2.3 From 48559b4c30708ebdc849483da9fb83ee08c6c908 Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Mon, 17 Aug 2009 16:32:18 -0500 Subject: 9p: Add missing cast for the error return value in v9fs_get_inode Cast the error return value (ENOMEM) in v9fs_get_inode() to its correct type using ERR_PTR. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/vfs_inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index fac30d21851f..06a223d50a81 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -215,7 +215,7 @@ struct inode *v9fs_get_inode(struct super_block *sb, int mode) inode = new_inode(sb); if (!inode) { P9_EPRINTK(KERN_WARNING, "Problem allocating inode\n"); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } inode->i_mode = mode; -- cgit v1.2.3 From 4b53e4b500779230aedd5355940aeaaed0b5353b Mon Sep 17 00:00:00 2001 From: Abhishek Kulkarni Date: Mon, 17 Aug 2009 16:42:28 -0500 Subject: 9p: remove unnecessary v9fses->options which duplicates the mount string The mount options string is saved in sb->s_options. This patch removes the redundant duplicating of the mount options. Also, since we are not displaying anything special in show options, we replace v9fs_show_options with generic_show_options for now. Signed-off-by: Abhishek Kulkarni Signed-off-by: Eric Van Hensbergen --- fs/9p/v9fs.c | 21 +++++---------------- fs/9p/v9fs.h | 1 - fs/9p/vfs_super.c | 23 +++++------------------ 3 files changed, 10 insertions(+), 35 deletions(-) (limited to 'fs') diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 332b5ff02fec..f7003cfac63d 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -76,7 +76,7 @@ static const match_table_t tokens = { * Return 0 upon success, -ERRNO upon failure. */ -static int v9fs_parse_options(struct v9fs_session_info *v9ses) +static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) { char *options; substring_t args[MAX_OPT_ARGS]; @@ -90,10 +90,10 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses) v9ses->debug = 0; v9ses->cache = 0; - if (!v9ses->options) + if (!opts) return 0; - options = kstrdup(v9ses->options, GFP_KERNEL); + options = kstrdup(opts, GFP_KERNEL); if (!options) { P9_DPRINTK(P9_DEBUG_ERROR, "failed to allocate copy of option string\n"); @@ -206,24 +206,14 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->uid = ~0; v9ses->dfltuid = V9FS_DEFUID; v9ses->dfltgid = V9FS_DEFGID; - if (data) { - v9ses->options = kstrdup(data, GFP_KERNEL); - if (!v9ses->options) { - P9_DPRINTK(P9_DEBUG_ERROR, - "failed to allocate copy of option string\n"); - retval = -ENOMEM; - goto error; - } - } - rc = v9fs_parse_options(v9ses); + rc = v9fs_parse_options(v9ses, data); if (rc < 0) { retval = rc; goto error; } - v9ses->clnt = p9_client_create(dev_name, v9ses->options); - + v9ses->clnt = p9_client_create(dev_name, data); if (IS_ERR(v9ses->clnt)) { retval = PTR_ERR(v9ses->clnt); v9ses->clnt = NULL; @@ -280,7 +270,6 @@ void v9fs_session_close(struct v9fs_session_info *v9ses) __putname(v9ses->uname); __putname(v9ses->aname); - kfree(v9ses->options); } /** diff --git a/fs/9p/v9fs.h b/fs/9p/v9fs.h index a7d567192998..38762bf102a9 100644 --- a/fs/9p/v9fs.h +++ b/fs/9p/v9fs.h @@ -85,7 +85,6 @@ struct v9fs_session_info { unsigned int afid; unsigned int cache; - char *options; /* copy of mount options */ char *uname; /* user name to mount as */ char *aname; /* name of remote hierarchy being mounted */ unsigned int maxdata; /* max data for client interface */ diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 072dce094477..8961f1a8f668 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -81,7 +81,7 @@ static int v9fs_set_super(struct super_block *s, void *data) static void v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, - int flags) + int flags, void *data) { sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize_bits = fls(v9ses->maxdata - 1); @@ -91,6 +91,8 @@ v9fs_fill_super(struct super_block *sb, struct v9fs_session_info *v9ses, sb->s_flags = flags | MS_ACTIVE | MS_SYNCHRONOUS | MS_DIRSYNC | MS_NOATIME; + + save_mount_options(sb, data); } /** @@ -139,7 +141,7 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags, retval = PTR_ERR(sb); goto free_stat; } - v9fs_fill_super(sb, v9ses, flags); + v9fs_fill_super(sb, v9ses, flags, data); inode = v9fs_get_inode(sb, S_IFDIR | mode); if (IS_ERR(inode)) { @@ -208,21 +210,6 @@ static void v9fs_kill_super(struct super_block *s) P9_DPRINTK(P9_DEBUG_VFS, "exiting kill_super\n"); } -/** - * v9fs_show_options - Show mount options in /proc/mounts - * @m: seq_file to write to - * @mnt: mount descriptor - * - */ - -static int v9fs_show_options(struct seq_file *m, struct vfsmount *mnt) -{ - struct v9fs_session_info *v9ses = mnt->mnt_sb->s_fs_info; - if (v9ses->options != NULL) - seq_printf(m, ",%s", v9ses->options); - return 0; -} - static void v9fs_umount_begin(struct super_block *sb) { @@ -235,7 +222,7 @@ v9fs_umount_begin(struct super_block *sb) static const struct super_operations v9fs_super_ops = { .statfs = simple_statfs, .clear_inode = v9fs_clear_inode, - .show_options = v9fs_show_options, + .show_options = generic_show_options, .umount_begin = v9fs_umount_begin, }; -- cgit v1.2.3 From 1154ecbd2f8298ef75609f5f8ed5aca96be599fb Mon Sep 17 00:00:00 2001 From: Zhang Qiang Date: Tue, 18 Aug 2009 14:58:24 +0800 Subject: nilfs2: missing a read lock for segment writer in nilfs_attach_checkpoint() 'ns_cno' of structure 'the_nilfs' must be protected from segment writer, in other words, the caller of nilfs_get_checkpoint should hold read lock for nilfs->ns_segctor_sem. This patch adds the lock/unlock operations in nilfs_attach_checkpoint() when calling nilfs_cpfile_get_checkpoint(). Signed-off-by: Zhang Qiang Signed-off-by: Ryusuke Konishi --- fs/nilfs2/super.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 8e2ec43b18f4..151964f0de4c 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -416,8 +416,10 @@ int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno) if (unlikely(err)) goto failed; + down_read(&nilfs->ns_segctor_sem); err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp, &bh_cp); + up_read(&nilfs->ns_segctor_sem); if (unlikely(err)) { if (err == -ENOENT || err == -EINVAL) { printk(KERN_ERR -- cgit v1.2.3 From a924586036833086b262a371b09d1266c23bb4d1 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Wed, 19 Aug 2009 00:29:43 +0900 Subject: nilfs2: fix oopses with doubly mounted snapshots will fix kernel oopses like the following: # mount -t nilfs2 -r -o cp=20 /dev/sdb1 /test1 # mount -t nilfs2 -r -o cp=20 /dev/sdb1 /test2 # umount /test1 # umount /test2 BUG: sleeping function called from invalid context at arch/x86/mm/fault.c:1069 in_atomic(): 0, irqs_disabled(): 1, pid: 3886, name: umount.nilfs2 1 lock held by umount.nilfs2/3886: #0: (&type->s_umount_key#31){+.+...}, at: [] deactivate_super+0x52/0x6c irq event stamp: 1219 hardirqs last enabled at (1219): [] __mutex_unlock_slowpath+0xf8/0x119 hardirqs last disabled at (1218): [] __mutex_unlock_slowpath+0x59/0x119 softirqs last enabled at (1214): [] __do_softirq+0x1a5/0x1ad softirqs last disabled at (1205): [] do_softirq+0x36/0x5a Pid: 3886, comm: umount.nilfs2 Not tainted 2.6.31-rc6 #55 Call Trace: [] __might_sleep+0x107/0x10e [] do_page_fault+0x246/0x397 [] ? do_page_fault+0x0/0x397 [] error_code+0x6b/0x70 [] ? do_page_fault+0x0/0x397 [] ? __lock_acquire+0x91/0x12fd [] ? __lock_acquire+0x12ee/0x12fd [] ? __lock_acquire+0x12ee/0x12fd [] lock_acquire+0xba/0xdd [] ? nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2] [] down_write+0x2a/0x46 [] ? nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2] [] nilfs_detach_segment_constructor+0x2f/0x2fa [nilfs2] [] ? mark_held_locks+0x43/0x5b [] ? trace_hardirqs_on_caller+0x10b/0x133 [] ? trace_hardirqs_on+0xb/0xd [] nilfs_put_super+0x2f/0xca [nilfs2] [] generic_shutdown_super+0x49/0xb8 [] kill_block_super+0x1d/0x31 [] ? vfs_quota_off+0x0/0x12 [] deactivate_super+0x57/0x6c [] mntput_no_expire+0x8c/0xb4 [] sys_umount+0x27f/0x2a4 [] sys_oldumount+0xd/0xf [] sysenter_do_call+0x12/0x38 ... This turns out to be a bug brought by an -rc1 patch ("nilfs2: simplify remaining sget() use"). In the patch, a new "put resource" function, nilfs_put_sbinfo() was introduced to delay freeing nilfs_sb_info struct. But the nilfs_put_sbinfo() mistakenly used atomic_dec_and_test() function to check the reference count, and it caused the nilfs_sb_info was freed when user mounted a snapshot twice. This bug also suggests there was unseen memory leak in usual mount /umount operations for nilfs. Signed-off-by: Ryusuke Konishi --- fs/nilfs2/the_nilfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index e8adbffc626f..1b9caafb8662 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -253,7 +253,7 @@ nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi) static inline void nilfs_put_sbinfo(struct nilfs_sb_info *sbi) { - if (!atomic_dec_and_test(&sbi->s_count)) + if (atomic_dec_and_test(&sbi->s_count)) kfree(sbi); } -- cgit v1.2.3 From 89a4eb4b66e8f4d395e14a14d262dac4d6ca52f0 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 18 Aug 2009 14:11:08 -0700 Subject: vfs: make get_sb_pseudo set s_maxbytes to value that can be cast to signed get_sb_pseudo sets s_maxbytes to ~0ULL which becomes negative when cast to a signed value. Fix it to use MAX_LFS_FILESIZE which casts properly to a positive signed value. Signed-off-by: Jeff Layton Reviewed-by: Johannes Weiner Acked-by: Steve French Reviewed-by: Christoph Hellwig Cc: Al Viro Cc: Robert Love Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/libfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/libfs.c b/fs/libfs.c index ddfa89948c3f..dcec3d3ea64f 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -217,7 +217,7 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name, return PTR_ERR(s); s->s_flags = MS_NOUSER; - s->s_maxbytes = ~0ULL; + s->s_maxbytes = MAX_LFS_FILESIZE; s->s_blocksize = PAGE_SIZE; s->s_blocksize_bits = PAGE_SHIFT; s->s_magic = magic; -- cgit v1.2.3 From 0753ba01e126020bf0f8150934903b48935b697d Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Tue, 18 Aug 2009 14:11:10 -0700 Subject: mm: revert "oom: move oom_adj value" The commit 2ff05b2b (oom: move oom_adj value) moveed the oom_adj value to the mm_struct. It was a very good first step for sanitize OOM. However Paul Menage reported the commit makes regression to his job scheduler. Current OOM logic can kill OOM_DISABLED process. Why? His program has the code of similar to the following. ... set_oom_adj(OOM_DISABLE); /* The job scheduler never killed by oom */ ... if (vfork() == 0) { set_oom_adj(0); /* Invoked child can be killed */ execve("foo-bar-cmd"); } .... vfork() parent and child are shared the same mm_struct. then above set_oom_adj(0) doesn't only change oom_adj for vfork() child, it's also change oom_adj for vfork() parent. Then, vfork() parent (job scheduler) lost OOM immune and it was killed. Actually, fork-setting-exec idiom is very frequently used in userland program. We must not break this assumption. Then, this patch revert commit 2ff05b2b and related commit. Reverted commit list --------------------- - commit 2ff05b2b4e (oom: move oom_adj value from task_struct to mm_struct) - commit 4d8b9135c3 (oom: avoid unnecessary mm locking and scanning for OOM_DISABLE) - commit 8123681022 (oom: only oom kill exiting tasks with attached memory) - commit 933b787b57 (mm: copy over oom_adj value at fork time) Signed-off-by: KOSAKI Motohiro Cc: Paul Menage Cc: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: Rik van Riel Cc: Linus Torvalds Cc: Oleg Nesterov Cc: Nick Piggin Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 15 +++------ fs/proc/base.c | 19 ++--------- include/linux/mm_types.h | 2 -- include/linux/sched.h | 1 + kernel/fork.c | 1 - mm/oom_kill.c | 64 +++++++++++++++++++++++--------------- 6 files changed, 48 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index fad18f9456e4..ffead13f9443 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -1167,13 +1167,11 @@ CHAPTER 3: PER-PROCESS PARAMETERS 3.1 /proc//oom_adj - Adjust the oom-killer score ------------------------------------------------------ -This file can be used to adjust the score used to select which processes should -be killed in an out-of-memory situation. The oom_adj value is a characteristic -of the task's mm, so all threads that share an mm with pid will have the same -oom_adj value. A high value will increase the likelihood of this process being -killed by the oom-killer. Valid values are in the range -16 to +15 as -explained below and a special value of -17, which disables oom-killing -altogether for threads sharing pid's mm. +This file can be used to adjust the score used to select which processes +should be killed in an out-of-memory situation. Giving it a high score will +increase the likelihood of this process being killed by the oom-killer. Valid +values are in the range -16 to +15, plus the special value -17, which disables +oom-killing altogether for this process. The process to be killed in an out-of-memory situation is selected among all others based on its badness score. This value equals the original memory size of the process @@ -1187,9 +1185,6 @@ the parent's score if they do not share the same memory. Thus forking servers are the prime candidates to be killed. Having only one 'hungry' child will make parent less preferable than the child. -/proc//oom_adj cannot be changed for kthreads since they are immune from -oom-killing already. - /proc//oom_score shows process' current badness score. The following heuristics are then applied: diff --git a/fs/proc/base.c b/fs/proc/base.c index 175db258942f..6f742f6658a9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1003,12 +1003,7 @@ static ssize_t oom_adjust_read(struct file *file, char __user *buf, if (!task) return -ESRCH; - task_lock(task); - if (task->mm) - oom_adjust = task->mm->oom_adj; - else - oom_adjust = OOM_DISABLE; - task_unlock(task); + oom_adjust = task->oomkilladj; put_task_struct(task); len = snprintf(buffer, sizeof(buffer), "%i\n", oom_adjust); @@ -1037,19 +1032,11 @@ static ssize_t oom_adjust_write(struct file *file, const char __user *buf, task = get_proc_task(file->f_path.dentry->d_inode); if (!task) return -ESRCH; - task_lock(task); - if (!task->mm) { - task_unlock(task); - put_task_struct(task); - return -EINVAL; - } - if (oom_adjust < task->mm->oom_adj && !capable(CAP_SYS_RESOURCE)) { - task_unlock(task); + if (oom_adjust < task->oomkilladj && !capable(CAP_SYS_RESOURCE)) { put_task_struct(task); return -EACCES; } - task->mm->oom_adj = oom_adjust; - task_unlock(task); + task->oomkilladj = oom_adjust; put_task_struct(task); if (end - buffer == 0) return -EIO; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 7acc8439d9b3..0042090a4d70 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -240,8 +240,6 @@ struct mm_struct { unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */ - s8 oom_adj; /* OOM kill score adjustment (bit shift) */ - cpumask_t cpu_vm_mask; /* Architecture-specific MM context */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3ab08e4bb6b8..0f1ea4a66957 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1198,6 +1198,7 @@ struct task_struct { * a short time */ unsigned char fpu_counter; + s8 oomkilladj; /* OOM kill score adjustment (bit shift). */ #ifdef CONFIG_BLK_DEV_IO_TRACE unsigned int btrace_seq; #endif diff --git a/kernel/fork.c b/kernel/fork.c index 021e1138556e..144326b7af50 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -426,7 +426,6 @@ static struct mm_struct * mm_init(struct mm_struct * mm, struct task_struct *p) init_rwsem(&mm->mmap_sem); INIT_LIST_HEAD(&mm->mmlist); mm->flags = (current->mm) ? current->mm->flags : default_dump_filter; - mm->oom_adj = (current->mm) ? current->mm->oom_adj : 0; mm->core_state = NULL; mm->nr_ptes = 0; set_mm_counter(mm, file_rss, 0); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 175a67a78a99..a7b2460e922b 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -58,7 +58,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) unsigned long points, cpu_time, run_time; struct mm_struct *mm; struct task_struct *child; - int oom_adj; task_lock(p); mm = p->mm; @@ -66,11 +65,6 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) task_unlock(p); return 0; } - oom_adj = mm->oom_adj; - if (oom_adj == OOM_DISABLE) { - task_unlock(p); - return 0; - } /* * The memory size of the process is the basis for the badness. @@ -154,15 +148,15 @@ unsigned long badness(struct task_struct *p, unsigned long uptime) points /= 8; /* - * Adjust the score by oom_adj. + * Adjust the score by oomkilladj. */ - if (oom_adj) { - if (oom_adj > 0) { + if (p->oomkilladj) { + if (p->oomkilladj > 0) { if (!points) points = 1; - points <<= oom_adj; + points <<= p->oomkilladj; } else - points >>= -(oom_adj); + points >>= -(p->oomkilladj); } #ifdef DEBUG @@ -257,8 +251,11 @@ static struct task_struct *select_bad_process(unsigned long *ppoints, *ppoints = ULONG_MAX; } + if (p->oomkilladj == OOM_DISABLE) + continue; + points = badness(p, uptime.tv_sec); - if (points > *ppoints) { + if (points > *ppoints || !chosen) { chosen = p; *ppoints = points; } @@ -307,7 +304,8 @@ static void dump_tasks(const struct mem_cgroup *mem) } printk(KERN_INFO "[%5d] %5d %5d %8lu %8lu %3d %3d %s\n", p->pid, __task_cred(p)->uid, p->tgid, mm->total_vm, - get_mm_rss(mm), (int)task_cpu(p), mm->oom_adj, p->comm); + get_mm_rss(mm), (int)task_cpu(p), p->oomkilladj, + p->comm); task_unlock(p); } while_each_thread(g, p); } @@ -325,8 +323,11 @@ static void __oom_kill_task(struct task_struct *p, int verbose) return; } - if (!p->mm) + if (!p->mm) { + WARN_ON(1); + printk(KERN_WARNING "tried to kill an mm-less task!\n"); return; + } if (verbose) printk(KERN_ERR "Killed process %d (%s)\n", @@ -348,13 +349,28 @@ static int oom_kill_task(struct task_struct *p) struct mm_struct *mm; struct task_struct *g, *q; - task_lock(p); mm = p->mm; - if (!mm || mm->oom_adj == OOM_DISABLE) { - task_unlock(p); + + /* WARNING: mm may not be dereferenced since we did not obtain its + * value from get_task_mm(p). This is OK since all we need to do is + * compare mm to q->mm below. + * + * Furthermore, even if mm contains a non-NULL value, p->mm may + * change to NULL at any time since we do not hold task_lock(p). + * However, this is of no concern to us. + */ + + if (mm == NULL) return 1; - } - task_unlock(p); + + /* + * Don't kill the process if any threads are set to OOM_DISABLE + */ + do_each_thread(g, q) { + if (q->mm == mm && q->oomkilladj == OOM_DISABLE) + return 1; + } while_each_thread(g, q); + __oom_kill_task(p, 1); /* @@ -377,11 +393,10 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, struct task_struct *c; if (printk_ratelimit()) { - task_lock(current); printk(KERN_WARNING "%s invoked oom-killer: " - "gfp_mask=0x%x, order=%d, oom_adj=%d\n", - current->comm, gfp_mask, order, - current->mm ? current->mm->oom_adj : OOM_DISABLE); + "gfp_mask=0x%x, order=%d, oomkilladj=%d\n", + current->comm, gfp_mask, order, current->oomkilladj); + task_lock(current); cpuset_print_task_mems_allowed(current); task_unlock(current); dump_stack(); @@ -394,9 +409,8 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If the task is already exiting, don't alarm the sysadmin or kill * its children or threads, just set TIF_MEMDIE so it can die quickly - * if its mm is still attached. */ - if (p->mm && (p->flags & PF_EXITING)) { + if (p->flags & PF_EXITING) { __oom_kill_task(p, 0); return 0; } -- cgit v1.2.3 From a8b88d3d49623ac701b5dc996cbd61219c793c7c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 20 Aug 2009 18:26:52 +0200 Subject: ocfs2: Add missing lock name There is missing name for NFSSync cluster lock. This makes lockdep unhappy because we end up passing NULL to lockdep when initializing lock key. Fix it. Signed-off-by: Jan Kara Signed-off-by: Joel Becker --- fs/ocfs2/ocfs2_lockid.h | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index fcdba091af3d..c212cf5a2bdf 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -108,6 +108,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_OPEN] = "Open", [OCFS2_LOCK_TYPE_FLOCK] = "Flock", [OCFS2_LOCK_TYPE_QINFO] = "Quota", + [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", }; -- cgit v1.2.3 From c795b33ba171e41563ab7e25105c0cd4edd81cd7 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 20 Aug 2009 13:43:19 -0500 Subject: ocfs2/dlm: Wait on lockres instead of erroring cancel requests In case a downconvert is queued, and a flock receives a signal, BUG_ON(lockres->l_action != OCFS2_AST_INVALID) is triggered because a lock cancel triggers a dlmunlock while an AST is scheduled. To avoid this, allow a LKM_CANCEL to pass through, and let it wait on __dlm_wait_on_lockres(). Signed-off-by: Goldwyn Rodrigues Acked-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmunlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index fcf879ed6930..756f5b0998e0 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -122,7 +122,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); spin_unlock(&dlm->ast_lock); - if (in_use) { + if (in_use && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, res->lockname.name); @@ -131,7 +131,7 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); if (res->state & DLM_LOCK_RES_IN_PROGRESS) { - if (master_node) { + if (master_node && !(flags & LKM_CANCEL)) { mlog(ML_ERROR, "lockres in progress!\n"); spin_unlock(&res->spinlock); return DLM_FORWARD; -- cgit v1.2.3 From 03e860bd9f6a3cca747b0795bed26279a8b420a0 Mon Sep 17 00:00:00 2001 From: "From: Nick Piggin" Date: Fri, 21 Aug 2009 10:09:44 +0200 Subject: btrfs: fix inode rbtree corruption Node may not be inserted over existing node. This causes inode tree corruption and I was seeing crashes in inode_tree_del which I can not reproduce after this patch. The other way to fix this would be to tie inode lifetime in the rbtree with inode while not in freeing state. I had a look at this but it is not so trivial at this point. At least this patch gets things working again. Signed-off-by: Nick Piggin Cc: Chris Mason Acked-by: Yan Zheng Signed-off-by: Jens Axboe --- fs/btrfs/inode.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 272b9b2bea86..59cba180fe83 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3099,8 +3099,12 @@ static void inode_tree_add(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_inode *entry; - struct rb_node **p = &root->inode_tree.rb_node; - struct rb_node *parent = NULL; + struct rb_node **p; + struct rb_node *parent; + +again: + p = &root->inode_tree.rb_node; + parent = NULL; spin_lock(&root->inode_lock); while (*p) { @@ -3108,13 +3112,16 @@ static void inode_tree_add(struct inode *inode) entry = rb_entry(parent, struct btrfs_inode, rb_node); if (inode->i_ino < entry->vfs_inode.i_ino) - p = &(*p)->rb_left; + p = &parent->rb_left; else if (inode->i_ino > entry->vfs_inode.i_ino) - p = &(*p)->rb_right; + p = &parent->rb_right; else { WARN_ON(!(entry->vfs_inode.i_state & (I_WILL_FREE | I_FREEING | I_CLEAR))); - break; + rb_erase(parent, &root->inode_tree); + RB_CLEAR_NODE(parent); + spin_unlock(&root->inode_lock); + goto again; } } rb_link_node(&BTRFS_I(inode)->rb_node, parent, p); @@ -3126,12 +3133,12 @@ static void inode_tree_del(struct inode *inode) { struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->inode_lock); if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - spin_lock(&root->inode_lock); rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - spin_unlock(&root->inode_lock); RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); } + spin_unlock(&root->inode_lock); } static noinline void init_btrfs_i(struct inode *inode) -- cgit v1.2.3 From 8e9d78edea3ce5c0036f85b93091483f2f15443a Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 21 Aug 2009 17:40:08 -0700 Subject: Re-introduce page mapping check in mark_buffer_dirty() In commit a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 ("Fix race in create_empty_buffers() vs __set_page_dirty_buffers()"), I removed a test for a NULL page mapping unintentionally when some of the code inside __set_page_dirty() was moved to the callers. That removal generally didn't matter, since a filesystem would serialize truncation (which clears the page mapping) against writing (which marks the buffer dirty), so locking at a higher level (either per-page or an inode at a time) should mean that the buffer page would be stable. And indeed, nothing bad seemed to happen. Except it turns out that apparently reiserfs does something odd when under load and writing out the journal, and we have a number of bugzilla entries that look similar: http://bugzilla.kernel.org/show_bug.cgi?id=13556 http://bugzilla.kernel.org/show_bug.cgi?id=13756 http://bugzilla.kernel.org/show_bug.cgi?id=13876 and it looks like reiserfs depended on that check (the common theme seems to be "data=journal", and a journal writeback during a truncate). I suspect reiserfs should have some additional locking, but in the meantime this should get us back to the pre-2.6.29 behavior. Pattern-pointed-out-by: Roland Kletzing Cc: stable@kernel.org (2.6.29 and 2.6.30) Cc: Jeff Mahoney Cc: Nick Piggin Cc: Al Viro Signed-off-by: Linus Torvalds --- fs/buffer.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index a3ef091a45bd..28f320fac4d4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1165,8 +1165,11 @@ void mark_buffer_dirty(struct buffer_head *bh) if (!test_set_buffer_dirty(bh)) { struct page *page = bh->b_page; - if (!TestSetPageDirty(page)) - __set_page_dirty(page, page_mapping(page), 0); + if (!TestSetPageDirty(page)) { + struct address_space *mapping = page_mapping(page); + if (mapping) + __set_page_dirty(page, mapping, 0); + } } } -- cgit v1.2.3 From 6777d773a463ac045d333b989d4e44660f8d92ad Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Fri, 21 Aug 2009 14:32:48 -0400 Subject: kernel_read: redefine offset type vfs_read() offset is defined as loff_t, but kernel_read() offset is only defined as unsigned long. Redefine kernel_read() offset as loff_t. Cc: stable@kernel.org Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/exec.c | 4 ++-- include/linux/fs.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/exec.c b/fs/exec.c index 4a8849e45b21..fb4f3cdda78c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -678,8 +678,8 @@ exit: } EXPORT_SYMBOL(open_exec); -int kernel_read(struct file *file, unsigned long offset, - char *addr, unsigned long count) +int kernel_read(struct file *file, loff_t offset, + char *addr, unsigned long count) { mm_segment_t old_fs; loff_t pos = offset; diff --git a/include/linux/fs.h b/include/linux/fs.h index 67888a9e0655..73e9b643e455 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2123,7 +2123,7 @@ extern struct file *do_filp_open(int dfd, const char *pathname, int open_flag, int mode, int acc_mode); extern int may_open(struct path *, int, int); -extern int kernel_read(struct file *, unsigned long, char *, unsigned long); +extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); /* fs/dcache.c -- generic fs support functions */ -- cgit v1.2.3 From 6d41807614151829ae17a3a58bff8572af5e407e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 10 Aug 2009 16:03:43 -0400 Subject: ext3: Update Kconfig description of EXT3_DEFAULTS_TO_ORDERED The old description for this configuration option was perhaps not completely balanced in terms of describing the tradeoffs of using a default of data=writeback vs. data=ordered. Despite the fact that old description very strongly recomended disabling this feature, all of the major distributions have elected to preserve the existing 'legacy' default, which is a strong hint that it perhaps wasn't telling the whole story. This revised description has been vetted by a number of ext3 developers as being better at informing the user about the tradeoffs of enabling or disabling this configuration feature. Cc: linux-ext4@vger.kernel.org Signed-off-by: "Theodore Ts'o" Signed-off-by: Jan Kara --- fs/ext3/Kconfig | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig index fb3c1a21b135..522b15498f45 100644 --- a/fs/ext3/Kconfig +++ b/fs/ext3/Kconfig @@ -29,23 +29,25 @@ config EXT3_FS module will be called ext3. config EXT3_DEFAULTS_TO_ORDERED - bool "Default to 'data=ordered' in ext3 (legacy option)" + bool "Default to 'data=ordered' in ext3" depends on EXT3_FS help - If a filesystem does not explicitly specify a data ordering - mode, and the journal capability allowed it, ext3 used to - historically default to 'data=ordered'. - - That was a rather unfortunate choice, because it leads to all - kinds of latency problems, and the 'data=writeback' mode is more - appropriate these days. - - You should probably always answer 'n' here, and if you really - want to use 'data=ordered' mode, set it in the filesystem itself - with 'tune2fs -o journal_data_ordered'. - - But if you really want to enable the legacy default, you can do - so by answering 'y' to this question. + The journal mode options for ext3 have different tradeoffs + between when data is guaranteed to be on disk and + performance. The use of "data=writeback" can cause + unwritten data to appear in files after an system crash or + power failure, which can be a security issue. However, + "data=ordered" mode can also result in major performance + problems, including seconds-long delays before an fsync() + call returns. For details, see: + + http://ext4.wiki.kernel.org/index.php/Ext3_data_mode_tradeoffs + + If you have been historically happy with ext3's performance, + data=ordered mode will be a safe choice and you should + answer 'y' here. If you understand the reliability and data + privacy issues of data=writeback and are willing to make + that trade off, answer 'n'. config EXT3_FS_XATTR bool "Ext3 extended attributes" -- cgit v1.2.3 From 3c4cec65274481ec6332b0a91f19b4c8c5394801 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 24 Aug 2009 16:38:43 +0200 Subject: ext3: Improve error message that changing journaling mode on remount is not possible This patch makes the error message about changing journaling mode on remount more descriptive. Some people are going to hit this error now due to commit bbae8bcc49bc4d002221dab52c79a50a82e7cd1f if they configure a kernel to default to data=writeback mode. The problem happens if they have data=ordered set for the root filesystem in /etc/fstab but not in the kernel command line (and they don't use initrd). Their filesystem then gets mounted as data=writeback by kernel but then their boot fails because init scripts won't be able to remount the filesystem rw. Better error message will hopefully make it easier for them to find the error in their setup and bother us less with error reports :). Signed-off-by: Jan Kara --- fs/ext3/super.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 524b349c6299..a8d80a7f1105 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -543,6 +543,19 @@ static inline void ext3_show_quota_options(struct seq_file *seq, struct super_bl #endif } +static char *data_mode_string(unsigned long mode) +{ + switch (mode) { + case EXT3_MOUNT_JOURNAL_DATA: + return "journal"; + case EXT3_MOUNT_ORDERED_DATA: + return "ordered"; + case EXT3_MOUNT_WRITEBACK_DATA: + return "writeback"; + } + return "unknown"; +} + /* * Show an option if * - it's set to a non-default value OR @@ -616,13 +629,8 @@ static int ext3_show_options(struct seq_file *seq, struct vfsmount *vfs) if (test_opt(sb, NOBH)) seq_puts(seq, ",nobh"); - if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA) - seq_puts(seq, ",data=journal"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA) - seq_puts(seq, ",data=ordered"); - else if (test_opt(sb, DATA_FLAGS) == EXT3_MOUNT_WRITEBACK_DATA) - seq_puts(seq, ",data=writeback"); - + seq_printf(seq, ",data=%s", data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS)); if (test_opt(sb, DATA_ERR_ABORT)) seq_puts(seq, ",data_err=abort"); @@ -1024,12 +1032,18 @@ static int parse_options (char *options, struct super_block *sb, datacheck: if (is_remount) { if ((sbi->s_mount_opt & EXT3_MOUNT_DATA_FLAGS) - != data_opt) { - printk(KERN_ERR - "EXT3-fs: cannot change data " - "mode on remount\n"); - return 0; - } + == data_opt) + break; + printk(KERN_ERR + "EXT3-fs (device %s): Cannot change " + "data mode on remount. The filesystem " + "is mounted in data=%s mode and you " + "try to remount it in data=%s mode.\n", + sb->s_id, + data_mode_string(sbi->s_mount_opt & + EXT3_MOUNT_DATA_FLAGS), + data_mode_string(data_opt)); + return 0; } else { sbi->s_mount_opt &= ~EXT3_MOUNT_DATA_FLAGS; sbi->s_mount_opt |= data_opt; -- cgit v1.2.3 From 353d5c30c666580347515da609dd74a2b8e9b828 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 24 Aug 2009 16:30:28 +0100 Subject: mm: fix hugetlb bug due to user_shm_unlock call 2.6.30's commit 8a0bdec194c21c8fdef840989d0d7b742bb5d4bc removed user_shm_lock() calls in hugetlb_file_setup() but left the user_shm_unlock call in shm_destroy(). In detail: Assume that can_do_hugetlb_shm() returns true and hence user_shm_lock() is not called in hugetlb_file_setup(). However, user_shm_unlock() is called in any case in shm_destroy() and in the following atomic_dec_and_lock(&up->__count) in free_uid() is executed and if up->__count gets zero, also cleanup_user_struct() is scheduled. Note that sched_destroy_user() is empty if CONFIG_USER_SCHED is not set. However, the ref counter up->__count gets unexpectedly non-positive and the corresponding structs are freed even though there are live references to them, resulting in a kernel oops after a lots of shmget(SHM_HUGETLB)/shmctl(IPC_RMID) cycles and CONFIG_USER_SCHED set. Hugh changed Stefan's suggested patch: can_do_hugetlb_shm() at the time of shm_destroy() may give a different answer from at the time of hugetlb_file_setup(). And fixed newseg()'s no_id error path, which has missed user_shm_unlock() ever since it came in 2.6.9. Reported-by: Stefan Huber Signed-off-by: Hugh Dickins Tested-by: Stefan Huber Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 20 ++++++++++++-------- include/linux/hugetlb.h | 6 ++++-- ipc/shm.c | 8 +++++--- 3 files changed, 21 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 941c8425c10b..cb88dac8ccaa 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -935,26 +935,28 @@ static int can_do_hugetlb_shm(void) return capable(CAP_IPC_LOCK) || in_group_p(sysctl_hugetlb_shm_group); } -struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag, + struct user_struct **user) { int error = -ENOMEM; - int unlock_shm = 0; struct file *file; struct inode *inode; struct dentry *dentry, *root; struct qstr quick_string; - struct user_struct *user = current_user(); + *user = NULL; if (!hugetlbfs_vfsmount) return ERR_PTR(-ENOENT); if (!can_do_hugetlb_shm()) { - if (user_shm_lock(size, user)) { - unlock_shm = 1; + *user = current_user(); + if (user_shm_lock(size, *user)) { WARN_ONCE(1, "Using mlock ulimits for SHM_HUGETLB deprecated\n"); - } else + } else { + *user = NULL; return ERR_PTR(-EPERM); + } } root = hugetlbfs_vfsmount->mnt_root; @@ -996,8 +998,10 @@ out_inode: out_dentry: dput(dentry); out_shm_unlock: - if (unlock_shm) - user_shm_unlock(size, user); + if (*user) { + user_shm_unlock(size, *user); + *user = NULL; + } return ERR_PTR(error); } diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 2723513a5651..5cbc620bdfe0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -10,6 +10,7 @@ #include struct ctl_table; +struct user_struct; int PageHuge(struct page *page); @@ -146,7 +147,8 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) extern const struct file_operations hugetlbfs_file_operations; extern struct vm_operations_struct hugetlb_vm_ops; -struct file *hugetlb_file_setup(const char *name, size_t, int); +struct file *hugetlb_file_setup(const char *name, size_t size, int acct, + struct user_struct **user); int hugetlb_get_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta); @@ -168,7 +170,7 @@ static inline void set_file_hugepages(struct file *file) #define is_file_hugepages(file) 0 #define set_file_hugepages(file) BUG() -#define hugetlb_file_setup(name,size,acctflag) ERR_PTR(-ENOSYS) +#define hugetlb_file_setup(name,size,acct,user) ERR_PTR(-ENOSYS) #endif /* !CONFIG_HUGETLBFS */ diff --git a/ipc/shm.c b/ipc/shm.c index 15dd238e5338..1bc4701ef4f0 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -174,7 +174,7 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) shm_unlock(shp); if (!is_file_hugepages(shp->shm_file)) shmem_lock(shp->shm_file, 0, shp->mlock_user); - else + else if (shp->mlock_user) user_shm_unlock(shp->shm_file->f_path.dentry->d_inode->i_size, shp->mlock_user); fput (shp->shm_file); @@ -369,8 +369,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) /* hugetlb_file_setup applies strict accounting */ if (shmflg & SHM_NORESERVE) acctflag = VM_NORESERVE; - file = hugetlb_file_setup(name, size, acctflag); - shp->mlock_user = current_user(); + file = hugetlb_file_setup(name, size, acctflag, + &shp->mlock_user); } else { /* * Do not allow no accounting for OVERCOMMIT_NEVER, even @@ -410,6 +410,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) return error; no_id: + if (shp->mlock_user) /* shmflg & SHM_HUGETLB case */ + user_shm_unlock(size, shp->mlock_user); fput(file); no_file: security_shm_free(shp); -- cgit v1.2.3 From 7111dc73923e9737b38a3ef5b5f236109000ff28 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Aug 2009 19:21:29 -0400 Subject: NFSv4: Fix an infinite looping problem with the nfs4_state_manager Commit 76db6d9500caeaa774a3e32a997eba30bbdc176b (nfs41: add session setup to the state manager) introduces an infinite loop possibility in the NFSv4 state manager. By first checking nfs4_has_session() before clearing the NFS4CLNT_SESSION_SETUP flag, it allows for a situation where someone sets that flag, but it never gets cleared, and so the state manager loops. In fact commit c3fad1b1aaf850bf692642642ace7cd0d64af0a3 (nfs41: add session reset to state manager) causes this to happen every time we get a network partition error. Signed-off-by: Trond Myklebust Tested-by: Daniel J Blueman Signed-off-by: Linus Torvalds --- fs/nfs/nfs4state.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 65ca8c18476f..1434080aefeb 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1250,8 +1250,8 @@ static void nfs4_state_manager(struct nfs_client *clp) continue; } /* Initialize or reset the session */ - if (nfs4_has_session(clp) && - test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state)) { + if (test_and_clear_bit(NFS4CLNT_SESSION_SETUP, &clp->cl_state) + && nfs4_has_session(clp)) { if (clp->cl_cons_state == NFS_CS_SESSION_INITING) status = nfs4_initialize_session(clp); else -- cgit v1.2.3 From 52cef7555adf5ca09b3b7283097466759120d901 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 24 Aug 2009 16:03:35 -0400 Subject: inotify: seperate new watch creation updating existing watches There is nothing known wrong with the inotify watch addition/modification but this patch seperates the two code paths to make them each easy to verify as correct. Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 172 +++++++++++++++++++++++---------------- 1 file changed, 103 insertions(+), 69 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index dc32ed8323ba..d8f73c253073 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -431,80 +431,29 @@ static void inotify_free_mark(struct fsnotify_mark_entry *entry) kmem_cache_free(inotify_inode_mark_cachep, ientry); } -static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +static int inotify_update_existing_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) { - struct fsnotify_mark_entry *entry = NULL; + struct fsnotify_mark_entry *entry; struct inotify_inode_mark_entry *ientry; - struct inotify_inode_mark_entry *tmp_ientry; - int ret = 0; - int add = (arg & IN_MASK_ADD); - __u32 mask; __u32 old_mask, new_mask; + __u32 mask; + int add = (arg & IN_MASK_ADD); + int ret; /* don't allow invalid bits: we don't want flags set */ mask = inotify_arg_to_mask(arg); if (unlikely(!mask)) return -EINVAL; - tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); - if (unlikely(!tmp_ientry)) - return -ENOMEM; - /* we set the mask at the end after attaching it */ - fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); - tmp_ientry->wd = -1; - -find_entry: spin_lock(&inode->i_lock); entry = fsnotify_find_mark_entry(group, inode); spin_unlock(&inode->i_lock); - if (entry) { - ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); - } else { - ret = -ENOSPC; - if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) - goto out_err; -retry: - ret = -ENOMEM; - if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) - goto out_err; - - spin_lock(&group->inotify_data.idr_lock); - ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, - group->inotify_data.last_wd, - &tmp_ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); - if (ret) { - if (ret == -EAGAIN) - goto retry; - goto out_err; - } + if (!entry) + return -ENOENT; - ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); - if (ret) { - inotify_remove_from_idr(group, tmp_ientry); - if (ret == -EEXIST) - goto find_entry; - goto out_err; - } - - /* tmp_ientry has been added to the inode, so we are all set up. - * now we just need to make sure tmp_ientry doesn't get freed and - * we need to set up entry and ientry so the generic code can - * do its thing. */ - ientry = tmp_ientry; - entry = &ientry->fsn_entry; - tmp_ientry = NULL; - - atomic_inc(&group->inotify_data.user->inotify_watches); - - /* update the idr hint */ - group->inotify_data.last_wd = ientry->wd; - - /* we put the mark on the idr, take a reference */ - fsnotify_get_mark(entry); - } - - ret = ientry->wd; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); spin_lock(&entry->lock); @@ -536,18 +485,103 @@ retry: fsnotify_recalc_group_mask(group); } - /* this either matches fsnotify_find_mark_entry, or init_mark_entry - * depending on which path we took... */ + /* return the wd */ + ret = ientry->wd; + + /* match the get from fsnotify_find_mark_entry() */ fsnotify_put_mark(entry); + return ret; +} + +static int inotify_new_watch(struct fsnotify_group *group, + struct inode *inode, + u32 arg) +{ + struct inotify_inode_mark_entry *tmp_ientry; + __u32 mask; + int ret; + + /* don't allow invalid bits: we don't want flags set */ + mask = inotify_arg_to_mask(arg); + if (unlikely(!mask)) + return -EINVAL; + + tmp_ientry = kmem_cache_alloc(inotify_inode_mark_cachep, GFP_KERNEL); + if (unlikely(!tmp_ientry)) + return -ENOMEM; + + fsnotify_init_mark(&tmp_ientry->fsn_entry, inotify_free_mark); + tmp_ientry->fsn_entry.mask = mask; + tmp_ientry->wd = -1; + + ret = -ENOSPC; + if (atomic_read(&group->inotify_data.user->inotify_watches) >= inotify_max_user_watches) + goto out_err; +retry: + ret = -ENOMEM; + if (unlikely(!idr_pre_get(&group->inotify_data.idr, GFP_KERNEL))) + goto out_err; + + spin_lock(&group->inotify_data.idr_lock); + ret = idr_get_new_above(&group->inotify_data.idr, &tmp_ientry->fsn_entry, + group->inotify_data.last_wd, + &tmp_ientry->wd); + spin_unlock(&group->inotify_data.idr_lock); + if (ret) { + /* idr was out of memory allocate and try again */ + if (ret == -EAGAIN) + goto retry; + goto out_err; + } + + /* we are on the idr, now get on the inode */ + ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); + if (ret) { + /* we failed to get on the inode, get off the idr */ + inotify_remove_from_idr(group, tmp_ientry); + goto out_err; + } + + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + + /* update the idr hint, who cares about races, it's just a hint */ + group->inotify_data.last_wd = tmp_ientry->wd; + + /* increment the number of watches the user has */ + atomic_inc(&group->inotify_data.user->inotify_watches); + + /* return the watch descriptor for this new entry */ + ret = tmp_ientry->wd; + + /* match the ref from fsnotify_init_markentry() */ + fsnotify_put_mark(&tmp_ientry->fsn_entry); + out_err: - /* could be an error, could be that we found an existing mark */ - if (tmp_ientry) { - /* on the idr but didn't make it on the inode */ - if (tmp_ientry->wd != -1) - inotify_remove_from_idr(group, tmp_ientry); + if (ret < 0) kmem_cache_free(inotify_inode_mark_cachep, tmp_ientry); - } + + return ret; +} + +static int inotify_update_watch(struct fsnotify_group *group, struct inode *inode, u32 arg) +{ + int ret = 0; + +retry: + /* try to update and existing watch with the new arg */ + ret = inotify_update_existing_watch(group, inode, arg); + /* no mark present, try to add a new one */ + if (ret == -ENOENT) + ret = inotify_new_watch(group, inode, arg); + /* + * inotify_new_watch could race with another thread which did an + * inotify_new_watch between the update_existing and the add watch + * here, go back and try to update an existing mark again. + */ + if (ret == -EEXIST) + goto retry; return ret; } -- cgit v1.2.3 From cf4374267fbe966e8e4e7db68f5dc7b267439780 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 24 Aug 2009 16:03:35 -0400 Subject: inotify: do not BUG on idr entries at inotify destruction If an inotify watch is left in the idr when an fsnotify group is destroyed this will lead to a BUG. This is not a dangerous situation and really indicates a programming bug and leak of memory. This patch changes it to use a WARN and a printk rather than killing people's boxes. Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_fsnotify.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 5dcbafe72d71..c9ee67b442e1 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -105,16 +105,45 @@ static bool inotify_should_send_event(struct fsnotify_group *group, struct inode return send; } +/* + * This is NEVER supposed to be called. Inotify marks should either have been + * removed from the idr when the watch was removed or in the + * fsnotify_destroy_mark_by_group() call when the inotify instance was being + * torn down. This is only called if the idr is about to be freed but there + * are still marks in it. + */ static int idr_callback(int id, void *p, void *data) { - BUG(); + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *ientry; + static bool warned = false; + + if (warned) + return 0; + + warned = false; + entry = p; + ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + + WARN(1, "inotify closing but id=%d for entry=%p in group=%p still in " + "idr. Probably leaking memory\n", id, p, data); + + /* + * I'm taking the liberty of assuming that the mark in question is a + * valid address and I'm dereferencing it. This might help to figure + * out why we got here and the panic is no worse than the original + * BUG() that was here. + */ + if (entry) + printk(KERN_WARNING "entry->group=%p inode=%p wd=%d\n", + entry->group, entry->inode, ientry->wd); return 0; } static void inotify_free_group_priv(struct fsnotify_group *group) { /* ideally the idr is empty and we won't hit the BUG in teh callback */ - idr_for_each(&group->inotify_data.idr, idr_callback, NULL); + idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_remove_all(&group->inotify_data.idr); idr_destroy(&group->inotify_data.idr); } -- cgit v1.2.3 From dead537dd8a1c9495322c1d6f7c780697f474af0 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 24 Aug 2009 16:03:35 -0400 Subject: inotify: fix locking around inotify watching in the idr The are races around the idr storage of inotify watches. It's possible that a watch could be found from sys_inotify_rm_watch() in the idr, but it could be removed from the idr before that code does it's removal. Move the locking and the refcnt'ing so that these have to happen atomically. Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 50 ++++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index d8f73c253073..ce1f5823e2c2 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -364,20 +364,53 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns return error; } +/* + * Remove the mark from the idr (if present) and drop the reference + * on the mark because it was in the idr. + */ static void inotify_remove_from_idr(struct fsnotify_group *group, struct inotify_inode_mark_entry *ientry) { struct idr *idr; + struct fsnotify_mark_entry *entry; + struct inotify_inode_mark_entry *found_ientry; + int wd; spin_lock(&group->inotify_data.idr_lock); idr = &group->inotify_data.idr; - idr_remove(idr, ientry->wd); - spin_unlock(&group->inotify_data.idr_lock); + wd = ientry->wd; + + if (wd == -1) + goto out; + + entry = idr_find(&group->inotify_data.idr, wd); + if (unlikely(!entry)) + goto out; + + found_ientry = container_of(entry, struct inotify_inode_mark_entry, fsn_entry); + if (unlikely(found_ientry != ientry)) { + /* We found an entry in the idr with the right wd, but it's + * not the entry we were told to remove. eparis seriously + * fucked up somewhere. */ + WARN_ON(1); + ientry->wd = -1; + goto out; + } + + /* One ref for being in the idr, one ref held by the caller */ + BUG_ON(atomic_read(&entry->refcnt) < 2); + + idr_remove(idr, wd); ientry->wd = -1; + + /* removed from the idr, drop that ref */ + fsnotify_put_mark(entry); +out: + spin_unlock(&group->inotify_data.idr_lock); } + /* - * Send IN_IGNORED for this wd, remove this wd from the idr, and drop the - * internal reference help on the mark because it is in the idr. + * Send IN_IGNORED for this wd, remove this wd from the idr. */ void inotify_ignored_and_remove_idr(struct fsnotify_mark_entry *entry, struct fsnotify_group *group) @@ -417,9 +450,6 @@ skip_send_ignore: /* remove this entry from the idr */ inotify_remove_from_idr(group, ientry); - /* removed from idr, drop that reference */ - fsnotify_put_mark(entry); - atomic_dec(&group->inotify_data.user->inotify_watches); } @@ -535,6 +565,9 @@ retry: goto out_err; } + /* we put the mark on the idr, take a reference */ + fsnotify_get_mark(&tmp_ientry->fsn_entry); + /* we are on the idr, now get on the inode */ ret = fsnotify_add_mark(&tmp_ientry->fsn_entry, group, inode); if (ret) { @@ -543,9 +576,6 @@ retry: goto out_err; } - /* we put the mark on the idr, take a reference */ - fsnotify_get_mark(&tmp_ientry->fsn_entry); - /* update the idr hint, who cares about races, it's just a hint */ group->inotify_data.last_wd = tmp_ientry->wd; -- cgit v1.2.3 From 0db501bd0610ee0c0aca84d927f90bcccd09e2bd Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 27 Aug 2009 03:20:04 -0700 Subject: inotify: Ensure we alwasy write the terminating NULL. Before the rewrite copy_event_to_user always wrote a terqminating '\0' byte to user space after the filename. Since the rewrite that terminating byte was skipped if your filename is exactly a multiple of event_size. Ouch! So add one byte to name_size before we round up and use clear_user to set userspace to zero like /dev/zero does instead of copying the strange nul_inotify_event. I can't quite convince myself len_to_zero will never exceed 16 and even if it doesn't clear_user should be more efficient and a more accurate reflection of what the code is trying to do. Signed-off-by: Eric W. Biederman Signed-off-by: Eric Paris --- fs/notify/inotify/inotify_user.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index ce1f5823e2c2..0e781bc88d1e 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -47,9 +47,6 @@ static struct vfsmount *inotify_mnt __read_mostly; -/* this just sits here and wastes global memory. used to just pad userspace messages with zeros */ -static struct inotify_event nul_inotify_event; - /* these are configurable via /proc/sys/fs/inotify/ */ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; @@ -199,8 +196,10 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, inotify_free_event_priv(fsn_priv); } - /* round up event->name_len so it is a multiple of event_size */ - name_len = roundup(event->name_len, event_size); + /* round up event->name_len so it is a multiple of event_size + * plus an extra byte for the terminating '\0'. + */ + name_len = roundup(event->name_len + 1, event_size); inotify_event.len = name_len; inotify_event.mask = inotify_mask_to_arg(event->mask); @@ -224,8 +223,8 @@ static ssize_t copy_event_to_user(struct fsnotify_group *group, return -EFAULT; buf += event->name_len; - /* fill userspace with 0's from nul_inotify_event */ - if (copy_to_user(buf, &nul_inotify_event, len_to_zero)) + /* fill userspace with 0's */ + if (clear_user(buf, len_to_zero)) return -EFAULT; buf += len_to_zero; event_size += name_len; -- cgit v1.2.3 From 9886e836a6a5dbd273dc55b17e713f0a188d137f Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 27 Aug 2009 13:09:06 +0100 Subject: AFS: Stop readlink() on AFS crashing due to NULL 'file' ptr kAFS crashes when asked to read a symbolic link because page_getlink() passes a NULL file pointer to read_mapping_page(), but afs_readpage() expects a file pointer from which to extract a key. Modify afs_readpage() to request the appropriate key from the calling process's keyrings if a file struct is not supplied with one attached. Signed-off-by: David Howells Acked-by: Anton Blanchard Signed-off-by: Linus Torvalds --- fs/afs/file.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/afs/file.c b/fs/afs/file.c index 0149dab365e7..681c2a7b013f 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -134,9 +134,16 @@ static int afs_readpage(struct file *file, struct page *page) inode = page->mapping->host; - ASSERT(file != NULL); - key = file->private_data; - ASSERT(key != NULL); + if (file) { + key = file->private_data; + ASSERT(key != NULL); + } else { + key = afs_request_key(AFS_FS_S(inode->i_sb)->volume->cell); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto error_nokey; + } + } _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); @@ -207,12 +214,17 @@ static int afs_readpage(struct file *file, struct page *page) unlock_page(page); } + if (!file) + key_put(key); _leave(" = 0"); return 0; error: SetPageError(page); unlock_page(page); + if (!file) + key_put(key); +error_nokey: _leave(" = %d", ret); return ret; } -- cgit v1.2.3