From e5880d76aea443b04e07da19830da0f6f7494eef Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 27 Jul 2010 11:56:04 -0400 Subject: ext4: fix potential NULL dereference while tracing The allocation_context pointer can be NULL. Signed-off-by: "Theodore Ts'o" --- include/trace/events/ext4.h | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index f3865c7b4166..01e9e0076a92 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -395,11 +395,12 @@ DEFINE_EVENT(ext4__mb_new_pa, ext4_mb_new_group_pa, ); TRACE_EVENT(ext4_mb_release_inode_pa, - TP_PROTO(struct ext4_allocation_context *ac, + TP_PROTO(struct super_block *sb, + struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa, unsigned long long block, unsigned int count), - TP_ARGS(ac, pa, block, count), + TP_ARGS(sb, ac, pa, block, count), TP_STRUCT__entry( __field( dev_t, dev ) @@ -410,8 +411,9 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; - __entry->ino = ac->ac_inode->i_ino; + __entry->dev = sb->s_dev; + __entry->ino = (ac && ac->ac_inode) ? + ac->ac_inode->i_ino : 0; __entry->block = block; __entry->count = count; ), @@ -422,10 +424,11 @@ TRACE_EVENT(ext4_mb_release_inode_pa, ); TRACE_EVENT(ext4_mb_release_group_pa, - TP_PROTO(struct ext4_allocation_context *ac, + TP_PROTO(struct super_block *sb, + struct ext4_allocation_context *ac, struct ext4_prealloc_space *pa), - TP_ARGS(ac, pa), + TP_ARGS(sb, ac, pa), TP_STRUCT__entry( __field( dev_t, dev ) @@ -436,8 +439,9 @@ TRACE_EVENT(ext4_mb_release_group_pa, ), TP_fast_assign( - __entry->dev = ac->ac_sb->s_dev; - __entry->ino = ac->ac_inode->i_ino; + __entry->dev = sb->s_dev; + __entry->ino = (ac && ac->ac_inode) ? + ac->ac_inode->i_ino : 0; __entry->pa_pstart = pa->pa_pstart; __entry->pa_len = pa->pa_len; ), -- cgit v1.2.3 From 47def82672b3ba4e7c5e9a4fe48a556f8684d0d6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 27 Jul 2010 11:56:05 -0400 Subject: jbd2: Remove __GFP_NOFAIL from jbd2 layer __GFP_NOFAIL is going away, so add our own retry loop. Also add jbd2__journal_start() and jbd2__journal_restart() which take a gfp mask, so that file systems can optionally (re)start transaction handles using GFP_KERNEL. If they do this, then they need to be prepared to handle receiving an PTR_ERR(-ENOMEM) error, and be ready to reflect that error up to userspace. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/journal.c | 15 ++++++++++--- fs/jbd2/transaction.c | 61 +++++++++++++++++++++++++++++++++++---------------- include/linux/jbd2.h | 4 +++- 3 files changed, 57 insertions(+), 23 deletions(-) (limited to 'include') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index f7bf15787d68..a79d3345b55a 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -41,6 +41,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include @@ -48,8 +49,6 @@ #include #include -EXPORT_SYMBOL(jbd2_journal_start); -EXPORT_SYMBOL(jbd2_journal_restart); EXPORT_SYMBOL(jbd2_journal_extend); EXPORT_SYMBOL(jbd2_journal_stop); EXPORT_SYMBOL(jbd2_journal_lock_updates); @@ -311,7 +310,17 @@ int jbd2_journal_write_metadata_buffer(transaction_t *transaction, */ J_ASSERT_BH(bh_in, buffer_jbddirty(bh_in)); - new_bh = alloc_buffer_head(GFP_NOFS|__GFP_NOFAIL); +retry_alloc: + new_bh = alloc_buffer_head(GFP_NOFS); + if (!new_bh) { + /* + * Failure is not an option, but __GFP_NOFAIL is going + * away; so we retry ourselves here. + */ + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_alloc; + } + /* keep subsequent assertions sane */ new_bh->b_state = 0; init_buffer(new_bh, NULL, NULL); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index e214d68620ac..001e95fb0fe1 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh); @@ -83,30 +85,38 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) * transaction's buffer credits. */ -static int start_this_handle(journal_t *journal, handle_t *handle) +static int start_this_handle(journal_t *journal, handle_t *handle, + int gfp_mask) { transaction_t *transaction; int needed; int nblocks = handle->h_buffer_credits; transaction_t *new_transaction = NULL; - int ret = 0; unsigned long ts = jiffies; if (nblocks > journal->j_max_transaction_buffers) { printk(KERN_ERR "JBD: %s wants too many credits (%d > %d)\n", current->comm, nblocks, journal->j_max_transaction_buffers); - ret = -ENOSPC; - goto out; + return -ENOSPC; } alloc_transaction: if (!journal->j_running_transaction) { - new_transaction = kzalloc(sizeof(*new_transaction), - GFP_NOFS|__GFP_NOFAIL); + new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask); if (!new_transaction) { - ret = -ENOMEM; - goto out; + /* + * If __GFP_FS is not present, then we may be + * being called from inside the fs writeback + * layer, so we MUST NOT fail. Since + * __GFP_NOFAIL is going away, we will arrange + * to retry the allocation ourselves. + */ + if ((gfp_mask & __GFP_FS) == 0) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto alloc_transaction; + } + return -ENOMEM; } } @@ -123,8 +133,8 @@ repeat_locked: if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { spin_unlock(&journal->j_state_lock); - ret = -EROFS; - goto out; + kfree(new_transaction); + return -EROFS; } /* Wait on the journal's transaction barrier if necessary */ @@ -240,10 +250,8 @@ repeat_locked: spin_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); -out: - if (unlikely(new_transaction)) /* It's usually NULL */ - kfree(new_transaction); - return ret; + kfree(new_transaction); + return 0; } static struct lock_class_key jbd2_handle_key; @@ -278,7 +286,7 @@ static handle_t *new_handle(int nblocks) * * Return a pointer to a newly allocated handle, or NULL on failure */ -handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +handle_t *jbd2__journal_start(journal_t *journal, int nblocks, int gfp_mask) { handle_t *handle = journal_current_handle(); int err; @@ -298,7 +306,7 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) current->journal_info = handle; - err = start_this_handle(journal, handle); + err = start_this_handle(journal, handle, gfp_mask); if (err < 0) { jbd2_free_handle(handle); current->journal_info = NULL; @@ -308,6 +316,15 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) out: return handle; } +EXPORT_SYMBOL(jbd2__journal_start); + + +handle_t *jbd2_journal_start(journal_t *journal, int nblocks) +{ + return jbd2__journal_start(journal, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_start); + /** * int jbd2_journal_extend() - extend buffer credits. @@ -394,8 +411,7 @@ out: * transaction capabable of guaranteeing the requested number of * credits. */ - -int jbd2_journal_restart(handle_t *handle, int nblocks) +int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; @@ -428,10 +444,17 @@ int jbd2_journal_restart(handle_t *handle, int nblocks) lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; - ret = start_this_handle(journal, handle); + ret = start_this_handle(journal, handle, gfp_mask); return ret; } +EXPORT_SYMBOL(jbd2__journal_restart); + +int jbd2_journal_restart(handle_t *handle, int nblocks) +{ + return jbd2__journal_restart(handle, nblocks, GFP_NOFS); +} +EXPORT_SYMBOL(jbd2_journal_restart); /** * void jbd2_journal_lock_updates () - establish a transaction barrier. diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a4d2e9f7088a..5a72bc75b273 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1081,7 +1081,9 @@ static inline handle_t *journal_current_handle(void) */ extern handle_t *jbd2_journal_start(journal_t *, int nblocks); -extern int jbd2_journal_restart (handle_t *, int nblocks); +extern handle_t *jbd2__journal_start(journal_t *, int nblocks, int gfp_mask); +extern int jbd2_journal_restart(handle_t *, int nblocks); +extern int jbd2__journal_restart(handle_t *, int nblocks, int gfp_mask); extern int jbd2_journal_extend (handle_t *, int nblocks); extern int jbd2_journal_get_write_access(handle_t *, struct buffer_head *); extern int jbd2_journal_get_create_access (handle_t *, struct buffer_head *); -- cgit v1.2.3 From 552ef8024f909d9b3a7442d0ab0d48a22de24e9e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 27 Jul 2010 11:56:06 -0400 Subject: direct-io: move aio_complete into ->end_io Filesystems with unwritten extent support must not complete an AIO request until the transaction to convert the extent has been commited. That means the aio_complete calls needs to be moved into the ->end_io callback so that the filesystem can control when to call it exactly. This makes a bit of a mess out of dio_complete and the ->end_io callback prototype even more complicated. Signed-off-by: Christoph Hellwig Reviewed-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/direct-io.c | 26 ++++++++++++++------------ fs/ext4/inode.c | 10 +++++++--- fs/ocfs2/aops.c | 7 ++++++- fs/xfs/linux-2.6/xfs_aops.c | 7 ++++++- fs/xfs/linux-2.6/xfs_aops.h | 2 ++ include/linux/fs.h | 3 ++- 6 files changed, 37 insertions(+), 18 deletions(-) (limited to 'include') diff --git a/fs/direct-io.c b/fs/direct-io.c index 7600aacf531d..a10cb91cadea 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -218,7 +218,7 @@ static struct page *dio_get_page(struct dio *dio) * filesystems can use it to hold additional state between get_block calls and * dio_complete. */ -static int dio_complete(struct dio *dio, loff_t offset, int ret) +static int dio_complete(struct dio *dio, loff_t offset, int ret, bool is_async) { ssize_t transferred = 0; @@ -239,14 +239,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) transferred = dio->i_size - offset; } - if (dio->end_io && dio->result) - dio->end_io(dio->iocb, offset, transferred, - dio->map_bh.b_private); - - if (dio->flags & DIO_LOCKING) - /* lockdep: non-owner release */ - up_read_non_owner(&dio->inode->i_alloc_sem); - if (ret == 0) ret = dio->page_errors; if (ret == 0) @@ -254,6 +246,17 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret) if (ret == 0) ret = transferred; + if (dio->end_io && dio->result) { + dio->end_io(dio->iocb, offset, transferred, + dio->map_bh.b_private, ret, is_async); + } else if (is_async) { + aio_complete(dio->iocb, ret, 0); + } + + if (dio->flags & DIO_LOCKING) + /* lockdep: non-owner release */ + up_read_non_owner(&dio->inode->i_alloc_sem); + return ret; } @@ -277,8 +280,7 @@ static void dio_bio_end_aio(struct bio *bio, int error) spin_unlock_irqrestore(&dio->bio_lock, flags); if (remaining == 0) { - int ret = dio_complete(dio, dio->iocb->ki_pos, 0); - aio_complete(dio->iocb, ret, 0); + dio_complete(dio, dio->iocb->ki_pos, 0, true); kfree(dio); } } @@ -1126,7 +1128,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, spin_unlock_irqrestore(&dio->bio_lock, flags); if (ret2 == 0) { - ret = dio_complete(dio, offset, ret); + ret = dio_complete(dio, offset, ret, false); kfree(dio); } else BUG_ON(ret != -EIOCBQUEUED); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 699d1d01c5df..609159e990de 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3775,7 +3775,8 @@ static ext4_io_end_t *ext4_init_io_end (struct inode *inode, gfp_t flags) } static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, - ssize_t size, void *private) + ssize_t size, void *private, int ret, + bool is_async) { ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; @@ -3784,7 +3785,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, /* if not async direct IO or dio with 0 bytes write, just return */ if (!io_end || !size) - return; + goto out; ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", @@ -3795,7 +3796,7 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, if (io_end->flag != EXT4_IO_UNWRITTEN){ ext4_free_io_end(io_end); iocb->private = NULL; - return; + goto out; } io_end->offset = offset; @@ -3812,6 +3813,9 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, list_add_tail(&io_end->list, &ei->i_completed_io_list); spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); iocb->private = NULL; +out: + if (is_async) + aio_complete(iocb, ret, 0); } static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 3623ca20cc18..1d2b1f156bcf 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -609,7 +609,9 @@ bail: static void ocfs2_dio_end_io(struct kiocb *iocb, loff_t offset, ssize_t bytes, - void *private) + void *private, + int ret, + bool is_async) { struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; int level; @@ -623,6 +625,9 @@ static void ocfs2_dio_end_io(struct kiocb *iocb, if (!level) up_read(&inode->i_alloc_sem); ocfs2_rw_unlock(inode, level); + + if (is_async) + aio_complete(iocb, ret, 0); } /* diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 34640d6dbdcb..5895aaf62ace 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -1599,7 +1599,9 @@ xfs_end_io_direct( struct kiocb *iocb, loff_t offset, ssize_t size, - void *private) + void *private, + int ret, + bool is_async) { xfs_ioend_t *ioend = iocb->private; @@ -1645,6 +1647,9 @@ xfs_end_io_direct( * against double-freeing. */ iocb->private = NULL; + + if (is_async) + aio_complete(iocb, ret, 0); } STATIC ssize_t diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 4cfc6ea87df8..9f566d92ae3a 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -37,6 +37,8 @@ typedef struct xfs_ioend { size_t io_size; /* size of the extent */ xfs_off_t io_offset; /* offset in the file */ struct work_struct io_work; /* xfsdatad work queue */ + struct kiocb *io_iocb; + int io_result; } xfs_ioend_t; extern const struct address_space_operations xfs_address_space_operations; diff --git a/include/linux/fs.h b/include/linux/fs.h index 471e1ff5079a..a0912f6075e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -415,7 +415,8 @@ struct buffer_head; typedef int (get_block_t)(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); typedef void (dio_iodone_t)(struct kiocb *iocb, loff_t offset, - ssize_t bytes, void *private); + ssize_t bytes, void *private, int ret, + bool is_async); /* * Attribute flags. These should be or-ed together to figure out what -- cgit v1.2.3 From a51dca9cd3bb4ec5a05bfb6feabf024a5c808a37 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 2 Aug 2010 08:43:25 -0400 Subject: jbd2: Use atomic variables to avoid taking t_handle_lock in jbd2_journal_stop By using an atomic_t for t_updates and t_outstanding credits, this should allow us to not need to take transaction t_handle_lock in jbd2_journal_stop(). Signed-off-by: "Theodore Ts'o" --- fs/jbd2/checkpoint.c | 2 +- fs/jbd2/commit.c | 13 +++++----- fs/jbd2/transaction.c | 69 +++++++++++++++++++++++++++++---------------------- include/linux/jbd2.h | 8 +++--- 4 files changed, 51 insertions(+), 41 deletions(-) (limited to 'include') diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 076d1cc44f95..f8cdc02520f9 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -775,7 +775,7 @@ void __jbd2_journal_drop_transaction(journal_t *journal, transaction_t *transact J_ASSERT(transaction->t_log_list == NULL); J_ASSERT(transaction->t_checkpoint_list == NULL); J_ASSERT(transaction->t_checkpoint_io_list == NULL); - J_ASSERT(transaction->t_updates == 0); + J_ASSERT(atomic_read(&transaction->t_updates) == 0); J_ASSERT(journal->j_committing_transaction != transaction); J_ASSERT(journal->j_running_transaction != transaction); diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index af056810acb6..fbd2c564e916 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -417,12 +417,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_locked); spin_lock(&commit_transaction->t_handle_lock); - while (commit_transaction->t_updates) { + while (atomic_read(&commit_transaction->t_updates)) { DEFINE_WAIT(wait); prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); - if (commit_transaction->t_updates) { + if (atomic_read(&commit_transaction->t_updates)) { spin_unlock(&commit_transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); schedule(); @@ -433,7 +433,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) } spin_unlock(&commit_transaction->t_handle_lock); - J_ASSERT (commit_transaction->t_outstanding_credits <= + J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <= journal->j_max_transaction_buffers); /* @@ -527,11 +527,12 @@ void jbd2_journal_commit_transaction(journal_t *journal) stats.run.rs_logging = jiffies; stats.run.rs_flushing = jbd2_time_diff(stats.run.rs_flushing, stats.run.rs_logging); - stats.run.rs_blocks = commit_transaction->t_outstanding_credits; + stats.run.rs_blocks = + atomic_read(&commit_transaction->t_outstanding_credits); stats.run.rs_blocks_logged = 0; J_ASSERT(commit_transaction->t_nr_buffers <= - commit_transaction->t_outstanding_credits); + atomic_read(&commit_transaction->t_outstanding_credits)); err = 0; descriptor = NULL; @@ -616,7 +617,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) * the free space in the log, but this counter is changed * by jbd2_journal_next_log_block() also. */ - commit_transaction->t_outstanding_credits--; + atomic_dec(&commit_transaction->t_outstanding_credits); /* Bump b_count to prevent truncate from stumbling over the shadowed buffer! @@@ This can go if we ever get diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 001e95fb0fe1..9c64c7ec48d4 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -55,6 +55,8 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) transaction->t_tid = journal->j_transaction_sequence++; transaction->t_expires = jiffies + journal->j_commit_interval; spin_lock_init(&transaction->t_handle_lock); + atomic_set(&transaction->t_updates, 0); + atomic_set(&transaction->t_outstanding_credits, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -177,7 +179,7 @@ repeat_locked: * checkpoint to free some more log space. */ spin_lock(&transaction->t_handle_lock); - needed = transaction->t_outstanding_credits + nblocks; + needed = atomic_read(&transaction->t_outstanding_credits) + nblocks; if (needed > journal->j_max_transaction_buffers) { /* @@ -240,11 +242,12 @@ repeat_locked: } handle->h_transaction = transaction; - transaction->t_outstanding_credits += nblocks; - transaction->t_updates++; + atomic_add(nblocks, &transaction->t_outstanding_credits); + atomic_inc(&transaction->t_updates); transaction->t_handle_count++; jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", - handle, nblocks, transaction->t_outstanding_credits, + handle, nblocks, + atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); spin_unlock(&journal->j_state_lock); @@ -369,7 +372,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } spin_lock(&transaction->t_handle_lock); - wanted = transaction->t_outstanding_credits + nblocks; + wanted = atomic_read(&transaction->t_outstanding_credits) + nblocks; if (wanted > journal->j_max_transaction_buffers) { jbd_debug(3, "denied handle %p %d blocks: " @@ -384,7 +387,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) } handle->h_buffer_credits += nblocks; - transaction->t_outstanding_credits += nblocks; + atomic_add(nblocks, &transaction->t_outstanding_credits); result = 0; jbd_debug(3, "extended handle %p by %d\n", handle, nblocks); @@ -426,15 +429,14 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) * First unlink the handle from its current transaction, and start the * commit on that. */ - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); J_ASSERT(journal_current_handle() == handle); spin_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - - if (!transaction->t_updates) + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); + if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); spin_unlock(&transaction->t_handle_lock); @@ -481,7 +483,7 @@ void jbd2_journal_lock_updates(journal_t *journal) break; spin_lock(&transaction->t_handle_lock); - if (!transaction->t_updates) { + if (!atomic_read(&transaction->t_updates)) { spin_unlock(&transaction->t_handle_lock); break; } @@ -1258,7 +1260,8 @@ int jbd2_journal_stop(handle_t *handle) { transaction_t *transaction = handle->h_transaction; journal_t *journal = transaction->t_journal; - int err; + int err, wait_for_commit = 0; + tid_t tid; pid_t pid; J_ASSERT(journal_current_handle() == handle); @@ -1266,7 +1269,7 @@ int jbd2_journal_stop(handle_t *handle) if (is_handle_aborted(handle)) err = -EIO; else { - J_ASSERT(transaction->t_updates > 0); + J_ASSERT(atomic_read(&transaction->t_updates) > 0); err = 0; } @@ -1334,14 +1337,8 @@ int jbd2_journal_stop(handle_t *handle) if (handle->h_sync) transaction->t_synchronous_commit = 1; current->journal_info = NULL; - spin_lock(&transaction->t_handle_lock); - transaction->t_outstanding_credits -= handle->h_buffer_credits; - transaction->t_updates--; - if (!transaction->t_updates) { - wake_up(&journal->j_wait_updates); - if (journal->j_barrier_count) - wake_up(&journal->j_wait_transaction_locked); - } + atomic_sub(handle->h_buffer_credits, + &transaction->t_outstanding_credits); /* * If the handle is marked SYNC, we need to set another commit @@ -1350,15 +1347,13 @@ int jbd2_journal_stop(handle_t *handle) * transaction is too old now. */ if (handle->h_sync || - transaction->t_outstanding_credits > - journal->j_max_transaction_buffers || - time_after_eq(jiffies, transaction->t_expires)) { + (atomic_read(&transaction->t_outstanding_credits) > + journal->j_max_transaction_buffers) || + time_after_eq(jiffies, transaction->t_expires)) { /* Do this even for aborted journals: an abort still * completes the commit thread, it just doesn't write * anything to disk. */ - tid_t tid = transaction->t_tid; - spin_unlock(&transaction->t_handle_lock); jbd_debug(2, "transaction too old, requesting commit for " "handle %p\n", handle); /* This is non-blocking */ @@ -1369,11 +1364,25 @@ int jbd2_journal_stop(handle_t *handle) * to wait for the commit to complete. */ if (handle->h_sync && !(current->flags & PF_MEMALLOC)) - err = jbd2_log_wait_commit(journal, tid); - } else { - spin_unlock(&transaction->t_handle_lock); + wait_for_commit = 1; } + /* + * Once we drop t_updates, if it goes to zero the transaction + * could start commiting on us and eventually disappear. So + * once we do this, we must not dereference transaction + * pointer again. + */ + tid = transaction->t_tid; + if (atomic_dec_and_test(&transaction->t_updates)) { + wake_up(&journal->j_wait_updates); + if (journal->j_barrier_count) + wake_up(&journal->j_wait_transaction_locked); + } + + if (wait_for_commit) + err = jbd2_log_wait_commit(journal, tid); + lock_map_release(&handle->h_lockdep_map); jbd2_free_handle(handle); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 5a72bc75b273..a72ce21de0e1 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -601,13 +601,13 @@ struct transaction_s * Number of outstanding updates running on this transaction * [t_handle_lock] */ - int t_updates; + atomic_t t_updates; /* * Number of buffers reserved for use by all handles in this transaction * handle but not yet modified. [t_handle_lock] */ - int t_outstanding_credits; + atomic_t t_outstanding_credits; /* * Forward and backward links for the circular list of all transactions @@ -1258,8 +1258,8 @@ static inline int jbd_space_needed(journal_t *journal) { int nblocks = journal->j_max_transaction_buffers; if (journal->j_committing_transaction) - nblocks += journal->j_committing_transaction-> - t_outstanding_credits; + nblocks += atomic_read(&journal->j_committing_transaction-> + t_outstanding_credits); return nblocks; } -- cgit v1.2.3 From a931da6ac9331a6c80dd91c199105806f2336188 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 3 Aug 2010 21:35:12 -0400 Subject: jbd2: Change j_state_lock to be a rwlock_t Lockstat reports have shown that j_state_lock is a major source of lock contention, especially on systems with more than 4 CPU cores. So change it to be a read/write spinlock. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 4 +-- fs/ext4/super.c | 4 +-- fs/jbd2/checkpoint.c | 16 ++++----- fs/jbd2/commit.c | 26 +++++++------- fs/jbd2/journal.c | 94 +++++++++++++++++++++++++-------------------------- fs/jbd2/transaction.c | 74 +++++++++++++++++++++------------------- fs/ocfs2/journal.c | 4 +-- include/linux/jbd2.h | 2 +- 8 files changed, 114 insertions(+), 110 deletions(-) (limited to 'include') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 533b607f9cb5..ab2247d642c6 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5066,7 +5066,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) transaction_t *transaction; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction) transaction = journal->j_running_transaction; else @@ -5075,7 +5075,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) tid = transaction->t_tid; else tid = journal->j_commit_sequence; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); ei->i_sync_tid = tid; ei->i_datasync_tid = tid; } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3fd65eb66ccd..81cb3fc1218e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3232,7 +3232,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_min_batch_time = sbi->s_min_batch_time; journal->j_max_batch_time = sbi->s_max_batch_time; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (test_opt(sb, BARRIER)) journal->j_flags |= JBD2_BARRIER; else @@ -3241,7 +3241,7 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; else journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } static journal_t *ext4_get_journal(struct super_block *sb, diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index f8cdc02520f9..1c23a0f4e8a3 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -118,13 +118,13 @@ static int __try_to_free_cp_buf(struct journal_head *jh) void __jbd2_log_wait_for_space(journal_t *journal) { int nblocks, space_left; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ nblocks = jbd_space_needed(journal); while (__jbd2_log_space_left(journal) < nblocks) { if (journal->j_flags & JBD2_ABORT) return; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); /* @@ -138,7 +138,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) * filesystem, so abort the journal and leave a stack * trace for forensic evidence. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); nblocks = jbd_space_needed(journal); space_left = __jbd2_log_space_left(journal); @@ -149,7 +149,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) if (journal->j_committing_transaction) tid = journal->j_committing_transaction->t_tid; spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (chkpt) { jbd2_log_do_checkpoint(journal); } else if (jbd2_cleanup_journal_tail(journal) == 0) { @@ -167,7 +167,7 @@ void __jbd2_log_wait_for_space(journal_t *journal) WARN_ON(1); jbd2_journal_abort(journal, 0); } - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { spin_unlock(&journal->j_list_lock); } @@ -474,7 +474,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * next transaction ID we will write, and where it will * start. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); transaction = journal->j_checkpoint_transactions; if (transaction) { @@ -496,7 +496,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) /* If the oldest pinned transaction is at the tail of the log already then there's not much we can do right now. */ if (journal->j_tail_sequence == first_tid) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 1; } @@ -516,7 +516,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal) journal->j_free += freed; journal->j_tail_sequence = first_tid; journal->j_tail = blocknr; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * If there is an external journal, we need to make sure that diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index fbd2c564e916..67bb0a2f35e5 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -152,9 +152,9 @@ static int journal_submit_commit_record(journal_t *journal, printk(KERN_WARNING "JBD2: Disabling barriers on %s, " "not supported by device\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* And try again, without the barrier */ lock_buffer(bh); @@ -182,9 +182,9 @@ retry: printk(KERN_WARNING "JBD2: %s: disabling barries on %s - not supported " "by device\n", __func__, journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); lock_buffer(bh); clear_buffer_dirty(bh); @@ -400,7 +400,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) jbd_debug(1, "JBD: starting commit of transaction %d\n", commit_transaction->t_tid); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_LOCKED; /* @@ -424,9 +424,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) TASK_UNINTERRUPTIBLE); if (atomic_read(&commit_transaction->t_updates)) { spin_unlock(&commit_transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&commit_transaction->t_handle_lock); } finish_wait(&journal->j_wait_updates, &wait); @@ -497,7 +497,7 @@ void jbd2_journal_commit_transaction(journal_t *journal) start_time = ktime_get(); commit_transaction->t_log_start = journal->j_head; wake_up(&journal->j_wait_transaction_locked); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd_debug (3, "JBD: commit phase 2\n"); @@ -519,9 +519,9 @@ void jbd2_journal_commit_transaction(journal_t *journal) * transaction! Now comes the tricky part: we need to write out * metadata. Loop over the transaction's entire buffer list: */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); commit_transaction->t_state = T_COMMIT; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); trace_jbd2_commit_logging(journal, commit_transaction); stats.run.rs_logging = jiffies; @@ -978,7 +978,7 @@ restart_loop: * __jbd2_journal_drop_transaction(). Otherwise we could race with * other checkpointing code processing the transaction... */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); /* * Now recheck if some buffers did not get attached to the transaction @@ -986,7 +986,7 @@ restart_loop: */ if (commit_transaction->t_forget) { spin_unlock(&journal->j_list_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); goto restart_loop; } @@ -1038,7 +1038,7 @@ restart_loop: journal->j_average_commit_time*3) / 4; else journal->j_average_commit_time = commit_time; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); if (commit_transaction->t_checkpoint_list == NULL && commit_transaction->t_checkpoint_io_list == NULL) { diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index a79d3345b55a..e7bf0fd9cec7 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -142,7 +142,7 @@ static int kjournald2(void *arg) /* * And now, wait forever for commit wakeup events. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); loop: if (journal->j_flags & JBD2_UNMOUNT) @@ -153,10 +153,10 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd_debug(1, "OK, requests differ\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); goto loop; } @@ -168,9 +168,9 @@ loop: * be already stopped. */ jbd_debug(1, "Now suspending kjournald2\n"); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); refrigerator(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } else { /* * We assume on resume that commits are already there, @@ -190,9 +190,9 @@ loop: if (journal->j_flags & JBD2_UNMOUNT) should_sleep = 0; if (should_sleep) { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } finish_wait(&journal->j_wait_commit, &wait); } @@ -210,7 +210,7 @@ loop: goto loop; end_loop: - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); del_timer_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); @@ -233,16 +233,16 @@ static int jbd2_journal_start_thread(journal_t *journal) static void journal_kill_thread(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_UNMOUNT; while (journal->j_task) { wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, journal->j_task == NULL); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -452,7 +452,7 @@ int __jbd2_log_space_left(journal_t *journal) { int left = journal->j_free; - assert_spin_locked(&journal->j_state_lock); + /* assert_spin_locked(&journal->j_state_lock); */ /* * Be pessimistic here about the number of those free blocks which @@ -497,9 +497,9 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid) { int ret; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ret = __jbd2_log_start_commit(journal, tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -518,7 +518,7 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction_t *transaction = NULL; tid_t tid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_running_transaction && !current->journal_info) { transaction = journal->j_running_transaction; __jbd2_log_start_commit(journal, transaction->t_tid); @@ -526,12 +526,12 @@ int jbd2_journal_force_commit_nested(journal_t *journal) transaction = journal->j_committing_transaction; if (!transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return 0; /* Nothing to retry */ } tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); return 1; } @@ -545,7 +545,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { int ret = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; @@ -564,7 +564,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) *ptid = journal->j_committing_transaction->t_tid; ret = 1; } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } @@ -576,26 +576,24 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid) { int err = 0; + read_lock(&journal->j_state_lock); #ifdef CONFIG_JBD2_DEBUG - spin_lock(&journal->j_state_lock); if (!tid_geq(journal->j_commit_request, tid)) { printk(KERN_EMERG "%s: error: j_commit_request=%d, tid=%d\n", __func__, journal->j_commit_request, tid); } - spin_unlock(&journal->j_state_lock); #endif - spin_lock(&journal->j_state_lock); while (tid_gt(tid, journal->j_commit_sequence)) { jbd_debug(1, "JBD: want %d, j_commit_sequence=%d\n", tid, journal->j_commit_sequence); wake_up(&journal->j_wait_commit); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_done_commit, !tid_gt(tid, journal->j_commit_sequence)); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); if (unlikely(is_journal_aborted(journal))) { printk(KERN_EMERG "journal commit I/O error\n"); @@ -612,7 +610,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) { unsigned long blocknr; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); J_ASSERT(journal->j_free > 1); blocknr = journal->j_head; @@ -620,7 +618,7 @@ int jbd2_journal_next_log_block(journal_t *journal, unsigned long long *retp) journal->j_free--; if (journal->j_head == journal->j_last) journal->j_head = journal->j_first; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return jbd2_journal_bmap(journal, blocknr, retp); } @@ -840,7 +838,7 @@ static journal_t * journal_init_common (void) mutex_init(&journal->j_checkpoint_mutex); spin_lock_init(&journal->j_revoke_lock); spin_lock_init(&journal->j_list_lock); - spin_lock_init(&journal->j_state_lock); + rwlock_init(&journal->j_state_lock); journal->j_commit_interval = (HZ * JBD2_DEFAULT_MAX_COMMIT_AGE); journal->j_min_batch_time = 0; @@ -1106,14 +1104,14 @@ void jbd2_journal_update_superblock(journal_t *journal, int wait) set_buffer_uptodate(bh); } - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); jbd_debug(1,"JBD: updating superblock (start %ld, seq %d, errno %d)\n", journal->j_tail, journal->j_tail_sequence, journal->j_errno); sb->s_sequence = cpu_to_be32(journal->j_tail_sequence); sb->s_start = cpu_to_be32(journal->j_tail); sb->s_errno = cpu_to_be32(journal->j_errno); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); BUFFER_TRACE(bh, "marking dirty"); mark_buffer_dirty(bh); @@ -1134,12 +1132,12 @@ out: * any future commit will have to be careful to update the * superblock again to re-record the true start of the log. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (sb->s_start) journal->j_flags &= ~JBD2_FLUSHED; else journal->j_flags |= JBD2_FLUSHED; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* @@ -1551,7 +1549,7 @@ int jbd2_journal_flush(journal_t *journal) transaction_t *transaction = NULL; unsigned long old_tail; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); /* Force everything buffered to the log... */ if (journal->j_running_transaction) { @@ -1564,10 +1562,10 @@ int jbd2_journal_flush(journal_t *journal) if (transaction) { tid_t tid = transaction->t_tid; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_log_wait_commit(journal, tid); } else { - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* ...and flush everything in the log out to disk. */ @@ -1591,12 +1589,12 @@ int jbd2_journal_flush(journal_t *journal) * the magic code for a fully-recovered superblock. Any future * commits of data to the journal will restore the current * s_start value. */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); old_tail = journal->j_tail; journal->j_tail = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); jbd2_journal_update_superblock(journal, 1); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_tail = old_tail; J_ASSERT(!journal->j_running_transaction); @@ -1604,7 +1602,7 @@ int jbd2_journal_flush(journal_t *journal) J_ASSERT(!journal->j_checkpoint_transactions); J_ASSERT(journal->j_head == journal->j_tail); J_ASSERT(journal->j_tail_sequence == journal->j_transaction_sequence); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } @@ -1668,12 +1666,12 @@ void __jbd2_journal_abort_hard(journal_t *journal) printk(KERN_ERR "Aborting journal on device %s.\n", journal->j_devname); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_flags |= JBD2_ABORT; transaction = journal->j_running_transaction; if (transaction) __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } /* Soft abort: record the abort error status in the journal superblock, @@ -1758,12 +1756,12 @@ int jbd2_journal_errno(journal_t *journal) { int err; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else err = journal->j_errno; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); return err; } @@ -1778,12 +1776,12 @@ int jbd2_journal_clear_err(journal_t *journal) { int err = 0; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_flags & JBD2_ABORT) err = -EROFS; else journal->j_errno = 0; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return err; } @@ -1796,10 +1794,10 @@ int jbd2_journal_clear_err(journal_t *journal) */ void jbd2_journal_ack_err(journal_t *journal) { - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); if (journal->j_errno) journal->j_flags |= JBD2_ACK_ERR; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int jbd2_journal_blocks_per_page(struct inode *inode) diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9c64c7ec48d4..663065142b42 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -124,36 +124,38 @@ alloc_transaction: jbd_debug(3, "New handle %p going live.\n", handle); -repeat: - /* * We need to hold j_state_lock until t_updates has been incremented, * for proper journal barrier handling */ - spin_lock(&journal->j_state_lock); -repeat_locked: +repeat: + read_lock(&journal->j_state_lock); if (is_journal_aborted(journal) || (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); kfree(new_transaction); return -EROFS; } /* Wait on the journal's transaction barrier if necessary */ if (journal->j_barrier_count) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); wait_event(journal->j_wait_transaction_locked, journal->j_barrier_count == 0); goto repeat; } if (!journal->j_running_transaction) { - if (!new_transaction) { - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); + if (!new_transaction) goto alloc_transaction; + write_lock(&journal->j_state_lock); + if (!journal->j_running_transaction) { + jbd2_get_transaction(journal, new_transaction); + new_transaction = NULL; } - jbd2_get_transaction(journal, new_transaction); - new_transaction = NULL; + write_unlock(&journal->j_state_lock); + goto repeat; } transaction = journal->j_running_transaction; @@ -167,7 +169,7 @@ repeat_locked: prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -194,7 +196,7 @@ repeat_locked: prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_transaction_locked, &wait); goto repeat; @@ -228,8 +230,12 @@ repeat_locked: if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); spin_unlock(&transaction->t_handle_lock); - __jbd2_log_wait_for_space(journal); - goto repeat_locked; + read_unlock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); + if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) + __jbd2_log_wait_for_space(journal); + write_unlock(&journal->j_state_lock); + goto repeat; } /* OK, account for the buffers that this operation expects to @@ -250,7 +256,7 @@ repeat_locked: atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); kfree(new_transaction); @@ -362,7 +368,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) result = 1; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); /* Don't extend a locked-down transaction! */ if (handle->h_transaction->t_state != T_RUNNING) { @@ -394,7 +400,7 @@ int jbd2_journal_extend(handle_t *handle, int nblocks) unlock: spin_unlock(&transaction->t_handle_lock); error_out: - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); out: return result; } @@ -432,7 +438,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) J_ASSERT(atomic_read(&transaction->t_updates) > 0); J_ASSERT(journal_current_handle() == handle); - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); spin_lock(&transaction->t_handle_lock); atomic_sub(handle->h_buffer_credits, &transaction->t_outstanding_credits); @@ -442,7 +448,7 @@ int jbd2__journal_restart(handle_t *handle, int nblocks, int gfp_mask) jbd_debug(2, "restarting handle %p\n", handle); __jbd2_log_start_commit(journal, transaction->t_tid); - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); lock_map_release(&handle->h_lockdep_map); handle->h_buffer_credits = nblocks; @@ -472,7 +478,7 @@ void jbd2_journal_lock_updates(journal_t *journal) { DEFINE_WAIT(wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); ++journal->j_barrier_count; /* Wait until there are no running updates */ @@ -490,12 +496,12 @@ void jbd2_journal_lock_updates(journal_t *journal) prepare_to_wait(&journal->j_wait_updates, &wait, TASK_UNINTERRUPTIBLE); spin_unlock(&transaction->t_handle_lock); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); schedule(); finish_wait(&journal->j_wait_updates, &wait); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); } - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); /* * We have now established a barrier against other normal updates, but @@ -519,9 +525,9 @@ void jbd2_journal_unlock_updates (journal_t *journal) J_ASSERT(journal->j_barrier_count != 0); mutex_unlock(&journal->j_barrier); - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); --journal->j_barrier_count; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); wake_up(&journal->j_wait_transaction_locked); } @@ -1314,9 +1320,9 @@ int jbd2_journal_stop(handle_t *handle) journal->j_last_sync_writer = pid; - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_time = journal->j_average_commit_time; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); trans_time = ktime_to_ns(ktime_sub(ktime_get(), transaction->t_start_time)); @@ -1748,7 +1754,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) goto zap_buffer_unlocked; /* OK, we have data buffer in journaled mode */ - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); jbd_lock_bh_state(bh); spin_lock(&journal->j_list_lock); @@ -1801,7 +1807,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* There is no currently-running transaction. So the @@ -1815,7 +1821,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return ret; } else { /* The orphan record's transaction has @@ -1839,7 +1845,7 @@ static int journal_unmap_buffer(journal_t *journal, struct buffer_head *bh) jbd2_journal_put_journal_head(jh); spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); return 0; } else { /* Good, the buffer belongs to the running transaction. @@ -1858,7 +1864,7 @@ zap_buffer: zap_buffer_no_jh: spin_unlock(&journal->j_list_lock); jbd_unlock_bh_state(bh); - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); zap_buffer_unlocked: clear_buffer_dirty(bh); J_ASSERT_BH(bh, !buffer_jbddirty(bh)); @@ -2165,9 +2171,9 @@ int jbd2_journal_begin_ordered_truncate(journal_t *journal, /* Locks are here just to force reading of recent values, it is * enough that the transaction was not committing before we started * a transaction adding the inode to orphan list */ - spin_lock(&journal->j_state_lock); + read_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; - spin_unlock(&journal->j_state_lock); + read_unlock(&journal->j_state_lock); spin_lock(&journal->j_list_lock); inode_trans = jinode->i_transaction; spin_unlock(&journal->j_list_lock); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 47878cf16418..9c1b92ebeb94 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -760,13 +760,13 @@ void ocfs2_set_journal_params(struct ocfs2_super *osb) if (osb->osb_commit_interval) commit_interval = osb->osb_commit_interval; - spin_lock(&journal->j_state_lock); + write_lock(&journal->j_state_lock); journal->j_commit_interval = commit_interval; if (osb->s_mount_opt & OCFS2_MOUNT_BARRIER) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - spin_unlock(&journal->j_state_lock); + write_unlock(&journal->j_state_lock); } int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty) diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a72ce21de0e1..15d5743ccfbb 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -764,7 +764,7 @@ struct journal_s /* * Protect the various scalars in the journal */ - spinlock_t j_state_lock; + rwlock_t j_state_lock; /* * Number of processes waiting to create a barrier lock [j_state_lock] -- cgit v1.2.3 From 8dd420466c7bfc459fa04680bd5690bfc41a4553 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 3 Aug 2010 21:38:29 -0400 Subject: jbd2: Remove t_handle_lock from start_this_handle() This should remove the last exclusive lock from start_this_handle(), so that we should now be able to start multiple transactions at the same time on large SMP systems. Signed-off-by: "Theodore Ts'o" --- fs/jbd2/commit.c | 3 ++- fs/jbd2/transaction.c | 33 ++++++++++++++++++++++----------- include/linux/jbd2.h | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 67bb0a2f35e5..f52e5e8049f1 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -1004,7 +1004,8 @@ restart_loop: * File the transaction statistics */ stats.ts_tid = commit_transaction->t_tid; - stats.run.rs_handle_count = commit_transaction->t_handle_count; + stats.run.rs_handle_count = + atomic_read(&commit_transaction->t_handle_count); trace_jbd2_run_stats(journal->j_fs_dev->bd_dev, commit_transaction->t_tid, &stats.run); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 663065142b42..0752bcda535f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -57,6 +57,7 @@ jbd2_get_transaction(journal_t *journal, transaction_t *transaction) spin_lock_init(&transaction->t_handle_lock); atomic_set(&transaction->t_updates, 0); atomic_set(&transaction->t_outstanding_credits, 0); + atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); INIT_LIST_HEAD(&transaction->t_private_list); @@ -180,8 +181,8 @@ repeat: * buffers requested by this operation, we need to stall pending a log * checkpoint to free some more log space. */ - spin_lock(&transaction->t_handle_lock); - needed = atomic_read(&transaction->t_outstanding_credits) + nblocks; + needed = atomic_add_return(nblocks, + &transaction->t_outstanding_credits); if (needed > journal->j_max_transaction_buffers) { /* @@ -192,7 +193,7 @@ repeat: DEFINE_WAIT(wait); jbd_debug(2, "Handle %p starting new commit...\n", handle); - spin_unlock(&transaction->t_handle_lock); + atomic_sub(nblocks, &transaction->t_outstanding_credits); prepare_to_wait(&journal->j_wait_transaction_locked, &wait, TASK_UNINTERRUPTIBLE); __jbd2_log_start_commit(journal, transaction->t_tid); @@ -229,7 +230,7 @@ repeat: */ if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) { jbd_debug(2, "Handle %p waiting for checkpoint...\n", handle); - spin_unlock(&transaction->t_handle_lock); + atomic_sub(nblocks, &transaction->t_outstanding_credits); read_unlock(&journal->j_state_lock); write_lock(&journal->j_state_lock); if (__jbd2_log_space_left(journal) < jbd_space_needed(journal)) @@ -239,23 +240,33 @@ repeat: } /* OK, account for the buffers that this operation expects to - * use and add the handle to the running transaction. */ - - if (time_after(transaction->t_start, ts)) { + * use and add the handle to the running transaction. + * + * In order for t_max_wait to be reliable, it must be + * protected by a lock. But doing so will mean that + * start_this_handle() can not be run in parallel on SMP + * systems, which limits our scalability. So we only enable + * it when debugging is enabled. We may want to use a + * separate flag, eventually, so we can enable this + * independently of debugging. + */ +#ifdef CONFIG_JBD2_DEBUG + if (jbd2_journal_enable_debug && + time_after(transaction->t_start, ts)) { ts = jbd2_time_diff(ts, transaction->t_start); + spin_lock(&transaction->t_handle_lock); if (ts > transaction->t_max_wait) transaction->t_max_wait = ts; + spin_unlock(&transaction->t_handle_lock); } - +#endif handle->h_transaction = transaction; - atomic_add(nblocks, &transaction->t_outstanding_credits); atomic_inc(&transaction->t_updates); - transaction->t_handle_count++; + atomic_inc(&transaction->t_handle_count); jbd_debug(4, "Handle %p given %d credits (total %d, free %d)\n", handle, nblocks, atomic_read(&transaction->t_outstanding_credits), __jbd2_log_space_left(journal)); - spin_unlock(&transaction->t_handle_lock); read_unlock(&journal->j_state_lock); lock_map_acquire(&handle->h_lockdep_map); diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 15d5743ccfbb..01743b5446ff 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -629,7 +629,7 @@ struct transaction_s /* * How many handles used this transaction? [t_handle_lock] */ - int t_handle_count; + atomic_t t_handle_count; /* * This transaction is being forced and some process is -- cgit v1.2.3