From fa5d11133b07053270e18fa9c18560e66e79217e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Mon, 2 Nov 2009 18:50:49 -0500 Subject: ext4: discard preallocation when restarting a transaction during truncate When restart a transaction during a truncate operation, we drop and reacquire i_data_sem. After reacquiring i_data_sem, we need to discard any inode-based preallocation that might have been grabbed while we released i_data_sem (for example, if pdflush is allocating blocks and racing against the truncate). Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5c5bc5dafff8..d1ec698a91d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -193,7 +193,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) * so before we call here everything must be consistently dirtied against * this transaction. */ - int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, +int ext4_truncate_restart_trans(handle_t *handle, struct inode *inode, int nblocks) { int ret; @@ -209,6 +209,7 @@ static int try_to_extend_transaction(handle_t *handle, struct inode *inode) up_write(&EXT4_I(inode)->i_data_sem); ret = ext4_journal_restart(handle, blocks_for_truncate(inode)); down_write(&EXT4_I(inode)->i_data_sem); + ext4_discard_preallocations(inode); return ret; } -- cgit v1.2.3 From 109f55651954def97fa41ee71c464d268c512ab0 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:08 -0500 Subject: ext4: fix ext4_ext_direct_IO()'s return value after converting uninit extents After a direct I/O request covering an uninitalized extent (i.e., created using the fallocate system call) or a hole in a file, ext4 will convert the uninitialized extent so it is marked as initialized by calling ext4_convert_unwritten_extents(). This function returns zero on success. This return value was getting returned by ext4_direct_IO(); however the file system's direct_IO function is supposed to return the number of bytes read or written on a success. By returning zero, it confused the direct I/O code into falling back to buffered I/O unnecessarily. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 10 +++++++--- 2 files changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 10539e364283..441716f33b4a 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3519,6 +3519,7 @@ retry: * * This function is called from the direct IO end io call back * function, to convert the fallocated extents after IO is completed. + * Returns 0 on success. */ int ext4_convert_unwritten_extents(struct inode *inode, loff_t offset, loff_t len) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d1ec698a91d1..12d727f8fedf 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,13 +3772,17 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) + } else if (ret > 0) { + int err; /* * for non AIO case, since the IO is already * completed, we could do the convertion right here */ - ret = ext4_convert_unwritten_extents(inode, - offset, ret); + err = ext4_convert_unwritten_extents(inode, + offset, ret); + if (err < 0) + ret = err; + } return ret; } -- cgit v1.2.3 From 5f5249507e4b5c4fc0f9c93f33d133d8c95f47e1 Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 10 Nov 2009 10:48:04 -0500 Subject: ext4: skip conversion of uninit extents after direct IO if there isn't any At the end of direct I/O operation, ext4_ext_direct_IO() always called ext4_convert_unwritten_extents(), regardless of whether there were any unwritten extents involved in the I/O or not. This commit adds a state flag so that ext4_ext_direct_IO() only calls ext4_convert_unwritten_extents() when necessary. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/extents.c | 22 +++++++++++++++++----- fs/ext4/inode.c | 4 +++- 3 files changed, 21 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 00d153f2f261..8825515eeddd 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -322,6 +322,7 @@ static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags) #define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */ #define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */ #define EXT4_STATE_EXT_MIGRATE 0x00000020 /* Inode is migrating */ +#define EXT4_STATE_DIO_UNWRITTEN 0x00000040 /* need convert on dio done*/ /* Used to pass group descriptor data when online resize is done */ struct ext4_new_group_input { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 441716f33b4a..e991ae2b461c 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3048,12 +3048,18 @@ ext4_ext_handle_uninitialized_extents(handle_t *handle, struct inode *inode, ret = ext4_split_unwritten_extents(handle, inode, path, iblock, max_blocks, flags); - /* flag the io_end struct that we need convert when IO done */ + /* + * Flag the inode(non aio case) or end_io struct (aio case) + * that this IO needs to convertion to written when IO is + * completed + */ if (io) io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= EXT4_STATE_DIO_UNWRITTEN; goto out; } - /* DIO end_io complete, convert the filled extent to written */ + /* async DIO end_io complete, convert the filled extent to written */ if (flags == EXT4_GET_BLOCKS_DIO_CONVERT_EXT) { ret = ext4_convert_unwritten_extents_dio(handle, inode, path); @@ -3295,10 +3301,16 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * To avoid unecessary convertion for every aio dio rewrite * to the mid of file, here we flag the IO that is really * need the convertion. - * + * For non asycn direct IO case, flag the inode state + * that we need to perform convertion when IO is done. */ - if (io && flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) - io->flag = DIO_AIO_UNWRITTEN; + if (flags == EXT4_GET_BLOCKS_DIO_CREATE_EXT) { + if (io) + io->flag = DIO_AIO_UNWRITTEN; + else + EXT4_I(inode)->i_state |= + EXT4_STATE_DIO_UNWRITTEN;; + } } err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); if (err) { diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 12d727f8fedf..a9ed2bce74d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3772,7 +3772,8 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, if (ret != -EIOCBQUEUED && ret <= 0 && iocb->private) { ext4_free_io_end(iocb->private); iocb->private = NULL; - } else if (ret > 0) { + } else if (ret > 0 && (EXT4_I(inode)->i_state & + EXT4_STATE_DIO_UNWRITTEN)) { int err; /* * for non AIO case, since the IO is already @@ -3782,6 +3783,7 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, offset, ret); if (err < 0) ret = err; + EXT4_I(inode)->i_state &= ~EXT4_STATE_DIO_UNWRITTEN; } return ret; } -- cgit v1.2.3 From 4b70df181611012a3556f017b57dfcef7e1d279f Mon Sep 17 00:00:00 2001 From: Mingming Date: Tue, 3 Nov 2009 14:44:54 -0500 Subject: ext4: code clean up for dio fallocate handling The ext4_debug() call in ext4_end_io_dio() should be moved after the check to make sure that io_end is non-NULL. The comment above ext4_get_block_dio_write() ("Maximum number of blocks...") is a duplicate; the original and correct comment is above the #define DIO_MAX_BLOCKS up above. Based on review comments from Curt Wohlgemuth. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9ed2bce74d1..2c8caa51addb 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3446,8 +3446,6 @@ out: return ret; } -/* Maximum number of blocks we map for direct IO at once. */ - static int ext4_get_block_dio_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -3655,13 +3653,14 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ext4_io_end_t *io_end = iocb->private; struct workqueue_struct *wq; + /* if not async direct IO or dio with 0 bytes write, just return */ + if (!io_end || !size) + return; + ext_debug("ext4_end_io_dio(): io_end 0x%p" "for inode %lu, iocb 0x%p, offset %llu, size %llu\n", iocb->private, io_end->inode->i_ino, iocb, offset, size); - /* if not async direct IO or dio with 0 bytes write, just return */ - if (!io_end || !size) - return; /* if not aio dio with unwritten extents, just free io and return */ if (io_end->flag != DIO_AIO_UNWRITTEN){ -- cgit v1.2.3 From ba230c3f6dc88ec008806adb27b12088486d508e Mon Sep 17 00:00:00 2001 From: Mingming Date: Fri, 6 Nov 2009 04:01:23 -0500 Subject: ext4: Fix return value of ext4_split_unwritten_extents() to fix direct I/O To prepare for a direct I/O write, we need to split the unwritten extents before submitting the I/O. When no extents needed to be split, ext4_split_unwritten_extents() was incorrectly returning 0 instead of the size of uninitialized extents. This bug caused the wrong return value sent back to VFS code when it gets called from async IO path, leading to an unnecessary fall back to buffered IO. This bug also hid the fact that the check to see whether or not a split would be necessary was incorrect; we can only skip splitting the extent if the write completely covers the uninitialized extent. Signed-off-by: Mingming Cao Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e991ae2b461c..715264b4bae4 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2807,6 +2807,8 @@ fix_extent_len: * into three uninitialized extent(at most). After IO complete, the part * being filled will be convert to initialized by the end_io callback function * via ext4_convert_unwritten_extents(). + * + * Returns the size of uninitialized extent to be written on success. */ static int ext4_split_unwritten_extents(handle_t *handle, struct inode *inode, @@ -2824,7 +2826,6 @@ static int ext4_split_unwritten_extents(handle_t *handle, unsigned int allocated, ee_len, depth; ext4_fsblk_t newblock; int err = 0; - int ret = 0; ext_debug("ext4_split_unwritten_extents: inode %lu," "iblock %llu, max_blocks %u\n", inode->i_ino, @@ -2842,12 +2843,12 @@ static int ext4_split_unwritten_extents(handle_t *handle, ext4_ext_store_pblock(&orig_ex, ext_pblock(ex)); /* - * if the entire unintialized extent length less than - * the size of extent to write, there is no need to split - * uninitialized extent + * If the uninitialized extent begins at the same logical + * block where the write begins, and the write completely + * covers the extent, then we don't need to split it. */ - if (allocated <= max_blocks) - return ret; + if ((iblock == ee_block) && (allocated <= max_blocks)) + return allocated; err = ext4_ext_get_access(handle, inode, path + depth); if (err) -- cgit v1.2.3 From 1e424a348303694fabdf8b1efbfcb1a892dfa63a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 8 Nov 2009 15:45:44 -0500 Subject: ext4: partial revert to fix double brelse WARNING() This is a partial revert of commit 6487a9d (only the changes made to fs/ext4/namei.c), since it is causing the following brelse() double-free warning when running fsstress on a file system with 1k blocksize and we run into a block allocation failure while converting a single-block directory to a multi-block hash-tree indexed directory. WARNING: at fs/buffer.c:1197 __brelse+0x2e/0x33() Hardware name: VFS: brelse: Trying to free free buffer Modules linked in: Pid: 2226, comm: jbd2/sdd-8 Not tainted 2.6.32-rc6-00577-g0003f55 #101 Call Trace: [] warn_slowpath_common+0x65/0x95 [] warn_slowpath_fmt+0x29/0x2c [] __brelse+0x2e/0x33 [] jbd2_journal_refile_buffer+0x67/0x6c [] jbd2_journal_commit_transaction+0x319/0x14d8 [] ? try_to_del_timer_sync+0x58/0x60 [] ? sched_clock_cpu+0x12a/0x13e [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x3f/0x5b [] ? lock_release_holdtime+0x36/0x137 [] ? _spin_unlock_irqrestore+0x44/0x51 [] ? trace_hardirqs_on_caller+0x103/0x124 [] ? trace_hardirqs_on+0xb/0xd [] ? try_to_del_timer_sync+0x58/0x60 [] kjournald2+0x11a/0x310 [] ? autoremove_wake_function+0x0/0x38 [] ? kjournald2+0x0/0x310 [] kthread+0x66/0x6b [] ? kthread+0x0/0x6b [] kernel_thread_helper+0x7/0x10 ---[ end trace 5579351b86af61e3 ]--- Commit 6487a9d was an attempt some buffer head leaks in an ENOSPC error path, but in some cases it actually results in an excess ENOSPC, as shown above. Fixing this means cleaning up who is responsible for releasing the buffer heads from the callee to the caller of add_dirent_to_buf(). Since that's a relatively complex change, and we're late in the rcX development cycle, I'm reverting this now, and holding back a more complete fix until after 2.6.32 ships. We've lived with this buffer_head leak on ENOSPC in ext3 and ext4 for a very long time; a few more months won't kill us. Signed-off-by: "Theodore Ts'o" Cc: Curt Wohlgemuth --- fs/ext4/namei.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 7c8fe80bacdd..6d2c1b897fc7 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1518,12 +1518,8 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (blocks == 1 && !dx_fallback && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) { - retval = make_indexed_dir(handle, dentry, inode, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; - } + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_DIR_INDEX)) + return make_indexed_dir(handle, dentry, inode, bh); brelse(bh); } bh = ext4_append(handle, dir, &block, &retval); @@ -1532,10 +1528,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, de = (struct ext4_dir_entry_2 *) bh->b_data; de->inode = 0; de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize); - retval = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (retval == -ENOSPC) - brelse(bh); - return retval; + return add_dirent_to_buf(handle, dentry, inode, de, bh); } /* @@ -1664,8 +1657,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, if (!de) goto cleanup; err = add_dirent_to_buf(handle, dentry, inode, de, bh); - if (err != -ENOSPC) - bh = NULL; + bh = NULL; goto cleanup; journal_error: -- cgit v1.2.3