From eb8ab4443aec5ffe923a471b337568a8158cd32b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:49 +0200 Subject: ext4: make ext4_forced_shutdown() take struct super_block Currently ext4_forced_shutdown() takes struct ext4_sb_info but most callers need to get it from struct super_block anyway. So just pass in struct super_block to save all callers from some boilerplate code. No functional changes. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-3-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 43775a6ca505..c6fa59e57f1e 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1114,7 +1114,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; unsigned from, to; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; trace_ext4_write_begin(inode, pos, len); @@ -2213,7 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(EXT4_SB(sb)) || + if (ext4_forced_shutdown(sb) || ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) goto invalidate_dirty_pages; /* @@ -2540,7 +2540,7 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(EXT4_SB(mapping->host->i_sb)) || + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { ret = -EROFS; goto out_writepages; @@ -2759,7 +2759,7 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(sb)))) + if (unlikely(ext4_forced_shutdown(sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(sb); @@ -2798,16 +2798,16 @@ static int ext4_dax_writepages(struct address_space *mapping, int ret; long nr_to_write = wbc->nr_to_write; struct inode *inode = mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); int alloc_ctx; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); - ret = dax_writeback_mapping_range(mapping, sbi->s_daxdev, wbc); + ret = dax_writeback_mapping_range(mapping, + EXT4_SB(inode->i_sb)->s_daxdev, wbc); trace_ext4_writepages_result(inode, wbc, ret, nr_to_write - wbc->nr_to_write); ext4_writepages_up_read(inode->i_sb, alloc_ctx); @@ -2857,7 +2857,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, pgoff_t index; struct inode *inode = mapping->host; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; index = pos >> PAGE_SHIFT; @@ -5135,7 +5135,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) sb_rdonly(inode->i_sb)) return 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (EXT4_SB(inode->i_sb)->s_journal) { @@ -5255,7 +5255,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; if (unlikely(IS_IMMUTABLE(inode))) @@ -5676,7 +5676,7 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) { + if (unlikely(ext4_forced_shutdown(inode->i_sb))) { put_bh(iloc->bh); return -EIO; } @@ -5702,7 +5702,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) + if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; err = ext4_get_inode_loc(inode, iloc); -- cgit v1.2.3 From 95257987a6387f02970eda707e55a06cce734e18 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:51 +0200 Subject: ext4: drop EXT4_MF_FS_ABORTED flag EXT4_MF_FS_ABORTED flag has practically the same intent as EXT4_FLAGS_SHUTDOWN flag. The shutdown flag is checked in many more places than the aborted flag which is mostly the historical artifact where we were relying on SB_RDONLY checks instead of the aborted flag checks. There are only three places - ext4_sync_file(), __ext4_remount(), and mballoc debug code - which check aborted flag and not shutdown flag and this is arguably a bug. Avoid these inconsistencies by removing EXT4_MF_FS_ABORTED flag and using EXT4_FLAGS_SHUTDOWN everywhere. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-5-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/ext4.h | 1 - fs/ext4/fsync.c | 7 +++---- fs/ext4/inode.c | 8 +++----- fs/ext4/mballoc.c | 4 ++-- fs/ext4/super.c | 4 ++-- 5 files changed, 10 insertions(+), 14 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 907829007f3f..89d76d50af4c 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1799,7 +1799,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FS_ABORTED, /* Fatal error detected */ EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ }; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index bffc1d0994f5..b40d3b29f7e5 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -131,7 +131,6 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) int ret = 0, err; bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); if (unlikely(ext4_forced_shutdown(inode->i_sb))) return -EIO; @@ -141,14 +140,14 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_ext4_sync_file_enter(file, datasync); if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_mount_flags value */ + /* Make sure that we read updated s_ext4_flags value */ smp_rmb(); - if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(inode->i_sb)) ret = -EROFS; goto out; } - if (!sbi->s_journal) { + if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, &needs_barrier); if (needs_barrier) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c6fa59e57f1e..100c3ec6da6c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2213,8 +2213,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb) || - ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2534,14 +2533,13 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * If the filesystem has aborted, it is read-only, so return * right away instead of dumping stack traces later on that * will obscure the real source of the problem. We test - * EXT4_MF_FS_ABORTED instead of sb->s_flag's SB_RDONLY because + * fs shutdown state instead of sb->s_flag's SB_RDONLY because * the latter could be true if the filesystem is mounted * read-only, and in that case, ext4_writepages should * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb) || - ext4_test_mount_flag(inode->i_sb, EXT4_MF_FS_ABORTED))) { + if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { ret = -EROFS; goto out_writepages; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 78a4a24e2f57..1dc63e329e64 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -5664,7 +5664,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5698,7 +5698,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) + if (ext4_forced_shutdown(sb)) return; mb_debug(sb, "Can't allocate:" diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f84142907cd5..20a8e64da4ac 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -657,7 +657,7 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, WARN_ON_ONCE(1); if (!continue_fs && !sb_rdonly(sb)) { - ext4_set_mount_flag(sb, EXT4_MF_FS_ABORTED); + set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); if (journal) jbd2_journal_abort(journal, -EIO); } @@ -6502,7 +6502,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_error_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_test_mount_flag(sb, EXT4_MF_FS_ABORTED)) { + if (ext4_forced_shutdown(sb)) { err = -EROFS; goto restore_opts; } -- cgit v1.2.3 From f1128084b40e520bea8bb32b3ff4d03745ab7e64 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 16 Jun 2023 18:50:55 +0200 Subject: ext4: drop read-only check in ext4_write_inode() We should not have dirty inodes on read-only filesystem. Also silently bailing without writing anything would be a problem when we enable quotas during remount while the filesystem is read-only. So drop the read-only check. Signed-off-by: Jan Kara Link: https://lore.kernel.org/r/20230616165109.21695-9-jack@suse.cz Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 100c3ec6da6c..1b9003840bc1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -5129,8 +5129,7 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) { int err; - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC) || - sb_rdonly(inode->i_sb)) + if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; if (unlikely(ext4_forced_shutdown(inode->i_sb))) -- cgit v1.2.3 From 03de20bed203b0819225d4de98353c1f8755a1dd Mon Sep 17 00:00:00 2001 From: Liu Song Date: Thu, 10 Aug 2023 23:43:33 +0800 Subject: ext4: do not mark inode dirty every time when appending using delalloc In the delalloc append write scenario, if inode's i_size is extended due to buffer write, there are delalloc writes pending in the range up to i_size, and no need to touch i_disksize since writeback will push i_disksize up to i_size eventually. Offers significant performance improvement in high-frequency append write scenarios. I conducted tests in my 32-core environment by launching 32 concurrent threads to append write to the same file. Each write operation had a length of 1024 bytes and was repeated 100000 times. Without using this patch, the test was completed in 7705 ms. However, with this patch, the test was completed in 5066 ms, resulting in a performance improvement of 34%. Moreover, in test scenarios of Kafka version 2.6.2, using packet size of 2K, with this patch resulted in a 10% performance improvement. Signed-off-by: Liu Song Suggested-by: Jan Kara Reviewed-by: Jan Kara Link: https://lore.kernel.org/r/20230810154333.84921-1-liusong@linux.alibaba.com Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 88 ++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 26 deletions(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1b9003840bc1..c5d8f8933c8c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2935,14 +2935,73 @@ static int ext4_da_should_update_i_disksize(struct folio *folio, return 1; } +static int ext4_da_do_write_end(struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page) +{ + struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; + bool disksize_changed = false; + loff_t new_i_size; + + /* + * block_write_end() will mark the inode as dirty with I_DIRTY_PAGES + * flag, which all that's needed to trigger page writeback. + */ + copied = block_write_end(NULL, mapping, pos, len, copied, page, NULL); + new_i_size = pos + copied; + + /* + * It's important to update i_size while still holding page lock, + * because page writeout could otherwise come in and zero beyond + * i_size. + * + * Since we are holding inode lock, we are sure i_disksize <= + * i_size. We also know that if i_disksize < i_size, there are + * delalloc writes pending in the range up to i_size. If the end of + * the current write is <= i_size, there's no need to touch + * i_disksize since writeback will push i_disksize up to i_size + * eventually. If the end of the current write is > i_size and + * inside an allocated block which ext4_da_should_update_i_disksize() + * checked, we need to update i_disksize here as certain + * ext4_writepages() paths not allocating blocks and update i_disksize. + */ + if (new_i_size > inode->i_size) { + unsigned long end; + + i_size_write(inode, new_i_size); + end = (new_i_size - 1) & (PAGE_SIZE - 1); + if (copied && ext4_da_should_update_i_disksize(page_folio(page), end)) { + ext4_update_i_disksize(inode, new_i_size); + disksize_changed = true; + } + } + + unlock_page(page); + put_page(page); + + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + + if (disksize_changed) { + handle_t *handle; + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + } + + return copied; +} + static int ext4_da_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, struct page *page, void *fsdata) { struct inode *inode = mapping->host; - loff_t new_i_size; - unsigned long start, end; int write_mode = (int)(unsigned long)fsdata; struct folio *folio = page_folio(page); @@ -2961,30 +3020,7 @@ static int ext4_da_write_end(struct file *file, if (unlikely(copied < len) && !PageUptodate(page)) copied = 0; - start = pos & (PAGE_SIZE - 1); - end = start + copied - 1; - - /* - * Since we are holding inode lock, we are sure i_disksize <= - * i_size. We also know that if i_disksize < i_size, there are - * delalloc writes pending in the range upto i_size. If the end of - * the current write is <= i_size, there's no need to touch - * i_disksize since writeback will push i_disksize upto i_size - * eventually. If the end of the current write is > i_size and - * inside an allocated block (ext4_da_should_update_i_disksize() - * check), we need to update i_disksize here as certain - * ext4_writepages() paths not allocating blocks update i_disksize. - * - * Note that we defer inode dirtying to generic_write_end() / - * ext4_da_write_inline_data_end(). - */ - new_i_size = pos + copied; - if (copied && new_i_size > inode->i_size && - ext4_da_should_update_i_disksize(folio, end)) - ext4_update_i_disksize(inode, new_i_size); - - return generic_write_end(file, mapping, pos, len, copied, &folio->page, - fsdata); + return ext4_da_do_write_end(mapping, pos, len, copied, &folio->page); } /* -- cgit v1.2.3 From 8216776ccff6fcd40e3fdaa109aa4150ebe760b3 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 14 Aug 2023 11:29:01 -0700 Subject: ext4: reject casefold inode flag without casefold feature It is invalid for the casefold inode flag to be set without the casefold superblock feature flag also being set. e2fsck already considers this case to be invalid and handles it by offering to clear the casefold flag on the inode. __ext4_iget() also already considered this to be invalid, sort of, but it only got so far as logging an error message; it didn't actually reject the inode. Make it reject the inode so that other code doesn't have to handle this case. This matches what f2fs does. Note: we could check 's_encoding != NULL' instead of ext4_has_feature_casefold(). This would make the check robust against the casefold feature being enabled by userspace writing to the page cache of the mounted block device. However, it's unsolvable in general for filesystems to be robust against concurrent writes to the page cache of the mounted block device. Though this very particular scenario involving the casefold feature is solvable, we should not pretend that we can support this model, so let's just check the casefold feature. tune2fs already forbids enabling casefold on a mounted filesystem. Signed-off-by: Eric Biggers Link: https://lore.kernel.org/r/20230814182903.37267-2-ebiggers@kernel.org Signed-off-by: Theodore Ts'o --- fs/ext4/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs/ext4/inode.c') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index c5d8f8933c8c..6c490f05e2ba 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4974,9 +4974,12 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, "iget: bogus i_mode (%o)", inode->i_mode); goto bad_inode; } - if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) + if (IS_CASEFOLDED(inode) && !ext4_has_feature_casefold(inode->i_sb)) { ext4_error_inode(inode, function, line, 0, "casefold flag without casefold feature"); + ret = -EFSCORRUPTED; + goto bad_inode; + } if ((err_str = check_igot_inode(inode, flags)) != NULL) { ext4_error_inode(inode, function, line, 0, err_str); ret = -EFSCORRUPTED; -- cgit v1.2.3