diff options
author | Jan Kara <jack@suse.cz> | 2021-08-16 11:57:06 +0200 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2021-08-31 05:36:51 +0200 |
commit | 02f310fcf47fa9311d6ba2946a8d19e7d7d11f37 (patch) | |
tree | 49d90610b63a045fcbe0141cfbc3089cca958f4a /fs/ext4/super.c | |
parent | ext4: Move orphan inode handling into a separate file (diff) | |
download | linux-02f310fcf47fa9311d6ba2946a8d19e7d7d11f37.tar.xz linux-02f310fcf47fa9311d6ba2946a8d19e7d7d11f37.zip |
ext4: Speedup ext4 orphan inode handling
Ext4 orphan inode handling is a bottleneck for workloads which heavily
truncate / unlink small files since it contends on the global
s_orphan_mutex lock (and generally it's difficult to improve scalability
of the ondisk linked list of orphaned inodes).
This patch implements new way of handling orphan inodes. Instead of
linking orphaned inode into a linked list, we store it's inode number in
a new special file which we call "orphan file". Only if there's no more
space in the orphan file (too many inodes are currently orphaned) we
fall back to using old style linked list. Currently we protect
operations in the orphan file with a spinlock for simplicity but even in
this setting we can substantially reduce the length of the critical
section and thus speedup some workloads. In the next patch we improve
this by making orphan handling lockless.
Note that the change is backwards compatible when the filesystem is
clean - the existence of the orphan file is a compat feature, we set
another ro-compat feature indicating orphan file needs scanning for
orphaned inodes when mounting filesystem read-write. This ro-compat
feature gets cleared on unmount / remount read-only.
Some performance data from 80 CPU Xeon Server with 512 GB of RAM,
filesystem located on SSD, average of 5 runs:
stress-orphan (microbenchmark truncating files byte-by-byte from N
processes in parallel)
Threads Time Time
Vanilla Patched
1 1.057200 0.945600
2 1.680400 1.331800
4 2.547000 1.995000
8 7.049400 6.424200
16 14.827800 14.937600
32 40.948200 33.038200
64 87.787400 60.823600
128 206.504000 122.941400
So we can see significant wins all over the board.
Reviewed-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Jan Kara <jack@suse.cz>
Link: https://lore.kernel.org/r/20210816095713.16537-3-jack@suse.cz
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Diffstat (limited to '')
-rw-r--r-- | fs/ext4/super.c | 34 |
1 files changed, 29 insertions, 5 deletions
diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5efb8403f50..feca816b6bf3 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1174,6 +1174,7 @@ static void ext4_put_super(struct super_block *sb) flush_work(&sbi->s_error_work); destroy_workqueue(sbi->rsv_conversion_wq); + ext4_release_orphan_info(sb); /* * Unregister sysfs before destroying jbd2 journal. @@ -1199,6 +1200,7 @@ static void ext4_put_super(struct super_block *sb) if (!sb_rdonly(sb) && !aborted) { ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); es->s_state = cpu_to_le16(sbi->s_mount_state); } if (!sb_rdonly(sb)) @@ -2684,8 +2686,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT); le16_add_cpu(&es->s_mnt_count, 1); ext4_update_tstamp(es, s_mtime); - if (sbi->s_journal) + if (sbi->s_journal) { ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); + } err = ext4_commit_super(sb); done: @@ -3960,6 +3965,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) silent = 1; goto cantfind_ext4; } + ext4_setup_csum_trigger(sb, EXT4_JTR_ORPHAN_FILE, + ext4_orphan_file_block_trigger); /* Load the checksum driver */ sbi->s_chksum_driver = crypto_alloc_shash("crc32c", 0, 0); @@ -4624,6 +4631,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_root = NULL; needs_recovery = (es->s_last_orphan != 0 || + ext4_has_feature_orphan_present(sb) || ext4_has_feature_journal_needs_recovery(sb)); if (ext4_has_feature_mmp(sb) && !sb_rdonly(sb)) @@ -4922,12 +4930,15 @@ no_journal: if (err) goto failed_mount7; + err = ext4_init_orphan_info(sb); + if (err) + goto failed_mount8; #ifdef CONFIG_QUOTA /* Enable quota usage during mount. */ if (ext4_has_feature_quota(sb) && !sb_rdonly(sb)) { err = ext4_enable_quotas(sb); if (err) - goto failed_mount8; + goto failed_mount9; } #endif /* CONFIG_QUOTA */ @@ -4946,7 +4957,7 @@ no_journal: ext4_msg(sb, KERN_INFO, "recovery complete"); err = ext4_mark_recovery_complete(sb, es); if (err) - goto failed_mount8; + goto failed_mount9; } if (EXT4_SB(sb)->s_journal) { if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) @@ -4992,6 +5003,8 @@ cantfind_ext4: ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem"); goto failed_mount; +failed_mount9: + ext4_release_orphan_info(sb); failed_mount8: ext4_unregister_sysfs(sb); kobject_put(&sbi->s_kobj); @@ -5502,8 +5515,15 @@ static int ext4_mark_recovery_complete(struct super_block *sb, if (err < 0) goto out; - if (ext4_has_feature_journal_needs_recovery(sb) && sb_rdonly(sb)) { + if (sb_rdonly(sb) && (ext4_has_feature_journal_needs_recovery(sb) || + ext4_has_feature_orphan_present(sb))) { + if (!ext4_orphan_file_empty(sb)) { + ext4_error(sb, "Orphan file not empty on read-only fs."); + err = -EFSCORRUPTED; + goto out; + } ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); ext4_commit_super(sb); } out: @@ -5646,6 +5666,8 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ ext4_clear_feature_journal_needs_recovery(sb); + if (ext4_orphan_file_empty(sb)) + ext4_clear_feature_orphan_present(sb); } error = ext4_commit_super(sb); @@ -5668,6 +5690,8 @@ static int ext4_unfreeze(struct super_block *sb) if (EXT4_SB(sb)->s_journal) { /* Reset the needs_recovery flag before the fs is unlocked. */ ext4_set_feature_journal_needs_recovery(sb); + if (ext4_has_feature_orphan_file(sb)) + ext4_set_feature_orphan_present(sb); } ext4_commit_super(sb); @@ -5871,7 +5895,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) * around from a previously readonly bdev mount, * require a full umount/remount for now. */ - if (es->s_last_orphan) { + if (es->s_last_orphan || !ext4_orphan_file_empty(sb)) { ext4_msg(sb, KERN_WARNING, "Couldn't " "remount RDWR because of unprocessed " "orphan inode list. Please " |