summaryrefslogtreecommitdiffstats
path: root/fs/ext4
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2023-06-29 22:18:36 +0200
committerLinus Torvalds <torvalds@linux-foundation.org>2023-06-29 22:18:36 +0200
commit53ea167b212f675e40420498e46fa31553b406ac (patch)
treeab4be1f5f439c563eac041cf5902cbf184158db8 /fs/ext4
parentMerge tag 'jfs-6.5' of github.com:kleikamp/linux-shaggy (diff)
parentext4: avoid updating the superblock on a r/o mount if not needed (diff)
downloadlinux-53ea167b212f675e40420498e46fa31553b406ac.tar.xz
linux-53ea167b212f675e40420498e46fa31553b406ac.zip
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o: "Various cleanups and bug fixes in ext4's extent status tree, journalling, and block allocator subsystems. Also improve performance for parallel DIO overwrites" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (55 commits) ext4: avoid updating the superblock on a r/o mount if not needed jbd2: skip reading super block if it has been verified ext4: fix to check return value of freeze_bdev() in ext4_shutdown() ext4: refactoring to use the unified helper ext4_quotas_off() ext4: turn quotas off if mount failed after enabling quotas ext4: update doc about journal superblock description ext4: add journal cycled recording support jbd2: continue to record log between each mount jbd2: remove j_format_version jbd2: factor out journal initialization from journal_get_superblock() jbd2: switch to check format version in superblock directly jbd2: remove unused feature macros ext4: ext4_put_super: Remove redundant checking for 'sbi->s_journal_bdev' ext4: Fix reusing stale buffer heads from last failed mounting ext4: allow concurrent unaligned dio overwrites ext4: clean up mballoc criteria comments ext4: make ext4_zeroout_es() return void ext4: make ext4_es_insert_extent() return void ext4: make ext4_es_insert_delayed_block() return void ext4: make ext4_es_remove_extent() return void ...
Diffstat (limited to 'fs/ext4')
-rw-r--r--fs/ext4/ext4.h89
-rw-r--r--fs/ext4/extents.c49
-rw-r--r--fs/ext4/extents_status.c207
-rw-r--r--fs/ext4/extents_status.h14
-rw-r--r--fs/ext4/file.c86
-rw-r--r--fs/ext4/indirect.c8
-rw-r--r--fs/ext4/inline.c39
-rw-r--r--fs/ext4/inode.c114
-rw-r--r--fs/ext4/ioctl.c5
-rw-r--r--fs/ext4/mballoc.c644
-rw-r--r--fs/ext4/mballoc.h16
-rw-r--r--fs/ext4/readpage.c2
-rw-r--r--fs/ext4/super.c86
-rw-r--r--fs/ext4/sysfs.c2
14 files changed, 827 insertions, 534 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 02fa8a64dc3f..0a2d55faa095 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -128,6 +128,58 @@ enum SHIFT_DIRECTION {
};
/*
+ * For each criteria, mballoc has slightly different way of finding
+ * the required blocks nad usually, higher the criteria the slower the
+ * allocation. We start at lower criterias and keep falling back to
+ * higher ones if we are not able to find any blocks. Lower (earlier)
+ * criteria are faster.
+ */
+enum criteria {
+ /*
+ * Used when number of blocks needed is a power of 2. This
+ * doesn't trigger any disk IO except prefetch and is the
+ * fastest criteria.
+ */
+ CR_POWER2_ALIGNED,
+
+ /*
+ * Tries to lookup in-memory data structures to find the most
+ * suitable group that satisfies goal request. No disk IO
+ * except block prefetch.
+ */
+ CR_GOAL_LEN_FAST,
+
+ /*
+ * Same as CR_GOAL_LEN_FAST but is allowed to reduce the goal
+ * length to the best available length for faster allocation.
+ */
+ CR_BEST_AVAIL_LEN,
+
+ /*
+ * Reads each block group sequentially, performing disk IO if
+ * necessary, to find find_suitable block group. Tries to
+ * allocate goal length but might trim the request if nothing
+ * is found after enough tries.
+ */
+ CR_GOAL_LEN_SLOW,
+
+ /*
+ * Finds the first free set of blocks and allocates
+ * those. This is only used in rare cases when
+ * CR_GOAL_LEN_SLOW also fails to allocate anything.
+ */
+ CR_ANY_FREE,
+
+ /*
+ * Number of criterias defined.
+ */
+ EXT4_MB_NUM_CRS
+};
+
+/* criteria below which we use fast block scanning and avoid unnecessary IO */
+#define CR_FAST CR_GOAL_LEN_SLOW
+
+/*
* Flags used in mballoc's allocation_context flags field.
*
* Also used to show what's going on for debugging purposes when the
@@ -165,9 +217,12 @@ enum SHIFT_DIRECTION {
/* Do strict check for free blocks while retrying block allocation */
#define EXT4_MB_STRICT_CHECK 0x4000
/* Large fragment size list lookup succeeded at least once for cr = 0 */
-#define EXT4_MB_CR0_OPTIMIZED 0x8000
+#define EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED 0x8000
/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */
-#define EXT4_MB_CR1_OPTIMIZED 0x00010000
+#define EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED 0x00010000
+/* Avg fragment size rb tree lookup succeeded at least once for cr = 1.5 */
+#define EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED 0x00020000
+
struct ext4_allocation_request {
/* target inode for block we're allocating */
struct inode *inode;
@@ -1532,21 +1587,25 @@ struct ext4_sb_info {
unsigned long s_mb_last_start;
unsigned int s_mb_prefetch;
unsigned int s_mb_prefetch_limit;
+ unsigned int s_mb_best_avail_max_trim_order;
/* stats for buddy allocator */
atomic_t s_bal_reqs; /* number of reqs with len > 1 */
atomic_t s_bal_success; /* we found long enough chunks */
atomic_t s_bal_allocated; /* in blocks */
atomic_t s_bal_ex_scanned; /* total extents scanned */
+ atomic_t s_bal_cX_ex_scanned[EXT4_MB_NUM_CRS]; /* total extents scanned */
atomic_t s_bal_groups_scanned; /* number of groups scanned */
atomic_t s_bal_goals; /* goal hits */
+ atomic_t s_bal_len_goals; /* len goal hits */
atomic_t s_bal_breaks; /* too long searches */
atomic_t s_bal_2orders; /* 2^order hits */
- atomic_t s_bal_cr0_bad_suggestions;
- atomic_t s_bal_cr1_bad_suggestions;
- atomic64_t s_bal_cX_groups_considered[4];
- atomic64_t s_bal_cX_hits[4];
- atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */
+ atomic_t s_bal_p2_aligned_bad_suggestions;
+ atomic_t s_bal_goal_fast_bad_suggestions;
+ atomic_t s_bal_best_avail_bad_suggestions;
+ atomic64_t s_bal_cX_groups_considered[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_hits[EXT4_MB_NUM_CRS];
+ atomic64_t s_bal_cX_failed[EXT4_MB_NUM_CRS]; /* cX loop didn't find blocks */
atomic_t s_mb_buddies_generated; /* number of buddies generated */
atomic64_t s_mb_generation_time;
atomic_t s_mb_lost_chunks;
@@ -2632,10 +2691,6 @@ extern void ext4_get_group_no_and_offset(struct super_block *sb,
extern ext4_group_t ext4_get_group_number(struct super_block *sb,
ext4_fsblk_t block);
-extern unsigned int ext4_block_group(struct super_block *sb,
- ext4_fsblk_t blocknr);
-extern ext4_grpblk_t ext4_block_group_offset(struct super_block *sb,
- ext4_fsblk_t blocknr);
extern int ext4_bg_has_super(struct super_block *sb, ext4_group_t group);
extern unsigned long ext4_bg_num_gdb(struct super_block *sb,
ext4_group_t group);
@@ -2841,8 +2896,6 @@ int ext4_fc_record_regions(struct super_block *sb, int ino,
/* mballoc.c */
extern const struct seq_operations ext4_mb_seq_groups_ops;
extern const struct seq_operations ext4_mb_seq_structs_summary_ops;
-extern long ext4_mb_stats;
-extern long ext4_mb_max_to_scan;
extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset);
extern int ext4_mb_init(struct super_block *);
extern int ext4_mb_release(struct super_block *);
@@ -3481,14 +3534,8 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
struct page **pagep);
-extern int ext4_write_inline_data_end(struct inode *inode,
- loff_t pos, unsigned len,
- unsigned copied,
- struct page *page);
-extern struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page);
+int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
+ unsigned copied, struct folio *folio);
extern int ext4_da_write_inline_data_begin(struct address_space *mapping,
struct inode *inode,
loff_t pos, unsigned len,
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 35703dce23a3..e4115d338f10 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -3123,7 +3123,7 @@ void ext4_ext_release(struct super_block *sb)
#endif
}
-static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
+static void ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
{
ext4_lblk_t ee_block;
ext4_fsblk_t ee_pblock;
@@ -3134,10 +3134,10 @@ static int ext4_zeroout_es(struct inode *inode, struct ext4_extent *ex)
ee_pblock = ext4_ext_pblock(ex);
if (ee_len == 0)
- return 0;
+ return;
- return ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
- EXTENT_STATUS_WRITTEN);
+ ext4_es_insert_extent(inode, ee_block, ee_len, ee_pblock,
+ EXTENT_STATUS_WRITTEN);
}
/* FIXME!! we need to try to merge to left or right after zero-out */
@@ -3287,7 +3287,7 @@ static int ext4_split_extent_at(handle_t *handle,
err = ext4_ext_dirty(handle, inode, path + path->p_depth);
if (!err)
/* update extent status tree */
- err = ext4_zeroout_es(inode, &zero_ex);
+ ext4_zeroout_es(inode, &zero_ex);
/* If we failed at this point, we don't know in which
* state the extent tree exactly is so don't try to fix
* length of the original extent as it may do even more
@@ -3640,9 +3640,8 @@ fallback:
out:
/* If we have gotten a failure, don't zero out status tree */
if (!err) {
- err = ext4_zeroout_es(inode, &zero_ex1);
- if (!err)
- err = ext4_zeroout_es(inode, &zero_ex2);
+ ext4_zeroout_es(inode, &zero_ex1);
+ ext4_zeroout_es(inode, &zero_ex2);
}
return err ? err : allocated;
}
@@ -4403,15 +4402,8 @@ int ext4_ext_truncate(handle_t *handle, struct inode *inode)
last_block = (inode->i_size + sb->s_blocksize - 1)
>> EXT4_BLOCK_SIZE_BITS(sb);
-retry:
- err = ext4_es_remove_extent(inode, last_block,
- EXT_MAX_BLOCKS - last_block);
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_ATOMIC);
- goto retry;
- }
- if (err)
- return err;
+ ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block);
+
retry_remove_space:
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
if (err == -ENOMEM) {
@@ -5363,13 +5355,7 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
-
- ret = ext4_es_remove_extent(inode, punch_start,
- EXT_MAX_BLOCKS - punch_start);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start);
ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1);
if (ret) {
@@ -5547,12 +5533,7 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
ext4_free_ext_path(path);
}
- ret = ext4_es_remove_extent(inode, offset_lblk,
- EXT_MAX_BLOCKS - offset_lblk);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk);
/*
* if offset_lblk lies in a hole which is at start of file, use
@@ -5610,12 +5591,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1,
BUG_ON(!inode_is_locked(inode1));
BUG_ON(!inode_is_locked(inode2));
- *erp = ext4_es_remove_extent(inode1, lblk1, count);
- if (unlikely(*erp))
- return 0;
- *erp = ext4_es_remove_extent(inode2, lblk2, count);
- if (unlikely(*erp))
- return 0;
+ ext4_es_remove_extent(inode1, lblk1, count);
+ ext4_es_remove_extent(inode2, lblk2, count);
while (count) {
struct ext4_extent *ex1, *ex2, tmp_ex;
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index 595abb9e7d74..9b5b8951afb4 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -144,9 +144,11 @@
static struct kmem_cache *ext4_es_cachep;
static struct kmem_cache *ext4_pending_cachep;
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes);
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc);
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved);
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc);
static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan);
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
struct ext4_inode_info *locked_ei);
@@ -446,22 +448,36 @@ static void ext4_es_list_del(struct inode *inode)
spin_unlock(&sbi->s_es_lock);
}
-static struct extent_status *
-ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
- ext4_fsblk_t pblk)
+/*
+ * Returns true if we cannot fail to allocate memory for this extent_status
+ * entry and cannot reclaim it until its status changes.
+ */
+static inline bool ext4_es_must_keep(struct extent_status *es)
+{
+ /* fiemap, bigalloc, and seek_data/hole need to use it. */
+ if (ext4_es_is_delayed(es))
+ return true;
+
+ return false;
+}
+
+static inline struct extent_status *__es_alloc_extent(bool nofail)
+{
+ if (!nofail)
+ return kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
+
+ return kmem_cache_zalloc(ext4_es_cachep, GFP_KERNEL | __GFP_NOFAIL);
+}
+
+static void ext4_es_init_extent(struct inode *inode, struct extent_status *es,
+ ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk)
{
- struct extent_status *es;
- es = kmem_cache_alloc(ext4_es_cachep, GFP_ATOMIC);
- if (es == NULL)
- return NULL;
es->es_lblk = lblk;
es->es_len = len;
es->es_pblk = pblk;
- /*
- * We don't count delayed extent because we never try to reclaim them
- */
- if (!ext4_es_is_delayed(es)) {
+ /* We never try to reclaim a must kept extent, so we don't count it. */
+ if (!ext4_es_must_keep(es)) {
if (!EXT4_I(inode)->i_es_shk_nr++)
ext4_es_list_add(inode);
percpu_counter_inc(&EXT4_SB(inode->i_sb)->
@@ -470,8 +486,11 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len,
EXT4_I(inode)->i_es_all_nr++;
percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
+}
- return es;
+static inline void __es_free_extent(struct extent_status *es)
+{
+ kmem_cache_free(ext4_es_cachep, es);
}
static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
@@ -479,8 +498,8 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
EXT4_I(inode)->i_es_all_nr--;
percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt);
- /* Decrease the shrink counter when this es is not delayed */
- if (!ext4_es_is_delayed(es)) {
+ /* Decrease the shrink counter when we can reclaim the extent. */
+ if (!ext4_es_must_keep(es)) {
BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0);
if (!--EXT4_I(inode)->i_es_shk_nr)
ext4_es_list_del(inode);
@@ -488,7 +507,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es)
s_es_stats.es_stats_shk_cnt);
}
- kmem_cache_free(ext4_es_cachep, es);
+ __es_free_extent(es);
}
/*
@@ -749,7 +768,8 @@ static inline void ext4_es_insert_extent_check(struct inode *inode,
}
#endif
-static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
+static int __es_insert_extent(struct inode *inode, struct extent_status *newes,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node **p = &tree->root.rb_node;
@@ -789,10 +809,15 @@ static int __es_insert_extent(struct inode *inode, struct extent_status *newes)
}
}
- es = ext4_es_alloc_extent(inode, newes->es_lblk, newes->es_len,
- newes->es_pblk);
+ if (prealloc)
+ es = prealloc;
+ else
+ es = __es_alloc_extent(false);
if (!es)
return -ENOMEM;
+ ext4_es_init_extent(inode, es, newes->es_lblk, newes->es_len,
+ newes->es_pblk);
+
rb_link_node(&es->rb_node, parent, p);
rb_insert_color(&es->rb_node, &tree->root);
@@ -804,26 +829,27 @@ out:
/*
* ext4_es_insert_extent() adds information to an inode's extent
* status tree.
- *
- * Return 0 on success, error code on failure.
*/
-int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status)
+void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status)
{
struct extent_status newes;
ext4_lblk_t end = lblk + len - 1;
- int err = 0;
+ int err1 = 0;
+ int err2 = 0;
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
es_debug("add [%u/%u) %llu %x to extent status tree of inode %lu\n",
lblk, len, pblk, status, inode->i_ino);
if (!len)
- return 0;
+ return;
BUG_ON(end < lblk);
@@ -842,29 +868,40 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, NULL);
- if (err != 0)
+
+ err1 = __es_remove_extent(inode, lblk, end, NULL, es1);
+ if (err1 != 0)
+ goto error;
+
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 == -ENOMEM && !ext4_es_must_keep(&newes))
+ err2 = 0;
+ if (err2 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err == -ENOMEM && !ext4_es_is_delayed(&newes))
- err = 0;
if (sbi->s_cluster_ratio > 1 && test_opt(inode->i_sb, DELALLOC) &&
(status & EXTENT_STATUS_WRITTEN ||
status & EXTENT_STATUS_UNWRITTEN))
__revise_pending(inode, lblk, len);
+ /* es is pre-allocated but not used, free it. */
+ if (es1 && !es1->es_len)
+ __es_free_extent(es1);
+ if (es2 && !es2->es_len)
+ __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2)
+ goto retry;
ext4_es_print_tree(inode);
-
- return err;
+ return;
}
/*
@@ -897,7 +934,7 @@ void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
es = __es_tree_search(&EXT4_I(inode)->i_es_tree.root, lblk);
if (!es || es->es_lblk > end)
- __es_insert_extent(inode, &newes);
+ __es_insert_extent(inode, &newes, NULL);
write_unlock(&EXT4_I(inode)->i_es_lock);
}
@@ -1287,6 +1324,7 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* @lblk - first block in range
* @end - last block in range
* @reserved - number of cluster reservations released
+ * @prealloc - pre-allocated es to avoid memory allocation failures
*
* If @reserved is not NULL and delayed allocation is enabled, counts
* block/cluster reservations freed by removing range and if bigalloc
@@ -1294,7 +1332,8 @@ static unsigned int get_rsvd(struct inode *inode, ext4_lblk_t end,
* error code on failure.
*/
static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t end, int *reserved)
+ ext4_lblk_t end, int *reserved,
+ struct extent_status *prealloc)
{
struct ext4_es_tree *tree = &EXT4_I(inode)->i_es_tree;
struct rb_node *node;
@@ -1302,14 +1341,12 @@ static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
struct extent_status orig_es;
ext4_lblk_t len1, len2;
ext4_fsblk_t block;
- int err;
+ int err = 0;
bool count_reserved = true;
struct rsvd_count rc;
if (reserved == NULL || !test_opt(inode->i_sb, DELALLOC))
count_reserved = false;
-retry:
- err = 0;
es = __es_tree_search(&tree->root, lblk);
if (!es)
@@ -1343,14 +1380,13 @@ retry:
orig_es.es_len - len2;
ext4_es_store_pblock_status(&newes, block,
ext4_es_status(&orig_es));
- err = __es_insert_extent(inode, &newes);
+ err = __es_insert_extent(inode, &newes, prealloc);
if (err) {
+ if (!ext4_es_must_keep(&newes))
+ return 0;
+
es->es_lblk = orig_es.es_lblk;
es->es_len = orig_es.es_len;
- if ((err == -ENOMEM) &&
- __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
goto out;
}
} else {
@@ -1422,39 +1458,48 @@ out:
* @len - number of blocks to remove
*
* Reduces block/cluster reservation count and for bigalloc cancels pending
- * reservations as needed. Returns 0 on success, error code on failure.
+ * reservations as needed.
*/
-int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len)
+void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len)
{
ext4_lblk_t end;
int err = 0;
int reserved = 0;
+ struct extent_status *es = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
trace_ext4_es_remove_extent(inode, lblk, len);
es_debug("remove [%u/%u) from extent status tree of inode %lu\n",
lblk, len, inode->i_ino);
if (!len)
- return err;
+ return;
end = lblk + len - 1;
BUG_ON(end < lblk);
+retry:
+ if (err && !es)
+ es = __es_alloc_extent(true);
/*
* ext4_clear_inode() depends on us taking i_es_lock unconditionally
* so that we are sure __es_shrink() is done with the inode before it
* is reclaimed.
*/
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, end, &reserved);
+ err = __es_remove_extent(inode, lblk, end, &reserved, es);
+ if (es && !es->es_len)
+ __es_free_extent(es);
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err)
+ goto retry;
+
ext4_es_print_tree(inode);
ext4_da_release_space(inode, reserved);
- return err;
+ return;
}
static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan,
@@ -1702,11 +1747,8 @@ static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end,
(*nr_to_scan)--;
node = rb_next(&es->rb_node);
- /*
- * We can't reclaim delayed extent from status tree because
- * fiemap, bigallic, and seek_data/hole need to use it.
- */
- if (ext4_es_is_delayed(es))
+
+ if (ext4_es_must_keep(es))
goto next;
if (ext4_es_is_referenced(es)) {
ext4_es_clear_referenced(es);
@@ -1770,7 +1812,7 @@ void ext4_clear_inode_es(struct inode *inode)
while (node) {
es = rb_entry(node, struct extent_status, rb_node);
node = rb_next(node);
- if (!ext4_es_is_delayed(es)) {
+ if (!ext4_es_must_keep(es)) {
rb_erase(&es->rb_node, &tree->root);
ext4_es_free_extent(inode, es);
}
@@ -1972,17 +2014,18 @@ bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk)
* @lblk - logical block to be added
* @allocated - indicates whether a physical cluster has been allocated for
* the logical cluster that contains the block
- *
- * Returns 0 on success, negative error code on failure.
*/
-int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated)
+void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated)
{
struct extent_status newes;
- int err = 0;
+ int err1 = 0;
+ int err2 = 0;
+ struct extent_status *es1 = NULL;
+ struct extent_status *es2 = NULL;
if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
- return 0;
+ return;
es_debug("add [%u/1) delayed to extent status tree of inode %lu\n",
lblk, inode->i_ino);
@@ -1994,29 +2037,37 @@ int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
ext4_es_insert_extent_check(inode, &newes);
+retry:
+ if (err1 && !es1)
+ es1 = __es_alloc_extent(true);
+ if ((err1 || err2) && !es2)
+ es2 = __es_alloc_extent(true);
write_lock(&EXT4_I(inode)->i_es_lock);
- err = __es_remove_extent(inode, lblk, lblk, NULL);
- if (err != 0)
+ err1 = __es_remove_extent(inode, lblk, lblk, NULL, es1);
+ if (err1 != 0)
goto error;
-retry:
- err = __es_insert_extent(inode, &newes);
- if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb),
- 128, EXT4_I(inode)))
- goto retry;
- if (err != 0)
+
+ err2 = __es_insert_extent(inode, &newes, es2);
+ if (err2 != 0)
goto error;
if (allocated)
__insert_pending(inode, lblk);
+ /* es is pre-allocated but not used, free it. */
+ if (es1 && !es1->es_len)
+ __es_free_extent(es1);
+ if (es2 && !es2->es_len)
+ __es_free_extent(es2);
error:
write_unlock(&EXT4_I(inode)->i_es_lock);
+ if (err1 || err2)
+ goto retry;
ext4_es_print_tree(inode);
ext4_print_pending_tree(inode);
-
- return err;
+ return;
}
/*
diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h
index 4ec30a798260..d9847a4a25db 100644
--- a/fs/ext4/extents_status.h
+++ b/fs/ext4/extents_status.h
@@ -127,14 +127,14 @@ extern int __init ext4_init_es(void);
extern void ext4_exit_es(void);
extern void ext4_es_init_tree(struct ext4_es_tree *tree);
-extern int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len, ext4_fsblk_t pblk,
- unsigned int status);
+extern void ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len, ext4_fsblk_t pblk,
+ unsigned int status);
extern void ext4_es_cache_extent(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len, ext4_fsblk_t pblk,
unsigned int status);
-extern int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
- ext4_lblk_t len);
+extern void ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk,
+ ext4_lblk_t len);
extern void ext4_es_find_extent_range(struct inode *inode,
int (*match_fn)(struct extent_status *es),
ext4_lblk_t lblk, ext4_lblk_t end,
@@ -249,8 +249,8 @@ extern void ext4_exit_pending(void);
extern void ext4_init_pending_tree(struct ext4_pending_tree *tree);
extern void ext4_remove_pending(struct inode *inode, ext4_lblk_t lblk);
extern bool ext4_is_pending(struct inode *inode, ext4_lblk_t lblk);
-extern int ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
- bool allocated);
+extern void ext4_es_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk,
+ bool allocated);
extern unsigned int ext4_es_delayed_clu(struct inode *inode, ext4_lblk_t lblk,
ext4_lblk_t len);
extern void ext4_clear_inode_es(struct inode *inode);
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 6a16d07965f9..c457c8517f0f 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -450,13 +450,14 @@ static const struct iomap_dio_ops ext4_dio_write_ops = {
*/
static ssize_t ext4_dio_write_checks(struct kiocb *iocb, struct iov_iter *from,
bool *ilock_shared, bool *extend,
- bool *unwritten)
+ bool *unwritten, int *dio_flags)
{
struct file *file = iocb->ki_filp;
struct inode *inode = file_inode(file);
loff_t offset;
size_t count;
ssize_t ret;
+ bool overwrite, unaligned_io;
restart:
ret = ext4_generic_write_checks(iocb, from);
@@ -465,16 +466,20 @@ restart:
offset = iocb->ki_pos;
count = ret;
- if (ext4_extending_io(inode, offset, count))
- *extend = true;
+
+ unaligned_io = ext4_unaligned_io(inode, from, offset);
+ *extend = ext4_extending_io(inode, offset, count);
+ overwrite = ext4_overwrite_io(inode, offset, count, unwritten);
+
/*
- * Determine whether the IO operation will overwrite allocated
- * and initialized blocks.
- * We need exclusive i_rwsem for changing security info
- * in file_modified().
+ * Determine whether we need to upgrade to an exclusive lock. This is
+ * required to change security info in file_modified(), for extending
+ * I/O, any form of non-overwrite I/O, and unaligned I/O to unwritten
+ * extents (as partial block zeroing may be required).
*/
- if (*ilock_shared && (!IS_NOSEC(inode) || *extend ||
- !ext4_overwrite_io(inode, offset, count, unwritten))) {
+ if (*ilock_shared &&
+ ((!IS_NOSEC(inode) || *extend || !overwrite ||
+ (unaligned_io && *unwritten)))) {
if (iocb->ki_flags & IOCB_NOWAIT) {
ret = -EAGAIN;
goto out;
@@ -485,6 +490,32 @@ restart:
goto restart;
}
+ /*
+ * Now that locking is settled, determine dio flags and exclusivity
+ * requirements. Unaligned writes are allowed under shared lock so long
+ * as they are pure overwrites. Set the iomap overwrite only flag as an
+ * added precaution in this case. Even though this is unnecessary, we
+ * can detect and warn on unexpected -EAGAIN if an unsafe unaligned
+ * write is ever submitted.
+ *
+ * Otherwise, concurrent unaligned writes risk data corruption due to
+ * partial block zeroing in the dio layer, and so the I/O must occur
+ * exclusively. The inode lock is already held exclusive if the write is
+ * non-overwrite or extending, so drain all outstanding dio and set the
+ * force wait dio flag.
+ */
+ if (*ilock_shared && unaligned_io) {
+ *dio_flags = IOMAP_DIO_OVERWRITE_ONLY;
+ } else if (!*ilock_shared && (unaligned_io || *extend)) {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ ret = -EAGAIN;
+ goto out;
+ }
+ if (unaligned_io && (!overwrite || *unwritten))
+ inode_dio_wait(inode);
+ *dio_flags = IOMAP_DIO_FORCE_WAIT;
+ }
+
ret = file_modified(file);
if (ret < 0)
goto out;
@@ -506,18 +537,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
loff_t offset = iocb->ki_pos;
size_t count = iov_iter_count(from);
const struct iomap_ops *iomap_ops = &ext4_iomap_ops;
- bool extend = false, unaligned_io = false, unwritten = false;
+ bool extend = false, unwritten = false;
bool ilock_shared = true;
+ int dio_flags = 0;
/*
- * We initially start with shared inode lock unless it is
- * unaligned IO which needs exclusive lock anyways.
- */
- if (ext4_unaligned_io(inode, from, offset)) {
- unaligned_io = true;
- ilock_shared = false;
- }
- /*
* Quick check here without any i_rwsem lock to see if it is extending
* IO. A more reliable check is done in ext4_dio_write_checks() with
* proper locking in place.
@@ -549,16 +573,11 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
return ext4_buffered_write_iter(iocb, from);
}
- ret = ext4_dio_write_checks(iocb, from,
- &ilock_shared, &extend, &unwritten);
+ ret = ext4_dio_write_checks(iocb, from, &ilock_shared, &extend,
+ &unwritten, &dio_flags);
if (ret <= 0)
return ret;
- /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */
- if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) {
- ret = -EAGAIN;
- goto out;
- }
/*
* Make sure inline data cannot be created anymore since we are going
* to allocate blocks for DIO. We know the inode does not have any
@@ -569,19 +588,6 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
offset = iocb->ki_pos;
count = ret;
- /*
- * Unaligned direct IO must be serialized among each other as zeroing
- * of partial blocks of two competing unaligned IOs can result in data
- * corruption.
- *
- * So we make sure we don't allow any unaligned IO in flight.
- * For IOs where we need not wait (like unaligned non-AIO DIO),
- * below inode_dio_wait() may anyway become a no-op, since we start
- * with exclusive lock.
- */
- if (unaligned_io)
- inode_dio_wait(inode);
-
if (extend) {
handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
if (IS_ERR(handle)) {
@@ -601,8 +607,8 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (ilock_shared && !unwritten)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
- (unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
- NULL, 0);
+ dio_flags, NULL, 0);
+ WARN_ON_ONCE(ret == -EAGAIN && !(iocb->ki_flags & IOCB_NOWAIT));
if (ret == -ENOTBLK)
ret = 0;
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c
index c68bebe7ff4b..a9f3716119d3 100644
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -651,6 +651,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
ext4_update_inode_fsync_trans(handle, inode, 1);
count = ar.len;
+
+ /*
+ * Update reserved blocks/metadata blocks after successful block
+ * allocation which had been deferred till now.
+ */
+ if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
+ ext4_da_update_reserve_space(inode, count, 1);
+
got_it:
map->m_flags |= EXT4_MAP_MAPPED;
map->m_pblk = le32_to_cpu(chain[depth-1].key);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 5854bd5a3352..a4b7e4bc32d4 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -741,9 +741,8 @@ convert:
}
int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len,
- unsigned copied, struct page *page)
+ unsigned copied, struct folio *folio)
{
- struct folio *folio = page_folio(page);
handle_t *handle = ext4_journal_current_handle();
int no_expand;
void *kaddr;
@@ -823,30 +822,6 @@ out:
return ret ? ret : copied;
}
-struct buffer_head *
-ext4_journalled_write_inline_data(struct inode *inode,
- unsigned len,
- struct page *page)
-{
- int ret, no_expand;
- void *kaddr;
- struct ext4_iloc iloc;
-
- ret = ext4_get_inode_loc(inode, &iloc);
- if (ret) {
- ext4_std_error(inode->i_sb, ret);
- return NULL;
- }
-
- ext4_write_lock_xattr(inode, &no_expand);
- kaddr = kmap_atomic(page);
- ext4_write_inline_data(inode, &iloc, kaddr, 0, len);
- kunmap_atomic(kaddr);
- ext4_write_unlock_xattr(inode, &no_expand);
-
- return iloc.bh;
-}
-
/*
* Try to make the page cache and handle ready for the inline data case.
* We can call this function in 2 cases:
@@ -1964,16 +1939,8 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
* the extent status cache must be cleared to avoid leaving
* behind stale delayed allocated extent entries
*/
- if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
-retry:
- err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
- if (err == -ENOMEM) {
- memalloc_retry_wait(GFP_ATOMIC);
- goto retry;
- }
- if (err)
- goto out_error;
- }
+ if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
+ ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
/* Clear the content in the xattr space. */
if (inline_size > EXT4_MIN_INLINE_DATA_SIZE) {
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index 9ca583360166..43775a6ca505 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -567,10 +567,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk,
- map->m_len, map->m_pblk, status);
- if (ret < 0)
- retval = ret;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
up_read((&EXT4_I(inode)->i_data_sem));
@@ -632,16 +630,6 @@ found:
*/
ext4_clear_inode_state(inode, EXT4_STATE_EXT_MIGRATE);
}
-
- /*
- * Update reserved blocks/metadata blocks after successful
- * block allocation which had been deferred till now. We don't
- * support fallocate for non extent files. So we can update
- * reserve space here.
- */
- if ((retval > 0) &&
- (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE))
- ext4_da_update_reserve_space(inode, retval, 1);
}
if (retval > 0) {
@@ -689,12 +677,8 @@ found:
ext4_es_scan_range(inode, &ext4_es_is_delayed, map->m_lblk,
map->m_lblk + map->m_len - 1))
status |= EXTENT_STATUS_DELAYED;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret < 0) {
- retval = ret;
- goto out_sem;
- }
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
out_sem:
@@ -1287,7 +1271,8 @@ static int ext4_write_end(struct file *file,
if (ext4_has_inline_data(inode) &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
/*
@@ -1395,7 +1380,8 @@ static int ext4_journalled_write_end(struct file *file,
BUG_ON(!ext4_handle_valid(handle));
if (ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !folio_test_uptodate(folio)) {
copied = 0;
@@ -1638,7 +1624,6 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
int ret;
bool allocated = false;
- bool reserved = false;
/*
* If the cluster containing lblk is shared with a delayed,
@@ -1654,8 +1639,7 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
if (sbi->s_cluster_ratio == 1) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
- goto errout;
- reserved = true;
+ return ret;
} else { /* bigalloc */
if (!ext4_es_scan_clu(inode, &ext4_es_is_delonly, lblk)) {
if (!ext4_es_scan_clu(inode,
@@ -1663,12 +1647,11 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
ret = ext4_clu_mapped(inode,
EXT4_B2C(sbi, lblk));
if (ret < 0)
- goto errout;
+ return ret;
if (ret == 0) {
ret = ext4_da_reserve_space(inode);
if (ret != 0) /* ENOSPC */
- goto errout;
- reserved = true;
+ return ret;
} else {
allocated = true;
}
@@ -1678,12 +1661,8 @@ static int ext4_insert_delayed_block(struct inode *inode, ext4_lblk_t lblk)
}
}
- ret = ext4_es_insert_delayed_block(inode, lblk, allocated);
- if (ret && reserved)
- ext4_da_release_space(inode, 1);
-
-errout:
- return ret;
+ ext4_es_insert_delayed_block(inode, lblk, allocated);
+ return 0;
}
/*
@@ -1780,7 +1759,6 @@ add_delayed:
set_buffer_new(bh);
set_buffer_delay(bh);
} else if (retval > 0) {
- int ret;
unsigned int status;
if (unlikely(retval != map->m_len)) {
@@ -1793,10 +1771,8 @@ add_delayed:
status = map->m_flags & EXT4_MAP_UNWRITTEN ?
EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status);
- if (ret != 0)
- retval = ret;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status);
}
out_unlock:
@@ -2321,11 +2297,11 @@ static int ext4_da_writepages_trans_blocks(struct inode *inode)
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
}
-static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
- int len)
+static int ext4_journal_folio_buffers(handle_t *handle, struct folio *folio,
+ size_t len)
{
- struct buffer_head *page_bufs = page_buffers(page);
- struct inode *inode = page->mapping->host;
+ struct buffer_head *page_bufs = folio_buffers(folio);
+ struct inode *inode = folio->mapping->host;
int ret, err;
ret = ext4_walk_page_buffers(handle, inode, page_bufs, 0, len,
@@ -2334,7 +2310,7 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
NULL, write_end_fn);
if (ret == 0)
ret = err;
- err = ext4_jbd2_inode_add_write(handle, inode, page_offset(page), len);
+ err = ext4_jbd2_inode_add_write(handle, inode, folio_pos(folio), len);
if (ret == 0)
ret = err;
EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
@@ -2344,22 +2320,20 @@ static int ext4_journal_page_buffers(handle_t *handle, struct page *page,
static int mpage_journal_page_buffers(handle_t *handle,
struct mpage_da_data *mpd,
- struct page *page)
+ struct folio *folio)
{
struct inode *inode = mpd->inode;
loff_t size = i_size_read(inode);
- int len;
+ size_t len = folio_size(folio);
- ClearPageChecked(page);
+ folio_clear_checked(folio);
mpd->wbc->nr_to_write--;
- if (page->index == size >> PAGE_SHIFT &&
+ if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
- len = size & ~PAGE_MASK;
- else
- len = PAGE_SIZE;
+ len = size - folio_pos(folio);
- return ext4_journal_page_buffers(handle, page, len);
+ return ext4_journal_folio_buffers(handle, folio, len);
}
/*
@@ -2499,7 +2473,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
/* Pending dirtying of journalled data? */
if (folio_test_checked(folio)) {
err = mpage_journal_page_buffers(handle,
- mpd, &folio->page);
+ mpd, folio);
if (err < 0)
goto out;
mpd->journalled_more_data = 1;
@@ -2944,15 +2918,15 @@ retry:
* Check if we should update i_disksize
* when write to the end of file but not require block allocation
*/
-static int ext4_da_should_update_i_disksize(struct page *page,
+static int ext4_da_should_update_i_disksize(struct folio *folio,
unsigned long offset)
{
struct buffer_head *bh;
- struct inode *inode = page->mapping->host;
+ struct inode *inode = folio->mapping->host;
unsigned int idx;
int i;
- bh = page_buffers(page);
+ bh = folio_buffers(folio);
idx = offset >> inode->i_blkbits;
for (i = 0; i < idx; i++)
@@ -2972,17 +2946,19 @@ static int ext4_da_write_end(struct file *file,
loff_t new_i_size;
unsigned long start, end;
int write_mode = (int)(unsigned long)fsdata;
+ struct folio *folio = page_folio(page);
if (write_mode == FALL_BACK_TO_NONDELALLOC)
return ext4_write_end(file, mapping, pos,
- len, copied, page, fsdata);
+ len, copied, &folio->page, fsdata);
trace_ext4_da_write_end(inode, pos, len, copied);
if (write_mode != CONVERT_INLINE_DATA &&
ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA) &&
ext4_has_inline_data(inode))
- return ext4_write_inline_data_end(inode, pos, len, copied, page);
+ return ext4_write_inline_data_end(inode, pos, len, copied,
+ folio);
if (unlikely(copied < len) && !PageUptodate(page))
copied = 0;
@@ -3006,10 +2982,11 @@ static int ext4_da_write_end(struct file *file,
*/
new_i_size = pos + copied;
if (copied && new_i_size > inode->i_size &&
- ext4_da_should_update_i_disksize(page, end))
+ ext4_da_should_update_i_disksize(folio, end))
ext4_update_i_disksize(inode, new_i_size);
- return generic_write_end(file, mapping, pos, len, copied, page, fsdata);
+ return generic_write_end(file, mapping, pos, len, copied, &folio->page,
+ fsdata);
}
/*
@@ -3105,7 +3082,7 @@ static int ext4_read_folio(struct file *file, struct folio *folio)
int ret = -EAGAIN;
struct inode *inode = folio->mapping->host;
- trace_ext4_readpage(&folio->page);
+ trace_ext4_read_folio(inode, folio);
if (ext4_has_inline_data(inode))
ret = ext4_readpage_inline(inode, folio);
@@ -3164,9 +3141,10 @@ static void ext4_journalled_invalidate_folio(struct folio *folio,
static bool ext4_release_folio(struct folio *folio, gfp_t wait)
{
- journal_t *journal = EXT4_JOURNAL(folio->mapping->host);
+ struct inode *inode = folio->mapping->host;
+ journal_t *journal = EXT4_JOURNAL(inode);
- trace_ext4_releasepage(&folio->page);
+ trace_ext4_release_folio(inode, folio);
/* Page has dirty journalled data -> cannot release */
if (folio_test_checked(folio))
@@ -3992,12 +3970,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode, 0);
- ret = ext4_es_remove_extent(inode, first_block,
- stop_block - first_block);
- if (ret) {
- up_write(&EXT4_I(inode)->i_data_sem);
- goto out_stop;
- }
+ ext4_es_remove_extent(inode, first_block,
+ stop_block - first_block);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
ret = ext4_ext_remove_space(inode, first_block,
@@ -6156,7 +6130,7 @@ retry_alloc:
err = __block_write_begin(&folio->page, 0, len, ext4_get_block);
if (!err) {
ret = VM_FAULT_SIGBUS;
- if (ext4_journal_page_buffers(handle, &folio->page, len))
+ if (ext4_journal_folio_buffers(handle, folio, len))
goto out_error;
} else {
folio_unlock(folio);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 961284cc9b65..331859511f80 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -796,6 +796,7 @@ static int ext4_ioctl_setproject(struct inode *inode, __u32 projid)
int ext4_force_shutdown(struct super_block *sb, u32 flags)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
+ int ret;
if (flags > EXT4_GOING_FLAGS_NOLOGFLUSH)
return -EINVAL;
@@ -808,7 +809,9 @@ int ext4_force_shutdown(struct super_block *sb, u32 flags)
switch (flags) {
case EXT4_GOING_FLAGS_DEFAULT:
- freeze_bdev(sb->s_bdev);
+ ret = freeze_bdev(sb->s_bdev);
+ if (ret)
+ return ret;
set_bit(EXT4_FLAGS_SHUTDOWN, &sbi->s_ext4_flags);
thaw_bdev(sb->s_bdev);
break;
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 20f67a260df5..a2475b8c9fb5 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -154,19 +154,31 @@
* structures to decide the order in which groups are to be traversed for
* fulfilling an allocation request.
*
- * At CR = 0, we look for groups which have the largest_free_order >= the order
- * of the request. We directly look at the largest free order list in the data
- * structure (1) above where largest_free_order = order of the request. If that
- * list is empty, we look at remaining list in the increasing order of
- * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time.
+ * At CR_POWER2_ALIGNED , we look for groups which have the largest_free_order
+ * >= the order of the request. We directly look at the largest free order list
+ * in the data structure (1) above where largest_free_order = order of the
+ * request. If that list is empty, we look at remaining list in the increasing
+ * order of largest_free_order. This allows us to perform CR_POWER2_ALIGNED
+ * lookup in O(1) time.
*
- * At CR = 1, we only consider groups where average fragment size > request
- * size. So, we lookup a group which has average fragment size just above or
- * equal to request size using our average fragment size group lists (data
- * structure 2) in O(1) time.
+ * At CR_GOAL_LEN_FAST, we only consider groups where
+ * average fragment size > request size. So, we lookup a group which has average
+ * fragment size just above or equal to request size using our average fragment
+ * size group lists (data structure 2) in O(1) time.
+ *
+ * At CR_BEST_AVAIL_LEN, we aim to optimize allocations which can't be satisfied
+ * in CR_GOAL_LEN_FAST. The fact that we couldn't find a group in
+ * CR_GOAL_LEN_FAST suggests that there is no BG that has avg
+ * fragment size > goal length. So before falling to the slower
+ * CR_GOAL_LEN_SLOW, in CR_BEST_AVAIL_LEN we proactively trim goal length and
+ * then use the same fragment lists as CR_GOAL_LEN_FAST to find a BG with a big
+ * enough average fragment size. This increases the chances of finding a
+ * suitable block group in O(1) time and results in faster allocation at the
+ * cost of reduced size of allocation.
*
* If "mb_optimize_scan" mount option is not set, mballoc traverses groups in
- * linear order which requires O(N) search time for each CR 0 and CR 1 phase.
+ * linear order which requires O(N) search time for each CR_POWER2_ALIGNED and
+ * CR_GOAL_LEN_FAST phase.
*
* The regular allocator (using the buddy cache) supports a few tunables.
*
@@ -351,8 +363,8 @@
* - bitlock on a group (group)
* - object (inode/locality) (object)
* - per-pa lock (pa)
- * - cr0 lists lock (cr0)
- * - cr1 tree lock (cr1)
+ * - cr_power2_aligned lists lock (cr_power2_aligned)
+ * - cr_goal_len_fast lists lock (cr_goal_len_fast)
*
* Paths:
* - new pa
@@ -384,7 +396,7 @@
*
* - allocation path (ext4_mb_regular_allocator)
* group
- * cr0/cr1
+ * cr_power2_aligned/cr_goal_len_fast
*/
static struct kmem_cache *ext4_pspace_cachep;
static struct kmem_cache *ext4_ac_cachep;
@@ -409,7 +421,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac);
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr);
+ ext4_group_t group, enum criteria cr);
static int ext4_try_to_trim_range(struct super_block *sb,
struct ext4_buddy *e4b, ext4_grpblk_t start,
@@ -858,8 +870,8 @@ mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp)
* Choose next group by traversing largest_free_order lists. Updates *new_cr if
* cr level needs an update.
*/
-static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static void ext4_mb_choose_next_group_p2_aligned(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
struct ext4_group_info *iter, *grp;
@@ -868,8 +880,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (ac->ac_status == AC_STATUS_FOUND)
return;
- if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED))
- atomic_inc(&sbi->s_bal_cr0_bad_suggestions);
+ if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED))
+ atomic_inc(&sbi->s_bal_p2_aligned_bad_suggestions);
grp = NULL;
for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) {
@@ -884,8 +896,8 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i],
bb_largest_free_order_node) {
if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[0]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) {
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, CR_POWER2_ALIGNED))) {
grp = iter;
break;
}
@@ -897,57 +909,155 @@ static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac,
if (!grp) {
/* Increment cr and search again */
- *new_cr = 1;
+ *new_cr = CR_GOAL_LEN_FAST;
} else {
*group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED;
+ ac->ac_flags |= EXT4_MB_CR_POWER2_ALIGNED_OPTIMIZED;
}
}
/*
+ * Find a suitable group of given order from the average fragments list.
+ */
+static struct ext4_group_info *
+ext4_mb_find_good_group_avg_frag_lists(struct ext4_allocation_context *ac, int order)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct list_head *frag_list = &sbi->s_mb_avg_fragment_size[order];
+ rwlock_t *frag_list_lock = &sbi->s_mb_avg_fragment_size_locks[order];
+ struct ext4_group_info *grp = NULL, *iter;
+ enum criteria cr = ac->ac_criteria;
+
+ if (list_empty(frag_list))
+ return NULL;
+ read_lock(frag_list_lock);
+ if (list_empty(frag_list)) {
+ read_unlock(frag_list_lock);
+ return NULL;
+ }
+ list_for_each_entry(iter, frag_list, bb_avg_fragment_size_node) {
+ if (sbi->s_mb_stats)
+ atomic64_inc(&sbi->s_bal_cX_groups_considered[cr]);
+ if (likely(ext4_mb_good_group(ac, iter->bb_group, cr))) {
+ grp = iter;
+ break;
+ }
+ }
+ read_unlock(frag_list_lock);
+ return grp;
+}
+
+/*
* Choose next group by traversing average fragment size list of suitable
* order. Updates *new_cr if cr level needs an update.
*/
-static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+static void ext4_mb_choose_next_group_goal_fast(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
- struct ext4_group_info *grp = NULL, *iter;
+ struct ext4_group_info *grp = NULL;
int i;
- if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) {
+ if (unlikely(ac->ac_flags & EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED)) {
if (sbi->s_mb_stats)
- atomic_inc(&sbi->s_bal_cr1_bad_suggestions);
+ atomic_inc(&sbi->s_bal_goal_fast_bad_suggestions);
}
for (i = mb_avg_fragment_size_order(ac->ac_sb, ac->ac_g_ex.fe_len);
i < MB_NUM_ORDERS(ac->ac_sb); i++) {
- if (list_empty(&sbi->s_mb_avg_fragment_size[i]))
- continue;
- read_lock(&sbi->s_mb_avg_fragment_size_locks[i]);
- if (list_empty(&sbi->s_mb_avg_fragment_size[i])) {
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
- continue;
- }
- list_for_each_entry(iter, &sbi->s_mb_avg_fragment_size[i],
- bb_avg_fragment_size_node) {
- if (sbi->s_mb_stats)
- atomic64_inc(&sbi->s_bal_cX_groups_considered[1]);
- if (likely(ext4_mb_good_group(ac, iter->bb_group, 1))) {
- grp = iter;
- break;
- }
+ grp = ext4_mb_find_good_group_avg_frag_lists(ac, i);
+ if (grp)
+ break;
+ }
+
+ if (grp) {
+ *group = grp->bb_group;
+ ac->ac_flags |= EXT4_MB_CR_GOAL_LEN_FAST_OPTIMIZED;
+ } else {
+ *new_cr = CR_BEST_AVAIL_LEN;
+ }
+}
+
+/*
+ * We couldn't find a group in CR_GOAL_LEN_FAST so try to find the highest free fragment
+ * order we have and proactively trim the goal request length to that order to
+ * find a suitable group faster.
+ *
+ * This optimizes allocation speed at the cost of slightly reduced
+ * preallocations. However, we make sure that we don't trim the request too
+ * much and fall to CR_GOAL_LEN_SLOW in that case.
+ */
+static void ext4_mb_choose_next_group_best_avail(struct ext4_allocation_context *ac,
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ struct ext4_group_info *grp = NULL;
+ int i, order, min_order;
+ unsigned long num_stripe_clusters = 0;
+
+ if (unlikely(ac->ac_flags & EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED)) {
+ if (sbi->s_mb_stats)
+ atomic_inc(&sbi->s_bal_best_avail_bad_suggestions);
+ }
+
+ /*
+ * mb_avg_fragment_size_order() returns order in a way that makes
+ * retrieving back the length using (1 << order) inaccurate. Hence, use
+ * fls() instead since we need to know the actual length while modifying
+ * goal length.
+ */
+ order = fls(ac->ac_g_ex.fe_len);
+ min_order = order - sbi->s_mb_best_avail_max_trim_order;
+ if (min_order < 0)
+ min_order = 0;
+
+ if (1 << min_order < ac->ac_o_ex.fe_len)
+ min_order = fls(ac->ac_o_ex.fe_len) + 1;
+
+ if (sbi->s_stripe > 0) {
+ /*
+ * We are assuming that stripe size is always a multiple of
+ * cluster ratio otherwise __ext4_fill_super exists early.
+ */
+ num_stripe_clusters = EXT4_NUM_B2C(sbi, sbi->s_stripe);
+ if (1 << min_order < num_stripe_clusters)
+ min_order = fls(num_stripe_clusters);
+ }
+
+ for (i = order; i >= min_order; i--) {
+ int frag_order;
+ /*
+ * Scale down goal len to make sure we find something
+ * in the free fragments list. Basically, reduce
+ * preallocations.
+ */
+ ac->ac_g_ex.fe_len = 1 << i;
+
+ if (num_stripe_clusters > 0) {
+ /*
+ * Try to round up the adjusted goal length to
+ * stripe size (in cluster units) multiple for
+ * efficiency.
+ */
+ ac->ac_g_ex.fe_len = roundup(ac->ac_g_ex.fe_len,
+ num_stripe_clusters);
}
- read_unlock(&sbi->s_mb_avg_fragment_size_locks[i]);
+
+ frag_order = mb_avg_fragment_size_order(ac->ac_sb,
+ ac->ac_g_ex.fe_len);
+
+ grp = ext4_mb_find_good_group_avg_frag_lists(ac, frag_order);
if (grp)
break;
}
if (grp) {
*group = grp->bb_group;
- ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED;
+ ac->ac_flags |= EXT4_MB_CR_BEST_AVAIL_LEN_OPTIMIZED;
} else {
- *new_cr = 2;
+ /* Reset goal length to original goal length before falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
+ *new_cr = CR_GOAL_LEN_SLOW;
}
}
@@ -955,7 +1065,7 @@ static inline int should_optimize_scan(struct ext4_allocation_context *ac)
{
if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN)))
return 0;
- if (ac->ac_criteria >= 2)
+ if (ac->ac_criteria >= CR_GOAL_LEN_SLOW)
return 0;
if (!ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS))
return 0;
@@ -1000,7 +1110,7 @@ inc_and_return:
* @ngroups Total number of groups
*/
static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
- int *new_cr, ext4_group_t *group, ext4_group_t ngroups)
+ enum criteria *new_cr, ext4_group_t *group, ext4_group_t ngroups)
{
*new_cr = ac->ac_criteria;
@@ -1009,10 +1119,12 @@ static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac,
return;
}
- if (*new_cr == 0) {
- ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups);
- } else if (*new_cr == 1) {
- ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups);
+ if (*new_cr == CR_POWER2_ALIGNED) {
+ ext4_mb_choose_next_group_p2_aligned(ac, new_cr, group, ngroups);
+ } else if (*new_cr == CR_GOAL_LEN_FAST) {
+ ext4_mb_choose_next_group_goal_fast(ac, new_cr, group, ngroups);
+ } else if (*new_cr == CR_BEST_AVAIL_LEN) {
+ ext4_mb_choose_next_group_best_avail(ac, new_cr, group, ngroups);
} else {
/*
* TODO: For CR=2, we can arrange groups in an rb tree sorted by
@@ -2103,6 +2215,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac,
BUG_ON(ac->ac_status != AC_STATUS_CONTINUE);
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
/*
* The special case - take what you catch first
@@ -2207,11 +2320,11 @@ int ext4_mb_find_by_goal(struct ext4_allocation_context *ac,
ac->ac_g_ex.fe_len, &ex);
ex.fe_logical = 0xDEADFA11; /* debug value */
- if (max >= ac->ac_g_ex.fe_len && ac->ac_g_ex.fe_len == sbi->s_stripe) {
+ if (max >= ac->ac_g_ex.fe_len &&
+ ac->ac_g_ex.fe_len == EXT4_B2C(sbi, sbi->s_stripe)) {
ext4_fsblk_t start;
- start = ext4_group_first_block_no(ac->ac_sb, e4b->bd_group) +
- ex.fe_start;
+ start = ext4_grp_offs_to_block(ac->ac_sb, &ex);
/* use do_div to get remainder (would be 64-bit modulo) */
if (do_div(start, sbi->s_stripe) == 0) {
ac->ac_found++;
@@ -2277,6 +2390,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac,
break;
}
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ac->ac_b_ex.fe_len = 1 << i;
ac->ac_b_ex.fe_start = k << i;
@@ -2305,7 +2419,7 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
struct super_block *sb = ac->ac_sb;
void *bitmap = e4b->bd_bitmap;
struct ext4_free_extent ex;
- int i;
+ int i, j, freelen;
int free;
free = e4b->bd_info->bb_free;
@@ -2332,6 +2446,24 @@ void ext4_mb_complex_scan_group(struct ext4_allocation_context *ac,
break;
}
+ if (ac->ac_criteria < CR_FAST) {
+ /*
+ * In CR_GOAL_LEN_FAST and CR_BEST_AVAIL_LEN, we are
+ * sure that this group will have a large enough
+ * continuous free extent, so skip over the smaller free
+ * extents
+ */
+ j = mb_find_next_bit(bitmap,
+ EXT4_CLUSTERS_PER_GROUP(sb), i);
+ freelen = j - i;
+
+ if (freelen < ac->ac_g_ex.fe_len) {
+ i = j;
+ free -= freelen;
+ continue;
+ }
+ }
+
mb_find_extent(e4b, i, ac->ac_g_ex.fe_len, &ex);
if (WARN_ON(ex.fe_len <= 0))
break;
@@ -2373,7 +2505,7 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
struct ext4_free_extent ex;
ext4_fsblk_t first_group_block;
ext4_fsblk_t a;
- ext4_grpblk_t i;
+ ext4_grpblk_t i, stripe;
int max;
BUG_ON(sbi->s_stripe == 0);
@@ -2385,18 +2517,21 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
do_div(a, sbi->s_stripe);
i = (a * sbi->s_stripe) - first_group_block;
+ stripe = EXT4_B2C(sbi, sbi->s_stripe);
+ i = EXT4_B2C(sbi, i);
while (i < EXT4_CLUSTERS_PER_GROUP(sb)) {
if (!mb_test_bit(i, bitmap)) {
- max = mb_find_extent(e4b, i, sbi->s_stripe, &ex);
- if (max >= sbi->s_stripe) {
+ max = mb_find_extent(e4b, i, stripe, &ex);
+ if (max >= stripe) {
ac->ac_found++;
+ ac->ac_cX_found[ac->ac_criteria]++;
ex.fe_logical = 0xDEADF00D; /* debug value */
ac->ac_b_ex = ex;
ext4_mb_use_best_found(ac, e4b);
break;
}
}
- i += sbi->s_stripe;
+ i += stripe;
}
}
@@ -2406,13 +2541,13 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac,
* for the allocation or not.
*/
static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
ext4_grpblk_t free, fragments;
int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
- BUG_ON(cr < 0 || cr >= 4);
+ BUG_ON(cr < CR_POWER2_ALIGNED || cr >= EXT4_MB_NUM_CRS);
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp) || !grp))
return false;
@@ -2426,7 +2561,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false;
switch (cr) {
- case 0:
+ case CR_POWER2_ALIGNED:
BUG_ON(ac->ac_2order == 0);
/* Avoid using the first bg of a flexgroup for data files */
@@ -2445,15 +2580,16 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
return false;
return true;
- case 1:
+ case CR_GOAL_LEN_FAST:
+ case CR_BEST_AVAIL_LEN:
if ((free / fragments) >= ac->ac_g_ex.fe_len)
return true;
break;
- case 2:
+ case CR_GOAL_LEN_SLOW:
if (free >= ac->ac_g_ex.fe_len)
return true;
break;
- case 3:
+ case CR_ANY_FREE:
return true;
default:
BUG();
@@ -2474,7 +2610,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac,
* out"!
*/
static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
- ext4_group_t group, int cr)
+ ext4_group_t group, enum criteria cr)
{
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
struct super_block *sb = ac->ac_sb;
@@ -2494,7 +2630,7 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
free = grp->bb_free;
if (free == 0)
goto out;
- if (cr <= 2 && free < ac->ac_g_ex.fe_len)
+ if (cr <= CR_FAST && free < ac->ac_g_ex.fe_len)
goto out;
if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp)))
goto out;
@@ -2509,15 +2645,16 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac,
ext4_get_group_desc(sb, group, NULL);
int ret;
- /* cr=0/1 is a very optimistic search to find large
- * good chunks almost for free. If buddy data is not
- * ready, then this optimization makes no sense. But
- * we never skip the first block group in a flex_bg,
- * since this gets used for metadata block allocation,
- * and we want to make sure we locate metadata blocks
- * in the first block group in the flex_bg if possible.
+ /*
+ * cr=CR_POWER2_ALIGNED/CR_GOAL_LEN_FAST is a very optimistic
+ * search to find large good chunks almost for free. If buddy
+ * data is not ready, then this optimization makes no sense. But
+ * we never skip the first block group in a flex_bg, since this
+ * gets used for metadata block allocation, and we want to make
+ * sure we locate metadata blocks in the first block group in
+ * the flex_bg if possible.
*/
- if (cr < 2 &&
+ if (cr < CR_FAST &&
(!sbi->s_log_groups_per_flex ||
((group & ((1 << sbi->s_log_groups_per_flex) - 1)) != 0)) &&
!(ext4_has_group_desc_csum(sb) &&
@@ -2567,9 +2704,7 @@ ext4_group_t ext4_mb_prefetch(struct super_block *sb, ext4_group_t group,
*/
if (gdp && grp && !EXT4_MB_GRP_TEST_AND_SET_READ(grp) &&
EXT4_MB_GRP_NEED_INIT(grp) &&
- ext4_free_group_clusters(sb, gdp) > 0 &&
- !(ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ ext4_free_group_clusters(sb, gdp) > 0 ) {
bh = ext4_read_block_bitmap_nowait(sb, group, true);
if (bh && !IS_ERR(bh)) {
if (!buffer_uptodate(bh) && cnt)
@@ -2610,9 +2745,7 @@ void ext4_mb_prefetch_fini(struct super_block *sb, ext4_group_t group,
grp = ext4_get_group_info(sb, group);
if (grp && gdp && EXT4_MB_GRP_NEED_INIT(grp) &&
- ext4_free_group_clusters(sb, gdp) > 0 &&
- !(ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)))) {
+ ext4_free_group_clusters(sb, gdp) > 0) {
if (ext4_mb_init_group(sb, group, GFP_NOFS))
break;
}
@@ -2623,7 +2756,7 @@ static noinline_for_stack int
ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
{
ext4_group_t prefetch_grp = 0, ngroups, group, i;
- int cr = -1, new_cr;
+ enum criteria new_cr, cr = CR_GOAL_LEN_FAST;
int err = 0, first_err = 0;
unsigned int nr = 0, prefetch_ios = 0;
struct ext4_sb_info *sbi;
@@ -2680,14 +2813,15 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
spin_unlock(&sbi->s_md_lock);
}
- /* Let's just scan groups to find more-less suitable blocks */
- cr = ac->ac_2order ? 0 : 1;
/*
- * cr == 0 try to get exact allocation,
- * cr == 3 try to get anything
+ * Let's just scan groups to find more-less suitable blocks We
+ * start with CR_GOAL_LEN_FAST, unless it is power of 2
+ * aligned, in which case let's do that faster approach first.
*/
+ if (ac->ac_2order)
+ cr = CR_POWER2_ALIGNED;
repeat:
- for (; cr < 4 && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
+ for (; cr < EXT4_MB_NUM_CRS && ac->ac_status == AC_STATUS_CONTINUE; cr++) {
ac->ac_criteria = cr;
/*
* searching for the right group start
@@ -2714,10 +2848,8 @@ repeat:
* spend a lot of time loading imperfect groups
*/
if ((prefetch_grp == group) &&
- (cr > 1 ||
+ (cr >= CR_FAST ||
prefetch_ios < sbi->s_mb_prefetch_limit)) {
- unsigned int curr_ios = prefetch_ios;
-
nr = sbi->s_mb_prefetch;
if (ext4_has_feature_flex_bg(sb)) {
nr = 1 << sbi->s_log_groups_per_flex;
@@ -2726,8 +2858,6 @@ repeat:
}
prefetch_grp = ext4_mb_prefetch(sb, group,
nr, &prefetch_ios);
- if (prefetch_ios == curr_ios)
- nr = 0;
}
/* This now checks without needing the buddy page */
@@ -2756,10 +2886,13 @@ repeat:
}
ac->ac_groups_scanned++;
- if (cr == 0)
+ if (cr == CR_POWER2_ALIGNED)
ext4_mb_simple_scan_group(ac, &e4b);
- else if (cr == 1 && sbi->s_stripe &&
- !(ac->ac_g_ex.fe_len % sbi->s_stripe))
+ else if ((cr == CR_GOAL_LEN_FAST ||
+ cr == CR_BEST_AVAIL_LEN) &&
+ sbi->s_stripe &&
+ !(ac->ac_g_ex.fe_len %
+ EXT4_B2C(sbi, sbi->s_stripe)))
ext4_mb_scan_aligned(ac, &e4b);
else
ext4_mb_complex_scan_group(ac, &e4b);
@@ -2773,6 +2906,11 @@ repeat:
/* Processed all groups and haven't found blocks */
if (sbi->s_mb_stats && i == ngroups)
atomic64_inc(&sbi->s_bal_cX_failed[cr]);
+
+ if (i == ngroups && ac->ac_criteria == CR_BEST_AVAIL_LEN)
+ /* Reset goal length to original goal length before
+ * falling into CR_GOAL_LEN_SLOW */
+ ac->ac_g_ex.fe_len = ac->ac_orig_goal_len;
}
if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND &&
@@ -2798,7 +2936,7 @@ repeat:
ac->ac_b_ex.fe_len = 0;
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_flags |= EXT4_MB_HINT_FIRST;
- cr = 3;
+ cr = CR_ANY_FREE;
goto repeat;
}
}
@@ -2914,51 +3052,94 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
seq_puts(seq, "mballoc:\n");
if (!sbi->s_mb_stats) {
seq_puts(seq, "\tmb stats collection turned off.\n");
- seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
+ seq_puts(
+ seq,
+ "\tTo enable, please write \"1\" to sysfs file mb_stats.\n");
return 0;
}
seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs));
seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success));
- seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned));
-
- seq_puts(seq, "\tcr0_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0]));
- seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[0]));
+ seq_printf(seq, "\tgroups_scanned: %u\n",
+ atomic_read(&sbi->s_bal_groups_scanned));
+
+ /* CR_POWER2_ALIGNED stats */
+ seq_puts(seq, "\tcr_p2_aligned_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_POWER2_ALIGNED]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_POWER2_ALIGNED]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[0]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_POWER2_ALIGNED]));
seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_cr0_bad_suggestions));
+ atomic_read(&sbi->s_bal_p2_aligned_bad_suggestions));
- seq_puts(seq, "\tcr1_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1]));
+ /* CR_GOAL_LEN_FAST stats */
+ seq_puts(seq, "\tcr_goal_fast_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[1]));
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_FAST]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[1]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_FAST]));
seq_printf(seq, "\t\tbad_suggestions: %u\n",
- atomic_read(&sbi->s_bal_cr1_bad_suggestions));
-
- seq_puts(seq, "\tcr2_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2]));
- seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[2]));
+ atomic_read(&sbi->s_bal_goal_fast_bad_suggestions));
+
+ /* CR_BEST_AVAIL_LEN stats */
+ seq_puts(seq, "\tcr_best_avail_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_BEST_AVAIL_LEN]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_BEST_AVAIL_LEN]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[2]));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_BEST_AVAIL_LEN]));
+ seq_printf(seq, "\t\tbad_suggestions: %u\n",
+ atomic_read(&sbi->s_bal_best_avail_bad_suggestions));
- seq_puts(seq, "\tcr3_stats:\n");
- seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3]));
+ /* CR_GOAL_LEN_SLOW stats */
+ seq_puts(seq, "\tcr_goal_slow_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_GOAL_LEN_SLOW]));
seq_printf(seq, "\t\tgroups_considered: %llu\n",
- atomic64_read(&sbi->s_bal_cX_groups_considered[3]));
+ atomic64_read(
+ &sbi->s_bal_cX_groups_considered[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_GOAL_LEN_SLOW]));
+ seq_printf(seq, "\t\tuseless_loops: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_failed[CR_GOAL_LEN_SLOW]));
+
+ /* CR_ANY_FREE stats */
+ seq_puts(seq, "\tcr_any_free_stats:\n");
+ seq_printf(seq, "\t\thits: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_hits[CR_ANY_FREE]));
+ seq_printf(
+ seq, "\t\tgroups_considered: %llu\n",
+ atomic64_read(&sbi->s_bal_cX_groups_considered[CR_ANY_FREE]));
+ seq_printf(seq, "\t\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_cX_ex_scanned[CR_ANY_FREE]));
seq_printf(seq, "\t\tuseless_loops: %llu\n",
- atomic64_read(&sbi->s_bal_cX_failed[3]));
- seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned));
+ atomic64_read(&sbi->s_bal_cX_failed[CR_ANY_FREE]));
+
+ /* Aggregates */
+ seq_printf(seq, "\textents_scanned: %u\n",
+ atomic_read(&sbi->s_bal_ex_scanned));
seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals));
+ seq_printf(seq, "\t\tlen_goal_hits: %u\n",
+ atomic_read(&sbi->s_bal_len_goals));
seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders));
seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks));
seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks));
-
seq_printf(seq, "\tbuddies_generated: %u/%u\n",
atomic_read(&sbi->s_mb_buddies_generated),
ext4_get_groups_count(sb));
@@ -2966,8 +3147,7 @@ int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset)
atomic64_read(&sbi->s_mb_generation_time));
seq_printf(seq, "\tpreallocated: %u\n",
atomic_read(&sbi->s_mb_preallocated));
- seq_printf(seq, "\tdiscarded: %u\n",
- atomic_read(&sbi->s_mb_discarded));
+ seq_printf(seq, "\tdiscarded: %u\n", atomic_read(&sbi->s_mb_discarded));
return 0;
}
@@ -3454,6 +3634,8 @@ int ext4_mb_init(struct super_block *sb)
sbi->s_mb_stats = MB_DEFAULT_STATS;
sbi->s_mb_stream_request = MB_DEFAULT_STREAM_THRESHOLD;
sbi->s_mb_order2_reqs = MB_DEFAULT_ORDER2_REQS;
+ sbi->s_mb_best_avail_max_trim_order = MB_DEFAULT_BEST_AVAIL_TRIM_ORDER;
+
/*
* The default group preallocation is 512, which for 4k block
* sizes translates to 2 megabytes. However for bigalloc file
@@ -3478,7 +3660,7 @@ int ext4_mb_init(struct super_block *sb)
*/
if (sbi->s_stripe > 1) {
sbi->s_mb_group_prealloc = roundup(
- sbi->s_mb_group_prealloc, sbi->s_stripe);
+ sbi->s_mb_group_prealloc, EXT4_B2C(sbi, sbi->s_stripe));
}
sbi->s_locality_groups = alloc_percpu(struct ext4_locality_group);
@@ -4283,7 +4465,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(22 - bsbits)) << 22;
size = 4 * 1024 * 1024;
- } else if (NRL_CHECK_SIZE(ac->ac_o_ex.fe_len,
+ } else if (NRL_CHECK_SIZE(EXT4_C2B(sbi, ac->ac_o_ex.fe_len),
(8<<20)>>bsbits, max, 8 * 1024)) {
start_off = ((loff_t)ac->ac_o_ex.fe_logical >>
(23 - bsbits)) << 23;
@@ -4357,6 +4539,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac,
* placement or satisfy big request as is */
ac->ac_g_ex.fe_logical = start;
ac->ac_g_ex.fe_len = EXT4_NUM_B2C(sbi, size);
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
/* define goal start in order to merge */
if (ar->pright && (ar->lright == (start + size)) &&
@@ -4390,11 +4573,20 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac)
atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated);
if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len)
atomic_inc(&sbi->s_bal_success);
+
atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned);
+ for (int i=0; i<EXT4_MB_NUM_CRS; i++) {
+ atomic_add(ac->ac_cX_found[i], &sbi->s_bal_cX_ex_scanned[i]);
+ }
+
atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned);
if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start &&
ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group)
atomic_inc(&sbi->s_bal_goals);
+ /* did we allocate as much as normalizer originally wanted? */
+ if (ac->ac_f_ex.fe_len == ac->ac_orig_goal_len)
+ atomic_inc(&sbi->s_bal_len_goals);
+
if (ac->ac_found > sbi->s_mb_max_to_scan)
atomic_inc(&sbi->s_bal_breaks);
}
@@ -4529,6 +4721,37 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block,
}
/*
+ * check if found pa meets EXT4_MB_HINT_GOAL_ONLY
+ */
+static bool
+ext4_mb_pa_goal_check(struct ext4_allocation_context *ac,
+ struct ext4_prealloc_space *pa)
+{
+ struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
+ ext4_fsblk_t start;
+
+ if (likely(!(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY)))
+ return true;
+
+ /*
+ * If EXT4_MB_HINT_GOAL_ONLY is set, ac_g_ex will not be adjusted
+ * in ext4_mb_normalize_request and will keep same with ac_o_ex
+ * from ext4_mb_initialize_context. Choose ac_g_ex here to keep
+ * consistent with ext4_mb_find_by_goal.
+ */
+ start = pa->pa_pstart +
+ (ac->ac_g_ex.fe_logical - pa->pa_lstart);
+ if (ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex) != start)
+ return false;
+
+ if (ac->ac_g_ex.fe_len > pa->pa_len -
+ EXT4_B2C(sbi, ac->ac_g_ex.fe_logical - pa->pa_lstart))
+ return false;
+
+ return true;
+}
+
+/*
* search goal blocks in preallocated space
*/
static noinline_for_stack bool
@@ -4578,11 +4801,11 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
/* found preallocated blocks, use them */
spin_lock(&tmp_pa->pa_lock);
- if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free) {
+ if (tmp_pa->pa_deleted == 0 && tmp_pa->pa_free &&
+ likely(ext4_mb_pa_goal_check(ac, tmp_pa))) {
atomic_inc(&tmp_pa->pa_count);
ext4_mb_use_inode_pa(ac, tmp_pa);
spin_unlock(&tmp_pa->pa_lock);
- ac->ac_criteria = 10;
read_unlock(&ei->i_prealloc_lock);
return true;
}
@@ -4625,7 +4848,6 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
}
if (cpa) {
ext4_mb_use_group_pa(ac, cpa);
- ac->ac_criteria = 20;
return true;
}
return false;
@@ -4849,7 +5071,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
pa = ac->ac_pa;
- if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) {
+ if (ac->ac_b_ex.fe_len < ac->ac_orig_goal_len) {
int new_bex_start;
int new_bex_end;
@@ -4864,14 +5086,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
* fragmentation in check while ensuring logical range of best
* extent doesn't overflow out of goal extent:
*
- * 1. Check if best ex can be kept at end of goal and still
- * cover original start
+ * 1. Check if best ex can be kept at end of goal (before
+ * cr_best_avail trimmed it) and still cover original start
* 2. Else, check if best ex can be kept at start of goal and
* still cover original start
* 3. Else, keep the best ex at start of original request.
*/
new_bex_end = ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_g_ex.fe_len);
+ EXT4_C2B(sbi, ac->ac_orig_goal_len);
new_bex_start = new_bex_end - EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
if (ac->ac_o_ex.fe_logical >= new_bex_start)
goto adjust_bex;
@@ -4892,7 +5114,7 @@ adjust_bex:
BUG_ON(ac->ac_o_ex.fe_logical < ac->ac_b_ex.fe_logical);
BUG_ON(ac->ac_o_ex.fe_len > ac->ac_b_ex.fe_len);
BUG_ON(new_bex_end > (ac->ac_g_ex.fe_logical +
- EXT4_C2B(sbi, ac->ac_g_ex.fe_len)));
+ EXT4_C2B(sbi, ac->ac_orig_goal_len)));
}
pa->pa_lstart = ac->ac_b_ex.fe_logical;
@@ -5399,6 +5621,10 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
(unsigned long)ac->ac_b_ex.fe_logical,
(int)ac->ac_criteria);
mb_debug(sb, "%u found", ac->ac_found);
+ mb_debug(sb, "used pa: %s, ", ac->ac_pa ? "yes" : "no");
+ if (ac->ac_pa)
+ mb_debug(sb, "pa_type %s\n", ac->ac_pa->pa_type == MB_GROUP_PA ?
+ "group pa" : "inode pa");
ext4_mb_show_pa(sb);
}
#else
@@ -5508,6 +5734,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac,
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
ac->ac_g_ex = ac->ac_o_ex;
+ ac->ac_orig_goal_len = ac->ac_g_ex.fe_len;
ac->ac_flags = ar->flags;
/* we have to define context: we'll work with a file or
@@ -5751,8 +5978,72 @@ out_dbg:
return ret;
}
-static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
- struct ext4_allocation_request *ar, int *errp);
+/*
+ * Simple allocator for Ext4 fast commit replay path. It searches for blocks
+ * linearly starting at the goal block and also excludes the blocks which
+ * are going to be in use after fast commit replay.
+ */
+static ext4_fsblk_t
+ext4_mb_new_blocks_simple(struct ext4_allocation_request *ar, int *errp)
+{
+ struct buffer_head *bitmap_bh;
+ struct super_block *sb = ar->inode->i_sb;
+ struct ext4_sb_info *sbi = EXT4_SB(sb);
+ ext4_group_t group, nr;
+ ext4_grpblk_t blkoff;
+ ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
+ ext4_grpblk_t i = 0;
+ ext4_fsblk_t goal, block;
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ goal = ar->goal;
+ if (goal < le32_to_cpu(es->s_first_data_block) ||
+ goal >= ext4_blocks_count(es))
+ goal = le32_to_cpu(es->s_first_data_block);
+
+ ar->len = 0;
+ ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
+ for (nr = ext4_get_groups_count(sb); nr > 0; nr--) {
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ *errp = PTR_ERR(bitmap_bh);
+ pr_warn("Failed to read block bitmap\n");
+ return 0;
+ }
+
+ while (1) {
+ i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
+ blkoff);
+ if (i >= max)
+ break;
+ if (ext4_fc_replay_check_excluded(sb,
+ ext4_group_first_block_no(sb, group) +
+ EXT4_C2B(sbi, i))) {
+ blkoff = i + 1;
+ } else
+ break;
+ }
+ brelse(bitmap_bh);
+ if (i < max)
+ break;
+
+ if (++group >= ext4_get_groups_count(sb))
+ group = 0;
+
+ blkoff = 0;
+ }
+
+ if (i >= max) {
+ *errp = -ENOSPC;
+ return 0;
+ }
+
+ block = ext4_group_first_block_no(sb, group) + EXT4_C2B(sbi, i);
+ ext4_mb_mark_bb(sb, block, 1, 1);
+ ar->len = 1;
+
+ return block;
+}
/*
* Main entry point into mballoc to allocate blocks
@@ -5777,7 +6068,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
trace_ext4_request_blocks(ar);
if (sbi->s_mount_state & EXT4_FC_REPLAY)
- return ext4_mb_new_blocks_simple(handle, ar, errp);
+ return ext4_mb_new_blocks_simple(ar, errp);
/* Allow to use superuser reservation for quota file */
if (ext4_is_quota_file(ar->inode))
@@ -6001,68 +6292,6 @@ ext4_mb_free_metadata(handle_t *handle, struct ext4_buddy *e4b,
spin_unlock(&sbi->s_md_lock);
}
-/*
- * Simple allocator for Ext4 fast commit replay path. It searches for blocks
- * linearly starting at the goal block and also excludes the blocks which
- * are going to be in use after fast commit replay.
- */
-static ext4_fsblk_t ext4_mb_new_blocks_simple(handle_t *handle,
- struct ext4_allocation_request *ar, int *errp)
-{
- struct buffer_head *bitmap_bh;
- struct super_block *sb = ar->inode->i_sb;
- ext4_group_t group;
- ext4_grpblk_t blkoff;
- ext4_grpblk_t max = EXT4_CLUSTERS_PER_GROUP(sb);
- ext4_grpblk_t i = 0;
- ext4_fsblk_t goal, block;
- struct ext4_super_block *es = EXT4_SB(sb)->s_es;
-
- goal = ar->goal;
- if (goal < le32_to_cpu(es->s_first_data_block) ||
- goal >= ext4_blocks_count(es))
- goal = le32_to_cpu(es->s_first_data_block);
-
- ar->len = 0;
- ext4_get_group_no_and_offset(sb, goal, &group, &blkoff);
- for (; group < ext4_get_groups_count(sb); group++) {
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- *errp = PTR_ERR(bitmap_bh);
- pr_warn("Failed to read block bitmap\n");
- return 0;
- }
-
- while (1) {
- i = mb_find_next_zero_bit(bitmap_bh->b_data, max,
- blkoff);
- if (i >= max)
- break;
- if (ext4_fc_replay_check_excluded(sb,
- ext4_group_first_block_no(sb, group) + i)) {
- blkoff = i + 1;
- } else
- break;
- }
- brelse(bitmap_bh);
- if (i < max)
- break;
-
- blkoff = 0;
- }
-
- if (group >= ext4_get_groups_count(sb) || i >= max) {
- *errp = -ENOSPC;
- return 0;
- }
-
- block = ext4_group_first_block_no(sb, group) + i;
- ext4_mb_mark_bb(sb, block, 1, 1);
- ar->len = 1;
-
- return block;
-}
-
static void ext4_free_blocks_simple(struct inode *inode, ext4_fsblk_t block,
unsigned long count)
{
@@ -6243,8 +6472,8 @@ do_more:
* them with group lock_held
*/
if (test_opt(sb, DISCARD)) {
- err = ext4_issue_discard(sb, block_group, bit, count,
- NULL);
+ err = ext4_issue_discard(sb, block_group, bit,
+ count_clusters, NULL);
if (err && err != -EOPNOTSUPP)
ext4_msg(sb, KERN_WARNING, "discard request in"
" group:%u block:%d count:%lu failed"
@@ -6328,12 +6557,6 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
sbi = EXT4_SB(sb);
- if (sbi->s_mount_state & EXT4_FC_REPLAY) {
- ext4_free_blocks_simple(inode, block, count);
- return;
- }
-
- might_sleep();
if (bh) {
if (block)
BUG_ON(block != bh->b_blocknr);
@@ -6341,6 +6564,13 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode,
block = bh->b_blocknr;
}
+ if (sbi->s_mount_state & EXT4_FC_REPLAY) {
+ ext4_free_blocks_simple(inode, block, EXT4_NUM_B2C(sbi, count));
+ return;
+ }
+
+ might_sleep();
+
if (!(flags & EXT4_FREE_BLOCKS_VALIDATED) &&
!ext4_inode_block_valid(inode, block, count)) {
ext4_error(sb, "Freeing blocks not in datazone - "
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 6d85ee8674a6..df6b5e7c2274 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -49,7 +49,7 @@
#define MB_DEFAULT_MIN_TO_SCAN 10
/*
- * with 'ext4_mb_stats' allocator will collect stats that will be
+ * with 's_mb_stats' allocator will collect stats that will be
* shown at umount. The collecting costs though!
*/
#define MB_DEFAULT_STATS 0
@@ -86,6 +86,13 @@
#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16
/*
+ * The maximum order upto which CR_BEST_AVAIL_LEN can trim a particular
+ * allocation request. Example, if we have an order 7 request and max trim order
+ * of 3, we can trim this request upto order 4.
+ */
+#define MB_DEFAULT_BEST_AVAIL_TRIM_ORDER 3
+
+/*
* Number of valid buddy orders
*/
#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2)
@@ -179,11 +186,18 @@ struct ext4_allocation_context {
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
+ /*
+ * goal len can change in CR1.5, so save the original len. This is
+ * used while adjusting the PA window and for accounting.
+ */
+ ext4_grpblk_t ac_orig_goal_len;
+
__u32 ac_groups_considered;
__u32 ac_flags; /* allocation hints */
__u16 ac_groups_scanned;
__u16 ac_groups_linear_remaining;
__u16 ac_found;
+ __u16 ac_cX_found[EXT4_MB_NUM_CRS];
__u16 ac_tail;
__u16 ac_buddy;
__u8 ac_status;
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 6f46823fba61..3e7d160f543f 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -334,7 +334,7 @@ int ext4_mpage_readpages(struct inode *inode,
folio_size(folio));
if (first_hole == 0) {
if (ext4_need_verity(inode, folio->index) &&
- !fsverity_verify_page(&folio->page))
+ !fsverity_verify_folio(folio))
goto set_error_page;
folio_mark_uptodate(folio);
folio_unlock(folio);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index eaa5858d5285..c94ebf704616 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1133,6 +1133,12 @@ static void ext4_blkdev_remove(struct ext4_sb_info *sbi)
struct block_device *bdev;
bdev = sbi->s_journal_bdev;
if (bdev) {
+ /*
+ * Invalidate the journal device's buffers. We don't want them
+ * floating about in memory - the physical journal device may
+ * hotswapped, and it breaks the `ro-after' testing code.
+ */
+ invalidate_bdev(bdev);
blkdev_put(bdev, sbi->s_sb);
sbi->s_journal_bdev = NULL;
}
@@ -1164,12 +1170,12 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
#ifdef CONFIG_QUOTA
static int ext4_quota_off(struct super_block *sb, int type);
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
- int type;
+ BUG_ON(type > EXT4_MAXQUOTAS);
/* Use our quota_off function to clear inode flags etc. */
- for (type = 0; type < EXT4_MAXQUOTAS; type++)
+ for (type--; type >= 0; type--)
ext4_quota_off(sb, type);
}
@@ -1185,7 +1191,7 @@ static inline char *get_qf_name(struct super_block *sb,
lockdep_is_held(&sb->s_umount));
}
#else
-static inline void ext4_quota_off_umount(struct super_block *sb)
+static inline void ext4_quotas_off(struct super_block *sb, int type)
{
}
#endif
@@ -1285,7 +1291,7 @@ static void ext4_put_super(struct super_block *sb)
&sb->s_uuid);
ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
flush_work(&sbi->s_error_work);
destroy_workqueue(sbi->rsv_conversion_wq);
@@ -1332,14 +1338,8 @@ static void ext4_put_super(struct super_block *sb)
sync_blockdev(sb->s_bdev);
invalidate_bdev(sb->s_bdev);
- if (sbi->s_journal_bdev && sbi->s_journal_bdev != sb->s_bdev) {
- /*
- * Invalidate the journal device's buffers. We don't want them
- * floating about in memory - the physical journal device may
- * hotswapped, and it breaks the `ro-after' testing code.
- */
+ if (sbi->s_journal_bdev) {
sync_blockdev(sbi->s_journal_bdev);
- invalidate_bdev(sbi->s_journal_bdev);
ext4_blkdev_remove(sbi);
}
@@ -3703,16 +3703,13 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
ext4_group_t group = elr->lr_next_group;
unsigned int prefetch_ios = 0;
int ret = 0;
+ int nr = EXT4_SB(sb)->s_mb_prefetch;
u64 start_time;
if (elr->lr_mode == EXT4_LI_MODE_PREFETCH_BBITMAP) {
- elr->lr_next_group = ext4_mb_prefetch(sb, group,
- EXT4_SB(sb)->s_mb_prefetch, &prefetch_ios);
- if (prefetch_ios)
- ext4_mb_prefetch_fini(sb, elr->lr_next_group,
- prefetch_ios);
- trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group,
- prefetch_ios);
+ elr->lr_next_group = ext4_mb_prefetch(sb, group, nr, &prefetch_ios);
+ ext4_mb_prefetch_fini(sb, elr->lr_next_group, nr);
+ trace_ext4_prefetch_bitmaps(sb, group, elr->lr_next_group, nr);
if (group >= elr->lr_next_group) {
ret = 1;
if (elr->lr_first_not_zeroed != ngroups &&
@@ -5308,6 +5305,19 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
goto failed_mount3;
sbi->s_stripe = ext4_get_stripe_size(sbi);
+ /*
+ * It's hard to get stripe aligned blocks if stripe is not aligned with
+ * cluster, just disable stripe and alert user to simpfy code and avoid
+ * stripe aligned allocation which will rarely successes.
+ */
+ if (sbi->s_stripe > 0 && sbi->s_cluster_ratio > 1 &&
+ sbi->s_stripe % sbi->s_cluster_ratio != 0) {
+ ext4_msg(sb, KERN_WARNING,
+ "stripe (%lu) is not aligned with cluster size (%u), "
+ "stripe is disabled",
+ sbi->s_stripe, sbi->s_cluster_ratio);
+ sbi->s_stripe = 0;
+ }
sbi->s_extent_max_zeroout_kb = 32;
/*
@@ -5578,7 +5588,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
ext4_msg(sb, KERN_INFO, "recovery complete");
err = ext4_mark_recovery_complete(sb, es);
if (err)
- goto failed_mount9;
+ goto failed_mount10;
}
if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev))
@@ -5597,7 +5607,9 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
return 0;
-failed_mount9:
+failed_mount10:
+ ext4_quotas_off(sb, EXT4_MAXQUOTAS);
+failed_mount9: __maybe_unused
ext4_release_orphan_info(sb);
failed_mount8:
ext4_unregister_sysfs(sb);
@@ -5656,6 +5668,7 @@ failed_mount:
brelse(sbi->s_sbh);
ext4_blkdev_remove(sbi);
out_fail:
+ invalidate_bdev(sb->s_bdev);
sb->s_fs_info = NULL;
return err;
}
@@ -5738,6 +5751,11 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal)
journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR;
else
journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR;
+ /*
+ * Always enable journal cycle record option, letting the journal
+ * records log transactions continuously between each mount.
+ */
+ journal->j_flags |= JBD2_CYCLE_RECORD;
write_unlock(&journal->j_state_lock);
}
@@ -5990,19 +6008,27 @@ static int ext4_load_journal(struct super_block *sb,
err = jbd2_journal_wipe(journal, !really_read_only);
if (!err) {
char *save = kmalloc(EXT4_S_ERR_LEN, GFP_KERNEL);
+ __le16 orig_state;
+ bool changed = false;
if (save)
memcpy(save, ((char *) es) +
EXT4_S_ERR_START, EXT4_S_ERR_LEN);
err = jbd2_journal_load(journal);
- if (save)
+ if (save && memcmp(((char *) es) + EXT4_S_ERR_START,
+ save, EXT4_S_ERR_LEN)) {
memcpy(((char *) es) + EXT4_S_ERR_START,
save, EXT4_S_ERR_LEN);
+ changed = true;
+ }
kfree(save);
+ orig_state = es->s_state;
es->s_state |= cpu_to_le16(EXT4_SB(sb)->s_mount_state &
EXT4_ERROR_FS);
+ if (orig_state != es->s_state)
+ changed = true;
/* Write out restored error information to the superblock */
- if (!bdev_read_only(sb->s_bdev)) {
+ if (changed && !really_read_only) {
int err2;
err2 = ext4_commit_super(sb);
err = err ? : err2;
@@ -7037,20 +7063,8 @@ int ext4_enable_quotas(struct super_block *sb)
"(type=%d, err=%d, ino=%lu). "
"Please run e2fsck to fix.", type,
err, qf_inums[type]);
- for (type--; type >= 0; type--) {
- struct inode *inode;
-
- inode = sb_dqopt(sb)->files[type];
- if (inode)
- inode = igrab(inode);
- dquot_quota_off(sb, type);
- if (inode) {
- lockdep_set_quota_inode(inode,
- I_DATA_SEM_NORMAL);
- iput(inode);
- }
- }
+ ext4_quotas_off(sb, type);
return err;
}
}
diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c
index 3042bc605bbf..6d332dff79dd 100644
--- a/fs/ext4/sysfs.c
+++ b/fs/ext4/sysfs.c
@@ -223,6 +223,7 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int
EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
+EXT4_RW_ATTR_SBI_UI(mb_best_avail_max_trim_order, s_mb_best_avail_max_trim_order);
#ifdef CONFIG_EXT4_DEBUG
EXT4_RW_ATTR_SBI_UL(simulate_fail, s_simulate_fail);
#endif
@@ -273,6 +274,7 @@ static struct attribute *ext4_attrs[] = {
ATTR_LIST(warning_ratelimit_burst),
ATTR_LIST(msg_ratelimit_interval_ms),
ATTR_LIST(msg_ratelimit_burst),
+ ATTR_LIST(mb_best_avail_max_trim_order),
ATTR_LIST(errors_count),
ATTR_LIST(warning_count),
ATTR_LIST(msg_count),