diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-06 01:19:28 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-06 01:19:28 +0200 |
commit | 0b166a57e6222666292a481b742af92b50c3ba50 (patch) | |
tree | fdc7e8b02ecf67366f783c361dbde9bcf07f4ec0 | |
parent | Merge tag 'for-5.8/dm-changes' of git://git.kernel.org/pub/scm/linux/kernel/g... (diff) | |
parent | ext4: avoid unnecessary transaction starts during writeback (diff) | |
download | linux-0b166a57e6222666292a481b742af92b50c3ba50.tar.xz linux-0b166a57e6222666292a481b742af92b50c3ba50.zip |
Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4
Pull ext4 updates from Ted Ts'o:
"A lot of bug fixes and cleanups for ext4, including:
- Fix performance problems found in dioread_nolock now that it is the
default, caused by transaction leaks.
- Clean up fiemap handling in ext4
- Clean up and refactor multiple block allocator (mballoc) code
- Fix a problem with mballoc with a smaller file systems running out
of blocks because they couldn't properly use blocks that had been
reserved by inode preallocation.
- Fixed a race in ext4_sync_parent() versus rename()
- Simplify the error handling in the extent manipulation code
- Make sure all metadata I/O errors are felected to
ext4_ext_dirty()'s and ext4_make_inode_dirty()'s callers.
- Avoid passing an error pointer to brelse in ext4_xattr_set()
- Fix race which could result to freeing an inode on the dirty last
in data=journal mode.
- Fix refcount handling if ext4_iget() fails
- Fix a crash in generic/019 caused by a corrupted extent node"
* tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (58 commits)
ext4: avoid unnecessary transaction starts during writeback
ext4: don't block for O_DIRECT if IOCB_NOWAIT is set
ext4: remove the access_ok() check in ext4_ioctl_get_es_cache
fs: remove the access_ok() check in ioctl_fiemap
fs: handle FIEMAP_FLAG_SYNC in fiemap_prep
fs: move fiemap range validation into the file systems instances
iomap: fix the iomap_fiemap prototype
fs: move the fiemap definitions out of fs.h
fs: mark __generic_block_fiemap static
ext4: remove the call to fiemap_check_flags in ext4_fiemap
ext4: split _ext4_fiemap
ext4: fix fiemap size checks for bitmap files
ext4: fix EXT4_MAX_LOGICAL_BLOCK macro
add comment for ext4_dir_entry_2 file_type member
jbd2: avoid leaking transaction credits when unreserving handle
ext4: drop ext4_journal_free_reserved()
ext4: mballoc: use lock for checking free blocks while retrying
ext4: mballoc: refactor ext4_mb_good_group()
ext4: mballoc: introduce pcpu seqcnt for freeing PA to improve ENOSPC handling
ext4: mballoc: refactor ext4_mb_discard_preallocations()
...
47 files changed, 905 insertions, 663 deletions
diff --git a/Documentation/filesystems/fiemap.rst b/Documentation/filesystems/fiemap.rst index 2a572e7edc08..93fc96f760aa 100644 --- a/Documentation/filesystems/fiemap.rst +++ b/Documentation/filesystems/fiemap.rst @@ -206,16 +206,18 @@ EINTR once fatal signal received. Flag checking should be done at the beginning of the ->fiemap callback via the -fiemap_check_flags() helper:: +fiemap_prep() helper:: - int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); + int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); The struct fieinfo should be passed in as received from ioctl_fiemap(). The set of fiemap flags which the fs understands should be passed via fs_flags. If -fiemap_check_flags finds invalid user flags, it will place the bad values in +fiemap_prep finds invalid user flags, it will place the bad values in fieinfo->fi_flags and return -EBADR. If the file system gets -EBADR, from -fiemap_check_flags(), it should immediately exit, returning that error back to -ioctl_fiemap(). +fiemap_prep(), it should immediately exit, returning that error back to +ioctl_fiemap(). Additionally the range is validate against the supported +maximum file size. For each extent in the request range, the file system should call diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 8035d2a44561..54f0ce444272 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -15,6 +15,7 @@ #include <linux/time.h> #include <linux/namei.h> #include <linux/poll.h> +#include <linux/fiemap.h> static int bad_file_open(struct inode *inode, struct file *filp) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 602bf3af9fb4..87f60a48f750 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -5,6 +5,7 @@ #include <linux/rbtree.h> #include <linux/refcount.h> +#include <linux/fiemap.h> #include "ulist.h" /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 768c8be4c765..31ac8c682f19 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7828,14 +7828,12 @@ const struct iomap_dio_ops btrfs_dops = { .submit_io = btrfs_submit_direct, }; -#define BTRFS_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { int ret; - ret = fiemap_check_flags(fieinfo, BTRFS_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 5d2965a23730..115d28e7c219 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -25,6 +25,7 @@ #include <linux/freezer.h> #include <linux/sched/signal.h> #include <linux/wait_bit.h> +#include <linux/fiemap.h> #include <asm/div64.h> #include "cifsfs.h" diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f829f4165d38..300ade2acc41 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -12,6 +12,7 @@ #include <linux/uuid.h> #include <linux/sort.h> #include <crypto/aead.h> +#include <linux/fiemap.h> #include "cifsfs.h" #include "cifsglob.h" #include "smb2pdu.h" @@ -3407,8 +3408,9 @@ static int smb3_fiemap(struct cifs_tcon *tcon, int i, num, rc, flags, last_blob; u64 next; - if (fiemap_check_flags(fei, FIEMAP_FLAG_SYNC)) - return -EBADR; + rc = fiemap_prep(d_inode(cfile->dentry), fei, start, &len, 0); + if (rc) + return rc; xid = get_xid(); again: diff --git a/fs/ext2/inode.c b/fs/ext2/inode.c index 2875c0a705b5..c8b371c82b4f 100644 --- a/fs/ext2/inode.c +++ b/fs/ext2/inode.c @@ -36,6 +36,7 @@ #include <linux/iomap.h> #include <linux/namei.h> #include <linux/uio.h> +#include <linux/fiemap.h> #include "ext2.h" #include "acl.h" #include "xattr.h" diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index 2a592e38cdfe..cf9e430514c4 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -99,8 +99,7 @@ config EXT4_DEBUG Enables run-time debugging support for the ext4 filesystem. If you select Y here, then you will be able to turn on debugging - with a command such as: - echo 1 > /sys/module/ext4/parameters/mballoc_debug + using dynamic debug control for mb_debug() / ext_debug() msgs. config EXT4_KUNIT_TESTS tristate "KUnit tests for ext4" diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index 8c7bbf3e566d..76f634d185f1 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -215,9 +215,8 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, value, size, xattr_flags); kfree(value); - if (!error) { + if (!error) set_cached_acl(inode, type, acl); - } return error; } @@ -256,7 +255,7 @@ retry: if (!error && update_mode) { inode->i_mode = mode; inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + error = ext4_mark_inode_dirty(handle, inode); } out_stop: ext4_journal_stop(handle); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a32e5f7b5385..1ba46d87cdf1 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -903,10 +903,11 @@ ext4_fsblk_t ext4_inode_to_goal_block(struct inode *inode) return bg_start; if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block) - colour = (current->pid % 16) * + colour = (task_pid_nr(current) % 16) * (EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16); else - colour = (current->pid % 16) * ((last_block - bg_start) / 16); + colour = (task_pid_nr(current) % 16) * + ((last_block - bg_start) / 16); return bg_start + colour; } diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 15b062efcff1..b08841f70b69 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -36,6 +36,7 @@ #include <crypto/hash.h> #include <linux/falloc.h> #include <linux/percpu-rwsem.h> +#include <linux/fiemap.h> #ifdef __KERNEL__ #include <linux/compat.h> #endif @@ -80,14 +81,22 @@ #define ext4_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif + /* + * Turn on EXT_DEBUG to enable ext4_ext_show_path/leaf/move in extents.c + */ +#define EXT_DEBUG__ + /* - * Turn on EXT_DEBUG to get lots of info about extents operations. + * Dynamic printk for controlled extents debugging. */ -#define EXT_DEBUG__ -#ifdef EXT_DEBUG -#define ext_debug(fmt, ...) printk(fmt, ##__VA_ARGS__) +#ifdef CONFIG_EXT4_DEBUG +#define ext_debug(ino, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): ino %lu: (%s, %d): %s:" fmt, \ + current->comm, task_pid_nr(current), \ + ino->i_sb->s_id, ino->i_ino, __FILE__, __LINE__, \ + __func__, ##__VA_ARGS__) #else -#define ext_debug(fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define ext_debug(ino, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif /* data type for block offset of block group */ @@ -142,6 +151,8 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_ROOT_BLOCKS 0x1000 /* Use blocks from reserved pool */ #define EXT4_MB_USE_RESERVED 0x2000 +/* Do strict check for free blocks while retrying block allocation */ +#define EXT4_MB_STRICT_CHECK 0x4000 struct ext4_allocation_request { /* target inode for block we're allocating */ @@ -171,10 +182,10 @@ struct ext4_allocation_request { * well as to store the information returned by ext4_map_blocks(). It * takes less room on the stack than a struct buffer_head. */ -#define EXT4_MAP_NEW (1 << BH_New) -#define EXT4_MAP_MAPPED (1 << BH_Mapped) -#define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) -#define EXT4_MAP_BOUNDARY (1 << BH_Boundary) +#define EXT4_MAP_NEW BIT(BH_New) +#define EXT4_MAP_MAPPED BIT(BH_Mapped) +#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) +#define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) @@ -417,7 +428,7 @@ struct flex_groups { /* 0x00400000 was formerly EXT4_EOFBLOCKS_FL */ #define EXT4_INLINE_DATA_FL 0x10000000 /* Inode has inline data. */ #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ -#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded file */ +#define EXT4_CASEFOLD_FL 0x40000000 /* Casefolded directory */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ #define EXT4_FL_USER_VISIBLE 0x705BDFFF /* User visible flags */ @@ -490,6 +501,7 @@ enum { /* 22 was formerly EXT4_INODE_EOFBLOCKS */ EXT4_INODE_INLINE_DATA = 28, /* Data in inode. */ EXT4_INODE_PROJINHERIT = 29, /* Create with parents projid */ + EXT4_INODE_CASEFOLD = 30, /* Casefolded directory */ EXT4_INODE_RESERVED = 31, /* reserved for ext4 lib */ }; @@ -535,6 +547,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(EA_INODE); CHECK_FLAG_VALUE(INLINE_DATA); CHECK_FLAG_VALUE(PROJINHERIT); + CHECK_FLAG_VALUE(CASEFOLD); CHECK_FLAG_VALUE(RESERVED); } @@ -609,8 +622,6 @@ enum { #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 /* Don't normalize allocation size (used for fallocate) */ #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 - /* Request will not result in inode size update (user for fallocate) */ -#define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Convert written extents to unwritten */ #define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 /* Write zeros to newly created written extents */ @@ -632,6 +643,7 @@ enum { */ #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 +#define EXT4_EX_NOFAIL 0x10000000 /* * Flags used by ext4_free_blocks @@ -2051,7 +2063,7 @@ struct ext4_dir_entry_2 { __le32 inode; /* Inode number */ __le16 rec_len; /* Directory entry length */ __u8 name_len; /* Name length */ - __u8 file_type; + __u8 file_type; /* See file type macros EXT4_FT_* below */ char name[EXT4_NAME_LEN]; /* File name */ }; @@ -3354,7 +3366,7 @@ struct ext4_extent; */ #define EXT_MAX_BLOCKS 0xffffffff -extern int ext4_ext_tree_init(handle_t *handle, struct inode *); +extern void ext4_ext_tree_init(handle_t *handle, struct inode *inode); extern int ext4_ext_index_trans_blocks(struct inode *inode, int extents); extern int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index 1c216fcc202a..44e59881a1f0 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -170,10 +170,13 @@ struct partial_cluster { (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) #define EXT_LAST_INDEX(__hdr__) \ (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1) -#define EXT_MAX_EXTENT(__hdr__) \ - (EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) +#define EXT_MAX_EXTENT(__hdr__) \ + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) \ + : 0) #define EXT_MAX_INDEX(__hdr__) \ - (EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1) + ((le16_to_cpu((__hdr__)->eh_max)) ? \ + ((EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)) : 0) static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 4b9002f0e84c..00dc668e052b 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -222,7 +222,10 @@ ext4_mark_iloc_dirty(handle_t *handle, int ext4_reserve_inode_write(handle_t *handle, struct inode *inode, struct ext4_iloc *iloc); -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode); +#define ext4_mark_inode_dirty(__h, __i) \ + __ext4_mark_inode_dirty((__h), (__i), __func__, __LINE__) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line); int ext4_expand_extra_isize(struct inode *inode, unsigned int new_extra_isize, @@ -335,12 +338,6 @@ static inline handle_t *__ext4_journal_start(struct inode *inode, handle_t *__ext4_journal_start_reserved(handle_t *handle, unsigned int line, int type); -static inline void ext4_journal_free_reserved(handle_t *handle) -{ - if (ext4_handle_valid(handle)) - jbd2_journal_free_reserved(handle); -} - static inline handle_t *ext4_journal_current_handle(void) { return journal_current_handle(); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 2b4b94542e34..7d088ff1e902 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -297,11 +297,14 @@ ext4_force_split_extent_at(handle_t *handle, struct inode *inode, { struct ext4_ext_path *path = *ppath; int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + int flags = EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO; + + if (nofail) + flags |= EXT4_GET_BLOCKS_METADATA_NOFAIL | EXT4_EX_NOFAIL; return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, - EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | - (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); + flags); } static int @@ -487,8 +490,12 @@ __read_extent_tree_block(const char *function, unsigned int line, { struct buffer_head *bh; int err; + gfp_t gfp_flags = __GFP_MOVABLE | GFP_NOFS; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; - bh = sb_getblk_gfp(inode->i_sb, pblk, __GFP_MOVABLE | GFP_NOFS); + bh = sb_getblk_gfp(inode->i_sb, pblk, gfp_flags); if (unlikely(!bh)) return ERR_PTR(-ENOMEM); @@ -600,22 +607,22 @@ static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path) { int k, l = path->p_depth; - ext_debug("path:"); + ext_debug(inode, "path:"); for (k = 0; k <= l; k++, path++) { if (path->p_idx) { - ext_debug(" %d->%llu", + ext_debug(inode, " %d->%llu", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); } else if (path->p_ext) { - ext_debug(" %d:[%d]%d:%llu ", + ext_debug(inode, " %d:[%d]%d:%llu ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_is_unwritten(path->p_ext), ext4_ext_get_actual_len(path->p_ext), ext4_ext_pblock(path->p_ext)); } else - ext_debug(" []"); + ext_debug(inode, " []"); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) @@ -631,14 +638,14 @@ static void ext4_ext_show_leaf(struct inode *inode, struct ext4_ext_path *path) eh = path[depth].p_hdr; ex = EXT_FIRST_EXTENT(eh); - ext_debug("Displaying leaf extents for inode %lu\n", inode->i_ino); + ext_debug(inode, "Displaying leaf extents\n"); for (i = 0; i < le16_to_cpu(eh->eh_entries); i++, ex++) { - ext_debug("%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), + ext_debug(inode, "%d:[%d]%d:%llu ", le32_to_cpu(ex->ee_block), ext4_ext_is_unwritten(ex), ext4_ext_get_actual_len(ex), ext4_ext_pblock(ex)); } - ext_debug("\n"); + ext_debug(inode, "\n"); } static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, @@ -651,10 +658,9 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *idx; idx = path[level].p_idx; while (idx <= EXT_MAX_INDEX(path[level].p_hdr)) { - ext_debug("%d: move %d:%llu in new index %llu\n", level, - le32_to_cpu(idx->ei_block), - ext4_idx_pblock(idx), - newblock); + ext_debug(inode, "%d: move %d:%llu in new index %llu\n", + level, le32_to_cpu(idx->ei_block), + ext4_idx_pblock(idx), newblock); idx++; } @@ -663,7 +669,7 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, ex = path[depth].p_ext; while (ex <= EXT_MAX_EXTENT(path[depth].p_hdr)) { - ext_debug("move %d:%llu:[%d]%d in new leaf %llu\n", + ext_debug(inode, "move %d:%llu:[%d]%d in new leaf %llu\n", le32_to_cpu(ex->ee_block), ext4_ext_pblock(ex), ext4_ext_is_unwritten(ex), @@ -707,7 +713,7 @@ ext4_ext_binsearch_idx(struct inode *inode, struct ext4_extent_idx *r, *l, *m; - ext_debug("binsearch for %u(idx): ", block); + ext_debug(inode, "binsearch for %u(idx): ", block); l = EXT_FIRST_INDEX(eh) + 1; r = EXT_LAST_INDEX(eh); @@ -717,13 +723,13 @@ ext4_ext_binsearch_idx(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ei_block), - m, le32_to_cpu(m->ei_block), - r, le32_to_cpu(r->ei_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ei_block), m, le32_to_cpu(m->ei_block), + r, le32_to_cpu(r->ei_block)); } path->p_idx = l - 1; - ext_debug(" -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), + ext_debug(inode, " -> %u->%lld ", le32_to_cpu(path->p_idx->ei_block), ext4_idx_pblock(path->p_idx)); #ifdef CHECK_BINSEARCH @@ -774,7 +780,7 @@ ext4_ext_binsearch(struct inode *inode, return; } - ext_debug("binsearch for %u: ", block); + ext_debug(inode, "binsearch for %u: ", block); l = EXT_FIRST_EXTENT(eh) + 1; r = EXT_LAST_EXTENT(eh); @@ -785,13 +791,13 @@ ext4_ext_binsearch(struct inode *inode, r = m - 1; else l = m + 1; - ext_debug("%p(%u):%p(%u):%p(%u) ", l, le32_to_cpu(l->ee_block), - m, le32_to_cpu(m->ee_block), - r, le32_to_cpu(r->ee_block)); + ext_debug(inode, "%p(%u):%p(%u):%p(%u) ", l, + le32_to_cpu(l->ee_block), m, le32_to_cpu(m->ee_block), + r, le32_to_cpu(r->ee_block)); } path->p_ext = l - 1; - ext_debug(" -> %d:%llu:[%d]%d ", + ext_debug(inode, " -> %d:%llu:[%d]%d ", le32_to_cpu(path->p_ext->ee_block), ext4_ext_pblock(path->p_ext), ext4_ext_is_unwritten(path->p_ext), @@ -816,7 +822,7 @@ ext4_ext_binsearch(struct inode *inode, } -int ext4_ext_tree_init(handle_t *handle, struct inode *inode) +void ext4_ext_tree_init(handle_t *handle, struct inode *inode) { struct ext4_extent_header *eh; @@ -826,7 +832,6 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) eh->eh_magic = EXT4_EXT_MAGIC; eh->eh_max = cpu_to_le16(ext4_ext_space_root(inode, 0)); ext4_mark_inode_dirty(handle, inode); - return 0; } struct ext4_ext_path * @@ -838,6 +843,10 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, struct ext4_ext_path *path = orig_path ? *orig_path : NULL; short int depth, i, ppos = 0; int ret; + gfp_t gfp_flags = GFP_NOFS; + + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; eh = ext_inode_hdr(inode); depth = ext_depth(inode); @@ -858,7 +867,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, if (!path) { /* account possible depth increase */ path = kcalloc(depth + 2, sizeof(struct ext4_ext_path), - GFP_NOFS); + gfp_flags); if (unlikely(!path)) return ERR_PTR(-ENOMEM); path[0].p_maxdepth = depth + 1; @@ -871,7 +880,7 @@ ext4_find_extent(struct inode *inode, ext4_lblk_t block, ext4_cache_extents(inode, eh); /* walk through the tree */ while (i) { - ext_debug("depth %d: num %d, max %d\n", + ext_debug(inode, "depth %d: num %d, max %d\n", ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); ext4_ext_binsearch_idx(inode, path + ppos, block); @@ -948,18 +957,20 @@ static int ext4_ext_insert_index(handle_t *handle, struct inode *inode, if (logical > le32_to_cpu(curp->p_idx->ei_block)) { /* insert after */ - ext_debug("insert new index %d after: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d after: %llu\n", + logical, ptr); ix = curp->p_idx + 1; } else { /* insert before */ - ext_debug("insert new index %d before: %llu\n", logical, ptr); + ext_debug(inode, "insert new index %d before: %llu\n", + logical, ptr); ix = curp->p_idx; } len = EXT_LAST_INDEX(curp->p_hdr) - ix + 1; BUG_ON(len < 0); if (len > 0) { - ext_debug("insert new index %d: " + ext_debug(inode, "insert new index %d: " "move %d indices from 0x%p to 0x%p\n", logical, len, ix, ix + 1); memmove(ix + 1, ix, len * sizeof(struct ext4_extent_idx)); @@ -1008,9 +1019,13 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, ext4_fsblk_t newblock, oldblock; __le32 border; ext4_fsblk_t *ablocks = NULL; /* array of allocated blocks */ + gfp_t gfp_flags = GFP_NOFS; int err = 0; size_t ext_size = 0; + if (flags & EXT4_EX_NOFAIL) + gfp_flags |= __GFP_NOFAIL; + /* make decision: where to split? */ /* FIXME: now decision is simplest: at current extent */ @@ -1022,12 +1037,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } if (path[depth].p_ext != EXT_MAX_EXTENT(path[depth].p_hdr)) { border = path[depth].p_ext[1].ee_block; - ext_debug("leaf will be split." + ext_debug(inode, "leaf will be split." " next leaf starts at %d\n", le32_to_cpu(border)); } else { border = newext->ee_block; - ext_debug("leaf will be added." + ext_debug(inode, "leaf will be added." " next leaf starts at %d\n", le32_to_cpu(border)); } @@ -1044,12 +1059,12 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, * We need this to handle errors and free blocks * upon them. */ - ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), GFP_NOFS); + ablocks = kcalloc(depth, sizeof(ext4_fsblk_t), gfp_flags); if (!ablocks) return -ENOMEM; /* allocate all needed blocks */ - ext_debug("allocate %d blocks for indexes/leaf\n", depth - at); + ext_debug(inode, "allocate %d blocks for indexes/leaf\n", depth - at); for (a = 0; a < depth - at; a++) { newblock = ext4_ext_new_meta_block(handle, inode, path, newext, &err, flags); @@ -1135,7 +1150,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, goto cleanup; } if (k) - ext_debug("create %d intermediate indices\n", k); + ext_debug(inode, "create %d intermediate indices\n", k); /* insert new index into current index block */ /* current depth stored in i var */ i = depth - 1; @@ -1162,7 +1177,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, fidx->ei_block = border; ext4_idx_store_pblock(fidx, oldblock); - ext_debug("int.index at %d (block %llu): %u -> %llu\n", + ext_debug(inode, "int.index at %d (block %llu): %u -> %llu\n", i, newblock, le32_to_cpu(border), oldblock); /* move remainder of path[i] to the new index block */ @@ -1176,7 +1191,7 @@ static int ext4_ext_split(handle_t *handle, struct inode *inode, } /* start copy indexes */ m = EXT_MAX_INDEX(path[i].p_hdr) - path[i].p_idx++; - ext_debug("cur 0x%p, last 0x%p\n", path[i].p_idx, + ext_debug(inode, "cur 0x%p, last 0x%p\n", path[i].p_idx, EXT_MAX_INDEX(path[i].p_hdr)); ext4_ext_show_move(inode, path, newblock, i); if (m) { @@ -1313,13 +1328,13 @@ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, EXT_FIRST_INDEX(neh)->ei_block = EXT_FIRST_EXTENT(neh)->ee_block; } - ext_debug("new root: num %d(%d), lblock %d, ptr %llu\n", + ext_debug(inode, "new root: num %d(%d), lblock %d, ptr %llu\n", le16_to_cpu(neh->eh_entries), le16_to_cpu(neh->eh_max), le32_to_cpu(EXT_FIRST_INDEX(neh)->ei_block), ext4_idx_pblock(EXT_FIRST_INDEX(neh))); le16_add_cpu(&neh->eh_depth, 1); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); out: brelse(bh); @@ -1955,7 +1970,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* Try to append newex to the ex */ if (ext4_can_extents_be_merged(inode, ex, newext)) { - ext_debug("append [%d]%d block to %u:[%d]%d" + ext_debug(inode, "append [%d]%d block to %u:[%d]%d" "(from %llu)\n", ext4_ext_is_unwritten(newext), ext4_ext_get_actual_len(newext), @@ -1980,7 +1995,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, prepend: /* Try to prepend newex to the ex */ if (ext4_can_extents_be_merged(inode, newext, ex)) { - ext_debug("prepend %u[%d]%d block to %u:[%d]%d" + ext_debug(inode, "prepend %u[%d]%d block to %u:[%d]%d" "(from %llu)\n", le32_to_cpu(newext->ee_block), ext4_ext_is_unwritten(newext), @@ -2018,20 +2033,20 @@ prepend: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(fex->ee_block)) next = ext4_ext_next_leaf_block(path); if (next != EXT_MAX_BLOCKS) { - ext_debug("next leaf block - %u\n", next); + ext_debug(inode, "next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, gb_flags); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); eh = npath[depth].p_hdr; if (le16_to_cpu(eh->eh_entries) < le16_to_cpu(eh->eh_max)) { - ext_debug("next leaf isn't full(%d)\n", + ext_debug(inode, "next leaf isn't full(%d)\n", le16_to_cpu(eh->eh_entries)); path = npath; goto has_space; } - ext_debug("next leaf has no free space(%d,%d)\n", + ext_debug(inode, "next leaf has no free space(%d,%d)\n", le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max)); } @@ -2057,7 +2072,7 @@ has_space: if (!nearex) { /* there is no extent in this leaf, create first one */ - ext_debug("first extent in the leaf: %u:%llu:[%d]%d\n", + ext_debug(inode, "first extent in the leaf: %u:%llu:[%d]%d\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), ext4_ext_is_unwritten(newext), @@ -2067,7 +2082,7 @@ has_space: if (le32_to_cpu(newext->ee_block) > le32_to_cpu(nearex->ee_block)) { /* Insert after */ - ext_debug("insert %u:%llu:[%d]%d before: " + ext_debug(inode, "insert %u:%llu:[%d]%d before: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2078,7 +2093,7 @@ has_space: } else { /* Insert before */ BUG_ON(newext->ee_block == nearex->ee_block); - ext_debug("insert %u:%llu:[%d]%d after: " + ext_debug(inode, "insert %u:%llu:[%d]%d after: " "nearest %p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2088,7 +2103,7 @@ has_space: } len = EXT_LAST_EXTENT(eh) - nearex + 1; if (len > 0) { - ext_debug("insert %u:%llu:[%d]%d: " + ext_debug(inode, "insert %u:%llu:[%d]%d: " "move %d extents from 0x%p to 0x%p\n", le32_to_cpu(newext->ee_block), ext4_ext_pblock(newext), @@ -2232,7 +2247,7 @@ ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start, return; hole_len = min(es.es_lblk - hole_start, hole_len); } - ext_debug(" -> %u:%u\n", hole_start, hole_len); + ext_debug(inode, " -> %u:%u\n", hole_start, hole_len); ext4_es_insert_extent(inode, hole_start, hole_len, ~0, EXTENT_STATUS_HOLE); } @@ -2269,7 +2284,7 @@ static int ext4_ext_rm_idx(handle_t *handle, struct inode *inode, err = ext4_ext_dirty(handle, inode, path); if (err) return err; - ext_debug("index is empty, remove it, free block %llu\n", leaf); + ext_debug(inode, "index is empty, remove it, free block %llu\n", leaf); trace_ext4_ext_rm_idx(inode, leaf); ext4_free_blocks(handle, inode, NULL, leaf, 1, @@ -2548,7 +2563,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ext4_fsblk_t pblk; /* the header must be checked already in ext4_ext_remove_space() */ - ext_debug("truncate since %u in leaf to %u\n", start, end); + ext_debug(inode, "truncate since %u in leaf to %u\n", start, end); if (!path[depth].p_hdr) path[depth].p_hdr = ext_block_hdr(path[depth].p_bh); eh = path[depth].p_hdr; @@ -2574,7 +2589,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, else unwritten = 0; - ext_debug("remove ext %u:[%d]%d\n", ex_ee_block, + ext_debug(inode, "remove ext %u:[%d]%d\n", ex_ee_block, unwritten, ex_ee_len); path[depth].p_ext = ex; @@ -2582,7 +2597,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, b = ex_ee_block+ex_ee_len - 1 < end ? ex_ee_block+ex_ee_len - 1 : end; - ext_debug(" border %u:%u\n", a, b); + ext_debug(inode, " border %u:%u\n", a, b); /* If this extent is beyond the end of the hole, skip it */ if (end < ex_ee_block) { @@ -2691,7 +2706,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (err) goto out; - ext_debug("new extent: %u:%u:%llu\n", ex_ee_block, num, + ext_debug(inode, "new extent: %u:%u:%llu\n", ex_ee_block, num, ext4_ext_pblock(ex)); ex--; ex_ee_block = le32_to_cpu(ex->ee_block); @@ -2768,7 +2783,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, partial.lblk = 0; partial.state = initial; - ext_debug("truncate since %u to %u\n", start, end); + ext_debug(inode, "truncate since %u to %u\n", start, end); /* probably first extent we're gonna free will be last in block */ handle = ext4_journal_start_with_revoke(inode, EXT4_HT_TRUNCATE, @@ -2793,7 +2808,8 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, + EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2879,7 +2895,7 @@ again: le16_to_cpu(path[k].p_hdr->eh_entries)+1; } else { path = kcalloc(depth + 1, sizeof(struct ext4_ext_path), - GFP_NOFS); + GFP_NOFS | __GFP_NOFAIL); if (path == NULL) { ext4_journal_stop(handle); return -ENOMEM; @@ -2909,7 +2925,7 @@ again: /* this is index block */ if (!path[i].p_hdr) { - ext_debug("initialize header\n"); + ext_debug(inode, "initialize header\n"); path[i].p_hdr = ext_block_hdr(path[i].p_bh); } @@ -2917,7 +2933,7 @@ again: /* this level hasn't been touched yet */ path[i].p_idx = EXT_LAST_INDEX(path[i].p_hdr); path[i].p_block = le16_to_cpu(path[i].p_hdr->eh_entries)+1; - ext_debug("init index ptr: hdr 0x%p, num %d\n", + ext_debug(inode, "init index ptr: hdr 0x%p, num %d\n", path[i].p_hdr, le16_to_cpu(path[i].p_hdr->eh_entries)); } else { @@ -2925,13 +2941,13 @@ again: path[i].p_idx--; } - ext_debug("level %d - index, first 0x%p, cur 0x%p\n", + ext_debug(inode, "level %d - index, first 0x%p, cur 0x%p\n", i, EXT_FIRST_INDEX(path[i].p_hdr), path[i].p_idx); if (ext4_ext_more_to_rm(path + i)) { struct buffer_head *bh; /* go to the next level */ - ext_debug("move to level %d (block %llu)\n", + ext_debug(inode, "move to level %d (block %llu)\n", i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, @@ -2967,7 +2983,7 @@ again: brelse(path[i].p_bh); path[i].p_bh = NULL; i--; - ext_debug("return to level %d\n", i); + ext_debug(inode, "return to level %d\n", i); } } @@ -3135,8 +3151,7 @@ static int ext4_split_extent_at(handle_t *handle, BUG_ON((split_flag & (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)) == (EXT4_EXT_DATA_VALID1 | EXT4_EXT_DATA_VALID2)); - ext_debug("ext4_split_extents_at: inode %lu, logical" - "block %llu\n", inode->i_ino, (unsigned long long)split); + ext_debug(inode, "logical block %llu\n", (unsigned long long)split); ext4_ext_show_leaf(inode, path); @@ -3244,6 +3259,10 @@ out: fix_extent_len: ex->ee_len = orig_ex.ee_len; + /* + * Ignore ext4_ext_dirty return value since we are already in error path + * and err is a non-zero error code. + */ ext4_ext_dirty(handle, inode, path + path->p_depth); return err; } @@ -3300,7 +3319,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, flags); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3369,9 +3388,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, int err = 0; int split_flag = EXT4_EXT_DATA_VALID2; - ext_debug("ext4_ext_convert_to_initialized: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, - (unsigned long long)map->m_lblk, map_len); + ext_debug(inode, "logical block %llu, max_blocks %u\n", + (unsigned long long)map->m_lblk, map_len); sbi = EXT4_SB(inode->i_sb); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3503,7 +3521,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } if (allocated) { /* Mark the block containing both extents as dirty */ - ext4_ext_dirty(handle, inode, path + depth); + err = ext4_ext_dirty(handle, inode, path + depth); /* Update path to point to the right extent */ path[depth].p_ext = abut_ex; @@ -3623,8 +3641,7 @@ static int ext4_split_convert_extents(handle_t *handle, unsigned int ee_len; int split_flag = 0, depth; - ext_debug("%s: inode %lu, logical block %llu, max_blocks %u\n", - __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)map->m_lblk, map->m_len); eof_block = (EXT4_I(inode)->i_disksize + inode->i_sb->s_blocksize - 1) @@ -3670,8 +3687,7 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("ext4_convert_unwritten_extents_endio: inode %lu, logical" - "block %llu, max_blocks %u\n", inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); /* If extent is larger than requested it is a clear sign that we still @@ -3741,8 +3757,7 @@ convert_initialized_extent(handle_t *handle, struct inode *inode, ee_block = le32_to_cpu(ex->ee_block); ee_len = ext4_ext_get_actual_len(ex); - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, + ext_debug(inode, "logical block %llu, max_blocks %u\n", (unsigned long long)ee_block, ee_len); if (ee_block != map->m_lblk || ee_len > map->m_len) { @@ -3794,16 +3809,13 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { -#ifdef EXT_DEBUG - struct ext4_ext_path *path = *ppath; -#endif + struct ext4_ext_path __maybe_unused *path = *ppath; int ret = 0; int err = 0; - ext_debug("ext4_ext_handle_unwritten_extents: inode %lu, logical " - "block %llu, max_blocks %u, flags %x, allocated %u\n", - inode->i_ino, (unsigned long long)map->m_lblk, map->m_len, - flags, allocated); + ext_debug(inode, "logical block %llu, max_blocks %u, flags 0x%x, allocated %u\n", + (unsigned long long)map->m_lblk, map->m_len, flags, + allocated); ext4_ext_show_leaf(inode, path); /* @@ -3815,39 +3827,38 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, trace_ext4_ext_handle_unwritten_extents(inode, map, flags, allocated, newblock); - /* get_block() before submit the IO, split the extent */ + /* get_block() before submitting IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { ret = ext4_split_convert_extents(handle, inode, map, ppath, flags | EXT4_GET_BLOCKS_CONVERT); - if (ret <= 0) - goto out; + if (ret < 0) { + err = ret; + goto out2; + } + /* + * shouldn't get a 0 return when splitting an extent unless + * m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, + "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } map->m_flags |= EXT4_MAP_UNWRITTEN; goto out; } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { - if (flags & EXT4_GET_BLOCKS_ZERO) { - if (allocated > map->m_len) - allocated = map->m_len; - err = ext4_issue_zeroout(inode, map->m_lblk, newblock, - allocated); - if (err < 0) - goto out2; - } - ret = ext4_convert_unwritten_extents_endio(handle, inode, map, + err = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); - else - err = ret; - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; - goto out2; + if (err < 0) + goto out2; + ext4_update_inode_fsync_trans(handle, inode, 1); + goto map_out; } - /* buffered IO case */ + /* buffered IO cases */ /* * repeat fallocate creation request * we already have an unwritten extent @@ -3870,29 +3881,39 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, goto out1; } - /* buffered write, writepage time, convert*/ + /* + * Default case when (flags & EXT4_GET_BLOCKS_CREATE) == 1. + * For buffered writes, at writepage time, etc. Convert a + * discovered unwritten extent to written. + */ ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out: - if (ret <= 0) { + if (ret < 0) { err = ret; goto out2; - } else - allocated = ret; - map->m_flags |= EXT4_MAP_NEW; - if (allocated > map->m_len) - allocated = map->m_len; - map->m_len = allocated; + } + ext4_update_inode_fsync_trans(handle, inode, 1); + /* + * shouldn't get a 0 return when converting an unwritten extent + * unless m_len is 0 (bug) or extent has been corrupted + */ + if (unlikely(ret == 0)) { + EXT4_ERROR_INODE(inode, "unexpected ret == 0, m_len = %u", + map->m_len); + err = -EFSCORRUPTED; + goto out2; + } +out: + allocated = ret; + map->m_flags |= EXT4_MAP_NEW; map_out: map->m_flags |= EXT4_MAP_MAPPED; out1: + map->m_pblk = newblock; if (allocated > map->m_len) allocated = map->m_len; - ext4_ext_show_leaf(inode, path); - map->m_pblk = newblock; map->m_len = allocated; + ext4_ext_show_leaf(inode, path); out2: return err ? err : allocated; } @@ -4024,15 +4045,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, struct ext4_ext_path *path = NULL; struct ext4_extent newex, *ex, *ex2; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - ext4_fsblk_t newblock = 0; + ext4_fsblk_t newblock = 0, pblk; int err = 0, depth, ret; unsigned int allocated = 0, offset = 0; unsigned int allocated_clusters = 0; struct ext4_allocation_request ar; ext4_lblk_t cluster_offset; - ext_debug("blocks %u/%u requested for inode %lu\n", - map->m_lblk, map->m_len, inode->i_ino); + ext_debug(inode, "blocks %u/%u requested\n", map->m_lblk, map->m_len); trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ @@ -4040,7 +4060,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; - goto out2; + goto out; } depth = ext_depth(inode); @@ -4056,7 +4076,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (unsigned long) map->m_lblk, depth, path[depth].p_block); err = -EFSCORRUPTED; - goto out2; + goto out; } ex = path[depth].p_ext; @@ -4079,8 +4099,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, newblock = map->m_lblk - ee_block + ee_start; /* number of remaining blocks in the extent */ allocated = ee_len - (map->m_lblk - ee_block); - ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk, - ee_block, ee_len, newblock); + ext_debug(inode, "%u fit into %u:%d -> %llu\n", + map->m_lblk, ee_block, ee_len, newblock); /* * If the extent is initialized check whether the @@ -4090,8 +4110,14 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { err = convert_initialized_extent(handle, inode, map, &path, &allocated); - goto out2; + goto out; } else if (!ext4_ext_is_unwritten(ex)) { + map->m_flags |= EXT4_MAP_MAPPED; + map->m_pblk = newblock; + if (allocated > map->m_len) + allocated = map->m_len; + map->m_len = allocated; + ext4_ext_show_leaf(inode, path); goto out; } @@ -4102,7 +4128,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, err = ret; else allocated = ret; - goto out2; + goto out; } } @@ -4127,7 +4153,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, map->m_pblk = 0; map->m_len = min_t(unsigned int, map->m_len, hole_len); - goto out2; + goto out; } /* @@ -4151,12 +4177,12 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.lleft = map->m_lblk; err = ext4_ext_search_left(inode, path, &ar.lleft, &ar.pleft); if (err) - goto out2; + goto out; ar.lright = map->m_lblk; ex2 = NULL; err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); if (err) - goto out2; + goto out; /* Check if the extent after searching to the right implies a * cluster we can use. */ @@ -4217,17 +4243,18 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags |= EXT4_MB_USE_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) - goto out2; - ext_debug("allocate new block: goal %llu, found %llu/%u\n", - ar.goal, newblock, allocated); + goto out; allocated_clusters = ar.len; ar.len = EXT4_C2B(sbi, ar.len) - offset; + ext_debug(inode, "allocate new block: goal %llu, found %llu/%u, requested %u\n", + ar.goal, newblock, ar.len, allocated); if (ar.len > allocated) ar.len = allocated; got_allocated_blocks: /* try to insert new extent into found leaf and return */ - ext4_ext_store_pblock(&newex, newblock + offset); + pblk = newblock + offset; + ext4_ext_store_pblock(&newex, pblk); newex.ee_len = cpu_to_le16(ar.len); /* Mark unwritten */ if (flags & EXT4_GET_BLOCKS_UNWRIT_EXT) { @@ -4252,16 +4279,9 @@ got_allocated_blocks: EXT4_C2B(sbi, allocated_clusters), fb_flags); } - goto out2; + goto out; } - /* previous routine could use block we allocated */ - newblock = ext4_ext_pblock(&newex); - allocated = ext4_ext_get_actual_len(&newex); - if (allocated > map->m_len) - allocated = map->m_len; - map->m_flags |= EXT4_MAP_NEW; - /* * Reduce the reserved cluster count to reflect successful deferred * allocation of delayed allocated clusters or direct allocation of @@ -4307,14 +4327,14 @@ got_allocated_blocks: ext4_update_inode_fsync_trans(handle, inode, 1); else ext4_update_inode_fsync_trans(handle, inode, 0); -out: - if (allocated > map->m_len) - allocated = map->m_len; + + map->m_flags |= (EXT4_MAP_NEW | EXT4_MAP_MAPPED); + map->m_pblk = pblk; + map->m_len = ar.len; + allocated = map->m_len; ext4_ext_show_leaf(inode, path); - map->m_flags |= EXT4_MAP_MAPPED; - map->m_pblk = newblock; - map->m_len = allocated; -out2: + +out: ext4_ext_drop_refs(path); kfree(path); @@ -4353,7 +4373,14 @@ retry: } if (err) return err; - return ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); +retry_remove_space: + err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); + if (err == -ENOMEM) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_remove_space; + } + return err; } static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, @@ -4363,7 +4390,7 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, struct inode *inode = file_inode(file); handle_t *handle; int ret = 0; - int ret2 = 0; + int ret2 = 0, ret3 = 0; int retries = 0; int depth = 0; struct ext4_map_blocks map; @@ -4423,10 +4450,11 @@ retry: if (ext4_update_inode_size(inode, epos) & 0x1) inode->i_mtime = inode->i_ctime; } - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); - ret2 = ext4_journal_stop(handle); - if (ret2) + ret3 = ext4_journal_stop(handle); + ret2 = ret3 ? ret3 : ret2; + if (unlikely(ret2)) break; } if (ret == -ENOSPC && @@ -4490,7 +4518,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode_lock(inode); /* - * Indirect files do not support unwritten extnets + * Indirect files do not support unwritten extents */ if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { ret = -EOPNOTSUPP; @@ -4507,8 +4535,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; /* Wait all existing dio workers, newcomers will block on i_mutex */ inode_dio_wait(inode); @@ -4577,7 +4603,9 @@ static long ext4_zero_range(struct file *file, loff_t offset, inode->i_mtime = inode->i_ctime = current_time(inode); if (new_size) ext4_update_inode_size(inode, new_size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; /* Zero out partial block at the edges of the range */ ret = ext4_zero_partial_blocks(handle, inode, offset, len); @@ -4587,6 +4615,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (file->f_flags & O_SYNC) ext4_handle_sync(handle); +out_handle: ext4_journal_stop(handle); out_mutex: inode_unlock(inode); @@ -4647,8 +4676,6 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - if (mode & FALLOC_FL_KEEP_SIZE) - flags |= EXT4_GET_BLOCKS_KEEP_SIZE; inode_lock(inode); @@ -4700,8 +4727,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len) { unsigned int max_blocks; - int ret = 0; - int ret2 = 0; + int ret = 0, ret2 = 0, ret3 = 0; struct ext4_map_blocks map; unsigned int blkbits = inode->i_blkbits; unsigned int credits = 0; @@ -4734,9 +4760,13 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, "ext4_ext_map_blocks returned %d", inode->i_ino, map.m_lblk, map.m_len, ret); - ext4_mark_inode_dirty(handle, inode); - if (credits) - ret2 = ext4_journal_stop(handle); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + if (ret <= 0 || ret2) break; } @@ -4854,11 +4884,9 @@ static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) return 0; } -static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len, bool from_es_cache) +int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len) { - ext4_lblk_t start_blk; - u32 ext4_fiemap_flags = FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR; int error = 0; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { @@ -4868,12 +4896,6 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } - if (from_es_cache) - ext4_fiemap_flags &= FIEMAP_FLAG_XATTR; - - if (fiemap_check_flags(fieinfo, ext4_fiemap_flags)) - return -EBADR; - /* * For bitmap files the maximum size limit could be smaller than * s_maxbytes, so check len here manually instead of just relying on the @@ -4885,40 +4907,20 @@ static int _ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); - } else if (!from_es_cache) { - error = iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_report_ops); - } else { - ext4_lblk_t len_blks; - __u64 last_blk; - - start_blk = start >> inode->i_sb->s_blocksize_bits; - last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; - if (last_blk >= EXT_MAX_BLOCKS) - last_blk = EXT_MAX_BLOCKS-1; - len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; - - /* - * Walk the extent tree gathering extent information - * and pushing extents back to the user. - */ - error = ext4_fill_es_cache_info(inode, start_blk, len_blks, - fieinfo); + return iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); } - return error; -} -int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - __u64 start, __u64 len) -{ - return _ext4_fiemap(inode, fieinfo, start, len, false); + return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len) { + ext4_lblk_t start_blk, len_blks; + __u64 last_blk; + int error = 0; + if (ext4_has_inline_data(inode)) { int has_inline; @@ -4929,9 +4931,33 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, return 0; } - return _ext4_fiemap(inode, fieinfo, start, len, true); -} + if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + error = ext4_ext_precache(inode); + if (error) + return error; + fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; + } + + error = fiemap_prep(inode, fieinfo, start, &len, 0); + if (error) + return error; + + error = ext4_fiemap_check_ranges(inode, start, &len); + if (error) + return error; + start_blk = start >> inode->i_sb->s_blocksize_bits; + last_blk = (start + len - 1) >> inode->i_sb->s_blocksize_bits; + if (last_blk >= EXT_MAX_BLOCKS) + last_blk = EXT_MAX_BLOCKS-1; + len_blks = ((ext4_lblk_t) last_blk) - start_blk + 1; + + /* + * Walk the extent tree gathering extent information + * and pushing extents back to the user. + */ + return ext4_fill_es_cache_info(inode, start_blk, len_blks, fieinfo); +} /* * ext4_access_path: @@ -5304,7 +5330,7 @@ static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (IS_SYNC(inode)) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d996b44d2265..e75171535375 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -1054,7 +1054,7 @@ static void count_rsvd(struct inode *inode, ext4_lblk_t lblk, long len, end = (end > ext4_es_end(es)) ? ext4_es_end(es) : end; /* record the first block of the first delonly extent seen */ - if (rc->first_do_lblk_found == false) { + if (!rc->first_do_lblk_found) { rc->first_do_lblk = i; rc->first_do_lblk_found = true; } diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 0d624250a62b..2a01e31a032c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -287,6 +287,7 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, bool truncate = false; u8 blkbits = inode->i_blkbits; ext4_lblk_t written_blk, end_blk; + int ret; /* * Note that EXT4_I(inode)->i_disksize can get extended up to @@ -327,8 +328,14 @@ static ssize_t ext4_handle_inode_extension(struct inode *inode, loff_t offset, goto truncate; } - if (ext4_update_inode_size(inode, offset + written)) - ext4_mark_inode_dirty(handle, inode); + if (ext4_update_inode_size(inode, offset + written)) { + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) { + written = ret; + ext4_journal_stop(handle); + goto truncate; + } + } /* * We may need to truncate allocated but not written blocks beyond EOF. @@ -495,6 +502,12 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from) if (ret <= 0) return ret; + /* if we're going to block and IOCB_NOWAIT is set, return -EAGAIN */ + if ((iocb->ki_flags & IOCB_NOWAIT) && (unaligned_io || extend)) { + ret = -EAGAIN; + goto out; + } + offset = iocb->ki_pos; count = ret; diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 35ff9a56db67..1d668c8f131f 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -44,30 +44,28 @@ */ static int ext4_sync_parent(struct inode *inode) { - struct dentry *dentry = NULL; - struct inode *next; + struct dentry *dentry, *next; int ret = 0; if (!ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) return 0; - inode = igrab(inode); + dentry = d_find_any_alias(inode); + if (!dentry) + return 0; while (ext4_test_inode_state(inode, EXT4_STATE_NEWENTRY)) { ext4_clear_inode_state(inode, EXT4_STATE_NEWENTRY); - dentry = d_find_any_alias(inode); - if (!dentry) - break; - next = igrab(d_inode(dentry->d_parent)); + + next = dget_parent(dentry); dput(dentry); - if (!next) - break; - iput(inode); - inode = next; + dentry = next; + inode = dentry->d_inode; + /* * The directory inode may have gone through rmdir by now. But * the inode itself and its blocks are still allocated (we hold - * a reference to the inode so it didn't go through - * ext4_evict_inode()) and so we are safe to flush metadata - * blocks and the inode. + * a reference to the inode via its dentry), so it didn't go + * through ext4_evict_inode()) and so we are safe to flush + * metadata blocks and the inode. */ ret = sync_mapping_buffers(inode->i_mapping); if (ret) @@ -76,7 +74,7 @@ static int ext4_sync_parent(struct inode *inode) if (ret) break; } - iput(inode); + dput(dentry); return ret; } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 499f08d8522e..54d324e80fe5 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1246,6 +1246,7 @@ struct inode *ext4_orphan_get(struct super_block *sb, unsigned long ino) ext4_error_err(sb, -err, "couldn't read orphan inode %lu (err %d)", ino, err); + brelse(bitmap_bh); return inode; } diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 107f0043f67f..be2b66eb65f7 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -467,7 +467,9 @@ static int ext4_splice_branch(handle_t *handle, /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, ar->inode); + err = ext4_mark_inode_dirty(handle, ar->inode); + if (unlikely(err)) + goto err_out; jbd_debug(5, "splicing direct\n"); } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index f35e289e17aa..c3a1ad2db122 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -1260,7 +1260,7 @@ out: int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode) { - int ret, inline_size, no_expand; + int ret, ret2, inline_size, no_expand; void *inline_start; struct ext4_iloc iloc; @@ -1314,7 +1314,9 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, out: ext4_write_unlock_xattr(dir, &no_expand); - ext4_mark_inode_dirty(handle, dir); + ret2 = ext4_mark_inode_dirty(handle, dir); + if (unlikely(ret2 && !ret)) + ret = ret2; brelse(iloc.bh); return ret; } diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 52be85f96159..afffb75915dc 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -221,6 +221,16 @@ void ext4_evict_inode(struct inode *inode) truncate_inode_pages_final(&inode->i_data); /* + * For inodes with journalled data, transaction commit could have + * dirtied the inode. Flush worker is ignoring it because of I_FREEING + * flag but we still need to remove the inode from the writeback lists. + */ + if (!list_empty_careful(&inode->i_io_list)) { + WARN_ON_ONCE(!ext4_should_journal_data(inode)); + inode_io_list_del(inode); + } + + /* * Protect us against freezing - iput() caller didn't have to have any * protection against it */ @@ -432,11 +442,9 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -493,9 +501,8 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, #endif map->m_flags = 0; - ext_debug("ext4_map_blocks(): inode %lu, flag %d, max_blocks %u," - "logical block %lu\n", inode->i_ino, flags, map->m_len, - (unsigned long) map->m_lblk); + ext_debug(inode, "flag 0x%x, max_blocks %u, logical block %lu\n", + flags, map->m_len, (unsigned long) map->m_lblk); /* * ext4_map_blocks returns an int, and m_len is an unsigned int @@ -541,11 +548,9 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, */ down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - retval = ext4_ext_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ext_map_blocks(handle, inode, map, 0); } else { - retval = ext4_ind_map_blocks(handle, inode, map, flags & - EXT4_GET_BLOCKS_KEEP_SIZE); + retval = ext4_ind_map_blocks(handle, inode, map, 0); } if (retval > 0) { unsigned int status; @@ -726,6 +731,9 @@ out_sem: return ret; } } + + if (retval < 0) + ext_debug(inode, "failed with err %d\n", retval); return retval; } @@ -1296,7 +1304,7 @@ static int ext4_write_end(struct file *file, * filesystems. */ if (i_size_changed || inline_data) - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); if (pos + len > inode->i_size && !verity && ext4_can_truncate(inode)) /* if we have allocated more blocks and copied @@ -1526,6 +1534,7 @@ struct mpage_da_data { struct ext4_map_blocks map; struct ext4_io_submit io_submit; /* IO submission data */ unsigned int do_map:1; + unsigned int scanned_until_end:1; }; static void mpage_release_unused_pages(struct mpage_da_data *mpd, @@ -1541,6 +1550,7 @@ static void mpage_release_unused_pages(struct mpage_da_data *mpd, if (mpd->first_page >= mpd->next_page) return; + mpd->scanned_until_end = 0; index = mpd->first_page; end = mpd->next_page - 1; if (invalidate) { @@ -1681,8 +1691,7 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, invalid_block = ~0; map->m_flags = 0; - ext_debug("ext4_da_map_blocks(): inode %lu, max_blocks %u," - "logical block %lu\n", inode->i_ino, map->m_len, + ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); /* Lookup extent status tree firstly */ @@ -2078,7 +2087,7 @@ static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) return err; } -#define BH_FLAGS ((1 << BH_Unwritten) | (1 << BH_Delay)) +#define BH_FLAGS (BIT(BH_Unwritten) | BIT(BH_Delay)) /* * mballoc gives us at most this number of blocks... @@ -2188,7 +2197,11 @@ static int mpage_process_page_bufs(struct mpage_da_data *mpd, if (err < 0) return err; } - return lblk < blocks; + if (lblk >= blocks) { + mpd->scanned_until_end = 1; + return 0; + } + return 1; } /* @@ -2311,7 +2324,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) * mapping, or maybe the page was submitted for IO. * So we return to call further extent mapping. */ - if (err < 0 || map_bh == true) + if (err < 0 || map_bh) goto out; /* Page fully mapped - let IO run! */ err = mpage_submit_page(mpd, page); @@ -2358,7 +2371,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; - if (map->m_flags & (1 << BH_Delay)) + if (map->m_flags & BIT(BH_Delay)) get_blocks_flags |= EXT4_GET_BLOCKS_DELALLOC_RESERVE; err = ext4_map_blocks(handle, inode, map, get_blocks_flags); @@ -2546,7 +2559,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end, tag); if (nr_pages == 0) - goto out; + break; for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; @@ -2601,6 +2614,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) pagevec_release(&pvec); cond_resched(); } + mpd->scanned_until_end = 1; return 0; out: pagevec_release(&pvec); @@ -2619,7 +2633,6 @@ static int ext4_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int needed_blocks, rsv_blocks = 0, ret = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); - bool done; struct blk_plug plug; bool give_up_on_write = false; @@ -2705,7 +2718,6 @@ static int ext4_writepages(struct address_space *mapping, retry: if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) tag_pages_for_writeback(mapping, mpd.first_page, mpd.last_page); - done = false; blk_start_plug(&plug); /* @@ -2715,6 +2727,7 @@ retry: * started. */ mpd.do_map = 0; + mpd.scanned_until_end = 0; mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { ret = -ENOMEM; @@ -2730,7 +2743,7 @@ retry: if (ret < 0) goto unplug; - while (!done && mpd.first_page <= mpd.last_page) { + while (!mpd.scanned_until_end && wbc->nr_to_write > 0) { /* For each extent of pages we use new io_end */ mpd.io_submit.io_end = ext4_init_io_end(inode, GFP_KERNEL); if (!mpd.io_submit.io_end) { @@ -2765,20 +2778,9 @@ retry: trace_ext4_da_write_pages(inode, mpd.first_page, mpd.wbc); ret = mpage_prepare_extent_to_map(&mpd); - if (!ret) { - if (mpd.map.m_len) - ret = mpage_map_and_submit_extent(handle, &mpd, + if (!ret && mpd.map.m_len) + ret = mpage_map_and_submit_extent(handle, &mpd, &give_up_on_write); - else { - /* - * We scanned the whole range (or exhausted - * nr_to_write), submitted what was mapped and - * didn't find anything needing mapping. We are - * done. - */ - done = true; - } - } /* * Caution: If the handle is synchronous, * ext4_journal_stop() can wait for transaction commit @@ -3077,7 +3079,7 @@ static int ext4_da_write_end(struct file *file, * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) */ - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); } } @@ -3094,7 +3096,7 @@ static int ext4_da_write_end(struct file *file, if (ret2 < 0) ret = ret2; ret2 = ext4_journal_stop(handle); - if (!ret) + if (unlikely(ret2 && !ret)) ret = ret2; return ret ? ret : copied; @@ -3883,6 +3885,8 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) { handle_t *handle; + int ret; + loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); @@ -3896,10 +3900,10 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, if (IS_ERR(handle)) return PTR_ERR(handle); ext4_update_i_disksize(inode, size); - ext4_mark_inode_dirty(handle, inode); + ret = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); - return 0; + return ret; } static void ext4_wait_dax_page(struct ext4_inode_info *ei) @@ -3951,7 +3955,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) loff_t first_block_offset, last_block_offset; handle_t *handle; unsigned int credits; - int ret = 0; + int ret = 0, ret2 = 0; trace_ext4_punch_hole(inode, offset, length, 0); @@ -4074,7 +4078,9 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) ext4_handle_sync(handle); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2)) + ret = ret2; if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out_stop: @@ -4143,7 +4149,7 @@ int ext4_truncate(struct inode *inode) { struct ext4_inode_info *ei = EXT4_I(inode); unsigned int credits; - int err = 0; + int err = 0, err2; handle_t *handle; struct address_space *mapping = inode->i_mapping; @@ -4231,7 +4237,9 @@ out_stop: ext4_orphan_del(handle, inode); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; ext4_journal_stop(handle); trace_ext4_truncate_exit(inode); @@ -5289,6 +5297,8 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) inode->i_gid = attr->ia_gid; error = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); + if (unlikely(error)) + return error; } if (attr->ia_valid & ATTR_SIZE) { @@ -5774,7 +5784,8 @@ out_unlock: * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync) * we start and wait on commits. */ -int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) +int __ext4_mark_inode_dirty(handle_t *handle, struct inode *inode, + const char *func, unsigned int line) { struct ext4_iloc iloc; struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); @@ -5784,13 +5795,18 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) trace_ext4_mark_inode_dirty(inode, _RET_IP_); err = ext4_reserve_inode_write(handle, inode, &iloc); if (err) - return err; + goto out; if (EXT4_I(inode)->i_extra_isize < sbi->s_want_extra_isize) ext4_try_to_expand_extra_isize(inode, sbi->s_want_extra_isize, iloc, handle); - return ext4_mark_iloc_dirty(handle, inode, &iloc); + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +out: + if (unlikely(err)) + ext4_error_inode_err(inode, func, line, 0, err, + "mark_inode_dirty error"); + return err; } /* diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0746532ba463..2162db0c747d 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -754,14 +754,6 @@ static int ext4_ioctl_get_es_cache(struct file *filp, unsigned long arg) fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); - error = ext4_get_es_cache(inode, &fieinfo, fiemap.fm_start, fiemap.fm_length); fiemap.fm_flags = fieinfo.fi_flags; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 30d5d97548c4..a9083113a8c0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -18,13 +18,6 @@ #include <linux/backing-dev.h> #include <trace/events/ext4.h> -#ifdef CONFIG_EXT4_DEBUG -ushort ext4_mballoc_debug __read_mostly; - -module_param_named(mballoc_debug, ext4_mballoc_debug, ushort, 0644); -MODULE_PARM_DESC(mballoc_debug, "Debugging level for ext4's mballoc"); -#endif - /* * MUSTDO: * - test ext4_ext_search_left() and ext4_ext_search_right() @@ -356,6 +349,36 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); + +/* + * The algorithm using this percpu seq counter goes below: + * 1. We sample the percpu discard_pa_seq counter before trying for block + * allocation in ext4_mb_new_blocks(). + * 2. We increment this percpu discard_pa_seq counter when we either allocate + * or free these blocks i.e. while marking those blocks as used/free in + * mb_mark_used()/mb_free_blocks(). + * 3. We also increment this percpu seq counter when we successfully identify + * that the bb_prealloc_list is not empty and hence proceed for discarding + * of those PAs inside ext4_mb_discard_group_preallocations(). + * + * Now to make sure that the regular fast path of block allocation is not + * affected, as a small optimization we only sample the percpu seq counter + * on that cpu. Only when the block allocation fails and when freed blocks + * found were 0, that is when we sample percpu seq counter for all cpus using + * below function ext4_get_discard_pa_seq_sum(). This happens after making + * sure that all the PAs on grp->bb_prealloc_list got freed or if it's empty. + */ +static DEFINE_PER_CPU(u64, discard_pa_seq); +static inline u64 ext4_get_discard_pa_seq_sum(void) +{ + int __cpu; + u64 __seq = 0; + + for_each_possible_cpu(__cpu) + __seq += per_cpu(discard_pa_seq, __cpu); + return __seq; +} static inline void *mb_correct_addr_and_bit(int *bit, void *addr) { @@ -493,6 +516,8 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { + if (unlikely(e4b->bd_info->bb_bitmap == NULL)) + return; if (memcmp(e4b->bd_info->bb_bitmap, bitmap, e4b->bd_sb->s_blocksize)) { unsigned char *b1, *b2; int i; @@ -511,6 +536,31 @@ static void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) } } +static void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + struct buffer_head *bh; + + grp->bb_bitmap = kmalloc(sb->s_blocksize, GFP_NOFS); + if (!grp->bb_bitmap) + return; + + bh = ext4_read_block_bitmap(sb, group); + if (IS_ERR_OR_NULL(bh)) { + kfree(grp->bb_bitmap); + grp->bb_bitmap = NULL; + return; + } + + memcpy(grp->bb_bitmap, bh->b_data, sb->s_blocksize); + put_bh(bh); +} + +static void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + kfree(grp->bb_bitmap); +} + #else static inline void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, int first, int count) @@ -526,6 +576,17 @@ static inline void mb_cmp_bitmaps(struct ext4_buddy *e4b, void *bitmap) { return; } + +static inline void mb_group_bb_bitmap_alloc(struct super_block *sb, + struct ext4_group_info *grp, ext4_group_t group) +{ + return; +} + +static inline void mb_group_bb_bitmap_free(struct ext4_group_info *grp) +{ + return; +} #endif #ifdef AGGRESSIVE_CHECK @@ -820,14 +881,14 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) char *bitmap; struct ext4_group_info *grinfo; - mb_debug(1, "init page %lu\n", page->index); - inode = page->mapping->host; sb = inode->i_sb; ngroups = ext4_get_groups_count(sb); blocksize = i_blocksize(inode); blocks_per_page = PAGE_SIZE / blocksize; + mb_debug(sb, "init page %lu\n", page->index); + groups_per_page = blocks_per_page >> 1; if (groups_per_page == 0) groups_per_page = 1; @@ -867,7 +928,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) bh[i] = NULL; goto out; } - mb_debug(1, "read bitmap for group %u\n", group); + mb_debug(sb, "read bitmap for group %u\n", group); } /* wait for I/O completion */ @@ -912,7 +973,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) if ((first_block + i) & 1) { /* this is block of buddy */ BUG_ON(incore == NULL); - mb_debug(1, "put buddy for group %u in page %lu/%x\n", + mb_debug(sb, "put buddy for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_buddy_bitmap_load(sb, group); grinfo = ext4_get_group_info(sb, group); @@ -932,7 +993,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) } else { /* this is block of bitmap */ BUG_ON(incore != NULL); - mb_debug(1, "put bitmap for group %u in page %lu/%x\n", + mb_debug(sb, "put bitmap for group %u in page %lu/%x\n", group, page->index, i * blocksize); trace_ext4_mb_bitmap_load(sb, group); @@ -1038,7 +1099,7 @@ int ext4_mb_init_group(struct super_block *sb, ext4_group_t group, gfp_t gfp) int ret = 0; might_sleep(); - mb_debug(1, "init group %u\n", group); + mb_debug(sb, "init group %u\n", group); this_grp = ext4_get_group_info(sb, group); /* * This ensures that we don't reinit the buddy cache @@ -1110,7 +1171,7 @@ ext4_mb_load_buddy_gfp(struct super_block *sb, ext4_group_t group, struct inode *inode = sbi->s_buddy_cache; might_sleep(); - mb_debug(1, "load group %u\n", group); + mb_debug(sb, "load group %u\n", group); blocks_per_page = PAGE_SIZE / sb->s_blocksize; grp = ext4_get_group_info(sb, group); @@ -1430,6 +1491,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free += count; if (first < e4b->bd_info->bb_first_free) e4b->bd_info->bb_first_free = first; @@ -1571,6 +1633,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); + this_cpu_inc(discard_pa_seq); e4b->bd_info->bb_free -= len; if (e4b->bd_info->bb_first_free == start) e4b->bd_info->bb_first_free += len; @@ -1670,6 +1733,14 @@ static void ext4_mb_use_best_found(struct ext4_allocation_context *ac, sbi->s_mb_last_start = ac->ac_f_ex.fe_start; spin_unlock(&sbi->s_md_lock); } + /* + * As we've just preallocated more space than + * user requested originally, we store allocated + * space in a special descriptor. + */ + if (ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) + ext4_mb_new_preallocation(ac); + } /* @@ -1918,7 +1989,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, ext4_mb_use_best_found(ac, e4b); - BUG_ON(ac->ac_b_ex.fe_len != ac->ac_g_ex.fe_len); + BUG_ON(ac->ac_f_ex.fe_len != ac->ac_g_ex.fe_len); if (EXT4_SB(sb)->s_mb_stats) atomic_inc(&EXT4_SB(sb)->s_bal_2orders); @@ -2035,15 +2106,14 @@ void ext4_mb_scan_aligned(struct ext4_allocation_context *ac, } /* - * This is now called BEFORE we load the buddy bitmap. + * This is also called BEFORE we load the buddy bitmap. * Returns either 1 or 0 indicating that the group is either suitable - * for the allocation or not. In addition it can also return negative - * error code when something goes wrong. + * for the allocation or not. */ -static int ext4_mb_good_group(struct ext4_allocation_context *ac, +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, ext4_group_t group, int cr) { - unsigned free, fragments; + ext4_grpblk_t free, fragments; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); @@ -2051,23 +2121,16 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, free = grp->bb_free; if (free == 0) - return 0; + return false; if (cr <= 2 && free < ac->ac_g_ex.fe_len) - return 0; + return false; if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) - return 0; - - /* We only do this if the grp has never been initialized */ - if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { - int ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); - if (ret) - return ret; - } + return false; fragments = grp->bb_fragments; if (fragments == 0) - return 0; + return false; switch (cr) { case 0: @@ -2077,38 +2140,80 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, if ((ac->ac_flags & EXT4_MB_HINT_DATA) && (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) && ((group % flex_size) == 0)) - return 0; + return false; if ((ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) || (free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; if (grp->bb_largest_free_order < ac->ac_2order) - return 0; + return false; - return 1; + return true; case 1: if ((free / fragments) >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 2: if (free >= ac->ac_g_ex.fe_len) - return 1; + return true; break; case 3: - return 1; + return true; default: BUG(); } - return 0; + return false; +} + +/* + * This could return negative error code if something goes wrong + * during ext4_mb_init_group(). This should not be called with + * ext4_lock_group() held. + */ +static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, + ext4_group_t group, int cr) +{ + struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); + struct super_block *sb = ac->ac_sb; + bool should_lock = ac->ac_flags & EXT4_MB_STRICT_CHECK; + ext4_grpblk_t free; + int ret = 0; + + if (should_lock) + ext4_lock_group(sb, group); + free = grp->bb_free; + if (free == 0) + goto out; + if (cr <= 2 && free < ac->ac_g_ex.fe_len) + goto out; + if (unlikely(EXT4_MB_GRP_BBITMAP_CORRUPT(grp))) + goto out; + if (should_lock) + ext4_unlock_group(sb, group); + + /* We only do this if the grp has never been initialized */ + if (unlikely(EXT4_MB_GRP_NEED_INIT(grp))) { + ret = ext4_mb_init_group(ac->ac_sb, group, GFP_NOFS); + if (ret) + return ret; + } + + if (should_lock) + ext4_lock_group(sb, group); + ret = ext4_mb_good_group(ac, group, cr); +out: + if (should_lock) + ext4_unlock_group(sb, group); + return ret; } static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { ext4_group_t ngroups, group, i; - int cr; + int cr = -1; int err = 0, first_err = 0; struct ext4_sb_info *sbi; struct super_block *sb; @@ -2189,7 +2294,7 @@ repeat: group = 0; /* This now checks without needing the buddy page */ - ret = ext4_mb_good_group(ac, group, cr); + ret = ext4_mb_good_group_nolock(ac, group, cr); if (ret <= 0) { if (!first_err) first_err = ret; @@ -2207,11 +2312,9 @@ repeat: * block group */ ret = ext4_mb_good_group(ac, group, cr); - if (ret <= 0) { + if (ret == 0) { ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); - if (!first_err) - first_err = ret; continue; } @@ -2260,6 +2363,10 @@ repeat: out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; + + mb_debug(sb, "Best len %d, origin len %d, ac_status %u, ac_flags 0x%x, cr %d ret %d\n", + ac->ac_b_ex.fe_len, ac->ac_o_ex.fe_len, ac->ac_status, + ac->ac_flags, cr, err); return err; } @@ -2452,20 +2559,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, meta_group_info[i]->bb_free_root = RB_ROOT; meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ -#ifdef DOUBLE_CHECK - { - struct buffer_head *bh; - meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_NOFS); - BUG_ON(meta_group_info[i]->bb_bitmap == NULL); - bh = ext4_read_block_bitmap(sb, group); - BUG_ON(IS_ERR_OR_NULL(bh)); - memcpy(meta_group_info[i]->bb_bitmap, bh->b_data, - sb->s_blocksize); - put_bh(bh); - } -#endif - + mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; exit_group_info: @@ -2702,7 +2796,7 @@ out: } /* need to called with the ext4 group lock held */ -static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) +static int ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; struct list_head *cur, *tmp; @@ -2714,9 +2808,7 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) count++; kmem_cache_free(ext4_pspace_cachep, pa); } - if (count) - mb_debug(1, "mballoc: %u PAs left\n", count); - + return count; } int ext4_mb_release(struct super_block *sb) @@ -2727,16 +2819,18 @@ int ext4_mb_release(struct super_block *sb) struct ext4_group_info *grinfo, ***group_info; struct ext4_sb_info *sbi = EXT4_SB(sb); struct kmem_cache *cachep = get_groupinfo_cache(sb->s_blocksize_bits); + int count; if (sbi->s_group_info) { for (i = 0; i < ngroups; i++) { cond_resched(); grinfo = ext4_get_group_info(sb, i); -#ifdef DOUBLE_CHECK - kfree(grinfo->bb_bitmap); -#endif + mb_group_bb_bitmap_free(grinfo); ext4_lock_group(sb, i); - ext4_mb_cleanup_pa(grinfo); + count = ext4_mb_cleanup_pa(grinfo); + if (count) + mb_debug(sb, "mballoc: %d PAs left\n", + count); ext4_unlock_group(sb, i); kmem_cache_free(cachep, grinfo); } @@ -2809,7 +2903,7 @@ static void ext4_free_data_in_buddy(struct super_block *sb, struct ext4_group_info *db; int err, count = 0, count2 = 0; - mb_debug(1, "gonna free %u blocks in group %u (0x%p):", + mb_debug(sb, "gonna free %u blocks in group %u (0x%p):", entry->efd_count, entry->efd_group, entry); err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b); @@ -2849,7 +2943,8 @@ static void ext4_free_data_in_buddy(struct super_block *sb, kmem_cache_free(ext4_free_data_cachep, entry); ext4_mb_unload_buddy(&e4b); - mb_debug(1, "freed %u blocks in %u structures\n", count, count2); + mb_debug(sb, "freed %d blocks in %d structures\n", count, + count2); } /* @@ -2909,23 +3004,26 @@ int __init ext4_init_mballoc(void) ext4_pspace_cachep = KMEM_CACHE(ext4_prealloc_space, SLAB_RECLAIM_ACCOUNT); if (ext4_pspace_cachep == NULL) - return -ENOMEM; + goto out; ext4_ac_cachep = KMEM_CACHE(ext4_allocation_context, SLAB_RECLAIM_ACCOUNT); - if (ext4_ac_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - return -ENOMEM; - } + if (ext4_ac_cachep == NULL) + goto out_pa_free; ext4_free_data_cachep = KMEM_CACHE(ext4_free_data, SLAB_RECLAIM_ACCOUNT); - if (ext4_free_data_cachep == NULL) { - kmem_cache_destroy(ext4_pspace_cachep); - kmem_cache_destroy(ext4_ac_cachep); - return -ENOMEM; - } + if (ext4_free_data_cachep == NULL) + goto out_ac_free; + return 0; + +out_ac_free: + kmem_cache_destroy(ext4_ac_cachep); +out_pa_free: + kmem_cache_destroy(ext4_pspace_cachep); +out: + return -ENOMEM; } void ext4_exit_mballoc(void) @@ -3077,8 +3175,7 @@ static void ext4_mb_normalize_group_request(struct ext4_allocation_context *ac) BUG_ON(lg == NULL); ac->ac_g_ex.fe_len = EXT4_SB(sb)->s_mb_group_prealloc; - mb_debug(1, "#%u: goal %u blocks for locality group\n", - current->pid, ac->ac_g_ex.fe_len); + mb_debug(sb, "goal %u blocks for locality group\n", ac->ac_g_ex.fe_len); } /* @@ -3276,8 +3373,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, ac->ac_flags |= EXT4_MB_HINT_TRY_GOAL; } - mb_debug(1, "goal: %u(was %u) blocks at %u\n", (unsigned) size, - (unsigned) orig_size, (unsigned) start); + mb_debug(ac->ac_sb, "goal: %lld(was %lld) blocks at %u\n", size, + orig_size, start); } static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) @@ -3366,7 +3463,7 @@ static void ext4_mb_use_inode_pa(struct ext4_allocation_context *ac, BUG_ON(pa->pa_free < len); pa->pa_free -= len; - mb_debug(1, "use %llu/%u from inode pa %p\n", start, len, pa); + mb_debug(ac->ac_sb, "use %llu/%d from inode pa %p\n", start, len, pa); } /* @@ -3390,7 +3487,8 @@ static void ext4_mb_use_group_pa(struct ext4_allocation_context *ac, * in on-disk bitmap -- see ext4_mb_release_context() * Other CPUs are prevented from allocating from this pa by lg_mutex */ - mb_debug(1, "use %u/%u from group pa %p\n", pa->pa_lstart-len, len, pa); + mb_debug(ac->ac_sb, "use %u/%u from group pa %p\n", + pa->pa_lstart-len, len, pa); } /* @@ -3425,7 +3523,7 @@ ext4_mb_check_group_pa(ext4_fsblk_t goal_block, /* * search goal blocks in preallocated space */ -static noinline_for_stack int +static noinline_for_stack bool ext4_mb_use_preallocated(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); @@ -3437,7 +3535,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* only data can be preallocated */ if (!(ac->ac_flags & EXT4_MB_HINT_DATA)) - return 0; + return false; /* first, try per-file preallocation */ rcu_read_lock(); @@ -3464,7 +3562,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) spin_unlock(&pa->pa_lock); ac->ac_criteria = 10; rcu_read_unlock(); - return 1; + return true; } spin_unlock(&pa->pa_lock); } @@ -3472,12 +3570,12 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) /* can we use group allocation? */ if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC)) - return 0; + return false; /* inode may have no locality group for some reason */ lg = ac->ac_lg; if (lg == NULL) - return 0; + return false; order = fls(ac->ac_o_ex.fe_len) - 1; if (order > PREALLOC_TB_SIZE - 1) /* The max size of hash table is PREALLOC_TB_SIZE */ @@ -3506,9 +3604,9 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) if (cpa) { ext4_mb_use_group_pa(ac, cpa); ac->ac_criteria = 20; - return 1; + return true; } - return 0; + return false; } /* @@ -3573,7 +3671,7 @@ void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_set_bits(bitmap, start, len); preallocated += len; } - mb_debug(1, "preallocated %u for group %u\n", preallocated, group); + mb_debug(sb, "preallocated %d for group %u\n", preallocated, group); } static void ext4_mb_pa_callback(struct rcu_head *head) @@ -3649,7 +3747,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, /* * creates new preallocated space for given inode */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3662,10 +3760,9 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; if (ac->ac_b_ex.fe_len < ac->ac_g_ex.fe_len) { int winl; @@ -3709,15 +3806,14 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_pstart = ext4_grp_offs_to_block(sb, &ac->ac_b_ex); pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_INODE_PA; - mb_debug(1, "new inode pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new inode pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_inode_pa(ac, pa); ext4_mb_use_inode_pa(ac, pa); @@ -3729,21 +3825,17 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &ei->i_prealloc_lock; pa->pa_inode = ac->ac_inode; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); spin_lock(pa->pa_obj_lock); list_add_rcu(&pa->pa_inode_list, &ei->i_prealloc_list); spin_unlock(pa->pa_obj_lock); - - return 0; } /* * creates new preallocated space for locality group inodes belongs to */ -static noinline_for_stack int +static noinline_for_stack void ext4_mb_new_group_pa(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; @@ -3755,11 +3847,9 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) BUG_ON(ac->ac_o_ex.fe_len >= ac->ac_b_ex.fe_len); BUG_ON(ac->ac_status != AC_STATUS_FOUND); BUG_ON(!S_ISREG(ac->ac_inode->i_mode)); + BUG_ON(ac->ac_pa == NULL); - BUG_ON(ext4_pspace_cachep == NULL); - pa = kmem_cache_alloc(ext4_pspace_cachep, GFP_NOFS); - if (pa == NULL) - return -ENOMEM; + pa = ac->ac_pa; /* preallocation can change ac_b_ex, thus we store actually * allocated blocks for history */ @@ -3769,15 +3859,14 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_lstart = pa->pa_pstart; pa->pa_len = ac->ac_b_ex.fe_len; pa->pa_free = pa->pa_len; - atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_type = MB_GROUP_PA; - mb_debug(1, "new group pa %p: %llu/%u for %u\n", pa, - pa->pa_pstart, pa->pa_len, pa->pa_lstart); + mb_debug(sb, "new group pa %p: %llu/%d for %u\n", pa, pa->pa_pstart, + pa->pa_len, pa->pa_lstart); trace_ext4_mb_new_group_pa(ac, pa); ext4_mb_use_group_pa(ac, pa); @@ -3790,26 +3879,20 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) pa->pa_obj_lock = &lg->lg_prealloc_lock; pa->pa_inode = NULL; - ext4_lock_group(sb, ac->ac_b_ex.fe_group); list_add(&pa->pa_group_list, &grp->bb_prealloc_list); - ext4_unlock_group(sb, ac->ac_b_ex.fe_group); /* * We will later add the new pa to the right bucket * after updating the pa_free in ext4_mb_release_context */ - return 0; } -static int ext4_mb_new_preallocation(struct ext4_allocation_context *ac) +static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac) { - int err; - if (ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC) - err = ext4_mb_new_group_pa(ac); + ext4_mb_new_group_pa(ac); else - err = ext4_mb_new_inode_pa(ac); - return err; + ext4_mb_new_inode_pa(ac); } /* @@ -3844,7 +3927,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, if (bit >= end) break; next = mb_find_next_bit(bitmap_bh->b_data, end, bit); - mb_debug(1, " free preallocated %u/%u in group %u\n", + mb_debug(sb, "free preallocated %u/%u in group %u\n", (unsigned) ext4_group_first_block_no(sb, group) + bit, (unsigned) next - bit, (unsigned) group); free += next - bit; @@ -3858,10 +3941,10 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh, } if (free != pa->pa_free) { ext4_msg(e4b->bd_sb, KERN_CRIT, - "pa %p: logic %lu, phys. %lu, len %lu", + "pa %p: logic %lu, phys. %lu, len %d", pa, (unsigned long) pa->pa_lstart, (unsigned long) pa->pa_pstart, - (unsigned long) pa->pa_len); + pa->pa_len); ext4_grp_locked_error(sb, group, 0, 0, "free %u, pa_free %u", free, pa->pa_free); /* @@ -3915,10 +3998,9 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, int busy = 0; int free = 0; - mb_debug(1, "discard preallocation for group %u\n", group); - + mb_debug(sb, "discard preallocation for group %u\n", group); if (list_empty(&grp->bb_prealloc_list)) - return 0; + goto out_dbg; bitmap_bh = ext4_read_block_bitmap(sb, group); if (IS_ERR(bitmap_bh)) { @@ -3926,7 +4008,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_error_err(sb, -err, "Error %d reading block bitmap for %u", err, group); - return 0; + goto out_dbg; } err = ext4_mb_load_buddy(sb, group, &e4b); @@ -3934,7 +4016,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, ext4_warning(sb, "Error %d loading buddy information for %u", err, group); put_bh(bitmap_bh); - return 0; + goto out_dbg; } if (needed == 0) @@ -3943,6 +4025,7 @@ ext4_mb_discard_group_preallocations(struct super_block *sb, INIT_LIST_HEAD(&list); repeat: ext4_lock_group(sb, group); + this_cpu_inc(discard_pa_seq); list_for_each_entry_safe(pa, tmp, &grp->bb_prealloc_list, pa_group_list) { spin_lock(&pa->pa_lock); @@ -3979,6 +4062,8 @@ repeat: /* found anything to free? */ if (list_empty(&list)) { BUG_ON(free != 0); + mb_debug(sb, "Someone else may have freed PA for this group %u\n", + group); goto out; } @@ -4003,6 +4088,9 @@ out: ext4_unlock_group(sb, group); ext4_mb_unload_buddy(&e4b); put_bh(bitmap_bh); +out_dbg: + mb_debug(sb, "discarded (%d) blocks preallocated for group %u bb_free (%d)\n", + free, group, grp->bb_free); return free; } @@ -4031,7 +4119,8 @@ void ext4_discard_preallocations(struct inode *inode) return; } - mb_debug(1, "discard preallocation for inode %lu\n", inode->i_ino); + mb_debug(sb, "discard preallocation for inode %lu\n", + inode->i_ino); trace_ext4_discard_preallocations(inode); INIT_LIST_HEAD(&list); @@ -4119,22 +4208,74 @@ repeat: } } +static int ext4_mb_pa_alloc(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa; + + BUG_ON(ext4_pspace_cachep == NULL); + pa = kmem_cache_zalloc(ext4_pspace_cachep, GFP_NOFS); + if (!pa) + return -ENOMEM; + atomic_set(&pa->pa_count, 1); + ac->ac_pa = pa; + return 0; +} + +static void ext4_mb_pa_free(struct ext4_allocation_context *ac) +{ + struct ext4_prealloc_space *pa = ac->ac_pa; + + BUG_ON(!pa); + ac->ac_pa = NULL; + WARN_ON(!atomic_dec_and_test(&pa->pa_count)); + kmem_cache_free(ext4_pspace_cachep, pa); +} + #ifdef CONFIG_EXT4_DEBUG +static inline void ext4_mb_show_pa(struct super_block *sb) +{ + ext4_group_t i, ngroups; + + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) + return; + + ngroups = ext4_get_groups_count(sb); + mb_debug(sb, "groups: "); + for (i = 0; i < ngroups; i++) { + struct ext4_group_info *grp = ext4_get_group_info(sb, i); + struct ext4_prealloc_space *pa; + ext4_grpblk_t start; + struct list_head *cur; + ext4_lock_group(sb, i); + list_for_each(cur, &grp->bb_prealloc_list) { + pa = list_entry(cur, struct ext4_prealloc_space, + pa_group_list); + spin_lock(&pa->pa_lock); + ext4_get_group_no_and_offset(sb, pa->pa_pstart, + NULL, &start); + spin_unlock(&pa->pa_lock); + mb_debug(sb, "PA:%u:%d:%d\n", i, start, + pa->pa_len); + } + ext4_unlock_group(sb, i); + mb_debug(sb, "%u: %d/%d\n", i, grp->bb_free, + grp->bb_fragments); + } +} + static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - ext4_group_t ngroups, i; - if (!ext4_mballoc_debug || - (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)) + if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED) return; - ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:" + mb_debug(sb, "Can't allocate:" " Allocation context details:"); - ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d", + mb_debug(sb, "status %u flags 0x%x", ac->ac_status, ac->ac_flags); - ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, " - "goal %lu/%lu/%lu@%lu, " + mb_debug(sb, "orig %lu/%lu/%lu@%lu, " + "goal %lu/%lu/%lu@%lu, " "best %lu/%lu/%lu@%lu cr %d", (unsigned long)ac->ac_o_ex.fe_group, (unsigned long)ac->ac_o_ex.fe_start, @@ -4149,37 +4290,17 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) (unsigned long)ac->ac_b_ex.fe_len, (unsigned long)ac->ac_b_ex.fe_logical, (int)ac->ac_criteria); - ext4_msg(ac->ac_sb, KERN_ERR, "%d found", ac->ac_found); - ext4_msg(ac->ac_sb, KERN_ERR, "groups: "); - ngroups = ext4_get_groups_count(sb); - for (i = 0; i < ngroups; i++) { - struct ext4_group_info *grp = ext4_get_group_info(sb, i); - struct ext4_prealloc_space *pa; - ext4_grpblk_t start; - struct list_head *cur; - ext4_lock_group(sb, i); - list_for_each(cur, &grp->bb_prealloc_list) { - pa = list_entry(cur, struct ext4_prealloc_space, - pa_group_list); - spin_lock(&pa->pa_lock); - ext4_get_group_no_and_offset(sb, pa->pa_pstart, - NULL, &start); - spin_unlock(&pa->pa_lock); - printk(KERN_ERR "PA:%u:%d:%u \n", i, - start, pa->pa_len); - } - ext4_unlock_group(sb, i); - - if (grp->bb_free == 0) - continue; - printk(KERN_ERR "%u: %d/%d \n", - i, grp->bb_free, grp->bb_fragments); - } - printk(KERN_ERR "\n"); + mb_debug(sb, "%u found", ac->ac_found); + ext4_mb_show_pa(sb); } #else +static inline void ext4_mb_show_pa(struct super_block *sb) +{ + return; +} static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac) { + ext4_mb_show_pa(ac->ac_sb); return; } #endif @@ -4282,7 +4403,7 @@ ext4_mb_initialize_context(struct ext4_allocation_context *ac, * locality group. this is a policy, actually */ ext4_mb_group_or_file(ac); - mb_debug(1, "init ac: %u blocks @ %u, goal %u, flags %x, 2^%d, " + mb_debug(sb, "init ac: %u blocks @ %u, goal %u, flags 0x%x, 2^%d, " "left: %u/%u, right %u/%u to %swritable\n", (unsigned) ar->len, (unsigned) ar->logical, (unsigned) ar->goal, ac->ac_flags, ac->ac_2order, @@ -4303,7 +4424,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb, struct list_head discard_list; struct ext4_prealloc_space *pa, *tmp; - mb_debug(1, "discard locality group preallocation\n"); + mb_debug(sb, "discard locality group preallocation\n"); INIT_LIST_HEAD(&discard_list); @@ -4486,6 +4607,30 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) return freed; } +static bool ext4_mb_discard_preallocations_should_retry(struct super_block *sb, + struct ext4_allocation_context *ac, u64 *seq) +{ + int freed; + u64 seq_retry = 0; + bool ret = false; + + freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); + if (freed) { + ret = true; + goto out_dbg; + } + seq_retry = ext4_get_discard_pa_seq_sum(); + if (!(ac->ac_flags & EXT4_MB_STRICT_CHECK) || seq_retry != *seq) { + ac->ac_flags |= EXT4_MB_STRICT_CHECK; + *seq = seq_retry; + ret = true; + } + +out_dbg: + mb_debug(sb, "freed %d, retry ? %s\n", freed, ret ? "yes" : "no"); + return ret; +} + /* * Main entry point into mballoc to allocate blocks * it tries to use preallocation first, then falls back @@ -4494,13 +4639,13 @@ static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, struct ext4_allocation_request *ar, int *errp) { - int freed; struct ext4_allocation_context *ac = NULL; struct ext4_sb_info *sbi; struct super_block *sb; ext4_fsblk_t block = 0; unsigned int inquota = 0; unsigned int reserv_clstrs = 0; + u64 seq; might_sleep(); sb = ar->inode->i_sb; @@ -4525,6 +4670,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, ar->len = ar->len >> 1; } if (!ar->len) { + ext4_mb_show_pa(sb); *errp = -ENOSPC; return 0; } @@ -4562,26 +4708,32 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, } ac->ac_op = EXT4_MB_HISTORY_PREALLOC; + seq = *this_cpu_ptr(&discard_pa_seq); if (!ext4_mb_use_preallocated(ac)) { ac->ac_op = EXT4_MB_HISTORY_ALLOC; ext4_mb_normalize_request(ac, ar); + + *errp = ext4_mb_pa_alloc(ac); + if (*errp) + goto errout; repeat: /* allocate space in core */ *errp = ext4_mb_regular_allocator(ac); - if (*errp) - goto discard_and_exit; - - /* as we've just preallocated more space than - * user requested originally, we store allocated - * space in a special descriptor */ - if (ac->ac_status == AC_STATUS_FOUND && - ac->ac_o_ex.fe_len < ac->ac_b_ex.fe_len) - *errp = ext4_mb_new_preallocation(ac); + /* + * pa allocated above is added to grp->bb_prealloc_list only + * when we were able to allocate some block i.e. when + * ac->ac_status == AC_STATUS_FOUND. + * And error from above mean ac->ac_status != AC_STATUS_FOUND + * So we have to free this pa here itself. + */ if (*errp) { - discard_and_exit: + ext4_mb_pa_free(ac); ext4_discard_allocated_blocks(ac); goto errout; } + if (ac->ac_status == AC_STATUS_FOUND && + ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len) + ext4_mb_pa_free(ac); } if (likely(ac->ac_status == AC_STATUS_FOUND)) { *errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs); @@ -4593,9 +4745,13 @@ repeat: ar->len = ac->ac_b_ex.fe_len; } } else { - freed = ext4_mb_discard_preallocations(sb, ac->ac_o_ex.fe_len); - if (freed) + if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq)) goto repeat; + /* + * If block allocation fails then the pa allocated above + * needs to be freed here itself. + */ + ext4_mb_pa_free(ac); *errp = -ENOSPC; } diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index 88c98f17e3d9..6b4d17c2935d 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -24,19 +24,15 @@ #include "ext4.h" /* + * mb_debug() dynamic printk msgs could be used to debug mballoc code. */ #ifdef CONFIG_EXT4_DEBUG -extern ushort ext4_mballoc_debug; - -#define mb_debug(n, fmt, ...) \ -do { \ - if ((n) <= ext4_mballoc_debug) { \ - printk(KERN_DEBUG "(%s, %d): %s: " fmt, \ - __FILE__, __LINE__, __func__, ##__VA_ARGS__); \ - } \ -} while (0) +#define mb_debug(sb, fmt, ...) \ + pr_debug("[%s/%d] EXT4-fs (%s): (%s, %d): %s: " fmt, \ + current->comm, task_pid_nr(current), sb->s_id, \ + __FILE__, __LINE__, __func__, ##__VA_ARGS__) #else -#define mb_debug(n, fmt, ...) no_printk(fmt, ##__VA_ARGS__) +#define mb_debug(sb, fmt, ...) no_printk(fmt, ##__VA_ARGS__) #endif #define EXT4_MB_HISTORY_ALLOC 1 /* allocation */ diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index fb6520f37135..c5e3fc998211 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -287,7 +287,7 @@ static int free_ind_block(handle_t *handle, struct inode *inode, __le32 *i_data) static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, struct inode *tmp_inode) { - int retval; + int retval, retval2 = 0; __le32 i_data[3]; struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_inode_info *tmp_ei = EXT4_I(tmp_inode); @@ -342,7 +342,9 @@ static int ext4_ext_swap_inode_data(handle_t *handle, struct inode *inode, * i_blocks when freeing the indirect meta-data blocks */ retval = free_ind_block(handle, inode, i_data); - ext4_mark_inode_dirty(handle, inode); + retval2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(retval2 && !retval)) + retval = retval2; err_out: return retval; @@ -601,7 +603,7 @@ int ext4_ind_migrate(struct inode *inode) ext4_lblk_t start, end; ext4_fsblk_t blk; handle_t *handle; - int ret; + int ret, ret2 = 0; if (!ext4_has_feature_extents(inode->i_sb) || (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) @@ -655,7 +657,9 @@ int ext4_ind_migrate(struct inode *inode) memset(ei->i_data, 0, sizeof(ei->i_data)); for (i = start; i <= end; i++) ei->i_data[i] = cpu_to_le32(blk++); - ext4_mark_inode_dirty(handle, inode); + ret2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret2 && !ret)) + ret = ret2; errout: ext4_journal_stop(handle); up_write(&EXT4_I(inode)->i_data_sem); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a8aca4772aaa..56738b538ddf 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1993,7 +1993,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, { unsigned int blocksize = dir->i_sb->s_blocksize; int csum_size = 0; - int err; + int err, err2; if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); @@ -2028,12 +2028,12 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, dir->i_mtime = dir->i_ctime = current_time(dir); ext4_update_dx_flag(dir); inode_inc_iversion(dir); - ext4_mark_inode_dirty(handle, dir); + err2 = ext4_mark_inode_dirty(handle, dir); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_dirblock(handle, dir, bh); if (err) ext4_std_error(dir->i_sb, err); - return 0; + return err ? err : err2; } /* @@ -2223,7 +2223,9 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); dx_fallback++; - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (unlikely(retval)) + goto out; } blocks = dir->i_size >> sb->s_blocksize_bits; for (block = 0; block < blocks; block++) { @@ -2576,12 +2578,12 @@ static int ext4_add_nondir(handle_t *handle, struct inode *inode = *inodep; int err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); if (IS_DIRSYNC(dir)) ext4_handle_sync(handle); d_instantiate_new(dentry, inode); *inodep = NULL; - return 0; + return err; } drop_nlink(inode); ext4_orphan_add(handle, inode); @@ -2775,7 +2777,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; - int err, credits, retries = 0; + int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) return -EMLINK; @@ -2808,7 +2810,9 @@ out_clear_inode: clear_nlink(inode); ext4_orphan_add(handle, inode); unlock_new_inode(inode); - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2)) + err = err2; ext4_journal_stop(handle); iput(inode); goto out_retry; @@ -3148,10 +3152,12 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) inode->i_size = 0; ext4_orphan_add(handle, inode); inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); + if (retval) + goto end_rmdir; ext4_dec_count(handle, dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3221,7 +3227,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) goto end_unlink; dir->i_ctime = dir->i_mtime = current_time(dir); ext4_update_dx_flag(dir); - ext4_mark_inode_dirty(handle, dir); + retval = ext4_mark_inode_dirty(handle, dir); + if (retval) + goto end_unlink; if (inode->i_nlink == 0) ext4_warning_inode(inode, "Deleting file '%.*s' with no links", dentry->d_name.len, dentry->d_name.name); @@ -3230,7 +3238,7 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) if (!inode->i_nlink) ext4_orphan_add(handle, inode); inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + retval = ext4_mark_inode_dirty(handle, inode); #ifdef CONFIG_UNICODE /* VFS negative dentries are incompatible with Encoding and @@ -3419,7 +3427,7 @@ retry: err = ext4_add_entry(handle, dentry, inode); if (!err) { - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); /* this can happen only for tmpfile being * linked the first time */ @@ -3531,7 +3539,7 @@ static int ext4_rename_dir_finish(handle_t *handle, struct ext4_renament *ent, static int ext4_setent(handle_t *handle, struct ext4_renament *ent, unsigned ino, unsigned file_type) { - int retval; + int retval, retval2; BUFFER_TRACE(ent->bh, "get write access"); retval = ext4_journal_get_write_access(handle, ent->bh); @@ -3543,19 +3551,19 @@ static int ext4_setent(handle_t *handle, struct ext4_renament *ent, inode_inc_iversion(ent->dir); ent->dir->i_ctime = ent->dir->i_mtime = current_time(ent->dir); - ext4_mark_inode_dirty(handle, ent->dir); + retval = ext4_mark_inode_dirty(handle, ent->dir); BUFFER_TRACE(ent->bh, "call ext4_handle_dirty_metadata"); if (!ent->inlined) { - retval = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); - if (unlikely(retval)) { - ext4_std_error(ent->dir->i_sb, retval); - return retval; + retval2 = ext4_handle_dirty_dirblock(handle, ent->dir, ent->bh); + if (unlikely(retval2)) { + ext4_std_error(ent->dir->i_sb, retval2); + return retval2; } } brelse(ent->bh); ent->bh = NULL; - return 0; + return retval; } static int ext4_find_delete_entry(handle_t *handle, struct inode *dir, @@ -3790,7 +3798,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, EXT4_FT_CHRDEV); if (retval) goto end_rename; - ext4_mark_inode_dirty(handle, whiteout); + retval = ext4_mark_inode_dirty(handle, whiteout); + if (unlikely(retval)) + goto end_rename; } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); @@ -3811,7 +3821,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, * rename. */ old.inode->i_ctime = current_time(old.inode); - ext4_mark_inode_dirty(handle, old.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; if (!whiteout) { /* @@ -3840,12 +3852,18 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, } else { ext4_inc_count(handle, new.dir); ext4_update_dx_flag(new.dir); - ext4_mark_inode_dirty(handle, new.dir); + retval = ext4_mark_inode_dirty(handle, new.dir); + if (unlikely(retval)) + goto end_rename; } } - ext4_mark_inode_dirty(handle, old.dir); + retval = ext4_mark_inode_dirty(handle, old.dir); + if (unlikely(retval)) + goto end_rename; if (new.inode) { - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (!new.inode->i_nlink) ext4_orphan_add(handle, new.inode); } @@ -3979,8 +3997,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, ctime = current_time(old.inode); old.inode->i_ctime = ctime; new.inode->i_ctime = ctime; - ext4_mark_inode_dirty(handle, old.inode); - ext4_mark_inode_dirty(handle, new.inode); + retval = ext4_mark_inode_dirty(handle, old.inode); + if (unlikely(retval)) + goto end_rename; + retval = ext4_mark_inode_dirty(handle, new.inode); + if (unlikely(retval)) + goto end_rename; if (old.dir_bh) { retval = ext4_rename_dir_finish(handle, &old, new.dir->i_ino); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 9824cd8203e8..a29e8ea1a7ab 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3718,7 +3718,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int blocksize, clustersize; unsigned int db_count; unsigned int i; - int needs_recovery, has_huge_files, has_bigalloc; + int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; @@ -4010,17 +4010,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { printk_once(KERN_WARNING "EXT4-fs: Warning: mounting with data=journal disables delayed allocation, dioread_nolock, and O_DIRECT support!\n"); + /* can't mount with both data=journal and dioread_nolock. */ clear_opt(sb, DIOREAD_NOLOCK); if (test_opt2(sb, EXPLICIT_DELALLOC)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and delalloc"); goto failed_mount; } - if (test_opt(sb, DIOREAD_NOLOCK)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "both data=journal and dioread_nolock"); - goto failed_mount; - } if (test_opt(sb, DAX)) { ext4_msg(sb, KERN_ERR, "can't mount with " "both data=journal and dax"); @@ -4237,8 +4233,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* Handle clustersize */ clustersize = BLOCK_SIZE << le32_to_cpu(es->s_log_cluster_size); - has_bigalloc = ext4_has_feature_bigalloc(sb); - if (has_bigalloc) { + if (ext4_has_feature_bigalloc(sb)) { if (clustersize < blocksize) { ext4_msg(sb, KERN_ERR, "cluster size (%d) smaller than " @@ -5925,7 +5920,7 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id, EXT4_I(inode)->i_flags |= EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL; inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, S_NOATIME | S_IMMUTABLE); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); unlock_inode: inode_unlock(inode); @@ -6027,12 +6022,14 @@ static int ext4_quota_off(struct super_block *sb, int type) * this is not a hard failure and quotas are already disabled. */ handle = ext4_journal_start(inode, EXT4_HT_QUOTA, 1); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + err = PTR_ERR(handle); goto out_unlock; + } EXT4_I(inode)->i_flags &= ~(EXT4_NOATIME_FL | EXT4_IMMUTABLE_FL); inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); inode->i_mtime = inode->i_ctime = current_time(inode); - ext4_mark_inode_dirty(handle, inode); + err = ext4_mark_inode_dirty(handle, inode); ext4_journal_stop(handle); out_unlock: inode_unlock(inode); @@ -6090,7 +6087,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err, offset = off & (sb->s_blocksize - 1); + int err = 0, err2 = 0, offset = off & (sb->s_blocksize - 1); int retries = 0; struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -6138,9 +6135,11 @@ out: if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; - ext4_mark_inode_dirty(handle, inode); + err2 = ext4_mark_inode_dirty(handle, inode); + if (unlikely(err2 && !err)) + err = err2; } - return len; + return err ? err : len; } #endif diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 21df43a25328..9b29a40738ac 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1327,7 +1327,7 @@ static int ext4_xattr_inode_write(handle_t *handle, struct inode *ea_inode, int blocksize = ea_inode->i_sb->s_blocksize; int max_blocks = (bufsize + blocksize - 1) >> ea_inode->i_blkbits; int csize, wsize = 0; - int ret = 0; + int ret = 0, ret2 = 0; int retries = 0; retry: @@ -1385,7 +1385,9 @@ retry: ext4_update_i_disksize(ea_inode, wsize); inode_unlock(ea_inode); - ext4_mark_inode_dirty(handle, ea_inode); + ret2 = ext4_mark_inode_dirty(handle, ea_inode); + if (unlikely(ret2 && !ret)) + ret = ret2; out: brelse(bh); @@ -1800,8 +1802,11 @@ ext4_xattr_block_find(struct inode *inode, struct ext4_xattr_info *i, if (EXT4_I(inode)->i_file_acl) { /* The inode already has an extended attribute block. */ bs->bh = ext4_sb_bread(sb, EXT4_I(inode)->i_file_acl, REQ_PRIO); - if (IS_ERR(bs->bh)) - return PTR_ERR(bs->bh); + if (IS_ERR(bs->bh)) { + error = PTR_ERR(bs->bh); + bs->bh = NULL; + return error; + } ea_bdebug(bs->bh, "b_count=%d, refcount=%d", atomic_read(&(bs->bh->b_count)), le32_to_cpu(BHDR(bs->bh)->h_refcount)); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 03ec97f28235..072fb19555a8 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,7 @@ #include <linux/uio.h> #include <linux/cleancache.h> #include <linux/sched/signal.h> +#include <linux/fiemap.h> #include "f2fs.h" #include "node.h" @@ -1824,7 +1825,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_XATTR); if (ret) return ret; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 4167e5408151..9686ffea177e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -8,6 +8,7 @@ #include <linux/fs.h> #include <linux/f2fs_fs.h> +#include <linux/fiemap.h> #include "f2fs.h" #include "node.h" diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a750381d554a..a605c3dddabc 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1125,6 +1125,7 @@ void inode_io_list_del(struct inode *inode) inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } +EXPORT_SYMBOL(inode_io_list_del); /* * mark an inode as under writeback on the sb diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5acd3ce30759..a1337bf31e49 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -17,6 +17,7 @@ #include <linux/crc32.h> #include <linux/iomap.h> #include <linux/security.h> +#include <linux/fiemap.h> #include <linux/uaccess.h> #include "gfs2.h" diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 2de0d3492d15..077c25128eb7 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -9,6 +9,7 @@ #include "hpfs_fn.h" #include <linux/mpage.h> +#include <linux/fiemap.h> #define BLOCKS(size) (((size) + 511) >> 9) diff --git a/fs/internal.h b/fs/internal.h index b89d78f10396..9b863a7bd708 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -142,8 +142,6 @@ extern int dentry_needs_remove_privs(struct dentry *dentry); /* * fs-writeback.c */ -extern void inode_io_list_del(struct inode *inode); - extern long get_nr_dirty_inodes(void); extern int invalidate_inodes(struct super_block *, bool); diff --git a/fs/ioctl.c b/fs/ioctl.c index 5e80b40bc1b5..d69786d1dd91 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -18,6 +18,7 @@ #include <linux/buffer_head.h> #include <linux/falloc.h> #include <linux/sched/signal.h> +#include <linux/fiemap.h> #include "internal.h" @@ -148,61 +149,55 @@ int fiemap_fill_next_extent(struct fiemap_extent_info *fieinfo, u64 logical, EXPORT_SYMBOL(fiemap_fill_next_extent); /** - * fiemap_check_flags - check validity of requested flags for fiemap + * fiemap_prep - check validity of requested flags for fiemap + * @inode: Inode to operate on * @fieinfo: Fiemap context passed into ->fiemap - * @fs_flags: Set of fiemap flags that the file system understands + * @start: Start of the mapped range + * @len: Length of the mapped range, can be truncated by this function. + * @supported_flags: Set of fiemap flags that the file system understands * - * Called from file system ->fiemap callback. This will compute the - * intersection of valid fiemap flags and those that the fs supports. That - * value is then compared against the user supplied flags. In case of bad user - * flags, the invalid values will be written into the fieinfo structure, and - * -EBADR is returned, which tells ioctl_fiemap() to return those values to - * userspace. For this reason, a return code of -EBADR should be preserved. + * This function must be called from each ->fiemap instance to validate the + * fiemap request against the file system parameters. * - * Returns 0 on success, -EBADR on bad flags. + * Returns 0 on success, or a negative error on failure. */ -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags) +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags) { + u64 maxbytes = inode->i_sb->s_maxbytes; u32 incompat_flags; + int ret = 0; - incompat_flags = fieinfo->fi_flags & ~(FIEMAP_FLAGS_COMPAT & fs_flags); - if (incompat_flags) { - fieinfo->fi_flags = incompat_flags; - return -EBADR; - } - return 0; -} -EXPORT_SYMBOL(fiemap_check_flags); - -static int fiemap_check_ranges(struct super_block *sb, - u64 start, u64 len, u64 *new_len) -{ - u64 maxbytes = (u64) sb->s_maxbytes; - - *new_len = len; - - if (len == 0) + if (*len == 0) return -EINVAL; - if (start > maxbytes) return -EFBIG; /* * Shrink request scope to what the fs can actually handle. */ - if (len > maxbytes || (maxbytes - len) < start) - *new_len = maxbytes - start; + if (*len > maxbytes || (maxbytes - *len) < start) + *len = maxbytes - start; + + supported_flags |= FIEMAP_FLAG_SYNC; + supported_flags &= FIEMAP_FLAGS_COMPAT; + incompat_flags = fieinfo->fi_flags & ~supported_flags; + if (incompat_flags) { + fieinfo->fi_flags = incompat_flags; + return -EBADR; + } - return 0; + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) + ret = filemap_write_and_wait(inode->i_mapping); + return ret; } +EXPORT_SYMBOL(fiemap_prep); static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) { struct fiemap fiemap; struct fiemap_extent_info fieinfo = { 0, }; struct inode *inode = file_inode(filp); - struct super_block *sb = inode->i_sb; - u64 len; int error; if (!inode->i_op->fiemap) @@ -214,24 +209,13 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) if (fiemap.fm_extent_count > FIEMAP_MAX_EXTENTS) return -EINVAL; - error = fiemap_check_ranges(sb, fiemap.fm_start, fiemap.fm_length, - &len); - if (error) - return error; - fieinfo.fi_flags = fiemap.fm_flags; fieinfo.fi_extents_max = fiemap.fm_extent_count; fieinfo.fi_extents_start = ufiemap->fm_extents; - if (fiemap.fm_extent_count != 0 && - !access_ok(fieinfo.fi_extents_start, - fieinfo.fi_extents_max * sizeof(struct fiemap_extent))) - return -EFAULT; - - if (fieinfo.fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(inode->i_mapping); + error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, + fiemap.fm_length); - error = inode->i_op->fiemap(inode, &fieinfo, fiemap.fm_start, len); fiemap.fm_flags = fieinfo.fi_flags; fiemap.fm_mapped_extents = fieinfo.fi_extents_mapped; if (copy_to_user(ufiemap, &fiemap, sizeof(fiemap))) @@ -307,8 +291,7 @@ static inline loff_t blk_to_logical(struct inode *inode, sector_t blk) * If you use this function directly, you need to do your own locking. Use * generic_block_fiemap if you want the locking done for you. */ - -int __generic_block_fiemap(struct inode *inode, +static int __generic_block_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, loff_t start, loff_t len, get_block_t *get_block) { @@ -320,7 +303,7 @@ int __generic_block_fiemap(struct inode *inode, bool past_eof = false, whole_file = false; int ret = 0; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, FIEMAP_FLAG_SYNC); if (ret) return ret; @@ -453,7 +436,6 @@ int __generic_block_fiemap(struct inode *inode, return ret; } -EXPORT_SYMBOL(__generic_block_fiemap); /** * generic_block_fiemap - FIEMAP for block based inodes diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index d55e8f491a5e..aab070df4a21 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -6,6 +6,7 @@ #include <linux/compiler.h> #include <linux/fs.h> #include <linux/iomap.h> +#include <linux/fiemap.h> struct fiemap_ctx { struct fiemap_extent_info *fi; @@ -65,7 +66,7 @@ iomap_fiemap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, - loff_t start, loff_t len, const struct iomap_ops *ops) + u64 start, u64 len, const struct iomap_ops *ops) { struct fiemap_ctx ctx; loff_t ret; @@ -74,16 +75,10 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, ctx.fi = fi; ctx.prev.type = IOMAP_HOLE; - ret = fiemap_check_flags(fi, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fi, start, &len, 0); if (ret) return ret; - if (fi->fi_flags & FIEMAP_FLAG_SYNC) { - ret = filemap_write_and_wait(inode->i_mapping); - if (ret) - return ret; - } - while (len > 0) { ret = iomap_apply(inode, start, len, IOMAP_REPORT, ops, &ctx, iomap_fiemap_actor); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 3dccc23cf010..e91aad3637a2 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -541,17 +541,24 @@ handle_t *jbd2_journal_start(journal_t *journal, int nblocks) } EXPORT_SYMBOL(jbd2_journal_start); -static void __jbd2_journal_unreserve_handle(handle_t *handle) +static void __jbd2_journal_unreserve_handle(handle_t *handle, transaction_t *t) { journal_t *journal = handle->h_journal; WARN_ON(!handle->h_reserved); sub_reserved_credits(journal, handle->h_total_credits); + if (t) + atomic_sub(handle->h_total_credits, &t->t_outstanding_credits); } void jbd2_journal_free_reserved(handle_t *handle) { - __jbd2_journal_unreserve_handle(handle); + journal_t *journal = handle->h_journal; + + /* Get j_state_lock to pin running transaction if it exists */ + read_lock(&journal->j_state_lock); + __jbd2_journal_unreserve_handle(handle, journal->j_running_transaction); + read_unlock(&journal->j_state_lock); jbd2_free_handle(handle); } EXPORT_SYMBOL(jbd2_journal_free_reserved); @@ -722,7 +729,8 @@ static void stop_this_handle(handle_t *handle) atomic_sub(handle->h_total_credits, &transaction->t_outstanding_credits); if (handle->h_rsv_handle) - __jbd2_journal_unreserve_handle(handle->h_rsv_handle); + __jbd2_journal_unreserve_handle(handle->h_rsv_handle, + transaction); if (atomic_dec_and_test(&transaction->t_updates)) wake_up(&journal->j_wait_updates); diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index ceeb3b441844..28009ec54420 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -14,6 +14,7 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/uio.h> +#include <linux/fiemap.h> #include "nilfs.h" #include "btnode.h" #include "segment.h" @@ -996,7 +997,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, unsigned int blkbits = inode->i_blkbits; int ret, n; - ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); + ret = fiemap_prep(inode, fieinfo, start, &len, 0); if (ret) return ret; diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e3e2d1b2af51..a94852af5510 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -733,8 +733,6 @@ static int ocfs2_fiemap_inline(struct inode *inode, struct buffer_head *di_bh, return 0; } -#define OCFS2_FIEMAP_FLAGS (FIEMAP_FLAG_SYNC) - int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len) { @@ -746,7 +744,7 @@ int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head *di_bh = NULL; struct ocfs2_extent_rec rec; - ret = fiemap_check_flags(fieinfo, OCFS2_FIEMAP_FLAGS); + ret = fiemap_prep(inode, fieinfo, map_start, &map_len, 0); if (ret) return ret; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index 981f11ec51bc..7af76b9004eb 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -10,6 +10,7 @@ #include <linux/xattr.h> #include <linux/posix_acl.h> #include <linux/ratelimit.h> +#include <linux/fiemap.h> #include "overlayfs.h" @@ -479,10 +480,6 @@ static int ovl_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return -EOPNOTSUPP; old_cred = ovl_override_creds(inode->i_sb); - - if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) - filemap_write_and_wait(realinode->i_mapping); - err = realinode->i_op->fiemap(realinode, fieinfo, start, len); revert_creds(old_cred); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index d66528fa3657..202b2c0a9e9d 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -25,6 +25,7 @@ #include <linux/posix_acl.h> #include <linux/security.h> #include <linux/iversion.h> +#include <linux/fiemap.h> /* * Directories have different lock order w.r.t. mmap_sem compared to regular diff --git a/include/linux/fiemap.h b/include/linux/fiemap.h new file mode 100644 index 000000000000..4e624c466583 --- /dev/null +++ b/include/linux/fiemap.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_FIEMAP_H +#define _LINUX_FIEMAP_H 1 + +#include <uapi/linux/fiemap.h> +#include <linux/fs.h> + +struct fiemap_extent_info { + unsigned int fi_flags; /* Flags as passed from user */ + unsigned int fi_extents_mapped; /* Number of mapped extents */ + unsigned int fi_extents_max; /* Size of fiemap_extent array */ + struct fiemap_extent __user *fi_extents_start; /* Start of + fiemap_extent array */ +}; + +int fiemap_prep(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 *len, u32 supported_flags); +int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, + u64 phys, u64 len, u32 flags); + +int generic_block_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, u64 start, u64 len, + get_block_t *get_block); + +#endif /* _LINUX_FIEMAP_H 1 */ diff --git a/include/linux/fs.h b/include/linux/fs.h index 9780ac31387c..bb0acc7e86d8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -24,7 +24,6 @@ #include <linux/capability.h> #include <linux/semaphore.h> #include <linux/fcntl.h> -#include <linux/fiemap.h> #include <linux/rculist_bl.h> #include <linux/atomic.h> #include <linux/shrinker.h> @@ -48,6 +47,7 @@ struct backing_dev_info; struct bdi_writeback; struct bio; struct export_operations; +struct fiemap_extent_info; struct hd_geometry; struct iovec; struct kiocb; @@ -1755,19 +1755,6 @@ extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, extern void inode_init_owner(struct inode *inode, const struct inode *dir, umode_t mode); extern bool may_open_dev(const struct path *path); -/* - * VFS FS_IOC_FIEMAP helper definitions. - */ -struct fiemap_extent_info { - unsigned int fi_flags; /* Flags as passed from user */ - unsigned int fi_extents_mapped; /* Number of mapped extents */ - unsigned int fi_extents_max; /* Size of fiemap_extent array */ - struct fiemap_extent __user *fi_extents_start; /* Start of - fiemap_extent array */ -}; -int fiemap_fill_next_extent(struct fiemap_extent_info *info, u64 logical, - u64 phys, u64 len, u32 flags); -int fiemap_check_flags(struct fiemap_extent_info *fieinfo, u32 fs_flags); /* * This is the "filldir" function type, used by readdir() to let @@ -3323,14 +3310,6 @@ static inline int vfs_fstat(int fd, struct kstat *stat) extern const char *vfs_get_link(struct dentry *, struct delayed_call *); extern int vfs_readlink(struct dentry *, char __user *, int); -extern int __generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, - get_block_t *get_block); -extern int generic_block_fiemap(struct inode *inode, - struct fiemap_extent_info *fieinfo, u64 start, - u64 len, get_block_t *get_block); - extern struct file_system_type *get_filesystem(struct file_system_type *fs); extern void put_filesystem(struct file_system_type *fs); extern struct file_system_type *get_fs_type(const char *name); diff --git a/include/linux/iomap.h b/include/linux/iomap.h index a5c219c29b10..4d1d3c3469e9 100644 --- a/include/linux/iomap.h +++ b/include/linux/iomap.h @@ -177,7 +177,7 @@ int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops); int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - loff_t start, loff_t len, const struct iomap_ops *ops); + u64 start, u64 len, const struct iomap_ops *ops); loff_t iomap_seek_hole(struct inode *inode, loff_t offset, const struct iomap_ops *ops); loff_t iomap_seek_data(struct inode *inode, loff_t offset, diff --git a/include/linux/writeback.h b/include/linux/writeback.h index f8a7e1a850fb..8e5c5bb16e2d 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -197,6 +197,7 @@ void wakeup_flusher_threads(enum wb_reason reason); void wakeup_flusher_threads_bdi(struct backing_dev_info *bdi, enum wb_reason reason); void inode_wait_for_writeback(struct inode *inode); +void inode_io_list_del(struct inode *inode); /* writeback.h requires fs.h; it, too, is not included from here. */ static inline void wait_on_inode(struct inode *inode) diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 19c87661eeec..cc41d692ae8e 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -35,7 +35,8 @@ struct partial_cluster; { EXT4_MB_DELALLOC_RESERVED, "DELALLOC_RESV" }, \ { EXT4_MB_STREAM_ALLOC, "STREAM_ALLOC" }, \ { EXT4_MB_USE_ROOT_BLOCKS, "USE_ROOT_BLKS" }, \ - { EXT4_MB_USE_RESERVED, "USE_RESV" }) + { EXT4_MB_USE_RESERVED, "USE_RESV" }, \ + { EXT4_MB_STRICT_CHECK, "STRICT_CHECK" }) #define show_map_flags(flags) __print_flags(flags, "|", \ { EXT4_GET_BLOCKS_CREATE, "CREATE" }, \ @@ -45,8 +46,10 @@ struct partial_cluster; { EXT4_GET_BLOCKS_CONVERT, "CONVERT" }, \ { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ - { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ - { EXT4_GET_BLOCKS_ZERO, "ZERO" }) + { EXT4_GET_BLOCKS_CONVERT_UNWRITTEN, "CONVERT_UNWRITTEN" }, \ + { EXT4_GET_BLOCKS_ZERO, "ZERO" }, \ + { EXT4_GET_BLOCKS_IO_SUBMIT, "IO_SUBMIT" }, \ + { EXT4_EX_NOCACHE, "EX_NOCACHE" }) /* * __print_flags() requires that all enum values be wrapped in the diff --git a/include/uapi/linux/fiemap.h b/include/uapi/linux/fiemap.h index 8c0bc24d5d95..07c1cdcb715e 100644 --- a/include/uapi/linux/fiemap.h +++ b/include/uapi/linux/fiemap.h @@ -9,8 +9,8 @@ * Andreas Dilger <adilger@sun.com> */ -#ifndef _LINUX_FIEMAP_H -#define _LINUX_FIEMAP_H +#ifndef _UAPI_LINUX_FIEMAP_H +#define _UAPI_LINUX_FIEMAP_H #include <linux/types.h> @@ -67,4 +67,4 @@ struct fiemap { #define FIEMAP_EXTENT_SHARED 0x00002000 /* Space shared with other * files. */ -#endif /* _LINUX_FIEMAP_H */ +#endif /* _UAPI_LINUX_FIEMAP_H */ |