diff options
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/ext4.h | 69 | ||||
-rw-r--r-- | fs/ext4/extents.c | 219 | ||||
-rw-r--r-- | fs/ext4/extents_status.c | 321 | ||||
-rw-r--r-- | fs/ext4/extents_status.h | 82 | ||||
-rw-r--r-- | fs/ext4/file.c | 50 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 123 | ||||
-rw-r--r-- | fs/ext4/inline.c | 35 | ||||
-rw-r--r-- | fs/ext4/inode.c | 203 | ||||
-rw-r--r-- | fs/ext4/ioctl.c | 2 | ||||
-rw-r--r-- | fs/ext4/mballoc.c | 15 | ||||
-rw-r--r-- | fs/ext4/migrate.c | 2 | ||||
-rw-r--r-- | fs/ext4/move_extent.c | 12 | ||||
-rw-r--r-- | fs/ext4/namei.c | 11 | ||||
-rw-r--r-- | fs/ext4/resize.c | 30 | ||||
-rw-r--r-- | fs/ext4/super.c | 169 |
15 files changed, 794 insertions, 549 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index c55a1faaed58..f63c3d5805c4 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -158,17 +158,8 @@ struct ext4_allocation_request { #define EXT4_MAP_MAPPED (1 << BH_Mapped) #define EXT4_MAP_UNWRITTEN (1 << BH_Unwritten) #define EXT4_MAP_BOUNDARY (1 << BH_Boundary) -/* Sometimes (in the bigalloc case, from ext4_da_get_block_prep) the caller of - * ext4_map_blocks wants to know whether or not the underlying cluster has - * already been accounted for. EXT4_MAP_FROM_CLUSTER conveys to the caller that - * the requested mapping was from previously mapped (or delayed allocated) - * cluster. We use BH_AllocFromCluster only for this flag. BH_AllocFromCluster - * should never appear on buffer_head's state flags. - */ -#define EXT4_MAP_FROM_CLUSTER (1 << BH_AllocFromCluster) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ - EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_FROM_CLUSTER) + EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -373,7 +364,8 @@ struct flex_groups { #define EXT4_DIRTY_FL 0x00000100 #define EXT4_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define EXT4_NOCOMPR_FL 0x00000400 /* Don't compress */ -#define EXT4_ECOMPR_FL 0x00000800 /* Compression error */ + /* nb: was previously EXT2_ECOMPR_FL */ +#define EXT4_ENCRYPT_FL 0x00000800 /* encrypted file */ /* End compression flags --- maybe not all used */ #define EXT4_INDEX_FL 0x00001000 /* hash-indexed directory */ #define EXT4_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -430,7 +422,7 @@ enum { EXT4_INODE_DIRTY = 8, EXT4_INODE_COMPRBLK = 9, /* One or more compressed clusters */ EXT4_INODE_NOCOMPR = 10, /* Don't compress */ - EXT4_INODE_ECOMPR = 11, /* Compression error */ + EXT4_INODE_ENCRYPT = 11, /* Compression error */ /* End compression flags --- maybe not all used */ EXT4_INODE_INDEX = 12, /* hash-indexed directory */ EXT4_INODE_IMAGIC = 13, /* AFS directory */ @@ -475,7 +467,7 @@ static inline void ext4_check_flag_values(void) CHECK_FLAG_VALUE(DIRTY); CHECK_FLAG_VALUE(COMPRBLK); CHECK_FLAG_VALUE(NOCOMPR); - CHECK_FLAG_VALUE(ECOMPR); + CHECK_FLAG_VALUE(ENCRYPT); CHECK_FLAG_VALUE(INDEX); CHECK_FLAG_VALUE(IMAGIC); CHECK_FLAG_VALUE(JOURNAL_DATA); @@ -565,10 +557,8 @@ enum { #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 /* Do not take i_data_sem locking in ext4_map_blocks */ #define EXT4_GET_BLOCKS_NO_LOCK 0x0100 - /* Do not put hole in extent cache */ -#define EXT4_GET_BLOCKS_NO_PUT_HOLE 0x0200 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0400 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 /* * The bit position of these flags must not overlap with any of the @@ -889,10 +879,12 @@ struct ext4_inode_info { /* extents status tree */ struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; - struct list_head i_es_lru; + struct list_head i_es_list; unsigned int i_es_all_nr; /* protected by i_es_lock */ - unsigned int i_es_lru_nr; /* protected by i_es_lock */ - unsigned long i_touch_when; /* jiffies of last accessing */ + unsigned int i_es_shk_nr; /* protected by i_es_lock */ + ext4_lblk_t i_es_shrink_lblk; /* Offset where we start searching for + extents to shrink. Protected by + i_es_lock */ /* ialloc */ ext4_group_t i_last_alloc_group; @@ -941,6 +933,10 @@ struct ext4_inode_info { tid_t i_sync_tid; tid_t i_datasync_tid; +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; +#endif + /* Precomputed uuid+inum+igen checksum for seeding inode checksums */ __u32 i_csum_seed; }; @@ -970,6 +966,11 @@ struct ext4_inode_info { #define EXT4_MOUNT_ERRORS_MASK 0x00070 #define EXT4_MOUNT_MINIX_DF 0x00080 /* Mimics the Minix statfs */ #define EXT4_MOUNT_NOLOAD 0x00100 /* Don't use existing journal*/ +#ifdef CONFIG_FS_DAX +#define EXT4_MOUNT_DAX 0x00200 /* Direct Access */ +#else +#define EXT4_MOUNT_DAX 0 +#endif #define EXT4_MOUNT_DATA_FLAGS 0x00C00 /* Mode for data writes: */ #define EXT4_MOUNT_JOURNAL_DATA 0x00400 /* Write data to journal */ #define EXT4_MOUNT_ORDERED_DATA 0x00800 /* Flush data before commit */ @@ -1048,6 +1049,12 @@ extern void ext4_set_bits(void *bm, int cur, int len); /* Metadata checksum algorithm codes */ #define EXT4_CRC32C_CHKSUM 1 +/* Encryption algorithms */ +#define EXT4_ENCRYPTION_MODE_INVALID 0 +#define EXT4_ENCRYPTION_MODE_AES_256_XTS 1 +#define EXT4_ENCRYPTION_MODE_AES_256_GCM 2 +#define EXT4_ENCRYPTION_MODE_AES_256_CBC 3 + /* * Structure of the super block */ @@ -1161,7 +1168,8 @@ struct ext4_super_block { __le32 s_grp_quota_inum; /* inode for tracking group quota */ __le32 s_overhead_clusters; /* overhead blocks/clusters in fs */ __le32 s_backup_bgs[2]; /* groups with sparse_super2 SBs */ - __le32 s_reserved[106]; /* Padding to the end of the block */ + __u8 s_encrypt_algos[4]; /* Encryption algorithms in use */ + __le32 s_reserved[105]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1333,10 +1341,11 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; - struct list_head s_es_lru; + struct list_head s_es_list; /* List of inodes with reclaimable extents */ + long s_es_nr_inode; struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; - spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; + spinlock_t s_es_lock ____cacheline_aligned_in_smp; /* Ratelimit ext4 messages. */ struct ratelimit_state s_err_ratelimit_state; @@ -1526,6 +1535,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) * GDT_CSUM bits are mutually exclusive. */ #define EXT4_FEATURE_RO_COMPAT_METADATA_CSUM 0x0400 +#define EXT4_FEATURE_RO_COMPAT_READONLY 0x1000 #define EXT4_FEATURE_INCOMPAT_COMPRESSION 0x0001 #define EXT4_FEATURE_INCOMPAT_FILETYPE 0x0002 @@ -1541,6 +1551,7 @@ static inline void ext4_clear_state_flags(struct ext4_inode_info *ei) #define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM 0x2000 /* use crc32c for bg */ #define EXT4_FEATURE_INCOMPAT_LARGEDIR 0x4000 /* >2GB or 3-lvl htree */ #define EXT4_FEATURE_INCOMPAT_INLINE_DATA 0x8000 /* data in inode */ +#define EXT4_FEATURE_INCOMPAT_ENCRYPT 0x10000 #define EXT2_FEATURE_COMPAT_SUPP EXT4_FEATURE_COMPAT_EXT_ATTR #define EXT2_FEATURE_INCOMPAT_SUPP (EXT4_FEATURE_INCOMPAT_FILETYPE| \ @@ -2192,7 +2203,6 @@ extern int ext4_calculate_overhead(struct super_block *sb); extern void ext4_superblock_csum_set(struct super_block *sb); extern void *ext4_kvmalloc(size_t size, gfp_t flags); extern void *ext4_kvzalloc(size_t size, gfp_t flags); -extern void ext4_kvfree(void *ptr); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); extern const char *ext4_decode_error(struct super_block *sb, int errno, @@ -2583,6 +2593,7 @@ extern const struct file_operations ext4_dir_operations; /* file.c */ extern const struct inode_operations ext4_file_inode_operations; extern const struct file_operations ext4_file_operations; +extern const struct file_operations ext4_dax_file_operations; extern loff_t ext4_llseek(struct file *file, loff_t offset, int origin); /* inline.c */ @@ -2643,7 +2654,7 @@ extern struct buffer_head *ext4_get_first_inline_block(struct inode *inode, int *retval); extern int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline); + int *has_inline, __u64 start, __u64 len); extern int ext4_try_to_evict_inline_data(handle_t *handle, struct inode *inode, int needed); @@ -2791,16 +2802,6 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io, extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t); /* - * Note that these flags will never ever appear in a buffer_head's state flag. - * See EXT4_MAP_... to see where this is used. - */ -enum ext4_state_bits { - BH_AllocFromCluster /* allocated blocks were part of already - * allocated cluster. */ - = BH_JBDPrivateStart -}; - -/* * Add new method to test whether block and inode bitmaps are properly * initialized. With uninit_bg reading the block from disk is not enough * to mark the bitmap uptodate. We need to also zero-out the bitmap diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 0b16fb4c06d3..bed43081720f 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2306,16 +2306,16 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t block) { int depth = ext_depth(inode); - unsigned long len = 0; - ext4_lblk_t lblock = 0; + ext4_lblk_t len; + ext4_lblk_t lblock; struct ext4_extent *ex; + struct extent_status es; ex = path[depth].p_ext; if (ex == NULL) { - /* - * there is no extent yet, so gap is [0;-] and we - * don't cache it - */ + /* there is no extent yet, so gap is [0;-] */ + lblock = 0; + len = EXT_MAX_BLOCKS; ext_debug("cache gap(whole file):"); } else if (block < le32_to_cpu(ex->ee_block)) { lblock = block; @@ -2324,9 +2324,6 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block, le32_to_cpu(ex->ee_block), ext4_ext_get_actual_len(ex)); - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else if (block >= le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex)) { ext4_lblk_t next; @@ -2340,14 +2337,19 @@ ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path, block); BUG_ON(next == lblock); len = next - lblock; - if (!ext4_find_delalloc_range(inode, lblock, lblock + len - 1)) - ext4_es_insert_extent(inode, lblock, len, ~0, - EXTENT_STATUS_HOLE); } else { BUG(); } - ext_debug(" -> %u:%lu\n", lblock, len); + ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es); + if (es.es_len) { + /* There's delayed extent containing lblock? */ + if (es.es_lblk <= lblock) + return; + len = min(es.es_lblk - lblock, len); + } + ext_debug(" -> %u:%u\n", lblock, len); + ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE); } /* @@ -2481,7 +2483,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t from, ext4_lblk_t to) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - unsigned short ee_len = ext4_ext_get_actual_len(ex); + unsigned short ee_len = ext4_ext_get_actual_len(ex); ext4_fsblk_t pblk; int flags = get_default_free_blocks_flags(inode); @@ -2490,7 +2492,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * at the beginning of the extent. Instead, we make a note * that we tried freeing the cluster, and check to see if we * need to free it on a subsequent call to ext4_remove_blocks, - * or at the end of the ext4_truncate() operation. + * or at the end of ext4_ext_rm_leaf or ext4_ext_remove_space. */ flags |= EXT4_FREE_BLOCKS_NOFREE_FIRST_CLUSTER; @@ -2501,8 +2503,8 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * partial cluster here. */ pblk = ext4_ext_pblock(ex) + ee_len - 1; - if ((*partial_cluster > 0) && - (EXT4_B2C(sbi, pblk) != *partial_cluster)) { + if (*partial_cluster > 0 && + *partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { ext4_free_blocks(handle, inode, NULL, EXT4_C2B(sbi, *partial_cluster), sbi->s_cluster_ratio, flags); @@ -2528,7 +2530,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, && to == le32_to_cpu(ex->ee_block) + ee_len - 1) { /* tail removal */ ext4_lblk_t num; - unsigned int unaligned; + long long first_cluster; num = le32_to_cpu(ex->ee_block) + ee_len - from; pblk = ext4_ext_pblock(ex) + ee_len - num; @@ -2538,7 +2540,7 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * used by any other extent (partial_cluster is negative). */ if (*partial_cluster < 0 && - -(*partial_cluster) == EXT4_B2C(sbi, pblk + num - 1)) + *partial_cluster == -(long long) EXT4_B2C(sbi, pblk+num-1)) flags |= EXT4_FREE_BLOCKS_NOFREE_LAST_CLUSTER; ext_debug("free last %u blocks starting %llu partial %lld\n", @@ -2549,21 +2551,24 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, * beginning of a cluster, and we removed the entire * extent and the cluster is not used by any other extent, * save the partial cluster here, since we might need to - * delete if we determine that the truncate operation has - * removed all of the blocks in the cluster. + * delete if we determine that the truncate or punch hole + * operation has removed all of the blocks in the cluster. + * If that cluster is used by another extent, preserve its + * negative value so it isn't freed later on. * - * On the other hand, if we did not manage to free the whole - * extent, we have to mark the cluster as used (store negative - * cluster number in partial_cluster). + * If the whole extent wasn't freed, we've reached the + * start of the truncated/punched region and have finished + * removing blocks. If there's a partial cluster here it's + * shared with the remainder of the extent and is no longer + * a candidate for removal. */ - unaligned = EXT4_PBLK_COFF(sbi, pblk); - if (unaligned && (ee_len == num) && - (*partial_cluster != -((long long)EXT4_B2C(sbi, pblk)))) - *partial_cluster = EXT4_B2C(sbi, pblk); - else if (unaligned) - *partial_cluster = -((long long)EXT4_B2C(sbi, pblk)); - else if (*partial_cluster > 0) + if (EXT4_PBLK_COFF(sbi, pblk) && ee_len == num) { + first_cluster = (long long) EXT4_B2C(sbi, pblk); + if (first_cluster != -*partial_cluster) + *partial_cluster = first_cluster; + } else { *partial_cluster = 0; + } } else ext4_error(sbi->s_sb, "strange request: removal(2) " "%u-%u from %u:%u\n", @@ -2574,15 +2579,16 @@ static int ext4_remove_blocks(handle_t *handle, struct inode *inode, /* * ext4_ext_rm_leaf() Removes the extents associated with the - * blocks appearing between "start" and "end", and splits the extents - * if "start" and "end" appear in the same extent + * blocks appearing between "start" and "end". Both "start" + * and "end" must appear in the same extent or EIO is returned. * * @handle: The journal handle * @inode: The files inode * @path: The path to the leaf * @partial_cluster: The cluster which we'll have to free if all extents - * has been released from it. It gets negative in case - * that the cluster is still used. + * has been released from it. However, if this value is + * negative, it's a cluster just to the right of the + * punched region and it must not be freed. * @start: The first block to remove * @end: The last block to remove */ @@ -2621,27 +2627,6 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); - /* - * If we're starting with an extent other than the last one in the - * node, we need to see if it shares a cluster with the extent to - * the right (towards the end of the file). If its leftmost cluster - * is this extent's rightmost cluster and it is not cluster aligned, - * we'll mark it as a partial that is not to be deallocated. - */ - - if (ex != EXT_LAST_EXTENT(eh)) { - ext4_fsblk_t current_pblk, right_pblk; - long long current_cluster, right_cluster; - - current_pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; - current_cluster = (long long)EXT4_B2C(sbi, current_pblk); - right_pblk = ext4_ext_pblock(ex + 1); - right_cluster = (long long)EXT4_B2C(sbi, right_pblk); - if (current_cluster == right_cluster && - EXT4_PBLK_COFF(sbi, right_pblk)) - *partial_cluster = -right_cluster; - } - trace_ext4_ext_rm_leaf(inode, start, ex, *partial_cluster); while (ex >= EXT_FIRST_EXTENT(eh) && @@ -2666,14 +2651,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, if (end < ex_ee_block) { /* * We're going to skip this extent and move to another, - * so if this extent is not cluster aligned we have - * to mark the current cluster as used to avoid - * accidentally freeing it later on + * so note that its first cluster is in use to avoid + * freeing it when removing blocks. Eventually, the + * right edge of the truncated/punched region will + * be just to the left. */ - pblk = ext4_ext_pblock(ex); - if (EXT4_PBLK_COFF(sbi, pblk)) + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex); *partial_cluster = - -((long long)EXT4_B2C(sbi, pblk)); + -(long long) EXT4_B2C(sbi, pblk); + } ex--; ex_ee_block = le32_to_cpu(ex->ee_block); ex_ee_len = ext4_ext_get_actual_len(ex); @@ -2749,8 +2736,7 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, sizeof(struct ext4_extent)); } le16_add_cpu(&eh->eh_entries, -1); - } else if (*partial_cluster > 0) - *partial_cluster = 0; + } err = ext4_ext_dirty(handle, inode, path + depth); if (err) @@ -2769,20 +2755,18 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode, /* * If there's a partial cluster and at least one extent remains in * the leaf, free the partial cluster if it isn't shared with the - * current extent. If there's a partial cluster and no extents - * remain in the leaf, it can't be freed here. It can only be - * freed when it's possible to determine if it's not shared with - * any other extent - when the next leaf is processed or when space - * removal is complete. + * current extent. If it is shared with the current extent + * we zero partial_cluster because we've reached the start of the + * truncated/punched region and we're done removing blocks. */ - if (*partial_cluster > 0 && eh->eh_entries && - (EXT4_B2C(sbi, ext4_ext_pblock(ex) + ex_ee_len - 1) != - *partial_cluster)) { - int flags = get_default_free_blocks_flags(inode); - - ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(sbi, *partial_cluster), - sbi->s_cluster_ratio, flags); + if (*partial_cluster > 0 && ex >= EXT_FIRST_EXTENT(eh)) { + pblk = ext4_ext_pblock(ex) + ex_ee_len - 1; + if (*partial_cluster != (long long) EXT4_B2C(sbi, pblk)) { + ext4_free_blocks(handle, inode, NULL, + EXT4_C2B(sbi, *partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); + } *partial_cluster = 0; } @@ -2819,7 +2803,7 @@ ext4_ext_more_to_rm(struct ext4_ext_path *path) int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, ext4_lblk_t end) { - struct super_block *sb = inode->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); int depth = ext_depth(inode); struct ext4_ext_path *path = NULL; long long partial_cluster = 0; @@ -2845,9 +2829,10 @@ again: */ if (end < EXT_MAX_BLOCKS - 1) { struct ext4_extent *ex; - ext4_lblk_t ee_block; + ext4_lblk_t ee_block, ex_end, lblk; + ext4_fsblk_t pblk; - /* find extent for this block */ + /* find extent for or closest extent to this block */ path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); @@ -2867,6 +2852,7 @@ again: } ee_block = le32_to_cpu(ex->ee_block); + ex_end = ee_block + ext4_ext_get_actual_len(ex) - 1; /* * See if the last block is inside the extent, if so split @@ -2874,8 +2860,19 @@ again: * tail of the first part of the split extent in * ext4_ext_rm_leaf(). */ - if (end >= ee_block && - end < ee_block + ext4_ext_get_actual_len(ex) - 1) { + if (end >= ee_block && end < ex_end) { + + /* + * If we're going to split the extent, note that + * the cluster containing the block after 'end' is + * in use to avoid freeing it when removing blocks. + */ + if (sbi->s_cluster_ratio > 1) { + pblk = ext4_ext_pblock(ex) + end - ee_block + 2; + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); + } + /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not @@ -2886,6 +2883,24 @@ again: end + 1, 1); if (err < 0) goto out; + + } else if (sbi->s_cluster_ratio > 1 && end >= ex_end) { + /* + * If there's an extent to the right its first cluster + * contains the immediate right boundary of the + * truncated/punched region. Set partial_cluster to + * its negative value so it won't be freed if shared + * with the current extent. The end < ee_block case + * is handled in ext4_ext_rm_leaf(). + */ + lblk = ex_end + 1; + err = ext4_ext_search_right(inode, path, &lblk, &pblk, + &ex); + if (err) + goto out; + if (pblk) + partial_cluster = + -(long long) EXT4_B2C(sbi, pblk); } } /* @@ -2996,16 +3011,18 @@ again: trace_ext4_ext_remove_space_done(inode, start, end, depth, partial_cluster, path->p_hdr->eh_entries); - /* If we still have something in the partial cluster and we have removed + /* + * If we still have something in the partial cluster and we have removed * even the first extent, then we should free the blocks in the partial - * cluster as well. */ - if (partial_cluster > 0 && path->p_hdr->eh_entries == 0) { - int flags = get_default_free_blocks_flags(inode); - + * cluster as well. (This code will only run when there are no leaves + * to the immediate left of the truncated/punched region.) + */ + if (partial_cluster > 0 && err == 0) { + /* don't zero partial_cluster since it's not used afterwards */ ext4_free_blocks(handle, inode, NULL, - EXT4_C2B(EXT4_SB(sb), partial_cluster), - EXT4_SB(sb)->s_cluster_ratio, flags); - partial_cluster = 0; + EXT4_C2B(sbi, partial_cluster), + sbi->s_cluster_ratio, + get_default_free_blocks_flags(inode)); } /* TODO: flexible tree reduction should be here */ @@ -4267,6 +4284,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ext4_io_end_t *io = ext4_inode_aio(inode); ext4_lblk_t cluster_offset; int set_unwritten = 0; + bool map_from_cluster = false; ext_debug("blocks %u/%u requested for inode %lu\n", map->m_lblk, map->m_len, inode->i_ino); @@ -4343,10 +4361,6 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, } } - if ((sbi->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; - /* * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero @@ -4356,15 +4370,13 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, * put just found gap into cache to speed up * subsequent requests */ - if ((flags & EXT4_GET_BLOCKS_NO_PUT_HOLE) == 0) - ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); + ext4_ext_put_gap_in_cache(inode, path, map->m_lblk); goto out2; } /* * Okay, we need to do block allocation. */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; newex.ee_block = cpu_to_le32(map->m_lblk); cluster_offset = EXT4_LBLK_COFF(sbi, map->m_lblk); @@ -4376,7 +4388,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4397,7 +4409,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, get_implied_cluster_alloc(inode->i_sb, map, ex2, path)) { ar.len = allocated = map->m_len; newblock = map->m_pblk; - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + map_from_cluster = true; goto got_allocated_blocks; } @@ -4523,7 +4535,7 @@ got_allocated_blocks: */ reserved_clusters = get_reserved_cluster_alloc(inode, map->m_lblk, allocated); - if (map->m_flags & EXT4_MAP_FROM_CLUSTER) { + if (map_from_cluster) { if (reserved_clusters) { /* * We have clusters reserved for this range. @@ -4620,7 +4632,6 @@ out2: trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); - ext4_es_lru_add(inode); return err ? err : allocated; } @@ -5140,7 +5151,8 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ext4_has_inline_data(inode)) { int has_inline = 1; - error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline); + error = ext4_inline_data_fiemap(inode, fieinfo, &has_inline, + start, len); if (has_inline) return error; @@ -5179,7 +5191,6 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, error = ext4_fill_fiemap_extents(inode, start_blk, len_blks, fieinfo); } - ext4_es_lru_add(inode); return error; } @@ -5239,8 +5250,6 @@ ext4_ext_shift_path_extents(struct ext4_ext_path *path, ext4_lblk_t shift, return -EIO; ex_last = EXT_LAST_EXTENT(path[depth].p_hdr); - if (!ex_last) - return -EIO; err = ext4_access_path(handle, inode, path + depth); if (err) diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 94e7855ae71b..e04d45733976 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -147,10 +147,9 @@ static struct kmem_cache *ext4_es_cachep; static int __es_insert_extent(struct inode *inode, struct extent_status *newes); static int __es_remove_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t end); -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan); -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei); +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan); +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei); int __init ext4_init_es(void) { @@ -298,6 +297,36 @@ out: trace_ext4_es_find_delayed_extent_range_exit(inode, es); } +static void ext4_es_list_add(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + if (!list_empty(&ei->i_es_list)) + return; + + spin_lock(&sbi->s_es_lock); + if (list_empty(&ei->i_es_list)) { + list_add_tail(&ei->i_es_list, &sbi->s_es_list); + sbi->s_es_nr_inode++; + } + spin_unlock(&sbi->s_es_lock); +} + +static void ext4_es_list_del(struct inode *inode) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + + spin_lock(&sbi->s_es_lock); + if (!list_empty(&ei->i_es_list)) { + list_del_init(&ei->i_es_list); + sbi->s_es_nr_inode--; + WARN_ON_ONCE(sbi->s_es_nr_inode < 0); + } + spin_unlock(&sbi->s_es_lock); +} + static struct extent_status * ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, ext4_fsblk_t pblk) @@ -314,9 +343,10 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, * We don't count delayed extent because we never try to reclaim them */ if (!ext4_es_is_delayed(es)) { - EXT4_I(inode)->i_es_lru_nr++; + if (!EXT4_I(inode)->i_es_shk_nr++) + ext4_es_list_add(inode); percpu_counter_inc(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } EXT4_I(inode)->i_es_all_nr++; @@ -330,12 +360,13 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) EXT4_I(inode)->i_es_all_nr--; percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); - /* Decrease the lru counter when this es is not delayed */ + /* Decrease the shrink counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { - BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); - EXT4_I(inode)->i_es_lru_nr--; + BUG_ON(EXT4_I(inode)->i_es_shk_nr == 0); + if (!--EXT4_I(inode)->i_es_shk_nr) + ext4_es_list_del(inode); percpu_counter_dec(&EXT4_SB(inode->i_sb)-> - s_es_stats.es_stats_lru_cnt); + s_es_stats.es_stats_shk_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -351,7 +382,7 @@ static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) static int ext4_es_can_be_merged(struct extent_status *es1, struct extent_status *es2) { - if (ext4_es_status(es1) != ext4_es_status(es2)) + if (ext4_es_type(es1) != ext4_es_type(es2)) return 0; if (((__u64) es1->es_len) + es2->es_len > EXT_MAX_BLOCKS) { @@ -394,6 +425,8 @@ ext4_es_try_to_merge_left(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es1, es)) { es1->es_len += es->es_len; + if (ext4_es_is_referenced(es)) + ext4_es_set_referenced(es1); rb_erase(&es->rb_node, &tree->root); ext4_es_free_extent(inode, es); es = es1; @@ -416,6 +449,8 @@ ext4_es_try_to_merge_right(struct inode *inode, struct extent_status *es) es1 = rb_entry(node, struct extent_status, rb_node); if (ext4_es_can_be_merged(es, es1)) { es->es_len += es1->es_len; + if (ext4_es_is_referenced(es1)) + ext4_es_set_referenced(es); rb_erase(node, &tree->root); ext4_es_free_extent(inode, es1); } @@ -683,8 +718,8 @@ int ext4_es_insert_extent(struct inode *inode, ext4_lblk_t lblk, goto error; retry: err = __es_insert_extent(inode, &newes); - if (err == -ENOMEM && __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + if (err == -ENOMEM && __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; if (err == -ENOMEM && !ext4_es_is_delayed(&newes)) err = 0; @@ -782,6 +817,8 @@ out: es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + if (!ext4_es_is_referenced(es)) + ext4_es_set_referenced(es); stats->es_stats_cache_hits++; } else { stats->es_stats_cache_misses++; @@ -841,8 +878,8 @@ retry: es->es_lblk = orig_es.es_lblk; es->es_len = orig_es.es_len; if ((err == -ENOMEM) && - __ext4_es_shrink(EXT4_SB(inode->i_sb), 1, - EXT4_I(inode))) + __es_shrink(EXT4_SB(inode->i_sb), + 128, EXT4_I(inode))) goto retry; goto out; } @@ -914,6 +951,11 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, end = lblk + len - 1; BUG_ON(end < lblk); + /* + * ext4_clear_inode() depends on us taking i_es_lock unconditionally + * so that we are sure __es_shrink() is done with the inode before it + * is reclaimed. + */ write_lock(&EXT4_I(inode)->i_es_lock); err = __es_remove_extent(inode, lblk, end); write_unlock(&EXT4_I(inode)->i_es_lock); @@ -921,114 +963,75 @@ int ext4_es_remove_extent(struct inode *inode, ext4_lblk_t lblk, return err; } -static int ext4_inode_touch_time_cmp(void *priv, struct list_head *a, - struct list_head *b) -{ - struct ext4_inode_info *eia, *eib; - eia = list_entry(a, struct ext4_inode_info, i_es_lru); - eib = list_entry(b, struct ext4_inode_info, i_es_lru); - - if (ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - !ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return 1; - if (!ext4_test_inode_state(&eia->vfs_inode, EXT4_STATE_EXT_PRECACHED) && - ext4_test_inode_state(&eib->vfs_inode, EXT4_STATE_EXT_PRECACHED)) - return -1; - if (eia->i_touch_when == eib->i_touch_when) - return 0; - if (time_after(eia->i_touch_when, eib->i_touch_when)) - return 1; - else - return -1; -} - -static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, - struct ext4_inode_info *locked_ei) +static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, + struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; struct ext4_es_stats *es_stats; - struct list_head *cur, *tmp; - LIST_HEAD(skipped); ktime_t start_time; u64 scan_time; + int nr_to_walk; int nr_shrunk = 0; - int retried = 0, skip_precached = 1, nr_skipped = 0; + int retried = 0, nr_skipped = 0; es_stats = &sbi->s_es_stats; start_time = ktime_get(); - spin_lock(&sbi->s_es_lru_lock); retry: - list_for_each_safe(cur, tmp, &sbi->s_es_lru) { - int shrunk; - - /* - * If we have already reclaimed all extents from extent - * status tree, just stop the loop immediately. - */ - if (percpu_counter_read_positive( - &es_stats->es_stats_lru_cnt) == 0) - break; - - ei = list_entry(cur, struct ext4_inode_info, i_es_lru); + spin_lock(&sbi->s_es_lock); + nr_to_walk = sbi->s_es_nr_inode; + while (nr_to_walk-- > 0) { + if (list_empty(&sbi->s_es_list)) { + spin_unlock(&sbi->s_es_lock); + goto out; + } + ei = list_first_entry(&sbi->s_es_list, struct ext4_inode_info, + i_es_list); + /* Move the inode to the tail */ + list_move_tail(&ei->i_es_list, &sbi->s_es_list); /* - * Skip the inode that is newer than the last_sorted - * time. Normally we try hard to avoid shrinking - * precached inodes, but we will as a last resort. + * Normally we try hard to avoid shrinking precached inodes, + * but we will as a last resort. */ - if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || - (skip_precached && ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED))) { + if (!retried && ext4_test_inode_state(&ei->vfs_inode, + EXT4_STATE_EXT_PRECACHED)) { nr_skipped++; - list_move_tail(cur, &skipped); continue; } - if (ei->i_es_lru_nr == 0 || ei == locked_ei || - !write_trylock(&ei->i_es_lock)) + if (ei == locked_ei || !write_trylock(&ei->i_es_lock)) { + nr_skipped++; continue; + } + /* + * Now we hold i_es_lock which protects us from inode reclaim + * freeing inode under us + */ + spin_unlock(&sbi->s_es_lock); - shrunk = __es_try_to_reclaim_extents(ei, nr_to_scan); - if (ei->i_es_lru_nr == 0) - list_del_init(&ei->i_es_lru); + nr_shrunk += es_reclaim_extents(ei, &nr_to_scan); write_unlock(&ei->i_es_lock); - nr_shrunk += shrunk; - nr_to_scan -= shrunk; - if (nr_to_scan == 0) - break; + if (nr_to_scan <= 0) + goto out; + spin_lock(&sbi->s_es_lock); } - - /* Move the newer inodes into the tail of the LRU list. */ - list_splice_tail(&skipped, &sbi->s_es_lru); - INIT_LIST_HEAD(&skipped); + spin_unlock(&sbi->s_es_lock); /* * If we skipped any inodes, and we weren't able to make any - * forward progress, sort the list and try again. + * forward progress, try again to scan precached inodes. */ if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; - list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - es_stats->es_stats_last_sorted = jiffies; - ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, - i_es_lru); - /* - * If there are no non-precached inodes left on the - * list, start releasing precached extents. - */ - if (ext4_test_inode_state(&ei->vfs_inode, - EXT4_STATE_EXT_PRECACHED)) - skip_precached = 0; goto retry; } - spin_unlock(&sbi->s_es_lru_lock); - if (locked_ei && nr_shrunk == 0) - nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + nr_shrunk = es_reclaim_extents(locked_ei, &nr_to_scan); +out: scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); if (likely(es_stats->es_stats_scan_time)) es_stats->es_stats_scan_time = (scan_time + @@ -1043,7 +1046,7 @@ retry: else es_stats->es_stats_shrunk = nr_shrunk; - trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, nr_skipped, retried); return nr_shrunk; } @@ -1055,7 +1058,7 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1068,13 +1071,13 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_shk_cnt); trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; - nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); + nr_shrunk = __es_shrink(sbi, nr_to_scan, NULL); trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; @@ -1102,28 +1105,24 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) return 0; /* here we just find an inode that has the max nr. of objects */ - spin_lock(&sbi->s_es_lru_lock); - list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + spin_lock(&sbi->s_es_lock); + list_for_each_entry(ei, &sbi->s_es_list, i_es_list) { inode_cnt++; if (max && max->i_es_all_nr < ei->i_es_all_nr) max = ei; else if (!max) max = ei; } - spin_unlock(&sbi->s_es_lru_lock); + spin_unlock(&sbi->s_es_lock); seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), - percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + percpu_counter_sum_positive(&es_stats->es_stats_shk_cnt)); seq_printf(seq, " %lu/%lu cache hits/misses\n", es_stats->es_stats_cache_hits, es_stats->es_stats_cache_misses); - if (es_stats->es_stats_last_sorted != 0) - seq_printf(seq, " %u ms last sorted interval\n", - jiffies_to_msecs(jiffies - - es_stats->es_stats_last_sorted)); if (inode_cnt) - seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + seq_printf(seq, " %d inodes on list\n", inode_cnt); seq_printf(seq, "average:\n %llu us scan time\n", div_u64(es_stats->es_stats_scan_time, 1000)); @@ -1132,7 +1131,7 @@ static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) seq_printf(seq, "maximum:\n %lu inode (%u objects, %u reclaimable)\n" " %llu us max scan time\n", - max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_shk_nr, div_u64(es_stats->es_stats_max_scan_time, 1000)); return 0; @@ -1181,9 +1180,11 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) { int err; - INIT_LIST_HEAD(&sbi->s_es_lru); - spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_stats.es_stats_last_sorted = 0; + /* Make sure we have enough bits for physical block number */ + BUILD_BUG_ON(ES_SHIFT < 48); + INIT_LIST_HEAD(&sbi->s_es_list); + sbi->s_es_nr_inode = 0; + spin_lock_init(&sbi->s_es_lock); sbi->s_es_stats.es_stats_shrunk = 0; sbi->s_es_stats.es_stats_cache_hits = 0; sbi->s_es_stats.es_stats_cache_misses = 0; @@ -1192,7 +1193,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); if (err) return err; - err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); + err = percpu_counter_init(&sbi->s_es_stats.es_stats_shk_cnt, 0, GFP_KERNEL); if (err) goto err1; @@ -1210,7 +1211,7 @@ int ext4_es_register_shrinker(struct ext4_sb_info *sbi) return 0; err2: - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); err1: percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); return err; @@ -1221,71 +1222,83 @@ void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) if (sbi->s_proc) remove_proc_entry("es_shrinker_info", sbi->s_proc); percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); - percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_shk_cnt); unregister_shrinker(&sbi->s_es_shrinker); } -void ext4_es_lru_add(struct inode *inode) +/* + * Shrink extents in given inode from ei->i_es_shrink_lblk till end. Scan at + * most *nr_to_scan extents, update *nr_to_scan accordingly. + * + * Return 0 if we hit end of tree / interval, 1 if we exhausted nr_to_scan. + * Increment *nr_shrunk by the number of reclaimed extents. Also update + * ei->i_es_shrink_lblk to where we should continue scanning. + */ +static int es_do_reclaim_extents(struct ext4_inode_info *ei, ext4_lblk_t end, + int *nr_to_scan, int *nr_shrunk) { - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); - - ei->i_touch_when = jiffies; - - if (!list_empty(&ei->i_es_lru)) - return; + struct inode *inode = &ei->vfs_inode; + struct ext4_es_tree *tree = &ei->i_es_tree; + struct extent_status *es; + struct rb_node *node; - spin_lock(&sbi->s_es_lru_lock); - if (list_empty(&ei->i_es_lru)) - list_add_tail(&ei->i_es_lru, &sbi->s_es_lru); - spin_unlock(&sbi->s_es_lru_lock); -} + es = __es_tree_search(&tree->root, ei->i_es_shrink_lblk); + if (!es) + goto out_wrap; + node = &es->rb_node; + while (*nr_to_scan > 0) { + if (es->es_lblk > end) { + ei->i_es_shrink_lblk = end + 1; + return 0; + } -void ext4_es_lru_del(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + (*nr_to_scan)--; + node = rb_next(&es->rb_node); + /* + * We can't reclaim delayed extent from status tree because + * fiemap, bigallic, and seek_data/hole need to use it. + */ + if (ext4_es_is_delayed(es)) + goto next; + if (ext4_es_is_referenced(es)) { + ext4_es_clear_referenced(es); + goto next; + } - spin_lock(&sbi->s_es_lru_lock); - if (!list_empty(&ei->i_es_lru)) - list_del_init(&ei->i_es_lru); - spin_unlock(&sbi->s_es_lru_lock); + rb_erase(&es->rb_node, &tree->root); + ext4_es_free_extent(inode, es); + (*nr_shrunk)++; +next: + if (!node) + goto out_wrap; + es = rb_entry(node, struct extent_status, rb_node); + } + ei->i_es_shrink_lblk = es->es_lblk; + return 1; +out_wrap: + ei->i_es_shrink_lblk = 0; + return 0; } -static int __es_try_to_reclaim_extents(struct ext4_inode_info *ei, - int nr_to_scan) +static int es_reclaim_extents(struct ext4_inode_info *ei, int *nr_to_scan) { struct inode *inode = &ei->vfs_inode; - struct ext4_es_tree *tree = &ei->i_es_tree; - struct rb_node *node; - struct extent_status *es; - unsigned long nr_shrunk = 0; + int nr_shrunk = 0; + ext4_lblk_t start = ei->i_es_shrink_lblk; static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - if (ei->i_es_lru_nr == 0) + if (ei->i_es_shk_nr == 0) return 0; if (ext4_test_inode_state(inode, EXT4_STATE_EXT_PRECACHED) && __ratelimit(&_rs)) ext4_warning(inode->i_sb, "forced shrink of precached extents"); - node = rb_first(&tree->root); - while (node != NULL) { - es = rb_entry(node, struct extent_status, rb_node); - node = rb_next(&es->rb_node); - /* - * We can't reclaim delayed extent from status tree because - * fiemap, bigallic, and seek_data/hole need to use it. - */ - if (!ext4_es_is_delayed(es)) { - rb_erase(&es->rb_node, &tree->root); - ext4_es_free_extent(inode, es); - nr_shrunk++; - if (--nr_to_scan == 0) - break; - } - } - tree->cache_es = NULL; + if (!es_do_reclaim_extents(ei, EXT_MAX_BLOCKS, nr_to_scan, &nr_shrunk) && + start != 0) + es_do_reclaim_extents(ei, start - 1, nr_to_scan, &nr_shrunk); + + ei->i_es_tree.cache_es = NULL; return nr_shrunk; } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index efd5f970b501..691b52613ce4 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -29,25 +29,28 @@ /* * These flags live in the high bits of extent_status.es_pblk */ -#define ES_SHIFT 60 - -#define EXTENT_STATUS_WRITTEN (1 << 3) -#define EXTENT_STATUS_UNWRITTEN (1 << 2) -#define EXTENT_STATUS_DELAYED (1 << 1) -#define EXTENT_STATUS_HOLE (1 << 0) +enum { + ES_WRITTEN_B, + ES_UNWRITTEN_B, + ES_DELAYED_B, + ES_HOLE_B, + ES_REFERENCED_B, + ES_FLAGS +}; -#define EXTENT_STATUS_FLAGS (EXTENT_STATUS_WRITTEN | \ - EXTENT_STATUS_UNWRITTEN | \ - EXTENT_STATUS_DELAYED | \ - EXTENT_STATUS_HOLE) +#define ES_SHIFT (sizeof(ext4_fsblk_t)*8 - ES_FLAGS) +#define ES_MASK (~((ext4_fsblk_t)0) << ES_SHIFT) -#define ES_WRITTEN (1ULL << 63) -#define ES_UNWRITTEN (1ULL << 62) -#define ES_DELAYED (1ULL << 61) -#define ES_HOLE (1ULL << 60) +#define EXTENT_STATUS_WRITTEN (1 << ES_WRITTEN_B) +#define EXTENT_STATUS_UNWRITTEN (1 << ES_UNWRITTEN_B) +#define EXTENT_STATUS_DELAYED (1 << ES_DELAYED_B) +#define EXTENT_STATUS_HOLE (1 << ES_HOLE_B) +#define EXTENT_STATUS_REFERENCED (1 << ES_REFERENCED_B) -#define ES_MASK (ES_WRITTEN | ES_UNWRITTEN | \ - ES_DELAYED | ES_HOLE) +#define ES_TYPE_MASK ((ext4_fsblk_t)(EXTENT_STATUS_WRITTEN | \ + EXTENT_STATUS_UNWRITTEN | \ + EXTENT_STATUS_DELAYED | \ + EXTENT_STATUS_HOLE) << ES_SHIFT) struct ext4_sb_info; struct ext4_extent; @@ -65,14 +68,13 @@ struct ext4_es_tree { }; struct ext4_es_stats { - unsigned long es_stats_last_sorted; unsigned long es_stats_shrunk; unsigned long es_stats_cache_hits; unsigned long es_stats_cache_misses; u64 es_stats_scan_time; u64 es_stats_max_scan_time; struct percpu_counter es_stats_all_cnt; - struct percpu_counter es_stats_lru_cnt; + struct percpu_counter es_stats_shk_cnt; }; extern int __init ext4_init_es(void); @@ -93,29 +95,49 @@ extern void ext4_es_find_delayed_extent_range(struct inode *inode, extern int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es); +static inline unsigned int ext4_es_status(struct extent_status *es) +{ + return es->es_pblk >> ES_SHIFT; +} + +static inline unsigned int ext4_es_type(struct extent_status *es) +{ + return (es->es_pblk & ES_TYPE_MASK) >> ES_SHIFT; +} + static inline int ext4_es_is_written(struct extent_status *es) { - return (es->es_pblk & ES_WRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_WRITTEN) != 0; } static inline int ext4_es_is_unwritten(struct extent_status *es) { - return (es->es_pblk & ES_UNWRITTEN) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_UNWRITTEN) != 0; } static inline int ext4_es_is_delayed(struct extent_status *es) { - return (es->es_pblk & ES_DELAYED) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_DELAYED) != 0; } static inline int ext4_es_is_hole(struct extent_status *es) { - return (es->es_pblk & ES_HOLE) != 0; + return (ext4_es_type(es) & EXTENT_STATUS_HOLE) != 0; } -static inline unsigned int ext4_es_status(struct extent_status *es) +static inline void ext4_es_set_referenced(struct extent_status *es) { - return es->es_pblk >> ES_SHIFT; + es->es_pblk |= ((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT; +} + +static inline void ext4_es_clear_referenced(struct extent_status *es) +{ + es->es_pblk &= ~(((ext4_fsblk_t)EXTENT_STATUS_REFERENCED) << ES_SHIFT); +} + +static inline int ext4_es_is_referenced(struct extent_status *es) +{ + return (ext4_es_status(es) & EXTENT_STATUS_REFERENCED) != 0; } static inline ext4_fsblk_t ext4_es_pblock(struct extent_status *es) @@ -135,23 +157,19 @@ static inline void ext4_es_store_pblock(struct extent_status *es, static inline void ext4_es_store_status(struct extent_status *es, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (es->es_pblk & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (es->es_pblk & ~ES_MASK); } static inline void ext4_es_store_pblock_status(struct extent_status *es, ext4_fsblk_t pb, unsigned int status) { - es->es_pblk = (((ext4_fsblk_t) - (status & EXTENT_STATUS_FLAGS) << ES_SHIFT) | - (pb & ~ES_MASK)); + es->es_pblk = (((ext4_fsblk_t)status << ES_SHIFT) & ES_MASK) | + (pb & ~ES_MASK); } extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); -extern void ext4_es_lru_add(struct inode *inode); -extern void ext4_es_lru_del(struct inode *inode); #endif /* _EXT4_EXTENTS_STATUS_H */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 8131be8c0af3..33a09da16c9c 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -95,7 +95,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file_inode(iocb->ki_filp); struct mutex *aio_mutex = NULL; struct blk_plug plug; - int o_direct = file->f_flags & O_DIRECT; + int o_direct = io_is_direct(file); int overwrite = 0; size_t length = iov_iter_count(from); ssize_t ret; @@ -191,17 +191,41 @@ errout: return ret; } +#ifdef CONFIG_FS_DAX +static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_fault(vma, vmf, ext4_get_block); + /* Is this the right get_block? */ +} + +static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + return dax_mkwrite(vma, vmf, ext4_get_block); +} + +static const struct vm_operations_struct ext4_dax_vm_ops = { + .fault = ext4_dax_fault, + .page_mkwrite = ext4_dax_mkwrite, +}; +#else +#define ext4_dax_vm_ops ext4_file_vm_ops +#endif + static const struct vm_operations_struct ext4_file_vm_ops = { .fault = filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, - .remap_pages = generic_file_remap_pages, }; static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { file_accessed(file); - vma->vm_ops = &ext4_file_vm_ops; + if (IS_DAX(file_inode(file))) { + vma->vm_ops = &ext4_dax_vm_ops; + vma->vm_flags |= VM_MIXEDMAP; + } else { + vma->vm_ops = &ext4_file_vm_ops; + } return 0; } @@ -600,6 +624,26 @@ const struct file_operations ext4_file_operations = { .fallocate = ext4_fallocate, }; +#ifdef CONFIG_FS_DAX +const struct file_operations ext4_dax_file_operations = { + .llseek = ext4_llseek, + .read = new_sync_read, + .write = new_sync_write, + .read_iter = generic_file_read_iter, + .write_iter = ext4_file_write_iter, + .unlocked_ioctl = ext4_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = ext4_compat_ioctl, +#endif + .mmap = ext4_file_mmap, + .open = ext4_file_open, + .release = ext4_release_file, + .fsync = ext4_sync_file, + /* Splice not yet supported with DAX */ + .fallocate = ext4_fallocate, +}; +#endif + const struct inode_operations ext4_file_inode_operations = { .setattr = ext4_setattr, .getattr = ext4_getattr, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 36b369697a13..45fe924f82bc 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -689,14 +689,22 @@ retry: inode_dio_done(inode); goto locked; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, offset, - ext4_get_block, NULL, NULL, 0); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, 0); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + ext4_get_block, NULL, NULL, 0); inode_dio_done(inode); } else { locked: - ret = blockdev_direct_IO(rw, iocb, inode, iter, - offset, ext4_get_block); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, + ext4_get_block, NULL, DIO_LOCKING); + else + ret = blockdev_direct_IO(rw, iocb, inode, iter, + offset, ext4_get_block); if (unlikely((rw & WRITE) && ret < 0)) { loff_t isize = i_size_read(inode); @@ -1393,10 +1401,7 @@ end_range: * to free. Everything was covered by the start * of the range. */ - return 0; - } else { - /* Shared branch grows from an indirect block */ - partial2--; + goto do_indirects; } } else { /* @@ -1427,56 +1432,96 @@ end_range: /* Punch happened within the same level (n == n2) */ partial = ext4_find_shared(inode, n, offsets, chain, &nr); partial2 = ext4_find_shared(inode, n2, offsets2, chain2, &nr2); - /* - * ext4_find_shared returns Indirect structure which - * points to the last element which should not be - * removed by truncate. But this is end of the range - * in punch_hole so we need to point to the next element - */ - partial2->p++; - while ((partial > chain) || (partial2 > chain2)) { - /* We're at the same block, so we're almost finished */ - if ((partial->bh && partial2->bh) && - (partial->bh->b_blocknr == partial2->bh->b_blocknr)) { - if ((partial > chain) && (partial2 > chain2)) { + + /* Free top, but only if partial2 isn't its subtree. */ + if (nr) { + int level = min(partial - chain, partial2 - chain2); + int i; + int subtree = 1; + + for (i = 0; i <= level; i++) { + if (offsets[i] != offsets2[i]) { + subtree = 0; + break; + } + } + + if (!subtree) { + if (partial == chain) { + /* Shared branch grows from the inode */ + ext4_free_branches(handle, inode, NULL, + &nr, &nr+1, + (chain+n-1) - partial); + *partial->p = 0; + } else { + /* Shared branch grows from an indirect block */ + BUFFER_TRACE(partial->bh, "get_write_access"); ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - partial2->p, + partial->p, + partial->p+1, (chain+n-1) - partial); - BUFFER_TRACE(partial->bh, "call brelse"); - brelse(partial->bh); - BUFFER_TRACE(partial2->bh, "call brelse"); - brelse(partial2->bh); } - return 0; } + } + + if (!nr2) { /* - * Clear the ends of indirect blocks on the shared branch - * at the start of the range + * ext4_find_shared returns Indirect structure which + * points to the last element which should not be + * removed by truncate. But this is end of the range + * in punch_hole so we need to point to the next element */ - if (partial > chain) { + partial2->p++; + } + + while (partial > chain || partial2 > chain2) { + int depth = (chain+n-1) - partial; + int depth2 = (chain2+n2-1) - partial2; + + if (partial > chain && partial2 > chain2 && + partial->bh->b_blocknr == partial2->bh->b_blocknr) { + /* + * We've converged on the same block. Clear the range, + * then we're done. + */ ext4_free_branches(handle, inode, partial->bh, - partial->p + 1, - (__le32 *)partial->bh->b_data+addr_per_block, - (chain+n-1) - partial); + partial->p + 1, + partial2->p, + (chain+n-1) - partial); BUFFER_TRACE(partial->bh, "call brelse"); brelse(partial->bh); - partial--; + BUFFER_TRACE(partial2->bh, "call brelse"); + brelse(partial2->bh); + return 0; } + /* - * Clear the ends of indirect blocks on the shared branch - * at the end of the range + * The start and end partial branches may not be at the same + * level even though the punch happened within one level. So, we + * give them a chance to arrive at the same level, then walk + * them in step with each other until we converge on the same + * block. */ - if (partial2 > chain2) { + if (partial > chain && depth <= depth2) { + ext4_free_branches(handle, inode, partial->bh, + partial->p + 1, + (__le32 *)partial->bh->b_data+addr_per_block, + (chain+n-1) - partial); + BUFFER_TRACE(partial->bh, "call brelse"); + brelse(partial->bh); + partial--; + } + if (partial2 > chain2 && depth2 <= depth) { ext4_free_branches(handle, inode, partial2->bh, (__le32 *)partial2->bh->b_data, partial2->p, - (chain2+n-1) - partial2); + (chain2+n2-1) - partial2); BUFFER_TRACE(partial2->bh, "call brelse"); brelse(partial2->bh); partial2--; } } + return 0; do_indirects: /* Kill the remaining (whole) subtrees */ diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3ea62695abce..4b143febf21f 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -811,8 +811,11 @@ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, ret = __block_write_begin(page, 0, inline_size, ext4_da_get_block_prep); if (ret) { + up_read(&EXT4_I(inode)->xattr_sem); + unlock_page(page); + page_cache_release(page); ext4_truncate_failed_write(inode); - goto out; + return ret; } SetPageDirty(page); @@ -870,6 +873,12 @@ retry_journal: goto out_journal; } + /* + * We cannot recurse into the filesystem as the transaction + * is already started. + */ + flags |= AOP_FLAG_NOFS; + if (ret == -ENOSPC) { ret = ext4_da_convert_inline_data_to_extent(mapping, inode, @@ -882,11 +891,6 @@ retry_journal: goto out; } - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, 0, flags); if (!page) { @@ -1807,11 +1811,12 @@ int ext4_destroy_inline_data(handle_t *handle, struct inode *inode) int ext4_inline_data_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, - int *has_inline) + int *has_inline, __u64 start, __u64 len) { __u64 physical = 0; - __u64 length; - __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_LAST; + __u64 inline_len; + __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | + FIEMAP_EXTENT_LAST; int error = 0; struct ext4_iloc iloc; @@ -1820,6 +1825,13 @@ int ext4_inline_data_fiemap(struct inode *inode, *has_inline = 0; goto out; } + inline_len = min_t(size_t, ext4_get_inline_size(inode), + i_size_read(inode)); + if (start >= inline_len) + goto out; + if (start + len < inline_len) + inline_len = start + len; + inline_len -= start; error = ext4_get_inode_loc(inode, &iloc); if (error) @@ -1828,11 +1840,10 @@ int ext4_inline_data_fiemap(struct inode *inode, physical = (__u64)iloc.bh->b_blocknr << inode->i_sb->s_blocksize_bits; physical += (char *)ext4_raw_inode(&iloc) - iloc.bh->b_data; physical += offsetof(struct ext4_inode, i_block); - length = i_size_read(inode); if (physical) - error = fiemap_fill_next_extent(fieinfo, 0, physical, - length, flags); + error = fiemap_fill_next_extent(fieinfo, start, physical, + inline_len, flags); brelse(iloc.bh); out: up_read(&EXT4_I(inode)->xattr_sem); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3356ab5395f4..5cb9a212b86f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -416,11 +416,6 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) up_read((&EXT4_I(inode)->i_data_sem)); - /* - * Clear EXT4_MAP_FROM_CLUSTER and EXT4_MAP_BOUNDARY flag - * because it shouldn't be marked in es_map->m_flags. - */ - map->m_flags &= ~(EXT4_MAP_FROM_CLUSTER | EXT4_MAP_BOUNDARY); /* * We don't check m_len because extent will be collpased in status @@ -491,7 +486,6 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_written(&es) || ext4_es_is_unwritten(&es)) { map->m_pblk = ext4_es_pblock(&es) + map->m_lblk - es.es_lblk; @@ -663,6 +657,18 @@ has_zeroout: return retval; } +static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) +{ + struct inode *inode = bh->b_assoc_map->host; + /* XXX: breaks on 32-bit > 16GB. Is that even supported? */ + loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; + int err; + if (!uptodate) + return; + WARN_ON(!buffer_unwritten(bh)); + err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); +} + /* Maximum number of blocks we map for direct IO at once. */ #define DIO_MAX_BLOCKS 4096 @@ -700,6 +706,11 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; + if (IS_DAX(inode) && buffer_unwritten(bh) && !io_end) { + bh->b_assoc_map = inode->i_mapping; + bh->b_private = (void *)(unsigned long)iblock; + bh->b_end_io = ext4_end_io_unwritten; + } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -1013,6 +1024,7 @@ static int ext4_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int i_size_changed = 0; @@ -1043,6 +1055,8 @@ static int ext4_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock @@ -1084,6 +1098,7 @@ static int ext4_journalled_write_end(struct file *file, { handle_t *handle = ext4_journal_current_handle(); struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int ret = 0, ret2; int partial = 0; unsigned from, to; @@ -1116,6 +1131,9 @@ static int ext4_journalled_write_end(struct file *file, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); + if (size_changed) { ret2 = ext4_mark_inode_dirty(handle, inode); if (!ret) @@ -1393,7 +1411,6 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, iblock, &es)) { - ext4_es_lru_add(inode); if (ext4_es_is_hole(&es)) { retval = 0; down_read(&EXT4_I(inode)->i_data_sem); @@ -1434,24 +1451,12 @@ static int ext4_da_map_blocks(struct inode *inode, sector_t iblock, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - if (ext4_has_inline_data(inode)) { - /* - * We will soon create blocks for this page, and let - * us pretend as if the blocks aren't allocated yet. - * In case of clusters, we have to handle the work - * of mapping from cluster so that the reserved space - * is calculated properly. - */ - if ((EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) && - ext4_find_delalloc_cluster(inode, map->m_lblk)) - map->m_flags |= EXT4_MAP_FROM_CLUSTER; + if (ext4_has_inline_data(inode)) retval = 0; - } else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + retval = ext4_ext_map_blocks(NULL, inode, map, 0); else - retval = ext4_ind_map_blocks(NULL, inode, map, - EXT4_GET_BLOCKS_NO_PUT_HOLE); + retval = ext4_ind_map_blocks(NULL, inode, map, 0); add_delayed: if (retval == 0) { @@ -1465,7 +1470,8 @@ add_delayed: * then we don't need to reserve it again. However we still need * to reserve metadata for every block we're going to write. */ - if (!(map->m_flags & EXT4_MAP_FROM_CLUSTER)) { + if (EXT4_SB(inode->i_sb)->s_cluster_ratio <= 1 || + !ext4_find_delalloc_cluster(inode, map->m_lblk)) { ret = ext4_da_reserve_space(inode, iblock); if (ret) { /* not enough space to reserve */ @@ -1481,11 +1487,6 @@ add_delayed: goto out_unlock; } - /* Clear EXT4_MAP_FROM_CLUSTER flag since its purpose is served - * and it should not appear on the bh->b_state. - */ - map->m_flags &= ~EXT4_MAP_FROM_CLUSTER; - map_bh(bh, inode->i_sb, invalid_block); set_buffer_new(bh); set_buffer_delay(bh); @@ -3033,13 +3034,14 @@ static ssize_t ext4_ext_direct_IO(int rw, struct kiocb *iocb, get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; } - ret = __blockdev_direct_IO(rw, iocb, inode, - inode->i_sb->s_bdev, iter, - offset, - get_block_func, - ext4_end_io_dio, - NULL, - dio_flags); + if (IS_DAX(inode)) + ret = dax_do_io(rw, iocb, inode, iter, offset, get_block_func, + ext4_end_io_dio, dio_flags); + else + ret = __blockdev_direct_IO(rw, iocb, inode, + inode->i_sb->s_bdev, iter, offset, + get_block_func, + ext4_end_io_dio, NULL, dio_flags); /* * Put our reference to io_end. This can free the io_end structure e.g. @@ -3203,19 +3205,12 @@ void ext4_set_aops(struct inode *inode) inode->i_mapping->a_ops = &ext4_aops; } -/* - * ext4_block_zero_page_range() zeros out a mapping of length 'length' - * starting from file offset 'from'. The range to be zero'd must - * be contained with in one block. If the specified range exceeds - * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' - */ -static int ext4_block_zero_page_range(handle_t *handle, +static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { ext4_fsblk_t index = from >> PAGE_CACHE_SHIFT; unsigned offset = from & (PAGE_CACHE_SIZE-1); - unsigned blocksize, max, pos; + unsigned blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3228,14 +3223,6 @@ static int ext4_block_zero_page_range(handle_t *handle, return -ENOMEM; blocksize = inode->i_sb->s_blocksize; - max = blocksize - (offset & (blocksize - 1)); - - /* - * correct length if it does not fall between - * 'from' and the end of the block - */ - if (length > max || length < 0) - length = max; iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits); @@ -3301,6 +3288,33 @@ unlock: } /* + * ext4_block_zero_page_range() zeros out a mapping of length 'length' + * starting from file offset 'from'. The range to be zero'd must + * be contained with in one block. If the specified range exceeds + * the end of the block it will be shortened to end of the block + * that cooresponds to 'from' + */ +static int ext4_block_zero_page_range(handle_t *handle, + struct address_space *mapping, loff_t from, loff_t length) +{ + struct inode *inode = mapping->host; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + unsigned blocksize = inode->i_sb->s_blocksize; + unsigned max = blocksize - (offset & (blocksize - 1)); + + /* + * correct length if it does not fall between + * 'from' and the end of the block + */ + if (length > max || length < 0) + length = max; + + if (IS_DAX(inode)) + return dax_zero_page_range(inode, from, length, ext4_get_block); + return __ext4_block_zero_page_range(handle, mapping, from, length); +} + +/* * ext4_block_truncate_page() zeroes out a mapping from file offset `from' * up to the end of the block which corresponds to `from'. * This required during truncate. We need to physically zero the tail end @@ -3643,7 +3657,7 @@ out_stop: * If this was a simple ftruncate() and the file will remain alive, * then we need to clear up the orphan record which we created above. * However, if this was a real unlink then we were called by - * ext4_delete_inode(), and we allow that function to clean up the + * ext4_evict_inode(), and we allow that function to clean up the * orphan info for us. */ if (inode->i_nlink) @@ -3821,8 +3835,10 @@ void ext4_set_inode_flags(struct inode *inode) new_fl |= S_NOATIME; if (flags & EXT4_DIRSYNC_FL) new_fl |= S_DIRSYNC; + if (test_opt(inode->i_sb, DAX)) + new_fl |= S_DAX; inode_set_flags(inode, new_fl, - S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX); } /* Propagate flags from i_flags to EXT4_I(inode)->i_flags */ @@ -4075,7 +4091,10 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) if (S_ISREG(inode->i_mode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); } else if (S_ISDIR(inode->i_mode)) { inode->i_op = &ext4_dir_inode_operations; @@ -4162,6 +4181,65 @@ static int ext4_inode_blocks_set(handle_t *handle, return 0; } +struct other_inode { + unsigned long orig_ino; + struct ext4_inode *raw_inode; +}; + +static int other_inode_match(struct inode * inode, unsigned long ino, + void *data) +{ + struct other_inode *oi = (struct other_inode *) data; + + if ((inode->i_ino != ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) || + ((inode->i_state & I_DIRTY_TIME) == 0)) + return 0; + spin_lock(&inode->i_lock); + if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW | + I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) && + (inode->i_state & I_DIRTY_TIME)) { + struct ext4_inode_info *ei = EXT4_I(inode); + + inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED); + spin_unlock(&inode->i_lock); + + spin_lock(&ei->i_raw_lock); + EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode); + EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode); + ext4_inode_csum_set(inode, oi->raw_inode, ei); + spin_unlock(&ei->i_raw_lock); + trace_ext4_other_inode_update_time(inode, oi->orig_ino); + return -1; + } + spin_unlock(&inode->i_lock); + return -1; +} + +/* + * Opportunistically update the other time fields for other inodes in + * the same inode table block. + */ +static void ext4_update_other_inodes_time(struct super_block *sb, + unsigned long orig_ino, char *buf) +{ + struct other_inode oi; + unsigned long ino; + int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block; + int inode_size = EXT4_INODE_SIZE(sb); + + oi.orig_ino = orig_ino; + ino = orig_ino & ~(inodes_per_block - 1); + for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) { + if (ino == orig_ino) + continue; + oi.raw_inode = (struct ext4_inode *) buf; + (void) find_inode_nowait(sb, ino, other_inode_match, &oi); + } +} + /* * Post the struct inode info into an on-disk inode location in the * buffer-cache. This gobbles the caller's reference to the @@ -4271,10 +4349,11 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } - ext4_inode_csum_set(inode, raw_inode, ei); - spin_unlock(&ei->i_raw_lock); + if (inode->i_sb->s_flags & MS_LAZYTIME) + ext4_update_other_inodes_time(inode->i_sb, inode->i_ino, + bh->b_data); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); rc = ext4_handle_dirty_metadata(handle, NULL, bh); @@ -4557,7 +4636,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. */ - truncate_pagecache(inode, inode->i_size); + truncate_pagecache(inode, inode->i_size); } /* * We want to call ext4_truncate() even if attr->ia_size == @@ -4863,11 +4942,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode) * If the inode is marked synchronous, we don't honour that here - doing * so would cause a commit on atime updates, which we don't bother doing. * We handle synchronous inodes at the highest possible level. + * + * If only the I_DIRTY_TIME flag is set, we can skip everything. If + * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need + * to copy into the on-disk inode structure are the timestamp files. */ void ext4_dirty_inode(struct inode *inode, int flags) { handle_t *handle; + if (flags == I_DIRTY_TIME) + return; handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) goto out; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index bfda18a15592..f58a0d106726 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -78,8 +78,6 @@ static void swap_inode_data(struct inode *inode1, struct inode *inode2) memswap(&ei1->i_disksize, &ei2->i_disksize, sizeof(ei1->i_disksize)); ext4_es_remove_extent(inode1, 0, EXT_MAX_BLOCKS); ext4_es_remove_extent(inode2, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode1); - ext4_es_lru_del(inode2); isize = i_size_read(inode1); i_size_write(inode1, i_size_read(inode2)); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbfe15c2533c..8d1e60214ef0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2358,7 +2358,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) if (sbi->s_group_info) { memcpy(new_groupinfo, sbi->s_group_info, sbi->s_group_info_size * sizeof(*sbi->s_group_info)); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } sbi->s_group_info = new_groupinfo; sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); @@ -2385,7 +2385,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, if (group % EXT4_DESC_PER_BLOCK(sb) == 0) { metalen = sizeof(*meta_group_info) << EXT4_DESC_PER_BLOCK_BITS(sb); - meta_group_info = kmalloc(metalen, GFP_KERNEL); + meta_group_info = kmalloc(metalen, GFP_NOFS); if (meta_group_info == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate mem " "for a buddy group"); @@ -2399,7 +2399,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, sbi->s_group_info[group >> EXT4_DESC_PER_BLOCK_BITS(sb)]; i = group & (EXT4_DESC_PER_BLOCK(sb) - 1); - meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_KERNEL); + meta_group_info[i] = kmem_cache_zalloc(cachep, GFP_NOFS); if (meta_group_info[i] == NULL) { ext4_msg(sb, KERN_ERR, "can't allocate buddy mem"); goto exit_group_info; @@ -2428,7 +2428,7 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, { struct buffer_head *bh; meta_group_info[i]->bb_bitmap = - kmalloc(sb->s_blocksize, GFP_KERNEL); + kmalloc(sb->s_blocksize, GFP_NOFS); BUG_ON(meta_group_info[i]->bb_bitmap == NULL); bh = ext4_read_block_bitmap(sb, group); BUG_ON(bh == NULL); @@ -2495,7 +2495,7 @@ err_freebuddy: kfree(sbi->s_group_info[i]); iput(sbi->s_buddy_cache); err_freesgi: - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); return -ENOMEM; } @@ -2708,12 +2708,11 @@ int ext4_mb_release(struct super_block *sb) EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) kfree(sbi->s_group_info[i]); - ext4_kvfree(sbi->s_group_info); + kvfree(sbi->s_group_info); } kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); - if (sbi->s_buddy_cache) - iput(sbi->s_buddy_cache); + iput(sbi->s_buddy_cache); if (sbi->s_mb_stats) { ext4_msg(sb, KERN_INFO, "mballoc: %u blocks %u reqs (%u success)", diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index a432634f2e6a..3cb267aee802 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -592,7 +592,7 @@ err_out: /* * set the i_blocks count to zero - * so that the ext4_delete_inode does the + * so that the ext4_evict_inode() does the * right job * * We don't need to take the i_lock because diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 9f2311bc9c4f..370420bfae8d 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -267,12 +267,12 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, handle_t *handle; ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; - unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; int err2, jblocks, retries = 0; int replaced_count = 0; int from = data_offset_in_page << orig_inode->i_blkbits; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; + struct super_block *sb = orig_inode->i_sb; /* * It needs twice the amount of ordinary journal buffers because @@ -287,9 +287,6 @@ again: return 0; } - if (segment_eq(get_fs(), KERNEL_DS)) - w_flags |= AOP_FLAG_UNINTERRUPTIBLE; - orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; @@ -405,10 +402,13 @@ unlock_pages: page_cache_release(pagep[1]); stop_journal: ext4_journal_stop(handle); + if (*err == -ENOSPC && + ext4_should_retry_alloc(sb, &retries)) + goto again; /* Buffer was busy because probably is pinned to journal transaction, * force transaction commit may help to free it. */ - if (*err == -EBUSY && ext4_should_retry_alloc(orig_inode->i_sb, - &retries)) + if (*err == -EBUSY && retries++ < 4 && EXT4_SB(sb)->s_journal && + jbd2_journal_force_commit_nested(EXT4_SB(sb)->s_journal)) goto again; return replaced_count; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 426211882f72..28fe71a2904c 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2235,7 +2235,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); err = ext4_add_nondir(handle, dentry, inode); if (!err && IS_DIRSYNC(dir)) @@ -2299,7 +2302,10 @@ retry: err = PTR_ERR(inode); if (!IS_ERR(inode)) { inode->i_op = &ext4_file_inode_operations; - inode->i_fop = &ext4_file_operations; + if (test_opt(inode->i_sb, DAX)) + inode->i_fop = &ext4_dax_file_operations; + else + inode->i_fop = &ext4_file_operations; ext4_set_aops(inode); d_tmpfile(dentry, inode); err = ext4_orphan_add(handle, inode); @@ -2814,7 +2820,6 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) ext4_orphan_add(handle, inode); inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); - retval = 0; end_unlink: brelse(bh); diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index ca4588388fc3..8a8ec6293b19 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -24,6 +24,18 @@ int ext4_resize_begin(struct super_block *sb) return -EPERM; /* + * If we are not using the primary superblock/GDT copy don't resize, + * because the user tools have no way of handling this. Probably a + * bad time to do it anyways. + */ + if (EXT4_SB(sb)->s_sbh->b_blocknr != + le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { + ext4_warning(sb, "won't resize using backup superblock at %llu", + (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); + return -EPERM; + } + + /* * We are not allowed to do online-resizing on a filesystem mounted * with error, because it can destroy the filesystem easily. */ @@ -758,18 +770,6 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, "EXT4-fs: ext4_add_new_gdb: adding group block %lu\n", gdb_num); - /* - * If we are not using the primary superblock/GDT copy don't resize, - * because the user tools have no way of handling this. Probably a - * bad time to do it anyways. - */ - if (EXT4_SB(sb)->s_sbh->b_blocknr != - le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block)) { - ext4_warning(sb, "won't resize using backup superblock at %llu", - (unsigned long long)EXT4_SB(sb)->s_sbh->b_blocknr); - return -EPERM; - } - gdb_bh = sb_bread(sb, gdblock); if (!gdb_bh) return -EIO; @@ -856,7 +856,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); le16_add_cpu(&es->s_reserved_gdt_blocks, -1); err = ext4_handle_dirty_super(handle, sb); @@ -866,7 +866,7 @@ static int add_new_gdb(handle_t *handle, struct inode *inode, return err; exit_inode: - ext4_kvfree(n_group_desc); + kvfree(n_group_desc); brelse(iloc.bh); exit_dind: brelse(dind); @@ -909,7 +909,7 @@ static int add_new_gdb_meta_bg(struct super_block *sb, n_group_desc[gdb_num] = gdb_bh; EXT4_SB(sb)->s_group_desc = n_group_desc; EXT4_SB(sb)->s_gdb_count++; - ext4_kvfree(o_group_desc); + kvfree(o_group_desc); BUFFER_TRACE(gdb_bh, "get_write_access"); err = ext4_journal_get_write_access(handle, gdb_bh); if (unlikely(err)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2c9e6864abd9..e061e66c8280 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -176,15 +176,6 @@ void *ext4_kvzalloc(size_t size, gfp_t flags) return ret; } -void ext4_kvfree(void *ptr) -{ - if (is_vmalloc_addr(ptr)) - vfree(ptr); - else - kfree(ptr); - -} - ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, struct ext4_group_desc *bg) { @@ -343,7 +334,7 @@ static void save_error_info(struct super_block *sb, const char *func, static int block_device_ejected(struct super_block *sb) { struct inode *bd_inode = sb->s_bdev->bd_inode; - struct backing_dev_info *bdi = bd_inode->i_mapping->backing_dev_info; + struct backing_dev_info *bdi = inode_to_bdi(bd_inode); return bdi->dev == NULL; } @@ -811,8 +802,8 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_group_desc); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -880,10 +871,10 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_prealloc_lock); ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); - INIT_LIST_HEAD(&ei->i_es_lru); + INIT_LIST_HEAD(&ei->i_es_list); ei->i_es_all_nr = 0; - ei->i_es_lru_nr = 0; - ei->i_touch_when = 0; + ei->i_es_shk_nr = 0; + ei->i_es_shrink_lblk = 0; ei->i_reserved_data_blocks = 0; ei->i_reserved_meta_blocks = 0; ei->i_allocated_meta_blocks = 0; @@ -892,6 +883,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&(ei->i_block_reservation_lock)); #ifdef CONFIG_QUOTA ei->i_reserved_quota = 0; + memset(&ei->i_dquot, 0, sizeof(ei->i_dquot)); #endif ei->jinode = NULL; INIT_LIST_HEAD(&ei->i_rsv_conversion_list); @@ -972,7 +964,6 @@ void ext4_clear_inode(struct inode *inode) dquot_drop(inode); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); - ext4_es_lru_del(inode); if (EXT4_I(inode)->jinode) { jbd2_journal_release_jbd_inode(EXT4_JOURNAL(inode), EXT4_I(inode)->jinode); @@ -1055,10 +1046,7 @@ static int ext4_mark_dquot_dirty(struct dquot *dquot); static int ext4_write_info(struct super_block *sb, int type); static int ext4_quota_on(struct super_block *sb, int type, int format_id, struct path *path); -static int ext4_quota_on_sysfile(struct super_block *sb, int type, - int format_id); static int ext4_quota_off(struct super_block *sb, int type); -static int ext4_quota_off_sysfile(struct super_block *sb, int type); static int ext4_quota_on_mount(struct super_block *sb, int type); static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, size_t len, loff_t off); @@ -1068,6 +1056,11 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, unsigned int flags); static int ext4_enable_quotas(struct super_block *sb); +static struct dquot **ext4_get_dquots(struct inode *inode) +{ + return EXT4_I(inode)->i_dquot; +} + static const struct dquot_operations ext4_quota_operations = { .get_reserved_space = ext4_get_reserved_space, .write_dquot = ext4_write_dquot, @@ -1088,16 +1081,6 @@ static const struct quotactl_ops ext4_qctl_operations = { .get_dqblk = dquot_get_dqblk, .set_dqblk = dquot_set_dqblk }; - -static const struct quotactl_ops ext4_qctl_sysfile_operations = { - .quota_on_meta = ext4_quota_on_sysfile, - .quota_off = ext4_quota_off_sysfile, - .quota_sync = dquot_quota_sync, - .get_info = dquot_get_dqinfo, - .set_info = dquot_set_dqinfo, - .get_dqblk = dquot_get_dqblk, - .set_dqblk = dquot_set_dqblk -}; #endif static const struct super_operations ext4_sops = { @@ -1117,6 +1100,7 @@ static const struct super_operations ext4_sops = { #ifdef CONFIG_QUOTA .quota_read = ext4_quota_read, .quota_write = ext4_quota_write, + .get_dquots = ext4_get_dquots, #endif .bdev_try_to_free_page = bdev_try_to_free_page, }; @@ -1140,13 +1124,14 @@ enum { Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota, Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err, - Opt_usrquota, Opt_grpquota, Opt_i_version, + Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax, Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit, + Opt_lazytime, Opt_nolazytime, Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio, Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, - Opt_max_dir_size_kb, + Opt_max_dir_size_kb, Opt_nojournal_checksum, }; static const match_table_t tokens = { @@ -1180,6 +1165,7 @@ static const match_table_t tokens = { {Opt_journal_dev, "journal_dev=%u"}, {Opt_journal_path, "journal_path=%s"}, {Opt_journal_checksum, "journal_checksum"}, + {Opt_nojournal_checksum, "nojournal_checksum"}, {Opt_journal_async_commit, "journal_async_commit"}, {Opt_abort, "abort"}, {Opt_data_journal, "data=journal"}, @@ -1202,8 +1188,11 @@ static const match_table_t tokens = { {Opt_barrier, "barrier"}, {Opt_nobarrier, "nobarrier"}, {Opt_i_version, "i_version"}, + {Opt_dax, "dax"}, {Opt_stripe, "stripe=%u"}, {Opt_delalloc, "delalloc"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_nodelalloc, "nodelalloc"}, {Opt_removed, "mblk_io_submit"}, {Opt_removed, "nomblk_io_submit"}, @@ -1361,6 +1350,8 @@ static const struct mount_opts { MOPT_EXT4_ONLY | MOPT_SET | MOPT_EXPLICIT}, {Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_EXT4_ONLY | MOPT_CLEAR}, + {Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, + MOPT_EXT4_ONLY | MOPT_CLEAR}, {Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_EXT4_ONLY | MOPT_SET}, {Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT | @@ -1384,6 +1375,7 @@ static const struct mount_opts { {Opt_min_batch_time, 0, MOPT_GTE0}, {Opt_inode_readahead_blks, 0, MOPT_GTE0}, {Opt_init_itable, 0, MOPT_GTE0}, + {Opt_dax, EXT4_MOUNT_DAX, MOPT_SET}, {Opt_stripe, 0, MOPT_GTE0}, {Opt_resuid, 0, MOPT_GTE0}, {Opt_resgid, 0, MOPT_GTE0}, @@ -1459,6 +1451,12 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, case Opt_i_version: sb->s_flags |= MS_I_VERSION; return 1; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + return 1; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + return 1; } for (m = ext4_mount_opts; m->token != Opt_err; m++) @@ -1620,6 +1618,11 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } sbi->s_jquota_fmt = m->mount_opt; #endif +#ifndef CONFIG_FS_DAX + } else if (token == Opt_dax) { + ext4_msg(sb, KERN_INFO, "dax option not supported"); + return -1; +#endif } else { if (!args->from) arg = 1; @@ -1702,6 +1705,12 @@ static int parse_options(char *options, struct super_block *sb, return 0; } } + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA && + test_opt(sb, JOURNAL_ASYNC_COMMIT)) { + ext4_msg(sb, KERN_ERR, "can't mount with journal_async_commit " + "in data=ordered mode"); + return 0; + } return 1; } @@ -1939,7 +1948,7 @@ int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup) memcpy(new_groups, sbi->s_flex_groups, (sbi->s_flex_groups_allocated * sizeof(struct flex_groups))); - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); } sbi->s_flex_groups = new_groups; sbi->s_flex_groups_allocated = size / sizeof(struct flex_groups); @@ -2770,6 +2779,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) if (readonly) return 1; + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_READONLY)) { + ext4_msg(sb, KERN_INFO, "filesystem is read-only"); + sb->s_flags |= MS_RDONLY; + return 1; + } + /* Check that feature set is OK for a read-write mount */ if (EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP)) { ext4_msg(sb, KERN_ERR, "couldn't mount RDWR because of " @@ -3310,7 +3325,7 @@ int ext4_calculate_overhead(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - char *buf = (char *) get_zeroed_page(GFP_KERNEL); + char *buf = (char *) get_zeroed_page(GFP_NOFS); if (!buf) return -ENOMEM; @@ -3338,8 +3353,8 @@ int ext4_calculate_overhead(struct super_block *sb) memset(buf, 0, PAGE_SIZE); cond_resched(); } - /* Add the journal blocks as well */ - if (sbi->s_journal) + /* Add the internal journal blocks as well */ + if (sbi->s_journal && !sbi->journal_bdev) overhead += EXT4_NUM_B2C(sbi, sbi->s_journal->j_maxlen); sbi->s_overhead = overhead; @@ -3476,7 +3491,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) - ext4_warning(sb, KERN_INFO "metadata_csum and uninit_bg are " + ext4_warning(sb, "metadata_csum and uninit_bg are " "redundant flags; please run fsck."); /* Check for a known checksum algorithm */ @@ -3596,6 +3611,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) "both data=journal and dioread_nolock"); goto failed_mount; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + goto failed_mount; + } if (test_opt(sb, DELALLOC)) clear_opt(sb, DELALLOC); } @@ -3659,6 +3679,19 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount; } + if (sbi->s_mount_opt & EXT4_MOUNT_DAX) { + if (blocksize != PAGE_SIZE) { + ext4_msg(sb, KERN_ERR, + "error: unsupported blocksize for dax"); + goto failed_mount; + } + if (!sb->s_bdev->bd_disk->fops->direct_access) { + ext4_msg(sb, KERN_ERR, + "error: device does not support dax"); + goto failed_mount; + } + } + if (sb->s_blocksize != blocksize) { /* Validate the filesystem blocksize */ if (!sb_set_blocksize(sb, blocksize)) { @@ -3909,9 +3942,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) get_random_bytes(&sbi->s_next_generation, sizeof(u32)); spin_lock_init(&sbi->s_next_gen_lock); - init_timer(&sbi->s_err_report); - sbi->s_err_report.function = print_daily_error_info; - sbi->s_err_report.data = (unsigned long) sb; + setup_timer(&sbi->s_err_report, print_daily_error_info, + (unsigned long) sb); /* Register extent status tree shrinker */ if (ext4_es_register_shrinker(sbi)) @@ -3929,9 +3961,10 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #ifdef CONFIG_QUOTA sb->dq_op = &ext4_quota_operations; if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - sb->s_qcop = &ext4_qctl_sysfile_operations; + sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4224,7 +4257,7 @@ failed_mount7: failed_mount6: ext4_mb_release(sb); if (sbi->s_flex_groups) - ext4_kvfree(sbi->s_flex_groups); + kvfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeclusters_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -4253,7 +4286,7 @@ failed_mount3: failed_mount2: for (i = 0; i < db_count; i++) brelse(sbi->s_group_desc[i]); - ext4_kvfree(sbi->s_group_desc); + kvfree(sbi->s_group_desc); failed_mount: if (sbi->s_chksum_driver) crypto_free_shash(sbi->s_chksum_driver); @@ -4838,9 +4871,6 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal && sbi->s_journal->j_task->io_context) journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; - /* - * Allow the "check" option to be passed as a remount option. - */ if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { err = -EINVAL; goto restore_opts; @@ -4849,9 +4879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if ((old_opts.s_mount_opt & EXT4_MOUNT_JOURNAL_CHECKSUM) ^ test_opt(sb, JOURNAL_CHECKSUM)) { ext4_msg(sb, KERN_ERR, "changing journal_checksum " - "during remount not supported"); - err = -EINVAL; - goto restore_opts; + "during remount not supported; ignoring"); + sbi->s_mount_opt ^= EXT4_MOUNT_JOURNAL_CHECKSUM; } if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) { @@ -4867,6 +4896,18 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) err = -EINVAL; goto restore_opts; } + if (test_opt(sb, DAX)) { + ext4_msg(sb, KERN_ERR, "can't mount with " + "both data=journal and dax"); + err = -EINVAL; + goto restore_opts; + } + } + + if ((sbi->s_mount_opt ^ old_opts.s_mount_opt) & EXT4_MOUNT_DAX) { + ext4_msg(sb, KERN_WARNING, "warning: refusing change of " + "dax flag with busy inodes while remounting"); + sbi->s_mount_opt ^= EXT4_MOUNT_DAX; } if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) @@ -4915,7 +4956,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) ext4_mark_recovery_complete(sb, es); } else { /* Make sure we can mount this feature set readwrite */ - if (!ext4_feature_set_ok(sb, 0)) { + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_READONLY) || + !ext4_feature_set_ok(sb, 0)) { err = -EROFS; goto restore_opts; } @@ -5005,6 +5048,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } #endif + *flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME); ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data); kfree(orig_data); return 0; @@ -5273,21 +5317,6 @@ static int ext4_enable_quotas(struct super_block *sb) return 0; } -/* - * quota_on function that is used when QUOTA feature is set. - */ -static int ext4_quota_on_sysfile(struct super_block *sb, int type, - int format_id) -{ - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - return -EINVAL; - - /* - * USAGE was enabled at mount time. Only need to enable LIMITS now. - */ - return ext4_quota_enable(sb, type, format_id, DQUOT_LIMITS_ENABLED); -} - static int ext4_quota_off(struct super_block *sb, int type) { struct inode *inode = sb_dqopt(sb)->files[type]; @@ -5314,18 +5343,6 @@ out: return dquot_quota_off(sb, type); } -/* - * quota_off function that is used when QUOTA feature is set. - */ -static int ext4_quota_off_sysfile(struct super_block *sb, int type) -{ - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_QUOTA)) - return -EINVAL; - - /* Disable only the limits. */ - return dquot_disable(sb, type, DQUOT_LIMITS_ENABLED); -} - /* Read data from quotafile - avoid pagecache and such because we cannot afford * acquiring the locks... As quota files are never truncated and quota code * itself serializes the operations (and no one else should touch the files) |