diff options
Diffstat (limited to 'fs/iomap/buffered-io.c')
-rw-r--r-- | fs/iomap/buffered-io.c | 133 |
1 files changed, 42 insertions, 91 deletions
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 11ea747228ae..ef0b68bccbb6 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1145,10 +1145,36 @@ static void iomap_write_delalloc_scan(struct inode *inode, } /* + * When a short write occurs, the filesystem might need to use ->iomap_end + * to remove space reservations created in ->iomap_begin. + * + * For filesystems that use delayed allocation, there can be dirty pages over + * the delalloc extent outside the range of a short write but still within the + * delalloc extent allocated for this iomap if the write raced with page + * faults. + * * Punch out all the delalloc blocks in the range given except for those that * have dirty data still pending in the page cache - those are going to be * written and so must still retain the delalloc backing for writeback. * + * The punch() callback *must* only punch delalloc extents in the range passed + * to it. It must skip over all other types of extents in the range and leave + * them completely unchanged. It must do this punch atomically with respect to + * other extent modifications. + * + * The punch() callback may be called with a folio locked to prevent writeback + * extent allocation racing at the edge of the range we are currently punching. + * The locked folio may or may not cover the range being punched, so it is not + * safe for the punch() callback to lock folios itself. + * + * Lock order is: + * + * inode->i_rwsem (shared or exclusive) + * inode->i_mapping->invalidate_lock (exclusive) + * folio_lock() + * ->punch + * internal filesystem allocation lock + * * As we are scanning the page cache for data, we don't need to reimplement the * wheel - mapping_seek_hole_data() does exactly what we need to identify the * start and end of data ranges correctly even for sub-folio block sizes. This @@ -1177,7 +1203,7 @@ static void iomap_write_delalloc_scan(struct inode *inode, * require sprinkling this code with magic "+ 1" and "- 1" arithmetic and expose * the code to subtle off-by-one bugs.... */ -static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, +void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t end_byte, unsigned flags, struct iomap *iomap, iomap_punch_t punch) { @@ -1185,12 +1211,13 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, loff_t scan_end_byte = min(i_size_read(inode), end_byte); /* - * Lock the mapping to avoid races with page faults re-instantiating - * folios and dirtying them via ->page_mkwrite whilst we walk the - * cache and perform delalloc extent removal. Failing to do this can - * leave dirty pages with no space reservation in the cache. + * The caller must hold invalidate_lock to avoid races with page faults + * re-instantiating folios and dirtying them via ->page_mkwrite whilst + * we walk the cache and perform delalloc extent removal. Failing to do + * this can leave dirty pages with no space reservation in the cache. */ - filemap_invalidate_lock(inode->i_mapping); + lockdep_assert_held_write(&inode->i_mapping->invalidate_lock); + while (start_byte < scan_end_byte) { loff_t data_end; @@ -1207,7 +1234,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (start_byte == -ENXIO || start_byte == scan_end_byte) break; if (WARN_ON_ONCE(start_byte < 0)) - goto out_unlock; + return; WARN_ON_ONCE(start_byte < punch_start_byte); WARN_ON_ONCE(start_byte > scan_end_byte); @@ -1218,7 +1245,7 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, data_end = mapping_seek_hole_data(inode->i_mapping, start_byte, scan_end_byte, SEEK_HOLE); if (WARN_ON_ONCE(data_end < 0)) - goto out_unlock; + return; /* * If we race with post-direct I/O invalidation of the page cache, @@ -1240,74 +1267,8 @@ static void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, if (punch_start_byte < end_byte) punch(inode, punch_start_byte, end_byte - punch_start_byte, iomap); -out_unlock: - filemap_invalidate_unlock(inode->i_mapping); -} - -/* - * When a short write occurs, the filesystem may need to remove reserved space - * that was allocated in ->iomap_begin from it's ->iomap_end method. For - * filesystems that use delayed allocation, we need to punch out delalloc - * extents from the range that are not dirty in the page cache. As the write can - * race with page faults, there can be dirty pages over the delalloc extent - * outside the range of a short write but still within the delalloc extent - * allocated for this iomap. - * - * This function uses [start_byte, end_byte) intervals (i.e. open ended) to - * simplify range iterations. - * - * The punch() callback *must* only punch delalloc extents in the range passed - * to it. It must skip over all other types of extents in the range and leave - * them completely unchanged. It must do this punch atomically with respect to - * other extent modifications. - * - * The punch() callback may be called with a folio locked to prevent writeback - * extent allocation racing at the edge of the range we are currently punching. - * The locked folio may or may not cover the range being punched, so it is not - * safe for the punch() callback to lock folios itself. - * - * Lock order is: - * - * inode->i_rwsem (shared or exclusive) - * inode->i_mapping->invalidate_lock (exclusive) - * folio_lock() - * ->punch - * internal filesystem allocation lock - */ -void iomap_file_buffered_write_punch_delalloc(struct inode *inode, - loff_t pos, loff_t length, ssize_t written, unsigned flags, - struct iomap *iomap, iomap_punch_t punch) -{ - loff_t start_byte; - loff_t end_byte; - unsigned int blocksize = i_blocksize(inode); - - if (iomap->type != IOMAP_DELALLOC) - return; - - /* If we didn't reserve the blocks, we're not allowed to punch them. */ - if (!(iomap->flags & IOMAP_F_NEW)) - return; - - /* - * start_byte refers to the first unused block after a short write. If - * nothing was written, round offset down to point at the first block in - * the range. - */ - if (unlikely(!written)) - start_byte = round_down(pos, blocksize); - else - start_byte = round_up(pos + written, blocksize); - end_byte = round_up(pos + length, blocksize); - - /* Nothing to do if we've written the entire delalloc extent */ - if (start_byte >= end_byte) - return; - - iomap_write_delalloc_release(inode, start_byte, end_byte, flags, iomap, - punch); } -EXPORT_SYMBOL_GPL(iomap_file_buffered_write_punch_delalloc); +EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); static loff_t iomap_unshare_iter(struct iomap_iter *iter) { @@ -1316,21 +1277,7 @@ static loff_t iomap_unshare_iter(struct iomap_iter *iter) loff_t length = iomap_length(iter); loff_t written = 0; - /* Don't bother with blocks that are not shared to start with. */ - if (!(iomap->flags & IOMAP_F_SHARED)) - return length; - - /* - * Don't bother with holes or unwritten extents. - * - * Note that we use srcmap directly instead of iomap_iter_srcmap as - * unsharing requires providing a separate source map, and the presence - * of one is a good indicator that unsharing is needed, unlike - * IOMAP_F_SHARED which can be set for any data that goes into the COW - * fork for XFS. - */ - if (iter->srcmap.type == IOMAP_HOLE || - iter->srcmap.type == IOMAP_UNWRITTEN) + if (!iomap_want_unshare_iter(iter)) return length; do { @@ -1374,11 +1321,15 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, struct iomap_iter iter = { .inode = inode, .pos = pos, - .len = len, .flags = IOMAP_WRITE | IOMAP_UNSHARE, }; + loff_t size = i_size_read(inode); int ret; + if (pos < 0 || pos >= size) + return 0; + + iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) iter.processed = iomap_unshare_iter(&iter); return ret; |