diff options
Diffstat (limited to 'fs/btrfs')
70 files changed, 4083 insertions, 3245 deletions
diff --git a/fs/btrfs/accessors.c b/fs/btrfs/accessors.c index 206cf1612c1d..1925a0919ca6 100644 --- a/fs/btrfs/accessors.c +++ b/fs/btrfs/accessors.c @@ -27,7 +27,7 @@ static bool check_setget_bounds(const struct extent_buffer *eb, void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb) { token->eb = eb; - token->kaddr = page_address(eb->pages[0]); + token->kaddr = folio_address(eb->folios[0]); token->offset = 0; } @@ -50,7 +50,7 @@ void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *e * an offset into the extent buffer page array, cast to a specific type. This * gives us all the type checking. * - * The extent buffer pages stored in the array pages do not form a contiguous + * The extent buffer pages stored in the array folios may not form a contiguous * phyusical range, but the API functions assume the linear offset to the range * from 0 to metadata node size. */ @@ -60,28 +60,30 @@ u##bits btrfs_get_token_##bits(struct btrfs_map_token *token, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - return get_unaligned_le##bits(token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + return get_unaligned_le##bits(token->kaddr + oil); \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE ) \ - return get_unaligned_le##bits(token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(token->kaddr + oil); \ \ - memcpy(lebytes, token->kaddr + oip, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(lebytes, token->kaddr + oil, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(lebytes + part, token->kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -89,19 +91,21 @@ u##bits btrfs_get_##bits(const struct extent_buffer *eb, \ const void *ptr, unsigned long off) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) \ - return get_unaligned_le##bits(kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || oil + size <= unit_size) \ + return get_unaligned_le##bits(kaddr + oil); \ \ - memcpy(lebytes, kaddr + oip, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(lebytes, kaddr + oil, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(lebytes + part, kaddr, size - part); \ return get_unaligned_le##bits(lebytes); \ } \ @@ -110,53 +114,59 @@ void btrfs_set_token_##bits(struct btrfs_map_token *token, \ u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long idx = get_eb_page_index(member_offset); \ - const unsigned long oip = get_eb_offset_in_page(token->eb, \ - member_offset); \ + const unsigned long idx = get_eb_folio_index(token->eb, member_offset); \ + const unsigned long oil = get_eb_offset_in_folio(token->eb, \ + member_offset);\ + const int unit_size = folio_size(token->eb->folios[0]); \ + const int unit_shift = folio_shift(token->eb->folios[0]); \ const int size = sizeof(u##bits); \ u8 lebytes[sizeof(u##bits)]; \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ \ ASSERT(token); \ ASSERT(token->kaddr); \ ASSERT(check_setget_bounds(token->eb, ptr, off, size)); \ if (token->offset <= member_offset && \ - member_offset + size <= token->offset + PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + member_offset + size <= token->offset + unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ - token->kaddr = page_address(token->eb->pages[idx]); \ - token->offset = idx << PAGE_SHIFT; \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, token->kaddr + oip); \ + token->kaddr = folio_address(token->eb->folios[idx]); \ + token->offset = idx << unit_shift; \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, token->kaddr + oil); \ return; \ } \ put_unaligned_le##bits(val, lebytes); \ - memcpy(token->kaddr + oip, lebytes, part); \ - token->kaddr = page_address(token->eb->pages[idx + 1]); \ - token->offset = (idx + 1) << PAGE_SHIFT; \ + memcpy(token->kaddr + oil, lebytes, part); \ + token->kaddr = folio_address(token->eb->folios[idx + 1]); \ + token->offset = (idx + 1) << unit_shift; \ memcpy(token->kaddr, lebytes + part, size - part); \ } \ void btrfs_set_##bits(const struct extent_buffer *eb, void *ptr, \ unsigned long off, u##bits val) \ { \ const unsigned long member_offset = (unsigned long)ptr + off; \ - const unsigned long oip = get_eb_offset_in_page(eb, member_offset); \ - const unsigned long idx = get_eb_page_index(member_offset); \ - char *kaddr = page_address(eb->pages[idx]); \ + const unsigned long idx = get_eb_folio_index(eb, member_offset);\ + const unsigned long oil = get_eb_offset_in_folio(eb, \ + member_offset);\ + const int unit_size = folio_size(eb->folios[0]); \ + char *kaddr = folio_address(eb->folios[idx]); \ const int size = sizeof(u##bits); \ - const int part = PAGE_SIZE - oip; \ + const int part = unit_size - oil; \ u8 lebytes[sizeof(u##bits)]; \ \ ASSERT(check_setget_bounds(eb, ptr, off, size)); \ - if (INLINE_EXTENT_BUFFER_PAGES == 1 || oip + size <= PAGE_SIZE) { \ - put_unaligned_le##bits(val, kaddr + oip); \ + if (INLINE_EXTENT_BUFFER_PAGES == 1 || \ + oil + size <= unit_size) { \ + put_unaligned_le##bits(val, kaddr + oil); \ return; \ } \ \ put_unaligned_le##bits(val, lebytes); \ - memcpy(kaddr + oip, lebytes, part); \ - kaddr = page_address(eb->pages[idx + 1]); \ + memcpy(kaddr + oil, lebytes, part); \ + kaddr = folio_address(eb->folios[idx + 1]); \ memcpy(kaddr, lebytes + part, size - part); \ } diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index aa0844535644..ed7aa32972ad 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -90,14 +90,14 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ #define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ - const type *p = page_address(eb->pages[0]) + \ + const type *p = folio_address(eb->folios[0]) + \ offset_in_page(eb->start); \ return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ - type *p = page_address(eb->pages[0]) + offset_in_page(eb->start); \ + type *p = folio_address(eb->folios[0]) + offset_in_page(eb->start); \ put_unaligned_le##bits(val, &p->member); \ } diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index 4f3b693a16b1..928f512cdb4a 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -194,6 +194,12 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, struct bio_vec *bv = bio_first_bvec_all(&repair_bbio->bio); int mirror = repair_bbio->mirror_num; + /* + * We can only trigger this for data bio, which doesn't support larger + * folios yet. + */ + ASSERT(folio_order(page_folio(bv->bv_page)) == 0); + if (repair_bbio->bio.bi_status || !btrfs_data_csum_ok(repair_bbio, dev, 0, bv)) { bio_reset(&repair_bbio->bio, NULL, REQ_OP_READ); @@ -215,7 +221,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - bv->bv_page, bv->bv_offset, mirror); + page_folio(bv->bv_page), bv->bv_offset, mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -626,7 +632,7 @@ static bool should_async_write(struct btrfs_bio *bbio) /* * Submit bio to an async queue. * - * Return true if the work has been succesfuly submitted, else false. + * Return true if the work has been successfully submitted, else false. */ static bool btrfs_wq_submit_bio(struct btrfs_bio *bbio, struct btrfs_io_context *bioc, @@ -767,8 +773,8 @@ void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num) + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -799,7 +805,8 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, length, pg_offset); + ret = bio_add_folio(&bio, folio, length, folio_offset); + ASSERT(ret); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index ca79decee060..bbaed317161a 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -105,7 +105,7 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct page *page, - unsigned int pg_offset, int mirror_num); + u64 length, u64 logical, struct folio *folio, + unsigned int folio_offset, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 6e5dc68ff661..a9be9ac99222 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -168,7 +168,7 @@ void btrfs_put_block_group(struct btrfs_block_group *cache) cache); kfree(cache->free_space_ctl); - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); kfree(cache); } } @@ -1047,7 +1047,7 @@ static int remove_block_group_item(struct btrfs_trans_handle *trans, } int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em) + struct btrfs_chunk_map *map) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_path *path; @@ -1059,10 +1059,10 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, int index; int factor; struct btrfs_caching_control *caching_ctl = NULL; - bool remove_em; + bool remove_map; bool remove_rsv = false; - block_group = btrfs_lookup_block_group(fs_info, group_start); + block_group = btrfs_lookup_block_group(fs_info, map->start); BUG_ON(!block_group); BUG_ON(!block_group->ro); @@ -1252,7 +1252,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * entries because we already removed them all when we called * btrfs_remove_free_space_cache(). * - * And we must not remove the extent map from the fs_info->mapping_tree + * And we must not remove the chunk map from the fs_info->mapping_tree * to prevent the same logical address range and physical device space * ranges from being reused for a new block group. This is needed to * avoid races with trimming and scrub. @@ -1268,19 +1268,11 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, * in place until the extents have been discarded completely when * the transaction commit has completed. */ - remove_em = (atomic_read(&block_group->frozen) == 0); + remove_map = (atomic_read(&block_group->frozen) == 0); spin_unlock(&block_group->lock); - if (remove_em) { - struct extent_map_tree *em_tree; - - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - /* once for the tree */ - free_extent_map(em); - } + if (remove_map) + btrfs_remove_chunk_map(fs_info, map); out: /* Once for the lookup reference */ @@ -1295,15 +1287,12 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset) { struct btrfs_root *root = btrfs_block_group_root(fs_info); - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned int num_items; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - ASSERT(em && em->start == chunk_offset); + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + ASSERT(map != NULL); + ASSERT(map->start == chunk_offset); /* * We need to reserve 3 + N units from the metadata space info in order @@ -1324,9 +1313,8 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = em->map_lookup; num_items = 3 + map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return btrfs_start_transaction_fallback_global_rsv(root, num_items); } @@ -1927,8 +1915,7 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; struct btrfs_block_group_item bg; struct extent_buffer *leaf; int slot; @@ -1938,23 +1925,20 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, slot = path->slots[0]; leaf = path->nodes[0]; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, key->objectid, key->offset); - read_unlock(&em_tree->lock); - if (!em) { + map = btrfs_find_chunk_map(fs_info, key->objectid, key->offset); + if (!map) { btrfs_err(fs_info, "logical %llu len %llu found bg but no related chunk", key->objectid, key->offset); return -ENOENT; } - if (em->start != key->objectid || em->len != key->offset) { + if (map->start != key->objectid || map->chunk_len != key->offset) { btrfs_err(fs_info, "block group %llu len %llu mismatch with chunk %llu len %llu", - key->objectid, key->offset, em->start, em->len); + key->objectid, key->offset, map->start, map->chunk_len); ret = -EUCLEAN; - goto out_free_em; + goto out_free_map; } read_extent_buffer(leaf, &bg, btrfs_item_ptr_offset(leaf, slot), @@ -1962,16 +1946,16 @@ static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, flags = btrfs_stack_block_group_flags(&bg) & BTRFS_BLOCK_GROUP_TYPE_MASK; - if (flags != (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + if (flags != (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "block group %llu len %llu type flags 0x%llx mismatch with chunk type flags 0x%llx", key->objectid, key->offset, flags, - (BTRFS_BLOCK_GROUP_TYPE_MASK & em->map_lookup->type)); + (BTRFS_BLOCK_GROUP_TYPE_MASK & map->type)); ret = -EUCLEAN; } -out_free_em: - free_extent_map(em); +out_free_map: + btrfs_free_chunk_map(map); return ret; } @@ -2024,8 +2008,7 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, u64 physical, u64 **logical, int *naddrs, int *stripe_len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 *buf; u64 bytenr; u64 data_stripe_length; @@ -2033,14 +2016,13 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, int i, nr = 0; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_start, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_start, 1); + if (IS_ERR(map)) return -EIO; - map = em->map_lookup; - data_stripe_length = em->orig_block_len; + data_stripe_length = map->stripe_size; io_stripe_size = BTRFS_STRIPE_LEN; - chunk_start = em->start; + chunk_start = map->start; /* For RAID5/6 adjust to a full IO stripe length */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) @@ -2094,7 +2076,7 @@ int btrfs_rmap_block(struct btrfs_fs_info *fs_info, u64 chunk_start, *naddrs = nr; *stripe_len = io_stripe_size; out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2199,49 +2181,47 @@ static struct btrfs_block_group *btrfs_create_block_group_cache( */ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct btrfs_block_group *bg; u64 start = 0; int ret = 0; while (1) { - read_lock(&map_tree->lock); + struct btrfs_chunk_map *map; + struct btrfs_block_group *bg; + /* - * lookup_extent_mapping will return the first extent map - * intersecting the range, so setting @len to 1 is enough to + * btrfs_find_chunk_map() will return the first chunk map + * intersecting the range, so setting @length to 1 is enough to * get the first chunk. */ - em = lookup_extent_mapping(map_tree, start, 1); - read_unlock(&map_tree->lock); - if (!em) + map = btrfs_find_chunk_map(fs_info, start, 1); + if (!map) break; - bg = btrfs_lookup_block_group(fs_info, em->start); + bg = btrfs_lookup_block_group(fs_info, map->start); if (!bg) { btrfs_err(fs_info, "chunk start=%llu len=%llu doesn't have corresponding block group", - em->start, em->len); + map->start, map->chunk_len); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); break; } - if (bg->start != em->start || bg->length != em->len || + if (bg->start != map->start || bg->length != map->chunk_len || (bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK) != - (em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { + (map->type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(fs_info, "chunk start=%llu len=%llu flags=0x%llx doesn't match block group start=%llu len=%llu flags=0x%llx", - em->start, em->len, - em->map_lookup->type & BTRFS_BLOCK_GROUP_TYPE_MASK, + map->start, map->chunk_len, + map->type & BTRFS_BLOCK_GROUP_TYPE_MASK, bg->start, bg->length, bg->flags & BTRFS_BLOCK_GROUP_TYPE_MASK); ret = -EUCLEAN; - free_extent_map(em); + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); break; } - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); btrfs_put_block_group(bg); } return ret; @@ -2369,28 +2349,25 @@ error: static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; struct rb_node *node; int ret = 0; - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - struct extent_map *em; - struct map_lookup *map; + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; struct btrfs_block_group *bg; - em = rb_entry(node, struct extent_map, rb_node); - map = em->map_lookup; - bg = btrfs_create_block_group_cache(fs_info, em->start); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + bg = btrfs_create_block_group_cache(fs_info, map->start); if (!bg) { ret = -ENOMEM; break; } /* Fill dummy cache as FULL */ - bg->length = em->len; + bg->length = map->chunk_len; bg->flags = map->type; bg->cached = BTRFS_CACHE_FINISHED; - bg->used = em->len; + bg->used = map->chunk_len; bg->flags = map->type; ret = btrfs_add_block_group_cache(fs_info, bg); /* @@ -2618,19 +2595,14 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_device *device; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_offset; - u64 stripe_size; int i; int ret = 0; - em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); - if (IS_ERR(em)) - return PTR_ERR(em); - - map = em->map_lookup; - stripe_size = em->orig_block_len; + map = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size); + if (IS_ERR(map)) + return PTR_ERR(map); /* * Take the device list mutex to prevent races with the final phase of @@ -2647,13 +2619,13 @@ static int insert_dev_extents(struct btrfs_trans_handle *trans, dev_offset = map->stripes[i].physical; ret = insert_dev_extent(trans, device, chunk_offset, dev_offset, - stripe_size); + map->stripe_size); if (ret) break; } mutex_unlock(&fs_info->fs_devices->device_list_mutex); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -2910,7 +2882,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, goto unlock_out; /* - * Skip chunk alloction if the bg is SYSTEM, this is to avoid system + * Skip chunk allocation if the bg is SYSTEM, this is to avoid system * chunk allocation storm to exhaust the system chunk array. Otherwise * we still want to try our best to mark the block group read-only. */ @@ -4406,8 +4378,6 @@ void btrfs_freeze_block_group(struct btrfs_block_group *cache) void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct extent_map_tree *em_tree; - struct extent_map *em; bool cleanup; spin_lock(&block_group->lock); @@ -4416,17 +4386,16 @@ void btrfs_unfreeze_block_group(struct btrfs_block_group *block_group) spin_unlock(&block_group->lock); if (cleanup) { - em_tree = &fs_info->mapping_tree; - write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, block_group->start, - 1); - BUG_ON(!em); /* logic error, can't happen */ - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* once for us and once for the tree */ - free_extent_map(em); - free_extent_map(em); + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map(fs_info, block_group->start, 1); + /* Logic error, can't happen. */ + ASSERT(map); + + btrfs_remove_chunk_map(fs_info, map); + + /* Once for our lookup reference. */ + btrfs_free_chunk_map(map); /* * We may have left one free space entry and other possible diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 2bdbcb834f95..c4a1f01cc1c2 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -5,6 +5,8 @@ #include "free-space-cache.h" +struct btrfs_chunk_map; + enum btrfs_disk_cache_state { BTRFS_DC_WRITTEN, BTRFS_DC_ERROR, @@ -243,7 +245,7 @@ struct btrfs_block_group { u64 zone_unusable; u64 zone_capacity; u64 meta_write_pointer; - struct map_lookup *physical_map; + struct btrfs_chunk_map *physical_map; struct list_head active_bg_list; struct work_struct zone_finish_work; struct extent_buffer *last_eb; @@ -297,7 +299,7 @@ struct btrfs_trans_handle *btrfs_start_trans_remove_block_group( struct btrfs_fs_info *fs_info, const u64 chunk_offset); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, - u64 group_start, struct extent_map *em); + struct btrfs_chunk_map *map); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); void btrfs_reclaim_bgs_work(struct work_struct *work); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 5572ae52444e..7f7c5a92d2b8 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -69,6 +69,8 @@ enum { BTRFS_INODE_VERITY_IN_PROGRESS, /* Set when this inode is a free space inode. */ BTRFS_INODE_FREE_SPACE_INODE, + /* Set when there are no capabilities in XATTs for the inode. */ + BTRFS_INODE_NO_CAP_XATTR, }; /* in memory btrfs inode */ @@ -107,9 +109,11 @@ struct btrfs_inode { /* * Keep track of where the inode has extent items mapped in order to - * make sure the i_size adjustments are accurate + * make sure the i_size adjustments are accurate. Not required when the + * filesystem is NO_HOLES, the status can't be set while mounted as + * it's a mkfs-time feature. */ - struct extent_io_tree file_extent_tree; + struct extent_io_tree *file_extent_tree; /* held while logging the inode in tree-log.c */ struct mutex log_mutex; @@ -487,7 +491,7 @@ struct inode *btrfs_iget_path(struct super_block *s, u64 ino, struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 end); + u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 19b22b4653c8..193168214eeb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -20,6 +20,7 @@ #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/log2.h> +#include <linux/shrinker.h> #include <crypto/hash.h> #include "misc.h" #include "ctree.h" @@ -163,13 +164,107 @@ static int compression_decompress(int type, struct list_head *ws, static void btrfs_free_compressed_pages(struct compressed_bio *cb) { for (unsigned int i = 0; i < cb->nr_pages; i++) - put_page(cb->compressed_pages[i]); + btrfs_free_compr_page(cb->compressed_pages[i]); kfree(cb->compressed_pages); } static int btrfs_decompress_bio(struct compressed_bio *cb); -static void end_compressed_bio_read(struct btrfs_bio *bbio) +/* + * Global cache of last unused pages for compression/decompression. + */ +static struct btrfs_compr_pool { + struct shrinker *shrinker; + spinlock_t lock; + struct list_head list; + int count; + int thresh; +} compr_pool; + +static unsigned long btrfs_compr_pool_count(struct shrinker *sh, struct shrink_control *sc) +{ + int ret; + + /* + * We must not read the values more than once if 'ret' gets expanded in + * the return statement so we don't accidentally return a negative + * number, even if the first condition finds it positive. + */ + ret = READ_ONCE(compr_pool.count) - READ_ONCE(compr_pool.thresh); + + return ret > 0 ? ret : 0; +} + +static unsigned long btrfs_compr_pool_scan(struct shrinker *sh, struct shrink_control *sc) +{ + struct list_head remove; + struct list_head *tmp, *next; + int freed; + + if (compr_pool.count == 0) + return SHRINK_STOP; + + INIT_LIST_HEAD(&remove); + + /* For now, just simply drain the whole list. */ + spin_lock(&compr_pool.lock); + list_splice_init(&compr_pool.list, &remove); + freed = compr_pool.count; + compr_pool.count = 0; + spin_unlock(&compr_pool.lock); + + list_for_each_safe(tmp, next, &remove) { + struct page *page = list_entry(tmp, struct page, lru); + + ASSERT(page_ref_count(page) == 1); + put_page(page); + } + + return freed; +} + +/* + * Common wrappers for page allocation from compression wrappers + */ +struct page *btrfs_alloc_compr_page(void) +{ + struct page *page = NULL; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > 0) { + page = list_first_entry(&compr_pool.list, struct page, lru); + list_del_init(&page->lru); + compr_pool.count--; + } + spin_unlock(&compr_pool.lock); + + if (page) + return page; + + return alloc_page(GFP_NOFS); +} + +void btrfs_free_compr_page(struct page *page) +{ + bool do_free = false; + + spin_lock(&compr_pool.lock); + if (compr_pool.count > compr_pool.thresh) { + do_free = true; + } else { + list_add(&page->lru, &compr_pool.list); + compr_pool.count++; + } + spin_unlock(&compr_pool.lock); + + if (!do_free) + return; + + ASSERT(page_ref_count(page) == 1); + put_page(page); +} + +static void end_bbio_comprssed_read(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); blk_status_t status = bbio->bio.bi_status; @@ -211,8 +306,8 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) for (i = 0; i < ret; i++) { struct folio *folio = fbatch.folios[i]; - btrfs_page_clamp_clear_writeback(fs_info, &folio->page, - cb->start, cb->len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, + cb->start, cb->len); } folio_batch_release(&fbatch); } @@ -242,7 +337,7 @@ static void btrfs_finish_compressed_write_work(struct work_struct *work) * This also calls the writeback end hooks for the file pages so that metadata * and checksums can be updated in the file. */ -static void end_compressed_bio_write(struct btrfs_bio *bbio) +static void end_bbio_comprssed_write(struct btrfs_bio *bbio) { struct compressed_bio *cb = to_compressed_bio(bbio); struct btrfs_fs_info *fs_info = bbio->inode->root->fs_info; @@ -289,7 +384,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, cb = alloc_compressed_bio(inode, ordered->file_offset, REQ_OP_WRITE | write_flags, - end_compressed_bio_write); + end_bbio_comprssed_write); cb->start = ordered->file_offset; cb->len = ordered->num_bytes; cb->compressed_pages = compressed_pages; @@ -446,7 +541,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, * subpage::readers and to unlock the page. */ if (fs_info->sectorsize < PAGE_SIZE) - btrfs_subpage_start_reader(fs_info, page, cur, add_size); + btrfs_subpage_start_reader(fs_info, page_folio(page), + cur, add_size); put_page(page); cur += add_size; } @@ -489,11 +585,11 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out; } - ASSERT(em->compress_type != BTRFS_COMPRESS_NONE); + ASSERT(extent_map_is_compressed(em)); compressed_len = em->block_len; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, - end_compressed_bio_read); + end_bbio_comprssed_read); cb->start = em->orig_start; em_len = em->len; @@ -501,7 +597,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = em->compress_type; + cb->compress_type = extent_map_compression(em); cb->orig_bbio = bbio; free_extent_map(em); @@ -513,7 +609,7 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) goto out_free_bio; } - ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages); + ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages, 0); if (ret2) { ret = BLK_STS_RESOURCE; goto out_free_compressed_pages; @@ -960,15 +1056,36 @@ int __init btrfs_init_compress(void) offsetof(struct compressed_bio, bbio.bio), BIOSET_NEED_BVECS)) return -ENOMEM; + + compr_pool.shrinker = shrinker_alloc(SHRINKER_NONSLAB, "btrfs-compr-pages"); + if (!compr_pool.shrinker) + return -ENOMEM; + btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO); zstd_init_workspace_manager(); + + spin_lock_init(&compr_pool.lock); + INIT_LIST_HEAD(&compr_pool.list); + compr_pool.count = 0; + /* 128K / 4K = 32, for 8 threads is 256 pages. */ + compr_pool.thresh = BTRFS_MAX_COMPRESSED / PAGE_SIZE * 8; + compr_pool.shrinker->count_objects = btrfs_compr_pool_count; + compr_pool.shrinker->scan_objects = btrfs_compr_pool_scan; + compr_pool.shrinker->batch = 32; + compr_pool.shrinker->seeks = DEFAULT_SEEKS; + shrinker_register(compr_pool.shrinker); + return 0; } void __cold btrfs_exit_compress(void) { + /* For now scan drains all pages and does not touch the parameters. */ + btrfs_compr_pool_scan(NULL, NULL); + shrinker_free(compr_pool.shrinker); + btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_NONE); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_ZLIB); btrfs_cleanup_workspace_manager(BTRFS_COMPRESS_LZO); diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 03bb9d143fa7..93cc92974dee 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -32,6 +32,8 @@ static_assert((BTRFS_MAX_COMPRESSED % PAGE_SIZE) == 0); #define BTRFS_ZLIB_DEFAULT_LEVEL 3 +struct page; + struct compressed_bio { /* Number of compressed pages in the array */ unsigned int nr_pages; @@ -96,6 +98,9 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio); unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +struct page *btrfs_alloc_compr_page(void); +void btrfs_free_compr_page(struct page *page); + enum btrfs_compression_type { BTRFS_COMPRESS_NONE = 0, BTRFS_COMPRESS_ZLIB = 1, diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 2a9344a3fcee..e65e012bac55 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -370,33 +370,41 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, /* * check if the tree block can be shared by multiple trees */ -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf) +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf) { + const u64 buf_gen = btrfs_header_generation(buf); + /* * Tree blocks not in shareable trees and tree roots are never shared. * If a block was allocated after the last snapshot and the block was * not allocated by tree relocation, we know the block is not shared. */ - if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && - buf != root->node && - (btrfs_header_generation(buf) <= - btrfs_root_last_snapshot(&root->root_item) || - btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))) { - if (buf != root->commit_root) - return 1; - /* - * An extent buffer that used to be the commit root may still be - * shared because the tree height may have increased and it - * became a child of a higher level root. This can happen when - * snapshotting a subvolume created in the current transaction. - */ - if (btrfs_header_generation(buf) == trans->transid) - return 1; - } - return 0; + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) + return false; + + if (buf == root->node) + return false; + + if (buf_gen > btrfs_root_last_snapshot(&root->root_item) && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) + return false; + + if (buf != root->commit_root) + return true; + + /* + * An extent buffer that used to be the commit root may still be shared + * because the tree height may have increased and it became a child of a + * higher level root. This can happen when snapshotting a subvolume + * created in the current transaction. + */ + if (buf_gen == trans->transid) + return true; + + return false; } static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, @@ -432,7 +440,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans, if (btrfs_block_can_be_shared(trans, root, buf)) { ret = btrfs_lookup_extent_info(trans, fs_info, buf->start, btrfs_header_level(buf), 1, - &refs, &flags); + &refs, &flags, NULL); if (ret) return ret; if (unlikely(refs == 0)) { @@ -812,7 +820,8 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, } while (low < high) { - unsigned long oip; + const int unit_size = folio_size(eb->folios[0]); + unsigned long oil; unsigned long offset; struct btrfs_disk_key *tmp; struct btrfs_disk_key unaligned; @@ -820,14 +829,14 @@ int btrfs_bin_search(struct extent_buffer *eb, int first_slot, mid = (low + high) / 2; offset = p + mid * item_size; - oip = offset_in_page(offset); + oil = get_eb_offset_in_folio(eb, offset); - if (oip + key_size <= PAGE_SIZE) { - const unsigned long idx = get_eb_page_index(offset); - char *kaddr = page_address(eb->pages[idx]); + if (oil + key_size <= unit_size) { + const unsigned long idx = get_eb_folio_index(eb, offset); + char *kaddr = folio_address(eb->folios[idx]); - oip = get_eb_offset_in_page(eb, offset); - tmp = (struct btrfs_disk_key *)(kaddr + oip); + oil = get_eb_offset_in_folio(eb, offset); + tmp = (struct btrfs_disk_key *)(kaddr + oil); } else { read_extent_buffer(eb, &unaligned, offset, key_size); tmp = &unaligned; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 196c005c31f6..70e828d33177 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -212,8 +212,6 @@ struct btrfs_root { u64 last_trans; - u32 type; - u64 free_objectid; struct btrfs_key defrag_progress; @@ -224,18 +222,15 @@ struct btrfs_root { struct list_head root_list; - spinlock_t log_extents_lock[2]; - struct list_head logged_list[2]; - spinlock_t inode_lock; /* red-black tree that keeps track of in-memory inodes */ struct rb_root inode_tree; /* - * radix tree that keeps track of delayed nodes of every inode, - * protected by inode_lock + * Xarray that keeps track of delayed nodes of every inode, protected + * by @inode_lock. */ - struct radix_tree_root delayed_nodes_tree; + struct xarray delayed_nodes; /* * right now this just gets used so that a root has its own devid * for stat. It may be used for more later @@ -561,9 +556,9 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer **cow_ret, u64 new_root_objectid); -int btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, - struct btrfs_root *root, - struct extent_buffer *buf); +bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf); int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); void btrfs_extend_item(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 5244561e2016..c276b136ab63 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -775,7 +775,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * this em, as either we don't care about the generation, or the * merged extent map will be rejected anyway. */ - if (em && test_bit(EXTENT_FLAG_MERGED, &em->flags) && + if (em && (em->flags & EXTENT_FLAG_MERGED) && newer_than && em->generation >= newer_than) { free_extent_map(em); em = NULL; @@ -802,7 +802,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + if (extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -828,7 +828,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, /* No more em or hole */ if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE) goto out; - if (test_bit(EXTENT_FLAG_PREALLOC, &next->flags)) + if (next->flags & EXTENT_FLAG_PREALLOC) goto out; /* * If the next extent is at its max capacity, defragging current extent @@ -996,10 +996,9 @@ static int defrag_collect_targets(struct btrfs_inode *inode, em->len <= inode->root->fs_info->max_inline) goto next; - /* Skip hole/delalloc/preallocated extents */ + /* Skip holes and preallocated extents. */ if (em->block_start == EXTENT_MAP_HOLE || - em->block_start == EXTENT_MAP_DELALLOC || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + (em->flags & EXTENT_FLAG_PREALLOC)) goto next; /* Skip older extent */ @@ -1190,7 +1189,7 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, /* Update the page status */ for (i = start_index - first_index; i <= last_index - first_index; i++) { ClearPageChecked(pages[i]); - btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(pages[i]), start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 51453d4928fa..2833e8ef4c09 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -199,7 +199,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, start = round_down(start, fs_info->sectorsize); btrfs_free_reserved_data_space_noquota(fs_info, len); - btrfs_qgroup_free_data(inode, reserved, start, len); + btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } /* diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 7381241334e8..08102883f560 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -71,7 +71,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( } spin_lock(&root->inode_lock); - node = radix_tree_lookup(&root->delayed_nodes_tree, ino); + node = xa_load(&root->delayed_nodes, ino); if (node) { if (btrfs_inode->delayed_node) { @@ -83,9 +83,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( /* * It's possible that we're racing into the middle of removing - * this node from the radix tree. In this case, the refcount + * this node from the xarray. In this case, the refcount * was zero and it should never go back to one. Just return - * NULL like it was never in the radix at all; our release + * NULL like it was never in the xarray at all; our release * function is in the process of removing it. * * Some implementations of refcount_inc refuse to bump the @@ -93,7 +93,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( * here, refcount_inc() may decide to just WARN_ONCE() instead * of actually bumping the refcount. * - * If this node is properly in the radix, we want to bump the + * If this node is properly in the xarray, we want to bump the * refcount twice, once for the inode and once for this get * operation. */ @@ -120,6 +120,7 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_root *root = btrfs_inode->root; u64 ino = btrfs_ino(btrfs_inode); int ret; + void *ptr; again: node = btrfs_get_delayed_node(btrfs_inode); @@ -131,26 +132,30 @@ again: return ERR_PTR(-ENOMEM); btrfs_init_delayed_node(node, root, ino); - /* cached in the btrfs inode and can be accessed */ + /* Cached in the inode and can be accessed. */ refcount_set(&node->refs, 2); - ret = radix_tree_preload(GFP_NOFS); - if (ret) { + /* Allocate and reserve the slot, from now it can return a NULL from xa_load(). */ + ret = xa_reserve(&root->delayed_nodes, ino, GFP_NOFS); + if (ret == -ENOMEM) { kmem_cache_free(delayed_node_cache, node); - return ERR_PTR(ret); + return ERR_PTR(-ENOMEM); } - spin_lock(&root->inode_lock); - ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node); - if (ret == -EEXIST) { + ptr = xa_load(&root->delayed_nodes, ino); + if (ptr) { + /* Somebody inserted it, go back and read it. */ spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, node); - radix_tree_preload_end(); + node = NULL; goto again; } + ptr = xa_store(&root->delayed_nodes, ino, node, GFP_ATOMIC); + ASSERT(xa_err(ptr) != -EINVAL); + ASSERT(xa_err(ptr) != -ENOMEM); + ASSERT(ptr == NULL); btrfs_inode->delayed_node = node; spin_unlock(&root->inode_lock); - radix_tree_preload_end(); return node; } @@ -269,8 +274,7 @@ static void __btrfs_release_delayed_node( * back up. We can delete it now. */ ASSERT(refcount_read(&delayed_node->refs) == 0); - radix_tree_delete(&root->delayed_nodes_tree, - delayed_node->inode_id); + xa_erase(&root->delayed_nodes, delayed_node->inode_id); spin_unlock(&root->inode_lock); kmem_cache_free(delayed_node_cache, delayed_node); } @@ -1036,14 +1040,33 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, if (!test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) goto out; - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(leaf)) - goto search; -again: + /* + * Now we're going to delete the INODE_REF/EXTREF, which should be the + * only one ref left. Check if the next item is an INODE_REF/EXTREF. + * + * But if we're the last item already, release and search for the last + * INODE_REF/EXTREF. + */ + if (path->slots[0] + 1 >= btrfs_header_nritems(leaf)) { + key.objectid = node->inode_id; + key.type = BTRFS_INODE_EXTREF_KEY; + key.offset = (u64)-1; + + btrfs_release_path(path); + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto err_out; + ASSERT(ret > 0); + ASSERT(path->slots[0] > 0); + ret = 0; + path->slots[0]--; + leaf = path->nodes[0]; + } else { + path->slots[0]++; + } btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); if (key.objectid != node->inode_id) goto out; - if (key.type != BTRFS_INODE_REF_KEY && key.type != BTRFS_INODE_EXTREF_KEY) goto out; @@ -1070,22 +1093,6 @@ err_out: btrfs_abort_transaction(trans, ret); return ret; - -search: - btrfs_release_path(path); - - key.type = BTRFS_INODE_EXTREF_KEY; - key.offset = -1; - - ret = btrfs_search_slot(trans, root, &key, path, -1, 1); - if (ret < 0) - goto err_out; - ASSERT(ret); - - ret = 0; - leaf = path->nodes[0]; - path->slots[0]--; - goto again; } static inline int btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, @@ -2035,34 +2042,36 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode) void btrfs_kill_all_delayed_nodes(struct btrfs_root *root) { - u64 inode_id = 0; + unsigned long index = 0; struct btrfs_delayed_node *delayed_nodes[8]; - int i, n; while (1) { + struct btrfs_delayed_node *node; + int count; + spin_lock(&root->inode_lock); - n = radix_tree_gang_lookup(&root->delayed_nodes_tree, - (void **)delayed_nodes, inode_id, - ARRAY_SIZE(delayed_nodes)); - if (!n) { + if (xa_empty(&root->delayed_nodes)) { spin_unlock(&root->inode_lock); - break; + return; } - inode_id = delayed_nodes[n - 1]->inode_id + 1; - for (i = 0; i < n; i++) { + count = 0; + xa_for_each_start(&root->delayed_nodes, index, node, index) { /* * Don't increase refs in case the node is dead and * about to be removed from the tree in the loop below */ - if (!refcount_inc_not_zero(&delayed_nodes[i]->refs)) - delayed_nodes[i] = NULL; + if (refcount_inc_not_zero(&node->refs)) { + delayed_nodes[count] = node; + count++; + } + if (count >= ARRAY_SIZE(delayed_nodes)) + break; } spin_unlock(&root->inode_lock); + index++; - for (i = 0; i < n; i++) { - if (!delayed_nodes[i]) - continue; + for (int i = 0; i < count; i++) { __btrfs_kill_delayed_node(delayed_nodes[i]); btrfs_release_delayed_node(delayed_nodes[i]); } diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 9223934d95f4..891ea2fa263c 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -1041,7 +1041,7 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); @@ -1144,7 +1144,7 @@ int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans, return -ENOMEM; } - if (btrfs_qgroup_enabled(fs_info) && !generic_ref->skip_qgroup) { + if (btrfs_qgroup_full_accounting(fs_info) && !generic_ref->skip_qgroup) { record = kzalloc(sizeof(*record), GFP_NOFS); if (!record) { kmem_cache_free(btrfs_delayed_data_ref_cachep, ref); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f9544fda38e9..1502d664c892 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -550,8 +550,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, u64 physical) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 chunk_offset = cache->start; int num_extents, cur_extent; int i; @@ -567,9 +566,8 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, } spin_unlock(&cache->lock); - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - ASSERT(!IS_ERR(em)); - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + ASSERT(!IS_ERR(map)); num_extents = 0; cur_extent = 0; @@ -583,7 +581,7 @@ bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, cur_extent = i; } - free_extent_map(em); + btrfs_free_chunk_map(map); if (num_extents > 1 && cur_extent < num_extents - 1) { /* @@ -812,25 +810,23 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( struct btrfs_device *srcdev, struct btrfs_device *tgtdev) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; u64 start = 0; int i; - write_lock(&em_tree->lock); + write_lock(&fs_info->mapping_tree_lock); do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) + struct btrfs_chunk_map *map; + + map = btrfs_find_chunk_map_nolock(fs_info, start, U64_MAX); + if (!map) break; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); + start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); } while (start); - write_unlock(&em_tree->lock); + write_unlock(&fs_info->mapping_tree_lock); } static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 401ea09ae4b8..c6907d533fe8 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -74,20 +74,37 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info) static void csum_tree_block(struct extent_buffer *buf, u8 *result) { struct btrfs_fs_info *fs_info = buf->fs_info; - const int num_pages = num_extent_pages(buf); - const int first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + int num_pages; + u32 first_page_part; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); char *kaddr; int i; shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - kaddr = page_address(buf->pages[0]) + offset_in_page(buf->start); + + if (buf->addr) { + /* Pages are contiguous, handle them as a big one. */ + kaddr = buf->addr; + first_page_part = fs_info->nodesize; + num_pages = 1; + } else { + kaddr = folio_address(buf->folios[0]); + first_page_part = min_t(u32, PAGE_SIZE, fs_info->nodesize); + num_pages = num_extent_pages(buf); + } + crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE, first_page_part - BTRFS_CSUM_SIZE); + /* + * Multiple single-page folios case would reach here. + * + * nodesize <= PAGE_SIZE and large folio all handled by above + * crypto_shash_update() already. + */ for (i = 1; i < num_pages && INLINE_EXTENT_BUFFER_PAGES > 1; i++) { - kaddr = page_address(buf->pages[i]); + kaddr = folio_address(buf->folios[i]); crypto_shash_update(shash, kaddr, PAGE_SIZE); } memset(result, 0, BTRFS_CSUM_SIZE); @@ -166,20 +183,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i, num_pages = num_extent_pages(eb); + int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - u64 start = max_t(u64, eb->start, page_offset(p)); - u64 end = min_t(u64, eb->start + eb->len, page_offset(p) + PAGE_SIZE); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + u64 start = max_t(u64, eb->start, folio_pos(folio)); + u64 end = min_t(u64, eb->start + eb->len, + folio_pos(folio) + folio_size(folio)); u32 len = end - start; ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, p, offset_in_page(start), mirror_num); + start, folio, offset_in_folio(folio, start), + mirror_num); if (ret) break; } @@ -254,15 +273,20 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) return BLK_STS_IOERR; - if (test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)) { - WARN_ON_ONCE(found_start != 0); + /* + * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't + * checksum it but zero-out its content. This is done to preserve + * ordering of I/O without unnecessarily writing out data. + */ + if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { + memzero_extent_buffer(eb, 0, eb->len); return BLK_STS_OK; } if (WARN_ON_ONCE(found_start != eb->start)) return BLK_STS_IOERR; - if (WARN_ON(!btrfs_page_test_uptodate(fs_info, eb->pages[0], eb->start, - eb->len))) + if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], + eb->start, eb->len))) return BLK_STS_IOERR; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, @@ -371,8 +395,8 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, } csum_tree_block(eb, result); - header_csum = page_address(eb->pages[0]) + - get_eb_offset_in_page(eb, offsetof(struct btrfs_header, csum)); + header_csum = folio_address(eb->folios[0]) + + get_eb_offset_in_folio(eb, offsetof(struct btrfs_header, csum)); if (memcmp(result, header_csum, csum_size) != 0) { btrfs_warn_rl(fs_info, @@ -639,7 +663,8 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, root->nr_delalloc_inodes = 0; root->nr_ordered_extents = 0; root->inode_tree = RB_ROOT; - INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC); + /* GFP flags are compatible with XA_FLAGS_*. */ + xa_init_flags(&root->delayed_nodes, GFP_ATOMIC); btrfs_init_root_block_rsv(root); @@ -650,14 +675,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, INIT_LIST_HEAD(&root->ordered_extents); INIT_LIST_HEAD(&root->ordered_root); INIT_LIST_HEAD(&root->reloc_dirty_list); - INIT_LIST_HEAD(&root->logged_list[0]); - INIT_LIST_HEAD(&root->logged_list[1]); spin_lock_init(&root->inode_lock); spin_lock_init(&root->delalloc_lock); spin_lock_init(&root->ordered_extent_lock); spin_lock_init(&root->accounting_lock); - spin_lock_init(&root->log_extents_lock[0]); - spin_lock_init(&root->log_extents_lock[1]); spin_lock_init(&root->qgroup_meta_rsv_lock); mutex_init(&root->objectid_mutex); mutex_init(&root->log_mutex); @@ -2618,9 +2639,6 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) */ btrfs_set_super_log_root(sb, 0); - /* We can't trust the free space cache either */ - btrfs_set_opt(fs_info->mount_opt, CLEAR_CACHE); - btrfs_warn(fs_info, "try to load backup roots slot %d", i); ret = read_backup_root(fs_info, i); backup_index = ret; @@ -2724,7 +2742,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->allocated_ebs); spin_lock_init(&fs_info->eb_leak_lock); #endif - extent_map_tree_init(&fs_info->mapping_tree); + fs_info->mapping_tree = RB_ROOT_CACHED; + rwlock_init(&fs_info->mapping_tree_lock); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); @@ -2794,6 +2813,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->sectorsize_bits = ilog2(4096); fs_info->stripesize = 4096; + /* Default compress algorithm when user does -o compress */ + fs_info->compress_type = BTRFS_COMPRESS_ZLIB; + fs_info->max_extent_size = BTRFS_MAX_EXTENT_SIZE; spin_lock_init(&fs_info->swapfile_pins_lock); @@ -2931,17 +2953,6 @@ out: } /* - * Some options only have meaning at mount time and shouldn't persist across - * remounts, or be displayed. Clear these at the end of mount and remount - * code paths. - */ -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) -{ - btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); - btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); -} - -/* * Mounting logic specific to read-write file systems. Shared by open_ctree * and btrfs_remount when remounting from read-only to read-write. */ @@ -2953,7 +2964,11 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) if (btrfs_test_opt(fs_info, CLEAR_CACHE) && btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - rebuild_free_space_tree = true; + if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) + btrfs_warn(fs_info, + "'clear_cache' option is ignored with extent tree v2"); + else + rebuild_free_space_tree = true; } else if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && !btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE_VALID)) { btrfs_warn(fs_info, "free space tree is invalid"); @@ -3213,6 +3228,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_alloc; } + btrfs_info(fs_info, "first mount of filesystem %pU", disk_super->fsid); /* * Verify the type first, if that or the checksum value are * corrupted, we'll find out @@ -3275,13 +3291,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_ERROR) WRITE_ONCE(fs_info->fs_error, -EUCLEAN); - /* - * In the long term, we'll store the compression type in the super - * block, and it'll be used for per file compression control. - */ - fs_info->compress_type = BTRFS_COMPRESS_ZLIB; - - /* Set up fs_info before parsing mount options */ nodesize = btrfs_super_nodesize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); @@ -3295,28 +3304,30 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; - ret = btrfs_parse_options(fs_info, options, sb->s_flags); - if (ret) + /* + * Handle the space caching options appropriately now that we have the + * super block loaded and validated. + */ + btrfs_set_free_space_cache_settings(fs_info); + + if (!btrfs_check_options(fs_info, &fs_info->mount_opt, sb->s_flags)) { + ret = -EINVAL; goto fail_alloc; + } ret = btrfs_check_features(fs_info, !sb_rdonly(sb)); if (ret < 0) goto fail_alloc; + /* + * At this point our mount options are validated, if we set ->max_inline + * to something non-standard make sure we truncate it to sectorsize. + */ + fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); + if (sectorsize < PAGE_SIZE) { struct btrfs_subpage_info *subpage_info; - /* - * V1 space cache has some hardcoded PAGE_SIZE usage, and is - * going to be deprecated. - * - * Force to use v2 cache for subpage case. - */ - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - btrfs_set_and_info(fs_info, FREE_SPACE_TREE, - "forcing free space tree for sector size %u with page size %lu", - sectorsize, PAGE_SIZE); - btrfs_warn(fs_info, "read-write for sector size %u with page size %lu is experimental", sectorsize, PAGE_SIZE); @@ -3493,29 +3504,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device goto fail_cleaner; } - if (!btrfs_test_opt(fs_info, NOSSD) && - !fs_info->fs_devices->rotating) { - btrfs_set_and_info(fs_info, SSD, "enabling ssd optimizations"); - } - - /* - * For devices supporting discard turn on discard=async automatically, - * unless it's already set or disabled. This could be turned off by - * nodiscard for the same mount. - * - * The zoned mode piggy backs on the discard functionality for - * resetting a zone. There is no reason to delay the zone reset as it is - * fast enough. So, do not enable async discard for zoned mode. - */ - if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || - btrfs_test_opt(fs_info, DISCARD_ASYNC) || - btrfs_test_opt(fs_info, NODISCARD)) && - fs_info->fs_devices->discardable && - !btrfs_is_zoned(fs_info)) { - btrfs_set_and_info(fs_info, DISCARD_ASYNC, - "auto enabling async discard"); - } - ret = btrfs_read_qgroup_config(fs_info); if (ret) goto fail_trans_kthread; @@ -3541,7 +3529,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } if (sb_rdonly(sb)) - goto clear_oneshot; + return 0; ret = btrfs_start_pre_rw_mount(fs_info); if (ret) { @@ -3569,8 +3557,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (test_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags)) wake_up_process(fs_info->cleaner_kthread); -clear_oneshot: - btrfs_clear_oneshot_options(fs_info); return 0; fail_qgroup: @@ -3607,7 +3593,7 @@ fail_sb_buffer: btrfs_stop_all_workers(fs_info); btrfs_free_block_groups(fs_info); fail_alloc: - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); iput(fs_info->btree_inode); fail: @@ -4390,7 +4376,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) iput(fs_info->btree_inode); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); btrfs_close_devices(fs_info->fs_devices); } @@ -4798,6 +4784,32 @@ void btrfs_cleanup_dirty_bgs(struct btrfs_transaction *cur_trans, } } +static void btrfs_free_all_qgroup_pertrans(struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *gang[8]; + int i; + int ret; + + spin_lock(&fs_info->fs_roots_radix_lock); + while (1) { + ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + struct btrfs_root *root = gang[i]; + + btrfs_qgroup_free_meta_all_pertrans(root); + radix_tree_tag_clear(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + } + } + spin_unlock(&fs_info->fs_roots_radix_lock); +} + void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, struct btrfs_fs_info *fs_info) { @@ -4826,6 +4838,8 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *cur_trans, EXTENT_DIRTY); btrfs_destroy_pinned_extent(fs_info, &cur_trans->pinned_extents); + btrfs_free_all_qgroup_pertrans(fs_info); + cur_trans->state =TRANS_STATE_COMPLETED; wake_up(&cur_trans->commit_wait); } diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 50dab8f639dc..9413726b329b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -37,9 +37,6 @@ struct extent_buffer *btrfs_find_create_tree_block( struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, int level); -void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, - struct extent_buffer *buf); -void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info); int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info); int btrfs_check_super_csum(struct btrfs_fs_info *fs_info, const struct btrfs_super_block *disk_sb); diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index ea149be28dff..e3ee5449cc4a 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -58,12 +58,13 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - struct btrfs_inode *inode = tree->inode; + const struct btrfs_inode *inode; u64 isize; - if (!inode) + if (tree->owner != IO_TREE_INODE_IO) return; + inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -78,31 +79,46 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif + /* - * For the file_extent_tree, we want to hold the inode lock when we lookup and - * update the disk_i_size, but lockdep will complain because our io_tree we hold - * the tree lock and get the inode lock when setting delalloc. These two things - * are unrelated, so make a class for the file_extent_tree so we don't get the - * two locking patterns mixed up. + * The only tree allowed to set the inode is IO_TREE_INODE_IO. */ -static struct lock_class_key file_extent_tree_class; +static bool is_inode_io_tree(const struct extent_io_tree *tree) +{ + return tree->owner == IO_TREE_INODE_IO; +} + +/* Return the inode if it's valid for the given tree, otherwise NULL. */ +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} + +/* Read-only access to the inode. */ +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode; + return NULL; +} -struct tree_entry { - u64 start; - u64 end; - struct rb_node rb_node; -}; +/* For read-only access to fs_info. */ +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +{ + if (tree->owner == IO_TREE_INODE_IO) + return tree->inode->root->fs_info; + return tree->fs_info; +} void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner) { - tree->fs_info = fs_info; tree->state = RB_ROOT; spin_lock_init(&tree->lock); - tree->inode = NULL; + tree->fs_info = fs_info; tree->owner = owner; - if (owner == IO_TREE_INODE_FILE_EXTENT) - lockdep_set_class(&tree->lock, &file_extent_tree_class); } /* @@ -329,10 +345,14 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(struct extent_io_tree *tree, int err) +static void extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(tree->fs_info, err, - "locking error: extent tree was modified by another thread while locked"); + btrfs_panic(extent_io_tree_to_fs_info(tree), err, + "extent io tree error on %s state start %llu end %llu", + opname, state->start, state->end); } static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *state) @@ -341,8 +361,9 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, prev); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); @@ -356,8 +377,9 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, state, next); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), + state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); @@ -390,8 +412,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_set_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -436,9 +458,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -448,9 +471,10 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (tree->inode) - btrfs_merge_delalloc_extent(tree->inode, - state, entry); + if (is_inode_io_tree(tree)) + btrfs_merge_delalloc_extent( + extent_io_tree_to_inode(tree), + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -458,9 +482,6 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } node = &(*node)->rb_right; } else { - btrfs_err(tree->fs_info, - "found node %llu %llu on insert of %llu %llu", - entry->start, entry->end, state->start, state->end); return ERR_PTR(-EEXIST); } } @@ -505,8 +526,9 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (tree->inode) - btrfs_split_delalloc_extent(tree->inode, orig, split); + if (is_inode_io_tree(tree)) + btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, + split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -553,8 +575,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (tree->inode) - btrfs_clear_delalloc_extent(tree->inode, state, bits); + if (is_inode_io_tree(tree)) + btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, + bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -695,7 +718,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -717,7 +740,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); if (wake) wake_up(&state->wq); @@ -939,6 +962,8 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, struct extent_state *state; int ret = 1; + ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); if (state) { @@ -1152,7 +1177,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) @@ -1200,7 +1225,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); @@ -1228,7 +1253,7 @@ hit_next: goto search_again; err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1382,7 +1407,7 @@ hit_next: } err = split_state(tree, state, prealloc, start); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); prealloc = NULL; if (err) goto out; @@ -1430,7 +1455,7 @@ hit_next: inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { err = PTR_ERR(inserted_state); - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, prealloc, "insert", err); } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) @@ -1453,7 +1478,7 @@ hit_next: err = split_state(tree, state, prealloc, end + 1); if (err) - extent_io_tree_panic(tree, err); + extent_io_tree_panic(tree, state, "split", err); set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 5602b0137fcd..ebe6390d65e9 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -87,9 +87,17 @@ enum { struct extent_io_tree { struct rb_root state; - struct btrfs_fs_info *fs_info; - /* Inode associated with this tree, or NULL. */ - struct btrfs_inode *inode; + /* + * The fs_info is needed for trace points, a tree attached to an inode + * needs the inode. + * + * owner == IO_TREE_INODE_IO - then inode is valid and fs_info can be + * accessed as inode->root->fs_info + */ + union { + struct btrfs_fs_info *fs_info; + struct btrfs_inode *inode; + }; /* Who owns this io tree, should be one of IO_TREE_* */ u8 owner; @@ -112,6 +120,10 @@ struct extent_state { #endif }; +struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); +const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); +const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); + void extent_io_tree_init(struct btrfs_fs_info *fs_info, struct extent_io_tree *tree, unsigned int owner); void extent_io_tree_release(struct extent_io_tree *tree); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c8e5b4715b49..f396aba92c57 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -102,7 +102,8 @@ int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) */ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags) + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owning_root) { struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; @@ -114,6 +115,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, u32 item_size; u64 num_refs; u64 extent_flags; + u64 owner = 0; int ret; /* @@ -167,6 +169,8 @@ search_again: struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); extent_flags = btrfs_extent_flags(leaf, ei); + owner = btrfs_get_extent_owner_root(fs_info, leaf, + path->slots[0]); } else { ret = -EUCLEAN; btrfs_err(fs_info, @@ -226,6 +230,8 @@ out: *refs = num_refs; if (flags) *flags = extent_flags; + if (owning_root) + *owning_root = owner; out_free: btrfs_free_path(path); return ret; @@ -1541,6 +1547,23 @@ out: return ret; } +static void free_head_ref_squota_rsv(struct btrfs_fs_info *fs_info, + struct btrfs_delayed_ref_head *href) +{ + u64 root = href->owning_root; + + /* + * Don't check must_insert_reserved, as this is called from contexts + * where it has already been unset. + */ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE || + !href->is_data || !is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, href->reserved_bytes, + BTRFS_QGROUP_RSV_DATA); +} + static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_head *href, struct btrfs_delayed_ref_node *node, @@ -1563,7 +1586,6 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = node->num_bytes, - .rsv_bytes = href->reserved_bytes, .is_data = true, .is_inc = true, .generation = trans->transid, @@ -1580,11 +1602,9 @@ static int run_delayed_data_ref(struct btrfs_trans_handle *trans, flags, ref->objectid, ref->offset, &key, node->ref_mod, href->owning_root); + free_head_ref_squota_rsv(trans->fs_info, href); if (!ret) ret = btrfs_record_squota_delta(trans->fs_info, &delta); - else - btrfs_qgroup_free_refroot(trans->fs_info, delta.root, - delta.rsv_bytes, BTRFS_QGROUP_RSV_DATA); } else if (node->action == BTRFS_ADD_DELAYED_REF) { ret = __btrfs_inc_extent_ref(trans, node, parent, ref->root, ref->objectid, ref->offset, @@ -1736,7 +1756,6 @@ static int run_delayed_tree_ref(struct btrfs_trans_handle *trans, struct btrfs_squota_delta delta = { .root = href->owning_root, .num_bytes = fs_info->nodesize, - .rsv_bytes = 0, .is_data = false, .is_inc = true, .generation = trans->transid, @@ -1768,8 +1787,10 @@ static int run_one_delayed_ref(struct btrfs_trans_handle *trans, int ret = 0; if (TRANS_ABORTED(trans)) { - if (insert_reserved) + if (insert_reserved) { btrfs_pin_extent(trans, node->bytenr, node->num_bytes, 1); + free_head_ref_squota_rsv(trans->fs_info, href); + } return 0; } @@ -1865,6 +1886,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, struct btrfs_delayed_ref_root *delayed_refs, struct btrfs_delayed_ref_head *head) { + u64 ret = 0; + /* * We had csum deletions accounted for in our delayed refs rsv, we need * to drop the csum leaves for this update from our delayed_refs_rsv. @@ -1879,14 +1902,13 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, btrfs_delayed_refs_rsv_release(fs_info, 0, nr_csums); - return btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); + ret = btrfs_calc_delayed_ref_csum_bytes(fs_info, nr_csums); } - if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE && - head->must_insert_reserved && head->is_data) - btrfs_qgroup_free_refroot(fs_info, head->owning_root, - head->reserved_bytes, BTRFS_QGROUP_RSV_DATA); + /* must_insert_reserved can be set only if we didn't run the head ref. */ + if (head->must_insert_reserved) + free_head_ref_squota_rsv(fs_info, head); - return 0; + return ret; } static int cleanup_ref_head(struct btrfs_trans_handle *trans, @@ -2027,6 +2049,12 @@ static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans, * spin lock. */ must_insert_reserved = locked_ref->must_insert_reserved; + /* + * Unsetting this on the head ref relinquishes ownership of + * the rsv_bytes, so it is critical that every possible code + * path from here forward frees all reserves including qgroup + * reserve. + */ locked_ref->must_insert_reserved = false; extent_op = locked_ref->extent_op; @@ -3286,7 +3314,6 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_squota_delta delta = { .root = delayed_ref_root, .num_bytes = num_bytes, - .rsv_bytes = 0, .is_data = is_data, .is_inc = false, .generation = btrfs_extent_generation(leaf, ei), @@ -3420,6 +3447,7 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_ref generic_ref = { 0 }; + struct btrfs_block_group *bg; int ret; btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF, @@ -3433,67 +3461,64 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, BUG_ON(ret); /* -ENOMEM */ } - if (last_ref && btrfs_header_generation(buf) == trans->transid) { - struct btrfs_block_group *cache; - bool must_pin = false; - - if (root_id != BTRFS_TREE_LOG_OBJECTID) { - ret = check_ref_cleanup(trans, buf->start); - if (!ret) { - btrfs_redirty_list_add(trans->transaction, buf); - goto out; - } - } + if (!last_ref) + return; - cache = btrfs_lookup_block_group(fs_info, buf->start); + if (btrfs_header_generation(buf) != trans->transid) + goto out; - if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); + if (root_id != BTRFS_TREE_LOG_OBJECTID) { + ret = check_ref_cleanup(trans, buf->start); + if (!ret) goto out; - } + } - /* - * If there are tree mod log users we may have recorded mod log - * operations for this node. If we re-allocate this node we - * could replay operations on this node that happened when it - * existed in a completely different root. For example if it - * was part of root A, then was reallocated to root B, and we - * are doing a btrfs_old_search_slot(root b), we could replay - * operations that happened when the block was part of root A, - * giving us an inconsistent view of the btree. - * - * We are safe from races here because at this point no other - * node or root points to this extent buffer, so if after this - * check a new tree mod log user joins we will not have an - * existing log of operations on this node that we have to - * contend with. - */ - if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) - must_pin = true; + bg = btrfs_lookup_block_group(fs_info, buf->start); - if (must_pin || btrfs_is_zoned(fs_info)) { - btrfs_redirty_list_add(trans->transaction, buf); - pin_down_extent(trans, cache, buf->start, buf->len, 1); - btrfs_put_block_group(cache); - goto out; - } + if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; + } - WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + /* + * If there are tree mod log users we may have recorded mod log + * operations for this node. If we re-allocate this node we + * could replay operations on this node that happened when it + * existed in a completely different root. For example if it + * was part of root A, then was reallocated to root B, and we + * are doing a btrfs_old_search_slot(root b), we could replay + * operations that happened when the block was part of root A, + * giving us an inconsistent view of the btree. + * + * We are safe from races here because at this point no other + * node or root points to this extent buffer, so if after this + * check a new tree mod log user joins we will not have an + * existing log of operations on this node that we have to + * contend with. + */ - btrfs_add_free_space(cache, buf->start, buf->len); - btrfs_free_reserved_bytes(cache, buf->len, 0); - btrfs_put_block_group(cache); - trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + if (test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags) + || btrfs_is_zoned(fs_info)) { + pin_down_extent(trans, bg, buf->start, buf->len, 1); + btrfs_put_block_group(bg); + goto out; } + + WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); + + btrfs_add_free_space(bg, buf->start, buf->len); + btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_put_block_group(bg); + trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); + out: - if (last_ref) { - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); - } + + /* + * Deleting the buffer, clear the corrupt flag since it doesn't + * matter anymore. + */ + clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); } /* Can return -ENOMEM */ @@ -4931,7 +4956,6 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, .root = root_objectid, .num_bytes = ins->offset, .generation = trans->transid, - .rsv_bytes = 0, .is_data = true, .is_inc = true, }; @@ -5035,7 +5059,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, __btrfs_tree_lock(buf, nest); btrfs_clear_buffer_dirty(trans, buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); - clear_bit(EXTENT_BUFFER_NO_CHECK, &buf->bflags); + clear_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &buf->bflags); set_extent_buffer_uptodate(buf); @@ -5234,7 +5258,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, /* We don't lock the tree block, it's OK to be racy here */ ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, wc->level - 1, 1, &refs, - &flags); + &flags, NULL); /* We don't care about errors in readahead. */ if (ret < 0) continue; @@ -5301,7 +5325,8 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); BUG_ON(ret == -ENOMEM); if (ret) return ret; @@ -5391,6 +5416,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, u64 bytenr; u64 generation; u64 parent; + u64 owner_root = 0; struct btrfs_tree_parent_check check = { 0 }; struct btrfs_key key; struct btrfs_ref ref = { 0 }; @@ -5434,7 +5460,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1, &wc->refs[level - 1], - &wc->flags[level - 1]); + &wc->flags[level - 1], + &owner_root); if (ret < 0) goto out_unlock; @@ -5567,8 +5594,7 @@ skip: find_next_key(path, level, &wc->drop_progress); btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr, - fs_info->nodesize, parent, - btrfs_header_owner(next)); + fs_info->nodesize, parent, owner_root); btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid, 0, false); ret = btrfs_free_extent(trans, &ref); @@ -5635,7 +5661,8 @@ static noinline int walk_up_proc(struct btrfs_trans_handle *trans, ret = btrfs_lookup_extent_info(trans, fs_info, eb->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], + NULL); if (ret < 0) { btrfs_tree_unlock_rw(eb, path->locks[level]); path->locks[level] = 0; @@ -5880,7 +5907,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc) ret = btrfs_lookup_extent_info(trans, fs_info, path->nodes[level]->start, level, 1, &wc->refs[level], - &wc->flags[level]); + &wc->flags[level], NULL); if (ret < 0) { err = ret; goto out_end_trans; diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index 0716f65d9753..2e066035ccee 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -99,7 +99,8 @@ u64 btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info, int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len); int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, - u64 offset, int metadata, u64 *refs, u64 *flags); + u64 offset, int metadata, u64 *refs, u64 *flags, + u64 *owner_root); int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num, int reserved); int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 03cef28d9e37..cfd2967f04a2 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -184,22 +184,23 @@ static void process_one_page(struct btrfs_fs_info *fs_info, struct page *page, struct page *locked_page, unsigned long page_ops, u64 start, u64 end) { + struct folio *folio = page_folio(page); u32 len; ASSERT(end + 1 - start != 0 && end + 1 - start < U32_MAX); len = end + 1 - start; if (page_ops & PAGE_SET_ORDERED) - btrfs_page_clamp_set_ordered(fs_info, page, start, len); + btrfs_folio_clamp_set_ordered(fs_info, folio, start, len); if (page_ops & PAGE_START_WRITEBACK) { - btrfs_page_clamp_clear_dirty(fs_info, page, start, len); - btrfs_page_clamp_set_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_dirty(fs_info, folio, start, len); + btrfs_folio_clamp_set_writeback(fs_info, folio, start, len); } if (page_ops & PAGE_END_WRITEBACK) - btrfs_page_clamp_clear_writeback(fs_info, page, start, len); + btrfs_folio_clamp_clear_writeback(fs_info, folio, start, len); if (page != locked_page && (page_ops & PAGE_UNLOCK)) - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } static void __process_pages_contig(struct address_space *mapping, @@ -271,19 +272,20 @@ static noinline int lock_delalloc_pages(struct inode *inode, goto out; for (i = 0; i < found_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; + struct page *page = folio_page(folio, 0); u32 len = end + 1 - start; if (page == locked_page) continue; - if (btrfs_page_start_writer_lock(fs_info, page, start, - len)) + if (btrfs_folio_start_writer_lock(fs_info, folio, start, + len)) goto out; if (!PageDirty(page) || page->mapping != mapping) { - btrfs_page_end_writer_lock(fs_info, page, start, - len); + btrfs_folio_end_writer_lock(fs_info, folio, start, + len); goto out; } @@ -432,60 +434,65 @@ static bool btrfs_verify_page(struct page *page, u64 start) static void end_page_read(struct page *page, bool uptodate, u64 start, u32 len) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); ASSERT(page_offset(page) <= start && start + len <= page_offset(page) + PAGE_SIZE); if (uptodate && btrfs_verify_page(page, start)) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) unlock_page(page); else - btrfs_subpage_end_reader(fs_info, page, start, len); + btrfs_subpage_end_reader(fs_info, folio, start, len); } /* - * after a writepage IO is done, we need to: - * clear the uptodate bits on error - * clear the writeback bits in the extent tree for this IO - * end_page_writeback if the page has no more pending IO + * After a write IO is done, we need to: + * + * - clear the uptodate bits on error + * - clear the writeback bits in the extent tree for the range + * - filio_end_writeback() if there is no more pending io for the folio * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_writepage(struct btrfs_bio *bbio) +static void end_bbio_data_write(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; int error = blk_status_to_errno(bio->bi_status); - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; - u64 start = page_offset(page) + bvec->bv_offset; - u32 len = bvec->bv_len; + u64 start = folio_pos(folio) + fi.offset; + u32 len = fi.length; + + /* Only order 0 (single page) folios are allowed for data. */ + ASSERT(folio_order(folio) == 0); /* Our read/write should always be sector aligned. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page write in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_len, sectorsize)) + "partial page write in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page write with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page write with offset %zu and length %zu", + fi.offset, fi.length); - btrfs_finish_ordered_extent(bbio->ordered, page, start, len, !error); + btrfs_finish_ordered_extent(bbio->ordered, + folio_page(folio, 0), start, len, !error); if (error) - mapping_set_error(page->mapping, error); - btrfs_page_clear_writeback(fs_info, page, start, len); + mapping_set_error(folio->mapping, error); + btrfs_folio_clear_writeback(fs_info, folio, start, len); } bio_put(bio); @@ -562,98 +569,102 @@ update: static void begin_page_read(struct btrfs_fs_info *fs_info, struct page *page) { - ASSERT(PageLocked(page)); - if (!btrfs_is_subpage(fs_info, page)) + struct folio *folio = page_folio(page); + + ASSERT(folio_test_locked(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page)); - btrfs_subpage_start_reader(fs_info, page, page_offset(page), PAGE_SIZE); + ASSERT(folio_test_private(folio)); + btrfs_subpage_start_reader(fs_info, folio, page_offset(page), PAGE_SIZE); } /* - * after a readpage IO is done, we need to: - * clear the uptodate bits on error - * set the uptodate bits if things worked - * set the page up to date if all extents in the tree are uptodate - * clear the lock bit in the extent tree - * unlock the page if there are no other extents locked for it + * After a data read IO is done, we need to: + * + * - clear the uptodate bits on error + * - set the uptodate bits if things worked + * - set the folio up to date if all extents in the tree are uptodate + * - clear the lock bit in the extent tree + * - unlock the folio if there are no other extents locked for it * * Scheduling is not allowed, so the extent state tree is expected * to have one and only one object corresponding to this IO. */ -static void end_bio_extent_readpage(struct btrfs_bio *bbio) +static void end_bbio_data_read(struct btrfs_bio *bbio) { struct bio *bio = &bbio->bio; - struct bio_vec *bvec; struct processed_extent processed = { 0 }; + struct folio_iter fi; /* * The offset to the beginning of a bio, since one bio can never be * larger than UINT_MAX, u32 here is enough. */ u32 bio_offset = 0; - struct bvec_iter_all iter_all; ASSERT(!bio_flagged(bio, BIO_CLONED)); - bio_for_each_segment_all(bvec, bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; - struct page *page = bvec->bv_page; - struct inode *inode = page->mapping->host; + struct folio *folio = fi.folio; + struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); const u32 sectorsize = fs_info->sectorsize; u64 start; u64 end; u32 len; + /* For now only order 0 folios are supported for data. */ + ASSERT(folio_order(folio) == 0); btrfs_debug(fs_info, - "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", - bio->bi_iter.bi_sector, bio->bi_status, + "%s: bi_sector=%llu, err=%d, mirror=%u", + __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); /* * We always issue full-sector reads, but if some block in a - * page fails to read, blk_update_request() will advance + * folio fails to read, blk_update_request() will advance * bv_offset and adjust bv_len to compensate. Print a warning * for unaligned offsets, and an error if they don't add up to * a full sector. */ - if (!IS_ALIGNED(bvec->bv_offset, sectorsize)) + if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, - "partial page read in btrfs with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); - else if (!IS_ALIGNED(bvec->bv_offset + bvec->bv_len, - sectorsize)) + "partial page read in btrfs with offset %zu and length %zu", + fi.offset, fi.length); + else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) btrfs_info(fs_info, - "incomplete page read with offset %u and length %u", - bvec->bv_offset, bvec->bv_len); + "incomplete page read with offset %zu and length %zu", + fi.offset, fi.length); - start = page_offset(page) + bvec->bv_offset; - end = start + bvec->bv_len - 1; - len = bvec->bv_len; + start = folio_pos(folio) + fi.offset; + end = start + fi.length - 1; + len = fi.length; if (likely(uptodate)) { loff_t i_size = i_size_read(inode); - pgoff_t end_index = i_size >> PAGE_SHIFT; + pgoff_t end_index = i_size >> folio_shift(folio); /* * Zero out the remaining part if this range straddles * i_size. * - * Here we should only zero the range inside the bvec, + * Here we should only zero the range inside the folio, * not touch anything else. * * NOTE: i_size is exclusive while end is inclusive. */ - if (page->index == end_index && i_size <= end) { - u32 zero_start = max(offset_in_page(i_size), - offset_in_page(start)); + if (folio_index(folio) == end_index && i_size <= end) { + u32 zero_start = max(offset_in_folio(folio, i_size), + offset_in_folio(folio, start)); + u32 zero_len = offset_in_folio(folio, end) + 1 - + zero_start; - zero_user_segment(page, zero_start, - offset_in_page(end) + 1); + folio_zero_range(folio, zero_start, zero_len); } } /* Update page status and unlock. */ - end_page_read(page, uptodate, start, len); + end_page_read(folio_page(folio, 0), uptodate, start, len); endio_readpage_release_extent(&processed, BTRFS_I(inode), start, end, uptodate); @@ -672,19 +683,22 @@ static void end_bio_extent_readpage(struct btrfs_bio *bbio) * @nr_pages: number of pages to allocate * @page_array: the array to fill with pages; any existing non-null entries in * the array will be skipped + * @extra_gfp: the extra GFP flags for the allocation. * * Return: 0 if all pages were able to be allocated; - * -ENOMEM otherwise, and the caller is responsible for freeing all - * non-null page pointers in the array. + * -ENOMEM otherwise, the partially allocated pages would be freed and + * the array slots zeroed */ -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp) { unsigned int allocated; for (allocated = 0; allocated < nr_pages;) { unsigned int last = allocated; - allocated = alloc_pages_bulk_array(GFP_NOFS, nr_pages, page_array); + allocated = alloc_pages_bulk_array(GFP_NOFS | extra_gfp, + nr_pages, page_array); if (allocated == nr_pages) return 0; @@ -694,14 +708,39 @@ int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array) * though alloc_pages_bulk_array() falls back to alloc_page() * if it could not bulk-allocate. So we must be out of memory. */ - if (allocated == last) + if (allocated == last) { + for (int i = 0; i < allocated; i++) { + __free_page(page_array[i]); + page_array[i] = NULL; + } return -ENOMEM; + } memalloc_retry_wait(GFP_NOFS); } return 0; } +/* + * Populate needed folios for the extent buffer. + * + * For now, the folios populated are always in order 0 (aka, single page). + */ +static int alloc_eb_folio_array(struct extent_buffer *eb, gfp_t extra_gfp) +{ + struct page *page_array[INLINE_EXTENT_BUFFER_PAGES] = { 0 }; + int num_pages = num_extent_pages(eb); + int ret; + + ret = btrfs_alloc_page_array(num_pages, page_array, extra_gfp); + if (ret < 0) + return ret; + + for (int i = 0; i < num_pages; i++) + eb->folios[i] = page_folio(page_array[i]); + return 0; +} + static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, struct page *page, u64 disk_bytenr, unsigned int pg_offset) @@ -856,9 +895,9 @@ static void submit_extent_page(struct btrfs_bio_ctrl *bio_ctrl, } while (size); } -static int attach_extent_buffer_page(struct extent_buffer *eb, - struct page *page, - struct btrfs_subpage *prealloc) +static int attach_extent_buffer_folio(struct extent_buffer *eb, + struct folio *folio, + struct btrfs_subpage *prealloc) { struct btrfs_fs_info *fs_info = eb->fs_info; int ret = 0; @@ -869,65 +908,66 @@ static int attach_extent_buffer_page(struct extent_buffer *eb, * For cloned or dummy extent buffers, their pages are not mapped and * will not race with any other ebs. */ - if (page->mapping) - lockdep_assert_held(&page->mapping->private_lock); + if (folio->mapping) + lockdep_assert_held(&folio->mapping->i_private_lock); if (fs_info->nodesize >= PAGE_SIZE) { - if (!PagePrivate(page)) - attach_page_private(page, eb); + if (!folio_test_private(folio)) + folio_attach_private(folio, eb); else - WARN_ON(page->private != (unsigned long)eb); + WARN_ON(folio_get_private(folio) != eb); return 0; } /* Already mapped, just free prealloc */ - if (PagePrivate(page)) { + if (folio_test_private(folio)) { btrfs_free_subpage(prealloc); return 0; } if (prealloc) /* Has preallocated memory for subpage */ - attach_page_private(page, prealloc); + folio_attach_private(folio, prealloc); else /* Do new allocation to attach subpage */ - ret = btrfs_attach_subpage(fs_info, page, - BTRFS_SUBPAGE_METADATA); + ret = btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return ret; } int set_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (PagePrivate(page)) + if (folio_test_private(folio)) return 0; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_attach_subpage(fs_info, page, BTRFS_SUBPAGE_DATA); + if (btrfs_is_subpage(fs_info, page->mapping)) + return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); - attach_page_private(page, (void *)EXTENT_PAGE_PRIVATE); + folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); return 0; } void clear_page_extent_mapped(struct page *page) { + struct folio *folio = page_folio(page); struct btrfs_fs_info *fs_info; ASSERT(page->mapping); - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return; fs_info = btrfs_sb(page->mapping->host->i_sb); - if (btrfs_is_subpage(fs_info, page)) - return btrfs_detach_subpage(fs_info, page); + if (btrfs_is_subpage(fs_info, page->mapping)) + return btrfs_detach_subpage(fs_info, folio); - detach_page_private(page); + folio_detach_private(folio); } static struct extent_map * @@ -996,7 +1036,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, memzero_page(page, zero_offset, iosize); } } - bio_ctrl->end_io_func = end_bio_extent_readpage; + bio_ctrl->end_io_func = end_bbio_data_read; begin_page_read(fs_info, page); while (cur <= end) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; @@ -1022,8 +1062,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, BUG_ON(extent_map_end(em) <= cur); BUG_ON(end < cur); - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) - compress_type = em->compress_type; + compress_type = extent_map_compression(em); iosize = min(extent_map_end(em) - cur, end - cur + 1); iosize = ALIGN(iosize, blocksize); @@ -1032,7 +1071,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, else disk_bytenr = em->block_start + extent_offset; block_start = em->block_start; - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; /* @@ -1069,7 +1108,7 @@ static int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, * is a corner case so we prioritize correctness over * non-optimal behavior (submitting 2 bios for the same extent). */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) && + if (compress_type != BTRFS_COMPRESS_NONE && prev_em_start && *prev_em_start != (u64)-1 && *prev_em_start != em->start) force_bio_submit = true; @@ -1235,7 +1274,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, struct page *page, u64 *start, u64 *end) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct folio *folio = page_folio(page); + struct btrfs_subpage *subpage = folio_get_private(folio); struct btrfs_subpage_info *spi = fs_info->subpage_info; u64 orig_start = *start; /* Declare as unsigned long so we can use bitmap ops */ @@ -1247,7 +1287,7 @@ static void find_next_dirty_byte(struct btrfs_fs_info *fs_info, * For regular sector size == page size case, since one page only * contains one sector, we return the page offset directly. */ - if (!btrfs_is_subpage(fs_info, page)) { + if (!btrfs_is_subpage(fs_info, page->mapping)) { *start = page_offset(page); *end = page_offset(page) + PAGE_SIZE; return; @@ -1300,7 +1340,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, return 1; } - bio_ctrl->end_io_func = end_bio_extent_writepage; + bio_ctrl->end_io_func = end_bbio_data_write; while (cur <= end) { u32 len = end - cur + 1; u64 disk_bytenr; @@ -1320,7 +1360,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * writeback the sectors with subpage dirty bits, * causing writeback without ordered extent. */ - btrfs_page_clear_dirty(fs_info, page, cur, len); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, len); break; } @@ -1347,7 +1387,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, block_start = em->block_start; disk_bytenr = em->block_start + extent_offset; - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); @@ -1372,7 +1412,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, * So clear subpage dirty bit here so next time we won't submit * page for range already written to disk. */ - btrfs_page_clear_dirty(fs_info, page, cur, iosize); + btrfs_folio_clear_dirty(fs_info, page_folio(page), cur, iosize); submit_extent_page(bio_ctrl, disk_bytenr, page, iosize, cur - page_offset(page)); @@ -1380,7 +1420,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, nr++; } - btrfs_page_assert_not_dirty(fs_info, page); + btrfs_folio_assert_not_dirty(fs_info, page_folio(page)); *nr_ret = nr; return 0; @@ -1602,24 +1642,23 @@ static struct extent_buffer *find_extent_buffer_nolock( return NULL; } -static void extent_buffer_write_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; if (!uptodate) set_btree_ioerr(eb); - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + struct folio *folio = fi.folio; + u32 len = fi.length; - btrfs_page_clear_writeback(fs_info, page, start, len); + btrfs_folio_clear_writeback(fs_info, folio, start, len); bio_offset += len; } @@ -1668,36 +1707,44 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_WRITE | REQ_META | wbc_to_write_flags(wbc), - eb->fs_info, extent_buffer_write_end_io, eb); + eb->fs_info, end_bbio_meta_write, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bio_set_dev(&bbio->bio, fs_info->fs_devices->latest_dev->bdev); wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; if (fs_info->nodesize < PAGE_SIZE) { - struct page *p = eb->pages[0]; + struct folio *folio = eb->folios[0]; + bool ret; - lock_page(p); - btrfs_subpage_set_writeback(fs_info, p, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, p, eb->start, + folio_lock(folio); + btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); + if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len)) { - clear_page_dirty_for_io(p); + folio_clear_dirty_for_io(folio); wbc->nr_to_write--; } - __bio_add_page(&bbio->bio, p, eb->len, eb->start - page_offset(p)); - wbc_account_cgroup_owner(wbc, p, eb->len); - unlock_page(p); + ret = bio_add_folio(&bbio->bio, folio, eb->len, + eb->start - folio_pos(folio)); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), eb->len); + folio_unlock(folio); } else { - for (int i = 0; i < num_extent_pages(eb); i++) { - struct page *p = eb->pages[i]; - - lock_page(p); - clear_page_dirty_for_io(p); - set_page_writeback(p); - __bio_add_page(&bbio->bio, p, PAGE_SIZE, 0); - wbc_account_cgroup_owner(wbc, p, PAGE_SIZE); - wbc->nr_to_write--; - unlock_page(p); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + bool ret; + + folio_lock(folio); + folio_clear_dirty_for_io(folio); + folio_start_writeback(folio); + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + wbc_account_cgroup_owner(wbc, folio_page(folio, 0), + folio_size(folio)); + wbc->nr_to_write -= folio_nr_pages(folio); + folio_unlock(folio); } } btrfs_submit_bio(bbio, 0); @@ -1720,6 +1767,7 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); int submitted = 0; u64 page_start = page_offset(page); int bit_start = 0; @@ -1727,7 +1775,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) /* Lock and write each dirty extent buffers in the range */ while (bit_start < fs_info->subpage_info->bitmap_nr_bits) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); struct extent_buffer *eb; unsigned long flags; u64 start; @@ -1736,16 +1784,16 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) * Take private lock to ensure the subpage won't be detached * in the meantime. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&page->mapping->i_private_lock); break; } spin_lock_irqsave(&subpage->lock, flags); if (!test_bit(bit_start + fs_info->subpage_info->dirty_offset, subpage->bitmaps)) { spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); bit_start++; continue; } @@ -1759,7 +1807,7 @@ static int submit_eb_subpage(struct page *page, struct writeback_control *wbc) */ eb = find_extent_buffer_nolock(fs_info, start); spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * The eb has already reached 0 refs thus find_extent_buffer() @@ -1802,38 +1850,39 @@ static int submit_eb_page(struct page *page, struct btrfs_eb_write_context *ctx) { struct writeback_control *wbc = ctx->wbc; struct address_space *mapping = page->mapping; + struct folio *folio = page_folio(page); struct extent_buffer *eb; int ret; - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return 0; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return submit_eb_subpage(page, wbc); - spin_lock(&mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&mapping->private_lock); + spin_lock(&mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&mapping->i_private_lock); return 0; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); /* * Shouldn't happen and normally this would be a BUG_ON but no point * crashing the machine for something we can survive anyway. */ if (WARN_ON(!eb)) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } if (eb == ctx->eb) { - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); return 0; } ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->private_lock); + spin_unlock(&mapping->i_private_lock); if (!ret) return 0; @@ -2196,7 +2245,7 @@ void extent_write_locked_range(struct inode *inode, struct page *locked_page, cur, cur_len, !ret); mapping_set_error(page->mapping, ret); } - btrfs_page_unlock_writer(fs_info, page, cur, cur_len); + btrfs_folio_unlock_writer(fs_info, page_folio(page), cur, cur_len); if (ret < 0) found_error = true; next_page: @@ -2297,7 +2346,8 @@ static int try_release_extent_state(struct extent_io_tree *tree, ret = 0; } else { u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS); + EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | + EXTENT_QGROUP_RESERVED); /* * At this point we can safely clear everything except the @@ -2346,7 +2396,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) write_unlock(&map->lock); break; } - if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&map->lock); free_extent_map(em); @@ -2363,7 +2413,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * extra reference on the em. */ if (list_empty(&em->list) || - test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + (em->flags & EXTENT_FLAG_LOGGING)) goto remove_em; /* * If it's in the list of modified extents, remove it @@ -3052,14 +3102,14 @@ static int extent_buffer_under_io(const struct extent_buffer *eb) test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); } -static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) +static bool folio_range_has_eb(struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - lockdep_assert_held(&page->mapping->private_lock); + lockdep_assert_held(&folio->mapping->i_private_lock); - if (PagePrivate(page)) { - subpage = (struct btrfs_subpage *)page->private; + if (folio_test_private(folio)) { + subpage = folio_get_private(folio); if (atomic_read(&subpage->eb_refs)) return true; /* @@ -3072,21 +3122,21 @@ static bool page_range_has_eb(struct btrfs_fs_info *fs_info, struct page *page) return false; } -static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *page) +static void detach_extent_buffer_folio(struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* - * For mapped eb, we're going to change the page private, which should - * be done under the private_lock. + * For mapped eb, we're going to change the folio private, which should + * be done under the i_private_lock. */ if (mapped) - spin_lock(&page->mapping->private_lock); + spin_lock(&folio->mapping->i_private_lock); - if (!PagePrivate(page)) { + if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } @@ -3095,66 +3145,58 @@ static void detach_extent_buffer_page(struct extent_buffer *eb, struct page *pag * We do this since we'll remove the pages after we've * removed the eb from the radix tree, so we could race * and have this page now attached to the new eb. So - * only clear page_private if it's still connected to + * only clear folio if it's still connected to * this eb. */ - if (PagePrivate(page) && - page->private == (unsigned long)eb) { + if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(PageDirty(page)); - BUG_ON(PageWriteback(page)); - /* - * We need to make sure we haven't be attached - * to a new eb. - */ - detach_page_private(page); + BUG_ON(folio_test_dirty(folio)); + BUG_ON(folio_test_writeback(folio)); + /* We need to make sure we haven't be attached to a new eb. */ + folio_detach_private(folio); } if (mapped) - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); return; } /* - * For subpage, we can have dummy eb with page private. In this case, - * we can directly detach the private as such page is only attached to - * one dummy eb, no sharing. + * For subpage, we can have dummy eb with folio private attached. In + * this case, we can directly detach the private as such folio is only + * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, page); + btrfs_detach_subpage(fs_info, folio); return; } - btrfs_page_dec_eb_refs(fs_info, page); + btrfs_folio_dec_eb_refs(fs_info, folio); /* - * We can only detach the page private if there are no other ebs in the + * We can only detach the folio private if there are no other ebs in the * page range and no unfinished IO. */ - if (!page_range_has_eb(fs_info, page)) - btrfs_detach_subpage(fs_info, page); + if (!folio_range_has_eb(fs_info, folio)) + btrfs_detach_subpage(fs_info, folio); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&folio->mapping->i_private_lock); } /* Release all pages attached to the extent buffer */ static void btrfs_release_extent_buffer_pages(struct extent_buffer *eb) { - int i; - int num_pages; - ASSERT(!extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *page = eb->pages[i]; + for (int i = 0; i < INLINE_EXTENT_BUFFER_PAGES; i++) { + struct folio *folio = eb->folios[i]; - if (!page) + if (!folio) continue; - detach_extent_buffer_page(eb, page); + detach_extent_buffer_folio(eb, folio); - /* One for when we allocated the page */ - put_page(page); + /* One for when we allocated the folio. */ + folio_put(folio); } } @@ -3192,9 +3234,8 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { - int i; struct extent_buffer *new; - int num_pages = num_extent_pages(src); + int num_folios = num_extent_folios(src); int ret; new = __alloc_extent_buffer(src->fs_info, src->start, src->len); @@ -3208,22 +3249,22 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) */ set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); - ret = btrfs_alloc_page_array(num_pages, new->pages); + ret = alloc_eb_folio_array(new, 0); if (ret) { btrfs_release_extent_buffer(new); return NULL; } - for (i = 0; i < num_pages; i++) { + for (int i = 0; i < num_folios; i++) { + struct folio *folio = new->folios[i]; int ret; - struct page *p = new->pages[i]; - ret = attach_extent_buffer_page(new, p, NULL); + ret = attach_extent_buffer_folio(new, folio, NULL); if (ret < 0) { btrfs_release_extent_buffer(new); return NULL; } - WARN_ON(PageDirty(p)); + WARN_ON(folio_test_dirty(folio)); } copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); @@ -3235,23 +3276,20 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len) { struct extent_buffer *eb; - int num_pages; - int i; + int num_folios = 0; int ret; eb = __alloc_extent_buffer(fs_info, start, len); if (!eb) return NULL; - num_pages = num_extent_pages(eb); - ret = btrfs_alloc_page_array(num_pages, eb->pages); + ret = alloc_eb_folio_array(eb, 0); if (ret) goto err; - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - ret = attach_extent_buffer_page(eb, p, NULL); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) goto err; } @@ -3262,10 +3300,10 @@ struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, return eb; err: - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) { - detach_extent_buffer_page(eb, eb->pages[i]); - __free_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) { + if (eb->folios[i]) { + detach_extent_buffer_folio(eb, eb->folios[i]); + __folio_put(eb->folios[i]); } } __free_extent_buffer(eb); @@ -3314,20 +3352,14 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) spin_unlock(&eb->refs_lock); } -static void mark_extent_buffer_accessed(struct extent_buffer *eb, - struct page *accessed) +static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_pages, i; + int num_folios= num_extent_folios(eb); check_buffer_tree_ref(eb); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - struct page *p = eb->pages[i]; - - if (p != accessed) - mark_page_accessed(p); - } + for (int i = 0; i < num_folios; i++) + folio_mark_accessed(eb->folios[i]); } struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, @@ -3355,7 +3387,7 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, spin_lock(&eb->refs_lock); spin_unlock(&eb->refs_lock); } - mark_extent_buffer_accessed(eb, NULL); + mark_extent_buffer_accessed(eb); return eb; } @@ -3404,6 +3436,7 @@ free_eb: static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *exists; /* @@ -3415,21 +3448,21 @@ static struct extent_buffer *grab_extent_buffer( return NULL; /* Page not yet attached to an extent buffer */ - if (!PagePrivate(page)) + if (!folio_test_private(folio)) return NULL; /* * We could have already allocated an eb for this page and attached one * so lets see if we can get a ref on the existing eb, and if we can we * know it's good and we can just return that one, else we know we can - * just overwrite page->private. + * just overwrite folio private. */ - exists = (struct extent_buffer *)page->private; + exists = folio_get_private(folio); if (atomic_inc_not_zero(&exists->refs)) return exists; WARN_ON(PageDirty(page)); - detach_page_private(page); + folio_detach_private(folio); return NULL; } @@ -3463,19 +3496,88 @@ static int check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return 0; } + +/* + * Return 0 if eb->folios[i] is attached to btree inode successfully. + * Return >0 if there is already another extent buffer for the range, + * and @found_eb_ret would be updated. + * Return -EAGAIN if the filemap has an existing folio but with different size + * than @eb. + * The caller needs to free the existing folios and retry using the same order. + */ +static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, + struct extent_buffer **found_eb_ret) +{ + + struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + const unsigned long index = eb->start >> PAGE_SHIFT; + struct folio *existing_folio; + int ret; + + ASSERT(found_eb_ret); + + /* Caller should ensure the folio exists. */ + ASSERT(eb->folios[i]); + +retry: + ret = filemap_add_folio(mapping, eb->folios[i], index + i, + GFP_NOFS | __GFP_NOFAIL); + if (!ret) + return 0; + + existing_folio = filemap_lock_folio(mapping, index + i); + /* The page cache only exists for a very short time, just retry. */ + if (IS_ERR(existing_folio)) + goto retry; + + /* For now, we should only have single-page folios for btree inode. */ + ASSERT(folio_nr_pages(existing_folio) == 1); + + if (folio_size(existing_folio) != folio_size(eb->folios[0])) { + folio_unlock(existing_folio); + folio_put(existing_folio); + return -EAGAIN; + } + + if (fs_info->nodesize < PAGE_SIZE) { + /* + * We're going to reuse the existing page, can drop our page + * and subpage structure now. + */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } else { + struct extent_buffer *existing_eb; + + existing_eb = grab_extent_buffer(fs_info, + folio_page(existing_folio, 0)); + if (existing_eb) { + /* The extent buffer still exists, we can use it directly. */ + *found_eb_ret = existing_eb; + folio_unlock(existing_folio); + folio_put(existing_folio); + return 1; + } + /* The extent buffer no longer exists, we can reuse the folio. */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } + return 0; +} + struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { unsigned long len = fs_info->nodesize; - int num_pages; - int i; - unsigned long index = start >> PAGE_SHIFT; + int num_folios; + int attached = 0; struct extent_buffer *eb; - struct extent_buffer *exists = NULL; - struct page *p; + struct extent_buffer *existing_eb = NULL; struct address_space *mapping = fs_info->btree_inode->i_mapping; struct btrfs_subpage *prealloc = NULL; u64 lockdep_owner = owner_root; + bool page_contig = true; int uptodate = 1; int ret; @@ -3510,11 +3612,9 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_set_buffer_lockdep_class(lockdep_owner, eb, level); - num_pages = num_extent_pages(eb); - /* - * Preallocate page->private for subpage case, so that we won't - * allocate memory with private_lock nor page lock hold. + * Preallocate folio private for subpage case, so that we won't + * allocate memory with i_private_lock nor page lock hold. * * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. @@ -3522,47 +3622,89 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (fs_info->nodesize < PAGE_SIZE) { prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { - exists = ERR_CAST(prealloc); - goto free_eb; + ret = PTR_ERR(prealloc); + goto out; } } - for (i = 0; i < num_pages; i++, index++) { - p = find_or_create_page(mapping, index, GFP_NOFS|__GFP_NOFAIL); - if (!p) { - exists = ERR_PTR(-ENOMEM); - btrfs_free_subpage(prealloc); - goto free_eb; +reallocate: + /* Allocate all pages first. */ + ret = alloc_eb_folio_array(eb, __GFP_NOFAIL); + if (ret < 0) { + btrfs_free_subpage(prealloc); + goto out; + } + + num_folios = num_extent_folios(eb); + /* Attach all pages to the filemap. */ + for (int i = 0; i < num_folios; i++) { + struct folio *folio; + + ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); + if (ret > 0) { + ASSERT(existing_eb); + goto out; } - spin_lock(&mapping->private_lock); - exists = grab_extent_buffer(fs_info, p); - if (exists) { - spin_unlock(&mapping->private_lock); - unlock_page(p); - put_page(p); - mark_extent_buffer_accessed(exists, p); - btrfs_free_subpage(prealloc); - goto free_eb; + /* + * TODO: Special handling for a corner case where the order of + * folios mismatch between the new eb and filemap. + * + * This happens when: + * + * - the new eb is using higher order folio + * + * - the filemap is still using 0-order folios for the range + * This can happen at the previous eb allocation, and we don't + * have higher order folio for the call. + * + * - the existing eb has already been freed + * + * In this case, we have to free the existing folios first, and + * re-allocate using the same order. + * Thankfully this is not going to happen yet, as we're still + * using 0-order folios. + */ + if (unlikely(ret == -EAGAIN)) { + ASSERT(0); + goto reallocate; } + attached++; + + /* + * Only after attach_eb_folio_to_filemap(), eb->folios[] is + * reliable, as we may choose to reuse the existing page cache + * and free the allocated page. + */ + folio = eb->folios[i]; + spin_lock(&mapping->i_private_lock); /* Should not fail, as we have preallocated the memory */ - ret = attach_extent_buffer_page(eb, p, prealloc); + ret = attach_extent_buffer_folio(eb, folio, prealloc); ASSERT(!ret); /* * To inform we have extra eb under allocation, so that - * detach_extent_buffer_page() won't release the page private + * detach_extent_buffer_page() won't release the folio private * when the eb hasn't yet been inserted into radix tree. * * The ref will be decreased when the eb released the page, in * detach_extent_buffer_page(). * Thus needs no special handling in error path. */ - btrfs_page_inc_eb_refs(fs_info, p); - spin_unlock(&mapping->private_lock); + btrfs_folio_inc_eb_refs(fs_info, folio); + spin_unlock(&mapping->i_private_lock); + + WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + + /* + * Check if the current page is physically contiguous with previous eb + * page. + * At this stage, either we allocated a large folio, thus @i + * would only be 0, or we fall back to per-page allocation. + */ + if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) + page_contig = false; - WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); - eb->pages[i] = p; - if (!btrfs_page_test_uptodate(fs_info, p, eb->start, eb->len)) + if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) uptodate = 0; /* @@ -3575,12 +3717,13 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, } if (uptodate) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); + /* All pages are physically contiguous, can skip cross page handling. */ + if (page_contig) + eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; - } + if (ret) + goto out; spin_lock(&fs_info->buffer_lock); ret = radix_tree_insert(&fs_info->buffer_radix, @@ -3588,9 +3731,10 @@ again: spin_unlock(&fs_info->buffer_lock); radix_tree_preload_end(); if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; + ret = 0; + existing_eb = find_extent_buffer(fs_info, start); + if (existing_eb) + goto out; else goto again; } @@ -3603,19 +3747,46 @@ again: * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (i = 0; i < num_pages; i++) - unlock_page(eb->pages[i]); + for (int i = 0; i < num_folios; i++) + unlock_page(folio_page(eb->folios[i], 0)); return eb; -free_eb: +out: WARN_ON(!atomic_dec_and_test(&eb->refs)); - for (i = 0; i < num_pages; i++) { - if (eb->pages[i]) - unlock_page(eb->pages[i]); + + /* + * Any attached folios need to be detached before we unlock them. This + * is because when we're inserting our new folios into the mapping, and + * then attaching our eb to that folio. If we fail to insert our folio + * we'll lookup the folio for that index, and grab that EB. We do not + * want that to grab this eb, as we're getting ready to free it. So we + * have to detach it first and then unlock it. + * + * We have to drop our reference and NULL it out here because in the + * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. + * Below when we call btrfs_release_extent_buffer() we will call + * detach_extent_buffer_folio() on our remaining pages in the !subpage + * case. If we left eb->folios[i] populated in the subpage case we'd + * double put our reference and be super sad. + */ + for (int i = 0; i < attached; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + unlock_page(folio_page(eb->folios[i], 0)); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; } + /* + * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, + * so it can be cleaned up without utlizing page->mapping. + */ + set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); btrfs_release_extent_buffer(eb); - return exists; + if (ret < 0) + return ERR_PTR(ret); + ASSERT(existing_eb); + return existing_eb; } static inline void btrfs_release_extent_buffer_rcu(struct rcu_head *head) @@ -3707,31 +3878,30 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_page_dirty(struct page *page) +static void btree_clear_folio_dirty(struct folio *folio) { - ASSERT(PageDirty(page)); - ASSERT(PageLocked(page)); - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + ASSERT(folio_test_dirty(folio)); + ASSERT(folio_test_locked(folio)); + folio_clear_dirty_for_io(folio); + xa_lock_irq(&folio->mapping->i_pages); + if (!folio_test_dirty(folio)) + __xa_clear_mark(&folio->mapping->i_pages, + folio_index(folio), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&folio->mapping->i_pages); } static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page = eb->pages[0]; + struct folio *folio = eb->folios[0]; bool last; - /* btree_clear_page_dirty() needs page locked */ - lock_page(page); - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, - eb->len); + /* btree_clear_folio_dirty() needs page locked. */ + folio_lock(folio); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); if (last) - btree_clear_page_dirty(page); - unlock_page(page); + btree_clear_folio_dirty(folio); + folio_unlock(folio); WARN_ON(atomic_read(&eb->refs) == 0); } @@ -3739,15 +3909,27 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int i; - int num_pages; - struct page *page; + int num_folios; btrfs_assert_tree_write_locked(eb); if (trans && btrfs_header_generation(eb) != trans->transid) return; + /* + * Instead of clearing the dirty flag off of the buffer, mark it as + * EXTENT_BUFFER_ZONED_ZEROOUT. This allows us to preserve + * write-ordering in zoned mode, without the need to later re-dirty + * the extent_buffer. + * + * The actual zeroout of the buffer will happen later in + * btree_csum_one_bio. + */ + if (btrfs_is_zoned(fs_info)) { + set_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags); + return; + } + if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; @@ -3757,30 +3939,29 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (eb->fs_info->nodesize < PAGE_SIZE) return clear_subpage_extent_buffer_dirty(eb); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!PageDirty(page)) + if (!folio_test_dirty(folio)) continue; - lock_page(page); - btree_clear_page_dirty(page); - unlock_page(page); + folio_lock(folio); + btree_clear_folio_dirty(folio); + folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); } void set_extent_buffer_dirty(struct extent_buffer *eb) { - int i; - int num_pages; + int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_pages = num_extent_pages(eb); + num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); @@ -3799,34 +3980,32 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) * the above race. */ if (subpage) - lock_page(eb->pages[0]); - for (i = 0; i < num_pages; i++) - btrfs_page_set_dirty(eb->fs_info, eb->pages[i], - eb->start, eb->len); + lock_page(folio_page(eb->folios[0], 0)); + for (int i = 0; i < num_folios; i++) + btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], + eb->start, eb->len); if (subpage) - unlock_page(eb->pages[0]); + unlock_page(folio_page(eb->folios[0], 0)); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, eb->len, eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (i = 0; i < num_pages; i++) - ASSERT(PageDirty(eb->pages[i])); + for (int i = 0; i < num_folios; i++) + ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; - if (!page) + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + if (!folio) continue; /* @@ -3834,44 +4013,40 @@ void clear_extent_buffer_uptodate(struct extent_buffer *eb) * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - ClearPageUptodate(page); + folio_clear_uptodate(folio); else - btrfs_subpage_clear_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_clear_uptodate(fs_info, folio, + eb->start, eb->len); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct page *page; - int num_pages; - int i; + int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - num_pages = num_extent_pages(eb); - for (i = 0; i < num_pages; i++) { - page = eb->pages[i]; + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; /* * This is special handling for metadata subpage, as regular * btrfs_is_subpage() can not handle cloned/dummy metadata. */ if (fs_info->nodesize >= PAGE_SIZE) - SetPageUptodate(page); + folio_mark_uptodate(folio); else - btrfs_subpage_set_uptodate(fs_info, page, eb->start, - eb->len); + btrfs_subpage_set_uptodate(fs_info, folio, + eb->start, eb->len); } } -static void extent_buffer_read_end_io(struct btrfs_bio *bbio) +static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; u32 bio_offset = 0; eb->read_mirror = bbio->mirror_num; @@ -3887,15 +4062,15 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); } - bio_for_each_segment_all(bvec, &bbio->bio, iter_all) { + bio_for_each_folio_all(fi, &bbio->bio) { + struct folio *folio = fi.folio; u64 start = eb->start + bio_offset; - struct page *page = bvec->bv_page; - u32 len = bvec->bv_len; + u32 len = fi.length; if (uptodate) - btrfs_page_set_uptodate(fs_info, page, start, len); + btrfs_folio_set_uptodate(fs_info, folio, start, len); else - btrfs_page_clear_uptodate(fs_info, page, start, len); + btrfs_folio_clear_uptodate(fs_info, folio, start, len); bio_offset += len; } @@ -3911,8 +4086,8 @@ static void extent_buffer_read_end_io(struct btrfs_bio *bbio) int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, struct btrfs_tree_parent_check *check) { - int num_pages = num_extent_pages(eb), i; struct btrfs_bio *bbio; + bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3936,17 +4111,24 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num, bbio = btrfs_bio_alloc(INLINE_EXTENT_BUFFER_PAGES, REQ_OP_READ | REQ_META, eb->fs_info, - extent_buffer_read_end_io, eb); + end_bbio_meta_read, eb); bbio->bio.bi_iter.bi_sector = eb->start >> SECTOR_SHIFT; bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); if (eb->fs_info->nodesize < PAGE_SIZE) { - __bio_add_page(&bbio->bio, eb->pages[0], eb->len, - eb->start - page_offset(eb->pages[0])); + ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, + eb->start - folio_pos(eb->folios[0])); + ASSERT(ret); } else { - for (i = 0; i < num_pages; i++) - __bio_add_page(&bbio->bio, eb->pages[i], PAGE_SIZE, 0); + int num_folios = num_extent_folios(eb); + + for (int i = 0; i < num_folios; i++) { + struct folio *folio = eb->folios[i]; + + ret = bio_add_folio(&bbio->bio, folio, folio_size(folio), 0); + ASSERT(ret); + } } btrfs_submit_bio(bbio, mirror_num); @@ -3993,29 +4175,33 @@ static inline int check_eb_range(const struct extent_buffer *eb, void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char *dst = (char *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); if (check_eb_range(eb, start, len)) { /* * Invalid range hit, reset the memory, so callers won't get - * some random garbage for their uninitialzed memory. + * some random garbage for their uninitialized memory. */ memset(dstv, 0, len); return; } - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + memcpy(dstv, eb->addr + start, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); memcpy(dst, kaddr + offset, cur); dst += cur; @@ -4029,24 +4215,29 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, void __user *dstv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; - char *kaddr; char __user *dst = (char __user *)dstv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; WARN_ON(start > eb->len); WARN_ON(start + len > eb->start + eb->len); - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (copy_to_user_nofault(dstv, eb->addr + start, len)) + ret = -EFAULT; + return ret; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; + char *kaddr; - cur = min(len, (PAGE_SIZE - offset)); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (copy_to_user_nofault(dst, kaddr + offset, cur)) { ret = -EFAULT; break; @@ -4064,25 +4255,25 @@ int read_extent_buffer_to_user_nofault(const struct extent_buffer *eb, int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *ptr = (char *)ptrv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); int ret = 0; if (check_eb_range(eb, start, len)) return -EINVAL; - offset = get_eb_offset_in_page(eb, start); - - while (len > 0) { - page = eb->pages[i]; + if (eb->addr) + return memcmp(ptrv, eb->addr + start, len); - cur = min(len, (PAGE_SIZE - offset)); + offset = get_eb_offset_in_folio(eb, start); - kaddr = page_address(page); + while (len > 0) { + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); ret = memcmp(ptr, kaddr + offset, cur); if (ret) break; @@ -4101,10 +4292,12 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, * For regular sector size == PAGE_SIZE case, check if @page is uptodate. * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. */ -static void assert_eb_page_uptodate(const struct extent_buffer *eb, - struct page *page) +static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct folio *folio = eb->folios[i]; + + ASSERT(folio); /* * If we are using the commit root we could potentially clear a page @@ -4118,11 +4311,14 @@ static void assert_eb_page_uptodate(const struct extent_buffer *eb, return; if (fs_info->nodesize < PAGE_SIZE) { - if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, page, + struct folio *folio = eb->folios[0]; + + ASSERT(i == 0); + if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, eb->start, eb->len))) - btrfs_subpage_dump_bitmap(fs_info, page, eb->start, eb->len); + btrfs_subpage_dump_bitmap(fs_info, folio, eb->start, eb->len); } else { - WARN_ON(!PageUptodate(page)); + WARN_ON(!folio_test_uptodate(folio)); } } @@ -4130,29 +4326,34 @@ static void __write_extent_buffer(const struct extent_buffer *eb, const void *srcv, unsigned long start, unsigned long len, bool use_memmove) { + const int unit_size = folio_size(eb->folios[0]); size_t cur; size_t offset; - struct page *page; char *kaddr; char *src = (char *)srcv; - unsigned long i = get_eb_page_index(start); + unsigned long i = get_eb_folio_index(eb, start); /* For unmapped (dummy) ebs, no need to check their uptodate status. */ const bool check_uptodate = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - WARN_ON(test_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags)); - if (check_eb_range(eb, start, len)) return; - offset = get_eb_offset_in_page(eb, start); + if (eb->addr) { + if (use_memmove) + memmove(eb->addr + start, srcv, len); + else + memcpy(eb->addr + start, srcv, len); + return; + } + + offset = get_eb_offset_in_folio(eb, start); while (len > 0) { - page = eb->pages[i]; if (check_uptodate) - assert_eb_page_uptodate(eb, page); + assert_eb_folio_uptodate(eb, i); - cur = min(len, PAGE_SIZE - offset); - kaddr = page_address(page); + cur = min(len, unit_size - offset); + kaddr = folio_address(eb->folios[i]); if (use_memmove) memmove(kaddr + offset, src, cur); else @@ -4174,16 +4375,21 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, static void memset_extent_buffer(const struct extent_buffer *eb, int c, unsigned long start, unsigned long len) { + const int unit_size = folio_size(eb->folios[0]); unsigned long cur = start; + if (eb->addr) { + memset(eb->addr + start, c, len); + return; + } + while (cur < start + len) { - unsigned long index = get_eb_page_index(cur); - unsigned int offset = get_eb_offset_in_page(eb, cur); - unsigned int cur_len = min(start + len - cur, PAGE_SIZE - offset); - struct page *page = eb->pages[index]; + unsigned long index = get_eb_folio_index(eb, cur); + unsigned int offset = get_eb_offset_in_folio(eb, cur); + unsigned int cur_len = min(start + len - cur, unit_size - offset); - assert_eb_page_uptodate(eb, page); - memset(page_address(page) + offset, c, cur_len); + assert_eb_folio_uptodate(eb, index); + memset(folio_address(eb->folios[index]) + offset, c, cur_len); cur += cur_len; } @@ -4200,15 +4406,16 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, void copy_extent_buffer_full(const struct extent_buffer *dst, const struct extent_buffer *src) { + const int unit_size = folio_size(src->folios[0]); unsigned long cur = 0; ASSERT(dst->len == src->len); while (cur < src->len) { - unsigned long index = get_eb_page_index(cur); - unsigned long offset = get_eb_offset_in_page(src, cur); - unsigned long cur_len = min(src->len, PAGE_SIZE - offset); - void *addr = page_address(src->pages[index]) + offset; + unsigned long index = get_eb_folio_index(src, cur); + unsigned long offset = get_eb_offset_in_folio(src, cur); + unsigned long cur_len = min(src->len, unit_size - offset); + void *addr = folio_address(src->folios[index]) + offset; write_extent_buffer(dst, addr, cur, cur_len); @@ -4221,12 +4428,12 @@ void copy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); u64 dst_len = dst->len; size_t cur; size_t offset; - struct page *page; char *kaddr; - unsigned long i = get_eb_page_index(dst_offset); + unsigned long i = get_eb_folio_index(dst, dst_offset); if (check_eb_range(dst, dst_offset, len) || check_eb_range(src, src_offset, len)) @@ -4234,15 +4441,14 @@ void copy_extent_buffer(const struct extent_buffer *dst, WARN_ON(src->len != dst_len); - offset = get_eb_offset_in_page(dst, dst_offset); + offset = get_eb_offset_in_folio(dst, dst_offset); while (len > 0) { - page = dst->pages[i]; - assert_eb_page_uptodate(dst, page); + assert_eb_folio_uptodate(dst, i); - cur = min(len, (unsigned long)(PAGE_SIZE - offset)); + cur = min(len, (unsigned long)(unit_size - offset)); - kaddr = page_address(page); + kaddr = folio_address(dst->folios[i]); read_extent_buffer(src, kaddr + offset, src_offset, cur); src_offset += cur; @@ -4253,22 +4459,22 @@ void copy_extent_buffer(const struct extent_buffer *dst, } /* - * Calculate the page and offset of the byte containing the given bit number. + * Calculate the folio and offset of the byte containing the given bit number. * * @eb: the extent buffer * @start: offset of the bitmap item in the extent buffer * @nr: bit number - * @page_index: return index of the page in the extent buffer that contains + * @folio_index: return index of the folio in the extent buffer that contains * the given bit number - * @page_offset: return offset into the page given by page_index + * @folio_offset: return offset into the folio given by folio_index * * This helper hides the ugliness of finding the byte in an extent buffer which * contains a given bit. */ static inline void eb_bitmap_offset(const struct extent_buffer *eb, unsigned long start, unsigned long nr, - unsigned long *page_index, - size_t *page_offset) + unsigned long *folio_index, + size_t *folio_offset) { size_t byte_offset = BIT_BYTE(nr); size_t offset; @@ -4278,10 +4484,10 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, * the bitmap item in the extent buffer + the offset of the byte in the * bitmap item. */ - offset = start + offset_in_page(eb->start) + byte_offset; + offset = start + offset_in_folio(eb->folios[0], eb->start) + byte_offset; - *page_index = offset >> PAGE_SHIFT; - *page_offset = offset_in_page(offset); + *folio_index = offset >> folio_shift(eb->folios[0]); + *folio_offset = offset_in_folio(eb->folios[0], offset); } /* @@ -4294,25 +4500,23 @@ static inline void eb_bitmap_offset(const struct extent_buffer *eb, int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, unsigned long nr) { - u8 *kaddr; - struct page *page; unsigned long i; size_t offset; + u8 *kaddr; eb_bitmap_offset(eb, start, nr, &i, &offset); - page = eb->pages[i]; - assert_eb_page_uptodate(eb, page); - kaddr = page_address(page); + assert_eb_folio_uptodate(eb, i); + kaddr = folio_address(eb->folios[i]); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } static u8 *extent_buffer_get_byte(const struct extent_buffer *eb, unsigned long bytenr) { - unsigned long index = get_eb_page_index(bytenr); + unsigned long index = get_eb_folio_index(eb, bytenr); if (check_eb_range(eb, bytenr, 1)) return NULL; - return page_address(eb->pages[index]) + get_eb_offset_in_page(eb, bytenr); + return folio_address(eb->folios[index]) + get_eb_offset_in_folio(eb, bytenr); } /* @@ -4397,19 +4601,30 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { + const int unit_size = folio_size(dst->folios[0]); unsigned long cur_off = 0; if (check_eb_range(dst, dst_offset, len) || check_eb_range(dst, src_offset, len)) return; + if (dst->addr) { + const bool use_memmove = areas_overlap(src_offset, dst_offset, len); + + if (use_memmove) + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + else + memcpy(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (cur_off < len) { unsigned long cur_src = cur_off + src_offset; - unsigned long pg_index = get_eb_page_index(cur_src); - unsigned long pg_off = get_eb_offset_in_page(dst, cur_src); + unsigned long folio_index = get_eb_folio_index(dst, cur_src); + unsigned long folio_off = get_eb_offset_in_folio(dst, cur_src); unsigned long cur_len = min(src_offset + len - cur_src, - PAGE_SIZE - pg_off); - void *src_addr = page_address(dst->pages[pg_index]) + pg_off; + unit_size - folio_off); + void *src_addr = folio_address(dst->folios[folio_index]) + folio_off; const bool use_memmove = areas_overlap(src_offset + cur_off, dst_offset + cur_off, cur_len); @@ -4435,24 +4650,29 @@ void memmove_extent_buffer(const struct extent_buffer *dst, return; } + if (dst->addr) { + memmove(dst->addr + dst_offset, dst->addr + src_offset, len); + return; + } + while (len > 0) { unsigned long src_i; size_t cur; - size_t dst_off_in_page; - size_t src_off_in_page; + size_t dst_off_in_folio; + size_t src_off_in_folio; void *src_addr; bool use_memmove; - src_i = get_eb_page_index(src_end); + src_i = get_eb_folio_index(dst, src_end); - dst_off_in_page = get_eb_offset_in_page(dst, dst_end); - src_off_in_page = get_eb_offset_in_page(dst, src_end); + dst_off_in_folio = get_eb_offset_in_folio(dst, dst_end); + src_off_in_folio = get_eb_offset_in_folio(dst, src_end); - cur = min_t(unsigned long, len, src_off_in_page + 1); - cur = min(cur, dst_off_in_page + 1); + cur = min_t(unsigned long, len, src_off_in_folio + 1); + cur = min(cur, dst_off_in_folio + 1); - src_addr = page_address(dst->pages[src_i]) + src_off_in_page - - cur + 1; + src_addr = folio_address(dst->folios[src_i]) + src_off_in_folio - + cur + 1; use_memmove = areas_overlap(src_end - cur + 1, dst_end - cur + 1, cur); @@ -4514,7 +4734,7 @@ static int try_release_subpage_extent_buffer(struct page *page) struct extent_buffer *eb = NULL; /* - * Unlike try_release_extent_buffer() which uses page->private + * Unlike try_release_extent_buffer() which uses folio private * to grab buffer, for subpage case we rely on radix tree, thus * we need to ensure radix tree consistency. * @@ -4554,43 +4774,44 @@ static int try_release_subpage_extent_buffer(struct page *page) /* * Here we don't care about the return value, we will always - * check the page private at the end. And + * check the folio private at the end. And * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); } /* - * Finally to check if we have cleared page private, as if we have - * released all ebs in the page, the page private should be cleared now. + * Finally to check if we have cleared folio private, as if we have + * released all ebs in the page, the folio private should be cleared now. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(page_folio(page))) ret = 1; else ret = 0; - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return ret; } int try_release_extent_buffer(struct page *page) { + struct folio *folio = page_folio(page); struct extent_buffer *eb; if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE) return try_release_subpage_extent_buffer(page); /* - * We need to make sure nobody is changing page->private, as we rely on - * page->private as the pointer to extent buffer. + * We need to make sure nobody is changing folio private, as we rely on + * folio private as the pointer to extent buffer. */ - spin_lock(&page->mapping->private_lock); - if (!PagePrivate(page)) { - spin_unlock(&page->mapping->private_lock); + spin_lock(&page->mapping->i_private_lock); + if (!folio_test_private(folio)) { + spin_unlock(&page->mapping->i_private_lock); return 1; } - eb = (struct extent_buffer *)page->private; + eb = folio_get_private(folio); BUG_ON(!eb); /* @@ -4601,10 +4822,10 @@ int try_release_extent_buffer(struct page *page) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); return 0; } - spin_unlock(&page->mapping->private_lock); + spin_unlock(&page->mapping->i_private_lock); /* * If tree ref isn't set then we know the ref on this eb is a real ref, diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 2171057a4477..46050500529b 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -28,7 +28,8 @@ enum { EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, - EXTENT_BUFFER_NO_CHECK, + /* Indicate the extent buffer is written zeroed out (for zoned) */ + EXTENT_BUFFER_ZONED_ZEROOUT, /* Indicate that extent buffer pages a being read */ EXTENT_BUFFER_READING, }; @@ -43,10 +44,10 @@ enum { }; /* - * page->private values. Every page that is controlled by the extent - * map has page->private set to one. + * Folio private values. Every page that is controlled by the extent map has + * folio private set to this value. */ -#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_FOLIO_PRIVATE 1 /* * The extent buffer bitmap operations are done with byte granularity instead of @@ -77,6 +78,13 @@ struct extent_buffer { unsigned long len; unsigned long bflags; struct btrfs_fs_info *fs_info; + + /* + * The address where the eb can be accessed without any cross-page handling. + * This can be NULL if not possible. + */ + void *addr; + spinlock_t refs_lock; atomic_t refs; int read_mirror; @@ -86,7 +94,12 @@ struct extent_buffer { struct rw_semaphore lock; - struct page *pages[INLINE_EXTENT_BUFFER_PAGES]; + /* + * Pointers to all the folios of the extent buffer. + * + * For now the folio is always order 0 (aka, a single page). + */ + struct folio *folios[INLINE_EXTENT_BUFFER_PAGES]; #ifdef CONFIG_BTRFS_DEBUG struct list_head leak_list; pid_t lock_owner; @@ -108,29 +121,43 @@ struct btrfs_eb_write_context { * * Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases. */ -static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb, - unsigned long offset) +static inline size_t get_eb_offset_in_folio(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, eb->start will always be aligned - * to PAGE_SIZE, thus adding it won't cause any difference. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb + * The eb->start is aligned to folio size, thus adding it + * won't cause any difference. + * 1.2) Several page sized folios + * The eb->start is aligned to folio (page) size, thus + * adding it won't cause any difference. * - * For sectorsize < PAGE_SIZE, we must only read the data that belongs - * to the eb, thus we have to take the eb->start into consideration. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * In this case there would only be one page sized folio, and there + * may be several different extent buffers in the page/folio. + * We need to add eb->start to properly access the offset inside + * that eb. */ - return offset_in_page(offset + eb->start); + return offset_in_folio(eb->folios[0], offset + eb->start); } -static inline unsigned long get_eb_page_index(unsigned long offset) +static inline unsigned long get_eb_folio_index(const struct extent_buffer *eb, + unsigned long offset) { /* - * For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough. + * 1) sectorsize == PAGE_SIZE and nodesize >= PAGE_SIZE case + * 1.1) One large folio covering the whole eb. + * the folio_shift would be large enough to always make us + * return 0 as index. + * 1.2) Several page sized folios + * The folio_shift() would be PAGE_SHIFT, giving us the correct + * index. * - * For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE, - * and have ensured that all tree blocks are contained in one page, - * thus we always get index == 0. + * 2) sectorsize < PAGE_SIZE and nodesize < PAGE_SIZE case + * The folio would only be page sized, and always give us 0 as index. */ - return offset >> PAGE_SHIFT; + return offset >> folio_shift(eb->folios[0]); } /* @@ -230,6 +257,20 @@ static inline int num_extent_pages(const struct extent_buffer *eb) return (eb->len >> PAGE_SHIFT) ?: 1; } +/* + * This can only be determined at runtime by checking eb::folios[0]. + * + * As we can have either one large folio covering the whole eb + * (either nodesize <= PAGE_SIZE, or high order folio), or multiple + * single-paged folios. + */ +static inline int num_extent_folios(const struct extent_buffer *eb) +{ + if (folio_order(eb->folios[0])) + return 1; + return num_extent_pages(eb); +} + static inline int extent_buffer_uptodate(const struct extent_buffer *eb) { return test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); @@ -294,7 +335,8 @@ int extent_invalidate_folio(struct extent_io_tree *tree, void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *buf); -int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array); +int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array, + gfp_t extra_gfp); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS bool find_lock_delalloc_range(struct inode *inode, diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index a6d8368ed0ed..b61099bf97a8 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -50,7 +50,6 @@ struct extent_map *alloc_extent_map(void) if (!em) return NULL; RB_CLEAR_NODE(&em->rb_node); - em->compress_type = BTRFS_COMPRESS_NONE; refcount_set(&em->refs, 1); INIT_LIST_HEAD(&em->list); return em; @@ -67,8 +66,6 @@ void free_extent_map(struct extent_map *em) if (refcount_dec_and_test(&em->refs)) { WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } @@ -182,50 +179,50 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, return NULL; } -/* Check to see if two extent_map structs are adjacent and safe to merge. */ -static int mergable_maps(struct extent_map *prev, struct extent_map *next) +static inline u64 extent_map_block_end(const struct extent_map *em) { - if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) - return 0; + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} - /* - * don't merge compressed extents, we need to know their - * actual size - */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) - return 0; +static bool can_merge_extent_map(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_PINNED) + return false; + + /* Don't merge compressed extents, we need to know their actual size. */ + if (extent_map_is_compressed(em)) + return false; - if (test_bit(EXTENT_FLAG_LOGGING, &prev->flags) || - test_bit(EXTENT_FLAG_LOGGING, &next->flags)) - return 0; + if (em->flags & EXTENT_FLAG_LOGGING) + return false; /* * We don't want to merge stuff that hasn't been written to the log yet * since it may not reflect exactly what is on disk, and that would be * bad. */ - if (!list_empty(&prev->list) || !list_empty(&next->list)) - return 0; - - ASSERT(next->block_start != EXTENT_MAP_DELALLOC && - prev->block_start != EXTENT_MAP_DELALLOC); - - if (prev->map_lookup || next->map_lookup) - ASSERT(test_bit(EXTENT_FLAG_FS_MAPPING, &prev->flags) && - test_bit(EXTENT_FLAG_FS_MAPPING, &next->flags)); - - if (extent_map_end(prev) == next->start && - prev->flags == next->flags && - prev->map_lookup == next->map_lookup && - ((next->block_start == EXTENT_MAP_HOLE && - prev->block_start == EXTENT_MAP_HOLE) || - (next->block_start == EXTENT_MAP_INLINE && - prev->block_start == EXTENT_MAP_INLINE) || - (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && - next->block_start == extent_map_block_end(prev)))) { - return 1; - } - return 0; + if (!list_empty(&em->list)) + return false; + + return true; +} + +/* Check to see if two extent_map structs are adjacent and safe to merge. */ +static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) +{ + if (extent_map_end(prev) != next->start) + return false; + + if (prev->flags != next->flags) + return false; + + if (next->block_start < EXTENT_MAP_LAST_BYTE - 1) + return next->block_start == extent_map_block_end(prev); + + /* HOLES and INLINE extents. */ + return next->block_start == prev->block_start; } static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) @@ -244,11 +241,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) if (refcount_read(&em->refs) > 2) return; + if (!can_merge_extent_map(em)) + return; + if (em->start != 0) { rb = rb_prev(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(merge, em)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->orig_start = merge->orig_start; em->len += merge->len; @@ -257,7 +257,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) em->mod_len = (em->mod_len + em->mod_start) - merge->mod_start; em->mod_start = merge->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); @@ -268,14 +268,14 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) rb = rb_next(&em->rb_node); if (rb) merge = rb_entry(rb, struct extent_map, rb_node); - if (rb && mergable_maps(em, merge)) { + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; em->block_len += merge->block_len; rb_erase_cached(&merge->rb_node, &tree->map); RB_CLEAR_NODE(&merge->rb_node); em->mod_len = (merge->mod_start + merge->mod_len) - em->mod_start; em->generation = max(em->generation, merge->generation); - set_bit(EXTENT_FLAG_MERGED, &em->flags); + em->flags |= EXTENT_FLAG_MERGED; free_extent_map(merge); } } @@ -283,7 +283,7 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) /* * Unpin an extent from the cache. * - * @tree: tree to unpin the extent in + * @inode: the inode from which we are unpinning an extent range * @start: logical offset in the file * @len: length of the extent * @gen: generation that this extent has been modified in @@ -292,9 +292,10 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em) * to the generation that actually added the file item to the inode so we know * we need to sync this extent when we call fsync(). */ -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, - u64 gen) +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_map_tree *tree = &inode->extent_tree; int ret = 0; struct extent_map *em; bool prealloc = false; @@ -302,19 +303,28 @@ int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, write_lock(&tree->lock); em = lookup_extent_mapping(tree, start, len); - WARN_ON(!em || em->start != start); - - if (!em) + if (WARN_ON(!em)) { + btrfs_warn(fs_info, +"no extent map found for inode %llu (root %lld) when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + start, len, gen); goto out; + } + + if (WARN_ON(em->start != start)) + btrfs_warn(fs_info, +"found extent map for inode %llu (root %lld) with unexpected start offset %llu when unpinning extent range [%llu, %llu), generation %llu", + btrfs_ino(inode), btrfs_root_id(inode->root), + em->start, start, len, gen); em->generation = gen; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; em->mod_start = em->start; em->mod_len = em->len; - if (test_bit(EXTENT_FLAG_FILLING, &em->flags)) { + if (em->flags & EXTENT_FLAG_FILLING) { prealloc = true; - clear_bit(EXTENT_FLAG_FILLING, &em->flags); + em->flags &= ~EXTENT_FLAG_FILLING; } try_merge_map(tree, em); @@ -335,7 +345,7 @@ void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~EXTENT_FLAG_LOGGING; if (extent_map_in_tree(em)) try_merge_map(tree, em); } @@ -348,45 +358,14 @@ static inline void setup_extent_mapping(struct extent_map_tree *tree, em->mod_start = em->start; em->mod_len = em->len; + ASSERT(list_empty(&em->list)); + if (modified) - list_move(&em->list, &tree->modified_extents); + list_add(&em->list, &tree->modified_extents); else try_merge_map(tree, em); } -static void extent_map_device_set_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); - } -} - -static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) -{ - struct map_lookup *map = em->map_lookup; - u64 stripe_size = em->orig_block_len; - int i; - - for (i = 0; i < map->num_stripes; i++) { - struct btrfs_io_stripe *stripe = &map->stripes[i]; - struct btrfs_device *device = stripe->dev; - - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); - } -} - /* * Add new extent map to the extent tree * @@ -400,8 +379,8 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits) * into the tree directly, with an additional reference taken, or a * reference dropped if the merge attempt was successful. */ -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified) +static int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em, int modified) { int ret = 0; @@ -412,10 +391,6 @@ int add_extent_mapping(struct extent_map_tree *tree, goto out; setup_extent_mapping(tree, em, modified); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) { - extent_map_device_set_bits(em, CHUNK_ALLOCATED); - extent_map_device_clear_bits(em, CHUNK_TRIMMED); - } out: return ret; } @@ -495,12 +470,10 @@ void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + WARN_ON(em->flags & EXTENT_FLAG_PINNED); rb_erase_cached(&em->rb_node, &tree->map); - if (!test_bit(EXTENT_FLAG_LOGGING, &em->flags)) + if (!(em->flags & EXTENT_FLAG_LOGGING)) list_del_init(&em->list); - if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - extent_map_device_clear_bits(em, CHUNK_ALLOCATED); RB_CLEAR_NODE(&em->rb_node); } @@ -511,9 +484,9 @@ static void replace_extent_mapping(struct extent_map_tree *tree, { lockdep_assert_held_write(&tree->lock); - WARN_ON(test_bit(EXTENT_FLAG_PINNED, &cur->flags)); + WARN_ON(cur->flags & EXTENT_FLAG_PINNED); ASSERT(extent_map_in_tree(cur)); - if (!test_bit(EXTENT_FLAG_LOGGING, &cur->flags)) + if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node_cached(&cur->rb_node, &new->rb_node, &tree->map); RB_CLEAR_NODE(&cur->rb_node); @@ -576,7 +549,7 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree, em->start = start; em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && - !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + !extent_map_is_compressed(em)) { em->block_start += start_diff; em->block_len = em->len; } @@ -626,8 +599,6 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info, if (ret == -EEXIST) { struct extent_map *existing; - ret = 0; - existing = search_extent_mapping(em_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -681,8 +652,7 @@ static void drop_all_extent_maps_fast(struct extent_map_tree *tree) node = rb_first_cached(&tree->map); em = rb_entry(node, struct extent_map, rb_node); - clear_bit(EXTENT_FLAG_PINNED, &em->flags); - clear_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); remove_extent_mapping(tree, em); free_extent_map(em); cond_resched_rwlock_write(&tree->lock); @@ -758,19 +728,18 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, } } - if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + if (skip_pinned && (em->flags & EXTENT_FLAG_PINNED)) { start = em_end; goto next; } flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); /* * In case we split the extent map, we want to preserve the * EXTENT_FLAG_LOGGING flag on our extent map, but we don't want * it on the new extent maps. */ - clear_bit(EXTENT_FLAG_LOGGING, &flags); + em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); modified = !list_empty(&em->list); /* @@ -781,7 +750,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, goto remove_em; gen = em->generation; - compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + compressed = extent_map_is_compressed(em); if (em->start < start) { if (!split) { @@ -814,7 +783,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; - split->compress_type = em->compress_type; replace_extent_mapping(em_tree, em, split, modified); free_extent_map(split); split = split2; @@ -831,7 +799,6 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->len = em_end - end; split->block_start = em->block_start; split->flags = flags; - split->compress_type = em->compress_type; split->generation = gen; if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -997,14 +964,14 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, } ASSERT(em->len == len); - ASSERT(!test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)); + ASSERT(!extent_map_is_compressed(em)); ASSERT(em->block_start < EXTENT_MAP_LAST_BYTE); - ASSERT(test_bit(EXTENT_FLAG_PINNED, &em->flags)); - ASSERT(!test_bit(EXTENT_FLAG_LOGGING, &em->flags)); + ASSERT(em->flags & EXTENT_FLAG_PINNED); + ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); ASSERT(!list_empty(&em->list)); flags = em->flags; - clear_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags &= ~EXTENT_FLAG_PINNED; /* First, replace the em with a new extent_map starting from * em->start */ split_pre->start = em->start; @@ -1015,7 +982,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_pre->orig_block_len = split_pre->block_len; split_pre->ram_bytes = split_pre->len; split_pre->flags = flags; - split_pre->compress_type = em->compress_type; split_pre->generation = em->generation; replace_extent_mapping(em_tree, em, split_pre, 1); @@ -1034,7 +1000,6 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, split_mid->orig_block_len = split_mid->block_len; split_mid->ram_bytes = split_mid->len; split_mid->flags = flags; - split_mid->compress_type = em->compress_type; split_mid->generation = em->generation; add_extent_mapping(em_tree, split_mid, 1); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index 35d27c756e08..e380fc08bbe4 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -5,30 +5,33 @@ #include <linux/rbtree.h> #include <linux/refcount.h> +#include "compression.h" #define EXTENT_MAP_LAST_BYTE ((u64)-4) #define EXTENT_MAP_HOLE ((u64)-3) #define EXTENT_MAP_INLINE ((u64)-2) -/* used only during fiemap calls */ -#define EXTENT_MAP_DELALLOC ((u64)-1) /* bits for the extent_map::flags field */ enum { /* this entry not yet on disk, don't free it */ - EXTENT_FLAG_PINNED, - EXTENT_FLAG_COMPRESSED, + ENUM_BIT(EXTENT_FLAG_PINNED), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZLIB), + ENUM_BIT(EXTENT_FLAG_COMPRESS_LZO), + ENUM_BIT(EXTENT_FLAG_COMPRESS_ZSTD), /* pre-allocated extent */ - EXTENT_FLAG_PREALLOC, + ENUM_BIT(EXTENT_FLAG_PREALLOC), /* Logging this extent */ - EXTENT_FLAG_LOGGING, + ENUM_BIT(EXTENT_FLAG_LOGGING), /* Filling in a preallocated extent */ - EXTENT_FLAG_FILLING, - /* filesystem extent mapping type */ - EXTENT_FLAG_FS_MAPPING, + ENUM_BIT(EXTENT_FLAG_FILLING), /* This em is merged from two or more physically adjacent ems */ - EXTENT_FLAG_MERGED, + ENUM_BIT(EXTENT_FLAG_MERGED), }; +/* + * Keep this structure as compact as possible, as we can have really large + * amounts of allocated extent maps at any time. + */ struct extent_map { struct rb_node rb_node; @@ -49,11 +52,8 @@ struct extent_map { * For non-merged extents, it's from btrfs_file_extent_item::generation. */ u64 generation; - unsigned long flags; - /* Used for chunk mappings, flag EXTENT_FLAG_FS_MAPPING must be set */ - struct map_lookup *map_lookup; + u32 flags; refcount_t refs; - unsigned int compress_type; struct list_head list; }; @@ -65,30 +65,57 @@ struct extent_map_tree { struct btrfs_inode; +static inline void extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) +{ + if (type == BTRFS_COMPRESS_ZLIB) + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; + else if (type == BTRFS_COMPRESS_LZO) + em->flags |= EXTENT_FLAG_COMPRESS_LZO; + else if (type == BTRFS_COMPRESS_ZSTD) + em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; +} + +static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +{ + if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) + return BTRFS_COMPRESS_ZLIB; + + if (em->flags & EXTENT_FLAG_COMPRESS_LZO) + return BTRFS_COMPRESS_LZO; + + if (em->flags & EXTENT_FLAG_COMPRESS_ZSTD) + return BTRFS_COMPRESS_ZSTD; + + return BTRFS_COMPRESS_NONE; +} + +/* + * More efficient way to determine if extent is compressed, instead of using + * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. + */ +static inline bool extent_map_is_compressed(const struct extent_map *em) +{ + return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | + EXTENT_FLAG_COMPRESS_LZO | + EXTENT_FLAG_COMPRESS_ZSTD)) != 0; +} + static inline int extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_end(struct extent_map *em) +static inline u64 extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -static inline u64 extent_map_block_end(struct extent_map *em) -{ - if (em->block_start + em->block_len < em->block_start) - return (u64)-1; - return em->block_start + em->block_len; -} - void extent_map_tree_init(struct extent_map_tree *tree); struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); -int add_extent_mapping(struct extent_map_tree *tree, - struct extent_map *em, int modified); void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, u64 new_logical); @@ -97,7 +124,7 @@ struct extent_map *alloc_extent_map(void); void free_extent_map(struct extent_map *em); int __init extent_map_init(void); void __cold extent_map_exit(void); -int unpin_extent_cache(struct extent_map_tree *tree, u64 start, u64 len, u64 gen); +int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); void clear_em_logging(struct extent_map_tree *tree, struct extent_map *em); struct extent_map *search_extent_mapping(struct extent_map_tree *tree, u64 start, u64 len); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 45cae356e89b..81ac1d474bf1 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -59,7 +59,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(&inode->file_extent_tree, 0, &start, + ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, &end, EXTENT_DIRTY); if (!ret && start == 0) i_size = min(i_size, end + 1); @@ -94,7 +94,7 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return set_extent_bit(&inode->file_extent_tree, start, start + len - 1, + return set_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -123,7 +123,7 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, if (btrfs_fs_incompat(inode->root->fs_info, NO_HOLES)) return 0; - return clear_extent_bit(&inode->file_extent_tree, start, + return clear_extent_bit(inode->file_extent_tree, start, start + len - 1, EXTENT_DIRTY, NULL); } @@ -1294,8 +1294,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, return; } if (compress_type != BTRFS_COMPRESS_NONE) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; + extent_map_set_compression(em, compress_type); em->block_start = bytenr; em->block_len = em->orig_block_len; } else { @@ -1303,7 +1302,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->block_start = bytenr; em->block_len = em->len; if (type == BTRFS_FILE_EXTENT_PREALLOC) - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; } } else if (type == BTRFS_FILE_EXTENT_INLINE) { em->block_start = EXTENT_MAP_INLINE; @@ -1315,9 +1314,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, */ em->orig_start = EXTENT_MAP_HOLE; em->block_len = (u64)-1; - em->compress_type = compress_type; - if (compress_type != BTRFS_COMPRESS_NONE) - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index f47731c45bb5..38dfcac47609 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -111,8 +111,8 @@ static void btrfs_drop_pages(struct btrfs_fs_info *fs_info, * accessed as prepare_pages should have marked them accessed * in prepare_pages via find_or_create_page() */ - btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start, - block_len); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(pages[i]), + block_start, block_len); unlock_page(pages[i]); put_page(pages[i]); } @@ -168,9 +168,12 @@ int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages, for (i = 0; i < num_pages; i++) { struct page *p = pages[i]; - btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes); - btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes); + btrfs_folio_clamp_set_uptodate(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_clear_checked(fs_info, page_folio(p), + start_pos, num_bytes); + btrfs_folio_clamp_set_dirty(fs_info, page_folio(p), + start_pos, num_bytes); } /* @@ -869,9 +872,9 @@ static int prepare_uptodate_page(struct inode *inode, * released. * * The private flag check is essential for subpage as we need - * to store extra bitmap using page->private. + * to store extra bitmap using folio private. */ - if (page->mapping != inode->i_mapping || !PagePrivate(page)) { + if (page->mapping != inode->i_mapping || !folio_test_private(folio)) { unlock_page(page); return -EAGAIN; } @@ -2150,7 +2153,6 @@ out: hole_em->block_start = EXTENT_MAP_HOLE; hole_em->block_len = 0; hole_em->orig_block_len = 0; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -2839,7 +2841,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, if (em->block_start == EXTENT_MAP_HOLE) ret = RANGE_BOUNDARY_HOLE; - else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + else if (em->flags & EXTENT_FLAG_PREALLOC) ret = RANGE_BOUNDARY_PREALLOC_EXTENT; else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; @@ -2879,8 +2881,7 @@ static int btrfs_zero_range(struct inode *inode, * extents and holes, we drop all the existing extents and allocate a * new prealloc extent, so that we get a larger contiguous disk extent. */ - if (em->start <= alloc_start && - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->start <= alloc_start && (em->flags & EXTENT_FLAG_PREALLOC)) { const u64 em_end = em->start + em->len; if (em_end >= offset + len) { @@ -2915,7 +2916,7 @@ static int btrfs_zero_range(struct inode *inode, goto out; } - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (em->flags & EXTENT_FLAG_PREALLOC) { free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); @@ -3136,7 +3137,7 @@ static long btrfs_fallocate(struct file *file, int mode, last_byte = ALIGN(last_byte, blocksize); if (em->block_start == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && - !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) { + !(em->flags & EXTENT_FLAG_PREALLOC))) { const u64 range_len = last_byte - cur_offset; ret = add_falloc_range(&reserve_list, cur_offset, range_len); @@ -3192,7 +3193,7 @@ static long btrfs_fallocate(struct file *file, int mode, qgroup_reserved -= range->len; } else if (qgroup_reserved > 0) { btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved, - range->start, range->len); + range->start, range->len, NULL); qgroup_reserved -= range->len; } list_del(&range->list); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 6f93c9a2c3e3..d372c7ce0e6b 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -439,8 +439,8 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) for (i = 0; i < io_ctl->num_pages; i++) { if (io_ctl->pages[i]) { - btrfs_page_clear_checked(io_ctl->fs_info, - io_ctl->pages[i], + btrfs_folio_clear_checked(io_ctl->fs_info, + page_folio(io_ctl->pages[i]), page_offset(io_ctl->pages[i]), PAGE_SIZE); unlock_page(io_ctl->pages[i]); diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index 318df6f9d9cb..f8bb73d6ab68 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -188,6 +188,7 @@ enum { BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 27), BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 28), BTRFS_MOUNT_NODISCARD = (1UL << 29), + BTRFS_MOUNT_NOSPACECACHE = (1UL << 30), }; /* @@ -398,7 +399,8 @@ struct btrfs_fs_info { struct extent_io_tree excluded_extents; /* logical->physical extent mapping */ - struct extent_map_tree mapping_tree; + struct rb_root_cached mapping_tree; + rwlock_t mapping_tree_lock; /* * Block reservation for extent, checksum, root tree and delayed dir @@ -960,20 +962,6 @@ void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag, #define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \ BTRFS_MOUNT_##opt) -#define btrfs_set_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (!btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_set_opt(fs_info->mount_opt, opt); \ -} while (0) - -#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \ -do { \ - if (btrfs_test_opt(fs_info, opt)) \ - btrfs_info(fs_info, fmt, ##args); \ - btrfs_clear_opt(fs_info->mount_opt, opt); \ -} while (0) - static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info) { /* Do it this way so we only ever do one test_bit in the normal case. */ diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 5e3fccddde0c..809b11472a80 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -114,6 +114,15 @@ struct data_reloc_warn { int mirror_num; }; +/* + * For the file_extent_tree, we want to hold the inode lock when we lookup and + * update the disk_i_size, but lockdep will complain because our io_tree we hold + * the tree lock and get the inode lock when setting delalloc. These two things + * are unrelated, so make a class for the file_extent_tree so we don't get the + * two locking patterns mixed up. + */ +static struct lock_class_key file_extent_tree_class; + static const struct inode_operations btrfs_dir_inode_operations; static const struct inode_operations btrfs_symlink_inode_operations; static const struct inode_operations btrfs_special_inode_operations; @@ -447,8 +456,8 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, * range, then btrfs_mark_ordered_io_finished() will handle * the ordered extent accounting for the range. */ - btrfs_page_clamp_clear_ordered(inode->root->fs_info, page, - offset, bytes); + btrfs_folio_clamp_clear_ordered(inode->root->fs_info, + page_folio(page), offset, bytes); put_page(page); } @@ -688,7 +697,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -1037,7 +1046,7 @@ free_pages: if (pages) { for (i = 0; i < nr_pages; i++) { WARN_ON(pages[i]->mapping); - put_page(pages[i]); + btrfs_free_compr_page(pages[i]); } kfree(pages); } @@ -1052,7 +1061,7 @@ static void free_async_extent_pages(struct async_extent *async_extent) for (i = 0; i < async_extent->nr_pages; i++) { WARN_ON(async_extent->pages[i]->mapping); - put_page(async_extent->pages[i]); + btrfs_free_compr_page(async_extent->pages[i]); } kfree(async_extent->pages); async_extent->nr_pages = 0; @@ -2793,7 +2802,7 @@ out_page: PAGE_SIZE, !ret); clear_page_dirty_for_io(page); } - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, page_folio(page), page_start, PAGE_SIZE); unlock_page(page); put_page(page); kfree(fixup); @@ -2848,7 +2857,7 @@ int btrfs_writepage_cow_fixup(struct page *page) * page->mapping outside of the page lock. */ ihold(inode); - btrfs_page_set_checked(fs_info, page, page_offset(page), PAGE_SIZE); + btrfs_folio_set_checked(fs_info, page_folio(page), page_offset(page), PAGE_SIZE); get_page(page); btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL); fixup->page = page; @@ -3118,7 +3127,7 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) ordered_extent->disk_num_bytes); } } - unpin_extent_cache(&inode->extent_tree, ordered_extent->file_offset, + unpin_extent_cache(inode, ordered_extent->file_offset, ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); @@ -3796,7 +3805,7 @@ cache_index: * cache. * * This is required for both inode re-read from disk and delayed inode - * in delayed_nodes_tree. + * in the delayed_nodes xarray. */ if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, @@ -4725,7 +4734,7 @@ again: /* * We unlock the page after the io is completed and then re-lock it * above. release_folio() could have come in between that and cleared - * PagePrivate(), but left the page in the mapping. Set the page mapped + * folio private, but left the page in the mapping. Set the page mapped * here to make sure it's properly set for the subpage stuff. */ ret = set_page_extent_mapped(page); @@ -4767,9 +4776,10 @@ again: memzero_page(page, (block_start - page_offset(page)) + offset, len); } - btrfs_page_clear_checked(fs_info, page, block_start, - block_end + 1 - block_start); - btrfs_page_set_dirty(fs_info, page, block_start, block_end + 1 - block_start); + btrfs_folio_clear_checked(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); + btrfs_folio_set_dirty(fs_info, page_folio(page), block_start, + block_end + 1 - block_start); unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) @@ -4889,7 +4899,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + if (!(em->flags & EXTENT_FLAG_PREALLOC)) { struct extent_map *hole_em; err = maybe_insert_hole(inode, cur_offset, hole_size); @@ -4917,7 +4927,6 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->block_len = 0; hole_em->orig_block_len = 0; hole_em->ram_bytes = hole_size; - hole_em->compress_type = BTRFS_COMPRESS_NONE; hole_em->generation = btrfs_get_fs_generation(fs_info); err = btrfs_replace_extent_map_range(inode, hole_em, true); @@ -5132,7 +5141,7 @@ static void evict_inode_truncate_pages(struct inode *inode) */ if (state_flags & EXTENT_DELALLOC) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, - end - start + 1); + end - start + 1, NULL); clear_extent_bit(io_tree, start, end, EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, @@ -6217,6 +6226,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, inode->i_generation = BTRFS_I(inode)->generation; /* + * We don't have any capability xattrs set here yet, shortcut any + * queries for the xattrs here. If we add them later via the inode + * security init path or any other path this flag will be cleared. + */ + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + /* * Subvolumes don't inherit flags from their parent directory. * Originally this was probably by accident, but we probably can't * change it now without compatibility issues. @@ -6983,8 +6999,15 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, int ret; alloc_hint = get_extent_allocation_hint(inode, start, len); +again: ret = btrfs_reserve_extent(root, len, len, fs_info->sectorsize, 0, alloc_hint, &ins, 1, 1); + if (ret == -EAGAIN) { + ASSERT(btrfs_is_zoned(fs_info)); + wait_on_bit_io(&inode->root->fs_info->flags, BTRFS_FS_NEED_ZONE_FINISH, + TASK_UNINTERRUPTIBLE); + goto again; + } if (ret) return ERR_PTR(ret); @@ -7251,13 +7274,11 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, em->orig_block_len = orig_block_len; em->ram_bytes = ram_bytes; em->generation = -1; - set_bit(EXTENT_FLAG_PINNED, &em->flags); - if (type == BTRFS_ORDERED_PREALLOC) { - set_bit(EXTENT_FLAG_FILLING, &em->flags); - } else if (type == BTRFS_ORDERED_COMPRESSED) { - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); - em->compress_type = compress_type; - } + em->flags |= EXTENT_FLAG_PINNED; + if (type == BTRFS_ORDERED_PREALLOC) + em->flags |= EXTENT_FLAG_FILLING; + else if (type == BTRFS_ORDERED_COMPRESSED) + extent_map_set_compression(em, compress_type); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { @@ -7297,10 +7318,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, * just use the extent. * */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + if ((em->flags & EXTENT_FLAG_PREALLOC) || ((BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && em->block_start != EXTENT_MAP_HOLE)) { - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) type = BTRFS_ORDERED_PREALLOC; else type = BTRFS_ORDERED_NOCOW; @@ -7535,7 +7556,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) || + if (extent_map_is_compressed(em) || em->block_start == EXTENT_MAP_INLINE) { free_extent_map(em); /* @@ -7631,7 +7652,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * that, since we have locked only the parts we are performing I/O in. */ if ((em->block_start == EXTENT_MAP_HOLE) || - (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + ((em->flags & EXTENT_FLAG_PREALLOC) && !write)) { iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { @@ -7844,13 +7865,14 @@ static void btrfs_readahead(struct readahead_control *rac) static void wait_subpage_spinlock(struct page *page) { struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct folio *folio = page_folio(page); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, page->mapping)) return; - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * This may look insane as we just acquire the spinlock and release it, @@ -7988,7 +8010,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, page_end); ASSERT(range_end + 1 - cur < U32_MAX); range_len = range_end + 1 - cur; - if (!btrfs_page_test_ordered(fs_info, &folio->page, cur, range_len)) { + if (!btrfs_folio_test_ordered(fs_info, folio, cur, range_len)) { /* * If Ordered (Private2) is cleared, it means endio has * already been executed for the range. @@ -7997,7 +8019,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, */ goto next; } - btrfs_page_clear_ordered(fs_info, &folio->page, cur, range_len); + btrfs_folio_clear_ordered(fs_info, folio, cur, range_len); /* * IO on this page will never be started, so we need to account @@ -8052,7 +8074,7 @@ next: * reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur); + btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); if (!inode_evicting) { clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_UPTODATE | @@ -8067,7 +8089,7 @@ next: * did something wrong. */ ASSERT(!folio_test_ordered(folio)); - btrfs_page_clear_checked(fs_info, &folio->page, folio_pos(folio), folio_size(folio)); + btrfs_folio_clear_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); if (!inode_evicting) __btrfs_release_folio(folio, GFP_NOFS); clear_page_extent_mapped(&folio->page); @@ -8091,6 +8113,7 @@ next: vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; + struct folio *folio = page_folio(page); struct inode *inode = file_inode(vmf->vma->vm_file); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -8107,6 +8130,8 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) u64 page_end; u64 end; + ASSERT(folio_order(folio) == 0); + reserved_space = PAGE_SIZE; sb_start_pagefault(inode->i_sb); @@ -8210,9 +8235,9 @@ again: if (zero_start != PAGE_SIZE) memzero_page(page, zero_start, PAGE_SIZE - zero_start); - btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE); - btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start); - btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start); + btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); + btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); @@ -8455,10 +8480,20 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_inode *ei; struct inode *inode; + struct extent_io_tree *file_extent_tree = NULL; + + /* Self tests may pass a NULL fs_info. */ + if (fs_info && !btrfs_fs_incompat(fs_info, NO_HOLES)) { + file_extent_tree = kmalloc(sizeof(struct extent_io_tree), GFP_KERNEL); + if (!file_extent_tree) + return NULL; + } ei = alloc_inode_sb(sb, btrfs_inode_cachep, GFP_KERNEL); - if (!ei) + if (!ei) { + kfree(file_extent_tree); return NULL; + } ei->root = NULL; ei->generation = 0; @@ -8494,10 +8529,18 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) inode = &ei->vfs_inode; extent_map_tree_init(&ei->extent_tree); + + /* This io tree sets the valid inode. */ extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; - extent_io_tree_init(fs_info, &ei->file_extent_tree, - IO_TREE_INODE_FILE_EXTENT); + + ei->file_extent_tree = file_extent_tree; + if (file_extent_tree) { + extent_io_tree_init(fs_info, ei->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); + /* Lockdep class is set only for the file extent tree. */ + lockdep_set_class(&ei->file_extent_tree->lock, &file_extent_tree_class); + } mutex_init(&ei->log_mutex); spin_lock_init(&ei->ordered_tree_lock); ei->ordered_tree = RB_ROOT; @@ -8514,12 +8557,14 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) void btrfs_test_destroy_inode(struct inode *inode) { btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } #endif void btrfs_free_inode(struct inode *inode) { + kfree(BTRFS_I(inode)->file_extent_tree); kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } @@ -9484,7 +9529,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; - int qgroup_released; + u64 qgroup_released = 0; int ret; memset(&stack_fi, 0, sizeof(stack_fi)); @@ -9497,9 +9542,9 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( btrfs_set_stack_file_extent_compression(&stack_fi, BTRFS_COMPRESS_NONE); /* Encryption and other encoding is reserved and all 0 */ - qgroup_released = btrfs_qgroup_release_data(inode, file_offset, len); - if (qgroup_released < 0) - return ERR_PTR(qgroup_released); + ret = btrfs_qgroup_release_data(inode, file_offset, len, &qgroup_released); + if (ret < 0) + return ERR_PTR(ret); if (trans) { ret = insert_reserved_file_extent(trans, inode, @@ -9625,7 +9670,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->block_len = ins.offset; em->orig_block_len = ins.offset; em->ram_bytes = ins.offset; - set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + em->flags |= EXTENT_FLAG_PREALLOC; em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); @@ -9778,7 +9823,9 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end) page = find_get_page(inode->vfs_inode.i_mapping, index); ASSERT(page); /* Pages should be in the extent_io_tree */ - btrfs_page_set_writeback(fs_info, page, start, len); + /* This is for data, which doesn't yet support larger folio. */ + ASSERT(folio_order(page_folio(page)) == 0); + btrfs_folio_set_writeback(fs_info, page_folio(page), start, len); put_page(page); index++; } @@ -9987,7 +10034,7 @@ static ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); if (!pages) return -ENOMEM; - ret = btrfs_alloc_page_array(nr_pages, pages); + ret = btrfs_alloc_page_array(nr_pages, pages, 0); if (ret) { ret = -ENOMEM; goto out; @@ -10106,12 +10153,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->len = min_t(u64, extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { + (em->flags & EXTENT_FLAG_PREALLOC)) { disk_bytenr = EXTENT_MAP_HOLE; count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + } else if (extent_map_is_compressed(em)) { disk_bytenr = em->block_start; /* * Bail if the buffer isn't large enough to return the whole @@ -10126,7 +10173,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - em->orig_start; ret = btrfs_encoded_io_compression_from_extent(fs_info, - em->compress_type); + extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; @@ -10394,7 +10441,7 @@ out_delalloc_release: btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); out_qgroup_free_data: if (ret < 0) - btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes); + btrfs_qgroup_free_data(inode, data_reserved, start, num_bytes, NULL); out_free_data_space: /* * If btrfs_reserve_extent() succeeded, then we already decremented @@ -10557,6 +10604,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_state *cached_state = NULL; struct extent_map *em = NULL; + struct btrfs_chunk_map *map = NULL; struct btrfs_device *device = NULL; struct btrfs_swap_info bsi = { .lowest_ppage = (sector_t)-1ULL, @@ -10673,7 +10721,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, ret = -EINVAL; goto out; } - if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + if (extent_map_is_compressed(em)) { btrfs_warn(fs_info, "swapfile must not be compressed"); ret = -EINVAL; goto out; @@ -10696,13 +10744,13 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, goto out; } - em = btrfs_get_chunk_map(fs_info, logical_block_start, len); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical_block_start, len); + if (IS_ERR(map)) { + ret = PTR_ERR(map); goto out; } - if (em->map_lookup->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { btrfs_warn(fs_info, "swapfile must have single data profile"); ret = -EINVAL; @@ -10710,23 +10758,23 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, } if (device == NULL) { - device = em->map_lookup->stripes[0].dev; + device = map->stripes[0].dev; ret = btrfs_add_swapfile_pin(inode, device, false); if (ret == 1) ret = 0; else if (ret) goto out; - } else if (device != em->map_lookup->stripes[0].dev) { + } else if (device != map->stripes[0].dev) { btrfs_warn(fs_info, "swapfile must be on one device"); ret = -EINVAL; goto out; } - physical_block_start = (em->map_lookup->stripes[0].physical + - (logical_block_start - em->start)); - len = min(len, em->len - (logical_block_start - em->start)); - free_extent_map(em); - em = NULL; + physical_block_start = (map->stripes[0].physical + + (logical_block_start - map->start)); + len = min(len, map->chunk_len - (logical_block_start - map->start)); + btrfs_free_chunk_map(map); + map = NULL; bg = btrfs_lookup_block_group(fs_info, logical_block_start); if (!bg) { @@ -10779,6 +10827,8 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, out: if (!IS_ERR_OR_NULL(em)) free_extent_map(em); + if (!IS_ERR_OR_NULL(map)) + btrfs_free_chunk_map(map); unlock_extent(io_tree, 0, isize - 1, &cached_state); @@ -10923,7 +10973,7 @@ static const struct address_space_operations btrfs_aops = { .release_folio = btrfs_release_folio, .migrate_folio = btrfs_migrate_folio, .dirty_folio = filemap_dirty_folio, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .swap_activate = btrfs_swap_activate, .swap_deactivate = btrfs_swap_deactivate, }; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 752acff2c734..41b479861b3c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1290,6 +1290,15 @@ static noinline int __btrfs_ioctl_snap_create(struct file *file, * are limited to own subvolumes only */ ret = -EPERM; + } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) { + /* + * Snapshots must be made with the src_inode referring + * to the subvolume inode, otherwise the permission + * checking above is useless because we may have + * permission on a lower directory but not the subvol + * itself. + */ + ret = -EINVAL; } else { ret = btrfs_mksnapshot(&file->f_path, idmap, name, namelen, @@ -1528,7 +1537,7 @@ static noinline int key_in_sk(struct btrfs_key *key, static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, int *num_found) @@ -1660,7 +1669,7 @@ out: static noinline int search_ioctl(struct inode *inode, struct btrfs_ioctl_search_key *sk, - size_t *buf_size, + u64 *buf_size, char __user *ubuf) { struct btrfs_fs_info *info = btrfs_sb(inode->i_sb); @@ -1733,7 +1742,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, struct btrfs_ioctl_search_args __user *uargs = argp; struct btrfs_ioctl_search_key sk; int ret; - size_t buf_size; + u64 buf_size; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1763,8 +1772,8 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, struct btrfs_ioctl_search_args_v2 __user *uarg = argp; struct btrfs_ioctl_search_args_v2 args; int ret; - size_t buf_size; - const size_t buf_limit = SZ_16M; + u64 buf_size; + const u64 buf_limit = SZ_16M; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -4356,6 +4365,7 @@ static int _btrfs_ioctl_send(struct inode *inode, void __user *argp, bool compat arg->clone_sources = compat_ptr(args32.clone_sources); arg->parent_root = args32.parent_root; arg->flags = args32.flags; + arg->version = args32.version; memcpy(arg->reserved, args32.reserved, sizeof(args32.reserved)); #else @@ -4523,29 +4533,29 @@ static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool if (ret < 0) goto out_acct; - file_start_write(file); - if (iov_iter_count(&iter) == 0) { ret = 0; - goto out_end_write; + goto out_iov; } pos = args.offset; ret = rw_verify_area(WRITE, file, &pos, args.len); if (ret < 0) - goto out_end_write; + goto out_iov; init_sync_kiocb(&kiocb, file); ret = kiocb_set_rw_flags(&kiocb, 0); if (ret) - goto out_end_write; + goto out_iov; kiocb.ki_pos = pos; + file_start_write(file); + ret = btrfs_do_write_iter(&kiocb, &iter, &args); if (ret > 0) fsnotify_modify(file); -out_end_write: file_end_write(file); +out_iov: kfree(iov); out_acct: if (ret > 0) diff --git a/fs/btrfs/lru_cache.c b/fs/btrfs/lru_cache.c index 0fe0ae54ac67..fd88af17d8d9 100644 --- a/fs/btrfs/lru_cache.c +++ b/fs/btrfs/lru_cache.c @@ -9,7 +9,7 @@ * * @cache: The cache. * @max_size: Maximum size (number of entries) for the cache. - * Use 0 for unlimited size, it's the user's responsability to + * Use 0 for unlimited size, it's the user's responsibility to * trim the cache in that case. */ void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size) diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index d3fcfc628a4f..1131d5a29d61 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -152,7 +152,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; @@ -178,7 +178,7 @@ static int copy_compressed_data_to_page(char *compressed_data, cur_page = out_pages[*cur_out / PAGE_SIZE]; /* Allocate a new page */ if (!cur_page) { - cur_page = alloc_page(GFP_NOFS); + cur_page = btrfs_alloc_compr_page(); if (!cur_page) return -ENOMEM; out_pages[*cur_out / PAGE_SIZE] = cur_page; diff --git a/fs/btrfs/messages.c b/fs/btrfs/messages.c index b8f9c9e56c8c..cdada4865837 100644 --- a/fs/btrfs/messages.c +++ b/fs/btrfs/messages.c @@ -287,7 +287,7 @@ void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) * panic or BUGs, depending on mount options. */ __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...) { char *s_id = "<unknown>"; diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 4d04c1fa5899..08a9272399d2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -194,7 +194,7 @@ const char * __attribute_const__ btrfs_decode_error(int error); __printf(5, 6) __cold -void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, +void __btrfs_panic(const struct btrfs_fs_info *fs_info, const char *function, unsigned int line, int error, const char *fmt, ...); /* * If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 574e8a55e24a..59850dc17b22 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -152,11 +152,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( { struct btrfs_ordered_extent *entry; int ret; + u64 qgroup_rsv = 0; if (flags & ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { /* For nocow write, we can release the qgroup rsv right now */ - ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes); + ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } else { @@ -164,7 +165,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( * The ordered extent has reserved qgroup space, release now * and pass the reserved number for qgroup_record to free. */ - ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes); + ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); if (ret < 0) return ERR_PTR(ret); } @@ -182,7 +183,7 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->inode = igrab(&inode->vfs_inode); entry->compress_type = compress_type; entry->truncated_len = (u64)-1; - entry->qgroup_rsv = ret; + entry->qgroup_rsv = qgroup_rsv; entry->flags = flags; refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); @@ -322,9 +323,10 @@ static bool can_finish_ordered_extent(struct btrfs_ordered_extent *ordered, * * If there's no such bit, we need to skip to next range. */ - if (!btrfs_page_test_ordered(fs_info, page, file_offset, len)) + if (!btrfs_folio_test_ordered(fs_info, page_folio(page), + file_offset, len)) return false; - btrfs_page_clear_ordered(fs_info, page, file_offset, len); + btrfs_folio_clear_ordered(fs_info, page_folio(page), file_offset, len); } /* Now we're fine to update the accounting. */ @@ -599,7 +601,9 @@ void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, release = entry->disk_num_bytes; else release = entry->num_bytes; - btrfs_delalloc_release_metadata(btrfs_inode, release, false); + btrfs_delalloc_release_metadata(btrfs_inode, release, + test_bit(BTRFS_ORDERED_IOERR, + &entry->flags)); } percpu_counter_add_batch(&fs_info->ordered_bytes, -entry->num_bytes, diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 567a6d3d4712..127ef8bf0ffd 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -97,13 +97,6 @@ struct btrfs_ordered_extent { u64 bytes_left; /* - * the end of the ordered extent which is behind it but - * didn't update disk_i_size. Please see the comment of - * btrfs_ordered_update_i_size(); - */ - u64 outstanding_isize; - - /* * If we get truncated we need to adjust the file extent we enter for * this ordered extent so that we do not expose stale data. */ diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index edb84cc03237..63b426cc7798 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -194,7 +194,7 @@ static struct btrfs_qgroup *find_qgroup_rb(struct btrfs_fs_info *fs_info, * * Must be called with qgroup_lock held and @prealloc preallocated. * - * The control on the lifespan of @prealloc would be transfered to this + * The control on the lifespan of @prealloc would be transferred to this * function, thus caller should no longer touch @prealloc. */ static struct btrfs_qgroup *add_qgroup_rb(struct btrfs_fs_info *fs_info, @@ -1888,7 +1888,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info, u64 bytenr = record->bytenr; if (!btrfs_qgroup_full_accounting(fs_info)) - return 0; + return 1; lockdep_assert_held(&delayed_refs->lock); trace_btrfs_qgroup_trace_extent(fs_info, record); @@ -2875,12 +2875,18 @@ int btrfs_qgroup_account_extent(struct btrfs_trans_handle *trans, u64 bytenr, num_bytes, seq); /* + * We're done using the iterator, release all its qgroups while holding + * fs_info->qgroup_lock so that we don't race with btrfs_remove_qgroup() + * and trigger use-after-free accesses to qgroups. + */ + qgroup_iterator_nested_clean(&qgroups); + + /* * Bump qgroup_seq to avoid seq overlap */ fs_info->qgroup_seq += max(nr_old_roots, nr_new_roots) + 1; spin_unlock(&fs_info->qgroup_lock); out_free: - qgroup_iterator_nested_clean(&qgroups); ulist_free(old_roots); ulist_free(new_roots); return ret; @@ -4051,13 +4057,14 @@ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, /* Free ranges specified by @reserved, normally in error path */ static int qgroup_free_reserved_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed_ret) { struct btrfs_root *root = inode->root; struct ulist_node *unode; struct ulist_iterator uiter; struct extent_changeset changeset; - int freed = 0; + u64 freed = 0; int ret; extent_changeset_init(&changeset); @@ -4098,7 +4105,9 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, } btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, freed, BTRFS_QGROUP_RSV_DATA); - ret = freed; + if (freed_ret) + *freed_ret = freed; + ret = 0; out: extent_changeset_release(&changeset); return ret; @@ -4106,7 +4115,7 @@ out: static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, - int free) + u64 *released, int free) { struct extent_changeset changeset; int trace_op = QGROUP_RELEASE; @@ -4122,7 +4131,7 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, /* In release case, we shouldn't have @reserved */ WARN_ON(!free && reserved); if (free && reserved) - return qgroup_free_reserved_data(inode, reserved, start, len); + return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, EXTENT_QGROUP_RESERVED, &changeset); @@ -4137,7 +4146,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, btrfs_qgroup_free_refroot(inode->root->fs_info, inode->root->root_key.objectid, changeset.bytes_changed, BTRFS_QGROUP_RSV_DATA); - ret = changeset.bytes_changed; + if (released) + *released = changeset.bytes_changed; out: extent_changeset_release(&changeset); return ret; @@ -4156,9 +4166,10 @@ out: * NOTE: This function may sleep for memory allocation. */ int btrfs_qgroup_free_data(struct btrfs_inode *inode, - struct extent_changeset *reserved, u64 start, u64 len) + struct extent_changeset *reserved, + u64 start, u64 len, u64 *freed) { - return __btrfs_qgroup_release_data(inode, reserved, start, len, 1); + return __btrfs_qgroup_release_data(inode, reserved, start, len, freed, 1); } /* @@ -4176,9 +4187,9 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode, * * NOTE: This function may sleep for memory allocation. */ -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len) +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released) { - return __btrfs_qgroup_release_data(inode, NULL, start, len, 0); + return __btrfs_qgroup_release_data(inode, NULL, start, len, released, 0); } static void add_root_meta_rsv(struct btrfs_root *root, int num_bytes, @@ -4326,8 +4337,9 @@ static void qgroup_convert_meta(struct btrfs_fs_info *fs_info, u64 ref_root, qgroup_rsv_release(fs_info, qgroup, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - qgroup_rsv_add(fs_info, qgroup, num_bytes, - BTRFS_QGROUP_RSV_META_PERTRANS); + if (!sb_rdonly(fs_info->sb)) + qgroup_rsv_add(fs_info, qgroup, num_bytes, + BTRFS_QGROUP_RSV_META_PERTRANS); list_for_each_entry(glist, &qgroup->groups, next_group) qgroup_iterator_add(&qgroup_list, glist->group); @@ -4649,6 +4661,17 @@ void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans) *root = RB_ROOT; } +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes) +{ + if (btrfs_qgroup_mode(fs_info) != BTRFS_QGROUP_MODE_SIMPLE) + return; + + if (!is_fstree(root)) + return; + + btrfs_qgroup_free_refroot(fs_info, root, rsv_bytes, BTRFS_QGROUP_RSV_DATA); +} + int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, struct btrfs_squota_delta *delta) { @@ -4693,8 +4716,5 @@ int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, out: spin_unlock(&fs_info->qgroup_lock); - if (!ret && delta->rsv_bytes) - btrfs_qgroup_free_refroot(fs_info, root, delta->rsv_bytes, - BTRFS_QGROUP_RSV_DATA); return ret; } diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index 855a4f978761..be18c862e64e 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -274,8 +274,6 @@ struct btrfs_squota_delta { u64 root; /* The number of bytes in the extent being counted. */ u64 num_bytes; - /* The number of bytes reserved for this extent. */ - u64 rsv_bytes; /* The generation the extent was created in. */ u64 generation; /* Whether we are using or freeing the extent. */ @@ -358,10 +356,10 @@ int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, /* New io_tree based accurate qgroup reserve API */ int btrfs_qgroup_reserve_data(struct btrfs_inode *inode, struct extent_changeset **reserved, u64 start, u64 len); -int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len); +int btrfs_qgroup_release_data(struct btrfs_inode *inode, u64 start, u64 len, u64 *released); int btrfs_qgroup_free_data(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, - u64 len); + u64 len, u64 *freed); int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, enum btrfs_qgroup_rsv_type type, bool enforce); int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, @@ -422,6 +420,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *eb); void btrfs_qgroup_destroy_extent_records(struct btrfs_transaction *trans); bool btrfs_check_quota_leak(struct btrfs_fs_info *fs_info); +void btrfs_free_squota_rsv(struct btrfs_fs_info *fs_info, u64 root, u64 rsv_bytes); int btrfs_record_squota_delta(struct btrfs_fs_info *fs_info, struct btrfs_squota_delta *delta); diff --git a/fs/btrfs/raid-stripe-tree.c b/fs/btrfs/raid-stripe-tree.c index 944e8f1862aa..9589362acfbf 100644 --- a/fs/btrfs/raid-stripe-tree.c +++ b/fs/btrfs/raid-stripe-tree.c @@ -145,7 +145,7 @@ int btrfs_insert_raid_extent(struct btrfs_trans_handle *trans, btrfs_put_bioc(bioc); } - return ret; + return 0; } int btrfs_get_raid_extent_offset(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 3e014b9370a3..792c8e17c31d 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -964,7 +964,7 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) { int ret; - ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; /* Mapping all sectors */ @@ -979,7 +979,7 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) int ret; ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages, - rbio->stripe_pages + data_pages); + rbio->stripe_pages + data_pages, 0); if (ret < 0) return ret; @@ -1530,7 +1530,7 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0); if (ret < 0) return ret; @@ -1549,7 +1549,6 @@ struct btrfs_plug_cb { struct blk_plug_cb cb; struct btrfs_fs_info *info; struct list_head rbio_list; - struct work_struct work; }; /* diff --git a/fs/btrfs/raid56.h b/fs/btrfs/raid56.h index 45e6ff78316f..470213688872 100644 --- a/fs/btrfs/raid56.h +++ b/fs/btrfs/raid56.h @@ -164,7 +164,7 @@ struct raid56_bio_trace_info { u8 stripe_nr; }; -static inline int nr_data_stripes(const struct map_lookup *map) +static inline int nr_data_stripes(const struct btrfs_chunk_map *map) { return map->num_stripes - btrfs_nr_parity_stripes(map->type); } diff --git a/fs/btrfs/ref-verify.c b/fs/btrfs/ref-verify.c index 1f62976bee82..6486f0d7e993 100644 --- a/fs/btrfs/ref-verify.c +++ b/fs/btrfs/ref-verify.c @@ -794,6 +794,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } else if (be->num_refs == 0) { btrfs_err(fs_info, @@ -803,6 +804,7 @@ int btrfs_ref_tree_mod(struct btrfs_fs_info *fs_info, dump_ref_action(fs_info, ra); kfree(ref); kfree(ra); + kfree(re); goto out_unlock; } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f88b0c2ac3fe..ae90894dc7dc 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -141,9 +141,9 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (datal < block_size) memzero_page(page, datal, block_size - datal); - btrfs_page_set_uptodate(fs_info, page, file_offset, block_size); - btrfs_page_clear_checked(fs_info, page, file_offset, block_size); - btrfs_page_set_dirty(fs_info, page, file_offset, block_size); + btrfs_folio_set_uptodate(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_clear_checked(fs_info, page_folio(page), file_offset, block_size); + btrfs_folio_set_dirty(fs_info, page_folio(page), file_offset, block_size); out_unlock: if (page) { unlock_page(page); diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index f5d9e5f74a52..abe594f77f99 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2895,7 +2895,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( * will re-read the whole page anyway. */ if (page) { - btrfs_subpage_clear_uptodate(fs_info, page, i_size, + btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size, round_up(i_size, PAGE_SIZE) - i_size); unlock_page(page); put_page(page); @@ -2951,7 +2951,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod em->len = end + 1 - start; em->block_len = em->len; em->block_start = block_start; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false); @@ -3070,7 +3070,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, clamped_len); goto release_page; } - btrfs_page_set_dirty(fs_info, page, clamped_start, clamped_len); + btrfs_folio_set_dirty(fs_info, page_folio(page), + clamped_start, clamped_len); /* * Set the boundary if it's inside the page. diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 9ce5be21b036..a01807cbd4d4 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -43,7 +43,7 @@ struct scrub_ctx; /* * The following value only influences the performance. * - * This detemines how many stripes would be submitted in one go, + * This determines how many stripes would be submitted in one go, * which is 512KiB (BTRFS_STRIPE_LEN * SCRUB_STRIPES_PER_GROUP). */ #define SCRUB_STRIPES_PER_GROUP 8 @@ -192,7 +192,6 @@ struct scrub_ctx { int cur_stripe; atomic_t cancel_req; int readonly; - int sectors_per_bio; /* State of IO submission throttling affecting the associated device */ ktime_t throttle_deadline; @@ -262,7 +261,7 @@ static int init_scrub_stripe(struct btrfs_fs_info *fs_info, atomic_set(&stripe->pending_io, 0); spin_lock_init(&stripe->write_error_lock); - ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages); + ret = btrfs_alloc_page_array(SCRUB_STRIPE_PAGES, stripe->pages, 0); if (ret < 0) goto error; @@ -710,7 +709,7 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) /* Metadata, verify the full tree block. */ if (sector->is_metadata) { /* - * Check if the tree block crosses the stripe boudary. If + * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a * warning. * @@ -884,7 +883,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, /* * Init needed infos for error reporting. * - * Although our scrub_stripe infrastucture is mostly based on btrfs_submit_bio() + * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { @@ -1280,7 +1279,7 @@ static void scrub_throttle_dev_io(struct scrub_ctx *sctx, struct btrfs_device *d * return 0 if it is a data stripe, 1 means parity stripe. */ static int get_raid56_logic_offset(u64 physical, int num, - struct map_lookup *map, u64 *offset, + struct btrfs_chunk_map *map, u64 *offset, u64 *stripe_start) { int i; @@ -1409,14 +1408,11 @@ search_forward: if (ret > 0) break; next: - path->slots[0]++; - if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { - ret = btrfs_next_leaf(extent_root, path); - if (ret) { - /* Either no more item or fatal error */ - btrfs_release_path(path); - return ret; - } + ret = btrfs_next_item(extent_root, path); + if (ret) { + /* Either no more items or a fatal error. */ + btrfs_release_path(path); + return ret; } } btrfs_release_path(path); @@ -1816,7 +1812,7 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) if (sctx->is_dev_replace) { /* * For dev-replace, if we know there is something wrong with - * metadata, we should immedately abort. + * metadata, we should immediately abort. */ for (int i = 0; i < nr_stripes; i++) { if (stripe_has_metadata_error(&sctx->stripes[i])) { @@ -1868,6 +1864,9 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * */ ASSERT(sctx->cur_stripe < SCRUB_TOTAL_STRIPES); + /* @found_logical_ret must be specified. */ + ASSERT(found_logical_ret); + stripe = &sctx->stripes[sctx->cur_stripe]; scrub_reset_stripe(stripe); ret = scrub_find_fill_first_stripe(bg, &sctx->extent_path, @@ -1876,8 +1875,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * /* Either >0 as no more extents or <0 for error. */ if (ret) return ret; - if (found_logical_ret) - *found_logical_ret = stripe->logical; + *found_logical_ret = stripe->logical; sctx->cur_stripe++; /* We filled one group, submit it. */ @@ -1896,7 +1894,7 @@ static int queue_scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group * static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, struct btrfs_device *scrub_dev, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 full_stripe_start) { DECLARE_COMPLETION_ONSTACK(io_done); @@ -2065,7 +2063,7 @@ out: */ static int scrub_simple_mirror(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, u64 logical_start, u64 logical_length, struct btrfs_device *device, u64 physical, int mirror_num) @@ -2080,7 +2078,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, /* Go through each extent items inside the logical range */ while (cur_logical < logical_end) { - u64 found_logical; + u64 found_logical = U64_MAX; u64 cur_physical = physical + cur_logical - logical_start; /* Canceled? */ @@ -2115,6 +2113,8 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, if (ret < 0) break; + /* queue_scrub_stripe() returned 0, @found_logical must be updated. */ + ASSERT(found_logical != U64_MAX); cur_logical = found_logical + BTRFS_STRIPE_LEN; /* Don't hold CPU for too long time */ @@ -2124,7 +2124,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx, } /* Calculate the full stripe length for simple stripe based profiles */ -static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) +static u64 simple_stripe_full_stripe_len(const struct btrfs_chunk_map *map) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2133,7 +2133,7 @@ static u64 simple_stripe_full_stripe_len(const struct map_lookup *map) } /* Get the logical bytenr for the stripe */ -static u64 simple_stripe_get_logical(struct map_lookup *map, +static u64 simple_stripe_get_logical(struct btrfs_chunk_map *map, struct btrfs_block_group *bg, int stripe_index) { @@ -2150,7 +2150,7 @@ static u64 simple_stripe_get_logical(struct map_lookup *map, } /* Get the mirror number for the stripe */ -static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) +static int simple_stripe_mirror_num(struct btrfs_chunk_map *map, int stripe_index) { ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)); @@ -2162,7 +2162,7 @@ static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index) static int scrub_simple_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct btrfs_device *device, int stripe_index) { @@ -2195,18 +2195,17 @@ static int scrub_simple_stripe(struct scrub_ctx *sctx, static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, struct btrfs_block_group *bg, - struct extent_map *em, + struct btrfs_chunk_map *map, struct btrfs_device *scrub_dev, int stripe_index) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct map_lookup *map = em->map_lookup; const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK; const u64 chunk_logical = bg->start; int ret; int ret2; u64 physical = map->stripes[stripe_index].physical; - const u64 dev_stripe_len = btrfs_calc_stripe_length(em); + const u64 dev_stripe_len = btrfs_calc_stripe_length(map); const u64 physical_end = physical + dev_stripe_len; u64 logical; u64 logic_end; @@ -2369,17 +2368,12 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, u64 dev_extent_len) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; int i; int ret = 0; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, bg->start, bg->length); - read_unlock(&map_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, bg->start, bg->length); + if (!map) { /* * Might have been an unused block group deleted by the cleaner * kthread or relocation. @@ -2391,22 +2385,21 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - if (em->start != bg->start) + if (map->start != bg->start) goto out; - if (em->len < dev_extent_len) + if (map->chunk_len < dev_extent_len) goto out; - map = em->map_lookup; for (i = 0; i < map->num_stripes; ++i) { if (map->stripes[i].dev->bdev == scrub_dev->bdev && map->stripes[i].physical == dev_offset) { - ret = scrub_stripe(sctx, bg, em, scrub_dev, i); + ret = scrub_stripe(sctx, bg, map, scrub_dev, i); if (ret) goto out; } } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 3b929f0e8f04..4e36550618e5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -8158,7 +8158,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg) } sctx->send_filp = fget(arg->send_fd); - if (!sctx->send_filp) { + if (!sctx->send_filp || !(sctx->send_filp->f_mode & FMODE_WRITE)) { ret = -EBADF; goto out; } diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 1b999c6e4193..93511d54abf8 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -64,7 +64,7 @@ * This means a slightly higher tree locking latency. */ -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) { if (fs_info->sectorsize >= PAGE_SIZE) return false; @@ -74,8 +74,7 @@ bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page) * mapping. And if page->mapping->host is data inode, it's subpage. * As we have ruled our sectorsize >= PAGE_SIZE case already. */ - if (!page->mapping || !page->mapping->host || - is_data_inode(page->mapping->host)) + if (!mapping || !mapping->host || is_data_inode(mapping->host)) return true; /* @@ -116,7 +115,7 @@ void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sector } int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type) + struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; @@ -124,31 +123,30 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. */ - if (page->mapping) - ASSERT(PageLocked(page)); + if (folio->mapping) + ASSERT(folio_test_locked(folio)); - /* Either not subpage, or the page already has private attached */ - if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) return 0; subpage = btrfs_alloc_subpage(fs_info, type); if (IS_ERR(subpage)) return PTR_ERR(subpage); - attach_page_private(page, subpage); + folio_attach_private(folio, subpage); return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - /* Either not subpage, or already detached */ - if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page)) + /* Either not subpage, or the folio already has private attached. */ + if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) return; - subpage = detach_page_private(page); + subpage = folio_detach_private(folio); ASSERT(subpage); btrfs_free_subpage(subpage); } @@ -188,77 +186,78 @@ void btrfs_free_subpage(struct btrfs_subpage *subpage) * This is important for eb allocation, to prevent race with last eb freeing * of the same page. * With the eb_refs increased before the eb inserted into radix tree, - * detach_extent_buffer_page() won't detach the page private while we're still + * detach_extent_buffer_page() won't detach the folio private while we're still * allocating the extent buffer. */ -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); atomic_inc(&subpage->eb_refs); } -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio) { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, page)) + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->mapping); - lockdep_assert_held(&page->mapping->private_lock); + ASSERT(folio_test_private(folio) && folio->mapping); + lockdep_assert_held(&folio->mapping->i_private_lock); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); ASSERT(atomic_read(&subpage->eb_refs)); atomic_dec(&subpage->eb_refs); } static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { + /* For subpage support, the folio must be single page. */ + ASSERT(folio_order(folio) == 0); + /* Basic checks */ - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && IS_ALIGNED(len, fs_info->sectorsize)); /* * The range check only works for mapped page, we can still have * unmapped page like dummy extent buffer pages. */ - if (page->mapping) - ASSERT(page_offset(page) <= start && - start + len <= page_offset(page) + PAGE_SIZE); + if (folio->mapping) + ASSERT(folio_pos(folio) <= start && + start + len <= folio_pos(folio) + PAGE_SIZE); } void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); atomic_add(nbits, &subpage->readers); } void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = len >> fs_info->sectorsize_bits; bool is_data; bool last; - btrfs_subpage_assert(fs_info, page, start, len); - is_data = is_data_inode(page->mapping->host); + btrfs_subpage_assert(fs_info, folio, start, len); + is_data = is_data_inode(folio->mapping->host); ASSERT(atomic_read(&subpage->readers) >= nbits); last = atomic_sub_and_test(nbits, &subpage->readers); @@ -270,35 +269,35 @@ void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, * As we want the atomic_sub_and_test() to be always executed. */ if (is_data && last) - unlock_page(page); + folio_unlock(folio); } -static void btrfs_subpage_clamp_range(struct page *page, u64 *start, u32 *len) +static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) { u64 orig_start = *start; u32 orig_len = *len; - *start = max_t(u64, page_offset(page), orig_start); + *start = max_t(u64, folio_pos(folio), orig_start); /* * For certain call sites like btrfs_drop_pages(), we may have pages * beyond the target range. In that case, just set @len to 0, subpage * helpers can handle @len == 0 without any problem. */ - if (page_offset(page) >= orig_start + orig_len) + if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, page_offset(page) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, orig_start + orig_len) - *start; } void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); int ret; - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); ASSERT(atomic_read(&subpage->readers) == 0); ret = atomic_add_return(nbits, &subpage->writers); @@ -306,12 +305,12 @@ void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, } bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); const int nbits = (len >> fs_info->sectorsize_bits); - btrfs_subpage_assert(fs_info, page, start, len); + btrfs_subpage_assert(fs_info, folio, start, len); /* * We have call sites passing @lock_page into @@ -328,7 +327,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, } /* - * Lock a page for delalloc page writeback. + * Lock a folio for delalloc page writeback. * * Return -EAGAIN if the page is not properly initialized. * Return 0 with the page locked, and writer counter updated. @@ -337,38 +336,40 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, * it's really the correct page, as the caller is using * filemap_get_folios_contig(), which can race with page invalidating. */ -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { - lock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_lock(folio); return 0; } - lock_page(page); - if (!PagePrivate(page) || !page->private) { - unlock_page(page); + folio_lock(folio); + if (!folio_test_private(folio) || !folio_get_private(folio)) { + folio_unlock(folio); return -EAGAIN; } - btrfs_subpage_clamp_range(page, &start, &len); - btrfs_subpage_start_writer(fs_info, page, start, len); + btrfs_subpage_clamp_range(folio, &start, &len); + btrfs_subpage_start_writer(fs_info, folio, start, len); return 0; } -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) - return unlock_page(page); - btrfs_subpage_clamp_range(page, &start, &len); - if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len)) - unlock_page(page); + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } + btrfs_subpage_clamp_range(folio, &start, &len); + if (btrfs_subpage_end_and_test_writer(fs_info, folio, start, len)) + folio_unlock(folio); } -#define subpage_calc_start_bit(fs_info, page, name, start, len) \ +#define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ unsigned int start_bit; \ \ - btrfs_subpage_assert(fs_info, page, start, len); \ + btrfs_subpage_assert(fs_info, folio, start, len); \ start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ start_bit += fs_info->subpage_info->name##_offset; \ start_bit; \ @@ -385,46 +386,46 @@ void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, fs_info->subpage_info->bitmap_nr_bits) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) - SetPageUptodate(page); + folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_uptodate(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, uptodate, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageUptodate(page); + folio_clear_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&subpage->lock, flags); - set_page_dirty(page); + folio_mark_dirty(folio); } /* @@ -438,10 +439,10 @@ void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, * extra handling for tree blocks. */ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, dirty, start, len); unsigned long flags; bool last = false; @@ -455,101 +456,101 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, } void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { bool last; - last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, start, len); if (last) - clear_page_dirty_for_io(page); + folio_clear_dirty_for_io(folio); } void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - set_page_writeback(page); + folio_start_writeback(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, writeback, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { - ASSERT(PageWriteback(page)); - end_page_writeback(page); + ASSERT(folio_test_writeback(folio)); + folio_end_writeback(folio); } spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - SetPageOrdered(page); + folio_set_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, ordered, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) - ClearPageOrdered(page); + folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) - SetPageChecked(page); + folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, + struct btrfs_subpage *subpage = folio_get_private(folio); + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, checked, start, len); unsigned long flags; spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - ClearPageChecked(page); + folio_clear_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -559,10 +560,10 @@ void btrfs_subpage_clear_checked(const struct btrfs_fs_info *fs_info, */ #define IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(name) \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ + struct folio *folio, u64 start, u32 len) \ { \ - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; \ - unsigned int start_bit = subpage_calc_start_bit(fs_info, page, \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + unsigned int start_bit = subpage_calc_start_bit(fs_info, folio, \ name, start, len); \ unsigned long flags; \ bool ret; \ @@ -584,88 +585,94 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked); * in. We only test sectorsize == PAGE_SIZE cases so far, thus we can fall * back to regular sectorsize branch. */ -#define IMPLEMENT_BTRFS_PAGE_OPS(name, set_page_func, clear_page_func, \ - test_page_func) \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +#define IMPLEMENT_BTRFS_PAGE_OPS(name, folio_set_func, \ + folio_clear_func, folio_test_func) \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - set_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_set_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_set_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_set_##name(fs_info, folio, start, len); \ } \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \ - clear_page_func(page); \ + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) { \ + folio_clear_func(folio); \ return; \ } \ - btrfs_subpage_clamp_range(page, &start, &len); \ - btrfs_subpage_clear_##name(fs_info, page, start, len); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + btrfs_subpage_clear_##name(fs_info, folio, start, len); \ } \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len) \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len) \ { \ - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \ - return test_page_func(page); \ - btrfs_subpage_clamp_range(page, &start, &len); \ - return btrfs_subpage_test_##name(fs_info, page, start, len); \ -} -IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, - PageUptodate); -IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, - PageDirty); -IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, - PageWriteback); -IMPLEMENT_BTRFS_PAGE_OPS(ordered, SetPageOrdered, ClearPageOrdered, - PageOrdered); -IMPLEMENT_BTRFS_PAGE_OPS(checked, SetPageChecked, ClearPageChecked, PageChecked); + if (unlikely(!fs_info) || \ + !btrfs_is_subpage(fs_info, folio->mapping)) \ + return folio_test_func(folio); \ + btrfs_subpage_clamp_range(folio, &start, &len); \ + return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} +IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, + folio_test_uptodate); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, folio_mark_dirty, folio_clear_dirty_for_io, + folio_test_dirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, folio_start_writeback, folio_end_writeback, + folio_test_writeback); +IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, + folio_test_ordered); +IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, + folio_test_checked); /* * Make sure not only the page dirty bit is cleared, but also subpage dirty bit * is cleared. */ -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page) +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio) { - struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct btrfs_subpage *subpage = folio_get_private(folio); if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - ASSERT(!PageDirty(page)); - if (!btrfs_is_subpage(fs_info, page)) + ASSERT(!folio_test_dirty(folio)); + if (!btrfs_is_subpage(fs_info, folio->mapping)) return; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_test_bitmap_all_zero(fs_info, subpage, dirty)); } @@ -684,18 +691,20 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, * extent_write_locked_range(). * In this case, we have to call subpage helper to handle the case. */ -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len) +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - ASSERT(PageLocked(page)); + ASSERT(folio_test_locked(folio)); /* For non-subpage case, we just unlock the page */ - if (!btrfs_is_subpage(fs_info, page)) - return unlock_page(page); + if (!btrfs_is_subpage(fs_info, folio->mapping)) { + folio_unlock(folio); + return; + } - ASSERT(PagePrivate(page) && page->private); - subpage = (struct btrfs_subpage *)page->private; + ASSERT(folio_test_private(folio) && folio_get_private(folio)); + subpage = folio_get_private(folio); /* * For subpage case, there are two types of locked page. With or @@ -704,12 +713,14 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, * Since we own the page lock, no one else could touch subpage::writers * and we are safe to do several atomic operations without spinlock. */ - if (atomic_read(&subpage->writers) == 0) + if (atomic_read(&subpage->writers) == 0) { /* No writers, locked by plain lock_page() */ - return unlock_page(page); + folio_unlock(folio); + return; + } /* Have writers, use proper subpage helper to end it */ - btrfs_page_end_writer_lock(fs_info, page, start, len); + btrfs_folio_end_writer_lock(fs_info, folio, start, len); } #define GET_SUBPAGE_BITMAP(subpage, subpage_info, name, dst) \ @@ -717,7 +728,7 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, subpage_info->name##_offset, subpage_info->bitmap_nr_bits) void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len) + struct folio *folio, u64 start, u32 len) { struct btrfs_subpage_info *subpage_info = fs_info->subpage_info; struct btrfs_subpage *subpage; @@ -729,9 +740,9 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long checked_bitmap; unsigned long flags; - ASSERT(PagePrivate(page) && page->private); + ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(subpage_info); - subpage = (struct btrfs_subpage *)page->private; + subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); GET_SUBPAGE_BITMAP(subpage, subpage_info, uptodate, &uptodate_bitmap); @@ -741,10 +752,10 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, GET_SUBPAGE_BITMAP(subpage, subpage_info, checked, &checked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); - dump_page(page, "btrfs subpage dump"); + dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl error=%*pbl dirty=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", - start, len, page_offset(page), + start, len, folio_pos(folio), subpage_info->bitmap_nr_bits, &uptodate_bitmap, subpage_info->bitmap_nr_bits, &error_bitmap, subpage_info->bitmap_nr_bits, &dirty_bitmap, diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 5cbf67ccbdeb..793c2b314a58 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -73,71 +73,68 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page); +bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize); int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, - struct page *page); + struct folio *folio, enum btrfs_subpage_type type); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); -void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info, - struct page *page); +void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); void btrfs_subpage_start_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_end_reader(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); void btrfs_subpage_start_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); -void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); +int btrfs_folio_start_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); +void btrfs_folio_end_writer_lock(const struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); /* * Template for subpage related operations. * - * btrfs_subpage_*() are for call sites where the page has subpage attached and - * the range is ensured to be inside the page. + * btrfs_subpage_*() are for call sites where the folio has subpage attached and + * the range is ensured to be inside the folio's single page. * - * btrfs_page_*() are for call sites where the page can either be subpage - * specific or regular page. The function will handle both cases. - * But the range still needs to be inside the page. + * btrfs_folio_*() are for call sites where the page can either be subpage + * specific or regular folios. The function will handle both cases. + * But the range still needs to be inside one single page. * - * btrfs_page_clamp_*() are similar to btrfs_page_*(), except the range doesn't + * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ void btrfs_subpage_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ + struct folio *folio, u64 start, u32 len); \ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); \ -bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); \ +bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ + struct folio *folio, u64 start, u32 len); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -146,13 +143,12 @@ DECLARE_BTRFS_SUBPAGE_OPS(ordered); DECLARE_BTRFS_SUBPAGE_OPS(checked); bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); -void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info, - struct page *page); -void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page, - u64 start, u32 len); +void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_folio_unlock_writer(struct btrfs_fs_info *fs_info, + struct folio *folio, u64 start, u32 len); void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, - struct page *page, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f638dc339693..896acfda1789 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -27,6 +27,7 @@ #include <linux/crc32c.h> #include <linux/btrfs.h> #include <linux/security.h> +#include <linux/fs_parser.h> #include "messages.h" #include "delayed-inode.h" #include "ctree.h" @@ -64,27 +65,32 @@ #include <trace/events/btrfs.h> static const struct super_operations btrfs_super_ops; - -/* - * Types for mounting the default subvolume and a subvolume explicitly - * requested by subvol=/path. That way the callchain is straightforward and we - * don't have to play tricks with the mount options and recursive calls to - * btrfs_mount. - * - * The new btrfs_root_fs_type also servers as a tag for the bdev_holder. - */ static struct file_system_type btrfs_fs_type; -static struct file_system_type btrfs_root_fs_type; - -static int btrfs_remount(struct super_block *sb, int *flags, char *data); static void btrfs_put_super(struct super_block *sb) { - close_ctree(btrfs_sb(sb)); + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + btrfs_info(fs_info, "last unmount of filesystem %pU", fs_info->fs_devices->fsid); + close_ctree(fs_info); } +/* Store the mount options related information. */ +struct btrfs_fs_context { + char *subvol_name; + u64 subvol_objectid; + u64 max_inline; + u32 commit_interval; + u32 metadata_ratio; + u32 thread_pool_size; + unsigned long mount_opt; + unsigned long compress_type:4; + unsigned int compress_level; + refcount_t refs; +}; + enum { - Opt_acl, Opt_noacl, + Opt_acl, Opt_clear_cache, Opt_commit_interval, Opt_compress, @@ -94,27 +100,26 @@ enum { Opt_degraded, Opt_device, Opt_fatal_errors, - Opt_flushoncommit, Opt_noflushoncommit, + Opt_flushoncommit, Opt_max_inline, - Opt_barrier, Opt_nobarrier, - Opt_datacow, Opt_nodatacow, - Opt_datasum, Opt_nodatasum, - Opt_defrag, Opt_nodefrag, - Opt_discard, Opt_nodiscard, + Opt_barrier, + Opt_datacow, + Opt_datasum, + Opt_defrag, + Opt_discard, Opt_discard_mode, - Opt_norecovery, Opt_ratio, Opt_rescan_uuid_tree, Opt_skip_balance, - Opt_space_cache, Opt_no_space_cache, + Opt_space_cache, Opt_space_cache_version, - Opt_ssd, Opt_nossd, - Opt_ssd_spread, Opt_nossd_spread, + Opt_ssd, + Opt_ssd_spread, Opt_subvol, Opt_subvol_empty, Opt_subvolid, Opt_thread_pool, - Opt_treelog, Opt_notreelog, + Opt_treelog, Opt_user_subvol_rm_allowed, /* Rescue options */ @@ -125,14 +130,10 @@ enum { Opt_ignoredatacsums, Opt_rescue_all, - /* Deprecated options */ - Opt_recovery, - Opt_inode_cache, Opt_noinode_cache, - /* Debugging options */ - Opt_enospc_debug, Opt_noenospc_debug, + Opt_enospc_debug, #ifdef CONFIG_BTRFS_DEBUG - Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, + Opt_fragment, Opt_fragment_data, Opt_fragment_metadata, Opt_fragment_all, #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY Opt_ref_verify, @@ -140,785 +141,611 @@ enum { Opt_err, }; -static const match_table_t tokens = { - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_clear_cache, "clear_cache"}, - {Opt_commit_interval, "commit=%u"}, - {Opt_compress, "compress"}, - {Opt_compress_type, "compress=%s"}, - {Opt_compress_force, "compress-force"}, - {Opt_compress_force_type, "compress-force=%s"}, - {Opt_degraded, "degraded"}, - {Opt_device, "device=%s"}, - {Opt_fatal_errors, "fatal_errors=%s"}, - {Opt_flushoncommit, "flushoncommit"}, - {Opt_noflushoncommit, "noflushoncommit"}, - {Opt_inode_cache, "inode_cache"}, - {Opt_noinode_cache, "noinode_cache"}, - {Opt_max_inline, "max_inline=%s"}, - {Opt_barrier, "barrier"}, - {Opt_nobarrier, "nobarrier"}, - {Opt_datacow, "datacow"}, - {Opt_nodatacow, "nodatacow"}, - {Opt_datasum, "datasum"}, - {Opt_nodatasum, "nodatasum"}, - {Opt_defrag, "autodefrag"}, - {Opt_nodefrag, "noautodefrag"}, - {Opt_discard, "discard"}, - {Opt_discard_mode, "discard=%s"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_norecovery, "norecovery"}, - {Opt_ratio, "metadata_ratio=%u"}, - {Opt_rescan_uuid_tree, "rescan_uuid_tree"}, - {Opt_skip_balance, "skip_balance"}, - {Opt_space_cache, "space_cache"}, - {Opt_no_space_cache, "nospace_cache"}, - {Opt_space_cache_version, "space_cache=%s"}, - {Opt_ssd, "ssd"}, - {Opt_nossd, "nossd"}, - {Opt_ssd_spread, "ssd_spread"}, - {Opt_nossd_spread, "nossd_spread"}, - {Opt_subvol, "subvol=%s"}, - {Opt_subvol_empty, "subvol="}, - {Opt_subvolid, "subvolid=%s"}, - {Opt_thread_pool, "thread_pool=%u"}, - {Opt_treelog, "treelog"}, - {Opt_notreelog, "notreelog"}, - {Opt_user_subvol_rm_allowed, "user_subvol_rm_allowed"}, +enum { + Opt_fatal_errors_panic, + Opt_fatal_errors_bug, +}; - /* Rescue options */ - {Opt_rescue, "rescue=%s"}, +static const struct constant_table btrfs_parameter_fatal_errors[] = { + { "panic", Opt_fatal_errors_panic }, + { "bug", Opt_fatal_errors_bug }, + {} +}; + +enum { + Opt_discard_sync, + Opt_discard_async, +}; + +static const struct constant_table btrfs_parameter_discard[] = { + { "sync", Opt_discard_sync }, + { "async", Opt_discard_async }, + {} +}; + +enum { + Opt_space_cache_v1, + Opt_space_cache_v2, +}; + +static const struct constant_table btrfs_parameter_space_cache[] = { + { "v1", Opt_space_cache_v1 }, + { "v2", Opt_space_cache_v2 }, + {} +}; + +enum { + Opt_rescue_usebackuproot, + Opt_rescue_nologreplay, + Opt_rescue_ignorebadroots, + Opt_rescue_ignoredatacsums, + Opt_rescue_parameter_all, +}; + +static const struct constant_table btrfs_parameter_rescue[] = { + { "usebackuproot", Opt_rescue_usebackuproot }, + { "nologreplay", Opt_rescue_nologreplay }, + { "ignorebadroots", Opt_rescue_ignorebadroots }, + { "ibadroots", Opt_rescue_ignorebadroots }, + { "ignoredatacsums", Opt_rescue_ignoredatacsums }, + { "idatacsums", Opt_rescue_ignoredatacsums }, + { "all", Opt_rescue_parameter_all }, + {} +}; + +#ifdef CONFIG_BTRFS_DEBUG +enum { + Opt_fragment_parameter_data, + Opt_fragment_parameter_metadata, + Opt_fragment_parameter_all, +}; + +static const struct constant_table btrfs_parameter_fragment[] = { + { "data", Opt_fragment_parameter_data }, + { "metadata", Opt_fragment_parameter_metadata }, + { "all", Opt_fragment_parameter_all }, + {} +}; +#endif + +static const struct fs_parameter_spec btrfs_fs_parameters[] = { + fsparam_flag_no("acl", Opt_acl), + fsparam_flag_no("autodefrag", Opt_defrag), + fsparam_flag_no("barrier", Opt_barrier), + fsparam_flag("clear_cache", Opt_clear_cache), + fsparam_u32("commit", Opt_commit_interval), + fsparam_flag("compress", Opt_compress), + fsparam_string("compress", Opt_compress_type), + fsparam_flag("compress-force", Opt_compress_force), + fsparam_string("compress-force", Opt_compress_force_type), + fsparam_flag_no("datacow", Opt_datacow), + fsparam_flag_no("datasum", Opt_datasum), + fsparam_flag("degraded", Opt_degraded), + fsparam_string("device", Opt_device), + fsparam_flag_no("discard", Opt_discard), + fsparam_enum("discard", Opt_discard_mode, btrfs_parameter_discard), + fsparam_enum("fatal_errors", Opt_fatal_errors, btrfs_parameter_fatal_errors), + fsparam_flag_no("flushoncommit", Opt_flushoncommit), + fsparam_string("max_inline", Opt_max_inline), + fsparam_u32("metadata_ratio", Opt_ratio), + fsparam_flag("rescan_uuid_tree", Opt_rescan_uuid_tree), + fsparam_flag("skip_balance", Opt_skip_balance), + fsparam_flag_no("space_cache", Opt_space_cache), + fsparam_enum("space_cache", Opt_space_cache_version, btrfs_parameter_space_cache), + fsparam_flag_no("ssd", Opt_ssd), + fsparam_flag_no("ssd_spread", Opt_ssd_spread), + fsparam_string("subvol", Opt_subvol), + fsparam_flag("subvol=", Opt_subvol_empty), + fsparam_u64("subvolid", Opt_subvolid), + fsparam_u32("thread_pool", Opt_thread_pool), + fsparam_flag_no("treelog", Opt_treelog), + fsparam_flag("user_subvol_rm_allowed", Opt_user_subvol_rm_allowed), + + /* Rescue options. */ + fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), /* Deprecated, with alias rescue=nologreplay */ - {Opt_nologreplay, "nologreplay"}, + __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ - {Opt_usebackuproot, "usebackuproot"}, - - /* Deprecated options */ - {Opt_recovery, "recovery"}, + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), - /* Debugging options */ - {Opt_enospc_debug, "enospc_debug"}, - {Opt_noenospc_debug, "noenospc_debug"}, + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), #ifdef CONFIG_BTRFS_DEBUG - {Opt_fragment_data, "fragment=data"}, - {Opt_fragment_metadata, "fragment=metadata"}, - {Opt_fragment_all, "fragment=all"}, + fsparam_enum("fragment", Opt_fragment, btrfs_parameter_fragment), #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - {Opt_ref_verify, "ref_verify"}, + fsparam_flag("ref_verify", Opt_ref_verify), #endif - {Opt_err, NULL}, -}; - -static const match_table_t rescue_tokens = { - {Opt_usebackuproot, "usebackuproot"}, - {Opt_nologreplay, "nologreplay"}, - {Opt_ignorebadroots, "ignorebadroots"}, - {Opt_ignorebadroots, "ibadroots"}, - {Opt_ignoredatacsums, "ignoredatacsums"}, - {Opt_ignoredatacsums, "idatacsums"}, - {Opt_rescue_all, "all"}, - {Opt_err, NULL}, + {} }; -static bool check_ro_option(struct btrfs_fs_info *fs_info, unsigned long opt, - const char *opt_name) +/* No support for restricting writes to btrfs devices yet... */ +static inline blk_mode_t btrfs_open_mode(struct fs_context *fc) { - if (fs_info->mount_opt & opt) { - btrfs_err(fs_info, "%s must be used with ro mount option", - opt_name); - return true; - } - return false; + return sb_open_mode(fc->sb_flags) & ~BLK_OPEN_RESTRICT_WRITES; } -static int parse_rescue_options(struct btrfs_fs_info *info, const char *options) +static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *opts; - char *orig; - char *p; - substring_t args[MAX_OPT_ARGS]; - int ret = 0; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + opt = fs_parse(fc, btrfs_fs_parameters, param, &result); + if (opt < 0) + return opt; - while ((p = strsep(&opts, ":")) != NULL) { - int token; + switch (opt) { + case Opt_degraded: + btrfs_set_opt(ctx->mount_opt, DEGRADED); + break; + case Opt_subvol_empty: + /* + * This exists because we used to allow it on accident, so we're + * keeping it to maintain ABI. See 37becec95ac3 ("Btrfs: allow + * empty subvol= again"). + */ + break; + case Opt_subvol: + kfree(ctx->subvol_name); + ctx->subvol_name = kstrdup(param->string, GFP_KERNEL); + if (!ctx->subvol_name) + return -ENOMEM; + break; + case Opt_subvolid: + ctx->subvol_objectid = result.uint_64; - if (!*p) - continue; - token = match_token(p, rescue_tokens, args); - switch (token){ - case Opt_usebackuproot: - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); - break; - case Opt_nologreplay: - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_ignorebadroots: - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - break; - case Opt_ignoredatacsums: - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - break; - case Opt_rescue_all: - btrfs_info(info, "enabling all of the rescue options"); - btrfs_set_and_info(info, IGNOREDATACSUMS, - "ignoring data csums"); - btrfs_set_and_info(info, IGNOREBADROOTS, - "ignoring bad roots"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_err: - btrfs_info(info, "unrecognized rescue option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + /* subvolid=0 means give me the original fs_tree. */ + if (!ctx->subvol_objectid) + ctx->subvol_objectid = BTRFS_FS_TREE_OBJECTID; + break; + case Opt_device: { + struct btrfs_device *device; + blk_mode_t mode = btrfs_open_mode(fc); + mutex_lock(&uuid_mutex); + device = btrfs_scan_one_device(param->string, mode, false); + mutex_unlock(&uuid_mutex); + if (IS_ERR(device)) + return PTR_ERR(device); + break; } -out: - kfree(orig); - return ret; -} - -/* - * Regular mount options parser. Everything that is needed only when - * reading in a new superblock is parsed here. - * XXX JDM: This needs to be cleaned up for remount. - */ -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags) -{ - substring_t args[MAX_OPT_ARGS]; - char *p, *num; - int intarg; - int ret = 0; - char *compress_type; - bool compress_force = false; - enum btrfs_compression_type saved_compress_type; - int saved_compress_level; - bool saved_compress_force; - int no_compress = 0; - const bool remounting = test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state); - - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE)) - btrfs_set_opt(info->mount_opt, FREE_SPACE_TREE); - else if (btrfs_free_space_cache_v1_active(info)) { - if (btrfs_is_zoned(info)) { - btrfs_info(info, - "zoned: clearing existing space cache"); - btrfs_set_super_cache_generation(info->super_copy, 0); + case Opt_datasum: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NODATASUM); } else { - btrfs_set_opt(info->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); } - } - - /* - * Even the options are empty, we still need to do extra check - * against new flags - */ - if (!options) - goto check; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_degraded: - btrfs_info(info, "allowing degraded mounts"); - btrfs_set_opt(info->mount_opt, DEGRADED); - break; - case Opt_subvol: - case Opt_subvol_empty: - case Opt_subvolid: - case Opt_device: - /* - * These are parsed by btrfs_parse_subvol_options or - * btrfs_parse_device_options and can be ignored here. - */ - break; - case Opt_nodatasum: - btrfs_set_and_info(info, NODATASUM, - "setting nodatasum"); - break; - case Opt_datasum: - if (btrfs_test_opt(info, NODATASUM)) { - if (btrfs_test_opt(info, NODATACOW)) - btrfs_info(info, - "setting datasum, datacow enabled"); - else - btrfs_info(info, "setting datasum"); - } - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - break; - case Opt_nodatacow: - if (!btrfs_test_opt(info, NODATACOW)) { - if (!btrfs_test_opt(info, COMPRESS) || - !btrfs_test_opt(info, FORCE_COMPRESS)) { - btrfs_info(info, - "setting nodatacow, compression disabled"); - } else { - btrfs_info(info, "setting nodatacow"); - } - } - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - btrfs_set_opt(info->mount_opt, NODATACOW); - btrfs_set_opt(info->mount_opt, NODATASUM); - break; - case Opt_datacow: - btrfs_clear_and_info(info, NODATACOW, - "setting datacow"); - break; - case Opt_compress_force: - case Opt_compress_force_type: - compress_force = true; - fallthrough; - case Opt_compress: - case Opt_compress_type: - saved_compress_type = btrfs_test_opt(info, - COMPRESS) ? - info->compress_type : BTRFS_COMPRESS_NONE; - saved_compress_force = - btrfs_test_opt(info, FORCE_COMPRESS); - saved_compress_level = info->compress_level; - if (token == Opt_compress || - token == Opt_compress_force || - strncmp(args[0].from, "zlib", 4) == 0) { - compress_type = "zlib"; - - info->compress_type = BTRFS_COMPRESS_ZLIB; - info->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; - /* - * args[0] contains uninitialized data since - * for these tokens we don't expect any - * parameter. - */ - if (token != Opt_compress && - token != Opt_compress_force) - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZLIB, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - no_compress = 0; - } else if (strncmp(args[0].from, "lzo", 3) == 0) { - compress_type = "lzo"; - info->compress_type = BTRFS_COMPRESS_LZO; - info->compress_level = 0; - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_LZO); - no_compress = 0; - } else if (strncmp(args[0].from, "zstd", 4) == 0) { - compress_type = "zstd"; - info->compress_type = BTRFS_COMPRESS_ZSTD; - info->compress_level = - btrfs_compress_str2level( - BTRFS_COMPRESS_ZSTD, - args[0].from + 4); - btrfs_set_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, NODATACOW); - btrfs_clear_opt(info->mount_opt, NODATASUM); - btrfs_set_fs_incompat(info, COMPRESS_ZSTD); - no_compress = 0; - } else if (strncmp(args[0].from, "no", 2) == 0) { - compress_type = "no"; - info->compress_level = 0; - info->compress_type = 0; - btrfs_clear_opt(info->mount_opt, COMPRESS); - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - compress_force = false; - no_compress++; - } else { - btrfs_err(info, "unrecognized compression value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - - if (compress_force) { - btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); - } else { - /* - * If we remount from compress-force=xxx to - * compress=xxx, we need clear FORCE_COMPRESS - * flag, otherwise, there is no way for users - * to disable forcible compression separately. - */ - btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); - } - if (no_compress == 1) { - btrfs_info(info, "use no compression"); - } else if ((info->compress_type != saved_compress_type) || - (compress_force != saved_compress_force) || - (info->compress_level != saved_compress_level)) { - btrfs_info(info, "%s %s compression, level %d", - (compress_force) ? "force" : "use", - compress_type, info->compress_level); - } - compress_force = false; - break; - case Opt_ssd: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_ssd_spread: - btrfs_set_and_info(info, SSD, - "enabling ssd optimizations"); - btrfs_set_and_info(info, SSD_SPREAD, - "using spread ssd allocation scheme"); - btrfs_clear_opt(info->mount_opt, NOSSD); - break; - case Opt_nossd: - btrfs_set_opt(info->mount_opt, NOSSD); - btrfs_clear_and_info(info, SSD, - "not using ssd optimizations"); - fallthrough; - case Opt_nossd_spread: - btrfs_clear_and_info(info, SSD_SPREAD, - "not using spread ssd allocation scheme"); - break; - case Opt_barrier: - btrfs_clear_and_info(info, NOBARRIER, - "turning on barriers"); - break; - case Opt_nobarrier: - btrfs_set_and_info(info, NOBARRIER, - "turning off barriers"); - break; - case Opt_thread_pool: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized thread_pool value %s", - args[0].from); - goto out; - } else if (intarg == 0) { - btrfs_err(info, "invalid value 0 for thread_pool"); - ret = -EINVAL; - goto out; - } - info->thread_pool_size = intarg; - break; - case Opt_max_inline: - num = match_strdup(&args[0]); - if (num) { - info->max_inline = memparse(num, NULL); - kfree(num); - - if (info->max_inline) { - info->max_inline = min_t(u64, - info->max_inline, - info->sectorsize); - } - btrfs_info(info, "max_inline at %llu", - info->max_inline); - } else { - ret = -ENOMEM; - goto out; - } - break; - case Opt_acl: + break; + case Opt_datacow: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + btrfs_set_opt(ctx->mount_opt, NODATACOW); + btrfs_set_opt(ctx->mount_opt, NODATASUM); + } else { + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + } + break; + case Opt_compress_force: + case Opt_compress_force_type: + btrfs_set_opt(ctx->mount_opt, FORCE_COMPRESS); + fallthrough; + case Opt_compress: + case Opt_compress_type: + if (opt == Opt_compress || opt == Opt_compress_force) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = BTRFS_ZLIB_DEFAULT_LEVEL; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zlib", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZLIB; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZLIB, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "lzo", 3) == 0) { + ctx->compress_type = BTRFS_COMPRESS_LZO; + ctx->compress_level = 0; + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "zstd", 4) == 0) { + ctx->compress_type = BTRFS_COMPRESS_ZSTD; + ctx->compress_level = + btrfs_compress_str2level(BTRFS_COMPRESS_ZSTD, + param->string + 4); + btrfs_set_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, NODATACOW); + btrfs_clear_opt(ctx->mount_opt, NODATASUM); + } else if (strncmp(param->string, "no", 2) == 0) { + ctx->compress_level = 0; + ctx->compress_type = 0; + btrfs_clear_opt(ctx->mount_opt, COMPRESS); + btrfs_clear_opt(ctx->mount_opt, FORCE_COMPRESS); + } else { + btrfs_err(NULL, "unrecognized compression value %s", + param->string); + return -EINVAL; + } + break; + case Opt_ssd: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSSD); + btrfs_clear_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_ssd_spread: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, SSD_SPREAD); + } else { + btrfs_set_opt(ctx->mount_opt, SSD); + btrfs_set_opt(ctx->mount_opt, SSD_SPREAD); + btrfs_clear_opt(ctx->mount_opt, NOSSD); + } + break; + case Opt_barrier: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOBARRIER); + else + btrfs_clear_opt(ctx->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + if (result.uint_32 == 0) { + btrfs_err(NULL, "invalid value 0 for thread_pool"); + return -EINVAL; + } + ctx->thread_pool_size = result.uint_32; + break; + case Opt_max_inline: + ctx->max_inline = memparse(param->string, NULL); + break; + case Opt_acl: + if (result.negated) { + fc->sb_flags &= ~SB_POSIXACL; + } else { #ifdef CONFIG_BTRFS_FS_POSIX_ACL - info->sb->s_flags |= SB_POSIXACL; - break; + fc->sb_flags |= SB_POSIXACL; #else - btrfs_err(info, "support for ACL not compiled in!"); - ret = -EINVAL; - goto out; + btrfs_err(NULL, "support for ACL not compiled in"); + return -EINVAL; #endif - case Opt_noacl: - info->sb->s_flags &= ~SB_POSIXACL; - break; - case Opt_notreelog: - btrfs_set_and_info(info, NOTREELOG, - "disabling tree log"); - break; - case Opt_treelog: - btrfs_clear_and_info(info, NOTREELOG, - "enabling tree log"); - break; - case Opt_norecovery: - case Opt_nologreplay: - btrfs_warn(info, + } + /* + * VFS limits the ability to toggle ACL on and off via remount, + * despite every file system allowing this. This seems to be + * an oversight since we all do, but it'll fail if we're + * remounting. So don't set the mask here, we'll check it in + * btrfs_reconfigure and do the toggling ourselves. + */ + if (fc->purpose != FS_CONTEXT_FOR_RECONFIGURE) + fc->sb_flags_mask |= SB_POSIXACL; + break; + case Opt_treelog: + if (result.negated) + btrfs_set_opt(ctx->mount_opt, NOTREELOG); + else + btrfs_clear_opt(ctx->mount_opt, NOTREELOG); + break; + case Opt_nologreplay: + btrfs_warn(NULL, "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_and_info(info, NOLOGREPLAY, - "disabling log replay at mount time"); - break; - case Opt_flushoncommit: - btrfs_set_and_info(info, FLUSHONCOMMIT, - "turning on flush-on-commit"); - break; - case Opt_noflushoncommit: - btrfs_clear_and_info(info, FLUSHONCOMMIT, - "turning off flush-on-commit"); - break; - case Opt_ratio: - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized metadata_ratio value %s", - args[0].from); - goto out; - } - info->metadata_ratio = intarg; - btrfs_info(info, "metadata ratio %u", - info->metadata_ratio); - break; - case Opt_discard: - case Opt_discard_mode: - if (token == Opt_discard || - strcmp(args[0].from, "sync") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_ASYNC); - btrfs_set_and_info(info, DISCARD_SYNC, - "turning on sync discard"); - } else if (strcmp(args[0].from, "async") == 0) { - btrfs_clear_opt(info->mount_opt, DISCARD_SYNC); - btrfs_set_and_info(info, DISCARD_ASYNC, - "turning on async discard"); - } else { - btrfs_err(info, "unrecognized discard mode value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - btrfs_clear_opt(info->mount_opt, NODISCARD); - break; - case Opt_nodiscard: - btrfs_clear_and_info(info, DISCARD_SYNC, - "turning off discard"); - btrfs_clear_and_info(info, DISCARD_ASYNC, - "turning off async discard"); - btrfs_set_opt(info->mount_opt, NODISCARD); - break; - case Opt_space_cache: - case Opt_space_cache_version: - /* - * We already set FREE_SPACE_TREE above because we have - * compat_ro(FREE_SPACE_TREE) set, and we aren't going - * to allow v1 to be set for extent tree v2, simply - * ignore this setting if we're extent tree v2. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (token == Opt_space_cache || - strcmp(args[0].from, "v1") == 0) { - btrfs_clear_opt(info->mount_opt, - FREE_SPACE_TREE); - btrfs_set_and_info(info, SPACE_CACHE, - "enabling disk space caching"); - } else if (strcmp(args[0].from, "v2") == 0) { - btrfs_clear_opt(info->mount_opt, - SPACE_CACHE); - btrfs_set_and_info(info, FREE_SPACE_TREE, - "enabling free space tree"); - } else { - btrfs_err(info, "unrecognized space_cache value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - break; - case Opt_rescan_uuid_tree: - btrfs_set_opt(info->mount_opt, RESCAN_UUID_TREE); - break; - case Opt_no_space_cache: - /* - * We cannot operate without the free space tree with - * extent tree v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - if (btrfs_test_opt(info, SPACE_CACHE)) { - btrfs_clear_and_info(info, SPACE_CACHE, - "disabling disk space caching"); - } - if (btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_clear_and_info(info, FREE_SPACE_TREE, - "disabling free space tree"); - } + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); + else + btrfs_set_opt(ctx->mount_opt, FLUSHONCOMMIT); + break; + case Opt_ratio: + ctx->metadata_ratio = result.uint_32; + break; + case Opt_discard: + if (result.negated) { + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, NODISCARD); + } else { + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + } + break; + case Opt_discard_mode: + switch (result.uint_32) { + case Opt_discard_sync: + btrfs_clear_opt(ctx->mount_opt, DISCARD_ASYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_SYNC); break; - case Opt_inode_cache: - case Opt_noinode_cache: - btrfs_warn(info, - "the 'inode_cache' option is deprecated and has no effect since 5.11"); + case Opt_discard_async: + btrfs_clear_opt(ctx->mount_opt, DISCARD_SYNC); + btrfs_set_opt(ctx->mount_opt, DISCARD_ASYNC); break; - case Opt_clear_cache: - /* - * We cannot clear the free space tree with extent tree - * v2, ignore this option. - */ - if (btrfs_fs_incompat(info, EXTENT_TREE_V2)) - break; - btrfs_set_and_info(info, CLEAR_CACHE, - "force clearing of disk cache"); - break; - case Opt_user_subvol_rm_allowed: - btrfs_set_opt(info->mount_opt, USER_SUBVOL_RM_ALLOWED); + default: + btrfs_err(NULL, "unrecognized discard mode value %s", + param->key); + return -EINVAL; + } + btrfs_clear_opt(ctx->mount_opt, NODISCARD); + break; + case Opt_space_cache: + if (result.negated) { + btrfs_set_opt(ctx->mount_opt, NOSPACECACHE); + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + } else { + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + } + break; + case Opt_space_cache_version: + switch (result.uint_32) { + case Opt_space_cache_v1: + btrfs_set_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_clear_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_enospc_debug: - btrfs_set_opt(info->mount_opt, ENOSPC_DEBUG); + case Opt_space_cache_v2: + btrfs_clear_opt(ctx->mount_opt, SPACE_CACHE); + btrfs_set_opt(ctx->mount_opt, FREE_SPACE_TREE); break; - case Opt_noenospc_debug: - btrfs_clear_opt(info->mount_opt, ENOSPC_DEBUG); + default: + btrfs_err(NULL, "unrecognized space_cache value %s", + param->key); + return -EINVAL; + } + break; + case Opt_rescan_uuid_tree: + btrfs_set_opt(ctx->mount_opt, RESCAN_UUID_TREE); + break; + case Opt_clear_cache: + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_user_subvol_rm_allowed: + btrfs_set_opt(ctx->mount_opt, USER_SUBVOL_RM_ALLOWED); + break; + case Opt_enospc_debug: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, ENOSPC_DEBUG); + else + btrfs_set_opt(ctx->mount_opt, ENOSPC_DEBUG); + break; + case Opt_defrag: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, AUTO_DEFRAG); + else + btrfs_set_opt(ctx->mount_opt, AUTO_DEFRAG); + break; + case Opt_usebackuproot: + btrfs_warn(NULL, + "'usebackuproot' is deprecated, use 'rescue=usebackuproot' instead"); + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); + + /* If we're loading the backup roots we can't trust the space cache. */ + btrfs_set_opt(ctx->mount_opt, CLEAR_CACHE); + break; + case Opt_skip_balance: + btrfs_set_opt(ctx->mount_opt, SKIP_BALANCE); + break; + case Opt_fatal_errors: + switch (result.uint_32) { + case Opt_fatal_errors_panic: + btrfs_set_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_defrag: - btrfs_set_and_info(info, AUTO_DEFRAG, - "enabling auto defrag"); + case Opt_fatal_errors_bug: + btrfs_clear_opt(ctx->mount_opt, PANIC_ON_FATAL_ERROR); break; - case Opt_nodefrag: - btrfs_clear_and_info(info, AUTO_DEFRAG, - "disabling auto defrag"); + default: + btrfs_err(NULL, "unrecognized fatal_errors value %s", + param->key); + return -EINVAL; + } + break; + case Opt_commit_interval: + ctx->commit_interval = result.uint_32; + if (ctx->commit_interval == 0) + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + break; + case Opt_rescue: + switch (result.uint_32) { + case Opt_rescue_usebackuproot: + btrfs_set_opt(ctx->mount_opt, USEBACKUPROOT); break; - case Opt_recovery: - case Opt_usebackuproot: - btrfs_warn(info, - "'%s' is deprecated, use 'rescue=usebackuproot' instead", - token == Opt_recovery ? "recovery" : - "usebackuproot"); - btrfs_info(info, - "trying to use backup root at mount time"); - btrfs_set_opt(info->mount_opt, USEBACKUPROOT); + case Opt_rescue_nologreplay: + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; - case Opt_skip_balance: - btrfs_set_opt(info->mount_opt, SKIP_BALANCE); + case Opt_rescue_ignorebadroots: + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); break; - case Opt_fatal_errors: - if (strcmp(args[0].from, "panic") == 0) { - btrfs_set_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else if (strcmp(args[0].from, "bug") == 0) { - btrfs_clear_opt(info->mount_opt, - PANIC_ON_FATAL_ERROR); - } else { - btrfs_err(info, "unrecognized fatal_errors value %s", - args[0].from); - ret = -EINVAL; - goto out; - } + case Opt_rescue_ignoredatacsums: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); break; - case Opt_commit_interval: - intarg = 0; - ret = match_int(&args[0], &intarg); - if (ret) { - btrfs_err(info, "unrecognized commit_interval value %s", - args[0].from); - ret = -EINVAL; - goto out; - } - if (intarg == 0) { - btrfs_info(info, - "using default commit interval %us", - BTRFS_DEFAULT_COMMIT_INTERVAL); - intarg = BTRFS_DEFAULT_COMMIT_INTERVAL; - } else if (intarg > 300) { - btrfs_warn(info, "excessive commit interval %d", - intarg); - } - info->commit_interval = intarg; - break; - case Opt_rescue: - ret = parse_rescue_options(info, args[0].from); - if (ret < 0) { - btrfs_err(info, "unrecognized rescue value %s", - args[0].from); - goto out; - } + case Opt_rescue_parameter_all: + btrfs_set_opt(ctx->mount_opt, IGNOREDATACSUMS); + btrfs_set_opt(ctx->mount_opt, IGNOREBADROOTS); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); break; + default: + btrfs_info(NULL, "unrecognized rescue option '%s'", + param->key); + return -EINVAL; + } + break; #ifdef CONFIG_BTRFS_DEBUG - case Opt_fragment_all: - btrfs_info(info, "fragmenting all space"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); - btrfs_set_opt(info->mount_opt, FRAGMENT_METADATA); + case Opt_fragment: + switch (result.uint_32) { + case Opt_fragment_parameter_all: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_metadata: - btrfs_info(info, "fragmenting metadata"); - btrfs_set_opt(info->mount_opt, - FRAGMENT_METADATA); + case Opt_fragment_parameter_metadata: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_METADATA); break; - case Opt_fragment_data: - btrfs_info(info, "fragmenting data"); - btrfs_set_opt(info->mount_opt, FRAGMENT_DATA); + case Opt_fragment_parameter_data: + btrfs_set_opt(ctx->mount_opt, FRAGMENT_DATA); break; + default: + btrfs_info(NULL, "unrecognized fragment option '%s'", + param->key); + return -EINVAL; + } + break; #endif #ifdef CONFIG_BTRFS_FS_REF_VERIFY - case Opt_ref_verify: - btrfs_info(info, "doing ref verification"); - btrfs_set_opt(info->mount_opt, REF_VERIFY); - break; + case Opt_ref_verify: + btrfs_set_opt(ctx->mount_opt, REF_VERIFY); + break; #endif - case Opt_err: - btrfs_err(info, "unrecognized mount option '%s'", p); - ret = -EINVAL; - goto out; - default: - break; - } + default: + btrfs_err(NULL, "unrecognized mount option '%s'", param->key); + return -EINVAL; } -check: - /* We're read-only, don't have to check. */ - if (new_flags & SB_RDONLY) - goto out; - if (check_ro_option(info, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || - check_ro_option(info, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || - check_ro_option(info, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums")) - ret = -EINVAL; -out: - if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE) && - !btrfs_test_opt(info, CLEAR_CACHE)) { - btrfs_err(info, "cannot disable free space tree"); - ret = -EINVAL; - } - if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && - !btrfs_test_opt(info, FREE_SPACE_TREE)) { - btrfs_err(info, "cannot disable free space tree with block-group-tree feature"); - ret = -EINVAL; - } - if (!ret) - ret = btrfs_check_mountopts_zoned(info); - if (!ret && !remounting) { - if (btrfs_test_opt(info, SPACE_CACHE)) - btrfs_info(info, "disk space caching is enabled"); - if (btrfs_test_opt(info, FREE_SPACE_TREE)) - btrfs_info(info, "using free space tree"); - } - return ret; + return 0; } /* - * Parse mount options that are required early in the mount process. - * - * All other options will be parsed on much later in the mount process and - * only when we need to allocate a new super block. + * Some options only have meaning at mount time and shouldn't persist across + * remounts, or be displayed. Clear these at the end of mount and remount code + * paths. */ -static int btrfs_parse_device_options(const char *options, blk_mode_t flags) +static void btrfs_clear_oneshot_options(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *device_name, *opts, *orig, *p; - struct btrfs_device *device = NULL; - int error = 0; + btrfs_clear_opt(fs_info->mount_opt, USEBACKUPROOT); + btrfs_clear_opt(fs_info->mount_opt, CLEAR_CACHE); + btrfs_clear_opt(fs_info->mount_opt, NOSPACECACHE); +} - lockdep_assert_held(&uuid_mutex); +static bool check_ro_option(struct btrfs_fs_info *fs_info, + unsigned long mount_opt, unsigned long opt, + const char *opt_name) +{ + if (mount_opt & opt) { + btrfs_err(fs_info, "%s must be used with ro mount option", + opt_name); + return true; + } + return false; +} - if (!options) - return 0; +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags) +{ + bool ret = true; - /* - * strsep changes the string, duplicate it because btrfs_parse_options - * gets called later - */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (!(flags & SB_RDONLY) && + (check_ro_option(info, *mount_opt, BTRFS_MOUNT_NOLOGREPLAY, "nologreplay") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREBADROOTS, "ignorebadroots") || + check_ro_option(info, *mount_opt, BTRFS_MOUNT_IGNOREDATACSUMS, "ignoredatacsums"))) + ret = false; - while ((p = strsep(&opts, ",")) != NULL) { - int token; + if (btrfs_fs_compat_ro(info, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE) && + !btrfs_raw_test_opt(*mount_opt, CLEAR_CACHE)) { + btrfs_err(info, "cannot disable free-space-tree"); + ret = false; + } + if (btrfs_fs_compat_ro(info, BLOCK_GROUP_TREE) && + !btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) { + btrfs_err(info, "cannot disable free-space-tree with block-group-tree feature"); + ret = false; + } - if (!*p) - continue; + if (btrfs_check_mountopts_zoned(info, mount_opt)) + ret = false; - token = match_token(p, tokens, args); - if (token == Opt_device) { - device_name = match_strdup(&args[0]); - if (!device_name) { - error = -ENOMEM; - goto out; - } - device = btrfs_scan_one_device(device_name, flags, false); - kfree(device_name); - if (IS_ERR(device)) { - error = PTR_ERR(device); - goto out; - } - } + if (!test_bit(BTRFS_FS_STATE_REMOUNTING, &info->fs_state)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) + btrfs_info(info, "disk space caching is enabled"); + if (btrfs_raw_test_opt(*mount_opt, FREE_SPACE_TREE)) + btrfs_info(info, "using free-space-tree"); } -out: - kfree(orig); - return error; + return ret; } /* - * Parse mount options that are related to subvolume id + * This is subtle, we only call this during open_ctree(). We need to pre-load + * the mount options with the on-disk settings. Before the new mount API took + * effect we would do this on mount and remount. With the new mount API we'll + * only do this on the initial mount. * - * The value is later passed to mount_subvol() + * This isn't a change in behavior, because we're using the current state of the + * file system to set the current mount options. If you mounted with special + * options to disable these features and then remounted we wouldn't revert the + * settings, because mounting without these features cleared the on-disk + * settings, so this being called on re-mount is not needed. */ -static int btrfs_parse_subvol_options(const char *options, char **subvol_name, - u64 *subvol_objectid) +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info) { - substring_t args[MAX_OPT_ARGS]; - char *opts, *orig, *p; - int error = 0; - u64 subvolid; - - if (!options) - return 0; + if (fs_info->sectorsize < PAGE_SIZE) { + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + if (!btrfs_test_opt(fs_info, FREE_SPACE_TREE)) { + btrfs_info(fs_info, + "forcing free space tree for sector size %u with page size %lu", + fs_info->sectorsize, PAGE_SIZE); + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + } + } /* - * strsep changes the string, duplicate it because - * btrfs_parse_device_options gets called later + * At this point our mount options are populated, so we only mess with + * these settings if we don't have any settings already. */ - opts = kstrdup(options, GFP_KERNEL); - if (!opts) - return -ENOMEM; - orig = opts; + if (btrfs_test_opt(fs_info, FREE_SPACE_TREE)) + return; - while ((p = strsep(&opts, ",")) != NULL) { - int token; - if (!*p) - continue; + if (btrfs_is_zoned(fs_info) && + btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_info(fs_info, "zoned: clearing existing space cache"); + btrfs_set_super_cache_generation(fs_info->super_copy, 0); + return; + } - token = match_token(p, tokens, args); - switch (token) { - case Opt_subvol: - kfree(*subvol_name); - *subvol_name = match_strdup(&args[0]); - if (!*subvol_name) { - error = -ENOMEM; - goto out; - } - break; - case Opt_subvolid: - error = match_u64(&args[0], &subvolid); - if (error) - goto out; + if (btrfs_test_opt(fs_info, SPACE_CACHE)) + return; - /* we want the original fs_tree */ - if (subvolid == 0) - subvolid = BTRFS_FS_TREE_OBJECTID; + if (btrfs_test_opt(fs_info, NOSPACECACHE)) + return; - *subvol_objectid = subvolid; - break; - default: - break; - } - } + /* + * At this point we don't have explicit options set by the user, set + * them ourselves based on the state of the file system. + */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + else if (btrfs_free_space_cache_v1_active(fs_info)) + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); +} -out: - kfree(orig); - return error; +static void set_device_specific_options(struct btrfs_fs_info *fs_info) +{ + if (!btrfs_test_opt(fs_info, NOSSD) && + !fs_info->fs_devices->rotating) + btrfs_set_opt(fs_info->mount_opt, SSD); + + /* + * For devices supporting discard turn on discard=async automatically, + * unless it's already set or disabled. This could be turned off by + * nodiscard for the same mount. + * + * The zoned mode piggy backs on the discard functionality for + * resetting a zone. There is no reason to delay the zone reset as it is + * fast enough. So, do not enable async discard for zoned mode. + */ + if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) || + btrfs_test_opt(fs_info, DISCARD_ASYNC) || + btrfs_test_opt(fs_info, NODISCARD)) && + fs_info->fs_devices->discardable && + !btrfs_is_zoned(fs_info)) + btrfs_set_opt(fs_info->mount_opt, DISCARD_ASYNC); } char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, @@ -1102,10 +929,6 @@ static int btrfs_fill_super(struct super_block *sb, #endif sb->s_xattr = btrfs_xattr_handlers; sb->s_time_gran = 1; -#ifdef CONFIG_BTRFS_FS_POSIX_ACL - sb->s_flags |= SB_POSIXACL; -#endif - sb->s_flags |= SB_I_VERSION; sb->s_iflags |= SB_I_CGROUPWB; err = super_setup_bdi(sb); @@ -1288,22 +1111,6 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) return 0; } -static int btrfs_test_super(struct super_block *s, void *data) -{ - struct btrfs_fs_info *p = data; - struct btrfs_fs_info *fs_info = btrfs_sb(s); - - return fs_info->fs_devices == p->fs_devices; -} - -static int btrfs_set_super(struct super_block *s, void *data) -{ - int err = set_anon_super(s, data); - if (!err) - s->s_fs_info = data; - return err; -} - /* * subvolumes are identified by ino 256 */ @@ -1379,200 +1186,6 @@ out: return root; } -/* - * Find a superblock for the given device / mount point. - * - * Note: This is based on mount_bdev from fs/super.c with a few additions - * for multiple device setup. Make sure to keep it in sync. - */ -static struct dentry *btrfs_mount_root(struct file_system_type *fs_type, - int flags, const char *device_name, void *data) -{ - struct block_device *bdev = NULL; - struct super_block *s; - struct btrfs_device *device = NULL; - struct btrfs_fs_devices *fs_devices = NULL; - struct btrfs_fs_info *fs_info = NULL; - void *new_sec_opts = NULL; - blk_mode_t mode = sb_open_mode(flags); - int error = 0; - - if (data) { - error = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (error) - return ERR_PTR(error); - } - - /* - * Setup a dummy root and fs_info for test/set super. This is because - * we don't actually fill this stuff out until open_ctree, but we need - * then open_ctree will properly initialize the file system specific - * settings later. btrfs_init_fs_info initializes the static elements - * of the fs_info (locks and such) to make cleanup easier if we find a - * superblock with our given fs_devices later on at sget() time. - */ - fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); - if (!fs_info) { - error = -ENOMEM; - goto error_sec_opts; - } - btrfs_init_fs_info(fs_info); - - fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); - if (!fs_info->super_copy || !fs_info->super_for_commit) { - error = -ENOMEM; - goto error_fs_info; - } - - mutex_lock(&uuid_mutex); - error = btrfs_parse_device_options(data, mode); - if (error) { - mutex_unlock(&uuid_mutex); - goto error_fs_info; - } - - /* - * With 'true' passed to btrfs_scan_one_device() (mount time) we expect - * either a valid device or an error. - */ - device = btrfs_scan_one_device(device_name, mode, true); - ASSERT(device != NULL); - if (IS_ERR(device)) { - mutex_unlock(&uuid_mutex); - error = PTR_ERR(device); - goto error_fs_info; - } - - fs_devices = device->fs_devices; - fs_info->fs_devices = fs_devices; - - error = btrfs_open_devices(fs_devices, mode, fs_type); - mutex_unlock(&uuid_mutex); - if (error) - goto error_fs_info; - - if (!(flags & SB_RDONLY) && fs_devices->rw_devices == 0) { - error = -EACCES; - goto error_close_devices; - } - - bdev = fs_devices->latest_dev->bdev; - s = sget(fs_type, btrfs_test_super, btrfs_set_super, flags | SB_NOSEC, - fs_info); - if (IS_ERR(s)) { - error = PTR_ERR(s); - goto error_close_devices; - } - - if (s->s_root) { - btrfs_close_devices(fs_devices); - btrfs_free_fs_info(fs_info); - if ((flags ^ s->s_flags) & SB_RDONLY) - error = -EBUSY; - } else { - snprintf(s->s_id, sizeof(s->s_id), "%pg", bdev); - shrinker_debugfs_rename(s->s_shrink, "sb-%s:%s", fs_type->name, - s->s_id); - btrfs_sb(s)->bdev_holder = fs_type; - error = btrfs_fill_super(s, fs_devices, data); - } - if (!error) - error = security_sb_set_mnt_opts(s, new_sec_opts, 0, NULL); - security_free_mnt_opts(&new_sec_opts); - if (error) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - - return dget(s->s_root); - -error_close_devices: - btrfs_close_devices(fs_devices); -error_fs_info: - btrfs_free_fs_info(fs_info); -error_sec_opts: - security_free_mnt_opts(&new_sec_opts); - return ERR_PTR(error); -} - -/* - * Mount function which is called by VFS layer. - * - * In order to allow mounting a subvolume directly, btrfs uses mount_subtree() - * which needs vfsmount* of device's root (/). This means device's root has to - * be mounted internally in any case. - * - * Operation flow: - * 1. Parse subvol id related options for later use in mount_subvol(). - * - * 2. Mount device's root (/) by calling vfs_kern_mount(). - * - * NOTE: vfs_kern_mount() is used by VFS to call btrfs_mount() in the - * first place. In order to avoid calling btrfs_mount() again, we use - * different file_system_type which is not registered to VFS by - * register_filesystem() (btrfs_root_fs_type). As a result, - * btrfs_mount_root() is called. The return value will be used by - * mount_subtree() in mount_subvol(). - * - * 3. Call mount_subvol() to get the dentry of subvolume. Since there is - * "btrfs subvolume set-default", mount_subvol() is called always. - */ -static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, - const char *device_name, void *data) -{ - struct vfsmount *mnt_root; - struct dentry *root; - char *subvol_name = NULL; - u64 subvol_objectid = 0; - int error = 0; - - error = btrfs_parse_subvol_options(data, &subvol_name, - &subvol_objectid); - if (error) { - kfree(subvol_name); - return ERR_PTR(error); - } - - /* mount device's root (/) */ - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, flags, device_name, data); - if (PTR_ERR_OR_ZERO(mnt_root) == -EBUSY) { - if (flags & SB_RDONLY) { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags & ~SB_RDONLY, device_name, data); - } else { - mnt_root = vfs_kern_mount(&btrfs_root_fs_type, - flags | SB_RDONLY, device_name, data); - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - down_write(&mnt_root->mnt_sb->s_umount); - error = btrfs_remount(mnt_root->mnt_sb, &flags, NULL); - up_write(&mnt_root->mnt_sb->s_umount); - if (error < 0) { - root = ERR_PTR(error); - mntput(mnt_root); - kfree(subvol_name); - goto out; - } - } - } - if (IS_ERR(mnt_root)) { - root = ERR_CAST(mnt_root); - kfree(subvol_name); - goto out; - } - - /* mount_subvol() will free subvol_name and mnt_root */ - root = mount_subvol(subvol_name, subvol_objectid, mnt_root); - -out: - return root; -} - static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info, u32 new_pool_size, u32 old_pool_size) { @@ -1635,202 +1248,274 @@ static inline void btrfs_remount_cleanup(struct btrfs_fs_info *fs_info, btrfs_set_free_space_cache_v1_active(fs_info, cache_opt); } -static int btrfs_remount(struct super_block *sb, int *flags, char *data) +static int btrfs_remount_rw(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_info *fs_info = btrfs_sb(sb); - unsigned old_flags = sb->s_flags; - unsigned long old_opts = fs_info->mount_opt; - unsigned long old_compress_type = fs_info->compress_type; - u64 old_max_inline = fs_info->max_inline; - u32 old_thread_pool_size = fs_info->thread_pool_size; - u32 old_metadata_ratio = fs_info->metadata_ratio; int ret; - sync_filesystem(sb); - set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + if (BTRFS_FS_ERROR(fs_info)) { + btrfs_err(fs_info, + "remounting read-write after error is not allowed"); + return -EINVAL; + } - if (data) { - void *new_sec_opts = NULL; + if (fs_info->fs_devices->rw_devices == 0) + return -EACCES; - ret = security_sb_eat_lsm_opts(data, &new_sec_opts); - if (!ret) - ret = security_sb_remount(sb, new_sec_opts); - security_free_mnt_opts(&new_sec_opts); - if (ret) - goto restore; + if (!btrfs_check_rw_degradable(fs_info, NULL)) { + btrfs_warn(fs_info, + "too many missing devices, writable remount is not allowed"); + return -EACCES; } - ret = btrfs_parse_options(fs_info, data, *flags); + if (btrfs_super_log_root(fs_info->super_copy) != 0) { + btrfs_warn(fs_info, + "mount required to replay tree-log, cannot remount read-write"); + return -EINVAL; + } + + /* + * NOTE: when remounting with a change that does writes, don't put it + * anywhere above this point, as we are not sure to be safe to write + * until we pass the above checks. + */ + ret = btrfs_start_pre_rw_mount(fs_info); if (ret) - goto restore; + return ret; - ret = btrfs_check_features(fs_info, !(*flags & SB_RDONLY)); - if (ret < 0) - goto restore; + btrfs_clear_sb_rdonly(fs_info->sb); - btrfs_remount_begin(fs_info, old_opts, *flags); - btrfs_resize_thread_pool(fs_info, - fs_info->thread_pool_size, old_thread_pool_size); + set_bit(BTRFS_FS_OPEN, &fs_info->flags); - if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != - (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && - (!sb_rdonly(sb) || (*flags & SB_RDONLY))) { - btrfs_warn(fs_info, - "remount supports changing free space tree only from ro to rw"); - /* Make sure free space cache options match the state on disk */ - if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { - btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); - } - if (btrfs_free_space_cache_v1_active(fs_info)) { - btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); - btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); - } - } + /* + * If we've gone from readonly -> read-write, we need to get our + * sync/async discard lists in the right state. + */ + btrfs_discard_resume(fs_info); - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) - goto out; + return 0; +} - if (*flags & SB_RDONLY) { - /* - * this also happens on 'umount -rf' or on shutdown, when - * the filesystem is busy. - */ - cancel_work_sync(&fs_info->async_reclaim_work); - cancel_work_sync(&fs_info->async_data_reclaim_work); +static int btrfs_remount_ro(struct btrfs_fs_info *fs_info) +{ + /* + * This also happens on 'umount -rf' or on shutdown, when the + * filesystem is busy. + */ + cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); - btrfs_discard_cleanup(fs_info); + btrfs_discard_cleanup(fs_info); - /* wait for the uuid_scan task to finish */ - down(&fs_info->uuid_tree_rescan_sem); - /* avoid complains from lockdep et al. */ - up(&fs_info->uuid_tree_rescan_sem); + /* Wait for the uuid_scan task to finish */ + down(&fs_info->uuid_tree_rescan_sem); + /* Avoid complains from lockdep et al. */ + up(&fs_info->uuid_tree_rescan_sem); - btrfs_set_sb_rdonly(sb); + btrfs_set_sb_rdonly(fs_info->sb); - /* - * Setting SB_RDONLY will put the cleaner thread to - * sleep at the next loop if it's already active. - * If it's already asleep, we'll leave unused block - * groups on disk until we're mounted read-write again - * unless we clean them up here. - */ - btrfs_delete_unused_bgs(fs_info); + /* + * Setting SB_RDONLY will put the cleaner thread to sleep at the next + * loop if it's already active. If it's already asleep, we'll leave + * unused block groups on disk until we're mounted read-write again + * unless we clean them up here. + */ + btrfs_delete_unused_bgs(fs_info); - /* - * The cleaner task could be already running before we set the - * flag BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). - * We must make sure that after we finish the remount, i.e. after - * we call btrfs_commit_super(), the cleaner can no longer start - * a transaction - either because it was dropping a dead root, - * running delayed iputs or deleting an unused block group (the - * cleaner picked a block group from the list of unused block - * groups before we were able to in the previous call to - * btrfs_delete_unused_bgs()). - */ - wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, - TASK_UNINTERRUPTIBLE); + /* + * The cleaner task could be already running before we set the flag + * BTRFS_FS_STATE_RO (and SB_RDONLY in the superblock). We must make + * sure that after we finish the remount, i.e. after we call + * btrfs_commit_super(), the cleaner can no longer start a transaction + * - either because it was dropping a dead root, running delayed iputs + * or deleting an unused block group (the cleaner picked a block + * group from the list of unused block groups before we were able to + * in the previous call to btrfs_delete_unused_bgs()). + */ + wait_on_bit(&fs_info->flags, BTRFS_FS_CLEANER_RUNNING, TASK_UNINTERRUPTIBLE); - /* - * We've set the superblock to RO mode, so we might have made - * the cleaner task sleep without running all pending delayed - * iputs. Go through all the delayed iputs here, so that if an - * unmount happens without remounting RW we don't end up at - * finishing close_ctree() with a non-empty list of delayed - * iputs. - */ - btrfs_run_delayed_iputs(fs_info); + /* + * We've set the superblock to RO mode, so we might have made the + * cleaner task sleep without running all pending delayed iputs. Go + * through all the delayed iputs here, so that if an unmount happens + * without remounting RW we don't end up at finishing close_ctree() + * with a non-empty list of delayed iputs. + */ + btrfs_run_delayed_iputs(fs_info); - btrfs_dev_replace_suspend_for_unmount(fs_info); - btrfs_scrub_cancel(fs_info); - btrfs_pause_balance(fs_info); + btrfs_dev_replace_suspend_for_unmount(fs_info); + btrfs_scrub_cancel(fs_info); + btrfs_pause_balance(fs_info); - /* - * Pause the qgroup rescan worker if it is running. We don't want - * it to be still running after we are in RO mode, as after that, - * by the time we unmount, it might have left a transaction open, - * so we would leak the transaction and/or crash. - */ - btrfs_qgroup_wait_for_completion(fs_info, false); + /* + * Pause the qgroup rescan worker if it is running. We don't want it to + * be still running after we are in RO mode, as after that, by the time + * we unmount, it might have left a transaction open, so we would leak + * the transaction and/or crash. + */ + btrfs_qgroup_wait_for_completion(fs_info, false); - ret = btrfs_commit_super(fs_info); - if (ret) - goto restore; - } else { - if (BTRFS_FS_ERROR(fs_info)) { - btrfs_err(fs_info, - "Remounting read-write after error is not allowed"); - ret = -EINVAL; - goto restore; - } - if (fs_info->fs_devices->rw_devices == 0) { - ret = -EACCES; - goto restore; - } + return btrfs_commit_super(fs_info); +} - if (!btrfs_check_rw_degradable(fs_info, NULL)) { - btrfs_warn(fs_info, - "too many missing devices, writable remount is not allowed"); - ret = -EACCES; - goto restore; - } +static void btrfs_ctx_to_info(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + fs_info->max_inline = ctx->max_inline; + fs_info->commit_interval = ctx->commit_interval; + fs_info->metadata_ratio = ctx->metadata_ratio; + fs_info->thread_pool_size = ctx->thread_pool_size; + fs_info->mount_opt = ctx->mount_opt; + fs_info->compress_type = ctx->compress_type; + fs_info->compress_level = ctx->compress_level; +} - if (btrfs_super_log_root(fs_info->super_copy) != 0) { - btrfs_warn(fs_info, - "mount required to replay tree-log, cannot remount read-write"); - ret = -EINVAL; - goto restore; - } +static void btrfs_info_to_ctx(struct btrfs_fs_info *fs_info, struct btrfs_fs_context *ctx) +{ + ctx->max_inline = fs_info->max_inline; + ctx->commit_interval = fs_info->commit_interval; + ctx->metadata_ratio = fs_info->metadata_ratio; + ctx->thread_pool_size = fs_info->thread_pool_size; + ctx->mount_opt = fs_info->mount_opt; + ctx->compress_type = fs_info->compress_type; + ctx->compress_level = fs_info->compress_level; +} - /* - * NOTE: when remounting with a change that does writes, don't - * put it anywhere above this point, as we are not sure to be - * safe to write until we pass the above checks. - */ - ret = btrfs_start_pre_rw_mount(fs_info); - if (ret) - goto restore; +#define btrfs_info_if_set(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((!old_ctx || !btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +#define btrfs_info_if_unset(fs_info, old_ctx, opt, fmt, args...) \ +do { \ + if ((old_ctx && btrfs_raw_test_opt(old_ctx->mount_opt, opt)) && \ + !btrfs_raw_test_opt(fs_info->mount_opt, opt)) \ + btrfs_info(fs_info, fmt, ##args); \ +} while (0) + +static void btrfs_emit_options(struct btrfs_fs_info *info, + struct btrfs_fs_context *old) +{ + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, DEGRADED, "allowing degraded mounts"); + btrfs_info_if_set(info, old, NODATASUM, "setting nodatasum"); + btrfs_info_if_set(info, old, SSD, "enabling ssd optimizations"); + btrfs_info_if_set(info, old, SSD_SPREAD, "using spread ssd allocation scheme"); + btrfs_info_if_set(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_set(info, old, NOTREELOG, "disabling tree log"); + btrfs_info_if_set(info, old, NOLOGREPLAY, "disabling log replay at mount time"); + btrfs_info_if_set(info, old, FLUSHONCOMMIT, "turning on flush-on-commit"); + btrfs_info_if_set(info, old, DISCARD_SYNC, "turning on sync discard"); + btrfs_info_if_set(info, old, DISCARD_ASYNC, "turning on async discard"); + btrfs_info_if_set(info, old, FREE_SPACE_TREE, "enabling free space tree"); + btrfs_info_if_set(info, old, SPACE_CACHE, "enabling disk space caching"); + btrfs_info_if_set(info, old, CLEAR_CACHE, "force clearing of disk cache"); + btrfs_info_if_set(info, old, AUTO_DEFRAG, "enabling auto defrag"); + btrfs_info_if_set(info, old, FRAGMENT_DATA, "fragmenting data"); + btrfs_info_if_set(info, old, FRAGMENT_METADATA, "fragmenting metadata"); + btrfs_info_if_set(info, old, REF_VERIFY, "doing ref verification"); + btrfs_info_if_set(info, old, USEBACKUPROOT, "trying to use backup root at mount time"); + btrfs_info_if_set(info, old, IGNOREBADROOTS, "ignoring bad roots"); + btrfs_info_if_set(info, old, IGNOREDATACSUMS, "ignoring data csums"); + + btrfs_info_if_unset(info, old, NODATACOW, "setting datacow"); + btrfs_info_if_unset(info, old, SSD, "not using ssd optimizations"); + btrfs_info_if_unset(info, old, SSD_SPREAD, "not using spread ssd allocation scheme"); + btrfs_info_if_unset(info, old, NOBARRIER, "turning off barriers"); + btrfs_info_if_unset(info, old, NOTREELOG, "enabling tree log"); + btrfs_info_if_unset(info, old, SPACE_CACHE, "disabling disk space caching"); + btrfs_info_if_unset(info, old, FREE_SPACE_TREE, "disabling free space tree"); + btrfs_info_if_unset(info, old, AUTO_DEFRAG, "disabling auto defrag"); + btrfs_info_if_unset(info, old, COMPRESS, "use no compression"); + + /* Did the compression settings change? */ + if (btrfs_test_opt(info, COMPRESS) && + (!old || + old->compress_type != info->compress_type || + old->compress_level != info->compress_level || + (!btrfs_raw_test_opt(old->mount_opt, FORCE_COMPRESS) && + btrfs_raw_test_opt(info->mount_opt, FORCE_COMPRESS)))) { + const char *compress_type = btrfs_compress_type2str(info->compress_type); + + btrfs_info(info, "%s %s compression, level %d", + btrfs_test_opt(info, FORCE_COMPRESS) ? "force" : "use", + compress_type, info->compress_level); + } - btrfs_clear_sb_rdonly(sb); + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) + btrfs_info(info, "max_inline set to %llu", info->max_inline); +} - set_bit(BTRFS_FS_OPEN, &fs_info->flags); +static int btrfs_reconfigure(struct fs_context *fc) +{ + struct super_block *sb = fc->root->d_sb; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_context old_ctx; + int ret = 0; + bool mount_reconfigure = (fc->s_fs_info != NULL); - /* - * If we've gone from readonly -> read/write, we need to get - * our sync/async discard lists in the right state. - */ - btrfs_discard_resume(fs_info); + btrfs_info_to_ctx(fs_info, &old_ctx); + + sync_filesystem(sb); + set_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); + + if (!mount_reconfigure && + !btrfs_check_options(fs_info, &ctx->mount_opt, fc->sb_flags)) + return -EINVAL; + + ret = btrfs_check_features(fs_info, !(fc->sb_flags & SB_RDONLY)); + if (ret < 0) + return ret; + + btrfs_ctx_to_info(fs_info, ctx); + btrfs_remount_begin(fs_info, old_ctx.mount_opt, fc->sb_flags); + btrfs_resize_thread_pool(fs_info, fs_info->thread_pool_size, + old_ctx.thread_pool_size); + + if ((bool)btrfs_test_opt(fs_info, FREE_SPACE_TREE) != + (bool)btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE) && + (!sb_rdonly(sb) || (fc->sb_flags & SB_RDONLY))) { + btrfs_warn(fs_info, + "remount supports changing free space tree only from RO to RW"); + /* Make sure free space cache options match the state on disk. */ + if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE)) { + btrfs_set_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_clear_opt(fs_info->mount_opt, SPACE_CACHE); + } + if (btrfs_free_space_cache_v1_active(fs_info)) { + btrfs_clear_opt(fs_info->mount_opt, FREE_SPACE_TREE); + btrfs_set_opt(fs_info->mount_opt, SPACE_CACHE); + } } -out: + + ret = 0; + if (!sb_rdonly(sb) && (fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_ro(fs_info); + else if (sb_rdonly(sb) && !(fc->sb_flags & SB_RDONLY)) + ret = btrfs_remount_rw(fs_info); + if (ret) + goto restore; + /* - * We need to set SB_I_VERSION here otherwise it'll get cleared by VFS, - * since the absence of the flag means it can be toggled off by remount. + * If we set the mask during the parameter parsing VFS would reject the + * remount. Here we can set the mask and the value will be updated + * appropriately. */ - *flags |= SB_I_VERSION; + if ((fc->sb_flags & SB_POSIXACL) != (sb->s_flags & SB_POSIXACL)) + fc->sb_flags_mask |= SB_POSIXACL; + btrfs_emit_options(fs_info, &old_ctx); wake_up_process(fs_info->transaction_kthread); - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); btrfs_clear_oneshot_options(fs_info); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); return 0; - restore: - /* We've hit an error - don't reset SB_RDONLY */ - if (sb_rdonly(sb)) - old_flags |= SB_RDONLY; - if (!(old_flags & SB_RDONLY)) - clear_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state); - sb->s_flags = old_flags; - fs_info->mount_opt = old_opts; - fs_info->compress_type = old_compress_type; - fs_info->max_inline = old_max_inline; - btrfs_resize_thread_pool(fs_info, - old_thread_pool_size, fs_info->thread_pool_size); - fs_info->metadata_ratio = old_metadata_ratio; - btrfs_remount_cleanup(fs_info, old_opts); + btrfs_ctx_to_info(fs_info, &old_ctx); + btrfs_remount_cleanup(fs_info, old_ctx.mount_opt); clear_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state); - return ret; } @@ -2091,6 +1776,309 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +static int btrfs_fc_test_super(struct super_block *sb, struct fs_context *fc) +{ + struct btrfs_fs_info *p = fc->s_fs_info; + struct btrfs_fs_info *fs_info = btrfs_sb(sb); + + return fs_info->fs_devices == p->fs_devices; +} + +static int btrfs_get_tree_super(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = fc->s_fs_info; + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_devices *fs_devices = NULL; + struct block_device *bdev; + struct btrfs_device *device; + struct super_block *sb; + blk_mode_t mode = btrfs_open_mode(fc); + int ret; + + btrfs_ctx_to_info(fs_info, ctx); + mutex_lock(&uuid_mutex); + + /* + * With 'true' passed to btrfs_scan_one_device() (mount time) we expect + * either a valid device or an error. + */ + device = btrfs_scan_one_device(fc->source, mode, true); + ASSERT(device != NULL); + if (IS_ERR(device)) { + mutex_unlock(&uuid_mutex); + return PTR_ERR(device); + } + + fs_devices = device->fs_devices; + fs_info->fs_devices = fs_devices; + + ret = btrfs_open_devices(fs_devices, mode, &btrfs_fs_type); + mutex_unlock(&uuid_mutex); + if (ret) + return ret; + + if (!(fc->sb_flags & SB_RDONLY) && fs_devices->rw_devices == 0) { + ret = -EACCES; + goto error; + } + + bdev = fs_devices->latest_dev->bdev; + + /* + * From now on the error handling is not straightforward. + * + * If successful, this will transfer the fs_info into the super block, + * and fc->s_fs_info will be NULL. However if there's an existing + * super, we'll still have fc->s_fs_info populated. If we error + * completely out it'll be cleaned up when we drop the fs_context, + * otherwise it's tied to the lifetime of the super_block. + */ + sb = sget_fc(fc, btrfs_fc_test_super, set_anon_super_fc); + if (IS_ERR(sb)) { + ret = PTR_ERR(sb); + goto error; + } + + set_device_specific_options(fs_info); + + if (sb->s_root) { + btrfs_close_devices(fs_devices); + if ((fc->sb_flags ^ sb->s_flags) & SB_RDONLY) + ret = -EBUSY; + } else { + snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); + shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); + btrfs_sb(sb)->bdev_holder = &btrfs_fs_type; + ret = btrfs_fill_super(sb, fs_devices, NULL); + } + + if (ret) { + deactivate_locked_super(sb); + return ret; + } + + btrfs_clear_oneshot_options(fs_info); + + fc->root = dget(sb->s_root); + return 0; + +error: + btrfs_close_devices(fs_devices); + return ret; +} + +/* + * Ever since commit 0723a0473fb4 ("btrfs: allow mounting btrfs subvolumes + * with different ro/rw options") the following works: + * + * (i) mount /dev/sda3 -o subvol=foo,ro /mnt/foo + * (ii) mount /dev/sda3 -o subvol=bar,rw /mnt/bar + * + * which looks nice and innocent but is actually pretty intricate and deserves + * a long comment. + * + * On another filesystem a subvolume mount is close to something like: + * + * (iii) # create rw superblock + initial mount + * mount -t xfs /dev/sdb /opt/ + * + * # create ro bind mount + * mount --bind -o ro /opt/foo /mnt/foo + * + * # unmount initial mount + * umount /opt + * + * Of course, there's some special subvolume sauce and there's the fact that the + * sb->s_root dentry is really swapped after mount_subtree(). But conceptually + * it's very close and will help us understand the issue. + * + * The old mount API didn't cleanly distinguish between a mount being made ro + * and a superblock being made ro. The only way to change the ro state of + * either object was by passing ms_rdonly. If a new mount was created via + * mount(2) such as: + * + * mount("/dev/sdb", "/mnt", "xfs", ms_rdonly, null); + * + * the MS_RDONLY flag being specified had two effects: + * + * (1) MNT_READONLY was raised -> the resulting mount got + * @mnt->mnt_flags |= MNT_READONLY raised. + * + * (2) MS_RDONLY was passed to the filesystem's mount method and the filesystems + * made the superblock ro. Note, how SB_RDONLY has the same value as + * ms_rdonly and is raised whenever MS_RDONLY is passed through mount(2). + * + * Creating a subtree mount via (iii) ends up leaving a rw superblock with a + * subtree mounted ro. + * + * But consider the effect on the old mount API on btrfs subvolume mounting + * which combines the distinct step in (iii) into a single step. + * + * By issuing (i) both the mount and the superblock are turned ro. Now when (ii) + * is issued the superblock is ro and thus even if the mount created for (ii) is + * rw it wouldn't help. Hence, btrfs needed to transition the superblock from ro + * to rw for (ii) which it did using an internal remount call. + * + * IOW, subvolume mounting was inherently complicated due to the ambiguity of + * MS_RDONLY in mount(2). Note, this ambiguity has mount(8) always translate + * "ro" to MS_RDONLY. IOW, in both (i) and (ii) "ro" becomes MS_RDONLY when + * passed by mount(8) to mount(2). + * + * Enter the new mount API. The new mount API disambiguates making a mount ro + * and making a superblock ro. + * + * (3) To turn a mount ro the MOUNT_ATTR_ONLY flag can be used with either + * fsmount() or mount_setattr() this is a pure VFS level change for a + * specific mount or mount tree that is never seen by the filesystem itself. + * + * (4) To turn a superblock ro the "ro" flag must be used with + * fsconfig(FSCONFIG_SET_FLAG, "ro"). This option is seen by the filesystem + * in fc->sb_flags. + * + * This disambiguation has rather positive consequences. Mounting a subvolume + * ro will not also turn the superblock ro. Only the mount for the subvolume + * will become ro. + * + * So, if the superblock creation request comes from the new mount API the + * caller must have explicitly done: + * + * fsconfig(FSCONFIG_SET_FLAG, "ro") + * fsmount/mount_setattr(MOUNT_ATTR_RDONLY) + * + * IOW, at some point the caller must have explicitly turned the whole + * superblock ro and we shouldn't just undo it like we did for the old mount + * API. In any case, it lets us avoid the hack in the new mount API. + * + * Consequently, the remounting hack must only be used for requests originating + * from the old mount API and should be marked for full deprecation so it can be + * turned off in a couple of years. + * + * The new mount API has no reason to support this hack. + */ +static struct vfsmount *btrfs_reconfigure_for_mount(struct fs_context *fc) +{ + struct vfsmount *mnt; + int ret; + const bool ro2rw = !(fc->sb_flags & SB_RDONLY); + + /* + * We got an EBUSY because our SB_RDONLY flag didn't match the existing + * super block, so invert our setting here and retry the mount so we + * can get our vfsmount. + */ + if (ro2rw) + fc->sb_flags |= SB_RDONLY; + else + fc->sb_flags &= ~SB_RDONLY; + + mnt = fc_mount(fc); + if (IS_ERR(mnt)) + return mnt; + + if (!fc->oldapi || !ro2rw) + return mnt; + + /* We need to convert to rw, call reconfigure. */ + fc->sb_flags &= ~SB_RDONLY; + down_write(&mnt->mnt_sb->s_umount); + ret = btrfs_reconfigure(fc); + up_write(&mnt->mnt_sb->s_umount); + if (ret) { + mntput(mnt); + return ERR_PTR(ret); + } + return mnt; +} + +static int btrfs_get_tree_subvol(struct fs_context *fc) +{ + struct btrfs_fs_info *fs_info = NULL; + struct btrfs_fs_context *ctx = fc->fs_private; + struct fs_context *dup_fc; + struct dentry *dentry; + struct vfsmount *mnt; + + /* + * Setup a dummy root and fs_info for test/set super. This is because + * we don't actually fill this stuff out until open_ctree, but we need + * then open_ctree will properly initialize the file system specific + * settings later. btrfs_init_fs_info initializes the static elements + * of the fs_info (locks and such) to make cleanup easier if we find a + * superblock with our given fs_devices later on at sget() time. + */ + fs_info = kvzalloc(sizeof(struct btrfs_fs_info), GFP_KERNEL); + if (!fs_info) + return -ENOMEM; + + fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_KERNEL); + if (!fs_info->super_copy || !fs_info->super_for_commit) { + btrfs_free_fs_info(fs_info); + return -ENOMEM; + } + btrfs_init_fs_info(fs_info); + + dup_fc = vfs_dup_fs_context(fc); + if (IS_ERR(dup_fc)) { + btrfs_free_fs_info(fs_info); + return PTR_ERR(dup_fc); + } + + /* + * When we do the sget_fc this gets transferred to the sb, so we only + * need to set it on the dup_fc as this is what creates the super block. + */ + dup_fc->s_fs_info = fs_info; + + /* + * We'll do the security settings in our btrfs_get_tree_super() mount + * loop, they were duplicated into dup_fc, we can drop the originals + * here. + */ + security_free_mnt_opts(&fc->security); + fc->security = NULL; + + mnt = fc_mount(dup_fc); + if (PTR_ERR_OR_ZERO(mnt) == -EBUSY) + mnt = btrfs_reconfigure_for_mount(dup_fc); + put_fs_context(dup_fc); + if (IS_ERR(mnt)) + return PTR_ERR(mnt); + + /* + * This free's ->subvol_name, because if it isn't set we have to + * allocate a buffer to hold the subvol_name, so we just drop our + * reference to it here. + */ + dentry = mount_subvol(ctx->subvol_name, ctx->subvol_objectid, mnt); + ctx->subvol_name = NULL; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + fc->root = dentry; + return 0; +} + +static int btrfs_get_tree(struct fs_context *fc) +{ + /* + * Since we use mount_subtree to mount the default/specified subvol, we + * have to do mounts in two steps. + * + * First pass through we call btrfs_get_tree_subvol(), this is just a + * wrapper around fc_mount() to call back into here again, and this time + * we'll call btrfs_get_tree_super(). This will do the open_ctree() and + * everything to open the devices and file system. Then we return back + * with a fully constructed vfsmount in btrfs_get_tree_subvol(), and + * from there we can do our mount_subvol() call, which will lookup + * whichever subvol we're mounting and setup this fc with the + * appropriate dentry for the subvol. + */ + if (fc->s_fs_info) + return btrfs_get_tree_super(fc); + return btrfs_get_tree_subvol(fc); +} + static void btrfs_kill_super(struct super_block *sb) { struct btrfs_fs_info *fs_info = btrfs_sb(sb); @@ -2098,22 +2086,85 @@ static void btrfs_kill_super(struct super_block *sb) btrfs_free_fs_info(fs_info); } -static struct file_system_type btrfs_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, -}; +static void btrfs_free_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx = fc->fs_private; + struct btrfs_fs_info *fs_info = fc->s_fs_info; + + if (fs_info) + btrfs_free_fs_info(fs_info); + + if (ctx && refcount_dec_and_test(&ctx->refs)) { + kfree(ctx->subvol_name); + kfree(ctx); + } +} + +static int btrfs_dup_fs_context(struct fs_context *fc, struct fs_context *src_fc) +{ + struct btrfs_fs_context *ctx = src_fc->fs_private; -static struct file_system_type btrfs_root_fs_type = { - .owner = THIS_MODULE, - .name = "btrfs", - .mount = btrfs_mount_root, - .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + /* + * Give a ref to our ctx to this dup, as we want to keep it around for + * our original fc so we can have the subvolume name or objectid. + * + * We unset ->source in the original fc because the dup needs it for + * mounting, and then once we free the dup it'll free ->source, so we + * need to make sure we're only pointing to it in one fc. + */ + refcount_inc(&ctx->refs); + fc->fs_private = ctx; + fc->source = src_fc->source; + src_fc->source = NULL; + return 0; +} + +static const struct fs_context_operations btrfs_fs_context_ops = { + .parse_param = btrfs_parse_param, + .reconfigure = btrfs_reconfigure, + .get_tree = btrfs_get_tree, + .dup = btrfs_dup_fs_context, + .free = btrfs_free_fs_context, }; +static int btrfs_init_fs_context(struct fs_context *fc) +{ + struct btrfs_fs_context *ctx; + + ctx = kzalloc(sizeof(struct btrfs_fs_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + refcount_set(&ctx->refs, 1); + fc->fs_private = ctx; + fc->ops = &btrfs_fs_context_ops; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + btrfs_info_to_ctx(btrfs_sb(fc->root->d_sb), ctx); + } else { + ctx->thread_pool_size = + min_t(unsigned long, num_online_cpus() + 2, 8); + ctx->max_inline = BTRFS_DEFAULT_MAX_INLINE; + ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; + } + +#ifdef CONFIG_BTRFS_FS_POSIX_ACL + fc->sb_flags |= SB_POSIXACL; +#endif + fc->sb_flags |= SB_I_VERSION; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .init_fs_context = btrfs_init_fs_context, + .parameters = btrfs_fs_parameters, + .kill_sb = btrfs_kill_super, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA | FS_ALLOW_IDMAP, + }; + MODULE_ALIAS_FS("btrfs"); static int btrfs_control_open(struct inode *inode, struct file *file) @@ -2325,7 +2376,6 @@ static const struct super_operations btrfs_super_ops = { .destroy_inode = btrfs_destroy_inode, .free_inode = btrfs_free_inode, .statfs = btrfs_statfs, - .remount_fs = btrfs_remount, .freeze_fs = btrfs_freeze, .unfreeze_fs = btrfs_unfreeze, }; diff --git a/fs/btrfs/super.h b/fs/btrfs/super.h index 8dbb909b364f..f18253ca280d 100644 --- a/fs/btrfs/super.h +++ b/fs/btrfs/super.h @@ -3,11 +3,12 @@ #ifndef BTRFS_SUPER_H #define BTRFS_SUPER_H -int btrfs_parse_options(struct btrfs_fs_info *info, char *options, - unsigned long new_flags); +bool btrfs_check_options(struct btrfs_fs_info *info, unsigned long *mount_opt, + unsigned long flags); int btrfs_sync_fs(struct super_block *sb, int wait); char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info, u64 subvol_objectid); +void btrfs_set_free_space_cache_settings(struct btrfs_fs_info *fs_info); static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) { diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index e6b51fb3ddc1..84c05246ffd8 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1783,6 +1783,10 @@ static ssize_t btrfs_devinfo_scrub_speed_max_store(struct kobject *kobj, unsigned long long limit; limit = memparse(buf, &endptr); + /* There could be trailing '\n', also catch any typos after the value. */ + endptr = skip_spaces(endptr); + if (*endptr != 0) + return -EINVAL; WRITE_ONCE(device->scrub_speed_max, limit); return len; } diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index ca09cf9afce8..709c6cc9706a 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -28,6 +28,7 @@ const char *test_error[] = { [TEST_ALLOC_INODE] = "cannot allocate inode", [TEST_ALLOC_BLOCK_GROUP] = "cannot allocate block group", [TEST_ALLOC_EXTENT_MAP] = "cannot allocate extent map", + [TEST_ALLOC_CHUNK_MAP] = "cannot allocate chunk map", }; static const struct super_operations btrfs_test_super_ops = { @@ -102,7 +103,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(NULL, &dev->alloc_state, 0); + extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -185,7 +186,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) } spin_unlock(&fs_info->buffer_lock); - btrfs_mapping_tree_free(&fs_info->mapping_tree); + btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, dev_list) { btrfs_free_dummy_device(dev); diff --git a/fs/btrfs/tests/btrfs-tests.h b/fs/btrfs/tests/btrfs-tests.h index 7a2d7ffbe30e..dc2f2ab15fa5 100644 --- a/fs/btrfs/tests/btrfs-tests.h +++ b/fs/btrfs/tests/btrfs-tests.h @@ -23,6 +23,7 @@ enum { TEST_ALLOC_INODE, TEST_ALLOC_BLOCK_GROUP, TEST_ALLOC_EXTENT_MAP, + TEST_ALLOC_CHUNK_MAP, }; extern const char *test_error[]; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 1cc86af97dc6..25b3349595e0 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -652,7 +652,7 @@ static void dump_eb_and_memory_contents(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < eb->len; i++) { - struct page *page = eb->pages[i >> PAGE_SHIFT]; + struct page *page = folio_page(eb->folios[i >> PAGE_SHIFT], 0); void *addr = page_address(page) + offset_in_page(i); if (memcmp(addr, memory + i, 1) != 0) { @@ -668,7 +668,7 @@ static int verify_eb_and_memory(struct extent_buffer *eb, void *memory, const char *test_name) { for (int i = 0; i < (eb->len >> PAGE_SHIFT); i++) { - void *eb_addr = page_address(eb->pages[i]); + void *eb_addr = folio_address(eb->folios[i]); if (memcmp(memory + (i << PAGE_SHIFT), eb_addr, PAGE_SIZE) != 0) { dump_eb_and_memory_contents(eb, memory, test_name); diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 29bdd08b241f..253cce7ffecf 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -25,7 +25,7 @@ static void free_extent_map_tree(struct extent_map_tree *em_tree) #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { test_err( -"em leak: em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx) refs %d", +"em leak: em (start %llu len %llu block_start %llu block_len %llu) refs %d", em->start, em->len, em->block_start, em->block_len, refcount_read(&em->refs)); @@ -73,7 +73,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 16K)"); @@ -94,7 +94,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, em->block_start = SZ_32K; /* avoid merging */ em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [16K, 20K)"); @@ -121,9 +121,14 @@ static int test_case_1(struct btrfs_fs_info *fs_info, test_err("case1 [%llu %llu]: ret %d", start, start + len, ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_16K || - em->block_start != 0 || em->block_len != SZ_16K)) { + if (!em) { + test_err("case1 [%llu %llu]: no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_16K || + em->block_start != 0 || em->block_len != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", start, start + len, ret, em->start, em->len, @@ -161,7 +166,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = EXTENT_MAP_INLINE; em->block_len = (u64)-1; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 1K)"); @@ -182,7 +187,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -209,9 +214,13 @@ static int test_case_2(struct btrfs_fs_info *fs_info, test_err("case2 [0 1K]: ret %d", ret); goto out; } - if (em && - (em->start != 0 || extent_map_end(em) != SZ_1K || - em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1)) { + if (!em) { + test_err("case2 [0 1K]: no extent map returned"); + ret = -ENOENT; + goto out; + } + if (em->start != 0 || extent_map_end(em) != SZ_1K || + em->block_start != EXTENT_MAP_INLINE || em->block_len != (u64)-1) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu block_start %llu block_len %llu", ret, em->start, em->len, em->block_start, @@ -244,7 +253,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, em->block_start = SZ_4K; em->block_len = SZ_4K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [4K, 8K)"); @@ -268,19 +277,24 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case3 [0x%llx 0x%llx): ret %d", + test_err("case3 [%llu %llu): ret %d", start, start + len, ret); goto out; } + if (!em) { + test_err("case3 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } /* * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (em && - (start < em->start || start + len > extent_map_end(em) || - em->start != em->block_start || em->len != em->block_len)) { + if (start < em->start || start + len > extent_map_end(em) || + em->start != em->block_start || em->len != em->block_len) { test_err( -"case3 [0x%llx 0x%llx): ret %d em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", +"case3 [%llu %llu): ret %d em (start %llu len %llu block_start %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; @@ -343,7 +357,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = 0; em->block_len = SZ_8K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [0, 8K)"); @@ -364,7 +378,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, em->block_start = SZ_16K; /* avoid merging */ em->block_len = 24 * SZ_1K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("cannot add extent range [8K, 32K)"); @@ -387,14 +401,20 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); if (ret) { - test_err("case4 [0x%llx 0x%llx): ret %d", - start, len, ret); + test_err("case4 [%llu %llu): ret %d", + start, start + len, ret); goto out; } - if (em && (start < em->start || start + len > extent_map_end(em))) { + if (!em) { + test_err("case4 [%llu %llu): no extent map returned", + start, start + len); + ret = -ENOENT; + goto out; + } + if (start < em->start || start + len > extent_map_end(em)) { test_err( -"case4 [0x%llx 0x%llx): ret %d, added wrong em (start 0x%llx len 0x%llx block_start 0x%llx block_len 0x%llx)", - start, len, ret, em->start, em->len, em->block_start, +"case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu block_start %llu block_len %llu)", + start, start + len, ret, em->start, em->len, em->block_start, em->block_len); ret = -EINVAL; } @@ -443,7 +463,8 @@ static int test_case_4(struct btrfs_fs_info *fs_info, return ret; } -static int add_compressed_extent(struct extent_map_tree *em_tree, +static int add_compressed_extent(struct btrfs_fs_info *fs_info, + struct extent_map_tree *em_tree, u64 start, u64 len, u64 block_start) { struct extent_map *em; @@ -459,9 +480,9 @@ static int add_compressed_extent(struct extent_map_tree *em_tree, em->len = len; em->block_start = block_start; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); free_extent_map(em); if (ret < 0) { @@ -567,7 +588,7 @@ static int validate_range(struct extent_map_tree *em_tree, int index) * They'll have the EXTENT_FLAG_COMPRESSED flag set to keep the em tree from * merging the em's. */ -static int test_case_5(void) +static int test_case_5(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct inode *inode; @@ -585,35 +606,35 @@ static int test_case_5(void) em_tree = &BTRFS_I(inode)->extent_tree; /* [0, 12k) */ - ret = add_compressed_extent(em_tree, 0, SZ_4K * 3, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K * 3, 0); if (ret) { test_err("cannot add extent range [0, 12K)"); goto out; } /* [12k, 24k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 3, SZ_4K * 3, SZ_4K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [24k, 36k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 6, SZ_4K * 3, SZ_8K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [36k, 40k) */ - ret = add_compressed_extent(em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); + ret = add_compressed_extent(fs_info, em_tree, SZ_32K + SZ_4K, SZ_4K, SZ_4K * 3); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; } /* [40k, 64k) */ - ret = add_compressed_extent(em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K * 10, SZ_4K * 6, SZ_16K); if (ret) { test_err("cannot add extent range [12k, 24k)"); goto out; @@ -665,11 +686,11 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct extent_map_tree *em struct extent_map *em = NULL; int ret; - ret = add_compressed_extent(em_tree, 0, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, 0, SZ_4K, 0); if (ret) goto out; - ret = add_compressed_extent(em_tree, SZ_4K, SZ_4K, 0); + ret = add_compressed_extent(fs_info, em_tree, SZ_4K, SZ_4K, 0); if (ret) goto out; @@ -713,7 +734,7 @@ out: * true would mess up the start/end calculations and subsequent splits would be * incorrect. */ -static int test_case_7(void) +static int test_case_7(struct btrfs_fs_info *fs_info) { struct extent_map_tree *em_tree; struct extent_map *em; @@ -742,9 +763,9 @@ static int test_case_7(void) em->len = SZ_16K; em->block_start = 0; em->block_len = SZ_4K; - set_bit(EXTENT_FLAG_PINNED, &em->flags); + em->flags |= EXTENT_FLAG_PINNED; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -765,7 +786,7 @@ static int test_case_7(void) em->block_start = SZ_32K; em->block_len = SZ_16K; write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, em->start, em->len); write_unlock(&em_tree->lock); if (ret < 0) { test_err("couldn't add extent map"); @@ -859,33 +880,21 @@ struct rmap_test_vector { static int test_rmap_block(struct btrfs_fs_info *fs_info, struct rmap_test_vector *test) { - struct extent_map *em; - struct map_lookup *map = NULL; + struct btrfs_chunk_map *map; u64 *logical = NULL; int i, out_ndaddrs, out_stripe_len; int ret; - em = alloc_extent_map(); - if (!em) { - test_std_err(TEST_ALLOC_EXTENT_MAP); - return -ENOMEM; - } - - map = kmalloc(map_lookup_size(test->num_stripes), GFP_KERNEL); + map = btrfs_alloc_chunk_map(test->num_stripes, GFP_KERNEL); if (!map) { - kfree(em); - test_std_err(TEST_ALLOC_EXTENT_MAP); + test_std_err(TEST_ALLOC_CHUNK_MAP); return -ENOMEM; } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); /* Start at 4GiB logical address */ - em->start = SZ_4G; - em->len = test->data_stripe_size * test->num_data_stripes; - em->block_len = em->len; - em->orig_block_len = test->data_stripe_size; - em->map_lookup = map; - + map->start = SZ_4G; + map->chunk_len = test->data_stripe_size * test->num_data_stripes; + map->stripe_size = test->data_stripe_size; map->num_stripes = test->num_stripes; map->type = test->raid_type; @@ -901,15 +910,13 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, map->stripes[i].physical = test->data_stripe_phys_start[i]; } - write_lock(&fs_info->mapping_tree.lock); - ret = add_extent_mapping(&fs_info->mapping_tree, em, 0); - write_unlock(&fs_info->mapping_tree.lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret) { - test_err("error adding block group mapping to mapping tree"); + test_err("error adding chunk map to mapping tree"); goto out_free; } - ret = btrfs_rmap_block(fs_info, em->start, btrfs_sb_offset(1), + ret = btrfs_rmap_block(fs_info, map->start, btrfs_sb_offset(1), &logical, &out_ndaddrs, &out_stripe_len); if (ret || (out_ndaddrs == 0 && test->expected_mapped_addr)) { test_err("didn't rmap anything but expected %d", @@ -938,14 +945,8 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = 0; out: - write_lock(&fs_info->mapping_tree.lock); - remove_extent_mapping(&fs_info->mapping_tree, em); - write_unlock(&fs_info->mapping_tree.lock); - /* For us */ - free_extent_map(em); + btrfs_remove_chunk_map(fs_info, map); out_free: - /* For the tree */ - free_extent_map(em); kfree(logical); return ret; } @@ -1022,13 +1023,13 @@ int btrfs_test_extent_map(void) ret = test_case_4(fs_info, em_tree); if (ret) goto out; - ret = test_case_5(); + ret = test_case_5(fs_info); if (ret) goto out; ret = test_case_6(fs_info, em_tree); if (ret) goto out; - ret = test_case_7(); + ret = test_case_7(fs_info); if (ret) goto out; diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 492d69d2fa73..9957de9f7806 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -211,9 +211,9 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize) sectorsize, BTRFS_FILE_EXTENT_REG, 0, slot); } -static unsigned long prealloc_only = 0; -static unsigned long compressed_only = 0; -static unsigned long vacancy_only = 0; +static u32 prealloc_only = 0; +static u32 compressed_only = 0; +static u32 vacancy_only = 0; static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) { @@ -305,7 +305,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } /* @@ -332,7 +332,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -355,7 +355,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -383,7 +383,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -412,7 +412,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } offset = em->start + em->len; @@ -434,7 +434,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -468,7 +468,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -497,7 +497,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -527,7 +527,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != orig_start) { @@ -560,7 +560,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != prealloc_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", prealloc_only, em->flags); goto out; } @@ -595,7 +595,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -604,9 +604,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -629,7 +629,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -638,9 +638,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } disk_bytenr = em->block_start; @@ -664,7 +664,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -692,7 +692,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != compressed_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", compressed_only, em->flags); goto out; } @@ -701,9 +701,9 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, orig_start); goto out; } - if (em->compress_type != BTRFS_COMPRESS_ZLIB) { + if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, em->compress_type); + BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); goto out; } offset = em->start + em->len; @@ -726,7 +726,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -758,7 +758,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("unexpected flags set, want %lu have %lu", + test_err("unexpected flags set, want %u have %u", vacancy_only, em->flags); goto out; } @@ -786,7 +786,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, want 0 have %lu", em->flags); + test_err("unexpected flags set, want 0 have %u", em->flags); goto out; } if (em->orig_start != em->start) { @@ -866,7 +866,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != vacancy_only) { - test_err("wrong flags, wanted %lu, have %lu", vacancy_only, + test_err("wrong flags, wanted %u, have %u", vacancy_only, em->flags); goto out; } @@ -888,7 +888,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) goto out; } if (em->flags != 0) { - test_err("unexpected flags set, wanted 0 got %lu", + test_err("unexpected flags set, wanted 0 got %u", em->flags); goto out; } @@ -1095,8 +1095,8 @@ int btrfs_test_inodes(u32 sectorsize, u32 nodesize) test_msg("running inode tests"); - set_bit(EXTENT_FLAG_COMPRESSED, &compressed_only); - set_bit(EXTENT_FLAG_PREALLOC, &prealloc_only); + compressed_only |= EXTENT_FLAG_COMPRESS_ZLIB; + prealloc_only |= EXTENT_FLAG_PREALLOC; ret = test_btrfs_get_extent(sectorsize, nodesize); if (ret) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 6e63816dddcb..5b3333ceef04 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -37,8 +37,6 @@ static struct kmem_cache *btrfs_trans_handle_cachep; -#define BTRFS_ROOT_TRANS_TAG 0 - /* * Transaction states and transitions * @@ -1774,7 +1772,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_release_path(path); ret = btrfs_create_qgroup(trans, objectid); - if (ret) { + if (ret && ret != -EEXIST) { btrfs_abort_transaction(trans, ret); goto fail; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 18c4f6e83b78..2bf8bbdfd0b3 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -12,6 +12,9 @@ #include "ctree.h" #include "misc.h" +/* Radix-tree tag for roots that are part of the trasaction. */ +#define BTRFS_ROOT_TRANS_TAG 0 + enum btrfs_trans_state { TRANS_STATE_RUNNING, TRANS_STATE_COMMIT_PREP, diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index a416cbea75d1..50fdc69fdddf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -31,6 +31,7 @@ #include "inode-item.h" #include "dir-item.h" #include "raid-stripe-tree.h" +#include "extent-tree.h" /* * Error message should follow the following format: @@ -1276,6 +1277,8 @@ static int check_extent_item(struct extent_buffer *leaf, unsigned long ptr; /* Current pointer inside inline refs */ unsigned long end; /* Extent item end */ const u32 item_size = btrfs_item_size(leaf, slot); + u8 last_type = 0; + u64 last_seq = U64_MAX; u64 flags; u64 generation; u64 total_refs; /* Total refs in btrfs_extent_item */ @@ -1322,6 +1325,18 @@ static int check_extent_item(struct extent_buffer *leaf, * 2.2) Ref type specific data * Either using btrfs_extent_inline_ref::offset, or specific * data structure. + * + * All above inline items should follow the order: + * + * - All btrfs_extent_inline_ref::type should be in an ascending + * order + * + * - Within the same type, the items should follow a descending + * order by their sequence number. The sequence number is + * determined by: + * * btrfs_extent_inline_ref::offset for all types other than + * EXTENT_DATA_REF + * * hash_extent_data_ref() for EXTENT_DATA_REF */ if (unlikely(item_size < sizeof(*ei))) { extent_err(leaf, slot, @@ -1403,6 +1418,7 @@ static int check_extent_item(struct extent_buffer *leaf, struct btrfs_extent_inline_ref *iref; struct btrfs_extent_data_ref *dref; struct btrfs_shared_data_ref *sref; + u64 seq; u64 dref_offset; u64 inline_offset; u8 inline_type; @@ -1416,6 +1432,7 @@ static int check_extent_item(struct extent_buffer *leaf, iref = (struct btrfs_extent_inline_ref *)ptr; inline_type = btrfs_extent_inline_ref_type(leaf, iref); inline_offset = btrfs_extent_inline_ref_offset(leaf, iref); + seq = inline_offset; if (unlikely(ptr + btrfs_extent_inline_ref_size(inline_type) > end)) { extent_err(leaf, slot, "inline ref item overflows extent item, ptr %lu iref size %u end %lu", @@ -1446,6 +1463,10 @@ static int check_extent_item(struct extent_buffer *leaf, case BTRFS_EXTENT_DATA_REF_KEY: dref = (struct btrfs_extent_data_ref *)(&iref->offset); dref_offset = btrfs_extent_data_ref_offset(leaf, dref); + seq = hash_extent_data_ref( + btrfs_extent_data_ref_root(leaf, dref), + btrfs_extent_data_ref_objectid(leaf, dref), + btrfs_extent_data_ref_offset(leaf, dref)); if (unlikely(!IS_ALIGNED(dref_offset, fs_info->sectorsize))) { extent_err(leaf, slot, @@ -1475,6 +1496,24 @@ static int check_extent_item(struct extent_buffer *leaf, inline_type); return -EUCLEAN; } + if (inline_type < last_type) { + extent_err(leaf, slot, + "inline ref out-of-order: has type %u, prev type %u", + inline_type, last_type); + return -EUCLEAN; + } + /* Type changed, allow the sequence starts from U64_MAX again. */ + if (inline_type > last_type) + last_seq = U64_MAX; + if (seq > last_seq) { + extent_err(leaf, slot, +"inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", + inline_type, inline_offset, seq, + last_type, last_seq); + return -EUCLEAN; + } + last_type = inline_type; + last_seq = seq; ptr += btrfs_extent_inline_ref_size(inline_type); } /* No padding is allowed */ diff --git a/fs/btrfs/tree-checker.h b/fs/btrfs/tree-checker.h index 3c2a02a72f64..14b9fbe82da4 100644 --- a/fs/btrfs/tree-checker.h +++ b/fs/btrfs/tree-checker.h @@ -22,7 +22,7 @@ struct btrfs_tree_parent_check { /* * Expected transid, can be 0 to skip the check, but such skip - * should only be utlized for backref walk related code. + * should only be utilized for backref walk related code. */ u64 transid; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 7d6729d9fd2f..331fc7429952 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2575,7 +2575,6 @@ static int clean_log_buffer(struct btrfs_trans_handle *trans, ret = btrfs_pin_reserved_extent(trans, eb); if (ret) return ret; - btrfs_redirty_list_add(trans->transaction, eb); } else { unaccount_log_buffer(eb->fs_info, eb->start); } @@ -4520,7 +4519,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, int ret = 0; if (inode->flags & BTRFS_INODE_NODATASUM || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + (em->flags & EXTENT_FLAG_PREALLOC) || em->block_start == EXTENT_MAP_HOLE) return 0; @@ -4583,7 +4582,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (em->compress_type) { + if (extent_map_is_compressed(em)) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { @@ -4623,18 +4622,20 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_file_extent_item fi = { 0 }; struct extent_buffer *leaf; struct btrfs_key key; + enum btrfs_compression_type compress_type; u64 extent_offset = em->start - em->orig_start; u64 block_len; int ret; btrfs_set_stack_file_extent_generation(&fi, trans->transid); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + if (em->flags & EXTENT_FLAG_PREALLOC) btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_PREALLOC); else btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { + compress_type = extent_map_compression(em); + if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, em->block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { @@ -4646,7 +4647,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_offset(&fi, extent_offset); btrfs_set_stack_file_extent_num_bytes(&fi, em->len); btrfs_set_stack_file_extent_ram_bytes(&fi, em->ram_bytes); - btrfs_set_stack_file_extent_compression(&fi, em->compress_type); + btrfs_set_stack_file_extent_compression(&fi, compress_type); ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) @@ -4859,13 +4860,13 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, continue; /* We log prealloc extents beyond eof later. */ - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && + if ((em->flags & EXTENT_FLAG_PREALLOC) && em->start >= i_size_read(&inode->vfs_inode)) continue; /* Need a ref to keep it from getting evicted from cache */ refcount_inc(&em->refs); - set_bit(EXTENT_FLAG_LOGGING, &em->flags); + em->flags |= EXTENT_FLAG_LOGGING; list_add_tail(&em->list, &extents); num++; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c87e18827a0a..4c32497311d2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -41,6 +41,17 @@ BTRFS_BLOCK_GROUP_RAID10 | \ BTRFS_BLOCK_GROUP_RAID56_MASK) +struct btrfs_io_geometry { + u32 stripe_index; + u32 stripe_nr; + int mirror_num; + int num_stripes; + u64 stripe_offset; + u64 raid56_full_stripe_start; + int max_errors; + enum btrfs_map_op op; +}; + const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = { .sub_stripes = 2, @@ -748,13 +759,13 @@ static noinline struct btrfs_device *device_list_add(const char *path, if (!fs_devices) { fs_devices = alloc_fs_devices(disk_super->fsid); + if (IS_ERR(fs_devices)) + return ERR_CAST(fs_devices); + if (has_metadata_uuid) memcpy(fs_devices->metadata_uuid, disk_super->metadata_uuid, BTRFS_FSID_SIZE); - if (IS_ERR(fs_devices)) - return ERR_CAST(fs_devices); - if (same_fsid_diff_dev) { generate_random_uuid(fs_devices->fsid); fs_devices->temp_fsid = true; @@ -1742,19 +1753,18 @@ out: static u64 find_next_chunk(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree; - struct extent_map *em; struct rb_node *n; u64 ret = 0; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - n = rb_last(&em_tree->map.rb_root); + read_lock(&fs_info->mapping_tree_lock); + n = rb_last(&fs_info->mapping_tree.rb_root); if (n) { - em = rb_entry(n, struct extent_map, rb_node); - ret = em->start + em->len; + struct btrfs_chunk_map *map; + + map = rb_entry(n, struct btrfs_chunk_map, rb_node); + ret = map->start + map->chunk_len; } - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } @@ -2986,6 +2996,81 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) return ret; } +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct rb_node *node = fs_info->mapping_tree.rb_root.rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev; + struct btrfs_chunk_map *map; + struct btrfs_chunk_map *prev_map = NULL; + + while (node) { + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + prev = node; + prev_map = map; + + if (logical < map->start) { + node = node->rb_left; + } else if (logical >= map->start + map->chunk_len) { + node = node->rb_right; + } else { + refcount_inc(&map->refs); + return map; + } + } + + if (!prev) + return NULL; + + orig_prev = prev; + while (prev && logical >= prev_map->start + prev_map->chunk_len) { + prev = rb_next(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + + if (!prev) { + prev = orig_prev; + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + while (prev && logical < prev_map->start) { + prev = rb_prev(prev); + prev_map = rb_entry(prev, struct btrfs_chunk_map, rb_node); + } + } + + if (prev) { + u64 end = logical + length; + + /* + * Caller can pass a U64_MAX length when it wants to get any + * chunk starting at an offset of 'logical' or higher, so deal + * with underflow by resetting the end offset to U64_MAX. + */ + if (end < logical) + end = U64_MAX; + + if (end > prev_map->start && + logical < prev_map->start + prev_map->chunk_len) { + refcount_inc(&prev_map->refs); + return prev_map; + } + } + + return NULL; +} + +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) +{ + struct btrfs_chunk_map *map; + + read_lock(&fs_info->mapping_tree_lock); + map = btrfs_find_chunk_map_nolock(fs_info, logical, length); + read_unlock(&fs_info->mapping_tree_lock); + + return map; +} + /* * Find the mapping containing the given logical extent. * @@ -2994,37 +3079,37 @@ static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * * Return: Chunk mapping or ERR_PTR. */ -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length) +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { - struct extent_map_tree *em_tree; - struct extent_map *em; + struct btrfs_chunk_map *map; - em_tree = &fs_info->mapping_tree; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, length); - if (!em) { - btrfs_crit(fs_info, "unable to find logical %llu length %llu", + if (unlikely(!map)) { + read_unlock(&fs_info->mapping_tree_lock); + btrfs_crit(fs_info, + "unable to find chunk map for logical %llu length %llu", logical, length); return ERR_PTR(-EINVAL); } - if (em->start > logical || em->start + em->len < logical) { + if (unlikely(map->start > logical || map->start + map->chunk_len <= logical)) { + read_unlock(&fs_info->mapping_tree_lock); btrfs_crit(fs_info, - "found a bad mapping, wanted %llu-%llu, found %llu-%llu", - logical, length, em->start, em->start + em->len); - free_extent_map(em); + "found a bad chunk map, wanted %llu-%llu, found %llu-%llu", + logical, logical + length, map->start, + map->start + map->chunk_len); + btrfs_free_chunk_map(map); return ERR_PTR(-EINVAL); } - /* callers are responsible for dropping em's ref. */ - return em; + /* Callers are responsible for dropping the reference. */ + return map; } static int remove_chunk_item(struct btrfs_trans_handle *trans, - struct map_lookup *map, u64 chunk_offset) + struct btrfs_chunk_map *map, u64 chunk_offset) { int i; @@ -3049,23 +3134,21 @@ static int remove_chunk_item(struct btrfs_trans_handle *trans, int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 dev_extent_len = 0; int i, ret = 0; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) { + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) { /* * This is a logic error, but we don't want to just rely on the * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ ASSERT(0); - return PTR_ERR(em); + return PTR_ERR(map); } - map = em->map_lookup; /* * First delete the device extent items from the devices btree. @@ -3168,7 +3251,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) goto out; } - trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len); + trace_btrfs_chunk_free(fs_info, map, chunk_offset, map->chunk_len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(fs_info, chunk_offset); @@ -3187,7 +3270,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) */ btrfs_trans_release_chunk_metadata(trans); - ret = btrfs_remove_block_group(trans, chunk_offset, em); + ret = btrfs_remove_block_group(trans, map); if (ret) { btrfs_abort_transaction(trans, ret); goto out; @@ -3199,7 +3282,7 @@ out: trans->removing_chunk = false; } /* once for us */ - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5346,24 +5429,131 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, } } +static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); + } +} + +static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned int bits) +{ + for (int i = 0; i < map->num_stripes; i++) { + struct btrfs_io_stripe *stripe = &map->stripes[i]; + struct btrfs_device *device = stripe->dev; + + __clear_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, + NULL, NULL); + } +} + +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + write_lock(&fs_info->mapping_tree_lock); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + write_unlock(&fs_info->mapping_tree_lock); + + /* Once for the tree reference. */ + btrfs_free_chunk_map(map); +} + +EXPORT_FOR_TESTS +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + bool leftmost = true; + + write_lock(&fs_info->mapping_tree_lock); + p = &fs_info->mapping_tree.rb_root.rb_node; + while (*p) { + struct btrfs_chunk_map *entry; + + parent = *p; + entry = rb_entry(parent, struct btrfs_chunk_map, rb_node); + + if (map->start < entry->start) { + p = &(*p)->rb_left; + } else if (map->start > entry->start) { + p = &(*p)->rb_right; + leftmost = false; + } else { + write_unlock(&fs_info->mapping_tree_lock); + return -EEXIST; + } + } + rb_link_node(&map->rb_node, parent, p); + rb_insert_color_cached(&map->rb_node, &fs_info->mapping_tree, leftmost); + chunk_map_device_set_bits(map, CHUNK_ALLOCATED); + chunk_map_device_clear_bits(map, CHUNK_TRIMMED); + write_unlock(&fs_info->mapping_tree_lock); + + return 0; +} + +EXPORT_FOR_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp) +{ + struct btrfs_chunk_map *map; + + map = kmalloc(btrfs_chunk_map_size(num_stripes), gfp); + if (!map) + return NULL; + + refcount_set(&map->refs, 1); + RB_CLEAR_NODE(&map->rb_node); + + return map; +} + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp) +{ + const int size = btrfs_chunk_map_size(map->num_stripes); + struct btrfs_chunk_map *clone; + + clone = kmemdup(map, size, gfp); + if (!clone) + return NULL; + + refcount_set(&clone->refs, 1); + RB_CLEAR_NODE(&clone->rb_node); + + return clone; +} + static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, struct alloc_chunk_ctl *ctl, struct btrfs_device_info *devices_info) { struct btrfs_fs_info *info = trans->fs_info; - struct map_lookup *map = NULL; - struct extent_map_tree *em_tree; + struct btrfs_chunk_map *map; struct btrfs_block_group *block_group; - struct extent_map *em; u64 start = ctl->start; u64 type = ctl->type; int ret; int i; int j; - map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS); + map = btrfs_alloc_chunk_map(ctl->num_stripes, GFP_NOFS); if (!map) return ERR_PTR(-ENOMEM); + + map->start = start; + map->chunk_len = ctl->chunk_size; + map->stripe_size = ctl->stripe_size; + map->type = type; + map->io_align = BTRFS_STRIPE_LEN; + map->io_width = BTRFS_STRIPE_LEN; + map->sub_stripes = ctl->sub_stripes; map->num_stripes = ctl->num_stripes; for (i = 0; i < ctl->ndevs; ++i) { @@ -5374,41 +5564,22 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, j * ctl->stripe_size; } } - map->io_align = BTRFS_STRIPE_LEN; - map->io_width = BTRFS_STRIPE_LEN; - map->type = type; - map->sub_stripes = ctl->sub_stripes; trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size); - em = alloc_extent_map(); - if (!em) { - kfree(map); - return ERR_PTR(-ENOMEM); - } - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = start; - em->len = ctl->chunk_size; - em->block_start = 0; - em->block_len = em->len; - em->orig_block_len = ctl->stripe_size; - - em_tree = &info->mapping_tree; - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); + ret = btrfs_add_chunk_map(info, map); if (ret) { - write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } - write_unlock(&em_tree->lock); block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); - if (IS_ERR(block_group)) - goto error_del_extent; + if (IS_ERR(block_group)) { + btrfs_remove_chunk_map(info, map); + return block_group; + } - for (i = 0; i < map->num_stripes; i++) { + for (int i = 0; i < map->num_stripes; i++) { struct btrfs_device *dev = map->stripes[i].dev; btrfs_device_set_bytes_used(dev, @@ -5421,23 +5592,10 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, atomic64_sub(ctl->stripe_size * map->num_stripes, &info->free_chunk_space); - free_extent_map(em); check_raid56_incompat_flag(info, type); check_raid1c34_incompat_flag(info, type); return block_group; - -error_del_extent: - write_lock(&em_tree->lock); - remove_extent_mapping(em_tree, em); - write_unlock(&em_tree->lock); - - /* One for our allocation */ - free_extent_map(em); - /* One for the tree reference */ - free_extent_map(em); - - return block_group; } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, @@ -5513,8 +5671,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_key key; struct btrfs_chunk *chunk; struct btrfs_stripe *stripe; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; size_t item_size; int i; int ret; @@ -5543,14 +5700,13 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, */ lockdep_assert_held(&fs_info->chunk_mutex); - em = btrfs_get_chunk_map(fs_info, bg->start, bg->length); - if (IS_ERR(em)) { - ret = PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, bg->start, bg->length); + if (IS_ERR(map)) { + ret = PTR_ERR(map); btrfs_abort_transaction(trans, ret); return ret; } - map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); chunk = kzalloc(item_size, GFP_NOFS); @@ -5607,7 +5763,7 @@ int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, out: kfree(chunk); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -5652,7 +5808,7 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) return 0; } -static inline int btrfs_chunk_max_errors(struct map_lookup *map) +static inline int btrfs_chunk_max_errors(struct btrfs_chunk_map *map) { const int index = btrfs_bg_flags_to_raid_index(map->type); @@ -5661,17 +5817,15 @@ static inline int btrfs_chunk_max_errors(struct map_lookup *map) bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int miss_ndevs = 0; int i; bool ret = true; - em = btrfs_get_chunk_map(fs_info, chunk_offset, 1); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, chunk_offset, 1); + if (IS_ERR(map)) return false; - map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (test_bit(BTRFS_DEV_STATE_MISSING, &map->stripes[i].dev->dev_state)) { @@ -5692,38 +5846,37 @@ bool btrfs_chunk_writeable(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (miss_ndevs > btrfs_chunk_max_errors(map)) ret = false; end: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } -void btrfs_mapping_tree_free(struct extent_map_tree *tree) +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info) { - struct extent_map *em; + write_lock(&fs_info->mapping_tree_lock); + while (!RB_EMPTY_ROOT(&fs_info->mapping_tree.rb_root)) { + struct btrfs_chunk_map *map; + struct rb_node *node; - while (1) { - write_lock(&tree->lock); - em = lookup_extent_mapping(tree, 0, (u64)-1); - if (em) - remove_extent_mapping(tree, em); - write_unlock(&tree->lock); - if (!em) - break; - /* once for us */ - free_extent_map(em); - /* once for the tree */ - free_extent_map(em); + node = rb_first_cached(&fs_info->mapping_tree); + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + rb_erase_cached(&map->rb_node, &fs_info->mapping_tree); + RB_CLEAR_NODE(&map->rb_node); + chunk_map_device_clear_bits(map, CHUNK_ALLOCATED); + /* Once for the tree ref. */ + btrfs_free_chunk_map(map); + cond_resched_rwlock_write(&fs_info->mapping_tree_lock); } + write_unlock(&fs_info->mapping_tree_lock); } int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; enum btrfs_raid_types index; int ret = 1; - em = btrfs_get_chunk_map(fs_info, logical, len); - if (IS_ERR(em)) + map = btrfs_get_chunk_map(fs_info, logical, len); + if (IS_ERR(map)) /* * We could return errors for these cases, but that could get * ugly and we'd probably do the same thing which is just not do @@ -5732,7 +5885,6 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) */ return 1; - map = em->map_lookup; index = btrfs_bg_flags_to_raid_index(map->type); /* Non-RAID56, use their ncopies from btrfs_raid_array. */ @@ -5749,53 +5901,49 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) * stripe under reconstruction. */ ret = map->num_stripes; - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; unsigned long len = fs_info->sectorsize; if (!btrfs_fs_incompat(fs_info, RAID56)) return len; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if (!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = btrfs_stripe_nr_to_offset(nr_data_stripes(map)); - free_extent_map(em); + btrfs_free_chunk_map(map); } return len; } int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; int ret = 0; if (!btrfs_fs_incompat(fs_info, RAID56)) return 0; - em = btrfs_get_chunk_map(fs_info, logical, len); + map = btrfs_get_chunk_map(fs_info, logical, len); - if(!WARN_ON(IS_ERR(em))) { - map = em->map_lookup; + if (!WARN_ON(IS_ERR(map))) { if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; - free_extent_map(em); + btrfs_free_chunk_map(map); } return ret; } static int find_live_mirror(struct btrfs_fs_info *fs_info, - struct map_lookup *map, int first, + struct btrfs_chunk_map *map, int first, int dev_replace_is_ongoing) { int i; @@ -5902,8 +6050,7 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, u64 logical, u64 *length_ret, u32 *num_stripes) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_discard_stripe *stripes; u64 length = *length_ret; u64 offset; @@ -5921,11 +6068,9 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int ret; int i; - em = btrfs_get_chunk_map(fs_info, logical, length); - if (IS_ERR(em)) - return ERR_CAST(em); - - map = em->map_lookup; + map = btrfs_get_chunk_map(fs_info, logical, length); + if (IS_ERR(map)) + return ERR_CAST(map); /* we don't discard raid56 yet */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { @@ -5933,8 +6078,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, goto out_free_map; } - offset = logical - em->start; - length = min_t(u64, em->start + em->len - logical, length); + offset = logical - map->start; + length = min_t(u64, map->start + map->chunk_len - logical, length); *length_ret = length; /* @@ -6031,10 +6176,10 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, } } - free_extent_map(em); + btrfs_free_chunk_map(map); return stripes; out_free_map: - free_extent_map(em); + btrfs_free_chunk_map(map); return ERR_PTR(ret); } @@ -6132,17 +6277,16 @@ static void handle_ops_on_dev_replace(enum btrfs_map_op op, bioc->replace_nr_stripes = nr_extra_stripes; } -static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, - u64 offset, u32 *stripe_nr, u64 *stripe_offset, - u64 *full_stripe_start) +static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, + struct btrfs_io_geometry *io_geom) { /* * Stripe_nr is the stripe where this block falls. stripe_offset is * the offset of this block in its stripe. */ - *stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; - *stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(*stripe_offset < U32_MAX); + io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; + io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; + ASSERT(io_geom->stripe_offset < U32_MAX); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6157,18 +6301,17 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * to go rounddown(), not round_down(), as nr_data_stripes is * not ensured to be power of 2. */ - *full_stripe_start = - btrfs_stripe_nr_to_offset( - rounddown(*stripe_nr, nr_data_stripes(map))); + io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( + rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(*full_stripe_start + full_stripe_len > offset); - ASSERT(*full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. */ - if (op == BTRFS_MAP_WRITE) - return full_stripe_len - (offset - *full_stripe_start); + if (io_geom->op == BTRFS_MAP_WRITE) + return full_stripe_len - (offset - io_geom->raid56_full_stripe_start); } /* @@ -6176,26 +6319,180 @@ static u64 btrfs_max_io_len(struct map_lookup *map, enum btrfs_map_op op, * a single disk). */ if (map->type & BTRFS_BLOCK_GROUP_STRIPE_MASK) - return BTRFS_STRIPE_LEN - *stripe_offset; + return BTRFS_STRIPE_LEN - io_geom->stripe_offset; return U64_MAX; } -static int set_io_stripe(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, - u64 logical, u64 *length, struct btrfs_io_stripe *dst, - struct map_lookup *map, u32 stripe_index, - u64 stripe_offset, u64 stripe_nr) +static int set_io_stripe(struct btrfs_fs_info *fs_info, u64 logical, + u64 *length, struct btrfs_io_stripe *dst, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) { - dst->dev = map->stripes[stripe_index].dev; + dst->dev = map->stripes[io_geom->stripe_index].dev; - if (op == BTRFS_MAP_READ && btrfs_need_stripe_tree_update(fs_info, map->type)) + if (io_geom->op == BTRFS_MAP_READ && + btrfs_need_stripe_tree_update(fs_info, map->type)) return btrfs_get_raid_extent_offset(fs_info, logical, length, - map->type, stripe_index, dst); + map->type, + io_geom->stripe_index, dst); - dst->physical = map->stripes[stripe_index].physical + - stripe_offset + btrfs_stripe_nr_to_offset(stripe_nr); + dst->physical = map->stripes[io_geom->stripe_index].physical + + io_geom->stripe_offset + + btrfs_stripe_nr_to_offset(io_geom->stripe_nr); return 0; } +static bool is_single_device_io(struct btrfs_fs_info *fs_info, + const struct btrfs_io_stripe *smap, + const struct btrfs_chunk_map *map, + int num_alloc_stripes, + enum btrfs_map_op op, int mirror_num) +{ + if (!smap) + return false; + + if (num_alloc_stripes != 1) + return false; + + if (btrfs_need_stripe_tree_update(fs_info, map->type) && op != BTRFS_MAP_READ) + return false; + + if ((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1) + return false; + + return true; +} + +static void map_blocks_raid0(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + if (io_geom->op == BTRFS_MAP_READ) + io_geom->mirror_num = 1; +} + +static void map_blocks_raid1(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->stripe_index = find_live_mirror(fs_info, map, 0, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index + 1; +} + +static void map_blocks_dup(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->num_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index = io_geom->mirror_num - 1; + return; + } + + io_geom->mirror_num = 1; +} + +static void map_blocks_raid10(struct btrfs_fs_info *fs_info, + struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + bool dev_replace_is_ongoing) +{ + u32 factor = map->num_stripes / map->sub_stripes; + int old_stripe_index; + + io_geom->stripe_index = (io_geom->stripe_nr % factor) * map->sub_stripes; + io_geom->stripe_nr /= factor; + + if (io_geom->op != BTRFS_MAP_READ) { + io_geom->num_stripes = map->sub_stripes; + return; + } + + if (io_geom->mirror_num) { + io_geom->stripe_index += io_geom->mirror_num - 1; + return; + } + + old_stripe_index = io_geom->stripe_index; + io_geom->stripe_index = find_live_mirror(fs_info, map, + io_geom->stripe_index, + dev_replace_is_ongoing); + io_geom->mirror_num = io_geom->stripe_index - old_stripe_index + 1; +} + +static void map_blocks_raid56_write(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom, + u64 logical, u64 *length) +{ + int data_stripes = nr_data_stripes(map); + + /* + * Needs full stripe mapping. + * + * Push stripe_nr back to the start of the full stripe For those cases + * needing a full stripe, @stripe_nr is the full stripe number. + * + * Originally we go raid56_full_stripe_start / full_stripe_len, but + * that can be expensive. Here we just divide @stripe_nr with + * @data_stripes. + */ + io_geom->stripe_nr /= data_stripes; + + /* RAID[56] write or recovery. Return all stripes */ + io_geom->num_stripes = map->num_stripes; + io_geom->max_errors = btrfs_chunk_max_errors(map); + + /* Return the length to the full stripe end. */ + *length = min(logical + *length, + io_geom->raid56_full_stripe_start + map->start + + btrfs_stripe_nr_to_offset(data_stripes)) - + logical; + io_geom->stripe_index = 0; + io_geom->stripe_offset = 0; +} + +static void map_blocks_raid56_read(struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + int data_stripes = nr_data_stripes(map); + + ASSERT(io_geom->mirror_num <= 1); + /* Just grab the data stripe directly. */ + io_geom->stripe_index = io_geom->stripe_nr % data_stripes; + io_geom->stripe_nr /= data_stripes; + + /* We distribute the parity blocks across stripes. */ + io_geom->stripe_index = + (io_geom->stripe_nr + io_geom->stripe_index) % map->num_stripes; + + if (io_geom->op == BTRFS_MAP_READ && io_geom->mirror_num < 1) + io_geom->mirror_num = 1; +} + +static void map_blocks_single(const struct btrfs_chunk_map *map, + struct btrfs_io_geometry *io_geom) +{ + io_geom->stripe_index = io_geom->stripe_nr % map->num_stripes; + io_geom->stripe_nr /= map->num_stripes; + io_geom->mirror_num = io_geom->stripe_index + 1; +} + /* * Map one logical range to one or more physical ranges. * @@ -6236,43 +6533,37 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, struct btrfs_io_context **bioc_ret, struct btrfs_io_stripe *smap, int *mirror_num_ret) { - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; + struct btrfs_io_geometry io_geom = { 0 }; u64 map_offset; - u64 stripe_offset; - u32 stripe_nr; - u32 stripe_index; - int data_stripes; int i; int ret = 0; - int mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); - int num_stripes; int num_copies; - int max_errors = 0; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; int dev_replace_is_ongoing = 0; u16 num_alloc_stripes; - u64 raid56_full_stripe_start = (u64)-1; u64 max_len; ASSERT(bioc_ret); + io_geom.mirror_num = (mirror_num_ret ? *mirror_num_ret : 0); + io_geom.num_stripes = 1; + io_geom.stripe_index = 0; + io_geom.op = op; + num_copies = btrfs_num_copies(fs_info, logical, fs_info->sectorsize); - if (mirror_num > num_copies) + if (io_geom.mirror_num > num_copies) return -EINVAL; - em = btrfs_get_chunk_map(fs_info, logical, *length); - if (IS_ERR(em)) - return PTR_ERR(em); + map = btrfs_get_chunk_map(fs_info, logical, *length); + if (IS_ERR(map)) + return PTR_ERR(map); - map = em->map_lookup; - data_stripes = nr_data_stripes(map); - - map_offset = logical - em->start; - max_len = btrfs_max_io_len(map, op, map_offset, &stripe_nr, - &stripe_offset, &raid56_full_stripe_start); - *length = min_t(u64, em->len - map_offset, max_len); + map_offset = logical - map->start; + io_geom.raid56_full_stripe_start = (u64)-1; + max_len = btrfs_max_io_len(map, map_offset, &io_geom); + *length = min_t(u64, map->chunk_len - map_offset, max_len); down_read(&dev_replace->rwsem); dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace); @@ -6283,107 +6574,46 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, if (!dev_replace_is_ongoing) up_read(&dev_replace->rwsem); - num_stripes = 1; - stripe_index = 0; - if (map->type & BTRFS_BLOCK_GROUP_RAID0) { - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - if (op == BTRFS_MAP_READ) - mirror_num = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - stripe_index = find_live_mirror(fs_info, map, 0, - dev_replace_is_ongoing); - mirror_num = stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { - if (op != BTRFS_MAP_READ) { - num_stripes = map->num_stripes; - } else if (mirror_num) { - stripe_index = mirror_num - 1; - } else { - mirror_num = 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { - u32 factor = map->num_stripes / map->sub_stripes; - - stripe_index = (stripe_nr % factor) * map->sub_stripes; - stripe_nr /= factor; - - if (op != BTRFS_MAP_READ) - num_stripes = map->sub_stripes; - else if (mirror_num) - stripe_index += mirror_num - 1; - else { - int old_stripe_index = stripe_index; - stripe_index = find_live_mirror(fs_info, map, - stripe_index, - dev_replace_is_ongoing); - mirror_num = stripe_index - old_stripe_index + 1; - } - - } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { - if (op != BTRFS_MAP_READ || mirror_num > 1) { - /* - * Needs full stripe mapping. - * - * Push stripe_nr back to the start of the full stripe - * For those cases needing a full stripe, @stripe_nr - * is the full stripe number. - * - * Originally we go raid56_full_stripe_start / full_stripe_len, - * but that can be expensive. Here we just divide - * @stripe_nr with @data_stripes. - */ - stripe_nr /= data_stripes; - - /* RAID[56] write or recovery. Return all stripes */ - num_stripes = map->num_stripes; - max_errors = btrfs_chunk_max_errors(map); - - /* Return the length to the full stripe end */ - *length = min(logical + *length, - raid56_full_stripe_start + em->start + - btrfs_stripe_nr_to_offset(data_stripes)) - - logical; - stripe_index = 0; - stripe_offset = 0; - } else { - ASSERT(mirror_num <= 1); - /* Just grab the data stripe directly. */ - stripe_index = stripe_nr % data_stripes; - stripe_nr /= data_stripes; - - /* We distribute the parity blocks across stripes */ - stripe_index = (stripe_nr + stripe_index) % map->num_stripes; - if (op == BTRFS_MAP_READ && mirror_num < 1) - mirror_num = 1; - } - } else { + switch (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) { + case BTRFS_BLOCK_GROUP_RAID0: + map_blocks_raid0(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID1: + case BTRFS_BLOCK_GROUP_RAID1C3: + case BTRFS_BLOCK_GROUP_RAID1C4: + map_blocks_raid1(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_DUP: + map_blocks_dup(map, &io_geom); + break; + case BTRFS_BLOCK_GROUP_RAID10: + map_blocks_raid10(fs_info, map, &io_geom, dev_replace_is_ongoing); + break; + case BTRFS_BLOCK_GROUP_RAID5: + case BTRFS_BLOCK_GROUP_RAID6: + if (op != BTRFS_MAP_READ || io_geom.mirror_num > 1) + map_blocks_raid56_write(map, &io_geom, logical, length); + else + map_blocks_raid56_read(map, &io_geom); + break; + default: /* * After this, stripe_nr is the number of stripes on this * device we have to walk to find the data, and stripe_index is * the number of our device in the stripe array */ - stripe_index = stripe_nr % map->num_stripes; - stripe_nr /= map->num_stripes; - mirror_num = stripe_index + 1; + map_blocks_single(map, &io_geom); + break; } - if (stripe_index >= map->num_stripes) { + if (io_geom.stripe_index >= map->num_stripes) { btrfs_crit(fs_info, "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u", - stripe_index, map->num_stripes); + io_geom.stripe_index, map->num_stripes); ret = -EINVAL; goto out; } - num_alloc_stripes = num_stripes; + num_alloc_stripes = io_geom.num_stripes; if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) /* @@ -6400,14 +6630,11 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * physical block information on the stack instead of allocating an * I/O context structure. */ - if (smap && num_alloc_stripes == 1 && - !(btrfs_need_stripe_tree_update(fs_info, map->type) && - op != BTRFS_MAP_READ) && - !((map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) && mirror_num > 1)) { - ret = set_io_stripe(fs_info, op, logical, length, smap, map, - stripe_index, stripe_offset, stripe_nr); + if (is_single_device_io(fs_info, smap, map, num_alloc_stripes, op, + io_geom.mirror_num)) { + ret = set_io_stripe(fs_info, logical, length, smap, map, &io_geom); if (mirror_num_ret) - *mirror_num_ret = mirror_num; + *mirror_num_ret = io_geom.mirror_num; *bioc_ret = NULL; goto out; } @@ -6427,7 +6654,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * It's still mostly the same as other profiles, just with extra rotation. */ if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && - (op != BTRFS_MAP_READ || mirror_num > 1)) { + (op != BTRFS_MAP_READ || io_geom.mirror_num > 1)) { /* * For RAID56 @stripe_nr is already the number of full stripes * before us, which is also the rotation value (needs to modulo @@ -6436,28 +6663,31 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, * In this case, we just add @stripe_nr with @i, then do the * modulo, to reduce one modulo call. */ - bioc->full_stripe_logical = em->start + - btrfs_stripe_nr_to_offset(stripe_nr * data_stripes); - for (int i = 0; i < num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, - (i + stripe_nr) % num_stripes, - stripe_offset, stripe_nr); - if (ret < 0) - break; + bioc->full_stripe_logical = map->start + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr * + nr_data_stripes(map)); + for (int i = 0; i < io_geom.num_stripes; i++) { + struct btrfs_io_stripe *dst = &bioc->stripes[i]; + u32 stripe_index; + + stripe_index = (i + io_geom.stripe_nr) % io_geom.num_stripes; + dst->dev = map->stripes[stripe_index].dev; + dst->physical = + map->stripes[stripe_index].physical + + io_geom.stripe_offset + + btrfs_stripe_nr_to_offset(io_geom.stripe_nr); } } else { /* * For all other non-RAID56 profiles, just copy the target * stripe into the bioc. */ - for (i = 0; i < num_stripes; i++) { - ret = set_io_stripe(fs_info, op, logical, length, - &bioc->stripes[i], map, stripe_index, - stripe_offset, stripe_nr); + for (i = 0; i < io_geom.num_stripes; i++) { + ret = set_io_stripe(fs_info, logical, length, + &bioc->stripes[i], map, &io_geom); if (ret < 0) break; - stripe_index++; + io_geom.stripe_index++; } } @@ -6468,18 +6698,18 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, } if (op != BTRFS_MAP_READ) - max_errors = btrfs_chunk_max_errors(map); + io_geom.max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL && op != BTRFS_MAP_READ) { handle_ops_on_dev_replace(op, bioc, dev_replace, logical, - &num_stripes, &max_errors); + &io_geom.num_stripes, &io_geom.max_errors); } *bioc_ret = bioc; - bioc->num_stripes = num_stripes; - bioc->max_errors = max_errors; - bioc->mirror_num = mirror_num; + bioc->num_stripes = io_geom.num_stripes; + bioc->max_errors = io_geom.max_errors; + bioc->mirror_num = io_geom.mirror_num; out: if (dev_replace_is_ongoing) { @@ -6487,7 +6717,7 @@ out: /* Unlock and let waiting writers proceed */ up_read(&dev_replace->rwsem); } - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } @@ -6659,12 +6889,11 @@ static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info, devid, uuid); } -u64 btrfs_calc_stripe_length(const struct extent_map *em) +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map) { - const struct map_lookup *map = em->map_lookup; const int data_stripes = calc_data_stripes(map->type, map->num_stripes); - return div_u64(em->len, data_stripes); + return div_u64(map->chunk_len, data_stripes); } #if BITS_PER_LONG == 32 @@ -6733,9 +6962,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, { BTRFS_DEV_LOOKUP_ARGS(args); struct btrfs_fs_info *fs_info = leaf->fs_info; - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct map_lookup *map; - struct extent_map *em; + struct btrfs_chunk_map *map; u64 logical; u64 length; u64 devid; @@ -6769,35 +6996,22 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, return ret; } - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, logical, 1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, logical, 1); /* already mapped? */ - if (em && em->start <= logical && em->start + em->len > logical) { - free_extent_map(em); + if (map && map->start <= logical && map->start + map->chunk_len > logical) { + btrfs_free_chunk_map(map); return 0; - } else if (em) { - free_extent_map(em); + } else if (map) { + btrfs_free_chunk_map(map); } - em = alloc_extent_map(); - if (!em) - return -ENOMEM; - map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); - if (!map) { - free_extent_map(em); + map = btrfs_alloc_chunk_map(num_stripes, GFP_NOFS); + if (!map) return -ENOMEM; - } - - set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->map_lookup = map; - em->start = logical; - em->len = length; - em->orig_start = 0; - em->block_start = 0; - em->block_len = em->len; + map->start = logical; + map->chunk_len = length; map->num_stripes = num_stripes; map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); @@ -6812,7 +7026,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, */ map->sub_stripes = btrfs_raid_array[index].sub_stripes; map->verified_stripes = 0; - em->orig_block_len = btrfs_calc_stripe_length(em); + map->stripe_size = btrfs_calc_stripe_length(map); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = btrfs_stripe_offset_nr(leaf, chunk, i); @@ -6828,7 +7042,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, devid, uuid); if (IS_ERR(map->stripes[i].dev)) { ret = PTR_ERR(map->stripes[i].dev); - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } } @@ -6837,15 +7051,12 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, &(map->stripes[i].dev->dev_state)); } - write_lock(&map_tree->lock); - ret = add_extent_mapping(map_tree, em, 0); - write_unlock(&map_tree->lock); + ret = btrfs_add_chunk_map(fs_info, map); if (ret < 0) { btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", - em->start, em->len, ret); + map->start, map->chunk_len, ret); } - free_extent_map(em); return ret; } @@ -7155,26 +7366,21 @@ out_short_read: bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev) { - struct extent_map_tree *map_tree = &fs_info->mapping_tree; - struct extent_map *em; - u64 next_start = 0; + struct btrfs_chunk_map *map; + u64 next_start; bool ret = true; - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, 0, (u64)-1); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, 0, U64_MAX); /* No chunk at all? Return false anyway */ - if (!em) { + if (!map) { ret = false; goto out; } - while (em) { - struct map_lookup *map; + while (map) { int missing = 0; int max_tolerated; int i; - map = em->map_lookup; max_tolerated = btrfs_get_num_tolerated_disk_barrier_failures( map->type); @@ -7192,18 +7398,15 @@ bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, if (!failing_dev) btrfs_warn(fs_info, "chunk %llu missing %d devices, max tolerance is %d for writable mount", - em->start, missing, max_tolerated); - free_extent_map(em); + map->start, missing, max_tolerated); + btrfs_free_chunk_map(map); ret = false; goto out; } - next_start = extent_map_end(em); - free_extent_map(em); + next_start = map->start + map->chunk_len; + btrfs_free_chunk_map(map); - read_lock(&map_tree->lock); - em = lookup_extent_mapping(map_tree, next_start, - (u64)(-1) - next_start); - read_unlock(&map_tree->lock); + map = btrfs_find_chunk_map(fs_info, next_start, U64_MAX - next_start); } out: return ret; @@ -7696,20 +7899,15 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, u64 physical_offset, u64 physical_len) { struct btrfs_dev_lookup_args args = { .devid = devid }; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *dev; u64 stripe_len; bool found = false; int ret = 0; int i; - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, chunk_offset, 1); - read_unlock(&em_tree->lock); - - if (!em) { + map = btrfs_find_chunk_map(fs_info, chunk_offset, 1); + if (!map) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk", physical_offset, devid); @@ -7717,12 +7915,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, goto out; } - map = em->map_lookup; - stripe_len = btrfs_calc_stripe_length(em); + stripe_len = btrfs_calc_stripe_length(map); if (physical_len != stripe_len) { btrfs_err(fs_info, "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu", - physical_offset, devid, em->start, physical_len, + physical_offset, devid, map->start, physical_len, stripe_len); ret = -EUCLEAN; goto out; @@ -7745,7 +7942,7 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, if (map->verified_stripes >= map->num_stripes) { btrfs_err(fs_info, "too many dev extents for chunk %llu found", - em->start); + map->start); ret = -EUCLEAN; goto out; } @@ -7791,32 +7988,30 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, } out: - free_extent_map(em); + btrfs_free_chunk_map(map); return ret; } static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info) { - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; struct rb_node *node; int ret = 0; - read_lock(&em_tree->lock); - for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) { - em = rb_entry(node, struct extent_map, rb_node); - if (em->map_lookup->num_stripes != - em->map_lookup->verified_stripes) { + read_lock(&fs_info->mapping_tree_lock); + for (node = rb_first_cached(&fs_info->mapping_tree); node; node = rb_next(node)) { + struct btrfs_chunk_map *map; + + map = rb_entry(node, struct btrfs_chunk_map, rb_node); + if (map->num_stripes != map->verified_stripes) { btrfs_err(fs_info, "chunk %llu has missing dev extent, have %d expect %d", - em->start, em->map_lookup->verified_stripes, - em->map_lookup->num_stripes); + map->start, map->verified_stripes, map->num_stripes); ret = -EUCLEAN; goto out; } } out: - read_unlock(&em_tree->lock); + read_unlock(&fs_info->mapping_tree_lock); return ret; } diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 9cc374864a79..53f87f398da7 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -426,7 +426,8 @@ struct btrfs_discard_stripe { struct btrfs_io_context { refcount_t refs; struct btrfs_fs_info *fs_info; - u64 map_type; /* get from map_lookup->type */ + /* Taken from struct btrfs_chunk_map::type. */ + u64 map_type; struct bio *orig_bio; atomic_t error; u16 max_errors; @@ -529,18 +530,32 @@ struct btrfs_raid_attr { extern const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES]; -struct map_lookup { +struct btrfs_chunk_map { + struct rb_node rb_node; + /* For mount time dev extent verification. */ + int verified_stripes; + refcount_t refs; + u64 start; + u64 chunk_len; + u64 stripe_size; u64 type; int io_align; int io_width; int num_stripes; int sub_stripes; - int verified_stripes; /* For mount time dev extent verification */ struct btrfs_io_stripe stripes[]; }; -#define map_lookup_size(n) (sizeof(struct map_lookup) + \ - (sizeof(struct btrfs_io_stripe) * (n))) +#define btrfs_chunk_map_size(n) (sizeof(struct btrfs_chunk_map) + \ + (sizeof(struct btrfs_io_stripe) * (n))) + +static inline void btrfs_free_chunk_map(struct btrfs_chunk_map *map) +{ + if (map && refcount_dec_and_test(&map->refs)) { + ASSERT(RB_EMPTY_NODE(&map->rb_node)); + kfree(map); + } +} struct btrfs_balance_args; struct btrfs_balance_progress; @@ -598,7 +613,7 @@ static inline unsigned long btrfs_chunk_item_size(int num_stripes) } /* - * Do the type safe converstion from stripe_nr to offset inside the chunk. + * Do the type safe conversion from stripe_nr to offset inside the chunk. * * @stripe_nr is u32, with left shift it can overflow u32 for chunks larger * than 4G. This does the proper type cast to avoid overflow. @@ -624,7 +639,7 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, u64 type); -void btrfs_mapping_tree_free(struct extent_map_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, @@ -680,13 +695,25 @@ int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len); unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info, u64 logical); -u64 btrfs_calc_stripe_length(const struct extent_map *em); +u64 btrfs_calc_stripe_length(const struct btrfs_chunk_map *map); int btrfs_nr_parity_stripes(u64 type); int btrfs_chunk_alloc_add_chunk_item(struct btrfs_trans_handle *trans, struct btrfs_block_group *bg); int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset); -struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, - u64 logical, u64 length); + +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +struct btrfs_chunk_map *btrfs_alloc_chunk_map(int num_stripes, gfp_t gfp); +int btrfs_add_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +#endif + +struct btrfs_chunk_map *btrfs_clone_chunk_map(struct btrfs_chunk_map *map, gfp_t gfp); +struct btrfs_chunk_map *btrfs_find_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, + u64 logical, u64 length); +void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index 3cf236fb40a4..6287763fdccc 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -382,6 +382,53 @@ static int btrfs_xattr_handler_set(const struct xattr_handler *handler, return btrfs_setxattr_trans(inode, name, buffer, size, flags); } +static int btrfs_xattr_handler_get_security(const struct xattr_handler *handler, + struct dentry *unused, + struct inode *inode, + const char *name, void *buffer, + size_t size) +{ + int ret; + bool is_cap = false; + + name = xattr_full_name(handler, name); + + /* + * security.capability doesn't cache the results, so calls into us + * constantly to see if there's a capability xattr. Cache the result + * here in order to avoid wasting time doing lookups for xattrs we know + * don't exist. + */ + if (strcmp(name, XATTR_NAME_CAPS) == 0) { + is_cap = true; + if (test_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags)) + return -ENODATA; + } + + ret = btrfs_getxattr(inode, name, buffer, size); + if (ret == -ENODATA && is_cap) + set_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + return ret; +} + +static int btrfs_xattr_handler_set_security(const struct xattr_handler *handler, + struct mnt_idmap *idmap, + struct dentry *unused, + struct inode *inode, + const char *name, + const void *buffer, + size_t size, int flags) +{ + if (btrfs_root_readonly(BTRFS_I(inode)->root)) + return -EROFS; + + name = xattr_full_name(handler, name); + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + + return btrfs_setxattr_trans(inode, name, buffer, size, flags); +} + static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *unused, struct inode *inode, @@ -420,8 +467,8 @@ static int btrfs_xattr_handler_set_prop(const struct xattr_handler *handler, static const struct xattr_handler btrfs_security_xattr_handler = { .prefix = XATTR_SECURITY_PREFIX, - .get = btrfs_xattr_handler_get, - .set = btrfs_xattr_handler_set, + .get = btrfs_xattr_handler_get_security, + .set = btrfs_xattr_handler_set_security, }; static const struct xattr_handler btrfs_trusted_xattr_handler = { @@ -473,6 +520,10 @@ static int btrfs_initxattrs(struct inode *inode, } strcpy(name, XATTR_SECURITY_PREFIX); strcpy(name + XATTR_SECURITY_PREFIX_LEN, xattr->name); + + if (strcmp(name, XATTR_NAME_CAPS) == 0) + clear_bit(BTRFS_INODE_NO_CAP_XATTR, &BTRFS_I(inode)->runtime_flags); + err = btrfs_setxattr(trans, inode, name, xattr->value, xattr->value_len, 0); kfree(name); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index 6c231a116a29..36cf1f0e338e 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -121,7 +121,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->strm.total_in = 0; workspace->strm.total_out = 0; - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -200,7 +200,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -236,7 +236,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 3504ade30cb0..5bd76813b23f 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -578,26 +578,12 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) kvfree(zones); - switch (bdev_zoned_model(bdev)) { - case BLK_ZONED_HM: + if (bdev_is_zoned(bdev)) { model = "host-managed zoned"; emulated = ""; - break; - case BLK_ZONED_HA: - model = "host-aware zoned"; - emulated = ""; - break; - case BLK_ZONED_NONE: + } else { model = "regular"; emulated = "emulated "; - break; - default: - /* Just in case */ - btrfs_err_in_rcu(fs_info, "zoned: unsupported model %d on %s", - bdev_zoned_model(bdev), - rcu_str_deref(device->name)); - ret = -EOPNOTSUPP; - goto out_free_zone_info; } btrfs_info_in_rcu(fs_info, @@ -609,9 +595,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache) out: kvfree(zones); -out_free_zone_info: btrfs_destroy_dev_zone_info(device); - return ret; } @@ -688,8 +672,7 @@ static int btrfs_check_for_zoned_device(struct btrfs_fs_info *fs_info) struct btrfs_device *device; list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { - if (device->bdev && - bdev_zoned_model(device->bdev) == BLK_ZONED_HM) { + if (device->bdev && bdev_is_zoned(device->bdev)) { btrfs_err(fs_info, "zoned: mode not enabled but zoned device found: %pg", device->bdev); @@ -781,7 +764,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) * Check mount options here, because we might change fs_info->zoned * from fs_info->zone_size. */ - ret = btrfs_check_mountopts_zoned(fs_info); + ret = btrfs_check_mountopts_zoned(fs_info, &fs_info->mount_opt); if (ret) return ret; @@ -789,7 +772,7 @@ int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info) return 0; } -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt) { if (!btrfs_is_zoned(info)) return 0; @@ -798,18 +781,21 @@ int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) * Space cache writing is not COWed. Disable that to avoid write errors * in sequential zones. */ - if (btrfs_test_opt(info, SPACE_CACHE)) { + if (btrfs_raw_test_opt(*mount_opt, SPACE_CACHE)) { btrfs_err(info, "zoned: space cache v1 is not supported"); return -EINVAL; } - if (btrfs_test_opt(info, NODATACOW)) { + if (btrfs_raw_test_opt(*mount_opt, NODATACOW)) { btrfs_err(info, "zoned: NODATACOW not supported"); return -EINVAL; } - btrfs_clear_and_info(info, DISCARD_ASYNC, - "zoned: async discard ignored and disabled for zoned mode"); + if (btrfs_raw_test_opt(*mount_opt, DISCARD_ASYNC)) { + btrfs_info(info, + "zoned: async discard ignored and disabled for zoned mode"); + btrfs_clear_opt(*mount_opt, DISCARD_ASYNC); + } return 0; } @@ -1290,7 +1276,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct map_lookup *map) + struct btrfs_chunk_map *map) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device = map->stripes[zone_idx].dev; @@ -1393,7 +1379,7 @@ static int btrfs_load_block_group_single(struct btrfs_block_group *bg, } static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1435,7 +1421,7 @@ static int btrfs_load_block_group_dup(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1483,7 +1469,7 @@ static int btrfs_load_block_group_raid1(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1515,7 +1501,7 @@ static int btrfs_load_block_group_raid0(struct btrfs_block_group *bg, } static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, - struct map_lookup *map, + struct btrfs_chunk_map *map, struct zone_info *zone_info, unsigned long *active) { @@ -1552,9 +1538,7 @@ static int btrfs_load_block_group_raid10(struct btrfs_block_group *bg, int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) { struct btrfs_fs_info *fs_info = cache->fs_info; - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; + struct btrfs_chunk_map *map; u64 logical = cache->start; u64 length = cache->length; struct zone_info *zone_info = NULL; @@ -1575,17 +1559,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) return -EIO; } - /* Get the chunk mapping */ - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, logical, length); - read_unlock(&em_tree->lock); - - if (!em) + map = btrfs_find_chunk_map(fs_info, logical, length); + if (!map) return -EINVAL; - map = em->map_lookup; - - cache->physical_map = kmemdup(map, map_lookup_size(map->num_stripes), GFP_NOFS); + cache->physical_map = btrfs_clone_chunk_map(map, GFP_NOFS); if (!cache->physical_map) { ret = -ENOMEM; goto out; @@ -1661,13 +1639,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } out: - if (cache->alloc_offset > fs_info->zone_size) { - btrfs_err(fs_info, - "zoned: invalid write pointer %llu in block group %llu", - cache->alloc_offset, cache->start); - ret = -EIO; - } - if (cache->alloc_offset > cache->zone_capacity) { btrfs_err(fs_info, "zoned: invalid write pointer %llu (larger than zone capacity %llu) in block group %llu", @@ -1694,12 +1665,11 @@ out: spin_unlock(&fs_info->zone_active_bgs_lock); } } else { - kfree(cache->physical_map); + btrfs_free_chunk_map(cache->physical_map); cache->physical_map = NULL; } bitmap_free(active); kfree(zone_info); - free_extent_map(em); return ret; } @@ -1722,22 +1692,6 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) cache->zone_unusable = unusable; } -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) -{ - if (!btrfs_is_zoned(eb->fs_info) || - btrfs_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN)) - return; - - ASSERT(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - - memzero_extent_buffer(eb, 0, eb->len); - set_bit(EXTENT_BUFFER_NO_CHECK, &eb->bflags); - set_extent_buffer_dirty(eb); - set_extent_bit(&trans->dirty_pages, eb->start, eb->start + eb->len - 1, - EXTENT_DIRTY, NULL); -} - bool btrfs_use_zone_append(struct btrfs_bio *bbio) { u64 start = (bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT); @@ -2089,7 +2043,7 @@ int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical, bool btrfs_zone_activate(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; struct btrfs_device *device; u64 physical; const bool is_data = (block_group->flags & BTRFS_BLOCK_GROUP_DATA); @@ -2201,7 +2155,7 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written) { struct btrfs_fs_info *fs_info = block_group->fs_info; - struct map_lookup *map; + struct btrfs_chunk_map *map; const bool is_metadata = (block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)); int ret = 0; @@ -2650,7 +2604,7 @@ void btrfs_check_active_zone_reservation(struct btrfs_fs_info *fs_info) /* Release reservation for currently active block groups. */ spin_lock(&fs_info->zone_active_bgs_lock); list_for_each_entry(block_group, &fs_info->zone_active_bgs, active_bg_list) { - struct map_lookup *map = block_group->physical_map; + struct btrfs_chunk_map *map = block_group->physical_map; if (!(block_group->flags & (BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM))) diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index b9cec523b778..f573bda496fb 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -45,7 +45,7 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache); void btrfs_destroy_dev_zone_info(struct btrfs_device *device); struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev); int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info); -int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info); +int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, unsigned long *mount_opt); int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw, u64 *bytenr_ret); int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw, @@ -59,8 +59,6 @@ int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical, int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size); int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new); void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); -void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb); bool btrfs_use_zone_append(struct btrfs_bio *bbio); void btrfs_record_physical_zoned(struct btrfs_bio *bbio); int btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info, @@ -123,7 +121,8 @@ static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info) return -EOPNOTSUPP; } -static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info) +static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info, + unsigned long *mount_opt) { return 0; } @@ -180,9 +179,6 @@ static inline int btrfs_load_block_group_zone_info( static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { } -static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, - struct extent_buffer *eb) { } - static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio) { return false; @@ -323,8 +319,8 @@ static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_i (bdev_zone_sectors(bdev) << SECTOR_SHIFT); } - /* Do not allow Host Manged zoned device */ - return bdev_zoned_model(bdev) != BLK_ZONED_HM; + /* Do not allow Host Managed zoned device. */ + return !bdev_is_zoned(bdev); } static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos) diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5511766485cd..0d66db8bc1d4 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -410,9 +410,8 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, workspace->in_buf.pos = 0; workspace->in_buf.size = min_t(size_t, len, PAGE_SIZE); - /* Allocate and map in the output buffer */ - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -457,7 +456,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; @@ -514,7 +513,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, ret = -E2BIG; goto out; } - out_page = alloc_page(GFP_NOFS); + out_page = btrfs_alloc_compr_page(); if (out_page == NULL) { ret = -ENOMEM; goto out; |