diff options
Diffstat (limited to 'fs/btrfs/raid56.c')
-rw-r--r-- | fs/btrfs/raid56.c | 2052 |
1 files changed, 1082 insertions, 970 deletions
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 82c8e991300e..2d90a6b5eb00 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,12 +13,15 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "messages.h" #include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" #include "raid56.h" #include "async-thread.h" +#include "file-item.h" +#include "btrfs_inode.h" /* set when additional merges to this rbio are not allowed */ #define RBIO_RMW_LOCKED_BIT 1 @@ -63,19 +66,45 @@ struct sector_ptr { unsigned int uptodate:8; }; -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio); -static noinline void finish_rmw(struct btrfs_raid_bio *rbio); -static void rmw_work(struct work_struct *work); -static void read_rebuild_work(struct work_struct *work); -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio); -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed); -static void __free_raid_bio(struct btrfs_raid_bio *rbio); +static void rmw_rbio_work(struct work_struct *work); +static void rmw_rbio_work_locked(struct work_struct *work); static void index_rbio_pages(struct btrfs_raid_bio *rbio); static int alloc_rbio_pages(struct btrfs_raid_bio *rbio); -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check); -static void scrub_parity_work(struct work_struct *work); +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check); +static void scrub_rbio_work_locked(struct work_struct *work); + +static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio) +{ + bitmap_free(rbio->error_bitmap); + kfree(rbio->stripe_pages); + kfree(rbio->bio_sectors); + kfree(rbio->stripe_sectors); + kfree(rbio->finish_pointers); +} + +static void free_raid_bio(struct btrfs_raid_bio *rbio) +{ + int i; + + if (!refcount_dec_and_test(&rbio->refs)) + return; + + WARN_ON(!list_empty(&rbio->stripe_cache)); + WARN_ON(!list_empty(&rbio->hash_list)); + WARN_ON(!bio_list_empty(&rbio->bio_list)); + + for (i = 0; i < rbio->nr_pages; i++) { + if (rbio->stripe_pages[i]) { + __free_page(rbio->stripe_pages[i]); + rbio->stripe_pages[i] = NULL; + } + } + + btrfs_put_bioc(rbio->bioc); + free_raid_bio_pointers(rbio); + kfree(rbio); +} static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func) { @@ -146,8 +175,16 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) + if (!rbio->bio_sectors[i].page) { + /* + * Even if the sector is not covered by bio, if it is + * a data sector it should still be uptodate as it is + * read from disk. + */ + if (i < rbio->nr_data * rbio->stripe_nsectors) + ASSERT(rbio->stripe_sectors[i].uptodate); continue; + } ASSERT(rbio->stripe_sectors[i].page); memcpy_page(rbio->stripe_sectors[i].page, @@ -234,6 +271,21 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, dest->stripe_sectors[i].uptodate = true; } +static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr) +{ + const int sector_nr = (page_nr << PAGE_SHIFT) >> + rbio->bioc->fs_info->sectorsize_bits; + + /* + * We have ensured PAGE_SIZE is aligned with sectorsize, thus + * we won't have a page which is half data half parity. + * + * Thus if the first sector of the page belongs to data stripes, then + * the full page belongs to data stripes. + */ + return (sector_nr < rbio->nr_data * rbio->stripe_nsectors); +} + /* * Stealing an rbio means taking all the uptodate pages from the stripe array * in the source rbio and putting them into the destination rbio. @@ -244,16 +296,26 @@ static void steal_rbio_page(struct btrfs_raid_bio *src, static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest) { int i; - struct page *s; if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags)) return; for (i = 0; i < dest->nr_pages; i++) { - s = src->stripe_pages[i]; - if (!s || !full_page_sectors_uptodate(src, i)) + struct page *p = src->stripe_pages[i]; + + /* + * We don't need to steal P/Q pages as they will always be + * regenerated for RMW or full write anyway. + */ + if (!is_data_stripe_page(src, i)) continue; + /* + * If @src already has RBIO_CACHE_READY_BIT, it should have + * all data stripe pages present and uptodate. + */ + ASSERT(p); + ASSERT(full_page_sectors_uptodate(src, i)); steal_rbio_page(src, dest, i); } index_stripe_sectors(dest); @@ -336,7 +398,7 @@ static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio) spin_unlock(&h->lock); if (freeit) - __free_raid_bio(rbio); + free_raid_bio(rbio); } /* @@ -526,28 +588,10 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, if (last->operation == BTRFS_RBIO_PARITY_SCRUB) return 0; - if (last->operation == BTRFS_RBIO_REBUILD_MISSING) + if (last->operation == BTRFS_RBIO_REBUILD_MISSING || + last->operation == BTRFS_RBIO_READ_REBUILD) return 0; - if (last->operation == BTRFS_RBIO_READ_REBUILD) { - int fa = last->faila; - int fb = last->failb; - int cur_fa = cur->faila; - int cur_fb = cur->failb; - - if (last->faila >= last->failb) { - fa = last->failb; - fb = last->faila; - } - - if (cur->faila >= cur->failb) { - cur_fa = cur->failb; - cur_fb = cur->faila; - } - - if (fa != cur_fa || fb != cur_fb) - return 0; - } return 1; } @@ -684,10 +728,12 @@ out: if (cache_drop) remove_rbio_from_cache(cache_drop); if (freeit) - __free_raid_bio(freeit); + free_raid_bio(freeit); return ret; } +static void recover_rbio_work_locked(struct work_struct *work); + /* * called as rmw or parity rebuild is completed. If the plug list has more * rbios waiting for this stripe, the next one on the list will be started @@ -745,16 +791,16 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio) spin_unlock_irqrestore(&h->lock, flags); if (next->operation == BTRFS_RBIO_READ_REBUILD) - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) { steal_rbio(rbio, next); - start_async_work(next, read_rebuild_work); + start_async_work(next, recover_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_WRITE) { steal_rbio(rbio, next); - start_async_work(next, rmw_work); + start_async_work(next, rmw_rbio_work_locked); } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) { steal_rbio(rbio, next); - start_async_work(next, scrub_parity_work); + start_async_work(next, scrub_rbio_work_locked); } goto done_nolock; @@ -769,28 +815,6 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void __free_raid_bio(struct btrfs_raid_bio *rbio) -{ - int i; - - if (!refcount_dec_and_test(&rbio->refs)) - return; - - WARN_ON(!list_empty(&rbio->stripe_cache)); - WARN_ON(!list_empty(&rbio->hash_list)); - WARN_ON(!bio_list_empty(&rbio->bio_list)); - - for (i = 0; i < rbio->nr_pages; i++) { - if (rbio->stripe_pages[i]) { - __free_page(rbio->stripe_pages[i]); - rbio->stripe_pages[i] = NULL; - } - } - - btrfs_put_bioc(rbio->bioc); - kfree(rbio); -} - static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) { struct bio *next; @@ -813,6 +837,11 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; + /* * Clear the data bitmap, as the rbio may be cached for later usage. * do this before before unlock_stripe() so there will be no new bio @@ -830,7 +859,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) */ unlock_stripe(rbio); extra = bio_list_get(&rbio->bio_list); - __free_raid_bio(rbio); + free_raid_bio(rbio); rbio_endio_bio_list(cur, err); if (extra) @@ -838,36 +867,7 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) } /* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - int max_errors; - - if (err) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = BLK_STS_OK; - - /* OK, we have read all the stripes we need to. */ - max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? - 0 : rbio->bioc->max_errors; - if (atomic_read(&rbio->error) > max_errors) - err = BLK_STS_IOERR; - - rbio_orig_end_io(rbio, err); -} - -/** - * Get a sector pointer specified by its @stripe_nr and @sector_nr + * Get a sector pointer specified by its @stripe_nr and @sector_nr. * * @rbio: The raid bio * @stripe_nr: Stripe number, valid range [0, real_stripe) @@ -919,7 +919,6 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits; const unsigned int num_sectors = stripe_nsectors * real_stripes; struct btrfs_raid_bio *rbio; - void *p; /* PAGE_SIZE must also be aligned to sectorsize for subpage support */ ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize)); @@ -929,16 +928,27 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, */ ASSERT(stripe_nsectors <= BITS_PER_LONG); - rbio = kzalloc(sizeof(*rbio) + - sizeof(*rbio->stripe_pages) * num_pages + - sizeof(*rbio->bio_sectors) * num_sectors + - sizeof(*rbio->stripe_sectors) * num_sectors + - sizeof(*rbio->finish_pointers) * real_stripes, - GFP_NOFS); + rbio = kzalloc(sizeof(*rbio), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); + rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *), + GFP_NOFS); + rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr), + GFP_NOFS); + rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS); + rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS); + + if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors || + !rbio->finish_pointers || !rbio->error_bitmap) { + free_raid_bio_pointers(rbio); + kfree(rbio); + return ERR_PTR(-ENOMEM); + } bio_list_init(&rbio->bio_list); + init_waitqueue_head(&rbio->io_wait); INIT_LIST_HEAD(&rbio->plug_list); spin_lock_init(&rbio->bio_list_lock); INIT_LIST_HEAD(&rbio->stripe_cache); @@ -950,27 +960,9 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info, rbio->real_stripes = real_stripes; rbio->stripe_npages = stripe_npages; rbio->stripe_nsectors = stripe_nsectors; - rbio->faila = -1; - rbio->failb = -1; refcount_set(&rbio->refs, 1); - atomic_set(&rbio->error, 0); atomic_set(&rbio->stripes_pending, 0); - /* - * The stripe_pages, bio_sectors, etc arrays point to the extra memory - * we allocated past the end of the rbio. - */ - p = rbio + 1; -#define CONSUME_ALLOC(ptr, count) do { \ - ptr = p; \ - p = (unsigned char *)p + sizeof(*(ptr)) * (count); \ - } while (0) - CONSUME_ALLOC(rbio->stripe_pages, num_pages); - CONSUME_ALLOC(rbio->bio_sectors, num_sectors); - CONSUME_ALLOC(rbio->stripe_sectors, num_sectors); - CONSUME_ALLOC(rbio->finish_pointers, real_stripes); -#undef CONSUME_ALLOC - ASSERT(btrfs_nr_parity_stripes(bioc->map_type)); rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type); @@ -1006,6 +998,45 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) } /* + * Return the total numer of errors found in the vertical stripe of @sector_nr. + * + * @faila and @failb will also be updated to the first and second stripe + * number of the errors. + */ +static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr, + int *faila, int *failb) +{ + int stripe_nr; + int found_errors = 0; + + if (faila || failb) { + /* + * Both @faila and @failb should be valid pointers if any of + * them is specified. + */ + ASSERT(faila && failb); + *faila = -1; + *failb = -1; + } + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr; + + if (test_bit(total_sector_nr, rbio->error_bitmap)) { + found_errors++; + if (faila) { + /* Update faila and failb. */ + if (*faila < 0) + *faila = stripe_nr; + else if (*failb < 0) + *failb = stripe_nr; + } + } + } + return found_errors; +} + +/* * Add a single sector @sector into our list of bios for IO. * * Return 0 if everything went well. @@ -1038,8 +1069,19 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, disk_start = stripe->physical + sector_nr * sectorsize; /* if the device is missing, just fail this stripe */ - if (!stripe->dev->bdev) - return fail_rbio_index(rbio, stripe_nr); + if (!stripe->dev->bdev) { + int found_errors; + + set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + + /* Check if we have reached tolerance early. */ + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + NULL, NULL); + if (found_errors > rbio->bioc->max_errors) + return -EIO; + return 0; + } /* see if we can add this page onto our existing bio */ if (last) { @@ -1071,23 +1113,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, return 0; } -/* - * while we're doing the read/modify/write cycle, we could - * have errors in reading pages off the disk. This checks - * for errors and if we're not able to read the page it'll - * trigger parity reconstruction. The rmw will be finished - * after we've reconstructed the failed stripes - */ -static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) -{ - if (rbio->faila >= 0 || rbio->failb >= 0) { - BUG_ON(rbio->faila == rbio->real_stripes - 1); - __raid56_parity_recover(rbio); - } else { - finish_rmw(rbio); - } -} - static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; @@ -1158,109 +1183,71 @@ not_found: trace_info->stripe_nr = -1; } -/* - * this is called from one of two situations. We either - * have a full stripe from the higher layers, or we've read all - * the missing bits off disk. - * - * This will calculate the parity and then send down any - * changed blocks. - */ -static noinline void finish_rmw(struct btrfs_raid_bio *rbio) +/* Generate PQ for one veritical stripe. */ +static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { - struct btrfs_io_context *bioc = rbio->bioc; - const u32 sectorsize = bioc->fs_info->sectorsize; void **pointers = rbio->finish_pointers; - int nr_data = rbio->nr_data; + const u32 sectorsize = rbio->bioc->fs_info->sectorsize; + struct sector_ptr *sector; + int stripe; + const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6; + + /* First collect one sector from each data stripe */ + for (stripe = 0; stripe < rbio->nr_data; stripe++) { + sector = sector_in_rbio(rbio, stripe, sectornr, 0); + pointers[stripe] = kmap_local_page(sector->page) + + sector->pgoff; + } + + /* Then add the parity stripe */ + sector = rbio_pstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + + if (has_qstripe) { + /* + * RAID6, add the qstripe and call the library function + * to fill in our p/q + */ + sector = rbio_qstripe_sector(rbio, sectornr); + sector->uptodate = 1; + pointers[stripe++] = kmap_local_page(sector->page) + + sector->pgoff; + + raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, + pointers); + } else { + /* raid5 */ + memcpy(pointers[rbio->nr_data], pointers[0], sectorsize); + run_xor(pointers + 1, rbio->nr_data - 1, sectorsize); + } + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); +} + +static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; /* The total sector number inside the full stripe. */ int total_sector_nr; - int stripe; - /* Sector number inside a stripe. */ int sectornr; - bool has_qstripe; - struct bio_list bio_list; - struct bio *bio; + int stripe; int ret; - bio_list_init(&bio_list); - - if (rbio->real_stripes - rbio->nr_data == 1) - has_qstripe = false; - else if (rbio->real_stripes - rbio->nr_data == 2) - has_qstripe = true; - else - BUG(); + ASSERT(bio_list_size(bio_list) == 0); /* We should have at least one data sector. */ ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors)); - /* at this point we either have a full stripe, - * or we've read the full stripe from the drive. - * recalculate the parity and write the new results. - * - * We're not allowed to add any new bios to the - * bio list here, anyone else that wants to - * change this stripe needs to do their own rmw. - */ - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - - atomic_set(&rbio->error, 0); - /* - * now that we've set rmw_locked, run through the - * bio list one last time and map the page pointers - * - * We don't cache full rbios because we're assuming - * the higher layers are unlikely to use this area of - * the disk again soon. If they do use it again, - * hopefully they will send another full bio. + * Reset errors, as we may have errors inherited from from degraded + * write. */ - index_rbio_pages(rbio); - if (!rbio_is_full(rbio)) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; - - /* First collect one sector from each data stripe */ - for (stripe = 0; stripe < nr_data; stripe++) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - } - - /* Then add the parity stripe */ - sector = rbio_pstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; - - if (has_qstripe) { - /* - * RAID6, add the qstripe and call the library function - * to fill in our p/q - */ - sector = rbio_qstripe_sector(rbio, sectornr); - sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; - - raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, - pointers); - } else { - /* raid5 */ - memcpy(pointers[nr_data], pointers[0], sectorsize); - run_xor(pointers + 1, nr_data - 1, sectorsize); - } - for (stripe = stripe - 1; stripe >= 0; stripe--) - kunmap_local(pointers[stripe]); - } + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* - * Start writing. Make bios for everything from the higher layers (the + * Start assembly. Make bios for everything from the higher layers (the * bio_list in our rbio) and our P/Q. Ignore everything else. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; @@ -1282,15 +1269,16 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; + goto error; } - if (likely(!bioc->num_tgtdevs)) - goto write_data; + if (likely(!rbio->bioc->num_tgtdevs)) + return 0; + /* Make a copy for the replace target device. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { struct sector_ptr *sector; @@ -1298,7 +1286,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) stripe = total_sector_nr / rbio->stripe_nsectors; sectornr = total_sector_nr % rbio->stripe_nsectors; - if (!bioc->tgtdev_map[stripe]) { + if (!rbio->bioc->tgtdev_map[stripe]) { /* * We can skip the whole stripe completely, note * total_sector_nr will be increased by one anyway. @@ -1320,125 +1308,52 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) sector = rbio_stripe_sector(rbio, stripe, sectornr); } - ret = rbio_add_io_sector(rbio, &bio_list, sector, + ret = rbio_add_io_sector(rbio, bio_list, sector, rbio->bioc->tgtdev_map[stripe], sectornr, REQ_OP_WRITE); if (ret) - goto cleanup; - } - -write_data: - atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list)); - BUG_ON(atomic_read(&rbio->stripes_pending) == 0); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); + goto error; } - return; -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - - while ((bio = bio_list_pop(&bio_list))) + return 0; +error: + while ((bio = bio_list_pop(bio_list))) bio_put(bio); + return -EIO; } -/* - * helper to find the stripe number for a given bio. Used to figure out which - * stripe has failed. This expects the bio to correspond to a physical disk, - * so it looks up based on physical sector numbers. - */ -static int find_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 physical = bio->bi_iter.bi_sector; - int i; - struct btrfs_io_stripe *stripe; - - physical <<= 9; - - for (i = 0; i < rbio->bioc->num_stripes; i++) { - stripe = &rbio->bioc->stripes[i]; - if (in_range(physical, stripe->physical, BTRFS_STRIPE_LEN) && - stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) { - return i; - } - } - return -1; -} - -/* - * helper to find the stripe number for a given - * bio (before mapping). Used to figure out which stripe has - * failed. This looks up based on logical block numbers. - */ -static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - u64 logical = bio->bi_iter.bi_sector << 9; - int i; - - for (i = 0; i < rbio->nr_data; i++) { - u64 stripe_start = rbio->bioc->raid_map[i]; - - if (in_range(logical, stripe_start, BTRFS_STRIPE_LEN)) - return i; - } - return -1; -} - -/* - * returns -EIO if we had too many failures - */ -static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed) +static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) { - unsigned long flags; - int ret = 0; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - + rbio->bioc->raid_map[0]; + int total_nr_sector = offset >> fs_info->sectorsize_bits; - spin_lock_irqsave(&rbio->bio_list_lock, flags); + ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors); - /* we already know this stripe is bad, move on */ - if (rbio->faila == failed || rbio->failb == failed) - goto out; + bitmap_set(rbio->error_bitmap, total_nr_sector, + bio->bi_iter.bi_size >> fs_info->sectorsize_bits); - if (rbio->faila == -1) { - /* first failure on this rbio */ - rbio->faila = failed; - atomic_inc(&rbio->error); - } else if (rbio->failb == -1) { - /* second failure on this rbio */ - rbio->failb = failed; - atomic_inc(&rbio->error); - } else { - ret = -EIO; + /* + * Special handling for raid56_alloc_missing_rbio() used by + * scrub/replace. Unlike call path in raid56_parity_recover(), they + * pass an empty bio here. Thus we have to find out the missing device + * and mark the stripe error instead. + */ + if (bio->bi_iter.bi_size == 0) { + bool found_missing = false; + int stripe_nr; + + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { + if (!rbio->bioc->stripes[stripe_nr].dev->bdev) { + found_missing = true; + bitmap_set(rbio->error_bitmap, + stripe_nr * rbio->stripe_nsectors, + rbio->stripe_nsectors); + } + } + ASSERT(found_missing); } -out: - spin_unlock_irqrestore(&rbio->bio_list_lock, flags); - - return ret; -} - -/* - * helper to fail a stripe based on a physical disk - * bio. - */ -static int fail_bio_stripe(struct btrfs_raid_bio *rbio, - struct bio *bio) -{ - int failed = find_bio_stripe(rbio, bio); - - if (failed < 0) - return -EIO; - - return fail_rbio_index(rbio, failed); } /* @@ -1486,191 +1401,163 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) } } -static void raid56_bio_end_io(struct bio *bio) +static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = bio->bi_private; - - if (bio->bi_status) - fail_bio_stripe(rbio, bio); - else - set_bio_pages_uptodate(rbio, bio); + struct bio_vec *bv = bio_first_bvec_all(bio); + int i; - bio_put(bio); + for (i = 0; i < rbio->nr_sectors; i++) { + struct sector_ptr *sector; - if (atomic_dec_and_test(&rbio->stripes_pending)) - queue_work(rbio->bioc->fs_info->endio_raid56_workers, - &rbio->end_io_work); + sector = &rbio->stripe_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + sector = &rbio->bio_sectors[i]; + if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + break; + } + ASSERT(i < rbio->nr_sectors); + return i; } -/* - * End io handler for the read phase of the RMW cycle. All the bios here are - * physical stripe bios we've read from the disk so we can recalculate the - * parity of the stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_rmw_end_io_work(struct work_struct *work) +static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); + int total_sector_nr = get_bio_sector_nr(rbio, bio); + u32 bio_size = 0; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) { - rbio_orig_end_io(rbio, BLK_STS_IOERR); - return; - } + bio_for_each_segment_all(bvec, bio, iter_all) + bio_size += bvec->bv_len; - /* - * This will normally call finish_rmw to start our write but if there - * are any failed stripes we'll reconstruct from parity first. - */ - validate_rbio_for_rmw(rbio); + bitmap_set(rbio->error_bitmap, total_sector_nr, + bio_size >> rbio->bioc->fs_info->sectorsize_bits); } -/* - * the stripe must be locked by the caller. It will - * unlock after all the writes are done - */ -static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) +/* Verify the data sectors at read time. */ +static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, + struct bio *bio) { - int bios_to_read = 0; - struct bio_list bio_list; - const int nr_data_sectors = rbio->stripe_nsectors * rbio->nr_data; - int ret; - int total_sector_nr; - struct bio *bio; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + int total_sector_nr = get_bio_sector_nr(rbio, bio); + struct bio_vec *bvec; + struct bvec_iter_all iter_all; - bio_list_init(&bio_list); + /* No data csum for the whole stripe, no need to verify. */ + if (!rbio->csum_bitmap || !rbio->csum_buf) + return; - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; + /* P/Q stripes, they have no data csum to verify against. */ + if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors) + return; - index_rbio_pages(rbio); + bio_for_each_segment_all(bvec, bio, iter_all) { + int bv_offset; + + for (bv_offset = bvec->bv_offset; + bv_offset < bvec->bv_offset + bvec->bv_len; + bv_offset += fs_info->sectorsize, total_sector_nr++) { + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *expected_csum = rbio->csum_buf + + total_sector_nr * fs_info->csum_size; + int ret; - atomic_set(&rbio->error, 0); - /* Build a list of bios to read all the missing data sectors. */ - for (total_sector_nr = 0; total_sector_nr < nr_data_sectors; - total_sector_nr++) { - struct sector_ptr *sector; - int stripe = total_sector_nr / rbio->stripe_nsectors; - int sectornr = total_sector_nr % rbio->stripe_nsectors; + /* No csum for this sector, skip to the next sector. */ + if (!test_bit(total_sector_nr, rbio->csum_bitmap)) + continue; - /* - * We want to find all the sectors missing from the rbio and - * read them from the disk. If sector_in_rbio() finds a page - * in the bio list we don't need to read it off the stripe. - */ - sector = sector_in_rbio(rbio, stripe, sectornr, 1); - if (sector) - continue; + ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, + bv_offset, csum_buf, expected_csum); + if (ret < 0) + set_bit(total_sector_nr, rbio->error_bitmap); + } + } +} - sector = rbio_stripe_sector(rbio, stripe, sectornr); - /* - * The bio cache may have handed us an uptodate page. If so, - * use it. - */ - if (sector->uptodate) - continue; +static void raid_wait_read_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; - ret = rbio_add_io_sector(rbio, &bio_list, sector, - stripe, sectornr, REQ_OP_READ); - if (ret) - goto cleanup; + if (bio->bi_status) { + rbio_update_error_bitmap(rbio, bio); + } else { + set_bio_pages_uptodate(rbio, bio); + verify_bio_data_sectors(rbio, bio); } - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; +static void submit_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_read_end_io; - if (trace_raid56_read_partial_enabled()) { + if (trace_raid56_scrub_read_recover_enabled()) { struct raid56_bio_trace_info trace_info = { 0 }; bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_read_partial(rbio, bio, &trace_info); + trace_raid56_scrub_read_recover(rbio, bio, &trace_info); } submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return 0; +} -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); +static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + int total_sector_nr; + int ret = 0; - while ((bio = bio_list_pop(&bio_list))) - bio_put(bio); + ASSERT(bio_list_size(bio_list) == 0); - return -EIO; + /* + * Build a list of bios to read all sectors (including data and P/Q). + * + * This behaviro is to compensate the later csum verification and + * recovery. + */ + for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; + total_sector_nr++) { + struct sector_ptr *sector; + int stripe = total_sector_nr / rbio->stripe_nsectors; + int sectornr = total_sector_nr % rbio->stripe_nsectors; -finish: - validate_rbio_for_rmw(rbio); + sector = rbio_stripe_sector(rbio, stripe, sectornr); + ret = rbio_add_io_sector(rbio, bio_list, sector, + stripe, sectornr, REQ_OP_READ); + if (ret) + goto cleanup; + } return 0; + +cleanup: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; } -/* - * if the upper layers pass in a full stripe, we thank them by only allocating - * enough pages to hold the parity, and sending it all down quickly. - */ -static int full_stripe_write(struct btrfs_raid_bio *rbio) +static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio) { + const int data_pages = rbio->nr_data * rbio->stripe_npages; int ret; - ret = alloc_rbio_parity_pages(rbio); - if (ret) + ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages); + if (ret < 0) return ret; - ret = lock_stripe_add(rbio); - if (ret == 0) - finish_rmw(rbio); - return 0; -} - -/* - * partial stripe writes get handed over to async helpers. - * We're really hoping to merge a few more writes into this - * rbio before calculating new parity - */ -static int partial_stripe_write(struct btrfs_raid_bio *rbio) -{ - int ret; - - ret = lock_stripe_add(rbio); - if (ret == 0) - start_async_work(rbio, rmw_work); + index_stripe_sectors(rbio); return 0; } /* - * sometimes while we were reading from the drive to - * recalculate parity, enough new bios come into create - * a full stripe. So we do a check here to see if we can - * go directly to finish_rmw - */ -static int __raid56_parity_write(struct btrfs_raid_bio *rbio) -{ - /* head off into rmw land if we don't have a full stripe */ - if (!rbio_is_full(rbio)) - return partial_stripe_write(rbio); - return full_stripe_write(rbio); -} - -/* * We use plugging call backs to collect full stripes. * Any time we get a partial stripe write while plugged * we collect it into a list. When the unplug comes down, @@ -1704,71 +1591,39 @@ static int plug_cmp(void *priv, const struct list_head *a, return 0; } -static void run_plug(struct btrfs_plug_cb *plug) +static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) { + struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb); struct btrfs_raid_bio *cur; struct btrfs_raid_bio *last = NULL; - /* - * sort our plug list then try to merge - * everything we can in hopes of creating full - * stripes. - */ list_sort(NULL, &plug->rbio_list, plug_cmp); + while (!list_empty(&plug->rbio_list)) { cur = list_entry(plug->rbio_list.next, struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { - int ret; - - /* we have a full stripe, send it down */ - ret = full_stripe_write(cur); - BUG_ON(ret); + /* We have a full stripe, queue it down. */ + start_async_work(cur, rmw_rbio_work); continue; } if (last) { if (rbio_can_merge(last, cur)) { merge_rbio(last, cur); - __free_raid_bio(cur); + free_raid_bio(cur); continue; - } - __raid56_parity_write(last); + start_async_work(last, rmw_rbio_work); } last = cur; } - if (last) { - __raid56_parity_write(last); - } + if (last) + start_async_work(last, rmw_rbio_work); kfree(plug); } -/* - * if the unplug comes from schedule, we have to push the - * work off to a helper thread - */ -static void unplug_work(struct work_struct *work) -{ - struct btrfs_plug_cb *plug; - plug = container_of(work, struct btrfs_plug_cb, work); - run_plug(plug); -} - -static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule) -{ - struct btrfs_plug_cb *plug; - plug = container_of(cb, struct btrfs_plug_cb, cb); - - if (from_schedule) { - INIT_WORK(&plug->work, unplug_work); - queue_work(plug->info->rmw_workers, &plug->work); - return; - } - run_plug(plug); -} - /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */ static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio) { @@ -1816,19 +1671,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) rbio_add_bio(rbio, bio); /* - * don't plug on full rbios, just get them out the door + * Don't plug on full rbios, just get them out the door * as quickly as we can */ - if (rbio_is_full(rbio)) { - ret = full_stripe_write(rbio); - if (ret) { - __free_raid_bio(rbio); - goto fail; - } - return; - } + if (rbio_is_full(rbio)) + goto queue_rbio; - cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug)); + cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug)); if (cb) { plug = container_of(cb, struct btrfs_plug_cb, cb); if (!plug->info) { @@ -1836,13 +1685,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc) INIT_LIST_HEAD(&plug->rbio_list); } list_add_tail(&rbio->plug_list, &plug->rbio_list); - } else { - ret = __raid56_parity_write(rbio); - if (ret) { - __free_raid_bio(rbio); - goto fail; - } + return; } +queue_rbio: + /* + * Either we don't have any existing plug, or we're doing a full stripe, + * can queue the rmw work now. + */ + start_async_work(rbio, rmw_rbio_work); return; @@ -1851,268 +1701,254 @@ fail: bio_endio(bio); } -/* - * all parity reconstruction happens here. We've read in everything - * we can find from the drives and this does the heavy lifting of - * sorting the good from the bad. - */ -static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) +static int verify_one_sector(struct btrfs_raid_bio *rbio, + int stripe_nr, int sector_nr) { - const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - int sectornr, stripe; - void **pointers; - void **unmap_array; - int faila = -1, failb = -1; - blk_status_t err; - int i; + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + u8 csum_buf[BTRFS_CSUM_SIZE]; + u8 *csum_expected; + int ret; - /* - * This array stores the pointer for each sector, thus it has the extra - * pgoff value added from each sector - */ - pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!pointers) { - err = BLK_STS_RESOURCE; - goto cleanup_io; - } + if (!rbio->csum_bitmap || !rbio->csum_buf) + return 0; + /* No way to verify P/Q as they are not covered by data csum. */ + if (stripe_nr >= rbio->nr_data) + return 0; /* - * Store copy of pointers that does not get reordered during - * reconstruction so that kunmap_local works. + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); - if (!unmap_array) { - err = BLK_STS_RESOURCE; - goto cleanup_pointers; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - faila = rbio->faila; - failb = rbio->failb; + ASSERT(sector->page); - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - spin_lock_irq(&rbio->bio_list_lock); - set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); - spin_unlock_irq(&rbio->bio_list_lock); - } + csum_expected = rbio->csum_buf + + (stripe_nr * rbio->stripe_nsectors + sector_nr) * + fs_info->csum_size; + ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, + csum_buf, csum_expected); + return ret; +} - index_rbio_pages(rbio); +/* + * Recover a vertical stripe specified by @sector_nr. + * @*pointers are the pre-allocated pointers by the caller, so we don't + * need to allocate/free the pointers again and again. + */ +static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, + void **pointers, void **unmap_array) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct sector_ptr *sector; + const u32 sectorsize = fs_info->sectorsize; + int found_errors; + int faila; + int failb; + int stripe_nr; + int ret = 0; - for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { - struct sector_ptr *sector; + /* + * Now we just use bitmap to mark the horizontal stripes in + * which we have data when doing parity scrub. + */ + if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && + !test_bit(sector_nr, &rbio->dbitmap)) + return 0; - /* - * Now we just use bitmap to mark the horizontal stripes in - * which we have data when doing parity scrub. - */ - if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB && - !test_bit(sectornr, &rbio->dbitmap)) - continue; + found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila, + &failb); + /* + * No errors in the veritical stripe, skip it. Can happen for recovery + * which only part of a stripe failed csum check. + */ + if (!found_errors) + return 0; + if (found_errors > rbio->bioc->max_errors) + return -EIO; + + /* + * Setup our array of pointers with sectors from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order. + */ + for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) { /* - * Setup our array of pointers with sectors from each stripe - * - * NOTE: store a duplicate array of pointers to preserve the - * pointer order + * If we're rebuilding a read, we have to use pages from the + * bio list if possible. */ - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * If we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - sector = sector_in_rbio(rbio, stripe, sectornr, 0); - } else { - sector = rbio_stripe_sector(rbio, stripe, sectornr); - } - ASSERT(sector->page); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; - unmap_array[stripe] = pointers[stripe]; + if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || + rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) { + sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0); + } else { + sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } + ASSERT(sector->page); + pointers[stripe_nr] = kmap_local_page(sector->page) + + sector->pgoff; + unmap_array[stripe_nr] = pointers[stripe_nr]; + } - /* All raid6 handling here */ - if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { - /* Single failure, rebuild from parity raid5 style */ - if (failb < 0) { - if (faila == rbio->nr_data) { - /* - * Just the P stripe has failed, without - * a bad data or Q stripe. - * TODO, we should redo the xor here. - */ - err = BLK_STS_IOERR; - goto cleanup; - } + /* All raid6 handling here */ + if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) { + /* Single failure, rebuild from parity raid5 style */ + if (failb < 0) { + if (faila == rbio->nr_data) /* - * a single failure in raid6 is rebuilt - * in the pstripe code below + * Just the P stripe has failed, without + * a bad data or Q stripe. + * We have nothing to do, just skip the + * recovery for this stripe. */ - goto pstripe; - } - - /* make sure our ps and qs are in order */ - if (faila > failb) - swap(faila, failb); - - /* if the q stripe is failed, do a pstripe reconstruction - * from the xors. - * If both the q stripe and the P stripe are failed, we're - * here due to a crc mismatch and we can't give them the - * data they want + goto cleanup; + /* + * a single failure in raid6 is rebuilt + * in the pstripe code below */ - if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { - if (rbio->bioc->raid_map[faila] == - RAID5_P_STRIPE) { - err = BLK_STS_IOERR; - goto cleanup; - } + goto pstripe; + } + + /* + * If the q stripe is failed, do a pstripe reconstruction from + * the xors. + * If both the q stripe and the P stripe are failed, we're + * here due to a crc mismatch and we can't give them the + * data they want. + */ + if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) { + if (rbio->bioc->raid_map[faila] == + RAID5_P_STRIPE) /* - * otherwise we have one bad data stripe and - * a good P stripe. raid5! + * Only P and Q are corrupted. + * We only care about data stripes recovery, + * can skip this vertical stripe. */ - goto pstripe; - } + goto cleanup; + /* + * Otherwise we have one bad data stripe and + * a good P stripe. raid5! + */ + goto pstripe; + } - if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { - raid6_datap_recov(rbio->real_stripes, - sectorsize, faila, pointers); - } else { - raid6_2data_recov(rbio->real_stripes, - sectorsize, faila, failb, - pointers); - } + if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) { + raid6_datap_recov(rbio->real_stripes, sectorsize, + faila, pointers); } else { - void *p; + raid6_2data_recov(rbio->real_stripes, sectorsize, + faila, failb, pointers); + } + } else { + void *p; - /* rebuild from P stripe here (raid5 or raid6) */ - BUG_ON(failb != -1); + /* Rebuild from P stripe here (raid5 or raid6). */ + ASSERT(failb == -1); pstripe: - /* Copy parity block into failed block to start with */ - memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); + /* Copy parity block into failed block to start with */ + memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize); - /* rearrange the pointer array */ - p = pointers[faila]; - for (stripe = faila; stripe < rbio->nr_data - 1; stripe++) - pointers[stripe] = pointers[stripe + 1]; - pointers[rbio->nr_data - 1] = p; + /* Rearrange the pointer array */ + p = pointers[faila]; + for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1; + stripe_nr++) + pointers[stripe_nr] = pointers[stripe_nr + 1]; + pointers[rbio->nr_data - 1] = p; - /* xor in the rest */ - run_xor(pointers, rbio->nr_data - 1, sectorsize); - } - /* if we're doing this rebuild as part of an rmw, go through - * and set all of our private rbio pages in the - * failed stripes as uptodate. This way finish_rmw will - * know they can be trusted. If this was a read reconstruction, - * other endio functions will fiddle the uptodate bits - */ - if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < rbio->stripe_nsectors; i++) { - if (faila != -1) { - sector = rbio_stripe_sector(rbio, faila, i); - sector->uptodate = 1; - } - if (failb != -1) { - sector = rbio_stripe_sector(rbio, failb, i); - sector->uptodate = 1; - } - } - } - for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) - kunmap_local(unmap_array[stripe]); + /* Xor in the rest */ + run_xor(pointers, rbio->nr_data - 1, sectorsize); + + } + + /* + * No matter if this is a RMW or recovery, we should have all + * failed sectors repaired in the vertical stripe, thus they are now + * uptodate. + * Especially if we determine to cache the rbio, we need to + * have at least all data sectors uptodate. + * + * If possible, also check if the repaired sector matches its data + * checksum. + */ + if (faila >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, faila, sector_nr); + sector->uptodate = 1; + } + if (failb >= 0) { + ret = verify_one_sector(rbio, faila, sector_nr); + if (ret < 0) + goto cleanup; + + sector = rbio_stripe_sector(rbio, failb, sector_nr); + sector->uptodate = 1; } - err = BLK_STS_OK; cleanup: - kfree(unmap_array); -cleanup_pointers: - kfree(pointers); + for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--) + kunmap_local(unmap_array[stripe_nr]); + return ret; +} + +static int recover_sectors(struct btrfs_raid_bio *rbio) +{ + void **pointers = NULL; + void **unmap_array = NULL; + int sectornr; + int ret = 0; -cleanup_io: /* - * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a - * valid rbio which is consistent with ondisk content, thus such a - * valid rbio can be cached to avoid further disk reads. + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } + if (rbio->operation == BTRFS_RBIO_READ_REBUILD || rbio->operation == BTRFS_RBIO_REBUILD_MISSING) { - /* - * - In case of two failures, where rbio->failb != -1: - * - * Do not cache this rbio since the above read reconstruction - * (raid6_datap_recov() or raid6_2data_recov()) may have - * changed some content of stripes which are not identical to - * on-disk content any more, otherwise, a later write/recover - * may steal stripe_pages from this rbio and end up with - * corruptions or rebuild failures. - * - * - In case of single failure, where rbio->failb == -1: - * - * Cache this rbio iff the above read reconstruction is - * executed without problems. - */ - if (err == BLK_STS_OK && rbio->failb < 0) - cache_rbio_pages(rbio); - else - clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + } - rbio_orig_end_io(rbio, err); - } else if (err == BLK_STS_OK) { - rbio->faila = -1; - rbio->failb = -1; + index_rbio_pages(rbio); - if (rbio->operation == BTRFS_RBIO_WRITE) - finish_rmw(rbio); - else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) - finish_parity_scrub(rbio, 0); - else - BUG(); - } else { - rbio_orig_end_io(rbio, err); + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + ret = recover_vertical(rbio, sectornr, pointers, unmap_array); + if (ret < 0) + break; } -} - -/* - * This is called only for stripes we've read from disk to reconstruct the - * parity. - */ -static void raid_recover_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - rbio_orig_end_io(rbio, BLK_STS_IOERR); - else - __raid_recover_end_io(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -/* - * reads everything we need off the disk to reconstruct - * the parity. endio handlers trigger final reconstruction - * when the IO is done. - * - * This is used both for reads from the higher layers and for - * parity construction required to finish a rmw cycle. - */ -static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) +static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_pages(rbio); - if (ret) - goto cleanup; - - atomic_set(&rbio->error, 0); - + ASSERT(bio_list_size(bio_list) == 0); /* * Read everything that hasn't failed. However this time we will * not trust any cached sector. @@ -2127,64 +1963,139 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int sectornr = total_sector_nr % rbio->stripe_nsectors; struct sector_ptr *sector; - if (rbio->faila == stripe || rbio->failb == stripe) { - atomic_inc(&rbio->error); - /* Skip the current stripe. */ - ASSERT(sectornr == 0); - total_sector_nr += rbio->stripe_nsectors - 1; + /* + * Skip the range which has error. It can be a range which is + * marked error (for csum mismatch), or it can be a missing + * device. + */ + if (!rbio->bioc->stripes[stripe].dev->bdev || + test_bit(total_sector_nr, rbio->error_bitmap)) { + /* + * Also set the error bit for missing device, which + * may not yet have its error bit set. + */ + set_bit(total_sector_nr, rbio->error_bitmap); continue; } + sector = rbio_stripe_sector(rbio, stripe, sectornr); - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret < 0) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * we might have no bios to read just because the pages - * were up to date, or we might have no bios to read because - * the devices were gone. - */ - if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) { - __raid_recover_end_io(rbio); - return 0; - } else { - goto cleanup; - } - } + return -EIO; +} + +static int recover_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. + * Either we're doing recover for a read failure or degraded write, + * caller should have set error bitmap correctly. */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors)); + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_recover_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + /* For recovery, we need to read all sectors including P/Q. */ + ret = alloc_rbio_pages(rbio); + if (ret < 0) + goto out; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read_recover(rbio, bio, &trace_info); - } - submit_bio(bio); - } + index_rbio_pages(rbio); - return 0; + ret = recover_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; -cleanup: - if (rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) - rbio_orig_end_io(rbio, BLK_STS_IOERR); + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + ret = recover_sectors(rbio); +out: while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return -EIO; + return ret; +} + +static void recover_rbio_work(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } +} + +static void recover_rbio_work_locked(struct work_struct *work) +{ + struct btrfs_raid_bio *rbio; + int ret; + + rbio = container_of(work, struct btrfs_raid_bio, work); + + ret = recover_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); +} + +static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num) +{ + bool found = false; + int sector_nr; + + /* + * This is for RAID6 extra recovery tries, thus mirror number should + * be large than 2. + * Mirror 1 means read from data stripes. Mirror 2 means rebuild using + * RAID5 methods. + */ + ASSERT(mirror_num > 2); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + int faila; + int failb; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + /* This vertical stripe doesn't have errors. */ + if (!found_errors) + continue; + + /* + * If we found errors, there should be only one error marked + * by previous set_rbio_range_error(). + */ + ASSERT(found_errors == 1); + found = true; + + /* Now select another stripe to mark as error. */ + failb = rbio->real_stripes - (mirror_num - 1); + if (failb <= faila) + failb--; + + /* Set the extra bit in error bitmap. */ + if (failb >= 0) + set_bit(failb * rbio->stripe_nsectors + sector_nr, + rbio->error_bitmap); + } + + /* We should found at least one vertical stripe with error.*/ + ASSERT(found); } /* @@ -2202,68 +2113,284 @@ void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc, rbio = alloc_rbio(fs_info, bioc); if (IS_ERR(rbio)) { bio->bi_status = errno_to_blk_status(PTR_ERR(rbio)); - goto out_end_bio; + bio_endio(bio); + return; } rbio->operation = BTRFS_RBIO_READ_REBUILD; rbio_add_bio(rbio, bio); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn(fs_info, -"%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)", - __func__, bio->bi_iter.bi_sector << 9, - (u64)bio->bi_iter.bi_size, bioc->map_type); - __free_raid_bio(rbio); - bio->bi_status = BLK_STS_IOERR; - goto out_end_bio; - } + set_rbio_range_error(rbio, bio); /* * Loop retry: * for 'mirror == 2', reconstruct from all other stripes. * for 'mirror_num > 2', select a stripe to fail on every retry. */ - if (mirror_num > 2) { + if (mirror_num > 2) + set_rbio_raid6_extra_error(rbio, mirror_num); + + start_async_work(rbio, recover_rbio_work); +} + +static void fill_data_csums(struct btrfs_raid_bio *rbio) +{ + struct btrfs_fs_info *fs_info = rbio->bioc->fs_info; + struct btrfs_root *csum_root = btrfs_csum_root(fs_info, + rbio->bioc->raid_map[0]); + const u64 start = rbio->bioc->raid_map[0]; + const u32 len = (rbio->nr_data * rbio->stripe_nsectors) << + fs_info->sectorsize_bits; + int ret; + + /* The rbio should not have its csum buffer initialized. */ + ASSERT(!rbio->csum_buf && !rbio->csum_bitmap); + + /* + * Skip the csum search if: + * + * - The rbio doesn't belong to data block groups + * Then we are doing IO for tree blocks, no need to search csums. + * + * - The rbio belongs to mixed block groups + * This is to avoid deadlock, as we're already holding the full + * stripe lock, if we trigger a metadata read, and it needs to do + * raid56 recovery, we will deadlock. + */ + if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) || + rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA) + return; + + rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors * + fs_info->csum_size, GFP_NOFS); + rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors, + GFP_NOFS); + if (!rbio->csum_buf || !rbio->csum_bitmap) { + ret = -ENOMEM; + goto error; + } + + ret = btrfs_lookup_csums_bitmap(csum_root, start, start + len - 1, + rbio->csum_buf, rbio->csum_bitmap); + if (ret < 0) + goto error; + if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits)) + goto no_csum; + return; + +error: + /* + * We failed to allocate memory or grab the csum, but it's not fatal, + * we can still continue. But better to warn users that RMW is no + * longer safe for this particular sub-stripe write. + */ + btrfs_warn_rl(fs_info, +"sub-stripe write for full stripe %llu is not safe, failed to get csum: %d", + rbio->bioc->raid_map[0], ret); +no_csum: + kfree(rbio->csum_buf); + bitmap_free(rbio->csum_bitmap); + rbio->csum_buf = NULL; + rbio->csum_bitmap = NULL; +} + +static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + struct bio *bio; + int ret; + + bio_list_init(&bio_list); + + /* + * Fill the data csums we need for data verification. We need to fill + * the csum_bitmap/csum_buf first, as our endio function will try to + * verify the data sectors. + */ + fill_data_csums(rbio); + + ret = rmw_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto out; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* + * We may or may not have any corrupted sectors (including missing dev + * and csum mismatch), just let recover_sectors() to handle them all. + */ + ret = recover_sectors(rbio); + return ret; +out: + while ((bio = bio_list_pop(&bio_list))) + bio_put(bio); + + return ret; +} + +static void raid_wait_write_end_io(struct bio *bio) +{ + struct btrfs_raid_bio *rbio = bio->bi_private; + blk_status_t err = bio->bi_status; + + if (err) + rbio_update_error_bitmap(rbio, bio); + bio_put(bio); + if (atomic_dec_and_test(&rbio->stripes_pending)) + wake_up(&rbio->io_wait); +} + +static void submit_write_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) +{ + struct bio *bio; + + atomic_set(&rbio->stripes_pending, bio_list_size(bio_list)); + while ((bio = bio_list_pop(bio_list))) { + bio->bi_end_io = raid_wait_write_end_io; + + if (trace_raid56_write_stripe_enabled()) { + struct raid56_bio_trace_info trace_info = { 0 }; + + bio_get_trace_info(rbio, bio, &trace_info); + trace_raid56_write_stripe(rbio, bio, &trace_info); + } + submit_bio(bio); + } +} + +/* + * To determine if we need to read any sector from the disk. + * Should only be utilized in RMW path, to skip cached rbio. + */ +static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) +{ + int i; + + for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) { + struct sector_ptr *sector = &rbio->stripe_sectors[i]; + /* - * 'mirror == 3' is to fail the p stripe and - * reconstruct from the q stripe. 'mirror > 3' is to - * fail a data stripe and reconstruct from p+q stripe. + * We have a sector which doesn't have page nor uptodate, + * thus this rbio can not be cached one, as cached one must + * have all its data sectors present and uptodate. */ - rbio->failb = rbio->real_stripes - (mirror_num - 1); - ASSERT(rbio->failb > 0); - if (rbio->failb <= rbio->faila) - rbio->failb--; + if (!sector->page || !sector->uptodate) + return true; } + return false; +} - if (lock_stripe_add(rbio)) - return; +static int rmw_rbio(struct btrfs_raid_bio *rbio) +{ + struct bio_list bio_list; + int sectornr; + int ret = 0; /* - * This adds our rbio to the list of rbios that will be handled after - * the current lock owner is done. + * Allocate the pages for parity first, as P/Q pages will always be + * needed for both full-stripe and sub-stripe writes. */ - __raid56_parity_recover(rbio); - return; + ret = alloc_rbio_parity_pages(rbio); + if (ret < 0) + return ret; -out_end_bio: - bio_endio(bio); + /* + * Either full stripe write, or we have every data sector already + * cached, can go to write path immediately. + */ + if (rbio_is_full(rbio) || !need_read_stripe_sectors(rbio)) + goto write; + + /* + * Now we're doing sub-stripe write, also need all data stripes to do + * the full RMW. + */ + ret = alloc_rbio_data_pages(rbio); + if (ret < 0) + return ret; + + index_rbio_pages(rbio); + + ret = rmw_read_wait_recover(rbio); + if (ret < 0) + return ret; + +write: + /* + * At this stage we're not allowed to add any new bios to the + * bio list any more, anyone else that wants to change this stripe + * needs to do their own rmw. + */ + spin_lock_irq(&rbio->bio_list_lock); + set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags); + spin_unlock_irq(&rbio->bio_list_lock); + + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + index_rbio_pages(rbio); + + /* + * We don't cache full rbios because we're assuming + * the higher layers are unlikely to use this area of + * the disk again soon. If they do use it again, + * hopefully they will send another full bio. + */ + if (!rbio_is_full(rbio)) + cache_rbio_pages(rbio); + else + clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); + + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) + generate_pq_vertical(rbio, sectornr); + + bio_list_init(&bio_list); + ret = rmw_assemble_write_bios(rbio, &bio_list); + if (ret < 0) + return ret; + + /* We should have at least one bio assembled. */ + ASSERT(bio_list_size(&bio_list)); + submit_write_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have more errors than our tolerance during the read. */ + for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; + } + } + return ret; } -static void rmw_work(struct work_struct *work) +static void rmw_rbio_work(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_rmw_stripe(rbio); + + ret = lock_stripe_add(rbio); + if (ret == 0) { + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); + } } -static void read_rebuild_work(struct work_struct *work) +static void rmw_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - __raid56_parity_recover(rbio); + + ret = rmw_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } /* @@ -2358,8 +2485,7 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) return 0; } -static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, - int need_check) +static int finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { struct btrfs_io_context *bioc = rbio->bioc; const u32 sectorsize = bioc->fs_info->sectorsize; @@ -2402,7 +2528,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, p_sector.page = alloc_page(GFP_NOFS); if (!p_sector.page) - goto cleanup; + return -ENOMEM; p_sector.pgoff = 0; p_sector.uptodate = 1; @@ -2412,14 +2538,14 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, if (!q_sector.page) { __free_page(p_sector.page); p_sector.page = NULL; - goto cleanup; + return -ENOMEM; } q_sector.pgoff = 0; q_sector.uptodate = 1; pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); } - atomic_set(&rbio->error, 0); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ pointers[nr_data] = kmap_local_page(p_sector.page); @@ -2499,33 +2625,13 @@ writeback: } submit_write: - nr_data = bio_list_size(&bio_list); - if (!nr_data) { - /* Every parity is right */ - rbio_orig_end_io(rbio, BLK_STS_OK); - return; - } - - atomic_set(&rbio->stripes_pending, nr_data); - - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid_write_end_io; - - if (trace_raid56_scrub_write_stripe_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; - - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_write_stripe(rbio, bio, &trace_info); - } - submit_bio(bio); - } - return; + submit_write_bios(rbio, &bio_list); + return 0; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); + return ret; } static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) @@ -2535,102 +2641,99 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe) return 0; } -/* - * While we're doing the parity check and repair, we could have errors - * in reading pages off the disk. This checks for errors and if we're - * not able to read the page it'll trigger parity reconstruction. The - * parity scrub will be finished after we've reconstructed the failed - * stripes - */ -static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio) +static int recover_scrub_rbio(struct btrfs_raid_bio *rbio) { - if (atomic_read(&rbio->error) > rbio->bioc->max_errors) - goto cleanup; + void **pointers = NULL; + void **unmap_array = NULL; + int sector_nr; + int ret; + + /* + * @pointers array stores the pointer for each sector. + * + * @unmap_array stores copy of pointers that does not get reordered + * during reconstruction so that kunmap_local works. + */ + pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!pointers || !unmap_array) { + ret = -ENOMEM; + goto out; + } - if (rbio->faila >= 0 || rbio->failb >= 0) { + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { int dfail = 0, failp = -1; + int faila; + int failb; + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, + &faila, &failb); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + goto out; + } + if (found_errors == 0) + continue; - if (is_data_stripe(rbio, rbio->faila)) - dfail++; - else if (is_parity_stripe(rbio->faila)) - failp = rbio->faila; + /* We should have at least one error here. */ + ASSERT(faila >= 0 || failb >= 0); - if (is_data_stripe(rbio, rbio->failb)) + if (is_data_stripe(rbio, faila)) dfail++; - else if (is_parity_stripe(rbio->failb)) - failp = rbio->failb; + else if (is_parity_stripe(faila)) + failp = faila; + if (is_data_stripe(rbio, failb)) + dfail++; + else if (is_parity_stripe(failb)) + failp = failb; /* - * Because we can not use a scrubbing parity to repair - * the data, so the capability of the repair is declined. - * (In the case of RAID5, we can not repair anything) + * Because we can not use a scrubbing parity to repair the + * data, so the capability of the repair is declined. (In the + * case of RAID5, we can not repair anything.) */ - if (dfail > rbio->bioc->max_errors - 1) - goto cleanup; - + if (dfail > rbio->bioc->max_errors - 1) { + ret = -EIO; + goto out; + } /* - * If all data is good, only parity is correctly, just - * repair the parity. + * If all data is good, only parity is correctly, just repair + * the parity, no need to recover data stripes. */ - if (dfail == 0) { - finish_parity_scrub(rbio, 0); - return; - } + if (dfail == 0) + continue; /* * Here means we got one corrupted data stripe and one - * corrupted parity on RAID6, if the corrupted parity - * is scrubbing parity, luckily, use the other one to repair - * the data, or we can not repair the data stripe. + * corrupted parity on RAID6, if the corrupted parity is + * scrubbing parity, luckily, use the other one to repair the + * data, or we can not repair the data stripe. */ - if (failp != rbio->scrubp) - goto cleanup; + if (failp != rbio->scrubp) { + ret = -EIO; + goto out; + } - __raid_recover_end_io(rbio); - } else { - finish_parity_scrub(rbio, 1); + ret = recover_vertical(rbio, sector_nr, pointers, unmap_array); + if (ret < 0) + goto out; } - return; - -cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); -} - -/* - * end io for the read phase of the rmw cycle. All the bios here are physical - * stripe bios we've read from the disk so we can recalculate the parity of the - * stripe. - * - * This will usually kick off finish_rmw once all the bios are read in, but it - * may trigger parity reconstruction if we had any errors along the way - */ -static void raid56_parity_scrub_end_io_work(struct work_struct *work) -{ - struct btrfs_raid_bio *rbio = - container_of(work, struct btrfs_raid_bio, end_io_work); - - /* - * This will normally call finish_rmw to start our write, but if there - * are any failed stripes we'll reconstruct from parity first - */ - validate_rbio_for_parity_scrub(rbio); +out: + kfree(pointers); + kfree(unmap_array); + return ret; } -static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) +static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio, + struct bio_list *bio_list) { - int bios_to_read = 0; - struct bio_list bio_list; - int ret; - int total_sector_nr; struct bio *bio; + int total_sector_nr; + int ret = 0; - bio_list_init(&bio_list); - - ret = alloc_rbio_essential_pages(rbio); - if (ret) - goto cleanup; + ASSERT(bio_list_size(bio_list) == 0); - atomic_set(&rbio->error, 0); /* Build a list of bios to read all the missing parts. */ for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors; total_sector_nr++) { @@ -2659,67 +2762,84 @@ static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio) if (sector->uptodate) continue; - ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe, + ret = rbio_add_io_sector(rbio, bio_list, sector, stripe, sectornr, REQ_OP_READ); if (ret) - goto cleanup; + goto error; } + return 0; +error: + while ((bio = bio_list_pop(bio_list))) + bio_put(bio); + return ret; +} - bios_to_read = bio_list_size(&bio_list); - if (!bios_to_read) { - /* - * this can happen if others have merged with - * us, it means there is nothing left to read. - * But if there are missing devices it may not be - * safe to do the full stripe write yet. - */ - goto finish; - } +static int scrub_rbio(struct btrfs_raid_bio *rbio) +{ + bool need_check = false; + struct bio_list bio_list; + int sector_nr; + int ret; + struct bio *bio; - /* - * The bioc may be freed once we submit the last bio. Make sure not to - * touch it after that. - */ - atomic_set(&rbio->stripes_pending, bios_to_read); - INIT_WORK(&rbio->end_io_work, raid56_parity_scrub_end_io_work); - while ((bio = bio_list_pop(&bio_list))) { - bio->bi_end_io = raid56_bio_end_io; + bio_list_init(&bio_list); - if (trace_raid56_scrub_read_enabled()) { - struct raid56_bio_trace_info trace_info = { 0 }; + ret = alloc_rbio_essential_pages(rbio); + if (ret) + goto cleanup; - bio_get_trace_info(rbio, bio, &trace_info); - trace_raid56_scrub_read(rbio, bio, &trace_info); + bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); + + ret = scrub_assemble_read_bios(rbio, &bio_list); + if (ret < 0) + goto cleanup; + + submit_read_bios(rbio, &bio_list); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + + /* We may have some failures, recover the failed sectors first. */ + ret = recover_scrub_rbio(rbio); + if (ret < 0) + goto cleanup; + + /* + * We have every sector properly prepared. Can finish the scrub + * and writeback the good content. + */ + ret = finish_parity_scrub(rbio, need_check); + wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0); + for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) { + int found_errors; + + found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL); + if (found_errors > rbio->bioc->max_errors) { + ret = -EIO; + break; } - submit_bio(bio); } - /* the actual write will happen once the reads are done */ - return; + return ret; cleanup: - rbio_orig_end_io(rbio, BLK_STS_IOERR); - while ((bio = bio_list_pop(&bio_list))) bio_put(bio); - return; - -finish: - validate_rbio_for_parity_scrub(rbio); + return ret; } -static void scrub_parity_work(struct work_struct *work) +static void scrub_rbio_work_locked(struct work_struct *work) { struct btrfs_raid_bio *rbio; + int ret; rbio = container_of(work, struct btrfs_raid_bio, work); - raid56_parity_scrub_stripe(rbio); + ret = scrub_rbio(rbio); + rbio_orig_end_io(rbio, errno_to_blk_status(ret)); } void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio) { if (!lock_stripe_add(rbio)) - start_async_work(rbio, scrub_parity_work); + start_async_work(rbio, scrub_rbio_work_locked); } /* The following code is used for dev replace of a missing RAID 5/6 device. */ @@ -2742,20 +2862,12 @@ raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc) */ ASSERT(!bio->bi_iter.bi_size); - rbio->faila = find_logical_bio_stripe(rbio, bio); - if (rbio->faila == -1) { - btrfs_warn_rl(fs_info, - "can not determine the failed stripe number for full stripe %llu", - bioc->raid_map[0]); - __free_raid_bio(rbio); - return NULL; - } + set_rbio_range_error(rbio, bio); return rbio; } void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio) { - if (!lock_stripe_add(rbio)) - start_async_work(rbio, read_rebuild_work); + start_async_work(rbio, recover_rbio_work); } |