diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 248 |
1 files changed, 198 insertions, 50 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 36c13e4be9c9..2ce23b01dbb2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, atomic_dec(&conf->r5c_cached_partial_stripes); list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); r5c_check_cached_full_stripe(conf); - } else { - /* partial stripe */ - if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, - &sh->state)) - atomic_inc(&conf->r5c_cached_partial_stripes); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); - } } } } @@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) { - struct stripe_head *sh; + struct stripe_head *sh, *t; int count = 0; struct llist_node *head; head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); - while (head) { + llist_for_each_entry_safe(sh, t, head, release_list) { int hash; - sh = llist_entry(head, struct stripe_head, release_list); - head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ smp_mb(); clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); @@ -556,7 +554,7 @@ static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, * of the two sections, and some non-in_sync devices may * be insync in the section most affected by failed devices. */ -static int calc_degraded(struct r5conf *conf) +int raid5_calc_degraded(struct r5conf *conf) { int degraded, degraded2; int i; @@ -619,7 +617,7 @@ static int has_failed(struct r5conf *conf) if (conf->mddev->reshape_position == MaxSector) return conf->mddev->degraded > conf->max_degraded; - degraded = calc_degraded(conf); + degraded = raid5_calc_degraded(conf); if (degraded > conf->max_degraded) return 1; return 0; @@ -863,6 +861,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp; + struct bio *bio; + + if (!conf->batch_bio_dispatch || !conf->group_cnt) + return; + + bio_list_init(&tmp); + spin_lock(&conf->pending_bios_lock); + bio_list_merge(&tmp, &conf->pending_bios); + bio_list_init(&conf->pending_bios); + spin_unlock(&conf->pending_bios_lock); + + while ((bio = bio_list_pop(&tmp))) + generic_make_request(bio); +} + +static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +{ + /* + * change group_cnt will drain all bios, so this is safe + * + * A read generally means a read-modify-write, which usually means a + * randwrite, so we don't delay it + */ + if (!conf->batch_bio_dispatch || !conf->group_cnt || + bio_op(bio) == REQ_OP_READ) { + generic_make_request(bio); + return; + } + spin_lock(&conf->pending_bios_lock); + bio_list_add(&conf->pending_bios, bio); + spin_unlock(&conf->pending_bios_lock); + md_wakeup_thread(conf->mddev->thread); +} + static void raid5_end_read_request(struct bio *bi); static void @@ -1015,7 +1050,17 @@ again: if (test_bit(R5_SkipCopy, &sh->dev[i].flags)) WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); - sh->dev[i].vec.bv_page = sh->dev[i].page; + + if (!op_is_write(op) && + test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * issuing read for a page in journal, this + * must be preparing for prexor in rmw; read + * the data into orig_page + */ + sh->dev[i].vec.bv_page = sh->dev[i].orig_page; + else + sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; @@ -1033,7 +1078,7 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(bi); + defer_bio_issue(conf, bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1078,7 +1123,7 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(rbi); + defer_bio_issue(conf, rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -2380,6 +2425,13 @@ static void raid5_end_read_request(struct bio * bi) } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) clear_bit(R5_ReadNoMerge, &sh->dev[i].flags); + if (test_bit(R5_InJournal, &sh->dev[i].flags)) + /* + * end read for a page in journal, this + * must be preparing for prexor in rmw + */ + set_bit(R5_OrigPageUPTDODATE, &sh->dev[i].flags); + if (atomic_read(&rdev->read_errors)) atomic_set(&rdev->read_errors, 0); } else { @@ -2538,7 +2590,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) spin_lock_irqsave(&conf->device_lock, flags); clear_bit(In_sync, &rdev->flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); set_bit(MD_RECOVERY_INTR, &mddev->recovery); @@ -2552,6 +2604,7 @@ static void raid5_error(struct mddev *mddev, struct md_rdev *rdev) bdevname(rdev->bdev, b), mdname(mddev), conf->raid_disks - mddev->degraded); + r5c_update_on_rdev_error(mddev); } /* @@ -2880,6 +2933,54 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) return r_sector; } +/* + * There are cases where we want handle_stripe_dirtying() and + * schedule_reconstruction() to delay towrite to some dev of a stripe. + * + * This function checks whether we want to delay the towrite. Specifically, + * we delay the towrite when: + * + * 1. degraded stripe has a non-overwrite to the missing dev, AND this + * stripe has data in journal (for other devices). + * + * In this case, when reading data for the non-overwrite dev, it is + * necessary to handle complex rmw of write back cache (prexor with + * orig_page, and xor with page). To keep read path simple, we would + * like to flush data in journal to RAID disks first, so complex rmw + * is handled in the write patch (handle_stripe_dirtying). + * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * + */ +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) +{ + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + return false; +} + static void schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, int rcw, int expand) @@ -2900,7 +3001,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite) { + if (dev->towrite && !delay_towrite(conf, dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) @@ -3295,13 +3396,6 @@ static int want_replace(struct stripe_head *sh, int disk_idx) return rv; } -/* fetch_block - checks the given member device to see if its data needs - * to be read or computed to satisfy a request. - * - * Returns 1 when no more member devices need to be checked, otherwise returns - * 0 to tell the loop in handle_stripe_fill to continue - */ - static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3392,6 +3486,12 @@ static int need_this_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } +/* fetch_block - checks the given member device to see if its data needs + * to be read or computed to satisfy a request. + * + * Returns 1 when no more member devices need to be checked, otherwise returns + * 0 to tell the loop in handle_stripe_fill to continue + */ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, int disk_idx, int disks) { @@ -3478,10 +3578,26 @@ static void handle_stripe_fill(struct stripe_head *sh, * midst of changing due to a write */ if (!test_bit(STRIPE_COMPUTE_RUN, &sh->state) && !sh->check_state && - !sh->reconstruct_state) + !sh->reconstruct_state) { + + /* + * For degraded stripe with data in journal, do not handle + * read requests yet, instead, flush the stripe to raid + * disks first, this avoids handling complex rmw of write + * back cache (prexor with orig_page, and then xor with + * page) in the read path + */ + if (s->injournal && s->failed) { + if (test_bit(STRIPE_R5C_CACHING, &sh->state)) + r5c_make_stripe_write_out(sh); + goto out; + } + for (i = disks; i--; ) if (fetch_block(sh, s, i, disks)) break; + } +out: set_bit(STRIPE_HANDLE, &sh->state); } @@ -3594,6 +3710,21 @@ unhash: break_stripe_batch_list(head_sh, STRIPE_EXPAND_SYNC_FLAGS); } +/* + * For RMW in write back cache, we need extra page in prexor to store the + * old data. This page is stored in dev->orig_page. + * + * This function checks whether we have data for prexor. The exact logic + * is: + * R5_UPTODATE && (!R5_InJournal || R5_OrigPageUPTDODATE) + */ +static inline bool uptodate_for_rmw(struct r5dev *dev) +{ + return (test_bit(R5_UPTODATE, &dev->flags)) && + (!test_bit(R5_InJournal, &dev->flags) || + test_bit(R5_OrigPageUPTDODATE, &dev->flags)); +} + static int handle_stripe_dirtying(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s, @@ -3622,12 +3753,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || i == sh->pd_idx || i == sh->qd_idx || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || + i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !((test_bit(R5_UPTODATE, &dev->flags) && - (!test_bit(R5_InJournal, &dev->flags) || - dev->page != dev->orig_page)) || + !(uptodate_for_rmw(dev) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rmw++; @@ -3639,7 +3769,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_InJournal, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { if (test_bit(R5_Insync, &dev->flags)) rcw++; @@ -3648,8 +3777,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, } } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ @@ -3689,13 +3818,11 @@ static int handle_stripe_dirtying(struct r5conf *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if ((dev->towrite || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && - !((test_bit(R5_UPTODATE, &dev->flags) && - (!test_bit(R5_InJournal, &dev->flags) || - dev->page != dev->orig_page)) || + !(uptodate_for_rmw(dev) || test_bit(R5_Wantcompute, &dev->flags)) && test_bit(R5_Insync, &dev->flags)) { if (test_bit(STRIPE_PREREAD_ACTIVE, @@ -3722,7 +3849,6 @@ static int handle_stripe_dirtying(struct r5conf *conf, i != sh->pd_idx && i != sh->qd_idx && !test_bit(R5_LOCKED, &dev->flags) && !(test_bit(R5_UPTODATE, &dev->flags) || - test_bit(R5_InJournal, &dev->flags) || test_bit(R5_Wantcompute, &dev->flags))) { rcw++; if (test_bit(R5_Insync, &dev->flags) && @@ -4928,9 +5054,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } /* - * use bio_clone_mddev to make a copy of the bio + * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); if (!align_bi) return 0; /* @@ -4958,6 +5084,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rdev->recovery_offset >= end_sector))) rdev = NULL; } + + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -5314,7 +5447,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - !r5c_is_writeback(conf->log) && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) @@ -6059,6 +6191,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + flush_deferred_bios(conf); + r5l_flush_stripe_to_raid(conf->log); async_tx_issue_pending_all(); @@ -6264,10 +6398,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); conf->skip_copy = new; if (new) - mddev->queue->backing_dev_info.capabilities |= + mddev->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - mddev->queue->backing_dev_info.capabilities &= + mddev->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; mddev_resume(mddev); } @@ -6644,6 +6778,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + bio_list_init(&conf->pending_bios); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + conf->batch_bio_dispatch = false; + break; + } + } + conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; @@ -6690,6 +6836,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->r5c_full_stripe_list); atomic_set(&conf->r5c_cached_partial_stripes, 0); INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; @@ -7025,7 +7173,7 @@ static int raid5_run(struct mddev *mddev) /* * 0 for a fully functional array, 1 or 2 for a degraded array. */ - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); if (has_failed(conf)) { pr_crit("md/raid:%s: not enough operational devices (%d/%d failed)\n", @@ -7086,8 +7234,8 @@ static int raid5_run(struct mddev *mddev) int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -7272,7 +7420,7 @@ static int raid5_spare_active(struct mddev *mddev) } } spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); print_raid5_conf(conf); return count; @@ -7632,7 +7780,7 @@ static int raid5_start_reshape(struct mddev *mddev) * pre and post number of devices. */ spin_lock_irqsave(&conf->device_lock, flags); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; @@ -7696,8 +7844,8 @@ static void end_reshape(struct r5conf *conf) int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } } } @@ -7720,7 +7868,7 @@ static void raid5_finish_reshape(struct mddev *mddev) } else { int d; spin_lock_irq(&conf->device_lock); - mddev->degraded = calc_degraded(conf); + mddev->degraded = raid5_calc_degraded(conf); spin_unlock_irq(&conf->device_lock); for (d = conf->raid_disks ; d < conf->raid_disks - mddev->delta_disks; |