diff options
Diffstat (limited to 'drivers/md/raid5.c')
-rw-r--r-- | drivers/md/raid5.c | 368 |
1 files changed, 348 insertions, 20 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 78ea44336e75..7f0e17a27aeb 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -53,6 +53,7 @@ #include <linux/cpu.h> #include <linux/slab.h> #include <linux/ratelimit.h> +#include <linux/nodemask.h> #include <trace/events/block.h> #include "md.h" @@ -60,6 +61,10 @@ #include "raid0.h" #include "bitmap.h" +#define cpu_to_group(cpu) cpu_to_node(cpu) +#define ANY_GROUP NUMA_NO_NODE + +static struct workqueue_struct *raid5_wq; /* * Stripe cache */ @@ -72,6 +77,7 @@ #define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) #define HASH_MASK (NR_HASH - 1) +#define MAX_STRIPE_BATCH 8 static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { @@ -200,6 +206,49 @@ static int stripe_operations_active(struct stripe_head *sh) test_bit(STRIPE_COMPUTE_RUN, &sh->state); } +static void raid5_wakeup_stripe_thread(struct stripe_head *sh) +{ + struct r5conf *conf = sh->raid_conf; + struct r5worker_group *group; + int thread_cnt; + int i, cpu = sh->cpu; + + if (!cpu_online(cpu)) { + cpu = cpumask_any(cpu_online_mask); + sh->cpu = cpu; + } + + if (list_empty(&sh->lru)) { + struct r5worker_group *group; + group = conf->worker_groups + cpu_to_group(cpu); + list_add_tail(&sh->lru, &group->handle_list); + group->stripes_cnt++; + sh->group = group; + } + + if (conf->worker_cnt_per_group == 0) { + md_wakeup_thread(conf->mddev->thread); + return; + } + + group = conf->worker_groups + cpu_to_group(sh->cpu); + + group->workers[0].working = true; + /* at least one worker should run to avoid race */ + queue_work_on(sh->cpu, raid5_wq, &group->workers[0].work); + + thread_cnt = group->stripes_cnt / MAX_STRIPE_BATCH - 1; + /* wakeup more workers */ + for (i = 1; i < conf->worker_cnt_per_group && thread_cnt > 0; i++) { + if (group->workers[i].working == false) { + group->workers[i].working = true; + queue_work_on(sh->cpu, raid5_wq, + &group->workers[i].work); + thread_cnt--; + } + } +} + static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) { BUG_ON(!list_empty(&sh->lru)); @@ -214,7 +263,12 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh) else { clear_bit(STRIPE_DELAYED, &sh->state); clear_bit(STRIPE_BIT_DELAY, &sh->state); - list_add_tail(&sh->lru, &conf->handle_list); + if (conf->worker_cnt_per_group == 0) { + list_add_tail(&sh->lru, &conf->handle_list); + } else { + raid5_wakeup_stripe_thread(sh); + return; + } } md_wakeup_thread(conf->mddev->thread); } else { @@ -239,12 +293,48 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) do_release_stripe(conf, sh); } +/* should hold conf->device_lock already */ +static int release_stripe_list(struct r5conf *conf) +{ + struct stripe_head *sh; + int count = 0; + struct llist_node *head; + + head = llist_del_all(&conf->released_stripes); + head = llist_reverse_order(head); + while (head) { + sh = llist_entry(head, struct stripe_head, release_list); + head = llist_next(head); + /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ + smp_mb(); + clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); + /* + * Don't worry the bit is set here, because if the bit is set + * again, the count is always > 1. This is true for + * STRIPE_ON_UNPLUG_LIST bit too. + */ + __release_stripe(conf, sh); + count++; + } + + return count; +} + static void release_stripe(struct stripe_head *sh) { struct r5conf *conf = sh->raid_conf; unsigned long flags; + bool wakeup; + if (test_and_set_bit(STRIPE_ON_RELEASE_LIST, &sh->state)) + goto slow_path; + wakeup = llist_add(&sh->release_list, &conf->released_stripes); + if (wakeup) + md_wakeup_thread(conf->mddev->thread); + return; +slow_path: local_irq_save(flags); + /* we are ok here if STRIPE_ON_RELEASE_LIST is set or not */ if (atomic_dec_and_lock(&sh->count, &conf->device_lock)) { do_release_stripe(conf, sh); spin_unlock(&conf->device_lock); @@ -359,6 +449,7 @@ static void init_stripe(struct stripe_head *sh, sector_t sector, int previous) raid5_build_block(sh, i, previous); } insert_hash(conf, sh); + sh->cpu = smp_processor_id(); } static struct stripe_head *__find_stripe(struct r5conf *conf, sector_t sector, @@ -491,7 +582,8 @@ get_active_stripe(struct r5conf *conf, sector_t sector, if (atomic_read(&sh->count)) { BUG_ON(!list_empty(&sh->lru) && !test_bit(STRIPE_EXPANDING, &sh->state) - && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state)); + && !test_bit(STRIPE_ON_UNPLUG_LIST, &sh->state) + && !test_bit(STRIPE_ON_RELEASE_LIST, &sh->state)); } else { if (!test_bit(STRIPE_HANDLE, &sh->state)) atomic_inc(&conf->active_stripes); @@ -499,6 +591,10 @@ get_active_stripe(struct r5conf *conf, sector_t sector, !test_bit(STRIPE_EXPANDING, &sh->state)) BUG(); list_del_init(&sh->lru); + if (sh->group) { + sh->group->stripes_cnt--; + sh->group = NULL; + } } } } while (sh == NULL); @@ -668,6 +764,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) bi->bi_io_vec[0].bv_len = STRIPE_SIZE; bi->bi_io_vec[0].bv_offset = 0; bi->bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + bi->bi_vcnt = 0; if (rrdev) set_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags); @@ -706,6 +808,12 @@ static void ops_run_io(struct stripe_head *sh, struct stripe_head_state *s) rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; rbi->bi_io_vec[0].bv_offset = 0; rbi->bi_size = STRIPE_SIZE; + /* + * If this is discard request, set bi_vcnt 0. We don't + * want to confuse SCSI because SCSI will replace payload + */ + if (rw & REQ_DISCARD) + rbi->bi_vcnt = 0; if (conf->mddev->gendisk) trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), @@ -2800,6 +2908,14 @@ static void handle_stripe_clean_event(struct r5conf *conf, } /* now that discard is done we can proceed with any sync */ clear_bit(STRIPE_DISCARD, &sh->state); + /* + * SCSI discard will change some bio fields and the stripe has + * no updated data, so remove it from hash list and the stripe + * will be reinitialized + */ + spin_lock_irq(&conf->device_lock); + remove_hash(sh); + spin_unlock_irq(&conf->device_lock); if (test_bit(STRIPE_SYNC_REQUESTED, &sh->state)) set_bit(STRIPE_HANDLE, &sh->state); @@ -3779,6 +3895,7 @@ static void raid5_activate_delayed(struct r5conf *conf) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); list_add_tail(&sh->lru, &conf->hold_list); + raid5_wakeup_stripe_thread(sh); } } } @@ -4058,18 +4175,35 @@ static int chunk_aligned_read(struct mddev *mddev, struct bio * raid_bio) * head of the hold_list has changed, i.e. the head was promoted to the * handle_list. */ -static struct stripe_head *__get_priority_stripe(struct r5conf *conf) +static struct stripe_head *__get_priority_stripe(struct r5conf *conf, int group) { - struct stripe_head *sh; + struct stripe_head *sh = NULL, *tmp; + struct list_head *handle_list = NULL; + struct r5worker_group *wg = NULL; + + if (conf->worker_cnt_per_group == 0) { + handle_list = &conf->handle_list; + } else if (group != ANY_GROUP) { + handle_list = &conf->worker_groups[group].handle_list; + wg = &conf->worker_groups[group]; + } else { + int i; + for (i = 0; i < conf->group_cnt; i++) { + handle_list = &conf->worker_groups[i].handle_list; + wg = &conf->worker_groups[i]; + if (!list_empty(handle_list)) + break; + } + } pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n", __func__, - list_empty(&conf->handle_list) ? "empty" : "busy", + list_empty(handle_list) ? "empty" : "busy", list_empty(&conf->hold_list) ? "empty" : "busy", atomic_read(&conf->pending_full_writes), conf->bypass_count); - if (!list_empty(&conf->handle_list)) { - sh = list_entry(conf->handle_list.next, typeof(*sh), lru); + if (!list_empty(handle_list)) { + sh = list_entry(handle_list->next, typeof(*sh), lru); if (list_empty(&conf->hold_list)) conf->bypass_count = 0; @@ -4087,14 +4221,32 @@ static struct stripe_head *__get_priority_stripe(struct r5conf *conf) ((conf->bypass_threshold && conf->bypass_count > conf->bypass_threshold) || atomic_read(&conf->pending_full_writes) == 0)) { - sh = list_entry(conf->hold_list.next, - typeof(*sh), lru); - conf->bypass_count -= conf->bypass_threshold; - if (conf->bypass_count < 0) - conf->bypass_count = 0; - } else + + list_for_each_entry(tmp, &conf->hold_list, lru) { + if (conf->worker_cnt_per_group == 0 || + group == ANY_GROUP || + !cpu_online(tmp->cpu) || + cpu_to_group(tmp->cpu) == group) { + sh = tmp; + break; + } + } + + if (sh) { + conf->bypass_count -= conf->bypass_threshold; + if (conf->bypass_count < 0) + conf->bypass_count = 0; + } + wg = NULL; + } + + if (!sh) return NULL; + if (wg) { + wg->stripes_cnt--; + sh->group = NULL; + } list_del_init(&sh->lru); atomic_inc(&sh->count); BUG_ON(atomic_read(&sh->count) != 1); @@ -4127,6 +4279,10 @@ static void raid5_unplug(struct blk_plug_cb *blk_cb, bool from_schedule) */ smp_mb__before_clear_bit(); clear_bit(STRIPE_ON_UNPLUG_LIST, &sh->state); + /* + * STRIPE_ON_RELEASE_LIST could be set here. In that + * case, the count is always > 1 here + */ __release_stripe(conf, sh); cnt++; } @@ -4286,8 +4442,10 @@ static void make_request(struct mddev *mddev, struct bio * bi) for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { DEFINE_WAIT(w); int previous; + int seq; retry: + seq = read_seqcount_begin(&conf->gen_lock); previous = 0; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); if (unlikely(conf->reshape_progress != MaxSector)) { @@ -4320,7 +4478,7 @@ static void make_request(struct mddev *mddev, struct bio * bi) previous, &dd_idx, NULL); pr_debug("raid456: make_request, sector %llu logical %llu\n", - (unsigned long long)new_sector, + (unsigned long long)new_sector, (unsigned long long)logical_sector); sh = get_active_stripe(conf, new_sector, previous, @@ -4349,6 +4507,13 @@ static void make_request(struct mddev *mddev, struct bio * bi) goto retry; } } + if (read_seqcount_retry(&conf->gen_lock, seq)) { + /* Might have got the wrong stripe_head + * by accident + */ + release_stripe(sh); + goto retry; + } if (rw == WRITE && logical_sector >= mddev->suspend_lo && @@ -4788,14 +4953,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio) return handled; } -#define MAX_STRIPE_BATCH 8 -static int handle_active_stripes(struct r5conf *conf) +static int handle_active_stripes(struct r5conf *conf, int group, + struct r5worker *worker) { struct stripe_head *batch[MAX_STRIPE_BATCH], *sh; int i, batch_size = 0; while (batch_size < MAX_STRIPE_BATCH && - (sh = __get_priority_stripe(conf)) != NULL) + (sh = __get_priority_stripe(conf, group)) != NULL) batch[batch_size++] = sh; if (batch_size == 0) @@ -4813,6 +4978,39 @@ static int handle_active_stripes(struct r5conf *conf) return batch_size; } +static void raid5_do_work(struct work_struct *work) +{ + struct r5worker *worker = container_of(work, struct r5worker, work); + struct r5worker_group *group = worker->group; + struct r5conf *conf = group->conf; + int group_id = group - conf->worker_groups; + int handled; + struct blk_plug plug; + + pr_debug("+++ raid5worker active\n"); + + blk_start_plug(&plug); + handled = 0; + spin_lock_irq(&conf->device_lock); + while (1) { + int batch_size, released; + + released = release_stripe_list(conf); + + batch_size = handle_active_stripes(conf, group_id, worker); + worker->working = false; + if (!batch_size && !released) + break; + handled += batch_size; + } + pr_debug("%d stripes handled\n", handled); + + spin_unlock_irq(&conf->device_lock); + blk_finish_plug(&plug); + + pr_debug("--- raid5worker inactive\n"); +} + /* * This is our raid5 kernel thread. * @@ -4836,7 +5034,9 @@ static void raid5d(struct md_thread *thread) spin_lock_irq(&conf->device_lock); while (1) { struct bio *bio; - int batch_size; + int batch_size, released; + + released = release_stripe_list(conf); if ( !list_empty(&conf->bitmap_list)) { @@ -4860,8 +5060,8 @@ static void raid5d(struct md_thread *thread) handled++; } - batch_size = handle_active_stripes(conf); - if (!batch_size) + batch_size = handle_active_stripes(conf, ANY_GROUP, NULL); + if (!batch_size && !released) break; handled += batch_size; @@ -4989,10 +5189,70 @@ stripe_cache_active_show(struct mddev *mddev, char *page) static struct md_sysfs_entry raid5_stripecache_active = __ATTR_RO(stripe_cache_active); +static ssize_t +raid5_show_group_thread_cnt(struct mddev *mddev, char *page) +{ + struct r5conf *conf = mddev->private; + if (conf) + return sprintf(page, "%d\n", conf->worker_cnt_per_group); + else + return 0; +} + +static int alloc_thread_groups(struct r5conf *conf, int cnt); +static ssize_t +raid5_store_group_thread_cnt(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf = mddev->private; + unsigned long new; + int err; + struct r5worker_group *old_groups; + int old_group_cnt; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (!conf) + return -ENODEV; + + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + if (new == conf->worker_cnt_per_group) + return len; + + mddev_suspend(mddev); + + old_groups = conf->worker_groups; + old_group_cnt = conf->worker_cnt_per_group; + + conf->worker_groups = NULL; + err = alloc_thread_groups(conf, new); + if (err) { + conf->worker_groups = old_groups; + conf->worker_cnt_per_group = old_group_cnt; + } else { + if (old_groups) + kfree(old_groups[0].workers); + kfree(old_groups); + } + + mddev_resume(mddev); + + if (err) + return err; + return len; +} + +static struct md_sysfs_entry +raid5_group_thread_cnt = __ATTR(group_thread_cnt, S_IRUGO | S_IWUSR, + raid5_show_group_thread_cnt, + raid5_store_group_thread_cnt); + static struct attribute *raid5_attrs[] = { &raid5_stripecache_size.attr, &raid5_stripecache_active.attr, &raid5_preread_bypass_threshold.attr, + &raid5_group_thread_cnt.attr, NULL, }; static struct attribute_group raid5_attrs_group = { @@ -5000,6 +5260,54 @@ static struct attribute_group raid5_attrs_group = { .attrs = raid5_attrs, }; +static int alloc_thread_groups(struct r5conf *conf, int cnt) +{ + int i, j; + ssize_t size; + struct r5worker *workers; + + conf->worker_cnt_per_group = cnt; + if (cnt == 0) { + conf->worker_groups = NULL; + return 0; + } + conf->group_cnt = num_possible_nodes(); + size = sizeof(struct r5worker) * cnt; + workers = kzalloc(size * conf->group_cnt, GFP_NOIO); + conf->worker_groups = kzalloc(sizeof(struct r5worker_group) * + conf->group_cnt, GFP_NOIO); + if (!conf->worker_groups || !workers) { + kfree(workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; + return -ENOMEM; + } + + for (i = 0; i < conf->group_cnt; i++) { + struct r5worker_group *group; + + group = &conf->worker_groups[i]; + INIT_LIST_HEAD(&group->handle_list); + group->conf = conf; + group->workers = workers + i * cnt; + + for (j = 0; j < cnt; j++) { + group->workers[j].group = group; + INIT_WORK(&group->workers[j].work, raid5_do_work); + } + } + + return 0; +} + +static void free_thread_groups(struct r5conf *conf) +{ + if (conf->worker_groups) + kfree(conf->worker_groups[0].workers); + kfree(conf->worker_groups); + conf->worker_groups = NULL; +} + static sector_t raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks) { @@ -5040,6 +5348,7 @@ static void raid5_free_percpu(struct r5conf *conf) static void free_conf(struct r5conf *conf) { + free_thread_groups(conf); shrink_stripes(conf); raid5_free_percpu(conf); kfree(conf->disks); @@ -5168,7 +5477,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + /* Don't enable multi-threading by default*/ + if (alloc_thread_groups(conf, 0)) + goto abort; spin_lock_init(&conf->device_lock); + seqcount_init(&conf->gen_lock); init_waitqueue_head(&conf->wait_for_stripe); init_waitqueue_head(&conf->wait_for_overlap); INIT_LIST_HEAD(&conf->handle_list); @@ -5176,6 +5489,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->delayed_list); INIT_LIST_HEAD(&conf->bitmap_list); INIT_LIST_HEAD(&conf->inactive_list); + init_llist_head(&conf->released_stripes); atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); @@ -5980,6 +6294,7 @@ static int raid5_start_reshape(struct mddev *mddev) atomic_set(&conf->reshape_stripes, 0); spin_lock_irq(&conf->device_lock); + write_seqcount_begin(&conf->gen_lock); conf->previous_raid_disks = conf->raid_disks; conf->raid_disks += mddev->delta_disks; conf->prev_chunk_sectors = conf->chunk_sectors; @@ -5996,8 +6311,16 @@ static int raid5_start_reshape(struct mddev *mddev) else conf->reshape_progress = 0; conf->reshape_safe = conf->reshape_progress; + write_seqcount_end(&conf->gen_lock); spin_unlock_irq(&conf->device_lock); + /* Now make sure any requests that proceeded on the assumption + * the reshape wasn't running - like Discard or Read - have + * completed. + */ + mddev_suspend(mddev); + mddev_resume(mddev); + /* Add some new drives, as many as will fit. * We know there are enough to make the newly sized array work. * Don't add devices if we are reducing the number of @@ -6472,6 +6795,10 @@ static struct md_personality raid4_personality = static int __init raid5_init(void) { + raid5_wq = alloc_workqueue("raid5wq", + WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0); + if (!raid5_wq) + return -ENOMEM; register_md_personality(&raid6_personality); register_md_personality(&raid5_personality); register_md_personality(&raid4_personality); @@ -6483,6 +6810,7 @@ static void raid5_exit(void) unregister_md_personality(&raid6_personality); unregister_md_personality(&raid5_personality); unregister_md_personality(&raid4_personality); + destroy_workqueue(raid5_wq); } module_init(raid5_init); |