diff options
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r-- | drivers/md/dm.c | 600 |
1 files changed, 327 insertions, 273 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 5df40480228b..be4905769a45 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info { struct bio clone; }; -union map_info *dm_get_rq_mapinfo(struct request *rq) -{ - if (rq && rq->end_io_data) - return &((struct dm_rq_target_io *)rq->end_io_data)->info; - return NULL; -} -EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); - #define MINOR_ALLOCED ((void *)-1) /* @@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo); #define DMF_SUSPENDED_INTERNALLY 7 /* - * A dummy definition to make RCU happy. - * struct dm_table should never be dereferenced in this file. - */ -struct dm_table { - int undefined__; -}; - -/* * Work processed by per-device workqueue. */ struct mapped_device { struct srcu_struct io_barrier; struct mutex suspend_lock; - atomic_t holders; - atomic_t open_count; /* - * The current mapping. + * The current mapping (struct dm_table *). * Use dm_get_live_table{_fast} or take suspend_lock for * dereference. */ - struct dm_table __rcu *map; + void __rcu *map; struct list_head table_devices; struct mutex table_devices_lock; @@ -158,10 +140,16 @@ struct mapped_device { unsigned long flags; struct request_queue *queue; + int numa_node_id; + unsigned type; /* Protect queue and type against concurrent access. */ struct mutex type_lock; + atomic_t holders; + atomic_t open_count; + + struct dm_target *immutable_target; struct target_type *immutable_target_type; struct gendisk *disk; @@ -175,8 +163,20 @@ struct mapped_device { atomic_t pending[2]; wait_queue_head_t wait; struct work_struct work; - struct bio_list deferred; spinlock_t deferred_lock; + struct bio_list deferred; + + /* + * Event handling. + */ + wait_queue_head_t eventq; + atomic_t event_nr; + atomic_t uevent_seq; + struct list_head uevent_list; + spinlock_t uevent_lock; /* Protect access to uevent_list */ + + /* the number of internal suspends */ + unsigned internal_suspend_count; /* * Processing queue (flush) @@ -192,32 +192,21 @@ struct mapped_device { struct bio_set *bs; /* - * Event handling. - */ - atomic_t event_nr; - wait_queue_head_t eventq; - atomic_t uevent_seq; - struct list_head uevent_list; - spinlock_t uevent_lock; /* Protect access to uevent_list */ - - /* * freeze/thaw support require holding onto a super block */ struct super_block *frozen_sb; - struct block_device *bdev; /* forced geometry settings */ struct hd_geometry geometry; + struct block_device *bdev; + /* kobject and completion */ struct dm_kobject_holder kobj_holder; /* zero-length flush that will be cloned and submitted to targets */ struct bio flush_bio; - /* the number of internal suspends */ - unsigned internal_suspend_count; - struct dm_stats stats; struct kthread_worker kworker; @@ -230,8 +219,9 @@ struct mapped_device { ktime_t last_rq_start_time; /* for blk-mq request-based DM support */ - struct blk_mq_tag_set tag_set; - bool use_blk_mq; + struct blk_mq_tag_set *tag_set; + bool use_blk_mq:1; + bool init_tio_pdu:1; }; #ifdef CONFIG_DM_MQ_DEFAULT @@ -240,10 +230,19 @@ static bool use_blk_mq = true; static bool use_blk_mq = false; #endif +#define DM_MQ_NR_HW_QUEUES 1 +#define DM_MQ_QUEUE_DEPTH 2048 +#define DM_NUMA_NODE NUMA_NO_NODE + +static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES; +static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH; +static int dm_numa_node = DM_NUMA_NODE; + bool dm_use_blk_mq(struct mapped_device *md) { return md->use_blk_mq; } +EXPORT_SYMBOL_GPL(dm_use_blk_mq); /* * For mempools pre-allocation at the table loading time. @@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS; */ static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS; +static int __dm_get_module_param_int(int *module_param, int min, int max) +{ + int param = ACCESS_ONCE(*module_param); + int modified_param = 0; + bool modified = true; + + if (param < min) + modified_param = min; + else if (param > max) + modified_param = max; + else + modified = false; + + if (modified) { + (void)cmpxchg(module_param, param, modified_param); + param = modified_param; + } + + return param; +} + static unsigned __dm_get_module_param(unsigned *module_param, unsigned def, unsigned max) { @@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void) } EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios); +static unsigned dm_get_blk_mq_nr_hw_queues(void) +{ + return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32); +} + +static unsigned dm_get_blk_mq_queue_depth(void) +{ + return __dm_get_module_param(&dm_mq_queue_depth, + DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH); +} + +static unsigned dm_get_numa_node(void) +{ + return __dm_get_module_param_int(&dm_numa_node, + DM_NUMA_NODE, num_online_nodes() - 1); +} + static int __init local_init(void) { int r = -ENOMEM; @@ -323,7 +360,7 @@ static int __init local_init(void) if (!_rq_tio_cache) goto out_free_io_cache; - _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request), + _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request), __alignof__(struct request), 0, NULL); if (!_rq_cache) goto out_free_rq_tio_cache; @@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo) return dm_get_geometry(md, geo); } -static int dm_get_live_table_for_ioctl(struct mapped_device *md, - struct dm_target **tgt, struct block_device **bdev, - fmode_t *mode, int *srcu_idx) +static int dm_grab_bdev_for_ioctl(struct mapped_device *md, + struct block_device **bdev, + fmode_t *mode) { + struct dm_target *tgt; struct dm_table *map; - int r; + int srcu_idx, r; retry: r = -ENOTTY; - map = dm_get_live_table(md, srcu_idx); + map = dm_get_live_table(md, &srcu_idx); if (!map || !dm_table_get_size(map)) goto out; @@ -573,9 +611,8 @@ retry: if (dm_table_get_num_targets(map) != 1) goto out; - *tgt = dm_table_get_target(map, 0); - - if (!(*tgt)->type->prepare_ioctl) + tgt = dm_table_get_target(map, 0); + if (!tgt->type->prepare_ioctl) goto out; if (dm_suspended_md(md)) { @@ -583,14 +620,16 @@ retry: goto out; } - r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode); + r = tgt->type->prepare_ioctl(tgt, bdev, mode); if (r < 0) goto out; + bdgrab(*bdev); + dm_put_live_table(md, srcu_idx); return r; out: - dm_put_live_table(md, *srcu_idx); + dm_put_live_table(md, srcu_idx); if (r == -ENOTCONN && !fatal_signal_pending(current)) { msleep(10); goto retry; @@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { struct mapped_device *md = bdev->bd_disk->private_data; - struct dm_target *tgt; - struct block_device *tgt_bdev = NULL; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode, goto out; } - r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg); + r = __blkdev_driver_ioctl(bdev, mode, cmd, arg); out: - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio) bio_put(&tio->clone); } -static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md, - gfp_t gfp_mask) +static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->io_pool, gfp_mask); } -static void free_rq_tio(struct dm_rq_target_io *tio) +static void free_old_rq_tio(struct dm_rq_target_io *tio) { mempool_free(tio, tio->md->io_pool); } -static struct request *alloc_clone_request(struct mapped_device *md, - gfp_t gfp_mask) +static struct request *alloc_old_clone_request(struct mapped_device *md, + gfp_t gfp_mask) { return mempool_alloc(md->rq_pool, gfp_mask); } -static void free_clone_request(struct mapped_device *md, struct request *rq) +static void free_old_clone_request(struct mapped_device *md, struct request *rq) { mempool_free(rq, md->rq_pool); } @@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode, mutex_lock(&md->table_devices_lock); td = find_table_device(&md->table_devices, dev, mode); if (!td) { - td = kmalloc(sizeof(*td), GFP_KERNEL); + td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id); if (!td) { mutex_unlock(&md->table_devices_lock); return -ENOMEM; @@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue) * back into ->request_fn() could deadlock attempting to grab the * queue lock again. */ - if (run_queue) { - if (md->queue->mq_ops) - blk_mq_run_hw_queues(md->queue, true); - else - blk_run_queue_async(md->queue); - } + if (!md->queue->mq_ops && run_queue) + blk_run_queue_async(md->queue); /* * dm_put() must be at the end of this function. See the comment above @@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone) tio->ti->type->release_clone_rq(clone); else if (!md->queue->mq_ops) /* request_fn queue stacked on request_fn queue(s) */ - free_clone_request(md, clone); - /* - * NOTE: for the blk-mq queue stacked on request_fn queue(s) case: - * no need to call free_clone_request() because we leverage blk-mq by - * allocating the clone at the end of the blk-mq pdu (see: clone_rq) - */ + free_old_clone_request(md, clone); if (!md->queue->mq_ops) - free_rq_tio(tio); + free_old_rq_tio(tio); } /* @@ -1191,12 +1219,14 @@ static void dm_unprep_request(struct request *rq) if (clone) free_rq_clone(clone); + else if (!tio->md->queue->mq_ops) + free_old_rq_tio(tio); } /* * Requeue the original request of a clone. */ -static void old_requeue_request(struct request *rq) +static void dm_old_requeue_request(struct request *rq) { struct request_queue *q = rq->q; unsigned long flags; @@ -1207,45 +1237,57 @@ static void old_requeue_request(struct request *rq) spin_unlock_irqrestore(q->queue_lock, flags); } +static void dm_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + blk_mq_requeue_request(rq); + spin_lock_irqsave(q->queue_lock, flags); + if (!blk_queue_stopped(q)) + blk_mq_kick_requeue_list(q); + spin_unlock_irqrestore(q->queue_lock, flags); +} + static void dm_requeue_original_request(struct mapped_device *md, struct request *rq) { int rw = rq_data_dir(rq); + rq_end_stats(md, rq); dm_unprep_request(rq); - rq_end_stats(md, rq); if (!rq->q->mq_ops) - old_requeue_request(rq); - else { - blk_mq_requeue_request(rq); - blk_mq_kick_requeue_list(rq->q); - } + dm_old_requeue_request(rq); + else + dm_mq_requeue_request(rq); rq_completed(md, rw, false); } -static void old_stop_queue(struct request_queue *q) +static void dm_old_stop_queue(struct request_queue *q) { unsigned long flags; - if (blk_queue_stopped(q)) + spin_lock_irqsave(q->queue_lock, flags); + if (blk_queue_stopped(q)) { + spin_unlock_irqrestore(q->queue_lock, flags); return; + } - spin_lock_irqsave(q->queue_lock, flags); blk_stop_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } -static void stop_queue(struct request_queue *q) +static void dm_stop_queue(struct request_queue *q) { if (!q->mq_ops) - old_stop_queue(q); + dm_old_stop_queue(q); else blk_mq_stop_hw_queues(q); } -static void old_start_queue(struct request_queue *q) +static void dm_old_start_queue(struct request_queue *q) { unsigned long flags; @@ -1255,12 +1297,14 @@ static void old_start_queue(struct request_queue *q) spin_unlock_irqrestore(q->queue_lock, flags); } -static void start_queue(struct request_queue *q) +static void dm_start_queue(struct request_queue *q) { if (!q->mq_ops) - old_start_queue(q); - else + dm_old_start_queue(q); + else { blk_mq_start_stopped_hw_queues(q, true); + blk_mq_kick_requeue_list(q); + } } static void dm_done(struct request *clone, int error, bool mapped) @@ -1311,7 +1355,7 @@ static void dm_softirq_done(struct request *rq) if (!rq->q->mq_ops) { blk_end_request_all(rq, tio->error); rq_completed(tio->md, rw, false); - free_rq_tio(tio); + free_old_rq_tio(tio); } else { blk_mq_end_request(rq, tio->error); rq_completed(tio->md, rw, false); @@ -1334,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error) struct dm_rq_target_io *tio = tio_from_request(rq); tio->error = error; - blk_complete_request(rq); + if (!rq->q->mq_ops) + blk_complete_request(rq); + else + blk_mq_complete_request(rq, error); } /* @@ -1350,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error) } /* - * Called with the clone's queue lock held (for non-blk-mq) + * Called with the clone's queue lock held (in the case of .request_fn) */ static void end_clone_request(struct request *clone, int error) { @@ -1520,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len) /* * Creates a bio that consists of range of complete bvecs. */ -static void clone_bio(struct dm_target_io *tio, struct bio *bio, - sector_t sector, unsigned len) +static int clone_bio(struct dm_target_io *tio, struct bio *bio, + sector_t sector, unsigned len) { struct bio *clone = &tio->clone; __bio_clone_fast(clone, bio); - if (bio_integrity(bio)) - bio_integrity_clone(clone, bio, GFP_NOIO); + if (bio_integrity(bio)) { + int r = bio_integrity_clone(clone, bio, GFP_NOIO); + if (r < 0) + return r; + } bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector)); clone->bi_iter.bi_size = to_bytes(len); if (bio_integrity(bio)) bio_integrity_trim(clone, 0, len); + + return 0; } static struct dm_target_io *alloc_tio(struct clone_info *ci, @@ -1591,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci) return 0; } -static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, +static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti, sector_t sector, unsigned *len) { struct bio *bio = ci->bio; struct dm_target_io *tio; unsigned target_bio_nr; unsigned num_target_bios = 1; + int r = 0; /* * Does the target want to receive duplicate copies of the bio? @@ -1608,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) { tio = alloc_tio(ci, ti, target_bio_nr); tio->len_ptr = len; - clone_bio(tio, bio, sector, *len); + r = clone_bio(tio, bio, sector, *len); + if (r < 0) + break; __map_bio(tio); } + + return r; } typedef unsigned (*get_num_bios_fn)(struct dm_target *ti); @@ -1687,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci) struct bio *bio = ci->bio; struct dm_target *ti; unsigned len; + int r; if (unlikely(bio->bi_rw & REQ_DISCARD)) return __send_discard(ci); @@ -1699,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci) len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count); - __clone_and_map_data_bio(ci, ti, ci->sector, &len); + r = __clone_and_map_data_bio(ci, ti, ci->sector, &len); + if (r < 0) + return r; ci->sector += len; ci->sector_count -= len; @@ -1837,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq, return 0; } -static struct request *clone_rq(struct request *rq, struct mapped_device *md, - struct dm_rq_target_io *tio, gfp_t gfp_mask) +static struct request *clone_old_rq(struct request *rq, struct mapped_device *md, + struct dm_rq_target_io *tio, gfp_t gfp_mask) { /* - * Do not allocate a clone if tio->clone was already set - * (see: dm_mq_queue_rq). + * Create clone for use with .request_fn request_queue */ - bool alloc_clone = !tio->clone; struct request *clone; - if (alloc_clone) { - clone = alloc_clone_request(md, gfp_mask); - if (!clone) - return NULL; - } else - clone = tio->clone; + clone = alloc_old_clone_request(md, gfp_mask); + if (!clone) + return NULL; blk_rq_init(NULL, clone); if (setup_clone(clone, rq, tio, gfp_mask)) { /* -ENOMEM */ - if (alloc_clone) - free_clone_request(md, clone); + free_old_clone_request(md, clone); return NULL; } @@ -1875,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq, tio->clone = NULL; tio->orig = rq; tio->error = 0; - memset(&tio->info, 0, sizeof(tio->info)); + /* + * Avoid initializing info for blk-mq; it passes + * target-specific data through info.ptr + * (see: dm_mq_init_request) + */ + if (!md->init_tio_pdu) + memset(&tio->info, 0, sizeof(tio->info)); if (md->kworker_task) init_kthread_work(&tio->work, map_tio_request); } -static struct dm_rq_target_io *prep_tio(struct request *rq, - struct mapped_device *md, gfp_t gfp_mask) +static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq, + struct mapped_device *md, + gfp_t gfp_mask) { struct dm_rq_target_io *tio; int srcu_idx; struct dm_table *table; - tio = alloc_rq_tio(md, gfp_mask); + tio = alloc_old_rq_tio(md, gfp_mask); if (!tio) return NULL; init_tio(tio, rq, md); table = dm_get_live_table(md, &srcu_idx); + /* + * Must clone a request if this .request_fn DM device + * is stacked on .request_fn device(s). + */ if (!dm_table_mq_request_based(table)) { - if (!clone_rq(rq, md, tio, gfp_mask)) { + if (!clone_old_rq(rq, md, tio, gfp_mask)) { dm_put_live_table(md, srcu_idx); - free_rq_tio(tio); + free_old_rq_tio(tio); return NULL; } } @@ -1909,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq, /* * Called with the queue lock held. */ -static int dm_prep_fn(struct request_queue *q, struct request *rq) +static int dm_old_prep_fn(struct request_queue *q, struct request *rq) { struct mapped_device *md = q->queuedata; struct dm_rq_target_io *tio; @@ -1919,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq) return BLKPREP_KILL; } - tio = prep_tio(rq, md, GFP_ATOMIC); + tio = dm_old_prep_tio(rq, md, GFP_ATOMIC); if (!tio) return BLKPREP_DEFER; @@ -2077,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md) static void dm_request_fn(struct request_queue *q) { struct mapped_device *md = q->queuedata; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; + struct dm_target *ti = md->immutable_target; struct request *rq; struct dm_rq_target_io *tio; - sector_t pos; + sector_t pos = 0; + + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); + + ti = dm_table_find_target(map, pos); + dm_put_live_table(md, srcu_idx); + } /* * For suspend, check blk_queue_stopped() and increment @@ -2093,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q) while (!blk_queue_stopped(q)) { rq = blk_peek_request(q); if (!rq) - goto out; + return; /* always use block 0 to find the target for flushes for now */ pos = 0; if (!(rq->cmd_flags & REQ_FLUSH)) pos = blk_rq_pos(rq); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { - /* - * Must perform setup, that rq_completed() requires, - * before calling dm_kill_unmapped_request - */ - DMERR_LIMIT("request attempted access beyond the end of device"); - dm_start_request(md, rq); - dm_kill_unmapped_request(rq, -EIO); - continue; + if ((dm_request_peeked_before_merge_deadline(md) && + md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && + md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) || + (ti->type->busy && ti->type->busy(ti))) { + blk_delay_queue(q, HZ / 100); + return; } - if (dm_request_peeked_before_merge_deadline(md) && - md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 && - md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) - goto delay_and_out; - - if (ti->type->busy && ti->type->busy(ti)) - goto delay_and_out; - dm_start_request(md, rq); tio = tio_from_request(rq); @@ -2128,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q) queue_kthread_work(&md->kworker, &tio->work); BUG_ON(!irqs_disabled()); } - - goto out; - -delay_and_out: - blk_delay_queue(q, HZ / 100); -out: - dm_put_live_table(md, srcu_idx); } static int dm_any_congested(void *congested_data, int bdi_bits) @@ -2144,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits) struct dm_table *map; if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { - map = dm_get_live_table_fast(md); - if (map) { + if (dm_request_based(md)) { /* - * Request-based dm cares about only own queue for - * the query about congestion status of request_queue + * With request-based DM we only need to check the + * top-level queue for congestion. */ - if (dm_request_based(md)) - r = md->queue->backing_dev_info.wb.state & - bdi_bits; - else + r = md->queue->backing_dev_info.wb.state & bdi_bits; + } else { + map = dm_get_live_table_fast(md); + if (map) r = dm_table_any_congested(map, bdi_bits); + dm_put_live_table_fast(md); } - dm_put_live_table_fast(md); } return r; @@ -2236,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md) md->queue->backing_dev_info.congested_data = md; } -static void dm_init_old_md_queue(struct mapped_device *md) +static void dm_init_normal_md_queue(struct mapped_device *md) { md->use_blk_mq = false; dm_init_md_queue(md); @@ -2283,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md) */ static struct mapped_device *alloc_dev(int minor) { - int r; - struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL); + int r, numa_node_id = dm_get_numa_node(); + struct mapped_device *md; void *old_md; + md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id); if (!md) { DMWARN("unable to allocate device, out of memory."); return NULL; @@ -2307,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor) if (r < 0) goto bad_io_barrier; + md->numa_node_id = numa_node_id; md->use_blk_mq = use_blk_mq; + md->init_tio_pdu = false; md->type = DM_TYPE_NONE; mutex_init(&md->suspend_lock); mutex_init(&md->type_lock); @@ -2321,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor) INIT_LIST_HEAD(&md->table_devices); spin_lock_init(&md->uevent_lock); - md->queue = blk_alloc_queue(GFP_KERNEL); + md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id); if (!md->queue) goto bad; dm_init_md_queue(md); - md->disk = alloc_disk(1); + md->disk = alloc_disk_node(1, numa_node_id); if (!md->disk) goto bad; @@ -2391,8 +2445,10 @@ static void free_dev(struct mapped_device *md) unlock_fs(md); cleanup_mapped_device(md); - if (md->use_blk_mq) - blk_mq_free_tag_set(&md->tag_set); + if (md->tag_set) { + blk_mq_free_tag_set(md->tag_set); + kfree(md->tag_set); + } free_table_devices(&md->table_devices); dm_stats_cleanup(&md->stats); @@ -2500,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t, * This must be done before setting the queue restrictions, * because request-based dm may be run just after the setting. */ - if (dm_table_request_based(t)) - stop_queue(q); + if (dm_table_request_based(t)) { + dm_stop_queue(q); + /* + * Leverage the fact that request-based DM targets are + * immutable singletons and establish md->immutable_target + * - used to optimize both dm_request_fn and dm_mq_queue_rq + */ + md->immutable_target = dm_table_get_immutable_target(t); + } __bind_mempools(md, t); old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock)); - rcu_assign_pointer(md->map, t); + rcu_assign_pointer(md->map, (void *)t); md->immutable_target_type = dm_table_get_immutable_target_type(t); dm_table_set_restrictions(t, q, limits); @@ -2572,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type) unsigned dm_get_md_type(struct mapped_device *md) { - BUG_ON(!mutex_is_locked(&md->type_lock)); return md->type; } @@ -2592,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md) } EXPORT_SYMBOL_GPL(dm_get_queue_limits); -static void init_rq_based_worker_thread(struct mapped_device *md) +static void dm_old_init_rq_based_worker_thread(struct mapped_device *md) { /* Initialize the request-based DM worker thread */ init_kthread_worker(&md->kworker); @@ -2601,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md) } /* - * Fully initialize a request-based queue (->elevator, ->request_fn, etc). + * Fully initialize a .request_fn request-based queue. */ -static int dm_init_request_based_queue(struct mapped_device *md) +static int dm_old_init_request_queue(struct mapped_device *md) { - struct request_queue *q = NULL; - /* Fully initialize the queue */ - q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL); - if (!q) + if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL)) return -EINVAL; /* disable dm_request_fn's merge heuristic by default */ md->seq_rq_merge_deadline_usecs = 0; - md->queue = q; - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_softirq_done(md->queue, dm_softirq_done); - blk_queue_prep_rq(md->queue, dm_prep_fn); + blk_queue_prep_rq(md->queue, dm_old_prep_fn); - init_rq_based_worker_thread(md); + dm_old_init_rq_based_worker_thread(md); elv_register_queue(md->queue); @@ -2640,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq, */ tio->md = md; + if (md->init_tio_pdu) { + /* target-specific per-io data is immediately after the tio */ + tio->info.ptr = tio + 1; + } + return 0; } @@ -2649,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq = bd->rq; struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq); struct mapped_device *md = tio->md; - int srcu_idx; - struct dm_table *map = dm_get_live_table(md, &srcu_idx); - struct dm_target *ti; - sector_t pos; + struct dm_target *ti = md->immutable_target; - /* always use block 0 to find the target for flushes for now */ - pos = 0; - if (!(rq->cmd_flags & REQ_FLUSH)) - pos = blk_rq_pos(rq); + if (unlikely(!ti)) { + int srcu_idx; + struct dm_table *map = dm_get_live_table(md, &srcu_idx); - ti = dm_table_find_target(map, pos); - if (!dm_target_is_valid(ti)) { + ti = dm_table_find_target(map, 0); dm_put_live_table(md, srcu_idx); - DMERR_LIMIT("request attempted access beyond the end of device"); - /* - * Must perform setup, that rq_completed() requires, - * before returning BLK_MQ_RQ_QUEUE_ERROR - */ - dm_start_request(md, rq); - return BLK_MQ_RQ_QUEUE_ERROR; } - dm_put_live_table(md, srcu_idx); if (ti->type->busy && ti->type->busy(ti)) return BLK_MQ_RQ_QUEUE_BUSY; @@ -2686,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx, */ tio->ti = ti; - /* Clone the request if underlying devices aren't blk-mq */ - if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) { - /* clone request is allocated at the end of the pdu */ - tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io); - (void) clone_rq(rq, md, tio, GFP_ATOMIC); - queue_kthread_work(&md->kworker, &tio->work); - } else { - /* Direct call is fine since .queue_rq allows allocations */ - if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { - /* Undo dm_start_request() before requeuing */ - rq_end_stats(md, rq); - rq_completed(md, rq_data_dir(rq), false); - return BLK_MQ_RQ_QUEUE_BUSY; - } + /* Direct call is fine since .queue_rq allows allocations */ + if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) { + /* Undo dm_start_request() before requeuing */ + rq_end_stats(md, rq); + rq_completed(md, rq_data_dir(rq), false); + return BLK_MQ_RQ_QUEUE_BUSY; } return BLK_MQ_RQ_QUEUE_OK; @@ -2712,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = { .init_request = dm_mq_init_request, }; -static int dm_init_request_based_blk_mq_queue(struct mapped_device *md) +static int dm_mq_init_request_queue(struct mapped_device *md, + struct dm_target *immutable_tgt) { - unsigned md_type = dm_get_md_type(md); struct request_queue *q; int err; - memset(&md->tag_set, 0, sizeof(md->tag_set)); - md->tag_set.ops = &dm_mq_ops; - md->tag_set.queue_depth = BLKDEV_MAX_RQ; - md->tag_set.numa_node = NUMA_NO_NODE; - md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; - md->tag_set.nr_hw_queues = 1; - if (md_type == DM_TYPE_REQUEST_BASED) { - /* make the memory for non-blk-mq clone part of the pdu */ - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request); - } else - md->tag_set.cmd_size = sizeof(struct dm_rq_target_io); - md->tag_set.driver_data = md; - - err = blk_mq_alloc_tag_set(&md->tag_set); + if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) { + DMERR("request-based dm-mq may only be stacked on blk-mq device(s)"); + return -EINVAL; + } + + md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id); + if (!md->tag_set) + return -ENOMEM; + + md->tag_set->ops = &dm_mq_ops; + md->tag_set->queue_depth = dm_get_blk_mq_queue_depth(); + md->tag_set->numa_node = md->numa_node_id; + md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; + md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues(); + md->tag_set->driver_data = md; + + md->tag_set->cmd_size = sizeof(struct dm_rq_target_io); + if (immutable_tgt && immutable_tgt->per_io_data_size) { + /* any target-specific per-io data is immediately after the tio */ + md->tag_set->cmd_size += immutable_tgt->per_io_data_size; + md->init_tio_pdu = true; + } + + err = blk_mq_alloc_tag_set(md->tag_set); if (err) - return err; + goto out_kfree_tag_set; - q = blk_mq_init_allocated_queue(&md->tag_set, md->queue); + q = blk_mq_init_allocated_queue(md->tag_set, md->queue); if (IS_ERR(q)) { err = PTR_ERR(q); goto out_tag_set; } - md->queue = q; dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ blk_mq_register_disk(md->disk); - if (md_type == DM_TYPE_REQUEST_BASED) - init_rq_based_worker_thread(md); - return 0; out_tag_set: - blk_mq_free_tag_set(&md->tag_set); + blk_mq_free_tag_set(md->tag_set); +out_kfree_tag_set: + kfree(md->tag_set); + return err; } @@ -2767,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md) /* * Setup the DM device's queue based on md's type */ -int dm_setup_md_queue(struct mapped_device *md) +int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t) { int r; unsigned md_type = filter_md_type(dm_get_md_type(md), md); switch (md_type) { case DM_TYPE_REQUEST_BASED: - r = dm_init_request_based_queue(md); + r = dm_old_init_request_queue(md); if (r) { - DMWARN("Cannot initialize queue for request-based mapped device"); + DMERR("Cannot initialize queue for request-based mapped device"); return r; } break; case DM_TYPE_MQ_REQUEST_BASED: - r = dm_init_request_based_blk_mq_queue(md); + r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t)); if (r) { - DMWARN("Cannot initialize queue for request-based blk-mq mapped device"); + DMERR("Cannot initialize queue for request-based dm-mq mapped device"); return r; } break; case DM_TYPE_BIO_BASED: - dm_init_old_md_queue(md); + dm_init_normal_md_queue(md); blk_queue_make_request(md->queue, dm_make_request); /* * DM handles splitting bios as needed. Free the bio_split bioset @@ -3131,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, * dm defers requests to md->wq from md->queue. */ if (dm_request_based(md)) { - stop_queue(md->queue); + dm_stop_queue(md->queue); if (md->kworker_task) flush_kthread_worker(&md->kworker); } @@ -3155,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map, dm_queue_flush(md); if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); dm_table_presuspend_undo_targets(map); @@ -3234,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map) * Request-based dm is queueing the deferred I/Os in its request_queue. */ if (dm_request_based(md)) - start_queue(md->queue); + dm_start_queue(md->queue); unlock_fs(md); @@ -3480,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti) EXPORT_SYMBOL_GPL(dm_noflush_suspending); struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type, - unsigned integrity, unsigned per_bio_data_size) + unsigned integrity, unsigned per_io_data_size) { - struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL); + struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id); struct kmem_cache *cachep = NULL; unsigned int pool_size = 0; unsigned int front_pad; @@ -3496,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t case DM_TYPE_BIO_BASED: cachep = _io_cache; pool_size = dm_get_reserved_bio_based_ios(); - front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); + front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone); break; case DM_TYPE_REQUEST_BASED: cachep = _rq_tio_cache; @@ -3509,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t if (!pool_size) pool_size = dm_get_reserved_rq_based_ios(); front_pad = offsetof(struct dm_rq_clone_bio_info, clone); - /* per_bio_data_size is not used. See __bind_mempools(). */ - WARN_ON(per_bio_data_size != 0); + /* per_io_data_size is used for blk-mq pdu at queue allocation */ break; default: BUG(); @@ -3552,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools) } static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3570,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, - u32 flags) + u32 flags) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3593,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3601,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3615,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, - enum pr_type type, bool abort) + enum pr_type type, bool abort) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3638,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key, else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3646,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) { struct mapped_device *md = bdev->bd_disk->private_data; const struct pr_ops *ops; - struct dm_target *tgt; fmode_t mode; - int srcu_idx, r; + int r; - r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx); + r = dm_grab_bdev_for_ioctl(md, &bdev, &mode); if (r < 0) return r; @@ -3660,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key) else r = -EOPNOTSUPP; - dm_put_live_table(md, srcu_idx); + bdput(bdev); return r; } @@ -3699,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools" module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices"); +module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices"); + +module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices"); + +module_param(dm_numa_node, int, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations"); + MODULE_DESCRIPTION(DM_NAME " driver"); MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>"); MODULE_LICENSE("GPL"); |