summaryrefslogtreecommitdiffstats
path: root/drivers/md/dm.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/dm.c')
-rw-r--r--drivers/md/dm.c600
1 files changed, 327 insertions, 273 deletions
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 5df40480228b..be4905769a45 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -106,14 +106,6 @@ struct dm_rq_clone_bio_info {
struct bio clone;
};
-union map_info *dm_get_rq_mapinfo(struct request *rq)
-{
- if (rq && rq->end_io_data)
- return &((struct dm_rq_target_io *)rq->end_io_data)->info;
- return NULL;
-}
-EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
-
#define MINOR_ALLOCED ((void *)-1)
/*
@@ -129,28 +121,18 @@ EXPORT_SYMBOL_GPL(dm_get_rq_mapinfo);
#define DMF_SUSPENDED_INTERNALLY 7
/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
- int undefined__;
-};
-
-/*
* Work processed by per-device workqueue.
*/
struct mapped_device {
struct srcu_struct io_barrier;
struct mutex suspend_lock;
- atomic_t holders;
- atomic_t open_count;
/*
- * The current mapping.
+ * The current mapping (struct dm_table *).
* Use dm_get_live_table{_fast} or take suspend_lock for
* dereference.
*/
- struct dm_table __rcu *map;
+ void __rcu *map;
struct list_head table_devices;
struct mutex table_devices_lock;
@@ -158,10 +140,16 @@ struct mapped_device {
unsigned long flags;
struct request_queue *queue;
+ int numa_node_id;
+
unsigned type;
/* Protect queue and type against concurrent access. */
struct mutex type_lock;
+ atomic_t holders;
+ atomic_t open_count;
+
+ struct dm_target *immutable_target;
struct target_type *immutable_target_type;
struct gendisk *disk;
@@ -175,8 +163,20 @@ struct mapped_device {
atomic_t pending[2];
wait_queue_head_t wait;
struct work_struct work;
- struct bio_list deferred;
spinlock_t deferred_lock;
+ struct bio_list deferred;
+
+ /*
+ * Event handling.
+ */
+ wait_queue_head_t eventq;
+ atomic_t event_nr;
+ atomic_t uevent_seq;
+ struct list_head uevent_list;
+ spinlock_t uevent_lock; /* Protect access to uevent_list */
+
+ /* the number of internal suspends */
+ unsigned internal_suspend_count;
/*
* Processing queue (flush)
@@ -192,32 +192,21 @@ struct mapped_device {
struct bio_set *bs;
/*
- * Event handling.
- */
- atomic_t event_nr;
- wait_queue_head_t eventq;
- atomic_t uevent_seq;
- struct list_head uevent_list;
- spinlock_t uevent_lock; /* Protect access to uevent_list */
-
- /*
* freeze/thaw support require holding onto a super block
*/
struct super_block *frozen_sb;
- struct block_device *bdev;
/* forced geometry settings */
struct hd_geometry geometry;
+ struct block_device *bdev;
+
/* kobject and completion */
struct dm_kobject_holder kobj_holder;
/* zero-length flush that will be cloned and submitted to targets */
struct bio flush_bio;
- /* the number of internal suspends */
- unsigned internal_suspend_count;
-
struct dm_stats stats;
struct kthread_worker kworker;
@@ -230,8 +219,9 @@ struct mapped_device {
ktime_t last_rq_start_time;
/* for blk-mq request-based DM support */
- struct blk_mq_tag_set tag_set;
- bool use_blk_mq;
+ struct blk_mq_tag_set *tag_set;
+ bool use_blk_mq:1;
+ bool init_tio_pdu:1;
};
#ifdef CONFIG_DM_MQ_DEFAULT
@@ -240,10 +230,19 @@ static bool use_blk_mq = true;
static bool use_blk_mq = false;
#endif
+#define DM_MQ_NR_HW_QUEUES 1
+#define DM_MQ_QUEUE_DEPTH 2048
+#define DM_NUMA_NODE NUMA_NO_NODE
+
+static unsigned dm_mq_nr_hw_queues = DM_MQ_NR_HW_QUEUES;
+static unsigned dm_mq_queue_depth = DM_MQ_QUEUE_DEPTH;
+static int dm_numa_node = DM_NUMA_NODE;
+
bool dm_use_blk_mq(struct mapped_device *md)
{
return md->use_blk_mq;
}
+EXPORT_SYMBOL_GPL(dm_use_blk_mq);
/*
* For mempools pre-allocation at the table loading time.
@@ -277,6 +276,27 @@ static unsigned reserved_bio_based_ios = RESERVED_BIO_BASED_IOS;
*/
static unsigned reserved_rq_based_ios = RESERVED_REQUEST_BASED_IOS;
+static int __dm_get_module_param_int(int *module_param, int min, int max)
+{
+ int param = ACCESS_ONCE(*module_param);
+ int modified_param = 0;
+ bool modified = true;
+
+ if (param < min)
+ modified_param = min;
+ else if (param > max)
+ modified_param = max;
+ else
+ modified = false;
+
+ if (modified) {
+ (void)cmpxchg(module_param, param, modified_param);
+ param = modified_param;
+ }
+
+ return param;
+}
+
static unsigned __dm_get_module_param(unsigned *module_param,
unsigned def, unsigned max)
{
@@ -310,6 +330,23 @@ unsigned dm_get_reserved_rq_based_ios(void)
}
EXPORT_SYMBOL_GPL(dm_get_reserved_rq_based_ios);
+static unsigned dm_get_blk_mq_nr_hw_queues(void)
+{
+ return __dm_get_module_param(&dm_mq_nr_hw_queues, 1, 32);
+}
+
+static unsigned dm_get_blk_mq_queue_depth(void)
+{
+ return __dm_get_module_param(&dm_mq_queue_depth,
+ DM_MQ_QUEUE_DEPTH, BLK_MQ_MAX_DEPTH);
+}
+
+static unsigned dm_get_numa_node(void)
+{
+ return __dm_get_module_param_int(&dm_numa_node,
+ DM_NUMA_NODE, num_online_nodes() - 1);
+}
+
static int __init local_init(void)
{
int r = -ENOMEM;
@@ -323,7 +360,7 @@ static int __init local_init(void)
if (!_rq_tio_cache)
goto out_free_io_cache;
- _rq_cache = kmem_cache_create("dm_clone_request", sizeof(struct request),
+ _rq_cache = kmem_cache_create("dm_old_clone_request", sizeof(struct request),
__alignof__(struct request), 0, NULL);
if (!_rq_cache)
goto out_free_rq_tio_cache;
@@ -556,16 +593,17 @@ static int dm_blk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
return dm_get_geometry(md, geo);
}
-static int dm_get_live_table_for_ioctl(struct mapped_device *md,
- struct dm_target **tgt, struct block_device **bdev,
- fmode_t *mode, int *srcu_idx)
+static int dm_grab_bdev_for_ioctl(struct mapped_device *md,
+ struct block_device **bdev,
+ fmode_t *mode)
{
+ struct dm_target *tgt;
struct dm_table *map;
- int r;
+ int srcu_idx, r;
retry:
r = -ENOTTY;
- map = dm_get_live_table(md, srcu_idx);
+ map = dm_get_live_table(md, &srcu_idx);
if (!map || !dm_table_get_size(map))
goto out;
@@ -573,9 +611,8 @@ retry:
if (dm_table_get_num_targets(map) != 1)
goto out;
- *tgt = dm_table_get_target(map, 0);
-
- if (!(*tgt)->type->prepare_ioctl)
+ tgt = dm_table_get_target(map, 0);
+ if (!tgt->type->prepare_ioctl)
goto out;
if (dm_suspended_md(md)) {
@@ -583,14 +620,16 @@ retry:
goto out;
}
- r = (*tgt)->type->prepare_ioctl(*tgt, bdev, mode);
+ r = tgt->type->prepare_ioctl(tgt, bdev, mode);
if (r < 0)
goto out;
+ bdgrab(*bdev);
+ dm_put_live_table(md, srcu_idx);
return r;
out:
- dm_put_live_table(md, *srcu_idx);
+ dm_put_live_table(md, srcu_idx);
if (r == -ENOTCONN && !fatal_signal_pending(current)) {
msleep(10);
goto retry;
@@ -602,11 +641,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
struct mapped_device *md = bdev->bd_disk->private_data;
- struct dm_target *tgt;
- struct block_device *tgt_bdev = NULL;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &tgt_bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -621,9 +658,9 @@ static int dm_blk_ioctl(struct block_device *bdev, fmode_t mode,
goto out;
}
- r = __blkdev_driver_ioctl(tgt_bdev, mode, cmd, arg);
+ r = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
out:
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -642,24 +679,24 @@ static void free_tio(struct mapped_device *md, struct dm_target_io *tio)
bio_put(&tio->clone);
}
-static struct dm_rq_target_io *alloc_rq_tio(struct mapped_device *md,
- gfp_t gfp_mask)
+static struct dm_rq_target_io *alloc_old_rq_tio(struct mapped_device *md,
+ gfp_t gfp_mask)
{
return mempool_alloc(md->io_pool, gfp_mask);
}
-static void free_rq_tio(struct dm_rq_target_io *tio)
+static void free_old_rq_tio(struct dm_rq_target_io *tio)
{
mempool_free(tio, tio->md->io_pool);
}
-static struct request *alloc_clone_request(struct mapped_device *md,
- gfp_t gfp_mask)
+static struct request *alloc_old_clone_request(struct mapped_device *md,
+ gfp_t gfp_mask)
{
return mempool_alloc(md->rq_pool, gfp_mask);
}
-static void free_clone_request(struct mapped_device *md, struct request *rq)
+static void free_old_clone_request(struct mapped_device *md, struct request *rq)
{
mempool_free(rq, md->rq_pool);
}
@@ -827,7 +864,7 @@ int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
mutex_lock(&md->table_devices_lock);
td = find_table_device(&md->table_devices, dev, mode);
if (!td) {
- td = kmalloc(sizeof(*td), GFP_KERNEL);
+ td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
if (!td) {
mutex_unlock(&md->table_devices_lock);
return -ENOMEM;
@@ -1109,12 +1146,8 @@ static void rq_completed(struct mapped_device *md, int rw, bool run_queue)
* back into ->request_fn() could deadlock attempting to grab the
* queue lock again.
*/
- if (run_queue) {
- if (md->queue->mq_ops)
- blk_mq_run_hw_queues(md->queue, true);
- else
- blk_run_queue_async(md->queue);
- }
+ if (!md->queue->mq_ops && run_queue)
+ blk_run_queue_async(md->queue);
/*
* dm_put() must be at the end of this function. See the comment above
@@ -1134,15 +1167,10 @@ static void free_rq_clone(struct request *clone)
tio->ti->type->release_clone_rq(clone);
else if (!md->queue->mq_ops)
/* request_fn queue stacked on request_fn queue(s) */
- free_clone_request(md, clone);
- /*
- * NOTE: for the blk-mq queue stacked on request_fn queue(s) case:
- * no need to call free_clone_request() because we leverage blk-mq by
- * allocating the clone at the end of the blk-mq pdu (see: clone_rq)
- */
+ free_old_clone_request(md, clone);
if (!md->queue->mq_ops)
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
}
/*
@@ -1191,12 +1219,14 @@ static void dm_unprep_request(struct request *rq)
if (clone)
free_rq_clone(clone);
+ else if (!tio->md->queue->mq_ops)
+ free_old_rq_tio(tio);
}
/*
* Requeue the original request of a clone.
*/
-static void old_requeue_request(struct request *rq)
+static void dm_old_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long flags;
@@ -1207,45 +1237,57 @@ static void old_requeue_request(struct request *rq)
spin_unlock_irqrestore(q->queue_lock, flags);
}
+static void dm_mq_requeue_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ unsigned long flags;
+
+ blk_mq_requeue_request(rq);
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (!blk_queue_stopped(q))
+ blk_mq_kick_requeue_list(q);
+ spin_unlock_irqrestore(q->queue_lock, flags);
+}
+
static void dm_requeue_original_request(struct mapped_device *md,
struct request *rq)
{
int rw = rq_data_dir(rq);
+ rq_end_stats(md, rq);
dm_unprep_request(rq);
- rq_end_stats(md, rq);
if (!rq->q->mq_ops)
- old_requeue_request(rq);
- else {
- blk_mq_requeue_request(rq);
- blk_mq_kick_requeue_list(rq->q);
- }
+ dm_old_requeue_request(rq);
+ else
+ dm_mq_requeue_request(rq);
rq_completed(md, rw, false);
}
-static void old_stop_queue(struct request_queue *q)
+static void dm_old_stop_queue(struct request_queue *q)
{
unsigned long flags;
- if (blk_queue_stopped(q))
+ spin_lock_irqsave(q->queue_lock, flags);
+ if (blk_queue_stopped(q)) {
+ spin_unlock_irqrestore(q->queue_lock, flags);
return;
+ }
- spin_lock_irqsave(q->queue_lock, flags);
blk_stop_queue(q);
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void stop_queue(struct request_queue *q)
+static void dm_stop_queue(struct request_queue *q)
{
if (!q->mq_ops)
- old_stop_queue(q);
+ dm_old_stop_queue(q);
else
blk_mq_stop_hw_queues(q);
}
-static void old_start_queue(struct request_queue *q)
+static void dm_old_start_queue(struct request_queue *q)
{
unsigned long flags;
@@ -1255,12 +1297,14 @@ static void old_start_queue(struct request_queue *q)
spin_unlock_irqrestore(q->queue_lock, flags);
}
-static void start_queue(struct request_queue *q)
+static void dm_start_queue(struct request_queue *q)
{
if (!q->mq_ops)
- old_start_queue(q);
- else
+ dm_old_start_queue(q);
+ else {
blk_mq_start_stopped_hw_queues(q, true);
+ blk_mq_kick_requeue_list(q);
+ }
}
static void dm_done(struct request *clone, int error, bool mapped)
@@ -1311,7 +1355,7 @@ static void dm_softirq_done(struct request *rq)
if (!rq->q->mq_ops) {
blk_end_request_all(rq, tio->error);
rq_completed(tio->md, rw, false);
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
} else {
blk_mq_end_request(rq, tio->error);
rq_completed(tio->md, rw, false);
@@ -1334,7 +1378,10 @@ static void dm_complete_request(struct request *rq, int error)
struct dm_rq_target_io *tio = tio_from_request(rq);
tio->error = error;
- blk_complete_request(rq);
+ if (!rq->q->mq_ops)
+ blk_complete_request(rq);
+ else
+ blk_mq_complete_request(rq, error);
}
/*
@@ -1350,7 +1397,7 @@ static void dm_kill_unmapped_request(struct request *rq, int error)
}
/*
- * Called with the clone's queue lock held (for non-blk-mq)
+ * Called with the clone's queue lock held (in the case of .request_fn)
*/
static void end_clone_request(struct request *clone, int error)
{
@@ -1520,21 +1567,26 @@ static void bio_setup_sector(struct bio *bio, sector_t sector, unsigned len)
/*
* Creates a bio that consists of range of complete bvecs.
*/
-static void clone_bio(struct dm_target_io *tio, struct bio *bio,
- sector_t sector, unsigned len)
+static int clone_bio(struct dm_target_io *tio, struct bio *bio,
+ sector_t sector, unsigned len)
{
struct bio *clone = &tio->clone;
__bio_clone_fast(clone, bio);
- if (bio_integrity(bio))
- bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (bio_integrity(bio)) {
+ int r = bio_integrity_clone(clone, bio, GFP_NOIO);
+ if (r < 0)
+ return r;
+ }
bio_advance(clone, to_bytes(sector - clone->bi_iter.bi_sector));
clone->bi_iter.bi_size = to_bytes(len);
if (bio_integrity(bio))
bio_integrity_trim(clone, 0, len);
+
+ return 0;
}
static struct dm_target_io *alloc_tio(struct clone_info *ci,
@@ -1591,13 +1643,14 @@ static int __send_empty_flush(struct clone_info *ci)
return 0;
}
-static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
+static int __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti,
sector_t sector, unsigned *len)
{
struct bio *bio = ci->bio;
struct dm_target_io *tio;
unsigned target_bio_nr;
unsigned num_target_bios = 1;
+ int r = 0;
/*
* Does the target want to receive duplicate copies of the bio?
@@ -1608,9 +1661,13 @@ static void __clone_and_map_data_bio(struct clone_info *ci, struct dm_target *ti
for (target_bio_nr = 0; target_bio_nr < num_target_bios; target_bio_nr++) {
tio = alloc_tio(ci, ti, target_bio_nr);
tio->len_ptr = len;
- clone_bio(tio, bio, sector, *len);
+ r = clone_bio(tio, bio, sector, *len);
+ if (r < 0)
+ break;
__map_bio(tio);
}
+
+ return r;
}
typedef unsigned (*get_num_bios_fn)(struct dm_target *ti);
@@ -1687,6 +1744,7 @@ static int __split_and_process_non_flush(struct clone_info *ci)
struct bio *bio = ci->bio;
struct dm_target *ti;
unsigned len;
+ int r;
if (unlikely(bio->bi_rw & REQ_DISCARD))
return __send_discard(ci);
@@ -1699,7 +1757,9 @@ static int __split_and_process_non_flush(struct clone_info *ci)
len = min_t(sector_t, max_io_len(ci->sector, ti), ci->sector_count);
- __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+ r = __clone_and_map_data_bio(ci, ti, ci->sector, &len);
+ if (r < 0)
+ return r;
ci->sector += len;
ci->sector_count -= len;
@@ -1837,28 +1897,22 @@ static int setup_clone(struct request *clone, struct request *rq,
return 0;
}
-static struct request *clone_rq(struct request *rq, struct mapped_device *md,
- struct dm_rq_target_io *tio, gfp_t gfp_mask)
+static struct request *clone_old_rq(struct request *rq, struct mapped_device *md,
+ struct dm_rq_target_io *tio, gfp_t gfp_mask)
{
/*
- * Do not allocate a clone if tio->clone was already set
- * (see: dm_mq_queue_rq).
+ * Create clone for use with .request_fn request_queue
*/
- bool alloc_clone = !tio->clone;
struct request *clone;
- if (alloc_clone) {
- clone = alloc_clone_request(md, gfp_mask);
- if (!clone)
- return NULL;
- } else
- clone = tio->clone;
+ clone = alloc_old_clone_request(md, gfp_mask);
+ if (!clone)
+ return NULL;
blk_rq_init(NULL, clone);
if (setup_clone(clone, rq, tio, gfp_mask)) {
/* -ENOMEM */
- if (alloc_clone)
- free_clone_request(md, clone);
+ free_old_clone_request(md, clone);
return NULL;
}
@@ -1875,29 +1929,40 @@ static void init_tio(struct dm_rq_target_io *tio, struct request *rq,
tio->clone = NULL;
tio->orig = rq;
tio->error = 0;
- memset(&tio->info, 0, sizeof(tio->info));
+ /*
+ * Avoid initializing info for blk-mq; it passes
+ * target-specific data through info.ptr
+ * (see: dm_mq_init_request)
+ */
+ if (!md->init_tio_pdu)
+ memset(&tio->info, 0, sizeof(tio->info));
if (md->kworker_task)
init_kthread_work(&tio->work, map_tio_request);
}
-static struct dm_rq_target_io *prep_tio(struct request *rq,
- struct mapped_device *md, gfp_t gfp_mask)
+static struct dm_rq_target_io *dm_old_prep_tio(struct request *rq,
+ struct mapped_device *md,
+ gfp_t gfp_mask)
{
struct dm_rq_target_io *tio;
int srcu_idx;
struct dm_table *table;
- tio = alloc_rq_tio(md, gfp_mask);
+ tio = alloc_old_rq_tio(md, gfp_mask);
if (!tio)
return NULL;
init_tio(tio, rq, md);
table = dm_get_live_table(md, &srcu_idx);
+ /*
+ * Must clone a request if this .request_fn DM device
+ * is stacked on .request_fn device(s).
+ */
if (!dm_table_mq_request_based(table)) {
- if (!clone_rq(rq, md, tio, gfp_mask)) {
+ if (!clone_old_rq(rq, md, tio, gfp_mask)) {
dm_put_live_table(md, srcu_idx);
- free_rq_tio(tio);
+ free_old_rq_tio(tio);
return NULL;
}
}
@@ -1909,7 +1974,7 @@ static struct dm_rq_target_io *prep_tio(struct request *rq,
/*
* Called with the queue lock held.
*/
-static int dm_prep_fn(struct request_queue *q, struct request *rq)
+static int dm_old_prep_fn(struct request_queue *q, struct request *rq)
{
struct mapped_device *md = q->queuedata;
struct dm_rq_target_io *tio;
@@ -1919,7 +1984,7 @@ static int dm_prep_fn(struct request_queue *q, struct request *rq)
return BLKPREP_KILL;
}
- tio = prep_tio(rq, md, GFP_ATOMIC);
+ tio = dm_old_prep_tio(rq, md, GFP_ATOMIC);
if (!tio)
return BLKPREP_DEFER;
@@ -2077,12 +2142,18 @@ static bool dm_request_peeked_before_merge_deadline(struct mapped_device *md)
static void dm_request_fn(struct request_queue *q)
{
struct mapped_device *md = q->queuedata;
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- struct dm_target *ti;
+ struct dm_target *ti = md->immutable_target;
struct request *rq;
struct dm_rq_target_io *tio;
- sector_t pos;
+ sector_t pos = 0;
+
+ if (unlikely(!ti)) {
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
+
+ ti = dm_table_find_target(map, pos);
+ dm_put_live_table(md, srcu_idx);
+ }
/*
* For suspend, check blk_queue_stopped() and increment
@@ -2093,33 +2164,21 @@ static void dm_request_fn(struct request_queue *q)
while (!blk_queue_stopped(q)) {
rq = blk_peek_request(q);
if (!rq)
- goto out;
+ return;
/* always use block 0 to find the target for flushes for now */
pos = 0;
if (!(rq->cmd_flags & REQ_FLUSH))
pos = blk_rq_pos(rq);
- ti = dm_table_find_target(map, pos);
- if (!dm_target_is_valid(ti)) {
- /*
- * Must perform setup, that rq_completed() requires,
- * before calling dm_kill_unmapped_request
- */
- DMERR_LIMIT("request attempted access beyond the end of device");
- dm_start_request(md, rq);
- dm_kill_unmapped_request(rq, -EIO);
- continue;
+ if ((dm_request_peeked_before_merge_deadline(md) &&
+ md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
+ md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq)) ||
+ (ti->type->busy && ti->type->busy(ti))) {
+ blk_delay_queue(q, HZ / 100);
+ return;
}
- if (dm_request_peeked_before_merge_deadline(md) &&
- md_in_flight(md) && rq->bio && rq->bio->bi_vcnt == 1 &&
- md->last_rq_pos == pos && md->last_rq_rw == rq_data_dir(rq))
- goto delay_and_out;
-
- if (ti->type->busy && ti->type->busy(ti))
- goto delay_and_out;
-
dm_start_request(md, rq);
tio = tio_from_request(rq);
@@ -2128,13 +2187,6 @@ static void dm_request_fn(struct request_queue *q)
queue_kthread_work(&md->kworker, &tio->work);
BUG_ON(!irqs_disabled());
}
-
- goto out;
-
-delay_and_out:
- blk_delay_queue(q, HZ / 100);
-out:
- dm_put_live_table(md, srcu_idx);
}
static int dm_any_congested(void *congested_data, int bdi_bits)
@@ -2144,19 +2196,18 @@ static int dm_any_congested(void *congested_data, int bdi_bits)
struct dm_table *map;
if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
- map = dm_get_live_table_fast(md);
- if (map) {
+ if (dm_request_based(md)) {
/*
- * Request-based dm cares about only own queue for
- * the query about congestion status of request_queue
+ * With request-based DM we only need to check the
+ * top-level queue for congestion.
*/
- if (dm_request_based(md))
- r = md->queue->backing_dev_info.wb.state &
- bdi_bits;
- else
+ r = md->queue->backing_dev_info.wb.state & bdi_bits;
+ } else {
+ map = dm_get_live_table_fast(md);
+ if (map)
r = dm_table_any_congested(map, bdi_bits);
+ dm_put_live_table_fast(md);
}
- dm_put_live_table_fast(md);
}
return r;
@@ -2236,7 +2287,7 @@ static void dm_init_md_queue(struct mapped_device *md)
md->queue->backing_dev_info.congested_data = md;
}
-static void dm_init_old_md_queue(struct mapped_device *md)
+static void dm_init_normal_md_queue(struct mapped_device *md)
{
md->use_blk_mq = false;
dm_init_md_queue(md);
@@ -2283,10 +2334,11 @@ static void cleanup_mapped_device(struct mapped_device *md)
*/
static struct mapped_device *alloc_dev(int minor)
{
- int r;
- struct mapped_device *md = kzalloc(sizeof(*md), GFP_KERNEL);
+ int r, numa_node_id = dm_get_numa_node();
+ struct mapped_device *md;
void *old_md;
+ md = kzalloc_node(sizeof(*md), GFP_KERNEL, numa_node_id);
if (!md) {
DMWARN("unable to allocate device, out of memory.");
return NULL;
@@ -2307,7 +2359,9 @@ static struct mapped_device *alloc_dev(int minor)
if (r < 0)
goto bad_io_barrier;
+ md->numa_node_id = numa_node_id;
md->use_blk_mq = use_blk_mq;
+ md->init_tio_pdu = false;
md->type = DM_TYPE_NONE;
mutex_init(&md->suspend_lock);
mutex_init(&md->type_lock);
@@ -2321,13 +2375,13 @@ static struct mapped_device *alloc_dev(int minor)
INIT_LIST_HEAD(&md->table_devices);
spin_lock_init(&md->uevent_lock);
- md->queue = blk_alloc_queue(GFP_KERNEL);
+ md->queue = blk_alloc_queue_node(GFP_KERNEL, numa_node_id);
if (!md->queue)
goto bad;
dm_init_md_queue(md);
- md->disk = alloc_disk(1);
+ md->disk = alloc_disk_node(1, numa_node_id);
if (!md->disk)
goto bad;
@@ -2391,8 +2445,10 @@ static void free_dev(struct mapped_device *md)
unlock_fs(md);
cleanup_mapped_device(md);
- if (md->use_blk_mq)
- blk_mq_free_tag_set(&md->tag_set);
+ if (md->tag_set) {
+ blk_mq_free_tag_set(md->tag_set);
+ kfree(md->tag_set);
+ }
free_table_devices(&md->table_devices);
dm_stats_cleanup(&md->stats);
@@ -2500,13 +2556,20 @@ static struct dm_table *__bind(struct mapped_device *md, struct dm_table *t,
* This must be done before setting the queue restrictions,
* because request-based dm may be run just after the setting.
*/
- if (dm_table_request_based(t))
- stop_queue(q);
+ if (dm_table_request_based(t)) {
+ dm_stop_queue(q);
+ /*
+ * Leverage the fact that request-based DM targets are
+ * immutable singletons and establish md->immutable_target
+ * - used to optimize both dm_request_fn and dm_mq_queue_rq
+ */
+ md->immutable_target = dm_table_get_immutable_target(t);
+ }
__bind_mempools(md, t);
old_map = rcu_dereference_protected(md->map, lockdep_is_held(&md->suspend_lock));
- rcu_assign_pointer(md->map, t);
+ rcu_assign_pointer(md->map, (void *)t);
md->immutable_target_type = dm_table_get_immutable_target_type(t);
dm_table_set_restrictions(t, q, limits);
@@ -2572,7 +2635,6 @@ void dm_set_md_type(struct mapped_device *md, unsigned type)
unsigned dm_get_md_type(struct mapped_device *md)
{
- BUG_ON(!mutex_is_locked(&md->type_lock));
return md->type;
}
@@ -2592,7 +2654,7 @@ struct queue_limits *dm_get_queue_limits(struct mapped_device *md)
}
EXPORT_SYMBOL_GPL(dm_get_queue_limits);
-static void init_rq_based_worker_thread(struct mapped_device *md)
+static void dm_old_init_rq_based_worker_thread(struct mapped_device *md)
{
/* Initialize the request-based DM worker thread */
init_kthread_worker(&md->kworker);
@@ -2601,26 +2663,22 @@ static void init_rq_based_worker_thread(struct mapped_device *md)
}
/*
- * Fully initialize a request-based queue (->elevator, ->request_fn, etc).
+ * Fully initialize a .request_fn request-based queue.
*/
-static int dm_init_request_based_queue(struct mapped_device *md)
+static int dm_old_init_request_queue(struct mapped_device *md)
{
- struct request_queue *q = NULL;
-
/* Fully initialize the queue */
- q = blk_init_allocated_queue(md->queue, dm_request_fn, NULL);
- if (!q)
+ if (!blk_init_allocated_queue(md->queue, dm_request_fn, NULL))
return -EINVAL;
/* disable dm_request_fn's merge heuristic by default */
md->seq_rq_merge_deadline_usecs = 0;
- md->queue = q;
- dm_init_old_md_queue(md);
+ dm_init_normal_md_queue(md);
blk_queue_softirq_done(md->queue, dm_softirq_done);
- blk_queue_prep_rq(md->queue, dm_prep_fn);
+ blk_queue_prep_rq(md->queue, dm_old_prep_fn);
- init_rq_based_worker_thread(md);
+ dm_old_init_rq_based_worker_thread(md);
elv_register_queue(md->queue);
@@ -2640,6 +2698,11 @@ static int dm_mq_init_request(void *data, struct request *rq,
*/
tio->md = md;
+ if (md->init_tio_pdu) {
+ /* target-specific per-io data is immediately after the tio */
+ tio->info.ptr = tio + 1;
+ }
+
return 0;
}
@@ -2649,28 +2712,15 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *rq = bd->rq;
struct dm_rq_target_io *tio = blk_mq_rq_to_pdu(rq);
struct mapped_device *md = tio->md;
- int srcu_idx;
- struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- struct dm_target *ti;
- sector_t pos;
+ struct dm_target *ti = md->immutable_target;
- /* always use block 0 to find the target for flushes for now */
- pos = 0;
- if (!(rq->cmd_flags & REQ_FLUSH))
- pos = blk_rq_pos(rq);
+ if (unlikely(!ti)) {
+ int srcu_idx;
+ struct dm_table *map = dm_get_live_table(md, &srcu_idx);
- ti = dm_table_find_target(map, pos);
- if (!dm_target_is_valid(ti)) {
+ ti = dm_table_find_target(map, 0);
dm_put_live_table(md, srcu_idx);
- DMERR_LIMIT("request attempted access beyond the end of device");
- /*
- * Must perform setup, that rq_completed() requires,
- * before returning BLK_MQ_RQ_QUEUE_ERROR
- */
- dm_start_request(md, rq);
- return BLK_MQ_RQ_QUEUE_ERROR;
}
- dm_put_live_table(md, srcu_idx);
if (ti->type->busy && ti->type->busy(ti))
return BLK_MQ_RQ_QUEUE_BUSY;
@@ -2686,20 +2736,12 @@ static int dm_mq_queue_rq(struct blk_mq_hw_ctx *hctx,
*/
tio->ti = ti;
- /* Clone the request if underlying devices aren't blk-mq */
- if (dm_table_get_type(map) == DM_TYPE_REQUEST_BASED) {
- /* clone request is allocated at the end of the pdu */
- tio->clone = (void *)blk_mq_rq_to_pdu(rq) + sizeof(struct dm_rq_target_io);
- (void) clone_rq(rq, md, tio, GFP_ATOMIC);
- queue_kthread_work(&md->kworker, &tio->work);
- } else {
- /* Direct call is fine since .queue_rq allows allocations */
- if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
- /* Undo dm_start_request() before requeuing */
- rq_end_stats(md, rq);
- rq_completed(md, rq_data_dir(rq), false);
- return BLK_MQ_RQ_QUEUE_BUSY;
- }
+ /* Direct call is fine since .queue_rq allows allocations */
+ if (map_request(tio, rq, md) == DM_MAPIO_REQUEUE) {
+ /* Undo dm_start_request() before requeuing */
+ rq_end_stats(md, rq);
+ rq_completed(md, rq_data_dir(rq), false);
+ return BLK_MQ_RQ_QUEUE_BUSY;
}
return BLK_MQ_RQ_QUEUE_OK;
@@ -2712,47 +2754,56 @@ static struct blk_mq_ops dm_mq_ops = {
.init_request = dm_mq_init_request,
};
-static int dm_init_request_based_blk_mq_queue(struct mapped_device *md)
+static int dm_mq_init_request_queue(struct mapped_device *md,
+ struct dm_target *immutable_tgt)
{
- unsigned md_type = dm_get_md_type(md);
struct request_queue *q;
int err;
- memset(&md->tag_set, 0, sizeof(md->tag_set));
- md->tag_set.ops = &dm_mq_ops;
- md->tag_set.queue_depth = BLKDEV_MAX_RQ;
- md->tag_set.numa_node = NUMA_NO_NODE;
- md->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
- md->tag_set.nr_hw_queues = 1;
- if (md_type == DM_TYPE_REQUEST_BASED) {
- /* make the memory for non-blk-mq clone part of the pdu */
- md->tag_set.cmd_size = sizeof(struct dm_rq_target_io) + sizeof(struct request);
- } else
- md->tag_set.cmd_size = sizeof(struct dm_rq_target_io);
- md->tag_set.driver_data = md;
-
- err = blk_mq_alloc_tag_set(&md->tag_set);
+ if (dm_get_md_type(md) == DM_TYPE_REQUEST_BASED) {
+ DMERR("request-based dm-mq may only be stacked on blk-mq device(s)");
+ return -EINVAL;
+ }
+
+ md->tag_set = kzalloc_node(sizeof(struct blk_mq_tag_set), GFP_KERNEL, md->numa_node_id);
+ if (!md->tag_set)
+ return -ENOMEM;
+
+ md->tag_set->ops = &dm_mq_ops;
+ md->tag_set->queue_depth = dm_get_blk_mq_queue_depth();
+ md->tag_set->numa_node = md->numa_node_id;
+ md->tag_set->flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE;
+ md->tag_set->nr_hw_queues = dm_get_blk_mq_nr_hw_queues();
+ md->tag_set->driver_data = md;
+
+ md->tag_set->cmd_size = sizeof(struct dm_rq_target_io);
+ if (immutable_tgt && immutable_tgt->per_io_data_size) {
+ /* any target-specific per-io data is immediately after the tio */
+ md->tag_set->cmd_size += immutable_tgt->per_io_data_size;
+ md->init_tio_pdu = true;
+ }
+
+ err = blk_mq_alloc_tag_set(md->tag_set);
if (err)
- return err;
+ goto out_kfree_tag_set;
- q = blk_mq_init_allocated_queue(&md->tag_set, md->queue);
+ q = blk_mq_init_allocated_queue(md->tag_set, md->queue);
if (IS_ERR(q)) {
err = PTR_ERR(q);
goto out_tag_set;
}
- md->queue = q;
dm_init_md_queue(md);
/* backfill 'mq' sysfs registration normally done in blk_register_queue */
blk_mq_register_disk(md->disk);
- if (md_type == DM_TYPE_REQUEST_BASED)
- init_rq_based_worker_thread(md);
-
return 0;
out_tag_set:
- blk_mq_free_tag_set(&md->tag_set);
+ blk_mq_free_tag_set(md->tag_set);
+out_kfree_tag_set:
+ kfree(md->tag_set);
+
return err;
}
@@ -2767,28 +2818,28 @@ static unsigned filter_md_type(unsigned type, struct mapped_device *md)
/*
* Setup the DM device's queue based on md's type
*/
-int dm_setup_md_queue(struct mapped_device *md)
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
unsigned md_type = filter_md_type(dm_get_md_type(md), md);
switch (md_type) {
case DM_TYPE_REQUEST_BASED:
- r = dm_init_request_based_queue(md);
+ r = dm_old_init_request_queue(md);
if (r) {
- DMWARN("Cannot initialize queue for request-based mapped device");
+ DMERR("Cannot initialize queue for request-based mapped device");
return r;
}
break;
case DM_TYPE_MQ_REQUEST_BASED:
- r = dm_init_request_based_blk_mq_queue(md);
+ r = dm_mq_init_request_queue(md, dm_table_get_immutable_target(t));
if (r) {
- DMWARN("Cannot initialize queue for request-based blk-mq mapped device");
+ DMERR("Cannot initialize queue for request-based dm-mq mapped device");
return r;
}
break;
case DM_TYPE_BIO_BASED:
- dm_init_old_md_queue(md);
+ dm_init_normal_md_queue(md);
blk_queue_make_request(md->queue, dm_make_request);
/*
* DM handles splitting bios as needed. Free the bio_split bioset
@@ -3131,7 +3182,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
* dm defers requests to md->wq from md->queue.
*/
if (dm_request_based(md)) {
- stop_queue(md->queue);
+ dm_stop_queue(md->queue);
if (md->kworker_task)
flush_kthread_worker(&md->kworker);
}
@@ -3155,7 +3206,7 @@ static int __dm_suspend(struct mapped_device *md, struct dm_table *map,
dm_queue_flush(md);
if (dm_request_based(md))
- start_queue(md->queue);
+ dm_start_queue(md->queue);
unlock_fs(md);
dm_table_presuspend_undo_targets(map);
@@ -3234,7 +3285,7 @@ static int __dm_resume(struct mapped_device *md, struct dm_table *map)
* Request-based dm is queueing the deferred I/Os in its request_queue.
*/
if (dm_request_based(md))
- start_queue(md->queue);
+ dm_start_queue(md->queue);
unlock_fs(md);
@@ -3480,9 +3531,9 @@ int dm_noflush_suspending(struct dm_target *ti)
EXPORT_SYMBOL_GPL(dm_noflush_suspending);
struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned type,
- unsigned integrity, unsigned per_bio_data_size)
+ unsigned integrity, unsigned per_io_data_size)
{
- struct dm_md_mempools *pools = kzalloc(sizeof(*pools), GFP_KERNEL);
+ struct dm_md_mempools *pools = kzalloc_node(sizeof(*pools), GFP_KERNEL, md->numa_node_id);
struct kmem_cache *cachep = NULL;
unsigned int pool_size = 0;
unsigned int front_pad;
@@ -3496,7 +3547,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
case DM_TYPE_BIO_BASED:
cachep = _io_cache;
pool_size = dm_get_reserved_bio_based_ios();
- front_pad = roundup(per_bio_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
+ front_pad = roundup(per_io_data_size, __alignof__(struct dm_target_io)) + offsetof(struct dm_target_io, clone);
break;
case DM_TYPE_REQUEST_BASED:
cachep = _rq_tio_cache;
@@ -3509,8 +3560,7 @@ struct dm_md_mempools *dm_alloc_md_mempools(struct mapped_device *md, unsigned t
if (!pool_size)
pool_size = dm_get_reserved_rq_based_ios();
front_pad = offsetof(struct dm_rq_clone_bio_info, clone);
- /* per_bio_data_size is not used. See __bind_mempools(). */
- WARN_ON(per_bio_data_size != 0);
+ /* per_io_data_size is used for blk-mq pdu at queue allocation */
break;
default:
BUG();
@@ -3552,15 +3602,14 @@ void dm_free_md_mempools(struct dm_md_mempools *pools)
}
static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
- u32 flags)
+ u32 flags)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3570,20 +3619,19 @@ static int dm_pr_register(struct block_device *bdev, u64 old_key, u64 new_key,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
- u32 flags)
+ u32 flags)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3593,7 +3641,7 @@ static int dm_pr_reserve(struct block_device *bdev, u64 key, enum pr_type type,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3601,11 +3649,10 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3615,20 +3662,19 @@ static int dm_pr_release(struct block_device *bdev, u64 key, enum pr_type type)
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
- enum pr_type type, bool abort)
+ enum pr_type type, bool abort)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3638,7 +3684,7 @@ static int dm_pr_preempt(struct block_device *bdev, u64 old_key, u64 new_key,
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3646,11 +3692,10 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
{
struct mapped_device *md = bdev->bd_disk->private_data;
const struct pr_ops *ops;
- struct dm_target *tgt;
fmode_t mode;
- int srcu_idx, r;
+ int r;
- r = dm_get_live_table_for_ioctl(md, &tgt, &bdev, &mode, &srcu_idx);
+ r = dm_grab_bdev_for_ioctl(md, &bdev, &mode);
if (r < 0)
return r;
@@ -3660,7 +3705,7 @@ static int dm_pr_clear(struct block_device *bdev, u64 key)
else
r = -EOPNOTSUPP;
- dm_put_live_table(md, srcu_idx);
+ bdput(bdev);
return r;
}
@@ -3699,6 +3744,15 @@ MODULE_PARM_DESC(reserved_rq_based_ios, "Reserved IOs in request-based mempools"
module_param(use_blk_mq, bool, S_IRUGO | S_IWUSR);
MODULE_PARM_DESC(use_blk_mq, "Use block multiqueue for request-based DM devices");
+module_param(dm_mq_nr_hw_queues, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_nr_hw_queues, "Number of hardware queues for request-based dm-mq devices");
+
+module_param(dm_mq_queue_depth, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_mq_queue_depth, "Queue depth for request-based dm-mq devices");
+
+module_param(dm_numa_node, int, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(dm_numa_node, "NUMA node for DM device memory allocations");
+
MODULE_DESCRIPTION(DM_NAME " driver");
MODULE_AUTHOR("Joe Thornber <dm-devel@redhat.com>");
MODULE_LICENSE("GPL");