diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 23:42:05 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2016-10-07 23:42:05 +0200 |
commit | 513a4befae06c4469abfb836e8f71977de58c636 (patch) | |
tree | 18cc7d0b01a7fd2352de734e99a4ca5c29ad5fac | |
parent | Merge branch 'i2c/for-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/w... (diff) | |
parent | fs/block_dev.c: return the right error in thaw_bdev() (diff) | |
download | linux-513a4befae06c4469abfb836e8f71977de58c636.tar.xz linux-513a4befae06c4469abfb836e8f71977de58c636.zip |
Merge branch 'for-4.9/block' of git://git.kernel.dk/linux-block
Pull block layer updates from Jens Axboe:
"This is the main pull request for block layer changes in 4.9.
As mentioned at the last merge window, I've changed things up and now
do just one branch for core block layer changes, and driver changes.
This avoids dependencies between the two branches. Outside of this
main pull request, there are two topical branches coming as well.
This pull request contains:
- A set of fixes, and a conversion to blk-mq, of nbd. From Josef.
- Set of fixes and updates for lightnvm from Matias, Simon, and Arnd.
Followup dependency fix from Geert.
- General fixes from Bart, Baoyou, Guoqing, and Linus W.
- CFQ async write starvation fix from Glauber.
- Add supprot for delayed kick of the requeue list, from Mike.
- Pull out the scalable bitmap code from blk-mq-tag.c and make it
generally available under the name of sbitmap. Only blk-mq-tag uses
it for now, but the blk-mq scheduling bits will use it as well.
From Omar.
- bdev thaw error progagation from Pierre.
- Improve the blk polling statistics, and allow the user to clear
them. From Stephen.
- Set of minor cleanups from Christoph in block/blk-mq.
- Set of cleanups and optimizations from me for block/blk-mq.
- Various nvme/nvmet/nvmeof fixes from the various folks"
* 'for-4.9/block' of git://git.kernel.dk/linux-block: (54 commits)
fs/block_dev.c: return the right error in thaw_bdev()
nvme: Pass pointers, not dma addresses, to nvme_get/set_features()
nvme/scsi: Remove power management support
nvmet: Make dsm number of ranges zero based
nvmet: Use direct IO for writes
admin-cmd: Added smart-log command support.
nvme-fabrics: Add host_traddr options field to host infrastructure
nvme-fabrics: revise host transport option descriptions
nvme-fabrics: rework nvmf_get_address() for variable options
nbd: use BLK_MQ_F_BLOCKING
blkcg: Annotate blkg_hint correctly
cfq: fix starvation of asynchronous writes
blk-mq: add flag for drivers wanting blocking ->queue_rq()
blk-mq: remove non-blocking pass in blk_mq_map_request
blk-mq: get rid of manual run of queue with __blk_mq_run_hw_queue()
block: export bio_free_pages to other modules
lightnvm: propagate device_add() error code
lightnvm: expose device geometry through sysfs
lightnvm: control life of nvm_dev in driver
blk-mq: register device instead of disk
...
53 files changed, 1828 insertions, 1170 deletions
diff --git a/Documentation/block/biodoc.txt b/Documentation/block/biodoc.txt index bcdb2b4c1f12..918e1e0d0e78 100644 --- a/Documentation/block/biodoc.txt +++ b/Documentation/block/biodoc.txt @@ -115,7 +115,7 @@ i. Per-queue limits/values exported to the generic layer by the driver Various parameters that the generic i/o scheduler logic uses are set at a per-queue level (e.g maximum request size, maximum number of segments in -a scatter-gather list, hardsect size) +a scatter-gather list, logical block size) Some parameters that were earlier available as global arrays indexed by major/minor are now directly associated with the queue. Some of these may @@ -156,7 +156,7 @@ Some new queue property settings: blk_queue_max_segment_size(q, max_seg_size) Maximum size of a clustered segment, 64kB default. - blk_queue_hardsect_size(q, hardsect_size) + blk_queue_logical_block_size(q, logical_block_size) Lowest possible sector size that the hardware can operate on, 512 bytes default. diff --git a/MAINTAINERS b/MAINTAINERS index 255655880881..fb4d381a6ecf 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -2472,6 +2472,7 @@ T: git git://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git S: Maintained F: block/ F: kernel/trace/blktrace.c +F: lib/sbitmap.c BLOCK2MTD DRIVER M: Joern Engel <joern@lazybastard.org> diff --git a/block/Kconfig b/block/Kconfig index 161491d0a879..5136ad4bb6d5 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -4,6 +4,7 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select SBITMAP help Provide block layer support for the kernel. diff --git a/block/bio.c b/block/bio.c index aa7354088008..db85c5753a76 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1068,7 +1068,7 @@ static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) return 0; } -static void bio_free_pages(struct bio *bio) +void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; int i; @@ -1076,6 +1076,7 @@ static void bio_free_pages(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) __free_page(bvec->bv_page); } +EXPORT_SYMBOL(bio_free_pages); /** * bio_uncopy_user - finish previously mapped bio @@ -1274,7 +1275,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, nr_pages += end - start; /* - * buffer must be aligned to at least hardsector size for now + * buffer must be aligned to at least logical block size for now */ if (uaddr & queue_dma_alignment(q)) return ERR_PTR(-EINVAL); diff --git a/block/blk-core.c b/block/blk-core.c index 36c7ac328d8c..14d7c0740dc0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -288,7 +288,7 @@ void blk_sync_queue(struct request_queue *q) int i; queue_for_each_hw_ctx(q, hctx, i) { - cancel_delayed_work_sync(&hctx->run_work); + cancel_work_sync(&hctx->run_work); cancel_delayed_work_sync(&hctx->delay_work); } } else { @@ -3097,6 +3097,12 @@ int kblockd_schedule_work(struct work_struct *work) } EXPORT_SYMBOL(kblockd_schedule_work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work) +{ + return queue_work_on(cpu, kblockd_workqueue, work); +} +EXPORT_SYMBOL(kblockd_schedule_work_on); + int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay) { @@ -3301,19 +3307,23 @@ bool blk_poll(struct request_queue *q, blk_qc_t cookie) { struct blk_plug *plug; long state; + unsigned int queue_num; + struct blk_mq_hw_ctx *hctx; if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return false; + queue_num = blk_qc_t_to_queue_num(cookie); + hctx = q->queue_hw_ctx[queue_num]; + hctx->poll_considered++; + plug = current->plug; if (plug) blk_flush_plug_list(plug, false); state = current->state; while (!need_resched()) { - unsigned int queue_num = blk_qc_t_to_queue_num(cookie); - struct blk_mq_hw_ctx *hctx = q->queue_hw_ctx[queue_num]; int ret; hctx->poll_invoked++; diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index fe822aa5b8e4..01fb455d3377 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -176,7 +176,17 @@ static ssize_t blk_mq_sysfs_rq_list_show(struct blk_mq_ctx *ctx, char *page) static ssize_t blk_mq_hw_sysfs_poll_show(struct blk_mq_hw_ctx *hctx, char *page) { - return sprintf(page, "invoked=%lu, success=%lu\n", hctx->poll_invoked, hctx->poll_success); + return sprintf(page, "considered=%lu, invoked=%lu, success=%lu\n", + hctx->poll_considered, hctx->poll_invoked, + hctx->poll_success); +} + +static ssize_t blk_mq_hw_sysfs_poll_store(struct blk_mq_hw_ctx *hctx, + const char *page, size_t size) +{ + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + + return size; } static ssize_t blk_mq_hw_sysfs_queued_show(struct blk_mq_hw_ctx *hctx, @@ -198,12 +208,14 @@ static ssize_t blk_mq_hw_sysfs_dispatched_show(struct blk_mq_hw_ctx *hctx, page += sprintf(page, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) { - unsigned long d = 1U << (i - 1); + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); - page += sprintf(page, "%8lu\t%lu\n", d, hctx->dispatched[i]); + page += sprintf(page, "%8u\t%lu\n", d, hctx->dispatched[i]); } + page += sprintf(page, "%8u+\t%lu\n", 1U << (i - 1), + hctx->dispatched[i]); return page - start_page; } @@ -301,8 +313,9 @@ static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { .show = blk_mq_hw_sysfs_cpus_show, }; static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_poll = { - .attr = {.name = "io_poll", .mode = S_IRUGO }, + .attr = {.name = "io_poll", .mode = S_IWUSR | S_IRUGO }, .show = blk_mq_hw_sysfs_poll_show, + .store = blk_mq_hw_sysfs_poll_store, }; static struct attribute *default_hw_ctx_attrs[] = { @@ -380,9 +393,8 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) return ret; } -static void __blk_mq_unregister_disk(struct gendisk *disk) +static void __blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; int i, j; @@ -400,15 +412,15 @@ static void __blk_mq_unregister_disk(struct gendisk *disk) kobject_del(&q->mq_kobj); kobject_put(&q->mq_kobj); - kobject_put(&disk_to_dev(disk)->kobj); + kobject_put(&dev->kobj); q->mq_sysfs_init_done = false; } -void blk_mq_unregister_disk(struct gendisk *disk) +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) { blk_mq_disable_hotplug(); - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); blk_mq_enable_hotplug(); } @@ -430,10 +442,8 @@ static void blk_mq_sysfs_init(struct request_queue *q) } } -int blk_mq_register_disk(struct gendisk *disk) +int blk_mq_register_dev(struct device *dev, struct request_queue *q) { - struct device *dev = disk_to_dev(disk); - struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; int ret, i; @@ -454,7 +464,7 @@ int blk_mq_register_disk(struct gendisk *disk) } if (ret) - __blk_mq_unregister_disk(disk); + __blk_mq_unregister_dev(dev, q); else q->mq_sysfs_init_done = true; out: @@ -462,7 +472,7 @@ out: return ret; } -EXPORT_SYMBOL_GPL(blk_mq_register_disk); +EXPORT_SYMBOL_GPL(blk_mq_register_dev); void blk_mq_sysfs_unregister(struct request_queue *q) { diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index 729bac3a673b..cef618f6fc92 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -1,58 +1,24 @@ /* - * Fast and scalable bitmap tagging variant. Uses sparser bitmaps spread - * over multiple cachelines to avoid ping-pong between multiple submitters - * or submitter and completer. Uses rolling wakeups to avoid falling of - * the scaling cliff when we run out of tags and have to start putting - * submitters to sleep. - * - * Uses active queue tracking to support fairer distribution of tags - * between multiple submitters when a shared tag map is used. + * Tag allocation using scalable bitmaps. Uses active queue tracking to support + * fairer distribution of tags between multiple submitters when a shared tag map + * is used. * * Copyright (C) 2013-2014 Jens Axboe */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/random.h> #include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-tag.h" -static bool bt_has_free_tags(struct blk_mq_bitmap_tags *bt) -{ - int i; - - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - int ret; - - ret = find_first_zero_bit(&bm->word, bm->depth); - if (ret < bm->depth) - return true; - } - - return false; -} - bool blk_mq_has_free_tags(struct blk_mq_tags *tags) { if (!tags) return true; - return bt_has_free_tags(&tags->bitmap_tags); -} - -static inline int bt_index_inc(int index) -{ - return (index + 1) & (BT_WAIT_QUEUES - 1); -} - -static inline void bt_index_atomic_inc(atomic_t *index) -{ - int old = atomic_read(index); - int new = bt_index_inc(old); - atomic_cmpxchg(index, old, new); + return sbitmap_any_bit_clear(&tags->bitmap_tags.sb); } /* @@ -72,29 +38,9 @@ bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) */ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) { - struct blk_mq_bitmap_tags *bt; - int i, wake_index; - - /* - * Make sure all changes prior to this are visible from other CPUs. - */ - smp_mb(); - bt = &tags->bitmap_tags; - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) - wake_up(&bs->wait); - - wake_index = bt_index_inc(wake_index); - } - - if (include_reserve) { - bt = &tags->breserved_tags; - if (waitqueue_active(&bt->bs[0].wait)) - wake_up(&bt->bs[0].wait); - } + sbitmap_queue_wake_all(&tags->bitmap_tags); + if (include_reserve) + sbitmap_queue_wake_all(&tags->breserved_tags); } /* @@ -118,7 +64,7 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) * and attempt to provide a fair share of the tag depth for each of them. */ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt) + struct sbitmap_queue *bt) { unsigned int depth, users; @@ -130,7 +76,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Don't try dividing an ant */ - if (bt->depth == 1) + if (bt->sb.depth == 1) return true; users = atomic_read(&hctx->tags->active_queues); @@ -140,142 +86,36 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, /* * Allow at least some tags */ - depth = max((bt->depth + users - 1) / users, 4U); + depth = max((bt->sb.depth + users - 1) / users, 4U); return atomic_read(&hctx->nr_active) < depth; } -static int __bt_get_word(struct blk_align_bitmap *bm, unsigned int last_tag, - bool nowrap) -{ - int tag, org_last_tag = last_tag; - - while (1) { - tag = find_next_zero_bit(&bm->word, bm->depth, last_tag); - if (unlikely(tag >= bm->depth)) { - /* - * We started with an offset, and we didn't reset the - * offset to 0 in a failure case, so start from 0 to - * exhaust the map. - */ - if (org_last_tag && last_tag && !nowrap) { - last_tag = org_last_tag = 0; - continue; - } - return -1; - } - - if (!test_and_set_bit(tag, &bm->word)) - break; - - last_tag = tag + 1; - if (last_tag >= bm->depth - 1) - last_tag = 0; - } - - return tag; -} - -#define BT_ALLOC_RR(tags) (tags->alloc_policy == BLK_TAG_ALLOC_RR) - -/* - * Straight forward bitmap tag implementation, where each bit is a tag - * (cleared == free, and set == busy). The small twist is using per-cpu - * last_tag caches, which blk-mq stores in the blk_mq_ctx software queue - * contexts. This enables us to drastically limit the space searched, - * without dirtying an extra shared cacheline like we would if we stored - * the cache value inside the shared blk_mq_bitmap_tags structure. On top - * of that, each word of tags is in a separate cacheline. This means that - * multiple users will tend to stick to different cachelines, at least - * until the map is exhausted. - */ -static int __bt_get(struct blk_mq_hw_ctx *hctx, struct blk_mq_bitmap_tags *bt, - unsigned int *tag_cache, struct blk_mq_tags *tags) +static int __bt_get(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt) { - unsigned int last_tag, org_last_tag; - int index, i, tag; - if (!hctx_may_queue(hctx, bt)) return -1; - - last_tag = org_last_tag = *tag_cache; - index = TAG_TO_INDEX(bt, last_tag); - - for (i = 0; i < bt->map_nr; i++) { - tag = __bt_get_word(&bt->map[index], TAG_TO_BIT(bt, last_tag), - BT_ALLOC_RR(tags)); - if (tag != -1) { - tag += (index << bt->bits_per_word); - goto done; - } - - /* - * Jump to next index, and reset the last tag to be the - * first tag of that index - */ - index++; - last_tag = (index << bt->bits_per_word); - - if (index >= bt->map_nr) { - index = 0; - last_tag = 0; - } - } - - *tag_cache = 0; - return -1; - - /* - * Only update the cache from the allocation path, if we ended - * up using the specific cached tag. - */ -done: - if (tag == org_last_tag || unlikely(BT_ALLOC_RR(tags))) { - last_tag = tag + 1; - if (last_tag >= bt->depth - 1) - last_tag = 0; - - *tag_cache = last_tag; - } - - return tag; + return __sbitmap_queue_get(bt); } -static struct bt_wait_state *bt_wait_ptr(struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx) +static int bt_get(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags) { - struct bt_wait_state *bs; - int wait_index; - - if (!hctx) - return &bt->bs[0]; - - wait_index = atomic_read(&hctx->wait_index); - bs = &bt->bs[wait_index]; - bt_index_atomic_inc(&hctx->wait_index); - return bs; -} - -static int bt_get(struct blk_mq_alloc_data *data, - struct blk_mq_bitmap_tags *bt, - struct blk_mq_hw_ctx *hctx, - unsigned int *last_tag, struct blk_mq_tags *tags) -{ - struct bt_wait_state *bs; + struct sbq_wait_state *ws; DEFINE_WAIT(wait); int tag; - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) return tag; if (data->flags & BLK_MQ_REQ_NOWAIT) return -1; - bs = bt_wait_ptr(bt, hctx); + ws = bt_wait_ptr(bt, hctx); do { - prepare_to_wait(&bs->wait, &wait, TASK_UNINTERRUPTIBLE); + prepare_to_wait(&ws->wait, &wait, TASK_UNINTERRUPTIBLE); - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -292,7 +132,7 @@ static int bt_get(struct blk_mq_alloc_data *data, * Retry tag allocation after running the hardware queue, * as running the queue may also have found completions. */ - tag = __bt_get(hctx, bt, last_tag, tags); + tag = __bt_get(hctx, bt); if (tag != -1) break; @@ -306,15 +146,14 @@ static int bt_get(struct blk_mq_alloc_data *data, if (data->flags & BLK_MQ_REQ_RESERVED) { bt = &data->hctx->tags->breserved_tags; } else { - last_tag = &data->ctx->last_tag; hctx = data->hctx; bt = &hctx->tags->bitmap_tags; } - finish_wait(&bs->wait, &wait); - bs = bt_wait_ptr(bt, hctx); + finish_wait(&ws->wait, &wait); + ws = bt_wait_ptr(bt, hctx); } while (1); - finish_wait(&bs->wait, &wait); + finish_wait(&ws->wait, &wait); return tag; } @@ -323,7 +162,7 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) int tag; tag = bt_get(data, &data->hctx->tags->bitmap_tags, data->hctx, - &data->ctx->last_tag, data->hctx->tags); + data->hctx->tags); if (tag >= 0) return tag + data->hctx->tags->nr_reserved_tags; @@ -332,15 +171,15 @@ static unsigned int __blk_mq_get_tag(struct blk_mq_alloc_data *data) static unsigned int __blk_mq_get_reserved_tag(struct blk_mq_alloc_data *data) { - int tag, zero = 0; + int tag; if (unlikely(!data->hctx->tags->nr_reserved_tags)) { WARN_ON_ONCE(1); return BLK_MQ_TAG_FAIL; } - tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, &zero, - data->hctx->tags); + tag = bt_get(data, &data->hctx->tags->breserved_tags, NULL, + data->hctx->tags); if (tag < 0) return BLK_MQ_TAG_FAIL; @@ -354,55 +193,8 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) return __blk_mq_get_tag(data); } -static struct bt_wait_state *bt_wake_ptr(struct blk_mq_bitmap_tags *bt) -{ - int i, wake_index; - - wake_index = atomic_read(&bt->wake_index); - for (i = 0; i < BT_WAIT_QUEUES; i++) { - struct bt_wait_state *bs = &bt->bs[wake_index]; - - if (waitqueue_active(&bs->wait)) { - int o = atomic_read(&bt->wake_index); - if (wake_index != o) - atomic_cmpxchg(&bt->wake_index, o, wake_index); - - return bs; - } - - wake_index = bt_index_inc(wake_index); - } - - return NULL; -} - -static void bt_clear_tag(struct blk_mq_bitmap_tags *bt, unsigned int tag) -{ - const int index = TAG_TO_INDEX(bt, tag); - struct bt_wait_state *bs; - int wait_cnt; - - clear_bit(TAG_TO_BIT(bt, tag), &bt->map[index].word); - - /* Ensure that the wait list checks occur after clear_bit(). */ - smp_mb(); - - bs = bt_wake_ptr(bt); - if (!bs) - return; - - wait_cnt = atomic_dec_return(&bs->wait_cnt); - if (unlikely(wait_cnt < 0)) - wait_cnt = atomic_inc_return(&bs->wait_cnt); - if (wait_cnt == 0) { - atomic_add(bt->wake_cnt, &bs->wait_cnt); - bt_index_atomic_inc(&bt->wake_index); - wake_up(&bs->wait); - } -} - -void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, - unsigned int *last_tag) +void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag) { struct blk_mq_tags *tags = hctx->tags; @@ -410,67 +202,92 @@ void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, const int real_tag = tag - tags->nr_reserved_tags; BUG_ON(real_tag >= tags->nr_tags); - bt_clear_tag(&tags->bitmap_tags, real_tag); - if (likely(tags->alloc_policy == BLK_TAG_ALLOC_FIFO)) - *last_tag = real_tag; + sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { BUG_ON(tag >= tags->nr_reserved_tags); - bt_clear_tag(&tags->breserved_tags, tag); + sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } -static void bt_for_each(struct blk_mq_hw_ctx *hctx, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_iter_fn *fn, void *data, bool reserved) +struct bt_iter_data { + struct blk_mq_hw_ctx *hctx; + busy_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_iter_data *iter_data = data; + struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = hctx->tags->rqs[off + bit]; - if (rq->q == hctx->queue) - fn(hctx, rq, data, reserved); - } + if (rq->q == hctx->queue) + iter_data->fn(hctx, rq, iter_data->data, reserved); + return true; +} - off += (1 << bt->bits_per_word); - } +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) +{ + struct bt_iter_data iter_data = { + .hctx = hctx, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); } -static void bt_tags_for_each(struct blk_mq_tags *tags, - struct blk_mq_bitmap_tags *bt, unsigned int off, - busy_tag_iter_fn *fn, void *data, bool reserved) +struct bt_tags_iter_data { + struct blk_mq_tags *tags; + busy_tag_iter_fn *fn; + void *data; + bool reserved; +}; + +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { + struct bt_tags_iter_data *iter_data = data; + struct blk_mq_tags *tags = iter_data->tags; + bool reserved = iter_data->reserved; struct request *rq; - int bit, i; - if (!tags->rqs) - return; - for (i = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - for (bit = find_first_bit(&bm->word, bm->depth); - bit < bm->depth; - bit = find_next_bit(&bm->word, bm->depth, bit + 1)) { - rq = tags->rqs[off + bit]; - fn(rq, data, reserved); - } + if (!reserved) + bitnr += tags->nr_reserved_tags; + rq = tags->rqs[bitnr]; - off += (1 << bt->bits_per_word); - } + iter_data->fn(rq, iter_data->data, reserved); + return true; +} + +static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, + busy_tag_iter_fn *fn, void *data, bool reserved) +{ + struct bt_tags_iter_data iter_data = { + .tags = tags, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + if (tags->rqs) + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); } static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, void *priv) { if (tags->nr_reserved_tags) - bt_tags_for_each(tags, &tags->breserved_tags, 0, fn, priv, true); - bt_tags_for_each(tags, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true); + bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false); } void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, @@ -529,124 +346,40 @@ void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, continue; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, 0, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, tags->nr_reserved_tags, fn, priv, - false); + bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); } } -static unsigned int bt_unused_tags(struct blk_mq_bitmap_tags *bt) +static unsigned int bt_unused_tags(const struct sbitmap_queue *bt) { - unsigned int i, used; - - for (i = 0, used = 0; i < bt->map_nr; i++) { - struct blk_align_bitmap *bm = &bt->map[i]; - - used += bitmap_weight(&bm->word, bm->depth); - } - - return bt->depth - used; + return bt->sb.depth - sbitmap_weight(&bt->sb); } -static void bt_update_count(struct blk_mq_bitmap_tags *bt, - unsigned int depth) +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, + bool round_robin, int node) { - unsigned int tags_per_word = 1U << bt->bits_per_word; - unsigned int map_depth = depth; - - if (depth) { - int i; - - for (i = 0; i < bt->map_nr; i++) { - bt->map[i].depth = min(map_depth, tags_per_word); - map_depth -= bt->map[i].depth; - } - } - - bt->wake_cnt = BT_WAIT_BATCH; - if (bt->wake_cnt > depth / BT_WAIT_QUEUES) - bt->wake_cnt = max(1U, depth / BT_WAIT_QUEUES); - - bt->depth = depth; -} - -static int bt_alloc(struct blk_mq_bitmap_tags *bt, unsigned int depth, - int node, bool reserved) -{ - int i; - - bt->bits_per_word = ilog2(BITS_PER_LONG); - - /* - * Depth can be zero for reserved tags, that's not a failure - * condition. - */ - if (depth) { - unsigned int nr, tags_per_word; - - tags_per_word = (1 << bt->bits_per_word); - - /* - * If the tag space is small, shrink the number of tags - * per word so we spread over a few cachelines, at least. - * If less than 4 tags, just forget about it, it's not - * going to work optimally anyway. - */ - if (depth >= 4) { - while (tags_per_word * 4 > depth) { - bt->bits_per_word--; - tags_per_word = (1 << bt->bits_per_word); - } - } - - nr = ALIGN(depth, tags_per_word) / tags_per_word; - bt->map = kzalloc_node(nr * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bt->map) - return -ENOMEM; - - bt->map_nr = nr; - } - - bt->bs = kzalloc(BT_WAIT_QUEUES * sizeof(*bt->bs), GFP_KERNEL); - if (!bt->bs) { - kfree(bt->map); - bt->map = NULL; - return -ENOMEM; - } - - bt_update_count(bt, depth); - - for (i = 0; i < BT_WAIT_QUEUES; i++) { - init_waitqueue_head(&bt->bs[i].wait); - atomic_set(&bt->bs[i].wait_cnt, bt->wake_cnt); - } - - return 0; -} - -static void bt_free(struct blk_mq_bitmap_tags *bt) -{ - kfree(bt->map); - kfree(bt->bs); + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, + node); } static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, int node, int alloc_policy) { unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - tags->alloc_policy = alloc_policy; - - if (bt_alloc(&tags->bitmap_tags, depth, node, false)) - goto enomem; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, node, true)) - goto enomem; + if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) + goto free_tags; + if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, + node)) + goto free_bitmap_tags; return tags; -enomem: - bt_free(&tags->bitmap_tags); +free_bitmap_tags: + sbitmap_queue_free(&tags->bitmap_tags); +free_tags: kfree(tags); return NULL; } @@ -679,19 +412,12 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, void blk_mq_free_tags(struct blk_mq_tags *tags) { - bt_free(&tags->bitmap_tags); - bt_free(&tags->breserved_tags); + sbitmap_queue_free(&tags->bitmap_tags); + sbitmap_queue_free(&tags->breserved_tags); free_cpumask_var(tags->cpumask); kfree(tags); } -void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *tag) -{ - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; - - *tag = prandom_u32() % depth; -} - int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) { tdepth -= tags->nr_reserved_tags; @@ -702,7 +428,8 @@ int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int tdepth) * Don't need (or can't) update reserved tags here, they remain * static and should never need resizing. */ - bt_update_count(&tags->bitmap_tags, tdepth); + sbitmap_queue_resize(&tags->bitmap_tags, tdepth); + blk_mq_tag_wakeup_all(tags, false); return 0; } @@ -746,7 +473,7 @@ ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page) page += sprintf(page, "nr_tags=%u, reserved_tags=%u, " "bits_per_word=%u\n", tags->nr_tags, tags->nr_reserved_tags, - tags->bitmap_tags.bits_per_word); + 1U << tags->bitmap_tags.sb.shift); free = bt_unused_tags(&tags->bitmap_tags); res = bt_unused_tags(&tags->breserved_tags); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h index d468a79f2c4a..09f4cc0aaa84 100644 --- a/block/blk-mq-tag.h +++ b/block/blk-mq-tag.h @@ -3,31 +3,6 @@ #include "blk-mq.h" -enum { - BT_WAIT_QUEUES = 8, - BT_WAIT_BATCH = 8, -}; - -struct bt_wait_state { - atomic_t wait_cnt; - wait_queue_head_t wait; -} ____cacheline_aligned_in_smp; - -#define TAG_TO_INDEX(bt, tag) ((tag) >> (bt)->bits_per_word) -#define TAG_TO_BIT(bt, tag) ((tag) & ((1 << (bt)->bits_per_word) - 1)) - -struct blk_mq_bitmap_tags { - unsigned int depth; - unsigned int wake_cnt; - unsigned int bits_per_word; - - unsigned int map_nr; - struct blk_align_bitmap *map; - - atomic_t wake_index; - struct bt_wait_state *bs; -}; - /* * Tag address space map. */ @@ -37,13 +12,12 @@ struct blk_mq_tags { atomic_t active_queues; - struct blk_mq_bitmap_tags bitmap_tags; - struct blk_mq_bitmap_tags breserved_tags; + struct sbitmap_queue bitmap_tags; + struct sbitmap_queue breserved_tags; struct request **rqs; struct list_head page_list; - int alloc_policy; cpumask_var_t cpumask; }; @@ -52,15 +26,23 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r extern void blk_mq_free_tags(struct blk_mq_tags *tags); extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, unsigned int tag, unsigned int *last_tag); +extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + unsigned int tag); extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags); extern ssize_t blk_mq_tag_sysfs_show(struct blk_mq_tags *tags, char *page); -extern void blk_mq_tag_init_last_tag(struct blk_mq_tags *tags, unsigned int *last_tag); extern int blk_mq_tag_update_depth(struct blk_mq_tags *tags, unsigned int depth); extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, void *priv); +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + enum { BLK_MQ_TAG_CACHE_MIN = 1, BLK_MQ_TAG_CACHE_MAX = 64, diff --git a/block/blk-mq.c b/block/blk-mq.c index c207fa9870eb..dc5f47f60931 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -22,6 +22,7 @@ #include <linux/sched/sysctl.h> #include <linux/delay.h> #include <linux/crash_dump.h> +#include <linux/prefetch.h> #include <trace/events/block.h> @@ -33,49 +34,28 @@ static DEFINE_MUTEX(all_q_mutex); static LIST_HEAD(all_q_list); -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); - /* * Check if any of the ctx's have pending work in this hardware queue */ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) { - unsigned int i; - - for (i = 0; i < hctx->ctx_map.size; i++) - if (hctx->ctx_map.map[i].word) - return true; - - return false; -} - -static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx) -{ - return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; + return sbitmap_any_bit_set(&hctx->ctx_map); } -#define CTX_TO_BIT(hctx, ctx) \ - ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) - /* * Mark this ctx as having pending work in this hardware queue */ static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) - set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) + sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - struct blk_align_bitmap *bm = get_bm(hctx, ctx); - - clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); + sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); } void blk_mq_freeze_queue_start(struct request_queue *q) @@ -246,19 +226,9 @@ struct request *blk_mq_alloc_request(struct request_queue *q, int rw, ctx = blk_mq_get_ctx(q); hctx = q->mq_ops->map_queue(q, ctx->cpu); blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - if (!rq && !(flags & BLK_MQ_REQ_NOWAIT)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, flags, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, rw, 0); - ctx = alloc_data.ctx; - } blk_mq_put_ctx(ctx); + if (!rq) { blk_queue_exit(q); return ERR_PTR(-EWOULDBLOCK); @@ -333,7 +303,7 @@ static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, rq->cmd_flags = 0; clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); - blk_mq_put_tag(hctx, tag, &ctx->last_tag); + blk_mq_put_tag(hctx, ctx, tag); blk_queue_exit(q); } @@ -513,7 +483,7 @@ EXPORT_SYMBOL(blk_mq_requeue_request); static void blk_mq_requeue_work(struct work_struct *work) { struct request_queue *q = - container_of(work, struct request_queue, requeue_work); + container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); struct request *rq, *next; unsigned long flags; @@ -568,16 +538,24 @@ EXPORT_SYMBOL(blk_mq_add_to_requeue_list); void blk_mq_cancel_requeue_work(struct request_queue *q) { - cancel_work_sync(&q->requeue_work); + cancel_delayed_work_sync(&q->requeue_work); } EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); void blk_mq_kick_requeue_list(struct request_queue *q) { - kblockd_schedule_work(&q->requeue_work); + kblockd_schedule_delayed_work(&q->requeue_work, 0); } EXPORT_SYMBOL(blk_mq_kick_requeue_list); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) +{ + kblockd_schedule_delayed_work(&q->requeue_work, + msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); + void blk_mq_abort_requeue_list(struct request_queue *q) { unsigned long flags; @@ -600,8 +578,10 @@ EXPORT_SYMBOL(blk_mq_abort_requeue_list); struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) { - if (tag < tags->nr_tags) + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); return tags->rqs[tag]; + } return NULL; } @@ -756,38 +736,44 @@ static bool blk_mq_attempt_merge(struct request_queue *q, return false; } +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + + sbitmap_clear_bit(sb, bitnr); + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_list, flush_data->list); + spin_unlock(&ctx->lock); + return true; +} + /* * Process software queues that have been marked busy, splicing them * to the for-dispatch */ static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) { - struct blk_mq_ctx *ctx; - int i; - - for (i = 0; i < hctx->ctx_map.size; i++) { - struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; - unsigned int off, bit; - - if (!bm->word) - continue; + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; - bit = 0; - off = i * hctx->ctx_map.bits_per_word; - do { - bit = find_next_bit(&bm->word, bm->depth, bit); - if (bit >= bm->depth) - break; + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); +} - ctx = hctx->ctxs[bit + off]; - clear_bit(bit, &bm->word); - spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, list); - spin_unlock(&ctx->lock); +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; - bit++; - } while (1); - } + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); } /* @@ -878,10 +864,7 @@ static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) dptr = &driver_list; } - if (!queued) - hctx->dispatched[0]++; - else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) - hctx->dispatched[ilog2(queued) + 1]++; + hctx->dispatched[queued_to_index(queued)]++; /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -937,7 +920,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) !blk_mq_hw_queue_mapped(hctx))) return; - if (!async) { + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); @@ -948,8 +931,7 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) put_cpu(); } - kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), - &hctx->run_work, 0); + kblockd_schedule_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work); } void blk_mq_run_hw_queues(struct request_queue *q, bool async) @@ -970,7 +952,7 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) { - cancel_delayed_work(&hctx->run_work); + cancel_work(&hctx->run_work); cancel_delayed_work(&hctx->delay_work); set_bit(BLK_MQ_S_STOPPED, &hctx->state); } @@ -1023,7 +1005,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) { struct blk_mq_hw_ctx *hctx; - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + hctx = container_of(work, struct blk_mq_hw_ctx, run_work); __blk_mq_run_hw_queue(hctx); } @@ -1240,20 +1222,8 @@ static struct request *blk_mq_map_request(struct request_queue *q, op_flags |= REQ_SYNC; trace_block_getrq(q, bio, op); - blk_mq_set_alloc_data(&alloc_data, q, BLK_MQ_REQ_NOWAIT, ctx, hctx); + blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - if (unlikely(!rq)) { - __blk_mq_run_hw_queue(hctx); - blk_mq_put_ctx(ctx); - trace_block_sleeprq(q, bio, op); - - ctx = blk_mq_get_ctx(q); - hctx = q->mq_ops->map_queue(q, ctx->cpu); - blk_mq_set_alloc_data(&alloc_data, q, 0, ctx, hctx); - rq = __blk_mq_alloc_request(&alloc_data, op, op_flags); - ctx = alloc_data.ctx; - hctx = alloc_data.hctx; - } hctx->queued++; data->hctx = hctx; @@ -1606,32 +1576,6 @@ fail: return NULL; } -static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) -{ - kfree(bitmap->map); -} - -static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) -{ - unsigned int bpw = 8, total, num_maps, i; - - bitmap->bits_per_word = bpw; - - num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; - bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), - GFP_KERNEL, node); - if (!bitmap->map) - return -ENOMEM; - - total = nr_cpu_ids; - for (i = 0; i < num_maps; i++) { - bitmap->map[i].depth = min(total, bitmap->bits_per_word); - total -= bitmap->map[i].depth; - } - - return 0; -} - /* * 'cpu' is going away. splice any existing rq_list entries from this * software queue to the hw queue dispatch list, and ensure that it @@ -1697,7 +1641,7 @@ static void blk_mq_exit_hctx(struct request_queue *q, blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); blk_free_flush_queue(hctx->fq); - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); } static void blk_mq_exit_hw_queues(struct request_queue *q, @@ -1734,7 +1678,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (node == NUMA_NO_NODE) node = hctx->numa_node = set->numa_node; - INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + INIT_WORK(&hctx->run_work, blk_mq_run_work_fn); INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); @@ -1757,7 +1701,8 @@ static int blk_mq_init_hctx(struct request_queue *q, if (!hctx->ctxs) goto unregister_cpu_notifier; - if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, + node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -1784,7 +1729,7 @@ static int blk_mq_init_hctx(struct request_queue *q, if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); free_bitmap: - blk_mq_free_bitmap(&hctx->ctx_map); + sbitmap_free(&hctx->ctx_map); free_ctxs: kfree(hctx->ctxs); unregister_cpu_notifier: @@ -1860,8 +1805,6 @@ static void blk_mq_map_swqueue(struct request_queue *q, mutex_unlock(&q->sysfs_lock); queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_ctxmap *map = &hctx->ctx_map; - /* * If no software queues are mapped to this hardware queue, * disable it and free the request entries. @@ -1887,7 +1830,7 @@ static void blk_mq_map_swqueue(struct request_queue *q, * This is more accurate and more efficient than looping * over all possibly mapped software queues. */ - map->size = DIV_ROUND_UP(hctx->nr_ctx, map->bits_per_word); + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); /* * Initialize batch roundrobin counts @@ -2094,7 +2037,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->sg_reserved_size = INT_MAX; - INIT_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); diff --git a/block/blk-mq.h b/block/blk-mq.h index 9087b11037b7..9b15d2ef7f7b 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -12,8 +12,6 @@ struct blk_mq_ctx { unsigned int cpu; unsigned int index_hw; - unsigned int last_tag ____cacheline_aligned_in_smp; - /* incremented at dispatch time */ unsigned long rq_dispatched[2]; unsigned long rq_merged; @@ -63,15 +61,6 @@ extern void blk_mq_rq_timed_out(struct request *req, bool reserved); void blk_mq_release(struct request_queue *q); -/* - * Basic implementation of sparser bitmap, allowing the user to spread - * the bits over more cachelines. - */ -struct blk_align_bitmap { - unsigned long word; - unsigned long depth; -} ____cacheline_aligned_in_smp; - static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, unsigned int cpu) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index f87a7e747d36..9cc8d7c5439a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -704,7 +704,7 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_ADD); if (q->mq_ops) - blk_mq_register_disk(disk); + blk_mq_register_dev(dev, q); if (!q->request_fn) return 0; @@ -729,7 +729,7 @@ void blk_unregister_queue(struct gendisk *disk) return; if (q->mq_ops) - blk_mq_unregister_disk(disk); + blk_mq_unregister_dev(disk_to_dev(disk), q); if (q->request_fn) elv_unregister_queue(q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index cc2f6dbd4303..5e24d880306c 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3042,7 +3042,6 @@ static struct request *cfq_check_fifo(struct cfq_queue *cfqq) if (ktime_get_ns() < rq->fifo_time) rq = NULL; - cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); return rq; } @@ -3420,6 +3419,9 @@ static bool cfq_may_dispatch(struct cfq_data *cfqd, struct cfq_queue *cfqq) { unsigned int max_dispatch; + if (cfq_cfqq_must_dispatch(cfqq)) + return true; + /* * Drain async requests before we start sync IO */ @@ -3511,15 +3513,20 @@ static bool cfq_dispatch_request(struct cfq_data *cfqd, struct cfq_queue *cfqq) BUG_ON(RB_EMPTY_ROOT(&cfqq->sort_list)); + rq = cfq_check_fifo(cfqq); + if (rq) + cfq_mark_cfqq_must_dispatch(cfqq); + if (!cfq_may_dispatch(cfqd, cfqq)) return false; /* * follow expired path, else get first next available */ - rq = cfq_check_fifo(cfqq); if (!rq) rq = cfqq->next_rq; + else + cfq_log_cfqq(cfqq->cfqd, cfqq, "fifo=%p", rq); /* * insert request into driver dispatch list @@ -3989,7 +3996,7 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, * if the new request is sync, but the currently running queue is * not, let the sync request have priority. */ - if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq)) + if (rq_is_sync(rq) && !cfq_cfqq_sync(cfqq) && !cfq_cfqq_must_dispatch(cfqq)) return true; /* diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 2aca98e8e427..88c46853dbb5 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3686,7 +3686,7 @@ static int mtip_block_open(struct block_device *dev, fmode_t mode) return -ENODEV; } -void mtip_block_release(struct gendisk *disk, fmode_t mode) +static void mtip_block_release(struct gendisk *disk, fmode_t mode) { } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9e398019f38..ccfcfc11399a 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -34,33 +34,29 @@ #include <linux/kthread.h> #include <linux/types.h> #include <linux/debugfs.h> +#include <linux/blk-mq.h> #include <asm/uaccess.h> #include <asm/types.h> #include <linux/nbd.h> +#define NBD_TIMEDOUT 0 +#define NBD_DISCONNECT_REQUESTED 1 + struct nbd_device { u32 flags; + unsigned long runtime_flags; struct socket * sock; /* If == NULL, device is not ready, yet */ int magic; - spinlock_t queue_lock; - struct list_head queue_head; /* Requests waiting result */ - struct request *active_req; - wait_queue_head_t active_wq; - struct list_head waiting_queue; /* Requests to be sent */ - wait_queue_head_t waiting_wq; + struct blk_mq_tag_set tag_set; struct mutex tx_lock; struct gendisk *disk; int blksize; loff_t bytesize; - int xmit_timeout; - bool timedout; - bool disconnect; /* a disconnect has been requested by user */ - struct timer_list timeout_timer; /* protects initialization and shutdown of the socket */ spinlock_t sock_lock; struct task_struct *task_recv; @@ -71,6 +67,11 @@ struct nbd_device { #endif }; +struct nbd_cmd { + struct nbd_device *nbd; + struct list_head list; +}; + #if IS_ENABLED(CONFIG_DEBUG_FS) static struct dentry *nbd_dbg_dir; #endif @@ -83,18 +84,6 @@ static unsigned int nbds_max = 16; static struct nbd_device *nbd_dev; static int max_part; -/* - * Use just one lock (or at most 1 per NIC). Two arguments for this: - * 1. Each NIC is essentially a synchronization point for all servers - * accessed through that NIC so there's no need to have more locks - * than NICs anyway. - * 2. More locks lead to more "Dirty cache line bouncing" which will slow - * down each lock to the point where they're actually slower than just - * a single lock. - * Thanks go to Jens Axboe and Al Viro for their LKML emails explaining this! - */ -static DEFINE_SPINLOCK(nbd_lock); - static inline struct device *nbd_to_dev(struct nbd_device *nbd) { return disk_to_dev(nbd->disk); @@ -153,18 +142,16 @@ static int nbd_size_set(struct nbd_device *nbd, struct block_device *bdev, return 0; } -static void nbd_end_request(struct nbd_device *nbd, struct request *req) +static void nbd_end_request(struct nbd_cmd *cmd) { + struct nbd_device *nbd = cmd->nbd; + struct request *req = blk_mq_rq_from_pdu(cmd); int error = req->errors ? -EIO : 0; - struct request_queue *q = req->q; - unsigned long flags; - dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", req, + dev_dbg(nbd_to_dev(nbd), "request %p: %s\n", cmd, error ? "failed" : "done"); - spin_lock_irqsave(q->queue_lock, flags); - __blk_end_request_all(req, error); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_mq_complete_request(req, error); } /* @@ -172,40 +159,49 @@ static void nbd_end_request(struct nbd_device *nbd, struct request *req) */ static void sock_shutdown(struct nbd_device *nbd) { - spin_lock_irq(&nbd->sock_lock); + struct socket *sock; + + spin_lock(&nbd->sock_lock); if (!nbd->sock) { spin_unlock_irq(&nbd->sock_lock); return; } + sock = nbd->sock; dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); - sockfd_put(nbd->sock); nbd->sock = NULL; - spin_unlock_irq(&nbd->sock_lock); + spin_unlock(&nbd->sock_lock); - del_timer(&nbd->timeout_timer); + kernel_sock_shutdown(sock, SHUT_RDWR); + sockfd_put(sock); } -static void nbd_xmit_timeout(unsigned long arg) +static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, + bool reserved) { - struct nbd_device *nbd = (struct nbd_device *)arg; - unsigned long flags; - - if (list_empty(&nbd->queue_head)) - return; + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); + struct nbd_device *nbd = cmd->nbd; + struct socket *sock = NULL; - spin_lock_irqsave(&nbd->sock_lock, flags); + spin_lock(&nbd->sock_lock); - nbd->timedout = true; + set_bit(NBD_TIMEDOUT, &nbd->runtime_flags); - if (nbd->sock) - kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + if (nbd->sock) { + sock = nbd->sock; + get_file(sock->file); + } - spin_unlock_irqrestore(&nbd->sock_lock, flags); + spin_unlock(&nbd->sock_lock); + if (sock) { + kernel_sock_shutdown(sock, SHUT_RDWR); + sockfd_put(sock); + } + req->errors++; dev_err(nbd_to_dev(nbd), "Connection timed out, shutting down connection\n"); + return BLK_EH_HANDLED; } /* @@ -255,9 +251,6 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, tsk_restore_flags(current, pflags, PF_MEMALLOC); - if (!send && nbd->xmit_timeout) - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); - return result; } @@ -273,8 +266,9 @@ static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *nbd, struct request *req) +static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd) { + struct request *req = blk_mq_rq_from_pdu(cmd); int result, flags; struct nbd_request request; unsigned long size = blk_rq_bytes(req); @@ -298,10 +292,10 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); } - memcpy(request.handle, &req, sizeof(req)); + memcpy(request.handle, &req->tag, sizeof(req->tag)); dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n", - req, nbdcmd_to_ascii(type), + cmd, nbdcmd_to_ascii(type), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); result = sock_xmit(nbd, 1, &request, sizeof(request), (type == NBD_CMD_WRITE) ? MSG_MORE : 0); @@ -323,7 +317,7 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) if (!rq_iter_last(bvec, iter)) flags = MSG_MORE; dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n", - req, bvec.bv_len); + cmd, bvec.bv_len); result = sock_send_bvec(nbd, &bvec, flags); if (result <= 0) { dev_err(disk_to_dev(nbd->disk), @@ -336,29 +330,6 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) return 0; } -static struct request *nbd_find_request(struct nbd_device *nbd, - struct request *xreq) -{ - struct request *req, *tmp; - int err; - - err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); - if (unlikely(err)) - return ERR_PTR(err); - - spin_lock(&nbd->queue_lock); - list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { - if (req != xreq) - continue; - list_del_init(&req->queuelist); - spin_unlock(&nbd->queue_lock); - return req; - } - spin_unlock(&nbd->queue_lock); - - return ERR_PTR(-ENOENT); -} - static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; @@ -370,11 +341,14 @@ static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *nbd) +static struct nbd_cmd *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; - struct request *req; + struct nbd_cmd *cmd; + struct request *req = NULL; + u16 hwq; + int tag; reply.magic = 0; result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); @@ -390,25 +364,27 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) return ERR_PTR(-EPROTO); } - req = nbd_find_request(nbd, *(struct request **)reply.handle); - if (IS_ERR(req)) { - result = PTR_ERR(req); - if (result != -ENOENT) - return ERR_PTR(result); + memcpy(&tag, reply.handle, sizeof(int)); - dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", - reply.handle); - return ERR_PTR(-EBADR); + hwq = blk_mq_unique_tag_to_hwq(tag); + if (hwq < nbd->tag_set.nr_hw_queues) + req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq], + blk_mq_unique_tag_to_tag(tag)); + if (!req || !blk_mq_request_started(req)) { + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n", + tag, req); + return ERR_PTR(-ENOENT); } + cmd = blk_mq_rq_to_pdu(req); if (ntohl(reply.error)) { dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); req->errors++; - return req; + return cmd; } - dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req); + dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", cmd); if (rq_data_dir(req) != WRITE) { struct req_iterator iter; struct bio_vec bvec; @@ -419,13 +395,13 @@ static struct request *nbd_read_stat(struct nbd_device *nbd) dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); req->errors++; - return req; + return cmd; } dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n", - req, bvec.bv_len); + cmd, bvec.bv_len); } } - return req; + return cmd; } static ssize_t pid_show(struct device *dev, @@ -444,7 +420,7 @@ static struct device_attribute pid_attr = { static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) { - struct request *req; + struct nbd_cmd *cmd; int ret; BUG_ON(nbd->magic != NBD_MAGIC); @@ -460,13 +436,13 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) nbd_size_update(nbd, bdev); while (1) { - req = nbd_read_stat(nbd); - if (IS_ERR(req)) { - ret = PTR_ERR(req); + cmd = nbd_read_stat(nbd); + if (IS_ERR(cmd)) { + ret = PTR_ERR(cmd); break; } - nbd_end_request(nbd, req); + nbd_end_request(cmd); } nbd_size_clear(nbd, bdev); @@ -475,44 +451,37 @@ static int nbd_thread_recv(struct nbd_device *nbd, struct block_device *bdev) return ret; } -static void nbd_clear_que(struct nbd_device *nbd) +static void nbd_clear_req(struct request *req, void *data, bool reserved) { - struct request *req; + struct nbd_cmd *cmd; + + if (!blk_mq_request_started(req)) + return; + cmd = blk_mq_rq_to_pdu(req); + req->errors++; + nbd_end_request(cmd); +} +static void nbd_clear_que(struct nbd_device *nbd) +{ BUG_ON(nbd->magic != NBD_MAGIC); /* * Because we have set nbd->sock to NULL under the tx_lock, all - * modifications to the list must have completed by now. For - * the same reason, the active_req must be NULL. - * - * As a consequence, we don't need to take the spin lock while - * purging the list here. + * modifications to the list must have completed by now. */ BUG_ON(nbd->sock); - BUG_ON(nbd->active_req); - while (!list_empty(&nbd->queue_head)) { - req = list_entry(nbd->queue_head.next, struct request, - queuelist); - list_del_init(&req->queuelist); - req->errors++; - nbd_end_request(nbd, req); - } - - while (!list_empty(&nbd->waiting_queue)) { - req = list_entry(nbd->waiting_queue.next, struct request, - queuelist); - list_del_init(&req->queuelist); - req->errors++; - nbd_end_request(nbd, req); - } + blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL); dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n"); } -static void nbd_handle_req(struct nbd_device *nbd, struct request *req) +static void nbd_handle_cmd(struct nbd_cmd *cmd) { + struct request *req = blk_mq_rq_from_pdu(cmd); + struct nbd_device *nbd = cmd->nbd; + if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -526,6 +495,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) req->errors = 0; mutex_lock(&nbd->tx_lock); + nbd->task_send = current; if (unlikely(!nbd->sock)) { mutex_unlock(&nbd->tx_lock); dev_err(disk_to_dev(nbd->disk), @@ -533,106 +503,30 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) goto error_out; } - nbd->active_req = req; - - if (nbd->xmit_timeout && list_empty_careful(&nbd->queue_head)) - mod_timer(&nbd->timeout_timer, jiffies + nbd->xmit_timeout); - - if (nbd_send_req(nbd, req) != 0) { + if (nbd_send_cmd(nbd, cmd) != 0) { dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; - nbd_end_request(nbd, req); - } else { - spin_lock(&nbd->queue_lock); - list_add_tail(&req->queuelist, &nbd->queue_head); - spin_unlock(&nbd->queue_lock); + nbd_end_request(cmd); } - nbd->active_req = NULL; + nbd->task_send = NULL; mutex_unlock(&nbd->tx_lock); - wake_up_all(&nbd->active_wq); return; error_out: req->errors++; - nbd_end_request(nbd, req); + nbd_end_request(cmd); } -static int nbd_thread_send(void *data) +static int nbd_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nbd_device *nbd = data; - struct request *req; - - nbd->task_send = current; - - set_user_nice(current, MIN_NICE); - while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { - /* wait for something to do */ - wait_event_interruptible(nbd->waiting_wq, - kthread_should_stop() || - !list_empty(&nbd->waiting_queue)); - - /* extract request */ - if (list_empty(&nbd->waiting_queue)) - continue; - - spin_lock_irq(&nbd->queue_lock); - req = list_entry(nbd->waiting_queue.next, struct request, - queuelist); - list_del_init(&req->queuelist); - spin_unlock_irq(&nbd->queue_lock); - - /* handle request */ - nbd_handle_req(nbd, req); - } + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - nbd->task_send = NULL; - - return 0; -} - -/* - * We always wait for result of write, for now. It would be nice to make it optional - * in future - * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) - * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } - */ - -static void nbd_request_handler(struct request_queue *q) - __releases(q->queue_lock) __acquires(q->queue_lock) -{ - struct request *req; - - while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *nbd; - - spin_unlock_irq(q->queue_lock); - - nbd = req->rq_disk->private_data; - - BUG_ON(nbd->magic != NBD_MAGIC); - - dev_dbg(nbd_to_dev(nbd), "request %p: dequeued (flags=%x)\n", - req, req->cmd_type); - - if (unlikely(!nbd->sock)) { - dev_err_ratelimited(disk_to_dev(nbd->disk), - "Attempted send on closed socket\n"); - req->errors++; - nbd_end_request(nbd, req); - spin_lock_irq(q->queue_lock); - continue; - } - - spin_lock_irq(&nbd->queue_lock); - list_add_tail(&req->queuelist, &nbd->waiting_queue); - spin_unlock_irq(&nbd->queue_lock); - - wake_up(&nbd->waiting_wq); - - spin_lock_irq(q->queue_lock); - } + blk_mq_start_request(bd->rq); + nbd_handle_cmd(cmd); + return BLK_MQ_RQ_QUEUE_OK; } static int nbd_set_socket(struct nbd_device *nbd, struct socket *sock) @@ -657,15 +551,13 @@ out: /* Reset all properties of an NBD device */ static void nbd_reset(struct nbd_device *nbd) { - nbd->disconnect = false; - nbd->timedout = false; + nbd->runtime_flags = 0; nbd->blksize = 1024; nbd->bytesize = 0; set_capacity(nbd->disk, 0); nbd->flags = 0; - nbd->xmit_timeout = 0; + nbd->tag_set.timeout = 0; queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, nbd->disk->queue); - del_timer_sync(&nbd->timeout_timer); } static void nbd_bdev_reset(struct block_device *bdev) @@ -700,33 +592,37 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, { switch (cmd) { case NBD_DISCONNECT: { - struct request sreq; + struct request *sreq; dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); if (!nbd->sock) return -EINVAL; + sreq = blk_mq_alloc_request(bdev_get_queue(bdev), WRITE, 0); + if (!sreq) + return -ENOMEM; + mutex_unlock(&nbd->tx_lock); fsync_bdev(bdev); mutex_lock(&nbd->tx_lock); - blk_rq_init(NULL, &sreq); - sreq.cmd_type = REQ_TYPE_DRV_PRIV; + sreq->cmd_type = REQ_TYPE_DRV_PRIV; /* Check again after getting mutex back. */ - if (!nbd->sock) + if (!nbd->sock) { + blk_mq_free_request(sreq); return -EINVAL; + } - nbd->disconnect = true; + set_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags); - nbd_send_req(nbd, &sreq); + nbd_send_cmd(nbd, blk_mq_rq_to_pdu(sreq)); + blk_mq_free_request(sreq); return 0; } case NBD_CLEAR_SOCK: sock_shutdown(nbd); nbd_clear_que(nbd); - BUG_ON(!list_empty(&nbd->queue_head)); - BUG_ON(!list_empty(&nbd->waiting_queue)); kill_bdev(bdev); return 0; @@ -758,13 +654,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return nbd_size_set(nbd, bdev, nbd->blksize, arg); case NBD_SET_TIMEOUT: - nbd->xmit_timeout = arg * HZ; - if (arg) - mod_timer(&nbd->timeout_timer, - jiffies + nbd->xmit_timeout); - else - del_timer_sync(&nbd->timeout_timer); - + nbd->tag_set.timeout = arg * HZ; return 0; case NBD_SET_FLAGS: @@ -772,7 +662,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; case NBD_DO_IT: { - struct task_struct *thread; int error; if (nbd->task_recv) @@ -786,18 +675,9 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, nbd_parse_flags(nbd, bdev); - thread = kthread_run(nbd_thread_send, nbd, "%s", - nbd_name(nbd)); - if (IS_ERR(thread)) { - mutex_lock(&nbd->tx_lock); - nbd->task_recv = NULL; - return PTR_ERR(thread); - } - nbd_dev_dbg_init(nbd); error = nbd_thread_recv(nbd, bdev); nbd_dev_dbg_close(nbd); - kthread_stop(thread); mutex_lock(&nbd->tx_lock); nbd->task_recv = NULL; @@ -807,9 +687,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, kill_bdev(bdev); nbd_bdev_reset(bdev); - if (nbd->disconnect) /* user requested, ignore socket errors */ + /* user requested, ignore socket errors */ + if (test_bit(NBD_DISCONNECT_REQUESTED, &nbd->runtime_flags)) error = 0; - if (nbd->timedout) + if (test_bit(NBD_TIMEDOUT, &nbd->runtime_flags)) error = -ETIMEDOUT; nbd_reset(nbd); @@ -825,10 +706,10 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, return 0; case NBD_PRINT_DEBUG: - dev_info(disk_to_dev(nbd->disk), - "next = %p, prev = %p, head = %p\n", - nbd->queue_head.next, nbd->queue_head.prev, - &nbd->queue_head); + /* + * For compatibility only, we no longer keep a list of + * outstanding requests. + */ return 0; } return -ENOTTY; @@ -935,7 +816,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_ops); debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); - debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); + debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); @@ -987,6 +868,24 @@ static void nbd_dbg_close(void) #endif +static int nbd_init_request(void *data, struct request *rq, + unsigned int hctx_idx, unsigned int request_idx, + unsigned int numa_node) +{ + struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq); + + cmd->nbd = data; + INIT_LIST_HEAD(&cmd->list); + return 0; +} + +static struct blk_mq_ops nbd_mq_ops = { + .queue_rq = nbd_queue_rq, + .map_queue = blk_mq_map_queue, + .init_request = nbd_init_request, + .timeout = nbd_xmit_timeout, +}; + /* * And here should be modules and kernel interface * (Just smiley confuses emacs :-) @@ -1035,16 +934,34 @@ static int __init nbd_init(void) if (!disk) goto out; nbd_dev[i].disk = disk; + + nbd_dev[i].tag_set.ops = &nbd_mq_ops; + nbd_dev[i].tag_set.nr_hw_queues = 1; + nbd_dev[i].tag_set.queue_depth = 128; + nbd_dev[i].tag_set.numa_node = NUMA_NO_NODE; + nbd_dev[i].tag_set.cmd_size = sizeof(struct nbd_cmd); + nbd_dev[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE | + BLK_MQ_F_SG_MERGE | BLK_MQ_F_BLOCKING; + nbd_dev[i].tag_set.driver_data = &nbd_dev[i]; + + err = blk_mq_alloc_tag_set(&nbd_dev[i].tag_set); + if (err) { + put_disk(disk); + goto out; + } + /* * The new linux 2.5 block layer implementation requires * every gendisk to have its very own request_queue struct. * These structs are big so we dynamically allocate them. */ - disk->queue = blk_init_queue(nbd_request_handler, &nbd_lock); + disk->queue = blk_mq_init_queue(&nbd_dev[i].tag_set); if (!disk->queue) { + blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); goto out; } + /* * Tell the block layer that we are not a rotational device */ @@ -1069,16 +986,8 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].magic = NBD_MAGIC; - INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); - spin_lock_init(&nbd_dev[i].queue_lock); spin_lock_init(&nbd_dev[i].sock_lock); - INIT_LIST_HEAD(&nbd_dev[i].queue_head); mutex_init(&nbd_dev[i].tx_lock); - init_timer(&nbd_dev[i].timeout_timer); - nbd_dev[i].timeout_timer.function = nbd_xmit_timeout; - nbd_dev[i].timeout_timer.data = (unsigned long)&nbd_dev[i]; - init_waitqueue_head(&nbd_dev[i].active_wq); - init_waitqueue_head(&nbd_dev[i].waiting_wq); disk->major = NBD_MAJOR; disk->first_minor = i << part_shift; disk->fops = &nbd_fops; @@ -1091,6 +1000,7 @@ static int __init nbd_init(void) return 0; out: while (i--) { + blk_mq_free_tag_set(&nbd_dev[i].tag_set); blk_cleanup_queue(nbd_dev[i].disk->queue); put_disk(nbd_dev[i].disk); } @@ -1110,6 +1020,7 @@ static void __exit nbd_cleanup(void) if (disk) { del_gendisk(disk); blk_cleanup_queue(disk->queue); + blk_mq_free_tag_set(&nbd_dev[i].tag_set); put_disk(disk); } } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 75a7f88d6717..91e1de898daf 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -34,6 +34,7 @@ struct nullb { unsigned int index; struct request_queue *q; struct gendisk *disk; + struct nvm_dev *ndev; struct blk_mq_tag_set tag_set; struct hrtimer timer; unsigned int queue_depth; @@ -414,23 +415,6 @@ static void cleanup_queues(struct nullb *nullb) kfree(nullb->queues); } -static void null_del_dev(struct nullb *nullb) -{ - list_del_init(&nullb->list); - - if (use_lightnvm) - nvm_unregister(nullb->disk_name); - else - del_gendisk(nullb->disk); - blk_cleanup_queue(nullb->q); - if (queue_mode == NULL_Q_MQ) - blk_mq_free_tag_set(&nullb->tag_set); - if (!use_lightnvm) - put_disk(nullb->disk); - cleanup_queues(nullb); - kfree(nullb); -} - #ifdef CONFIG_NVM static void null_lnvm_end_io(struct request *rq, int error) @@ -564,10 +548,58 @@ static struct nvm_dev_ops null_lnvm_dev_ops = { /* Simulate nvme protocol restriction */ .max_phys_sect = 64, }; + +static int null_nvm_register(struct nullb *nullb) +{ + struct nvm_dev *dev; + int rv; + + dev = nvm_alloc_dev(0); + if (!dev) + return -ENOMEM; + + dev->q = nullb->q; + memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN); + dev->ops = &null_lnvm_dev_ops; + + rv = nvm_register(dev); + if (rv) { + kfree(dev); + return rv; + } + nullb->ndev = dev; + return 0; +} + +static void null_nvm_unregister(struct nullb *nullb) +{ + nvm_unregister(nullb->ndev); +} #else -static struct nvm_dev_ops null_lnvm_dev_ops; +static int null_nvm_register(struct nullb *nullb) +{ + return -EINVAL; +} +static void null_nvm_unregister(struct nullb *nullb) {} #endif /* CONFIG_NVM */ +static void null_del_dev(struct nullb *nullb) +{ + list_del_init(&nullb->list); + + if (use_lightnvm) + null_nvm_unregister(nullb); + else + del_gendisk(nullb->disk); + blk_cleanup_queue(nullb->q); + if (queue_mode == NULL_Q_MQ) + blk_mq_free_tag_set(&nullb->tag_set); + if (!use_lightnvm) + put_disk(nullb->disk); + cleanup_queues(nullb); + kfree(nullb); +} + static int null_open(struct block_device *bdev, fmode_t mode) { return 0; @@ -640,11 +672,32 @@ static int init_driver_queues(struct nullb *nullb) return 0; } -static int null_add_dev(void) +static int null_gendisk_register(struct nullb *nullb) { struct gendisk *disk; - struct nullb *nullb; sector_t size; + + disk = nullb->disk = alloc_disk_node(1, home_node); + if (!disk) + return -ENOMEM; + size = gb * 1024 * 1024 * 1024ULL; + set_capacity(disk, size >> 9); + + disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->major = null_major; + disk->first_minor = nullb->index; + disk->fops = &null_fops; + disk->private_data = nullb; + disk->queue = nullb->q; + strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + + add_disk(disk); + return 0; +} + +static int null_add_dev(void) +{ + struct nullb *nullb; int rv; nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, home_node); @@ -716,42 +769,19 @@ static int null_add_dev(void) sprintf(nullb->disk_name, "nullb%d", nullb->index); - if (use_lightnvm) { - rv = nvm_register(nullb->q, nullb->disk_name, - &null_lnvm_dev_ops); - if (rv) - goto out_cleanup_blk_queue; - goto done; - } - - disk = nullb->disk = alloc_disk_node(1, home_node); - if (!disk) { - rv = -ENOMEM; - goto out_cleanup_lightnvm; - } - size = gb * 1024 * 1024 * 1024ULL; - set_capacity(disk, size >> 9); - - disk->flags |= GENHD_FL_EXT_DEVT | GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->major = null_major; - disk->first_minor = nullb->index; - disk->fops = &null_fops; - disk->private_data = nullb; - disk->queue = nullb->q; - strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN); + if (use_lightnvm) + rv = null_nvm_register(nullb); + else + rv = null_gendisk_register(nullb); - add_disk(disk); + if (rv) + goto out_cleanup_blk_queue; -done: mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); mutex_unlock(&lock); return 0; - -out_cleanup_lightnvm: - if (use_lightnvm) - nvm_unregister(nullb->disk_name); out_cleanup_blk_queue: blk_cleanup_queue(nullb->q); out_cleanup_tags: diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig index 61c68a1f054a..2f5d5f4a4c75 100644 --- a/drivers/lightnvm/Kconfig +++ b/drivers/lightnvm/Kconfig @@ -4,7 +4,7 @@ menuconfig NVM bool "Open-Channel SSD target support" - depends on BLOCK + depends on BLOCK && HAS_DMA help Say Y here to get to enable Open-channel SSDs. diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index a7a0a22cf1a5..1f6b6521016a 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,6 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o sysblk.o +obj-$(CONFIG_NVM) := core.o sysblk.o sysfs.o obj-$(CONFIG_NVM_GENNVM) += gennvm.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index c784ddcd4405..1cac0f8bc0dc 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -27,6 +27,8 @@ #include <linux/lightnvm.h> #include <linux/sched/sysctl.h> +#include "lightnvm.h" + static LIST_HEAD(nvm_tgt_types); static DECLARE_RWSEM(nvm_tgtt_lock); static LIST_HEAD(nvm_mgrs); @@ -581,6 +583,8 @@ static int nvm_core_init(struct nvm_dev *dev) mutex_init(&dev->mlock); spin_lock_init(&dev->lock); + blk_queue_logical_block_size(dev->q, dev->sec_size); + return 0; err_fmtype: kfree(dev->lun_map); @@ -596,15 +600,19 @@ static void nvm_free_mgr(struct nvm_dev *dev) dev->mt = NULL; } -static void nvm_free(struct nvm_dev *dev) +void nvm_free(struct nvm_dev *dev) { if (!dev) return; nvm_free_mgr(dev); + if (dev->dma_pool) + dev->ops->destroy_dma_pool(dev->dma_pool); + kfree(dev->lptbl); kfree(dev->lun_map); + kfree(dev); } static int nvm_init(struct nvm_dev *dev) @@ -651,30 +659,19 @@ err: static void nvm_exit(struct nvm_dev *dev) { - if (dev->dma_pool) - dev->ops->destroy_dma_pool(dev->dma_pool); - nvm_free(dev); + nvm_sysfs_unregister_dev(dev); +} - pr_info("nvm: successfully unloaded\n"); +struct nvm_dev *nvm_alloc_dev(int node) +{ + return kzalloc_node(sizeof(struct nvm_dev), GFP_KERNEL, node); } +EXPORT_SYMBOL(nvm_alloc_dev); -int nvm_register(struct request_queue *q, char *disk_name, - struct nvm_dev_ops *ops) +int nvm_register(struct nvm_dev *dev) { - struct nvm_dev *dev; int ret; - if (!ops->identity) - return -EINVAL; - - dev = kzalloc(sizeof(struct nvm_dev), GFP_KERNEL); - if (!dev) - return -ENOMEM; - - dev->q = q; - dev->ops = ops; - strncpy(dev->name, disk_name, DISK_NAME_LEN); - ret = nvm_init(dev); if (ret) goto err_init; @@ -694,6 +691,10 @@ int nvm_register(struct request_queue *q, char *disk_name, } } + ret = nvm_sysfs_register_dev(dev); + if (ret) + goto err_ppalist; + if (dev->identity.cap & NVM_ID_DCAP_BBLKMGMT) { ret = nvm_get_sysblock(dev, &dev->sb); if (!ret) @@ -710,31 +711,21 @@ int nvm_register(struct request_queue *q, char *disk_name, up_write(&nvm_lock); return 0; +err_ppalist: + dev->ops->destroy_dma_pool(dev->dma_pool); err_init: kfree(dev->lun_map); - kfree(dev); return ret; } EXPORT_SYMBOL(nvm_register); -void nvm_unregister(char *disk_name) +void nvm_unregister(struct nvm_dev *dev) { - struct nvm_dev *dev; - down_write(&nvm_lock); - dev = nvm_find_nvm_dev(disk_name); - if (!dev) { - pr_err("nvm: could not find device %s to unregister\n", - disk_name); - up_write(&nvm_lock); - return; - } - list_del(&dev->devices); up_write(&nvm_lock); nvm_exit(dev); - kfree(dev); } EXPORT_SYMBOL(nvm_unregister); diff --git a/drivers/lightnvm/lightnvm.h b/drivers/lightnvm/lightnvm.h new file mode 100644 index 000000000000..305c181509a6 --- /dev/null +++ b/drivers/lightnvm/lightnvm.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2016 CNEX Labs. All rights reserved. + * Initial release: Matias Bjorling <matias@cnexlabs.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#ifndef LIGHTNVM_H +#define LIGHTNVM_H + +#include <linux/lightnvm.h> + +/* core -> sysfs.c */ +int __must_check nvm_sysfs_register_dev(struct nvm_dev *); +void nvm_sysfs_unregister_dev(struct nvm_dev *); +int nvm_sysfs_register(void); +void nvm_sysfs_unregister(void); + +/* sysfs > core */ +void nvm_free(struct nvm_dev *); + +#endif diff --git a/drivers/lightnvm/sysfs.c b/drivers/lightnvm/sysfs.c new file mode 100644 index 000000000000..0338c27ab95a --- /dev/null +++ b/drivers/lightnvm/sysfs.c @@ -0,0 +1,198 @@ +#include <linux/kernel.h> +#include <linux/lightnvm.h> +#include <linux/miscdevice.h> +#include <linux/kobject.h> +#include <linux/blk-mq.h> + +#include "lightnvm.h" + +static ssize_t nvm_dev_attr_show(struct device *dev, + struct device_attribute *dattr, char *page) +{ + struct nvm_dev *ndev = container_of(dev, struct nvm_dev, dev); + struct nvm_id *id = &ndev->identity; + struct nvm_id_group *grp = &id->groups[0]; + struct attribute *attr = &dattr->attr; + + if (strcmp(attr->name, "version") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->ver_id); + } else if (strcmp(attr->name, "vendor_opcode") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->vmnt); + } else if (strcmp(attr->name, "capabilities") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->cap); + } else if (strcmp(attr->name, "device_mode") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", id->dom); + } else if (strcmp(attr->name, "media_manager") == 0) { + if (!ndev->mt) + return scnprintf(page, PAGE_SIZE, "%s\n", "none"); + return scnprintf(page, PAGE_SIZE, "%s\n", ndev->mt->name); + } else if (strcmp(attr->name, "ppa_format") == 0) { + return scnprintf(page, PAGE_SIZE, + "0x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x%02x\n", + id->ppaf.ch_offset, id->ppaf.ch_len, + id->ppaf.lun_offset, id->ppaf.lun_len, + id->ppaf.pln_offset, id->ppaf.pln_len, + id->ppaf.blk_offset, id->ppaf.blk_len, + id->ppaf.pg_offset, id->ppaf.pg_len, + id->ppaf.sect_offset, id->ppaf.sect_len); + } else if (strcmp(attr->name, "media_type") == 0) { /* u8 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->mtype); + } else if (strcmp(attr->name, "flash_media_type") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fmtype); + } else if (strcmp(attr->name, "num_channels") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_ch); + } else if (strcmp(attr->name, "num_luns") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_lun); + } else if (strcmp(attr->name, "num_planes") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln); + } else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk); + } else if (strcmp(attr->name, "num_pages") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg); + } else if (strcmp(attr->name, "page_size") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->fpg_sz); + } else if (strcmp(attr->name, "hw_sector_size") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->csecs); + } else if (strcmp(attr->name, "oob_sector_size") == 0) {/* u32 */ + return scnprintf(page, PAGE_SIZE, "%u\n", grp->sos); + } else if (strcmp(attr->name, "read_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdt); + } else if (strcmp(attr->name, "read_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->trdm); + } else if (strcmp(attr->name, "prog_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprt); + } else if (strcmp(attr->name, "prog_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tprm); + } else if (strcmp(attr->name, "erase_typ") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbet); + } else if (strcmp(attr->name, "erase_max") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", grp->tbem); + } else if (strcmp(attr->name, "multiplane_modes") == 0) { + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mpos); + } else if (strcmp(attr->name, "media_capabilities") == 0) { + return scnprintf(page, PAGE_SIZE, "0x%08x\n", grp->mccap); + } else if (strcmp(attr->name, "max_phys_secs") == 0) { + return scnprintf(page, PAGE_SIZE, "%u\n", + ndev->ops->max_phys_sect); + } else { + return scnprintf(page, + PAGE_SIZE, + "Unhandled attr(%s) in `nvm_dev_attr_show`\n", + attr->name); + } +} + +#define NVM_DEV_ATTR_RO(_name) \ + DEVICE_ATTR(_name, S_IRUGO, nvm_dev_attr_show, NULL) + +static NVM_DEV_ATTR_RO(version); +static NVM_DEV_ATTR_RO(vendor_opcode); +static NVM_DEV_ATTR_RO(capabilities); +static NVM_DEV_ATTR_RO(device_mode); +static NVM_DEV_ATTR_RO(ppa_format); +static NVM_DEV_ATTR_RO(media_manager); + +static NVM_DEV_ATTR_RO(media_type); +static NVM_DEV_ATTR_RO(flash_media_type); +static NVM_DEV_ATTR_RO(num_channels); +static NVM_DEV_ATTR_RO(num_luns); +static NVM_DEV_ATTR_RO(num_planes); +static NVM_DEV_ATTR_RO(num_blocks); +static NVM_DEV_ATTR_RO(num_pages); +static NVM_DEV_ATTR_RO(page_size); +static NVM_DEV_ATTR_RO(hw_sector_size); +static NVM_DEV_ATTR_RO(oob_sector_size); +static NVM_DEV_ATTR_RO(read_typ); +static NVM_DEV_ATTR_RO(read_max); +static NVM_DEV_ATTR_RO(prog_typ); +static NVM_DEV_ATTR_RO(prog_max); +static NVM_DEV_ATTR_RO(erase_typ); +static NVM_DEV_ATTR_RO(erase_max); +static NVM_DEV_ATTR_RO(multiplane_modes); +static NVM_DEV_ATTR_RO(media_capabilities); +static NVM_DEV_ATTR_RO(max_phys_secs); + +#define NVM_DEV_ATTR(_name) (dev_attr_##_name##) + +static struct attribute *nvm_dev_attrs[] = { + &dev_attr_version.attr, + &dev_attr_vendor_opcode.attr, + &dev_attr_capabilities.attr, + &dev_attr_device_mode.attr, + &dev_attr_media_manager.attr, + + &dev_attr_ppa_format.attr, + &dev_attr_media_type.attr, + &dev_attr_flash_media_type.attr, + &dev_attr_num_channels.attr, + &dev_attr_num_luns.attr, + &dev_attr_num_planes.attr, + &dev_attr_num_blocks.attr, + &dev_attr_num_pages.attr, + &dev_attr_page_size.attr, + &dev_attr_hw_sector_size.attr, + &dev_attr_oob_sector_size.attr, + &dev_attr_read_typ.attr, + &dev_attr_read_max.attr, + &dev_attr_prog_typ.attr, + &dev_attr_prog_max.attr, + &dev_attr_erase_typ.attr, + &dev_attr_erase_max.attr, + &dev_attr_multiplane_modes.attr, + &dev_attr_media_capabilities.attr, + &dev_attr_max_phys_secs.attr, + NULL, +}; + +static struct attribute_group nvm_dev_attr_group = { + .name = "lightnvm", + .attrs = nvm_dev_attrs, +}; + +static const struct attribute_group *nvm_dev_attr_groups[] = { + &nvm_dev_attr_group, + NULL, +}; + +static void nvm_dev_release(struct device *device) +{ + struct nvm_dev *dev = container_of(device, struct nvm_dev, dev); + struct request_queue *q = dev->q; + + pr_debug("nvm/sysfs: `nvm_dev_release`\n"); + + blk_mq_unregister_dev(device, q); + + nvm_free(dev); +} + +static struct device_type nvm_type = { + .name = "lightnvm", + .groups = nvm_dev_attr_groups, + .release = nvm_dev_release, +}; + +int nvm_sysfs_register_dev(struct nvm_dev *dev) +{ + int ret; + + if (!dev->parent_dev) + return 0; + + dev->dev.parent = dev->parent_dev; + dev_set_name(&dev->dev, "%s", dev->name); + dev->dev.type = &nvm_type; + device_initialize(&dev->dev); + ret = device_add(&dev->dev); + + if (!ret) + blk_mq_register_dev(&dev->dev, dev->q); + + return ret; +} + +void nvm_sysfs_unregister_dev(struct nvm_dev *dev) +{ + if (dev && dev->parent_dev) + kobject_put(&dev->dev.kobj); +} diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 76f7534d1dd1..81d3db40cd7b 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -361,12 +361,8 @@ static void __btree_node_write_done(struct closure *cl) static void btree_node_write_done(struct closure *cl) { struct btree *b = container_of(cl, struct btree, io); - struct bio_vec *bv; - int n; - - bio_for_each_segment_all(bv, b->bio, n) - __free_page(bv->bv_page); + bio_free_pages(b->bio); __btree_node_write_done(cl); } diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c index c28df164701e..333a1e5f6ae6 100644 --- a/drivers/md/bcache/debug.c +++ b/drivers/md/bcache/debug.c @@ -107,9 +107,8 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) { char name[BDEVNAME_SIZE]; struct bio *check; - struct bio_vec bv, *bv2; + struct bio_vec bv; struct bvec_iter iter; - int i; check = bio_clone(bio, GFP_NOIO); if (!check) @@ -136,8 +135,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio) kunmap_atomic(p1); } - bio_for_each_segment_all(bv2, check, i) - __free_page(bv2->bv_page); + bio_free_pages(check); out_put: bio_put(check); } diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c index 1881319f2298..5c4bddecfaf0 100644 --- a/drivers/md/bcache/movinggc.c +++ b/drivers/md/bcache/movinggc.c @@ -44,11 +44,8 @@ static void write_moving_finish(struct closure *cl) { struct moving_io *io = container_of(cl, struct moving_io, cl); struct bio *bio = &io->bio.bio; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, bio, i) - __free_page(bv->bv_page); + bio_free_pages(bio); if (io->op.replace_collision) trace_bcache_gc_copy_collision(&io->w->key); diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 4b177fe11ebb..40ffe5e424b3 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -694,13 +694,8 @@ static void cached_dev_cache_miss_done(struct closure *cl) if (s->iop.replace_collision) bch_mark_cache_miss_collision(s->iop.c, s->d); - if (s->iop.bio) { - int i; - struct bio_vec *bv; - - bio_for_each_segment_all(bv, s->iop.bio, i) - __free_page(bv->bv_page); - } + if (s->iop.bio) + bio_free_pages(s->iop.bio); cached_dev_bio_complete(cl); } diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index d9fd2a62e5f6..e51644e503a5 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -128,11 +128,8 @@ static void write_dirty_finish(struct closure *cl) struct dirty_io *io = container_of(cl, struct dirty_io, cl); struct keybuf_key *w = io->bio.bi_private; struct cached_dev *dc = io->dc; - struct bio_vec *bv; - int i; - bio_for_each_segment_all(bv, &io->bio, i) - __free_page(bv->bv_page); + bio_free_pages(&io->bio); /* This is kind of a dumb way of signalling errors. */ if (KEY_DIRTY(&w->key)) { diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 874295757caa..0448e7e35c8c 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c @@ -1136,7 +1136,7 @@ static void clone_init(struct dm_crypt_io *io, struct bio *clone) clone->bi_private = io; clone->bi_end_io = crypt_endio; clone->bi_bdev = cc->dev->bdev; - bio_set_op_attrs(clone, bio_op(io->base_bio), io->base_bio->bi_opf); + bio_set_op_attrs(clone, bio_op(io->base_bio), bio_flags(io->base_bio)); } static int kcryptd_io_read(struct dm_crypt_io *io, gfp_t gfp) diff --git a/drivers/md/dm-log-writes.c b/drivers/md/dm-log-writes.c index 49e4d8d4558f..4dfe38655a49 100644 --- a/drivers/md/dm-log-writes.c +++ b/drivers/md/dm-log-writes.c @@ -149,8 +149,6 @@ static void put_io_block(struct log_writes_c *lc) static void log_end_io(struct bio *bio) { struct log_writes_c *lc = bio->bi_private; - struct bio_vec *bvec; - int i; if (bio->bi_error) { unsigned long flags; @@ -161,9 +159,7 @@ static void log_end_io(struct bio *bio) spin_unlock_irqrestore(&lc->blocks_lock, flags); } - bio_for_each_segment_all(bvec, bio, i) - __free_page(bvec->bv_page); - + bio_free_pages(bio); put_io_block(lc); bio_put(bio); } diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c index 1ca7463e8bb2..ee48230a2952 100644 --- a/drivers/md/dm-rq.c +++ b/drivers/md/dm-rq.c @@ -955,7 +955,7 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t) dm_init_md_queue(md); /* backfill 'mq' sysfs registration normally done in blk_register_queue */ - blk_mq_register_disk(md->disk); + blk_mq_register_dev(disk_to_dev(md->disk), q); return 0; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 21dc00eb1989..1961d827dbd1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -145,12 +145,8 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) return r1_bio; out_free_pages: - while (--j >= 0) { - struct bio_vec *bv; - - bio_for_each_segment_all(bv, r1_bio->bios[j], i) - __free_page(bv->bv_page); - } + while (--j >= 0) + bio_free_pages(r1_bio->bios[j]); out_free_bio: while (++j < pi->raid_disks) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2feacc70bf61..4669c052239e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -156,12 +156,14 @@ static void nvme_free_ns(struct kref *kref) { struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - if (ns->type == NVME_NS_LIGHTNVM) - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + if (ns->ndev) + nvme_nvm_unregister(ns); - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); + if (ns->disk) { + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + } put_disk(ns->disk); ida_simple_remove(&ns->ctrl->ns_ida, ns->instance); @@ -597,7 +599,7 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, } int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) + void *buffer, size_t buflen, u32 *result) { struct nvme_command c; struct nvme_completion cqe; @@ -606,10 +608,9 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_get_features; c.features.nsid = cpu_to_le32(nsid); - c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, buffer, buflen, 0, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); @@ -617,7 +618,7 @@ int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, } int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) + void *buffer, size_t buflen, u32 *result) { struct nvme_command c; struct nvme_completion cqe; @@ -625,12 +626,11 @@ int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, memset(&c, 0, sizeof(c)); c.features.opcode = nvme_admin_set_features; - c.features.dptr.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, NULL, 0, 0, - NVME_QID_ANY, 0, 0); + ret = __nvme_submit_sync_cmd(dev->admin_q, &c, &cqe, + buffer, buflen, 0, NVME_QID_ANY, 0, 0); if (ret >= 0 && result) *result = le32_to_cpu(cqe.result); return ret; @@ -664,7 +664,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) u32 result; int status, nr_io_queues; - status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, NULL, 0, &result); if (status < 0) return status; @@ -888,42 +888,32 @@ static void nvme_config_discard(struct nvme_ns *ns) queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); } -static int nvme_revalidate_disk(struct gendisk *disk) +static int nvme_revalidate_ns(struct nvme_ns *ns, struct nvme_id_ns **id) { - struct nvme_ns *ns = disk->private_data; - struct nvme_id_ns *id; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - if (test_bit(NVME_NS_DEAD, &ns->flags)) { - set_capacity(disk, 0); - return -ENODEV; - } - if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { - dev_warn(disk_to_dev(ns->disk), "%s: Identify failure\n", - __func__); - return -ENODEV; - } - if (id->ncap == 0) { - kfree(id); + if (nvme_identify_ns(ns->ctrl, ns->ns_id, id)) { + dev_warn(ns->ctrl->dev, "%s: Identify failure\n", __func__); return -ENODEV; } - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { - if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(disk_to_dev(ns->disk), - "%s: LightNVM init failure\n", __func__); - kfree(id); - return -ENODEV; - } - ns->type = NVME_NS_LIGHTNVM; + if ((*id)->ncap == 0) { + kfree(*id); + return -ENODEV; } if (ns->ctrl->vs >= NVME_VS(1, 1)) - memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + memcpy(ns->eui, (*id)->eui64, sizeof(ns->eui)); if (ns->ctrl->vs >= NVME_VS(1, 2)) - memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + memcpy(ns->uuid, (*id)->nguid, sizeof(ns->uuid)); + + return 0; +} + +static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id) +{ + struct nvme_ns *ns = disk->private_data; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; old_ms = ns->ms; lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; @@ -962,8 +952,26 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); blk_mq_unfreeze_queue(disk->queue); +} + +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id = NULL; + int ret; + + if (test_bit(NVME_NS_DEAD, &ns->flags)) { + set_capacity(disk, 0); + return -ENODEV; + } + + ret = nvme_revalidate_ns(ns, &id); + if (ret) + return ret; + __nvme_revalidate_disk(disk, id); kfree(id); + return 0; } @@ -1425,7 +1433,7 @@ static DEVICE_ATTR(rescan_controller, S_IWUSR, NULL, nvme_sysfs_rescan); static ssize_t wwid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); struct nvme_ctrl *ctrl = ns->ctrl; int serial_len = sizeof(ctrl->serial); int model_len = sizeof(ctrl->model); @@ -1449,7 +1457,7 @@ static DEVICE_ATTR(wwid, S_IRUGO, wwid_show, NULL); static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%pU\n", ns->uuid); } static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); @@ -1457,7 +1465,7 @@ static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); static ssize_t eui_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%8phd\n", ns->eui); } static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); @@ -1465,7 +1473,7 @@ static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); return sprintf(buf, "%d\n", ns->ns_id); } static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); @@ -1482,7 +1490,7 @@ static umode_t nvme_ns_attrs_are_visible(struct kobject *kobj, struct attribute *a, int n) { struct device *dev = container_of(kobj, struct device, kobj); - struct nvme_ns *ns = dev_to_disk(dev)->private_data; + struct nvme_ns *ns = nvme_get_ns_from_dev(dev); if (a == &dev_attr_uuid.attr) { if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) @@ -1642,6 +1650,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; + struct nvme_id_ns *id; + char disk_name[DISK_NAME_LEN]; int node = dev_to_node(ctrl->dev); ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); @@ -1659,34 +1669,49 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns->queue->queuedata = ns; ns->ctrl = ctrl; - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - kref_init(&ns->kref); ns->ns_id = nsid; - ns->disk = disk; ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); nvme_set_queue_limits(ctrl, ns->queue); - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance); + sprintf(disk_name, "nvme%dn%d", ctrl->instance, ns->instance); + + if (nvme_revalidate_ns(ns, &id)) + goto out_free_queue; - if (nvme_revalidate_disk(ns->disk)) - goto out_free_disk; + if (nvme_nvm_ns_supported(ns, id)) { + if (nvme_nvm_register(ns, disk_name, node, + &nvme_ns_attr_group)) { + dev_warn(ctrl->dev, "%s: LightNVM init failure\n", + __func__); + goto out_free_id; + } + } else { + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_id; + + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->flags = GENHD_FL_EXT_DEVT; + memcpy(disk->disk_name, disk_name, DISK_NAME_LEN); + ns->disk = disk; + + __nvme_revalidate_disk(disk, id); + } mutex_lock(&ctrl->namespaces_mutex); list_add_tail(&ns->list, &ctrl->namespaces); mutex_unlock(&ctrl->namespaces_mutex); kref_get(&ctrl->kref); - if (ns->type == NVME_NS_LIGHTNVM) + + kfree(id); + + if (ns->ndev) return; device_add_disk(ctrl->device, ns->disk); @@ -1695,8 +1720,8 @@ static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) pr_warn("%s: failed to create sysfs group for identification\n", ns->disk->disk_name); return; - out_free_disk: - kfree(disk); + out_free_id: + kfree(id); out_free_queue: blk_cleanup_queue(ns->queue); out_release_instance: @@ -1710,7 +1735,7 @@ static void nvme_ns_remove(struct nvme_ns *ns) if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags)) return; - if (ns->disk->flags & GENHD_FL_UP) { + if (ns->disk && ns->disk->flags & GENHD_FL_UP) { if (blk_get_integrity(ns->disk)) blk_integrity_unregister(ns->disk); sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, @@ -1733,7 +1758,7 @@ static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) ns = nvme_find_get_ns(ctrl, nsid); if (ns) { - if (revalidate_disk(ns->disk)) + if (ns->disk && revalidate_disk(ns->disk)) nvme_ns_remove(ns); nvme_put_ns(ns); } else @@ -2038,7 +2063,7 @@ void nvme_kill_queues(struct nvme_ctrl *ctrl) * Revalidating a dead namespace sets capacity to 0. This will * end buffered writers dirtying pages that can't be synced. */ - if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags)) + if (ns->disk && !test_and_set_bit(NVME_NS_DEAD, &ns->flags)) revalidate_disk(ns->disk); blk_set_queue_dying(ns->queue); diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 4eff49174466..5a3f008d3480 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -111,8 +111,19 @@ static void nvmf_host_put(struct nvmf_host *host) */ int nvmf_get_address(struct nvme_ctrl *ctrl, char *buf, int size) { - return snprintf(buf, size, "traddr=%s,trsvcid=%s\n", - ctrl->opts->traddr, ctrl->opts->trsvcid); + int len = 0; + + if (ctrl->opts->mask & NVMF_OPT_TRADDR) + len += snprintf(buf, size, "traddr=%s", ctrl->opts->traddr); + if (ctrl->opts->mask & NVMF_OPT_TRSVCID) + len += snprintf(buf + len, size - len, "%strsvcid=%s", + (len) ? "," : "", ctrl->opts->trsvcid); + if (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR) + len += snprintf(buf + len, size - len, "%shost_traddr=%s", + (len) ? "," : "", ctrl->opts->host_traddr); + len += snprintf(buf + len, size - len, "\n"); + + return len; } EXPORT_SYMBOL_GPL(nvmf_get_address); @@ -519,6 +530,7 @@ static const match_table_t opt_tokens = { { NVMF_OPT_RECONNECT_DELAY, "reconnect_delay=%d" }, { NVMF_OPT_KATO, "keep_alive_tmo=%d" }, { NVMF_OPT_HOSTNQN, "hostnqn=%s" }, + { NVMF_OPT_HOST_TRADDR, "host_traddr=%s" }, { NVMF_OPT_ERR, NULL } }; @@ -675,6 +687,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, } opts->reconnect_delay = token; break; + case NVMF_OPT_HOST_TRADDR: + p = match_strdup(args); + if (!p) { + ret = -ENOMEM; + goto out; + } + opts->host_traddr = p; + break; default: pr_warn("unknown parameter or missing value '%s' in ctrl creation request\n", p); @@ -741,6 +761,7 @@ void nvmf_free_options(struct nvmf_ctrl_options *opts) kfree(opts->traddr); kfree(opts->trsvcid); kfree(opts->subsysnqn); + kfree(opts->host_traddr); kfree(opts); } EXPORT_SYMBOL_GPL(nvmf_free_options); diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index 46e460aee52d..924145c979f1 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -52,6 +52,7 @@ enum { NVMF_OPT_KATO = 1 << 7, NVMF_OPT_HOSTNQN = 1 << 8, NVMF_OPT_RECONNECT_DELAY = 1 << 9, + NVMF_OPT_HOST_TRADDR = 1 << 10, }; /** @@ -64,9 +65,12 @@ enum { * being added. * @subsysnqn: Hold the fully qualified NQN subystem name (format defined * in the NVMe specification, "NVMe Qualified Names"). - * @traddr: network address that will be used by the host to communicate - * to the added NVMe controller. - * @trsvcid: network port used for host-controller communication. + * @traddr: The transport-specific TRADDR field for a port on the + * subsystem which is adding a controller. + * @trsvcid: The transport-specific TRSVCID field for a port on the + * subsystem which is adding a controller. + * @host_traddr: A transport-specific field identifying the NVME host port + * to use for the connection to the controller. * @queue_size: Number of IO queue elements. * @nr_io_queues: Number of controller IO queues that will be established. * @reconnect_delay: Time between two consecutive reconnect attempts. @@ -80,6 +84,7 @@ struct nvmf_ctrl_options { char *subsysnqn; char *traddr; char *trsvcid; + char *host_traddr; size_t queue_size; unsigned int nr_io_queues; unsigned int reconnect_delay; diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 63f483daf930..f5e3011e31fc 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -475,7 +475,7 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD) c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns, - rqd->bio->bi_iter.bi_sector)); + rqd->bio->bi_iter.bi_sector)); } static void nvme_nvm_end_io(struct request *rq, int error) @@ -592,14 +592,37 @@ static struct nvm_dev_ops nvme_nvm_dev_ops = { .max_phys_sect = 64, }; -int nvme_nvm_register(struct request_queue *q, char *disk_name) +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, + const struct attribute_group *attrs) { - return nvm_register(q, disk_name, &nvme_nvm_dev_ops); + struct request_queue *q = ns->queue; + struct nvm_dev *dev; + int ret; + + dev = nvm_alloc_dev(node); + if (!dev) + return -ENOMEM; + + dev->q = q; + memcpy(dev->name, disk_name, DISK_NAME_LEN); + dev->ops = &nvme_nvm_dev_ops; + dev->parent_dev = ns->ctrl->device; + dev->private_data = ns; + ns->ndev = dev; + + ret = nvm_register(dev); + + ns->lba_shift = ilog2(dev->sec_size) - 9; + + if (sysfs_create_group(&dev->dev.kobj, attrs)) + pr_warn("%s: failed to create sysfs group for identification\n", + disk_name); + return ret; } -void nvme_nvm_unregister(struct request_queue *q, char *disk_name) +void nvme_nvm_unregister(struct nvme_ns *ns) { - nvm_unregister(disk_name); + nvm_unregister(ns->ndev); } /* move to shared place when used in multiple places. */ diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index ab18b78102bf..b0a9ec681685 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -18,6 +18,7 @@ #include <linux/pci.h> #include <linux/kref.h> #include <linux/blk-mq.h> +#include <linux/lightnvm.h> enum { /* @@ -154,6 +155,7 @@ struct nvme_ns { struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; + struct nvm_dev *ndev; struct kref kref; int instance; @@ -165,7 +167,6 @@ struct nvme_ns { u16 ms; bool ext; u8 pi_type; - int type; unsigned long flags; #define NVME_NS_REMOVING 0 @@ -292,9 +293,9 @@ int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result); + void *buffer, size_t buflen, u32 *result); int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result); + void *buffer, size_t buflen, u32 *result); int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); @@ -307,20 +308,35 @@ int nvme_sg_get_version_num(int __user *ip); #ifdef CONFIG_NVM int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id); -int nvme_nvm_register(struct request_queue *q, char *disk_name); -void nvme_nvm_unregister(struct request_queue *q, char *disk_name); +int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, int node, + const struct attribute_group *attrs); +void nvme_nvm_unregister(struct nvme_ns *ns); + +static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) +{ + if (dev->type->devnode) + return dev_to_disk(dev)->private_data; + + return (container_of(dev, struct nvm_dev, dev))->private_data; +} #else -static inline int nvme_nvm_register(struct request_queue *q, char *disk_name) +static inline int nvme_nvm_register(struct nvme_ns *ns, char *disk_name, + int node, + const struct attribute_group *attrs) { return 0; } -static inline void nvme_nvm_unregister(struct request_queue *q, char *disk_name) {}; +static inline void nvme_nvm_unregister(struct nvme_ns *ns) {}; static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { return 0; } +static inline struct nvme_ns *nvme_get_ns_from_dev(struct device *dev) +{ + return dev_to_disk(dev)->private_data; +} #endif /* CONFIG_NVM */ int __init nvme_core_init(void); diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index e947e298a737..c2a0a1c7d05d 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -72,15 +72,6 @@ static int sg_version_num = 30534; /* 2 digits for each component */ #define ALL_LUNS_RETURNED 0x02 #define ALL_WELL_KNOWN_LUNS_RETURNED 0x01 #define RESTRICTED_LUNS_RETURNED 0x00 -#define NVME_POWER_STATE_START_VALID 0x00 -#define NVME_POWER_STATE_ACTIVE 0x01 -#define NVME_POWER_STATE_IDLE 0x02 -#define NVME_POWER_STATE_STANDBY 0x03 -#define NVME_POWER_STATE_LU_CONTROL 0x07 -#define POWER_STATE_0 0 -#define POWER_STATE_1 1 -#define POWER_STATE_2 2 -#define POWER_STATE_3 3 #define DOWNLOAD_SAVE_ACTIVATE 0x05 #define DOWNLOAD_SAVE_DEFER_ACTIVATE 0x0E #define ACTIVATE_DEFERRED_MICROCODE 0x0F @@ -915,7 +906,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, kfree(smart_log); /* Get Features for Temp Threshold */ - res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, NULL, 0, &feature_resp); if (res != NVME_SC_SUCCESS) temp_c_thresh = LOG_TEMP_UNKNOWN; @@ -1048,7 +1039,7 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns, if (len < MODE_PAGE_CACHING_LEN) return -EINVAL; - nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, NULL, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -1229,64 +1220,6 @@ static void nvme_trans_fill_read_cap(u8 *response, struct nvme_id_ns *id_ns, /* Start Stop Unit Helper Functions */ -static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 pc, u8 pcmod, u8 start) -{ - int res; - int nvme_sc; - struct nvme_id_ctrl *id_ctrl; - int lowest_pow_st; /* max npss = lowest power consumption */ - unsigned ps_desired = 0; - - nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; - - lowest_pow_st = max(POWER_STATE_0, (int)(id_ctrl->npss - 1)); - kfree(id_ctrl); - - switch (pc) { - case NVME_POWER_STATE_START_VALID: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0 && start == 0x1) - ps_desired = POWER_STATE_0; - if (pcmod == 0 && start == 0x0) - ps_desired = lowest_pow_st; - break; - case NVME_POWER_STATE_ACTIVE: - /* Action unspecified if POWER CONDITION MODIFIER != 0 */ - if (pcmod == 0) - ps_desired = POWER_STATE_0; - break; - case NVME_POWER_STATE_IDLE: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1,2] */ - if (pcmod == 0x0) - ps_desired = POWER_STATE_1; - else if (pcmod == 0x1) - ps_desired = POWER_STATE_2; - else if (pcmod == 0x2) - ps_desired = POWER_STATE_3; - break; - case NVME_POWER_STATE_STANDBY: - /* Action unspecified if POWER CONDITION MODIFIER != [0,1] */ - if (pcmod == 0x0) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 2)); - else if (pcmod == 0x1) - ps_desired = max(POWER_STATE_0, (lowest_pow_st - 1)); - break; - case NVME_POWER_STATE_LU_CONTROL: - default: - res = nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - break; - } - nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, - NULL); - return nvme_trans_status_code(hdr, nvme_sc); -} - static int nvme_trans_send_activate_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 buffer_id) { @@ -1395,7 +1328,7 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, case MODE_PAGE_CACHING: dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, - dword11, 0, NULL); + dword11, NULL, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); break; case MODE_PAGE_CONTROL: @@ -2235,11 +2168,10 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - u8 immed, pcmod, pc, no_flush, start; + u8 immed, pcmod, no_flush, start; immed = cmd[1] & 0x01; pcmod = cmd[3] & 0x0f; - pc = (cmd[4] & 0xf0) >> 4; no_flush = cmd[4] & 0x04; start = cmd[4] & 0x01; @@ -2254,8 +2186,8 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (res) return res; } - /* Setup the expected power state transition */ - return nvme_trans_power_state(ns, hdr, pc, pcmod, start); + + return 0; } } diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index 47c564b5a289..7ab9c9381b98 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -14,6 +14,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/module.h> #include <generated/utsrelease.h> +#include <asm/unaligned.h> #include "nvmet.h" u32 nvmet_get_log_page_len(struct nvme_command *cmd) @@ -29,8 +30,84 @@ u32 nvmet_get_log_page_len(struct nvme_command *cmd) return len; } +static u16 nvmet_get_smart_log_nsid(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + struct nvmet_ns *ns; + u64 host_reads, host_writes, data_units_read, data_units_written; + + status = NVME_SC_SUCCESS; + ns = nvmet_find_namespace(req->sq->ctrl, req->cmd->get_log_page.nsid); + if (!ns) { + status = NVME_SC_INVALID_NS; + pr_err("nvmet : Counld not find namespace id : %d\n", + le32_to_cpu(req->cmd->get_log_page.nsid)); + goto out; + } + + host_reads = part_stat_read(ns->bdev->bd_part, ios[READ]); + data_units_read = part_stat_read(ns->bdev->bd_part, sectors[READ]); + host_writes = part_stat_read(ns->bdev->bd_part, ios[WRITE]); + data_units_written = part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + nvmet_put_namespace(ns); +out: + return status; +} + +static u16 nvmet_get_smart_log_all(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + u64 host_reads = 0, host_writes = 0; + u64 data_units_read = 0, data_units_written = 0; + struct nvmet_ns *ns; + struct nvmet_ctrl *ctrl; + + status = NVME_SC_SUCCESS; + ctrl = req->sq->ctrl; + + rcu_read_lock(); + list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link) { + host_reads += part_stat_read(ns->bdev->bd_part, ios[READ]); + data_units_read += + part_stat_read(ns->bdev->bd_part, sectors[READ]); + host_writes += part_stat_read(ns->bdev->bd_part, ios[WRITE]); + data_units_written += + part_stat_read(ns->bdev->bd_part, sectors[WRITE]); + + } + rcu_read_unlock(); + + put_unaligned_le64(host_reads, &slog->host_reads[0]); + put_unaligned_le64(data_units_read, &slog->data_units_read[0]); + put_unaligned_le64(host_writes, &slog->host_writes[0]); + put_unaligned_le64(data_units_written, &slog->data_units_written[0]); + + return status; +} + +static u16 nvmet_get_smart_log(struct nvmet_req *req, + struct nvme_smart_log *slog) +{ + u16 status; + + WARN_ON(req == NULL || slog == NULL); + if (req->cmd->get_log_page.nsid == 0xFFFFFFFF) + status = nvmet_get_smart_log_all(req, slog); + else + status = nvmet_get_smart_log_nsid(req, slog); + return status; +} + static void nvmet_execute_get_log_page(struct nvmet_req *req) { + struct nvme_smart_log *smart_log; size_t data_len = nvmet_get_log_page_len(req->cmd); void *buf; u16 status = 0; @@ -59,6 +136,16 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) * available (e.g. units or commands read/written) those aren't * persistent over power loss. */ + if (data_len != sizeof(*smart_log)) { + status = NVME_SC_INTERNAL; + goto err; + } + smart_log = buf; + status = nvmet_get_smart_log(req, smart_log); + if (status) { + memset(buf, '\0', data_len); + goto err; + } break; case 0x03: /* @@ -73,6 +160,7 @@ static void nvmet_execute_get_log_page(struct nvmet_req *req) status = nvmet_copy_to_sgl(req, 0, buf, data_len); +err: kfree(buf); out: nvmet_req_complete(req, status); diff --git a/drivers/nvme/target/io-cmd.c b/drivers/nvme/target/io-cmd.c index 2cd069b691ae..4a96c2049b7b 100644 --- a/drivers/nvme/target/io-cmd.c +++ b/drivers/nvme/target/io-cmd.c @@ -58,6 +58,7 @@ static void nvmet_execute_rw(struct nvmet_req *req) if (req->cmd->rw.opcode == nvme_cmd_write) { op = REQ_OP_WRITE; + op_flags = WRITE_ODIRECT; if (req->cmd->rw.control & cpu_to_le16(NVME_RW_FUA)) op_flags |= REQ_FUA; } else { @@ -205,7 +206,7 @@ int nvmet_parse_io_cmd(struct nvmet_req *req) return 0; case nvme_cmd_dsm: req->execute = nvmet_execute_dsm; - req->data_len = le32_to_cpu(cmd->dsm.nr) * + req->data_len = le32_to_cpu(cmd->dsm.nr + 1) * sizeof(struct nvme_dsm_range); return 0; default: diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c index 7da05b159ade..bfe9f9994935 100644 --- a/fs/befs/linuxvfs.c +++ b/fs/befs/linuxvfs.c @@ -789,7 +789,7 @@ befs_fill_super(struct super_block *sb, void *data, int silent) * Will be set to real fs blocksize later. * * Linux 2.4.10 and later refuse to read blocks smaller than - * the hardsect size for the device. But we also need to read at + * the logical block size for the device. But we also need to read at * least 1k to get the second 512 bytes of the volume. * -WD 10-26-01 */ diff --git a/fs/block_dev.c b/fs/block_dev.c index 08ae99343d92..376e4e426324 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -180,9 +180,6 @@ blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) struct file *file = iocb->ki_filp; struct inode *inode = bdev_file_inode(file); - if (IS_DAX(inode)) - return dax_do_io(iocb, inode, iter, blkdev_get_block, - NULL, DIO_SKIP_DIO_COUNT); return __blockdev_direct_IO(iocb, inode, I_BDEV(inode), iter, blkdev_get_block, NULL, NULL, DIO_SKIP_DIO_COUNT); @@ -302,14 +299,11 @@ int thaw_bdev(struct block_device *bdev, struct super_block *sb) error = sb->s_op->thaw_super(sb); else error = thaw_super(sb); - if (error) { + if (error) bdev->bd_fsfreeze_count++; - mutex_unlock(&bdev->bd_fsfreeze_mutex); - return error; - } out: mutex_unlock(&bdev->bd_fsfreeze_mutex); - return 0; + return error; } EXPORT_SYMBOL(thaw_bdev); @@ -1275,7 +1269,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = disk; bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; - bdev->bd_inode->i_flags = 0; if (!partno) { ret = -ENXIO; @@ -1303,11 +1296,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) } } - if (!ret) { + if (!ret) bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; - } /* * If the device is invalidated, rescan partition @@ -1342,8 +1332,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) goto out_clear; } bd_set_size(bdev, (loff_t)bdev->bd_part->nr_sects << 9); - if (!bdev_dax_capable(bdev)) - bdev->bd_inode->i_flags &= ~S_DAX; } } else { if (bdev->bd_contains == bdev) { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e6811c42e41e..ca01106795ea 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8412,7 +8412,7 @@ static int btrfs_submit_direct_hook(struct btrfs_dio_private *dip, if (!bio) return -ENOMEM; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; @@ -8450,7 +8450,8 @@ next_block: start_sector, GFP_NOFS); if (!bio) goto out_err; - bio_set_op_attrs(bio, bio_op(orig_bio), orig_bio->bi_opf); + bio_set_op_attrs(bio, bio_op(orig_bio), + bio_flags(orig_bio)); bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; diff --git a/include/linux/bio.h b/include/linux/bio.h index 23ddf4b46a9b..97cb48f03dc7 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -1,6 +1,4 @@ /* - * 2.5 block I/O model - * * Copyright (C) 2001 Jens Axboe <axboe@suse.de> * * This program is free software; you can redistribute it and/or modify @@ -461,6 +459,7 @@ static inline void bio_flush_dcache_pages(struct bio *bi) extern void bio_copy_data(struct bio *dst, struct bio *src); extern int bio_alloc_pages(struct bio *bio, gfp_t gfp); +extern void bio_free_pages(struct bio *bio); extern struct bio *bio_copy_user_iov(struct request_queue *, struct rq_map_data *, diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h index 10648e300c93..cbdbf34de5b6 100644 --- a/include/linux/blk-cgroup.h +++ b/include/linux/blk-cgroup.h @@ -45,7 +45,7 @@ struct blkcg { spinlock_t lock; struct radix_tree_root blkg_tree; - struct blkcg_gq *blkg_hint; + struct blkcg_gq __rcu *blkg_hint; struct hlist_head blkg_list; struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index e43bbffb5b7a..5daa0ef756dd 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -2,6 +2,7 @@ #define BLK_MQ_H #include <linux/blkdev.h> +#include <linux/sbitmap.h> struct blk_mq_tags; struct blk_flush_queue; @@ -12,21 +13,14 @@ struct blk_mq_cpu_notifier { int (*notify)(void *data, unsigned long action, unsigned int cpu); }; -struct blk_mq_ctxmap { - unsigned int size; - unsigned int bits_per_word; - struct blk_align_bitmap *map; -}; - struct blk_mq_hw_ctx { struct { spinlock_t lock; struct list_head dispatch; + unsigned long state; /* BLK_MQ_S_* flags */ } ____cacheline_aligned_in_smp; - unsigned long state; /* BLK_MQ_S_* flags */ - struct delayed_work run_work; - struct delayed_work delay_work; + struct work_struct run_work; cpumask_var_t cpumask; int next_cpu; int next_cpu_batch; @@ -38,10 +32,10 @@ struct blk_mq_hw_ctx { void *driver_data; - struct blk_mq_ctxmap ctx_map; + struct sbitmap ctx_map; - unsigned int nr_ctx; struct blk_mq_ctx **ctxs; + unsigned int nr_ctx; atomic_t wait_index; @@ -49,7 +43,7 @@ struct blk_mq_hw_ctx { unsigned long queued; unsigned long run; -#define BLK_MQ_MAX_DISPATCH_ORDER 10 +#define BLK_MQ_MAX_DISPATCH_ORDER 7 unsigned long dispatched[BLK_MQ_MAX_DISPATCH_ORDER]; unsigned int numa_node; @@ -57,9 +51,12 @@ struct blk_mq_hw_ctx { atomic_t nr_active; + struct delayed_work delay_work; + struct blk_mq_cpu_notifier cpu_notifier; struct kobject kobj; + unsigned long poll_considered; unsigned long poll_invoked; unsigned long poll_success; }; @@ -158,6 +155,7 @@ enum { BLK_MQ_F_TAG_SHARED = 1 << 1, BLK_MQ_F_SG_MERGE = 1 << 2, BLK_MQ_F_DEFER_ISSUE = 1 << 4, + BLK_MQ_F_BLOCKING = 1 << 5, BLK_MQ_F_ALLOC_POLICY_START_BIT = 8, BLK_MQ_F_ALLOC_POLICY_BITS = 1, @@ -178,8 +176,8 @@ enum { struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *); struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, struct request_queue *q); -int blk_mq_register_disk(struct gendisk *); -void blk_mq_unregister_disk(struct gendisk *); +int blk_mq_register_dev(struct device *, struct request_queue *); +void blk_mq_unregister_dev(struct device *, struct request_queue *); int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set); @@ -221,7 +219,6 @@ static inline u16 blk_mq_unique_tag_to_tag(u32 unique_tag) } struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *, const int ctx_index); -struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_tag_set *, unsigned int, int); int blk_mq_request_started(struct request *rq); void blk_mq_start_request(struct request *rq); @@ -232,6 +229,7 @@ void blk_mq_requeue_request(struct request *rq); void blk_mq_add_to_requeue_list(struct request *rq, bool at_head); void blk_mq_cancel_requeue_work(struct request_queue *q); void blk_mq_kick_requeue_list(struct request_queue *q); +void blk_mq_delay_kick_requeue_list(struct request_queue *q, unsigned long msecs); void blk_mq_abort_requeue_list(struct request_queue *q); void blk_mq_complete_request(struct request *rq, int error); diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 436f43f87da9..cd395ecec99d 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -16,7 +16,6 @@ struct block_device; struct io_context; struct cgroup_subsys_state; typedef void (bio_end_io_t) (struct bio *); -typedef void (bio_destructor_t) (struct bio *); #ifdef CONFIG_BLOCK /* @@ -89,14 +88,22 @@ struct bio { struct bio_vec bi_inline_vecs[0]; }; -#define BIO_OP_SHIFT (8 * sizeof(unsigned int) - REQ_OP_BITS) +#define BIO_OP_SHIFT (8 * FIELD_SIZEOF(struct bio, bi_opf) - REQ_OP_BITS) +#define bio_flags(bio) ((bio)->bi_opf & ((1 << BIO_OP_SHIFT) - 1)) #define bio_op(bio) ((bio)->bi_opf >> BIO_OP_SHIFT) -#define bio_set_op_attrs(bio, op, op_flags) do { \ - WARN_ON(op >= (1 << REQ_OP_BITS)); \ - (bio)->bi_opf &= ((1 << BIO_OP_SHIFT) - 1); \ - (bio)->bi_opf |= ((unsigned int) (op) << BIO_OP_SHIFT); \ - (bio)->bi_opf |= op_flags; \ +#define bio_set_op_attrs(bio, op, op_flags) do { \ + if (__builtin_constant_p(op)) \ + BUILD_BUG_ON((op) + 0U >= (1U << REQ_OP_BITS)); \ + else \ + WARN_ON_ONCE((op) + 0U >= (1U << REQ_OP_BITS)); \ + if (__builtin_constant_p(op_flags)) \ + BUILD_BUG_ON((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ + else \ + WARN_ON_ONCE((op_flags) + 0U >= (1U << BIO_OP_SHIFT)); \ + (bio)->bi_opf = bio_flags(bio); \ + (bio)->bi_opf |= (((op) + 0U) << BIO_OP_SHIFT); \ + (bio)->bi_opf |= (op_flags); \ } while (0) #define BIO_RESET_BYTES offsetof(struct bio, bi_max_vecs) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e79055c8b577..c47c358ba052 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -449,7 +449,7 @@ struct request_queue { struct list_head requeue_list; spinlock_t requeue_lock; - struct work_struct requeue_work; + struct delayed_work requeue_work; struct mutex sysfs_lock; @@ -1440,8 +1440,8 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) return bio_will_gap(req->q, bio, req->bio); } -struct work_struct; int kblockd_schedule_work(struct work_struct *work); +int kblockd_schedule_work_on(int cpu, struct work_struct *work); int kblockd_schedule_delayed_work(struct delayed_work *dwork, unsigned long delay); int kblockd_schedule_delayed_work_on(int cpu, struct delayed_work *dwork, unsigned long delay); diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index beb9ce1c2c23..8c1239020d79 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -7,7 +7,6 @@ /* * Gives us 8 prio classes with 13-bits of data for each class */ -#define IOPRIO_BITS (16) #define IOPRIO_CLASS_SHIFT (13) #define IOPRIO_PRIO_MASK ((1UL << IOPRIO_CLASS_SHIFT) - 1) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index ba78b8306674..d190786e4ad8 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -352,7 +352,10 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; + struct device dev; + struct device *parent_dev; char name[DISK_NAME_LEN]; + void *private_data; struct mutex mlock; spinlock_t lock; @@ -524,9 +527,9 @@ extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); -extern int nvm_register(struct request_queue *, char *, - struct nvm_dev_ops *); -extern void nvm_unregister(char *); +extern struct nvm_dev *nvm_alloc_dev(int); +extern int nvm_register(struct nvm_dev *); +extern void nvm_unregister(struct nvm_dev *); void nvm_mark_blk(struct nvm_dev *dev, struct ppa_addr ppa, int type); @@ -575,11 +578,14 @@ extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; -static inline int nvm_register(struct request_queue *q, char *disk_name, - struct nvm_dev_ops *ops) +static inline struct nvm_dev *nvm_alloc_dev(int node) +{ + return ERR_PTR(-EINVAL); +} +static inline int nvm_register(struct nvm_dev *dev) { return -EINVAL; } -static inline void nvm_unregister(char *disk_name) {} +static inline void nvm_unregister(struct nvm_dev *dev) {} #endif /* CONFIG_NVM */ #endif /* LIGHTNVM.H */ diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h new file mode 100644 index 000000000000..f017fd6e69c4 --- /dev/null +++ b/include/linux/sbitmap.h @@ -0,0 +1,373 @@ +/* + * Fast and scalable bitmaps. + * + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#ifndef __LINUX_SCALE_BITMAP_H +#define __LINUX_SCALE_BITMAP_H + +#include <linux/kernel.h> +#include <linux/slab.h> + +/** + * struct sbitmap_word - Word in a &struct sbitmap. + */ +struct sbitmap_word { + /** + * @word: The bitmap word itself. + */ + unsigned long word; + + /** + * @depth: Number of bits being used in @word. + */ + unsigned long depth; +} ____cacheline_aligned_in_smp; + +/** + * struct sbitmap - Scalable bitmap. + * + * A &struct sbitmap is spread over multiple cachelines to avoid ping-pong. This + * trades off higher memory usage for better scalability. + */ +struct sbitmap { + /** + * @depth: Number of bits used in the whole bitmap. + */ + unsigned int depth; + + /** + * @shift: log2(number of bits used per word) + */ + unsigned int shift; + + /** + * @map_nr: Number of words (cachelines) being used for the bitmap. + */ + unsigned int map_nr; + + /** + * @map: Allocated bitmap. + */ + struct sbitmap_word *map; +}; + +#define SBQ_WAIT_QUEUES 8 +#define SBQ_WAKE_BATCH 8 + +/** + * struct sbq_wait_state - Wait queue in a &struct sbitmap_queue. + */ +struct sbq_wait_state { + /** + * @wait_cnt: Number of frees remaining before we wake up. + */ + atomic_t wait_cnt; + + /** + * @wait: Wait queue. + */ + wait_queue_head_t wait; +} ____cacheline_aligned_in_smp; + +/** + * struct sbitmap_queue - Scalable bitmap with the added ability to wait on free + * bits. + * + * A &struct sbitmap_queue uses multiple wait queues and rolling wakeups to + * avoid contention on the wait queue spinlock. This ensures that we don't hit a + * scalability wall when we run out of free bits and have to start putting tasks + * to sleep. + */ +struct sbitmap_queue { + /** + * @sb: Scalable bitmap. + */ + struct sbitmap sb; + + /* + * @alloc_hint: Cache of last successfully allocated or freed bit. + * + * This is per-cpu, which allows multiple users to stick to different + * cachelines until the map is exhausted. + */ + unsigned int __percpu *alloc_hint; + + /** + * @wake_batch: Number of bits which must be freed before we wake up any + * waiters. + */ + unsigned int wake_batch; + + /** + * @wake_index: Next wait queue in @ws to wake up. + */ + atomic_t wake_index; + + /** + * @ws: Wait queues. + */ + struct sbq_wait_state *ws; + + /** + * @round_robin: Allocate bits in strict round-robin order. + */ + bool round_robin; +}; + +/** + * sbitmap_init_node() - Initialize a &struct sbitmap on a specific memory node. + * @sb: Bitmap to initialize. + * @depth: Number of bits to allocate. + * @shift: Use 2^@shift bits per word in the bitmap; if a negative number if + * given, a good default is chosen. + * @flags: Allocation flags. + * @node: Memory node to allocate on. + * + * Return: Zero on success or negative errno on failure. + */ +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node); + +/** + * sbitmap_free() - Free memory used by a &struct sbitmap. + * @sb: Bitmap to free. + */ +static inline void sbitmap_free(struct sbitmap *sb) +{ + kfree(sb->map); + sb->map = NULL; +} + +/** + * sbitmap_resize() - Resize a &struct sbitmap. + * @sb: Bitmap to resize. + * @depth: New number of bits to resize to. + * + * Doesn't reallocate anything. It's up to the caller to ensure that the new + * depth doesn't exceed the depth that the sb was initialized with. + */ +void sbitmap_resize(struct sbitmap *sb, unsigned int depth); + +/** + * sbitmap_get() - Try to allocate a free bit from a &struct sbitmap. + * @sb: Bitmap to allocate from. + * @alloc_hint: Hint for where to start searching for a free bit. + * @round_robin: If true, be stricter about allocation order; always allocate + * starting from the last allocated bit. This is less efficient + * than the default behavior (false). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin); + +/** + * sbitmap_any_bit_set() - Check for a set bit in a &struct sbitmap. + * @sb: Bitmap to check. + * + * Return: true if any bit in the bitmap is set, false otherwise. + */ +bool sbitmap_any_bit_set(const struct sbitmap *sb); + +/** + * sbitmap_any_bit_clear() - Check for an unset bit in a &struct + * sbitmap. + * @sb: Bitmap to check. + * + * Return: true if any bit in the bitmap is clear, false otherwise. + */ +bool sbitmap_any_bit_clear(const struct sbitmap *sb); + +typedef bool (*sb_for_each_fn)(struct sbitmap *, unsigned int, void *); + +/** + * sbitmap_for_each_set() - Iterate over each set bit in a &struct sbitmap. + * @sb: Bitmap to iterate over. + * @fn: Callback. Should return true to continue or false to break early. + * @data: Pointer to pass to callback. + * + * This is inline even though it's non-trivial so that the function calls to the + * callback will hopefully get optimized away. + */ +static inline void sbitmap_for_each_set(struct sbitmap *sb, sb_for_each_fn fn, + void *data) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + struct sbitmap_word *word = &sb->map[i]; + unsigned int off, nr; + + if (!word->word) + continue; + + nr = 0; + off = i << sb->shift; + while (1) { + nr = find_next_bit(&word->word, word->depth, nr); + if (nr >= word->depth) + break; + + if (!fn(sb, off + nr, data)) + return; + + nr++; + } + } +} + +#define SB_NR_TO_INDEX(sb, bitnr) ((bitnr) >> (sb)->shift) +#define SB_NR_TO_BIT(sb, bitnr) ((bitnr) & ((1U << (sb)->shift) - 1U)) + +static inline unsigned long *__sbitmap_word(struct sbitmap *sb, + unsigned int bitnr) +{ + return &sb->map[SB_NR_TO_INDEX(sb, bitnr)].word; +} + +/* Helpers equivalent to the operations in asm/bitops.h and linux/bitmap.h */ + +static inline void sbitmap_set_bit(struct sbitmap *sb, unsigned int bitnr) +{ + set_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +static inline void sbitmap_clear_bit(struct sbitmap *sb, unsigned int bitnr) +{ + clear_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +static inline int sbitmap_test_bit(struct sbitmap *sb, unsigned int bitnr) +{ + return test_bit(SB_NR_TO_BIT(sb, bitnr), __sbitmap_word(sb, bitnr)); +} + +unsigned int sbitmap_weight(const struct sbitmap *sb); + +/** + * sbitmap_queue_init_node() - Initialize a &struct sbitmap_queue on a specific + * memory node. + * @sbq: Bitmap queue to initialize. + * @depth: See sbitmap_init_node(). + * @shift: See sbitmap_init_node(). + * @round_robin: See sbitmap_get(). + * @flags: Allocation flags. + * @node: Memory node to allocate on. + * + * Return: Zero on success or negative errno on failure. + */ +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, bool round_robin, gfp_t flags, int node); + +/** + * sbitmap_queue_free() - Free memory used by a &struct sbitmap_queue. + * + * @sbq: Bitmap queue to free. + */ +static inline void sbitmap_queue_free(struct sbitmap_queue *sbq) +{ + kfree(sbq->ws); + free_percpu(sbq->alloc_hint); + sbitmap_free(&sbq->sb); +} + +/** + * sbitmap_queue_resize() - Resize a &struct sbitmap_queue. + * @sbq: Bitmap queue to resize. + * @depth: New number of bits to resize to. + * + * Like sbitmap_resize(), this doesn't reallocate anything. It has to do + * some extra work on the &struct sbitmap_queue, so it's not safe to just + * resize the underlying &struct sbitmap. + */ +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth); + +/** + * __sbitmap_queue_get() - Try to allocate a free bit from a &struct + * sbitmap_queue with preemption already disabled. + * @sbq: Bitmap queue to allocate from. + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +int __sbitmap_queue_get(struct sbitmap_queue *sbq); + +/** + * sbitmap_queue_get() - Try to allocate a free bit from a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to allocate from. + * @cpu: Output parameter; will contain the CPU we ran on (e.g., to be passed to + * sbitmap_queue_clear()). + * + * Return: Non-negative allocated bit number if successful, -1 otherwise. + */ +static inline int sbitmap_queue_get(struct sbitmap_queue *sbq, + unsigned int *cpu) +{ + int nr; + + *cpu = get_cpu(); + nr = __sbitmap_queue_get(sbq); + put_cpu(); + return nr; +} + +/** + * sbitmap_queue_clear() - Free an allocated bit and wake up waiters on a + * &struct sbitmap_queue. + * @sbq: Bitmap to free from. + * @nr: Bit number to free. + * @cpu: CPU the bit was allocated on. + */ +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + unsigned int cpu); + +static inline int sbq_index_inc(int index) +{ + return (index + 1) & (SBQ_WAIT_QUEUES - 1); +} + +static inline void sbq_index_atomic_inc(atomic_t *index) +{ + int old = atomic_read(index); + int new = sbq_index_inc(old); + atomic_cmpxchg(index, old, new); +} + +/** + * sbq_wait_ptr() - Get the next wait queue to use for a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to wait on. + * @wait_index: A counter per "user" of @sbq. + */ +static inline struct sbq_wait_state *sbq_wait_ptr(struct sbitmap_queue *sbq, + atomic_t *wait_index) +{ + struct sbq_wait_state *ws; + + ws = &sbq->ws[atomic_read(wait_index)]; + sbq_index_atomic_inc(wait_index); + return ws; +} + +/** + * sbitmap_queue_wake_all() - Wake up everything waiting on a &struct + * sbitmap_queue. + * @sbq: Bitmap queue to wake up. + */ +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq); + +#endif /* __LINUX_SCALE_BITMAP_H */ diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 26cc1df280d6..fc6e22186405 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -442,6 +442,7 @@ extern int schedule_on_each_cpu(work_func_t func); int execute_in_process_context(work_func_t fn, struct execute_work *); extern bool flush_work(struct work_struct *work); +extern bool cancel_work(struct work_struct *work); extern bool cancel_work_sync(struct work_struct *work); extern bool flush_delayed_work(struct delayed_work *dwork); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ef071ca73fc3..bd81f0390277 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2974,6 +2974,31 @@ bool flush_delayed_work(struct delayed_work *dwork) } EXPORT_SYMBOL(flush_delayed_work); +static bool __cancel_work(struct work_struct *work, bool is_dwork) +{ + unsigned long flags; + int ret; + + do { + ret = try_to_grab_pending(work, is_dwork, &flags); + } while (unlikely(ret == -EAGAIN)); + + if (unlikely(ret < 0)) + return false; + + set_work_pool_and_clear_pending(work, get_work_pool_id(work)); + local_irq_restore(flags); + return ret; +} + +/* + * See cancel_delayed_work() + */ +bool cancel_work(struct work_struct *work) +{ + return __cancel_work(work, false); +} + /** * cancel_delayed_work - cancel a delayed work * @dwork: delayed_work to cancel @@ -2992,20 +3017,7 @@ EXPORT_SYMBOL(flush_delayed_work); */ bool cancel_delayed_work(struct delayed_work *dwork) { - unsigned long flags; - int ret; - - do { - ret = try_to_grab_pending(&dwork->work, true, &flags); - } while (unlikely(ret == -EAGAIN)); - - if (unlikely(ret < 0)) - return false; - - set_work_pool_and_clear_pending(&dwork->work, - get_work_pool_id(&dwork->work)); - local_irq_restore(flags); - return ret; + return __cancel_work(&dwork->work, true); } EXPORT_SYMBOL(cancel_delayed_work); diff --git a/lib/Kconfig b/lib/Kconfig index d79909dc01ec..942fb8091a86 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -550,4 +550,7 @@ config STACKDEPOT bool select STACKTRACE +config SBITMAP + bool + endmenu diff --git a/lib/Makefile b/lib/Makefile index df747e5eeb7a..f3ca8c0ab634 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -227,3 +227,5 @@ obj-$(CONFIG_UCS2_STRING) += ucs2_string.o obj-$(CONFIG_UBSAN) += ubsan.o UBSAN_SANITIZE_ubsan.o := n + +obj-$(CONFIG_SBITMAP) += sbitmap.o diff --git a/lib/sbitmap.c b/lib/sbitmap.c new file mode 100644 index 000000000000..2cecf05c82fd --- /dev/null +++ b/lib/sbitmap.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2016 Facebook + * Copyright (C) 2013-2014 Jens Axboe + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <https://www.gnu.org/licenses/>. + */ + +#include <linux/random.h> +#include <linux/sbitmap.h> + +int sbitmap_init_node(struct sbitmap *sb, unsigned int depth, int shift, + gfp_t flags, int node) +{ + unsigned int bits_per_word; + unsigned int i; + + if (shift < 0) { + shift = ilog2(BITS_PER_LONG); + /* + * If the bitmap is small, shrink the number of bits per word so + * we spread over a few cachelines, at least. If less than 4 + * bits, just forget about it, it's not going to work optimally + * anyway. + */ + if (depth >= 4) { + while ((4U << shift) > depth) + shift--; + } + } + bits_per_word = 1U << shift; + if (bits_per_word > BITS_PER_LONG) + return -EINVAL; + + sb->shift = shift; + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + if (depth == 0) { + sb->map = NULL; + return 0; + } + + sb->map = kzalloc_node(sb->map_nr * sizeof(*sb->map), flags, node); + if (!sb->map) + return -ENOMEM; + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_init_node); + +void sbitmap_resize(struct sbitmap *sb, unsigned int depth) +{ + unsigned int bits_per_word = 1U << sb->shift; + unsigned int i; + + sb->depth = depth; + sb->map_nr = DIV_ROUND_UP(sb->depth, bits_per_word); + + for (i = 0; i < sb->map_nr; i++) { + sb->map[i].depth = min(depth, bits_per_word); + depth -= sb->map[i].depth; + } +} +EXPORT_SYMBOL_GPL(sbitmap_resize); + +static int __sbitmap_get_word(struct sbitmap_word *word, unsigned int hint, + bool wrap) +{ + unsigned int orig_hint = hint; + int nr; + + while (1) { + nr = find_next_zero_bit(&word->word, word->depth, hint); + if (unlikely(nr >= word->depth)) { + /* + * We started with an offset, and we didn't reset the + * offset to 0 in a failure case, so start from 0 to + * exhaust the map. + */ + if (orig_hint && hint && wrap) { + hint = orig_hint = 0; + continue; + } + return -1; + } + + if (!test_and_set_bit(nr, &word->word)) + break; + + hint = nr + 1; + if (hint >= word->depth - 1) + hint = 0; + } + + return nr; +} + +int sbitmap_get(struct sbitmap *sb, unsigned int alloc_hint, bool round_robin) +{ + unsigned int i, index; + int nr = -1; + + index = SB_NR_TO_INDEX(sb, alloc_hint); + + for (i = 0; i < sb->map_nr; i++) { + nr = __sbitmap_get_word(&sb->map[index], + SB_NR_TO_BIT(sb, alloc_hint), + !round_robin); + if (nr != -1) { + nr += index << sb->shift; + break; + } + + /* Jump to next index. */ + index++; + alloc_hint = index << sb->shift; + + if (index >= sb->map_nr) { + index = 0; + alloc_hint = 0; + } + } + + return nr; +} +EXPORT_SYMBOL_GPL(sbitmap_get); + +bool sbitmap_any_bit_set(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + if (sb->map[i].word) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_set); + +bool sbitmap_any_bit_clear(const struct sbitmap *sb) +{ + unsigned int i; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + unsigned long ret; + + ret = find_first_zero_bit(&word->word, word->depth); + if (ret < word->depth) + return true; + } + return false; +} +EXPORT_SYMBOL_GPL(sbitmap_any_bit_clear); + +unsigned int sbitmap_weight(const struct sbitmap *sb) +{ + unsigned int i, weight = 0; + + for (i = 0; i < sb->map_nr; i++) { + const struct sbitmap_word *word = &sb->map[i]; + + weight += bitmap_weight(&word->word, word->depth); + } + return weight; +} +EXPORT_SYMBOL_GPL(sbitmap_weight); + +static unsigned int sbq_calc_wake_batch(unsigned int depth) +{ + unsigned int wake_batch; + + /* + * For each batch, we wake up one queue. We need to make sure that our + * batch size is small enough that the full depth of the bitmap is + * enough to wake up all of the queues. + */ + wake_batch = SBQ_WAKE_BATCH; + if (wake_batch > depth / SBQ_WAIT_QUEUES) + wake_batch = max(1U, depth / SBQ_WAIT_QUEUES); + + return wake_batch; +} + +int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth, + int shift, bool round_robin, gfp_t flags, int node) +{ + int ret; + int i; + + ret = sbitmap_init_node(&sbq->sb, depth, shift, flags, node); + if (ret) + return ret; + + sbq->alloc_hint = alloc_percpu_gfp(unsigned int, flags); + if (!sbq->alloc_hint) { + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + if (depth && !round_robin) { + for_each_possible_cpu(i) + *per_cpu_ptr(sbq->alloc_hint, i) = prandom_u32() % depth; + } + + sbq->wake_batch = sbq_calc_wake_batch(depth); + atomic_set(&sbq->wake_index, 0); + + sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node); + if (!sbq->ws) { + free_percpu(sbq->alloc_hint); + sbitmap_free(&sbq->sb); + return -ENOMEM; + } + + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + init_waitqueue_head(&sbq->ws[i].wait); + atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch); + } + + sbq->round_robin = round_robin; + return 0; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_init_node); + +void sbitmap_queue_resize(struct sbitmap_queue *sbq, unsigned int depth) +{ + sbq->wake_batch = sbq_calc_wake_batch(depth); + sbitmap_resize(&sbq->sb, depth); +} +EXPORT_SYMBOL_GPL(sbitmap_queue_resize); + +int __sbitmap_queue_get(struct sbitmap_queue *sbq) +{ + unsigned int hint, depth; + int nr; + + hint = this_cpu_read(*sbq->alloc_hint); + depth = READ_ONCE(sbq->sb.depth); + if (unlikely(hint >= depth)) { + hint = depth ? prandom_u32() % depth : 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + nr = sbitmap_get(&sbq->sb, hint, sbq->round_robin); + + if (nr == -1) { + /* If the map is full, a hint won't do us much good. */ + this_cpu_write(*sbq->alloc_hint, 0); + } else if (nr == hint || unlikely(sbq->round_robin)) { + /* Only update the hint if we used it. */ + hint = nr + 1; + if (hint >= depth - 1) + hint = 0; + this_cpu_write(*sbq->alloc_hint, hint); + } + + return nr; +} +EXPORT_SYMBOL_GPL(__sbitmap_queue_get); + +static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) { + int o = atomic_read(&sbq->wake_index); + + if (wake_index != o) + atomic_cmpxchg(&sbq->wake_index, o, wake_index); + return ws; + } + + wake_index = sbq_index_inc(wake_index); + } + + return NULL; +} + +static void sbq_wake_up(struct sbitmap_queue *sbq) +{ + struct sbq_wait_state *ws; + int wait_cnt; + + /* Ensure that the wait list checks occur after clear_bit(). */ + smp_mb(); + + ws = sbq_wake_ptr(sbq); + if (!ws) + return; + + wait_cnt = atomic_dec_return(&ws->wait_cnt); + if (unlikely(wait_cnt < 0)) + wait_cnt = atomic_inc_return(&ws->wait_cnt); + if (wait_cnt == 0) { + atomic_add(sbq->wake_batch, &ws->wait_cnt); + sbq_index_atomic_inc(&sbq->wake_index); + wake_up(&ws->wait); + } +} + +void sbitmap_queue_clear(struct sbitmap_queue *sbq, unsigned int nr, + unsigned int cpu) +{ + sbitmap_clear_bit(&sbq->sb, nr); + sbq_wake_up(sbq); + if (likely(!sbq->round_robin && nr < sbq->sb.depth)) + *per_cpu_ptr(sbq->alloc_hint, cpu) = nr; +} +EXPORT_SYMBOL_GPL(sbitmap_queue_clear); + +void sbitmap_queue_wake_all(struct sbitmap_queue *sbq) +{ + int i, wake_index; + + /* + * Make sure all changes prior to this are visible from other CPUs. + */ + smp_mb(); + wake_index = atomic_read(&sbq->wake_index); + for (i = 0; i < SBQ_WAIT_QUEUES; i++) { + struct sbq_wait_state *ws = &sbq->ws[wake_index]; + + if (waitqueue_active(&ws->wait)) + wake_up(&ws->wait); + + wake_index = sbq_index_inc(wake_index); + } +} +EXPORT_SYMBOL_GPL(sbitmap_queue_wake_all); |