diff options
author | Jens Axboe <axboe@fb.com> | 2016-01-13 16:20:36 +0100 |
---|---|---|
committer | Jens Axboe <axboe@fb.com> | 2016-01-13 16:20:36 +0100 |
commit | 038a75afc54d4b4dc9794213bb16e88c1a31a752 (patch) | |
tree | b092390b1fa88956571577e8ed9edb391383c37d /drivers/block/xen-blkfront.c | |
parent | mtip32xx: restrict variables visible in current code module (diff) | |
parent | xen/blkfront: Fix crash if backend doesn't follow the right states. (diff) | |
download | linux-038a75afc54d4b4dc9794213bb16e88c1a31a752.tar.xz linux-038a75afc54d4b4dc9794213bb16e88c1a31a752.zip |
Merge branch 'stable/for-jens-4.5' of git://git.kernel.org/pub/scm/linux/kernel/git/konrad/xen into for-4.5/drivers
Konrad writes:
The pull is based on converting the backend driver into an multiqueue
driver and exposing more than one queue to the frontend. As such we had
to modify the frontend and also fix a bunch of bugs around this.
The original work is based on Arianna Avanzini's work as an OPW intern.
Bob took over the work and had been massaging it for quite some time.
Also included are are features to 64KB page support for ARM and various
bug-fixes.
Diffstat (limited to 'drivers/block/xen-blkfront.c')
-rw-r--r-- | drivers/block/xen-blkfront.c | 1061 |
1 files changed, 715 insertions, 346 deletions
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fee2eef988d..8a8dc91c39f7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -60,6 +60,20 @@ #include <asm/xen/hypervisor.h> +/* + * The minimal size of segment supported by the block framework is PAGE_SIZE. + * When Linux is using a different page size than Xen, it may not be possible + * to put all the data in a single segment. + * This can happen when the backend doesn't support indirect descriptor and + * therefore the maximum amount of data that a request can carry is + * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB + * + * Note that we only support one extra request. So the Linux page size + * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = + * 88KB. + */ +#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) + enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, @@ -72,6 +86,13 @@ struct grant { struct list_head node; }; +enum blk_req_status { + REQ_WAITING, + REQ_DONE, + REQ_ERROR, + REQ_EOPNOTSUPP, +}; + struct blk_shadow { struct blkif_request req; struct request *request; @@ -79,6 +100,14 @@ struct blk_shadow { struct grant **indirect_grants; struct scatterlist *sg; unsigned int num_sg; + enum blk_req_status status; + + #define NO_ASSOCIATED_ID ~0UL + /* + * Id of the sibling if we ever need 2 requests when handling a + * block I/O request + */ + unsigned long associated_id; }; struct split_bio { @@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +static unsigned int xen_blkif_max_queues = 4; +module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* - * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 - * characters are enough. Define to 20 to keep consist with backend. + * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consistent with backend. */ #define RINGREF_NAME_LEN (20) +/* + * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. + */ +#define QUEUE_NAME_LEN (17) + +/* + * Per-ring info. + * Every blkfront device can associate with one or more blkfront_ring_info, + * depending on how many hardware queues/rings to be used. + */ +struct blkfront_ring_info { + /* Lock to protect data in every ring buffer. */ + spinlock_t ring_lock; + struct blkif_front_ring ring; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, irq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; + struct list_head indirect_pages; + struct list_head grants; + unsigned int persistent_gnts_c; + unsigned long shadow_free; + struct blkfront_info *dev_info; +}; /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the */ struct blkfront_info { - spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_GRANTS]; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; - struct blkif_front_ring ring; - unsigned int evtchn, irq; struct request_queue *rq; - struct work_struct work; - struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; - struct list_head grants; - struct list_head indirect_pages; - unsigned int persistent_gnts_c; - unsigned long shadow_free; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -155,6 +203,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) -static int blkfront_setup_indirect(struct blkfront_info *info); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); +static void blkfront_gather_backend_features(struct blkfront_info *info); -static int get_id_from_freelist(struct blkfront_info *info) +static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { - unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE(info)); - info->shadow_free = info->shadow[free].req.u.rw.id; - info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + unsigned long free = rinfo->shadow_free; + + BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); + rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; + rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static int add_id_to_freelist(struct blkfront_info *info, - unsigned long id) +static int add_id_to_freelist(struct blkfront_ring_info *rinfo, + unsigned long id) { - if (info->shadow[id].req.u.rw.id != id) + if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; - if (info->shadow[id].request == NULL) + if (rinfo->shadow[id].request == NULL) return -EINVAL; - info->shadow[id].req.u.rw.id = info->shadow_free; - info->shadow[id].request = NULL; - info->shadow_free = id; + rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; + rinfo->shadow[id].request = NULL; + rinfo->shadow_free = id; return 0; } -static int fill_grant_buffer(struct blkfront_info *info, int num) +static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) { + struct blkfront_info *info = rinfo->dev_info; struct page *granted_page; struct grant *gnt_list_entry, *n; int i = 0; - while(i < num) { + while (i < num) { gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); if (!gnt_list_entry) goto out_of_memory; @@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; - list_add(&gnt_list_entry->node, &info->grants); + list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->grants, node) { + &rinfo->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) __free_page(gnt_list_entry->page); @@ -263,17 +315,17 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_free_grant(struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) { struct grant *gnt_list_entry; - BUG_ON(list_empty(&info->grants)); - gnt_list_entry = list_first_entry(&info->grants, struct grant, + BUG_ON(list_empty(&rinfo->grants)); + gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, node); list_del(&gnt_list_entry->node); if (gnt_list_entry->gref != GRANT_INVALID_REF) - info->persistent_gnts_c--; + rinfo->persistent_gnts_c--; return gnt_list_entry; } @@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, static struct grant *get_grant(grant_ref_t *gref_head, unsigned long gfn, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, } static struct grant *get_indirect_grant(grant_ref_t *gref_head, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, + BUG_ON(list_empty(&rinfo->indirect_pages)); + indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) static void blkif_restart_queue_callback(void *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; - schedule_work(&info->work); + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; + schedule_work(&rinfo->work); } static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) @@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkif_queue_discard_req(struct request *req) +static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, + struct request *req, + struct blkif_request **ring_req) { - struct blkfront_info *info = req->rq_disk->private_data; + unsigned long id; + + *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + rinfo->ring.req_prod_pvt++; + + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; + rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; + + (*ring_req)->u.rw.id = id; + + return id; +} + +static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req) else ring_req->u.discard.flag = 0; - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; return 0; } @@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req) struct setup_rw_req { unsigned int grant_idx; struct blkif_request_segment *segments; - struct blkfront_info *info; + struct blkfront_ring_info *rinfo; struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; @@ -495,6 +564,9 @@ struct setup_rw_req { bool need_copy; unsigned int bvec_off; char *bvec_data; + + bool require_extra_req; + struct blkif_request *extra_ring_req; }; static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, @@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, /* Convenient aliases */ unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; - struct blkfront_info *info = setup->info; - struct blk_shadow *shadow = &info->shadow[setup->id]; + struct blkfront_ring_info *rinfo = setup->rinfo; + /* + * We always use the shadow of the first request to store the list + * of grant associated to the block I/O request. This made the + * completion more easy to handle even if the block I/O request is + * split. + */ + struct blk_shadow *shadow = &rinfo->shadow[setup->id]; + + if (unlikely(setup->require_extra_req && + grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + /* + * We are using the second request, setup grant_idx + * to be the index of the segment array. + */ + grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; + ring_req = setup->extra_ring_req; + } if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { @@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(setup->segments); n = grant_idx / GRANTS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo); shadow->indirect_grants[n] = gnt_list_entry; setup->segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; - shadow->grants_used[grant_idx] = gnt_list_entry; + /* + * All the grants are stored in the shadow of the first + * request. Therefore we have to use the global index. + */ + shadow->grants_used[setup->grant_idx] = gnt_list_entry; if (setup->need_copy) { void *shared_data; @@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } -static int blkif_queue_rw_req(struct request *req) +static void blkif_setup_extra_req(struct blkif_request *first, + struct blkif_request *second) { - struct blkfront_info *info = req->rq_disk->private_data; - struct blkif_request *ring_req; - unsigned long id; + uint16_t nr_segments = first->u.rw.nr_segments; + + /* + * The second request is only present when the first request uses + * all its segments. It's always the continuity of the first one. + */ + first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + + second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; + second->u.rw.sector_number = first->u.rw.sector_number + + (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; + + second->u.rw.handle = first->u.rw.handle; + second->operation = first->operation; +} + +static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; + struct blkif_request *ring_req, *extra_ring_req = NULL; + unsigned long id, extra_id = NO_ASSOCIATED_ID; + bool require_extra_req = false; int i; struct setup_rw_req setup = { .grant_idx = 0, .segments = NULL, - .info = info, + .rinfo = rinfo, .need_copy = rq_data_dir(req) && info->feature_persistent, }; @@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req) * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req) */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( - &info->callback, + &rinfo->callback, blkif_restart_queue_callback, - info, + rinfo, max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; - - BUG_ON(info->max_indirect_segments == 0 && - GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - GREFS(req->nr_phys_segments) > info->max_indirect_segments); + id = blkif_ring_get_request(rinfo, req, &ring_req); - num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ - for_each_sg(info->shadow[id].sg, sg, num_sg, i) + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); - ring_req->u.rw.id = id; - info->shadow[id].num_sg = num_sg; - if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + require_extra_req = info->max_indirect_segments == 0 && + num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; + BUG_ON(!HAS_EXTRA_REQ && require_extra_req); + + rinfo->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && + likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req) } } ring_req->u.rw.nr_segments = num_grant; + if (unlikely(require_extra_req)) { + extra_id = blkif_ring_get_request(rinfo, req, + &extra_ring_req); + /* + * Only the first request contains the scatter-gather + * list. + */ + rinfo->shadow[extra_id].num_sg = 0; + + blkif_setup_extra_req(ring_req, extra_ring_req); + + /* Link the 2 requests together */ + rinfo->shadow[extra_id].associated_id = id; + rinfo->shadow[id].associated_id = extra_id; + } } setup.ring_req = ring_req; setup.id = id; - for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + + setup.require_extra_req = require_extra_req; + if (unlikely(require_extra_req)) + setup.extra_ring_req = extra_ring_req; + + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); if (setup.need_copy) { @@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req) if (setup.segments) kunmap_atomic(setup.segments); - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; + if (unlikely(require_extra_req)) + rinfo->shadow[extra_id].req = *extra_ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req) * * @req: a request struct */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) - return blkif_queue_discard_req(req); + return blkif_queue_discard_req(req, rinfo); else - return blkif_queue_rw_req(req); + return blkif_queue_rw_req(req, rinfo); } -static inline void flush_requests(struct blkfront_info *info) +static inline void flush_requests(struct blkfront_ring_info *rinfo) { int notify; - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); if (notify) - notify_remote_via_irq(info->irq); + notify_remote_via_irq(rinfo->irq); } static inline bool blkif_request_flush_invalid(struct request *req, @@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req, } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *qd) + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = qd->rq->rq_disk->private_data; + unsigned long flags; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; blk_mq_start_request(qd->rq); - spin_lock_irq(&info->io_lock); - if (RING_FULL(&info->ring)) + spin_lock_irqsave(&rinfo->ring_lock, flags); + if (RING_FULL(&rinfo->ring)) goto out_busy; - if (blkif_request_flush_invalid(qd->rq, info)) + if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) goto out_err; - if (blkif_queue_request(qd->rq)) + if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; - flush_requests(info); - spin_unlock_irq(&info->io_lock); + flush_requests(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_OK; out_err: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_ERROR; out_busy: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); return BLK_MQ_RQ_QUEUE_BUSY; } +static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct blkfront_info *info = (struct blkfront_info *)data; + + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; + return 0; +} + static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, + .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = 1; - info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.cmd_size = 0; info->tag_set.driver_data = info; if (blk_mq_alloc_tag_set(&info->tag_set)) - return -1; + return -EINVAL; rq = blk_mq_init_queue(&info->tag_set); if (IS_ERR(rq)) { blk_mq_free_tag_set(&info->tag_set); - return -1; + return PTR_ERR(rq); } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) info->gd = NULL; } -/* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_info *info) +/* Already hold rinfo->ring_lock. */ +static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { - if (!RING_FULL(&info->ring)) - blk_mq_start_stopped_hw_queues(info->rq, true); + if (!RING_FULL(&rinfo->ring)) + blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } -static void blkif_restart_queue(struct work_struct *work) +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = container_of(work, struct blkfront_info, work); + unsigned long flags; - spin_lock_irq(&info->io_lock); - if (info->connected == BLKIF_STATE_CONNECTED) - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + spin_lock_irqsave(&rinfo->ring_lock, flags); + kick_pending_request_queues_locked(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_restart_queue(struct work_struct *work) { - struct grant *persistent_gnt; - struct grant *n; - int i, j, segs; + struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); + if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(rinfo); +} - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); +static void blkif_free_ring(struct blkfront_ring_info *rinfo) +{ + struct grant *persistent_gnt, *n; + struct blkfront_info *info = rinfo->dev_info; + int i, j, segs; /* * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants */ - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; BUG_ON(info->feature_persistent); - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } } + /* Remove all persistent grants. */ + if (!list_empty(&rinfo->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &rinfo->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + rinfo->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(rinfo->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring */ - if (!info->shadow[i].request) + if (!rinfo->shadow[i].request) goto free_shadow; - segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? - info->shadow[i].req.u.indirect.nr_segments : - info->shadow[i].req.u.rw.nr_segments; + segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + rinfo->shadow[i].req.u.indirect.nr_segments : + rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { - persistent_gnt = info->shadow[i].grants_used[j]; + persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) __free_page(persistent_gnt->page); kfree(persistent_gnt); } - if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments @@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend) goto free_shadow; for (j = 0; j < INDIRECT_GREFS(segs); j++) { - persistent_gnt = info->shadow[i].indirect_grants[j]; + persistent_gnt = rinfo->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; } /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&info->io_lock); + gnttab_cancel_free_callback(&rinfo->callback); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); /* Free resources associated with old device channel. */ for (i = 0; i < info->nr_ring_pages; i++) { - if (info->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref[i], 0, 0); - info->ring_ref[i] = GRANT_INVALID_REF; + if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); + rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); - info->ring.sring = NULL; + free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + rinfo->ring.sring = NULL; - if (info->irq) - unbind_from_irqhandler(info->irq, info); - info->evtchn = info->irq = 0; + if (rinfo->irq) + unbind_from_irqhandler(rinfo->irq, rinfo); + rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + + kfree(info->rinfo); + info->rinfo = NULL; + info->nr_rings = 0; } struct copy_from_grant { @@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, +static enum blk_req_status blkif_rsp_to_req_status(int rsp) +{ + switch (rsp) + { + case BLKIF_RSP_OKAY: + return REQ_DONE; + case BLKIF_RSP_EOPNOTSUPP: + return REQ_EOPNOTSUPP; + case BLKIF_RSP_ERROR: + /* Fallthrough. */ + default: + return REQ_ERROR; + } +} + +/* + * Get the final status of the block request based on two ring response + */ +static int blkif_get_final_status(enum blk_req_status s1, + enum blk_req_status s2) +{ + BUG_ON(s1 == REQ_WAITING); + BUG_ON(s2 == REQ_WAITING); + + if (s1 == REQ_ERROR || s2 == REQ_ERROR) + return BLKIF_RSP_ERROR; + else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) + return BLKIF_RSP_EOPNOTSUPP; + return BLKIF_RSP_OKAY; +} + +static bool blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; + struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *s = &rinfo->shadow[*id]; struct copy_from_grant data = { - .s = s, .grant_idx = 0, }; num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + /* The I/O request may be split in two. */ + if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { + struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; + + /* Keep the status of the current response in shadow. */ + s->status = blkif_rsp_to_req_status(bret->status); + + /* Wait the second response if not yet here. */ + if (s2->status == REQ_WAITING) + return 0; + + bret->status = blkif_get_final_status(s->status, + s2->status); + + /* + * All the grants is stored in the first shadow in order + * to make the completion code simpler. + */ + num_grant += s2->req.u.rw.nr_segments; + + /* + * The two responses may not come in order. Only the + * first request will store the scatter-gather list. + */ + if (s2->num_sg != 0) { + /* Update "id" with the ID of the first response. */ + *id = s->associated_id; + s = s2; + } + + /* + * We don't need anymore the second request, so recycling + * it now. + */ + if (add_id_to_freelist(rinfo, s->associated_id)) + WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n", + info->gd->disk_name, s->associated_id); + } + + data.s = s; num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { @@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); - list_add(&s->grants_used[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->grants_used[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { /* * If the grant is not mapped by the backend we end the @@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->grants_used[i]->node, &info->grants); + list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); - list_add(&s->indirect_grants[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->indirect_grants[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { struct page *indirect_page; @@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ if (!info->feature_persistent) { indirect_page = s->indirect_grants[i]->page; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->indirect_grants[i]->node, &info->grants); + list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } + + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkif_response *bret; RING_IDX i, rp; unsigned long flags; - struct blkfront_info *info = (struct blkfront_info *)dev_id; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; + struct blkfront_info *info = rinfo->dev_info; int error; - spin_lock_irqsave(&info->io_lock, flags); - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&info->io_lock, flags); + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return IRQ_HANDLED; - } + spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = info->ring.sring->rsp_prod; + rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = info->ring.rsp_cons; i != rp; i++) { + for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&info->ring, i); + bret = RING_GET_RESPONSE(&rinfo->ring, i); id = bret->id; /* * The backend has messed up and given us an id that we would @@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * the id is busted. */ continue; } - req = info->shadow[id].request; + req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + if (bret->operation != BLKIF_OP_DISCARD) { + /* + * We may need to wait for an extra response if the + * I/O request is split in 2 + */ + if (!blkif_completion(&id, rinfo, bret)) + continue; + } - if (add_id_to_freelist(info, id)) { + if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", info->gd->disk_name, op_name(bret->operation), id); continue; @@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.u.rw.nr_segments == 0)) { + rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; @@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } } - info->ring.rsp_cons = i; + rinfo->ring.rsp_cons = i; - if (i != info->ring.req_prod_pvt) { + if (i != rinfo->ring.req_prod_pvt) { int more_to_do; - RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again; } else - info->ring.sring->rsp_event = i + 1; + rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(info); + kick_pending_request_queues_locked(rinfo); - spin_unlock_irqrestore(&info->io_lock, flags); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; } static int setup_blkring(struct xenbus_device *dev, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; int err, i; + struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = GRANT_INVALID_REF; + rinfo->ring_ref[i] = GRANT_INVALID_REF; sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, get_order(ring_size)); @@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev, return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, ring_size); + FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); + err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { free_pages((unsigned long)sring, get_order(ring_size)); - info->ring.sring = NULL; + rinfo->ring.sring = NULL; goto fail; } for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = gref[i]; + rinfo->ring_ref[i] = gref[i]; - err = xenbus_alloc_evtchn(dev, &info->evtchn); + err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, - "blkif", info); + err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, + "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); goto fail; } - info->irq = err; + rinfo->irq = err; return 0; fail: @@ -1455,6 +1701,53 @@ fail: return err; } +/* + * Write out per-ring/queue nodes including ring-ref and event-channel, and each + * ring buffer may have multi pages depending on ->nr_ring_pages. + */ +static int write_per_ring_nodes(struct xenbus_transaction xbt, + struct blkfront_ring_info *rinfo, const char *dir) +{ + int err; + unsigned int i; + const char *message = NULL; + struct blkfront_info *info = rinfo->dev_info; + + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dir, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } + } + + err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(info->xbdev, err, "%s", message); + + return err; +} /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err, i; - unsigned int max_page_order = 0; + int err; + unsigned int i, max_page_order = 0; unsigned int ring_page_order = 0; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, @@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, info); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1487,38 +1784,49 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref[0]); + if (info->nr_ring_pages > 1) { + err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u", + ring_page_order); if (err) { - message = "writing ring-ref"; + message = "writing ring-page-order"; goto abort_transaction; } + } + + /* We already got the number of queues/rings in _probe */ + if (info->nr_rings == 1) { + err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + if (err) + goto destroy_blkring; } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); + char *path; + size_t pathsize; + + err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u", + info->nr_rings); if (err) { - message = "writing ring-page-order"; + message = "writing multi-queue-num-queues"; goto abort_transaction; } - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; + pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; + path = kmalloc(pathsize, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + message = "ENOMEM while writing ring references"; + goto abort_transaction; + } - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", info->ring_ref[i]); + for (i = 0; i < info->nr_rings; i++) { + memset(path, 0, pathsize); + snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); + err = write_per_ring_nodes(xbt, &info->rinfo[i], path); if (err) { - message = "writing ring-ref"; - goto abort_transaction; + kfree(path); + goto destroy_blkring; } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", info->evtchn); - if (err) { - message = "writing event-channel"; - goto abort_transaction; + kfree(path); } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); @@ -1540,9 +1848,14 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + unsigned int j; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1553,7 +1866,10 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; } @@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; + unsigned int r_index; struct blkfront_info *info; + unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - mutex_init(&info->mutex); - spin_lock_init(&info->io_lock); info->xbdev = dev; + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); + } + + mutex_init(&info->mutex); info->vdevice = vdevice; - INIT_LIST_HEAD(&info->grants); - INIT_LIST_HEAD(&info->indirect_pages); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue); /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + unsigned int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info) struct split_bio *split_bio; struct list_head requests; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(info->shadow, sizeof(info->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } - + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; + } + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + /* - * Flush operations don't contain bios, so - * we need to requeue the whole request + * Get the bios in the request so we can re-queue them. */ - list_add(©[i].request->queuelist, &requests); - continue; + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); + kfree(copy); + } xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&info->io_lock); - /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(info); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info) BUG_ON(req->nr_phys_segments > segs); blk_mq_requeue_request(req); } - spin_unlock_irq(&info->io_lock); blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { @@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev) return err; } -static void -blkfront_closing(struct blkfront_info *info) +static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; struct block_device *bdev = NULL; @@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->feature_secdiscard = !!discard_secure; } -static int blkfront_setup_indirect(struct blkfront_info *info) +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { unsigned int psegs, grants; int err, i; + struct blkfront_info *info = rinfo->dev_info; - if (info->max_indirect_segments == 0) - grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + if (info->max_indirect_segments == 0) { + if (!HAS_EXTRA_REQ) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else { + /* + * When an extra req is required, the maximum + * grants supported is related to the size of the + * Linux block segment. + */ + grants = GRANTS_PER_PSEG; + } + } else grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, + err = fill_grant_buffer(rinfo, (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); - BUG_ON(!list_empty(&info->indirect_pages)); + BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_NOIO); if (!indirect_page) goto out_of_memory; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } } for (i = 0; i < BLK_RING_SIZE(info); i++) { - info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * grants, + rinfo->shadow[i].grants_used = kzalloc( + sizeof(rinfo->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); + rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) - info->shadow[i].indirect_grants = kzalloc( - sizeof(info->shadow[i].indirect_grants[0]) * + rinfo->shadow[i].indirect_grants = kzalloc( + sizeof(rinfo->shadow[i].indirect_grants[0]) * INDIRECT_GREFS(grants), GFP_NOIO); - if ((info->shadow[i].grants_used == NULL) || - (info->shadow[i].sg == NULL) || + if ((rinfo->shadow[i].grants_used == NULL) || + (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && - (info->shadow[i].indirect_grants == NULL))) + (rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, psegs); + sg_init_table(rinfo->shadow[i].sg, psegs); } @@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - } - if (!list_empty(&info->indirect_pages)) { + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1927,7 +2287,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(info); } /* @@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); add_disk(info->gd); @@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateInitWait: if (dev->state != XenbusStateInitialising) break; - if (talk_to_blkback(dev, info)) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); + if (talk_to_blkback(dev, info)) break; - } case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: + if (dev->state != XenbusStateInitialised) { + if (talk_to_blkback(dev, info)) + break; + } blkfront_connect(info); break; @@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = { static int __init xlblk_init(void) { int ret; + int nr_cpus = num_online_cpus(); if (!xen_domain()) return -ENODEV; @@ -2288,7 +2651,13 @@ static int __init xlblk_init(void) if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); - xen_blkif_max_ring_order = 0; + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; + } + + if (xen_blkif_max_queues > nr_cpus) { + pr_info("Invalid max_queues (%d), will use default max: %d.\n", + xen_blkif_max_queues, nr_cpus); + xen_blkif_max_queues = nr_cpus; } if (!xen_has_pv_disk_devices()) |