diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-02 22:46:35 +0200 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-08-02 22:46:35 +0200 |
commit | c013d0af81f60cc7dbe357c4e2a925fb6738dbfe (patch) | |
tree | 171dfdf928d0450a3fa98a58b2297d857804bb35 /drivers/block | |
parent | Merge tag 'for-5.20/io_uring-zerocopy-send-2022-07-29' of git://git.kernel.dk... (diff) | |
parent | ublk_drv: fix double shift bug (diff) | |
download | linux-c013d0af81f60cc7dbe357c4e2a925fb6738dbfe.tar.xz linux-c013d0af81f60cc7dbe357c4e2a925fb6738dbfe.zip |
Merge tag 'for-5.20/block-2022-07-29' of git://git.kernel.dk/linux-block
Pull block updates from Jens Axboe:
- Improve the type checking of request flags (Bart)
- Ensure queue mapping for a single queues always picks the right queue
(Bart)
- Sanitize the io priority handling (Jan)
- rq-qos race fix (Jinke)
- Reserved tags handling improvements (John)
- Separate memory alignment from file/disk offset aligment for O_DIRECT
(Keith)
- Add new ublk driver, userspace block driver using io_uring for
communication with the userspace backend (Ming)
- Use try_cmpxchg() to cleanup the code in various spots (Uros)
- Finally remove bdevname() (Christoph)
- Clean up the zoned device handling (Christoph)
- Clean up independent access range support (Christoph)
- Clean up and improve block sysfs handling (Christoph)
- Clean up and improve teardown of block devices.
This turns the usual two step process into something that is simpler
to implement and handle in block drivers (Christoph)
- Clean up chunk size handling (Christoph)
- Misc cleanups and fixes (Bart, Bo, Dan, GuoYong, Jason, Keith, Liu,
Ming, Sebastian, Yang, Ying)
* tag 'for-5.20/block-2022-07-29' of git://git.kernel.dk/linux-block: (178 commits)
ublk_drv: fix double shift bug
ublk_drv: make sure that correct flags(features) returned to userspace
ublk_drv: fix error handling of ublk_add_dev
ublk_drv: fix lockdep warning
block: remove __blk_get_queue
block: call blk_mq_exit_queue from disk_release for never added disks
blk-mq: fix error handling in __blk_mq_alloc_disk
ublk: defer disk allocation
ublk: rewrite ublk_ctrl_get_queue_affinity to not rely on hctx->cpumask
ublk: fold __ublk_create_dev into ublk_ctrl_add_dev
ublk: cleanup ublk_ctrl_uring_cmd
ublk: simplify ublk_ch_open and ublk_ch_release
ublk: remove the empty open and release block device operations
ublk: remove UBLK_IO_F_PREFLUSH
ublk: add a MAINTAINERS entry
block: don't allow the same type rq_qos add more than once
mmc: fix disk/queue leak in case of adding disk failure
ublk_drv: fix an IS_ERR() vs NULL check
ublk: remove UBLK_IO_F_INTEGRITY
ublk_drv: remove unneeded semicolon
...
Diffstat (limited to 'drivers/block')
48 files changed, 1717 insertions, 393 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index fdb81f2794cd..e19fcab016ba 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -408,6 +408,15 @@ config BLK_DEV_RBD If unsure, say N. +config BLK_DEV_UBLK + tristate "Userspace block driver (Experimental)" + select IO_URING + help + io_uring based userspace block driver. Together with ublk server, ublk + has been working well, but interface with userspace or command data + definition isn't finalized yet, and might change according to future + requirement, so mark is as experimental now. + source "drivers/block/rnbd/Kconfig" endif # BLK_DEV diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 934a9c7c3a7c..be631352567e 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -39,4 +39,6 @@ obj-$(CONFIG_BLK_DEV_RNBD) += rnbd/ obj-$(CONFIG_BLK_DEV_NULL_BLK) += null_blk/ +obj-$(CONFIG_BLK_DEV_UBLK) += ublk_drv.o + swim_mod-y := swim.o swim_asm.o diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 5a566f2fd533..4c8b2ba579ee 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1802,7 +1802,7 @@ static int fd_alloc_disk(int drive, int system) unit[drive].gendisk[system] = disk; err = add_disk(disk); if (err) - blk_cleanup_disk(disk); + put_disk(disk); return err; } diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index 348adf335217..12b3ca8f6f4a 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -427,7 +427,7 @@ aoeblk_gdalloc(void *vp) return; out_disk_cleanup: - blk_cleanup_disk(gd); + put_disk(gd); err_tagset: blk_mq_free_tag_set(set); err_mempool: diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c index b381d1c3ef32..3523dd82d7a0 100644 --- a/drivers/block/aoe/aoedev.c +++ b/drivers/block/aoe/aoedev.c @@ -277,7 +277,7 @@ freedev(struct aoedev *d) if (d->gd) { aoedisk_rm_debugfs(d); del_gendisk(d->gd); - blk_cleanup_disk(d->gd); + put_disk(d->gd); blk_mq_free_tag_set(&d->tag_set); } t = d->targets; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index e232cc4fd444..9deb4df6bdb8 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -2031,7 +2031,7 @@ static void ataflop_probe(dev_t dev) return; cleanup_disk: - blk_cleanup_disk(unit[drive].disk[type]); + put_disk(unit[drive].disk[type]); unit[drive].disk[type] = NULL; } @@ -2045,7 +2045,6 @@ static void atari_floppy_cleanup(void) if (!unit[i].disk[type]) continue; del_gendisk(unit[i].disk[type]); - blk_cleanup_queue(unit[i].disk[type]->queue); put_disk(unit[i].disk[type]); } blk_mq_free_tag_set(&unit[i].tag_set); @@ -2064,7 +2063,7 @@ static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs) continue; if (fs->registered[type]) del_gendisk(fs->disk[type]); - blk_cleanup_disk(fs->disk[type]); + put_disk(fs->disk[type]); } blk_mq_free_tag_set(&fs->tag_set); } diff --git a/drivers/block/brd.c b/drivers/block/brd.c index 6e3f2f0d2352..859499cd1ff8 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -256,7 +256,7 @@ static void copy_from_brd(void *dst, struct brd_device *brd, * Process a single bvec of a bio. */ static int brd_do_bvec(struct brd_device *brd, struct page *page, - unsigned int len, unsigned int off, unsigned int op, + unsigned int len, unsigned int off, enum req_op op, sector_t sector) { void *mem; @@ -310,7 +310,7 @@ static void brd_submit_bio(struct bio *bio) } static int brd_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { struct brd_device *brd = bdev->bd_disk->private_data; int err; @@ -419,7 +419,7 @@ static int brd_alloc(int i) return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_dev: list_del(&brd->brd_list); kfree(brd); @@ -439,7 +439,7 @@ static void brd_cleanup(void) list_for_each_entry_safe(brd, next, &brd_devices, brd_list) { del_gendisk(brd->brd_disk); - blk_cleanup_disk(brd->brd_disk); + put_disk(brd->brd_disk); brd_free_pages(brd); list_del(&brd->brd_list); kfree(brd); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index f5bcded3640d..e27478ae579c 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -124,12 +124,13 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b static int _drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int op) + sector_t sector, enum req_op op) { struct bio *bio; /* we do all our meta data IO in aligned 4k blocks. */ const int size = 4096; - int err, op_flags = 0; + int err; + blk_opf_t op_flags = 0; device->md_io.done = 0; device->md_io.error = -ENODEV; @@ -174,7 +175,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, } int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int op) + sector_t sector, enum req_op op) { int err; D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); @@ -385,7 +386,7 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; rcu_read_unlock(); if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) { err = -EIO; drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); } else { diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9e060e49b3f8..603f6828dd79 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -977,7 +977,7 @@ static void drbd_bm_endio(struct bio *bio) static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local) { struct drbd_device *device = ctx->device; - unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE; + enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE; struct bio *bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO, &drbd_md_io_bio_set); struct drbd_bitmap *b = device->bitmap; diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 4d3efaa20b7b..f15f2f041596 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1495,7 +1495,7 @@ extern int drbd_resync_finished(struct drbd_device *device); extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, - struct drbd_backing_dev *bdev, sector_t sector, int op); + struct drbd_backing_dev *bdev, sector_t sector, enum req_op op); extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); extern void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int *done); @@ -1547,8 +1547,7 @@ extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_device *, - struct drbd_peer_request *, const unsigned, - const unsigned, const int); + struct drbd_peer_request *, blk_opf_t, int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2887350ae010..f3e4db16fd07 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2207,7 +2207,7 @@ void drbd_destroy_device(struct kref *kref) if (device->bitmap) /* should no longer be there. */ drbd_bm_cleanup(device); __free_page(device->md_io.page); - blk_cleanup_disk(device->vdisk); + put_disk(device->vdisk); kfree(device->rs_plan_s); /* not for_each_connection(connection, resource): @@ -2807,7 +2807,7 @@ out_no_minor_idr: out_no_bitmap: __free_page(device->md_io.page); out_no_io_page: - blk_cleanup_disk(disk); + put_disk(disk); out_no_disk: kref_put(&resource->kref, drbd_destroy_resource); kfree(device); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6762be53f409..af4c7d65490b 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -1621,8 +1621,7 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru /* TODO allocate from our own bio_set. */ int drbd_submit_peer_request(struct drbd_device *device, struct drbd_peer_request *peer_req, - const unsigned op, const unsigned op_flags, - const int fault_type) + const blk_opf_t opf, const int fault_type) { struct bio *bios = NULL; struct bio *bio; @@ -1668,8 +1667,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * generated bio, but a bio allocated on behalf of the peer. */ next_bio: - bio = bio_alloc(device->ldev->backing_bdev, nr_pages, op | op_flags, - GFP_NOIO); + bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO); /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; bio->bi_private = peer_req; @@ -2060,7 +2058,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, DRBD_FAULT_RS_WR) == 0) return 0; @@ -2383,14 +2381,14 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co /* see also bio_flags_to_wire() * DRBD_REQ_*, because we need to semantically map the flags to data packet * flags and back. We may replicate to other kernel versions. */ -static unsigned long wire_flags_to_bio_flags(u32 dpf) +static blk_opf_t wire_flags_to_bio_flags(u32 dpf) { return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); } -static unsigned long wire_flags_to_bio_op(u32 dpf) +static enum req_op wire_flags_to_bio_op(u32 dpf) { if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; @@ -2543,7 +2541,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - int op, op_flags; + enum req_op op; + blk_opf_t op_flags; u32 dp_flags; int err, tp; @@ -2681,7 +2680,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } - err = drbd_submit_peer_request(device, peer_req, op, op_flags, + err = drbd_submit_peer_request(device, peer_req, op | op_flags, DRBD_FAULT_DT_WR); if (!err) return 0; @@ -2979,7 +2978,7 @@ submit_for_resync: submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, fault_type) == 0) return 0; @@ -4951,7 +4950,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac if (get_ldev(device)) { struct drbd_peer_request *peer_req; - const int op = REQ_OP_WRITE_ZEROES; + const enum req_op op = REQ_OP_WRITE_ZEROES; peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, size, 0, GFP_NOIO); @@ -4969,7 +4968,8 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); + err = drbd_submit_peer_request(device, peer_req, op, + DRBD_FAULT_RS_WR); if (err) { spin_lock_irq(&device->resource->req_lock); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index e64bcfba30ef..6d8dd14458c6 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -523,16 +523,14 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req) { - char b[BDEVNAME_SIZE]; - if (!__ratelimit(&drbd_ratelimit_state)) return; - drbd_warn(device, "local %s IO error sector %llu+%u on %s\n", + drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n", (req->rq_state & RQ_WRITE) ? "WRITE" : "READ", (unsigned long long)req->i.sector, req->i.size >> 9, - bdevname(device->ldev->backing_bdev, b)); + device->ldev->backing_bdev); } /* Helper for HANDED_OVER_TO_NETWORK. diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index af3051dd8912..0bb1a900c2d5 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -405,7 +405,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, spin_unlock_irq(&device->resource->req_lock); atomic_add(size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, DRBD_FAULT_RS_RD) == 0) return 0; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 015841f50f4e..ccad3d7b3ddd 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2859,7 +2859,7 @@ static blk_status_t floppy_queue_rq(struct blk_mq_hw_ctx *hctx, if (WARN(atomic_read(&usage_count) == 0, "warning: usage count=0, current_req=%p sect=%ld flags=%llx\n", current_req, (long)blk_rq_pos(current_req), - (unsigned long long) current_req->cmd_flags)) + (__force unsigned long long) current_req->cmd_flags)) return BLK_STS_IOERR; if (test_and_set_bit(0, &fdc_busy)) { @@ -4557,7 +4557,7 @@ out: return; cleanup_disk: - blk_cleanup_disk(disks[drive][type]); + put_disk(disks[drive][type]); disks[drive][type] = NULL; mutex_unlock(&floppy_probe_lock); } @@ -4753,7 +4753,7 @@ out_put_disk: if (!disks[drive][0]) break; del_timer_sync(&motor_off_timer[drive]); - blk_cleanup_disk(disks[drive][0]); + put_disk(disks[drive][0]); blk_mq_free_tag_set(&tag_sets[drive]); } return err; @@ -4985,7 +4985,7 @@ static void __exit floppy_module_exit(void) } for (i = 0; i < ARRAY_SIZE(floppy_type); i++) { if (disks[drive][i]) - blk_cleanup_disk(disks[drive][i]); + put_disk(disks[drive][i]); } blk_mq_free_tag_set(&tag_sets[drive]); } diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 084f9b8a0ba3..e3c0ba93c1a3 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -2040,7 +2040,7 @@ static int loop_add(int i) return i; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_cleanup_tags: blk_mq_free_tag_set(&lo->tag_set); out_free_idr: @@ -2057,7 +2057,6 @@ static void loop_remove(struct loop_device *lo) { /* Make this loop device unreachable from pathname. */ del_gendisk(lo->lo_disk); - blk_cleanup_queue(lo->lo_disk->queue); blk_mq_free_tag_set(&lo->tag_set); mutex_lock(&loop_ctl_mutex); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 27386a572ba4..562725d222a7 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -94,17 +94,12 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -static LIST_HEAD(online_list); -static LIST_HEAD(removing_list); -static DEFINE_SPINLOCK(dev_lock); - /* * Global variable used to hold the major block device number * allocated in mtip_init(). */ static int mtip_major; static struct dentry *dfs_parent; -static struct dentry *dfs_device_status; static u32 cpu_use[NR_CPUS]; @@ -146,11 +141,8 @@ static bool mtip_check_surprise_removal(struct driver_data *dd) pci_read_config_word(dd->pdev, 0x00, &vendor_id); if (vendor_id == 0xFFFF) { dd->sr = true; - if (dd->queue) - blk_queue_flag_set(QUEUE_FLAG_DEAD, dd->queue); - else - dev_warn(&dd->pdev->dev, - "%s: dd->queue is NULL\n", __func__); + if (dd->disk) + blk_mark_disk_dead(dd->disk); return true; /* device removed */ } @@ -2170,106 +2162,6 @@ static const struct attribute_group *mtip_disk_attr_groups[] = { NULL, }; -/* debugsfs entries */ - -static ssize_t show_device_status(struct device_driver *drv, char *buf) -{ - int size = 0; - struct driver_data *dd, *tmp; - unsigned long flags; - char id_buf[42]; - u16 status = 0; - - spin_lock_irqsave(&dev_lock, flags); - size += sprintf(&buf[size], "Devices Present:\n"); - list_for_each_entry_safe(dd, tmp, &online_list, online_list) { - if (dd->pdev) { - if (dd->port && - dd->port->identify && - dd->port->identify_valid) { - strlcpy(id_buf, - (char *) (dd->port->identify + 10), 21); - status = *(dd->port->identify + 141); - } else { - memset(id_buf, 0, 42); - status = 0; - } - - if (dd->port && - test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { - size += sprintf(&buf[size], - " device %s %s (ftl rebuild %d %%)\n", - dev_name(&dd->pdev->dev), - id_buf, - status); - } else { - size += sprintf(&buf[size], - " device %s %s\n", - dev_name(&dd->pdev->dev), - id_buf); - } - } - } - - size += sprintf(&buf[size], "Devices Being Removed:\n"); - list_for_each_entry_safe(dd, tmp, &removing_list, remove_list) { - if (dd->pdev) { - if (dd->port && - dd->port->identify && - dd->port->identify_valid) { - strlcpy(id_buf, - (char *) (dd->port->identify+10), 21); - status = *(dd->port->identify + 141); - } else { - memset(id_buf, 0, 42); - status = 0; - } - - if (dd->port && - test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) { - size += sprintf(&buf[size], - " device %s %s (ftl rebuild %d %%)\n", - dev_name(&dd->pdev->dev), - id_buf, - status); - } else { - size += sprintf(&buf[size], - " device %s %s\n", - dev_name(&dd->pdev->dev), - id_buf); - } - } - } - spin_unlock_irqrestore(&dev_lock, flags); - - return size; -} - -static ssize_t mtip_hw_read_device_status(struct file *f, char __user *ubuf, - size_t len, loff_t *offset) -{ - int size = *offset; - char *buf; - int rv = 0; - - if (!len || *offset) - return 0; - - buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - size += show_device_status(NULL, buf); - - *offset = size <= len ? size : len; - size = copy_to_user(ubuf, buf, *offset); - if (size) - rv = -EFAULT; - - kfree(buf); - return rv ? rv : *offset; -} - static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, size_t len, loff_t *offset) { @@ -2363,13 +2255,6 @@ static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, return rv ? rv : *offset; } -static const struct file_operations mtip_device_status_fops = { - .owner = THIS_MODULE, - .open = simple_open, - .read = mtip_hw_read_device_status, - .llseek = no_llseek, -}; - static const struct file_operations mtip_regs_fops = { .owner = THIS_MODULE, .open = simple_open, @@ -2556,7 +2441,7 @@ static void mtip_softirq_done_fn(struct request *rq) blk_mq_end_request(rq, cmd->status); } -static bool mtip_abort_cmd(struct request *req, void *data, bool reserved) +static bool mtip_abort_cmd(struct request *req, void *data) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); struct driver_data *dd = data; @@ -2569,7 +2454,7 @@ static bool mtip_abort_cmd(struct request *req, void *data, bool reserved) return true; } -static bool mtip_queue_cmd(struct request *req, void *data, bool reserved) +static bool mtip_queue_cmd(struct request *req, void *data) { struct driver_data *dd = data; @@ -3297,26 +3182,12 @@ static int mtip_block_getgeo(struct block_device *dev, return 0; } -static int mtip_block_open(struct block_device *dev, fmode_t mode) +static void mtip_block_free_disk(struct gendisk *disk) { - struct driver_data *dd; + struct driver_data *dd = disk->private_data; - if (dev && dev->bd_disk) { - dd = (struct driver_data *) dev->bd_disk->private_data; - - if (dd) { - if (test_bit(MTIP_DDF_REMOVAL_BIT, - &dd->dd_flag)) { - return -ENODEV; - } - return 0; - } - } - return -ENODEV; -} - -static void mtip_block_release(struct gendisk *disk, fmode_t mode) -{ + ida_free(&rssd_index_ida, dd->index); + kfree(dd); } /* @@ -3326,13 +3197,12 @@ static void mtip_block_release(struct gendisk *disk, fmode_t mode) * layer. */ static const struct block_device_operations mtip_block_ops = { - .open = mtip_block_open, - .release = mtip_block_release, .ioctl = mtip_block_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = mtip_block_compat_ioctl, #endif .getgeo = mtip_block_getgeo, + .free_disk = mtip_block_free_disk, .owner = THIS_MODULE }; @@ -3487,12 +3357,11 @@ static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq, return 0; } -static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req, - bool reserved) +static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req) { struct driver_data *dd = req->q->queuedata; - if (reserved) { + if (blk_mq_is_reserved_rq(req)) { struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req); cmd->status = BLK_STS_TIMEOUT; @@ -3664,7 +3533,7 @@ init_hw_cmds_error: disk_index_error: ida_free(&rssd_index_ida, index); ida_get_error: - blk_cleanup_disk(dd->disk); + put_disk(dd->disk); block_queue_alloc_init_error: blk_mq_free_tag_set(&dd->tags); block_queue_alloc_tag_error: @@ -3673,72 +3542,6 @@ protocol_init_error: return rv; } -static bool mtip_no_dev_cleanup(struct request *rq, void *data, bool reserv) -{ - struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq); - - cmd->status = BLK_STS_IOERR; - blk_mq_complete_request(rq); - return true; -} - -/* - * Block layer deinitialization function. - * - * Called by the PCI layer as each P320 device is removed. - * - * @dd Pointer to the driver data structure. - * - * return value - * 0 - */ -static int mtip_block_remove(struct driver_data *dd) -{ - mtip_hw_debugfs_exit(dd); - - if (dd->mtip_svc_handler) { - set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); - wake_up_interruptible(&dd->port->svc_wait); - kthread_stop(dd->mtip_svc_handler); - } - - if (!dd->sr) { - /* - * Explicitly wait here for IOs to quiesce, - * as mtip_standby_drive usually won't wait for IOs. - */ - if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS)) - mtip_standby_drive(dd); - } - else - dev_info(&dd->pdev->dev, "device %s surprise removal\n", - dd->disk->disk_name); - - blk_freeze_queue_start(dd->queue); - blk_mq_quiesce_queue(dd->queue); - blk_mq_tagset_busy_iter(&dd->tags, mtip_no_dev_cleanup, dd); - blk_mq_unquiesce_queue(dd->queue); - - if (dd->disk) { - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - del_gendisk(dd->disk); - if (dd->disk->queue) { - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - dd->queue = NULL; - } - put_disk(dd->disk); - } - dd->disk = NULL; - - ida_free(&rssd_index_ida, dd->index); - - /* De-initialize the protocol layer. */ - mtip_hw_exit(dd); - - return 0; -} - /* * Function called by the PCI layer when just before the * machine shuts down. @@ -3755,23 +3558,14 @@ static int mtip_block_shutdown(struct driver_data *dd) { mtip_hw_shutdown(dd); - /* Delete our gendisk structure, and cleanup the blk queue. */ - if (dd->disk) { - dev_info(&dd->pdev->dev, - "Shutting down %s ...\n", dd->disk->disk_name); + dev_info(&dd->pdev->dev, + "Shutting down %s ...\n", dd->disk->disk_name); - if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) - del_gendisk(dd->disk); - if (dd->disk->queue) { - blk_cleanup_queue(dd->queue); - blk_mq_free_tag_set(&dd->tags); - } - put_disk(dd->disk); - dd->disk = NULL; - dd->queue = NULL; - } + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); - ida_free(&rssd_index_ida, dd->index); + blk_mq_free_tag_set(&dd->tags); + put_disk(dd->disk); return 0; } @@ -3905,7 +3699,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, const struct cpumask *node_mask; int cpu, i = 0, j = 0; int my_node = NUMA_NO_NODE; - unsigned long flags; /* Allocate memory for this devices private data. */ my_node = pcibus_to_node(pdev->bus); @@ -3952,9 +3745,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, dd->pdev = pdev; dd->numa_node = my_node; - INIT_LIST_HEAD(&dd->online_list); - INIT_LIST_HEAD(&dd->remove_list); - memset(dd->workq_name, 0, 32); snprintf(dd->workq_name, 31, "mtipq%d", dd->instance); @@ -4047,11 +3837,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, else rv = 0; /* device in rebuild state, return 0 from probe */ - /* Add to online list even if in ftl rebuild */ - spin_lock_irqsave(&dev_lock, flags); - list_add(&dd->online_list, &online_list); - spin_unlock_irqrestore(&dev_lock, flags); - goto done; block_initialize_err: @@ -4085,14 +3870,7 @@ done: static void mtip_pci_remove(struct pci_dev *pdev) { struct driver_data *dd = pci_get_drvdata(pdev); - unsigned long flags, to; - - set_bit(MTIP_DDF_REMOVAL_BIT, &dd->dd_flag); - - spin_lock_irqsave(&dev_lock, flags); - list_del_init(&dd->online_list); - list_add(&dd->remove_list, &removing_list); - spin_unlock_irqrestore(&dev_lock, flags); + unsigned long to; mtip_check_surprise_removal(dd); synchronize_irq(dd->pdev->irq); @@ -4109,11 +3887,35 @@ static void mtip_pci_remove(struct pci_dev *pdev) "Completion workers still active!\n"); } - blk_mark_disk_dead(dd->disk); set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); - /* Clean up the block layer. */ - mtip_block_remove(dd); + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) + del_gendisk(dd->disk); + + mtip_hw_debugfs_exit(dd); + + if (dd->mtip_svc_handler) { + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); + wake_up_interruptible(&dd->port->svc_wait); + kthread_stop(dd->mtip_svc_handler); + } + + if (!dd->sr) { + /* + * Explicitly wait here for IOs to quiesce, + * as mtip_standby_drive usually won't wait for IOs. + */ + if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS)) + mtip_standby_drive(dd); + } + else + dev_info(&dd->pdev->dev, "device %s surprise removal\n", + dd->disk->disk_name); + + blk_mq_free_tag_set(&dd->tags); + + /* De-initialize the protocol layer. */ + mtip_hw_exit(dd); if (dd->isr_workq) { destroy_workqueue(dd->isr_workq); @@ -4124,14 +3926,10 @@ static void mtip_pci_remove(struct pci_dev *pdev) pci_disable_msi(pdev); - spin_lock_irqsave(&dev_lock, flags); - list_del_init(&dd->remove_list); - spin_unlock_irqrestore(&dev_lock, flags); - - kfree(dd); - pcim_iounmap_regions(pdev, 1 << MTIP_ABAR); pci_set_drvdata(pdev, NULL); + + put_disk(dd->disk); } /* @@ -4250,15 +4048,6 @@ static int __init mtip_init(void) pr_warn("Error creating debugfs parent\n"); dfs_parent = NULL; } - if (dfs_parent) { - dfs_device_status = debugfs_create_file("device_status", - 0444, dfs_parent, NULL, - &mtip_device_status_fops); - if (IS_ERR_OR_NULL(dfs_device_status)) { - pr_err("Error creating device_status node\n"); - dfs_device_status = NULL; - } - } /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index 6816beb45352..f7328f19ac5c 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -149,7 +149,6 @@ enum { MTIP_DDF_RESUME_BIT = 6, MTIP_DDF_INIT_DONE_BIT = 7, MTIP_DDF_REBUILD_FAILED_BIT = 8, - MTIP_DDF_REMOVAL_BIT = 9, MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | (1 << MTIP_DDF_SEC_LOCK_BIT) | @@ -462,10 +461,6 @@ struct driver_data { int isr_binding; - struct list_head online_list; /* linkage for online list */ - - struct list_head remove_list; /* linkage for removing list */ - int unal_qdepth; /* qdepth of unaligned IO queue */ }; diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c index e094d2b8b5a9..d914156db2d8 100644 --- a/drivers/block/n64cart.c +++ b/drivers/block/n64cart.c @@ -157,7 +157,7 @@ static int __init n64cart_probe(struct platform_device *pdev) return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out: return err; } diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 07f3c139a3d7..f5d098a148cb 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -250,7 +250,7 @@ static void nbd_dev_remove(struct nbd_device *nbd) struct gendisk *disk = nbd->disk; del_gendisk(disk); - blk_cleanup_disk(disk); + put_disk(disk); blk_mq_free_tag_set(&nbd->tag_set); /* @@ -393,8 +393,7 @@ static u32 req_to_nbd_cmd_type(struct request *req) } } -static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req, - bool reserved) +static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); struct nbd_device *nbd = cmd->nbd; @@ -880,7 +879,7 @@ static void recv_work(struct work_struct *work) kfree(args); } -static bool nbd_clear_req(struct request *req, void *data, bool reserved) +static bool nbd_clear_req(struct request *req, void *data) { struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req); @@ -1833,7 +1832,7 @@ static struct nbd_device *nbd_dev_add(int index, unsigned int refs) out_free_work: destroy_workqueue(nbd->recv_workq); out_err_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_idr: mutex_lock(&nbd_index_mutex); idr_remove(&nbd_index_idr, index); diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 6b67088f4ea7..8b224ede2e33 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1310,7 +1310,7 @@ static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd, } static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd, - enum req_opf op, + enum req_op op, sector_t sector, sector_t nr_sectors) { @@ -1381,9 +1381,8 @@ static inline void nullb_complete_cmd(struct nullb_cmd *cmd) } } -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors) +blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, unsigned int nr_sectors) { struct nullb_device *dev = cmd->nq->dev; blk_status_t ret; @@ -1401,7 +1400,7 @@ blk_status_t null_process_cmd(struct nullb_cmd *cmd, } static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector, - sector_t nr_sectors, enum req_opf op) + sector_t nr_sectors, enum req_op op) { struct nullb_device *dev = cmd->nq->dev; struct nullb *nullb = dev->nullb; @@ -1578,7 +1577,7 @@ static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return nr; } -static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res) +static enum blk_eh_timer_return null_timeout_rq(struct request *rq) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); @@ -1737,7 +1736,7 @@ static void null_del_dev(struct nullb *nullb) null_restart_queue_async(nullb); } - blk_cleanup_disk(nullb->disk); + put_disk(nullb->disk); if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); @@ -2082,7 +2081,7 @@ static int null_add_dev(struct nullb_device *dev) out_cleanup_zone: null_free_zoned_dev(dev); out_cleanup_disk: - blk_cleanup_disk(nullb->disk); + put_disk(nullb->disk); out_cleanup_tags: if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set) blk_mq_free_tag_set(nullb->tag_set); diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h index 8359b43842f2..6fbf0a1b2622 100644 --- a/drivers/block/null_blk/null_blk.h +++ b/drivers/block/null_blk/null_blk.h @@ -136,9 +136,8 @@ struct nullb { blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector, sector_t nr_sectors); -blk_status_t null_process_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - unsigned int nr_sectors); +blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, unsigned int nr_sectors); #ifdef CONFIG_BLK_DEV_ZONED int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q); @@ -146,9 +145,8 @@ int null_register_zoned_dev(struct nullb *nullb); void null_free_zoned_dev(struct nullb_device *dev); int null_report_zones(struct gendisk *disk, sector_t sector, unsigned int nr_zones, report_zones_cb cb, void *data); -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, - sector_t nr_sectors); +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, + sector_t sector, sector_t nr_sectors); size_t null_zone_valid_read_len(struct nullb *nullb, sector_t sector, unsigned int len); #else @@ -164,7 +162,7 @@ static inline int null_register_zoned_dev(struct nullb *nullb) } static inline void null_free_zoned_dev(struct nullb_device *dev) {} static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, - enum req_opf op, sector_t sector, sector_t nr_sectors) + enum req_op op, sector_t sector, sector_t nr_sectors) { return BLK_STS_NOTSUPP; } diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h index 86d6c12c603c..6b2b370e786f 100644 --- a/drivers/block/null_blk/trace.h +++ b/drivers/block/null_blk/trace.h @@ -36,7 +36,7 @@ TRACE_EVENT(nullb_zone_op, TP_ARGS(cmd, zone_no, zone_cond), TP_STRUCT__entry( __array(char, disk, DISK_NAME_LEN) - __field(enum req_opf, op) + __field(enum req_op, op) __field(unsigned int, zone_no) __field(unsigned int, zone_cond) ), diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c index 2fdd7b20c224..55a69e48ef8b 100644 --- a/drivers/block/null_blk/zoned.c +++ b/drivers/block/null_blk/zoned.c @@ -159,7 +159,7 @@ int null_register_zoned_dev(struct nullb *nullb) struct nullb_device *dev = nullb->dev; struct request_queue *q = nullb->q; - blk_queue_set_zoned(nullb->disk, BLK_ZONED_HM); + disk_set_zoned(nullb->disk, BLK_ZONED_HM); blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q); blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE); @@ -170,12 +170,12 @@ int null_register_zoned_dev(struct nullb *nullb) return ret; } else { blk_queue_chunk_sectors(q, dev->zone_size_sects); - q->nr_zones = blkdev_nr_zones(nullb->disk); + nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0); } blk_queue_max_zone_append_sectors(q, dev->zone_size_sects); - blk_queue_max_open_zones(q, dev->zone_max_open); - blk_queue_max_active_zones(q, dev->zone_max_active); + disk_set_max_open_zones(nullb->disk, dev->zone_max_open); + disk_set_max_active_zones(nullb->disk, dev->zone_max_active); return 0; } @@ -600,7 +600,7 @@ static blk_status_t null_reset_zone(struct nullb_device *dev, return BLK_STS_OK; } -static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, +static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op, sector_t sector) { struct nullb_device *dev = cmd->nq->dev; @@ -653,7 +653,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_opf op, return ret; } -blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_opf op, +blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op, sector_t sector, sector_t nr_sectors) { struct nullb_device *dev; diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index f462ad67931a..a5ab40784119 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -956,7 +956,7 @@ out_unreg_cdrom: out_pi_release: pi_release(cd->pi); out_free_disk: - blk_cleanup_disk(cd->disk); + put_disk(cd->disk); out_free_tag_set: blk_mq_free_tag_set(&cd->tag_set); return ret; @@ -1029,7 +1029,7 @@ static void __exit pcd_exit(void) unregister_cdrom(&cd->info); del_gendisk(cd->disk); pi_release(cd->pi); - blk_cleanup_disk(cd->disk); + put_disk(cd->disk); blk_mq_free_tag_set(&cd->tag_set); } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 3637c38c72f9..f8a75bc90f70 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -501,6 +501,8 @@ static enum action do_pd_io_start(void) return do_pd_read_start(); else return do_pd_write_start(); + default: + break; } return Fail; } @@ -943,7 +945,7 @@ static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port, goto cleanup_disk; return 0; cleanup_disk: - blk_cleanup_disk(disk->gd); + put_disk(disk->gd); put_disk: put_disk(p); disk->gd = NULL; @@ -1018,7 +1020,7 @@ static void __exit pd_exit(void) if (p) { disk->gd = NULL; del_gendisk(p); - blk_cleanup_disk(p); + put_disk(p); blk_mq_free_tag_set(&disk->tag_set); pi_release(disk->pi); } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 292e9a4ce1b9..eec1b9fde245 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -975,7 +975,7 @@ static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port, out_pi_release: pi_release(pf->pi); out_free_disk: - blk_cleanup_disk(pf->disk); + put_disk(pf->disk); out_free_tag_set: blk_mq_free_tag_set(&pf->tag_set); return ret; @@ -1044,7 +1044,7 @@ static void __exit pf_exit(void) if (!pf->present) continue; del_gendisk(pf->disk); - blk_cleanup_disk(pf->disk); + put_disk(pf->disk); blk_mq_free_tag_set(&pf->tag_set); pi_release(pf->pi); } diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 789093375344..01a15dbd9cde 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2460,11 +2460,9 @@ static int pkt_seq_show(struct seq_file *m, void *p) { struct pktcdvd_device *pd = m->private; char *msg; - char bdev_buf[BDEVNAME_SIZE]; int states[PACKET_NUM_STATES]; - seq_printf(m, "Writer %s mapped to %s:\n", pd->name, - bdevname(pd->bdev, bdev_buf)); + seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev); seq_printf(m, "\nSettings:\n"); seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2); @@ -2521,7 +2519,6 @@ static int pkt_seq_show(struct seq_file *m, void *p) static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) { int i; - char b[BDEVNAME_SIZE]; struct block_device *bdev; struct scsi_device *sdev; @@ -2534,8 +2531,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) if (!pd2) continue; if (pd2->bdev->bd_dev == dev) { - pkt_err(pd, "%s already setup\n", - bdevname(pd2->bdev, b)); + pkt_err(pd, "%pg already setup\n", pd2->bdev); return -EBUSY; } if (pd2->pkt_dev == dev) { @@ -2570,7 +2566,7 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev) } proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd); - pkt_dbg(1, pd, "writer mapped to %s\n", bdevname(bdev, b)); + pkt_dbg(1, pd, "writer mapped to %pg\n", bdev); return 0; out_mem: @@ -2733,7 +2729,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev) return 0; out_mem2: - blk_cleanup_disk(disk); + put_disk(disk); out_mem: mempool_exit(&pd->rb_pool); kfree(pd); @@ -2783,7 +2779,7 @@ static int pkt_remove_dev(dev_t pkt_dev) pkt_dbg(1, pd, "writer unmapped\n"); del_gendisk(pd->disk); - blk_cleanup_disk(pd->disk); + put_disk(pd->disk); mempool_exit(&pd->rb_pool); kfree(pd); diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 3054adf77460..36d7b36c60c7 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -473,7 +473,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) return 0; fail_cleanup_disk: - blk_cleanup_disk(gendisk); + put_disk(gendisk); fail_free_tag_set: blk_mq_free_tag_set(&priv->tag_set); fail_teardown: @@ -500,7 +500,7 @@ static void ps3disk_remove(struct ps3_system_bus_device *_dev) &ps3disk_mask); mutex_unlock(&ps3disk_mask_mutex); del_gendisk(priv->gendisk); - blk_cleanup_disk(priv->gendisk); + put_disk(priv->gendisk); blk_mq_free_tag_set(&priv->tag_set); dev_notice(&dev->sbd.core, "Synchronizing disk cache\n"); ps3disk_sync_cache(dev); diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 4f90819e245e..d1e0fefec90b 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -761,7 +761,7 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) return 0; out_cleanup_disk: - blk_cleanup_disk(gendisk); + put_disk(gendisk); out_cache_cleanup: remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); @@ -792,7 +792,7 @@ static void ps3vram_remove(struct ps3_system_bus_device *dev) struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev); del_gendisk(priv->gendisk); - blk_cleanup_disk(priv->gendisk); + put_disk(priv->gendisk); remove_proc_entry(DEVICE_NAME, NULL); ps3vram_cache_cleanup(dev); iounmap(priv->reports); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index ef9bc62e9afd..0d8ec2fe5740 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -4729,7 +4729,7 @@ static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx, static void rbd_free_disk(struct rbd_device *rbd_dev) { - blk_cleanup_disk(rbd_dev->disk); + put_disk(rbd_dev->disk); blk_mq_free_tag_set(&rbd_dev->tag_set); rbd_dev->disk = NULL; } diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c index 409c76b81aed..b8d9e2824d9c 100644 --- a/drivers/block/rnbd/rnbd-clt.c +++ b/drivers/block/rnbd/rnbd-clt.c @@ -1408,7 +1408,7 @@ static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev, int idx) blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue); err = add_disk(dev->gd); if (err) - blk_cleanup_disk(dev->gd); + put_disk(dev->gd); return err; } @@ -1630,7 +1630,7 @@ put_sess: static void destroy_gen_disk(struct rnbd_clt_dev *dev) { del_gendisk(dev->gd); - blk_cleanup_disk(dev->gd); + put_disk(dev->gd); } static void destroy_sysfs(struct rnbd_clt_dev *dev, @@ -1755,7 +1755,7 @@ static void rnbd_destroy_sessions(void) list_for_each_entry_safe(dev, tn, &sess->devs_list, list) { /* * Here unmap happens in parallel for only one reason: - * blk_cleanup_queue() takes around half a second, so + * del_gendisk() takes around half a second, so * on huge amount of devices the whole module unload * procedure takes minutes. */ diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h index bfb08dd434d1..ea7ac8bca63c 100644 --- a/drivers/block/rnbd/rnbd-proto.h +++ b/drivers/block/rnbd/rnbd-proto.h @@ -229,9 +229,9 @@ static inline bool rnbd_flags_supported(u32 flags) return true; } -static inline u32 rnbd_to_bio_flags(u32 rnbd_opf) +static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf) { - u32 bio_opf; + blk_opf_t bio_opf; switch (rnbd_op(rnbd_opf)) { case RNBD_OP_READ: @@ -286,7 +286,8 @@ static inline u32 rq_to_rnbd_flags(struct request *rq) break; default: WARN(1, "Unknown request type %d (flags %llu)\n", - req_op(rq), (unsigned long long)rq->cmd_flags); + (__force u32)req_op(rq), + (__force unsigned long long)rq->cmd_flags); rnbd_opf = 0; } diff --git a/drivers/block/rnbd/rnbd-srv-dev.c b/drivers/block/rnbd/rnbd-srv-dev.c index c5d0a0391165..c63017f6e421 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.c +++ b/drivers/block/rnbd/rnbd-srv-dev.c @@ -28,7 +28,6 @@ struct rnbd_dev *rnbd_dev_open(const char *path, fmode_t flags) goto err; dev->blk_open_flags = flags; - bdevname(dev->bdev, dev->name); return dev; diff --git a/drivers/block/rnbd/rnbd-srv-dev.h b/drivers/block/rnbd/rnbd-srv-dev.h index 4309e5252469..8407d12f70af 100644 --- a/drivers/block/rnbd/rnbd-srv-dev.h +++ b/drivers/block/rnbd/rnbd-srv-dev.h @@ -15,7 +15,6 @@ struct rnbd_dev { struct block_device *bdev; fmode_t blk_open_flags; - char name[BDEVNAME_SIZE]; }; /** diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c index feaa76c5a342..297a6924ff4e 100644 --- a/drivers/block/rnbd/rnbd-srv-sysfs.c +++ b/drivers/block/rnbd/rnbd-srv-sysfs.c @@ -38,14 +38,13 @@ static struct kobj_type dev_ktype = { }; int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, - struct block_device *bdev, - const char *dev_name) + struct block_device *bdev) { struct kobject *bdev_kobj; int ret; ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype, - rnbd_devs_kobj, dev_name); + rnbd_devs_kobj, "%pg", bdev); if (ret) { kobject_put(&dev->dev_kobj); return ret; diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c index beaef43a67b9..0713014bf423 100644 --- a/drivers/block/rnbd/rnbd-srv.c +++ b/drivers/block/rnbd/rnbd-srv.c @@ -419,7 +419,7 @@ static struct rnbd_srv_sess_dev return sess_dev; } -static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) +static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(struct block_device *bdev) { struct rnbd_srv_dev *dev; @@ -427,7 +427,7 @@ static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(const char *id) if (!dev) return ERR_PTR(-ENOMEM); - strscpy(dev->id, id, sizeof(dev->id)); + snprintf(dev->id, sizeof(dev->id), "%pg", bdev); kref_init(&dev->kref); INIT_LIST_HEAD(&dev->sess_dev_list); mutex_init(&dev->lock); @@ -512,7 +512,7 @@ rnbd_srv_get_or_create_srv_dev(struct rnbd_dev *rnbd_dev, int ret; struct rnbd_srv_dev *new_dev, *dev; - new_dev = rnbd_srv_init_srv_dev(rnbd_dev->name); + new_dev = rnbd_srv_init_srv_dev(rnbd_dev->bdev); if (IS_ERR(new_dev)) return new_dev; @@ -758,8 +758,7 @@ static int process_msg_open(struct rnbd_srv_session *srv_sess, */ mutex_lock(&srv_dev->lock); if (!srv_dev->dev_kobj.state_in_sysfs) { - ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev, - rnbd_dev->name); + ret = rnbd_srv_create_dev_sysfs(srv_dev, rnbd_dev->bdev); if (ret) { mutex_unlock(&srv_dev->lock); rnbd_srv_err(srv_sess_dev, diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h index be2ae486d407..6926f9069dc4 100644 --- a/drivers/block/rnbd/rnbd-srv.h +++ b/drivers/block/rnbd/rnbd-srv.h @@ -68,8 +68,7 @@ void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev, /* rnbd-srv-sysfs.c */ int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev, - struct block_device *bdev, - const char *dir_name); + struct block_device *bdev); void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev); int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index dd0a1a6fed29..fb855da971ee 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -886,7 +886,7 @@ static int probe_disk(struct vdc_port *port) return 0; out_cleanup_disk: - blk_cleanup_disk(g); + put_disk(g); out_free_tag: blk_mq_free_tag_set(&port->tag_set); return err; @@ -1070,7 +1070,7 @@ static void vdc_port_remove(struct vio_dev *vdev) del_timer_sync(&port->vio.timer); del_gendisk(port->disk); - blk_cleanup_disk(port->disk); + put_disk(port->disk); blk_mq_free_tag_set(&port->tag_set); vdc_free_tx_ring(port); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index fef65a18d56f..42b4b6828690 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -783,7 +783,7 @@ static void swim_cleanup_floppy_disk(struct floppy_state *fs) if (fs->registered) del_gendisk(fs->disk); - blk_cleanup_disk(disk); + put_disk(disk); blk_mq_free_tag_set(&fs->tag_set); } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 6c39f2c9f806..da811a7da03f 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -1238,7 +1238,7 @@ static int swim3_attach(struct macio_dev *mdev, return 0; out_cleanup_disk: - blk_cleanup_disk(disk); + put_disk(disk); out_free_tag_set: blk_mq_free_tag_set(&fs->tag_set); out_unregister: diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 63b4f6431d2e..0e1a484cab0b 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -1377,7 +1377,7 @@ static void carm_free_disk(struct carm_host *host, unsigned int port_no) if (host->state > HST_DEV_ACTIVATE) del_gendisk(disk); - blk_cleanup_disk(disk); + put_disk(disk); } static int carm_init_shm(struct carm_host *host) @@ -1536,7 +1536,7 @@ err_out_free_majors: clear_bit(0, &carm_major_alloc); else if (host->major == 161) clear_bit(1, &carm_major_alloc); - blk_cleanup_queue(host->oob_q); + blk_mq_destroy_queue(host->oob_q); blk_mq_free_tag_set(&host->tag_set); err_out_dma_free: dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); @@ -1570,7 +1570,7 @@ static void carm_remove_one (struct pci_dev *pdev) clear_bit(0, &carm_major_alloc); else if (host->major == 161) clear_bit(1, &carm_major_alloc); - blk_cleanup_queue(host->oob_q); + blk_mq_destroy_queue(host->oob_q); blk_mq_free_tag_set(&host->tag_set); dma_free_coherent(&pdev->dev, CARM_SHM_SIZE, host->shm, host->shm_dma); iounmap(host->mmio); diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c new file mode 100644 index 000000000000..3f1906965ac8 --- /dev/null +++ b/drivers/block/ublk_drv.c @@ -0,0 +1,1545 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Userspace block device - block device which IO is handled from userspace + * + * Take full use of io_uring passthrough command for communicating with + * ublk userspace daemon(ublksrvd) for handling basic IO request. + * + * Copyright 2022 Ming Lei <ming.lei@redhat.com> + * + * (part of code stolen from loop.c) + */ +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/file.h> +#include <linux/stat.h> +#include <linux/errno.h> +#include <linux/major.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/init.h> +#include <linux/swap.h> +#include <linux/slab.h> +#include <linux/compat.h> +#include <linux/mutex.h> +#include <linux/writeback.h> +#include <linux/completion.h> +#include <linux/highmem.h> +#include <linux/sysfs.h> +#include <linux/miscdevice.h> +#include <linux/falloc.h> +#include <linux/uio.h> +#include <linux/ioprio.h> +#include <linux/sched/mm.h> +#include <linux/uaccess.h> +#include <linux/cdev.h> +#include <linux/io_uring.h> +#include <linux/blk-mq.h> +#include <linux/delay.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <linux/task_work.h> +#include <uapi/linux/ublk_cmd.h> + +#define UBLK_MINORS (1U << MINORBITS) + +/* All UBLK_F_* have to be included into UBLK_F_ALL */ +#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY | UBLK_F_URING_CMD_COMP_IN_TASK) + +struct ublk_rq_data { + struct callback_head work; +}; + +struct ublk_uring_cmd_pdu { + struct request *req; +}; + +/* + * io command is active: sqe cmd is received, and its cqe isn't done + * + * If the flag is set, the io command is owned by ublk driver, and waited + * for incoming blk-mq request from the ublk block device. + * + * If the flag is cleared, the io command will be completed, and owned by + * ublk server. + */ +#define UBLK_IO_FLAG_ACTIVE 0x01 + +/* + * IO command is completed via cqe, and it is being handled by ublksrv, and + * not committed yet + * + * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for + * cross verification + */ +#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02 + +/* + * IO command is aborted, so this flag is set in case of + * !UBLK_IO_FLAG_ACTIVE. + * + * After this flag is observed, any pending or new incoming request + * associated with this io command will be failed immediately + */ +#define UBLK_IO_FLAG_ABORTED 0x04 + +struct ublk_io { + /* userspace buffer address from io cmd */ + __u64 addr; + unsigned int flags; + int res; + + struct io_uring_cmd *cmd; +}; + +struct ublk_queue { + int q_id; + int q_depth; + + unsigned long flags; + struct task_struct *ubq_daemon; + char *io_cmd_buf; + + unsigned long io_addr; /* mapped vm address */ + unsigned int max_io_sz; + bool abort_work_pending; + unsigned short nr_io_ready; /* how many ios setup */ + struct ublk_device *dev; + struct ublk_io ios[0]; +}; + +#define UBLK_DAEMON_MONITOR_PERIOD (5 * HZ) + +struct ublk_device { + struct gendisk *ub_disk; + + char *__queues; + + unsigned short queue_size; + unsigned short bs_shift; + struct ublksrv_ctrl_dev_info dev_info; + + struct blk_mq_tag_set tag_set; + + struct cdev cdev; + struct device cdev_dev; + +#define UB_STATE_OPEN 0 +#define UB_STATE_USED 1 + unsigned long state; + int ub_number; + + struct mutex mutex; + + spinlock_t mm_lock; + struct mm_struct *mm; + + struct completion completion; + unsigned int nr_queues_ready; + atomic_t nr_aborted_queues; + + /* + * Our ubq->daemon may be killed without any notification, so + * monitor each queue's daemon periodically + */ + struct delayed_work monitor_work; + struct work_struct stop_work; +}; + +static dev_t ublk_chr_devt; +static struct class *ublk_chr_class; + +static DEFINE_IDR(ublk_index_idr); +static DEFINE_SPINLOCK(ublk_idr_lock); +static wait_queue_head_t ublk_idr_wq; /* wait until one idr is freed */ + +static DEFINE_MUTEX(ublk_ctl_mutex); + +static struct miscdevice ublk_misc; + +static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq) +{ + if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) && + !(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK)) + return true; + return false; +} + +static struct ublk_device *ublk_get_device(struct ublk_device *ub) +{ + if (kobject_get_unless_zero(&ub->cdev_dev.kobj)) + return ub; + return NULL; +} + +static void ublk_put_device(struct ublk_device *ub) +{ + put_device(&ub->cdev_dev); +} + +static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev, + int qid) +{ + return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]); +} + +static inline bool ublk_rq_has_data(const struct request *rq) +{ + return rq->bio && bio_has_data(rq->bio); +} + +static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq, + int tag) +{ + return (struct ublksrv_io_desc *) + &(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]); +} + +static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id) +{ + return ublk_get_queue(ub, q_id)->io_cmd_buf; +} + +static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + + return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc), + PAGE_SIZE); +} + +static void ublk_free_disk(struct gendisk *disk) +{ + struct ublk_device *ub = disk->private_data; + + clear_bit(UB_STATE_USED, &ub->state); + put_device(&ub->cdev_dev); +} + +static const struct block_device_operations ub_fops = { + .owner = THIS_MODULE, + .free_disk = ublk_free_disk, +}; + +#define UBLK_MAX_PIN_PAGES 32 + +struct ublk_map_data { + const struct ublk_queue *ubq; + const struct request *rq; + const struct ublk_io *io; + unsigned max_bytes; +}; + +struct ublk_io_iter { + struct page *pages[UBLK_MAX_PIN_PAGES]; + unsigned pg_off; /* offset in the 1st page in pages */ + int nr_pages; /* how many page pointers in pages */ + struct bio *bio; + struct bvec_iter iter; +}; + +static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data, + unsigned max_bytes, bool to_vm) +{ + const unsigned total = min_t(unsigned, max_bytes, + PAGE_SIZE - data->pg_off + + ((data->nr_pages - 1) << PAGE_SHIFT)); + unsigned done = 0; + unsigned pg_idx = 0; + + while (done < total) { + struct bio_vec bv = bio_iter_iovec(data->bio, data->iter); + const unsigned int bytes = min3(bv.bv_len, total - done, + (unsigned)(PAGE_SIZE - data->pg_off)); + void *bv_buf = bvec_kmap_local(&bv); + void *pg_buf = kmap_local_page(data->pages[pg_idx]); + + if (to_vm) + memcpy(pg_buf + data->pg_off, bv_buf, bytes); + else + memcpy(bv_buf, pg_buf + data->pg_off, bytes); + + kunmap_local(pg_buf); + kunmap_local(bv_buf); + + /* advance page array */ + data->pg_off += bytes; + if (data->pg_off == PAGE_SIZE) { + pg_idx += 1; + data->pg_off = 0; + } + + done += bytes; + + /* advance bio */ + bio_advance_iter_single(data->bio, &data->iter, bytes); + if (!data->iter.bi_size) { + data->bio = data->bio->bi_next; + if (data->bio == NULL) + break; + data->iter = data->bio->bi_iter; + } + } + + return done; +} + +static inline int ublk_copy_user_pages(struct ublk_map_data *data, + bool to_vm) +{ + const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0; + const unsigned long start_vm = data->io->addr; + unsigned int done = 0; + struct ublk_io_iter iter = { + .pg_off = start_vm & (PAGE_SIZE - 1), + .bio = data->rq->bio, + .iter = data->rq->bio->bi_iter, + }; + const unsigned int nr_pages = round_up(data->max_bytes + + (start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT; + + while (done < nr_pages) { + const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES, + nr_pages - done); + unsigned i, len; + + iter.nr_pages = get_user_pages_fast(start_vm + + (done << PAGE_SHIFT), to_pin, gup_flags, + iter.pages); + if (iter.nr_pages <= 0) + return done == 0 ? iter.nr_pages : done; + len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm); + for (i = 0; i < iter.nr_pages; i++) { + if (to_vm) + set_page_dirty(iter.pages[i]); + put_page(iter.pages[i]); + } + data->max_bytes -= len; + done += iter.nr_pages; + } + + return done; +} + +static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req, + struct ublk_io *io) +{ + const unsigned int rq_bytes = blk_rq_bytes(req); + /* + * no zero copy, we delay copy WRITE request data into ublksrv + * context and the big benefit is that pinning pages in current + * context is pretty fast, see ublk_pin_user_pages + */ + if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH) + return rq_bytes; + + if (ublk_rq_has_data(req)) { + struct ublk_map_data data = { + .ubq = ubq, + .rq = req, + .io = io, + .max_bytes = rq_bytes, + }; + + ublk_copy_user_pages(&data, true); + + return rq_bytes - data.max_bytes; + } + return rq_bytes; +} + +static int ublk_unmap_io(const struct ublk_queue *ubq, + const struct request *req, + struct ublk_io *io) +{ + const unsigned int rq_bytes = blk_rq_bytes(req); + + if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) { + struct ublk_map_data data = { + .ubq = ubq, + .rq = req, + .io = io, + .max_bytes = io->res, + }; + + WARN_ON_ONCE(io->res > rq_bytes); + + ublk_copy_user_pages(&data, false); + + return io->res - data.max_bytes; + } + return rq_bytes; +} + +static inline unsigned int ublk_req_build_flags(struct request *req) +{ + unsigned flags = 0; + + if (req->cmd_flags & REQ_FAILFAST_DEV) + flags |= UBLK_IO_F_FAILFAST_DEV; + + if (req->cmd_flags & REQ_FAILFAST_TRANSPORT) + flags |= UBLK_IO_F_FAILFAST_TRANSPORT; + + if (req->cmd_flags & REQ_FAILFAST_DRIVER) + flags |= UBLK_IO_F_FAILFAST_DRIVER; + + if (req->cmd_flags & REQ_META) + flags |= UBLK_IO_F_META; + + if (req->cmd_flags & REQ_FUA) + flags |= UBLK_IO_F_FUA; + + if (req->cmd_flags & REQ_NOUNMAP) + flags |= UBLK_IO_F_NOUNMAP; + + if (req->cmd_flags & REQ_SWAP) + flags |= UBLK_IO_F_SWAP; + + return flags; +} + +static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req) +{ + struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag); + struct ublk_io *io = &ubq->ios[req->tag]; + u32 ublk_op; + + switch (req_op(req)) { + case REQ_OP_READ: + ublk_op = UBLK_IO_OP_READ; + break; + case REQ_OP_WRITE: + ublk_op = UBLK_IO_OP_WRITE; + break; + case REQ_OP_FLUSH: + ublk_op = UBLK_IO_OP_FLUSH; + break; + case REQ_OP_DISCARD: + ublk_op = UBLK_IO_OP_DISCARD; + break; + case REQ_OP_WRITE_ZEROES: + ublk_op = UBLK_IO_OP_WRITE_ZEROES; + break; + default: + return BLK_STS_IOERR; + } + + /* need to translate since kernel may change */ + iod->op_flags = ublk_op | ublk_req_build_flags(req); + iod->nr_sectors = blk_rq_sectors(req); + iod->start_sector = blk_rq_pos(req); + iod->addr = io->addr; + + return BLK_STS_OK; +} + +static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu( + struct io_uring_cmd *ioucmd) +{ + return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu; +} + +static bool ubq_daemon_is_dying(struct ublk_queue *ubq) +{ + return ubq->ubq_daemon->flags & PF_EXITING; +} + +/* todo: handle partial completion */ +static void ublk_complete_rq(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + struct ublk_io *io = &ubq->ios[req->tag]; + unsigned int unmapped_bytes; + + /* failed read IO if nothing is read */ + if (!io->res && req_op(req) == REQ_OP_READ) + io->res = -EIO; + + if (io->res < 0) { + blk_mq_end_request(req, errno_to_blk_status(io->res)); + return; + } + + /* + * FLUSH or DISCARD usually won't return bytes returned, so end them + * directly. + * + * Both the two needn't unmap. + */ + if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) { + blk_mq_end_request(req, BLK_STS_OK); + return; + } + + /* for READ request, writing data in iod->addr to rq buffers */ + unmapped_bytes = ublk_unmap_io(ubq, req, io); + + /* + * Extremely impossible since we got data filled in just before + * + * Re-read simply for this unlikely case. + */ + if (unlikely(unmapped_bytes < io->res)) + io->res = unmapped_bytes; + + if (blk_update_request(req, BLK_STS_OK, io->res)) + blk_mq_requeue_request(req, true); + else + __blk_mq_end_request(req, BLK_STS_OK); +} + +/* + * __ublk_fail_req() may be called from abort context or ->ubq_daemon + * context during exiting, so lock is required. + * + * Also aborting may not be started yet, keep in mind that one failed + * request may be issued by block layer again. + */ +static void __ublk_fail_req(struct ublk_io *io, struct request *req) +{ + WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE); + + if (!(io->flags & UBLK_IO_FLAG_ABORTED)) { + io->flags |= UBLK_IO_FLAG_ABORTED; + blk_mq_end_request(req, BLK_STS_IOERR); + } +} + +#define UBLK_REQUEUE_DELAY_MS 3 + +static inline void __ublk_rq_task_work(struct request *req) +{ + struct ublk_queue *ubq = req->mq_hctx->driver_data; + struct ublk_device *ub = ubq->dev; + int tag = req->tag; + struct ublk_io *io = &ubq->ios[tag]; + bool task_exiting = current != ubq->ubq_daemon || + (current->flags & PF_EXITING); + unsigned int mapped_bytes; + + pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n", + __func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags, + ublk_get_iod(ubq, req->tag)->addr); + + if (unlikely(task_exiting)) { + blk_mq_end_request(req, BLK_STS_IOERR); + mod_delayed_work(system_wq, &ub->monitor_work, 0); + return; + } + + mapped_bytes = ublk_map_io(ubq, req, io); + + /* partially mapped, update io descriptor */ + if (unlikely(mapped_bytes != blk_rq_bytes(req))) { + /* + * Nothing mapped, retry until we succeed. + * + * We may never succeed in mapping any bytes here because + * of OOM. TODO: reserve one buffer with single page pinned + * for providing forward progress guarantee. + */ + if (unlikely(!mapped_bytes)) { + blk_mq_requeue_request(req, false); + blk_mq_delay_kick_requeue_list(req->q, + UBLK_REQUEUE_DELAY_MS); + return; + } + + ublk_get_iod(ubq, req->tag)->nr_sectors = + mapped_bytes >> 9; + } + + /* mark this cmd owned by ublksrv */ + io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV; + + /* + * clear ACTIVE since we are done with this sqe/cmd slot + * We can only accept io cmd in case of being not active. + */ + io->flags &= ~UBLK_IO_FLAG_ACTIVE; + + /* tell ublksrv one io request is coming */ + io_uring_cmd_done(io->cmd, UBLK_IO_RES_OK, 0); +} + +static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd) +{ + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + __ublk_rq_task_work(pdu->req); +} + +static void ublk_rq_task_work_fn(struct callback_head *work) +{ + struct ublk_rq_data *data = container_of(work, + struct ublk_rq_data, work); + struct request *req = blk_mq_rq_from_pdu(data); + + __ublk_rq_task_work(req); +} + +static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct ublk_queue *ubq = hctx->driver_data; + struct request *rq = bd->rq; + blk_status_t res; + + /* fill iod to slot in io cmd buffer */ + res = ublk_setup_iod(ubq, rq); + if (unlikely(res != BLK_STS_OK)) + return BLK_STS_IOERR; + + blk_mq_start_request(bd->rq); + + if (unlikely(ubq_daemon_is_dying(ubq))) { + fail: + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); + return BLK_STS_IOERR; + } + + if (ublk_can_use_task_work(ubq)) { + struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq); + enum task_work_notify_mode notify_mode = bd->last ? + TWA_SIGNAL_NO_IPI : TWA_NONE; + + if (task_work_add(ubq->ubq_daemon, &data->work, notify_mode)) + goto fail; + } else { + struct io_uring_cmd *cmd = ubq->ios[rq->tag].cmd; + struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd); + + pdu->req = rq; + io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb); + } + + return BLK_STS_OK; +} + +static void ublk_commit_rqs(struct blk_mq_hw_ctx *hctx) +{ + struct ublk_queue *ubq = hctx->driver_data; + + if (ublk_can_use_task_work(ubq)) + __set_notify_signal(ubq->ubq_daemon); +} + +static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data, + unsigned int hctx_idx) +{ + struct ublk_device *ub = driver_data; + struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num); + + hctx->driver_data = ubq; + return 0; +} + +static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct ublk_rq_data *data = blk_mq_rq_to_pdu(req); + + init_task_work(&data->work, ublk_rq_task_work_fn); + return 0; +} + +static const struct blk_mq_ops ublk_mq_ops = { + .queue_rq = ublk_queue_rq, + .commit_rqs = ublk_commit_rqs, + .init_hctx = ublk_init_hctx, + .init_request = ublk_init_rq, +}; + +static int ublk_ch_open(struct inode *inode, struct file *filp) +{ + struct ublk_device *ub = container_of(inode->i_cdev, + struct ublk_device, cdev); + + if (test_and_set_bit(UB_STATE_OPEN, &ub->state)) + return -EBUSY; + filp->private_data = ub; + return 0; +} + +static int ublk_ch_release(struct inode *inode, struct file *filp) +{ + struct ublk_device *ub = filp->private_data; + + clear_bit(UB_STATE_OPEN, &ub->state); + return 0; +} + +/* map pre-allocated per-queue cmd buffer to ublksrv daemon */ +static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct ublk_device *ub = filp->private_data; + size_t sz = vma->vm_end - vma->vm_start; + unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc); + unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT; + int q_id, ret = 0; + + spin_lock(&ub->mm_lock); + if (!ub->mm) + ub->mm = current->mm; + if (current->mm != ub->mm) + ret = -EINVAL; + spin_unlock(&ub->mm_lock); + + if (ret) + return ret; + + if (vma->vm_flags & VM_WRITE) + return -EPERM; + + end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz; + if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end) + return -EINVAL; + + q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz; + pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n", + __func__, q_id, current->pid, vma->vm_start, + phys_off, (unsigned long)sz); + + if (sz != ublk_queue_cmd_buf_size(ub, q_id)) + return -EINVAL; + + pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT; + return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot); +} + +static void ublk_commit_completion(struct ublk_device *ub, + struct ublksrv_io_cmd *ub_cmd) +{ + u32 qid = ub_cmd->q_id, tag = ub_cmd->tag; + struct ublk_queue *ubq = ublk_get_queue(ub, qid); + struct ublk_io *io = &ubq->ios[tag]; + struct request *req; + + /* now this cmd slot is owned by nbd driver */ + io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV; + io->res = ub_cmd->result; + + /* find the io request and complete */ + req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag); + + if (req && likely(!blk_should_fake_timeout(req->q))) + ublk_complete_rq(req); +} + +/* + * When ->ubq_daemon is exiting, either new request is ended immediately, + * or any queued io command is drained, so it is safe to abort queue + * lockless + */ +static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq) +{ + int i; + + if (!ublk_get_device(ub)) + return; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) { + struct request *rq; + + /* + * Either we fail the request or ublk_rq_task_work_fn + * will do it + */ + rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i); + if (rq) + __ublk_fail_req(io, rq); + } + } + ublk_put_device(ub); +} + +static void ublk_daemon_monitor_work(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, monitor_work.work); + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) { + struct ublk_queue *ubq = ublk_get_queue(ub, i); + + if (ubq_daemon_is_dying(ubq)) { + schedule_work(&ub->stop_work); + + /* abort queue is for making forward progress */ + ublk_abort_queue(ub, ubq); + } + } + + /* + * We can't schedule monitor work after ublk_remove() is started. + * + * No need ub->mutex, monitor work are canceled after state is marked + * as DEAD, so DEAD state is observed reliably. + */ + if (ub->dev_info.state != UBLK_S_DEV_DEAD) + schedule_delayed_work(&ub->monitor_work, + UBLK_DAEMON_MONITOR_PERIOD); +} + +static void ublk_cancel_queue(struct ublk_queue *ubq) +{ + int i; + + for (i = 0; i < ubq->q_depth; i++) { + struct ublk_io *io = &ubq->ios[i]; + + if (io->flags & UBLK_IO_FLAG_ACTIVE) + io_uring_cmd_done(io->cmd, UBLK_IO_RES_ABORT, 0); + } +} + +/* Cancel all pending commands, must be called after del_gendisk() returns */ +static void ublk_cancel_dev(struct ublk_device *ub) +{ + int i; + + for (i = 0; i < ub->dev_info.nr_hw_queues; i++) + ublk_cancel_queue(ublk_get_queue(ub, i)); +} + +static void ublk_stop_dev(struct ublk_device *ub) +{ + mutex_lock(&ub->mutex); + if (ub->dev_info.state != UBLK_S_DEV_LIVE) + goto unlock; + + del_gendisk(ub->ub_disk); + ub->dev_info.state = UBLK_S_DEV_DEAD; + ub->dev_info.ublksrv_pid = -1; + ublk_cancel_dev(ub); + put_disk(ub->ub_disk); + ub->ub_disk = NULL; + unlock: + mutex_unlock(&ub->mutex); + cancel_delayed_work_sync(&ub->monitor_work); +} + +static inline bool ublk_queue_ready(struct ublk_queue *ubq) +{ + return ubq->nr_io_ready == ubq->q_depth; +} + +/* device can only be started after all IOs are ready */ +static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq) +{ + mutex_lock(&ub->mutex); + ubq->nr_io_ready++; + if (ublk_queue_ready(ubq)) { + ubq->ubq_daemon = current; + get_task_struct(ubq->ubq_daemon); + ub->nr_queues_ready++; + } + if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) + complete_all(&ub->completion); + mutex_unlock(&ub->mutex); +} + +static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) +{ + struct ublksrv_io_cmd *ub_cmd = (struct ublksrv_io_cmd *)cmd->cmd; + struct ublk_device *ub = cmd->file->private_data; + struct ublk_queue *ubq; + struct ublk_io *io; + u32 cmd_op = cmd->cmd_op; + unsigned tag = ub_cmd->tag; + int ret = -EINVAL; + + pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n", + __func__, cmd->cmd_op, ub_cmd->q_id, tag, + ub_cmd->result); + + if (!(issue_flags & IO_URING_F_SQE128)) + goto out; + + if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) + goto out; + + ubq = ublk_get_queue(ub, ub_cmd->q_id); + if (!ubq || ub_cmd->q_id != ubq->q_id) + goto out; + + if (ubq->ubq_daemon && ubq->ubq_daemon != current) + goto out; + + if (tag >= ubq->q_depth) + goto out; + + io = &ubq->ios[tag]; + + /* there is pending io cmd, something must be wrong */ + if (io->flags & UBLK_IO_FLAG_ACTIVE) { + ret = -EBUSY; + goto out; + } + + switch (cmd_op) { + case UBLK_IO_FETCH_REQ: + /* UBLK_IO_FETCH_REQ is only allowed before queue is setup */ + if (ublk_queue_ready(ubq)) { + ret = -EBUSY; + goto out; + } + /* + * The io is being handled by server, so COMMIT_RQ is expected + * instead of FETCH_REQ + */ + if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV) + goto out; + /* FETCH_RQ has to provide IO buffer */ + if (!ub_cmd->addr) + goto out; + io->cmd = cmd; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->addr = ub_cmd->addr; + + ublk_mark_io_ready(ub, ubq); + break; + case UBLK_IO_COMMIT_AND_FETCH_REQ: + /* FETCH_RQ has to provide IO buffer */ + if (!ub_cmd->addr) + goto out; + if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)) + goto out; + io->addr = ub_cmd->addr; + io->flags |= UBLK_IO_FLAG_ACTIVE; + io->cmd = cmd; + ublk_commit_completion(ub, ub_cmd); + break; + default: + goto out; + } + return -EIOCBQUEUED; + + out: + io_uring_cmd_done(cmd, ret, 0); + pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n", + __func__, cmd_op, tag, ret, io->flags); + return -EIOCBQUEUED; +} + +static const struct file_operations ublk_ch_fops = { + .owner = THIS_MODULE, + .open = ublk_ch_open, + .release = ublk_ch_release, + .llseek = no_llseek, + .uring_cmd = ublk_ch_uring_cmd, + .mmap = ublk_ch_mmap, +}; + +static void ublk_deinit_queue(struct ublk_device *ub, int q_id) +{ + int size = ublk_queue_cmd_buf_size(ub, q_id); + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + + if (ubq->ubq_daemon) + put_task_struct(ubq->ubq_daemon); + if (ubq->io_cmd_buf) + free_pages((unsigned long)ubq->io_cmd_buf, get_order(size)); +} + +static int ublk_init_queue(struct ublk_device *ub, int q_id) +{ + struct ublk_queue *ubq = ublk_get_queue(ub, q_id); + gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO; + void *ptr; + int size; + + ubq->flags = ub->dev_info.flags; + ubq->q_id = q_id; + ubq->q_depth = ub->dev_info.queue_depth; + size = ublk_queue_cmd_buf_size(ub, q_id); + + ptr = (void *) __get_free_pages(gfp_flags, get_order(size)); + if (!ptr) + return -ENOMEM; + + ubq->io_cmd_buf = ptr; + ubq->dev = ub; + return 0; +} + +static void ublk_deinit_queues(struct ublk_device *ub) +{ + int nr_queues = ub->dev_info.nr_hw_queues; + int i; + + if (!ub->__queues) + return; + + for (i = 0; i < nr_queues; i++) + ublk_deinit_queue(ub, i); + kfree(ub->__queues); +} + +static int ublk_init_queues(struct ublk_device *ub) +{ + int nr_queues = ub->dev_info.nr_hw_queues; + int depth = ub->dev_info.queue_depth; + int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io); + int i, ret = -ENOMEM; + + ub->queue_size = ubq_size; + ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL); + if (!ub->__queues) + return ret; + + for (i = 0; i < nr_queues; i++) { + if (ublk_init_queue(ub, i)) + goto fail; + } + + init_completion(&ub->completion); + return 0; + + fail: + ublk_deinit_queues(ub); + return ret; +} + +static int ublk_alloc_dev_number(struct ublk_device *ub, int idx) +{ + int i = idx; + int err; + + spin_lock(&ublk_idr_lock); + /* allocate id, if @id >= 0, we're requesting that specific id */ + if (i >= 0) { + err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT); + if (err == -ENOSPC) + err = -EEXIST; + } else { + err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT); + } + spin_unlock(&ublk_idr_lock); + + if (err >= 0) + ub->ub_number = err; + + return err; +} + +static void ublk_free_dev_number(struct ublk_device *ub) +{ + spin_lock(&ublk_idr_lock); + idr_remove(&ublk_index_idr, ub->ub_number); + wake_up_all(&ublk_idr_wq); + spin_unlock(&ublk_idr_lock); +} + +static void ublk_cdev_rel(struct device *dev) +{ + struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev); + + blk_mq_free_tag_set(&ub->tag_set); + ublk_deinit_queues(ub); + ublk_free_dev_number(ub); + mutex_destroy(&ub->mutex); + kfree(ub); +} + +static int ublk_add_chdev(struct ublk_device *ub) +{ + struct device *dev = &ub->cdev_dev; + int minor = ub->ub_number; + int ret; + + dev->parent = ublk_misc.this_device; + dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor); + dev->class = ublk_chr_class; + dev->release = ublk_cdev_rel; + device_initialize(dev); + + ret = dev_set_name(dev, "ublkc%d", minor); + if (ret) + goto fail; + + cdev_init(&ub->cdev, &ublk_ch_fops); + ret = cdev_device_add(&ub->cdev, dev); + if (ret) + goto fail; + return 0; + fail: + put_device(dev); + return ret; +} + +static void ublk_stop_work_fn(struct work_struct *work) +{ + struct ublk_device *ub = + container_of(work, struct ublk_device, stop_work); + + ublk_stop_dev(ub); +} + +/* align maximum I/O size to PAGE_SIZE */ +static void ublk_align_max_io_size(struct ublk_device *ub) +{ + unsigned int max_rq_bytes = ub->dev_info.rq_max_blocks << ub->bs_shift; + + ub->dev_info.rq_max_blocks = + round_down(max_rq_bytes, PAGE_SIZE) >> ub->bs_shift; +} + +static int ublk_add_tag_set(struct ublk_device *ub) +{ + ub->tag_set.ops = &ublk_mq_ops; + ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues; + ub->tag_set.queue_depth = ub->dev_info.queue_depth; + ub->tag_set.numa_node = NUMA_NO_NODE; + ub->tag_set.cmd_size = sizeof(struct ublk_rq_data); + ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE; + ub->tag_set.driver_data = ub; + return blk_mq_alloc_tag_set(&ub->tag_set); +} + +static void ublk_remove(struct ublk_device *ub) +{ + ublk_stop_dev(ub); + cancel_work_sync(&ub->stop_work); + cdev_device_del(&ub->cdev, &ub->cdev_dev); + put_device(&ub->cdev_dev); +} + +static struct ublk_device *ublk_get_device_from_id(int idx) +{ + struct ublk_device *ub = NULL; + + if (idx < 0) + return NULL; + + spin_lock(&ublk_idr_lock); + ub = idr_find(&ublk_index_idr, idx); + if (ub) + ub = ublk_get_device(ub); + spin_unlock(&ublk_idr_lock); + + return ub; +} + +static int ublk_ctrl_start_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ublksrv_pid = (int)header->data[0]; + unsigned long dev_blocks = header->data[1]; + struct ublk_device *ub; + struct gendisk *disk; + int ret = -EINVAL; + + if (ublksrv_pid <= 0) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + wait_for_completion_interruptible(&ub->completion); + + schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD); + + mutex_lock(&ub->mutex); + if (ub->dev_info.state == UBLK_S_DEV_LIVE || + test_bit(UB_STATE_USED, &ub->state)) { + ret = -EEXIST; + goto out_unlock; + } + + /* We may get disk size updated */ + if (dev_blocks) + ub->dev_info.dev_blocks = dev_blocks; + + disk = blk_mq_alloc_disk(&ub->tag_set, ub); + if (IS_ERR(disk)) { + ret = PTR_ERR(disk); + goto out_unlock; + } + sprintf(disk->disk_name, "ublkb%d", ub->ub_number); + disk->fops = &ub_fops; + disk->private_data = ub; + + blk_queue_logical_block_size(disk->queue, ub->dev_info.block_size); + blk_queue_physical_block_size(disk->queue, ub->dev_info.block_size); + blk_queue_io_min(disk->queue, ub->dev_info.block_size); + blk_queue_max_hw_sectors(disk->queue, + ub->dev_info.rq_max_blocks << (ub->bs_shift - 9)); + disk->queue->limits.discard_granularity = PAGE_SIZE; + blk_queue_max_discard_sectors(disk->queue, UINT_MAX >> 9); + blk_queue_max_write_zeroes_sectors(disk->queue, UINT_MAX >> 9); + + set_capacity(disk, ub->dev_info.dev_blocks << (ub->bs_shift - 9)); + + ub->dev_info.ublksrv_pid = ublksrv_pid; + ub->ub_disk = disk; + get_device(&ub->cdev_dev); + ret = add_disk(disk); + if (ret) { + put_disk(disk); + goto out_unlock; + } + set_bit(UB_STATE_USED, &ub->state); + ub->dev_info.state = UBLK_S_DEV_LIVE; +out_unlock: + mutex_unlock(&ub->mutex); + ublk_put_device(ub); + return ret; +} + +static int ublk_ctrl_get_queue_affinity(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_device *ub; + cpumask_var_t cpumask; + unsigned long queue; + unsigned int retlen; + unsigned int i; + int ret = -EINVAL; + + if (header->len * BITS_PER_BYTE < nr_cpu_ids) + return -EINVAL; + if (header->len & (sizeof(unsigned long)-1)) + return -EINVAL; + if (!header->addr) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + queue = header->data[0]; + if (queue >= ub->dev_info.nr_hw_queues) + goto out_put_device; + + ret = -ENOMEM; + if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL)) + goto out_put_device; + + for_each_possible_cpu(i) { + if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue) + cpumask_set_cpu(i, cpumask); + } + + ret = -EFAULT; + retlen = min_t(unsigned short, header->len, cpumask_size()); + if (copy_to_user(argp, cpumask, retlen)) + goto out_free_cpumask; + if (retlen != header->len && + clear_user(argp + retlen, header->len - retlen)) + goto out_free_cpumask; + + ret = 0; +out_free_cpumask: + free_cpumask_var(cpumask); +out_put_device: + ublk_put_device(ub); + return ret; +} + +static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info) +{ + pr_devel("%s: dev id %d flags %llx\n", __func__, + info->dev_id, info->flags); + pr_devel("\t nr_hw_queues %d queue_depth %d block size %d dev_capacity %lld\n", + info->nr_hw_queues, info->queue_depth, + info->block_size, info->dev_blocks); +} + +static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublksrv_ctrl_dev_info info; + struct ublk_device *ub; + int ret = -EINVAL; + + if (header->len < sizeof(info) || !header->addr) + return -EINVAL; + if (header->queue_id != (u16)-1) { + pr_warn("%s: queue_id is wrong %x\n", + __func__, header->queue_id); + return -EINVAL; + } + if (copy_from_user(&info, argp, sizeof(info))) + return -EFAULT; + ublk_dump_dev_info(&info); + if (header->dev_id != info.dev_id) { + pr_warn("%s: dev id not match %u %u\n", + __func__, header->dev_id, info.dev_id); + return -EINVAL; + } + + ret = mutex_lock_killable(&ublk_ctl_mutex); + if (ret) + return ret; + + ret = -ENOMEM; + ub = kzalloc(sizeof(*ub), GFP_KERNEL); + if (!ub) + goto out_unlock; + mutex_init(&ub->mutex); + spin_lock_init(&ub->mm_lock); + INIT_WORK(&ub->stop_work, ublk_stop_work_fn); + INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work); + + ret = ublk_alloc_dev_number(ub, header->dev_id); + if (ret < 0) + goto out_free_ub; + + memcpy(&ub->dev_info, &info, sizeof(info)); + + /* update device id */ + ub->dev_info.dev_id = ub->ub_number; + + /* + * 64bit flags will be copied back to userspace as feature + * negotiation result, so have to clear flags which driver + * doesn't support yet, then userspace can get correct flags + * (features) to handle. + */ + ub->dev_info.flags &= UBLK_F_ALL; + + /* We are not ready to support zero copy */ + ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY; + + ub->bs_shift = ilog2(ub->dev_info.block_size); + ub->dev_info.nr_hw_queues = min_t(unsigned int, + ub->dev_info.nr_hw_queues, nr_cpu_ids); + ublk_align_max_io_size(ub); + + ret = ublk_init_queues(ub); + if (ret) + goto out_free_dev_number; + + ret = ublk_add_tag_set(ub); + if (ret) + goto out_deinit_queues; + + ret = -EFAULT; + if (copy_to_user(argp, &ub->dev_info, sizeof(info))) + goto out_free_tag_set; + + /* + * Add the char dev so that ublksrv daemon can be setup. + * ublk_add_chdev() will cleanup everything if it fails. + */ + ret = ublk_add_chdev(ub); + goto out_unlock; + +out_free_tag_set: + blk_mq_free_tag_set(&ub->tag_set); +out_deinit_queues: + ublk_deinit_queues(ub); +out_free_dev_number: + ublk_free_dev_number(ub); +out_free_ub: + mutex_destroy(&ub->mutex); + kfree(ub); +out_unlock: + mutex_unlock(&ublk_ctl_mutex); + return ret; +} + +static inline bool ublk_idr_freed(int id) +{ + void *ptr; + + spin_lock(&ublk_idr_lock); + ptr = idr_find(&ublk_index_idr, id); + spin_unlock(&ublk_idr_lock); + + return ptr == NULL; +} + +static int ublk_ctrl_del_dev(int idx) +{ + struct ublk_device *ub; + int ret; + + ret = mutex_lock_killable(&ublk_ctl_mutex); + if (ret) + return ret; + + ub = ublk_get_device_from_id(idx); + if (ub) { + ublk_remove(ub); + ublk_put_device(ub); + ret = 0; + } else { + ret = -ENODEV; + } + + /* + * Wait until the idr is removed, then it can be reused after + * DEL_DEV command is returned. + */ + if (!ret) + wait_event(ublk_idr_wq, ublk_idr_freed(idx)); + mutex_unlock(&ublk_ctl_mutex); + + return ret; +} + +static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + + pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n", + __func__, cmd->cmd_op, header->dev_id, header->queue_id, + header->data[0], header->addr, header->len); +} + +static int ublk_ctrl_stop_dev(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + struct ublk_device *ub; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + ublk_stop_dev(ub); + cancel_work_sync(&ub->stop_work); + + ublk_put_device(ub); + return 0; +} + +static int ublk_ctrl_get_dev_info(struct io_uring_cmd *cmd) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + void __user *argp = (void __user *)(unsigned long)header->addr; + struct ublk_device *ub; + int ret = 0; + + if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr) + return -EINVAL; + + ub = ublk_get_device_from_id(header->dev_id); + if (!ub) + return -EINVAL; + + if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info))) + ret = -EFAULT; + ublk_put_device(ub); + + return ret; +} + +static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd, + unsigned int issue_flags) +{ + struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd; + int ret = -EINVAL; + + ublk_ctrl_cmd_dump(cmd); + + if (!(issue_flags & IO_URING_F_SQE128)) + goto out; + + ret = -EPERM; + if (!capable(CAP_SYS_ADMIN)) + goto out; + + ret = -ENODEV; + switch (cmd->cmd_op) { + case UBLK_CMD_START_DEV: + ret = ublk_ctrl_start_dev(cmd); + break; + case UBLK_CMD_STOP_DEV: + ret = ublk_ctrl_stop_dev(cmd); + break; + case UBLK_CMD_GET_DEV_INFO: + ret = ublk_ctrl_get_dev_info(cmd); + break; + case UBLK_CMD_ADD_DEV: + ret = ublk_ctrl_add_dev(cmd); + break; + case UBLK_CMD_DEL_DEV: + ret = ublk_ctrl_del_dev(header->dev_id); + break; + case UBLK_CMD_GET_QUEUE_AFFINITY: + ret = ublk_ctrl_get_queue_affinity(cmd); + break; + default: + break; + } + out: + io_uring_cmd_done(cmd, ret, 0); + pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n", + __func__, ret, cmd->cmd_op, header->dev_id, header->queue_id); + return -EIOCBQUEUED; +} + +static const struct file_operations ublk_ctl_fops = { + .open = nonseekable_open, + .uring_cmd = ublk_ctrl_uring_cmd, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice ublk_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "ublk-control", + .fops = &ublk_ctl_fops, +}; + +static int __init ublk_init(void) +{ + int ret; + + init_waitqueue_head(&ublk_idr_wq); + + ret = misc_register(&ublk_misc); + if (ret) + return ret; + + ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char"); + if (ret) + goto unregister_mis; + + ublk_chr_class = class_create(THIS_MODULE, "ublk-char"); + if (IS_ERR(ublk_chr_class)) { + ret = PTR_ERR(ublk_chr_class); + goto free_chrdev_region; + } + return 0; + +free_chrdev_region: + unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); +unregister_mis: + misc_deregister(&ublk_misc); + return ret; +} + +static void __exit ublk_exit(void) +{ + struct ublk_device *ub; + int id; + + class_destroy(ublk_chr_class); + + misc_deregister(&ublk_misc); + + idr_for_each_entry(&ublk_index_idr, ub, id) + ublk_remove(ub); + + idr_destroy(&ublk_index_idr); + unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS); +} + +module_init(ublk_init); +module_exit(ublk_exit); + +MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>"); +MODULE_LICENSE("GPL"); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6fc7850c2b0a..d7d72e8f6e55 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -1089,7 +1089,7 @@ static int virtblk_probe(struct virtio_device *vdev) return 0; out_cleanup_disk: - blk_cleanup_disk(vblk->disk); + put_disk(vblk->disk); out_free_tags: blk_mq_free_tag_set(&vblk->tag_set); out_free_vq: @@ -1111,7 +1111,6 @@ static void virtblk_remove(struct virtio_device *vdev) flush_work(&vblk->config_work); del_gendisk(vblk->disk); - blk_cleanup_queue(vblk->disk->queue); blk_mq_free_tag_set(&vblk->tag_set); mutex_lock(&vblk->vdev_mutex); diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index a97f2bf5b01b..a5cf7f1e871c 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -442,7 +442,7 @@ static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) * Routines for managing virtual block devices (vbds). */ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, - int operation) + enum req_op operation) { struct xen_vbd *vbd = &blkif->vbd; int rc = -EACCES; @@ -1193,8 +1193,8 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct bio *bio = NULL; struct bio **biolist = pending_req->biolist; int i, nbio = 0; - int operation; - int operation_flags = 0; + enum req_op operation; + blk_opf_t operation_flags = 0; struct blk_plug plug; bool drain = false; struct grant_page **pages = pending_req->segments; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 3646c0cae672..dc48298225a6 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -2397,7 +2397,7 @@ static void blkfront_connect(struct blkfront_info *info) err = device_add_disk(&info->xbdev->dev, info->gd, NULL); if (err) { - blk_cleanup_disk(info->gd); + put_disk(info->gd); blk_mq_free_tag_set(&info->tag_set); info->rq = NULL; goto fail; @@ -2482,7 +2482,7 @@ static int blkfront_remove(struct xenbus_device *xbdev) blkif_free(info, 0); if (info->gd) { xlbd_release_minors(info->gd->first_minor, info->gd->minors); - blk_cleanup_disk(info->gd); + put_disk(info->gd); blk_mq_free_tag_set(&info->tag_set); } diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 7a6ed83481b8..c1e85f356e4d 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -337,7 +337,7 @@ static int z2ram_register_disk(int minor) z2ram_gendisk[minor] = disk; err = add_disk(disk); if (err) - blk_cleanup_disk(disk); + put_disk(disk); return err; } @@ -384,7 +384,6 @@ static void __exit z2_exit(void) for (i = 0; i < Z2MINOR_COUNT; i++) { del_gendisk(z2ram_gendisk[i]); - blk_cleanup_queue(z2ram_gendisk[i]->queue); put_disk(z2ram_gendisk[i]); } blk_mq_free_tag_set(&tag_set); diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index b8549c61ff2c..4abeb261b833 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1523,7 +1523,7 @@ static void zram_bio_discard(struct zram *zram, u32 index, * Returns 1 if IO request was successfully submitted. */ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, unsigned int op, struct bio *bio) + int offset, enum req_op op, struct bio *bio) { int ret; @@ -1631,7 +1631,7 @@ static void zram_slot_free_notify(struct block_device *bdev, } static int zram_rw_page(struct block_device *bdev, sector_t sector, - struct page *page, unsigned int op) + struct page *page, enum req_op op) { int offset, ret; u32 index; @@ -1957,7 +1957,7 @@ static int zram_add(void) return device_id; out_cleanup_disk: - blk_cleanup_disk(zram->disk); + put_disk(zram->disk); out_free_idr: idr_remove(&zram_index_idr, device_id); out_free_dev: @@ -2008,7 +2008,7 @@ static int zram_remove(struct zram *zram) */ zram_reset_device(zram); - blk_cleanup_disk(zram->disk); + put_disk(zram->disk); kfree(zram); return 0; } |