summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 19:43:59 +0100
committerLinus Torvalds <torvalds@linux-foundation.org>2022-12-13 19:43:59 +0100
commitce8a79d5601aab94c02ed4539c48e8605422ac94 (patch)
tree7830a97a475d57284640c8e2d3516521722708b6
parentMerge tag 'for-6.2/io_uring-next-2022-12-08' of git://git.kernel.dk/linux (diff)
parentblktrace: Fix output non-blktrace event when blk_classic option enabled (diff)
downloadlinux-ce8a79d5601aab94c02ed4539c48e8605422ac94.tar.xz
linux-ce8a79d5601aab94c02ed4539c48e8605422ac94.zip
Merge tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux
Pull block updates from Jens Axboe: - NVMe pull requests via Christoph: - Support some passthrough commands without CAP_SYS_ADMIN (Kanchan Joshi) - Refactor PCIe probing and reset (Christoph Hellwig) - Various fabrics authentication fixes and improvements (Sagi Grimberg) - Avoid fallback to sequential scan due to transient issues (Uday Shankar) - Implement support for the DEAC bit in Write Zeroes (Christoph Hellwig) - Allow overriding the IEEE OUI and firmware revision in configfs for nvmet (Aleksandr Miloserdov) - Force reconnect when number of queue changes in nvmet (Daniel Wagner) - Minor fixes and improvements (Uros Bizjak, Joel Granados, Sagi Grimberg, Christoph Hellwig, Christophe JAILLET) - Fix and cleanup nvme-fc req allocation (Chaitanya Kulkarni) - Use the common tagset helpers in nvme-pci driver (Christoph Hellwig) - Cleanup the nvme-pci removal path (Christoph Hellwig) - Use kstrtobool() instead of strtobool (Christophe JAILLET) - Allow unprivileged passthrough of Identify Controller (Joel Granados) - Support io stats on the mpath device (Sagi Grimberg) - Minor nvmet cleanup (Sagi Grimberg) - MD pull requests via Song: - Code cleanups (Christoph) - Various fixes - Floppy pull request from Denis: - Fix a memory leak in the init error path (Yuan) - Series fixing some batch wakeup issues with sbitmap (Gabriel) - Removal of the pktcdvd driver that was deprecated more than 5 years ago, and subsequent removal of the devnode callback in struct block_device_operations as no users are now left (Greg) - Fix for partition read on an exclusively opened bdev (Jan) - Series of elevator API cleanups (Jinlong, Christoph) - Series of fixes and cleanups for blk-iocost (Kemeng) - Series of fixes and cleanups for blk-throttle (Kemeng) - Series adding concurrent support for sync queues in BFQ (Yu) - Series bringing drbd a bit closer to the out-of-tree maintained version (Christian, Joel, Lars, Philipp) - Misc drbd fixes (Wang) - blk-wbt fixes and tweaks for enable/disable (Yu) - Fixes for mq-deadline for zoned devices (Damien) - Add support for read-only and offline zones for null_blk (Shin'ichiro) - Series fixing the delayed holder tracking, as used by DM (Yu, Christoph) - Series enabling bio alloc caching for IRQ based IO (Pavel) - Series enabling userspace peer-to-peer DMA (Logan) - BFQ waker fixes (Khazhismel) - Series fixing elevator refcount issues (Christoph, Jinlong) - Series cleaning up references around queue destruction (Christoph) - Series doing quiesce by tagset, enabling cleanups in drivers (Christoph, Chao) - Series untangling the queue kobject and queue references (Christoph) - Misc fixes and cleanups (Bart, David, Dawei, Jinlong, Kemeng, Ye, Yang, Waiman, Shin'ichiro, Randy, Pankaj, Christoph) * tag 'for-6.2/block-2022-12-08' of git://git.kernel.dk/linux: (247 commits) blktrace: Fix output non-blktrace event when blk_classic option enabled block: sed-opal: Don't include <linux/kernel.h> sed-opal: allow using IOC_OPAL_SAVE for locking too blk-cgroup: Fix typo in comment block: remove bio_set_op_attrs nvmet: don't open-code NVME_NS_ATTR_RO enumeration nvme-pci: use the tagset alloc/free helpers nvme: add the Apple shared tag workaround to nvme_alloc_io_tag_set nvme: only set reserved_tags in nvme_alloc_io_tag_set for fabrics controllers nvme: consolidate setting the tagset flags nvme: pass nr_maps explicitly to nvme_alloc_io_tag_set block: bio_copy_data_iter nvme-pci: split out a nvme_pci_ctrl_is_dead helper nvme-pci: return early on ctrl state mismatch in nvme_reset_work nvme-pci: rename nvme_disable_io_queues nvme-pci: cleanup nvme_suspend_queue nvme-pci: remove nvme_pci_disable nvme-pci: remove nvme_disable_admin_queue nvme: merge nvme_shutdown_ctrl into nvme_disable_ctrl nvme: use nvme_wait_ready in nvme_shutdown_ctrl ...
-rw-r--r--Documentation/ABI/testing/debugfs-pktcdvd18
-rw-r--r--Documentation/ABI/testing/sysfs-bus-pci10
-rw-r--r--Documentation/ABI/testing/sysfs-class-pktcdvd97
-rw-r--r--Documentation/block/inline-encryption.rst12
-rw-r--r--MAINTAINERS7
-rw-r--r--block/bdev.c4
-rw-r--r--block/bfq-cgroup.c12
-rw-r--r--block/bfq-iosched.c102
-rw-r--r--block/bfq-iosched.h32
-rw-r--r--block/bfq-wf2q.c157
-rw-r--r--block/bio.c146
-rw-r--r--block/blk-cgroup.c94
-rw-r--r--block/blk-cgroup.h10
-rw-r--r--block/blk-core.c83
-rw-r--r--block/blk-crypto-internal.h22
-rw-r--r--block/blk-crypto-profile.c1
-rw-r--r--block/blk-crypto-sysfs.c11
-rw-r--r--block/blk-crypto.c37
-rw-r--r--block/blk-ia-ranges.c3
-rw-r--r--block/blk-iocost.c57
-rw-r--r--block/blk-iolatency.c37
-rw-r--r--block/blk-map.c14
-rw-r--r--block/blk-merge.c44
-rw-r--r--block/blk-mq-sched.c8
-rw-r--r--block/blk-mq-sysfs.c11
-rw-r--r--block/blk-mq.c229
-rw-r--r--block/blk-mq.h14
-rw-r--r--block/blk-settings.c6
-rw-r--r--block/blk-sysfs.c137
-rw-r--r--block/blk-throttle.c102
-rw-r--r--block/blk-wbt.c26
-rw-r--r--block/blk-wbt.h17
-rw-r--r--block/blk.h27
-rw-r--r--block/bsg-lib.c2
-rw-r--r--block/bsg.c11
-rw-r--r--block/elevator.c254
-rw-r--r--block/elevator.h20
-rw-r--r--block/fops.c7
-rw-r--r--block/genhd.c35
-rw-r--r--block/holder.c103
-rw-r--r--block/ioctl.c12
-rw-r--r--block/mq-deadline.c83
-rw-r--r--block/sed-opal.c39
-rw-r--r--drivers/block/Kconfig43
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/drbd/Kconfig2
-rw-r--r--drivers/block/drbd/Makefile2
-rw-r--r--drivers/block/drbd/drbd_actlog.c8
-rw-r--r--drivers/block/drbd/drbd_bitmap.c62
-rw-r--r--drivers/block/drbd/drbd_debugfs.c2
-rw-r--r--drivers/block/drbd/drbd_debugfs.h2
-rw-r--r--drivers/block/drbd/drbd_int.h78
-rw-r--r--drivers/block/drbd/drbd_interval.c2
-rw-r--r--drivers/block/drbd/drbd_interval.h2
-rw-r--r--drivers/block/drbd/drbd_main.c21
-rw-r--r--drivers/block/drbd/drbd_nl.c27
-rw-r--r--drivers/block/drbd/drbd_nla.c2
-rw-r--r--drivers/block/drbd/drbd_nla.h2
-rw-r--r--drivers/block/drbd/drbd_polymorph_printk.h141
-rw-r--r--drivers/block/drbd/drbd_proc.c2
-rw-r--r--drivers/block/drbd/drbd_protocol.h2
-rw-r--r--drivers/block/drbd/drbd_receiver.c99
-rw-r--r--drivers/block/drbd/drbd_req.c8
-rw-r--r--drivers/block/drbd/drbd_req.h2
-rw-r--r--drivers/block/drbd/drbd_state.c2
-rw-r--r--drivers/block/drbd/drbd_state.h2
-rw-r--r--drivers/block/drbd/drbd_state_change.h2
-rw-r--r--drivers/block/drbd/drbd_strings.c2
-rw-r--r--drivers/block/drbd/drbd_strings.h2
-rw-r--r--drivers/block/drbd/drbd_vli.h2
-rw-r--r--drivers/block/drbd/drbd_worker.c18
-rw-r--r--drivers/block/floppy.c4
-rw-r--r--drivers/block/null_blk/main.c22
-rw-r--r--drivers/block/null_blk/null_blk.h8
-rw-r--r--drivers/block/null_blk/zoned.c95
-rw-r--r--drivers/block/pktcdvd.c2944
-rw-r--r--drivers/block/virtio_blk.c8
-rw-r--r--drivers/block/xen-blkfront.c1
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c2
-rw-r--r--drivers/md/bcache/writeback.c4
-rw-r--r--drivers/md/dm-table.c2
-rw-r--r--drivers/md/dm-thin.c2
-rw-r--r--drivers/md/dm.c138
-rw-r--r--drivers/md/md-bitmap.c47
-rw-r--r--drivers/md/md.c323
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid0.c1
-rw-r--r--drivers/md/raid1.c13
-rw-r--r--drivers/md/raid10.c20
-rw-r--r--drivers/md/raid5-cache.c10
-rw-r--r--drivers/md/raid5-ppl.c5
-rw-r--r--drivers/nvme/host/apple.c30
-rw-r--r--drivers/nvme/host/auth.c258
-rw-r--r--drivers/nvme/host/core.c319
-rw-r--r--drivers/nvme/host/fc.c59
-rw-r--r--drivers/nvme/host/ioctl.c118
-rw-r--r--drivers/nvme/host/multipath.c26
-rw-r--r--drivers/nvme/host/nvme.h69
-rw-r--r--drivers/nvme/host/pci.c606
-rw-r--r--drivers/nvme/host/rdma.c42
-rw-r--r--drivers/nvme/host/tcp.c45
-rw-r--r--drivers/nvme/target/admin-cmd.c11
-rw-r--r--drivers/nvme/target/configfs.c138
-rw-r--r--drivers/nvme/target/core.c44
-rw-r--r--drivers/nvme/target/io-cmd-file.c16
-rw-r--r--drivers/nvme/target/loop.c16
-rw-r--r--drivers/nvme/target/nvmet.h6
-rw-r--r--drivers/pci/p2pdma.c124
-rw-r--r--drivers/scsi/scsi_lib.c2
-rw-r--r--drivers/scsi/scsi_scan.c1
-rw-r--r--drivers/ufs/core/ufshcd.c2
-rw-r--r--fs/crypto/inline_crypt.c14
-rw-r--r--include/linux/bio.h2
-rw-r--r--include/linux/blk-crypto-profile.h12
-rw-r--r--include/linux/blk-crypto.h13
-rw-r--r--include/linux/blk-mq.h9
-rw-r--r--include/linux/blk_types.h7
-rw-r--r--include/linux/blkdev.h32
-rw-r--r--include/linux/lru_cache.h3
-rw-r--r--include/linux/mempool.h5
-rw-r--r--include/linux/mm.h3
-rw-r--r--include/linux/mmzone.h24
-rw-r--r--include/linux/nvme.h2
-rw-r--r--include/linux/pktcdvd.h197
-rw-r--r--include/linux/raid/pq.h8
-rw-r--r--include/linux/sbitmap.h16
-rw-r--r--include/linux/sed-opal.h3
-rw-r--r--include/linux/uio.h6
-rw-r--r--include/linux/wait.h2
-rw-r--r--include/trace/events/iocost.h4
-rw-r--r--include/uapi/linux/pktcdvd.h112
-rw-r--r--include/uapi/linux/sed-opal.h8
-rw-r--r--io_uring/rw.c3
-rw-r--r--kernel/sched/wait.c18
-rw-r--r--kernel/trace/blktrace.c7
-rw-r--r--lib/iov_iter.c32
-rw-r--r--lib/lru_cache.c59
-rw-r--r--lib/raid6/algos.c2
-rw-r--r--lib/sbitmap.c144
-rw-r--r--lib/scatterlist.c25
-rw-r--r--mm/gup.c45
-rw-r--r--mm/huge_memory.c19
-rw-r--r--mm/hugetlb.c23
144 files changed, 3229 insertions, 5907 deletions
diff --git a/Documentation/ABI/testing/debugfs-pktcdvd b/Documentation/ABI/testing/debugfs-pktcdvd
deleted file mode 100644
index f6f65a4faea0..000000000000
--- a/Documentation/ABI/testing/debugfs-pktcdvd
+++ /dev/null
@@ -1,18 +0,0 @@
-What: /sys/kernel/debug/pktcdvd/pktcdvd[0-7]
-Date: Oct. 2006
-KernelVersion: 2.6.20
-Contact: Thomas Maier <balagi@justmail.de>
-Description:
-
-The pktcdvd module (packet writing driver) creates
-these files in debugfs:
-
-/sys/kernel/debug/pktcdvd/pktcdvd[0-7]/
-
- ==== ====== ====================================
- info 0444 Lots of driver statistics and infos.
- ==== ====== ====================================
-
-Example::
-
- cat /sys/kernel/debug/pktcdvd/pktcdvd0/info
diff --git a/Documentation/ABI/testing/sysfs-bus-pci b/Documentation/ABI/testing/sysfs-bus-pci
index 840727fc75dc..ecf47559f495 100644
--- a/Documentation/ABI/testing/sysfs-bus-pci
+++ b/Documentation/ABI/testing/sysfs-bus-pci
@@ -407,6 +407,16 @@ Description:
file contains a '1' if the memory has been published for
use outside the driver that owns the device.
+What: /sys/bus/pci/devices/.../p2pmem/allocate
+Date: August 2022
+Contact: Logan Gunthorpe <logang@deltatee.com>
+Description:
+ This file allows mapping p2pmem into userspace. For each
+ mmap() call on this file, the kernel will allocate a chunk
+ of Peer-to-Peer memory for use in Peer-to-Peer transactions.
+ This memory can be used in O_DIRECT calls to NVMe backed
+ files for Peer-to-Peer copies.
+
What: /sys/bus/pci/devices/.../link/clkpm
/sys/bus/pci/devices/.../link/l0s_aspm
/sys/bus/pci/devices/.../link/l1_aspm
diff --git a/Documentation/ABI/testing/sysfs-class-pktcdvd b/Documentation/ABI/testing/sysfs-class-pktcdvd
deleted file mode 100644
index ba1ce626591d..000000000000
--- a/Documentation/ABI/testing/sysfs-class-pktcdvd
+++ /dev/null
@@ -1,97 +0,0 @@
-sysfs interface
----------------
-The pktcdvd module (packet writing driver) creates the following files in the
-sysfs: (<devid> is in the format major:minor)
-
-What: /sys/class/pktcdvd/add
-What: /sys/class/pktcdvd/remove
-What: /sys/class/pktcdvd/device_map
-Date: Oct. 2006
-KernelVersion: 2.6.20
-Contact: Thomas Maier <balagi@justmail.de>
-Description:
-
- ========== ==============================================
- add (WO) Write a block device id (major:minor) to
- create a new pktcdvd device and map it to the
- block device.
-
- remove (WO) Write the pktcdvd device id (major:minor)
- to remove the pktcdvd device.
-
- device_map (RO) Shows the device mapping in format:
- pktcdvd[0-7] <pktdevid> <blkdevid>
- ========== ==============================================
-
-
-What: /sys/class/pktcdvd/pktcdvd[0-7]/dev
-What: /sys/class/pktcdvd/pktcdvd[0-7]/uevent
-Date: Oct. 2006
-KernelVersion: 2.6.20
-Contact: Thomas Maier <balagi@justmail.de>
-Description:
- dev: (RO) Device id
-
- uevent: (WO) To send a uevent
-
-
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_started
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/packets_finished
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_written
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/kb_read_gather
-What: /sys/class/pktcdvd/pktcdvd[0-7]/stat/reset
-Date: Oct. 2006
-KernelVersion: 2.6.20
-Contact: Thomas Maier <balagi@justmail.de>
-Description:
- packets_started: (RO) Number of started packets.
-
- packets_finished: (RO) Number of finished packets.
-
- kb_written: (RO) kBytes written.
-
- kb_read: (RO) kBytes read.
-
- kb_read_gather: (RO) kBytes read to fill write packets.
-
- reset: (WO) Write any value to it to reset
- pktcdvd device statistic values, like
- bytes read/written.
-
-
-What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/size
-What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_off
-What: /sys/class/pktcdvd/pktcdvd[0-7]/write_queue/congestion_on
-Date: Oct. 2006
-KernelVersion: 2.6.20
-Contact: Thomas Maier <balagi@justmail.de>
-Description:
- ============== ================================================
- size (RO) Contains the size of the bio write queue.
-
- congestion_off (RW) If bio write queue size is below this mark,
- accept new bio requests from the block layer.
-
- congestion_on (RW) If bio write queue size is higher as this
- mark, do no longer accept bio write requests
- from the block layer and wait till the pktcdvd
- device has processed enough bio's so that bio
- write queue size is below congestion off mark.
- A value of <= 0 disables congestion control.
- ============== ================================================
-
-
-Example:
---------
-To use the pktcdvd sysfs interface directly, you can do::
-
- # create a new pktcdvd device mapped to /dev/hdc
- echo "22:0" >/sys/class/pktcdvd/add
- cat /sys/class/pktcdvd/device_map
- # assuming device pktcdvd0 was created, look at stat's
- cat /sys/class/pktcdvd/pktcdvd0/stat/kb_written
- # print the device id of the mapped block device
- fgrep pktcdvd0 /sys/class/pktcdvd/device_map
- # remove device, using pktcdvd0 device id 253:0
- echo "253:0" >/sys/class/pktcdvd/remove
diff --git a/Documentation/block/inline-encryption.rst b/Documentation/block/inline-encryption.rst
index 4d151fbe2058..f9bf18ea6509 100644
--- a/Documentation/block/inline-encryption.rst
+++ b/Documentation/block/inline-encryption.rst
@@ -142,7 +142,7 @@ Therefore, we also introduce *blk-crypto-fallback*, which is an implementation
of inline encryption using the kernel crypto API. blk-crypto-fallback is built
into the block layer, so it works on any block device without any special setup.
Essentially, when a bio with an encryption context is submitted to a
-request_queue that doesn't support that encryption context, the block layer will
+block_device that doesn't support that encryption context, the block layer will
handle en/decryption of the bio using blk-crypto-fallback.
For encryption, the data cannot be encrypted in-place, as callers usually rely
@@ -187,7 +187,7 @@ API presented to users of the block layer
``blk_crypto_config_supported()`` allows users to check ahead of time whether
inline encryption with particular crypto settings will work on a particular
-request_queue -- either via hardware or via blk-crypto-fallback. This function
+block_device -- either via hardware or via blk-crypto-fallback. This function
takes in a ``struct blk_crypto_config`` which is like blk_crypto_key, but omits
the actual bytes of the key and instead just contains the algorithm, data unit
size, etc. This function can be useful if blk-crypto-fallback is disabled.
@@ -195,7 +195,7 @@ size, etc. This function can be useful if blk-crypto-fallback is disabled.
``blk_crypto_init_key()`` allows users to initialize a blk_crypto_key.
Users must call ``blk_crypto_start_using_key()`` before actually starting to use
-a blk_crypto_key on a request_queue (even if ``blk_crypto_config_supported()``
+a blk_crypto_key on a block_device (even if ``blk_crypto_config_supported()``
was called earlier). This is needed to initialize blk-crypto-fallback if it
will be needed. This must not be called from the data path, as this may have to
allocate resources, which may deadlock in that case.
@@ -207,7 +207,7 @@ for en/decryption. Users don't need to worry about freeing the bio_crypt_ctx
later, as that happens automatically when the bio is freed or reset.
Finally, when done using inline encryption with a blk_crypto_key on a
-request_queue, users must call ``blk_crypto_evict_key()``. This ensures that
+block_device, users must call ``blk_crypto_evict_key()``. This ensures that
the key is evicted from all keyslots it may be programmed into and unlinked from
any kernel data structures it may be linked into.
@@ -221,9 +221,9 @@ as follows:
5. ``blk_crypto_evict_key()`` (after all I/O has completed)
6. Zeroize the blk_crypto_key (this has no dedicated function)
-If a blk_crypto_key is being used on multiple request_queues, then
+If a blk_crypto_key is being used on multiple block_devices, then
``blk_crypto_config_supported()`` (if used), ``blk_crypto_start_using_key()``,
-and ``blk_crypto_evict_key()`` must be called on each request_queue.
+and ``blk_crypto_evict_key()`` must be called on each block_device.
API presented to device drivers
===============================
diff --git a/MAINTAINERS b/MAINTAINERS
index a8c8f6b42436..3b335ae0f9db 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -16430,13 +16430,6 @@ S: Supported
F: Documentation/devicetree/bindings/input/pine64,pinephone-keyboard.yaml
F: drivers/input/keyboard/pinephone-keyboard.c
-PKTCDVD DRIVER
-M: linux-block@vger.kernel.org
-S: Orphan
-F: drivers/block/pktcdvd.c
-F: include/linux/pktcdvd.h
-F: include/uapi/linux/pktcdvd.h
-
PLANTOWER PMS7003 AIR POLLUTION SENSOR DRIVER
M: Tomasz Duszynski <tduszyns@gmail.com>
S: Maintained
diff --git a/block/bdev.c b/block/bdev.c
index d699ecdb3260..edc110d90df4 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -224,7 +224,7 @@ int fsync_bdev(struct block_device *bdev)
EXPORT_SYMBOL(fsync_bdev);
/**
- * freeze_bdev -- lock a filesystem and force it into a consistent state
+ * freeze_bdev - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
@@ -268,7 +268,7 @@ done:
EXPORT_SYMBOL(freeze_bdev);
/**
- * thaw_bdev -- unlock filesystem
+ * thaw_bdev - unlock filesystem
* @bdev: blockdevice to unlock
*
* Unlocks the filesystem and marks it writeable again after freeze_bdev().
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 7d624a3a3f0f..627476bc6495 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -224,7 +224,7 @@ void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
{
blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ if (!(bfqq == bfqg->bfqd->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
@@ -552,6 +552,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
*/
bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
+ bfqg->num_queues_with_pending_reqs = 0;
bfqg->online = true;
bfqg->rq_pos_tree = RB_ROOT;
}
@@ -645,6 +646,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_entity *entity = &bfqq->entity;
struct bfq_group *old_parent = bfqq_group(bfqq);
+ bool has_pending_reqs = false;
/*
* No point to move bfqq to the same group, which can happen when
@@ -665,6 +667,11 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfqq->ref++;
+ if (entity->in_groups_with_pending_reqs) {
+ has_pending_reqs = true;
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ }
+
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@@ -692,6 +699,9 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* pin down bfqg and its associated blkg */
bfqg_and_blkg_get(bfqg);
+ if (has_pending_reqs)
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
+
if (bfq_bfqq_busy(bfqq)) {
if (unlikely(!bfqd->nonrot_with_queueing))
bfq_pos_tree_add_move(bfqd, bfqq);
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 7ea427817f7f..a72304c728fc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -820,7 +820,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
* 2) all active queues belong to the same I/O-priority class,
- * 3) there are no active groups.
+ * 3) there is at most one active group.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
@@ -852,7 +852,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
return varied_queue_weights || multiple_classes_busy
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- || bfqd->num_groups_with_pending_reqs > 0
+ || bfqd->num_groups_with_pending_reqs > 1
#endif
;
}
@@ -870,9 +870,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_add(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree;
struct bfq_entity *entity = &bfqq->entity;
struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
bool leftmost = true;
@@ -944,13 +944,14 @@ inc_counter:
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_remove(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root;
+
if (!bfqq->weight_counter)
return;
+ root = &bfqq->bfqd->queue_weights_tree;
bfqq->weight_counter->num_active--;
if (bfqq->weight_counter->num_active > 0)
goto reset_entity_pointer;
@@ -964,59 +965,6 @@ reset_entity_pointer:
}
/*
- * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
- * of active groups for each queue's inactive parent entity.
- */
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = bfqq->entity.parent;
-
- for_each_entity(entity) {
- struct bfq_sched_data *sd = entity->my_sched_data;
-
- if (sd->next_in_service || sd->in_service_entity) {
- /*
- * entity is still active, because either
- * next_in_service or in_service_entity is not
- * NULL (see the comments on the definition of
- * next_in_service for details on why
- * in_service_entity must be checked too).
- *
- * As a consequence, its parent entities are
- * active as well, and thus this loop must
- * stop here.
- */
- break;
- }
-
- /*
- * The decrement of num_groups_with_pending_reqs is
- * not performed immediately upon the deactivation of
- * entity, but it is delayed to when it also happens
- * that the first leaf descendant bfqq of entity gets
- * all its pending requests completed. The following
- * instructions perform this delayed decrement, if
- * needed. See the comments on
- * num_groups_with_pending_reqs for details.
- */
- if (entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = false;
- bfqd->num_groups_with_pending_reqs--;
- }
- }
-
- /*
- * Next function is invoked last, because it causes bfqq to be
- * freed if the following holds: bfqq is not in service and
- * has no dispatched request. DO NOT use bfqq after the next
- * function invocation.
- */
- __bfq_weights_tree_remove(bfqd, bfqq,
- &bfqd->queue_weights_tree);
-}
-
-/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -2135,7 +2083,9 @@ static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->last_completed_rq_bfqq ||
bfqd->last_completed_rq_bfqq == bfqq ||
bfq_bfqq_has_short_ttime(bfqq) ||
- now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC)
+ now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
+ bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq ||
+ bfqq == &bfqd->oom_bfqq)
return;
/*
@@ -2373,22 +2323,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
return 0;
}
-#if 0 /* Still not clear if we can do without next two functions */
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver++;
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver--;
-}
-#endif
-
static void bfq_remove_request(struct request_queue *q,
struct request *rq)
{
@@ -6261,7 +6195,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
- bfq_weights_tree_remove(bfqd, bfqq);
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
@@ -6784,6 +6719,12 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
+ if (unlikely(bfqq == &bfqd->oom_bfqq))
+ bfqq_already_existing = true;
+ } else
+ bfqq_already_existing = true;
+
+ if (!bfqq_already_existing) {
bfqq->waker_bfqq = old_bfqq->waker_bfqq;
bfqq->tentative_waker_bfqq = NULL;
@@ -6797,8 +6738,7 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (bfqq->waker_bfqq)
hlist_add_head(&bfqq->woken_list_node,
&bfqq->waker_bfqq->woken_list);
- } else
- bfqq_already_existing = true;
+ }
}
}
@@ -7045,6 +6985,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
#endif
blk_stat_disable_accounting(bfqd->queue);
+ clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
wbt_enable_default(bfqd->queue);
kfree(bfqd);
@@ -7190,6 +7131,7 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
/* We dispatch from request queue wide instead of hw queue */
blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+ set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
wbt_disable_default(q);
blk_stat_enable_accounting(q);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index 71f721670ab6..9fa89577322d 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -492,27 +492,27 @@ struct bfq_data {
struct rb_root_cached queue_weights_tree;
/*
- * Number of groups with at least one descendant process that
+ * Number of groups with at least one process that
* has at least one request waiting for completion. Note that
* this accounts for also requests already dispatched, but not
* yet completed. Therefore this number of groups may differ
* (be larger) than the number of active groups, as a group is
* considered active only if its corresponding entity has
- * descendant queues with at least one request queued. This
+ * queues with at least one request queued. This
* number is used to decide whether a scenario is symmetric.
* For a detailed explanation see comments on the computation
* of the variable asymmetric_scenario in the function
* bfq_better_to_idle().
*
* However, it is hard to compute this number exactly, for
- * groups with multiple descendant processes. Consider a group
- * that is inactive, i.e., that has no descendant process with
+ * groups with multiple processes. Consider a group
+ * that is inactive, i.e., that has no process with
* pending I/O inside BFQ queues. Then suppose that
* num_groups_with_pending_reqs is still accounting for this
- * group, because the group has descendant processes with some
+ * group, because the group has processes with some
* I/O request still in flight. num_groups_with_pending_reqs
* should be decremented when the in-flight request of the
- * last descendant process is finally completed (assuming that
+ * last process is finally completed (assuming that
* nothing else has changed for the group in the meantime, in
* terms of composition of the group and active/inactive state of child
* groups and processes). To accomplish this, an additional
@@ -521,7 +521,7 @@ struct bfq_data {
* we resort to the following tradeoff between simplicity and
* accuracy: for an inactive group that is still counted in
* num_groups_with_pending_reqs, we decrement
- * num_groups_with_pending_reqs when the first descendant
+ * num_groups_with_pending_reqs when the first
* process of the group remains with no request waiting for
* completion.
*
@@ -529,12 +529,12 @@ struct bfq_data {
* carefulness: to avoid multiple decrements, we flag a group,
* more precisely an entity representing a group, as still
* counted in num_groups_with_pending_reqs when it becomes
- * inactive. Then, when the first descendant queue of the
+ * inactive. Then, when the first queue of the
* entity remains with no request waiting for completion,
* num_groups_with_pending_reqs is decremented, and this flag
* is reset. After this flag is reset for the entity,
* num_groups_with_pending_reqs won't be decremented any
- * longer in case a new descendant queue of the entity remains
+ * longer in case a new queue of the entity remains
* with no request waiting for completion.
*/
unsigned int num_groups_with_pending_reqs;
@@ -931,7 +931,7 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- void *bfqd;
+ struct bfq_data *bfqd;
struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS];
struct bfq_queue *async_idle_bfqq;
@@ -939,6 +939,7 @@ struct bfq_group {
struct bfq_entity *my_entity;
int active_entities;
+ int num_queues_with_pending_reqs;
struct rb_root rq_pos_tree;
@@ -968,13 +969,8 @@ struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_queue *bfqq);
+void bfq_weights_tree_remove(struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
@@ -1078,6 +1074,8 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index 8fc3da4c23bb..b02b53658ed4 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -218,6 +218,24 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return false;
}
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities++;
+}
+
+static void bfq_dec_active_entities(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities--;
+}
+
#else /* CONFIG_BFQ_GROUP_IOSCHED */
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
@@ -230,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return true;
}
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+}
+
+static void bfq_dec_active_entities(struct bfq_entity *entity)
+{
+}
+
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
/*
@@ -456,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
bfq_insert(&st->active, entity);
@@ -471,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_update_active_tree(node);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities++;
-#endif
+
+ bfq_inc_active_entities(entity);
}
/**
@@ -558,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
if (node)
bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities--;
-#endif
+
+ bfq_dec_active_entities(entity);
}
/**
@@ -706,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (entity->prio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned int prev_weight, new_weight;
- struct bfq_data *bfqd = NULL;
- struct rb_root_cached *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd;
- struct bfq_group *bfqg;
-#endif
-
- if (bfqq)
- bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- sd = entity->my_sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
- }
-#endif
/* Matches the smp_wmb() in bfq_group_set_weight. */
smp_rmb();
@@ -770,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
* queue, remove the entity from its old weight counter (if
* there is a counter associated with the entity).
*/
- if (prev_weight != new_weight && bfqq) {
- root = &bfqd->queue_weights_tree;
- __bfq_weights_tree_remove(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq)
+ bfq_weights_tree_remove(bfqq);
entity->weight = new_weight;
/*
* Add the entity, if it is not a weight-raised queue,
* to the counter associated with its new weight.
*/
- if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
- /* If we get here, root has been initialized. */
- bfq_weights_tree_add(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqq);
new_st->wsum += entity->weight;
@@ -984,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st_or_in_serv = true;
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
- struct bfq_data *bfqd = bfqg->bfqd;
-
- if (!entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = true;
- bfqd->num_groups_with_pending_reqs++;
- }
- }
-#endif
-
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@@ -1082,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity)
}
static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
- struct bfq_sched_data *sd,
bool non_blocking_wait_rq)
{
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
- if (sd->in_service_entity == entity || entity->tree == &st->active)
+ if (entity->sched_data->in_service_entity == entity ||
+ entity->tree == &st->active)
/*
* in service or already queued on the active tree,
* requeue or reposition
@@ -1119,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
bool non_blocking_wait_rq,
bool requeue, bool expiration)
{
- struct bfq_sched_data *sd;
-
for_each_entity(entity) {
- sd = entity->sched_data;
- __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
- if (!bfq_update_next_in_service(sd, entity, expiration) &&
- !requeue)
+ __bfq_activate_requeue_entity(entity, non_blocking_wait_rq);
+ if (!bfq_update_next_in_service(entity->sched_data, entity,
+ expiration) && !requeue)
break;
}
}
@@ -1646,6 +1610,32 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq == bfqd->in_service_queue, expiration);
}
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (!entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = true;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++))
+ bfqq->bfqd->num_groups_with_pending_reqs++;
+#endif
+ }
+}
+
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = false;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs))
+ bfqq->bfqd->num_groups_with_pending_reqs--;
+#endif
+ }
+}
+
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree. As a special case, it can be invoked during an
@@ -1668,8 +1658,14 @@ void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
- if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, bfqq);
+ if (!bfqq->dispatched) {
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ /*
+ * Next function is invoked last, because it causes bfqq to be
+ * freed. DO NOT use bfqq after the next function invocation.
+ */
+ bfq_weights_tree_remove(bfqq);
+ }
}
/*
@@ -1686,10 +1682,11 @@ void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
- if (!bfqq->dispatched)
+ if (!bfqq->dispatched) {
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
if (bfqq->wr_coeff == 1)
- bfq_weights_tree_add(bfqd, bfqq,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_add(bfqq);
+ }
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;
diff --git a/block/bio.c b/block/bio.c
index 57c2f327225b..5f96fcae3f75 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -25,9 +25,15 @@
#include "blk-rq-qos.h"
#include "blk-cgroup.h"
+#define ALLOC_CACHE_THRESHOLD 16
+#define ALLOC_CACHE_SLACK 64
+#define ALLOC_CACHE_MAX 256
+
struct bio_alloc_cache {
struct bio *free_list;
+ struct bio *free_list_irq;
unsigned int nr;
+ unsigned int nr_irq;
};
static struct biovec_slab {
@@ -408,6 +414,22 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}
+static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
+{
+ unsigned long flags;
+
+ /* cache->free_list must be empty */
+ if (WARN_ON_ONCE(cache->free_list))
+ return;
+
+ local_irq_save(flags);
+ cache->free_list = cache->free_list_irq;
+ cache->free_list_irq = NULL;
+ cache->nr += cache->nr_irq;
+ cache->nr_irq = 0;
+ local_irq_restore(flags);
+}
+
static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
struct bio_set *bs)
@@ -417,8 +439,12 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
cache = per_cpu_ptr(bs->cache, get_cpu());
if (!cache->free_list) {
- put_cpu();
- return NULL;
+ if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
+ bio_alloc_irq_cache_splice(cache);
+ if (!cache->free_list) {
+ put_cpu();
+ return NULL;
+ }
}
bio = cache->free_list;
cache->free_list = bio->bi_next;
@@ -462,9 +488,6 @@ static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
* submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
* for per bio allocations.
*
- * If REQ_ALLOC_CACHE is set, the final put of the bio MUST be done from process
- * context, not hard/soft IRQ.
- *
* Returns: Pointer to new bio on success, NULL on failure.
*/
struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
@@ -526,6 +549,8 @@ struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
}
if (unlikely(!p))
return NULL;
+ if (!mempool_is_saturated(&bs->bio_pool))
+ opf &= ~REQ_ALLOC_CACHE;
bio = p + bs->front_pad;
if (nr_vecs > BIO_INLINE_VECS) {
@@ -676,11 +701,8 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
-#define ALLOC_CACHE_MAX 512
-#define ALLOC_CACHE_SLACK 64
-
-static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
- unsigned int nr)
+static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
{
unsigned int i = 0;
struct bio *bio;
@@ -692,6 +714,17 @@ static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
if (++i == nr)
break;
}
+ return i;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ nr -= __bio_alloc_cache_prune(cache, nr);
+ if (!READ_ONCE(cache->free_list)) {
+ bio_alloc_irq_cache_splice(cache);
+ __bio_alloc_cache_prune(cache, nr);
+ }
}
static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
@@ -725,6 +758,35 @@ static void bio_alloc_cache_destroy(struct bio_set *bs)
bs->cache = NULL;
}
+static inline void bio_put_percpu_cache(struct bio *bio)
+{
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
+ put_cpu();
+ bio_free(bio);
+ return;
+ }
+
+ bio_uninit(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
+ bio->bi_next = cache->free_list;
+ cache->free_list = bio;
+ cache->nr++;
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ bio->bi_next = cache->free_list_irq;
+ cache->free_list_irq = bio;
+ cache->nr_irq++;
+ local_irq_restore(flags);
+ }
+ put_cpu();
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -740,20 +802,10 @@ void bio_put(struct bio *bio)
if (!atomic_dec_and_test(&bio->__bi_cnt))
return;
}
-
- if ((bio->bi_opf & REQ_ALLOC_CACHE) && !WARN_ON_ONCE(in_interrupt())) {
- struct bio_alloc_cache *cache;
-
- bio_uninit(bio);
- cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
- bio->bi_next = cache->free_list;
- cache->free_list = bio;
- if (++cache->nr > ALLOC_CACHE_MAX + ALLOC_CACHE_SLACK)
- bio_alloc_cache_prune(cache, ALLOC_CACHE_SLACK);
- put_cpu();
- } else {
+ if (bio->bi_opf & REQ_ALLOC_CACHE)
+ bio_put_percpu_cache(bio);
+ else
bio_free(bio);
- }
}
EXPORT_SYMBOL(bio_put);
@@ -863,6 +915,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
if (*same_page)
@@ -1195,6 +1249,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
+ unsigned int gup_flags = 0;
ssize_t size, left;
unsigned len, i = 0;
size_t offset, trim;
@@ -1208,6 +1263,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
+ if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
+ gup_flags |= FOLL_PCI_P2PDMA;
+
/*
* Each segment in the iov is required to be a block size multiple.
* However, we may not be able to get the entire segment if it spans
@@ -1215,8 +1273,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
* result to ensure the bio's total size is correct. The remainder of
* the iov data will be picked up in the next bio iteration.
*/
- size = iov_iter_get_pages2(iter, pages, UINT_MAX - bio->bi_iter.bi_size,
- nr_pages, &offset);
+ size = iov_iter_get_pages(iter, pages,
+ UINT_MAX - bio->bi_iter.bi_size,
+ nr_pages, &offset, gup_flags);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
@@ -1342,27 +1401,6 @@ void __bio_advance(struct bio *bio, unsigned bytes)
}
EXPORT_SYMBOL(__bio_advance);
-void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
- struct bio *src, struct bvec_iter *src_iter)
-{
- while (src_iter->bi_size && dst_iter->bi_size) {
- struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
- struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
- unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
- void *src_buf = bvec_kmap_local(&src_bv);
- void *dst_buf = bvec_kmap_local(&dst_bv);
-
- memcpy(dst_buf, src_buf, bytes);
-
- kunmap_local(dst_buf);
- kunmap_local(src_buf);
-
- bio_advance_iter_single(src, src_iter, bytes);
- bio_advance_iter_single(dst, dst_iter, bytes);
- }
-}
-EXPORT_SYMBOL(bio_copy_data_iter);
-
/**
* bio_copy_data - copy contents of data buffers from one bio to another
* @src: source bio
@@ -1376,7 +1414,21 @@ void bio_copy_data(struct bio *dst, struct bio *src)
struct bvec_iter src_iter = src->bi_iter;
struct bvec_iter dst_iter = dst->bi_iter;
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+ while (src_iter.bi_size && dst_iter.bi_size) {
+ struct bio_vec src_bv = bio_iter_iovec(src, src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf = bvec_kmap_local(&src_bv);
+ void *dst_buf = bvec_kmap_local(&dst_bv);
+
+ memcpy(dst_buf, src_buf, bytes);
+
+ kunmap_local(dst_buf);
+ kunmap_local(src_buf);
+
+ bio_advance_iter_single(src, &src_iter, bytes);
+ bio_advance_iter_single(dst, &dst_iter, bytes);
+ }
}
EXPORT_SYMBOL(bio_copy_data);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index ed761c62ad0a..50ac0dce95b8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -59,6 +59,37 @@ static struct workqueue_struct *blkcg_punt_bio_wq;
#define BLKG_DESTROY_BATCH_SIZE 64
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+ int cpu;
+
+ blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+ if (!blkcg->lhead)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
+ return 0;
+}
+
/**
* blkcg_css - find the current css
*
@@ -236,8 +267,10 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
blkg->blkcg = blkcg;
u64_stats_init(&blkg->iostat.sync);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+ per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+ }
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -577,7 +610,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
* @pd: policy private data of interest
* @v: value to print
*
- * Print @v to @sf for the device assocaited with @pd.
+ * Print @v to @sf for the device associated with @pd.
*/
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
@@ -765,7 +798,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
- * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ * @ctx: blkg_conf_ctx initialized by blkg_conf_prep()
*
* Finish up after per-blkg config update. This function must be paired
* with blkg_conf_prep().
@@ -827,7 +860,9 @@ static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
{
struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+ struct llist_node *lnode;
+ struct blkg_iostat_set *bisc, *next_bisc;
/* Root-level stats are sourced from system-wide IO stats */
if (!cgroup_parent(css->cgroup))
@@ -835,12 +870,21 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
rcu_read_lock();
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ lnode = llist_del_all(lhead);
+ if (!lnode)
+ goto out;
+
+ /*
+ * Iterate only the iostat_cpu's queued in the lockless list.
+ */
+ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+ struct blkcg_gq *blkg = bisc->blkg;
struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
struct blkg_iostat cur;
unsigned int seq;
+ WRITE_ONCE(bisc->lqueued, false);
+
/* fetch the current per-cpu values */
do {
seq = u64_stats_fetch_begin(&bisc->sync);
@@ -853,8 +897,10 @@ static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
if (parent && parent->parent)
blkcg_iostat_update(parent, &blkg->iostat.cur,
&blkg->iostat.last);
+ percpu_ref_put(&blkg->refcnt);
}
+out:
rcu_read_unlock();
}
@@ -1132,6 +1178,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&blkcg_pol_mutex);
+ free_percpu(blkcg->lhead);
kfree(blkcg);
}
@@ -1139,7 +1186,6 @@ static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct blkcg *blkcg;
- struct cgroup_subsys_state *ret;
int i;
mutex_lock(&blkcg_pol_mutex);
@@ -1148,12 +1194,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg = &blkcg_root;
} else {
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg) {
- ret = ERR_PTR(-ENOMEM);
+ if (!blkcg)
goto unlock;
- }
}
+ if (init_blkcg_llists(blkcg))
+ goto free_blkcg;
+
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@@ -1168,10 +1215,9 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
continue;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd) {
- ret = ERR_PTR(-ENOMEM);
+ if (!cpd)
goto free_pd_blkcg;
- }
+
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
@@ -1195,12 +1241,13 @@ free_pd_blkcg:
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+ free_percpu(blkcg->lhead);
+free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
mutex_unlock(&blkcg_pol_mutex);
- return ret;
+ return ERR_PTR(-ENOMEM);
}
static int blkcg_css_online(struct cgroup_subsys_state *css)
@@ -1784,7 +1831,7 @@ out:
/**
* blkcg_schedule_throttle - this task needs to check for throttling
- * @gendisk: disk to throttle
+ * @disk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI
*
* This is called by the IO controller when we know there's delay accumulated
@@ -1943,6 +1990,7 @@ static int blk_cgroup_io_type(struct bio *bio)
void blk_cgroup_bio_start(struct bio *bio)
{
+ struct blkcg *blkcg = bio->bi_blkg->blkcg;
int rwd = blk_cgroup_io_type(bio), cpu;
struct blkg_iostat_set *bis;
unsigned long flags;
@@ -1961,9 +2009,21 @@ void blk_cgroup_bio_start(struct bio *bio)
}
bis->cur.ios[rwd]++;
+ /*
+ * If the iostat_cpu isn't in a lockless list, put it into the
+ * list to indicate that a stat update is pending.
+ */
+ if (!READ_ONCE(bis->lqueued)) {
+ struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+ llist_add(&bis->lnode, lhead);
+ WRITE_ONCE(bis->lqueued, true);
+ percpu_ref_get(&bis->blkg->refcnt);
+ }
+
u64_stats_update_end_irqrestore(&bis->sync, flags);
if (cgroup_subsys_on_dfl(io_cgrp_subsys))
- cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
put_cpu();
}
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index aa2b286bc825..1e94e404eaa8 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -18,6 +18,7 @@
#include <linux/cgroup.h>
#include <linux/kthread.h>
#include <linux/blk-mq.h>
+#include <linux/llist.h>
struct blkcg_gq;
struct blkg_policy_data;
@@ -43,6 +44,9 @@ struct blkg_iostat {
struct blkg_iostat_set {
struct u64_stats_sync sync;
+ struct blkcg_gq *blkg;
+ struct llist_node lnode;
+ int lqueued; /* queued in llist */
struct blkg_iostat cur;
struct blkg_iostat last;
};
@@ -97,6 +101,12 @@ struct blkcg {
struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
struct list_head all_blkcgs_node;
+
+ /*
+ * List of updated percpu blkg_iostat_set's since the last flush.
+ */
+ struct llist_head __percpu *lhead;
+
#ifdef CONFIG_BLK_CGROUP_FC_APPID
char fc_app_id[FC_APPID_LEN];
#endif
diff --git a/block/blk-core.c b/block/blk-core.c
index 5487912befe8..3866b6c4cd88 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -59,13 +59,12 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
-DEFINE_IDA(blk_queue_ida);
+static DEFINE_IDA(blk_queue_ida);
/*
* For queue allocation
*/
-struct kmem_cache *blk_requestq_cachep;
-struct kmem_cache *blk_requestq_srcu_cachep;
+static struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
@@ -253,19 +252,44 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
+{
+ kmem_cache_free(blk_requestq_cachep,
+ container_of(rcu_head, struct request_queue, rcu_head));
+}
+
+static void blk_free_queue(struct request_queue *q)
+{
+ percpu_ref_exit(&q->q_usage_counter);
+
+ if (q->poll_stat)
+ blk_stat_remove_callback(q, q->poll_cb);
+ blk_stat_free_callback(q->poll_cb);
+
+ blk_free_queue_stats(q->stats);
+ kfree(q->poll_stat);
+
+ if (queue_is_mq(q))
+ blk_mq_release(q);
+
+ ida_free(&blk_queue_ida, q->id);
+ call_rcu(&q->rcu_head, blk_free_queue_rcu);
+}
+
/**
* blk_put_queue - decrement the request_queue refcount
* @q: the request_queue structure to decrement the refcount for
*
- * Decrements the refcount of the request_queue kobject. When this reaches 0
- * we'll have blk_release_queue() called.
+ * Decrements the refcount of the request_queue and free it when the refcount
+ * reaches 0.
*
- * Context: Any context, but the last reference must not be dropped from
- * atomic context.
+ * Context: Can sleep.
*/
void blk_put_queue(struct request_queue *q)
{
- kobject_put(&q->kobj);
+ might_sleep();
+ if (refcount_dec_and_test(&q->refs))
+ blk_free_queue(q);
}
EXPORT_SYMBOL(blk_put_queue);
@@ -373,26 +397,20 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
+struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
- q = kmem_cache_alloc_node(blk_get_queue_kmem_cache(alloc_srcu),
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
+ node_id);
if (!q)
return NULL;
- if (alloc_srcu) {
- blk_queue_flag_set(QUEUE_FLAG_HAS_SRCU, q);
- if (init_srcu_struct(q->srcu) != 0)
- goto fail_q;
- }
-
q->last_merge = NULL;
q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
- goto fail_srcu;
+ goto fail_q;
q->stats = blk_alloc_queue_stats();
if (!q->stats)
@@ -406,8 +424,7 @@ struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu)
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
- kobject_init(&q->kobj, &blk_queue_ktype);
-
+ refcount_set(&q->refs, 1);
mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
@@ -434,11 +451,8 @@ fail_stats:
blk_free_queue_stats(q->stats);
fail_id:
ida_free(&blk_queue_ida, q->id);
-fail_srcu:
- if (alloc_srcu)
- cleanup_srcu_struct(q->srcu);
fail_q:
- kmem_cache_free(blk_get_queue_kmem_cache(alloc_srcu), q);
+ kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
@@ -454,7 +468,7 @@ bool blk_get_queue(struct request_queue *q)
{
if (unlikely(blk_queue_dying(q)))
return false;
- kobject_get(&q->kobj);
+ refcount_inc(&q->refs);
return true;
}
EXPORT_SYMBOL(blk_get_queue);
@@ -945,18 +959,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev,
EXPORT_SYMBOL(bdev_start_io_acct);
/**
- * bio_start_io_acct_time - start I/O accounting for bio based drivers
- * @bio: bio to start account for
- * @start_time: start time that should be passed back to bio_end_io_acct().
- */
-void bio_start_io_acct_time(struct bio *bio, unsigned long start_time)
-{
- bdev_start_io_acct(bio->bi_bdev, bio_sectors(bio),
- bio_op(bio), start_time);
-}
-EXPORT_SYMBOL_GPL(bio_start_io_acct_time);
-
-/**
* bio_start_io_acct - start I/O accounting for bio based drivers
* @bio: bio to start account for
*
@@ -1183,9 +1185,6 @@ int __init blk_dev_init(void)
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct bio, bi_opf));
- BUILD_BUG_ON(ALIGN(offsetof(struct request_queue, srcu),
- __alignof__(struct request_queue)) !=
- sizeof(struct request_queue));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
@@ -1196,10 +1195,6 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
- blk_requestq_srcu_cachep = kmem_cache_create("request_queue_srcu",
- sizeof(struct request_queue) +
- sizeof(struct srcu_struct), 0, SLAB_PANIC, NULL);
-
blk_debugfs_root = debugfs_create_dir("block", NULL);
return 0;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index e6818ffaddbf..a8cdaf26851e 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -21,9 +21,9 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
-int blk_crypto_sysfs_register(struct request_queue *q);
+int blk_crypto_sysfs_register(struct gendisk *disk);
-void blk_crypto_sysfs_unregister(struct request_queue *q);
+void blk_crypto_sysfs_unregister(struct gendisk *disk);
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -65,14 +65,28 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return rq->crypt_ctx;
}
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
-static inline int blk_crypto_sysfs_register(struct request_queue *q)
+static inline int blk_crypto_sysfs_register(struct gendisk *disk)
{
return 0;
}
-static inline void blk_crypto_sysfs_unregister(struct request_queue *q) { }
+static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
+{
+}
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
index 96c511967386..0307fb0d95d3 100644
--- a/block/blk-crypto-profile.c
+++ b/block/blk-crypto-profile.c
@@ -32,6 +32,7 @@
#include <linux/wait.h>
#include <linux/blkdev.h>
#include <linux/blk-integrity.h>
+#include "blk-crypto-internal.h"
struct blk_crypto_keyslot {
atomic_t slot_refs;
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
index fd93bd2f33b7..55268edc0625 100644
--- a/block/blk-crypto-sysfs.c
+++ b/block/blk-crypto-sysfs.c
@@ -126,8 +126,9 @@ static struct kobj_type blk_crypto_ktype = {
* If the request_queue has a blk_crypto_profile, create the "crypto"
* subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
*/
-int blk_crypto_sysfs_register(struct request_queue *q)
+int blk_crypto_sysfs_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_crypto_kobj *obj;
int err;
@@ -139,8 +140,8 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return -ENOMEM;
obj->profile = q->crypto_profile;
- err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, &q->kobj,
- "crypto");
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype,
+ &disk->queue_kobj, "crypto");
if (err) {
kobject_put(&obj->kobj);
return err;
@@ -149,9 +150,9 @@ int blk_crypto_sysfs_register(struct request_queue *q)
return 0;
}
-void blk_crypto_sysfs_unregister(struct request_queue *q)
+void blk_crypto_sysfs_unregister(struct gendisk *disk)
{
- kobject_put(q->crypto_kobject);
+ kobject_put(disk->queue->crypto_kobject);
}
static int __init blk_crypto_sysfs_init(void)
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index e44709fc6a08..45378586151f 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -273,7 +273,6 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
- struct blk_crypto_profile *profile;
/* Error if bio has no data. */
if (WARN_ON_ONCE(!bio_has_data(bio))) {
@@ -290,10 +289,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
- profile = bdev_get_queue(bio->bi_bdev)->crypto_profile;
- if (__blk_crypto_cfg_supported(profile, &bc_key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bio->bi_bdev,
+ &bc_key->crypto_cfg))
return true;
-
if (blk_crypto_fallback_bio_prep(bio_ptr))
return true;
fail:
@@ -358,22 +356,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return 0;
}
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+ const struct blk_crypto_config *cfg)
+{
+ return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+ cfg);
+}
+
/*
* Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
- * request queue it's submitted to supports inline crypto, or the
+ * block_device it's submitted to supports inline crypto, or the
* blk-crypto-fallback is enabled and supports the cfg).
*/
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- __blk_crypto_cfg_supported(q->crypto_profile, cfg);
+ blk_crypto_config_supported_natively(bdev, cfg);
}
/**
* blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @bdev: block device to operate on
* @key: A key to use on the device
- * @q: the request queue for the device
*
* Upper layers must call this function to ensure that either the hardware
* supports the key's crypto settings, or the crypto API fallback has transforms
@@ -385,10 +390,10 @@ bool blk_crypto_config_supported(struct request_queue *q,
* blk-crypto-fallback is either disabled or the needed algorithm
* is disabled in the crypto API; or another -errno code.
*/
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
- struct request_queue *q)
+int blk_crypto_start_using_key(struct block_device *bdev,
+ const struct blk_crypto_key *key)
{
- if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
@@ -396,7 +401,7 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
/**
* blk_crypto_evict_key() - Evict a key from any inline encryption hardware
* it may have been programmed into
- * @q: The request queue who's associated inline encryption hardware this key
+ * @bdev: The block_device who's associated inline encryption hardware this key
* might have been programmed into
* @key: The key to evict
*
@@ -406,14 +411,16 @@ int blk_crypto_start_using_key(const struct blk_crypto_key *key,
*
* Return: 0 on success or if the key wasn't in any keyslot; -errno on error.
*/
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
const struct blk_crypto_key *key)
{
- if (__blk_crypto_cfg_supported(q->crypto_profile, &key->crypto_cfg))
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return __blk_crypto_evict_key(q->crypto_profile, key);
/*
- * If the request_queue didn't support the key, then blk-crypto-fallback
+ * If the block_device didn't support the key, then blk-crypto-fallback
* may have been used, so try to evict the key from blk-crypto-fallback.
*/
return blk_crypto_fallback_evict_key(key);
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
index 2bd1d311033b..2141931ddd37 100644
--- a/block/blk-ia-ranges.c
+++ b/block/blk-ia-ranges.c
@@ -123,7 +123,8 @@ int disk_register_independent_access_ranges(struct gendisk *disk)
*/
WARN_ON(iars->sysfs_registered);
ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
- &q->kobj, "%s", "independent_access_ranges");
+ &disk->queue_kobj, "%s",
+ "independent_access_ranges");
if (ret) {
disk->ia_ranges = NULL;
kobject_put(&iars->kobj);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 495396425bad..d1bdc12deaa7 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -111,7 +111,7 @@
* busy signal.
*
* As devices can have deep queues and be unfair in how the queued commands
- * are executed, soley depending on rq wait may not result in satisfactory
+ * are executed, solely depending on rq wait may not result in satisfactory
* control quality. For a better control quality, completion latency QoS
* parameters can be configured so that the device is considered saturated
* if N'th percentile completion latency rises above the set point.
@@ -556,7 +556,6 @@ struct ioc_now {
u64 now_ns;
u64 now;
u64 vnow;
- u64 vrate;
};
struct iocg_wait {
@@ -906,8 +905,10 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
if (idx == ioc->autop_idx && !force)
return false;
- if (idx != ioc->autop_idx)
+ if (idx != ioc->autop_idx) {
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+ ioc->vtime_base_rate = VTIME_PER_USEC;
+ }
ioc->autop_idx = idx;
ioc->autop_too_fast_at = 0;
@@ -975,7 +976,7 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
if (ioc->busy_level != prev_busy_level || nr_lagging)
- trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+ trace_iocost_ioc_vrate_adj(ioc, vrate,
missed_ppm, rq_wait_pct,
nr_lagging, nr_shortages);
@@ -1018,10 +1019,11 @@ static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
unsigned seq;
+ u64 vrate;
now->now_ns = ktime_get();
now->now = ktime_to_us(now->now_ns);
- now->vrate = atomic64_read(&ioc->vtime_rate);
+ vrate = atomic64_read(&ioc->vtime_rate);
/*
* The current vtime is
@@ -1034,7 +1036,7 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
do {
seq = read_seqcount_begin(&ioc->period_seqcount);
now->vnow = ioc->period_at_vtime +
- (now->now - ioc->period_at) * now->vrate;
+ (now->now - ioc->period_at) * vrate;
} while (read_seqcount_retry(&ioc->period_seqcount, seq));
}
@@ -2203,8 +2205,8 @@ static void ioc_timer_fn(struct timer_list *timer)
LIST_HEAD(surpluses);
int nr_debtors, nr_shortages = 0, nr_lagging = 0;
u64 usage_us_sum = 0;
- u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
- u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ u32 ppm_rthr;
+ u32 ppm_wthr;
u32 missed_ppm[2], rq_wait_pct;
u64 period_vtime;
int prev_busy_level;
@@ -2215,6 +2217,8 @@ static void ioc_timer_fn(struct timer_list *timer)
/* take care of active iocgs */
spin_lock_irq(&ioc->lock);
+ ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
ioc_now(ioc, &now);
period_vtime = now.vnow - ioc->period_at_vtime;
@@ -2878,7 +2882,7 @@ static int blk_iocost_init(struct gendisk *disk)
spin_unlock_irq(&ioc->lock);
/*
- * rqos must be added before activation to allow iocg_pd_init() to
+ * rqos must be added before activation to allow ioc_pd_init() to
* lookup the ioc from q. This means that the rqos methods may get
* called before policy activation completion, can't assume that the
* target bio has an iocg associated and need to test for NULL iocg.
@@ -3187,11 +3191,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc = q_to_ioc(disk->queue);
}
+ blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue(disk->queue);
+
spin_lock_irq(&ioc->lock);
memcpy(qos, ioc->params.qos, sizeof(qos));
enable = ioc->enabled;
user = ioc->user_qos_params;
- spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@@ -3258,15 +3264,15 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (qos[QOS_MIN] > qos[QOS_MAX])
goto einval;
- spin_lock_irq(&ioc->lock);
-
if (enable) {
blk_stat_enable_accounting(disk->queue);
blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
+ wbt_disable_default(disk->queue);
} else {
blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
+ wbt_enable_default(disk->queue);
}
if (user) {
@@ -3279,9 +3285,17 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
blkdev_put_no_open(bdev);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);
@@ -3336,6 +3350,7 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
struct block_device *bdev;
+ struct request_queue *q;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
@@ -3346,18 +3361,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
if (IS_ERR(bdev))
return PTR_ERR(bdev);
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ q = bdev_get_queue(bdev);
+ ioc = q_to_ioc(q);
if (!ioc) {
ret = blk_iocost_init(bdev->bd_disk);
if (ret)
goto err;
- ioc = q_to_ioc(bdev_get_queue(bdev));
+ ioc = q_to_ioc(q);
}
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
spin_lock_irq(&ioc->lock);
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
- spin_unlock_irq(&ioc->lock);
while ((p = strsep(&input, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
@@ -3394,7 +3412,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
user = true;
}
- spin_lock_irq(&ioc->lock);
if (user) {
memcpy(ioc->params.i_lcoefs, u, sizeof(u));
ioc->user_cost_model = true;
@@ -3404,10 +3421,18 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
blkdev_put_no_open(bdev);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
ret = -EINVAL;
err:
blkdev_put_no_open(bdev);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index 571fa95aafe9..778a0057193e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -141,7 +141,7 @@ struct iolatency_grp {
struct latency_stat __percpu *stats;
struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
- struct rq_depth rq_depth;
+ unsigned int max_depth;
struct rq_wait rq_wait;
atomic64_t window_start;
atomic_t scale_cookie;
@@ -280,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
{
struct iolatency_grp *iolat = private_data;
- return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+ return rq_wait_inc_below(rqw, iolat->max_depth);
}
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
@@ -364,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
}
/*
- * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
+ * Change the queue depth of the iolatency_grp. We add 1/16th of the
* queue depth at a time so we don't get wild swings and hopefully dial in to
- * fairer distribution of the overall queue depth.
+ * fairer distribution of the overall queue depth. We halve the queue depth
+ * at a time so we can scale down queue depth quickly from default unlimited
+ * to target.
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
unsigned long scale = scale_amount(qd, up);
- unsigned long old = iolat->rq_depth.max_depth;
+ unsigned long old = iolat->max_depth;
if (old > qd)
old = qd;
@@ -384,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
if (old < qd) {
old += scale;
old = min(old, qd);
- iolat->rq_depth.max_depth = old;
+ iolat->max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
} else {
old >>= 1;
- iolat->rq_depth.max_depth = max(old, 1UL);
+ iolat->max_depth = max(old, 1UL);
}
}
@@ -403,9 +405,6 @@ static void check_scale_change(struct iolatency_grp *iolat)
u64 scale_lat;
int direction = 0;
- if (lat_to_blkg(iolat)->parent == NULL)
- return;
-
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
if (!parent)
return;
@@ -445,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
}
/* We're as low as we can go. */
- if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+ if (iolat->max_depth == 1 && direction < 0) {
blkcg_use_delay(lat_to_blkg(iolat));
return;
}
@@ -453,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
/* We're back to the default cookie, unthrottle all the things. */
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
blkcg_clear_delay(lat_to_blkg(iolat));
- iolat->rq_depth.max_depth = UINT_MAX;
+ iolat->max_depth = UINT_MAX;
wake_up_all(&iolat->rq_wait.wait);
return;
}
@@ -508,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
* We don't want to count issue_as_root bio's in the cgroups latency
* statistics as it could skew the numbers downwards.
*/
- if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+ if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
u64 sub = iolat->min_lat_nsec;
if (req_time < sub)
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
@@ -920,7 +919,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
}
preempt_enable();
- if (iolat->rq_depth.max_depth == UINT_MAX)
+ if (iolat->max_depth == UINT_MAX)
seq_printf(s, " missed=%llu total=%llu depth=max",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total);
@@ -928,7 +927,7 @@ static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
seq_printf(s, " missed=%llu total=%llu depth=%u",
(unsigned long long)stat.ps.missed,
(unsigned long long)stat.ps.total,
- iolat->rq_depth.max_depth);
+ iolat->max_depth);
}
static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
@@ -945,12 +944,12 @@ static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
- if (iolat->rq_depth.max_depth == UINT_MAX)
+ if (iolat->max_depth == UINT_MAX)
seq_printf(s, " depth=max avg_lat=%llu win=%llu",
avg_lat, cur_win);
else
seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
- iolat->rq_depth.max_depth, avg_lat, cur_win);
+ iolat->max_depth, avg_lat, cur_win);
}
static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
@@ -994,9 +993,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
- iolat->rq_depth.queue_depth = blkg->q->nr_requests;
- iolat->rq_depth.max_depth = UINT_MAX;
- iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+ iolat->max_depth = UINT_MAX;
iolat->blkiolat = blkiolat;
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
atomic64_set(&iolat->window_start, now);
diff --git a/block/blk-map.c b/block/blk-map.c
index 34735626b00f..19940c978c73 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -267,6 +267,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
{
unsigned int max_sectors = queue_max_hw_sectors(rq->q);
unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
+ unsigned int gup_flags = 0;
struct bio *bio;
int ret;
int j;
@@ -278,6 +279,9 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (bio == NULL)
return -ENOMEM;
+ if (blk_queue_pci_p2pdma(rq->q))
+ gup_flags |= FOLL_PCI_P2PDMA;
+
while (iov_iter_count(iter)) {
struct page **pages, *stack_pages[UIO_FASTIOV];
ssize_t bytes;
@@ -286,11 +290,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
if (nr_vecs <= ARRAY_SIZE(stack_pages)) {
pages = stack_pages;
- bytes = iov_iter_get_pages2(iter, pages, LONG_MAX,
- nr_vecs, &offs);
+ bytes = iov_iter_get_pages(iter, pages, LONG_MAX,
+ nr_vecs, &offs, gup_flags);
} else {
- bytes = iov_iter_get_pages_alloc2(iter, &pages,
- LONG_MAX, &offs);
+ bytes = iov_iter_get_pages_alloc(iter, &pages,
+ LONG_MAX, &offs, gup_flags);
}
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
@@ -555,7 +559,7 @@ static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
size_t nr_iter = iov_iter_count(iter);
size_t nr_segs = iter->nr_segs;
struct bio_vec *bvecs, *bvprvp = NULL;
- struct queue_limits *lim = &q->limits;
+ const struct queue_limits *lim = &q->limits;
unsigned int nsegs = 0, bytes = 0;
struct bio *bio;
size_t i;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index ff04e9290715..35a8f75cc45d 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -100,13 +100,14 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
* is defined as 'unsigned int', meantime it has to be aligned to with the
* logical block size, which is the minimum accepted unit by hardware.
*/
-static unsigned int bio_allowed_max_sectors(struct queue_limits *lim)
+static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
{
return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
}
-static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
- unsigned *nsegs, struct bio_set *bs)
+static struct bio *bio_split_discard(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
sector_t tmp;
@@ -146,7 +147,8 @@ static struct bio *bio_split_discard(struct bio *bio, struct queue_limits *lim,
}
static struct bio *bio_split_write_zeroes(struct bio *bio,
- struct queue_limits *lim, unsigned *nsegs, struct bio_set *bs)
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
if (!lim->max_write_zeroes_sectors)
@@ -165,7 +167,7 @@ static struct bio *bio_split_write_zeroes(struct bio *bio,
* aligned to a physical block boundary.
*/
static inline unsigned get_max_io_size(struct bio *bio,
- struct queue_limits *lim)
+ const struct queue_limits *lim)
{
unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
@@ -184,7 +186,15 @@ static inline unsigned get_max_io_size(struct bio *bio,
return max_sectors & ~(lbs - 1);
}
-static inline unsigned get_max_segment_size(struct queue_limits *lim,
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @start_page: See below.
+ * @offset: Offset from @start_page where to add a segment.
+ *
+ * Returns the maximum number of bytes that can be added as a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
struct page *start_page, unsigned long offset)
{
unsigned long mask = lim->seg_boundary_mask;
@@ -192,11 +202,10 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
offset = mask & (page_to_phys(start_page) + offset);
/*
- * overflow may be triggered in case of zero page physical address
- * on 32bit arch, use queue's max segment size when that happens.
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
*/
- return min_not_zero(mask - offset + 1,
- (unsigned long)lim->max_segment_size);
+ return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
}
/**
@@ -219,9 +228,9 @@ static inline unsigned get_max_segment_size(struct queue_limits *lim,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
-static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
- unsigned *nsegs, unsigned *bytes, unsigned max_segs,
- unsigned max_bytes)
+static bool bvec_split_segs(const struct queue_limits *lim,
+ const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
+ unsigned max_segs, unsigned max_bytes)
{
unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
@@ -267,7 +276,7 @@ static bool bvec_split_segs(struct queue_limits *lim, const struct bio_vec *bv,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *bio_split_rw(struct bio *bio, struct queue_limits *lim,
+static struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
@@ -331,8 +340,9 @@ split:
* The split bio is allocated from @q->bio_split, which is provided by the
* block layer.
*/
-struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
- unsigned int *nr_segs)
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs)
{
struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
struct bio *split;
@@ -377,7 +387,7 @@ struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
*/
struct bio *bio_split_to_limits(struct bio *bio)
{
- struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
+ const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
if (bio_may_exceed_limits(bio, lim))
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index a4f7c101b53b..23d1a90fec42 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -555,6 +555,7 @@ static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
return 0;
}
+/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
unsigned int flags = q->tag_set->flags;
@@ -563,13 +564,6 @@ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
unsigned long i;
int ret;
- if (!e) {
- blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- return 0;
- }
-
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 93997d297d42..4515288fbe35 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -185,7 +185,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
- int i, ret;
+ int i, j, ret;
if (!hctx->nr_ctx)
return 0;
@@ -197,9 +197,16 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
- break;
+ goto out;
}
+ return 0;
+out:
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (j < i)
+ kobject_del(&ctx->kobj);
+ }
+ kobject_del(&hctx->kobj);
return ret;
}
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 228a6696d835..c5cf0dbca1db 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -254,15 +254,17 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
* blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
- * @q: request queue.
+ * @set: tag_set to wait on
*
* Note: it is driver's responsibility for making sure that quiesce has
- * been started.
+ * been started on or more of the request_queues of the tag_set. This
+ * function only waits for the quiesce on those request_queues that had
+ * the quiesce flag set using blk_mq_quiesce_queue_nowait.
*/
-void blk_mq_wait_quiesce_done(struct request_queue *q)
+void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
{
- if (blk_queue_has_srcu(q))
- synchronize_srcu(q->srcu);
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ synchronize_srcu(set->srcu);
else
synchronize_rcu();
}
@@ -280,7 +282,9 @@ EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
void blk_mq_quiesce_queue(struct request_queue *q)
{
blk_mq_quiesce_queue_nowait(q);
- blk_mq_wait_quiesce_done(q);
+ /* nothing to wait for non-mq queues */
+ if (queue_is_mq(q))
+ blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -311,6 +315,33 @@ void blk_mq_unquiesce_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_quiesce_queue_nowait(q);
+ }
+ blk_mq_wait_quiesce_done(set);
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_unquiesce_queue(q);
+ }
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
@@ -544,25 +575,26 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
if (!plug)
return NULL;
+
if (rq_list_empty(plug->cached_rq)) {
if (plug->nr_ios == 1)
return NULL;
rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
- if (rq)
- goto got_it;
- return NULL;
- }
- rq = rq_list_peek(&plug->cached_rq);
- if (!rq || rq->q != q)
- return NULL;
+ if (!rq)
+ return NULL;
+ } else {
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
- if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
- return NULL;
- if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
- return NULL;
+ if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+
+ plug->cached_rq = rq_list_next(rq);
+ }
- plug->cached_rq = rq_list_next(rq);
-got_it:
rq->cmd_flags = opf;
INIT_LIST_HEAD(&rq->queuelist);
return rq;
@@ -1529,7 +1561,13 @@ static void blk_mq_rq_timed_out(struct request *req)
blk_add_timer(req);
}
-static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
+struct blk_expired_data {
+ bool has_timedout_rq;
+ unsigned long next;
+ unsigned long timeout_start;
+};
+
+static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
unsigned long deadline;
@@ -1539,13 +1577,13 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
deadline = READ_ONCE(rq->deadline);
- if (time_after_eq(jiffies, deadline))
+ if (time_after_eq(expired->timeout_start, deadline))
return true;
- if (*next == 0)
- *next = deadline;
- else if (time_after(*next, deadline))
- *next = deadline;
+ if (expired->next == 0)
+ expired->next = deadline;
+ else if (time_after(expired->next, deadline))
+ expired->next = deadline;
return false;
}
@@ -1561,7 +1599,7 @@ void blk_mq_put_rq_ref(struct request *rq)
static bool blk_mq_check_expired(struct request *rq, void *priv)
{
- unsigned long *next = priv;
+ struct blk_expired_data *expired = priv;
/*
* blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
@@ -1570,7 +1608,18 @@ static bool blk_mq_check_expired(struct request *rq, void *priv)
* it was completed and reallocated as a new request after returning
* from blk_mq_check_expired().
*/
- if (blk_mq_req_expired(rq, next))
+ if (blk_mq_req_expired(rq, expired)) {
+ expired->has_timedout_rq = true;
+ return false;
+ }
+ return true;
+}
+
+static bool blk_mq_handle_expired(struct request *rq, void *priv)
+{
+ struct blk_expired_data *expired = priv;
+
+ if (blk_mq_req_expired(rq, expired))
blk_mq_rq_timed_out(rq);
return true;
}
@@ -1579,7 +1628,9 @@ static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
- unsigned long next = 0;
+ struct blk_expired_data expired = {
+ .timeout_start = jiffies,
+ };
struct blk_mq_hw_ctx *hctx;
unsigned long i;
@@ -1599,10 +1650,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+ /* check if there is any timed-out request */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
+ if (expired.has_timedout_rq) {
+ /*
+ * Before walking tags, we must ensure any submit started
+ * before the current time has finished. Since the submit
+ * uses srcu or rcu, wait for a synchronization point to
+ * ensure all running submits have finished
+ */
+ blk_mq_wait_quiesce_done(q->tag_set);
+
+ expired.next = 0;
+ blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
+ }
- if (next != 0) {
- mod_timer(&q->timeout, next);
+ if (expired.next != 0) {
+ mod_timer(&q->timeout, expired.next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@@ -3248,21 +3312,22 @@ static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->rqs) {
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->rqs)
+ goto err_free_tags;
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->static_rqs) {
- kfree(tags->rqs);
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->static_rqs)
+ goto err_free_rqs;
return tags;
+
+err_free_rqs:
+ kfree(tags->rqs);
+err_free_tags:
+ blk_mq_free_tags(tags);
+ return NULL;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
@@ -3975,7 +4040,7 @@ static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
struct request_queue *q;
int ret;
- q = blk_alloc_queue(set->numa_node, set->flags & BLK_MQ_F_BLOCKING);
+ q = blk_alloc_queue(set->numa_node);
if (!q)
return ERR_PTR(-ENOMEM);
q->queuedata = queuedata;
@@ -4011,14 +4076,11 @@ void blk_mq_destroy_queue(struct request_queue *q)
blk_queue_flag_set(QUEUE_FLAG_DYING, q);
blk_queue_start_drain(q);
- blk_freeze_queue(q);
+ blk_mq_freeze_queue_wait(q);
blk_sync_queue(q);
blk_mq_cancel_work_sync(q);
blk_mq_exit_queue(q);
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
}
EXPORT_SYMBOL(blk_mq_destroy_queue);
@@ -4035,6 +4097,7 @@ struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
disk = __alloc_disk_node(q, set->numa_node, lkclass);
if (!disk) {
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
return ERR_PTR(-ENOMEM);
}
set_bit(GD_OWNS_QUEUE, &disk->state);
@@ -4147,9 +4210,6 @@ static void blk_mq_update_poll_flag(struct request_queue *q)
int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- WARN_ON_ONCE(blk_queue_has_srcu(q) !=
- !!(set->flags & BLK_MQ_F_BLOCKING));
-
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -4325,12 +4385,12 @@ static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
- int cur_nr_hw_queues, int new_nr_hw_queues)
+ int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
- if (cur_nr_hw_queues >= new_nr_hw_queues)
- return 0;
+ if (set->nr_hw_queues >= new_nr_hw_queues)
+ goto done;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@@ -4338,21 +4398,15 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
return -ENOMEM;
if (set->tags)
- memcpy(new_tags, set->tags, cur_nr_hw_queues *
+ memcpy(new_tags, set->tags, set->nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
+done:
set->nr_hw_queues = new_nr_hw_queues;
-
return 0;
}
-static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set,
- int new_nr_hw_queues)
-{
- return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues);
-}
-
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -4406,10 +4460,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0)
- return -ENOMEM;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
+ if (!set->srcu)
+ return -ENOMEM;
+ ret = init_srcu_struct(set->srcu);
+ if (ret)
+ goto out_free_srcu;
+ }
ret = -ENOMEM;
+ set->tags = kcalloc_node(set->nr_hw_queues,
+ sizeof(struct blk_mq_tags *), GFP_KERNEL,
+ set->numa_node);
+ if (!set->tags)
+ goto out_cleanup_srcu;
+
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
@@ -4437,6 +4503,12 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ cleanup_srcu_struct(set->srcu);
+out_free_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
@@ -4476,6 +4548,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ cleanup_srcu_struct(set->srcu);
+ kfree(set->srcu);
+ }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
@@ -4564,17 +4640,10 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
+ /* keep a reference to the elevator module as we'll switch back */
+ __elevator_get(qe->type);
list_add(&qe->node, head);
-
- /*
- * After elevator_switch, the previous elevator_queue will be
- * released by elevator_release. The reference of the io scheduler
- * module get by elevator_get will also be put. So we need to get
- * a reference of the io scheduler module here to prevent it to be
- * removed.
- */
- __module_get(qe->type->elevator_owner);
- elevator_switch(q, NULL);
+ elevator_disable(q);
mutex_unlock(&q->sysfs_lock);
return true;
@@ -4607,6 +4676,8 @@ static void blk_mq_elv_switch_back(struct list_head *head,
mutex_lock(&q->sysfs_lock);
elevator_switch(q, t);
+ /* drop the reference acquired in blk_mq_elv_switch_none */
+ elevator_put(t);
mutex_unlock(&q->sysfs_lock);
}
@@ -4643,11 +4714,9 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
}
prev_nr_hw_queues = set->nr_hw_queues;
- if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
- 0)
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto reregister;
- set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
@@ -4867,15 +4936,13 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
void blk_mq_cancel_work_sync(struct request_queue *q)
{
- if (queue_is_mq(q)) {
- struct blk_mq_hw_ctx *hctx;
- unsigned long i;
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- cancel_delayed_work_sync(&q->requeue_work);
+ cancel_delayed_work_sync(&q->requeue_work);
- queue_for_each_hw_ctx(q, hctx, i)
- cancel_delayed_work_sync(&hctx->run_work);
- }
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
}
static int __init blk_mq_init(void)
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 0b2870839cdd..ef59fee62780 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -377,17 +377,17 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
/* run the code block in @dispatch_ops with rcu/srcu read lock held */
#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
do { \
- if (!blk_queue_has_srcu(q)) { \
- rcu_read_lock(); \
- (dispatch_ops); \
- rcu_read_unlock(); \
- } else { \
+ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \
int srcu_idx; \
\
might_sleep_if(check_sleep); \
- srcu_idx = srcu_read_lock((q)->srcu); \
+ srcu_idx = srcu_read_lock((q)->tag_set->srcu); \
(dispatch_ops); \
- srcu_read_unlock((q)->srcu, srcu_idx); \
+ srcu_read_unlock((q)->tag_set->srcu, srcu_idx); \
+ } else { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
} \
} while (0)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 8ac1038d0c79..0477c4d527fe 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -481,7 +481,7 @@ void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
}
EXPORT_SYMBOL(blk_queue_io_opt);
-static int queue_limit_alignment_offset(struct queue_limits *lim,
+static int queue_limit_alignment_offset(const struct queue_limits *lim,
sector_t sector)
{
unsigned int granularity = max(lim->physical_block_size, lim->io_min);
@@ -491,8 +491,8 @@ static int queue_limit_alignment_offset(struct queue_limits *lim,
return (granularity + lim->alignment_offset - alignment) % granularity;
}
-static unsigned int queue_limit_discard_alignment(struct queue_limits *lim,
- sector_t sector)
+static unsigned int queue_limit_discard_alignment(
+ const struct queue_limits *lim, sector_t sector)
{
unsigned int alignment, granularity, offset;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index e71b3b43927c..93d9e9c9a6ea 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -470,6 +470,9 @@ static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
if (!wbt_rq_qos(q))
return -EINVAL;
+ if (wbt_disabled(q))
+ return sprintf(page, "0\n");
+
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
@@ -680,8 +683,8 @@ static struct attribute *queue_attrs[] = {
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
if (attr == &queue_io_timeout_entry.attr &&
(!q->mq_ops || !q->mq_ops->timeout))
@@ -707,8 +710,8 @@ static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->show)
@@ -724,68 +727,19 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q;
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->store)
return -EIO;
- q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
-static void blk_free_queue_rcu(struct rcu_head *rcu_head)
-{
- struct request_queue *q = container_of(rcu_head, struct request_queue,
- rcu_head);
-
- kmem_cache_free(blk_get_queue_kmem_cache(blk_queue_has_srcu(q)), q);
-}
-
-/**
- * blk_release_queue - releases all allocated resources of the request_queue
- * @kobj: pointer to a kobject, whose container is a request_queue
- *
- * This function releases all allocated resources of the request queue.
- *
- * The struct request_queue refcount is incremented with blk_get_queue() and
- * decremented with blk_put_queue(). Once the refcount reaches 0 this function
- * is called.
- *
- * Drivers exist which depend on the release of the request_queue to be
- * synchronous, it should not be deferred.
- *
- * Context: can sleep
- */
-static void blk_release_queue(struct kobject *kobj)
-{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
-
- might_sleep();
-
- percpu_ref_exit(&q->q_usage_counter);
-
- if (q->poll_stat)
- blk_stat_remove_callback(q, q->poll_cb);
- blk_stat_free_callback(q->poll_cb);
-
- blk_free_queue_stats(q->stats);
- kfree(q->poll_stat);
-
- if (queue_is_mq(q))
- blk_mq_release(q);
-
- if (blk_queue_has_srcu(q))
- cleanup_srcu_struct(q->srcu);
-
- ida_free(&blk_queue_ida, q->id);
- call_rcu(&q->rcu_head, blk_free_queue_rcu);
-}
-
static const struct sysfs_ops queue_sysfs_ops = {
.show = queue_attr_show,
.store = queue_attr_store,
@@ -796,12 +750,30 @@ static const struct attribute_group *blk_queue_attr_groups[] = {
NULL
};
-struct kobj_type blk_queue_ktype = {
+static void blk_queue_release(struct kobject *kobj)
+{
+ /* nothing to do here, all data is associated with the parent gendisk */
+}
+
+static struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
- .release = blk_release_queue,
+ .release = blk_queue_release,
};
+static void blk_debugfs_remove(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_trace_shutdown(q);
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->debugfs_dir = NULL;
+ q->sched_debugfs_dir = NULL;
+ q->rqos_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
+}
+
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
@@ -812,47 +784,47 @@ int blk_register_queue(struct gendisk *disk)
int ret;
mutex_lock(&q->sysfs_dir_lock);
-
- ret = kobject_add(&q->kobj, &disk_to_dev(disk)->kobj, "queue");
+ kobject_init(&disk->queue_kobj, &blk_queue_ktype);
+ ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
- goto unlock;
+ goto out_put_queue_kobj;
- if (queue_is_mq(q))
- blk_mq_sysfs_register(disk);
+ if (queue_is_mq(q)) {
+ ret = blk_mq_sysfs_register(disk);
+ if (ret)
+ goto out_put_queue_kobj;
+ }
mutex_lock(&q->sysfs_lock);
mutex_lock(&q->debugfs_mutex);
- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
+ q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
if (queue_is_mq(q))
blk_mq_debugfs_register(q);
mutex_unlock(&q->debugfs_mutex);
ret = disk_register_independent_access_ranges(disk);
if (ret)
- goto put_dev;
+ goto out_debugfs_remove;
if (q->elevator) {
ret = elv_register_queue(q, false);
if (ret)
- goto put_dev;
+ goto out_unregister_ia_ranges;
}
- ret = blk_crypto_sysfs_register(q);
+ ret = blk_crypto_sysfs_register(disk);
if (ret)
- goto put_dev;
+ goto out_elv_unregister;
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(q);
blk_throtl_register(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
- kobject_uevent(&q->kobj, KOBJ_ADD);
+ kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
if (q->elevator)
kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
mutex_unlock(&q->sysfs_lock);
-
-unlock:
mutex_unlock(&q->sysfs_dir_lock);
/*
@@ -871,13 +843,16 @@ unlock:
return ret;
-put_dev:
+out_elv_unregister:
elv_unregister_queue(q);
+out_unregister_ia_ranges:
disk_unregister_independent_access_ranges(disk);
+out_debugfs_remove:
+ blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
+out_put_queue_kobj:
+ kobject_put(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
- kobject_del(&q->kobj);
-
return ret;
}
@@ -915,7 +890,7 @@ void blk_unregister_queue(struct gendisk *disk)
*/
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
- blk_crypto_sysfs_unregister(q);
+ blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
elv_unregister_queue(q);
@@ -923,15 +898,9 @@ void blk_unregister_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock);
/* Now that we've deleted all child objects, we can delete the queue. */
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
+ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
+ kobject_del(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
- mutex_lock(&q->debugfs_mutex);
- blk_trace_shutdown(q);
- debugfs_remove_recursive(q->debugfs_dir);
- q->debugfs_dir = NULL;
- q->sched_debugfs_dir = NULL;
- q->rqos_debugfs_dir = NULL;
- mutex_unlock(&q->debugfs_mutex);
+ blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 847721dc2b2b..6fb5a2f9e1ee 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -129,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
/*
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
* make the IO dispatch more smooth.
- * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * Scale up: linearly scale up according to elapsed time since upgrade. For
* every throtl_slice, the limit scales up 1/2 .low limit till the
* limit hits .max limit
* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
@@ -395,8 +395,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
* If on the default hierarchy, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
- * read_bps limit is set on the root group, the whole system can't
- * exceed 16M for the device.
+ * read_bps limit is set on a parent group, summary bps of
+ * parent group and its subtree groups can't exceed 16M for the
+ * device.
*
* If not on the default hierarchy, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
@@ -644,7 +645,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
* that bandwidth. Do try to make use of that bandwidth while giving
* credit.
*/
- if (time_after_eq(start, tg->slice_start[rw]))
+ if (time_after(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
@@ -821,17 +822,15 @@ static void tg_update_carryover(struct throtl_grp *tg)
tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
}
-static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
- u32 iops_limit, unsigned long *wait)
+static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
+ u32 iops_limit)
{
bool rw = bio_data_dir(bio);
unsigned int io_allowed;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
if (iops_limit == UINT_MAX) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
jiffy_elapsed = jiffies - tg->slice_start[rw];
@@ -841,21 +840,16 @@ static bool tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
tg->carryover_ios[rw];
if (tg->io_disp[rw] + 1 <= io_allowed) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
-
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
-static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
- u64 bps_limit, unsigned long *wait)
+static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
+ u64 bps_limit)
{
bool rw = bio_data_dir(bio);
u64 bytes_allowed, extra_bytes;
@@ -864,9 +858,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
/* no need to throttle if this bio's bytes have been accounted */
if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
@@ -879,9 +871,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
tg->carryover_bytes[rw];
if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
- if (wait)
- *wait = 0;
- return true;
+ return 0;
}
/* Calc approx time to dispatch */
@@ -896,9 +886,7 @@ static bool tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
* up we did. Add that time also.
*/
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
/*
@@ -946,8 +934,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (tg_within_bps_limit(tg, bio, bps_limit, &bps_wait) &&
- tg_within_iops_limit(tg, bio, iops_limit, &iops_wait)) {
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
return true;
@@ -1066,7 +1055,6 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
sq->nr_queued[rw]--;
throtl_charge_bio(tg, bio);
- bio_set_flag(bio, BIO_BPS_THROTTLED);
/*
* If our parent is another tg, we just need to transfer @bio to
@@ -1079,6 +1067,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
+ bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
@@ -1737,7 +1726,18 @@ void blk_throtl_cancel_bios(struct gendisk *disk)
* Set the flag to make sure throtl_pending_timer_fn() won't
* stop until all throttled bios are dispatched.
*/
- blkg_to_tg(blkg)->flags |= THROTL_TG_CANCELING;
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ continue;
+
/*
* Update disptime after setting the above flag to make sure
* throtl_select_dispatch() won't exit without dispatching.
@@ -1762,7 +1762,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
return min(rtime, wtime);
}
-/* tg should not be an intermediate node */
static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq;
@@ -1816,24 +1815,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
return ret;
}
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
- bool read_limit, write_limit;
+ bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
/*
- * if cgroup reaches low limit (if low limit is 0, the cgroup always
- * reaches), it's ok to upgrade to next limit
+ * if low limit is zero, low limit is always reached.
+ * if low limit is non-zero, we can check if there is any request
+ * is queued to determine if low limit is reached as we throttle
+ * request according to limit.
*/
- read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
- write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
- if (!read_limit && !write_limit)
- return true;
- if (read_limit && sq->nr_queued[READ] &&
- (!write_limit || sq->nr_queued[WRITE]))
- return true;
- if (write_limit && sq->nr_queued[WRITE] &&
- (!read_limit || sq->nr_queued[READ]))
+ return !limit || sq->nr_queued[rw];
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ /*
+ * cgroup reaches low limit when low limit of READ and WRITE are
+ * both reached, it's ok to upgrade to next limit if cgroup reaches
+ * low limit
+ */
+ if (throtl_low_limit_reached(tg, READ) &&
+ throtl_low_limit_reached(tg, WRITE))
return true;
if (time_after_eq(jiffies,
@@ -1951,8 +1955,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
* If cgroup is below low limit, consider downgrade and throttle other
* cgroups
*/
- if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
- time_after_eq(now, tg_last_low_overflow_time(tg) +
+ if (time_after_eq(now, tg_last_low_overflow_time(tg) +
td->throtl_slice) &&
(!throtl_tg_is_idle(tg) ||
!list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
@@ -1962,6 +1965,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
{
+ struct throtl_data *td = tg->td;
+
+ if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
+ return false;
+
while (true) {
if (!throtl_tg_can_downgrade(tg))
return false;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index c293e08b301f..68a774d7a7c9 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -27,6 +27,7 @@
#include "blk-wbt.h"
#include "blk-rq-qos.h"
+#include "elevator.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
@@ -422,6 +423,14 @@ static void wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
+bool wbt_disabled(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+
+ return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT ||
+ RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL;
+}
+
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -435,8 +444,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
struct rq_qos *rqos = wbt_rq_qos(q);
if (!rqos)
return;
+
RQWB(rqos)->min_lat_nsec = val;
- RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ if (val)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ else
+ RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
+
wbt_update_limits(RQWB(rqos));
}
@@ -638,11 +652,15 @@ void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
*/
void wbt_enable_default(struct request_queue *q)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_qos *rqos;
+ bool disable_flag = q->elevator &&
+ test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags);
/* Throttling already enabled? */
+ rqos = wbt_rq_qos(q);
if (rqos) {
- if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ if (!disable_flag &&
+ RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
}
@@ -651,7 +669,7 @@ void wbt_enable_default(struct request_queue *q)
if (!blk_queue_registered(q))
return;
- if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
+ if (queue_is_mq(q) && !disable_flag)
wbt_init(q);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 7e44eccc676d..e3ea6e7e2900 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -28,13 +28,15 @@ enum {
};
/*
- * Enable states. Either off, or on by default (done at init time),
- * or on through manual setup in sysfs.
+ * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
+ * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
+ * to WBT_STATE_OFF/ON_MANUAL.
*/
enum {
- WBT_STATE_ON_DEFAULT = 1,
- WBT_STATE_ON_MANUAL = 2,
- WBT_STATE_OFF_DEFAULT
+ WBT_STATE_ON_DEFAULT = 1, /* on by default */
+ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
+ WBT_STATE_OFF_DEFAULT = 3, /* off by default */
+ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
};
struct rq_wb {
@@ -94,6 +96,7 @@ void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
+bool wbt_disabled(struct request_queue *);
void wbt_set_write_cache(struct request_queue *, bool);
@@ -125,6 +128,10 @@ static inline u64 wbt_default_latency_nsec(struct request_queue *q)
{
return 0;
}
+static inline bool wbt_disabled(struct request_queue *q)
+{
+ return true;
+}
#endif /* CONFIG_BLK_WBT */
diff --git a/block/blk.h b/block/blk.h
index a186ea20f39d..4c3b3325219a 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -26,11 +26,6 @@ struct blk_flush_queue {
spinlock_t mq_flush_lock;
};
-extern struct kmem_cache *blk_requestq_cachep;
-extern struct kmem_cache *blk_requestq_srcu_cachep;
-extern struct kobj_type blk_queue_ktype;
-extern struct ida blk_queue_ida;
-
bool is_flush_rq(struct request *req);
struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
@@ -104,7 +99,7 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
-static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
+static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
return (offset & lim->virt_boundary_mask) ||
@@ -115,7 +110,7 @@ static inline bool __bvec_gap_to_prev(struct queue_limits *lim,
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
-static inline bool bvec_gap_to_prev(struct queue_limits *lim,
+static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
if (!lim->virt_boundary_mask)
@@ -278,6 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
void blk_insert_flush(struct request *rq);
int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
+void elevator_disable(struct request_queue *q);
void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
@@ -297,7 +293,7 @@ ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
static inline bool bio_may_exceed_limits(struct bio *bio,
- struct queue_limits *lim)
+ const struct queue_limits *lim)
{
switch (bio_op(bio)) {
case REQ_OP_DISCARD:
@@ -320,8 +316,9 @@ static inline bool bio_may_exceed_limits(struct bio *bio,
bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-struct bio *__bio_split_to_limits(struct bio *bio, struct queue_limits *lim,
- unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
@@ -428,15 +425,9 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page);
-static inline struct kmem_cache *blk_get_queue_kmem_cache(bool srcu)
-{
- if (srcu)
- return blk_requestq_srcu_cachep;
- return blk_requestq_cachep;
-}
-struct request_queue *blk_alloc_queue(int node_id, bool alloc_srcu);
+struct request_queue *blk_alloc_queue(int node_id);
-int disk_scan_partitions(struct gendisk *disk, fmode_t mode);
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner);
int disk_alloc_events(struct gendisk *disk);
void disk_add_events(struct gendisk *disk);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index d6f5dcdce748..435c32373cd6 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -325,6 +325,7 @@ void bsg_remove_queue(struct request_queue *q)
bsg_unregister_queue(bset->bd);
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
@@ -400,6 +401,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
return q;
out_cleanup_queue:
blk_mq_destroy_queue(q);
+ blk_put_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:
diff --git a/block/bsg.c b/block/bsg.c
index 2ab1351eb082..8eba57b9bb46 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -175,8 +175,10 @@ static void bsg_device_release(struct device *dev)
void bsg_unregister_queue(struct bsg_device *bd)
{
- if (bd->queue->kobj.sd)
- sysfs_remove_link(&bd->queue->kobj, "bsg");
+ struct gendisk *disk = bd->queue->disk;
+
+ if (disk && disk->queue_kobj.sd)
+ sysfs_remove_link(&disk->queue_kobj, "bsg");
cdev_device_del(&bd->cdev, &bd->device);
put_device(&bd->device);
}
@@ -216,8 +218,9 @@ struct bsg_device *bsg_register_queue(struct request_queue *q,
if (ret)
goto out_put_device;
- if (q->kobj.sd) {
- ret = sysfs_create_link(&q->kobj, &bd->device.kobj, "bsg");
+ if (q->disk && q->disk->queue_kobj.sd) {
+ ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj,
+ "bsg");
if (ret)
goto out_device_del;
}
diff --git a/block/elevator.c b/block/elevator.c
index bd71f0fc4e4b..adee58e48e2d 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -57,7 +57,7 @@ static LIST_HEAD(elv_list);
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
*/
-static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
+static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
@@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
- return 1;
+ return true;
}
/*
@@ -83,78 +83,45 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(unsigned int elv_features,
- unsigned int required_features)
+static inline bool elv_support_features(struct request_queue *q,
+ const struct elevator_type *e)
{
- return (required_features & elv_features) == required_features;
+ return (q->required_elevator_features & e->elevator_features) ==
+ q->required_elevator_features;
}
/**
- * elevator_match - Test an elevator name and features
+ * elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
* @name: Elevator name to test
- * @required_features: Features that the elevator must provide
*
- * Return true if the elevator @e name matches @name and if @e provides all
- * the features specified by @required_features.
+ * Return true if the elevator @e's name or alias matches @name.
*/
-static bool elevator_match(const struct elevator_type *e, const char *name,
- unsigned int required_features)
+static bool elevator_match(const struct elevator_type *e, const char *name)
{
- if (!elv_support_features(e->elevator_features, required_features))
- return false;
- if (!strcmp(e->elevator_name, name))
- return true;
- if (e->elevator_alias && !strcmp(e->elevator_alias, name))
- return true;
-
- return false;
+ return !strcmp(e->elevator_name, name) ||
+ (e->elevator_alias && !strcmp(e->elevator_alias, name));
}
-/**
- * elevator_find - Find an elevator
- * @name: Name of the elevator to find
- * @required_features: Features that the elevator must provide
- *
- * Return the first registered scheduler with name @name and supporting the
- * features @required_features and NULL otherwise.
- */
-static struct elevator_type *elevator_find(const char *name,
- unsigned int required_features)
+static struct elevator_type *__elevator_find(const char *name)
{
struct elevator_type *e;
- list_for_each_entry(e, &elv_list, list) {
- if (elevator_match(e, name, required_features))
+ list_for_each_entry(e, &elv_list, list)
+ if (elevator_match(e, name))
return e;
- }
-
return NULL;
}
-static void elevator_put(struct elevator_type *e)
-{
- module_put(e->elevator_owner);
-}
-
-static struct elevator_type *elevator_get(struct request_queue *q,
- const char *name, bool try_loading)
+static struct elevator_type *elevator_find_get(struct request_queue *q,
+ const char *name)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
-
- e = elevator_find(name, q->required_elevator_features);
- if (!e && try_loading) {
- spin_unlock(&elv_list_lock);
- request_module("%s-iosched", name);
- spin_lock(&elv_list_lock);
- e = elevator_find(name, q->required_elevator_features);
- }
-
- if (e && !try_module_get(e->elevator_owner))
+ e = __elevator_find(name);
+ if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
e = NULL;
-
spin_unlock(&elv_list_lock);
return e;
}
@@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
if (unlikely(!eq))
return NULL;
+ __elevator_get(e);
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
@@ -499,7 +467,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
lockdep_assert_held(&q->sysfs_lock);
- error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
+ error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
@@ -512,7 +480,7 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
- e->registered = 1;
+ set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
@@ -523,13 +491,9 @@ void elv_unregister_queue(struct request_queue *q)
lockdep_assert_held(&q->sysfs_lock);
- if (e && e->registered) {
- struct elevator_queue *e = q->elevator;
-
+ if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
-
- e->registered = 0;
}
}
@@ -555,7 +519,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name, 0)) {
+ if (__elevator_find(e->elevator_name)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -588,39 +552,6 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-static int elevator_switch_mq(struct request_queue *q,
- struct elevator_type *new_e)
-{
- int ret;
-
- lockdep_assert_held(&q->sysfs_lock);
-
- if (q->elevator) {
- elv_unregister_queue(q);
- elevator_exit(q);
- }
-
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out;
-
- if (new_e) {
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q);
- goto out;
- }
- }
-
- if (new_e)
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
- else
- blk_add_trace_msg(q, "elv switch: none");
-
-out:
- return ret;
-}
-
static inline bool elv_support_iosched(struct request_queue *q)
{
if (!queue_is_mq(q) ||
@@ -642,7 +573,7 @@ static struct elevator_type *elevator_get_default(struct request_queue *q)
!blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
- return elevator_get(q, "mq-deadline", false);
+ return elevator_find_get(q, "mq-deadline");
}
/*
@@ -656,14 +587,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q)
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(e->elevator_features,
- q->required_elevator_features)) {
+ if (elv_support_features(q, e)) {
found = e;
break;
}
}
- if (found && !try_module_get(found->elevator_owner))
+ if (found && !elevator_tryget(found))
found = NULL;
spin_unlock(&elv_list_lock);
@@ -713,115 +643,147 @@ void elevator_init_mq(struct request_queue *q)
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
- elevator_put(e);
}
+
+ elevator_put(e);
}
/*
- * switch to new_e io scheduler. be careful not to introduce deadlocks -
- * we don't free the old io scheduler, before we have allocated what we
- * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason.
+ * Switch to new_e io scheduler.
+ *
+ * If switching fails, we are most likely running out of memory and not able
+ * to restore the old io scheduler, so leaving the io scheduler being none.
*/
int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
- int err;
+ int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
- err = elevator_switch_mq(q, new_e);
+ if (q->elevator) {
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ }
+
+ ret = blk_mq_init_sched(q, new_e);
+ if (ret)
+ goto out_unfreeze;
+
+ ret = elv_register_queue(q, true);
+ if (ret) {
+ elevator_exit(q);
+ goto out_unfreeze;
+ }
+ blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+out_unfreeze:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
- return err;
+ if (ret) {
+ pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
+ new_e->elevator_name);
+ }
+
+ return ret;
+}
+
+void elevator_disable(struct request_queue *q)
+{
+ lockdep_assert_held(&q->sysfs_lock);
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
+ blk_add_trace_msg(q, "elv switch: none");
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
}
/*
* Switch this queue to the given IO scheduler.
*/
-static int __elevator_change(struct request_queue *q, const char *name)
+static int elevator_change(struct request_queue *q, const char *elevator_name)
{
- char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
+ int ret;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
- /*
- * Special case for mq, turn off scheduling
- */
- if (!strncmp(name, "none", 4)) {
- if (!q->elevator)
- return 0;
- return elevator_switch(q, NULL);
+ if (!strncmp(elevator_name, "none", 4)) {
+ if (q->elevator)
+ elevator_disable(q);
+ return 0;
}
- strlcpy(elevator_name, name, sizeof(elevator_name));
- e = elevator_get(q, strstrip(elevator_name), true);
- if (!e)
- return -EINVAL;
-
- if (q->elevator &&
- elevator_match(q->elevator->type, elevator_name, 0)) {
- elevator_put(e);
+ if (q->elevator && elevator_match(q->elevator->type, elevator_name))
return 0;
- }
- return elevator_switch(q, e);
+ e = elevator_find_get(q, elevator_name);
+ if (!e) {
+ request_module("%s-iosched", elevator_name);
+ e = elevator_find_get(q, elevator_name);
+ if (!e)
+ return -EINVAL;
+ }
+ ret = elevator_switch(q, e);
+ elevator_put(e);
+ return ret;
}
-ssize_t elv_iosched_store(struct request_queue *q, const char *name,
+ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
size_t count)
{
+ char elevator_name[ELV_NAME_MAX];
int ret;
if (!elv_support_iosched(q))
return count;
- ret = __elevator_change(q, name);
+ strlcpy(elevator_name, buf, sizeof(elevator_name));
+ ret = elevator_change(q, strstrip(elevator_name));
if (!ret)
return count;
-
return ret;
}
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
- struct elevator_queue *e = q->elevator;
- struct elevator_type *elv = NULL;
- struct elevator_type *__e;
+ struct elevator_queue *eq = q->elevator;
+ struct elevator_type *cur = NULL, *e;
int len = 0;
- if (!queue_is_mq(q))
+ if (!elv_support_iosched(q))
return sprintf(name, "none\n");
- if (!q->elevator)
+ if (!q->elevator) {
len += sprintf(name+len, "[none] ");
- else
- elv = e->type;
+ } else {
+ len += sprintf(name+len, "none ");
+ cur = eq->type;
+ }
spin_lock(&elv_list_lock);
- list_for_each_entry(__e, &elv_list, list) {
- if (elv && elevator_match(elv, __e->elevator_name, 0)) {
- len += sprintf(name+len, "[%s] ", elv->elevator_name);
- continue;
- }
- if (elv_support_iosched(q) &&
- elevator_match(__e, __e->elevator_name,
- q->required_elevator_features))
- len += sprintf(name+len, "%s ", __e->elevator_name);
+ list_for_each_entry(e, &elv_list, list) {
+ if (e == cur)
+ len += sprintf(name+len, "[%s] ", e->elevator_name);
+ else if (elv_support_features(q, e))
+ len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
- if (q->elevator)
- len += sprintf(name+len, "none");
-
- len += sprintf(len+name, "\n");
+ len += sprintf(name+len, "\n");
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
index 3f0593b3bf9d..774a8f6b99e6 100644
--- a/block/elevator.h
+++ b/block/elevator.h
@@ -84,6 +84,21 @@ struct elevator_type
struct list_head list;
};
+static inline bool elevator_tryget(struct elevator_type *e)
+{
+ return try_module_get(e->elevator_owner);
+}
+
+static inline void __elevator_get(struct elevator_type *e)
+{
+ __module_get(e->elevator_owner);
+}
+
+static inline void elevator_put(struct elevator_type *e)
+{
+ module_put(e->elevator_owner);
+}
+
#define ELV_HASH_BITS 6
void elv_rqhash_del(struct request_queue *q, struct request *rq);
@@ -100,10 +115,13 @@ struct elevator_queue
void *elevator_data;
struct kobject kobj;
struct mutex sysfs_lock;
- unsigned int registered:1;
+ unsigned long flags;
DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
};
+#define ELEVATOR_FLAG_REGISTERED 0
+#define ELEVATOR_FLAG_DISABLE_WBT 1
+
/*
* block elevator interface
*/
diff --git a/block/fops.c b/block/fops.c
index b90742595317..50d245e8c913 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -405,12 +405,6 @@ static int blkdev_write_end(struct file *file, struct address_space *mapping,
return ret;
}
-static int blkdev_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return generic_writepages(mapping, wbc);
-}
-
const struct address_space_operations def_blk_aops = {
.dirty_folio = block_dirty_folio,
.invalidate_folio = block_invalidate_folio,
@@ -419,7 +413,6 @@ const struct address_space_operations def_blk_aops = {
.writepage = blkdev_writepage,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
- .writepages = blkdev_writepages,
.direct_IO = blkdev_direct_IO,
.migrate_folio = buffer_migrate_folio_norefs,
.is_dirty_writeback = buffer_check_dirty_writeback,
diff --git a/block/genhd.c b/block/genhd.c
index 0f9769db2de8..08f76135a637 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action)
}
EXPORT_SYMBOL_GPL(disk_uevent);
-int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
+int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner)
{
struct block_device *bdev;
@@ -366,6 +366,9 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode)
return -EINVAL;
if (disk->open_partitions)
return -EBUSY;
+ /* Someone else has bdev exclusively open? */
+ if (disk->part0->bd_holder && disk->part0->bd_holder != owner)
+ return -EBUSY;
set_bit(GD_NEED_PART_SCAN, &disk->state);
bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL);
@@ -479,10 +482,6 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
goto out_put_holder_dir;
}
- ret = bd_register_pending_holders(disk);
- if (ret < 0)
- goto out_put_slave_dir;
-
ret = blk_register_queue(disk);
if (ret)
goto out_put_slave_dir;
@@ -500,7 +499,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
bdev_add(disk->part0, ddev->devt);
if (get_capacity(disk))
- disk_scan_partitions(disk, FMODE_READ);
+ disk_scan_partitions(disk, FMODE_READ, NULL);
/*
* Announce the disk and partitions after all partitions are
@@ -530,6 +529,7 @@ out_unregister_queue:
rq_qos_exit(disk->queue);
out_put_slave_dir:
kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
out_put_holder_dir:
kobject_put(disk->part0->bd_holder_dir);
out_del_integrity:
@@ -560,6 +560,11 @@ void blk_mark_disk_dead(struct gendisk *disk)
{
set_bit(GD_DEAD, &disk->state);
blk_queue_start_drain(disk->queue);
+
+ /*
+ * Stop buffered writers from dirtying pages that can't be written out.
+ */
+ set_capacity_and_notify(disk, 0);
}
EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
@@ -629,6 +634,7 @@ void del_gendisk(struct gendisk *disk)
kobject_put(disk->part0->bd_holder_dir);
kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
part_stat_set_all(disk->part0, 0);
disk->part0->bd_stamp = 0;
@@ -643,7 +649,9 @@ void del_gendisk(struct gendisk *disk)
blk_sync_queue(q);
blk_flush_integrity();
- blk_mq_cancel_work_sync(q);
+
+ if (queue_is_mq(q))
+ blk_mq_cancel_work_sync(q);
blk_mq_quiesce_queue(q);
if (q->elevator) {
@@ -1193,21 +1201,10 @@ struct class block_class = {
.dev_uevent = block_uevent,
};
-static char *block_devnode(struct device *dev, umode_t *mode,
- kuid_t *uid, kgid_t *gid)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (disk->fops->devnode)
- return disk->fops->devnode(disk, mode);
- return NULL;
-}
-
const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
- .devnode = block_devnode,
};
#ifdef CONFIG_PROC_FS
@@ -1412,7 +1409,7 @@ struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
struct request_queue *q;
struct gendisk *disk;
- q = blk_alloc_queue(node, false);
+ q = blk_alloc_queue(node);
if (!q)
return NULL;
diff --git a/block/holder.c b/block/holder.c
index 5283bc804cc1..37d18c13d958 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -4,7 +4,7 @@
struct bd_holder_disk {
struct list_head list;
- struct block_device *bdev;
+ struct kobject *holder_dir;
int refcnt;
};
@@ -14,7 +14,7 @@ static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct bd_holder_disk *holder;
list_for_each_entry(holder, &disk->slave_bdevs, list)
- if (holder->bdev == bdev)
+ if (holder->holder_dir == bdev->bd_holder_dir)
return holder;
return NULL;
}
@@ -29,19 +29,6 @@ static void del_symlink(struct kobject *from, struct kobject *to)
sysfs_remove_link(from, kobject_name(to));
}
-static int __link_disk_holder(struct block_device *bdev, struct gendisk *disk)
-{
- int ret;
-
- ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
- if (ret)
- return ret;
- ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
- if (ret)
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- return ret;
-}
-
/**
* bd_link_disk_holder - create symlinks between holding disk and slave bdev
* @bdev: the claimed slave bdev
@@ -75,12 +62,30 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
struct bd_holder_disk *holder;
int ret = 0;
- mutex_lock(&disk->open_mutex);
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return -EINVAL;
+
+ if (bdev->bd_disk == disk)
+ return -EINVAL;
+
+ /*
+ * del_gendisk drops the initial reference to bd_holder_dir, so we
+ * need to keep our own here to allow for cleanup past that point.
+ */
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+ return -ENODEV;
+ }
+ kobject_get(bdev->bd_holder_dir);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+ mutex_lock(&disk->open_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
if (holder) {
+ kobject_put(bdev->bd_holder_dir);
holder->refcnt++;
goto out_unlock;
}
@@ -92,36 +97,32 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
}
INIT_LIST_HEAD(&holder->list);
- holder->bdev = bdev;
holder->refcnt = 1;
- if (disk->slave_dir) {
- ret = __link_disk_holder(bdev, disk);
- if (ret) {
- kfree(holder);
- goto out_unlock;
- }
- }
+ holder->holder_dir = bdev->bd_holder_dir;
+ ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
+ if (ret)
+ goto out_free_holder;
+ ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+ if (ret)
+ goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
- /*
- * del_gendisk drops the initial reference to bd_holder_dir, so we need
- * to keep our own here to allow for cleanup past that point.
- */
- kobject_get(bdev->bd_holder_dir);
+ mutex_unlock(&disk->open_mutex);
+ return 0;
+
+out_del_symlink:
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+out_free_holder:
+ kfree(holder);
out_unlock:
mutex_unlock(&disk->open_mutex);
+ if (ret)
+ kobject_put(bdev->bd_holder_dir);
return ret;
}
EXPORT_SYMBOL_GPL(bd_link_disk_holder);
-static void __unlink_disk_holder(struct block_device *bdev,
- struct gendisk *disk)
-{
- del_symlink(disk->slave_dir, bdev_kobj(bdev));
- del_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
-}
-
/**
* bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
* @bdev: the calimed slave bdev
@@ -136,36 +137,18 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
{
struct bd_holder_disk *holder;
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return;
+
mutex_lock(&disk->open_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
- if (disk->slave_dir)
- __unlink_disk_holder(bdev, disk);
- kobject_put(bdev->bd_holder_dir);
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj);
+ kobject_put(holder->holder_dir);
list_del_init(&holder->list);
kfree(holder);
}
mutex_unlock(&disk->open_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
-
-int bd_register_pending_holders(struct gendisk *disk)
-{
- struct bd_holder_disk *holder;
- int ret;
-
- mutex_lock(&disk->open_mutex);
- list_for_each_entry(holder, &disk->slave_bdevs, list) {
- ret = __link_disk_holder(holder->bdev, disk);
- if (ret)
- goto out_undo;
- }
- mutex_unlock(&disk->open_mutex);
- return 0;
-
-out_undo:
- list_for_each_entry_continue_reverse(holder, &disk->slave_bdevs, list)
- __unlink_disk_holder(holder->bdev, disk);
- mutex_unlock(&disk->open_mutex);
- return ret;
-}
diff --git a/block/ioctl.c b/block/ioctl.c
index 60121e89052b..96617512982e 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -467,9 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
* user space. Note the separate arg/argp parameters that are needed
* to deal with the compat_ptr() conversion.
*/
-static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg, void __user *argp)
+static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd,
+ unsigned long arg, void __user *argp)
{
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
unsigned int max_sectors;
switch (cmd) {
@@ -527,7 +528,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
- return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL);
+ return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL,
+ file);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
@@ -605,7 +607,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
- ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
+ ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret != -ENOIOCTLCMD)
return ret;
@@ -674,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
break;
}
- ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
+ ret = blkdev_common_ioctl(file, mode, cmd, arg, argp);
if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl)
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 5639921dfa92..f10c2a0d18d4 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -131,6 +131,20 @@ static u8 dd_rq_ioclass(struct request *rq)
}
/*
+ * get the request before `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_earlier_request(struct request *rq)
+{
+ struct rb_node *node = rb_prev(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
+}
+
+/*
* get the request after `rq' in sector-sorted order
*/
static inline struct request *
@@ -278,6 +292,39 @@ static inline int deadline_check_fifo(struct dd_per_prio *per_prio,
}
/*
+ * Check if rq has a sequential request preceding it.
+ */
+static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
+{
+ struct request *prev = deadline_earlier_request(rq);
+
+ if (!prev)
+ return false;
+
+ return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
+}
+
+/*
+ * Skip all write requests that are sequential from @rq, even if we cross
+ * a zone boundary.
+ */
+static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
+ struct request *rq)
+{
+ sector_t pos = blk_rq_pos(rq);
+ sector_t skipped_sectors = 0;
+
+ while (rq) {
+ if (blk_rq_pos(rq) != pos + skipped_sectors)
+ break;
+ skipped_sectors += blk_rq_sectors(rq);
+ rq = deadline_latter_request(rq);
+ }
+
+ return rq;
+}
+
+/*
* For the specified data direction, return the next request to
* dispatch using arrival ordered lists.
*/
@@ -297,11 +344,16 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) {
- if (blk_req_can_dispatch_to_zone(rq))
+ if (blk_req_can_dispatch_to_zone(rq) &&
+ (blk_queue_nonrot(rq->q) ||
+ !deadline_is_seq_write(dd, rq)))
goto out;
}
rq = NULL;
@@ -331,13 +383,19 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
- rq = deadline_latter_request(rq);
+ if (blk_queue_nonrot(rq->q))
+ rq = deadline_latter_request(rq);
+ else
+ rq = deadline_skip_seq_writes(dd, rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
@@ -789,6 +847,18 @@ static void dd_prepare_request(struct request *rq)
rq->elv.priv[0] = NULL;
}
+static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio p;
+
+ for (p = 0; p <= DD_PRIO_MAX; p++)
+ if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
+ return true;
+
+ return false;
+}
+
/*
* Callback from inside blk_mq_free_request().
*
@@ -828,9 +898,10 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
- if (!list_empty(&per_prio->fifo_list[DD_WRITE]))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ if (dd_has_write_work(rq->mq_hctx))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
}
}
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 9bdb833e5817..463873f61e01 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -2461,6 +2461,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
}
+static void opal_lock_check_for_saved_key(struct opal_dev *dev,
+ struct opal_lock_unlock *lk_unlk)
+{
+ struct opal_suspend_data *iter;
+
+ if (lk_unlk->l_state != OPAL_LK ||
+ lk_unlk->session.opal_key.key_len > 0)
+ return;
+
+ /*
+ * Usually when closing a crypto device (eg: dm-crypt with LUKS) the
+ * volume key is not required, as it requires root privileges anyway,
+ * and root can deny access to a disk in many ways regardless.
+ * Requiring the volume key to lock the device is a peculiarity of the
+ * OPAL specification. Given we might already have saved the key if
+ * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use
+ * that key to lock the device if no key was provided here, the
+ * locking range matches and the appropriate flag was passed with
+ * 'IOC_OPAL_SAVE'.
+ * This allows integrating OPAL with tools and libraries that are used
+ * to the common behaviour and do not ask for the volume key when
+ * closing a device.
+ */
+ setup_opal_dev(dev);
+ list_for_each_entry(iter, &dev->unlk_lst, node) {
+ if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) &&
+ iter->lr == lk_unlk->session.opal_key.lr &&
+ iter->unlk.session.opal_key.key_len > 0) {
+ lk_unlk->session.opal_key.key_len =
+ iter->unlk.session.opal_key.key_len;
+ memcpy(lk_unlk->session.opal_key.key,
+ iter->unlk.session.opal_key.key,
+ iter->unlk.session.opal_key.key_len);
+ break;
+ }
+ }
+}
+
static int opal_lock_unlock(struct opal_dev *dev,
struct opal_lock_unlock *lk_unlk)
{
@@ -2470,6 +2508,7 @@ static int opal_lock_unlock(struct opal_dev *dev,
return -EINVAL;
mutex_lock(&dev->dev_lock);
+ opal_lock_check_for_saved_key(dev, lk_unlk);
ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index a41145d52de9..a2184b428493 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -285,49 +285,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config CDROM_PKTCDVD
- tristate "Packet writing on CD/DVD media (DEPRECATED)"
- depends on !UML
- depends on SCSI
- select CDROM
- help
- Note: This driver is deprecated and will be removed from the
- kernel in the near future!
-
- If you have a CDROM/DVD drive that supports packet writing, say
- Y to include support. It should work with any MMC/Mt Fuji
- compliant ATAPI or SCSI drive, which is just about any newer
- DVD/CD writer.
-
- Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
- is possible.
- DVD-RW disks must be in restricted overwrite mode.
-
- See the file <file:Documentation/cdrom/packet-writing.rst>
- for further information on the use of this driver.
-
- To compile this driver as a module, choose M here: the
- module will be called pktcdvd.
-
-config CDROM_PKTCDVD_BUFFERS
- int "Free buffers for data gathering"
- depends on CDROM_PKTCDVD
- default "8"
- help
- This controls the maximum number of active concurrent packets. More
- concurrent packets can increase write performance, but also require
- more memory. Each concurrent packet will require approximately 64Kb
- of non-swappable kernel memory, memory which will be allocated when
- a disc is opened for writing.
-
-config CDROM_PKTCDVD_WCACHE
- bool "Enable write caching"
- depends on CDROM_PKTCDVD
- help
- If enabled, write caching will be set for the CD-R/W device. For now
- this option is dangerous unless the CD-RW media is known good, as we
- don't do deferred write error handling yet.
-
config ATA_OVER_ETH
tristate "ATA over Ethernet support"
depends on NET
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 101612cba303..962ee65d8ca3 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -20,7 +20,6 @@ obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
obj-$(CONFIG_N64CART) += n64cart.o
obj-$(CONFIG_BLK_DEV_RAM) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
-obj-$(CONFIG_CDROM_PKTCDVD) += pktcdvd.o
obj-$(CONFIG_SUNVDC) += sunvdc.o
obj-$(CONFIG_BLK_DEV_NBD) += nbd.o
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
index cbacddc55a1d..6fb4e38fca88 100644
--- a/drivers/block/drbd/Kconfig
+++ b/drivers/block/drbd/Kconfig
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0-only
#
# DRBD device driver configuration
#
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 8bd534697d1b..c93e462130ff 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -1,4 +1,4 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0-only
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index e27478ae579c..429255876800 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_actlog.c
@@ -868,9 +868,9 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
nr_sectors = get_capacity(device->vdisk);
esector = sector + (size >> 9) - 1;
- if (!expect(sector < nr_sectors))
+ if (!expect(device, sector < nr_sectors))
goto out;
- if (!expect(esector < nr_sectors))
+ if (!expect(device, esector < nr_sectors))
esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@@ -1143,7 +1143,7 @@ void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
if (!bm_ext) {
spin_unlock_irqrestore(&device->al_lock, flags);
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
return;
}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 7d9db33363de..289876ffbc31 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_bitmap.c
@@ -113,7 +113,7 @@ struct drbd_bitmap {
static void __bm_print_lock_info(struct drbd_device *device, const char *func)
{
struct drbd_bitmap *b = device->bitmap;
- if (!__ratelimit(&drbd_ratelimit_state))
+ if (!drbd_ratelimit())
return;
drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
current->comm, task_pid_nr(current),
@@ -448,7 +448,7 @@ int drbd_bm_init(struct drbd_device *device)
sector_t drbd_bm_capacity(struct drbd_device *device)
{
- if (!expect(device->bitmap))
+ if (!expect(device, device->bitmap))
return 0;
return device->bitmap->bm_dev_capacity;
}
@@ -457,7 +457,7 @@ sector_t drbd_bm_capacity(struct drbd_device *device)
*/
void drbd_bm_cleanup(struct drbd_device *device)
{
- if (!expect(device->bitmap))
+ if (!expect(device, device->bitmap))
return;
bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
bm_vk_free(device->bitmap->bm_pages);
@@ -636,7 +636,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi
int err = 0;
bool growing;
- if (!expect(b))
+ if (!expect(device, b))
return -ENOMEM;
drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
@@ -757,9 +757,9 @@ unsigned long _drbd_bm_total_weight(struct drbd_device *device)
unsigned long s;
unsigned long flags;
- if (!expect(b))
+ if (!expect(device, b))
return 0;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@@ -783,9 +783,9 @@ unsigned long drbd_bm_total_weight(struct drbd_device *device)
size_t drbd_bm_words(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
- if (!expect(b))
+ if (!expect(device, b))
return 0;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 0;
return b->bm_words;
@@ -794,7 +794,7 @@ size_t drbd_bm_words(struct drbd_device *device)
unsigned long drbd_bm_bits(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
- if (!expect(b))
+ if (!expect(device, b))
return 0;
return b->bm_bits;
@@ -816,9 +816,9 @@ void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
end = offset + number;
- if (!expect(b))
+ if (!expect(device, b))
return;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return;
if (number == 0)
return;
@@ -863,9 +863,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
end = offset + number;
- if (!expect(b))
+ if (!expect(device, b))
return;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@@ -894,9 +894,9 @@ void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
void drbd_bm_set_all(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
- if (!expect(b))
+ if (!expect(device, b))
return;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@@ -910,9 +910,9 @@ void drbd_bm_set_all(struct drbd_device *device)
void drbd_bm_clear_all(struct drbd_device *device)
{
struct drbd_bitmap *b = device->bitmap;
- if (!expect(b))
+ if (!expect(device, b))
return;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return;
spin_lock_irq(&b->bm_lock);
@@ -952,7 +952,7 @@ static void drbd_bm_endio(struct bio *bio)
bm_set_page_io_err(b->bm_pages[idx]);
/* Not identical to on disk version of it.
* Is BM_PAGE_IO_ERROR enough? */
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
bio->bi_status, idx);
} else {
@@ -1013,7 +1013,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho
else
len = PAGE_SIZE;
} else {
- if (__ratelimit(&drbd_ratelimit_state)) {
+ if (drbd_ratelimit()) {
drbd_err(device, "Invalid offset during on-disk bitmap access: "
"page idx %u, sector %llu\n", page_nr, on_disk_sector);
}
@@ -1332,9 +1332,9 @@ static unsigned long bm_find_next(struct drbd_device *device,
struct drbd_bitmap *b = device->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
- if (!expect(b))
+ if (!expect(device, b))
return i;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return i;
spin_lock_irq(&b->bm_lock);
@@ -1436,9 +1436,9 @@ static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
struct drbd_bitmap *b = device->bitmap;
int c = 0;
- if (!expect(b))
+ if (!expect(device, b))
return 1;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@@ -1582,9 +1582,9 @@ int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
unsigned long *p_addr;
int i;
- if (!expect(b))
+ if (!expect(device, b))
return 0;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
@@ -1619,9 +1619,9 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
- if (!expect(b))
+ if (!expect(device, b))
return 1;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 1;
spin_lock_irqsave(&b->bm_lock, flags);
@@ -1635,7 +1635,7 @@ int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
- if (expect(bitnr < b->bm_bits))
+ if (expect(device, bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
else
drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
@@ -1668,9 +1668,9 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
unsigned long flags;
unsigned long *p_addr, *bm;
- if (!expect(b))
+ if (!expect(device, b))
return 0;
- if (!expect(b->bm_pages))
+ if (!expect(device, b->bm_pages))
return 0;
spin_lock_irqsave(&b->bm_lock, flags);
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
index b3b9cd5628fd..a72c096aa5b1 100644
--- a/drivers/block/drbd/drbd_debugfs.c
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
#define pr_fmt(fmt) "drbd debugfs: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
index 58e31cef0844..ee3d66eb40c6 100644
--- a/drivers/block/drbd/drbd_debugfs.h
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/debugfs.h>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 4d661282ff41..ae713338aa46 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
drbd_int.h
@@ -37,6 +37,7 @@
#include "drbd_strings.h"
#include "drbd_state.h"
#include "drbd_protocol.h"
+#include "drbd_polymorph_printk.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@@ -75,71 +76,6 @@ extern int drbd_proc_details;
struct drbd_device;
struct drbd_connection;
-#define __drbd_printk_device(level, device, fmt, args...) \
- dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
-#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
- dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
-#define __drbd_printk_resource(level, resource, fmt, args...) \
- printk(level "drbd %s: " fmt, (resource)->name, ## args)
-#define __drbd_printk_connection(level, connection, fmt, args...) \
- printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
-
-void drbd_printk_with_wrong_object_type(void);
-
-#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
- (__builtin_types_compatible_p(typeof(obj), type) || \
- __builtin_types_compatible_p(typeof(obj), const type)), \
- func(level, (const type)(obj), fmt, ## args)
-
-#define drbd_printk(level, obj, fmt, args...) \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, struct drbd_device *, \
- __drbd_printk_device, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, struct drbd_resource *, \
- __drbd_printk_resource, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, struct drbd_connection *, \
- __drbd_printk_connection, level, fmt, ## args), \
- __builtin_choose_expr( \
- __drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
- __drbd_printk_peer_device, level, fmt, ## args), \
- drbd_printk_with_wrong_object_type()))))
-
-#define drbd_dbg(obj, fmt, args...) \
- drbd_printk(KERN_DEBUG, obj, fmt, ## args)
-#define drbd_alert(obj, fmt, args...) \
- drbd_printk(KERN_ALERT, obj, fmt, ## args)
-#define drbd_err(obj, fmt, args...) \
- drbd_printk(KERN_ERR, obj, fmt, ## args)
-#define drbd_warn(obj, fmt, args...) \
- drbd_printk(KERN_WARNING, obj, fmt, ## args)
-#define drbd_info(obj, fmt, args...) \
- drbd_printk(KERN_INFO, obj, fmt, ## args)
-#define drbd_emerg(obj, fmt, args...) \
- drbd_printk(KERN_EMERG, obj, fmt, ## args)
-
-#define dynamic_drbd_dbg(device, fmt, args...) \
- dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
-
-#define D_ASSERT(device, exp) do { \
- if (!(exp)) \
- drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
- } while (0)
-
-/**
- * expect - Make an assertion
- *
- * Unlike the assert macro, this macro returns a boolean result.
- */
-#define expect(exp) ({ \
- bool _bool = (exp); \
- if (!_bool) \
- drbd_err(device, "ASSERTION %s FAILED in %s\n", \
- #exp, __func__); \
- _bool; \
- })
-
/* Defines to control fault insertion */
enum {
DRBD_FAULT_MD_WR = 0, /* meta data write */
@@ -395,6 +331,7 @@ struct drbd_peer_request {
struct drbd_peer_device *peer_device;
struct drbd_epoch *epoch; /* for writes */
struct page *pages;
+ blk_opf_t opf;
atomic_t pending_bios;
struct drbd_interval i;
/* see comments on ee flag bits below */
@@ -406,6 +343,10 @@ struct drbd_peer_request {
};
};
+/* Equivalent to bio_op and req_op. */
+#define peer_req_op(peer_req) \
+ ((peer_req)->opf & REQ_OP_MASK)
+
/* ee flag bits.
* While corresponding bios are in flight, the only modification will be
* set_bit WAS_ERROR, which has to be atomic.
@@ -1545,8 +1486,7 @@ extern void drbd_send_acks_wf(struct work_struct *ws);
extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
bool throttle_if_app_is_waiting);
-extern int drbd_submit_peer_request(struct drbd_device *,
- struct drbd_peer_request *, blk_opf_t, int);
+extern int drbd_submit_peer_request(struct drbd_peer_request *peer_req);
extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
sector_t, unsigned int,
@@ -1718,7 +1658,7 @@ static inline void __drbd_chk_io_error_(struct drbd_device *device,
switch (ep) {
case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Local IO failed in %s.\n", where);
if (device->state.disk > D_INCONSISTENT)
_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
index f07b4378388b..5024ffd6143d 100644
--- a/drivers/block/drbd/drbd_interval.c
+++ b/drivers/block/drbd/drbd_interval.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
#include <asm/bug.h>
#include <linux/rbtree_augmented.h>
#include "drbd_interval.h"
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
index b8c2dee5edc8..366489b72fe9 100644
--- a/drivers/block/drbd/drbd_interval.h
+++ b/drivers/block/drbd/drbd_interval.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_INTERVAL_H
#define __DRBD_INTERVAL_H
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 0aa1dde07a98..2f16e1bfb6e7 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd.c
@@ -1259,7 +1259,7 @@ static int _drbd_send_bitmap(struct drbd_device *device)
struct bm_xfer_ctx c;
int err;
- if (!expect(device->bitmap))
+ if (!expect(device, device->bitmap))
return false;
if (get_ldev(device)) {
@@ -2217,7 +2217,8 @@ void drbd_destroy_device(struct kref *kref)
kref_put(&peer_device->connection->kref, drbd_destroy_connection);
kfree(peer_device);
}
- memset(device, 0xfd, sizeof(*device));
+ if (device->submit.wq)
+ destroy_workqueue(device->submit.wq);
kfree(device);
kref_put(&resource->kref, drbd_destroy_resource);
}
@@ -2249,9 +2250,9 @@ static void do_retry(struct work_struct *ws)
bool expected;
expected =
- expect(atomic_read(&req->completion_ref) == 0) &&
- expect(req->rq_state & RQ_POSTPONED) &&
- expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ expect(device, atomic_read(&req->completion_ref) == 0) &&
+ expect(device, req->rq_state & RQ_POSTPONED) &&
+ expect(device, (req->rq_state & RQ_LOCAL_PENDING) == 0 ||
(req->rq_state & RQ_LOCAL_ABORTED) != 0);
if (!expected)
@@ -2309,7 +2310,6 @@ void drbd_destroy_resource(struct kref *kref)
idr_destroy(&resource->devices);
free_cpumask_var(resource->cpu_mask);
kfree(resource->name);
- memset(resource, 0xf2, sizeof(*resource));
kfree(resource);
}
@@ -2650,7 +2650,6 @@ void drbd_destroy_connection(struct kref *kref)
drbd_free_socket(&connection->data);
kfree(connection->int_dig_in);
kfree(connection->int_dig_vv);
- memset(connection, 0xfc, sizeof(*connection));
kfree(connection);
kref_put(&resource->kref, drbd_destroy_resource);
}
@@ -2774,7 +2773,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
err = add_disk(disk);
if (err)
- goto out_idr_remove_from_resource;
+ goto out_destroy_workqueue;
/* inherit the connection state */
device->state.conn = first_connection(resource)->cstate;
@@ -2788,6 +2787,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig
drbd_debugfs_device_add(device);
return NO_ERROR;
+out_destroy_workqueue:
+ destroy_workqueue(device->submit.wq);
out_idr_remove_from_resource:
for_each_connection_safe(connection, n, resource) {
peer_device = idr_remove(&connection->peer_devices, vnr);
@@ -3766,7 +3767,7 @@ _drbd_insert_fault(struct drbd_device *device, unsigned int type)
if (ret) {
drbd_fault_count++;
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_warn(device, "***Simulating %s failure\n",
_drbd_fault_str(type));
}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 864c98e74875..60757ac31701 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_nl.c
@@ -1210,6 +1210,7 @@ static void decide_on_discard_support(struct drbd_device *device,
struct drbd_connection *connection =
first_peer_device(device)->connection;
struct request_queue *q = device->rq_queue;
+ unsigned int max_discard_sectors;
if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
goto not_supported;
@@ -1230,15 +1231,14 @@ static void decide_on_discard_support(struct drbd_device *device,
* topology on all peers.
*/
blk_queue_discard_granularity(q, 512);
- q->limits.max_discard_sectors = drbd_max_discard_sectors(connection);
- q->limits.max_write_zeroes_sectors =
- drbd_max_discard_sectors(connection);
+ max_discard_sectors = drbd_max_discard_sectors(connection);
+ blk_queue_max_discard_sectors(q, max_discard_sectors);
+ blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
return;
not_supported:
blk_queue_discard_granularity(q, 0);
- q->limits.max_discard_sectors = 0;
- q->limits.max_write_zeroes_sectors = 0;
+ blk_queue_max_discard_sectors(q, 0);
}
static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
@@ -1256,6 +1256,18 @@ static void fixup_write_zeroes(struct drbd_device *device, struct request_queue
q->limits.max_write_zeroes_sectors = 0;
}
+static void fixup_discard_support(struct drbd_device *device, struct request_queue *q)
+{
+ unsigned int max_discard = device->rq_queue->limits.max_discard_sectors;
+ unsigned int discard_granularity =
+ device->rq_queue->limits.discard_granularity >> SECTOR_SHIFT;
+
+ if (discard_granularity > max_discard) {
+ blk_queue_discard_granularity(q, 0);
+ blk_queue_max_discard_sectors(q, 0);
+ }
+}
+
static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
unsigned int max_bio_size, struct o_qlim *o)
{
@@ -1288,6 +1300,7 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi
disk_update_readahead(device->vdisk);
}
fixup_write_zeroes(device, q);
+ fixup_discard_support(device, q);
}
void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
@@ -1530,7 +1543,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
goto fail_unlock;
}
- if (!expect(new_disk_conf->resync_rate >= 1))
+ if (!expect(device, new_disk_conf->resync_rate >= 1))
new_disk_conf->resync_rate = 1;
sanitize_disk_conf(device, new_disk_conf, device->ldev);
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
index 6a09b0b98018..df0d241d3f6a 100644
--- a/drivers/block/drbd/drbd_nla.c
+++ b/drivers/block/drbd/drbd_nla.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
#include <linux/kernel.h>
#include <net/netlink.h>
#include <linux/drbd_genl_api.h>
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
index f5eaffb6474e..d3555df0d353 100644
--- a/drivers/block/drbd/drbd_nla.h
+++ b/drivers/block/drbd/drbd_nla.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_NLA_H
#define __DRBD_NLA_H
diff --git a/drivers/block/drbd/drbd_polymorph_printk.h b/drivers/block/drbd/drbd_polymorph_printk.h
new file mode 100644
index 000000000000..8e0082d139ba
--- /dev/null
+++ b/drivers/block/drbd/drbd_polymorph_printk.h
@@ -0,0 +1,141 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+#ifndef DRBD_POLYMORPH_PRINTK_H
+#define DRBD_POLYMORPH_PRINTK_H
+
+#if !defined(CONFIG_DYNAMIC_DEBUG)
+#undef DEFINE_DYNAMIC_DEBUG_METADATA
+#undef __dynamic_pr_debug
+#undef DYNAMIC_DEBUG_BRANCH
+#define DEFINE_DYNAMIC_DEBUG_METADATA(D, F) const char *D = F; ((void)D)
+#define __dynamic_pr_debug(D, F, args...) do { (void)(D); if (0) printk(F, ## args); } while (0)
+#define DYNAMIC_DEBUG_BRANCH(D) false
+#endif
+
+
+#define __drbd_printk_drbd_device_prep(device) \
+ const struct drbd_device *__d = (device); \
+ const struct drbd_resource *__r = __d->resource
+#define __drbd_printk_drbd_device_fmt(fmt) "drbd %s/%u drbd%u: " fmt
+#define __drbd_printk_drbd_device_args() __r->name, __d->vnr, __d->minor
+#define __drbd_printk_drbd_device_unprep()
+
+#define __drbd_printk_drbd_peer_device_prep(peer_device) \
+ const struct drbd_device *__d; \
+ const struct drbd_resource *__r; \
+ __d = (peer_device)->device; \
+ __r = __d->resource
+#define __drbd_printk_drbd_peer_device_fmt(fmt) \
+ "drbd %s/%u drbd%u: " fmt
+#define __drbd_printk_drbd_peer_device_args() \
+ __r->name, __d->vnr, __d->minor
+#define __drbd_printk_drbd_peer_device_unprep()
+
+#define __drbd_printk_drbd_resource_prep(resource) \
+ const struct drbd_resource *__r = resource
+#define __drbd_printk_drbd_resource_fmt(fmt) "drbd %s: " fmt
+#define __drbd_printk_drbd_resource_args() __r->name
+#define __drbd_printk_drbd_resource_unprep(resource)
+
+#define __drbd_printk_drbd_connection_prep(connection) \
+ const struct drbd_connection *__c = (connection); \
+ const struct drbd_resource *__r = __c->resource
+#define __drbd_printk_drbd_connection_fmt(fmt) \
+ "drbd %s: " fmt
+#define __drbd_printk_drbd_connection_args() \
+ __r->name
+#define __drbd_printk_drbd_connection_unprep()
+
+void drbd_printk_with_wrong_object_type(void);
+void drbd_dyn_dbg_with_wrong_object_type(void);
+
+#define __drbd_printk_choose_cond(obj, struct_name) \
+ (__builtin_types_compatible_p(typeof(obj), struct struct_name *) || \
+ __builtin_types_compatible_p(typeof(obj), const struct struct_name *))
+#define __drbd_printk_if_same_type(obj, struct_name, level, fmt, args...) \
+ __drbd_printk_choose_cond(obj, struct_name), \
+({ \
+ __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
+ printk(level __drbd_printk_ ## struct_name ## _fmt(fmt), \
+ __drbd_printk_ ## struct_name ## _args(), ## args); \
+ __drbd_printk_ ## struct_name ## _unprep(); \
+})
+
+#define drbd_printk(level, obj, fmt, args...) \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, drbd_device, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, drbd_resource, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, drbd_connection, level, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_printk_if_same_type(obj, drbd_peer_device, level, fmt, ## args), \
+ drbd_printk_with_wrong_object_type()))))
+
+#define __drbd_dyn_dbg_if_same_type(obj, struct_name, fmt, args...) \
+ __drbd_printk_choose_cond(obj, struct_name), \
+({ \
+ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \
+ if (DYNAMIC_DEBUG_BRANCH(descriptor)) { \
+ __drbd_printk_ ## struct_name ## _prep((const struct struct_name *)(obj)); \
+ __dynamic_pr_debug(&descriptor, __drbd_printk_ ## struct_name ## _fmt(fmt), \
+ __drbd_printk_ ## struct_name ## _args(), ## args); \
+ __drbd_printk_ ## struct_name ## _unprep(); \
+ } \
+})
+
+#define dynamic_drbd_dbg(obj, fmt, args...) \
+ __builtin_choose_expr( \
+ __drbd_dyn_dbg_if_same_type(obj, drbd_device, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_dyn_dbg_if_same_type(obj, drbd_resource, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_dyn_dbg_if_same_type(obj, drbd_connection, fmt, ## args), \
+ __builtin_choose_expr( \
+ __drbd_dyn_dbg_if_same_type(obj, drbd_peer_device, fmt, ## args), \
+ drbd_dyn_dbg_with_wrong_object_type()))))
+
+#define drbd_emerg(device, fmt, args...) \
+ drbd_printk(KERN_EMERG, device, fmt, ## args)
+#define drbd_alert(device, fmt, args...) \
+ drbd_printk(KERN_ALERT, device, fmt, ## args)
+#define drbd_crit(device, fmt, args...) \
+ drbd_printk(KERN_CRIT, device, fmt, ## args)
+#define drbd_err(device, fmt, args...) \
+ drbd_printk(KERN_ERR, device, fmt, ## args)
+#define drbd_warn(device, fmt, args...) \
+ drbd_printk(KERN_WARNING, device, fmt, ## args)
+#define drbd_notice(device, fmt, args...) \
+ drbd_printk(KERN_NOTICE, device, fmt, ## args)
+#define drbd_info(device, fmt, args...) \
+ drbd_printk(KERN_INFO, device, fmt, ## args)
+
+
+#define drbd_ratelimit() \
+({ \
+ static DEFINE_RATELIMIT_STATE(_rs, \
+ DEFAULT_RATELIMIT_INTERVAL, \
+ DEFAULT_RATELIMIT_BURST); \
+ __ratelimit(&_rs); \
+})
+
+#define D_ASSERT(x, exp) \
+ do { \
+ if (!(exp)) \
+ drbd_err(x, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ } while (0)
+
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(x, exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool && drbd_ratelimit()) \
+ drbd_err(x, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
+
+#endif
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 3c0193de2498..2227fb0db1ce 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_proc.c
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
index a882b65ab5d2..56bbca9d7700 100644
--- a/drivers/block/drbd/drbd_protocol.h
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_PROTOCOL_H
#define __DRBD_PROTOCOL_H
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 9ace76156e4b..0e58a3187345 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_receiver.c
@@ -413,7 +413,7 @@ void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *
drbd_free_pages(device, peer_req->pages, is_net);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
- if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+ if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
drbd_al_complete_io(device, &peer_req->i);
}
@@ -1603,9 +1603,19 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
drbd_endio_write_sec_final(peer_req);
}
+static int peer_request_fault_type(struct drbd_peer_request *peer_req)
+{
+ if (peer_req_op(peer_req) == REQ_OP_READ) {
+ return peer_req->flags & EE_APPLICATION ?
+ DRBD_FAULT_DT_RD : DRBD_FAULT_RS_RD;
+ } else {
+ return peer_req->flags & EE_APPLICATION ?
+ DRBD_FAULT_DT_WR : DRBD_FAULT_RS_WR;
+ }
+}
+
/**
* drbd_submit_peer_request()
- * @device: DRBD device.
* @peer_req: peer request
*
* May spread the pages to multiple bios,
@@ -1619,10 +1629,9 @@ static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, stru
* on certain Xen deployments.
*/
/* TODO allocate from our own bio_set. */
-int drbd_submit_peer_request(struct drbd_device *device,
- struct drbd_peer_request *peer_req,
- const blk_opf_t opf, const int fault_type)
+int drbd_submit_peer_request(struct drbd_peer_request *peer_req)
{
+ struct drbd_device *device = peer_req->peer_device->device;
struct bio *bios = NULL;
struct bio *bio;
struct page *page = peer_req->pages;
@@ -1667,7 +1676,18 @@ int drbd_submit_peer_request(struct drbd_device *device,
* generated bio, but a bio allocated on behalf of the peer.
*/
next_bio:
- bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO);
+ /* _DISCARD, _WRITE_ZEROES handled above.
+ * REQ_OP_FLUSH (empty flush) not expected,
+ * should have been mapped to a "drbd protocol barrier".
+ * REQ_OP_SECURE_ERASE: I don't see how we could ever support that.
+ */
+ if (!(peer_req_op(peer_req) == REQ_OP_WRITE ||
+ peer_req_op(peer_req) == REQ_OP_READ)) {
+ drbd_err(device, "Invalid bio op received: 0x%x\n", peer_req->opf);
+ return -EINVAL;
+ }
+
+ bio = bio_alloc(device->ldev->backing_bdev, nr_pages, peer_req->opf, GFP_NOIO);
/* > peer_req->i.sector, unless this is the first bio */
bio->bi_iter.bi_sector = sector;
bio->bi_private = peer_req;
@@ -1697,7 +1717,7 @@ next_bio:
bios = bios->bi_next;
bio->bi_next = NULL;
- drbd_submit_bio_noacct(device, fault_type, bio);
+ drbd_submit_bio_noacct(device, peer_request_fault_type(peer_req), bio);
} while (bios);
return 0;
}
@@ -1853,21 +1873,21 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
/* assume request_size == data_size, but special case trim. */
ds = data_size;
if (trim) {
- if (!expect(data_size == 0))
+ if (!expect(peer_device, data_size == 0))
return NULL;
ds = be32_to_cpu(trim->size);
} else if (zeroes) {
- if (!expect(data_size == 0))
+ if (!expect(peer_device, data_size == 0))
return NULL;
ds = be32_to_cpu(zeroes->size);
}
- if (!expect(IS_ALIGNED(ds, 512)))
+ if (!expect(peer_device, IS_ALIGNED(ds, 512)))
return NULL;
if (trim || zeroes) {
- if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
+ if (!expect(peer_device, ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
return NULL;
- } else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
+ } else if (!expect(peer_device, ds <= DRBD_MAX_BIO_SIZE))
return NULL;
/* even though we trust out peer,
@@ -2051,6 +2071,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
* respective _drbd_clear_done_ee */
peer_req->w.cb = e_end_resync_block;
+ peer_req->opf = REQ_OP_WRITE;
peer_req->submit_jif = jiffies;
spin_lock_irq(&device->resource->req_lock);
@@ -2058,8 +2079,7 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
- if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE,
- DRBD_FAULT_RS_WR) == 0)
+ if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* don't care for the reason here */
@@ -2145,7 +2165,7 @@ static int receive_RSDataReply(struct drbd_connection *connection, struct packet
* or in drbd_peer_request_endio. */
err = recv_resync_read(peer_device, sector, pi);
} else {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Can not write resync data to local disk.\n");
err = drbd_drain_block(peer_device, pi->size);
@@ -2375,16 +2395,6 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co
return ret;
}
-/* see also bio_flags_to_wire()
- * DRBD_REQ_*, because we need to semantically map the flags to data packet
- * flags and back. We may replicate to other kernel versions. */
-static blk_opf_t wire_flags_to_bio_flags(u32 dpf)
-{
- return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
- (dpf & DP_FUA ? REQ_FUA : 0) |
- (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
-}
-
static enum req_op wire_flags_to_bio_op(u32 dpf)
{
if (dpf & DP_ZEROES)
@@ -2395,6 +2405,15 @@ static enum req_op wire_flags_to_bio_op(u32 dpf)
return REQ_OP_WRITE;
}
+/* see also bio_flags_to_wire() */
+static blk_opf_t wire_flags_to_bio(struct drbd_connection *connection, u32 dpf)
+{
+ return wire_flags_to_bio_op(dpf) |
+ (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
+}
+
static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
unsigned int size)
{
@@ -2538,8 +2557,6 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
struct drbd_peer_request *peer_req;
struct p_data *p = pi->data;
u32 peer_seq = be32_to_cpu(p->seq_num);
- enum req_op op;
- blk_opf_t op_flags;
u32 dp_flags;
int err, tp;
@@ -2578,11 +2595,10 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_APPLICATION;
dp_flags = be32_to_cpu(p->dp_flags);
- op = wire_flags_to_bio_op(dp_flags);
- op_flags = wire_flags_to_bio_flags(dp_flags);
+ peer_req->opf = wire_flags_to_bio(connection, dp_flags);
if (pi->cmd == P_TRIM) {
D_ASSERT(peer_device, peer_req->i.size > 0);
- D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+ D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_DISCARD);
D_ASSERT(peer_device, peer_req->pages == NULL);
/* need to play safe: an older DRBD sender
* may mean zero-out while sending P_TRIM. */
@@ -2590,7 +2606,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_ZEROOUT;
} else if (pi->cmd == P_ZEROES) {
D_ASSERT(peer_device, peer_req->i.size > 0);
- D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
+ D_ASSERT(peer_device, peer_req_op(peer_req) == REQ_OP_WRITE_ZEROES);
D_ASSERT(peer_device, peer_req->pages == NULL);
/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
if (dp_flags & DP_DISCARD)
@@ -2677,8 +2693,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info *
peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
}
- err = drbd_submit_peer_request(device, peer_req, op | op_flags,
- DRBD_FAULT_DT_WR);
+ err = drbd_submit_peer_request(peer_req);
if (!err)
return 0;
@@ -2789,7 +2804,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
struct drbd_peer_request *peer_req;
struct digest_info *di = NULL;
int size, verb;
- unsigned int fault_type;
struct p_block_req *p = pi->data;
peer_device = conn_peer_device(connection, pi->vnr);
@@ -2832,7 +2846,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
default:
BUG();
}
- if (verb && __ratelimit(&drbd_ratelimit_state))
+ if (verb && drbd_ratelimit())
drbd_err(device, "Can not satisfy peer's read request, "
"no local data.\n");
@@ -2849,11 +2863,11 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
put_ldev(device);
return -ENOMEM;
}
+ peer_req->opf = REQ_OP_READ;
switch (pi->cmd) {
case P_DATA_REQUEST:
peer_req->w.cb = w_e_end_data_req;
- fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
peer_req->flags |= EE_APPLICATION;
goto submit;
@@ -2867,14 +2881,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
fallthrough;
case P_RS_DATA_REQUEST:
peer_req->w.cb = w_e_end_rsdata_req;
- fault_type = DRBD_FAULT_RS_RD;
/* used in the sector offset progress display */
device->bm_resync_fo = BM_SECT_TO_BIT(sector);
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
- fault_type = DRBD_FAULT_RS_RD;
di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
if (!di)
goto out_free_e;
@@ -2923,7 +2935,6 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet
(unsigned long long)sector);
}
peer_req->w.cb = w_e_end_ov_req;
- fault_type = DRBD_FAULT_RS_RD;
break;
default:
@@ -2975,8 +2986,7 @@ submit_for_resync:
submit:
update_receiver_timing_details(connection, drbd_submit_peer_request);
inc_unacked(device);
- if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
- fault_type) == 0)
+ if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* don't care for the reason here */
@@ -4947,7 +4957,6 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
if (get_ldev(device)) {
struct drbd_peer_request *peer_req;
- const enum req_op op = REQ_OP_WRITE_ZEROES;
peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
size, 0, GFP_NOIO);
@@ -4957,6 +4966,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
}
peer_req->w.cb = e_end_resync_block;
+ peer_req->opf = REQ_OP_DISCARD;
peer_req->submit_jif = jiffies;
peer_req->flags |= EE_TRIM;
@@ -4965,8 +4975,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac
spin_unlock_irq(&device->resource->req_lock);
atomic_add(pi->size >> 9, &device->rs_sect_ev);
- err = drbd_submit_peer_request(device, peer_req, op,
- DRBD_FAULT_RS_WR);
+ err = drbd_submit_peer_request(peer_req);
if (err) {
spin_lock_irq(&device->resource->req_lock);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 7f9bcc82fc9c..eb14ec8ec04c 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_req.c
@@ -144,7 +144,7 @@ void drbd_req_destroy(struct kref *kref)
if (get_ldev_if_state(device, D_FAILED)) {
drbd_al_complete_io(device, &req->i);
put_ldev(device);
- } else if (__ratelimit(&drbd_ratelimit_state)) {
+ } else if (drbd_ratelimit()) {
drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
"but my Disk seems to have failed :(\n",
(unsigned long long) req->i.sector, req->i.size);
@@ -518,7 +518,7 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
{
- if (!__ratelimit(&drbd_ratelimit_state))
+ if (!drbd_ratelimit())
return;
drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n",
@@ -1402,7 +1402,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request
submit_private_bio = true;
} else if (no_remote) {
nodata:
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
(unsigned long long)req->i.sector, req->i.size >> 9);
/* A write may have been queued for send_oos, however.
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 6237fa1dcb0e..b4017b5c3fbc 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
drbd_req.h
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 3f7bf9f2d874..75d13ea0024f 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_state.c
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
index f87371e55e68..cbaeb8018dbf 100644
--- a/drivers/block/drbd/drbd_state.h
+++ b/drivers/block/drbd/drbd_state.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_STATE_H
#define DRBD_STATE_H
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
index d5b0479bc9a6..9d78d8e3912e 100644
--- a/drivers/block/drbd/drbd_state_change.h
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef DRBD_STATE_CHANGE_H
#define DRBD_STATE_CHANGE_H
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index fc01307607ea..0a06f744b096 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd.h
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
index 87b94a27358a..0201f6590f6a 100644
--- a/drivers/block/drbd/drbd_strings.h
+++ b/drivers/block/drbd/drbd_strings.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
#ifndef __DRBD_STRINGS_H
#define __DRBD_STRINGS_H
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
index 01e3babc5277..1ee81e3c2152 100644
--- a/drivers/block/drbd/drbd_vli.h
+++ b/drivers/block/drbd/drbd_vli.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0-or-later */
+/* SPDX-License-Identifier: GPL-2.0-only */
/*
-*- linux-c -*-
drbd_receiver.c
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 0bb1a900c2d5..f46738040d6b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -1,4 +1,4 @@
-// SPDX-License-Identifier: GPL-2.0-or-later
+// SPDX-License-Identifier: GPL-2.0-only
/*
drbd_worker.c
@@ -176,7 +176,7 @@ void drbd_peer_request_endio(struct bio *bio)
bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
bio_op(bio) == REQ_OP_DISCARD;
- if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
+ if (bio->bi_status && drbd_ratelimit())
drbd_warn(device, "%s: error=%d s=%llus\n",
is_write ? (is_discard ? "discard" : "write")
: "read", bio->bi_status,
@@ -240,7 +240,7 @@ void drbd_request_endio(struct bio *bio)
* though we still will complain noisily about it.
*/
if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
if (!bio->bi_status)
@@ -400,13 +400,13 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
goto defer;
peer_req->w.cb = w_e_send_csum;
+ peer_req->opf = REQ_OP_READ;
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->read_ee);
spin_unlock_irq(&device->resource->req_lock);
atomic_add(size >> 9, &device->rs_sect_ev);
- if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
- DRBD_FAULT_RS_RD) == 0)
+ if (drbd_submit_peer_request(peer_req) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
@@ -1062,7 +1062,7 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
} else {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. sector=%llus.\n",
(unsigned long long)peer_req->i.sector);
@@ -1135,13 +1135,13 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
else
err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
} else {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Not sending RSDataReply, "
"partner DISKLESS!\n");
err = 0;
}
} else {
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
(unsigned long long)peer_req->i.sector);
@@ -1212,7 +1212,7 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
}
} else {
err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
- if (__ratelimit(&drbd_ratelimit_state))
+ if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index ccad3d7b3ddd..487840e3564d 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -4593,8 +4593,10 @@ static int __init do_floppy_init(void)
goto out_put_disk;
err = floppy_alloc_disk(drive, 0);
- if (err)
+ if (err) {
+ blk_mq_free_tag_set(&tag_sets[drive]);
goto out_put_disk;
+ }
timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
}
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
index 1f154f92f4c2..7d28e3aa406c 100644
--- a/drivers/block/null_blk/main.c
+++ b/drivers/block/null_blk/main.c
@@ -523,6 +523,24 @@ out:
}
CONFIGFS_ATTR(nullb_device_, badblocks);
+static ssize_t nullb_device_zone_readonly_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nullb_device *dev = to_nullb_device(item);
+
+ return zone_cond_store(dev, page, count, BLK_ZONE_COND_READONLY);
+}
+CONFIGFS_ATTR_WO(nullb_device_, zone_readonly);
+
+static ssize_t nullb_device_zone_offline_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nullb_device *dev = to_nullb_device(item);
+
+ return zone_cond_store(dev, page, count, BLK_ZONE_COND_OFFLINE);
+}
+CONFIGFS_ATTR_WO(nullb_device_, zone_offline);
+
static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_size,
&nullb_device_attr_completion_nsec,
@@ -549,6 +567,8 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_zone_nr_conv,
&nullb_device_attr_zone_max_open,
&nullb_device_attr_zone_max_active,
+ &nullb_device_attr_zone_readonly,
+ &nullb_device_attr_zone_offline,
&nullb_device_attr_virt_boundary,
&nullb_device_attr_no_sched,
&nullb_device_attr_shared_tag_bitmap,
@@ -614,7 +634,7 @@ static ssize_t memb_group_features_show(struct config_item *item, char *page)
"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
"zone_capacity,zone_max_active,zone_max_open,"
- "zone_nr_conv,zone_size\n");
+ "zone_nr_conv,zone_offline,zone_readonly,zone_size\n");
}
CONFIGFS_ATTR_RO(memb_group_, features);
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
index 94ff68052b1e..eb5972c50be8 100644
--- a/drivers/block/null_blk/null_blk.h
+++ b/drivers/block/null_blk/null_blk.h
@@ -151,6 +151,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
sector_t sector, sector_t nr_sectors);
size_t null_zone_valid_read_len(struct nullb *nullb,
sector_t sector, unsigned int len);
+ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
+ size_t count, enum blk_zone_cond cond);
#else
static inline int null_init_zoned_dev(struct nullb_device *dev,
struct request_queue *q)
@@ -174,6 +176,12 @@ static inline size_t null_zone_valid_read_len(struct nullb *nullb,
{
return len;
}
+static inline ssize_t zone_cond_store(struct nullb_device *dev,
+ const char *page, size_t count,
+ enum blk_zone_cond cond)
+{
+ return -EOPNOTSUPP;
+}
#define null_report_zones NULL
#endif /* CONFIG_BLK_DEV_ZONED */
#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
index 55a69e48ef8b..635ce0648133 100644
--- a/drivers/block/null_blk/zoned.c
+++ b/drivers/block/null_blk/zoned.c
@@ -384,8 +384,10 @@ static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
null_lock_zone(dev, zone);
- if (zone->cond == BLK_ZONE_COND_FULL) {
- /* Cannot write to a full zone */
+ if (zone->cond == BLK_ZONE_COND_FULL ||
+ zone->cond == BLK_ZONE_COND_READONLY ||
+ zone->cond == BLK_ZONE_COND_OFFLINE) {
+ /* Cannot write to the zone */
ret = BLK_STS_IOERR;
goto unlock;
}
@@ -613,7 +615,9 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
zone = &dev->zones[i];
null_lock_zone(dev, zone);
- if (zone->cond != BLK_ZONE_COND_EMPTY) {
+ if (zone->cond != BLK_ZONE_COND_EMPTY &&
+ zone->cond != BLK_ZONE_COND_READONLY &&
+ zone->cond != BLK_ZONE_COND_OFFLINE) {
null_reset_zone(dev, zone);
trace_nullb_zone_op(cmd, i, zone->cond);
}
@@ -627,6 +631,12 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
null_lock_zone(dev, zone);
+ if (zone->cond == BLK_ZONE_COND_READONLY ||
+ zone->cond == BLK_ZONE_COND_OFFLINE) {
+ ret = BLK_STS_IOERR;
+ goto unlock;
+ }
+
switch (op) {
case REQ_OP_ZONE_RESET:
ret = null_reset_zone(dev, zone);
@@ -648,6 +658,7 @@ static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
if (ret == BLK_STS_OK)
trace_nullb_zone_op(cmd, zone_no, zone->cond);
+unlock:
null_unlock_zone(dev, zone);
return ret;
@@ -674,6 +685,8 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
default:
dev = cmd->nq->dev;
zone = &dev->zones[null_zone_no(dev, sector)];
+ if (zone->cond == BLK_ZONE_COND_OFFLINE)
+ return BLK_STS_IOERR;
null_lock_zone(dev, zone);
sts = null_process_cmd(cmd, op, sector, nr_sectors);
@@ -681,3 +694,79 @@ blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
return sts;
}
}
+
+/*
+ * Set a zone in the read-only or offline condition.
+ */
+static void null_set_zone_cond(struct nullb_device *dev,
+ struct nullb_zone *zone, enum blk_zone_cond cond)
+{
+ if (WARN_ON_ONCE(cond != BLK_ZONE_COND_READONLY &&
+ cond != BLK_ZONE_COND_OFFLINE))
+ return;
+
+ null_lock_zone(dev, zone);
+
+ /*
+ * If the read-only condition is requested again to zones already in
+ * read-only condition, restore back normal empty condition. Do the same
+ * if the offline condition is requested for offline zones. Otherwise,
+ * set the specified zone condition to the zones. Finish the zones
+ * beforehand to free up zone resources.
+ */
+ if (zone->cond == cond) {
+ zone->cond = BLK_ZONE_COND_EMPTY;
+ zone->wp = zone->start;
+ if (dev->memory_backed)
+ null_handle_discard(dev, zone->start, zone->len);
+ } else {
+ if (zone->cond != BLK_ZONE_COND_READONLY &&
+ zone->cond != BLK_ZONE_COND_OFFLINE)
+ null_finish_zone(dev, zone);
+ zone->cond = cond;
+ zone->wp = (sector_t)-1;
+ }
+
+ null_unlock_zone(dev, zone);
+}
+
+/*
+ * Identify a zone from the sector written to configfs file. Then set zone
+ * condition to the zone.
+ */
+ssize_t zone_cond_store(struct nullb_device *dev, const char *page,
+ size_t count, enum blk_zone_cond cond)
+{
+ unsigned long long sector;
+ unsigned int zone_no;
+ int ret;
+
+ if (!dev->zoned) {
+ pr_err("null_blk device is not zoned\n");
+ return -EINVAL;
+ }
+
+ if (!dev->zones) {
+ pr_err("null_blk device is not yet powered\n");
+ return -EINVAL;
+ }
+
+ ret = kstrtoull(page, 0, &sector);
+ if (ret < 0)
+ return ret;
+
+ zone_no = null_zone_no(dev, sector);
+ if (zone_no >= dev->nr_zones) {
+ pr_err("Sector out of range\n");
+ return -EINVAL;
+ }
+
+ if (dev->zones[zone_no].type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ pr_err("Can not change condition of conventional zones\n");
+ return -EINVAL;
+ }
+
+ null_set_zone_cond(dev, &dev->zones[zone_no], cond);
+
+ return count;
+}
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
deleted file mode 100644
index 4cea3b08087e..000000000000
--- a/drivers/block/pktcdvd.c
+++ /dev/null
@@ -1,2944 +0,0 @@
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- * Copyright (C) 2006 Thomas Maier <balagi@justmail.de>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License. See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and
- * DVD-RAM devices.
- *
- * Theory of operation:
- *
- * At the lowest level, there is the standard driver for the CD/DVD device,
- * such as drivers/scsi/sr.c. This driver can handle read and write requests,
- * but it doesn't know anything about the special restrictions that apply to
- * packet writing. One restriction is that write requests must be aligned to
- * packet boundaries on the physical media, and the size of a write request
- * must be equal to the packet size. Another restriction is that a
- * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read
- * command, if the previous command was a write.
- *
- * The purpose of the packet writing driver is to hide these restrictions from
- * higher layers, such as file systems, and present a block device that can be
- * randomly read and written using 2kB-sized blocks.
- *
- * The lowest layer in the packet writing driver is the packet I/O scheduler.
- * Its data is defined by the struct packet_iosched and includes two bio
- * queues with pending read and write requests. These queues are processed
- * by the pkt_iosched_process_queue() function. The write requests in this
- * queue are already properly aligned and sized. This layer is responsible for
- * issuing the flush cache commands and scheduling the I/O in a good order.
- *
- * The next layer transforms unaligned write requests to aligned writes. This
- * transformation requires reading missing pieces of data from the underlying
- * block device, assembling the pieces to full packets and queuing them to the
- * packet I/O scheduler.
- *
- * At the top layer there is a custom ->submit_bio function that forwards
- * read requests directly to the iosched queue and puts write requests in the
- * unaligned write queue. A kernel thread performs the necessary read
- * gathering to convert the unaligned writes to aligned writes and then feeds
- * them to the packet I/O scheduler.
- *
- *************************************************************************/
-
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
-#include <linux/pktcdvd.h>
-#include <linux/module.h>
-#include <linux/types.h>
-#include <linux/kernel.h>
-#include <linux/compat.h>
-#include <linux/kthread.h>
-#include <linux/errno.h>
-#include <linux/spinlock.h>
-#include <linux/file.h>
-#include <linux/proc_fs.h>
-#include <linux/seq_file.h>
-#include <linux/miscdevice.h>
-#include <linux/freezer.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/backing-dev.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_ioctl.h>
-#include <scsi/scsi.h>
-#include <linux/debugfs.h>
-#include <linux/device.h>
-#include <linux/nospec.h>
-#include <linux/uaccess.h>
-
-#define DRIVER_NAME "pktcdvd"
-
-#define pkt_err(pd, fmt, ...) \
- pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
-#define pkt_notice(pd, fmt, ...) \
- pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
-#define pkt_info(pd, fmt, ...) \
- pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
-
-#define pkt_dbg(level, pd, fmt, ...) \
-do { \
- if (level == 2 && PACKET_DEBUG >= 2) \
- pr_notice("%s: %s():" fmt, \
- pd->name, __func__, ##__VA_ARGS__); \
- else if (level == 1 && PACKET_DEBUG >= 1) \
- pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__); \
-} while (0)
-
-#define MAX_SPEED 0xffff
-
-static DEFINE_MUTEX(pktcdvd_mutex);
-static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
-static struct proc_dir_entry *pkt_proc;
-static int pktdev_major;
-static int write_congestion_on = PKT_WRITE_CONGESTION_ON;
-static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
-static struct mutex ctl_mutex; /* Serialize open/close/setup/teardown */
-static mempool_t psd_pool;
-static struct bio_set pkt_bio_set;
-
-static struct class *class_pktcdvd = NULL; /* /sys/class/pktcdvd */
-static struct dentry *pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
-
-/* forward declaration */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
-static int pkt_remove_dev(dev_t pkt_dev);
-static int pkt_seq_show(struct seq_file *m, void *p);
-
-static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
-{
- return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
-}
-
-/**********************************************************
- * sysfs interface for pktcdvd
- * by (C) 2006 Thomas Maier <balagi@justmail.de>
-
- /sys/class/pktcdvd/pktcdvd[0-7]/
- stat/reset
- stat/packets_started
- stat/packets_finished
- stat/kb_written
- stat/kb_read
- stat/kb_read_gather
- write_queue/size
- write_queue/congestion_off
- write_queue/congestion_on
- **********************************************************/
-
-static ssize_t packets_started_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started);
-}
-static DEVICE_ATTR_RO(packets_started);
-
-static ssize_t packets_finished_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended);
-}
-static DEVICE_ATTR_RO(packets_finished);
-
-static ssize_t kb_written_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1);
-}
-static DEVICE_ATTR_RO(kb_written);
-
-static ssize_t kb_read_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1);
-}
-static DEVICE_ATTR_RO(kb_read);
-
-static ssize_t kb_read_gather_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1);
-}
-static DEVICE_ATTR_RO(kb_read_gather);
-
-static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
-
- if (len > 0) {
- pd->stats.pkt_started = 0;
- pd->stats.pkt_ended = 0;
- pd->stats.secs_w = 0;
- pd->stats.secs_rg = 0;
- pd->stats.secs_r = 0;
- }
- return len;
-}
-static DEVICE_ATTR_WO(reset);
-
-static struct attribute *pkt_stat_attrs[] = {
- &dev_attr_packets_finished.attr,
- &dev_attr_packets_started.attr,
- &dev_attr_kb_read.attr,
- &dev_attr_kb_written.attr,
- &dev_attr_kb_read_gather.attr,
- &dev_attr_reset.attr,
- NULL,
-};
-
-static const struct attribute_group pkt_stat_group = {
- .name = "stat",
- .attrs = pkt_stat_attrs,
-};
-
-static ssize_t size_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->bio_queue_size);
- spin_unlock(&pd->lock);
- return n;
-}
-static DEVICE_ATTR_RO(size);
-
-static void init_write_congestion_marks(int* lo, int* hi)
-{
- if (*hi > 0) {
- *hi = max(*hi, 500);
- *hi = min(*hi, 1000000);
- if (*lo <= 0)
- *lo = *hi - 100;
- else {
- *lo = min(*lo, *hi - 100);
- *lo = max(*lo, 100);
- }
- } else {
- *hi = -1;
- *lo = -1;
- }
-}
-
-static ssize_t congestion_off_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->write_congestion_off);
- spin_unlock(&pd->lock);
- return n;
-}
-
-static ssize_t congestion_off_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) == 1) {
- spin_lock(&pd->lock);
- pd->write_congestion_off = val;
- init_write_congestion_marks(&pd->write_congestion_off,
- &pd->write_congestion_on);
- spin_unlock(&pd->lock);
- }
- return len;
-}
-static DEVICE_ATTR_RW(congestion_off);
-
-static ssize_t congestion_on_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int n;
-
- spin_lock(&pd->lock);
- n = sysfs_emit(buf, "%d\n", pd->write_congestion_on);
- spin_unlock(&pd->lock);
- return n;
-}
-
-static ssize_t congestion_on_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t len)
-{
- struct pktcdvd_device *pd = dev_get_drvdata(dev);
- int val;
-
- if (sscanf(buf, "%d", &val) == 1) {
- spin_lock(&pd->lock);
- pd->write_congestion_on = val;
- init_write_congestion_marks(&pd->write_congestion_off,
- &pd->write_congestion_on);
- spin_unlock(&pd->lock);
- }
- return len;
-}
-static DEVICE_ATTR_RW(congestion_on);
-
-static struct attribute *pkt_wq_attrs[] = {
- &dev_attr_congestion_on.attr,
- &dev_attr_congestion_off.attr,
- &dev_attr_size.attr,
- NULL,
-};
-
-static const struct attribute_group pkt_wq_group = {
- .name = "write_queue",
- .attrs = pkt_wq_attrs,
-};
-
-static const struct attribute_group *pkt_groups[] = {
- &pkt_stat_group,
- &pkt_wq_group,
- NULL,
-};
-
-static void pkt_sysfs_dev_new(struct pktcdvd_device *pd)
-{
- if (class_pktcdvd) {
- pd->dev = device_create_with_groups(class_pktcdvd, NULL,
- MKDEV(0, 0), pd, pkt_groups,
- "%s", pd->name);
- if (IS_ERR(pd->dev))
- pd->dev = NULL;
- }
-}
-
-static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
-{
- if (class_pktcdvd)
- device_unregister(pd->dev);
-}
-
-
-/********************************************************************
- /sys/class/pktcdvd/
- add map block device
- remove unmap packet dev
- device_map show mappings
- *******************************************************************/
-
-static void class_pktcdvd_release(struct class *cls)
-{
- kfree(cls);
-}
-
-static ssize_t device_map_show(struct class *c, struct class_attribute *attr,
- char *data)
-{
- int n = 0;
- int idx;
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
- for (idx = 0; idx < MAX_WRITERS; idx++) {
- struct pktcdvd_device *pd = pkt_devs[idx];
- if (!pd)
- continue;
- n += sprintf(data+n, "%s %u:%u %u:%u\n",
- pd->name,
- MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
- MAJOR(pd->bdev->bd_dev),
- MINOR(pd->bdev->bd_dev));
- }
- mutex_unlock(&ctl_mutex);
- return n;
-}
-static CLASS_ATTR_RO(device_map);
-
-static ssize_t add_store(struct class *c, struct class_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int major, minor;
-
- if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
- /* pkt_setup_dev() expects caller to hold reference to self */
- if (!try_module_get(THIS_MODULE))
- return -ENODEV;
-
- pkt_setup_dev(MKDEV(major, minor), NULL);
-
- module_put(THIS_MODULE);
-
- return count;
- }
-
- return -EINVAL;
-}
-static CLASS_ATTR_WO(add);
-
-static ssize_t remove_store(struct class *c, struct class_attribute *attr,
- const char *buf, size_t count)
-{
- unsigned int major, minor;
- if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
- pkt_remove_dev(MKDEV(major, minor));
- return count;
- }
- return -EINVAL;
-}
-static CLASS_ATTR_WO(remove);
-
-static struct attribute *class_pktcdvd_attrs[] = {
- &class_attr_add.attr,
- &class_attr_remove.attr,
- &class_attr_device_map.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(class_pktcdvd);
-
-static int pkt_sysfs_init(void)
-{
- int ret = 0;
-
- /*
- * create control files in sysfs
- * /sys/class/pktcdvd/...
- */
- class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL);
- if (!class_pktcdvd)
- return -ENOMEM;
- class_pktcdvd->name = DRIVER_NAME;
- class_pktcdvd->owner = THIS_MODULE;
- class_pktcdvd->class_release = class_pktcdvd_release;
- class_pktcdvd->class_groups = class_pktcdvd_groups;
- ret = class_register(class_pktcdvd);
- if (ret) {
- kfree(class_pktcdvd);
- class_pktcdvd = NULL;
- pr_err("failed to create class pktcdvd\n");
- return ret;
- }
- return 0;
-}
-
-static void pkt_sysfs_cleanup(void)
-{
- if (class_pktcdvd)
- class_destroy(class_pktcdvd);
- class_pktcdvd = NULL;
-}
-
-/********************************************************************
- entries in debugfs
-
- /sys/kernel/debug/pktcdvd[0-7]/
- info
-
- *******************************************************************/
-
-static int pkt_debugfs_seq_show(struct seq_file *m, void *p)
-{
- return pkt_seq_show(m, p);
-}
-
-static int pkt_debugfs_fops_open(struct inode *inode, struct file *file)
-{
- return single_open(file, pkt_debugfs_seq_show, inode->i_private);
-}
-
-static const struct file_operations debug_fops = {
- .open = pkt_debugfs_fops_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .owner = THIS_MODULE,
-};
-
-static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
-{
- if (!pkt_debugfs_root)
- return;
- pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
- if (!pd->dfs_d_root)
- return;
-
- pd->dfs_f_info = debugfs_create_file("info", 0444,
- pd->dfs_d_root, pd, &debug_fops);
-}
-
-static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
-{
- if (!pkt_debugfs_root)
- return;
- debugfs_remove(pd->dfs_f_info);
- debugfs_remove(pd->dfs_d_root);
- pd->dfs_f_info = NULL;
- pd->dfs_d_root = NULL;
-}
-
-static void pkt_debugfs_init(void)
-{
- pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
-}
-
-static void pkt_debugfs_cleanup(void)
-{
- debugfs_remove(pkt_debugfs_root);
- pkt_debugfs_root = NULL;
-}
-
-/* ----------------------------------------------------------*/
-
-
-static void pkt_bio_finished(struct pktcdvd_device *pd)
-{
- BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
- if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
- pkt_dbg(2, pd, "queue empty\n");
- atomic_set(&pd->iosched.attention, 1);
- wake_up(&pd->wqueue);
- }
-}
-
-/*
- * Allocate a packet_data struct
- */
-static struct packet_data *pkt_alloc_packet_data(int frames)
-{
- int i;
- struct packet_data *pkt;
-
- pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL);
- if (!pkt)
- goto no_pkt;
-
- pkt->frames = frames;
- pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL);
- if (!pkt->w_bio)
- goto no_bio;
-
- for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
- pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
- if (!pkt->pages[i])
- goto no_page;
- }
-
- spin_lock_init(&pkt->lock);
- bio_list_init(&pkt->orig_bios);
-
- for (i = 0; i < frames; i++) {
- pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL);
- if (!pkt->r_bios[i])
- goto no_rd_bio;
- }
-
- return pkt;
-
-no_rd_bio:
- for (i = 0; i < frames; i++)
- kfree(pkt->r_bios[i]);
-no_page:
- for (i = 0; i < frames / FRAMES_PER_PAGE; i++)
- if (pkt->pages[i])
- __free_page(pkt->pages[i]);
- kfree(pkt->w_bio);
-no_bio:
- kfree(pkt);
-no_pkt:
- return NULL;
-}
-
-/*
- * Free a packet_data struct
- */
-static void pkt_free_packet_data(struct packet_data *pkt)
-{
- int i;
-
- for (i = 0; i < pkt->frames; i++)
- kfree(pkt->r_bios[i]);
- for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++)
- __free_page(pkt->pages[i]);
- kfree(pkt->w_bio);
- kfree(pkt);
-}
-
-static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
-{
- struct packet_data *pkt, *next;
-
- BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
-
- list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
- pkt_free_packet_data(pkt);
- }
- INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
-}
-
-static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
-{
- struct packet_data *pkt;
-
- BUG_ON(!list_empty(&pd->cdrw.pkt_free_list));
-
- while (nr_packets > 0) {
- pkt = pkt_alloc_packet_data(pd->settings.size >> 2);
- if (!pkt) {
- pkt_shrink_pktlist(pd);
- return 0;
- }
- pkt->id = nr_packets;
- pkt->pd = pd;
- list_add(&pkt->list, &pd->cdrw.pkt_free_list);
- nr_packets--;
- }
- return 1;
-}
-
-static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
-{
- struct rb_node *n = rb_next(&node->rb_node);
- if (!n)
- return NULL;
- return rb_entry(n, struct pkt_rb_node, rb_node);
-}
-
-static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
- rb_erase(&node->rb_node, &pd->bio_queue);
- mempool_free(node, &pd->rb_pool);
- pd->bio_queue_size--;
- BUG_ON(pd->bio_queue_size < 0);
-}
-
-/*
- * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
- */
-static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
-{
- struct rb_node *n = pd->bio_queue.rb_node;
- struct rb_node *next;
- struct pkt_rb_node *tmp;
-
- if (!n) {
- BUG_ON(pd->bio_queue_size > 0);
- return NULL;
- }
-
- for (;;) {
- tmp = rb_entry(n, struct pkt_rb_node, rb_node);
- if (s <= tmp->bio->bi_iter.bi_sector)
- next = n->rb_left;
- else
- next = n->rb_right;
- if (!next)
- break;
- n = next;
- }
-
- if (s > tmp->bio->bi_iter.bi_sector) {
- tmp = pkt_rbtree_next(tmp);
- if (!tmp)
- return NULL;
- }
- BUG_ON(s > tmp->bio->bi_iter.bi_sector);
- return tmp;
-}
-
-/*
- * Insert a node into the pd->bio_queue rb tree.
- */
-static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
-{
- struct rb_node **p = &pd->bio_queue.rb_node;
- struct rb_node *parent = NULL;
- sector_t s = node->bio->bi_iter.bi_sector;
- struct pkt_rb_node *tmp;
-
- while (*p) {
- parent = *p;
- tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
- if (s < tmp->bio->bi_iter.bi_sector)
- p = &(*p)->rb_left;
- else
- p = &(*p)->rb_right;
- }
- rb_link_node(&node->rb_node, parent, p);
- rb_insert_color(&node->rb_node, &pd->bio_queue);
- pd->bio_queue_size++;
-}
-
-/*
- * Send a packet_command to the underlying block device and
- * wait for completion.
- */
-static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
- struct request_queue *q = bdev_get_queue(pd->bdev);
- struct scsi_cmnd *scmd;
- struct request *rq;
- int ret = 0;
-
- rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
- REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
- scmd = blk_mq_rq_to_pdu(rq);
-
- if (cgc->buflen) {
- ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
- GFP_NOIO);
- if (ret)
- goto out;
- }
-
- scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
- memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE);
-
- rq->timeout = 60*HZ;
- if (cgc->quiet)
- rq->rq_flags |= RQF_QUIET;
-
- blk_execute_rq(rq, false);
- if (scmd->result)
- ret = -EIO;
-out:
- blk_mq_free_request(rq);
- return ret;
-}
-
-static const char *sense_key_string(__u8 index)
-{
- static const char * const info[] = {
- "No sense", "Recovered error", "Not ready",
- "Medium error", "Hardware error", "Illegal request",
- "Unit attention", "Data protect", "Blank check",
- };
-
- return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
-}
-
-/*
- * A generic sense dump / resolve mechanism should be implemented across
- * all ATAPI + SCSI devices.
- */
-static void pkt_dump_sense(struct pktcdvd_device *pd,
- struct packet_command *cgc)
-{
- struct scsi_sense_hdr *sshdr = cgc->sshdr;
-
- if (sshdr)
- pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
- CDROM_PACKET_SIZE, cgc->cmd,
- sshdr->sense_key, sshdr->asc, sshdr->ascq,
- sense_key_string(sshdr->sense_key));
- else
- pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
-}
-
-/*
- * flush the drive cache to media
- */
-static int pkt_flush_cache(struct pktcdvd_device *pd)
-{
- struct packet_command cgc;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.cmd[0] = GPCMD_FLUSH_CACHE;
- cgc.quiet = 1;
-
- /*
- * the IMMED bit -- we default to not setting it, although that
- * would allow a much faster close, this is safer
- */
-#if 0
- cgc.cmd[1] = 1 << 1;
-#endif
- return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * speed is given as the normal factor, e.g. 4 for 4x
- */
-static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
- unsigned write_speed, unsigned read_speed)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- int ret;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_SET_SPEED;
- cgc.cmd[2] = (read_speed >> 8) & 0xff;
- cgc.cmd[3] = read_speed & 0xff;
- cgc.cmd[4] = (write_speed >> 8) & 0xff;
- cgc.cmd[5] = write_speed & 0xff;
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- pkt_dump_sense(pd, &cgc);
-
- return ret;
-}
-
-/*
- * Queue a bio for processing by the low-level CD device. Must be called
- * from process context.
- */
-static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
-{
- spin_lock(&pd->iosched.lock);
- if (bio_data_dir(bio) == READ)
- bio_list_add(&pd->iosched.read_queue, bio);
- else
- bio_list_add(&pd->iosched.write_queue, bio);
- spin_unlock(&pd->iosched.lock);
-
- atomic_set(&pd->iosched.attention, 1);
- wake_up(&pd->wqueue);
-}
-
-/*
- * Process the queued read/write requests. This function handles special
- * requirements for CDRW drives:
- * - A cache flush command must be inserted before a read request if the
- * previous request was a write.
- * - Switching between reading and writing is slow, so don't do it more often
- * than necessary.
- * - Optimize for throughput at the expense of latency. This means that streaming
- * writes will never be interrupted by a read, but if the drive has to seek
- * before the next write, switch to reading instead if there are any pending
- * read requests.
- * - Set the read speed according to current usage pattern. When only reading
- * from the device, it's best to use the highest possible read speed, but
- * when switching often between reading and writing, it's better to have the
- * same read and write speeds.
- */
-static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
-{
-
- if (atomic_read(&pd->iosched.attention) == 0)
- return;
- atomic_set(&pd->iosched.attention, 0);
-
- for (;;) {
- struct bio *bio;
- int reads_queued, writes_queued;
-
- spin_lock(&pd->iosched.lock);
- reads_queued = !bio_list_empty(&pd->iosched.read_queue);
- writes_queued = !bio_list_empty(&pd->iosched.write_queue);
- spin_unlock(&pd->iosched.lock);
-
- if (!reads_queued && !writes_queued)
- break;
-
- if (pd->iosched.writing) {
- int need_write_seek = 1;
- spin_lock(&pd->iosched.lock);
- bio = bio_list_peek(&pd->iosched.write_queue);
- spin_unlock(&pd->iosched.lock);
- if (bio && (bio->bi_iter.bi_sector ==
- pd->iosched.last_write))
- need_write_seek = 0;
- if (need_write_seek && reads_queued) {
- if (atomic_read(&pd->cdrw.pending_bios) > 0) {
- pkt_dbg(2, pd, "write, waiting\n");
- break;
- }
- pkt_flush_cache(pd);
- pd->iosched.writing = 0;
- }
- } else {
- if (!reads_queued && writes_queued) {
- if (atomic_read(&pd->cdrw.pending_bios) > 0) {
- pkt_dbg(2, pd, "read, waiting\n");
- break;
- }
- pd->iosched.writing = 1;
- }
- }
-
- spin_lock(&pd->iosched.lock);
- if (pd->iosched.writing)
- bio = bio_list_pop(&pd->iosched.write_queue);
- else
- bio = bio_list_pop(&pd->iosched.read_queue);
- spin_unlock(&pd->iosched.lock);
-
- if (!bio)
- continue;
-
- if (bio_data_dir(bio) == READ)
- pd->iosched.successive_reads +=
- bio->bi_iter.bi_size >> 10;
- else {
- pd->iosched.successive_reads = 0;
- pd->iosched.last_write = bio_end_sector(bio);
- }
- if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
- if (pd->read_speed == pd->write_speed) {
- pd->read_speed = MAX_SPEED;
- pkt_set_speed(pd, pd->write_speed, pd->read_speed);
- }
- } else {
- if (pd->read_speed != pd->write_speed) {
- pd->read_speed = pd->write_speed;
- pkt_set_speed(pd, pd->write_speed, pd->read_speed);
- }
- }
-
- atomic_inc(&pd->cdrw.pending_bios);
- submit_bio_noacct(bio);
- }
-}
-
-/*
- * Special care is needed if the underlying block device has a small
- * max_phys_segments value.
- */
-static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
-{
- if ((pd->settings.size << 9) / CD_FRAMESIZE
- <= queue_max_segments(q)) {
- /*
- * The cdrom device can handle one segment/frame
- */
- clear_bit(PACKET_MERGE_SEGS, &pd->flags);
- return 0;
- } else if ((pd->settings.size << 9) / PAGE_SIZE
- <= queue_max_segments(q)) {
- /*
- * We can handle this case at the expense of some extra memory
- * copies during write operations
- */
- set_bit(PACKET_MERGE_SEGS, &pd->flags);
- return 0;
- } else {
- pkt_err(pd, "cdrom max_phys_segments too small\n");
- return -EIO;
- }
-}
-
-static void pkt_end_io_read(struct bio *bio)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
- bio, (unsigned long long)pkt->sector,
- (unsigned long long)bio->bi_iter.bi_sector, bio->bi_status);
-
- if (bio->bi_status)
- atomic_inc(&pkt->io_errors);
- bio_uninit(bio);
- if (atomic_dec_and_test(&pkt->io_wait)) {
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
- }
- pkt_bio_finished(pd);
-}
-
-static void pkt_end_io_packet_write(struct bio *bio)
-{
- struct packet_data *pkt = bio->bi_private;
- struct pktcdvd_device *pd = pkt->pd;
- BUG_ON(!pd);
-
- pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status);
-
- pd->stats.pkt_ended++;
-
- bio_uninit(bio);
- pkt_bio_finished(pd);
- atomic_dec(&pkt->io_wait);
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
-}
-
-/*
- * Schedule reads for the holes in a packet
- */
-static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- int frames_read = 0;
- struct bio *bio;
- int f;
- char written[PACKET_MAX_SIZE];
-
- BUG_ON(bio_list_empty(&pkt->orig_bios));
-
- atomic_set(&pkt->io_wait, 0);
- atomic_set(&pkt->io_errors, 0);
-
- /*
- * Figure out which frames we need to read before we can write.
- */
- memset(written, 0, sizeof(written));
- spin_lock(&pkt->lock);
- bio_list_for_each(bio, &pkt->orig_bios) {
- int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
- (CD_FRAMESIZE >> 9);
- int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
- pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
- BUG_ON(first_frame < 0);
- BUG_ON(first_frame + num_frames > pkt->frames);
- for (f = first_frame; f < first_frame + num_frames; f++)
- written[f] = 1;
- }
- spin_unlock(&pkt->lock);
-
- if (pkt->cache_valid) {
- pkt_dbg(2, pd, "zone %llx cached\n",
- (unsigned long long)pkt->sector);
- goto out_account;
- }
-
- /*
- * Schedule reads for missing parts of the packet.
- */
- for (f = 0; f < pkt->frames; f++) {
- int p, offset;
-
- if (written[f])
- continue;
-
- bio = pkt->r_bios[f];
- bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ);
- bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
- bio->bi_end_io = pkt_end_io_read;
- bio->bi_private = pkt;
-
- p = (f * CD_FRAMESIZE) / PAGE_SIZE;
- offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
- pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
- f, pkt->pages[p], offset);
- if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
- BUG();
-
- atomic_inc(&pkt->io_wait);
- pkt_queue_bio(pd, bio);
- frames_read++;
- }
-
-out_account:
- pkt_dbg(2, pd, "need %d frames for zone %llx\n",
- frames_read, (unsigned long long)pkt->sector);
- pd->stats.pkt_started++;
- pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
-}
-
-/*
- * Find a packet matching zone, or the least recently used packet if
- * there is no match.
- */
-static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
-{
- struct packet_data *pkt;
-
- list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
- if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
- list_del_init(&pkt->list);
- if (pkt->sector != zone)
- pkt->cache_valid = 0;
- return pkt;
- }
- }
- BUG();
- return NULL;
-}
-
-static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- if (pkt->cache_valid) {
- list_add(&pkt->list, &pd->cdrw.pkt_free_list);
- } else {
- list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
- }
-}
-
-static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
-{
-#if PACKET_DEBUG > 1
- static const char *state_name[] = {
- "IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
- };
- enum packet_data_state old_state = pkt->state;
- pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
- pkt->id, (unsigned long long)pkt->sector,
- state_name[old_state], state_name[state]);
-#endif
- pkt->state = state;
-}
-
-/*
- * Scan the work queue to see if we can start a new packet.
- * returns non-zero if any work was done.
- */
-static int pkt_handle_queue(struct pktcdvd_device *pd)
-{
- struct packet_data *pkt, *p;
- struct bio *bio = NULL;
- sector_t zone = 0; /* Suppress gcc warning */
- struct pkt_rb_node *node, *first_node;
- struct rb_node *n;
-
- atomic_set(&pd->scan_queue, 0);
-
- if (list_empty(&pd->cdrw.pkt_free_list)) {
- pkt_dbg(2, pd, "no pkt\n");
- return 0;
- }
-
- /*
- * Try to find a zone we are not already working on.
- */
- spin_lock(&pd->lock);
- first_node = pkt_rbtree_find(pd, pd->current_sector);
- if (!first_node) {
- n = rb_first(&pd->bio_queue);
- if (n)
- first_node = rb_entry(n, struct pkt_rb_node, rb_node);
- }
- node = first_node;
- while (node) {
- bio = node->bio;
- zone = get_zone(bio->bi_iter.bi_sector, pd);
- list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
- if (p->sector == zone) {
- bio = NULL;
- goto try_next_bio;
- }
- }
- break;
-try_next_bio:
- node = pkt_rbtree_next(node);
- if (!node) {
- n = rb_first(&pd->bio_queue);
- if (n)
- node = rb_entry(n, struct pkt_rb_node, rb_node);
- }
- if (node == first_node)
- node = NULL;
- }
- spin_unlock(&pd->lock);
- if (!bio) {
- pkt_dbg(2, pd, "no bio\n");
- return 0;
- }
-
- pkt = pkt_get_packet_data(pd, zone);
-
- pd->current_sector = zone + pd->settings.size;
- pkt->sector = zone;
- BUG_ON(pkt->frames != pd->settings.size >> 2);
- pkt->write_size = 0;
-
- /*
- * Scan work queue for bios in the same zone and link them
- * to this packet.
- */
- spin_lock(&pd->lock);
- pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
- while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
- bio = node->bio;
- pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)
- get_zone(bio->bi_iter.bi_sector, pd));
- if (get_zone(bio->bi_iter.bi_sector, pd) != zone)
- break;
- pkt_rbtree_erase(pd, node);
- spin_lock(&pkt->lock);
- bio_list_add(&pkt->orig_bios, bio);
- pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
- spin_unlock(&pkt->lock);
- }
- /* check write congestion marks, and if bio_queue_size is
- * below, wake up any waiters
- */
- if (pd->congested &&
- pd->bio_queue_size <= pd->write_congestion_off) {
- pd->congested = false;
- wake_up_var(&pd->congested);
- }
- spin_unlock(&pd->lock);
-
- pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
- pkt_set_state(pkt, PACKET_WAITING_STATE);
- atomic_set(&pkt->run_sm, 1);
-
- spin_lock(&pd->cdrw.active_list_lock);
- list_add(&pkt->list, &pd->cdrw.pkt_active_list);
- spin_unlock(&pd->cdrw.active_list_lock);
-
- return 1;
-}
-
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-static void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
- struct bvec_iter src_iter = src->bi_iter;
- struct bvec_iter dst_iter = dst->bi_iter;
-
- while (1) {
- if (!src_iter.bi_size) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_iter = src->bi_iter;
- }
-
- if (!dst_iter.bi_size) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_iter = dst->bi_iter;
- }
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
-}
-
-/*
- * Assemble a bio to write one packet and queue the bio for processing
- * by the underlying block device.
- */
-static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- int f;
-
- bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames,
- REQ_OP_WRITE);
- pkt->w_bio->bi_iter.bi_sector = pkt->sector;
- pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
- pkt->w_bio->bi_private = pkt;
-
- /* XXX: locking? */
- for (f = 0; f < pkt->frames; f++) {
- struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
- unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
-
- if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset))
- BUG();
- }
- pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
-
- /*
- * Fill-in bvec with data from orig_bios.
- */
- spin_lock(&pkt->lock);
- bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
-
- pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
- spin_unlock(&pkt->lock);
-
- pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
- pkt->write_size, (unsigned long long)pkt->sector);
-
- if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames))
- pkt->cache_valid = 1;
- else
- pkt->cache_valid = 0;
-
- /* Start the write request */
- atomic_set(&pkt->io_wait, 1);
- pkt_queue_bio(pd, pkt->w_bio);
-}
-
-static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status)
-{
- struct bio *bio;
-
- if (status)
- pkt->cache_valid = 0;
-
- /* Finish all bios corresponding to this packet */
- while ((bio = bio_list_pop(&pkt->orig_bios))) {
- bio->bi_status = status;
- bio_endio(bio);
- }
-}
-
-static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
-{
- pkt_dbg(2, pd, "pkt %d\n", pkt->id);
-
- for (;;) {
- switch (pkt->state) {
- case PACKET_WAITING_STATE:
- if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
- return;
-
- pkt->sleep_time = 0;
- pkt_gather_data(pd, pkt);
- pkt_set_state(pkt, PACKET_READ_WAIT_STATE);
- break;
-
- case PACKET_READ_WAIT_STATE:
- if (atomic_read(&pkt->io_wait) > 0)
- return;
-
- if (atomic_read(&pkt->io_errors) > 0) {
- pkt_set_state(pkt, PACKET_RECOVERY_STATE);
- } else {
- pkt_start_write(pd, pkt);
- }
- break;
-
- case PACKET_WRITE_WAIT_STATE:
- if (atomic_read(&pkt->io_wait) > 0)
- return;
-
- if (!pkt->w_bio->bi_status) {
- pkt_set_state(pkt, PACKET_FINISHED_STATE);
- } else {
- pkt_set_state(pkt, PACKET_RECOVERY_STATE);
- }
- break;
-
- case PACKET_RECOVERY_STATE:
- pkt_dbg(2, pd, "No recovery possible\n");
- pkt_set_state(pkt, PACKET_FINISHED_STATE);
- break;
-
- case PACKET_FINISHED_STATE:
- pkt_finish_packet(pkt, pkt->w_bio->bi_status);
- return;
-
- default:
- BUG();
- break;
- }
- }
-}
-
-static void pkt_handle_packets(struct pktcdvd_device *pd)
-{
- struct packet_data *pkt, *next;
-
- /*
- * Run state machine for active packets
- */
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (atomic_read(&pkt->run_sm) > 0) {
- atomic_set(&pkt->run_sm, 0);
- pkt_run_state_machine(pd, pkt);
- }
- }
-
- /*
- * Move no longer active packets to the free list
- */
- spin_lock(&pd->cdrw.active_list_lock);
- list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
- if (pkt->state == PACKET_FINISHED_STATE) {
- list_del(&pkt->list);
- pkt_put_packet_data(pd, pkt);
- pkt_set_state(pkt, PACKET_IDLE_STATE);
- atomic_set(&pd->scan_queue, 1);
- }
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-static void pkt_count_states(struct pktcdvd_device *pd, int *states)
-{
- struct packet_data *pkt;
- int i;
-
- for (i = 0; i < PACKET_NUM_STATES; i++)
- states[i] = 0;
-
- spin_lock(&pd->cdrw.active_list_lock);
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- states[pkt->state]++;
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-}
-
-/*
- * kcdrwd is woken up when writes have been queued for one of our
- * registered devices
- */
-static int kcdrwd(void *foobar)
-{
- struct pktcdvd_device *pd = foobar;
- struct packet_data *pkt;
- long min_sleep_time, residue;
-
- set_user_nice(current, MIN_NICE);
- set_freezable();
-
- for (;;) {
- DECLARE_WAITQUEUE(wait, current);
-
- /*
- * Wait until there is something to do
- */
- add_wait_queue(&pd->wqueue, &wait);
- for (;;) {
- set_current_state(TASK_INTERRUPTIBLE);
-
- /* Check if we need to run pkt_handle_queue */
- if (atomic_read(&pd->scan_queue) > 0)
- goto work_to_do;
-
- /* Check if we need to run the state machine for some packet */
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (atomic_read(&pkt->run_sm) > 0)
- goto work_to_do;
- }
-
- /* Check if we need to process the iosched queues */
- if (atomic_read(&pd->iosched.attention) != 0)
- goto work_to_do;
-
- /* Otherwise, go to sleep */
- if (PACKET_DEBUG > 1) {
- int states[PACKET_NUM_STATES];
- pkt_count_states(pd, states);
- pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
- states[0], states[1], states[2],
- states[3], states[4], states[5]);
- }
-
- min_sleep_time = MAX_SCHEDULE_TIMEOUT;
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
- min_sleep_time = pkt->sleep_time;
- }
-
- pkt_dbg(2, pd, "sleeping\n");
- residue = schedule_timeout(min_sleep_time);
- pkt_dbg(2, pd, "wake up\n");
-
- /* make swsusp happy with our thread */
- try_to_freeze();
-
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (!pkt->sleep_time)
- continue;
- pkt->sleep_time -= min_sleep_time - residue;
- if (pkt->sleep_time <= 0) {
- pkt->sleep_time = 0;
- atomic_inc(&pkt->run_sm);
- }
- }
-
- if (kthread_should_stop())
- break;
- }
-work_to_do:
- set_current_state(TASK_RUNNING);
- remove_wait_queue(&pd->wqueue, &wait);
-
- if (kthread_should_stop())
- break;
-
- /*
- * if pkt_handle_queue returns true, we can queue
- * another request.
- */
- while (pkt_handle_queue(pd))
- ;
-
- /*
- * Handle packet state machine
- */
- pkt_handle_packets(pd);
-
- /*
- * Handle iosched queues
- */
- pkt_iosched_process_queue(pd);
- }
-
- return 0;
-}
-
-static void pkt_print_settings(struct pktcdvd_device *pd)
-{
- pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
- pd->settings.fp ? "Fixed" : "Variable",
- pd->settings.size >> 2,
- pd->settings.block_mode == 8 ? '1' : '2');
-}
-
-static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
-{
- memset(cgc->cmd, 0, sizeof(cgc->cmd));
-
- cgc->cmd[0] = GPCMD_MODE_SENSE_10;
- cgc->cmd[2] = page_code | (page_control << 6);
- cgc->cmd[7] = cgc->buflen >> 8;
- cgc->cmd[8] = cgc->buflen & 0xff;
- cgc->data_direction = CGC_DATA_READ;
- return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
-{
- memset(cgc->cmd, 0, sizeof(cgc->cmd));
- memset(cgc->buffer, 0, 2);
- cgc->cmd[0] = GPCMD_MODE_SELECT_10;
- cgc->cmd[1] = 0x10; /* PF */
- cgc->cmd[7] = cgc->buflen >> 8;
- cgc->cmd[8] = cgc->buflen & 0xff;
- cgc->data_direction = CGC_DATA_WRITE;
- return pkt_generic_packet(pd, cgc);
-}
-
-static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
-{
- struct packet_command cgc;
- int ret;
-
- /* set up command and get the disc info */
- init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_READ_DISC_INFO;
- cgc.cmd[8] = cgc.buflen = 2;
- cgc.quiet = 1;
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- return ret;
-
- /* not all drives have the same disc_info length, so requeue
- * packet with the length the drive tells us it can supply
- */
- cgc.buflen = be16_to_cpu(di->disc_information_length) +
- sizeof(di->disc_information_length);
-
- if (cgc.buflen > sizeof(disc_information))
- cgc.buflen = sizeof(disc_information);
-
- cgc.cmd[8] = cgc.buflen;
- return pkt_generic_packet(pd, &cgc);
-}
-
-static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
-{
- struct packet_command cgc;
- int ret;
-
- init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
- cgc.cmd[1] = type & 3;
- cgc.cmd[4] = (track & 0xff00) >> 8;
- cgc.cmd[5] = track & 0xff;
- cgc.cmd[8] = 8;
- cgc.quiet = 1;
-
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- return ret;
-
- cgc.buflen = be16_to_cpu(ti->track_information_length) +
- sizeof(ti->track_information_length);
-
- if (cgc.buflen > sizeof(track_information))
- cgc.buflen = sizeof(track_information);
-
- cgc.cmd[8] = cgc.buflen;
- return pkt_generic_packet(pd, &cgc);
-}
-
-static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
- long *last_written)
-{
- disc_information di;
- track_information ti;
- __u32 last_track;
- int ret;
-
- ret = pkt_get_disc_info(pd, &di);
- if (ret)
- return ret;
-
- last_track = (di.last_track_msb << 8) | di.last_track_lsb;
- ret = pkt_get_track_info(pd, last_track, 1, &ti);
- if (ret)
- return ret;
-
- /* if this track is blank, try the previous. */
- if (ti.blank) {
- last_track--;
- ret = pkt_get_track_info(pd, last_track, 1, &ti);
- if (ret)
- return ret;
- }
-
- /* if last recorded field is valid, return it. */
- if (ti.lra_v) {
- *last_written = be32_to_cpu(ti.last_rec_address);
- } else {
- /* make it up instead */
- *last_written = be32_to_cpu(ti.track_start) +
- be32_to_cpu(ti.track_size);
- if (ti.free_blocks)
- *last_written -= (be32_to_cpu(ti.free_blocks) + 7);
- }
- return 0;
-}
-
-/*
- * write mode select package based on pd->settings
- */
-static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- write_param_page *wp;
- char buffer[128];
- int ret, size;
-
- /* doesn't apply to DVD+RW or DVD-RAM */
- if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12))
- return 0;
-
- memset(buffer, 0, sizeof(buffer));
- init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff));
- pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff);
- if (size > sizeof(buffer))
- size = sizeof(buffer);
-
- /*
- * now get it all
- */
- init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- /*
- * write page is offset header + block descriptor length
- */
- wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
-
- wp->fp = pd->settings.fp;
- wp->track_mode = pd->settings.track_mode;
- wp->write_type = pd->settings.write_type;
- wp->data_block_type = pd->settings.block_mode;
-
- wp->multi_session = 0;
-
-#ifdef PACKET_USE_LS
- wp->link_size = 7;
- wp->ls_v = 1;
-#endif
-
- if (wp->data_block_type == PACKET_BLOCK_MODE1) {
- wp->session_format = 0;
- wp->subhdr2 = 0x20;
- } else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
- wp->session_format = 0x20;
- wp->subhdr2 = 8;
-#if 0
- wp->mcn[0] = 0x80;
- memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
-#endif
- } else {
- /*
- * paranoia
- */
- pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
- return 1;
- }
- wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
-
- cgc.buflen = cgc.cmd[8] = size;
- ret = pkt_mode_select(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- pkt_print_settings(pd);
- return 0;
-}
-
-/*
- * 1 -- we can write to this track, 0 -- we can't
- */
-static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
-{
- switch (pd->mmc3_profile) {
- case 0x1a: /* DVD+RW */
- case 0x12: /* DVD-RAM */
- /* The track is always writable on DVD+RW/DVD-RAM */
- return 1;
- default:
- break;
- }
-
- if (!ti->packet || !ti->fp)
- return 0;
-
- /*
- * "good" settings as per Mt Fuji.
- */
- if (ti->rt == 0 && ti->blank == 0)
- return 1;
-
- if (ti->rt == 0 && ti->blank == 1)
- return 1;
-
- if (ti->rt == 1 && ti->blank == 0)
- return 1;
-
- pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
- return 0;
-}
-
-/*
- * 1 -- we can write to this disc, 0 -- we can't
- */
-static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
-{
- switch (pd->mmc3_profile) {
- case 0x0a: /* CD-RW */
- case 0xffff: /* MMC3 not supported */
- break;
- case 0x1a: /* DVD+RW */
- case 0x13: /* DVD-RW */
- case 0x12: /* DVD-RAM */
- return 1;
- default:
- pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
- pd->mmc3_profile);
- return 0;
- }
-
- /*
- * for disc type 0xff we should probably reserve a new track.
- * but i'm not sure, should we leave this to user apps? probably.
- */
- if (di->disc_type == 0xff) {
- pkt_notice(pd, "unknown disc - no track?\n");
- return 0;
- }
-
- if (di->disc_type != 0x20 && di->disc_type != 0) {
- pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
- return 0;
- }
-
- if (di->erasable == 0) {
- pkt_notice(pd, "disc not erasable\n");
- return 0;
- }
-
- if (di->border_status == PACKET_SESSION_RESERVED) {
- pkt_err(pd, "can't write to last track (reserved)\n");
- return 0;
- }
-
- return 1;
-}
-
-static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
-{
- struct packet_command cgc;
- unsigned char buf[12];
- disc_information di;
- track_information ti;
- int ret, track;
-
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
- cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
- cgc.cmd[8] = 8;
- ret = pkt_generic_packet(pd, &cgc);
- pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7];
-
- memset(&di, 0, sizeof(disc_information));
- memset(&ti, 0, sizeof(track_information));
-
- ret = pkt_get_disc_info(pd, &di);
- if (ret) {
- pkt_err(pd, "failed get_disc\n");
- return ret;
- }
-
- if (!pkt_writable_disc(pd, &di))
- return -EROFS;
-
- pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
-
- track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
- ret = pkt_get_track_info(pd, track, 1, &ti);
- if (ret) {
- pkt_err(pd, "failed get_track\n");
- return ret;
- }
-
- if (!pkt_writable_track(pd, &ti)) {
- pkt_err(pd, "can't write to this track\n");
- return -EROFS;
- }
-
- /*
- * we keep packet size in 512 byte units, makes it easier to
- * deal with request calculations.
- */
- pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
- if (pd->settings.size == 0) {
- pkt_notice(pd, "detected zero packet size!\n");
- return -ENXIO;
- }
- if (pd->settings.size > PACKET_MAX_SECTORS) {
- pkt_err(pd, "packet size is too big\n");
- return -EROFS;
- }
- pd->settings.fp = ti.fp;
- pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
-
- if (ti.nwa_v) {
- pd->nwa = be32_to_cpu(ti.next_writable);
- set_bit(PACKET_NWA_VALID, &pd->flags);
- }
-
- /*
- * in theory we could use lra on -RW media as well and just zero
- * blocks that haven't been written yet, but in practice that
- * is just a no-go. we'll use that for -R, naturally.
- */
- if (ti.lra_v) {
- pd->lra = be32_to_cpu(ti.last_rec_address);
- set_bit(PACKET_LRA_VALID, &pd->flags);
- } else {
- pd->lra = 0xffffffff;
- set_bit(PACKET_LRA_VALID, &pd->flags);
- }
-
- /*
- * fine for now
- */
- pd->settings.link_loss = 7;
- pd->settings.write_type = 0; /* packet */
- pd->settings.track_mode = ti.track_mode;
-
- /*
- * mode1 or mode2 disc
- */
- switch (ti.data_mode) {
- case PACKET_MODE1:
- pd->settings.block_mode = PACKET_BLOCK_MODE1;
- break;
- case PACKET_MODE2:
- pd->settings.block_mode = PACKET_BLOCK_MODE2;
- break;
- default:
- pkt_err(pd, "unknown data mode\n");
- return -EROFS;
- }
- return 0;
-}
-
-/*
- * enable/disable write caching on drive
- */
-static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
- int set)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[64];
- int ret;
-
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.buflen = pd->mode_offset + 12;
-
- /*
- * caching mode page might not be there, so quiet this command
- */
- cgc.quiet = 1;
-
- ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
- if (ret)
- return ret;
-
- buf[pd->mode_offset + 10] |= (!!set << 2);
-
- cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
- ret = pkt_mode_select(pd, &cgc);
- if (ret) {
- pkt_err(pd, "write caching control failed\n");
- pkt_dump_sense(pd, &cgc);
- } else if (!ret && set)
- pkt_notice(pd, "enabled write caching\n");
- return ret;
-}
-
-static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
-{
- struct packet_command cgc;
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
- cgc.cmd[4] = lockflag ? 1 : 0;
- return pkt_generic_packet(pd, &cgc);
-}
-
-/*
- * Returns drive maximum write speed
- */
-static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
- unsigned *write_speed)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[256+18];
- unsigned char *cap_buf;
- int ret, offset;
-
- cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
- init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
- cgc.sshdr = &sshdr;
-
- ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
- if (ret) {
- cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
- sizeof(struct mode_page_header);
- ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
- }
-
- offset = 20; /* Obsoleted field, used by older drives */
- if (cap_buf[1] >= 28)
- offset = 28; /* Current write speed selected */
- if (cap_buf[1] >= 30) {
- /* If the drive reports at least one "Logical Unit Write
- * Speed Performance Descriptor Block", use the information
- * in the first block. (contains the highest speed)
- */
- int num_spdb = (cap_buf[30] << 8) + cap_buf[31];
- if (num_spdb > 0)
- offset = 34;
- }
-
- *write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1];
- return 0;
-}
-
-/* These tables from cdrecord - I don't have orange book */
-/* standard speed CD-RW (1-4x) */
-static char clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* high speed CD-RW (-10x) */
-static char hs_clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
-};
-/* ultra high speed CD-RW */
-static char us_clv_to_speed[16] = {
- /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */
- 0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
-};
-
-/*
- * reads the maximum media speed from ATIP
- */
-static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
- unsigned *speed)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- unsigned char buf[64];
- unsigned int size, st, sp;
- int ret;
-
- init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
- cgc.cmd[1] = 2;
- cgc.cmd[2] = 4; /* READ ATIP */
- cgc.cmd[8] = 2;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
- size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
- if (size > sizeof(buf))
- size = sizeof(buf);
-
- init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
- cgc.sshdr = &sshdr;
- cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
- cgc.cmd[1] = 2;
- cgc.cmd[2] = 4;
- cgc.cmd[8] = size;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret) {
- pkt_dump_sense(pd, &cgc);
- return ret;
- }
-
- if (!(buf[6] & 0x40)) {
- pkt_notice(pd, "disc type is not CD-RW\n");
- return 1;
- }
- if (!(buf[6] & 0x4)) {
- pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
- return 1;
- }
-
- st = (buf[6] >> 3) & 0x7; /* disc sub-type */
-
- sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
-
- /* Info from cdrecord */
- switch (st) {
- case 0: /* standard speed */
- *speed = clv_to_speed[sp];
- break;
- case 1: /* high speed */
- *speed = hs_clv_to_speed[sp];
- break;
- case 2: /* ultra high speed */
- *speed = us_clv_to_speed[sp];
- break;
- default:
- pkt_notice(pd, "unknown disc sub-type %d\n", st);
- return 1;
- }
- if (*speed) {
- pkt_info(pd, "maximum media speed: %d\n", *speed);
- return 0;
- } else {
- pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
- return 1;
- }
-}
-
-static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
-{
- struct packet_command cgc;
- struct scsi_sense_hdr sshdr;
- int ret;
-
- pkt_dbg(2, pd, "Performing OPC\n");
-
- init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
- cgc.sshdr = &sshdr;
- cgc.timeout = 60*HZ;
- cgc.cmd[0] = GPCMD_SEND_OPC;
- cgc.cmd[1] = 1;
- ret = pkt_generic_packet(pd, &cgc);
- if (ret)
- pkt_dump_sense(pd, &cgc);
- return ret;
-}
-
-static int pkt_open_write(struct pktcdvd_device *pd)
-{
- int ret;
- unsigned int write_speed, media_write_speed, read_speed;
-
- ret = pkt_probe_settings(pd);
- if (ret) {
- pkt_dbg(2, pd, "failed probe\n");
- return ret;
- }
-
- ret = pkt_set_write_settings(pd);
- if (ret) {
- pkt_dbg(1, pd, "failed saving write settings\n");
- return -EIO;
- }
-
- pkt_write_caching(pd, USE_WCACHING);
-
- ret = pkt_get_max_speed(pd, &write_speed);
- if (ret)
- write_speed = 16 * 177;
- switch (pd->mmc3_profile) {
- case 0x13: /* DVD-RW */
- case 0x1a: /* DVD+RW */
- case 0x12: /* DVD-RAM */
- pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
- break;
- default:
- ret = pkt_media_speed(pd, &media_write_speed);
- if (ret)
- media_write_speed = 16;
- write_speed = min(write_speed, media_write_speed * 177);
- pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
- break;
- }
- read_speed = write_speed;
-
- ret = pkt_set_speed(pd, write_speed, read_speed);
- if (ret) {
- pkt_dbg(1, pd, "couldn't set write speed\n");
- return -EIO;
- }
- pd->write_speed = write_speed;
- pd->read_speed = read_speed;
-
- ret = pkt_perform_opc(pd);
- if (ret) {
- pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
- }
-
- return 0;
-}
-
-/*
- * called at open time.
- */
-static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
-{
- int ret;
- long lba;
- struct request_queue *q;
- struct block_device *bdev;
-
- /*
- * We need to re-open the cdrom device without O_NONBLOCK to be able
- * to read/write from/to it. It is already opened in O_NONBLOCK mode
- * so open should not fail.
- */
- bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd);
- if (IS_ERR(bdev)) {
- ret = PTR_ERR(bdev);
- goto out;
- }
-
- ret = pkt_get_last_written(pd, &lba);
- if (ret) {
- pkt_err(pd, "pkt_get_last_written failed\n");
- goto out_putdev;
- }
-
- set_capacity(pd->disk, lba << 2);
- set_capacity_and_notify(pd->bdev->bd_disk, lba << 2);
-
- q = bdev_get_queue(pd->bdev);
- if (write) {
- ret = pkt_open_write(pd);
- if (ret)
- goto out_putdev;
- /*
- * Some CDRW drives can not handle writes larger than one packet,
- * even if the size is a multiple of the packet size.
- */
- blk_queue_max_hw_sectors(q, pd->settings.size);
- set_bit(PACKET_WRITABLE, &pd->flags);
- } else {
- pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
- clear_bit(PACKET_WRITABLE, &pd->flags);
- }
-
- ret = pkt_set_segment_merging(pd, q);
- if (ret)
- goto out_putdev;
-
- if (write) {
- if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
- pkt_err(pd, "not enough memory for buffers\n");
- ret = -ENOMEM;
- goto out_putdev;
- }
- pkt_info(pd, "%lukB available on disc\n", lba << 1);
- }
-
- return 0;
-
-out_putdev:
- blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
-out:
- return ret;
-}
-
-/*
- * called when the device is closed. makes sure that the device flushes
- * the internal cache before we close.
- */
-static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
-{
- if (flush && pkt_flush_cache(pd))
- pkt_dbg(1, pd, "not flushing cache\n");
-
- pkt_lock_door(pd, 0);
-
- pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
- blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
-
- pkt_shrink_pktlist(pd);
-}
-
-static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
-{
- if (dev_minor >= MAX_WRITERS)
- return NULL;
-
- dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
- return pkt_devs[dev_minor];
-}
-
-static int pkt_open(struct block_device *bdev, fmode_t mode)
-{
- struct pktcdvd_device *pd = NULL;
- int ret;
-
- mutex_lock(&pktcdvd_mutex);
- mutex_lock(&ctl_mutex);
- pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
- if (!pd) {
- ret = -ENODEV;
- goto out;
- }
- BUG_ON(pd->refcnt < 0);
-
- pd->refcnt++;
- if (pd->refcnt > 1) {
- if ((mode & FMODE_WRITE) &&
- !test_bit(PACKET_WRITABLE, &pd->flags)) {
- ret = -EBUSY;
- goto out_dec;
- }
- } else {
- ret = pkt_open_dev(pd, mode & FMODE_WRITE);
- if (ret)
- goto out_dec;
- /*
- * needed here as well, since ext2 (among others) may change
- * the blocksize at mount time
- */
- set_blocksize(bdev, CD_FRAMESIZE);
- }
-
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
- return 0;
-
-out_dec:
- pd->refcnt--;
-out:
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
- return ret;
-}
-
-static void pkt_close(struct gendisk *disk, fmode_t mode)
-{
- struct pktcdvd_device *pd = disk->private_data;
-
- mutex_lock(&pktcdvd_mutex);
- mutex_lock(&ctl_mutex);
- pd->refcnt--;
- BUG_ON(pd->refcnt < 0);
- if (pd->refcnt == 0) {
- int flush = test_bit(PACKET_WRITABLE, &pd->flags);
- pkt_release_dev(pd, flush);
- }
- mutex_unlock(&ctl_mutex);
- mutex_unlock(&pktcdvd_mutex);
-}
-
-
-static void pkt_end_io_read_cloned(struct bio *bio)
-{
- struct packet_stacked_data *psd = bio->bi_private;
- struct pktcdvd_device *pd = psd->pd;
-
- psd->bio->bi_status = bio->bi_status;
- bio_put(bio);
- bio_endio(psd->bio);
- mempool_free(psd, &psd_pool);
- pkt_bio_finished(pd);
-}
-
-static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
-{
- struct bio *cloned_bio =
- bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
- struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
-
- psd->pd = pd;
- psd->bio = bio;
- cloned_bio->bi_private = psd;
- cloned_bio->bi_end_io = pkt_end_io_read_cloned;
- pd->stats.secs_r += bio_sectors(bio);
- pkt_queue_bio(pd, cloned_bio);
-}
-
-static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
-{
- struct pktcdvd_device *pd = q->queuedata;
- sector_t zone;
- struct packet_data *pkt;
- int was_empty, blocked_bio;
- struct pkt_rb_node *node;
-
- zone = get_zone(bio->bi_iter.bi_sector, pd);
-
- /*
- * If we find a matching packet in state WAITING or READ_WAIT, we can
- * just append this bio to that packet.
- */
- spin_lock(&pd->cdrw.active_list_lock);
- blocked_bio = 0;
- list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
- if (pkt->sector == zone) {
- spin_lock(&pkt->lock);
- if ((pkt->state == PACKET_WAITING_STATE) ||
- (pkt->state == PACKET_READ_WAIT_STATE)) {
- bio_list_add(&pkt->orig_bios, bio);
- pkt->write_size +=
- bio->bi_iter.bi_size / CD_FRAMESIZE;
- if ((pkt->write_size >= pkt->frames) &&
- (pkt->state == PACKET_WAITING_STATE)) {
- atomic_inc(&pkt->run_sm);
- wake_up(&pd->wqueue);
- }
- spin_unlock(&pkt->lock);
- spin_unlock(&pd->cdrw.active_list_lock);
- return;
- } else {
- blocked_bio = 1;
- }
- spin_unlock(&pkt->lock);
- }
- }
- spin_unlock(&pd->cdrw.active_list_lock);
-
- /*
- * Test if there is enough room left in the bio work queue
- * (queue size >= congestion on mark).
- * If not, wait till the work queue size is below the congestion off mark.
- */
- spin_lock(&pd->lock);
- if (pd->write_congestion_on > 0
- && pd->bio_queue_size >= pd->write_congestion_on) {
- struct wait_bit_queue_entry wqe;
-
- init_wait_var_entry(&wqe, &pd->congested, 0);
- for (;;) {
- prepare_to_wait_event(__var_waitqueue(&pd->congested),
- &wqe.wq_entry,
- TASK_UNINTERRUPTIBLE);
- if (pd->bio_queue_size <= pd->write_congestion_off)
- break;
- pd->congested = true;
- spin_unlock(&pd->lock);
- schedule();
- spin_lock(&pd->lock);
- }
- }
- spin_unlock(&pd->lock);
-
- /*
- * No matching packet found. Store the bio in the work queue.
- */
- node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
- node->bio = bio;
- spin_lock(&pd->lock);
- BUG_ON(pd->bio_queue_size < 0);
- was_empty = (pd->bio_queue_size == 0);
- pkt_rbtree_insert(pd, node);
- spin_unlock(&pd->lock);
-
- /*
- * Wake up the worker thread.
- */
- atomic_set(&pd->scan_queue, 1);
- if (was_empty) {
- /* This wake_up is required for correct operation */
- wake_up(&pd->wqueue);
- } else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
- /*
- * This wake up is not required for correct operation,
- * but improves performance in some cases.
- */
- wake_up(&pd->wqueue);
- }
-}
-
-static void pkt_submit_bio(struct bio *bio)
-{
- struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
- struct bio *split;
-
- bio = bio_split_to_limits(bio);
-
- pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
- (unsigned long long)bio->bi_iter.bi_sector,
- (unsigned long long)bio_end_sector(bio));
-
- /*
- * Clone READ bios so we can have our own bi_end_io callback.
- */
- if (bio_data_dir(bio) == READ) {
- pkt_make_request_read(pd, bio);
- return;
- }
-
- if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
- pkt_notice(pd, "WRITE for ro device (%llu)\n",
- (unsigned long long)bio->bi_iter.bi_sector);
- goto end_io;
- }
-
- if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
- pkt_err(pd, "wrong bio size\n");
- goto end_io;
- }
-
- do {
- sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
- sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
-
- if (last_zone != zone) {
- BUG_ON(last_zone != zone + pd->settings.size);
-
- split = bio_split(bio, last_zone -
- bio->bi_iter.bi_sector,
- GFP_NOIO, &pkt_bio_set);
- bio_chain(split, bio);
- } else {
- split = bio;
- }
-
- pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
- } while (split != bio);
-
- return;
-end_io:
- bio_io_error(bio);
-}
-
-static void pkt_init_queue(struct pktcdvd_device *pd)
-{
- struct request_queue *q = pd->disk->queue;
-
- blk_queue_logical_block_size(q, CD_FRAMESIZE);
- blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
- q->queuedata = pd;
-}
-
-static int pkt_seq_show(struct seq_file *m, void *p)
-{
- struct pktcdvd_device *pd = m->private;
- char *msg;
- int states[PACKET_NUM_STATES];
-
- seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev);
-
- seq_printf(m, "\nSettings:\n");
- seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
-
- if (pd->settings.write_type == 0)
- msg = "Packet";
- else
- msg = "Unknown";
- seq_printf(m, "\twrite type:\t\t%s\n", msg);
-
- seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
- seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
-
- seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
-
- if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
- msg = "Mode 1";
- else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
- msg = "Mode 2";
- else
- msg = "Unknown";
- seq_printf(m, "\tblock mode:\t\t%s\n", msg);
-
- seq_printf(m, "\nStatistics:\n");
- seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
- seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
- seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
- seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
- seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
-
- seq_printf(m, "\nMisc:\n");
- seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
- seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
- seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
- seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
- seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
- seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
-
- seq_printf(m, "\nQueue state:\n");
- seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
- seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
- seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
-
- pkt_count_states(pd, states);
- seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
- states[0], states[1], states[2], states[3], states[4], states[5]);
-
- seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n",
- pd->write_congestion_off,
- pd->write_congestion_on);
- return 0;
-}
-
-static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
-{
- int i;
- struct block_device *bdev;
- struct scsi_device *sdev;
-
- if (pd->pkt_dev == dev) {
- pkt_err(pd, "recursive setup not allowed\n");
- return -EBUSY;
- }
- for (i = 0; i < MAX_WRITERS; i++) {
- struct pktcdvd_device *pd2 = pkt_devs[i];
- if (!pd2)
- continue;
- if (pd2->bdev->bd_dev == dev) {
- pkt_err(pd, "%pg already setup\n", pd2->bdev);
- return -EBUSY;
- }
- if (pd2->pkt_dev == dev) {
- pkt_err(pd, "can't chain pktcdvd devices\n");
- return -EBUSY;
- }
- }
-
- bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
- sdev = scsi_device_from_queue(bdev->bd_disk->queue);
- if (!sdev) {
- blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
- return -EINVAL;
- }
- put_device(&sdev->sdev_gendev);
-
- /* This is safe, since we have a reference from open(). */
- __module_get(THIS_MODULE);
-
- pd->bdev = bdev;
- set_blocksize(bdev, CD_FRAMESIZE);
-
- pkt_init_queue(pd);
-
- atomic_set(&pd->cdrw.pending_bios, 0);
- pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
- if (IS_ERR(pd->cdrw.thread)) {
- pkt_err(pd, "can't start kernel thread\n");
- goto out_mem;
- }
-
- proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd);
- pkt_dbg(1, pd, "writer mapped to %pg\n", bdev);
- return 0;
-
-out_mem:
- blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
- /* This is safe: open() is still holding a reference. */
- module_put(THIS_MODULE);
- return -ENOMEM;
-}
-
-static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
-{
- struct pktcdvd_device *pd = bdev->bd_disk->private_data;
- int ret;
-
- pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
- cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
-
- mutex_lock(&pktcdvd_mutex);
- switch (cmd) {
- case CDROMEJECT:
- /*
- * The door gets locked when the device is opened, so we
- * have to unlock it or else the eject command fails.
- */
- if (pd->refcnt == 1)
- pkt_lock_door(pd, 0);
- fallthrough;
- /*
- * forward selected CDROM ioctls to CD-ROM, for UDF
- */
- case CDROMMULTISESSION:
- case CDROMREADTOCENTRY:
- case CDROM_LAST_WRITTEN:
- case CDROM_SEND_PACKET:
- case SCSI_IOCTL_SEND_COMMAND:
- if (!bdev->bd_disk->fops->ioctl)
- ret = -ENOTTY;
- else
- ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
- break;
- default:
- pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
- ret = -ENOTTY;
- }
- mutex_unlock(&pktcdvd_mutex);
-
- return ret;
-}
-
-static unsigned int pkt_check_events(struct gendisk *disk,
- unsigned int clearing)
-{
- struct pktcdvd_device *pd = disk->private_data;
- struct gendisk *attached_disk;
-
- if (!pd)
- return 0;
- if (!pd->bdev)
- return 0;
- attached_disk = pd->bdev->bd_disk;
- if (!attached_disk || !attached_disk->fops->check_events)
- return 0;
- return attached_disk->fops->check_events(attached_disk, clearing);
-}
-
-static char *pkt_devnode(struct gendisk *disk, umode_t *mode)
-{
- return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name);
-}
-
-static const struct block_device_operations pktcdvd_ops = {
- .owner = THIS_MODULE,
- .submit_bio = pkt_submit_bio,
- .open = pkt_open,
- .release = pkt_close,
- .ioctl = pkt_ioctl,
- .compat_ioctl = blkdev_compat_ptr_ioctl,
- .check_events = pkt_check_events,
- .devnode = pkt_devnode,
-};
-
-/*
- * Set up mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
-{
- int idx;
- int ret = -ENOMEM;
- struct pktcdvd_device *pd;
- struct gendisk *disk;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- for (idx = 0; idx < MAX_WRITERS; idx++)
- if (!pkt_devs[idx])
- break;
- if (idx == MAX_WRITERS) {
- pr_err("max %d writers supported\n", MAX_WRITERS);
- ret = -EBUSY;
- goto out_mutex;
- }
-
- pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
- if (!pd)
- goto out_mutex;
-
- ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
- sizeof(struct pkt_rb_node));
- if (ret)
- goto out_mem;
-
- INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
- INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
- spin_lock_init(&pd->cdrw.active_list_lock);
-
- spin_lock_init(&pd->lock);
- spin_lock_init(&pd->iosched.lock);
- bio_list_init(&pd->iosched.read_queue);
- bio_list_init(&pd->iosched.write_queue);
- sprintf(pd->name, DRIVER_NAME"%d", idx);
- init_waitqueue_head(&pd->wqueue);
- pd->bio_queue = RB_ROOT;
-
- pd->write_congestion_on = write_congestion_on;
- pd->write_congestion_off = write_congestion_off;
-
- ret = -ENOMEM;
- disk = blk_alloc_disk(NUMA_NO_NODE);
- if (!disk)
- goto out_mem;
- pd->disk = disk;
- disk->major = pktdev_major;
- disk->first_minor = idx;
- disk->minors = 1;
- disk->fops = &pktcdvd_ops;
- disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART;
- strcpy(disk->disk_name, pd->name);
- disk->private_data = pd;
-
- pd->pkt_dev = MKDEV(pktdev_major, idx);
- ret = pkt_new_dev(pd, dev);
- if (ret)
- goto out_mem2;
-
- /* inherit events of the host device */
- disk->events = pd->bdev->bd_disk->events;
-
- ret = add_disk(disk);
- if (ret)
- goto out_mem2;
-
- pkt_sysfs_dev_new(pd);
- pkt_debugfs_dev_new(pd);
-
- pkt_devs[idx] = pd;
- if (pkt_dev)
- *pkt_dev = pd->pkt_dev;
-
- mutex_unlock(&ctl_mutex);
- return 0;
-
-out_mem2:
- put_disk(disk);
-out_mem:
- mempool_exit(&pd->rb_pool);
- kfree(pd);
-out_mutex:
- mutex_unlock(&ctl_mutex);
- pr_err("setup of pktcdvd device failed\n");
- return ret;
-}
-
-/*
- * Tear down mapping from pktcdvd device to CD-ROM device.
- */
-static int pkt_remove_dev(dev_t pkt_dev)
-{
- struct pktcdvd_device *pd;
- int idx;
- int ret = 0;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- for (idx = 0; idx < MAX_WRITERS; idx++) {
- pd = pkt_devs[idx];
- if (pd && (pd->pkt_dev == pkt_dev))
- break;
- }
- if (idx == MAX_WRITERS) {
- pr_debug("dev not setup\n");
- ret = -ENXIO;
- goto out;
- }
-
- if (pd->refcnt > 0) {
- ret = -EBUSY;
- goto out;
- }
- if (!IS_ERR(pd->cdrw.thread))
- kthread_stop(pd->cdrw.thread);
-
- pkt_devs[idx] = NULL;
-
- pkt_debugfs_dev_remove(pd);
- pkt_sysfs_dev_remove(pd);
-
- blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
-
- remove_proc_entry(pd->name, pkt_proc);
- pkt_dbg(1, pd, "writer unmapped\n");
-
- del_gendisk(pd->disk);
- put_disk(pd->disk);
-
- mempool_exit(&pd->rb_pool);
- kfree(pd);
-
- /* This is safe: open() is still holding a reference. */
- module_put(THIS_MODULE);
-
-out:
- mutex_unlock(&ctl_mutex);
- return ret;
-}
-
-static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
-{
- struct pktcdvd_device *pd;
-
- mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
-
- pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
- if (pd) {
- ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
- ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
- } else {
- ctrl_cmd->dev = 0;
- ctrl_cmd->pkt_dev = 0;
- }
- ctrl_cmd->num_devices = MAX_WRITERS;
-
- mutex_unlock(&ctl_mutex);
-}
-
-static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- void __user *argp = (void __user *)arg;
- struct pkt_ctrl_command ctrl_cmd;
- int ret = 0;
- dev_t pkt_dev = 0;
-
- if (cmd != PACKET_CTRL_CMD)
- return -ENOTTY;
-
- if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
- return -EFAULT;
-
- switch (ctrl_cmd.command) {
- case PKT_CTRL_CMD_SETUP:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev);
- ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev);
- break;
- case PKT_CTRL_CMD_TEARDOWN:
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev));
- break;
- case PKT_CTRL_CMD_STATUS:
- pkt_get_status(&ctrl_cmd);
- break;
- default:
- return -ENOTTY;
- }
-
- if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
- return -EFAULT;
- return ret;
-}
-
-#ifdef CONFIG_COMPAT
-static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
-{
- return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
-}
-#endif
-
-static const struct file_operations pkt_ctl_fops = {
- .open = nonseekable_open,
- .unlocked_ioctl = pkt_ctl_ioctl,
-#ifdef CONFIG_COMPAT
- .compat_ioctl = pkt_ctl_compat_ioctl,
-#endif
- .owner = THIS_MODULE,
- .llseek = no_llseek,
-};
-
-static struct miscdevice pkt_misc = {
- .minor = MISC_DYNAMIC_MINOR,
- .name = DRIVER_NAME,
- .nodename = "pktcdvd/control",
- .fops = &pkt_ctl_fops
-};
-
-static int __init pkt_init(void)
-{
- int ret;
-
- mutex_init(&ctl_mutex);
-
- ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
- sizeof(struct packet_stacked_data));
- if (ret)
- return ret;
- ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
- if (ret) {
- mempool_exit(&psd_pool);
- return ret;
- }
-
- ret = register_blkdev(pktdev_major, DRIVER_NAME);
- if (ret < 0) {
- pr_err("unable to register block device\n");
- goto out2;
- }
- if (!pktdev_major)
- pktdev_major = ret;
-
- ret = pkt_sysfs_init();
- if (ret)
- goto out;
-
- pkt_debugfs_init();
-
- ret = misc_register(&pkt_misc);
- if (ret) {
- pr_err("unable to register misc device\n");
- goto out_misc;
- }
-
- pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL);
-
- return 0;
-
-out_misc:
- pkt_debugfs_cleanup();
- pkt_sysfs_cleanup();
-out:
- unregister_blkdev(pktdev_major, DRIVER_NAME);
-out2:
- mempool_exit(&psd_pool);
- bioset_exit(&pkt_bio_set);
- return ret;
-}
-
-static void __exit pkt_exit(void)
-{
- remove_proc_entry("driver/"DRIVER_NAME, NULL);
- misc_deregister(&pkt_misc);
-
- pkt_debugfs_cleanup();
- pkt_sysfs_cleanup();
-
- unregister_blkdev(pktdev_major, DRIVER_NAME);
- mempool_exit(&psd_pool);
- bioset_exit(&pkt_bio_set);
-}
-
-MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
-MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
-MODULE_LICENSE("GPL");
-
-module_init(pkt_init);
-module_exit(pkt_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 19da5defd734..68bd2f7961b3 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -512,7 +512,7 @@ static void virtblk_free_disk(struct gendisk *disk)
{
struct virtio_blk *vblk = disk->private_data;
- ida_simple_remove(&vd_index_ida, vblk->index);
+ ida_free(&vd_index_ida, vblk->index);
mutex_destroy(&vblk->vdev_mutex);
kfree(vblk);
}
@@ -902,8 +902,8 @@ static int virtblk_probe(struct virtio_device *vdev)
return -EINVAL;
}
- err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
- GFP_KERNEL);
+ err = ida_alloc_range(&vd_index_ida, 0,
+ minor_to_index(1 << MINORBITS) - 1, GFP_KERNEL);
if (err < 0)
goto out;
index = err;
@@ -1163,7 +1163,7 @@ out_free_vq:
out_free_vblk:
kfree(vblk);
out_free_index:
- ida_simple_remove(&vd_index_ida, index);
+ ida_free(&vd_index_ida, index);
out:
return err;
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 35b9bcad9db9..b28489290323 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -2129,7 +2129,6 @@ static void blkfront_closing(struct blkfront_info *info)
if (info->rq && info->gd) {
blk_mq_stop_hw_queues(info->rq);
blk_mark_disk_dead(info->gd);
- set_capacity(info->gd, 0);
}
for_each_rinfo(info, rinfo, i) {
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index 99499d1f6e66..9f32901fdad1 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -160,7 +160,7 @@ static void read_moving(struct cache_set *c)
moving_init(io);
bio = &io->bio.bio;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
bio->bi_end_io = read_moving_endio;
if (bch_bio_alloc_pages(bio, GFP_KERNEL))
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 32e21ba64357..67a2e29e0b40 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -244,7 +244,7 @@ static void bch_data_insert_start(struct closure *cl)
trace_bcache_cache_insert(k);
bch_keylist_push(&op->insert_keys);
- bio_set_op_attrs(n, REQ_OP_WRITE, 0);
+ n->bi_opf = REQ_OP_WRITE;
bch_submit_bbio(n, op->c, k, 0);
} while (n != bio);
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 0285b676e983..d4a5fc0650bb 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -434,7 +434,7 @@ static void write_dirty(struct closure *cl)
*/
if (KEY_DIRTY(&w->key)) {
dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
+ io->bio.bi_opf = REQ_OP_WRITE;
io->bio.bi_iter.bi_sector = KEY_START(&w->key);
bio_set_dev(&io->bio, io->dc->bdev);
io->bio.bi_end_io = dirty_endio;
@@ -547,7 +547,7 @@ static void read_dirty(struct cached_dev *dc)
io->sequence = sequence++;
dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+ io->bio.bi_opf = REQ_OP_READ;
io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
bio_set_dev(&io->bio, dc->disk.c->cache->bdev);
io->bio.bi_end_io = read_dirty_endio;
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 078da18bb86d..8541d5688f3a 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1215,7 +1215,7 @@ static int dm_keyslot_evict_callback(struct dm_target *ti, struct dm_dev *dev,
struct dm_keyslot_evict_args *args = data;
int err;
- err = blk_crypto_evict_key(bdev_get_queue(dev->bdev), args->key);
+ err = blk_crypto_evict_key(dev->bdev, args->key);
if (!args->err)
args->err = err;
/* Always try to evict the key from all devices. */
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index e76c96c760a9..c2b5a537f5b8 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -410,7 +410,7 @@ static void end_discard(struct discard_op *op, int r)
* need to wait for the chain to complete.
*/
bio_chain(op->bio, op->parent_bio);
- bio_set_op_attrs(op->bio, REQ_OP_DISCARD, 0);
+ op->bio->bi_opf = REQ_OP_DISCARD;
submit_bio(op->bio);
}
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 95a1ee3d314e..e1ea3a7bd9d9 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -732,28 +732,48 @@ static char *_dm_claim_ptr = "I belong to device-mapper";
/*
* Open a table device so we can use it as a map destination.
*/
-static int open_table_device(struct table_device *td, dev_t dev,
- struct mapped_device *md)
+static struct table_device *open_table_device(struct mapped_device *md,
+ dev_t dev, fmode_t mode)
{
+ struct table_device *td;
struct block_device *bdev;
u64 part_off;
int r;
- BUG_ON(td->dm_dev.bdev);
+ td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
+ if (!td)
+ return ERR_PTR(-ENOMEM);
+ refcount_set(&td->count, 1);
- bdev = blkdev_get_by_dev(dev, td->dm_dev.mode | FMODE_EXCL, _dm_claim_ptr);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ bdev = blkdev_get_by_dev(dev, mode | FMODE_EXCL, _dm_claim_ptr);
+ if (IS_ERR(bdev)) {
+ r = PTR_ERR(bdev);
+ goto out_free_td;
+ }
- r = bd_link_disk_holder(bdev, dm_disk(md));
- if (r) {
- blkdev_put(bdev, td->dm_dev.mode | FMODE_EXCL);
- return r;
+ /*
+ * We can be called before the dm disk is added. In that case we can't
+ * register the holder relation here. It will be done once add_disk was
+ * called.
+ */
+ if (md->disk->slave_dir) {
+ r = bd_link_disk_holder(bdev, md->disk);
+ if (r)
+ goto out_blkdev_put;
}
+ td->dm_dev.mode = mode;
td->dm_dev.bdev = bdev;
td->dm_dev.dax_dev = fs_dax_get_by_bdev(bdev, &part_off, NULL, NULL);
- return 0;
+ format_dev_t(td->dm_dev.name, dev);
+ list_add(&td->list, &md->table_devices);
+ return td;
+
+out_blkdev_put:
+ blkdev_put(bdev, mode | FMODE_EXCL);
+out_free_td:
+ kfree(td);
+ return ERR_PTR(r);
}
/*
@@ -761,14 +781,12 @@ static int open_table_device(struct table_device *td, dev_t dev,
*/
static void close_table_device(struct table_device *td, struct mapped_device *md)
{
- if (!td->dm_dev.bdev)
- return;
-
- bd_unlink_disk_holder(td->dm_dev.bdev, dm_disk(md));
+ if (md->disk->slave_dir)
+ bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
blkdev_put(td->dm_dev.bdev, td->dm_dev.mode | FMODE_EXCL);
put_dax(td->dm_dev.dax_dev);
- td->dm_dev.bdev = NULL;
- td->dm_dev.dax_dev = NULL;
+ list_del(&td->list);
+ kfree(td);
}
static struct table_device *find_table_device(struct list_head *l, dev_t dev,
@@ -786,31 +804,16 @@ static struct table_device *find_table_device(struct list_head *l, dev_t dev,
int dm_get_table_device(struct mapped_device *md, dev_t dev, fmode_t mode,
struct dm_dev **result)
{
- int r;
struct table_device *td;
mutex_lock(&md->table_devices_lock);
td = find_table_device(&md->table_devices, dev, mode);
if (!td) {
- td = kmalloc_node(sizeof(*td), GFP_KERNEL, md->numa_node_id);
- if (!td) {
+ td = open_table_device(md, dev, mode);
+ if (IS_ERR(td)) {
mutex_unlock(&md->table_devices_lock);
- return -ENOMEM;
+ return PTR_ERR(td);
}
-
- td->dm_dev.mode = mode;
- td->dm_dev.bdev = NULL;
-
- if ((r = open_table_device(td, dev, md))) {
- mutex_unlock(&md->table_devices_lock);
- kfree(td);
- return r;
- }
-
- format_dev_t(td->dm_dev.name, dev);
-
- refcount_set(&td->count, 1);
- list_add(&td->list, &md->table_devices);
} else {
refcount_inc(&td->count);
}
@@ -825,27 +828,11 @@ void dm_put_table_device(struct mapped_device *md, struct dm_dev *d)
struct table_device *td = container_of(d, struct table_device, dm_dev);
mutex_lock(&md->table_devices_lock);
- if (refcount_dec_and_test(&td->count)) {
+ if (refcount_dec_and_test(&td->count))
close_table_device(td, md);
- list_del(&td->list);
- kfree(td);
- }
mutex_unlock(&md->table_devices_lock);
}
-static void free_table_devices(struct list_head *devices)
-{
- struct list_head *tmp, *next;
-
- list_for_each_safe(tmp, next, devices) {
- struct table_device *td = list_entry(tmp, struct table_device, list);
-
- DMWARN("dm_destroy: %s still exists with %d references",
- td->dm_dev.name, refcount_read(&td->count));
- kfree(td);
- }
-}
-
/*
* Get the geometry associated with a dm device
*/
@@ -1972,8 +1959,21 @@ static void cleanup_mapped_device(struct mapped_device *md)
md->disk->private_data = NULL;
spin_unlock(&_minor_lock);
if (dm_get_md_type(md) != DM_TYPE_NONE) {
+ struct table_device *td;
+
dm_sysfs_exit(md);
+ list_for_each_entry(td, &md->table_devices, list) {
+ bd_unlink_disk_holder(td->dm_dev.bdev,
+ md->disk);
+ }
+
+ /*
+ * Hold lock to make sure del_gendisk() won't concurrent
+ * with open/close_table_device().
+ */
+ mutex_lock(&md->table_devices_lock);
del_gendisk(md->disk);
+ mutex_unlock(&md->table_devices_lock);
}
dm_queue_destroy_crypto_profile(md->queue);
put_disk(md->disk);
@@ -2122,7 +2122,7 @@ static void free_dev(struct mapped_device *md)
cleanup_mapped_device(md);
- free_table_devices(&md->table_devices);
+ WARN_ON_ONCE(!list_empty(&md->table_devices));
dm_stats_cleanup(&md->stats);
free_minor(minor);
@@ -2305,6 +2305,7 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
enum dm_queue_mode type = dm_table_get_type(t);
struct queue_limits limits;
+ struct table_device *td;
int r;
switch (type) {
@@ -2333,17 +2334,40 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
if (r)
return r;
+ /*
+ * Hold lock to make sure add_disk() and del_gendisk() won't concurrent
+ * with open_table_device() and close_table_device().
+ */
+ mutex_lock(&md->table_devices_lock);
r = add_disk(md->disk);
+ mutex_unlock(&md->table_devices_lock);
if (r)
return r;
- r = dm_sysfs_init(md);
- if (r) {
- del_gendisk(md->disk);
- return r;
+ /*
+ * Register the holder relationship for devices added before the disk
+ * was live.
+ */
+ list_for_each_entry(td, &md->table_devices, list) {
+ r = bd_link_disk_holder(td->dm_dev.bdev, md->disk);
+ if (r)
+ goto out_undo_holders;
}
+
+ r = dm_sysfs_init(md);
+ if (r)
+ goto out_undo_holders;
+
md->type = type;
return 0;
+
+out_undo_holders:
+ list_for_each_entry_continue_reverse(td, &md->table_devices, list)
+ bd_unlink_disk_holder(td->dm_dev.bdev, md->disk);
+ mutex_lock(&md->table_devices_lock);
+ del_gendisk(md->disk);
+ mutex_unlock(&md->table_devices_lock);
+ return r;
}
struct mapped_device *dm_get_md(dev_t dev)
diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index bf6dffadbe6f..e7cc6ba1b657 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -486,7 +486,7 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
sb = kmap_atomic(bitmap->storage.sb_page);
pr_debug("%s: bitmap file superblock:\n", bmname(bitmap));
pr_debug(" magic: %08x\n", le32_to_cpu(sb->magic));
- pr_debug(" version: %d\n", le32_to_cpu(sb->version));
+ pr_debug(" version: %u\n", le32_to_cpu(sb->version));
pr_debug(" uuid: %08x.%08x.%08x.%08x\n",
le32_to_cpu(*(__le32 *)(sb->uuid+0)),
le32_to_cpu(*(__le32 *)(sb->uuid+4)),
@@ -497,11 +497,11 @@ void md_bitmap_print_sb(struct bitmap *bitmap)
pr_debug("events cleared: %llu\n",
(unsigned long long) le64_to_cpu(sb->events_cleared));
pr_debug(" state: %08x\n", le32_to_cpu(sb->state));
- pr_debug(" chunksize: %d B\n", le32_to_cpu(sb->chunksize));
- pr_debug(" daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep));
+ pr_debug(" chunksize: %u B\n", le32_to_cpu(sb->chunksize));
+ pr_debug(" daemon sleep: %us\n", le32_to_cpu(sb->daemon_sleep));
pr_debug(" sync size: %llu KB\n",
(unsigned long long)le64_to_cpu(sb->sync_size)/2);
- pr_debug("max write behind: %d\n", le32_to_cpu(sb->write_behind));
+ pr_debug("max write behind: %u\n", le32_to_cpu(sb->write_behind));
kunmap_atomic(sb);
}
@@ -2105,7 +2105,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bytes = DIV_ROUND_UP(chunks, 8);
if (!bitmap->mddev->bitmap_info.external)
bytes += sizeof(bitmap_super_t);
- } while (bytes > (space << 9));
+ } while (bytes > (space << 9) && (chunkshift + BITMAP_BLOCK_SHIFT) <
+ (BITS_PER_BYTE * sizeof(((bitmap_super_t *)0)->chunksize) - 1));
} else
chunkshift = ffz(~chunksize) - BITMAP_BLOCK_SHIFT;
@@ -2150,7 +2151,7 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = pages;
bitmap->counts.chunkshift = chunkshift;
bitmap->counts.chunks = chunks;
- bitmap->mddev->bitmap_info.chunksize = 1 << (chunkshift +
+ bitmap->mddev->bitmap_info.chunksize = 1UL << (chunkshift +
BITMAP_BLOCK_SHIFT);
blocks = min(old_counts.chunks << old_counts.chunkshift,
@@ -2176,8 +2177,8 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
bitmap->counts.missing_pages = old_counts.pages;
bitmap->counts.chunkshift = old_counts.chunkshift;
bitmap->counts.chunks = old_counts.chunks;
- bitmap->mddev->bitmap_info.chunksize = 1 << (old_counts.chunkshift +
- BITMAP_BLOCK_SHIFT);
+ bitmap->mddev->bitmap_info.chunksize =
+ 1UL << (old_counts.chunkshift + BITMAP_BLOCK_SHIFT);
blocks = old_counts.chunks << old_counts.chunkshift;
pr_warn("Could not pre-allocate in-memory bitmap for cluster raid\n");
break;
@@ -2195,20 +2196,23 @@ int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks,
if (set) {
bmc_new = md_bitmap_get_counter(&bitmap->counts, block, &new_blocks, 1);
- if (*bmc_new == 0) {
- /* need to set on-disk bits too. */
- sector_t end = block + new_blocks;
- sector_t start = block >> chunkshift;
- start <<= chunkshift;
- while (start < end) {
- md_bitmap_file_set_bit(bitmap, block);
- start += 1 << chunkshift;
+ if (bmc_new) {
+ if (*bmc_new == 0) {
+ /* need to set on-disk bits too. */
+ sector_t end = block + new_blocks;
+ sector_t start = block >> chunkshift;
+
+ start <<= chunkshift;
+ while (start < end) {
+ md_bitmap_file_set_bit(bitmap, block);
+ start += 1 << chunkshift;
+ }
+ *bmc_new = 2;
+ md_bitmap_count_page(&bitmap->counts, block, 1);
+ md_bitmap_set_pending(&bitmap->counts, block);
}
- *bmc_new = 2;
- md_bitmap_count_page(&bitmap->counts, block, 1);
- md_bitmap_set_pending(&bitmap->counts, block);
+ *bmc_new |= NEEDED_MASK;
}
- *bmc_new |= NEEDED_MASK;
if (new_blocks < old_blocks)
old_blocks = new_blocks;
}
@@ -2534,6 +2538,9 @@ chunksize_store(struct mddev *mddev, const char *buf, size_t len)
if (csize < 512 ||
!is_power_of_2(csize))
return -EINVAL;
+ if (BITS_PER_LONG > 32 && csize >= (1ULL << (BITS_PER_BYTE *
+ sizeof(((bitmap_super_t *)0)->chunksize))))
+ return -EOVERFLOW;
mddev->bitmap_info.chunksize = csize;
return len;
}
diff --git a/drivers/md/md.c b/drivers/md/md.c
index a467b492d4ad..775f1dde190a 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -93,6 +93,18 @@ static int remove_and_add_spares(struct mddev *mddev,
struct md_rdev *this);
static void mddev_detach(struct mddev *mddev);
+enum md_ro_state {
+ MD_RDWR,
+ MD_RDONLY,
+ MD_AUTO_READ,
+ MD_MAX_STATE
+};
+
+static bool md_is_rdwr(struct mddev *mddev)
+{
+ return (mddev->ro == MD_RDWR);
+}
+
/*
* Default number of read corrections we'll attempt on an rdev
* before ejecting it from the array. We divide the read error
@@ -444,7 +456,7 @@ static void md_submit_bio(struct bio *bio)
bio = bio_split_to_limits(bio);
- if (mddev->ro == 1 && unlikely(rw == WRITE)) {
+ if (mddev->ro == MD_RDONLY && unlikely(rw == WRITE)) {
if (bio_sectors(bio) != 0)
bio->bi_status = BLK_STS_IOERR;
bio_endio(bio);
@@ -509,13 +521,14 @@ static void md_end_flush(struct bio *bio)
struct md_rdev *rdev = bio->bi_private;
struct mddev *mddev = rdev->mddev;
+ bio_put(bio);
+
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
/* The pre-request flush has finished */
queue_work(md_wq, &mddev->flush_work);
}
- bio_put(bio);
}
static void md_submit_flush_data(struct work_struct *ws);
@@ -913,10 +926,12 @@ static void super_written(struct bio *bio)
} else
clear_bit(LastDev, &rdev->flags);
+ bio_put(bio);
+
+ rdev_dec_pending(rdev, mddev);
+
if (atomic_dec_and_test(&mddev->pending_writes))
wake_up(&mddev->sb_wait);
- rdev_dec_pending(rdev, mddev);
- bio_put(bio);
}
void md_super_write(struct mddev *mddev, struct md_rdev *rdev,
@@ -2453,7 +2468,22 @@ static void rdev_delayed_delete(struct work_struct *ws)
kobject_put(&rdev->kobj);
}
-static void unbind_rdev_from_array(struct md_rdev *rdev)
+void md_autodetect_dev(dev_t dev);
+
+static void export_rdev(struct md_rdev *rdev)
+{
+ pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
+ md_rdev_clear(rdev);
+#ifndef MODULE
+ if (test_bit(AutoDetected, &rdev->flags))
+ md_autodetect_dev(rdev->bdev->bd_dev);
+#endif
+ blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+ rdev->bdev = NULL;
+ kobject_put(&rdev->kobj);
+}
+
+static void md_kick_rdev_from_array(struct md_rdev *rdev)
{
bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
@@ -2476,56 +2506,8 @@ static void unbind_rdev_from_array(struct md_rdev *rdev)
INIT_WORK(&rdev->del_work, rdev_delayed_delete);
kobject_get(&rdev->kobj);
queue_work(md_rdev_misc_wq, &rdev->del_work);
-}
-
-/*
- * prevent the device from being mounted, repartitioned or
- * otherwise reused by a RAID array (or any other kernel
- * subsystem), by bd_claiming the device.
- */
-static int lock_rdev(struct md_rdev *rdev, dev_t dev, int shared)
-{
- int err = 0;
- struct block_device *bdev;
-
- bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
- shared ? (struct md_rdev *)lock_rdev : rdev);
- if (IS_ERR(bdev)) {
- pr_warn("md: could not open device unknown-block(%u,%u).\n",
- MAJOR(dev), MINOR(dev));
- return PTR_ERR(bdev);
- }
- rdev->bdev = bdev;
- return err;
-}
-
-static void unlock_rdev(struct md_rdev *rdev)
-{
- struct block_device *bdev = rdev->bdev;
- rdev->bdev = NULL;
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
-}
-
-void md_autodetect_dev(dev_t dev);
-
-static void export_rdev(struct md_rdev *rdev)
-{
- pr_debug("md: export_rdev(%pg)\n", rdev->bdev);
- md_rdev_clear(rdev);
-#ifndef MODULE
- if (test_bit(AutoDetected, &rdev->flags))
- md_autodetect_dev(rdev->bdev->bd_dev);
-#endif
- unlock_rdev(rdev);
- kobject_put(&rdev->kobj);
-}
-
-void md_kick_rdev_from_array(struct md_rdev *rdev)
-{
- unbind_rdev_from_array(rdev);
export_rdev(rdev);
}
-EXPORT_SYMBOL_GPL(md_kick_rdev_from_array);
static void export_array(struct mddev *mddev)
{
@@ -2639,7 +2621,7 @@ void md_update_sb(struct mddev *mddev, int force_change)
int any_badblocks_changed = 0;
int ret = -1;
- if (mddev->ro) {
+ if (!md_is_rdwr(mddev)) {
if (force_change)
set_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags);
return;
@@ -3660,9 +3642,10 @@ EXPORT_SYMBOL_GPL(md_rdev_init);
*/
static struct md_rdev *md_import_device(dev_t newdev, int super_format, int super_minor)
{
- int err;
+ static struct md_rdev *claim_rdev; /* just for claiming the bdev */
struct md_rdev *rdev;
sector_t size;
+ int err;
rdev = kzalloc(sizeof(*rdev), GFP_KERNEL);
if (!rdev)
@@ -3670,14 +3653,20 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
err = md_rdev_init(rdev);
if (err)
- goto abort_free;
+ goto out_free_rdev;
err = alloc_disk_sb(rdev);
if (err)
- goto abort_free;
+ goto out_clear_rdev;
- err = lock_rdev(rdev, newdev, super_format == -2);
- if (err)
- goto abort_free;
+ rdev->bdev = blkdev_get_by_dev(newdev,
+ FMODE_READ | FMODE_WRITE | FMODE_EXCL,
+ super_format == -2 ? claim_rdev : rdev);
+ if (IS_ERR(rdev->bdev)) {
+ pr_warn("md: could not open device unknown-block(%u,%u).\n",
+ MAJOR(newdev), MINOR(newdev));
+ err = PTR_ERR(rdev->bdev);
+ goto out_clear_rdev;
+ }
kobject_init(&rdev->kobj, &rdev_ktype);
@@ -3686,7 +3675,7 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
pr_warn("md: %pg has zero or unknown size, marking faulty!\n",
rdev->bdev);
err = -EINVAL;
- goto abort_free;
+ goto out_blkdev_put;
}
if (super_format >= 0) {
@@ -3696,21 +3685,22 @@ static struct md_rdev *md_import_device(dev_t newdev, int super_format, int supe
pr_warn("md: %pg does not have a valid v%d.%d superblock, not importing!\n",
rdev->bdev,
super_format, super_minor);
- goto abort_free;
+ goto out_blkdev_put;
}
if (err < 0) {
pr_warn("md: could not read %pg's sb, not importing!\n",
rdev->bdev);
- goto abort_free;
+ goto out_blkdev_put;
}
}
return rdev;
-abort_free:
- if (rdev->bdev)
- unlock_rdev(rdev);
+out_blkdev_put:
+ blkdev_put(rdev->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+out_clear_rdev:
md_rdev_clear(rdev);
+out_free_rdev:
kfree(rdev);
return ERR_PTR(err);
}
@@ -3901,7 +3891,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
goto out_unlock;
}
rv = -EROFS;
- if (mddev->ro)
+ if (!md_is_rdwr(mddev))
goto out_unlock;
/* request to change the personality. Need to ensure:
@@ -4107,7 +4097,7 @@ layout_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
if (mddev->pers->check_reshape == NULL)
err = -EBUSY;
- else if (mddev->ro)
+ else if (!md_is_rdwr(mddev))
err = -EROFS;
else {
mddev->new_layout = n;
@@ -4216,7 +4206,7 @@ chunk_size_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers) {
if (mddev->pers->check_reshape == NULL)
err = -EBUSY;
- else if (mddev->ro)
+ else if (!md_is_rdwr(mddev))
err = -EROFS;
else {
mddev->new_chunk_sectors = n >> 9;
@@ -4339,13 +4329,13 @@ array_state_show(struct mddev *mddev, char *page)
if (mddev->pers && !test_bit(MD_NOT_READY, &mddev->flags)) {
switch(mddev->ro) {
- case 1:
+ case MD_RDONLY:
st = readonly;
break;
- case 2:
+ case MD_AUTO_READ:
st = read_auto;
break;
- case 0:
+ case MD_RDWR:
spin_lock(&mddev->lock);
if (test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags))
st = write_pending;
@@ -4381,7 +4371,8 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
int err = 0;
enum array_state st = match_word(buf, array_states);
- if (mddev->pers && (st == active || st == clean) && mddev->ro != 1) {
+ if (mddev->pers && (st == active || st == clean) &&
+ mddev->ro != MD_RDONLY) {
/* don't take reconfig_mutex when toggling between
* clean and active
*/
@@ -4425,23 +4416,23 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
if (mddev->pers)
err = md_set_readonly(mddev, NULL);
else {
- mddev->ro = 1;
+ mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
err = do_md_run(mddev);
}
break;
case read_auto:
if (mddev->pers) {
- if (mddev->ro == 0)
+ if (md_is_rdwr(mddev))
err = md_set_readonly(mddev, NULL);
- else if (mddev->ro == 1)
+ else if (mddev->ro == MD_RDONLY)
err = restart_array(mddev);
if (err == 0) {
- mddev->ro = 2;
+ mddev->ro = MD_AUTO_READ;
set_disk_ro(mddev->gendisk, 0);
}
} else {
- mddev->ro = 2;
+ mddev->ro = MD_AUTO_READ;
err = do_md_run(mddev);
}
break;
@@ -4466,7 +4457,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len)
wake_up(&mddev->sb_wait);
err = 0;
} else {
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
set_disk_ro(mddev->gendisk, 0);
err = do_md_run(mddev);
}
@@ -4765,7 +4756,7 @@ action_show(struct mddev *mddev, char *page)
if (test_bit(MD_RECOVERY_FROZEN, &recovery))
type = "frozen";
else if (test_bit(MD_RECOVERY_RUNNING, &recovery) ||
- (!mddev->ro && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
+ (md_is_rdwr(mddev) && test_bit(MD_RECOVERY_NEEDED, &recovery))) {
if (test_bit(MD_RECOVERY_RESHAPE, &recovery))
type = "reshape";
else if (test_bit(MD_RECOVERY_SYNC, &recovery)) {
@@ -4851,11 +4842,11 @@ action_store(struct mddev *mddev, const char *page, size_t len)
set_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
set_bit(MD_RECOVERY_SYNC, &mddev->recovery);
}
- if (mddev->ro == 2) {
+ if (mddev->ro == MD_AUTO_READ) {
/* A write to sync_action is enough to justify
* canceling read-auto mode
*/
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
md_wakeup_thread(mddev->sync_thread);
}
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -5083,8 +5074,7 @@ max_sync_store(struct mddev *mddev, const char *buf, size_t len)
goto out_unlock;
err = -EBUSY;
- if (max < mddev->resync_max &&
- mddev->ro == 0 &&
+ if (max < mddev->resync_max && md_is_rdwr(mddev) &&
test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
goto out_unlock;
@@ -5813,8 +5803,8 @@ int md_run(struct mddev *mddev)
continue;
sync_blockdev(rdev->bdev);
invalidate_bdev(rdev->bdev);
- if (mddev->ro != 1 && rdev_read_only(rdev)) {
- mddev->ro = 1;
+ if (mddev->ro != MD_RDONLY && rdev_read_only(rdev)) {
+ mddev->ro = MD_RDONLY;
if (mddev->gendisk)
set_disk_ro(mddev->gendisk, 1);
}
@@ -5917,8 +5907,8 @@ int md_run(struct mddev *mddev)
mddev->ok_start_degraded = start_dirty_degraded;
- if (start_readonly && mddev->ro == 0)
- mddev->ro = 2; /* read-only, but switch on first write */
+ if (start_readonly && md_is_rdwr(mddev))
+ mddev->ro = MD_AUTO_READ; /* read-only, but switch on first write */
err = pers->run(mddev);
if (err)
@@ -5996,8 +5986,8 @@ int md_run(struct mddev *mddev)
mddev->sysfs_action = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_action");
mddev->sysfs_completed = sysfs_get_dirent_safe(mddev->kobj.sd, "sync_completed");
mddev->sysfs_degraded = sysfs_get_dirent_safe(mddev->kobj.sd, "degraded");
- } else if (mddev->ro == 2) /* auto-readonly not meaningful */
- mddev->ro = 0;
+ } else if (mddev->ro == MD_AUTO_READ)
+ mddev->ro = MD_RDWR;
atomic_set(&mddev->max_corr_read_errors,
MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
@@ -6015,7 +6005,7 @@ int md_run(struct mddev *mddev)
if (rdev->raid_disk >= 0)
sysfs_link_rdev(mddev, rdev); /* failure here is OK */
- if (mddev->degraded && !mddev->ro)
+ if (mddev->degraded && md_is_rdwr(mddev))
/* This ensures that recovering status is reported immediately
* via sysfs - until a lack of spares is confirmed.
*/
@@ -6105,7 +6095,7 @@ static int restart_array(struct mddev *mddev)
return -ENXIO;
if (!mddev->pers)
return -EINVAL;
- if (!mddev->ro)
+ if (md_is_rdwr(mddev))
return -EBUSY;
rcu_read_lock();
@@ -6124,7 +6114,7 @@ static int restart_array(struct mddev *mddev)
return -EROFS;
mddev->safemode = 0;
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
set_disk_ro(disk, 0);
pr_debug("md: %s switched to read-write mode.\n", mdname(mddev));
/* Kick recovery or resync if necessary */
@@ -6151,7 +6141,7 @@ static void md_clean(struct mddev *mddev)
mddev->clevel[0] = 0;
mddev->flags = 0;
mddev->sb_flags = 0;
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
mddev->metadata_type[0] = 0;
mddev->chunk_sectors = 0;
mddev->ctime = mddev->utime = 0;
@@ -6203,7 +6193,7 @@ static void __md_stop_writes(struct mddev *mddev)
}
md_bitmap_flush(mddev);
- if (mddev->ro == 0 &&
+ if (md_is_rdwr(mddev) &&
((!mddev->in_sync && !mddev_is_clustered(mddev)) ||
mddev->sb_flags)) {
/* mark array as shutdown cleanly */
@@ -6312,9 +6302,9 @@ static int md_set_readonly(struct mddev *mddev, struct block_device *bdev)
__md_stop_writes(mddev);
err = -ENXIO;
- if (mddev->ro==1)
+ if (mddev->ro == MD_RDONLY)
goto out;
- mddev->ro = 1;
+ mddev->ro = MD_RDONLY;
set_disk_ro(mddev->gendisk, 1);
clear_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
@@ -6371,7 +6361,7 @@ static int do_md_stop(struct mddev *mddev, int mode,
return -EBUSY;
}
if (mddev->pers) {
- if (mddev->ro)
+ if (!md_is_rdwr(mddev))
set_disk_ro(disk, 0);
__md_stop_writes(mddev);
@@ -6388,8 +6378,8 @@ static int do_md_stop(struct mddev *mddev, int mode,
mutex_unlock(&mddev->open_mutex);
mddev->changed = 1;
- if (mddev->ro)
- mddev->ro = 0;
+ if (!md_is_rdwr(mddev))
+ mddev->ro = MD_RDWR;
} else
mutex_unlock(&mddev->open_mutex);
/*
@@ -7204,7 +7194,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
mddev->sync_thread)
return -EBUSY;
- if (mddev->ro)
+ if (!md_is_rdwr(mddev))
return -EROFS;
rdev_for_each(rdev, mddev) {
@@ -7234,7 +7224,7 @@ static int update_raid_disks(struct mddev *mddev, int raid_disks)
/* change the number of raid disks */
if (mddev->pers->check_reshape == NULL)
return -EINVAL;
- if (mddev->ro)
+ if (!md_is_rdwr(mddev))
return -EROFS;
if (raid_disks <= 0 ||
(mddev->max_disks && raid_disks >= mddev->max_disks))
@@ -7464,6 +7454,40 @@ static inline bool md_ioctl_valid(unsigned int cmd)
}
}
+static int __md_set_array_info(struct mddev *mddev, void __user *argp)
+{
+ mdu_array_info_t info;
+ int err;
+
+ if (!argp)
+ memset(&info, 0, sizeof(info));
+ else if (copy_from_user(&info, argp, sizeof(info)))
+ return -EFAULT;
+
+ if (mddev->pers) {
+ err = update_array_info(mddev, &info);
+ if (err)
+ pr_warn("md: couldn't update array info. %d\n", err);
+ return err;
+ }
+
+ if (!list_empty(&mddev->disks)) {
+ pr_warn("md: array %s already has disks!\n", mdname(mddev));
+ return -EBUSY;
+ }
+
+ if (mddev->raid_disks) {
+ pr_warn("md: array %s already initialised!\n", mdname(mddev));
+ return -EBUSY;
+ }
+
+ err = md_set_array_info(mddev, &info);
+ if (err)
+ pr_warn("md: couldn't set array info. %d\n", err);
+
+ return err;
+}
+
static int md_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
@@ -7569,36 +7593,7 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
}
if (cmd == SET_ARRAY_INFO) {
- mdu_array_info_t info;
- if (!arg)
- memset(&info, 0, sizeof(info));
- else if (copy_from_user(&info, argp, sizeof(info))) {
- err = -EFAULT;
- goto unlock;
- }
- if (mddev->pers) {
- err = update_array_info(mddev, &info);
- if (err) {
- pr_warn("md: couldn't update array info. %d\n", err);
- goto unlock;
- }
- goto unlock;
- }
- if (!list_empty(&mddev->disks)) {
- pr_warn("md: array %s already has disks!\n", mdname(mddev));
- err = -EBUSY;
- goto unlock;
- }
- if (mddev->raid_disks) {
- pr_warn("md: array %s already initialised!\n", mdname(mddev));
- err = -EBUSY;
- goto unlock;
- }
- err = md_set_array_info(mddev, &info);
- if (err) {
- pr_warn("md: couldn't set array info. %d\n", err);
- goto unlock;
- }
+ err = __md_set_array_info(mddev, argp);
goto unlock;
}
@@ -7658,26 +7653,25 @@ static int md_ioctl(struct block_device *bdev, fmode_t mode,
* The remaining ioctls are changing the state of the
* superblock, so we do not allow them on read-only arrays.
*/
- if (mddev->ro && mddev->pers) {
- if (mddev->ro == 2) {
- mddev->ro = 0;
- sysfs_notify_dirent_safe(mddev->sysfs_state);
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- /* mddev_unlock will wake thread */
- /* If a device failed while we were read-only, we
- * need to make sure the metadata is updated now.
- */
- if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
- mddev_unlock(mddev);
- wait_event(mddev->sb_wait,
- !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
- !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
- mddev_lock_nointr(mddev);
- }
- } else {
+ if (!md_is_rdwr(mddev) && mddev->pers) {
+ if (mddev->ro != MD_AUTO_READ) {
err = -EROFS;
goto unlock;
}
+ mddev->ro = MD_RDWR;
+ sysfs_notify_dirent_safe(mddev->sysfs_state);
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ /* mddev_unlock will wake thread */
+ /* If a device failed while we were read-only, we
+ * need to make sure the metadata is updated now.
+ */
+ if (test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags)) {
+ mddev_unlock(mddev);
+ wait_event(mddev->sb_wait,
+ !test_bit(MD_SB_CHANGE_DEVS, &mddev->sb_flags) &&
+ !test_bit(MD_SB_CHANGE_PENDING, &mddev->sb_flags));
+ mddev_lock_nointr(mddev);
+ }
}
switch (cmd) {
@@ -7763,11 +7757,11 @@ static int md_set_read_only(struct block_device *bdev, bool ro)
* Transitioning to read-auto need only happen for arrays that call
* md_write_start and which are not ready for writes yet.
*/
- if (!ro && mddev->ro == 1 && mddev->pers) {
+ if (!ro && mddev->ro == MD_RDONLY && mddev->pers) {
err = restart_array(mddev);
if (err)
goto out_unlock;
- mddev->ro = 2;
+ mddev->ro = MD_AUTO_READ;
}
out_unlock:
@@ -8241,9 +8235,9 @@ static int md_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%s : %sactive", mdname(mddev),
mddev->pers ? "" : "in");
if (mddev->pers) {
- if (mddev->ro==1)
+ if (mddev->ro == MD_RDONLY)
seq_printf(seq, " (read-only)");
- if (mddev->ro==2)
+ if (mddev->ro == MD_AUTO_READ)
seq_printf(seq, " (auto-read-only)");
seq_printf(seq, " %s", mddev->pers->name);
}
@@ -8502,10 +8496,10 @@ bool md_write_start(struct mddev *mddev, struct bio *bi)
if (bio_data_dir(bi) != WRITE)
return true;
- BUG_ON(mddev->ro == 1);
- if (mddev->ro == 2) {
+ BUG_ON(mddev->ro == MD_RDONLY);
+ if (mddev->ro == MD_AUTO_READ) {
/* need to switch to read/write */
- mddev->ro = 0;
+ mddev->ro = MD_RDWR;
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
md_wakeup_thread(mddev->sync_thread);
@@ -8556,7 +8550,7 @@ void md_write_inc(struct mddev *mddev, struct bio *bi)
{
if (bio_data_dir(bi) != WRITE)
return;
- WARN_ON_ONCE(mddev->in_sync || mddev->ro);
+ WARN_ON_ONCE(mddev->in_sync || !md_is_rdwr(mddev));
percpu_ref_get(&mddev->writes_pending);
}
EXPORT_SYMBOL(md_write_inc);
@@ -8661,7 +8655,7 @@ void md_allow_write(struct mddev *mddev)
{
if (!mddev->pers)
return;
- if (mddev->ro)
+ if (!md_is_rdwr(mddev))
return;
if (!mddev->pers->sync_request)
return;
@@ -8709,7 +8703,7 @@ void md_do_sync(struct md_thread *thread)
if (test_bit(MD_RECOVERY_DONE, &mddev->recovery) ||
test_bit(MD_RECOVERY_WAIT, &mddev->recovery))
return;
- if (mddev->ro) {/* never try to sync a read-only array */
+ if (!md_is_rdwr(mddev)) {/* never try to sync a read-only array */
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
return;
}
@@ -9178,9 +9172,9 @@ static int remove_and_add_spares(struct mddev *mddev,
if (test_bit(Faulty, &rdev->flags))
continue;
if (!test_bit(Journal, &rdev->flags)) {
- if (mddev->ro &&
- ! (rdev->saved_raid_disk >= 0 &&
- !test_bit(Bitmap_sync, &rdev->flags)))
+ if (!md_is_rdwr(mddev) &&
+ !(rdev->saved_raid_disk >= 0 &&
+ !test_bit(Bitmap_sync, &rdev->flags)))
continue;
rdev->recovery_offset = 0;
@@ -9278,7 +9272,8 @@ void md_check_recovery(struct mddev *mddev)
flush_signals(current);
}
- if (mddev->ro && !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
+ if (!md_is_rdwr(mddev) &&
+ !test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
return;
if ( ! (
(mddev->sb_flags & ~ (1<<MD_SB_CHANGE_PENDING)) ||
@@ -9297,7 +9292,7 @@ void md_check_recovery(struct mddev *mddev)
if (!mddev->external && mddev->safemode == 1)
mddev->safemode = 0;
- if (mddev->ro) {
+ if (!md_is_rdwr(mddev)) {
struct md_rdev *rdev;
if (!mddev->external && mddev->in_sync)
/* 'Blocked' flag not needed as failed devices
diff --git a/drivers/md/md.h b/drivers/md/md.h
index b4e2d8b87b61..554a9026669a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -782,7 +782,6 @@ extern void mddev_resume(struct mddev *mddev);
extern void md_reload_sb(struct mddev *mddev, int raid_disk);
extern void md_update_sb(struct mddev *mddev, int force);
-extern void md_kick_rdev_from_array(struct md_rdev * rdev);
extern void mddev_create_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
bool is_suspend);
extern void mddev_destroy_serial_pool(struct mddev *mddev, struct md_rdev *rdev,
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 857c49399c28..b536befd8898 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -398,7 +398,6 @@ static int raid0_run(struct mddev *mddev)
blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
blk_queue_max_write_zeroes_sectors(mddev->queue, mddev->chunk_sectors);
- blk_queue_max_discard_sectors(mddev->queue, UINT_MAX);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
blk_queue_io_opt(mddev->queue,
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 05d8438cfec8..68a9e2d9985b 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1321,7 +1321,7 @@ static void raid1_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r1_bio->sector +
mirror->rdev->data_offset;
read_bio->bi_end_io = raid1_end_read_request;
- bio_set_op_attrs(read_bio, op, do_sync);
+ read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &mirror->rdev->flags) &&
test_bit(R1BIO_FailFast, &r1_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -2254,7 +2254,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio)
continue;
}
- bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
+ wbio->bi_opf = REQ_OP_WRITE;
if (test_bit(FailFast, &conf->mirrors[i].rdev->flags))
wbio->bi_opf |= MD_FAILFAST;
@@ -2419,7 +2419,7 @@ static int narrow_write_error(struct r1bio *r1_bio, int i)
GFP_NOIO, &mddev->bio_set);
}
- bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
+ wbio->bi_opf = REQ_OP_WRITE;
wbio->bi_iter.bi_sector = r1_bio->sector;
wbio->bi_iter.bi_size = r1_bio->sectors << 9;
@@ -2770,7 +2770,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (i < conf->raid_disks)
still_degraded = 1;
} else if (!test_bit(In_sync, &rdev->flags)) {
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
write_targets ++;
} else {
@@ -2797,7 +2797,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
if (disk < 0)
disk = i;
}
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
bio->bi_end_io = end_sync_read;
read_targets++;
} else if (!test_bit(WriteErrorSeen, &rdev->flags) &&
@@ -2809,7 +2809,7 @@ static sector_t raid1_sync_request(struct mddev *mddev, sector_t sector_nr,
* if we are doing resync or repair. Otherwise, leave
* this device alone for this sync request.
*/
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE;
bio->bi_end_io = end_sync_write;
write_targets++;
}
@@ -3159,6 +3159,7 @@ static int raid1_run(struct mddev *mddev)
* RAID1 needs at least one disk in active
*/
if (conf->raid_disks - mddev->degraded < 1) {
+ md_unregister_thread(&conf->thread);
ret = -EINVAL;
goto abort;
}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 3aa8b6e11d58..6c66357f92f5 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1254,7 +1254,7 @@ static void raid10_read_request(struct mddev *mddev, struct bio *bio,
read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
choose_data_offset(r10_bio, rdev);
read_bio->bi_end_io = raid10_end_read_request;
- bio_set_op_attrs(read_bio, op, do_sync);
+ read_bio->bi_opf = op | do_sync;
if (test_bit(FailFast, &rdev->flags) &&
test_bit(R10BIO_FailFast, &r10_bio->state))
read_bio->bi_opf |= MD_FAILFAST;
@@ -1301,7 +1301,7 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio,
mbio->bi_iter.bi_sector = (r10_bio->devs[n_copy].addr +
choose_data_offset(r10_bio, rdev));
mbio->bi_end_io = raid10_end_write_request;
- bio_set_op_attrs(mbio, op, do_sync | do_fua);
+ mbio->bi_opf = op | do_sync | do_fua;
if (!replacement && test_bit(FailFast,
&conf->mirrors[devnum].rdev->flags)
&& enough(conf, devnum))
@@ -2933,7 +2933,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i)
wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector);
wbio->bi_iter.bi_sector = wsector +
choose_data_offset(r10_bio, rdev);
- bio_set_op_attrs(wbio, REQ_OP_WRITE, 0);
+ wbio->bi_opf = REQ_OP_WRITE;
if (submit_bio_wait(wbio) < 0)
/* Failure! */
@@ -3542,7 +3542,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
from_addr = r10_bio->devs[j].addr;
@@ -3567,7 +3567,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = to_addr
+ mrdev->data_offset;
bio_set_dev(bio, mrdev->bdev);
@@ -3588,7 +3588,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE;
bio->bi_iter.bi_sector = to_addr +
mreplace->data_offset;
bio_set_dev(bio, mreplace->bdev);
@@ -3742,7 +3742,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_read;
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
+ bio->bi_opf = REQ_OP_READ;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
@@ -3764,7 +3764,7 @@ static sector_t raid10_sync_request(struct mddev *mddev, sector_t sector_nr,
bio->bi_next = biolist;
biolist = bio;
bio->bi_end_io = end_sync_write;
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
+ bio->bi_opf = REQ_OP_WRITE;
if (test_bit(FailFast, &rdev->flags))
bio->bi_opf |= MD_FAILFAST;
bio->bi_iter.bi_sector = sector + rdev->data_offset;
@@ -4145,8 +4145,6 @@ static int raid10_run(struct mddev *mddev)
conf->thread = NULL;
if (mddev->queue) {
- blk_queue_max_discard_sectors(mddev->queue,
- UINT_MAX);
blk_queue_max_write_zeroes_sectors(mddev->queue, 0);
blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
raid10_set_io_opt(conf);
@@ -4972,7 +4970,7 @@ read_more:
b->bi_iter.bi_sector = r10_bio->devs[s/2].addr +
rdev2->new_data_offset;
b->bi_end_io = end_reshape_write;
- bio_set_op_attrs(b, REQ_OP_WRITE, 0);
+ b->bi_opf = REQ_OP_WRITE;
b->bi_next = blist;
blist = b;
}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 832d8566e165..46182b955aef 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -1565,11 +1565,12 @@ void r5l_wake_reclaim(struct r5l_log *log, sector_t space)
if (!log)
return;
+
+ target = READ_ONCE(log->reclaim_target);
do {
- target = log->reclaim_target;
if (new < target)
return;
- } while (cmpxchg(&log->reclaim_target, target, new) != target);
+ } while (!try_cmpxchg(&log->reclaim_target, &target, new));
md_wakeup_thread(log->reclaim_thread);
}
@@ -3061,7 +3062,6 @@ void r5c_update_on_rdev_error(struct mddev *mddev, struct md_rdev *rdev)
int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
{
- struct request_queue *q = bdev_get_queue(rdev->bdev);
struct r5l_log *log;
int ret;
@@ -3090,9 +3090,7 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
if (!log)
return -ENOMEM;
log->rdev = rdev;
-
- log->need_cache_flush = test_bit(QUEUE_FLAG_WC, &q->queue_flags) != 0;
-
+ log->need_cache_flush = bdev_write_cache(rdev->bdev);
log->uuid_checksum = crc32c_le(~0, rdev->mddev->uuid,
sizeof(rdev->mddev->uuid));
diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c
index 31b9157bc9ae..e495939bb3e0 100644
--- a/drivers/md/raid5-ppl.c
+++ b/drivers/md/raid5-ppl.c
@@ -1301,8 +1301,6 @@ static int ppl_validate_rdev(struct md_rdev *rdev)
static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
{
- struct request_queue *q;
-
if ((rdev->ppl.size << 9) >= (PPL_SPACE_SIZE +
PPL_HEADER_SIZE) * 2) {
log->use_multippl = true;
@@ -1316,8 +1314,7 @@ static void ppl_init_child_log(struct ppl_log *log, struct md_rdev *rdev)
}
log->next_io_sector = rdev->ppl.sector;
- q = bdev_get_queue(rdev->bdev);
- if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ if (bdev_write_cache(rdev->bdev))
log->wb_cache_on = true;
}
diff --git a/drivers/nvme/host/apple.c b/drivers/nvme/host/apple.c
index ff8b083dc5c6..e36aeb50b4ed 100644
--- a/drivers/nvme/host/apple.c
+++ b/drivers/nvme/host/apple.c
@@ -763,7 +763,7 @@ static blk_status_t apple_nvme_queue_rq(struct blk_mq_hw_ctx *hctx,
goto out_free_cmd;
}
- blk_mq_start_request(req);
+ nvme_start_request(req);
apple_nvme_submit_cmd(q, cmnd);
return BLK_STS_OK;
@@ -821,7 +821,7 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
if (!dead && shutdown && freeze)
nvme_wait_freeze_timeout(&anv->ctrl, NVME_IO_TIMEOUT);
- nvme_stop_queues(&anv->ctrl);
+ nvme_quiesce_io_queues(&anv->ctrl);
if (!dead) {
if (READ_ONCE(anv->ioq.enabled)) {
@@ -829,15 +829,13 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
apple_nvme_remove_cq(anv);
}
- if (shutdown)
- nvme_shutdown_ctrl(&anv->ctrl);
- nvme_disable_ctrl(&anv->ctrl);
+ nvme_disable_ctrl(&anv->ctrl, shutdown);
}
WRITE_ONCE(anv->ioq.enabled, false);
WRITE_ONCE(anv->adminq.enabled, false);
mb(); /* ensure that nvme_queue_rq() sees that enabled is cleared */
- nvme_stop_admin_queue(&anv->ctrl);
+ nvme_quiesce_admin_queue(&anv->ctrl);
/* last chance to complete any requests before nvme_cancel_request */
spin_lock_irqsave(&anv->lock, flags);
@@ -854,8 +852,8 @@ static void apple_nvme_disable(struct apple_nvme *anv, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
- nvme_start_queues(&anv->ctrl);
- nvme_start_admin_queue(&anv->ctrl);
+ nvme_unquiesce_io_queues(&anv->ctrl);
+ nvme_unquiesce_admin_queue(&anv->ctrl);
}
}
@@ -1093,7 +1091,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
dev_dbg(anv->dev, "Starting admin queue");
apple_nvme_init_queue(&anv->adminq);
- nvme_start_admin_queue(&anv->ctrl);
+ nvme_unquiesce_admin_queue(&anv->ctrl);
if (!nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_CONNECTING)) {
dev_warn(anv->ctrl.device,
@@ -1102,7 +1100,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
goto out;
}
- ret = nvme_init_ctrl_finish(&anv->ctrl);
+ ret = nvme_init_ctrl_finish(&anv->ctrl, false);
if (ret)
goto out;
@@ -1127,7 +1125,7 @@ static void apple_nvme_reset_work(struct work_struct *work)
anv->ctrl.queue_count = nr_io_queues + 1;
- nvme_start_queues(&anv->ctrl);
+ nvme_unquiesce_io_queues(&anv->ctrl);
nvme_wait_freeze(&anv->ctrl);
blk_mq_update_nr_hw_queues(&anv->tagset, 1);
nvme_unfreeze(&anv->ctrl);
@@ -1153,7 +1151,7 @@ out:
nvme_change_ctrl_state(&anv->ctrl, NVME_CTRL_DELETING);
nvme_get_ctrl(&anv->ctrl);
apple_nvme_disable(anv, false);
- nvme_kill_queues(&anv->ctrl);
+ nvme_mark_namespaces_dead(&anv->ctrl);
if (!queue_work(nvme_wq, &anv->remove_work))
nvme_put_ctrl(&anv->ctrl);
}
@@ -1507,14 +1505,6 @@ static int apple_nvme_probe(struct platform_device *pdev)
goto put_dev;
}
- if (!blk_get_queue(anv->ctrl.admin_q)) {
- nvme_start_admin_queue(&anv->ctrl);
- blk_mq_destroy_queue(anv->ctrl.admin_q);
- anv->ctrl.admin_q = NULL;
- ret = -ENODEV;
- goto put_dev;
- }
-
nvme_reset_ctrl(&anv->ctrl);
async_schedule(apple_nvme_async_probe, anv);
diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c
index c8a6db7c4498..bb0abbe4491c 100644
--- a/drivers/nvme/host/auth.c
+++ b/drivers/nvme/host/auth.c
@@ -13,6 +13,10 @@
#include "fabrics.h"
#include <linux/nvme-auth.h>
+#define CHAP_BUF_SIZE 4096
+static struct kmem_cache *nvme_chap_buf_cache;
+static mempool_t *nvme_chap_buf_pool;
+
struct nvme_dhchap_queue_context {
struct list_head entry;
struct work_struct auth_work;
@@ -20,7 +24,6 @@ struct nvme_dhchap_queue_context {
struct crypto_shash *shash_tfm;
struct crypto_kpp *dh_tfm;
void *buf;
- size_t buf_size;
int qid;
int error;
u32 s1;
@@ -47,6 +50,12 @@ struct nvme_dhchap_queue_context {
#define nvme_auth_queue_from_qid(ctrl, qid) \
(qid == 0) ? (ctrl)->fabrics_q : (ctrl)->connect_q
+static inline int ctrl_max_dhchaps(struct nvme_ctrl *ctrl)
+{
+ return ctrl->opts->nr_io_queues + ctrl->opts->nr_write_queues +
+ ctrl->opts->nr_poll_queues + 1;
+}
+
static int nvme_auth_submit(struct nvme_ctrl *ctrl, int qid,
void *data, size_t data_len, bool auth_send)
{
@@ -112,7 +121,7 @@ static int nvme_auth_set_dhchap_negotiate_data(struct nvme_ctrl *ctrl,
struct nvmf_auth_dhchap_negotiate_data *data = chap->buf;
size_t size = sizeof(*data) + sizeof(union nvmf_auth_protocol);
- if (chap->buf_size < size) {
+ if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return -EINVAL;
}
@@ -147,7 +156,7 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
const char *gid_name = nvme_auth_dhgroup_name(data->dhgid);
const char *hmac_name, *kpp_name;
- if (chap->buf_size < size) {
+ if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return NVME_SC_INVALID_FIELD;
}
@@ -197,12 +206,6 @@ static int nvme_auth_process_dhchap_challenge(struct nvme_ctrl *ctrl,
return NVME_SC_AUTH_REQUIRED;
}
- /* Reset host response if the hash had been changed */
- if (chap->hash_id != data->hashid) {
- kfree(chap->host_response);
- chap->host_response = NULL;
- }
-
chap->hash_id = data->hashid;
chap->hash_len = data->hl;
dev_dbg(ctrl->device, "qid %d: selected hash %s\n",
@@ -219,14 +222,6 @@ select_kpp:
return NVME_SC_AUTH_REQUIRED;
}
- /* Clear host and controller key to avoid accidental reuse */
- kfree_sensitive(chap->host_key);
- chap->host_key = NULL;
- chap->host_key_len = 0;
- kfree_sensitive(chap->ctrl_key);
- chap->ctrl_key = NULL;
- chap->ctrl_key_len = 0;
-
if (chap->dhgroup_id == data->dhgid &&
(data->dhgid == NVME_AUTH_DHGROUP_NULL || chap->dh_tfm)) {
dev_dbg(ctrl->device,
@@ -302,7 +297,7 @@ static int nvme_auth_set_dhchap_reply_data(struct nvme_ctrl *ctrl,
if (chap->host_key_len)
size += chap->host_key_len;
- if (chap->buf_size < size) {
+ if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return -EINVAL;
}
@@ -344,10 +339,10 @@ static int nvme_auth_process_dhchap_success1(struct nvme_ctrl *ctrl,
struct nvmf_auth_dhchap_success1_data *data = chap->buf;
size_t size = sizeof(*data);
- if (ctrl->ctrl_key)
+ if (chap->ctrl_key)
size += chap->hash_len;
- if (chap->buf_size < size) {
+ if (size > CHAP_BUF_SIZE) {
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return NVME_SC_INVALID_FIELD;
}
@@ -521,6 +516,7 @@ static int nvme_auth_dhchap_setup_ctrl_response(struct nvme_ctrl *ctrl,
ret = PTR_ERR(ctrl_response);
return ret;
}
+
ret = crypto_shash_setkey(chap->shash_tfm,
ctrl_response, ctrl->ctrl_key->len);
if (ret) {
@@ -621,9 +617,6 @@ static int nvme_auth_dhchap_exponential(struct nvme_ctrl *ctrl,
if (ret) {
dev_dbg(ctrl->device,
"failed to generate public key, error %d\n", ret);
- kfree(chap->host_key);
- chap->host_key = NULL;
- chap->host_key_len = 0;
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return ret;
}
@@ -643,9 +636,6 @@ gen_sesskey:
if (ret) {
dev_dbg(ctrl->device,
"failed to generate shared secret, error %d\n", ret);
- kfree_sensitive(chap->sess_key);
- chap->sess_key = NULL;
- chap->sess_key_len = 0;
chap->status = NVME_AUTH_DHCHAP_FAILURE_INCORRECT_PAYLOAD;
return ret;
}
@@ -654,7 +644,7 @@ gen_sesskey:
return 0;
}
-static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
+static void nvme_auth_reset_dhchap(struct nvme_dhchap_queue_context *chap)
{
kfree_sensitive(chap->host_response);
chap->host_response = NULL;
@@ -674,24 +664,20 @@ static void __nvme_auth_reset(struct nvme_dhchap_queue_context *chap)
chap->transaction = 0;
memset(chap->c1, 0, sizeof(chap->c1));
memset(chap->c2, 0, sizeof(chap->c2));
+ mempool_free(chap->buf, nvme_chap_buf_pool);
+ chap->buf = NULL;
}
-static void __nvme_auth_free(struct nvme_dhchap_queue_context *chap)
+static void nvme_auth_free_dhchap(struct nvme_dhchap_queue_context *chap)
{
- __nvme_auth_reset(chap);
+ nvme_auth_reset_dhchap(chap);
if (chap->shash_tfm)
crypto_free_shash(chap->shash_tfm);
if (chap->dh_tfm)
crypto_free_kpp(chap->dh_tfm);
- kfree_sensitive(chap->ctrl_key);
- kfree_sensitive(chap->host_key);
- kfree_sensitive(chap->sess_key);
- kfree_sensitive(chap->host_response);
- kfree(chap->buf);
- kfree(chap);
}
-static void __nvme_auth_work(struct work_struct *work)
+static void nvme_queue_auth_work(struct work_struct *work)
{
struct nvme_dhchap_queue_context *chap =
container_of(work, struct nvme_dhchap_queue_context, auth_work);
@@ -699,6 +685,16 @@ static void __nvme_auth_work(struct work_struct *work)
size_t tl;
int ret = 0;
+ /*
+ * Allocate a large enough buffer for the entire negotiation:
+ * 4k is enough to ffdhe8192.
+ */
+ chap->buf = mempool_alloc(nvme_chap_buf_pool, GFP_KERNEL);
+ if (!chap->buf) {
+ chap->error = -ENOMEM;
+ return;
+ }
+
chap->transaction = ctrl->transaction++;
/* DH-HMAC-CHAP Step 1: send negotiate */
@@ -720,8 +716,9 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d receive challenge\n",
__func__, chap->qid);
- memset(chap->buf, 0, chap->buf_size);
- ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+ memset(chap->buf, 0, CHAP_BUF_SIZE);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
+ false);
if (ret) {
dev_warn(ctrl->device,
"qid %d failed to receive challenge, %s %d\n",
@@ -757,11 +754,14 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d host response\n",
__func__, chap->qid);
+ mutex_lock(&ctrl->dhchap_auth_mutex);
ret = nvme_auth_dhchap_setup_host_response(ctrl, chap);
if (ret) {
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
chap->error = ret;
goto fail2;
}
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
/* DH-HMAC-CHAP Step 3: send reply */
dev_dbg(ctrl->device, "%s: qid %d send reply\n",
@@ -783,8 +783,9 @@ static void __nvme_auth_work(struct work_struct *work)
dev_dbg(ctrl->device, "%s: qid %d receive success1\n",
__func__, chap->qid);
- memset(chap->buf, 0, chap->buf_size);
- ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, chap->buf_size, false);
+ memset(chap->buf, 0, CHAP_BUF_SIZE);
+ ret = nvme_auth_submit(ctrl, chap->qid, chap->buf, CHAP_BUF_SIZE,
+ false);
if (ret) {
dev_warn(ctrl->device,
"qid %d failed to receive success1, %s %d\n",
@@ -801,16 +802,19 @@ static void __nvme_auth_work(struct work_struct *work)
return;
}
+ mutex_lock(&ctrl->dhchap_auth_mutex);
if (ctrl->ctrl_key) {
dev_dbg(ctrl->device,
"%s: qid %d controller response\n",
__func__, chap->qid);
ret = nvme_auth_dhchap_setup_ctrl_response(ctrl, chap);
if (ret) {
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
chap->error = ret;
goto fail2;
}
}
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
ret = nvme_auth_process_dhchap_success1(ctrl, chap);
if (ret) {
@@ -819,7 +823,7 @@ static void __nvme_auth_work(struct work_struct *work)
goto fail2;
}
- if (ctrl->ctrl_key) {
+ if (chap->ctrl_key) {
/* DH-HMAC-CHAP Step 5: send success2 */
dev_dbg(ctrl->device, "%s: qid %d send success2\n",
__func__, chap->qid);
@@ -860,42 +864,8 @@ int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
return -ENOKEY;
}
- mutex_lock(&ctrl->dhchap_auth_mutex);
- /* Check if the context is already queued */
- list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
- WARN_ON(!chap->buf);
- if (chap->qid == qid) {
- dev_dbg(ctrl->device, "qid %d: re-using context\n", qid);
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- flush_work(&chap->auth_work);
- __nvme_auth_reset(chap);
- queue_work(nvme_wq, &chap->auth_work);
- return 0;
- }
- }
- chap = kzalloc(sizeof(*chap), GFP_KERNEL);
- if (!chap) {
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- return -ENOMEM;
- }
- chap->qid = (qid == NVME_QID_ANY) ? 0 : qid;
- chap->ctrl = ctrl;
-
- /*
- * Allocate a large enough buffer for the entire negotiation:
- * 4k should be enough to ffdhe8192.
- */
- chap->buf_size = 4096;
- chap->buf = kzalloc(chap->buf_size, GFP_KERNEL);
- if (!chap->buf) {
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- kfree(chap);
- return -ENOMEM;
- }
-
- INIT_WORK(&chap->auth_work, __nvme_auth_work);
- list_add(&chap->entry, &ctrl->dhchap_auth_list);
- mutex_unlock(&ctrl->dhchap_auth_mutex);
+ chap = &ctrl->dhchap_ctxs[qid];
+ cancel_work_sync(&chap->auth_work);
queue_work(nvme_wq, &chap->auth_work);
return 0;
}
@@ -906,40 +876,28 @@ int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid)
struct nvme_dhchap_queue_context *chap;
int ret;
- mutex_lock(&ctrl->dhchap_auth_mutex);
- list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
- if (chap->qid != qid)
- continue;
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- flush_work(&chap->auth_work);
- ret = chap->error;
- return ret;
- }
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- return -ENXIO;
+ chap = &ctrl->dhchap_ctxs[qid];
+ flush_work(&chap->auth_work);
+ ret = chap->error;
+ /* clear sensitive info */
+ nvme_auth_reset_dhchap(chap);
+ return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_wait);
-void nvme_auth_reset(struct nvme_ctrl *ctrl)
-{
- struct nvme_dhchap_queue_context *chap;
-
- mutex_lock(&ctrl->dhchap_auth_mutex);
- list_for_each_entry(chap, &ctrl->dhchap_auth_list, entry) {
- mutex_unlock(&ctrl->dhchap_auth_mutex);
- flush_work(&chap->auth_work);
- __nvme_auth_reset(chap);
- }
- mutex_unlock(&ctrl->dhchap_auth_mutex);
-}
-EXPORT_SYMBOL_GPL(nvme_auth_reset);
-
-static void nvme_dhchap_auth_work(struct work_struct *work)
+static void nvme_ctrl_auth_work(struct work_struct *work)
{
struct nvme_ctrl *ctrl =
container_of(work, struct nvme_ctrl, dhchap_auth_work);
int ret, q;
+ /*
+ * If the ctrl is no connected, bail as reconnect will handle
+ * authentication.
+ */
+ if (ctrl->state != NVME_CTRL_LIVE)
+ return;
+
/* Authenticate admin queue first */
ret = nvme_auth_negotiate(ctrl, 0);
if (ret) {
@@ -968,43 +926,75 @@ static void nvme_dhchap_auth_work(struct work_struct *work)
* Failure is a soft-state; credentials remain valid until
* the controller terminates the connection.
*/
+ for (q = 1; q < ctrl->queue_count; q++) {
+ ret = nvme_auth_wait(ctrl, q);
+ if (ret)
+ dev_warn(ctrl->device,
+ "qid %d: authentication failed\n", q);
+ }
}
-void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
{
- INIT_LIST_HEAD(&ctrl->dhchap_auth_list);
- INIT_WORK(&ctrl->dhchap_auth_work, nvme_dhchap_auth_work);
+ struct nvme_dhchap_queue_context *chap;
+ int i, ret;
+
mutex_init(&ctrl->dhchap_auth_mutex);
+ INIT_WORK(&ctrl->dhchap_auth_work, nvme_ctrl_auth_work);
if (!ctrl->opts)
- return;
- nvme_auth_generate_key(ctrl->opts->dhchap_secret, &ctrl->host_key);
- nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret, &ctrl->ctrl_key);
+ return 0;
+ ret = nvme_auth_generate_key(ctrl->opts->dhchap_secret,
+ &ctrl->host_key);
+ if (ret)
+ return ret;
+ ret = nvme_auth_generate_key(ctrl->opts->dhchap_ctrl_secret,
+ &ctrl->ctrl_key);
+ if (ret)
+ goto err_free_dhchap_secret;
+
+ if (!ctrl->opts->dhchap_secret && !ctrl->opts->dhchap_ctrl_secret)
+ return ret;
+
+ ctrl->dhchap_ctxs = kvcalloc(ctrl_max_dhchaps(ctrl),
+ sizeof(*chap), GFP_KERNEL);
+ if (!ctrl->dhchap_ctxs) {
+ ret = -ENOMEM;
+ goto err_free_dhchap_ctrl_secret;
+ }
+
+ for (i = 0; i < ctrl_max_dhchaps(ctrl); i++) {
+ chap = &ctrl->dhchap_ctxs[i];
+ chap->qid = i;
+ chap->ctrl = ctrl;
+ INIT_WORK(&chap->auth_work, nvme_queue_auth_work);
+ }
+
+ return 0;
+err_free_dhchap_ctrl_secret:
+ nvme_auth_free_key(ctrl->ctrl_key);
+ ctrl->ctrl_key = NULL;
+err_free_dhchap_secret:
+ nvme_auth_free_key(ctrl->host_key);
+ ctrl->host_key = NULL;
+ return ret;
}
EXPORT_SYMBOL_GPL(nvme_auth_init_ctrl);
void nvme_auth_stop(struct nvme_ctrl *ctrl)
{
- struct nvme_dhchap_queue_context *chap = NULL, *tmp;
-
cancel_work_sync(&ctrl->dhchap_auth_work);
- mutex_lock(&ctrl->dhchap_auth_mutex);
- list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry)
- cancel_work_sync(&chap->auth_work);
- mutex_unlock(&ctrl->dhchap_auth_mutex);
}
EXPORT_SYMBOL_GPL(nvme_auth_stop);
void nvme_auth_free(struct nvme_ctrl *ctrl)
{
- struct nvme_dhchap_queue_context *chap = NULL, *tmp;
+ int i;
- mutex_lock(&ctrl->dhchap_auth_mutex);
- list_for_each_entry_safe(chap, tmp, &ctrl->dhchap_auth_list, entry) {
- list_del_init(&chap->entry);
- flush_work(&chap->auth_work);
- __nvme_auth_free(chap);
+ if (ctrl->dhchap_ctxs) {
+ for (i = 0; i < ctrl_max_dhchaps(ctrl); i++)
+ nvme_auth_free_dhchap(&ctrl->dhchap_ctxs[i]);
+ kfree(ctrl->dhchap_ctxs);
}
- mutex_unlock(&ctrl->dhchap_auth_mutex);
if (ctrl->host_key) {
nvme_auth_free_key(ctrl->host_key);
ctrl->host_key = NULL;
@@ -1015,3 +1005,27 @@ void nvme_auth_free(struct nvme_ctrl *ctrl)
}
}
EXPORT_SYMBOL_GPL(nvme_auth_free);
+
+int __init nvme_init_auth(void)
+{
+ nvme_chap_buf_cache = kmem_cache_create("nvme-chap-buf-cache",
+ CHAP_BUF_SIZE, 0, SLAB_HWCACHE_ALIGN, NULL);
+ if (!nvme_chap_buf_cache)
+ return -ENOMEM;
+
+ nvme_chap_buf_pool = mempool_create(16, mempool_alloc_slab,
+ mempool_free_slab, nvme_chap_buf_cache);
+ if (!nvme_chap_buf_pool)
+ goto err_destroy_chap_buf_cache;
+
+ return 0;
+err_destroy_chap_buf_cache:
+ kmem_cache_destroy(nvme_chap_buf_cache);
+ return -ENOMEM;
+}
+
+void __exit nvme_exit_auth(void)
+{
+ mempool_destroy(nvme_chap_buf_pool);
+ kmem_cache_destroy(nvme_chap_buf_cache);
+}
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 7e3893d06bab..e26b085a007a 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -384,6 +384,8 @@ static inline void nvme_end_req(struct request *req)
nvme_log_error(req);
nvme_end_req_zoned(req);
nvme_trace_bio_complete(req);
+ if (req->cmd_flags & REQ_NVME_MPATH)
+ nvme_mpath_end_request(req);
blk_mq_end_request(req, status);
}
@@ -851,8 +853,11 @@ static inline blk_status_t nvme_setup_write_zeroes(struct nvme_ns *ns,
cmnd->write_zeroes.length =
cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1);
+ if (!(req->cmd_flags & REQ_NOUNMAP) && (ns->features & NVME_NS_DEAC))
+ cmnd->write_zeroes.control |= cpu_to_le16(NVME_WZ_DEAC);
+
if (nvme_ns_has_pi(ns)) {
- cmnd->write_zeroes.control = cpu_to_le16(NVME_RW_PRINFO_PRACT);
+ cmnd->write_zeroes.control |= cpu_to_le16(NVME_RW_PRINFO_PRACT);
switch (ns->pi_type) {
case NVME_NS_DPS_PI_TYPE1:
@@ -1118,11 +1123,12 @@ void nvme_passthru_end(struct nvme_ctrl *ctrl, u32 effects,
nvme_unfreeze(ctrl);
nvme_mpath_unfreeze(ctrl->subsys);
mutex_unlock(&ctrl->subsys->lock);
- nvme_remove_invalid_namespaces(ctrl, NVME_NSID_ALL);
mutex_unlock(&ctrl->scan_lock);
}
- if (effects & NVME_CMD_EFFECTS_CCC)
- nvme_init_ctrl_finish(ctrl);
+ if (effects & NVME_CMD_EFFECTS_CCC) {
+ dev_info(ctrl->device,
+"controller capabilities changed, reset may be required to take effect.\n");
+ }
if (effects & (NVME_CMD_EFFECTS_NIC | NVME_CMD_EFFECTS_NCC)) {
nvme_queue_scan(ctrl);
flush_work(&ctrl->scan_work);
@@ -2003,6 +2009,14 @@ static int nvme_update_ns_info_block(struct nvme_ns *ns,
}
}
+ /*
+ * Only set the DEAC bit if the device guarantees that reads from
+ * deallocated data return zeroes. While the DEAC bit does not
+ * require that, it must be a no-op if reads from deallocated data
+ * do not return zeroes.
+ */
+ if ((id->dlfeat & 0x7) == 0x1 && (id->dlfeat & (1 << 3)))
+ ns->features |= NVME_NS_DEAC;
set_disk_ro(ns->disk, nvme_ns_is_readonly(ns, info));
set_bit(NVME_NS_READY, &ns->flags);
blk_mq_unfreeze_queue(ns->disk->queue);
@@ -2179,7 +2193,7 @@ const struct pr_ops nvme_pr_ops = {
};
#ifdef CONFIG_BLK_SED_OPAL
-int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
+static int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
bool send)
{
struct nvme_ctrl *ctrl = data;
@@ -2196,7 +2210,23 @@ int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
return __nvme_submit_sync_cmd(ctrl->admin_q, &cmd, NULL, buffer, len,
NVME_QID_ANY, 1, 0);
}
-EXPORT_SYMBOL_GPL(nvme_sec_submit);
+
+static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
+{
+ if (ctrl->oacs & NVME_CTRL_OACS_SEC_SUPP) {
+ if (!ctrl->opal_dev)
+ ctrl->opal_dev = init_opal_dev(ctrl, &nvme_sec_submit);
+ else if (was_suspended)
+ opal_unlock_from_suspend(ctrl->opal_dev);
+ } else {
+ free_opal_dev(ctrl->opal_dev);
+ ctrl->opal_dev = NULL;
+ }
+}
+#else
+static void nvme_configure_opal(struct nvme_ctrl *ctrl, bool was_suspended)
+{
+}
#endif /* CONFIG_BLK_SED_OPAL */
#ifdef CONFIG_BLK_DEV_ZONED
@@ -2221,16 +2251,17 @@ static const struct block_device_operations nvme_bdev_ops = {
.pr_ops = &nvme_pr_ops,
};
-static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
+static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 mask, u32 val,
+ u32 timeout, const char *op)
{
- unsigned long timeout_jiffies = ((timeout + 1) * HZ / 2) + jiffies;
- u32 csts, bit = enabled ? NVME_CSTS_RDY : 0;
+ unsigned long timeout_jiffies = jiffies + timeout * HZ;
+ u32 csts;
int ret;
while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
if (csts == ~0)
return -ENODEV;
- if ((csts & NVME_CSTS_RDY) == bit)
+ if ((csts & mask) == val)
break;
usleep_range(1000, 2000);
@@ -2239,7 +2270,7 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
if (time_after(jiffies, timeout_jiffies)) {
dev_err(ctrl->device,
"Device not ready; aborting %s, CSTS=0x%x\n",
- enabled ? "initialisation" : "reset", csts);
+ op, csts);
return -ENODEV;
}
}
@@ -2247,27 +2278,29 @@ static int nvme_wait_ready(struct nvme_ctrl *ctrl, u32 timeout, bool enabled)
return ret;
}
-/*
- * If the device has been passed off to us in an enabled state, just clear
- * the enabled bit. The spec says we should set the 'shutdown notification
- * bits', but doing so may cause the device to complete commands to the
- * admin queue ... and we don't know what memory that might be pointing at!
- */
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl)
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
int ret;
ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
- ctrl->ctrl_config &= ~NVME_CC_ENABLE;
+ if (shutdown)
+ ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
+ else
+ ctrl->ctrl_config &= ~NVME_CC_ENABLE;
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
+ if (shutdown) {
+ return nvme_wait_ready(ctrl, NVME_CSTS_SHST_MASK,
+ NVME_CSTS_SHST_CMPLT,
+ ctrl->shutdown_timeout, "shutdown");
+ }
if (ctrl->quirks & NVME_QUIRK_DELAY_BEFORE_CHK_RDY)
msleep(NVME_QUIRK_DELAY_AMOUNT);
-
- return nvme_wait_ready(ctrl, NVME_CAP_TIMEOUT(ctrl->cap), false);
+ return nvme_wait_ready(ctrl, NVME_CSTS_RDY, 0,
+ (NVME_CAP_TIMEOUT(ctrl->cap) + 1) / 2, "reset");
}
EXPORT_SYMBOL_GPL(nvme_disable_ctrl);
@@ -2332,41 +2365,11 @@ int nvme_enable_ctrl(struct nvme_ctrl *ctrl)
ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
if (ret)
return ret;
- return nvme_wait_ready(ctrl, timeout, true);
+ return nvme_wait_ready(ctrl, NVME_CSTS_RDY, NVME_CSTS_RDY,
+ (timeout + 1) / 2, "initialisation");
}
EXPORT_SYMBOL_GPL(nvme_enable_ctrl);
-int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl)
-{
- unsigned long timeout = jiffies + (ctrl->shutdown_timeout * HZ);
- u32 csts;
- int ret;
-
- ctrl->ctrl_config &= ~NVME_CC_SHN_MASK;
- ctrl->ctrl_config |= NVME_CC_SHN_NORMAL;
-
- ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config);
- if (ret)
- return ret;
-
- while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) {
- if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT)
- break;
-
- msleep(100);
- if (fatal_signal_pending(current))
- return -EINTR;
- if (time_after(jiffies, timeout)) {
- dev_err(ctrl->device,
- "Device shutdown incomplete; abort shutdown\n");
- return -ENODEV;
- }
- }
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(nvme_shutdown_ctrl);
-
static int nvme_configure_timestamp(struct nvme_ctrl *ctrl)
{
__le64 ts;
@@ -3049,7 +3052,7 @@ static int nvme_init_non_mdts_limits(struct nvme_ctrl *ctrl)
id = kzalloc(sizeof(*id), GFP_KERNEL);
if (!id)
- return 0;
+ return -ENOMEM;
c.identify.opcode = nvme_admin_identify;
c.identify.cns = NVME_ID_CNS_CS_CTRL;
@@ -3229,7 +3232,7 @@ out_free:
* register in our nvme_ctrl structure. This should be called as soon as
* the admin queue is fully up and running.
*/
-int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
+int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended)
{
int ret;
@@ -3260,6 +3263,8 @@ int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl)
if (ret < 0)
return ret;
+ nvme_configure_opal(ctrl, was_suspended);
+
if (!ctrl->identified && !nvme_discovery_ctrl(ctrl)) {
/*
* Do not return errors unless we are in a controller reset,
@@ -3745,15 +3750,19 @@ static ssize_t nvme_ctrl_dhchap_secret_store(struct device *dev,
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_secret)) {
+ struct nvme_dhchap_key *key, *host_key;
int ret;
- ret = nvme_auth_generate_key(dhchap_secret, &ctrl->host_key);
+ ret = nvme_auth_generate_key(dhchap_secret, &key);
if (ret)
return ret;
kfree(opts->dhchap_secret);
opts->dhchap_secret = dhchap_secret;
- /* Key has changed; re-authentication with new key */
- nvme_auth_reset(ctrl);
+ host_key = ctrl->host_key;
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ ctrl->host_key = key;
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ nvme_auth_free_key(host_key);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
@@ -3795,15 +3804,19 @@ static ssize_t nvme_ctrl_dhchap_ctrl_secret_store(struct device *dev,
memcpy(dhchap_secret, buf, count);
nvme_auth_stop(ctrl);
if (strcmp(dhchap_secret, opts->dhchap_ctrl_secret)) {
+ struct nvme_dhchap_key *key, *ctrl_key;
int ret;
- ret = nvme_auth_generate_key(dhchap_secret, &ctrl->ctrl_key);
+ ret = nvme_auth_generate_key(dhchap_secret, &key);
if (ret)
return ret;
kfree(opts->dhchap_ctrl_secret);
opts->dhchap_ctrl_secret = dhchap_secret;
- /* Key has changed; re-authentication with new key */
- nvme_auth_reset(ctrl);
+ ctrl_key = ctrl->ctrl_key;
+ mutex_lock(&ctrl->dhchap_auth_mutex);
+ ctrl->ctrl_key = key;
+ mutex_unlock(&ctrl->dhchap_auth_mutex);
+ nvme_auth_free_key(ctrl_key);
}
/* Start re-authentication */
dev_info(ctrl->device, "re-authenticating controller\n");
@@ -3875,10 +3888,11 @@ static umode_t nvme_dev_attrs_are_visible(struct kobject *kobj,
return a->mode;
}
-static const struct attribute_group nvme_dev_attrs_group = {
+const struct attribute_group nvme_dev_attrs_group = {
.attrs = nvme_dev_attrs,
.is_visible = nvme_dev_attrs_are_visible,
};
+EXPORT_SYMBOL_GPL(nvme_dev_attrs_group);
static const struct attribute_group *nvme_dev_attr_groups[] = {
&nvme_dev_attrs_group,
@@ -4333,10 +4347,6 @@ static void nvme_validate_ns(struct nvme_ns *ns, struct nvme_ns_info *info)
{
int ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
- if (test_bit(NVME_NS_DEAD, &ns->flags))
- goto out;
-
- ret = NVME_SC_INVALID_NS | NVME_SC_DNR;
if (!nvme_ns_ids_equal(&ns->head->ids, &info->ids)) {
dev_err(ns->ctrl->device,
"identifiers changed for nsid %d\n", ns->head->ns_id);
@@ -4407,7 +4417,7 @@ static void nvme_remove_invalid_namespaces(struct nvme_ctrl *ctrl,
down_write(&ctrl->namespaces_rwsem);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) {
- if (ns->head->ns_id > nsid || test_bit(NVME_NS_DEAD, &ns->flags))
+ if (ns->head->ns_id > nsid)
list_move_tail(&ns->list, &rm_list);
}
up_write(&ctrl->namespaces_rwsem);
@@ -4424,9 +4434,6 @@ static int nvme_scan_ns_list(struct nvme_ctrl *ctrl)
u32 prev = 0;
int ret = 0, i;
- if (nvme_ctrl_limited_cns(ctrl))
- return -EOPNOTSUPP;
-
ns_list = kzalloc(NVME_IDENTIFY_DATA_SIZE, GFP_KERNEL);
if (!ns_list)
return -ENOMEM;
@@ -4534,8 +4541,18 @@ static void nvme_scan_work(struct work_struct *work)
}
mutex_lock(&ctrl->scan_lock);
- if (nvme_scan_ns_list(ctrl) != 0)
+ if (nvme_ctrl_limited_cns(ctrl)) {
nvme_scan_ns_sequential(ctrl);
+ } else {
+ /*
+ * Fall back to sequential scan if DNR is set to handle broken
+ * devices which should support Identify NS List (as per the VS
+ * they report) but don't actually support it.
+ */
+ ret = nvme_scan_ns_list(ctrl);
+ if (ret > 0 && ret & NVME_SC_DNR)
+ nvme_scan_ns_sequential(ctrl);
+ }
mutex_unlock(&ctrl->scan_lock);
}
@@ -4565,8 +4582,10 @@ void nvme_remove_namespaces(struct nvme_ctrl *ctrl)
* removing the namespaces' disks; fail all the queues now to avoid
* potentially having to clean up the failed sync later.
*/
- if (ctrl->state == NVME_CTRL_DEAD)
- nvme_kill_queues(ctrl);
+ if (ctrl->state == NVME_CTRL_DEAD) {
+ nvme_mark_namespaces_dead(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
+ }
/* this is a no-op when called from the controller reset handler */
nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING_NOIO);
@@ -4692,7 +4711,7 @@ static void nvme_fw_act_work(struct work_struct *work)
fw_act_timeout = jiffies +
msecs_to_jiffies(admin_timeout * 1000);
- nvme_stop_queues(ctrl);
+ nvme_quiesce_io_queues(ctrl);
while (nvme_ctrl_pp_status(ctrl)) {
if (time_after(jiffies, fw_act_timeout)) {
dev_warn(ctrl->device,
@@ -4706,7 +4725,7 @@ static void nvme_fw_act_work(struct work_struct *work)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE))
return;
- nvme_start_queues(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
/* read FW slot information to clear the AER */
nvme_get_fw_slot_info(ctrl);
@@ -4811,8 +4830,7 @@ void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
EXPORT_SYMBOL_GPL(nvme_complete_async_event);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops, unsigned int flags,
- unsigned int cmd_size)
+ const struct blk_mq_ops *ops, unsigned int cmd_size)
{
int ret;
@@ -4822,7 +4840,9 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
if (ctrl->ops->flags & NVME_F_FABRICS)
set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
- set->flags = flags;
+ set->flags = BLK_MQ_F_NO_SCHED;
+ if (ctrl->ops->flags & NVME_F_BLOCKING)
+ set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size;
set->driver_data = ctrl;
set->nr_hw_queues = 1;
@@ -4850,6 +4870,7 @@ int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
out_cleanup_admin_q:
blk_mq_destroy_queue(ctrl->admin_q);
+ blk_put_queue(ctrl->admin_q);
out_free_tagset:
blk_mq_free_tag_set(ctrl->admin_tagset);
return ret;
@@ -4859,14 +4880,17 @@ EXPORT_SYMBOL_GPL(nvme_alloc_admin_tag_set);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl)
{
blk_mq_destroy_queue(ctrl->admin_q);
- if (ctrl->ops->flags & NVME_F_FABRICS)
+ blk_put_queue(ctrl->admin_q);
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->fabrics_q);
+ blk_put_queue(ctrl->fabrics_q);
+ }
blk_mq_free_tag_set(ctrl->admin_tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_admin_tag_set);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops, unsigned int flags,
+ const struct blk_mq_ops *ops, unsigned int nr_maps,
unsigned int cmd_size)
{
int ret;
@@ -4874,15 +4898,23 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
memset(set, 0, sizeof(*set));
set->ops = ops;
set->queue_depth = ctrl->sqsize + 1;
- set->reserved_tags = NVMF_RESERVED_TAGS;
+ /*
+ * Some Apple controllers requires tags to be unique across admin and
+ * the (only) I/O queue, so reserve the first 32 tags of the I/O queue.
+ */
+ if (ctrl->quirks & NVME_QUIRK_SHARED_TAGS)
+ set->reserved_tags = NVME_AQ_DEPTH;
+ else if (ctrl->ops->flags & NVME_F_FABRICS)
+ set->reserved_tags = NVMF_RESERVED_TAGS;
set->numa_node = ctrl->numa_node;
- set->flags = flags;
+ set->flags = BLK_MQ_F_SHOULD_MERGE;
+ if (ctrl->ops->flags & NVME_F_BLOCKING)
+ set->flags |= BLK_MQ_F_BLOCKING;
set->cmd_size = cmd_size,
set->driver_data = ctrl;
set->nr_hw_queues = ctrl->queue_count - 1;
set->timeout = NVME_IO_TIMEOUT;
- if (ops->map_queues)
- set->nr_maps = ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+ set->nr_maps = nr_maps;
ret = blk_mq_alloc_tag_set(set);
if (ret)
return ret;
@@ -4893,6 +4925,8 @@ int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
+ blk_queue_flag_set(QUEUE_FLAG_SKIP_TAGSET_QUIESCE,
+ ctrl->connect_q);
}
ctrl->tagset = set;
@@ -4906,8 +4940,10 @@ EXPORT_SYMBOL_GPL(nvme_alloc_io_tag_set);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl)
{
- if (ctrl->ops->flags & NVME_F_FABRICS)
+ if (ctrl->ops->flags & NVME_F_FABRICS) {
blk_mq_destroy_queue(ctrl->connect_q);
+ blk_put_queue(ctrl->connect_q);
+ }
blk_mq_free_tag_set(ctrl->tagset);
}
EXPORT_SYMBOL_GPL(nvme_remove_io_tag_set);
@@ -4943,7 +4979,7 @@ void nvme_start_ctrl(struct nvme_ctrl *ctrl)
if (ctrl->queue_count > 1) {
nvme_queue_scan(ctrl);
- nvme_start_queues(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
nvme_mpath_update(ctrl);
}
@@ -4988,6 +5024,7 @@ static void nvme_free_ctrl(struct device *dev)
nvme_auth_stop(ctrl);
nvme_auth_free(ctrl);
__free_page(ctrl->discard_page);
+ free_opal_dev(ctrl->opal_dev);
if (subsys) {
mutex_lock(&nvme_subsystems_lock);
@@ -5053,7 +5090,10 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
ctrl->instance);
ctrl->device->class = nvme_class;
ctrl->device->parent = ctrl->dev;
- ctrl->device->groups = nvme_dev_attr_groups;
+ if (ops->dev_attr_groups)
+ ctrl->device->groups = ops->dev_attr_groups;
+ else
+ ctrl->device->groups = nvme_dev_attr_groups;
ctrl->device->release = nvme_free_ctrl;
dev_set_drvdata(ctrl->device, ctrl);
ret = dev_set_name(ctrl->device, "nvme%d", ctrl->instance);
@@ -5077,9 +5117,13 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
nvme_fault_inject_init(&ctrl->fault_inject, dev_name(ctrl->device));
nvme_mpath_init_ctrl(ctrl);
- nvme_auth_init_ctrl(ctrl);
+ ret = nvme_auth_init_ctrl(ctrl);
+ if (ret)
+ goto out_free_cdev;
return 0;
+out_free_cdev:
+ cdev_device_del(&ctrl->cdev, ctrl->device);
out_free_name:
nvme_put_ctrl(ctrl);
kfree_const(ctrl->device->kobj.name);
@@ -5092,62 +5136,17 @@ out:
}
EXPORT_SYMBOL_GPL(nvme_init_ctrl);
-static void nvme_start_ns_queue(struct nvme_ns *ns)
-{
- if (test_and_clear_bit(NVME_NS_STOPPED, &ns->flags))
- blk_mq_unquiesce_queue(ns->queue);
-}
-
-static void nvme_stop_ns_queue(struct nvme_ns *ns)
-{
- if (!test_and_set_bit(NVME_NS_STOPPED, &ns->flags))
- blk_mq_quiesce_queue(ns->queue);
- else
- blk_mq_wait_quiesce_done(ns->queue);
-}
-
-/*
- * Prepare a queue for teardown.
- *
- * This must forcibly unquiesce queues to avoid blocking dispatch, and only set
- * the capacity to 0 after that to avoid blocking dispatchers that may be
- * holding bd_butex. This will end buffered writers dirtying pages that can't
- * be synced.
- */
-static void nvme_set_queue_dying(struct nvme_ns *ns)
-{
- if (test_and_set_bit(NVME_NS_DEAD, &ns->flags))
- return;
-
- blk_mark_disk_dead(ns->disk);
- nvme_start_ns_queue(ns);
-
- set_capacity_and_notify(ns->disk, 0);
-}
-
-/**
- * nvme_kill_queues(): Ends all namespace queues
- * @ctrl: the dead controller that needs to end
- *
- * Call this function when the driver determines it is unable to get the
- * controller in a state capable of servicing IO.
- */
-void nvme_kill_queues(struct nvme_ctrl *ctrl)
+/* let I/O to all namespaces fail in preparation for surprise removal */
+void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
down_read(&ctrl->namespaces_rwsem);
-
- /* Forcibly unquiesce queues to avoid blocking dispatch */
- if (ctrl->admin_q && !blk_queue_dying(ctrl->admin_q))
- nvme_start_admin_queue(ctrl);
-
list_for_each_entry(ns, &ctrl->namespaces, list)
- nvme_set_queue_dying(ns);
-
+ blk_mark_disk_dead(ns->disk);
up_read(&ctrl->namespaces_rwsem);
}
-EXPORT_SYMBOL_GPL(nvme_kill_queues);
+EXPORT_SYMBOL_GPL(nvme_mark_namespaces_dead);
void nvme_unfreeze(struct nvme_ctrl *ctrl)
{
@@ -5197,43 +5196,41 @@ void nvme_start_freeze(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_start_freeze);
-void nvme_stop_queues(struct nvme_ctrl *ctrl)
+void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl)
{
- struct nvme_ns *ns;
-
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- nvme_stop_ns_queue(ns);
- up_read(&ctrl->namespaces_rwsem);
+ if (!ctrl->tagset)
+ return;
+ if (!test_and_set_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+ blk_mq_quiesce_tagset(ctrl->tagset);
+ else
+ blk_mq_wait_quiesce_done(ctrl->tagset);
}
-EXPORT_SYMBOL_GPL(nvme_stop_queues);
+EXPORT_SYMBOL_GPL(nvme_quiesce_io_queues);
-void nvme_start_queues(struct nvme_ctrl *ctrl)
+void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl)
{
- struct nvme_ns *ns;
-
- down_read(&ctrl->namespaces_rwsem);
- list_for_each_entry(ns, &ctrl->namespaces, list)
- nvme_start_ns_queue(ns);
- up_read(&ctrl->namespaces_rwsem);
+ if (!ctrl->tagset)
+ return;
+ if (test_and_clear_bit(NVME_CTRL_STOPPED, &ctrl->flags))
+ blk_mq_unquiesce_tagset(ctrl->tagset);
}
-EXPORT_SYMBOL_GPL(nvme_start_queues);
+EXPORT_SYMBOL_GPL(nvme_unquiesce_io_queues);
-void nvme_stop_admin_queue(struct nvme_ctrl *ctrl)
+void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl)
{
if (!test_and_set_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
blk_mq_quiesce_queue(ctrl->admin_q);
else
- blk_mq_wait_quiesce_done(ctrl->admin_q);
+ blk_mq_wait_quiesce_done(ctrl->admin_q->tag_set);
}
-EXPORT_SYMBOL_GPL(nvme_stop_admin_queue);
+EXPORT_SYMBOL_GPL(nvme_quiesce_admin_queue);
-void nvme_start_admin_queue(struct nvme_ctrl *ctrl)
+void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl)
{
if (test_and_clear_bit(NVME_CTRL_ADMIN_Q_STOPPED, &ctrl->flags))
blk_mq_unquiesce_queue(ctrl->admin_q);
}
-EXPORT_SYMBOL_GPL(nvme_start_admin_queue);
+EXPORT_SYMBOL_GPL(nvme_unquiesce_admin_queue);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl)
{
@@ -5344,8 +5341,13 @@ static int __init nvme_core_init(void)
goto unregister_generic_ns;
}
+ result = nvme_init_auth();
+ if (result)
+ goto destroy_ns_chr;
return 0;
+destroy_ns_chr:
+ class_destroy(nvme_ns_chr_class);
unregister_generic_ns:
unregister_chrdev_region(nvme_ns_chr_devt, NVME_MINORS);
destroy_subsys_class:
@@ -5366,6 +5368,7 @@ out:
static void __exit nvme_core_exit(void)
{
+ nvme_exit_auth();
class_destroy(nvme_ns_chr_class);
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 5d57a042dbca..4564f16a0b20 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -1475,6 +1475,8 @@ nvme_fc_xmt_ls_rsp_done(struct nvmefc_ls_rsp *lsrsp)
fc_dma_unmap_single(lport->dev, lsop->rspdma,
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
+ kfree(lsop->rspbuf);
+ kfree(lsop->rqstbuf);
kfree(lsop);
nvme_fc_rport_put(rport);
@@ -1699,6 +1701,15 @@ restart:
spin_unlock_irqrestore(&rport->lock, flags);
}
+static
+void nvme_fc_rcv_ls_req_err_msg(struct nvme_fc_lport *lport,
+ struct fcnvme_ls_rqst_w0 *w0)
+{
+ dev_info(lport->dev, "RCV %s LS failed: No memory\n",
+ (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
+ nvmefc_ls_names[w0->ls_cmd] : "");
+}
+
/**
* nvme_fc_rcv_ls_req - transport entry point called by an LLDD
* upon the reception of a NVME LS request.
@@ -1751,20 +1762,20 @@ nvme_fc_rcv_ls_req(struct nvme_fc_remote_port *portptr,
goto out_put;
}
- lsop = kzalloc(sizeof(*lsop) +
- sizeof(union nvmefc_ls_requests) +
- sizeof(union nvmefc_ls_responses),
- GFP_KERNEL);
+ lsop = kzalloc(sizeof(*lsop), GFP_KERNEL);
if (!lsop) {
- dev_info(lport->dev,
- "RCV %s LS failed: No memory\n",
- (w0->ls_cmd <= NVME_FC_LAST_LS_CMD_VALUE) ?
- nvmefc_ls_names[w0->ls_cmd] : "");
+ nvme_fc_rcv_ls_req_err_msg(lport, w0);
ret = -ENOMEM;
goto out_put;
}
- lsop->rqstbuf = (union nvmefc_ls_requests *)&lsop[1];
- lsop->rspbuf = (union nvmefc_ls_responses *)&lsop->rqstbuf[1];
+
+ lsop->rqstbuf = kzalloc(sizeof(*lsop->rqstbuf), GFP_KERNEL);
+ lsop->rspbuf = kzalloc(sizeof(*lsop->rspbuf), GFP_KERNEL);
+ if (!lsop->rqstbuf || !lsop->rspbuf) {
+ nvme_fc_rcv_ls_req_err_msg(lport, w0);
+ ret = -ENOMEM;
+ goto out_free;
+ }
lsop->rspdma = fc_dma_map_single(lport->dev, lsop->rspbuf,
sizeof(*lsop->rspbuf),
@@ -1801,6 +1812,8 @@ out_unmap:
fc_dma_unmap_single(lport->dev, lsop->rspdma,
sizeof(*lsop->rspbuf), DMA_TO_DEVICE);
out_free:
+ kfree(lsop->rspbuf);
+ kfree(lsop->rqstbuf);
kfree(lsop);
out_put:
nvme_fc_rport_put(rport);
@@ -2391,7 +2404,7 @@ nvme_fc_ctrl_free(struct kref *ref)
list_del(&ctrl->ctrl_list);
spin_unlock_irqrestore(&ctrl->rport->lock, flags);
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvme_remove_admin_tag_set(&ctrl->ctrl);
kfree(ctrl->queues);
@@ -2492,20 +2505,20 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
* (but with error status).
*/
if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
+ nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
blk_mq_tagset_busy_iter(&ctrl->tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->tag_set);
if (start_queues)
- nvme_start_queues(&ctrl->ctrl);
+ nvme_unquiesce_io_queues(&ctrl->ctrl);
}
/*
* Other transports, which don't have link-level contexts bound
* to sqe's, would try to gracefully shutdown the controller by
* writing the registers for shutdown and polling (call
- * nvme_shutdown_ctrl()). Given a bunch of i/o was potentially
+ * nvme_disable_ctrl()). Given a bunch of i/o was potentially
* just aborted and we will wait on those contexts, and given
* there was no indication of how live the controlelr is on the
* link, don't send more io to create more contexts for the
@@ -2516,13 +2529,13 @@ __nvme_fc_abort_outstanding_ios(struct nvme_fc_ctrl *ctrl, bool start_queues)
/*
* clean up the admin queue. Same thing as above.
*/
- nvme_stop_admin_queue(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
blk_mq_tagset_busy_iter(&ctrl->admin_tag_set,
nvme_fc_terminate_exchange, &ctrl->ctrl);
blk_mq_tagset_wait_completed_request(&ctrl->admin_tag_set);
if (start_queues)
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
}
static void
@@ -2732,7 +2745,7 @@ nvme_fc_start_fcp_op(struct nvme_fc_ctrl *ctrl, struct nvme_fc_queue *queue,
atomic_set(&op->state, FCPOP_STATE_ACTIVE);
if (!(op->flags & FCOP_FLAGS_AEN))
- blk_mq_start_request(op->rq);
+ nvme_start_request(op->rq);
cmdiu->csn = cpu_to_be32(atomic_inc_return(&queue->csn));
ret = ctrl->lport->ops->fcp_io(&ctrl->lport->localport,
@@ -2903,7 +2916,7 @@ nvme_fc_create_io_queues(struct nvme_fc_ctrl *ctrl)
nvme_fc_init_io_queues(ctrl);
ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
- &nvme_fc_mq_ops, BLK_MQ_F_SHOULD_MERGE,
+ &nvme_fc_mq_ops, 1,
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz));
if (ret)
@@ -3104,9 +3117,9 @@ nvme_fc_create_association(struct nvme_fc_ctrl *ctrl)
ctrl->ctrl.max_hw_sectors = ctrl->ctrl.max_segments <<
(ilog2(SZ_4K) - 9);
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
- ret = nvme_init_ctrl_finish(&ctrl->ctrl);
+ ret = nvme_init_ctrl_finish(&ctrl->ctrl, false);
if (ret || test_bit(ASSOC_FAILED, &ctrl->flags))
goto out_disconnect_admin_queue;
@@ -3250,10 +3263,10 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
nvme_fc_free_queue(&ctrl->queues[0]);
/* re-enable the admin_q so anything new can fast fail */
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
/* resume the io queues so that things will fast fail */
- nvme_start_queues(&ctrl->ctrl);
+ nvme_unquiesce_io_queues(&ctrl->ctrl);
nvme_fc_ctlr_inactive_on_rport(ctrl);
}
@@ -3509,7 +3522,7 @@ nvme_fc_init_ctrl(struct device *dev, struct nvmf_ctrl_options *opts,
nvme_fc_init_queue(ctrl, 0);
ret = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
- &nvme_fc_admin_mq_ops, BLK_MQ_F_NO_SCHED,
+ &nvme_fc_admin_mq_ops,
struct_size((struct nvme_fcp_op_w_sgl *)NULL, priv,
ctrl->lport->ops->fcprqst_priv_sz));
if (ret)
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index 81f5550b670d..9ddda571f046 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -8,6 +8,50 @@
#include <linux/io_uring.h>
#include "nvme.h"
+static bool nvme_cmd_allowed(struct nvme_ns *ns, struct nvme_command *c,
+ fmode_t mode)
+{
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+
+ /*
+ * Do not allow unprivileged processes to send vendor specific or fabrics
+ * commands as we can't be sure about their effects.
+ */
+ if (c->common.opcode >= nvme_cmd_vendor_start ||
+ c->common.opcode == nvme_fabrics_command)
+ return false;
+
+ /*
+ * Do not allow unprivileged passthrough of admin commands except
+ * for a subset of identify commands that contain information required
+ * to form proper I/O commands in userspace and do not expose any
+ * potentially sensitive information.
+ */
+ if (!ns) {
+ if (c->common.opcode == nvme_admin_identify) {
+ switch (c->identify.cns) {
+ case NVME_ID_CNS_NS:
+ case NVME_ID_CNS_CS_NS:
+ case NVME_ID_CNS_NS_CS_INDEP:
+ case NVME_ID_CNS_CS_CTRL:
+ case NVME_ID_CNS_CTRL:
+ return true;
+ }
+ }
+ return false;
+ }
+
+ /*
+ * Only allow I/O commands that transfer data to the controller if the
+ * special file is open for writing, but always allow I/O commands that
+ * transfer data from the controller.
+ */
+ if (nvme_is_write(c))
+ return mode & FMODE_WRITE;
+ return true;
+}
+
/*
* Convert integer values from ioctl structures to user pointers, silently
* ignoring the upper bits in the compat case to match behaviour of 32-bit
@@ -261,7 +305,7 @@ static bool nvme_validate_passthru_nsid(struct nvme_ctrl *ctrl,
}
static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd __user *ucmd)
+ struct nvme_passthru_cmd __user *ucmd, fmode_t mode)
{
struct nvme_passthru_cmd cmd;
struct nvme_command c;
@@ -269,8 +313,6 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
u64 result;
int status;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@@ -291,6 +333,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+ if (!nvme_cmd_allowed(ns, &c, mode))
+ return -EACCES;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -308,15 +353,14 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
}
static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
- struct nvme_passthru_cmd64 __user *ucmd, bool vec)
+ struct nvme_passthru_cmd64 __user *ucmd, bool vec,
+ fmode_t mode)
{
struct nvme_passthru_cmd64 cmd;
struct nvme_command c;
unsigned timeout = 0;
int status;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
if (cmd.flags)
@@ -337,6 +381,9 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(cmd.cdw14);
c.common.cdw15 = cpu_to_le32(cmd.cdw15);
+ if (!nvme_cmd_allowed(ns, &c, mode))
+ return -EACCES;
+
if (cmd.timeout_ms)
timeout = msecs_to_jiffies(cmd.timeout_ms);
@@ -483,9 +530,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
void *meta = NULL;
int ret;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
c.common.opcode = READ_ONCE(cmd->opcode);
c.common.flags = READ_ONCE(cmd->flags);
if (c.common.flags)
@@ -507,6 +551,9 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
c.common.cdw14 = cpu_to_le32(READ_ONCE(cmd->cdw14));
c.common.cdw15 = cpu_to_le32(READ_ONCE(cmd->cdw15));
+ if (!nvme_cmd_allowed(ns, &c, ioucmd->file->f_mode))
+ return -EACCES;
+
d.metadata = READ_ONCE(cmd->metadata);
d.addr = READ_ONCE(cmd->addr);
d.data_len = READ_ONCE(cmd->data_len);
@@ -570,13 +617,13 @@ static bool is_ctrl_ioctl(unsigned int cmd)
}
static int nvme_ctrl_ioctl(struct nvme_ctrl *ctrl, unsigned int cmd,
- void __user *argp)
+ void __user *argp, fmode_t mode)
{
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp);
+ return nvme_user_cmd(ctrl, NULL, argp, mode);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, false);
+ return nvme_user_cmd64(ctrl, NULL, argp, false, mode);
default:
return sed_ioctl(ctrl->opal_dev, cmd, argp);
}
@@ -601,14 +648,14 @@ struct nvme_user_io32 {
#endif /* COMPAT_FOR_U64_ALIGNMENT */
static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
- void __user *argp)
+ void __user *argp, fmode_t mode)
{
switch (cmd) {
case NVME_IOCTL_ID:
force_successful_syscall_return();
return ns->head->ns_id;
case NVME_IOCTL_IO_CMD:
- return nvme_user_cmd(ns->ctrl, ns, argp);
+ return nvme_user_cmd(ns->ctrl, ns, argp, mode);
/*
* struct nvme_user_io can have different padding on some 32-bit ABIs.
* Just accept the compat version as all fields that are used are the
@@ -620,19 +667,20 @@ static int nvme_ns_ioctl(struct nvme_ns *ns, unsigned int cmd,
case NVME_IOCTL_SUBMIT_IO:
return nvme_submit_io(ns, argp);
case NVME_IOCTL_IO64_CMD:
- return nvme_user_cmd64(ns->ctrl, ns, argp, false);
+ return nvme_user_cmd64(ns->ctrl, ns, argp, false, mode);
case NVME_IOCTL_IO64_CMD_VEC:
- return nvme_user_cmd64(ns->ctrl, ns, argp, true);
+ return nvme_user_cmd64(ns->ctrl, ns, argp, true, mode);
default:
return -ENOTTY;
}
}
-static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg)
+static int __nvme_ioctl(struct nvme_ns *ns, unsigned int cmd, void __user *arg,
+ fmode_t mode)
{
- if (is_ctrl_ioctl(cmd))
- return nvme_ctrl_ioctl(ns->ctrl, cmd, arg);
- return nvme_ns_ioctl(ns, cmd, arg);
+ if (is_ctrl_ioctl(cmd))
+ return nvme_ctrl_ioctl(ns->ctrl, cmd, arg, mode);
+ return nvme_ns_ioctl(ns, cmd, arg, mode);
}
int nvme_ioctl(struct block_device *bdev, fmode_t mode,
@@ -640,7 +688,7 @@ int nvme_ioctl(struct block_device *bdev, fmode_t mode,
{
struct nvme_ns *ns = bdev->bd_disk->private_data;
- return __nvme_ioctl(ns, cmd, (void __user *)arg);
+ return __nvme_ioctl(ns, cmd, (void __user *)arg, mode);
}
long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
@@ -648,7 +696,7 @@ long nvme_ns_chr_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
struct nvme_ns *ns =
container_of(file_inode(file)->i_cdev, struct nvme_ns, cdev);
- return __nvme_ioctl(ns, cmd, (void __user *)arg);
+ return __nvme_ioctl(ns, cmd, (void __user *)arg, file->f_mode);
}
static int nvme_uring_cmd_checks(unsigned int issue_flags)
@@ -716,7 +764,8 @@ int nvme_ns_chr_uring_cmd_iopoll(struct io_uring_cmd *ioucmd,
}
#ifdef CONFIG_NVME_MULTIPATH
static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
- void __user *argp, struct nvme_ns_head *head, int srcu_idx)
+ void __user *argp, struct nvme_ns_head *head, int srcu_idx,
+ fmode_t mode)
__releases(&head->srcu)
{
struct nvme_ctrl *ctrl = ns->ctrl;
@@ -724,7 +773,7 @@ static int nvme_ns_head_ctrl_ioctl(struct nvme_ns *ns, unsigned int cmd,
nvme_get_ctrl(ns->ctrl);
srcu_read_unlock(&head->srcu, srcu_idx);
- ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp);
+ ret = nvme_ctrl_ioctl(ns->ctrl, cmd, argp, mode);
nvme_put_ctrl(ctrl);
return ret;
@@ -749,9 +798,10 @@ int nvme_ns_head_ioctl(struct block_device *bdev, fmode_t mode,
* deadlock when deleting namespaces using the passthrough interface.
*/
if (is_ctrl_ioctl(cmd))
- return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+ return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
+ mode);
- ret = nvme_ns_ioctl(ns, cmd, argp);
+ ret = nvme_ns_ioctl(ns, cmd, argp, mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -773,9 +823,10 @@ long nvme_ns_head_chr_ioctl(struct file *file, unsigned int cmd,
goto out_unlock;
if (is_ctrl_ioctl(cmd))
- return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx);
+ return nvme_ns_head_ctrl_ioctl(ns, cmd, argp, head, srcu_idx,
+ file->f_mode);
- ret = nvme_ns_ioctl(ns, cmd, argp);
+ ret = nvme_ns_ioctl(ns, cmd, argp, file->f_mode);
out_unlock:
srcu_read_unlock(&head->srcu, srcu_idx);
return ret;
@@ -849,7 +900,8 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags)
return ret;
}
-static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
+static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp,
+ fmode_t mode)
{
struct nvme_ns *ns;
int ret;
@@ -873,7 +925,7 @@ static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp)
kref_get(&ns->kref);
up_read(&ctrl->namespaces_rwsem);
- ret = nvme_user_cmd(ctrl, ns, argp);
+ ret = nvme_user_cmd(ctrl, ns, argp, mode);
nvme_put_ns(ns);
return ret;
@@ -890,11 +942,11 @@ long nvme_dev_ioctl(struct file *file, unsigned int cmd,
switch (cmd) {
case NVME_IOCTL_ADMIN_CMD:
- return nvme_user_cmd(ctrl, NULL, argp);
+ return nvme_user_cmd(ctrl, NULL, argp, file->f_mode);
case NVME_IOCTL_ADMIN64_CMD:
- return nvme_user_cmd64(ctrl, NULL, argp, false);
+ return nvme_user_cmd64(ctrl, NULL, argp, false, file->f_mode);
case NVME_IOCTL_IO_CMD:
- return nvme_dev_user_cmd(ctrl, argp);
+ return nvme_dev_user_cmd(ctrl, argp, file->f_mode);
case NVME_IOCTL_RESET:
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 7e025b8948cb..c03093b6813c 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -114,6 +114,31 @@ void nvme_failover_req(struct request *req)
kblockd_schedule_work(&ns->head->requeue_work);
}
+void nvme_mpath_start_request(struct request *rq)
+{
+ struct nvme_ns *ns = rq->q->queuedata;
+ struct gendisk *disk = ns->head->disk;
+
+ if (!blk_queue_io_stat(disk->queue) || blk_rq_is_passthrough(rq))
+ return;
+
+ nvme_req(rq)->flags |= NVME_MPATH_IO_STATS;
+ nvme_req(rq)->start_time = bdev_start_io_acct(disk->part0,
+ blk_rq_bytes(rq) >> SECTOR_SHIFT,
+ req_op(rq), jiffies);
+}
+EXPORT_SYMBOL_GPL(nvme_mpath_start_request);
+
+void nvme_mpath_end_request(struct request *rq)
+{
+ struct nvme_ns *ns = rq->q->queuedata;
+
+ if (!(nvme_req(rq)->flags & NVME_MPATH_IO_STATS))
+ return;
+ bdev_end_io_acct(ns->head->disk->part0, req_op(rq),
+ nvme_req(rq)->start_time);
+}
+
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
@@ -506,6 +531,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head)
blk_queue_flag_set(QUEUE_FLAG_NONROT, head->disk->queue);
blk_queue_flag_set(QUEUE_FLAG_NOWAIT, head->disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, head->disk->queue);
/*
* This assumes all controllers that refer to a namespace either
* support poll queues or not. That is not a strict guarantee,
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a29877217ee6..6bbb73ef8b25 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -162,6 +162,9 @@ struct nvme_request {
u8 retries;
u8 flags;
u16 status;
+#ifdef CONFIG_NVME_MULTIPATH
+ unsigned long start_time;
+#endif
struct nvme_ctrl *ctrl;
};
@@ -173,6 +176,7 @@ struct nvme_request {
enum {
NVME_REQ_CANCELLED = (1 << 0),
NVME_REQ_USERCMD = (1 << 1),
+ NVME_MPATH_IO_STATS = (1 << 2),
};
static inline struct nvme_request *nvme_req(struct request *req)
@@ -237,6 +241,7 @@ enum nvme_ctrl_flags {
NVME_CTRL_FAILFAST_EXPIRED = 0,
NVME_CTRL_ADMIN_Q_STOPPED = 1,
NVME_CTRL_STARTED_ONCE = 2,
+ NVME_CTRL_STOPPED = 3,
};
struct nvme_ctrl {
@@ -336,8 +341,8 @@ struct nvme_ctrl {
#ifdef CONFIG_NVME_AUTH
struct work_struct dhchap_auth_work;
- struct list_head dhchap_auth_list;
struct mutex dhchap_auth_mutex;
+ struct nvme_dhchap_queue_context *dhchap_ctxs;
struct nvme_dhchap_key *host_key;
struct nvme_dhchap_key *ctrl_key;
u16 transaction;
@@ -454,6 +459,7 @@ static inline bool nvme_ns_head_multipath(struct nvme_ns_head *head)
enum nvme_ns_features {
NVME_NS_EXT_LBAS = 1 << 0, /* support extended LBA format */
NVME_NS_METADATA_SUPPORTED = 1 << 1, /* support getting generated md */
+ NVME_NS_DEAC, /* DEAC bit in Write Zeores supported */
};
struct nvme_ns {
@@ -483,11 +489,9 @@ struct nvme_ns {
unsigned long features;
unsigned long flags;
#define NVME_NS_REMOVING 0
-#define NVME_NS_DEAD 1
#define NVME_NS_ANA_PENDING 2
#define NVME_NS_FORCE_RO 3
#define NVME_NS_READY 4
-#define NVME_NS_STOPPED 5
struct cdev cdev;
struct device cdev_device;
@@ -508,6 +512,9 @@ struct nvme_ctrl_ops {
unsigned int flags;
#define NVME_F_FABRICS (1 << 0)
#define NVME_F_METADATA_SUPPORTED (1 << 1)
+#define NVME_F_BLOCKING (1 << 2)
+
+ const struct attribute_group **dev_attr_groups;
int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val);
int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val);
int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val);
@@ -728,37 +735,32 @@ void nvme_cancel_tagset(struct nvme_ctrl *ctrl);
void nvme_cancel_admin_tagset(struct nvme_ctrl *ctrl);
bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
enum nvme_ctrl_state new_state);
-int nvme_disable_ctrl(struct nvme_ctrl *ctrl);
+int nvme_disable_ctrl(struct nvme_ctrl *ctrl, bool shutdown);
int nvme_enable_ctrl(struct nvme_ctrl *ctrl);
-int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl);
int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
const struct nvme_ctrl_ops *ops, unsigned long quirks);
void nvme_uninit_ctrl(struct nvme_ctrl *ctrl);
void nvme_start_ctrl(struct nvme_ctrl *ctrl);
void nvme_stop_ctrl(struct nvme_ctrl *ctrl);
-int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl);
+int nvme_init_ctrl_finish(struct nvme_ctrl *ctrl, bool was_suspended);
int nvme_alloc_admin_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops, unsigned int flags,
- unsigned int cmd_size);
+ const struct blk_mq_ops *ops, unsigned int cmd_size);
void nvme_remove_admin_tag_set(struct nvme_ctrl *ctrl);
int nvme_alloc_io_tag_set(struct nvme_ctrl *ctrl, struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops, unsigned int flags,
+ const struct blk_mq_ops *ops, unsigned int nr_maps,
unsigned int cmd_size);
void nvme_remove_io_tag_set(struct nvme_ctrl *ctrl);
void nvme_remove_namespaces(struct nvme_ctrl *ctrl);
-int nvme_sec_submit(void *data, u16 spsp, u8 secp, void *buffer, size_t len,
- bool send);
-
void nvme_complete_async_event(struct nvme_ctrl *ctrl, __le16 status,
volatile union nvme_result *res);
-void nvme_stop_queues(struct nvme_ctrl *ctrl);
-void nvme_start_queues(struct nvme_ctrl *ctrl);
-void nvme_stop_admin_queue(struct nvme_ctrl *ctrl);
-void nvme_start_admin_queue(struct nvme_ctrl *ctrl);
-void nvme_kill_queues(struct nvme_ctrl *ctrl);
+void nvme_quiesce_io_queues(struct nvme_ctrl *ctrl);
+void nvme_unquiesce_io_queues(struct nvme_ctrl *ctrl);
+void nvme_quiesce_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_unquiesce_admin_queue(struct nvme_ctrl *ctrl);
+void nvme_mark_namespaces_dead(struct nvme_ctrl *ctrl);
void nvme_sync_queues(struct nvme_ctrl *ctrl);
void nvme_sync_io_queues(struct nvme_ctrl *ctrl);
void nvme_unfreeze(struct nvme_ctrl *ctrl);
@@ -857,6 +859,7 @@ int nvme_dev_uring_cmd(struct io_uring_cmd *ioucmd, unsigned int issue_flags);
extern const struct attribute_group *nvme_ns_id_attr_groups[];
extern const struct pr_ops nvme_pr_ops;
extern const struct block_device_operations nvme_ns_head_ops;
+extern const struct attribute_group nvme_dev_attrs_group;
struct nvme_ns *nvme_find_path(struct nvme_ns_head *head);
#ifdef CONFIG_NVME_MULTIPATH
@@ -883,6 +886,8 @@ bool nvme_mpath_clear_current_path(struct nvme_ns *ns);
void nvme_mpath_revalidate_paths(struct nvme_ns *ns);
void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl);
void nvme_mpath_shutdown_disk(struct nvme_ns_head *head);
+void nvme_mpath_start_request(struct request *rq);
+void nvme_mpath_end_request(struct request *rq);
static inline void nvme_trace_bio_complete(struct request *req)
{
@@ -968,6 +973,12 @@ static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
static inline void nvme_mpath_default_iopolicy(struct nvme_subsystem *subsys)
{
}
+static inline void nvme_mpath_start_request(struct request *rq)
+{
+}
+static inline void nvme_mpath_end_request(struct request *rq)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
int nvme_revalidate_zones(struct nvme_ns *ns);
@@ -1013,20 +1024,38 @@ static inline void nvme_hwmon_exit(struct nvme_ctrl *ctrl)
}
#endif
+static inline void nvme_start_request(struct request *rq)
+{
+ if (rq->cmd_flags & REQ_NVME_MPATH)
+ nvme_mpath_start_request(rq);
+ blk_mq_start_request(rq);
+}
+
static inline bool nvme_ctrl_sgl_supported(struct nvme_ctrl *ctrl)
{
return ctrl->sgls & ((1 << 0) | (1 << 1));
}
#ifdef CONFIG_NVME_AUTH
-void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
+int __init nvme_init_auth(void);
+void __exit nvme_exit_auth(void);
+int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl);
void nvme_auth_stop(struct nvme_ctrl *ctrl);
int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid);
int nvme_auth_wait(struct nvme_ctrl *ctrl, int qid);
-void nvme_auth_reset(struct nvme_ctrl *ctrl);
void nvme_auth_free(struct nvme_ctrl *ctrl);
#else
-static inline void nvme_auth_init_ctrl(struct nvme_ctrl *ctrl) {};
+static inline int nvme_auth_init_ctrl(struct nvme_ctrl *ctrl)
+{
+ return 0;
+}
+static inline int __init nvme_init_auth(void)
+{
+ return 0;
+}
+static inline void __exit nvme_exit_auth(void)
+{
+}
static inline void nvme_auth_stop(struct nvme_ctrl *ctrl) {};
static inline int nvme_auth_negotiate(struct nvme_ctrl *ctrl, int qid)
{
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 488ad7dabeb8..f0f8027644bb 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -15,6 +15,7 @@
#include <linux/init.h>
#include <linux/interrupt.h>
#include <linux/io.h>
+#include <linux/kstrtox.h>
#include <linux/memremap.h>
#include <linux/mm.h>
#include <linux/module.h>
@@ -108,7 +109,7 @@ struct nvme_dev;
struct nvme_queue;
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode);
+static void nvme_delete_io_queues(struct nvme_dev *dev);
/*
* Represents an NVM Express device. Each nvme_dev is a PCI function.
@@ -130,7 +131,6 @@ struct nvme_dev {
u32 db_stride;
void __iomem *bar;
unsigned long bar_mapped_size;
- struct work_struct remove_work;
struct mutex shutdown_lock;
bool subsystem;
u64 cmb_size;
@@ -158,8 +158,6 @@ struct nvme_dev {
unsigned int nr_allocated_queues;
unsigned int nr_write_queues;
unsigned int nr_poll_queues;
-
- bool attrs_added;
};
static int io_queue_depth_set(const char *val, const struct kernel_param *kp)
@@ -241,10 +239,13 @@ static inline unsigned int nvme_dbbuf_size(struct nvme_dev *dev)
return dev->nr_allocated_queues * 8 * dev->db_stride;
}
-static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
+static void nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
{
unsigned int mem_size = nvme_dbbuf_size(dev);
+ if (!(dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP))
+ return;
+
if (dev->dbbuf_dbs) {
/*
* Clear the dbbuf memory so the driver doesn't observe stale
@@ -252,25 +253,27 @@ static int nvme_dbbuf_dma_alloc(struct nvme_dev *dev)
*/
memset(dev->dbbuf_dbs, 0, mem_size);
memset(dev->dbbuf_eis, 0, mem_size);
- return 0;
+ return;
}
dev->dbbuf_dbs = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_dbs_dma_addr,
GFP_KERNEL);
if (!dev->dbbuf_dbs)
- return -ENOMEM;
+ goto fail;
dev->dbbuf_eis = dma_alloc_coherent(dev->dev, mem_size,
&dev->dbbuf_eis_dma_addr,
GFP_KERNEL);
- if (!dev->dbbuf_eis) {
- dma_free_coherent(dev->dev, mem_size,
- dev->dbbuf_dbs, dev->dbbuf_dbs_dma_addr);
- dev->dbbuf_dbs = NULL;
- return -ENOMEM;
- }
+ if (!dev->dbbuf_eis)
+ goto fail_free_dbbuf_dbs;
+ return;
- return 0;
+fail_free_dbbuf_dbs:
+ dma_free_coherent(dev->dev, mem_size, dev->dbbuf_dbs,
+ dev->dbbuf_dbs_dma_addr);
+ dev->dbbuf_dbs = NULL;
+fail:
+ dev_warn(dev->dev, "unable to allocate dma for dbbuf\n");
}
static void nvme_dbbuf_dma_free(struct nvme_dev *dev)
@@ -392,18 +395,10 @@ static int nvme_pci_npages_sgl(void)
PAGE_SIZE);
}
-static size_t nvme_pci_iod_alloc_size(void)
-{
- size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
-
- return sizeof(__le64 *) * npages +
- sizeof(struct scatterlist) * NVME_MAX_SEGS;
-}
-
static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
+ struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
@@ -416,7 +411,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
- struct nvme_dev *dev = data;
+ struct nvme_dev *dev = to_nvme_dev(data);
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
@@ -428,7 +423,7 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set,
struct request *req, unsigned int hctx_idx,
unsigned int numa_node)
{
- struct nvme_dev *dev = set->driver_data;
+ struct nvme_dev *dev = to_nvme_dev(set->driver_data);
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
nvme_req(req)->ctrl = &dev->ctrl;
@@ -447,7 +442,7 @@ static int queue_irq_offset(struct nvme_dev *dev)
static void nvme_pci_map_queues(struct blk_mq_tag_set *set)
{
- struct nvme_dev *dev = set->driver_data;
+ struct nvme_dev *dev = to_nvme_dev(set->driver_data);
int i, qoff, offset;
offset = queue_irq_offset(dev);
@@ -914,7 +909,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req)
goto out_unmap_data;
}
- blk_mq_start_request(req);
+ nvme_start_request(req);
return BLK_STS_OK;
out_unmap_data:
nvme_unmap_data(dev, req);
@@ -1474,24 +1469,21 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
}
}
-/**
- * nvme_suspend_queue - put queue into suspended state
- * @nvmeq: queue to suspend
- */
-static int nvme_suspend_queue(struct nvme_queue *nvmeq)
+static void nvme_suspend_queue(struct nvme_dev *dev, unsigned int qid)
{
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
if (!test_and_clear_bit(NVMEQ_ENABLED, &nvmeq->flags))
- return 1;
+ return;
/* ensure that nvme_queue_rq() sees NVMEQ_ENABLED cleared */
mb();
nvmeq->dev->online_queues--;
if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q)
- nvme_stop_admin_queue(&nvmeq->dev->ctrl);
+ nvme_quiesce_admin_queue(&nvmeq->dev->ctrl);
if (!test_and_clear_bit(NVMEQ_POLLED, &nvmeq->flags))
- pci_free_irq(to_pci_dev(nvmeq->dev->dev), nvmeq->cq_vector, nvmeq);
- return 0;
+ pci_free_irq(to_pci_dev(dev->dev), nvmeq->cq_vector, nvmeq);
}
static void nvme_suspend_io_queues(struct nvme_dev *dev)
@@ -1499,19 +1491,7 @@ static void nvme_suspend_io_queues(struct nvme_dev *dev)
int i;
for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(&dev->queues[i]);
-}
-
-static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
-{
- struct nvme_queue *nvmeq = &dev->queues[0];
-
- if (shutdown)
- nvme_shutdown_ctrl(&dev->ctrl);
- else
- nvme_disable_ctrl(&dev->ctrl);
-
- nvme_poll_irqdisable(nvmeq);
+ nvme_suspend_queue(dev, i);
}
/*
@@ -1748,44 +1728,11 @@ static void nvme_dev_remove_admin(struct nvme_dev *dev)
* user requests may be waiting on a stopped queue. Start the
* queue to flush these to completion.
*/
- nvme_start_admin_queue(&dev->ctrl);
- blk_mq_destroy_queue(dev->ctrl.admin_q);
- blk_mq_free_tag_set(&dev->admin_tagset);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
+ nvme_remove_admin_tag_set(&dev->ctrl);
}
}
-static int nvme_pci_alloc_admin_tag_set(struct nvme_dev *dev)
-{
- struct blk_mq_tag_set *set = &dev->admin_tagset;
-
- set->ops = &nvme_mq_admin_ops;
- set->nr_hw_queues = 1;
-
- set->queue_depth = NVME_AQ_MQ_TAG_DEPTH;
- set->timeout = NVME_ADMIN_TIMEOUT;
- set->numa_node = dev->ctrl.numa_node;
- set->cmd_size = sizeof(struct nvme_iod);
- set->flags = BLK_MQ_F_NO_SCHED;
- set->driver_data = dev;
-
- if (blk_mq_alloc_tag_set(set))
- return -ENOMEM;
- dev->ctrl.admin_tagset = set;
-
- dev->ctrl.admin_q = blk_mq_init_queue(set);
- if (IS_ERR(dev->ctrl.admin_q)) {
- blk_mq_free_tag_set(set);
- dev->ctrl.admin_q = NULL;
- return -ENOMEM;
- }
- if (!blk_get_queue(dev->ctrl.admin_q)) {
- nvme_dev_remove_admin(dev);
- dev->ctrl.admin_q = NULL;
- return -ENODEV;
- }
- return 0;
-}
-
static unsigned long db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues)
{
return NVME_REG_DBS + ((nr_io_queues + 1) * 8 * dev->db_stride);
@@ -1829,7 +1776,14 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
(readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO))
writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS);
- result = nvme_disable_ctrl(&dev->ctrl);
+ /*
+ * If the device has been passed off to us in an enabled state, just
+ * clear the enabled bit. The spec says we should set the 'shutdown
+ * notification bits', but doing so may cause the device to complete
+ * commands to the admin queue ... and we don't know what memory that
+ * might be pointing at!
+ */
+ result = nvme_disable_ctrl(&dev->ctrl, false);
if (result < 0)
return result;
@@ -2112,6 +2066,9 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u32 enable_bits = NVME_HOST_MEM_ENABLE;
int ret;
+ if (!dev->ctrl.hmpre)
+ return 0;
+
preferred = min(preferred, max);
if (min > max) {
dev_warn(dev->ctrl.device,
@@ -2192,7 +2149,7 @@ static ssize_t hmb_store(struct device *dev, struct device_attribute *attr,
bool new;
int ret;
- if (strtobool(buf, &new) < 0)
+ if (kstrtobool(buf, &new) < 0)
return -EINVAL;
if (new == ndev->hmb)
@@ -2240,11 +2197,17 @@ static struct attribute *nvme_pci_attrs[] = {
NULL,
};
-static const struct attribute_group nvme_pci_attr_group = {
+static const struct attribute_group nvme_pci_dev_attrs_group = {
.attrs = nvme_pci_attrs,
.is_visible = nvme_pci_attrs_are_visible,
};
+static const struct attribute_group *nvme_pci_dev_attr_groups[] = {
+ &nvme_dev_attrs_group,
+ &nvme_pci_dev_attrs_group,
+ NULL,
+};
+
/*
* nirqs is the number of interrupts available for write and read
* queues. The core already reserved an interrupt for the admin queue.
@@ -2319,12 +2282,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
PCI_IRQ_ALL_TYPES | PCI_IRQ_AFFINITY, &affd);
}
-static void nvme_disable_io_queues(struct nvme_dev *dev)
-{
- if (__nvme_disable_io_queues(dev, nvme_admin_delete_sq))
- __nvme_disable_io_queues(dev, nvme_admin_delete_cq);
-}
-
static unsigned int nvme_max_io_queues(struct nvme_dev *dev)
{
/*
@@ -2432,7 +2389,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (dev->online_queues - 1 < dev->max_qid) {
nr_io_queues = dev->online_queues - 1;
- nvme_disable_io_queues(dev);
+ nvme_delete_io_queues(dev);
result = nvme_setup_io_queues_trylock(dev);
if (result)
return result;
@@ -2495,7 +2452,7 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
return 0;
}
-static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
+static bool __nvme_delete_io_queues(struct nvme_dev *dev, u8 opcode)
{
int nr_queues = dev->online_queues - 1, sent = 0;
unsigned long timeout;
@@ -2523,40 +2480,19 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
return true;
}
-static void nvme_pci_alloc_tag_set(struct nvme_dev *dev)
+static void nvme_delete_io_queues(struct nvme_dev *dev)
{
- struct blk_mq_tag_set * set = &dev->tagset;
- int ret;
+ if (__nvme_delete_io_queues(dev, nvme_admin_delete_sq))
+ __nvme_delete_io_queues(dev, nvme_admin_delete_cq);
+}
- set->ops = &nvme_mq_ops;
- set->nr_hw_queues = dev->online_queues - 1;
- set->nr_maps = 1;
- if (dev->io_queues[HCTX_TYPE_READ])
- set->nr_maps = 2;
+static unsigned int nvme_pci_nr_maps(struct nvme_dev *dev)
+{
if (dev->io_queues[HCTX_TYPE_POLL])
- set->nr_maps = 3;
- set->timeout = NVME_IO_TIMEOUT;
- set->numa_node = dev->ctrl.numa_node;
- set->queue_depth = min_t(unsigned, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1;
- set->cmd_size = sizeof(struct nvme_iod);
- set->flags = BLK_MQ_F_SHOULD_MERGE;
- set->driver_data = dev;
-
- /*
- * Some Apple controllers requires tags to be unique
- * across admin and IO queue, so reserve the first 32
- * tags of the IO queue.
- */
- if (dev->ctrl.quirks & NVME_QUIRK_SHARED_TAGS)
- set->reserved_tags = NVME_AQ_DEPTH;
-
- ret = blk_mq_alloc_tag_set(set);
- if (ret) {
- dev_warn(dev->ctrl.device,
- "IO queues tagset allocation failed %d\n", ret);
- return;
- }
- dev->ctrl.tagset = set;
+ return 3;
+ if (dev->io_queues[HCTX_TYPE_READ])
+ return 2;
+ return 1;
}
static void nvme_pci_update_nr_queues(struct nvme_dev *dev)
@@ -2647,7 +2583,8 @@ static int nvme_pci_enable(struct nvme_dev *dev)
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
- return 0;
+
+ return nvme_pci_configure_admin_queue(dev);
disable:
pci_disable_device(pdev);
@@ -2661,57 +2598,53 @@ static void nvme_dev_unmap(struct nvme_dev *dev)
pci_release_mem_regions(to_pci_dev(dev->dev));
}
-static void nvme_pci_disable(struct nvme_dev *dev)
+static bool nvme_pci_ctrl_is_dead(struct nvme_dev *dev)
{
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ u32 csts;
- pci_free_irq_vectors(pdev);
+ if (!pci_is_enabled(pdev) || !pci_device_is_present(pdev))
+ return true;
+ if (pdev->error_state != pci_channel_io_normal)
+ return true;
- if (pci_is_enabled(pdev)) {
- pci_disable_pcie_error_reporting(pdev);
- pci_disable_device(pdev);
- }
+ csts = readl(dev->bar + NVME_REG_CSTS);
+ return (csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY);
}
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- bool dead = true, freeze = false;
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ bool dead;
mutex_lock(&dev->shutdown_lock);
- if (pci_is_enabled(pdev)) {
- u32 csts;
-
- if (pci_device_is_present(pdev))
- csts = readl(dev->bar + NVME_REG_CSTS);
- else
- csts = ~0;
-
- if (dev->ctrl.state == NVME_CTRL_LIVE ||
- dev->ctrl.state == NVME_CTRL_RESETTING) {
- freeze = true;
+ dead = nvme_pci_ctrl_is_dead(dev);
+ if (dev->ctrl.state == NVME_CTRL_LIVE ||
+ dev->ctrl.state == NVME_CTRL_RESETTING) {
+ if (pci_is_enabled(pdev))
nvme_start_freeze(&dev->ctrl);
- }
- dead = !!((csts & NVME_CSTS_CFS) || !(csts & NVME_CSTS_RDY) ||
- pdev->error_state != pci_channel_io_normal);
+ /*
+ * Give the controller a chance to complete all entered requests
+ * if doing a safe shutdown.
+ */
+ if (!dead && shutdown)
+ nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
}
- /*
- * Give the controller a chance to complete all entered requests if
- * doing a safe shutdown.
- */
- if (!dead && shutdown && freeze)
- nvme_wait_freeze_timeout(&dev->ctrl, NVME_IO_TIMEOUT);
-
- nvme_stop_queues(&dev->ctrl);
+ nvme_quiesce_io_queues(&dev->ctrl);
if (!dead && dev->ctrl.queue_count > 0) {
- nvme_disable_io_queues(dev);
- nvme_disable_admin_queue(dev, shutdown);
+ nvme_delete_io_queues(dev);
+ nvme_disable_ctrl(&dev->ctrl, shutdown);
+ nvme_poll_irqdisable(&dev->queues[0]);
}
nvme_suspend_io_queues(dev);
- nvme_suspend_queue(&dev->queues[0]);
- nvme_pci_disable(dev);
+ nvme_suspend_queue(dev, 0);
+ pci_free_irq_vectors(pdev);
+ if (pci_is_enabled(pdev)) {
+ pci_disable_pcie_error_reporting(pdev);
+ pci_disable_device(pdev);
+ }
nvme_reap_pending_cqes(dev);
nvme_cancel_tagset(&dev->ctrl);
@@ -2723,9 +2656,9 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
* deadlocking blk-mq hot-cpu notifier.
*/
if (shutdown) {
- nvme_start_queues(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q))
- nvme_start_admin_queue(&dev->ctrl);
+ nvme_unquiesce_admin_queue(&dev->ctrl);
}
mutex_unlock(&dev->shutdown_lock);
}
@@ -2762,42 +2695,40 @@ static void nvme_release_prp_pools(struct nvme_dev *dev)
dma_pool_destroy(dev->prp_small_pool);
}
+static int nvme_pci_alloc_iod_mempool(struct nvme_dev *dev)
+{
+ size_t npages = max(nvme_pci_npages_prp(), nvme_pci_npages_sgl());
+ size_t alloc_size = sizeof(__le64 *) * npages +
+ sizeof(struct scatterlist) * NVME_MAX_SEGS;
+
+ WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+ dev->iod_mempool = mempool_create_node(1,
+ mempool_kmalloc, mempool_kfree,
+ (void *)alloc_size, GFP_KERNEL,
+ dev_to_node(dev->dev));
+ if (!dev->iod_mempool)
+ return -ENOMEM;
+ return 0;
+}
+
static void nvme_free_tagset(struct nvme_dev *dev)
{
if (dev->tagset.tags)
- blk_mq_free_tag_set(&dev->tagset);
+ nvme_remove_io_tag_set(&dev->ctrl);
dev->ctrl.tagset = NULL;
}
+/* pairs with nvme_pci_alloc_dev */
static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- nvme_dbbuf_dma_free(dev);
nvme_free_tagset(dev);
- if (dev->ctrl.admin_q)
- blk_put_queue(dev->ctrl.admin_q);
- free_opal_dev(dev->ctrl.opal_dev);
- mempool_destroy(dev->iod_mempool);
put_device(dev->dev);
kfree(dev->queues);
kfree(dev);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
- /*
- * Set state to deleting now to avoid blocking nvme_wait_reset(), which
- * may be holding this pci_dev's device lock.
- */
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
- nvme_get_ctrl(&dev->ctrl);
- nvme_dev_disable(dev, false);
- nvme_kill_queues(&dev->ctrl);
- if (!queue_work(nvme_wq, &dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev =
@@ -2808,8 +2739,7 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.state != NVME_CTRL_RESETTING) {
dev_warn(dev->ctrl.device, "ctrl state %d is not RESETTING\n",
dev->ctrl.state);
- result = -ENODEV;
- goto out;
+ return;
}
/*
@@ -2824,34 +2754,7 @@ static void nvme_reset_work(struct work_struct *work)
result = nvme_pci_enable(dev);
if (result)
goto out_unlock;
-
- result = nvme_pci_configure_admin_queue(dev);
- if (result)
- goto out_unlock;
-
- if (!dev->ctrl.admin_q) {
- result = nvme_pci_alloc_admin_tag_set(dev);
- if (result)
- goto out_unlock;
- } else {
- nvme_start_admin_queue(&dev->ctrl);
- }
-
- dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1);
-
- /*
- * Limit the max command size to prevent iod->sg allocations going
- * over a single page.
- */
- dev->ctrl.max_hw_sectors = min_t(u32,
- NVME_MAX_KB_SZ << 1, dma_max_mapping_size(dev->dev) >> 9);
- dev->ctrl.max_segments = NVME_MAX_SEGS;
-
- /*
- * Don't limit the IOMMU merged segment size.
- */
- dma_set_max_seg_size(dev->dev, 0xffffffff);
-
+ nvme_unquiesce_admin_queue(&dev->ctrl);
mutex_unlock(&dev->shutdown_lock);
/*
@@ -2865,62 +2768,37 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- /*
- * We do not support an SGL for metadata (yet), so we are limited to a
- * single integrity segment for the separate metadata pointer.
- */
- dev->ctrl.max_integrity_segments = 1;
-
- result = nvme_init_ctrl_finish(&dev->ctrl);
+ result = nvme_init_ctrl_finish(&dev->ctrl, was_suspend);
if (result)
goto out;
- if (dev->ctrl.oacs & NVME_CTRL_OACS_SEC_SUPP) {
- if (!dev->ctrl.opal_dev)
- dev->ctrl.opal_dev =
- init_opal_dev(&dev->ctrl, &nvme_sec_submit);
- else if (was_suspend)
- opal_unlock_from_suspend(dev->ctrl.opal_dev);
- } else {
- free_opal_dev(dev->ctrl.opal_dev);
- dev->ctrl.opal_dev = NULL;
- }
-
- if (dev->ctrl.oacs & NVME_CTRL_OACS_DBBUF_SUPP) {
- result = nvme_dbbuf_dma_alloc(dev);
- if (result)
- dev_warn(dev->dev,
- "unable to allocate dma for dbbuf\n");
- }
+ nvme_dbbuf_dma_alloc(dev);
- if (dev->ctrl.hmpre) {
- result = nvme_setup_host_mem(dev);
- if (result < 0)
- goto out;
- }
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out;
result = nvme_setup_io_queues(dev);
if (result)
goto out;
/*
- * Keep the controller around but remove all namespaces if we don't have
- * any working I/O queue.
+ * Freeze and update the number of I/O queues as thos might have
+ * changed. If there are no I/O queues left after this reset, keep the
+ * controller around but remove all namespaces.
*/
- if (dev->online_queues < 2) {
- dev_warn(dev->ctrl.device, "IO queues not created\n");
- nvme_kill_queues(&dev->ctrl);
- nvme_remove_namespaces(&dev->ctrl);
- nvme_free_tagset(dev);
- } else {
- nvme_start_queues(&dev->ctrl);
+ if (dev->online_queues > 1) {
+ nvme_unquiesce_io_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- if (!dev->ctrl.tagset)
- nvme_pci_alloc_tag_set(dev);
- else
- nvme_pci_update_nr_queues(dev);
+ nvme_pci_update_nr_queues(dev);
nvme_dbbuf_set(dev);
nvme_unfreeze(&dev->ctrl);
+ } else {
+ dev_warn(dev->ctrl.device, "IO queues lost\n");
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_unquiesce_io_queues(&dev->ctrl);
+ nvme_remove_namespaces(&dev->ctrl);
+ nvme_free_tagset(dev);
}
/*
@@ -2934,30 +2812,22 @@ static void nvme_reset_work(struct work_struct *work)
goto out;
}
- if (!dev->attrs_added && !sysfs_create_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group))
- dev->attrs_added = true;
-
nvme_start_ctrl(&dev->ctrl);
return;
out_unlock:
mutex_unlock(&dev->shutdown_lock);
out:
- if (result)
- dev_warn(dev->ctrl.device,
- "Removing after probe failure status: %d\n", result);
- nvme_remove_dead_ctrl(dev);
-}
-
-static void nvme_remove_dead_ctrl_work(struct work_struct *work)
-{
- struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
- struct pci_dev *pdev = to_pci_dev(dev->dev);
-
- if (pci_get_drvdata(pdev))
- device_release_driver(&pdev->dev);
- nvme_put_ctrl(&dev->ctrl);
+ /*
+ * Set state to deleting now to avoid blocking nvme_wait_reset(), which
+ * may be holding this pci_dev's device lock.
+ */
+ dev_warn(dev->ctrl.device, "Disabling device after reset failure: %d\n",
+ result);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_mark_namespaces_dead(&dev->ctrl);
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DEAD);
}
static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val)
@@ -3010,6 +2880,7 @@ static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = {
.name = "pcie",
.module = THIS_MODULE,
.flags = NVME_F_METADATA_SUPPORTED,
+ .dev_attr_groups = nvme_pci_dev_attr_groups,
.reg_read32 = nvme_pci_reg_read32,
.reg_write32 = nvme_pci_reg_write32,
.reg_read64 = nvme_pci_reg_read64,
@@ -3079,29 +2950,22 @@ static unsigned long check_vendor_combination_bug(struct pci_dev *pdev)
return 0;
}
-static void nvme_async_probe(void *data, async_cookie_t cookie)
+static struct nvme_dev *nvme_pci_alloc_dev(struct pci_dev *pdev,
+ const struct pci_device_id *id)
{
- struct nvme_dev *dev = data;
-
- flush_work(&dev->ctrl.reset_work);
- flush_work(&dev->ctrl.scan_work);
- nvme_put_ctrl(&dev->ctrl);
-}
-
-static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
-{
- int node, result = -ENOMEM;
- struct nvme_dev *dev;
unsigned long quirks = id->driver_data;
- size_t alloc_size;
+ int node = dev_to_node(&pdev->dev);
+ struct nvme_dev *dev;
+ int ret = -ENOMEM;
- node = dev_to_node(&pdev->dev);
if (node == NUMA_NO_NODE)
set_dev_node(&pdev->dev, first_memory_node);
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
- return -ENOMEM;
+ return NULL;
+ INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
+ mutex_init(&dev->shutdown_lock);
dev->nr_write_queues = write_queues;
dev->nr_poll_queues = poll_queues;
@@ -3109,25 +2973,11 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev->queues = kcalloc_node(dev->nr_allocated_queues,
sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
- goto free;
+ goto out_free_dev;
dev->dev = get_device(&pdev->dev);
- pci_set_drvdata(pdev, dev);
-
- result = nvme_dev_map(dev);
- if (result)
- goto put_pci;
-
- INIT_WORK(&dev->ctrl.reset_work, nvme_reset_work);
- INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work);
- mutex_init(&dev->shutdown_lock);
-
- result = nvme_setup_prp_pools(dev);
- if (result)
- goto unmap;
quirks |= check_vendor_combination_bug(pdev);
-
if (!noacpi && acpi_storage_d3(&pdev->dev)) {
/*
* Some systems use a bios work around to ask for D3 on
@@ -3137,46 +2987,131 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
"platform quirk: setting simple suspend\n");
quirks |= NVME_QUIRK_SIMPLE_SUSPEND;
}
+ ret = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
+ quirks);
+ if (ret)
+ goto out_put_device;
+
+ dma_set_min_align_mask(&pdev->dev, NVME_CTRL_PAGE_SIZE - 1);
+ dma_set_max_seg_size(&pdev->dev, 0xffffffff);
/*
- * Double check that our mempool alloc size will cover the biggest
- * command we support.
+ * Limit the max command size to prevent iod->sg allocations going
+ * over a single page.
*/
- alloc_size = nvme_pci_iod_alloc_size();
- WARN_ON_ONCE(alloc_size > PAGE_SIZE);
+ dev->ctrl.max_hw_sectors = min_t(u32,
+ NVME_MAX_KB_SZ << 1, dma_max_mapping_size(&pdev->dev) >> 9);
+ dev->ctrl.max_segments = NVME_MAX_SEGS;
- dev->iod_mempool = mempool_create_node(1, mempool_kmalloc,
- mempool_kfree,
- (void *) alloc_size,
- GFP_KERNEL, node);
- if (!dev->iod_mempool) {
- result = -ENOMEM;
- goto release_pools;
- }
+ /*
+ * There is no support for SGLs for metadata (yet), so we are limited to
+ * a single integrity segment for the separate metadata pointer.
+ */
+ dev->ctrl.max_integrity_segments = 1;
+ return dev;
+
+out_put_device:
+ put_device(dev->dev);
+ kfree(dev->queues);
+out_free_dev:
+ kfree(dev);
+ return ERR_PTR(ret);
+}
- result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops,
- quirks);
+static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+ struct nvme_dev *dev;
+ int result = -ENOMEM;
+
+ dev = nvme_pci_alloc_dev(pdev, id);
+ if (!dev)
+ return -ENOMEM;
+
+ result = nvme_dev_map(dev);
if (result)
- goto release_mempool;
+ goto out_uninit_ctrl;
+
+ result = nvme_setup_prp_pools(dev);
+ if (result)
+ goto out_dev_unmap;
+
+ result = nvme_pci_alloc_iod_mempool(dev);
+ if (result)
+ goto out_release_prp_pools;
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- nvme_reset_ctrl(&dev->ctrl);
- async_schedule(nvme_async_probe, dev);
+ result = nvme_pci_enable(dev);
+ if (result)
+ goto out_release_iod_mempool;
+
+ result = nvme_alloc_admin_tag_set(&dev->ctrl, &dev->admin_tagset,
+ &nvme_mq_admin_ops, sizeof(struct nvme_iod));
+ if (result)
+ goto out_disable;
+
+ /*
+ * Mark the controller as connecting before sending admin commands to
+ * allow the timeout handler to do the right thing.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_CONNECTING)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller CONNECTING\n");
+ result = -EBUSY;
+ goto out_disable;
+ }
+
+ result = nvme_init_ctrl_finish(&dev->ctrl, false);
+ if (result)
+ goto out_disable;
+
+ nvme_dbbuf_dma_alloc(dev);
+
+ result = nvme_setup_host_mem(dev);
+ if (result < 0)
+ goto out_disable;
+
+ result = nvme_setup_io_queues(dev);
+ if (result)
+ goto out_disable;
+ if (dev->online_queues > 1) {
+ nvme_alloc_io_tag_set(&dev->ctrl, &dev->tagset, &nvme_mq_ops,
+ nvme_pci_nr_maps(dev), sizeof(struct nvme_iod));
+ nvme_dbbuf_set(dev);
+ }
+
+ if (!dev->ctrl.tagset)
+ dev_warn(dev->ctrl.device, "IO queues not created\n");
+
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller live state\n");
+ result = -ENODEV;
+ goto out_disable;
+ }
+
+ pci_set_drvdata(pdev, dev);
+
+ nvme_start_ctrl(&dev->ctrl);
+ nvme_put_ctrl(&dev->ctrl);
return 0;
- release_mempool:
+out_disable:
+ nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_DELETING);
+ nvme_dev_disable(dev, true);
+ nvme_free_host_mem(dev);
+ nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
+ nvme_free_queues(dev, 0);
+out_release_iod_mempool:
mempool_destroy(dev->iod_mempool);
- release_pools:
+out_release_prp_pools:
nvme_release_prp_pools(dev);
- unmap:
+out_dev_unmap:
nvme_dev_unmap(dev);
- put_pci:
- put_device(dev->dev);
- free:
- kfree(dev->queues);
- kfree(dev);
+out_uninit_ctrl:
+ nvme_uninit_ctrl(&dev->ctrl);
return result;
}
@@ -3208,13 +3143,6 @@ static void nvme_shutdown(struct pci_dev *pdev)
nvme_disable_prepare_reset(dev, true);
}
-static void nvme_remove_attrs(struct nvme_dev *dev)
-{
- if (dev->attrs_added)
- sysfs_remove_group(&dev->ctrl.device->kobj,
- &nvme_pci_attr_group);
-}
-
/*
* The driver's remove may be called on a device in a partially initialized
* state. This function must not have any dependencies on the device state in
@@ -3236,10 +3164,11 @@ static void nvme_remove(struct pci_dev *pdev)
nvme_stop_ctrl(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
nvme_dev_disable(dev, true);
- nvme_remove_attrs(dev);
nvme_free_host_mem(dev);
nvme_dev_remove_admin(dev);
+ nvme_dbbuf_dma_free(dev);
nvme_free_queues(dev, 0);
+ mempool_destroy(dev->iod_mempool);
nvme_release_prp_pools(dev);
nvme_dev_unmap(dev);
nvme_uninit_ctrl(&dev->ctrl);
@@ -3576,11 +3505,12 @@ static struct pci_driver nvme_driver = {
.probe = nvme_probe,
.remove = nvme_remove,
.shutdown = nvme_shutdown,
-#ifdef CONFIG_PM_SLEEP
.driver = {
- .pm = &nvme_dev_pm_ops,
- },
+ .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+#ifdef CONFIG_PM_SLEEP
+ .pm = &nvme_dev_pm_ops,
#endif
+ },
.sriov_configure = pci_sriov_configure_simple,
.err_handler = &nvme_err_handler,
};
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 6e079abb22ee..bbad26b82b56 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -798,7 +798,9 @@ static int nvme_rdma_alloc_tag_set(struct nvme_ctrl *ctrl)
NVME_RDMA_METADATA_SGL_SIZE;
return nvme_alloc_io_tag_set(ctrl, &to_rdma_ctrl(ctrl)->tag_set,
- &nvme_rdma_mq_ops, BLK_MQ_F_SHOULD_MERGE, cmd_size);
+ &nvme_rdma_mq_ops,
+ ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
+ cmd_size);
}
static void nvme_rdma_destroy_admin_queue(struct nvme_rdma_ctrl *ctrl)
@@ -846,7 +848,6 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
if (new) {
error = nvme_alloc_admin_tag_set(&ctrl->ctrl,
&ctrl->admin_tag_set, &nvme_rdma_admin_mq_ops,
- BLK_MQ_F_NO_SCHED,
sizeof(struct nvme_rdma_request) +
NVME_RDMA_DATA_SGL_SIZE);
if (error)
@@ -869,16 +870,16 @@ static int nvme_rdma_configure_admin_queue(struct nvme_rdma_ctrl *ctrl,
else
ctrl->ctrl.max_integrity_segments = 0;
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
- error = nvme_init_ctrl_finish(&ctrl->ctrl);
+ error = nvme_init_ctrl_finish(&ctrl->ctrl, false);
if (error)
goto out_quiesce_queue;
return 0;
out_quiesce_queue:
- nvme_stop_admin_queue(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
out_stop_queue:
nvme_rdma_stop_queue(&ctrl->queues[0]);
@@ -922,7 +923,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
goto out_cleanup_tagset;
if (!new) {
- nvme_start_queues(&ctrl->ctrl);
+ nvme_unquiesce_io_queues(&ctrl->ctrl);
if (!nvme_wait_freeze_timeout(&ctrl->ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
@@ -949,7 +950,7 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
return 0;
out_wait_freeze_timed_out:
- nvme_stop_queues(&ctrl->ctrl);
+ nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
out_cleanup_tagset:
@@ -964,12 +965,12 @@ out_free_io_queues:
static void nvme_rdma_teardown_admin_queue(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
- nvme_stop_admin_queue(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl);
if (remove) {
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
nvme_remove_admin_tag_set(&ctrl->ctrl);
}
nvme_rdma_destroy_admin_queue(ctrl);
@@ -980,12 +981,12 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
{
if (ctrl->ctrl.queue_count > 1) {
nvme_start_freeze(&ctrl->ctrl);
- nvme_stop_queues(&ctrl->ctrl);
+ nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
nvme_cancel_tagset(&ctrl->ctrl);
if (remove) {
- nvme_start_queues(&ctrl->ctrl);
+ nvme_unquiesce_io_queues(&ctrl->ctrl);
nvme_remove_io_tag_set(&ctrl->ctrl);
}
nvme_rdma_free_io_queues(ctrl);
@@ -1106,7 +1107,7 @@ static int nvme_rdma_setup_ctrl(struct nvme_rdma_ctrl *ctrl, bool new)
destroy_io:
if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
+ nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_sync_io_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
nvme_cancel_tagset(&ctrl->ctrl);
@@ -1115,7 +1116,7 @@ destroy_io:
nvme_rdma_free_io_queues(ctrl);
}
destroy_admin:
- nvme_stop_admin_queue(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
blk_sync_queue(ctrl->ctrl.admin_q);
nvme_rdma_stop_queue(&ctrl->queues[0]);
nvme_cancel_admin_tagset(&ctrl->ctrl);
@@ -1153,13 +1154,13 @@ static void nvme_rdma_error_recovery_work(struct work_struct *work)
struct nvme_rdma_ctrl *ctrl = container_of(work,
struct nvme_rdma_ctrl, err_work);
- nvme_auth_stop(&ctrl->ctrl);
nvme_stop_keep_alive(&ctrl->ctrl);
flush_work(&ctrl->ctrl.async_event_work);
nvme_rdma_teardown_io_queues(ctrl, false);
- nvme_start_queues(&ctrl->ctrl);
+ nvme_unquiesce_io_queues(&ctrl->ctrl);
nvme_rdma_teardown_admin_queue(ctrl, false);
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
+ nvme_auth_stop(&ctrl->ctrl);
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
@@ -2040,7 +2041,7 @@ static blk_status_t nvme_rdma_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ret)
goto unmap_qe;
- blk_mq_start_request(rq);
+ nvme_start_request(rq);
if (IS_ENABLED(CONFIG_BLK_DEV_INTEGRITY) &&
queue->pi_support &&
@@ -2207,11 +2208,8 @@ static const struct blk_mq_ops nvme_rdma_admin_mq_ops = {
static void nvme_rdma_shutdown_ctrl(struct nvme_rdma_ctrl *ctrl, bool shutdown)
{
nvme_rdma_teardown_io_queues(ctrl, shutdown);
- nvme_stop_admin_queue(&ctrl->ctrl);
- if (shutdown)
- nvme_shutdown_ctrl(&ctrl->ctrl);
- else
- nvme_disable_ctrl(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
+ nvme_disable_ctrl(&ctrl->ctrl, shutdown);
nvme_rdma_teardown_admin_queue(ctrl, shutdown);
}
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index 3d13f6f08388..b69b89166b6b 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -1867,7 +1867,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
if (new) {
ret = nvme_alloc_io_tag_set(ctrl, &to_tcp_ctrl(ctrl)->tag_set,
&nvme_tcp_mq_ops,
- BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING,
+ ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
sizeof(struct nvme_tcp_request));
if (ret)
goto out_free_io_queues;
@@ -1884,7 +1884,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
goto out_cleanup_connect_q;
if (!new) {
- nvme_start_queues(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
/*
* If we timed out waiting for freeze we are likely to
@@ -1911,7 +1911,7 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
return 0;
out_wait_freeze_timed_out:
- nvme_stop_queues(ctrl);
+ nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
out_cleanup_connect_q:
@@ -1942,7 +1942,7 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (new) {
error = nvme_alloc_admin_tag_set(ctrl,
&to_tcp_ctrl(ctrl)->admin_tag_set,
- &nvme_tcp_admin_mq_ops, BLK_MQ_F_BLOCKING,
+ &nvme_tcp_admin_mq_ops,
sizeof(struct nvme_tcp_request));
if (error)
goto out_free_queue;
@@ -1956,16 +1956,16 @@ static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
if (error)
goto out_stop_queue;
- nvme_start_admin_queue(ctrl);
+ nvme_unquiesce_admin_queue(ctrl);
- error = nvme_init_ctrl_finish(ctrl);
+ error = nvme_init_ctrl_finish(ctrl, false);
if (error)
goto out_quiesce_queue;
return 0;
out_quiesce_queue:
- nvme_stop_admin_queue(ctrl);
+ nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
out_stop_queue:
nvme_tcp_stop_queue(ctrl, 0);
@@ -1981,12 +1981,12 @@ out_free_queue:
static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
bool remove)
{
- nvme_stop_admin_queue(ctrl);
+ nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
if (remove)
- nvme_start_admin_queue(ctrl);
+ nvme_unquiesce_admin_queue(ctrl);
nvme_tcp_destroy_admin_queue(ctrl, remove);
}
@@ -1995,14 +1995,14 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
- nvme_stop_admin_queue(ctrl);
+ nvme_quiesce_admin_queue(ctrl);
nvme_start_freeze(ctrl);
- nvme_stop_queues(ctrl);
+ nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
if (remove)
- nvme_start_queues(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
nvme_tcp_destroy_io_queues(ctrl, remove);
}
@@ -2083,14 +2083,14 @@ static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
destroy_io:
if (ctrl->queue_count > 1) {
- nvme_stop_queues(ctrl);
+ nvme_quiesce_io_queues(ctrl);
nvme_sync_io_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
nvme_cancel_tagset(ctrl);
nvme_tcp_destroy_io_queues(ctrl, new);
}
destroy_admin:
- nvme_stop_admin_queue(ctrl);
+ nvme_quiesce_admin_queue(ctrl);
blk_sync_queue(ctrl->admin_q);
nvme_tcp_stop_queue(ctrl, 0);
nvme_cancel_admin_tagset(ctrl);
@@ -2128,14 +2128,14 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
struct nvme_tcp_ctrl, err_work);
struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
- nvme_auth_stop(ctrl);
nvme_stop_keep_alive(ctrl);
flush_work(&ctrl->async_event_work);
nvme_tcp_teardown_io_queues(ctrl, false);
/* unquiesce to fail fast pending requests */
- nvme_start_queues(ctrl);
+ nvme_unquiesce_io_queues(ctrl);
nvme_tcp_teardown_admin_queue(ctrl, false);
- nvme_start_admin_queue(ctrl);
+ nvme_unquiesce_admin_queue(ctrl);
+ nvme_auth_stop(ctrl);
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING)) {
/* state change failure is ok if we started ctrl delete */
@@ -2150,11 +2150,8 @@ static void nvme_tcp_error_recovery_work(struct work_struct *work)
static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
{
nvme_tcp_teardown_io_queues(ctrl, shutdown);
- nvme_stop_admin_queue(ctrl);
- if (shutdown)
- nvme_shutdown_ctrl(ctrl);
- else
- nvme_disable_ctrl(ctrl);
+ nvme_quiesce_admin_queue(ctrl);
+ nvme_disable_ctrl(ctrl, shutdown);
nvme_tcp_teardown_admin_queue(ctrl, shutdown);
}
@@ -2414,7 +2411,7 @@ static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
if (unlikely(ret))
return ret;
- blk_mq_start_request(rq);
+ nvme_start_request(rq);
nvme_tcp_queue_request(req, true, bd->last);
@@ -2523,7 +2520,7 @@ static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
.name = "tcp",
.module = THIS_MODULE,
- .flags = NVME_F_FABRICS,
+ .flags = NVME_F_FABRICS | NVME_F_BLOCKING,
.reg_read32 = nvmf_reg_read32,
.reg_read64 = nvmf_reg_read64,
.reg_write32 = nvmf_reg_write32,
diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c
index c8a061ce3ee5..53a004ea320c 100644
--- a/drivers/nvme/target/admin-cmd.c
+++ b/drivers/nvme/target/admin-cmd.c
@@ -370,7 +370,9 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
memcpy_and_pad(id->mn, sizeof(id->mn), subsys->model_number,
strlen(subsys->model_number), ' ');
memcpy_and_pad(id->fr, sizeof(id->fr),
- UTS_RELEASE, strlen(UTS_RELEASE), ' ');
+ subsys->firmware_rev, strlen(subsys->firmware_rev), ' ');
+
+ put_unaligned_le24(subsys->ieee_oui, id->ieee);
id->rab = 6;
@@ -379,11 +381,6 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req)
else
id->cntrltype = NVME_CTRL_IO;
- /*
- * XXX: figure out how we can assign a IEEE OUI, but until then
- * the safest is to leave it as zeroes.
- */
-
/* we support multiple ports, multiples hosts and ANA: */
id->cmic = NVME_CTRL_CMIC_MULTI_PORT | NVME_CTRL_CMIC_MULTI_CTRL |
NVME_CTRL_CMIC_ANA;
@@ -564,7 +561,7 @@ static void nvmet_execute_identify_ns(struct nvmet_req *req)
}
if (req->ns->readonly)
- id->nsattr |= (1 << 0);
+ id->nsattr |= NVME_NS_ATTR_RO;
done:
if (!status)
status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id));
diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c
index 6a2816f3b4e8..907143870da5 100644
--- a/drivers/nvme/target/configfs.c
+++ b/drivers/nvme/target/configfs.c
@@ -4,6 +4,7 @@
* Copyright (c) 2015-2016 HGST, a Western Digital Company.
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/kstrtox.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -267,7 +268,7 @@ static ssize_t nvmet_param_pi_enable_store(struct config_item *item,
struct nvmet_port *port = to_nvmet_port(item);
bool val;
- if (strtobool(page, &val))
+ if (kstrtobool(page, &val))
return -EINVAL;
if (nvmet_is_port_enabled(port, __func__))
@@ -532,7 +533,7 @@ static ssize_t nvmet_ns_enable_store(struct config_item *item,
bool enable;
int ret = 0;
- if (strtobool(page, &enable))
+ if (kstrtobool(page, &enable))
return -EINVAL;
if (enable)
@@ -556,7 +557,7 @@ static ssize_t nvmet_ns_buffered_io_store(struct config_item *item,
struct nvmet_ns *ns = to_nvmet_ns(item);
bool val;
- if (strtobool(page, &val))
+ if (kstrtobool(page, &val))
return -EINVAL;
mutex_lock(&ns->subsys->lock);
@@ -579,7 +580,7 @@ static ssize_t nvmet_ns_revalidate_size_store(struct config_item *item,
struct nvmet_ns *ns = to_nvmet_ns(item);
bool val;
- if (strtobool(page, &val))
+ if (kstrtobool(page, &val))
return -EINVAL;
if (!val)
@@ -728,7 +729,7 @@ static ssize_t nvmet_passthru_enable_store(struct config_item *item,
bool enable;
int ret = 0;
- if (strtobool(page, &enable))
+ if (kstrtobool(page, &enable))
return -EINVAL;
if (enable)
@@ -995,7 +996,7 @@ static ssize_t nvmet_subsys_attr_allow_any_host_store(struct config_item *item,
bool allow_any_host;
int ret = 0;
- if (strtobool(page, &allow_any_host))
+ if (kstrtobool(page, &allow_any_host))
return -EINVAL;
down_write(&nvmet_config_sem);
@@ -1262,6 +1263,116 @@ static ssize_t nvmet_subsys_attr_model_store(struct config_item *item,
}
CONFIGFS_ATTR(nvmet_subsys_, attr_model);
+static ssize_t nvmet_subsys_attr_ieee_oui_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_subsys *subsys = to_subsys(item);
+
+ return sysfs_emit(page, "0x%06x\n", subsys->ieee_oui);
+}
+
+static ssize_t nvmet_subsys_attr_ieee_oui_store_locked(struct nvmet_subsys *subsys,
+ const char *page, size_t count)
+{
+ uint32_t val = 0;
+ int ret;
+
+ if (subsys->subsys_discovered) {
+ pr_err("Can't set IEEE OUI. 0x%06x is already assigned\n",
+ subsys->ieee_oui);
+ return -EINVAL;
+ }
+
+ ret = kstrtou32(page, 0, &val);
+ if (ret < 0)
+ return ret;
+
+ if (val >= 0x1000000)
+ return -EINVAL;
+
+ subsys->ieee_oui = val;
+
+ return count;
+}
+
+static ssize_t nvmet_subsys_attr_ieee_oui_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_subsys *subsys = to_subsys(item);
+ ssize_t ret;
+
+ down_write(&nvmet_config_sem);
+ mutex_lock(&subsys->lock);
+ ret = nvmet_subsys_attr_ieee_oui_store_locked(subsys, page, count);
+ mutex_unlock(&subsys->lock);
+ up_write(&nvmet_config_sem);
+
+ return ret;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_ieee_oui);
+
+static ssize_t nvmet_subsys_attr_firmware_show(struct config_item *item,
+ char *page)
+{
+ struct nvmet_subsys *subsys = to_subsys(item);
+
+ return sysfs_emit(page, "%s\n", subsys->firmware_rev);
+}
+
+static ssize_t nvmet_subsys_attr_firmware_store_locked(struct nvmet_subsys *subsys,
+ const char *page, size_t count)
+{
+ int pos = 0, len;
+ char *val;
+
+ if (subsys->subsys_discovered) {
+ pr_err("Can't set firmware revision. %s is already assigned\n",
+ subsys->firmware_rev);
+ return -EINVAL;
+ }
+
+ len = strcspn(page, "\n");
+ if (!len)
+ return -EINVAL;
+
+ if (len > NVMET_FR_MAX_SIZE) {
+ pr_err("Firmware revision size can not exceed %d Bytes\n",
+ NVMET_FR_MAX_SIZE);
+ return -EINVAL;
+ }
+
+ for (pos = 0; pos < len; pos++) {
+ if (!nvmet_is_ascii(page[pos]))
+ return -EINVAL;
+ }
+
+ val = kmemdup_nul(page, len, GFP_KERNEL);
+ if (!val)
+ return -ENOMEM;
+
+ kfree(subsys->firmware_rev);
+
+ subsys->firmware_rev = val;
+
+ return count;
+}
+
+static ssize_t nvmet_subsys_attr_firmware_store(struct config_item *item,
+ const char *page, size_t count)
+{
+ struct nvmet_subsys *subsys = to_subsys(item);
+ ssize_t ret;
+
+ down_write(&nvmet_config_sem);
+ mutex_lock(&subsys->lock);
+ ret = nvmet_subsys_attr_firmware_store_locked(subsys, page, count);
+ mutex_unlock(&subsys->lock);
+ up_write(&nvmet_config_sem);
+
+ return ret;
+}
+CONFIGFS_ATTR(nvmet_subsys_, attr_firmware);
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
static ssize_t nvmet_subsys_attr_pi_enable_show(struct config_item *item,
char *page)
@@ -1275,7 +1386,7 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item,
struct nvmet_subsys *subsys = to_subsys(item);
bool pi_enable;
- if (strtobool(page, &pi_enable))
+ if (kstrtobool(page, &pi_enable))
return -EINVAL;
subsys->pi_support = pi_enable;
@@ -1293,6 +1404,8 @@ static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item,
static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item,
const char *page, size_t cnt)
{
+ struct nvmet_subsys *subsys = to_subsys(item);
+ struct nvmet_ctrl *ctrl;
u16 qid_max;
if (sscanf(page, "%hu\n", &qid_max) != 1)
@@ -1302,8 +1415,13 @@ static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item,
return -EINVAL;
down_write(&nvmet_config_sem);
- to_subsys(item)->max_qid = qid_max;
+ subsys->max_qid = qid_max;
+
+ /* Force reconnect */
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry)
+ ctrl->ops->delete_ctrl(ctrl);
up_write(&nvmet_config_sem);
+
return cnt;
}
CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max);
@@ -1316,6 +1434,8 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = {
&nvmet_subsys_attr_attr_cntlid_max,
&nvmet_subsys_attr_attr_model,
&nvmet_subsys_attr_attr_qid_max,
+ &nvmet_subsys_attr_attr_ieee_oui,
+ &nvmet_subsys_attr_attr_firmware,
#ifdef CONFIG_BLK_DEV_INTEGRITY
&nvmet_subsys_attr_attr_pi_enable,
#endif
@@ -1395,7 +1515,7 @@ static ssize_t nvmet_referral_enable_store(struct config_item *item,
struct nvmet_port *port = to_nvmet_port(item);
bool enable;
- if (strtobool(page, &enable))
+ if (kstrtobool(page, &enable))
goto inval;
if (enable)
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index aecb5853f8da..f66ed13d7c11 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -10,11 +10,14 @@
#include <linux/pci-p2pdma.h>
#include <linux/scatterlist.h>
+#include <generated/utsrelease.h>
+
#define CREATE_TRACE_POINTS
#include "trace.h"
#include "nvmet.h"
+struct kmem_cache *nvmet_bvec_cache;
struct workqueue_struct *buffered_io_wq;
struct workqueue_struct *zbd_wq;
static const struct nvmet_fabrics_ops *nvmet_transports[NVMF_TRTYPE_MAX];
@@ -695,11 +698,10 @@ static void nvmet_update_sq_head(struct nvmet_req *req)
if (req->sq->size) {
u32 old_sqhd, new_sqhd;
+ old_sqhd = READ_ONCE(req->sq->sqhd);
do {
- old_sqhd = req->sq->sqhd;
new_sqhd = (old_sqhd + 1) % req->sq->size;
- } while (cmpxchg(&req->sq->sqhd, old_sqhd, new_sqhd) !=
- old_sqhd);
+ } while (!try_cmpxchg(&req->sq->sqhd, &old_sqhd, new_sqhd));
}
req->cqe->sq_head = cpu_to_le16(req->sq->sqhd & 0x0000FFFF);
}
@@ -1561,6 +1563,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
goto free_subsys;
}
+ subsys->ieee_oui = 0;
+
+ subsys->firmware_rev = kstrndup(UTS_RELEASE, NVMET_FR_MAX_SIZE, GFP_KERNEL);
+ if (!subsys->firmware_rev) {
+ ret = -ENOMEM;
+ goto free_mn;
+ }
+
switch (type) {
case NVME_NQN_NVME:
subsys->max_qid = NVMET_NR_QUEUES;
@@ -1572,14 +1582,14 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
default:
pr_err("%s: Unknown Subsystem type - %d\n", __func__, type);
ret = -EINVAL;
- goto free_mn;
+ goto free_fr;
}
subsys->type = type;
subsys->subsysnqn = kstrndup(subsysnqn, NVMF_NQN_SIZE,
GFP_KERNEL);
if (!subsys->subsysnqn) {
ret = -ENOMEM;
- goto free_mn;
+ goto free_fr;
}
subsys->cntlid_min = NVME_CNTLID_MIN;
subsys->cntlid_max = NVME_CNTLID_MAX;
@@ -1592,6 +1602,8 @@ struct nvmet_subsys *nvmet_subsys_alloc(const char *subsysnqn,
return subsys;
+free_fr:
+ kfree(subsys->firmware_rev);
free_mn:
kfree(subsys->model_number);
free_subsys:
@@ -1611,6 +1623,7 @@ static void nvmet_subsys_free(struct kref *ref)
kfree(subsys->subsysnqn);
kfree(subsys->model_number);
+ kfree(subsys->firmware_rev);
kfree(subsys);
}
@@ -1631,26 +1644,28 @@ void nvmet_subsys_put(struct nvmet_subsys *subsys)
static int __init nvmet_init(void)
{
- int error;
+ int error = -ENOMEM;
nvmet_ana_group_enabled[NVMET_DEFAULT_ANA_GRPID] = 1;
+ nvmet_bvec_cache = kmem_cache_create("nvmet-bvec",
+ NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec), 0,
+ SLAB_HWCACHE_ALIGN, NULL);
+ if (!nvmet_bvec_cache)
+ return -ENOMEM;
+
zbd_wq = alloc_workqueue("nvmet-zbd-wq", WQ_MEM_RECLAIM, 0);
if (!zbd_wq)
- return -ENOMEM;
+ goto out_destroy_bvec_cache;
buffered_io_wq = alloc_workqueue("nvmet-buffered-io-wq",
WQ_MEM_RECLAIM, 0);
- if (!buffered_io_wq) {
- error = -ENOMEM;
+ if (!buffered_io_wq)
goto out_free_zbd_work_queue;
- }
nvmet_wq = alloc_workqueue("nvmet-wq", WQ_MEM_RECLAIM, 0);
- if (!nvmet_wq) {
- error = -ENOMEM;
+ if (!nvmet_wq)
goto out_free_buffered_work_queue;
- }
error = nvmet_init_discovery();
if (error)
@@ -1669,6 +1684,8 @@ out_free_buffered_work_queue:
destroy_workqueue(buffered_io_wq);
out_free_zbd_work_queue:
destroy_workqueue(zbd_wq);
+out_destroy_bvec_cache:
+ kmem_cache_destroy(nvmet_bvec_cache);
return error;
}
@@ -1680,6 +1697,7 @@ static void __exit nvmet_exit(void)
destroy_workqueue(nvmet_wq);
destroy_workqueue(buffered_io_wq);
destroy_workqueue(zbd_wq);
+ kmem_cache_destroy(nvmet_bvec_cache);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_entry) != 1024);
BUILD_BUG_ON(sizeof(struct nvmf_disc_rsp_page_hdr) != 1024);
diff --git a/drivers/nvme/target/io-cmd-file.c b/drivers/nvme/target/io-cmd-file.c
index 946ad0240ee5..871c4f32f443 100644
--- a/drivers/nvme/target/io-cmd-file.c
+++ b/drivers/nvme/target/io-cmd-file.c
@@ -11,7 +11,6 @@
#include <linux/fs.h>
#include "nvmet.h"
-#define NVMET_MAX_MPOOL_BVEC 16
#define NVMET_MIN_MPOOL_OBJ 16
void nvmet_file_ns_revalidate(struct nvmet_ns *ns)
@@ -26,8 +25,6 @@ void nvmet_file_ns_disable(struct nvmet_ns *ns)
flush_workqueue(buffered_io_wq);
mempool_destroy(ns->bvec_pool);
ns->bvec_pool = NULL;
- kmem_cache_destroy(ns->bvec_cache);
- ns->bvec_cache = NULL;
fput(ns->file);
ns->file = NULL;
}
@@ -59,16 +56,8 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
ns->blksize_shift = min_t(u8,
file_inode(ns->file)->i_blkbits, 12);
- ns->bvec_cache = kmem_cache_create("nvmet-bvec",
- NVMET_MAX_MPOOL_BVEC * sizeof(struct bio_vec),
- 0, SLAB_HWCACHE_ALIGN, NULL);
- if (!ns->bvec_cache) {
- ret = -ENOMEM;
- goto err;
- }
-
ns->bvec_pool = mempool_create(NVMET_MIN_MPOOL_OBJ, mempool_alloc_slab,
- mempool_free_slab, ns->bvec_cache);
+ mempool_free_slab, nvmet_bvec_cache);
if (!ns->bvec_pool) {
ret = -ENOMEM;
@@ -77,9 +66,10 @@ int nvmet_file_ns_enable(struct nvmet_ns *ns)
return ret;
err:
+ fput(ns->file);
+ ns->file = NULL;
ns->size = 0;
ns->blksize_shift = 0;
- nvmet_file_ns_disable(ns);
return ret;
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index b45fe3adf015..f2d24b2d992f 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -145,7 +145,7 @@ static blk_status_t nvme_loop_queue_rq(struct blk_mq_hw_ctx *hctx,
if (ret)
return ret;
- blk_mq_start_request(req);
+ nvme_start_request(req);
iod->cmd.common.flags |= NVME_CMD_SGL_METABUF;
iod->req.port = queue->ctrl->port;
if (!nvmet_req_init(&iod->req, &queue->nvme_cq,
@@ -353,7 +353,7 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
ctrl->ctrl.queue_count = 1;
error = nvme_alloc_admin_tag_set(&ctrl->ctrl, &ctrl->admin_tag_set,
- &nvme_loop_admin_mq_ops, BLK_MQ_F_NO_SCHED,
+ &nvme_loop_admin_mq_ops,
sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
if (error)
@@ -375,9 +375,9 @@ static int nvme_loop_configure_admin_queue(struct nvme_loop_ctrl *ctrl)
ctrl->ctrl.max_hw_sectors =
(NVME_LOOP_MAX_SEGMENTS - 1) << (PAGE_SHIFT - 9);
- nvme_start_admin_queue(&ctrl->ctrl);
+ nvme_unquiesce_admin_queue(&ctrl->ctrl);
- error = nvme_init_ctrl_finish(&ctrl->ctrl);
+ error = nvme_init_ctrl_finish(&ctrl->ctrl, false);
if (error)
goto out_cleanup_tagset;
@@ -394,14 +394,14 @@ out_free_sq:
static void nvme_loop_shutdown_ctrl(struct nvme_loop_ctrl *ctrl)
{
if (ctrl->ctrl.queue_count > 1) {
- nvme_stop_queues(&ctrl->ctrl);
+ nvme_quiesce_io_queues(&ctrl->ctrl);
nvme_cancel_tagset(&ctrl->ctrl);
nvme_loop_destroy_io_queues(ctrl);
}
- nvme_stop_admin_queue(&ctrl->ctrl);
+ nvme_quiesce_admin_queue(&ctrl->ctrl);
if (ctrl->ctrl.state == NVME_CTRL_LIVE)
- nvme_shutdown_ctrl(&ctrl->ctrl);
+ nvme_disable_ctrl(&ctrl->ctrl, true);
nvme_cancel_admin_tagset(&ctrl->ctrl);
nvme_loop_destroy_admin_queue(ctrl);
@@ -494,7 +494,7 @@ static int nvme_loop_create_io_queues(struct nvme_loop_ctrl *ctrl)
return ret;
ret = nvme_alloc_io_tag_set(&ctrl->ctrl, &ctrl->tag_set,
- &nvme_loop_mq_ops, BLK_MQ_F_SHOULD_MERGE,
+ &nvme_loop_mq_ops, 1,
sizeof(struct nvme_loop_iod) +
NVME_INLINE_SG_CNT * sizeof(struct scatterlist));
if (ret)
diff --git a/drivers/nvme/target/nvmet.h b/drivers/nvme/target/nvmet.h
index dfe3894205aa..89bedfcd974c 100644
--- a/drivers/nvme/target/nvmet.h
+++ b/drivers/nvme/target/nvmet.h
@@ -29,6 +29,7 @@
#define NVMET_DEFAULT_CTRL_MODEL "Linux"
#define NVMET_MN_MAX_SIZE 40
#define NVMET_SN_MAX_SIZE 20
+#define NVMET_FR_MAX_SIZE 8
/*
* Supported optional AENs:
@@ -77,7 +78,6 @@ struct nvmet_ns {
struct completion disable_done;
mempool_t *bvec_pool;
- struct kmem_cache *bvec_cache;
int use_p2pmem;
struct pci_dev *p2p_dev;
@@ -264,6 +264,8 @@ struct nvmet_subsys {
struct config_group allowed_hosts_group;
char *model_number;
+ u32 ieee_oui;
+ char *firmware_rev;
#ifdef CONFIG_NVME_TARGET_PASSTHRU
struct nvme_ctrl *passthru_ctrl;
@@ -393,6 +395,8 @@ struct nvmet_req {
u64 error_slba;
};
+#define NVMET_MAX_MPOOL_BVEC 16
+extern struct kmem_cache *nvmet_bvec_cache;
extern struct workqueue_struct *buffered_io_wq;
extern struct workqueue_struct *zbd_wq;
extern struct workqueue_struct *nvmet_wq;
diff --git a/drivers/pci/p2pdma.c b/drivers/pci/p2pdma.c
index 5565f67d6537..86812d2073ea 100644
--- a/drivers/pci/p2pdma.c
+++ b/drivers/pci/p2pdma.c
@@ -89,6 +89,90 @@ static ssize_t published_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(published);
+static int p2pmem_alloc_mmap(struct file *filp, struct kobject *kobj,
+ struct bin_attribute *attr, struct vm_area_struct *vma)
+{
+ struct pci_dev *pdev = to_pci_dev(kobj_to_dev(kobj));
+ size_t len = vma->vm_end - vma->vm_start;
+ struct pci_p2pdma *p2pdma;
+ struct percpu_ref *ref;
+ unsigned long vaddr;
+ void *kaddr;
+ int ret;
+
+ /* prevent private mappings from being established */
+ if ((vma->vm_flags & VM_MAYSHARE) != VM_MAYSHARE) {
+ pci_info_ratelimited(pdev,
+ "%s: fail, attempted private mapping\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ if (vma->vm_pgoff) {
+ pci_info_ratelimited(pdev,
+ "%s: fail, attempted mapping with non-zero offset\n",
+ current->comm);
+ return -EINVAL;
+ }
+
+ rcu_read_lock();
+ p2pdma = rcu_dereference(pdev->p2pdma);
+ if (!p2pdma) {
+ ret = -ENODEV;
+ goto out;
+ }
+
+ kaddr = (void *)gen_pool_alloc_owner(p2pdma->pool, len, (void **)&ref);
+ if (!kaddr) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ /*
+ * vm_insert_page() can sleep, so a reference is taken to mapping
+ * such that rcu_read_unlock() can be done before inserting the
+ * pages
+ */
+ if (unlikely(!percpu_ref_tryget_live_rcu(ref))) {
+ ret = -ENODEV;
+ goto out_free_mem;
+ }
+ rcu_read_unlock();
+
+ for (vaddr = vma->vm_start; vaddr < vma->vm_end; vaddr += PAGE_SIZE) {
+ ret = vm_insert_page(vma, vaddr, virt_to_page(kaddr));
+ if (ret) {
+ gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+ return ret;
+ }
+ percpu_ref_get(ref);
+ put_page(virt_to_page(kaddr));
+ kaddr += PAGE_SIZE;
+ len -= PAGE_SIZE;
+ }
+
+ percpu_ref_put(ref);
+
+ return 0;
+out_free_mem:
+ gen_pool_free(p2pdma->pool, (uintptr_t)kaddr, len);
+out:
+ rcu_read_unlock();
+ return ret;
+}
+
+static struct bin_attribute p2pmem_alloc_attr = {
+ .attr = { .name = "allocate", .mode = 0660 },
+ .mmap = p2pmem_alloc_mmap,
+ /*
+ * Some places where we want to call mmap (ie. python) will check
+ * that the file size is greater than the mmap size before allowing
+ * the mmap to continue. To work around this, just set the size
+ * to be very large.
+ */
+ .size = SZ_1T,
+};
+
static struct attribute *p2pmem_attrs[] = {
&dev_attr_size.attr,
&dev_attr_available.attr,
@@ -96,11 +180,32 @@ static struct attribute *p2pmem_attrs[] = {
NULL,
};
+static struct bin_attribute *p2pmem_bin_attrs[] = {
+ &p2pmem_alloc_attr,
+ NULL,
+};
+
static const struct attribute_group p2pmem_group = {
.attrs = p2pmem_attrs,
+ .bin_attrs = p2pmem_bin_attrs,
.name = "p2pmem",
};
+static void p2pdma_page_free(struct page *page)
+{
+ struct pci_p2pdma_pagemap *pgmap = to_p2p_pgmap(page->pgmap);
+ struct percpu_ref *ref;
+
+ gen_pool_free_owner(pgmap->provider->p2pdma->pool,
+ (uintptr_t)page_to_virt(page), PAGE_SIZE,
+ (void **)&ref);
+ percpu_ref_put(ref);
+}
+
+static const struct dev_pagemap_ops p2pdma_pgmap_ops = {
+ .page_free = p2pdma_page_free,
+};
+
static void pci_p2pdma_release(void *data)
{
struct pci_dev *pdev = data;
@@ -152,6 +257,19 @@ out:
return error;
}
+static void pci_p2pdma_unmap_mappings(void *data)
+{
+ struct pci_dev *pdev = data;
+
+ /*
+ * Removing the alloc attribute from sysfs will call
+ * unmap_mapping_range() on the inode, teardown any existing userspace
+ * mappings and prevent new ones from being created.
+ */
+ sysfs_remove_file_from_group(&pdev->dev.kobj, &p2pmem_alloc_attr.attr,
+ p2pmem_group.name);
+}
+
/**
* pci_p2pdma_add_resource - add memory for use as p2p memory
* @pdev: the device to add the memory to
@@ -198,6 +316,7 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
pgmap->range.end = pgmap->range.start + size - 1;
pgmap->nr_range = 1;
pgmap->type = MEMORY_DEVICE_PCI_P2PDMA;
+ pgmap->ops = &p2pdma_pgmap_ops;
p2p_pgmap->provider = pdev;
p2p_pgmap->bus_offset = pci_bus_address(pdev, bar) -
@@ -209,6 +328,11 @@ int pci_p2pdma_add_resource(struct pci_dev *pdev, int bar, size_t size,
goto pgmap_free;
}
+ error = devm_add_action_or_reset(&pdev->dev, pci_p2pdma_unmap_mappings,
+ pdev);
+ if (error)
+ goto pages_free;
+
p2pdma = rcu_dereference_protected(pdev->p2pdma, 1);
error = gen_pool_add_owner(p2pdma->pool, (unsigned long)addr,
pci_bus_address(pdev, bar) + offset,
diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c
index 8b89fab7c420..249757ddd8fe 100644
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -2735,7 +2735,7 @@ static void scsi_stop_queue(struct scsi_device *sdev, bool nowait)
blk_mq_quiesce_queue(sdev->request_queue);
} else {
if (!nowait)
- blk_mq_wait_quiesce_done(sdev->request_queue);
+ blk_mq_wait_quiesce_done(sdev->request_queue->tag_set);
}
}
diff --git a/drivers/scsi/scsi_scan.c b/drivers/scsi/scsi_scan.c
index 5d27f5196de6..0a95fa787fdf 100644
--- a/drivers/scsi/scsi_scan.c
+++ b/drivers/scsi/scsi_scan.c
@@ -344,7 +344,6 @@ static struct scsi_device *scsi_alloc_sdev(struct scsi_target *starget,
sdev->request_queue = q;
q->queuedata = sdev;
__scsi_init_queue(sdev->host, q);
- WARN_ON_ONCE(!blk_get_queue(q));
depth = sdev->host->cmd_per_lun ?: 1;
diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c
index b1f59a5fe632..d2b11d5b91ce 100644
--- a/drivers/ufs/core/ufshcd.c
+++ b/drivers/ufs/core/ufshcd.c
@@ -9544,6 +9544,7 @@ void ufshcd_remove(struct ufs_hba *hba)
ufshpb_remove(hba);
ufs_sysfs_remove_nodes(hba->dev);
blk_mq_destroy_queue(hba->tmf_queue);
+ blk_put_queue(hba->tmf_queue);
blk_mq_free_tag_set(&hba->tmf_tag_set);
scsi_remove_host(hba->host);
/* disable interrupts */
@@ -9840,6 +9841,7 @@ int ufshcd_init(struct ufs_hba *hba, void __iomem *mmio_base, unsigned int irq)
free_tmf_queue:
blk_mq_destroy_queue(hba->tmf_queue);
+ blk_put_queue(hba->tmf_queue);
free_tmf_tag_set:
blk_mq_free_tag_set(&hba->tmf_tag_set);
out_remove_scsi_host:
diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c
index cea8b14007e6..8bfb3ce86476 100644
--- a/fs/crypto/inline_crypt.c
+++ b/fs/crypto/inline_crypt.c
@@ -12,7 +12,7 @@
* provides the key and IV to use.
*/
-#include <linux/blk-crypto-profile.h>
+#include <linux/blk-crypto.h>
#include <linux/blkdev.h>
#include <linux/buffer_head.h>
#include <linux/sched/mm.h>
@@ -77,10 +77,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode,
unsigned int i;
for (i = 0; i < num_devs; i++) {
- struct request_queue *q = bdev_get_queue(devs[i]);
-
if (!IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- __blk_crypto_cfg_supported(q->crypto_profile, cfg)) {
+ blk_crypto_config_supported_natively(devs[i], cfg)) {
if (!xchg(&mode->logged_blk_crypto_native, 1))
pr_info("fscrypt: %s using blk-crypto (native)\n",
mode->friendly_name);
@@ -139,8 +137,7 @@ int fscrypt_select_encryption_impl(struct fscrypt_info *ci)
return PTR_ERR(devs);
for (i = 0; i < num_devs; i++) {
- if (!blk_crypto_config_supported(bdev_get_queue(devs[i]),
- &crypto_cfg))
+ if (!blk_crypto_config_supported(devs[i], &crypto_cfg))
goto out_free_devs;
}
@@ -184,8 +181,7 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key,
goto fail;
}
for (i = 0; i < num_devs; i++) {
- err = blk_crypto_start_using_key(blk_key,
- bdev_get_queue(devs[i]));
+ err = blk_crypto_start_using_key(devs[i], blk_key);
if (err)
break;
}
@@ -224,7 +220,7 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb,
devs = fscrypt_get_devices(sb, &num_devs);
if (!IS_ERR(devs)) {
for (i = 0; i < num_devs; i++)
- blk_crypto_evict_key(bdev_get_queue(devs[i]), blk_key);
+ blk_crypto_evict_key(devs[i], blk_key);
kfree(devs);
}
kfree_sensitive(blk_key);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2c5806997bbf..b231a665682a 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -475,8 +475,6 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty);
extern void bio_set_pages_dirty(struct bio *bio);
extern void bio_check_pages_dirty(struct bio *bio);
-extern void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
- struct bio *src, struct bvec_iter *src_iter);
extern void bio_copy_data(struct bio *dst, struct bio *src);
extern void bio_free_pages(struct bio *bio);
void guard_bio_eod(struct bio *bio);
diff --git a/include/linux/blk-crypto-profile.h b/include/linux/blk-crypto-profile.h
index bbab65bd5428..e6802b69cdd6 100644
--- a/include/linux/blk-crypto-profile.h
+++ b/include/linux/blk-crypto-profile.h
@@ -138,18 +138,6 @@ int devm_blk_crypto_profile_init(struct device *dev,
unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot);
-blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
- const struct blk_crypto_key *key,
- struct blk_crypto_keyslot **slot_ptr);
-
-void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
-
-bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
- const struct blk_crypto_config *cfg);
-
-int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
- const struct blk_crypto_key *key);
-
void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile);
void blk_crypto_profile_destroy(struct blk_crypto_profile *profile);
diff --git a/include/linux/blk-crypto.h b/include/linux/blk-crypto.h
index 26b1b71c3091..1e3e5d0adf12 100644
--- a/include/linux/blk-crypto.h
+++ b/include/linux/blk-crypto.h
@@ -72,9 +72,6 @@ struct bio_crypt_ctx {
#include <linux/blk_types.h>
#include <linux/blkdev.h>
-struct request;
-struct request_queue;
-
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
static inline bool bio_has_crypt_ctx(struct bio *bio)
@@ -95,13 +92,15 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
unsigned int dun_bytes,
unsigned int data_unit_size);
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
- struct request_queue *q);
+int blk_crypto_start_using_key(struct block_device *bdev,
+ const struct blk_crypto_key *key);
-int blk_crypto_evict_key(struct request_queue *q,
+int blk_crypto_evict_key(struct block_device *bdev,
const struct blk_crypto_key *key);
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+ const struct blk_crypto_config *cfg);
+bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg);
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h
index d6119c5d1069..779fba613bd0 100644
--- a/include/linux/blk-mq.h
+++ b/include/linux/blk-mq.h
@@ -7,6 +7,7 @@
#include <linux/lockdep.h>
#include <linux/scatterlist.h>
#include <linux/prefetch.h>
+#include <linux/srcu.h>
struct blk_mq_tags;
struct blk_flush_queue;
@@ -140,7 +141,6 @@ struct request {
struct blk_crypto_keyslot *crypt_keyslot;
#endif
- unsigned short write_hint;
unsigned short ioprio;
enum mq_rq_state state;
@@ -501,6 +501,8 @@ enum hctx_type {
* @tag_list_lock: Serializes tag_list accesses.
* @tag_list: List of the request queues that use this tag set. See also
* request_queue.tag_set_list.
+ * @srcu: Use as lock when type of the request queue is blocking
+ * (BLK_MQ_F_BLOCKING).
*/
struct blk_mq_tag_set {
struct blk_mq_queue_map map[HCTX_MAX_TYPES];
@@ -521,6 +523,7 @@ struct blk_mq_tag_set {
struct mutex tag_list_lock;
struct list_head tag_list;
+ struct srcu_struct *srcu;
};
/**
@@ -878,7 +881,9 @@ void blk_mq_start_hw_queues(struct request_queue *q);
void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async);
void blk_mq_quiesce_queue(struct request_queue *q);
-void blk_mq_wait_quiesce_done(struct request_queue *q);
+void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set);
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set);
void blk_mq_unquiesce_queue(struct request_queue *q);
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs);
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async);
diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
index e0b098089ef2..99be590f952f 100644
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -472,13 +472,6 @@ static inline enum req_op bio_op(const struct bio *bio)
return bio->bi_opf & REQ_OP_MASK;
}
-/* obsolete, don't use in new code */
-static inline void bio_set_op_attrs(struct bio *bio, enum req_op op,
- blk_opf_t op_flags)
-{
- bio->bi_opf = op | op_flags;
-}
-
static inline bool op_is_write(blk_opf_t op)
{
return !!(op & (__force blk_opf_t)1);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 891f8cbcd043..301cf1cf4f2f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -22,7 +22,6 @@
#include <linux/blkzoned.h>
#include <linux/sched.h>
#include <linux/sbitmap.h>
-#include <linux/srcu.h>
#include <linux/uuid.h>
#include <linux/xarray.h>
@@ -156,6 +155,7 @@ struct gendisk {
unsigned open_partitions; /* number of open partitions */
struct backing_dev_info *bdi;
+ struct kobject queue_kobj; /* the queue/ directory */
struct kobject *slave_dir;
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
struct list_head slave_bdevs;
@@ -438,10 +438,7 @@ struct request_queue {
struct gendisk *disk;
- /*
- * queue kobject
- */
- struct kobject kobj;
+ refcount_t refs;
/*
* mq queue kobject
@@ -544,18 +541,11 @@ struct request_queue {
struct mutex debugfs_mutex;
bool mq_sysfs_init_done;
-
- /**
- * @srcu: Sleepable RCU. Use as lock when type of the request queue
- * is blocking (BLK_MQ_F_BLOCKING). Must be the last member
- */
- struct srcu_struct srcu[];
};
/* Keep blk_queue_flag_name[] in sync with the definitions below */
#define QUEUE_FLAG_STOPPED 0 /* queue is stopped */
#define QUEUE_FLAG_DYING 1 /* queue being torn down */
-#define QUEUE_FLAG_HAS_SRCU 2 /* SRCU is allocated */
#define QUEUE_FLAG_NOMERGES 3 /* disable merge attempts */
#define QUEUE_FLAG_SAME_COMP 4 /* complete on same CPU-group */
#define QUEUE_FLAG_FAIL_IO 5 /* fake timeout */
@@ -580,6 +570,7 @@ struct request_queue {
#define QUEUE_FLAG_HCTX_ACTIVE 28 /* at least one blk-mq hctx is active */
#define QUEUE_FLAG_NOWAIT 29 /* device supports NOWAIT */
#define QUEUE_FLAG_SQ_SCHED 30 /* single queue style io dispatch */
+#define QUEUE_FLAG_SKIP_TAGSET_QUIESCE 31 /* quiesce_tagset skip the queue*/
#define QUEUE_FLAG_MQ_DEFAULT ((1UL << QUEUE_FLAG_IO_STAT) | \
(1UL << QUEUE_FLAG_SAME_COMP) | \
@@ -591,7 +582,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_stopped(q) test_bit(QUEUE_FLAG_STOPPED, &(q)->queue_flags)
#define blk_queue_dying(q) test_bit(QUEUE_FLAG_DYING, &(q)->queue_flags)
-#define blk_queue_has_srcu(q) test_bit(QUEUE_FLAG_HAS_SRCU, &(q)->queue_flags)
#define blk_queue_init_done(q) test_bit(QUEUE_FLAG_INIT_DONE, &(q)->queue_flags)
#define blk_queue_nomerges(q) test_bit(QUEUE_FLAG_NOMERGES, &(q)->queue_flags)
#define blk_queue_noxmerges(q) \
@@ -620,6 +610,8 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q);
#define blk_queue_pm_only(q) atomic_read(&(q)->pm_only)
#define blk_queue_registered(q) test_bit(QUEUE_FLAG_REGISTERED, &(q)->queue_flags)
#define blk_queue_sq_sched(q) test_bit(QUEUE_FLAG_SQ_SCHED, &(q)->queue_flags)
+#define blk_queue_skip_tagset_quiesce(q) \
+ test_bit(QUEUE_FLAG_SKIP_TAGSET_QUIESCE, &(q)->queue_flags)
extern void blk_set_pm_only(struct request_queue *q);
extern void blk_clear_pm_only(struct request_queue *q);
@@ -840,7 +832,6 @@ void set_capacity(struct gendisk *disk, sector_t size);
#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk);
void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk);
-int bd_register_pending_holders(struct gendisk *disk);
#else
static inline int bd_link_disk_holder(struct block_device *bdev,
struct gendisk *disk)
@@ -851,10 +842,6 @@ static inline void bd_unlink_disk_holder(struct block_device *bdev,
struct gendisk *disk)
{
}
-static inline int bd_register_pending_holders(struct gendisk *disk)
-{
- return 0;
-}
#endif /* CONFIG_BLOCK_HOLDER_DEPRECATED */
dev_t part_devt(struct gendisk *disk, u8 partno);
@@ -1349,12 +1336,7 @@ static inline int blk_rq_aligned(struct request_queue *q, unsigned long addr,
/* assumes size > 256 */
static inline unsigned int blksize_bits(unsigned int size)
{
- unsigned int bits = 8;
- do {
- bits++;
- size >>= 1;
- } while (size > 256);
- return bits;
+ return order_base_2(size >> SECTOR_SHIFT) + SECTOR_SHIFT;
}
static inline unsigned int block_size(struct block_device *bdev)
@@ -1413,7 +1395,6 @@ struct block_device_operations {
void (*swap_slot_free_notify) (struct block_device *, unsigned long);
int (*report_zones)(struct gendisk *, sector_t sector,
unsigned int nr_zones, report_zones_cb cb, void *data);
- char *(*devnode)(struct gendisk *disk, umode_t *mode);
/* returns the length of the identifier or a negative errno: */
int (*get_unique_id)(struct gendisk *disk, u8 id[16],
enum blk_unique_id id_type);
@@ -1458,7 +1439,6 @@ unsigned long bdev_start_io_acct(struct block_device *bdev,
void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
unsigned long start_time);
-void bio_start_io_acct_time(struct bio *bio, unsigned long start_time);
unsigned long bio_start_io_acct(struct bio *bio);
void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
struct block_device *orig_bdev);
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 07add7882a5d..c9afcdd9324c 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -199,7 +199,6 @@ struct lru_cache {
unsigned long flags;
- void *lc_private;
const char *name;
/* nr_elements there */
@@ -241,7 +240,6 @@ extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
-extern void lc_set(struct lru_cache *lc, unsigned int enr, int index);
extern void lc_del(struct lru_cache *lc, struct lc_element *element);
extern struct lc_element *lc_get_cumulative(struct lru_cache *lc, unsigned int enr);
@@ -297,6 +295,5 @@ extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
container_of(ptr, type, member)
extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i);
-extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e);
#endif
diff --git a/include/linux/mempool.h b/include/linux/mempool.h
index 0c964ac107c2..4aae6c06c5f2 100644
--- a/include/linux/mempool.h
+++ b/include/linux/mempool.h
@@ -30,6 +30,11 @@ static inline bool mempool_initialized(mempool_t *pool)
return pool->elements != NULL;
}
+static inline bool mempool_is_saturated(mempool_t *pool)
+{
+ return READ_ONCE(pool->curr_nr) >= pool->min_nr;
+}
+
void mempool_exit(mempool_t *pool);
int mempool_init_node(mempool_t *pool, int min_nr, mempool_alloc_t *alloc_fn,
mempool_free_t *free_fn, void *pool_data,
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 974ccca609d2..6a05a3bc0a28 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1129,7 +1129,7 @@ static inline void get_page(struct page *page)
folio_get(page_folio(page));
}
-bool __must_check try_grab_page(struct page *page, unsigned int flags);
+int __must_check try_grab_page(struct page *page, unsigned int flags);
static inline __must_check bool try_get_page(struct page *page)
{
@@ -2979,6 +2979,7 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
#define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */
#define FOLL_PIN 0x40000 /* pages must be released via unpin_user_page */
#define FOLL_FAST_ONLY 0x80000 /* gup_fast: prevent fall-back to slow gup */
+#define FOLL_PCI_P2PDMA 0x100000 /* allow returning PCI P2PDMA pages */
/*
* FOLL_PIN and FOLL_LONGTERM may be used in various combinations with each
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 5f74891556f3..9c49ec5d0e25 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -986,6 +986,25 @@ static inline bool is_zone_device_page(const struct page *page)
{
return page_zonenum(page) == ZONE_DEVICE;
}
+
+/*
+ * Consecutive zone device pages should not be merged into the same sgl
+ * or bvec segment with other types of pages or if they belong to different
+ * pgmaps. Otherwise getting the pgmap of a given segment is not possible
+ * without scanning the entire segment. This helper returns true either if
+ * both pages are not zone device pages or both pages are zone device pages
+ * with the same pgmap.
+ */
+static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
+ const struct page *b)
+{
+ if (is_zone_device_page(a) != is_zone_device_page(b))
+ return false;
+ if (!is_zone_device_page(a))
+ return true;
+ return a->pgmap == b->pgmap;
+}
+
extern void memmap_init_zone_device(struct zone *, unsigned long,
unsigned long, struct dev_pagemap *);
#else
@@ -993,6 +1012,11 @@ static inline bool is_zone_device_page(const struct page *page)
{
return false;
}
+static inline bool zone_device_pages_have_same_pgmap(const struct page *a,
+ const struct page *b)
+{
+ return true;
+}
#endif
static inline bool folio_is_zone_device(const struct folio *folio)
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index 050d7d0cd81b..d6be2a686100 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -797,6 +797,7 @@ enum nvme_opcode {
nvme_cmd_zone_mgmt_send = 0x79,
nvme_cmd_zone_mgmt_recv = 0x7a,
nvme_cmd_zone_append = 0x7d,
+ nvme_cmd_vendor_start = 0x80,
};
#define nvme_opcode_name(opcode) { opcode, #opcode }
@@ -963,6 +964,7 @@ enum {
NVME_RW_PRINFO_PRCHK_GUARD = 1 << 12,
NVME_RW_PRINFO_PRACT = 1 << 13,
NVME_RW_DTYPE_STREAMS = 1 << 4,
+ NVME_WZ_DEAC = 1 << 9,
};
struct nvme_dsm_cmd {
diff --git a/include/linux/pktcdvd.h b/include/linux/pktcdvd.h
deleted file mode 100644
index f9c5ac80d59b..000000000000
--- a/include/linux/pktcdvd.h
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License. See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
- * DVD-RW devices.
- *
- */
-#ifndef __PKTCDVD_H
-#define __PKTCDVD_H
-
-#include <linux/blkdev.h>
-#include <linux/completion.h>
-#include <linux/cdrom.h>
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/mempool.h>
-#include <uapi/linux/pktcdvd.h>
-
-/* default bio write queue congestion marks */
-#define PKT_WRITE_CONGESTION_ON 10000
-#define PKT_WRITE_CONGESTION_OFF 9000
-
-
-struct packet_settings
-{
- __u32 size; /* packet size in (512 byte) sectors */
- __u8 fp; /* fixed packets */
- __u8 link_loss; /* the rest is specified
- * as per Mt Fuji */
- __u8 write_type;
- __u8 track_mode;
- __u8 block_mode;
-};
-
-/*
- * Very crude stats for now
- */
-struct packet_stats
-{
- unsigned long pkt_started;
- unsigned long pkt_ended;
- unsigned long secs_w;
- unsigned long secs_rg;
- unsigned long secs_r;
-};
-
-struct packet_cdrw
-{
- struct list_head pkt_free_list;
- struct list_head pkt_active_list;
- spinlock_t active_list_lock; /* Serialize access to pkt_active_list */
- struct task_struct *thread;
- atomic_t pending_bios;
-};
-
-/*
- * Switch to high speed reading after reading this many kilobytes
- * with no interspersed writes.
- */
-#define HI_SPEED_SWITCH 512
-
-struct packet_iosched
-{
- atomic_t attention; /* Set to non-zero when queue processing is needed */
- int writing; /* Non-zero when writing, zero when reading */
- spinlock_t lock; /* Protecting read/write queue manipulations */
- struct bio_list read_queue;
- struct bio_list write_queue;
- sector_t last_write; /* The sector where the last write ended */
- int successive_reads;
-};
-
-/*
- * 32 buffers of 2048 bytes
- */
-#if (PAGE_SIZE % CD_FRAMESIZE) != 0
-#error "PAGE_SIZE must be a multiple of CD_FRAMESIZE"
-#endif
-#define PACKET_MAX_SIZE 128
-#define FRAMES_PER_PAGE (PAGE_SIZE / CD_FRAMESIZE)
-#define PACKET_MAX_SECTORS (PACKET_MAX_SIZE * CD_FRAMESIZE >> 9)
-
-enum packet_data_state {
- PACKET_IDLE_STATE, /* Not used at the moment */
- PACKET_WAITING_STATE, /* Waiting for more bios to arrive, so */
- /* we don't have to do as much */
- /* data gathering */
- PACKET_READ_WAIT_STATE, /* Waiting for reads to fill in holes */
- PACKET_WRITE_WAIT_STATE, /* Waiting for the write to complete */
- PACKET_RECOVERY_STATE, /* Recover after read/write errors */
- PACKET_FINISHED_STATE, /* After write has finished */
-
- PACKET_NUM_STATES /* Number of possible states */
-};
-
-/*
- * Information needed for writing a single packet
- */
-struct pktcdvd_device;
-
-struct packet_data
-{
- struct list_head list;
-
- spinlock_t lock; /* Lock protecting state transitions and */
- /* orig_bios list */
-
- struct bio_list orig_bios; /* Original bios passed to pkt_make_request */
- /* that will be handled by this packet */
- int write_size; /* Total size of all bios in the orig_bios */
- /* list, measured in number of frames */
-
- struct bio *w_bio; /* The bio we will send to the real CD */
- /* device once we have all data for the */
- /* packet we are going to write */
- sector_t sector; /* First sector in this packet */
- int frames; /* Number of frames in this packet */
-
- enum packet_data_state state; /* Current state */
- atomic_t run_sm; /* Incremented whenever the state */
- /* machine needs to be run */
- long sleep_time; /* Set this to non-zero to make the state */
- /* machine run after this many jiffies. */
-
- atomic_t io_wait; /* Number of pending IO operations */
- atomic_t io_errors; /* Number of read/write errors during IO */
-
- struct bio *r_bios[PACKET_MAX_SIZE]; /* bios to use during data gathering */
- struct page *pages[PACKET_MAX_SIZE / FRAMES_PER_PAGE];
-
- int cache_valid; /* If non-zero, the data for the zone defined */
- /* by the sector variable is completely cached */
- /* in the pages[] vector. */
-
- int id; /* ID number for debugging */
- struct pktcdvd_device *pd;
-};
-
-struct pkt_rb_node {
- struct rb_node rb_node;
- struct bio *bio;
-};
-
-struct packet_stacked_data
-{
- struct bio *bio; /* Original read request bio */
- struct pktcdvd_device *pd;
-};
-#define PSD_POOL_SIZE 64
-
-struct pktcdvd_device
-{
- struct block_device *bdev; /* dev attached */
- dev_t pkt_dev; /* our dev */
- char name[20];
- struct packet_settings settings;
- struct packet_stats stats;
- int refcnt; /* Open count */
- int write_speed; /* current write speed, kB/s */
- int read_speed; /* current read speed, kB/s */
- unsigned long offset; /* start offset */
- __u8 mode_offset; /* 0 / 8 */
- __u8 type;
- unsigned long flags;
- __u16 mmc3_profile;
- __u32 nwa; /* next writable address */
- __u32 lra; /* last recorded address */
- struct packet_cdrw cdrw;
- wait_queue_head_t wqueue;
-
- spinlock_t lock; /* Serialize access to bio_queue */
- struct rb_root bio_queue; /* Work queue of bios we need to handle */
- int bio_queue_size; /* Number of nodes in bio_queue */
- bool congested; /* Someone is waiting for bio_queue_size
- * to drop. */
- sector_t current_sector; /* Keep track of where the elevator is */
- atomic_t scan_queue; /* Set to non-zero when pkt_handle_queue */
- /* needs to be run. */
- mempool_t rb_pool; /* mempool for pkt_rb_node allocations */
-
- struct packet_iosched iosched;
- struct gendisk *disk;
-
- int write_congestion_off;
- int write_congestion_on;
-
- struct device *dev; /* sysfs pktcdvd[0-7] dev */
-
- struct dentry *dfs_d_root; /* debugfs: devname directory */
- struct dentry *dfs_f_info; /* debugfs: info file */
-};
-
-#endif /* __PKTCDVD_H */
diff --git a/include/linux/raid/pq.h b/include/linux/raid/pq.h
index d6e5a1feb947..f29aaaf2eb21 100644
--- a/include/linux/raid/pq.h
+++ b/include/linux/raid/pq.h
@@ -10,17 +10,9 @@
#ifdef __KERNEL__
-/* Set to 1 to use kernel-wide empty_zero_page */
-#define RAID6_USE_EMPTY_ZERO_PAGE 0
#include <linux/blkdev.h>
-/* We need a pre-zeroed page... if we don't want to use the kernel-provided
- one define it here */
-#if RAID6_USE_EMPTY_ZERO_PAGE
-# define raid6_empty_zero_page empty_zero_page
-#else
extern const char raid6_empty_zero_page[PAGE_SIZE];
-#endif
#else /* ! __KERNEL__ */
/* Used for testing in user space */
diff --git a/include/linux/sbitmap.h b/include/linux/sbitmap.h
index 4d2d5205ab58..d662cf136021 100644
--- a/include/linux/sbitmap.h
+++ b/include/linux/sbitmap.h
@@ -87,11 +87,6 @@ struct sbitmap {
*/
struct sbq_wait_state {
/**
- * @wait_cnt: Number of frees remaining before we wake up.
- */
- atomic_t wait_cnt;
-
- /**
* @wait: Wait queue.
*/
wait_queue_head_t wait;
@@ -138,6 +133,17 @@ struct sbitmap_queue {
* sbitmap_queue_get_shallow()
*/
unsigned int min_shallow_depth;
+
+ /**
+ * @completion_cnt: Number of bits cleared passed to the
+ * wakeup function.
+ */
+ atomic_t completion_cnt;
+
+ /**
+ * @wakeup_cnt: Number of thread wake ups issued.
+ */
+ atomic_t wakeup_cnt;
};
/**
diff --git a/include/linux/sed-opal.h b/include/linux/sed-opal.h
index 6f837bb6c715..31ac562a17d7 100644
--- a/include/linux/sed-opal.h
+++ b/include/linux/sed-opal.h
@@ -11,7 +11,8 @@
#define LINUX_OPAL_H
#include <uapi/linux/sed-opal.h>
-#include <linux/kernel.h>
+#include <linux/compiler_types.h>
+#include <linux/types.h>
struct opal_dev;
diff --git a/include/linux/uio.h b/include/linux/uio.h
index 87fc3d0dda98..9f158238edba 100644
--- a/include/linux/uio.h
+++ b/include/linux/uio.h
@@ -250,8 +250,14 @@ void iov_iter_pipe(struct iov_iter *i, unsigned int direction, struct pipe_inode
void iov_iter_discard(struct iov_iter *i, unsigned int direction, size_t count);
void iov_iter_xarray(struct iov_iter *i, unsigned int direction, struct xarray *xarray,
loff_t start, size_t count);
+ssize_t iov_iter_get_pages(struct iov_iter *i, struct page **pages,
+ size_t maxsize, unsigned maxpages, size_t *start,
+ unsigned gup_flags);
ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
size_t maxsize, unsigned maxpages, size_t *start);
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
+ struct page ***pages, size_t maxsize, size_t *start,
+ unsigned gup_flags);
ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i, struct page ***pages,
size_t maxsize, size_t *start);
int iov_iter_npages(const struct iov_iter *i, int maxpages);
diff --git a/include/linux/wait.h b/include/linux/wait.h
index 7f5a51aae0a7..a0307b516b09 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -209,7 +209,7 @@ __remove_wait_queue(struct wait_queue_head *wq_head, struct wait_queue_entry *wq
list_del(&wq_entry->entry);
}
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode, int nr, void *key);
void __wake_up_locked_key(struct wait_queue_head *wq_head, unsigned int mode, void *key);
void __wake_up_locked_key_bookmark(struct wait_queue_head *wq_head,
unsigned int mode, void *key, wait_queue_entry_t *bookmark);
diff --git a/include/trace/events/iocost.h b/include/trace/events/iocost.h
index 6d1626e7a4ce..af8bfed528fc 100644
--- a/include/trace/events/iocost.h
+++ b/include/trace/events/iocost.h
@@ -38,7 +38,7 @@ DECLARE_EVENT_CLASS(iocost_iocg_state,
__assign_str(cgroup, path);
__entry->now = now->now;
__entry->vnow = now->vnow;
- __entry->vrate = now->vrate;
+ __entry->vrate = iocg->ioc->vtime_base_rate;
__entry->last_period = last_period;
__entry->cur_period = cur_period;
__entry->vtime = vtime;
@@ -160,7 +160,7 @@ TRACE_EVENT(iocost_ioc_vrate_adj,
TP_fast_assign(
__assign_str(devname, ioc_name(ioc));
- __entry->old_vrate = atomic64_read(&ioc->vtime_rate);
+ __entry->old_vrate = ioc->vtime_base_rate;
__entry->new_vrate = new_vrate;
__entry->busy_level = ioc->busy_level;
__entry->read_missed_ppm = missed_ppm[READ];
diff --git a/include/uapi/linux/pktcdvd.h b/include/uapi/linux/pktcdvd.h
deleted file mode 100644
index 9cbb55d21c94..000000000000
--- a/include/uapi/linux/pktcdvd.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */
-/*
- * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
- * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
- *
- * May be copied or modified under the terms of the GNU General Public
- * License. See linux/COPYING for more information.
- *
- * Packet writing layer for ATAPI and SCSI CD-R, CD-RW, DVD-R, and
- * DVD-RW devices.
- *
- */
-#ifndef _UAPI__PKTCDVD_H
-#define _UAPI__PKTCDVD_H
-
-#include <linux/types.h>
-
-/*
- * 1 for normal debug messages, 2 is very verbose. 0 to turn it off.
- */
-#define PACKET_DEBUG 1
-
-#define MAX_WRITERS 8
-
-#define PKT_RB_POOL_SIZE 512
-
-/*
- * How long we should hold a non-full packet before starting data gathering.
- */
-#define PACKET_WAIT_TIME (HZ * 5 / 1000)
-
-/*
- * use drive write caching -- we need deferred error handling to be
- * able to successfully recover with this option (drive will return good
- * status as soon as the cdb is validated).
- */
-#if defined(CONFIG_CDROM_PKTCDVD_WCACHE)
-#define USE_WCACHING 1
-#else
-#define USE_WCACHING 0
-#endif
-
-/*
- * No user-servicable parts beyond this point ->
- */
-
-/*
- * device types
- */
-#define PACKET_CDR 1
-#define PACKET_CDRW 2
-#define PACKET_DVDR 3
-#define PACKET_DVDRW 4
-
-/*
- * flags
- */
-#define PACKET_WRITABLE 1 /* pd is writable */
-#define PACKET_NWA_VALID 2 /* next writable address valid */
-#define PACKET_LRA_VALID 3 /* last recorded address valid */
-#define PACKET_MERGE_SEGS 4 /* perform segment merging to keep */
- /* underlying cdrom device happy */
-
-/*
- * Disc status -- from READ_DISC_INFO
- */
-#define PACKET_DISC_EMPTY 0
-#define PACKET_DISC_INCOMPLETE 1
-#define PACKET_DISC_COMPLETE 2
-#define PACKET_DISC_OTHER 3
-
-/*
- * write type, and corresponding data block type
- */
-#define PACKET_MODE1 1
-#define PACKET_MODE2 2
-#define PACKET_BLOCK_MODE1 8
-#define PACKET_BLOCK_MODE2 10
-
-/*
- * Last session/border status
- */
-#define PACKET_SESSION_EMPTY 0
-#define PACKET_SESSION_INCOMPLETE 1
-#define PACKET_SESSION_RESERVED 2
-#define PACKET_SESSION_COMPLETE 3
-
-#define PACKET_MCN "4a656e734178626f65323030300000"
-
-#undef PACKET_USE_LS
-
-#define PKT_CTRL_CMD_SETUP 0
-#define PKT_CTRL_CMD_TEARDOWN 1
-#define PKT_CTRL_CMD_STATUS 2
-
-struct pkt_ctrl_command {
- __u32 command; /* in: Setup, teardown, status */
- __u32 dev_index; /* in/out: Device index */
- __u32 dev; /* in/out: Device nr for cdrw device */
- __u32 pkt_dev; /* in/out: Device nr for packet device */
- __u32 num_devices; /* out: Largest device index + 1 */
- __u32 padding; /* Not used */
-};
-
-/*
- * packet ioctls
- */
-#define PACKET_IOCTL_MAGIC ('X')
-#define PACKET_CTRL_CMD _IOWR(PACKET_IOCTL_MAGIC, 1, struct pkt_ctrl_command)
-
-
-#endif /* _UAPI__PKTCDVD_H */
diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h
index 2573772e2fb3..1fed3c9294fc 100644
--- a/include/uapi/linux/sed-opal.h
+++ b/include/uapi/linux/sed-opal.h
@@ -44,6 +44,11 @@ enum opal_lock_state {
OPAL_LK = 0x04, /* 0100 */
};
+enum opal_lock_flags {
+ /* IOC_OPAL_SAVE will also store the provided key for locking */
+ OPAL_SAVE_FOR_LOCK = 0x01,
+};
+
struct opal_key {
__u8 lr;
__u8 key_len;
@@ -76,7 +81,8 @@ struct opal_user_lr_setup {
struct opal_lock_unlock {
struct opal_session_info session;
__u32 l_state;
- __u8 __align[4];
+ __u16 flags;
+ __u8 __align[2];
};
struct opal_new_pw {
diff --git a/io_uring/rw.c b/io_uring/rw.c
index 77576835a848..b9cac5706e8d 100644
--- a/io_uring/rw.c
+++ b/io_uring/rw.c
@@ -671,6 +671,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
ret = kiocb_set_rw_flags(kiocb, rw->flags);
if (unlikely(ret))
return ret;
+ kiocb->ki_flags |= IOCB_ALLOC_CACHE;
/*
* If the file is marked O_NONBLOCK, still allow retry for it if it
@@ -686,7 +687,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
return -EOPNOTSUPP;
kiocb->private = NULL;
- kiocb->ki_flags |= IOCB_HIPRI | IOCB_ALLOC_CACHE;
+ kiocb->ki_flags |= IOCB_HIPRI;
kiocb->ki_complete = io_complete_rw_iopoll;
req->iopoll_completed = 0;
} else {
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
index 9860bb9a847c..133b74730738 100644
--- a/kernel/sched/wait.c
+++ b/kernel/sched/wait.c
@@ -121,11 +121,12 @@ static int __wake_up_common(struct wait_queue_head *wq_head, unsigned int mode,
return nr_exclusive;
}
-static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
+static int __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int mode,
int nr_exclusive, int wake_flags, void *key)
{
unsigned long flags;
wait_queue_entry_t bookmark;
+ int remaining = nr_exclusive;
bookmark.flags = 0;
bookmark.private = NULL;
@@ -134,10 +135,12 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
do {
spin_lock_irqsave(&wq_head->lock, flags);
- nr_exclusive = __wake_up_common(wq_head, mode, nr_exclusive,
+ remaining = __wake_up_common(wq_head, mode, remaining,
wake_flags, key, &bookmark);
spin_unlock_irqrestore(&wq_head->lock, flags);
} while (bookmark.flags & WQ_FLAG_BOOKMARK);
+
+ return nr_exclusive - remaining;
}
/**
@@ -147,13 +150,14 @@ static void __wake_up_common_lock(struct wait_queue_head *wq_head, unsigned int
* @nr_exclusive: how many wake-one or wake-many threads to wake up
* @key: is directly passed to the wakeup function
*
- * If this function wakes up a task, it executes a full memory barrier before
- * accessing the task state.
+ * If this function wakes up a task, it executes a full memory barrier
+ * before accessing the task state. Returns the number of exclusive
+ * tasks that were awaken.
*/
-void __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
- int nr_exclusive, void *key)
+int __wake_up(struct wait_queue_head *wq_head, unsigned int mode,
+ int nr_exclusive, void *key)
{
- __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
+ return __wake_up_common_lock(wq_head, mode, nr_exclusive, 0, key);
}
EXPORT_SYMBOL(__wake_up);
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a995ea1ef849..918a7d12df8f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -721,7 +721,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
*/
/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * blk_trace_ioctl - handle the ioctls associated with tracing
* @bdev: the block device
* @cmd: the ioctl cmd
* @arg: the argument data, if any
@@ -769,7 +769,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
}
/**
- * blk_trace_shutdown: - stop and cleanup trace structures
+ * blk_trace_shutdown - stop and cleanup trace structures
* @q: the request queue associated with the device
*
**/
@@ -1548,7 +1548,8 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags,
static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
{
- if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+ if ((iter->ent->type != TRACE_BLK) ||
+ !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
return TRACE_TYPE_UNHANDLED;
return print_one_line(iter, true);
diff --git a/lib/iov_iter.c b/lib/iov_iter.c
index 98e8425b060d..f9a3ff37ecd1 100644
--- a/lib/iov_iter.c
+++ b/lib/iov_iter.c
@@ -1431,7 +1431,8 @@ static struct page *first_bvec_segment(const struct iov_iter *i,
static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
- unsigned int maxpages, size_t *start)
+ unsigned int maxpages, size_t *start,
+ unsigned int gup_flags)
{
unsigned int n;
@@ -1443,7 +1444,6 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
maxsize = MAX_RW_COUNT;
if (likely(user_backed_iter(i))) {
- unsigned int gup_flags = 0;
unsigned long addr;
int res;
@@ -1493,33 +1493,49 @@ static ssize_t __iov_iter_get_pages_alloc(struct iov_iter *i,
return -EFAULT;
}
-ssize_t iov_iter_get_pages2(struct iov_iter *i,
+ssize_t iov_iter_get_pages(struct iov_iter *i,
struct page **pages, size_t maxsize, unsigned maxpages,
- size_t *start)
+ size_t *start, unsigned gup_flags)
{
if (!maxpages)
return 0;
BUG_ON(!pages);
- return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages, start);
+ return __iov_iter_get_pages_alloc(i, &pages, maxsize, maxpages,
+ start, gup_flags);
+}
+EXPORT_SYMBOL_GPL(iov_iter_get_pages);
+
+ssize_t iov_iter_get_pages2(struct iov_iter *i, struct page **pages,
+ size_t maxsize, unsigned maxpages, size_t *start)
+{
+ return iov_iter_get_pages(i, pages, maxsize, maxpages, start, 0);
}
EXPORT_SYMBOL(iov_iter_get_pages2);
-ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
+ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
struct page ***pages, size_t maxsize,
- size_t *start)
+ size_t *start, unsigned gup_flags)
{
ssize_t len;
*pages = NULL;
- len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start);
+ len = __iov_iter_get_pages_alloc(i, pages, maxsize, ~0U, start,
+ gup_flags);
if (len <= 0) {
kvfree(*pages);
*pages = NULL;
}
return len;
}
+EXPORT_SYMBOL_GPL(iov_iter_get_pages_alloc);
+
+ssize_t iov_iter_get_pages_alloc2(struct iov_iter *i,
+ struct page ***pages, size_t maxsize, size_t *start)
+{
+ return iov_iter_get_pages_alloc(i, pages, maxsize, start, 0);
+}
EXPORT_SYMBOL(iov_iter_get_pages_alloc2);
size_t csum_and_copy_from_iter(void *addr, size_t bytes, __wsum *csum,
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index dc35464216d3..b3d9187611de 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -60,17 +60,6 @@ int lc_try_lock(struct lru_cache *lc)
} while (unlikely (val == LC_PARANOIA));
/* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */
return 0 == val;
-#if 0
- /* Alternative approach, spin in case someone enters or leaves a
- * PARANOIA_ENTRY()/RETURN() section. */
- unsigned long old, new, val;
- do {
- old = lc->flags & LC_PARANOIA;
- new = old | LC_LOCKED;
- val = cmpxchg(&lc->flags, old, new);
- } while (unlikely (val == (old ^ LC_PARANOIA)));
- return old == val;
-#endif
}
/**
@@ -364,7 +353,7 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsig
struct lc_element *e;
PARANOIA_ENTRY();
- if (lc->flags & LC_STARVING) {
+ if (test_bit(__LC_STARVING, &lc->flags)) {
++lc->starving;
RETURN(NULL);
}
@@ -417,7 +406,7 @@ static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, unsig
* the LRU element, we have to wait ...
*/
if (!lc_unused_element_available(lc)) {
- __set_bit(__LC_STARVING, &lc->flags);
+ set_bit(__LC_STARVING, &lc->flags);
RETURN(NULL);
}
@@ -586,48 +575,6 @@ struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i)
}
/**
- * lc_index_of
- * @lc: the lru cache to operate on
- * @e: the element to query for its index position in lc->element
- */
-unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e)
-{
- PARANOIA_LC_ELEMENT(lc, e);
- return e->lc_index;
-}
-
-/**
- * lc_set - associate index with label
- * @lc: the lru cache to operate on
- * @enr: the label to set
- * @index: the element index to associate label with.
- *
- * Used to initialize the active set to some previously recorded state.
- */
-void lc_set(struct lru_cache *lc, unsigned int enr, int index)
-{
- struct lc_element *e;
- struct list_head *lh;
-
- if (index < 0 || index >= lc->nr_elements)
- return;
-
- e = lc_element_by_index(lc, index);
- BUG_ON(e->lc_number != e->lc_new_number);
- BUG_ON(e->refcnt != 0);
-
- e->lc_number = e->lc_new_number = enr;
- hlist_del_init(&e->colision);
- if (enr == LC_FREE)
- lh = &lc->free;
- else {
- hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
- lh = &lc->lru;
- }
- list_move(&e->list, lh);
-}
-
-/**
* lc_seq_dump_details - Dump a complete LRU cache to seq in textual form.
* @lc: the lru cache to operate on
* @seq: the &struct seq_file pointer to seq_printf into
@@ -661,7 +608,6 @@ void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext
EXPORT_SYMBOL(lc_create);
EXPORT_SYMBOL(lc_reset);
EXPORT_SYMBOL(lc_destroy);
-EXPORT_SYMBOL(lc_set);
EXPORT_SYMBOL(lc_del);
EXPORT_SYMBOL(lc_try_get);
EXPORT_SYMBOL(lc_find);
@@ -669,7 +615,6 @@ EXPORT_SYMBOL(lc_get);
EXPORT_SYMBOL(lc_put);
EXPORT_SYMBOL(lc_committed);
EXPORT_SYMBOL(lc_element_by_index);
-EXPORT_SYMBOL(lc_index_of);
EXPORT_SYMBOL(lc_seq_printf_stats);
EXPORT_SYMBOL(lc_seq_dump_details);
EXPORT_SYMBOL(lc_try_lock);
diff --git a/lib/raid6/algos.c b/lib/raid6/algos.c
index 39b74221f4a7..a22a05c9af8a 100644
--- a/lib/raid6/algos.c
+++ b/lib/raid6/algos.c
@@ -18,12 +18,10 @@
#else
#include <linux/module.h>
#include <linux/gfp.h>
-#if !RAID6_USE_EMPTY_ZERO_PAGE
/* In .bss so it's zeroed */
const char raid6_empty_zero_page[PAGE_SIZE] __attribute__((aligned(256)));
EXPORT_SYMBOL(raid6_empty_zero_page);
#endif
-#endif
struct raid6_calls raid6_call;
EXPORT_SYMBOL_GPL(raid6_call);
diff --git a/lib/sbitmap.c b/lib/sbitmap.c
index 58de526ff051..1fcede228fa2 100644
--- a/lib/sbitmap.c
+++ b/lib/sbitmap.c
@@ -434,6 +434,8 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
sbq->wake_batch = sbq_calc_wake_batch(sbq, depth);
atomic_set(&sbq->wake_index, 0);
atomic_set(&sbq->ws_active, 0);
+ atomic_set(&sbq->completion_cnt, 0);
+ atomic_set(&sbq->wakeup_cnt, 0);
sbq->ws = kzalloc_node(SBQ_WAIT_QUEUES * sizeof(*sbq->ws), flags, node);
if (!sbq->ws) {
@@ -441,40 +443,21 @@ int sbitmap_queue_init_node(struct sbitmap_queue *sbq, unsigned int depth,
return -ENOMEM;
}
- for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
+ for (i = 0; i < SBQ_WAIT_QUEUES; i++)
init_waitqueue_head(&sbq->ws[i].wait);
- atomic_set(&sbq->ws[i].wait_cnt, sbq->wake_batch);
- }
return 0;
}
EXPORT_SYMBOL_GPL(sbitmap_queue_init_node);
-static inline void __sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
- unsigned int wake_batch)
-{
- int i;
-
- if (sbq->wake_batch != wake_batch) {
- WRITE_ONCE(sbq->wake_batch, wake_batch);
- /*
- * Pairs with the memory barrier in sbitmap_queue_wake_up()
- * to ensure that the batch size is updated before the wait
- * counts.
- */
- smp_mb();
- for (i = 0; i < SBQ_WAIT_QUEUES; i++)
- atomic_set(&sbq->ws[i].wait_cnt, 1);
- }
-}
-
static void sbitmap_queue_update_wake_batch(struct sbitmap_queue *sbq,
unsigned int depth)
{
unsigned int wake_batch;
wake_batch = sbq_calc_wake_batch(sbq, depth);
- __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+ if (sbq->wake_batch != wake_batch)
+ WRITE_ONCE(sbq->wake_batch, wake_batch);
}
void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
@@ -488,7 +471,8 @@ void sbitmap_queue_recalculate_wake_batch(struct sbitmap_queue *sbq,
wake_batch = clamp_val(depth / SBQ_WAIT_QUEUES,
min_batch, SBQ_WAKE_BATCH);
- __sbitmap_queue_update_wake_batch(sbq, wake_batch);
+
+ WRITE_ONCE(sbq->wake_batch, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_recalculate_wake_batch);
@@ -576,106 +560,56 @@ void sbitmap_queue_min_shallow_depth(struct sbitmap_queue *sbq,
}
EXPORT_SYMBOL_GPL(sbitmap_queue_min_shallow_depth);
-static struct sbq_wait_state *sbq_wake_ptr(struct sbitmap_queue *sbq)
+static void __sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
int i, wake_index;
if (!atomic_read(&sbq->ws_active))
- return NULL;
+ return;
wake_index = atomic_read(&sbq->wake_index);
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
struct sbq_wait_state *ws = &sbq->ws[wake_index];
- if (waitqueue_active(&ws->wait) && atomic_read(&ws->wait_cnt)) {
- if (wake_index != atomic_read(&sbq->wake_index))
- atomic_set(&sbq->wake_index, wake_index);
- return ws;
- }
-
+ /*
+ * Advance the index before checking the current queue.
+ * It improves fairness, by ensuring the queue doesn't
+ * need to be fully emptied before trying to wake up
+ * from the next one.
+ */
wake_index = sbq_index_inc(wake_index);
+
+ /*
+ * It is sufficient to wake up at least one waiter to
+ * guarantee forward progress.
+ */
+ if (waitqueue_active(&ws->wait) &&
+ wake_up_nr(&ws->wait, nr))
+ break;
}
- return NULL;
+ if (wake_index != atomic_read(&sbq->wake_index))
+ atomic_set(&sbq->wake_index, wake_index);
}
-static bool __sbq_wake_up(struct sbitmap_queue *sbq, int *nr)
+void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
{
- struct sbq_wait_state *ws;
- unsigned int wake_batch;
- int wait_cnt, cur, sub;
- bool ret;
+ unsigned int wake_batch = READ_ONCE(sbq->wake_batch);
+ unsigned int wakeups;
- if (*nr <= 0)
- return false;
+ if (!atomic_read(&sbq->ws_active))
+ return;
- ws = sbq_wake_ptr(sbq);
- if (!ws)
- return false;
+ atomic_add(nr, &sbq->completion_cnt);
+ wakeups = atomic_read(&sbq->wakeup_cnt);
- cur = atomic_read(&ws->wait_cnt);
do {
- /*
- * For concurrent callers of this, callers should call this
- * function again to wakeup a new batch on a different 'ws'.
- */
- if (cur == 0)
- return true;
- sub = min(*nr, cur);
- wait_cnt = cur - sub;
- } while (!atomic_try_cmpxchg(&ws->wait_cnt, &cur, wait_cnt));
-
- /*
- * If we decremented queue without waiters, retry to avoid lost
- * wakeups.
- */
- if (wait_cnt > 0)
- return !waitqueue_active(&ws->wait);
-
- *nr -= sub;
-
- /*
- * When wait_cnt == 0, we have to be particularly careful as we are
- * responsible to reset wait_cnt regardless whether we've actually
- * woken up anybody. But in case we didn't wakeup anybody, we still
- * need to retry.
- */
- ret = !waitqueue_active(&ws->wait);
- wake_batch = READ_ONCE(sbq->wake_batch);
-
- /*
- * Wake up first in case that concurrent callers decrease wait_cnt
- * while waitqueue is empty.
- */
- wake_up_nr(&ws->wait, wake_batch);
+ if (atomic_read(&sbq->completion_cnt) - wakeups < wake_batch)
+ return;
+ } while (!atomic_try_cmpxchg(&sbq->wakeup_cnt,
+ &wakeups, wakeups + wake_batch));
- /*
- * Pairs with the memory barrier in sbitmap_queue_resize() to
- * ensure that we see the batch size update before the wait
- * count is reset.
- *
- * Also pairs with the implicit barrier between decrementing wait_cnt
- * and checking for waitqueue_active() to make sure waitqueue_active()
- * sees result of the wakeup if atomic_dec_return() has seen the result
- * of atomic_set().
- */
- smp_mb__before_atomic();
-
- /*
- * Increase wake_index before updating wait_cnt, otherwise concurrent
- * callers can see valid wait_cnt in old waitqueue, which can cause
- * invalid wakeup on the old waitqueue.
- */
- sbq_index_atomic_inc(&sbq->wake_index);
- atomic_set(&ws->wait_cnt, wake_batch);
-
- return ret || *nr;
-}
-
-void sbitmap_queue_wake_up(struct sbitmap_queue *sbq, int nr)
-{
- while (__sbq_wake_up(sbq, &nr))
- ;
+ __sbitmap_queue_wake_up(sbq, wake_batch);
}
EXPORT_SYMBOL_GPL(sbitmap_queue_wake_up);
@@ -792,9 +726,7 @@ void sbitmap_queue_show(struct sbitmap_queue *sbq, struct seq_file *m)
seq_puts(m, "ws={\n");
for (i = 0; i < SBQ_WAIT_QUEUES; i++) {
struct sbq_wait_state *ws = &sbq->ws[i];
-
- seq_printf(m, "\t{.wait_cnt=%d, .wait=%s},\n",
- atomic_read(&ws->wait_cnt),
+ seq_printf(m, "\t{.wait=%s},\n",
waitqueue_active(&ws->wait) ? "active" : "inactive");
}
seq_puts(m, "}\n");
diff --git a/lib/scatterlist.c b/lib/scatterlist.c
index c8c3d675845c..a0ad2a7959b5 100644
--- a/lib/scatterlist.c
+++ b/lib/scatterlist.c
@@ -410,6 +410,15 @@ static struct scatterlist *get_next_sg(struct sg_append_table *table,
return new_sg;
}
+static bool pages_are_mergeable(struct page *a, struct page *b)
+{
+ if (page_to_pfn(a) != page_to_pfn(b) + 1)
+ return false;
+ if (!zone_device_pages_have_same_pgmap(a, b))
+ return false;
+ return true;
+}
+
/**
* sg_alloc_append_table_from_pages - Allocate and initialize an append sg
* table from an array of pages
@@ -447,6 +456,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
unsigned int chunks, cur_page, seg_len, i, prv_len = 0;
unsigned int added_nents = 0;
struct scatterlist *s = sgt_append->prv;
+ struct page *last_pg;
/*
* The algorithm below requires max_segment to be aligned to PAGE_SIZE
@@ -460,21 +470,17 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
return -EOPNOTSUPP;
if (sgt_append->prv) {
- unsigned long paddr =
- (page_to_pfn(sg_page(sgt_append->prv)) * PAGE_SIZE +
- sgt_append->prv->offset + sgt_append->prv->length) /
- PAGE_SIZE;
-
if (WARN_ON(offset))
return -EINVAL;
/* Merge contiguous pages into the last SG */
prv_len = sgt_append->prv->length;
- while (n_pages && page_to_pfn(pages[0]) == paddr) {
+ last_pg = sg_page(sgt_append->prv);
+ while (n_pages && pages_are_mergeable(last_pg, pages[0])) {
if (sgt_append->prv->length + PAGE_SIZE > max_segment)
break;
sgt_append->prv->length += PAGE_SIZE;
- paddr++;
+ last_pg = pages[0];
pages++;
n_pages--;
}
@@ -488,7 +494,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
for (i = 1; i < n_pages; i++) {
seg_len += PAGE_SIZE;
if (seg_len >= max_segment ||
- page_to_pfn(pages[i]) != page_to_pfn(pages[i - 1]) + 1) {
+ !pages_are_mergeable(pages[i], pages[i - 1])) {
chunks++;
seg_len = 0;
}
@@ -504,8 +510,7 @@ int sg_alloc_append_table_from_pages(struct sg_append_table *sgt_append,
for (j = cur_page + 1; j < n_pages; j++) {
seg_len += PAGE_SIZE;
if (seg_len >= max_segment ||
- page_to_pfn(pages[j]) !=
- page_to_pfn(pages[j - 1]) + 1)
+ !pages_are_mergeable(pages[j], pages[j - 1]))
break;
}
diff --git a/mm/gup.c b/mm/gup.c
index 3b7bc2c1fd44..98aac6201e1b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -123,6 +123,9 @@ retry:
*/
struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags)
{
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return NULL;
+
if (flags & FOLL_GET)
return try_get_folio(page, refs);
else if (flags & FOLL_PIN) {
@@ -202,17 +205,22 @@ static void gup_put_folio(struct folio *folio, int refs, unsigned int flags)
* time. Cases: please see the try_grab_folio() documentation, with
* "refs=1".
*
- * Return: true for success, or if no action was required (if neither FOLL_PIN
- * nor FOLL_GET was set, nothing is done). False for failure: FOLL_GET or
- * FOLL_PIN was set, but the page could not be grabbed.
+ * Return: 0 for success, or if no action was required (if neither FOLL_PIN
+ * nor FOLL_GET was set, nothing is done). A negative error code for failure:
+ *
+ * -ENOMEM FOLL_GET or FOLL_PIN was set, but the page could not
+ * be grabbed.
*/
-bool __must_check try_grab_page(struct page *page, unsigned int flags)
+int __must_check try_grab_page(struct page *page, unsigned int flags)
{
struct folio *folio = page_folio(page);
WARN_ON_ONCE((flags & (FOLL_GET | FOLL_PIN)) == (FOLL_GET | FOLL_PIN));
if (WARN_ON_ONCE(folio_ref_count(folio) <= 0))
- return false;
+ return -ENOMEM;
+
+ if (unlikely(!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)))
+ return -EREMOTEIO;
if (flags & FOLL_GET)
folio_ref_inc(folio);
@@ -232,7 +240,7 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
node_stat_mod_folio(folio, NR_FOLL_PIN_ACQUIRED, 1);
}
- return true;
+ return 0;
}
/**
@@ -624,10 +632,12 @@ retry:
!PageAnonExclusive(page), page);
/* try_grab_page() does nothing unless FOLL_GET or FOLL_PIN is set. */
- if (unlikely(!try_grab_page(page, flags))) {
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (unlikely(ret)) {
+ page = ERR_PTR(ret);
goto out;
}
+
/*
* We need to make the page accessible if and only if we are going
* to access its content (the FOLL_PIN case). Please see
@@ -960,10 +970,9 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
goto unmap;
*page = pte_page(*pte);
}
- if (unlikely(!try_grab_page(*page, gup_flags))) {
- ret = -ENOMEM;
+ ret = try_grab_page(*page, gup_flags);
+ if (unlikely(ret))
goto unmap;
- }
out:
ret = 0;
unmap:
@@ -1058,6 +1067,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
return -EOPNOTSUPP;
+ if ((gup_flags & FOLL_LONGTERM) && (gup_flags & FOLL_PCI_P2PDMA))
+ return -EOPNOTSUPP;
+
if (vma_is_secretmem(vma))
return -EFAULT;
@@ -2534,9 +2546,15 @@ static int __gup_device_huge(unsigned long pfn, unsigned long addr,
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
+
+ if (!(flags & FOLL_PCI_P2PDMA) && is_pci_p2pdma_page(page)) {
+ undo_dev_pagemap(nr, nr_start, flags, pages);
+ break;
+ }
+
SetPageReferenced(page);
pages[*nr] = page;
- if (unlikely(!try_grab_page(page, flags))) {
+ if (unlikely(try_grab_page(page, flags))) {
undo_dev_pagemap(nr, nr_start, flags, pages);
break;
}
@@ -3018,7 +3036,8 @@ static int internal_get_user_pages_fast(unsigned long start,
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
- FOLL_FAST_ONLY | FOLL_NOFAULT)))
+ FOLL_FAST_ONLY | FOLL_NOFAULT |
+ FOLL_PCI_P2PDMA)))
return -EINVAL;
if (gup_flags & FOLL_PIN)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 811d19b5c4f6..ffbea56a8711 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1035,6 +1035,7 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pmd_pfn(*pmd);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -1066,8 +1067,9 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
@@ -1193,6 +1195,7 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
unsigned long pfn = pud_pfn(*pud);
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pud_lockptr(mm, pud));
@@ -1226,8 +1229,10 @@ struct page *follow_devmap_pud(struct vm_area_struct *vma, unsigned long addr,
if (!*pgmap)
return ERR_PTR(-EFAULT);
page = pfn_to_page(pfn);
- if (!try_grab_page(page, flags))
- page = ERR_PTR(-ENOMEM);
+
+ ret = try_grab_page(page, flags);
+ if (ret)
+ page = ERR_PTR(ret);
return page;
}
@@ -1435,6 +1440,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
{
struct mm_struct *mm = vma->vm_mm;
struct page *page;
+ int ret;
assert_spin_locked(pmd_lockptr(mm, pmd));
@@ -1459,8 +1465,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
VM_BUG_ON_PAGE((flags & FOLL_PIN) && PageAnon(page) &&
!PageAnonExclusive(page), page);
- if (!try_grab_page(page, flags))
- return ERR_PTR(-ENOMEM);
+ ret = try_grab_page(page, flags);
+ if (ret)
+ return ERR_PTR(ret);
if (flags & FOLL_TOUCH)
touch_pmd(vma, addr, pmd, flags & FOLL_WRITE);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index e36ca75311a5..3d9f4abec17c 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -6372,8 +6372,10 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
* tables. If the huge page is present, then the tail
* pages must also be present. The ptl prevents the
* head page and tail pages from being rearranged in
- * any way. So this page must be available at this
- * point, unless the page refcount overflowed:
+ * any way. As this is hugetlb, the pages will never
+ * be p2pdma or not longterm pinable. So this page
+ * must be available at this point, unless the page
+ * refcount overflowed:
*/
if (WARN_ON_ONCE(!try_grab_folio(pages[i], refs,
flags))) {
@@ -7254,14 +7256,15 @@ retry:
page = pte_page(pte) +
((address & ~huge_page_mask(h)) >> PAGE_SHIFT);
/*
- * try_grab_page() should always succeed here, because: a) we
- * hold the pmd (ptl) lock, and b) we've just checked that the
- * huge pmd (head) page is present in the page tables. The ptl
- * prevents the head page and tail pages from being rearranged
- * in any way. So this page must be available at this point,
- * unless the page refcount overflowed:
+ * try_grab_page() should always be able to get the page here,
+ * because: a) we hold the pmd (ptl) lock, and b) we've just
+ * checked that the huge pmd (head) page is present in the
+ * page tables. The ptl prevents the head page and tail pages
+ * from being rearranged in any way. So this page must be
+ * available at this point, unless the page refcount
+ * overflowed:
*/
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ if (try_grab_page(page, flags)) {
page = NULL;
goto out;
}
@@ -7299,7 +7302,7 @@ retry:
pte = huge_ptep_get((pte_t *)pud);
if (pte_present(pte)) {
page = pud_page(*pud) + ((address & ~PUD_MASK) >> PAGE_SHIFT);
- if (WARN_ON_ONCE(!try_grab_page(page, flags))) {
+ if (try_grab_page(page, flags)) {
page = NULL;
goto out;
}