summaryrefslogtreecommitdiffstats
path: root/drivers
diff options
context:
space:
mode:
Diffstat (limited to 'drivers')
-rw-r--r--drivers/block/DAC960.c160
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/aoe/aoe.h3
-rw-r--r--drivers/block/aoe/aoecmd.c48
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/null_blk.c290
-rw-r--r--drivers/block/pktcdvd.c12
-rw-r--r--drivers/block/smart1,2.h278
-rw-r--r--drivers/block/zram/zram_drv.c2
-rw-r--r--drivers/lightnvm/Kconfig7
-rw-r--r--drivers/lightnvm/Makefile1
-rw-r--r--drivers/lightnvm/core.c462
-rw-r--r--drivers/lightnvm/pblk-cache.c5
-rw-r--r--drivers/lightnvm/pblk-core.c55
-rw-r--r--drivers/lightnvm/pblk-gc.c23
-rw-r--r--drivers/lightnvm/pblk-init.c104
-rw-r--r--drivers/lightnvm/pblk-map.c2
-rw-r--r--drivers/lightnvm/pblk-rb.c111
-rw-r--r--drivers/lightnvm/pblk-read.c35
-rw-r--r--drivers/lightnvm/pblk-recovery.c43
-rw-r--r--drivers/lightnvm/pblk-rl.c54
-rw-r--r--drivers/lightnvm/pblk-sysfs.c15
-rw-r--r--drivers/lightnvm/pblk-write.c23
-rw-r--r--drivers/lightnvm/pblk.h163
-rw-r--r--drivers/lightnvm/rrpc.c1625
-rw-r--r--drivers/lightnvm/rrpc.h290
-rw-r--r--drivers/md/bcache/alloc.c19
-rw-r--r--drivers/md/bcache/bcache.h24
-rw-r--r--drivers/md/bcache/btree.c10
-rw-r--r--drivers/md/bcache/closure.c47
-rw-r--r--drivers/md/bcache/closure.h60
-rw-r--r--drivers/md/bcache/debug.c7
-rw-r--r--drivers/md/bcache/io.c13
-rw-r--r--drivers/md/bcache/movinggc.c2
-rw-r--r--drivers/md/bcache/request.c29
-rw-r--r--drivers/md/bcache/super.c27
-rw-r--r--drivers/md/bcache/util.c34
-rw-r--r--drivers/md/bcache/util.h1
-rw-r--r--drivers/md/bcache/writeback.c203
-rw-r--r--drivers/md/bcache/writeback.h12
-rw-r--r--drivers/md/dm-crypt.c1
-rw-r--r--drivers/md/dm-mpath.c19
-rw-r--r--drivers/md/dm-rq.c28
-rw-r--r--drivers/md/dm.c21
-rw-r--r--drivers/nvme/host/Makefile4
-rw-r--r--drivers/nvme/host/core.c134
-rw-r--r--drivers/nvme/host/fabrics.c22
-rw-r--r--drivers/nvme/host/fabrics.h2
-rw-r--r--drivers/nvme/host/fc.c7
-rw-r--r--drivers/nvme/host/lightnvm.c185
-rw-r--r--drivers/nvme/host/multipath.c44
-rw-r--r--drivers/nvme/host/nvme.h9
-rw-r--r--drivers/nvme/host/pci.c216
-rw-r--r--drivers/nvme/host/rdma.c6
-rw-r--r--drivers/nvme/host/trace.c130
-rw-r--r--drivers/nvme/host/trace.h165
-rw-r--r--drivers/nvme/target/Kconfig2
-rw-r--r--drivers/nvme/target/core.c14
-rw-r--r--drivers/nvme/target/fabrics-cmd.c2
-rw-r--r--drivers/nvme/target/fc.c60
-rw-r--r--drivers/nvme/target/fcloop.c244
-rw-r--r--drivers/nvme/target/loop.c3
-rw-r--r--drivers/nvme/target/rdma.c83
-rw-r--r--drivers/target/Kconfig1
-rw-r--r--drivers/target/target_core_transport.c46
65 files changed, 1909 insertions, 3844 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 442e777bdfb2..728075214959 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6619,43 +6619,27 @@ static void DAC960_DestroyProcEntries(DAC960_Controller_T *Controller)
#ifdef DAC960_GAM_MINOR
-/*
- * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
-*/
-
-static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
- unsigned long Argument)
+static long DAC960_gam_get_controller_info(DAC960_ControllerInfo_T __user *UserSpaceControllerInfo)
{
- long ErrorCode = 0;
- if (!capable(CAP_SYS_ADMIN)) return -EACCES;
-
- mutex_lock(&DAC960_mutex);
- switch (Request)
- {
- case DAC960_IOCTL_GET_CONTROLLER_COUNT:
- ErrorCode = DAC960_ControllerCount;
- break;
- case DAC960_IOCTL_GET_CONTROLLER_INFO:
- {
- DAC960_ControllerInfo_T __user *UserSpaceControllerInfo =
- (DAC960_ControllerInfo_T __user *) Argument;
DAC960_ControllerInfo_T ControllerInfo;
DAC960_Controller_T *Controller;
int ControllerNumber;
+ long ErrorCode;
+
if (UserSpaceControllerInfo == NULL)
ErrorCode = -EINVAL;
else ErrorCode = get_user(ControllerNumber,
&UserSpaceControllerInfo->ControllerNumber);
if (ErrorCode != 0)
- break;
+ goto out;
ErrorCode = -ENXIO;
if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1) {
- break;
+ goto out;
}
Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL)
- break;
+ goto out;
memset(&ControllerInfo, 0, sizeof(DAC960_ControllerInfo_T));
ControllerInfo.ControllerNumber = ControllerNumber;
ControllerInfo.FirmwareType = Controller->FirmwareType;
@@ -6670,12 +6654,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
strcpy(ControllerInfo.FirmwareVersion, Controller->FirmwareVersion);
ErrorCode = (copy_to_user(UserSpaceControllerInfo, &ControllerInfo,
sizeof(DAC960_ControllerInfo_T)) ? -EFAULT : 0);
- break;
- }
- case DAC960_IOCTL_V1_EXECUTE_COMMAND:
- {
- DAC960_V1_UserCommand_T __user *UserSpaceUserCommand =
- (DAC960_V1_UserCommand_T __user *) Argument;
+out:
+ return ErrorCode;
+}
+
+static long DAC960_gam_v1_execute_command(DAC960_V1_UserCommand_T __user *UserSpaceUserCommand)
+{
DAC960_V1_UserCommand_T UserCommand;
DAC960_Controller_T *Controller;
DAC960_Command_T *Command = NULL;
@@ -6688,39 +6672,41 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
int ControllerNumber, DataTransferLength;
unsigned char *DataTransferBuffer = NULL;
dma_addr_t DataTransferBufferDMA;
+ long ErrorCode;
+
if (UserSpaceUserCommand == NULL) {
ErrorCode = -EINVAL;
- break;
+ goto out;
}
if (copy_from_user(&UserCommand, UserSpaceUserCommand,
sizeof(DAC960_V1_UserCommand_T))) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
ControllerNumber = UserCommand.ControllerNumber;
ErrorCode = -ENXIO;
if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1)
- break;
+ goto out;
Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL)
- break;
+ goto out;
ErrorCode = -EINVAL;
if (Controller->FirmwareType != DAC960_V1_Controller)
- break;
+ goto out;
CommandOpcode = UserCommand.CommandMailbox.Common.CommandOpcode;
DataTransferLength = UserCommand.DataTransferLength;
if (CommandOpcode & 0x80)
- break;
+ goto out;
if (CommandOpcode == DAC960_V1_DCDB)
{
if (copy_from_user(&DCDB, UserCommand.DCDB,
sizeof(DAC960_V1_DCDB_T))) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
if (DCDB.Channel >= DAC960_V1_MaxChannels)
- break;
+ goto out;
if (!((DataTransferLength == 0 &&
DCDB.Direction
== DAC960_V1_DCDB_NoDataTransfer) ||
@@ -6730,15 +6716,15 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
(DataTransferLength < 0 &&
DCDB.Direction
== DAC960_V1_DCDB_DataTransferSystemToDevice)))
- break;
+ goto out;
if (((DCDB.TransferLengthHigh4 << 16) | DCDB.TransferLength)
!= abs(DataTransferLength))
- break;
+ goto out;
DCDB_IOBUF = pci_alloc_consistent(Controller->PCIDevice,
sizeof(DAC960_V1_DCDB_T), &DCDB_IOBUFDMA);
if (DCDB_IOBUF == NULL) {
ErrorCode = -ENOMEM;
- break;
+ goto out;
}
}
ErrorCode = -ENOMEM;
@@ -6748,19 +6734,19 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DataTransferLength,
&DataTransferBufferDMA);
if (DataTransferBuffer == NULL)
- break;
+ goto out;
}
else if (DataTransferLength < 0)
{
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
-DataTransferLength, &DataTransferBufferDMA);
if (DataTransferBuffer == NULL)
- break;
+ goto out;
if (copy_from_user(DataTransferBuffer,
UserCommand.DataTransferBuffer,
-DataTransferLength)) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
}
if (CommandOpcode == DAC960_V1_DCDB)
@@ -6837,12 +6823,12 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
if (DCDB_IOBUF != NULL)
pci_free_consistent(Controller->PCIDevice, sizeof(DAC960_V1_DCDB_T),
DCDB_IOBUF, DCDB_IOBUFDMA);
- break;
- }
- case DAC960_IOCTL_V2_EXECUTE_COMMAND:
- {
- DAC960_V2_UserCommand_T __user *UserSpaceUserCommand =
- (DAC960_V2_UserCommand_T __user *) Argument;
+ out:
+ return ErrorCode;
+}
+
+static long DAC960_gam_v2_execute_command(DAC960_V2_UserCommand_T __user *UserSpaceUserCommand)
+{
DAC960_V2_UserCommand_T UserCommand;
DAC960_Controller_T *Controller;
DAC960_Command_T *Command = NULL;
@@ -6855,26 +6841,26 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
dma_addr_t DataTransferBufferDMA;
unsigned char *RequestSenseBuffer = NULL;
dma_addr_t RequestSenseBufferDMA;
+ long ErrorCode = -EINVAL;
- ErrorCode = -EINVAL;
if (UserSpaceUserCommand == NULL)
- break;
+ goto out;
if (copy_from_user(&UserCommand, UserSpaceUserCommand,
sizeof(DAC960_V2_UserCommand_T))) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
ErrorCode = -ENXIO;
ControllerNumber = UserCommand.ControllerNumber;
if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1)
- break;
+ goto out;
Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL)
- break;
+ goto out;
if (Controller->FirmwareType != DAC960_V2_Controller){
ErrorCode = -EINVAL;
- break;
+ goto out;
}
DataTransferLength = UserCommand.DataTransferLength;
ErrorCode = -ENOMEM;
@@ -6884,14 +6870,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DataTransferLength,
&DataTransferBufferDMA);
if (DataTransferBuffer == NULL)
- break;
+ goto out;
}
else if (DataTransferLength < 0)
{
DataTransferBuffer = pci_alloc_consistent(Controller->PCIDevice,
-DataTransferLength, &DataTransferBufferDMA);
if (DataTransferBuffer == NULL)
- break;
+ goto out;
if (copy_from_user(DataTransferBuffer,
UserCommand.DataTransferBuffer,
-DataTransferLength)) {
@@ -7001,42 +6987,44 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
if (RequestSenseBuffer != NULL)
pci_free_consistent(Controller->PCIDevice, RequestSenseLength,
RequestSenseBuffer, RequestSenseBufferDMA);
- break;
- }
- case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
- {
- DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus =
- (DAC960_V2_GetHealthStatus_T __user *) Argument;
+out:
+ return ErrorCode;
+}
+
+static long DAC960_gam_v2_get_health_status(DAC960_V2_GetHealthStatus_T __user *UserSpaceGetHealthStatus)
+{
DAC960_V2_GetHealthStatus_T GetHealthStatus;
DAC960_V2_HealthStatusBuffer_T HealthStatusBuffer;
DAC960_Controller_T *Controller;
int ControllerNumber;
+ long ErrorCode;
+
if (UserSpaceGetHealthStatus == NULL) {
ErrorCode = -EINVAL;
- break;
+ goto out;
}
if (copy_from_user(&GetHealthStatus, UserSpaceGetHealthStatus,
sizeof(DAC960_V2_GetHealthStatus_T))) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
ErrorCode = -ENXIO;
ControllerNumber = GetHealthStatus.ControllerNumber;
if (ControllerNumber < 0 ||
ControllerNumber > DAC960_ControllerCount - 1)
- break;
+ goto out;
Controller = DAC960_Controllers[ControllerNumber];
if (Controller == NULL)
- break;
+ goto out;
if (Controller->FirmwareType != DAC960_V2_Controller) {
ErrorCode = -EINVAL;
- break;
+ goto out;
}
if (copy_from_user(&HealthStatusBuffer,
GetHealthStatus.HealthStatusBuffer,
sizeof(DAC960_V2_HealthStatusBuffer_T))) {
ErrorCode = -EFAULT;
- break;
+ goto out;
}
ErrorCode = wait_event_interruptible_timeout(Controller->HealthStatusWaitQueue,
!(Controller->V2.HealthStatusBuffer->StatusChangeCounter
@@ -7046,7 +7034,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
DAC960_MonitoringTimerInterval);
if (ErrorCode == -ERESTARTSYS) {
ErrorCode = -EINTR;
- break;
+ goto out;
}
if (copy_to_user(GetHealthStatus.HealthStatusBuffer,
Controller->V2.HealthStatusBuffer,
@@ -7054,7 +7042,39 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
ErrorCode = -EFAULT;
else
ErrorCode = 0;
- }
+
+out:
+ return ErrorCode;
+}
+
+/*
+ * DAC960_gam_ioctl is the ioctl function for performing RAID operations.
+*/
+
+static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
+ unsigned long Argument)
+{
+ long ErrorCode = 0;
+ void __user *argp = (void __user *)Argument;
+ if (!capable(CAP_SYS_ADMIN)) return -EACCES;
+
+ mutex_lock(&DAC960_mutex);
+ switch (Request)
+ {
+ case DAC960_IOCTL_GET_CONTROLLER_COUNT:
+ ErrorCode = DAC960_ControllerCount;
+ break;
+ case DAC960_IOCTL_GET_CONTROLLER_INFO:
+ ErrorCode = DAC960_gam_get_controller_info(argp);
+ break;
+ case DAC960_IOCTL_V1_EXECUTE_COMMAND:
+ ErrorCode = DAC960_gam_v1_execute_command(argp);
+ break;
+ case DAC960_IOCTL_V2_EXECUTE_COMMAND:
+ ErrorCode = DAC960_gam_v2_execute_command(argp);
+ break;
+ case DAC960_IOCTL_V2_GET_HEALTH_STATUS:
+ ErrorCode = DAC960_gam_v2_get_health_status(argp);
break;
default:
ErrorCode = -ENOTTY;
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 40579d0cb3d1..ad9b687a236a 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -20,6 +20,10 @@ config BLK_DEV_NULL_BLK
tristate "Null test block driver"
select CONFIGFS_FS
+config BLK_DEV_NULL_BLK_FAULT_INJECTION
+ bool "Support fault injection for Null test block driver"
+ depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
+
config BLK_DEV_FD
tristate "Normal floppy disk support"
depends on ARCH_MAY_HAVE_PC_FDC
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 9220f8e833d0..c0ebda1283cc 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -112,8 +112,7 @@ enum frame_flags {
struct frame {
struct list_head head;
u32 tag;
- struct timeval sent; /* high-res time packet was sent */
- u32 sent_jiffs; /* low-res jiffies-based sent time */
+ ktime_t sent; /* high-res time packet was sent */
ulong waited;
ulong waited_total;
struct aoetgt *t; /* parent target I belong to */
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 812fed069708..540bb60cd071 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -398,8 +398,7 @@ aoecmd_ata_rw(struct aoedev *d)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -489,8 +488,7 @@ resend(struct aoedev *d, struct frame *f)
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -499,33 +497,17 @@ resend(struct aoedev *d, struct frame *f)
static int
tsince_hr(struct frame *f)
{
- struct timeval now;
- int n;
+ u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
- do_gettimeofday(&now);
- n = now.tv_usec - f->sent.tv_usec;
- n += (now.tv_sec - f->sent.tv_sec) * USEC_PER_SEC;
+ /* delta is normally under 4.2 seconds, avoid 64-bit division */
+ if (likely(delta <= UINT_MAX))
+ return (u32)delta / NSEC_PER_USEC;
- if (n < 0)
- n = -n;
+ /* avoid overflow after 71 minutes */
+ if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
+ return INT_MAX;
- /* For relatively long periods, use jiffies to avoid
- * discrepancies caused by updates to the system time.
- *
- * On system with HZ of 1000, 32-bits is over 49 days
- * worth of jiffies, or over 71 minutes worth of usecs.
- *
- * Jiffies overflow is handled by subtraction of unsigned ints:
- * (gdb) print (unsigned) 2 - (unsigned) 0xfffffffe
- * $3 = 4
- * (gdb)
- */
- if (n > USEC_PER_SEC / 4) {
- n = ((u32) jiffies) - f->sent_jiffs;
- n *= USEC_PER_SEC / HZ;
- }
-
- return n;
+ return div_u64(delta, NSEC_PER_USEC);
}
static int
@@ -589,7 +571,6 @@ reassign_frame(struct frame *f)
nf->waited = 0;
nf->waited_total = f->waited_total;
nf->sent = f->sent;
- nf->sent_jiffs = f->sent_jiffs;
f->skb = skb;
return nf;
@@ -633,8 +614,7 @@ probe(struct aoetgt *t)
skb = skb_clone(f->skb, GFP_ATOMIC);
if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
+ f->sent = ktime_get();
__skb_queue_head_init(&queue);
__skb_queue_tail(&queue, skb);
aoenet_xmit(&queue);
@@ -1432,10 +1412,8 @@ aoecmd_ata_id(struct aoedev *d)
d->timer.function = rexmit_timer;
skb = skb_clone(skb, GFP_ATOMIC);
- if (skb) {
- do_gettimeofday(&f->sent);
- f->sent_jiffs = (u32) jiffies;
- }
+ if (skb)
+ f->sent = ktime_get();
return skb;
}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index bd97908c766f..9f4e6f502b84 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -953,7 +953,7 @@ static void drbd_bm_endio(struct bio *bio)
struct drbd_bm_aio_ctx *ctx = bio->bi_private;
struct drbd_device *device = ctx->device;
struct drbd_bitmap *b = device->bitmap;
- unsigned int idx = bm_page_to_idx(bio->bi_io_vec[0].bv_page);
+ unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
!bm_test_page_unchanged(b->bm_pages[idx]))
diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c
index ad0477ae820f..6655893a3a7a 100644
--- a/drivers/block/null_blk.c
+++ b/drivers/block/null_blk.c
@@ -12,9 +12,9 @@
#include <linux/slab.h>
#include <linux/blk-mq.h>
#include <linux/hrtimer.h>
-#include <linux/lightnvm.h>
#include <linux/configfs.h>
#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
#define SECTOR_SHIFT 9
#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
@@ -27,6 +27,10 @@
#define TICKS_PER_SEC 50ULL
#define TIMER_INTERVAL (NSEC_PER_SEC / TICKS_PER_SEC)
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static DECLARE_FAULT_ATTR(null_timeout_attr);
+#endif
+
static inline u64 mb_per_tick(int mbps)
{
return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
@@ -107,7 +111,6 @@ struct nullb_device {
unsigned int hw_queue_depth; /* queue depth */
unsigned int index; /* index of the disk, only valid with a disk */
unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
- bool use_lightnvm; /* register as a LightNVM device */
bool blocking; /* blocking blk-mq device */
bool use_per_node_hctx; /* use per-node allocation for hardware context */
bool power; /* power on/off the device */
@@ -121,7 +124,6 @@ struct nullb {
unsigned int index;
struct request_queue *q;
struct gendisk *disk;
- struct nvm_dev *ndev;
struct blk_mq_tag_set *tag_set;
struct blk_mq_tag_set __tag_set;
unsigned int queue_depth;
@@ -139,7 +141,6 @@ static LIST_HEAD(nullb_list);
static struct mutex lock;
static int null_major;
static DEFINE_IDA(nullb_indexes);
-static struct kmem_cache *ppa_cache;
static struct blk_mq_tag_set tag_set;
enum {
@@ -166,6 +167,11 @@ static int g_home_node = NUMA_NO_NODE;
module_param_named(home_node, g_home_node, int, S_IRUGO);
MODULE_PARM_DESC(home_node, "Home node for the device");
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static char g_timeout_str[80];
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), S_IRUGO);
+#endif
+
static int g_queue_mode = NULL_Q_MQ;
static int null_param_store_val(const char *str, int *val, int min, int max)
@@ -208,10 +214,6 @@ static int nr_devices = 1;
module_param(nr_devices, int, S_IRUGO);
MODULE_PARM_DESC(nr_devices, "Number of devices to register");
-static bool g_use_lightnvm;
-module_param_named(use_lightnvm, g_use_lightnvm, bool, S_IRUGO);
-MODULE_PARM_DESC(use_lightnvm, "Register as a LightNVM device");
-
static bool g_blocking;
module_param_named(blocking, g_blocking, bool, S_IRUGO);
MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
@@ -345,7 +347,6 @@ NULLB_DEVICE_ATTR(blocksize, uint);
NULLB_DEVICE_ATTR(irqmode, uint);
NULLB_DEVICE_ATTR(hw_queue_depth, uint);
NULLB_DEVICE_ATTR(index, uint);
-NULLB_DEVICE_ATTR(use_lightnvm, bool);
NULLB_DEVICE_ATTR(blocking, bool);
NULLB_DEVICE_ATTR(use_per_node_hctx, bool);
NULLB_DEVICE_ATTR(memory_backed, bool);
@@ -455,7 +456,6 @@ static struct configfs_attribute *nullb_device_attrs[] = {
&nullb_device_attr_irqmode,
&nullb_device_attr_hw_queue_depth,
&nullb_device_attr_index,
- &nullb_device_attr_use_lightnvm,
&nullb_device_attr_blocking,
&nullb_device_attr_use_per_node_hctx,
&nullb_device_attr_power,
@@ -573,7 +573,6 @@ static struct nullb_device *null_alloc_dev(void)
dev->blocksize = g_bs;
dev->irqmode = g_irqmode;
dev->hw_queue_depth = g_hw_queue_depth;
- dev->use_lightnvm = g_use_lightnvm;
dev->blocking = g_blocking;
dev->use_per_node_hctx = g_use_per_node_hctx;
return dev;
@@ -1352,6 +1351,12 @@ static blk_qc_t null_queue_bio(struct request_queue *q, struct bio *bio)
return BLK_QC_T_NONE;
}
+static enum blk_eh_timer_return null_rq_timed_out_fn(struct request *rq)
+{
+ pr_info("null: rq %p timed out\n", rq);
+ return BLK_EH_HANDLED;
+}
+
static int null_rq_prep_fn(struct request_queue *q, struct request *req)
{
struct nullb *nullb = q->queuedata;
@@ -1369,6 +1374,16 @@ static int null_rq_prep_fn(struct request_queue *q, struct request *req)
return BLKPREP_DEFER;
}
+static bool should_timeout_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ if (g_timeout_str[0])
+ return should_fail(&null_timeout_attr, 1);
+#endif
+
+ return false;
+}
+
static void null_request_fn(struct request_queue *q)
{
struct request *rq;
@@ -1376,12 +1391,20 @@ static void null_request_fn(struct request_queue *q)
while ((rq = blk_fetch_request(q)) != NULL) {
struct nullb_cmd *cmd = rq->special;
- spin_unlock_irq(q->queue_lock);
- null_handle_cmd(cmd);
- spin_lock_irq(q->queue_lock);
+ if (!should_timeout_request(rq)) {
+ spin_unlock_irq(q->queue_lock);
+ null_handle_cmd(cmd);
+ spin_lock_irq(q->queue_lock);
+ }
}
}
+static enum blk_eh_timer_return null_timeout_rq(struct request *rq, bool res)
+{
+ pr_info("null: rq %p timed out\n", rq);
+ return BLK_EH_HANDLED;
+}
+
static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
const struct blk_mq_queue_data *bd)
{
@@ -1399,12 +1422,16 @@ static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
blk_mq_start_request(bd->rq);
- return null_handle_cmd(cmd);
+ if (!should_timeout_request(bd->rq))
+ return null_handle_cmd(cmd);
+
+ return BLK_STS_OK;
}
static const struct blk_mq_ops null_mq_ops = {
.queue_rq = null_queue_rq,
.complete = null_softirq_done_fn,
+ .timeout = null_timeout_rq,
};
static void cleanup_queue(struct nullb_queue *nq)
@@ -1423,170 +1450,6 @@ static void cleanup_queues(struct nullb *nullb)
kfree(nullb->queues);
}
-#ifdef CONFIG_NVM
-
-static void null_lnvm_end_io(struct request *rq, blk_status_t status)
-{
- struct nvm_rq *rqd = rq->end_io_data;
-
- /* XXX: lighnvm core seems to expect NVM_RSP_* values here.. */
- rqd->error = status ? -EIO : 0;
- nvm_end_io(rqd);
-
- blk_put_request(rq);
-}
-
-static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd)
-{
- struct request_queue *q = dev->q;
- struct request *rq;
- struct bio *bio = rqd->bio;
-
- rq = blk_mq_alloc_request(q,
- op_is_write(bio_op(bio)) ? REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
- if (IS_ERR(rq))
- return -ENOMEM;
-
- blk_init_request_from_bio(rq, bio);
-
- rq->end_io_data = rqd;
-
- blk_execute_rq_nowait(q, NULL, rq, 0, null_lnvm_end_io);
-
- return 0;
-}
-
-static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id)
-{
- struct nullb *nullb = dev->q->queuedata;
- sector_t size = (sector_t)nullb->dev->size * 1024 * 1024ULL;
- sector_t blksize;
- struct nvm_id_group *grp;
-
- id->ver_id = 0x1;
- id->vmnt = 0;
- id->cap = 0x2;
- id->dom = 0x1;
-
- id->ppaf.blk_offset = 0;
- id->ppaf.blk_len = 16;
- id->ppaf.pg_offset = 16;
- id->ppaf.pg_len = 16;
- id->ppaf.sect_offset = 32;
- id->ppaf.sect_len = 8;
- id->ppaf.pln_offset = 40;
- id->ppaf.pln_len = 8;
- id->ppaf.lun_offset = 48;
- id->ppaf.lun_len = 8;
- id->ppaf.ch_offset = 56;
- id->ppaf.ch_len = 8;
-
- sector_div(size, nullb->dev->blocksize); /* convert size to pages */
- size >>= 8; /* concert size to pgs pr blk */
- grp = &id->grp;
- grp->mtype = 0;
- grp->fmtype = 0;
- grp->num_ch = 1;
- grp->num_pg = 256;
- blksize = size;
- size >>= 16;
- grp->num_lun = size + 1;
- sector_div(blksize, grp->num_lun);
- grp->num_blk = blksize;
- grp->num_pln = 1;
-
- grp->fpg_sz = nullb->dev->blocksize;
- grp->csecs = nullb->dev->blocksize;
- grp->trdt = 25000;
- grp->trdm = 25000;
- grp->tprt = 500000;
- grp->tprm = 500000;
- grp->tbet = 1500000;
- grp->tbem = 1500000;
- grp->mpos = 0x010101; /* single plane rwe */
- grp->cpar = nullb->dev->hw_queue_depth;
-
- return 0;
-}
-
-static void *null_lnvm_create_dma_pool(struct nvm_dev *dev, char *name)
-{
- mempool_t *virtmem_pool;
-
- virtmem_pool = mempool_create_slab_pool(64, ppa_cache);
- if (!virtmem_pool) {
- pr_err("null_blk: Unable to create virtual memory pool\n");
- return NULL;
- }
-
- return virtmem_pool;
-}
-
-static void null_lnvm_destroy_dma_pool(void *pool)
-{
- mempool_destroy(pool);
-}
-
-static void *null_lnvm_dev_dma_alloc(struct nvm_dev *dev, void *pool,
- gfp_t mem_flags, dma_addr_t *dma_handler)
-{
- return mempool_alloc(pool, mem_flags);
-}
-
-static void null_lnvm_dev_dma_free(void *pool, void *entry,
- dma_addr_t dma_handler)
-{
- mempool_free(entry, pool);
-}
-
-static struct nvm_dev_ops null_lnvm_dev_ops = {
- .identity = null_lnvm_id,
- .submit_io = null_lnvm_submit_io,
-
- .create_dma_pool = null_lnvm_create_dma_pool,
- .destroy_dma_pool = null_lnvm_destroy_dma_pool,
- .dev_dma_alloc = null_lnvm_dev_dma_alloc,
- .dev_dma_free = null_lnvm_dev_dma_free,
-
- /* Simulate nvme protocol restriction */
- .max_phys_sect = 64,
-};
-
-static int null_nvm_register(struct nullb *nullb)
-{
- struct nvm_dev *dev;
- int rv;
-
- dev = nvm_alloc_dev(0);
- if (!dev)
- return -ENOMEM;
-
- dev->q = nullb->q;
- memcpy(dev->name, nullb->disk_name, DISK_NAME_LEN);
- dev->ops = &null_lnvm_dev_ops;
-
- rv = nvm_register(dev);
- if (rv) {
- kfree(dev);
- return rv;
- }
- nullb->ndev = dev;
- return 0;
-}
-
-static void null_nvm_unregister(struct nullb *nullb)
-{
- nvm_unregister(nullb->ndev);
-}
-#else
-static int null_nvm_register(struct nullb *nullb)
-{
- pr_err("null_blk: CONFIG_NVM needs to be enabled for LightNVM\n");
- return -EINVAL;
-}
-static void null_nvm_unregister(struct nullb *nullb) {}
-#endif /* CONFIG_NVM */
-
static void null_del_dev(struct nullb *nullb)
{
struct nullb_device *dev = nullb->dev;
@@ -1595,10 +1458,7 @@ static void null_del_dev(struct nullb *nullb)
list_del_init(&nullb->list);
- if (dev->use_lightnvm)
- null_nvm_unregister(nullb);
- else
- del_gendisk(nullb->disk);
+ del_gendisk(nullb->disk);
if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
hrtimer_cancel(&nullb->bw_timer);
@@ -1610,8 +1470,7 @@ static void null_del_dev(struct nullb *nullb)
if (dev->queue_mode == NULL_Q_MQ &&
nullb->tag_set == &nullb->__tag_set)
blk_mq_free_tag_set(nullb->tag_set);
- if (!dev->use_lightnvm)
- put_disk(nullb->disk);
+ put_disk(nullb->disk);
cleanup_queues(nullb);
if (null_cache_active(nullb))
null_free_device_storage(nullb->dev, true);
@@ -1775,11 +1634,6 @@ static void null_validate_conf(struct nullb_device *dev)
{
dev->blocksize = round_down(dev->blocksize, 512);
dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
- if (dev->use_lightnvm && dev->blocksize != 4096)
- dev->blocksize = 4096;
-
- if (dev->use_lightnvm && dev->queue_mode != NULL_Q_MQ)
- dev->queue_mode = NULL_Q_MQ;
if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
if (dev->submit_queues != nr_online_nodes)
@@ -1805,6 +1659,20 @@ static void null_validate_conf(struct nullb_device *dev)
dev->mbps = 0;
}
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+ if (!g_timeout_str[0])
+ return true;
+
+ if (!setup_fault_attr(&null_timeout_attr, g_timeout_str))
+ return false;
+
+ null_timeout_attr.verbose = 0;
+#endif
+ return true;
+}
+
static int null_add_dev(struct nullb_device *dev)
{
struct nullb *nullb;
@@ -1838,6 +1706,10 @@ static int null_add_dev(struct nullb_device *dev)
if (rv)
goto out_cleanup_queues;
+ if (!null_setup_fault())
+ goto out_cleanup_queues;
+
+ nullb->tag_set->timeout = 5 * HZ;
nullb->q = blk_mq_init_queue(nullb->tag_set);
if (IS_ERR(nullb->q)) {
rv = -ENOMEM;
@@ -1861,8 +1733,14 @@ static int null_add_dev(struct nullb_device *dev)
rv = -ENOMEM;
goto out_cleanup_queues;
}
+
+ if (!null_setup_fault())
+ goto out_cleanup_blk_queue;
+
blk_queue_prep_rq(nullb->q, null_rq_prep_fn);
blk_queue_softirq_done(nullb->q, null_softirq_done_fn);
+ blk_queue_rq_timed_out(nullb->q, null_rq_timed_out_fn);
+ nullb->q->rq_timeout = 5 * HZ;
rv = init_driver_queues(nullb);
if (rv)
goto out_cleanup_blk_queue;
@@ -1895,11 +1773,7 @@ static int null_add_dev(struct nullb_device *dev)
sprintf(nullb->disk_name, "nullb%d", nullb->index);
- if (dev->use_lightnvm)
- rv = null_nvm_register(nullb);
- else
- rv = null_gendisk_register(nullb);
-
+ rv = null_gendisk_register(nullb);
if (rv)
goto out_cleanup_blk_queue;
@@ -1938,18 +1812,6 @@ static int __init null_init(void)
g_bs = PAGE_SIZE;
}
- if (g_use_lightnvm && g_bs != 4096) {
- pr_warn("null_blk: LightNVM only supports 4k block size\n");
- pr_warn("null_blk: defaults block size to 4k\n");
- g_bs = 4096;
- }
-
- if (g_use_lightnvm && g_queue_mode != NULL_Q_MQ) {
- pr_warn("null_blk: LightNVM only supported for blk-mq\n");
- pr_warn("null_blk: defaults queue mode to blk-mq\n");
- g_queue_mode = NULL_Q_MQ;
- }
-
if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
if (g_submit_queues != nr_online_nodes) {
pr_warn("null_blk: submit_queues param is set to %u.\n",
@@ -1982,16 +1844,6 @@ static int __init null_init(void)
goto err_conf;
}
- if (g_use_lightnvm) {
- ppa_cache = kmem_cache_create("ppa_cache", 64 * sizeof(u64),
- 0, 0, NULL);
- if (!ppa_cache) {
- pr_err("null_blk: unable to create ppa cache\n");
- ret = -ENOMEM;
- goto err_ppa;
- }
- }
-
for (i = 0; i < nr_devices; i++) {
dev = null_alloc_dev();
if (!dev) {
@@ -2015,8 +1867,6 @@ err_dev:
null_del_dev(nullb);
null_free_dev(dev);
}
- kmem_cache_destroy(ppa_cache);
-err_ppa:
unregister_blkdev(null_major, "nullb");
err_conf:
configfs_unregister_subsystem(&nullb_subsys);
@@ -2047,8 +1897,6 @@ static void __exit null_exit(void)
if (g_queue_mode == NULL_Q_MQ && shared_tags)
blk_mq_free_tag_set(&tag_set);
-
- kmem_cache_destroy(ppa_cache);
}
module_init(null_init);
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 67974796c350..531a0915066b 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -2579,14 +2579,14 @@ static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
bdev = bdget(dev);
if (!bdev)
return -ENOMEM;
+ ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
+ if (ret)
+ return ret;
if (!blk_queue_scsi_passthrough(bdev_get_queue(bdev))) {
WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
- bdput(bdev);
+ blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
return -EINVAL;
}
- ret = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
- if (ret)
- return ret;
/* This is safe, since we have a reference from open(). */
__module_get(THIS_MODULE);
@@ -2745,7 +2745,7 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
pd->pkt_dev = MKDEV(pktdev_major, idx);
ret = pkt_new_dev(pd, dev);
if (ret)
- goto out_new_dev;
+ goto out_mem2;
/* inherit events of the host device */
disk->events = pd->bdev->bd_disk->events;
@@ -2763,8 +2763,6 @@ static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
mutex_unlock(&ctl_mutex);
return 0;
-out_new_dev:
- blk_cleanup_queue(disk->queue);
out_mem2:
put_disk(disk);
out_mem:
diff --git a/drivers/block/smart1,2.h b/drivers/block/smart1,2.h
deleted file mode 100644
index e5565fbaeb30..000000000000
--- a/drivers/block/smart1,2.h
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Disk Array driver for Compaq SMART2 Controllers
- * Copyright 1998 Compaq Computer Corporation
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2 of the License, or
- * (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
- * NON INFRINGEMENT. See the GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- *
- * Questions/Comments/Bugfixes to iss_storagedev@hp.com
- *
- * If you want to make changes, improve or add functionality to this
- * driver, you'll probably need the Compaq Array Controller Interface
- * Specificiation (Document number ECG086/1198)
- */
-
-/*
- * This file contains the controller communication implementation for
- * Compaq SMART-1 and SMART-2 controllers. To the best of my knowledge,
- * this should support:
- *
- * PCI:
- * SMART-2/P, SMART-2DH, SMART-2SL, SMART-221, SMART-3100ES, SMART-3200
- * Integerated SMART Array Controller, SMART-4200, SMART-4250ES
- *
- * EISA:
- * SMART-2/E, SMART, IAES, IDA-2, IDA
- */
-
-/*
- * Memory mapped FIFO interface (SMART 42xx cards)
- */
-static void smart4_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
- writel(c->busaddr, h->vaddr + S42XX_REQUEST_PORT_OFFSET);
-}
-
-/*
- * This card is the opposite of the other cards.
- * 0 turns interrupts on...
- * 0x08 turns them off...
- */
-static void smart4_intr_mask(ctlr_info_t *h, unsigned long val)
-{
- if (val)
- { /* Turn interrupts on */
- writel(0, h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
- } else /* Turn them off */
- {
- writel( S42XX_INTR_OFF,
- h->vaddr + S42XX_REPLY_INTR_MASK_OFFSET);
- }
-}
-
-/*
- * For older cards FIFO Full = 0.
- * On this card 0 means there is room, anything else FIFO Full.
- *
- */
-static unsigned long smart4_fifo_full(ctlr_info_t *h)
-{
-
- return (!readl(h->vaddr + S42XX_REQUEST_PORT_OFFSET));
-}
-
-/* This type of controller returns -1 if the fifo is empty,
- * Not 0 like the others.
- * And we need to let it know we read a value out
- */
-static unsigned long smart4_completed(ctlr_info_t *h)
-{
- long register_value
- = readl(h->vaddr + S42XX_REPLY_PORT_OFFSET);
-
- /* Fifo is empty */
- if( register_value == 0xffffffff)
- return 0;
-
- /* Need to let it know we got the reply */
- /* We do this by writing a 0 to the port we just read from */
- writel(0, h->vaddr + S42XX_REPLY_PORT_OFFSET);
-
- return ((unsigned long) register_value);
-}
-
- /*
- * This hardware returns interrupt pending at a different place and
- * it does not tell us if the fifo is empty, we will have check
- * that by getting a 0 back from the command_completed call.
- */
-static unsigned long smart4_intr_pending(ctlr_info_t *h)
-{
- unsigned long register_value =
- readl(h->vaddr + S42XX_INTR_STATUS);
-
- if( register_value & S42XX_INTR_PENDING)
- return FIFO_NOT_EMPTY;
- return 0 ;
-}
-
-static struct access_method smart4_access = {
- smart4_submit_command,
- smart4_intr_mask,
- smart4_fifo_full,
- smart4_intr_pending,
- smart4_completed,
-};
-
-/*
- * Memory mapped FIFO interface (PCI SMART2 and SMART 3xxx cards)
- */
-static void smart2_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
- writel(c->busaddr, h->vaddr + COMMAND_FIFO);
-}
-
-static void smart2_intr_mask(ctlr_info_t *h, unsigned long val)
-{
- writel(val, h->vaddr + INTR_MASK);
-}
-
-static unsigned long smart2_fifo_full(ctlr_info_t *h)
-{
- return readl(h->vaddr + COMMAND_FIFO);
-}
-
-static unsigned long smart2_completed(ctlr_info_t *h)
-{
- return readl(h->vaddr + COMMAND_COMPLETE_FIFO);
-}
-
-static unsigned long smart2_intr_pending(ctlr_info_t *h)
-{
- return readl(h->vaddr + INTR_PENDING);
-}
-
-static struct access_method smart2_access = {
- smart2_submit_command,
- smart2_intr_mask,
- smart2_fifo_full,
- smart2_intr_pending,
- smart2_completed,
-};
-
-/*
- * IO access for SMART-2/E cards
- */
-static void smart2e_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
- outl(c->busaddr, h->io_mem_addr + COMMAND_FIFO);
-}
-
-static void smart2e_intr_mask(ctlr_info_t *h, unsigned long val)
-{
- outl(val, h->io_mem_addr + INTR_MASK);
-}
-
-static unsigned long smart2e_fifo_full(ctlr_info_t *h)
-{
- return inl(h->io_mem_addr + COMMAND_FIFO);
-}
-
-static unsigned long smart2e_completed(ctlr_info_t *h)
-{
- return inl(h->io_mem_addr + COMMAND_COMPLETE_FIFO);
-}
-
-static unsigned long smart2e_intr_pending(ctlr_info_t *h)
-{
- return inl(h->io_mem_addr + INTR_PENDING);
-}
-
-static struct access_method smart2e_access = {
- smart2e_submit_command,
- smart2e_intr_mask,
- smart2e_fifo_full,
- smart2e_intr_pending,
- smart2e_completed,
-};
-
-/*
- * IO access for older SMART-1 type cards
- */
-#define SMART1_SYSTEM_MASK 0xC8E
-#define SMART1_SYSTEM_DOORBELL 0xC8F
-#define SMART1_LOCAL_MASK 0xC8C
-#define SMART1_LOCAL_DOORBELL 0xC8D
-#define SMART1_INTR_MASK 0xC89
-#define SMART1_LISTADDR 0xC90
-#define SMART1_LISTLEN 0xC94
-#define SMART1_TAG 0xC97
-#define SMART1_COMPLETE_ADDR 0xC98
-#define SMART1_LISTSTATUS 0xC9E
-
-#define CHANNEL_BUSY 0x01
-#define CHANNEL_CLEAR 0x02
-
-static void smart1_submit_command(ctlr_info_t *h, cmdlist_t *c)
-{
- /*
- * This __u16 is actually a bunch of control flags on SMART
- * and below. We want them all to be zero.
- */
- c->hdr.size = 0;
-
- outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
-
- outl(c->busaddr, h->io_mem_addr + SMART1_LISTADDR);
- outw(c->size, h->io_mem_addr + SMART1_LISTLEN);
-
- outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
-}
-
-static void smart1_intr_mask(ctlr_info_t *h, unsigned long val)
-{
- if (val == 1) {
- outb(0xFD, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
- outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
- outb(0x01, h->io_mem_addr + SMART1_INTR_MASK);
- outb(0x01, h->io_mem_addr + SMART1_SYSTEM_MASK);
- } else {
- outb(0, h->io_mem_addr + 0xC8E);
- }
-}
-
-static unsigned long smart1_fifo_full(ctlr_info_t *h)
-{
- unsigned char chan;
- chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_CLEAR;
- return chan;
-}
-
-static unsigned long smart1_completed(ctlr_info_t *h)
-{
- unsigned char status;
- unsigned long cmd;
-
- if (inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY) {
- outb(CHANNEL_BUSY, h->io_mem_addr + SMART1_SYSTEM_DOORBELL);
-
- cmd = inl(h->io_mem_addr + SMART1_COMPLETE_ADDR);
- status = inb(h->io_mem_addr + SMART1_LISTSTATUS);
-
- outb(CHANNEL_CLEAR, h->io_mem_addr + SMART1_LOCAL_DOORBELL);
-
- /*
- * this is x86 (actually compaq x86) only, so it's ok
- */
- if (cmd) ((cmdlist_t*)bus_to_virt(cmd))->req.hdr.rcode = status;
- } else {
- cmd = 0;
- }
- return cmd;
-}
-
-static unsigned long smart1_intr_pending(ctlr_info_t *h)
-{
- unsigned char chan;
- chan = inb(h->io_mem_addr + SMART1_SYSTEM_DOORBELL) & CHANNEL_BUSY;
- return chan;
-}
-
-static struct access_method smart1_access = {
- smart1_submit_command,
- smart1_intr_mask,
- smart1_fifo_full,
- smart1_intr_pending,
- smart1_completed,
-};
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
index d70eba30003a..0afa6c8c3857 100644
--- a/drivers/block/zram/zram_drv.c
+++ b/drivers/block/zram/zram_drv.c
@@ -430,7 +430,7 @@ static void put_entry_bdev(struct zram *zram, unsigned long entry)
static void zram_page_end_io(struct bio *bio)
{
- struct page *page = bio->bi_io_vec[0].bv_page;
+ struct page *page = bio_first_page_all(bio);
page_endio(page, op_is_write(bio_op(bio)),
blk_status_to_errno(bio->bi_status));
diff --git a/drivers/lightnvm/Kconfig b/drivers/lightnvm/Kconfig
index 2a953efec4e1..10c08982185a 100644
--- a/drivers/lightnvm/Kconfig
+++ b/drivers/lightnvm/Kconfig
@@ -27,13 +27,6 @@ config NVM_DEBUG
It is required to create/remove targets without IOCTLs.
-config NVM_RRPC
- tristate "Round-robin Hybrid Open-Channel SSD target"
- ---help---
- Allows an open-channel SSD to be exposed as a block device to the
- host. The target is implemented using a linear mapping table and
- cost-based garbage collection. It is optimized for 4K IO sizes.
-
config NVM_PBLK
tristate "Physical Block Device Open-Channel SSD target"
---help---
diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile
index 2c3fd9d2c08c..97d9d7c71550 100644
--- a/drivers/lightnvm/Makefile
+++ b/drivers/lightnvm/Makefile
@@ -4,7 +4,6 @@
#
obj-$(CONFIG_NVM) := core.o
-obj-$(CONFIG_NVM_RRPC) += rrpc.o
obj-$(CONFIG_NVM_PBLK) += pblk.o
pblk-y := pblk-init.o pblk-core.o pblk-rb.o \
pblk-write.o pblk-cache.o pblk-read.o \
diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c
index 83249b43dd06..dcc9e621e651 100644
--- a/drivers/lightnvm/core.c
+++ b/drivers/lightnvm/core.c
@@ -45,12 +45,6 @@ struct nvm_dev_map {
int nr_chnls;
};
-struct nvm_area {
- struct list_head list;
- sector_t begin;
- sector_t end; /* end is excluded */
-};
-
static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
{
struct nvm_target *tgt;
@@ -62,6 +56,30 @@ static struct nvm_target *nvm_find_target(struct nvm_dev *dev, const char *name)
return NULL;
}
+static bool nvm_target_exists(const char *name)
+{
+ struct nvm_dev *dev;
+ struct nvm_target *tgt;
+ bool ret = false;
+
+ down_write(&nvm_lock);
+ list_for_each_entry(dev, &nvm_devices, devices) {
+ mutex_lock(&dev->mlock);
+ list_for_each_entry(tgt, &dev->targets, list) {
+ if (!strcmp(name, tgt->disk->disk_name)) {
+ ret = true;
+ mutex_unlock(&dev->mlock);
+ goto out;
+ }
+ }
+ mutex_unlock(&dev->mlock);
+ }
+
+out:
+ up_write(&nvm_lock);
+ return ret;
+}
+
static int nvm_reserve_luns(struct nvm_dev *dev, int lun_begin, int lun_end)
{
int i;
@@ -104,7 +122,7 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
if (clear) {
for (j = 0; j < ch_map->nr_luns; j++) {
int lun = j + lun_offs[j];
- int lunid = (ch * dev->geo.luns_per_chnl) + lun;
+ int lunid = (ch * dev->geo.nr_luns) + lun;
WARN_ON(!test_and_clear_bit(lunid,
dev->lun_map));
@@ -122,7 +140,8 @@ static void nvm_remove_tgt_dev(struct nvm_tgt_dev *tgt_dev, int clear)
}
static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
- int lun_begin, int lun_end)
+ u16 lun_begin, u16 lun_end,
+ u16 op)
{
struct nvm_tgt_dev *tgt_dev = NULL;
struct nvm_dev_map *dev_rmap = dev->rmap;
@@ -130,10 +149,10 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
struct ppa_addr *luns;
int nr_luns = lun_end - lun_begin + 1;
int luns_left = nr_luns;
- int nr_chnls = nr_luns / dev->geo.luns_per_chnl;
- int nr_chnls_mod = nr_luns % dev->geo.luns_per_chnl;
- int bch = lun_begin / dev->geo.luns_per_chnl;
- int blun = lun_begin % dev->geo.luns_per_chnl;
+ int nr_chnls = nr_luns / dev->geo.nr_luns;
+ int nr_chnls_mod = nr_luns % dev->geo.nr_luns;
+ int bch = lun_begin / dev->geo.nr_luns;
+ int blun = lun_begin % dev->geo.nr_luns;
int lunid = 0;
int lun_balanced = 1;
int prev_nr_luns;
@@ -154,15 +173,15 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
if (!luns)
goto err_luns;
- prev_nr_luns = (luns_left > dev->geo.luns_per_chnl) ?
- dev->geo.luns_per_chnl : luns_left;
+ prev_nr_luns = (luns_left > dev->geo.nr_luns) ?
+ dev->geo.nr_luns : luns_left;
for (i = 0; i < nr_chnls; i++) {
struct nvm_ch_map *ch_rmap = &dev_rmap->chnls[i + bch];
int *lun_roffs = ch_rmap->lun_offs;
struct nvm_ch_map *ch_map = &dev_map->chnls[i];
int *lun_offs;
- int luns_in_chnl = (luns_left > dev->geo.luns_per_chnl) ?
- dev->geo.luns_per_chnl : luns_left;
+ int luns_in_chnl = (luns_left > dev->geo.nr_luns) ?
+ dev->geo.nr_luns : luns_left;
if (lun_balanced && prev_nr_luns != luns_in_chnl)
lun_balanced = 0;
@@ -199,8 +218,9 @@ static struct nvm_tgt_dev *nvm_create_tgt_dev(struct nvm_dev *dev,
memcpy(&tgt_dev->geo, &dev->geo, sizeof(struct nvm_geo));
/* Target device only owns a portion of the physical device */
tgt_dev->geo.nr_chnls = nr_chnls;
- tgt_dev->geo.nr_luns = nr_luns;
- tgt_dev->geo.luns_per_chnl = (lun_balanced) ? prev_nr_luns : -1;
+ tgt_dev->geo.all_luns = nr_luns;
+ tgt_dev->geo.nr_luns = (lun_balanced) ? prev_nr_luns : -1;
+ tgt_dev->geo.op = op;
tgt_dev->total_secs = nr_luns * tgt_dev->geo.sec_per_lun;
tgt_dev->q = dev->q;
tgt_dev->map = dev_map;
@@ -226,27 +246,79 @@ static const struct block_device_operations nvm_fops = {
.owner = THIS_MODULE,
};
-static struct nvm_tgt_type *nvm_find_target_type(const char *name, int lock)
+static struct nvm_tgt_type *__nvm_find_target_type(const char *name)
{
- struct nvm_tgt_type *tmp, *tt = NULL;
+ struct nvm_tgt_type *tt;
- if (lock)
- down_write(&nvm_tgtt_lock);
+ list_for_each_entry(tt, &nvm_tgt_types, list)
+ if (!strcmp(name, tt->name))
+ return tt;
- list_for_each_entry(tmp, &nvm_tgt_types, list)
- if (!strcmp(name, tmp->name)) {
- tt = tmp;
- break;
- }
+ return NULL;
+}
+
+static struct nvm_tgt_type *nvm_find_target_type(const char *name)
+{
+ struct nvm_tgt_type *tt;
+
+ down_write(&nvm_tgtt_lock);
+ tt = __nvm_find_target_type(name);
+ up_write(&nvm_tgtt_lock);
- if (lock)
- up_write(&nvm_tgtt_lock);
return tt;
}
+static int nvm_config_check_luns(struct nvm_geo *geo, int lun_begin,
+ int lun_end)
+{
+ if (lun_begin > lun_end || lun_end >= geo->all_luns) {
+ pr_err("nvm: lun out of bound (%u:%u > %u)\n",
+ lun_begin, lun_end, geo->all_luns - 1);
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+static int __nvm_config_simple(struct nvm_dev *dev,
+ struct nvm_ioctl_create_simple *s)
+{
+ struct nvm_geo *geo = &dev->geo;
+
+ if (s->lun_begin == -1 && s->lun_end == -1) {
+ s->lun_begin = 0;
+ s->lun_end = geo->all_luns - 1;
+ }
+
+ return nvm_config_check_luns(geo, s->lun_begin, s->lun_end);
+}
+
+static int __nvm_config_extended(struct nvm_dev *dev,
+ struct nvm_ioctl_create_extended *e)
+{
+ struct nvm_geo *geo = &dev->geo;
+
+ if (e->lun_begin == 0xFFFF && e->lun_end == 0xFFFF) {
+ e->lun_begin = 0;
+ e->lun_end = dev->geo.all_luns - 1;
+ }
+
+ /* op not set falls into target's default */
+ if (e->op == 0xFFFF)
+ e->op = NVM_TARGET_DEFAULT_OP;
+
+ if (e->op < NVM_TARGET_MIN_OP ||
+ e->op > NVM_TARGET_MAX_OP) {
+ pr_err("nvm: invalid over provisioning value\n");
+ return -EINVAL;
+ }
+
+ return nvm_config_check_luns(geo, e->lun_begin, e->lun_end);
+}
+
static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
{
- struct nvm_ioctl_create_simple *s = &create->conf.s;
+ struct nvm_ioctl_create_extended e;
struct request_queue *tqueue;
struct gendisk *tdisk;
struct nvm_tgt_type *tt;
@@ -255,22 +327,41 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
void *targetdata;
int ret;
- tt = nvm_find_target_type(create->tgttype, 1);
+ switch (create->conf.type) {
+ case NVM_CONFIG_TYPE_SIMPLE:
+ ret = __nvm_config_simple(dev, &create->conf.s);
+ if (ret)
+ return ret;
+
+ e.lun_begin = create->conf.s.lun_begin;
+ e.lun_end = create->conf.s.lun_end;
+ e.op = NVM_TARGET_DEFAULT_OP;
+ break;
+ case NVM_CONFIG_TYPE_EXTENDED:
+ ret = __nvm_config_extended(dev, &create->conf.e);
+ if (ret)
+ return ret;
+
+ e = create->conf.e;
+ break;
+ default:
+ pr_err("nvm: config type not valid\n");
+ return -EINVAL;
+ }
+
+ tt = nvm_find_target_type(create->tgttype);
if (!tt) {
pr_err("nvm: target type %s not found\n", create->tgttype);
return -EINVAL;
}
- mutex_lock(&dev->mlock);
- t = nvm_find_target(dev, create->tgtname);
- if (t) {
- pr_err("nvm: target name already exists.\n");
- mutex_unlock(&dev->mlock);
+ if (nvm_target_exists(create->tgtname)) {
+ pr_err("nvm: target name already exists (%s)\n",
+ create->tgtname);
return -EINVAL;
}
- mutex_unlock(&dev->mlock);
- ret = nvm_reserve_luns(dev, s->lun_begin, s->lun_end);
+ ret = nvm_reserve_luns(dev, e.lun_begin, e.lun_end);
if (ret)
return ret;
@@ -280,7 +371,7 @@ static int nvm_create_tgt(struct nvm_dev *dev, struct nvm_ioctl_create *create)
goto err_reserve;
}
- tgt_dev = nvm_create_tgt_dev(dev, s->lun_begin, s->lun_end);
+ tgt_dev = nvm_create_tgt_dev(dev, e.lun_begin, e.lun_end, e.op);
if (!tgt_dev) {
pr_err("nvm: could not create target device\n");
ret = -ENOMEM;
@@ -350,7 +441,7 @@ err_dev:
err_t:
kfree(t);
err_reserve:
- nvm_release_luns_err(dev, s->lun_begin, s->lun_end);
+ nvm_release_luns_err(dev, e.lun_begin, e.lun_end);
return ret;
}
@@ -420,7 +511,7 @@ static int nvm_register_map(struct nvm_dev *dev)
for (i = 0; i < dev->geo.nr_chnls; i++) {
struct nvm_ch_map *ch_rmap;
int *lun_roffs;
- int luns_in_chnl = dev->geo.luns_per_chnl;
+ int luns_in_chnl = dev->geo.nr_luns;
ch_rmap = &rmap->chnls[i];
@@ -524,41 +615,12 @@ static void nvm_rq_dev_to_tgt(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
nvm_ppa_dev_to_tgt(tgt_dev, rqd->ppa_list, rqd->nr_ppas);
}
-void nvm_part_to_tgt(struct nvm_dev *dev, sector_t *entries,
- int len)
-{
- struct nvm_geo *geo = &dev->geo;
- struct nvm_dev_map *dev_rmap = dev->rmap;
- u64 i;
-
- for (i = 0; i < len; i++) {
- struct nvm_ch_map *ch_rmap;
- int *lun_roffs;
- struct ppa_addr gaddr;
- u64 pba = le64_to_cpu(entries[i]);
- u64 diff;
-
- if (!pba)
- continue;
-
- gaddr = linear_to_generic_addr(geo, pba);
- ch_rmap = &dev_rmap->chnls[gaddr.g.ch];
- lun_roffs = ch_rmap->lun_offs;
-
- diff = ((ch_rmap->ch_off * geo->luns_per_chnl) +
- (lun_roffs[gaddr.g.lun])) * geo->sec_per_lun;
-
- entries[i] -= cpu_to_le64(diff);
- }
-}
-EXPORT_SYMBOL(nvm_part_to_tgt);
-
int nvm_register_tgt_type(struct nvm_tgt_type *tt)
{
int ret = 0;
down_write(&nvm_tgtt_lock);
- if (nvm_find_target_type(tt->name, 0))
+ if (__nvm_find_target_type(tt->name))
ret = -EEXIST;
else
list_add(&tt->list, &nvm_tgt_types);
@@ -726,112 +788,6 @@ int nvm_submit_io_sync(struct nvm_tgt_dev *tgt_dev, struct nvm_rq *rqd)
}
EXPORT_SYMBOL(nvm_submit_io_sync);
-int nvm_erase_sync(struct nvm_tgt_dev *tgt_dev, struct ppa_addr *ppas,
- int nr_ppas)
-{
- struct nvm_geo *geo = &tgt_dev->geo;
- struct nvm_rq rqd;
- int ret;
-
- memset(&rqd, 0, sizeof(struct nvm_rq));
-
- rqd.opcode = NVM_OP_ERASE;
- rqd.flags = geo->plane_mode >> 1;
-
- ret = nvm_set_rqd_ppalist(tgt_dev, &rqd, ppas, nr_ppas);
- if (ret)
- return ret;
-
- ret = nvm_submit_io_sync(tgt_dev, &rqd);
- if (ret) {
- pr_err("rrpr: erase I/O submission failed: %d\n", ret);
- goto free_ppa_list;
- }
-
-free_ppa_list:
- nvm_free_rqd_ppalist(tgt_dev, &rqd);
-
- return ret;
-}
-EXPORT_SYMBOL(nvm_erase_sync);
-
-int nvm_get_l2p_tbl(struct nvm_tgt_dev *tgt_dev, u64 slba, u32 nlb,
- nvm_l2p_update_fn *update_l2p, void *priv)
-{
- struct nvm_dev *dev = tgt_dev->parent;
-
- if (!dev->ops->get_l2p_tbl)
- return 0;
-
- return dev->ops->get_l2p_tbl(dev, slba, nlb, update_l2p, priv);
-}
-EXPORT_SYMBOL(nvm_get_l2p_tbl);
-
-int nvm_get_area(struct nvm_tgt_dev *tgt_dev, sector_t *lba, sector_t len)
-{
- struct nvm_dev *dev = tgt_dev->parent;
- struct nvm_geo *geo = &dev->geo;
- struct nvm_area *area, *prev, *next;
- sector_t begin = 0;
- sector_t max_sectors = (geo->sec_size * dev->total_secs) >> 9;
-
- if (len > max_sectors)
- return -EINVAL;
-
- area = kmalloc(sizeof(struct nvm_area), GFP_KERNEL);
- if (!area)
- return -ENOMEM;
-
- prev = NULL;
-
- spin_lock(&dev->lock);
- list_for_each_entry(next, &dev->area_list, list) {
- if (begin + len > next->begin) {
- begin = next->end;
- prev = next;
- continue;
- }
- break;
- }
-
- if ((begin + len) > max_sectors) {
- spin_unlock(&dev->lock);
- kfree(area);
- return -EINVAL;
- }
-
- area->begin = *lba = begin;
- area->end = begin + len;
-
- if (prev) /* insert into sorted order */
- list_add(&area->list, &prev->list);
- else
- list_add(&area->list, &dev->area_list);
- spin_unlock(&dev->lock);
-
- return 0;
-}
-EXPORT_SYMBOL(nvm_get_area);
-
-void nvm_put_area(struct nvm_tgt_dev *tgt_dev, sector_t begin)
-{
- struct nvm_dev *dev = tgt_dev->parent;
- struct nvm_area *area;
-
- spin_lock(&dev->lock);
- list_for_each_entry(area, &dev->area_list, list) {
- if (area->begin != begin)
- continue;
-
- list_del(&area->list);
- spin_unlock(&dev->lock);
- kfree(area);
- return;
- }
- spin_unlock(&dev->lock);
-}
-EXPORT_SYMBOL(nvm_put_area);
-
void nvm_end_io(struct nvm_rq *rqd)
{
struct nvm_tgt_dev *tgt_dev = rqd->dev;
@@ -858,10 +814,10 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
struct nvm_geo *geo = &dev->geo;
int blk, offset, pl, blktype;
- if (nr_blks != geo->blks_per_lun * geo->plane_mode)
+ if (nr_blks != geo->nr_chks * geo->plane_mode)
return -EINVAL;
- for (blk = 0; blk < geo->blks_per_lun; blk++) {
+ for (blk = 0; blk < geo->nr_chks; blk++) {
offset = blk * geo->plane_mode;
blktype = blks[offset];
@@ -877,7 +833,7 @@ int nvm_bb_tbl_fold(struct nvm_dev *dev, u8 *blks, int nr_blks)
blks[blk] = blktype;
}
- return geo->blks_per_lun;
+ return geo->nr_chks;
}
EXPORT_SYMBOL(nvm_bb_tbl_fold);
@@ -892,53 +848,6 @@ int nvm_get_tgt_bb_tbl(struct nvm_tgt_dev *tgt_dev, struct ppa_addr ppa,
}
EXPORT_SYMBOL(nvm_get_tgt_bb_tbl);
-static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
- struct nvm_geo *geo = &dev->geo;
- int i;
-
- dev->lps_per_blk = geo->pgs_per_blk;
- dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
- if (!dev->lptbl)
- return -ENOMEM;
-
- /* Just a linear array */
- for (i = 0; i < dev->lps_per_blk; i++)
- dev->lptbl[i] = i;
-
- return 0;
-}
-
-static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp)
-{
- int i, p;
- struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc;
-
- if (!mlc->num_pairs)
- return 0;
-
- dev->lps_per_blk = mlc->num_pairs;
- dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL);
- if (!dev->lptbl)
- return -ENOMEM;
-
- /* The lower page table encoding consists of a list of bytes, where each
- * has a lower and an upper half. The first half byte maintains the
- * increment value and every value after is an offset added to the
- * previous incrementation value
- */
- dev->lptbl[0] = mlc->pairs[0] & 0xF;
- for (i = 1; i < dev->lps_per_blk; i++) {
- p = mlc->pairs[i >> 1];
- if (i & 0x1) /* upper */
- dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4);
- else /* lower */
- dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF);
- }
-
- return 0;
-}
-
static int nvm_core_init(struct nvm_dev *dev)
{
struct nvm_id *id = &dev->identity;
@@ -946,66 +855,44 @@ static int nvm_core_init(struct nvm_dev *dev)
struct nvm_geo *geo = &dev->geo;
int ret;
+ memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
+
+ if (grp->mtype != 0) {
+ pr_err("nvm: memory type not supported\n");
+ return -EINVAL;
+ }
+
/* Whole device values */
geo->nr_chnls = grp->num_ch;
- geo->luns_per_chnl = grp->num_lun;
-
- /* Generic device values */
- geo->pgs_per_blk = grp->num_pg;
- geo->blks_per_lun = grp->num_blk;
- geo->nr_planes = grp->num_pln;
- geo->fpg_size = grp->fpg_sz;
- geo->pfpg_size = grp->fpg_sz * grp->num_pln;
+ geo->nr_luns = grp->num_lun;
+
+ /* Generic device geometry values */
+ geo->ws_min = grp->ws_min;
+ geo->ws_opt = grp->ws_opt;
+ geo->ws_seq = grp->ws_seq;
+ geo->ws_per_chk = grp->ws_per_chk;
+ geo->nr_chks = grp->num_chk;
geo->sec_size = grp->csecs;
geo->oob_size = grp->sos;
- geo->sec_per_pg = grp->fpg_sz / grp->csecs;
geo->mccap = grp->mccap;
- memcpy(&geo->ppaf, &id->ppaf, sizeof(struct nvm_addr_format));
-
- geo->plane_mode = NVM_PLANE_SINGLE;
geo->max_rq_size = dev->ops->max_phys_sect * geo->sec_size;
- if (grp->mpos & 0x020202)
- geo->plane_mode = NVM_PLANE_DOUBLE;
- if (grp->mpos & 0x040404)
- geo->plane_mode = NVM_PLANE_QUAD;
+ geo->sec_per_chk = grp->clba;
+ geo->sec_per_lun = geo->sec_per_chk * geo->nr_chks;
+ geo->all_luns = geo->nr_luns * geo->nr_chnls;
- if (grp->mtype != 0) {
- pr_err("nvm: memory type not supported\n");
- return -EINVAL;
- }
-
- /* calculated values */
+ /* 1.2 spec device geometry values */
+ geo->plane_mode = 1 << geo->ws_seq;
+ geo->nr_planes = geo->ws_opt / geo->ws_min;
+ geo->sec_per_pg = geo->ws_min;
geo->sec_per_pl = geo->sec_per_pg * geo->nr_planes;
- geo->sec_per_blk = geo->sec_per_pl * geo->pgs_per_blk;
- geo->sec_per_lun = geo->sec_per_blk * geo->blks_per_lun;
- geo->nr_luns = geo->luns_per_chnl * geo->nr_chnls;
- dev->total_secs = geo->nr_luns * geo->sec_per_lun;
- dev->lun_map = kcalloc(BITS_TO_LONGS(geo->nr_luns),
+ dev->total_secs = geo->all_luns * geo->sec_per_lun;
+ dev->lun_map = kcalloc(BITS_TO_LONGS(geo->all_luns),
sizeof(unsigned long), GFP_KERNEL);
if (!dev->lun_map)
return -ENOMEM;
- switch (grp->fmtype) {
- case NVM_ID_FMTYPE_SLC:
- if (nvm_init_slc_tbl(dev, grp)) {
- ret = -ENOMEM;
- goto err_fmtype;
- }
- break;
- case NVM_ID_FMTYPE_MLC:
- if (nvm_init_mlc_tbl(dev, grp)) {
- ret = -ENOMEM;
- goto err_fmtype;
- }
- break;
- default:
- pr_err("nvm: flash type not supported\n");
- ret = -EINVAL;
- goto err_fmtype;
- }
-
INIT_LIST_HEAD(&dev->area_list);
INIT_LIST_HEAD(&dev->targets);
mutex_init(&dev->mlock);
@@ -1031,7 +918,6 @@ static void nvm_free(struct nvm_dev *dev)
dev->ops->destroy_dma_pool(dev->dma_pool);
nvm_unregister_map(dev);
- kfree(dev->lptbl);
kfree(dev->lun_map);
kfree(dev);
}
@@ -1062,8 +948,8 @@ static int nvm_init(struct nvm_dev *dev)
pr_info("nvm: registered %s [%u/%u/%u/%u/%u/%u]\n",
dev->name, geo->sec_per_pg, geo->nr_planes,
- geo->pgs_per_blk, geo->blks_per_lun,
- geo->nr_luns, geo->nr_chnls);
+ geo->ws_per_chk, geo->nr_chks,
+ geo->all_luns, geo->nr_chnls);
return 0;
err:
pr_err("nvm: failed to initialize nvm\n");
@@ -1135,7 +1021,6 @@ EXPORT_SYMBOL(nvm_unregister);
static int __nvm_configure_create(struct nvm_ioctl_create *create)
{
struct nvm_dev *dev;
- struct nvm_ioctl_create_simple *s;
down_write(&nvm_lock);
dev = nvm_find_nvm_dev(create->dev);
@@ -1146,23 +1031,6 @@ static int __nvm_configure_create(struct nvm_ioctl_create *create)
return -EINVAL;
}
- if (create->conf.type != NVM_CONFIG_TYPE_SIMPLE) {
- pr_err("nvm: config type not valid\n");
- return -EINVAL;
- }
- s = &create->conf.s;
-
- if (s->lun_begin == -1 && s->lun_end == -1) {
- s->lun_begin = 0;
- s->lun_end = dev->geo.nr_luns - 1;
- }
-
- if (s->lun_begin > s->lun_end || s->lun_end >= dev->geo.nr_luns) {
- pr_err("nvm: lun out of bound (%u:%u > %u)\n",
- s->lun_begin, s->lun_end, dev->geo.nr_luns - 1);
- return -EINVAL;
- }
-
return nvm_create_tgt(dev, create);
}
@@ -1262,6 +1130,12 @@ static long nvm_ioctl_dev_create(struct file *file, void __user *arg)
if (copy_from_user(&create, arg, sizeof(struct nvm_ioctl_create)))
return -EFAULT;
+ if (create.conf.type == NVM_CONFIG_TYPE_EXTENDED &&
+ create.conf.e.rsv != 0) {
+ pr_err("nvm: reserved config field in use\n");
+ return -EINVAL;
+ }
+
create.dev[DISK_NAME_LEN - 1] = '\0';
create.tgttype[NVM_TTYPE_NAME_MAX - 1] = '\0';
create.tgtname[DISK_NAME_LEN - 1] = '\0';
diff --git a/drivers/lightnvm/pblk-cache.c b/drivers/lightnvm/pblk-cache.c
index 0d227ef7d1b9..000fcad38136 100644
--- a/drivers/lightnvm/pblk-cache.c
+++ b/drivers/lightnvm/pblk-cache.c
@@ -19,12 +19,16 @@
int pblk_write_to_cache(struct pblk *pblk, struct bio *bio, unsigned long flags)
{
+ struct request_queue *q = pblk->dev->q;
struct pblk_w_ctx w_ctx;
sector_t lba = pblk_get_lba(bio);
+ unsigned long start_time = jiffies;
unsigned int bpos, pos;
int nr_entries = pblk_get_secs(bio);
int i, ret;
+ generic_start_io_acct(q, WRITE, bio_sectors(bio), &pblk->disk->part0);
+
/* Update the write buffer head (mem) with the entries that we can
* write. The write in itself cannot fail, so there is no need to
* rollback from here on.
@@ -67,6 +71,7 @@ retry:
pblk_rl_inserted(&pblk->rl, nr_entries);
out:
+ generic_end_io_acct(q, WRITE, &pblk->disk->part0, start_time);
pblk_write_should_kick(pblk);
return ret;
}
diff --git a/drivers/lightnvm/pblk-core.c b/drivers/lightnvm/pblk-core.c
index 76516ee84e9a..0487b9340c1d 100644
--- a/drivers/lightnvm/pblk-core.c
+++ b/drivers/lightnvm/pblk-core.c
@@ -32,8 +32,8 @@ static void pblk_line_mark_bb(struct work_struct *work)
struct pblk_line *line;
int pos;
- line = &pblk->lines[pblk_dev_ppa_to_line(*ppa)];
- pos = pblk_dev_ppa_to_pos(&dev->geo, *ppa);
+ line = &pblk->lines[pblk_ppa_to_line(*ppa)];
+ pos = pblk_ppa_to_pos(&dev->geo, *ppa);
pr_err("pblk: failed to mark bb, line:%d, pos:%d\n",
line->id, pos);
@@ -48,7 +48,7 @@ static void pblk_mark_bb(struct pblk *pblk, struct pblk_line *line,
{
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- int pos = pblk_dev_ppa_to_pos(geo, *ppa);
+ int pos = pblk_ppa_to_pos(geo, *ppa);
pr_debug("pblk: erase failed: line:%d, pos:%d\n", line->id, pos);
atomic_long_inc(&pblk->erase_failed);
@@ -66,7 +66,7 @@ static void __pblk_end_io_erase(struct pblk *pblk, struct nvm_rq *rqd)
{
struct pblk_line *line;
- line = &pblk->lines[pblk_dev_ppa_to_line(rqd->ppa_addr)];
+ line = &pblk->lines[pblk_ppa_to_line(rqd->ppa_addr)];
atomic_dec(&line->left_seblks);
if (rqd->error) {
@@ -144,7 +144,7 @@ void pblk_map_invalidate(struct pblk *pblk, struct ppa_addr ppa)
BUG_ON(pblk_ppa_empty(ppa));
#endif
- line_id = pblk_tgt_ppa_to_line(ppa);
+ line_id = pblk_ppa_to_line(ppa);
line = &pblk->lines[line_id];
paddr = pblk_dev_ppa_to_line_addr(pblk, ppa);
@@ -650,7 +650,7 @@ next_rq:
} else {
for (i = 0; i < rqd.nr_ppas; ) {
struct ppa_addr ppa = addr_to_gen_ppa(pblk, paddr, id);
- int pos = pblk_dev_ppa_to_pos(geo, ppa);
+ int pos = pblk_ppa_to_pos(geo, ppa);
int read_type = PBLK_READ_RANDOM;
if (pblk_io_aligned(pblk, rq_ppas))
@@ -668,7 +668,7 @@ next_rq:
}
ppa = addr_to_gen_ppa(pblk, paddr, id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
}
if (pblk_boundary_paddr_checks(pblk, paddr + min)) {
@@ -742,7 +742,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
cmd_op = NVM_OP_PWRITE;
flags = pblk_set_progr_mode(pblk, PBLK_WRITE);
lba_list = emeta_to_lbas(pblk, line->emeta->buf);
- } else if (dir == PBLK_READ) {
+ } else if (dir == PBLK_READ_RECOV || dir == PBLK_READ) {
bio_op = REQ_OP_READ;
cmd_op = NVM_OP_PREAD;
flags = pblk_set_read_mode(pblk, PBLK_READ_SEQUENTIAL);
@@ -802,7 +802,7 @@ static int pblk_line_submit_smeta_io(struct pblk *pblk, struct pblk_line *line,
if (rqd.error) {
if (dir == PBLK_WRITE)
pblk_log_write_err(pblk, &rqd);
- else
+ else if (dir == PBLK_READ)
pblk_log_read_err(pblk, &rqd);
}
@@ -816,7 +816,7 @@ int pblk_line_read_smeta(struct pblk *pblk, struct pblk_line *line)
{
u64 bpaddr = pblk_line_smeta_start(pblk, line);
- return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ);
+ return pblk_line_submit_smeta_io(pblk, line, bpaddr, PBLK_READ_RECOV);
}
int pblk_line_read_emeta(struct pblk *pblk, struct pblk_line *line,
@@ -854,8 +854,8 @@ static int pblk_blk_erase_sync(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not sync erase line:%d,blk:%d\n",
- pblk_dev_ppa_to_line(ppa),
- pblk_dev_ppa_to_pos(geo, ppa));
+ pblk_ppa_to_line(ppa),
+ pblk_ppa_to_pos(geo, ppa));
rqd.error = ret;
goto out;
@@ -979,7 +979,7 @@ static int pblk_line_init_metadata(struct pblk *pblk, struct pblk_line *line,
/* Start metadata */
smeta_buf->seq_nr = cpu_to_le64(line->seq_nr);
- smeta_buf->window_wr_lun = cpu_to_le32(geo->nr_luns);
+ smeta_buf->window_wr_lun = cpu_to_le32(geo->all_luns);
/* Fill metadata among lines */
if (cur) {
@@ -1032,7 +1032,7 @@ static int pblk_line_init_bb(struct pblk *pblk, struct pblk_line *line,
lm->sec_per_line);
bitmap_or(line->map_bitmap, line->map_bitmap, l_mg->bb_aux,
lm->sec_per_line);
- line->sec_in_line -= geo->sec_per_blk;
+ line->sec_in_line -= geo->sec_per_chk;
if (bit >= lm->emeta_bb)
nr_bb++;
}
@@ -1145,7 +1145,7 @@ int pblk_line_recov_alloc(struct pblk *pblk, struct pblk_line *line)
}
spin_unlock(&l_mg->free_lock);
- pblk_rl_free_lines_dec(&pblk->rl, line);
+ pblk_rl_free_lines_dec(&pblk->rl, line, true);
if (!pblk_line_init_bb(pblk, line, 0)) {
list_add(&line->list, &l_mg->free_list);
@@ -1233,7 +1233,7 @@ retry:
l_mg->data_line = retry_line;
spin_unlock(&l_mg->free_lock);
- pblk_rl_free_lines_dec(&pblk->rl, retry_line);
+ pblk_rl_free_lines_dec(&pblk->rl, line, false);
if (pblk_line_erase(pblk, retry_line))
goto retry;
@@ -1252,7 +1252,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
{
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *line;
- int is_next = 0;
spin_lock(&l_mg->free_lock);
line = pblk_line_get(pblk);
@@ -1280,7 +1279,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
} else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
- is_next = 1;
}
spin_unlock(&l_mg->free_lock);
@@ -1290,10 +1288,6 @@ struct pblk_line *pblk_line_get_first_data(struct pblk *pblk)
return NULL;
}
- pblk_rl_free_lines_dec(&pblk->rl, line);
- if (is_next)
- pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
-
retry_setup:
if (!pblk_line_init_metadata(pblk, line, NULL)) {
line = pblk_line_retry(pblk, line);
@@ -1311,6 +1305,8 @@ retry_setup:
goto retry_setup;
}
+ pblk_rl_free_lines_dec(&pblk->rl, line, true);
+
return line;
}
@@ -1395,7 +1391,6 @@ struct pblk_line *pblk_line_replace_data(struct pblk *pblk)
struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line *cur, *new = NULL;
unsigned int left_seblks;
- int is_next = 0;
cur = l_mg->data_line;
new = l_mg->data_next;
@@ -1444,6 +1439,8 @@ retry_setup:
goto retry_setup;
}
+ pblk_rl_free_lines_dec(&pblk->rl, new, true);
+
/* Allocate next line for preparation */
spin_lock(&l_mg->free_lock);
l_mg->data_next = pblk_line_get(pblk);
@@ -1457,13 +1454,9 @@ retry_setup:
} else {
l_mg->data_next->seq_nr = l_mg->d_seq_nr++;
l_mg->data_next->type = PBLK_LINETYPE_DATA;
- is_next = 1;
}
spin_unlock(&l_mg->free_lock);
- if (is_next)
- pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
-
out:
return new;
}
@@ -1561,8 +1554,8 @@ int pblk_blk_erase_async(struct pblk *pblk, struct ppa_addr ppa)
struct nvm_geo *geo = &dev->geo;
pr_err("pblk: could not async erase line:%d,blk:%d\n",
- pblk_dev_ppa_to_line(ppa),
- pblk_dev_ppa_to_pos(geo, ppa));
+ pblk_ppa_to_line(ppa),
+ pblk_ppa_to_pos(geo, ppa));
}
return err;
@@ -1746,7 +1739,7 @@ void pblk_up_rq(struct pblk *pblk, struct ppa_addr *ppa_list, int nr_ppas,
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
struct pblk_lun *rlun;
- int nr_luns = geo->nr_luns;
+ int nr_luns = geo->all_luns;
int bit = -1;
while ((bit = find_next_bit(lun_bitmap, nr_luns, bit + 1)) < nr_luns) {
@@ -1884,7 +1877,7 @@ void pblk_lookup_l2p_seq(struct pblk *pblk, struct ppa_addr *ppas,
/* If the L2P entry maps to a line, the reference is valid */
if (!pblk_ppa_empty(ppa) && !pblk_addr_in_cache(ppa)) {
- int line_id = pblk_dev_ppa_to_line(ppa);
+ int line_id = pblk_ppa_to_line(ppa);
struct pblk_line *line = &pblk->lines[line_id];
kref_get(&line->ref);
diff --git a/drivers/lightnvm/pblk-gc.c b/drivers/lightnvm/pblk-gc.c
index 9c8e114c8a54..3d899383666e 100644
--- a/drivers/lightnvm/pblk-gc.c
+++ b/drivers/lightnvm/pblk-gc.c
@@ -169,7 +169,14 @@ static void pblk_gc_line_prepare_ws(struct work_struct *work)
* the line untouched. TODO: Implement a recovery routine that scans and
* moves all sectors on the line.
*/
- lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
+
+ ret = pblk_recov_check_emeta(pblk, emeta_buf);
+ if (ret) {
+ pr_err("pblk: inconsistent emeta (line %d)\n", line->id);
+ goto fail_free_emeta;
+ }
+
+ lba_list = emeta_to_lbas(pblk, emeta_buf);
if (!lba_list) {
pr_err("pblk: could not interpret emeta (line %d)\n", line->id);
goto fail_free_emeta;
@@ -519,22 +526,12 @@ void pblk_gc_should_start(struct pblk *pblk)
}
}
-/*
- * If flush_wq == 1 then no lock should be held by the caller since
- * flush_workqueue can sleep
- */
-static void pblk_gc_stop(struct pblk *pblk, int flush_wq)
-{
- pblk->gc.gc_active = 0;
- pr_debug("pblk: gc stop\n");
-}
-
void pblk_gc_should_stop(struct pblk *pblk)
{
struct pblk_gc *gc = &pblk->gc;
if (gc->gc_active && !gc->gc_forced)
- pblk_gc_stop(pblk, 0);
+ gc->gc_active = 0;
}
void pblk_gc_should_kick(struct pblk *pblk)
@@ -660,7 +657,7 @@ void pblk_gc_exit(struct pblk *pblk)
gc->gc_enabled = 0;
del_timer_sync(&gc->gc_timer);
- pblk_gc_stop(pblk, 1);
+ gc->gc_active = 0;
if (gc->gc_ts)
kthread_stop(gc->gc_ts);
diff --git a/drivers/lightnvm/pblk-init.c b/drivers/lightnvm/pblk-init.c
index 695826a06b5d..93d671ca518e 100644
--- a/drivers/lightnvm/pblk-init.c
+++ b/drivers/lightnvm/pblk-init.c
@@ -169,8 +169,8 @@ static int pblk_set_ppaf(struct pblk *pblk)
}
ppaf.ch_len = power_len;
- power_len = get_count_order(geo->luns_per_chnl);
- if (1 << power_len != geo->luns_per_chnl) {
+ power_len = get_count_order(geo->nr_luns);
+ if (1 << power_len != geo->nr_luns) {
pr_err("pblk: supports only power-of-two LUN config.\n");
return -EINVAL;
}
@@ -254,7 +254,7 @@ static int pblk_core_init(struct pblk *pblk)
struct nvm_geo *geo = &dev->geo;
pblk->pgs_in_buffer = NVM_MEM_PAGE_WRITE * geo->sec_per_pg *
- geo->nr_planes * geo->nr_luns;
+ geo->nr_planes * geo->all_luns;
if (pblk_init_global_caches(pblk))
return -ENOMEM;
@@ -270,21 +270,22 @@ static int pblk_core_init(struct pblk *pblk)
if (!pblk->gen_ws_pool)
goto free_page_bio_pool;
- pblk->rec_pool = mempool_create_slab_pool(geo->nr_luns, pblk_rec_cache);
+ pblk->rec_pool = mempool_create_slab_pool(geo->all_luns,
+ pblk_rec_cache);
if (!pblk->rec_pool)
goto free_gen_ws_pool;
- pblk->r_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk->r_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_g_rq_cache);
if (!pblk->r_rq_pool)
goto free_rec_pool;
- pblk->e_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk->e_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_g_rq_cache);
if (!pblk->e_rq_pool)
goto free_r_rq_pool;
- pblk->w_rq_pool = mempool_create_slab_pool(geo->nr_luns,
+ pblk->w_rq_pool = mempool_create_slab_pool(geo->all_luns,
pblk_w_rq_cache);
if (!pblk->w_rq_pool)
goto free_e_rq_pool;
@@ -354,6 +355,8 @@ static void pblk_core_free(struct pblk *pblk)
mempool_destroy(pblk->e_rq_pool);
mempool_destroy(pblk->w_rq_pool);
+ pblk_rwb_free(pblk);
+
pblk_free_global_caches(pblk);
}
@@ -409,7 +412,7 @@ static int pblk_bb_discovery(struct nvm_tgt_dev *dev, struct pblk_lun *rlun)
u8 *blks;
int nr_blks, ret;
- nr_blks = geo->blks_per_lun * geo->plane_mode;
+ nr_blks = geo->nr_chks * geo->plane_mode;
blks = kmalloc(nr_blks, GFP_KERNEL);
if (!blks)
return -ENOMEM;
@@ -482,20 +485,21 @@ static int pblk_luns_init(struct pblk *pblk, struct ppa_addr *luns)
int i, ret;
/* TODO: Implement unbalanced LUN support */
- if (geo->luns_per_chnl < 0) {
+ if (geo->nr_luns < 0) {
pr_err("pblk: unbalanced LUN config.\n");
return -EINVAL;
}
- pblk->luns = kcalloc(geo->nr_luns, sizeof(struct pblk_lun), GFP_KERNEL);
+ pblk->luns = kcalloc(geo->all_luns, sizeof(struct pblk_lun),
+ GFP_KERNEL);
if (!pblk->luns)
return -ENOMEM;
- for (i = 0; i < geo->nr_luns; i++) {
+ for (i = 0; i < geo->all_luns; i++) {
/* Stripe across channels */
int ch = i % geo->nr_chnls;
int lun_raw = i / geo->nr_chnls;
- int lunid = lun_raw + ch * geo->luns_per_chnl;
+ int lunid = lun_raw + ch * geo->nr_luns;
rlun = &pblk->luns[i];
rlun->bppa = luns[lunid];
@@ -577,22 +581,37 @@ static unsigned int calc_emeta_len(struct pblk *pblk)
static void pblk_set_provision(struct pblk *pblk, long nr_free_blks)
{
struct nvm_tgt_dev *dev = pblk->dev;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
+ struct pblk_line_meta *lm = &pblk->lm;
struct nvm_geo *geo = &dev->geo;
sector_t provisioned;
+ int sec_meta, blk_meta;
- pblk->over_pct = 20;
+ if (geo->op == NVM_TARGET_DEFAULT_OP)
+ pblk->op = PBLK_DEFAULT_OP;
+ else
+ pblk->op = geo->op;
provisioned = nr_free_blks;
- provisioned *= (100 - pblk->over_pct);
+ provisioned *= (100 - pblk->op);
sector_div(provisioned, 100);
+ pblk->op_blks = nr_free_blks - provisioned;
+
/* Internally pblk manages all free blocks, but all calculations based
* on user capacity consider only provisioned blocks
*/
pblk->rl.total_blocks = nr_free_blks;
- pblk->rl.nr_secs = nr_free_blks * geo->sec_per_blk;
- pblk->capacity = provisioned * geo->sec_per_blk;
+ pblk->rl.nr_secs = nr_free_blks * geo->sec_per_chk;
+
+ /* Consider sectors used for metadata */
+ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+ blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
+
+ pblk->capacity = (provisioned - blk_meta) * geo->sec_per_chk;
+
atomic_set(&pblk->rl.free_blocks, nr_free_blks);
+ atomic_set(&pblk->rl.free_user_blocks, nr_free_blks);
}
static int pblk_lines_alloc_metadata(struct pblk *pblk)
@@ -683,7 +702,7 @@ static int pblk_lines_init(struct pblk *pblk)
int i, ret;
pblk->min_write_pgs = geo->sec_per_pl * (geo->sec_size / PAGE_SIZE);
- max_write_ppas = pblk->min_write_pgs * geo->nr_luns;
+ max_write_ppas = pblk->min_write_pgs * geo->all_luns;
pblk->max_write_pgs = (max_write_ppas < nvm_max_phys_sects(dev)) ?
max_write_ppas : nvm_max_phys_sects(dev);
pblk_set_sec_per_write(pblk, pblk->min_write_pgs);
@@ -693,26 +712,26 @@ static int pblk_lines_init(struct pblk *pblk)
return -EINVAL;
}
- div_u64_rem(geo->sec_per_blk, pblk->min_write_pgs, &mod);
+ div_u64_rem(geo->sec_per_chk, pblk->min_write_pgs, &mod);
if (mod) {
pr_err("pblk: bad configuration of sectors/pages\n");
return -EINVAL;
}
- l_mg->nr_lines = geo->blks_per_lun;
+ l_mg->nr_lines = geo->nr_chks;
l_mg->log_line = l_mg->data_line = NULL;
l_mg->l_seq_nr = l_mg->d_seq_nr = 0;
l_mg->nr_free_lines = 0;
bitmap_zero(&l_mg->meta_bitmap, PBLK_DATA_LINES);
- lm->sec_per_line = geo->sec_per_blk * geo->nr_luns;
- lm->blk_per_line = geo->nr_luns;
- lm->blk_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->sec_per_line = geo->sec_per_chk * geo->all_luns;
+ lm->blk_per_line = geo->all_luns;
+ lm->blk_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
lm->sec_bitmap_len = BITS_TO_LONGS(lm->sec_per_line) * sizeof(long);
- lm->lun_bitmap_len = BITS_TO_LONGS(geo->nr_luns) * sizeof(long);
+ lm->lun_bitmap_len = BITS_TO_LONGS(geo->all_luns) * sizeof(long);
lm->mid_thrs = lm->sec_per_line / 2;
lm->high_thrs = lm->sec_per_line / 4;
- lm->meta_distance = (geo->nr_luns / 2) * pblk->min_write_pgs;
+ lm->meta_distance = (geo->all_luns / 2) * pblk->min_write_pgs;
/* Calculate necessary pages for smeta. See comment over struct
* line_smeta definition
@@ -742,12 +761,12 @@ add_emeta_page:
goto add_emeta_page;
}
- lm->emeta_bb = geo->nr_luns > i ? geo->nr_luns - i : 0;
+ lm->emeta_bb = geo->all_luns > i ? geo->all_luns - i : 0;
lm->min_blk_line = 1;
- if (geo->nr_luns > 1)
+ if (geo->all_luns > 1)
lm->min_blk_line += DIV_ROUND_UP(lm->smeta_sec +
- lm->emeta_sec[0], geo->sec_per_blk);
+ lm->emeta_sec[0], geo->sec_per_chk);
if (lm->min_blk_line > lm->blk_per_line) {
pr_err("pblk: config. not supported. Min. LUN in line:%d\n",
@@ -772,7 +791,7 @@ add_emeta_page:
goto fail_free_bb_template;
}
- bb_distance = (geo->nr_luns) * geo->sec_per_pl;
+ bb_distance = (geo->all_luns) * geo->sec_per_pl;
for (i = 0; i < lm->sec_per_line; i += bb_distance)
bitmap_set(l_mg->bb_template, i, geo->sec_per_pl);
@@ -844,7 +863,7 @@ add_emeta_page:
pblk_set_provision(pblk, nr_free_blks);
/* Cleanup per-LUN bad block lists - managed within lines on run-time */
- for (i = 0; i < geo->nr_luns; i++)
+ for (i = 0; i < geo->all_luns; i++)
kfree(pblk->luns[i].bb_list);
return 0;
@@ -858,7 +877,7 @@ fail_free_bb_template:
fail_free_meta:
pblk_line_meta_free(pblk);
fail:
- for (i = 0; i < geo->nr_luns; i++)
+ for (i = 0; i < geo->all_luns; i++)
kfree(pblk->luns[i].bb_list);
return ret;
@@ -866,15 +885,19 @@ fail:
static int pblk_writer_init(struct pblk *pblk)
{
- timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
- mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
-
pblk->writer_ts = kthread_create(pblk_write_ts, pblk, "pblk-writer-t");
if (IS_ERR(pblk->writer_ts)) {
- pr_err("pblk: could not allocate writer kthread\n");
- return PTR_ERR(pblk->writer_ts);
+ int err = PTR_ERR(pblk->writer_ts);
+
+ if (err != -EINTR)
+ pr_err("pblk: could not allocate writer kthread (%d)\n",
+ err);
+ return err;
}
+ timer_setup(&pblk->wtimer, pblk_write_timer_fn, 0);
+ mod_timer(&pblk->wtimer, jiffies + msecs_to_jiffies(100));
+
return 0;
}
@@ -910,7 +933,6 @@ static void pblk_tear_down(struct pblk *pblk)
pblk_pipeline_stop(pblk);
pblk_writer_stop(pblk);
pblk_rb_sync_l2p(&pblk->rwb);
- pblk_rwb_free(pblk);
pblk_rl_free(&pblk->rl);
pr_debug("pblk: consistent tear down\n");
@@ -1025,7 +1047,8 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
ret = pblk_writer_init(pblk);
if (ret) {
- pr_err("pblk: could not initialize write thread\n");
+ if (ret != -EINTR)
+ pr_err("pblk: could not initialize write thread\n");
goto fail_free_lines;
}
@@ -1041,13 +1064,14 @@ static void *pblk_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
blk_queue_write_cache(tqueue, true, false);
- tqueue->limits.discard_granularity = geo->pgs_per_blk * geo->pfpg_size;
+ tqueue->limits.discard_granularity = geo->sec_per_chk * geo->sec_size;
tqueue->limits.discard_alignment = 0;
blk_queue_max_discard_sectors(tqueue, UINT_MAX >> 9);
queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, tqueue);
- pr_info("pblk init: luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
- geo->nr_luns, pblk->l_mg.nr_lines,
+ pr_info("pblk(%s): luns:%u, lines:%d, secs:%llu, buf entries:%u\n",
+ tdisk->disk_name,
+ geo->all_luns, pblk->l_mg.nr_lines,
(unsigned long long)pblk->rl.nr_secs,
pblk->rwb.nr_entries);
diff --git a/drivers/lightnvm/pblk-map.c b/drivers/lightnvm/pblk-map.c
index 6f3ecde2140f..7445e6430c52 100644
--- a/drivers/lightnvm/pblk-map.c
+++ b/drivers/lightnvm/pblk-map.c
@@ -146,7 +146,7 @@ void pblk_map_erase_rq(struct pblk *pblk, struct nvm_rq *rqd,
return;
/* Erase blocks that are bad in this line but might not be in next */
- if (unlikely(ppa_empty(*erase_ppa)) &&
+ if (unlikely(pblk_ppa_empty(*erase_ppa)) &&
bitmap_weight(d_line->blk_bitmap, lm->blk_per_line)) {
int bit = -1;
diff --git a/drivers/lightnvm/pblk-rb.c b/drivers/lightnvm/pblk-rb.c
index b8f78e401482..ec8fc314646b 100644
--- a/drivers/lightnvm/pblk-rb.c
+++ b/drivers/lightnvm/pblk-rb.c
@@ -54,7 +54,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
rb->seg_size = (1 << power_seg_sz);
rb->nr_entries = (1 << power_size);
rb->mem = rb->subm = rb->sync = rb->l2p_update = 0;
- rb->sync_point = EMPTY_ENTRY;
+ rb->flush_point = EMPTY_ENTRY;
spin_lock_init(&rb->w_lock);
spin_lock_init(&rb->s_lock);
@@ -112,7 +112,7 @@ int pblk_rb_init(struct pblk_rb *rb, struct pblk_rb_entry *rb_entry_base,
up_write(&pblk_rb_lock);
#ifdef CONFIG_NVM_DEBUG
- atomic_set(&rb->inflight_sync_point, 0);
+ atomic_set(&rb->inflight_flush_point, 0);
#endif
/*
@@ -226,7 +226,7 @@ static int __pblk_rb_update_l2p(struct pblk_rb *rb, unsigned int to_update)
pblk_update_map_dev(pblk, w_ctx->lba, w_ctx->ppa,
entry->cacheline);
- line = &pblk->lines[pblk_tgt_ppa_to_line(w_ctx->ppa)];
+ line = &pblk->lines[pblk_ppa_to_line(w_ctx->ppa)];
kref_put(&line->ref, pblk_line_put);
clean_wctx(w_ctx);
rb->l2p_update = (rb->l2p_update + 1) & (rb->nr_entries - 1);
@@ -349,35 +349,35 @@ void pblk_rb_write_entry_gc(struct pblk_rb *rb, void *data,
smp_store_release(&entry->w_ctx.flags, flags);
}
-static int pblk_rb_sync_point_set(struct pblk_rb *rb, struct bio *bio,
+static int pblk_rb_flush_point_set(struct pblk_rb *rb, struct bio *bio,
unsigned int pos)
{
struct pblk_rb_entry *entry;
- unsigned int subm, sync_point;
+ unsigned int sync, flush_point;
- subm = READ_ONCE(rb->subm);
+ sync = READ_ONCE(rb->sync);
+
+ if (pos == sync)
+ return 0;
#ifdef CONFIG_NVM_DEBUG
- atomic_inc(&rb->inflight_sync_point);
+ atomic_inc(&rb->inflight_flush_point);
#endif
- if (pos == subm)
- return 0;
+ flush_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
+ entry = &rb->entries[flush_point];
- sync_point = (pos == 0) ? (rb->nr_entries - 1) : (pos - 1);
- entry = &rb->entries[sync_point];
+ pblk_rb_sync_init(rb, NULL);
- /* Protect syncs */
- smp_store_release(&rb->sync_point, sync_point);
+ /* Protect flush points */
+ smp_store_release(&rb->flush_point, flush_point);
- if (!bio)
- return 0;
+ if (bio)
+ bio_list_add(&entry->w_ctx.bios, bio);
- spin_lock_irq(&rb->s_lock);
- bio_list_add(&entry->w_ctx.bios, bio);
- spin_unlock_irq(&rb->s_lock);
+ pblk_rb_sync_end(rb, NULL);
- return 1;
+ return bio ? 1 : 0;
}
static int __pblk_rb_may_write(struct pblk_rb *rb, unsigned int nr_entries,
@@ -416,7 +416,7 @@ void pblk_rb_flush(struct pblk_rb *rb)
struct pblk *pblk = container_of(rb, struct pblk, rwb);
unsigned int mem = READ_ONCE(rb->mem);
- if (pblk_rb_sync_point_set(rb, NULL, mem))
+ if (pblk_rb_flush_point_set(rb, NULL, mem))
return;
pblk_write_should_kick(pblk);
@@ -440,7 +440,7 @@ static int pblk_rb_may_write_flush(struct pblk_rb *rb, unsigned int nr_entries,
#ifdef CONFIG_NVM_DEBUG
atomic_long_inc(&pblk->nr_flush);
#endif
- if (pblk_rb_sync_point_set(&pblk->rwb, bio, mem))
+ if (pblk_rb_flush_point_set(&pblk->rwb, bio, mem))
*io_ret = NVM_IO_OK;
}
@@ -606,21 +606,6 @@ try:
return NVM_IO_ERR;
}
- if (flags & PBLK_FLUSH_ENTRY) {
- unsigned int sync_point;
-
- sync_point = READ_ONCE(rb->sync_point);
- if (sync_point == pos) {
- /* Protect syncs */
- smp_store_release(&rb->sync_point, EMPTY_ENTRY);
- }
-
- flags &= ~PBLK_FLUSH_ENTRY;
-#ifdef CONFIG_NVM_DEBUG
- atomic_dec(&rb->inflight_sync_point);
-#endif
- }
-
flags &= ~PBLK_WRITTEN_DATA;
flags |= PBLK_SUBMITTED_ENTRY;
@@ -730,15 +715,24 @@ void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags)
unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
{
- unsigned int sync;
- unsigned int i;
-
+ unsigned int sync, flush_point;
lockdep_assert_held(&rb->s_lock);
sync = READ_ONCE(rb->sync);
+ flush_point = READ_ONCE(rb->flush_point);
- for (i = 0; i < nr_entries; i++)
- sync = (sync + 1) & (rb->nr_entries - 1);
+ if (flush_point != EMPTY_ENTRY) {
+ unsigned int secs_to_flush;
+
+ secs_to_flush = pblk_rb_ring_count(flush_point, sync,
+ rb->nr_entries);
+ if (secs_to_flush < nr_entries) {
+ /* Protect flush points */
+ smp_store_release(&rb->flush_point, EMPTY_ENTRY);
+ }
+ }
+
+ sync = (sync + nr_entries) & (rb->nr_entries - 1);
/* Protect from counts */
smp_store_release(&rb->sync, sync);
@@ -746,22 +740,27 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries)
return sync;
}
-unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb)
+/* Calculate how many sectors to submit up to the current flush point. */
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb)
{
- unsigned int subm, sync_point;
- unsigned int count;
+ unsigned int subm, sync, flush_point;
+ unsigned int submitted, to_flush;
- /* Protect syncs */
- sync_point = smp_load_acquire(&rb->sync_point);
- if (sync_point == EMPTY_ENTRY)
+ /* Protect flush points */
+ flush_point = smp_load_acquire(&rb->flush_point);
+ if (flush_point == EMPTY_ENTRY)
return 0;
+ /* Protect syncs */
+ sync = smp_load_acquire(&rb->sync);
+
subm = READ_ONCE(rb->subm);
+ submitted = pblk_rb_ring_count(subm, sync, rb->nr_entries);
/* The sync point itself counts as a sector to sync */
- count = pblk_rb_ring_count(sync_point, subm, rb->nr_entries) + 1;
+ to_flush = pblk_rb_ring_count(flush_point, sync, rb->nr_entries) + 1;
- return count;
+ return (submitted < to_flush) ? (to_flush - submitted) : 0;
}
/*
@@ -801,7 +800,7 @@ int pblk_rb_tear_down_check(struct pblk_rb *rb)
if ((rb->mem == rb->subm) && (rb->subm == rb->sync) &&
(rb->sync == rb->l2p_update) &&
- (rb->sync_point == EMPTY_ENTRY)) {
+ (rb->flush_point == EMPTY_ENTRY)) {
goto out;
}
@@ -848,7 +847,7 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
queued_entries++;
spin_unlock_irq(&rb->s_lock);
- if (rb->sync_point != EMPTY_ENTRY)
+ if (rb->flush_point != EMPTY_ENTRY)
offset = scnprintf(buf, PAGE_SIZE,
"%u\t%u\t%u\t%u\t%u\t%u\t%u - %u/%u/%u - %d\n",
rb->nr_entries,
@@ -857,14 +856,14 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG
- atomic_read(&rb->inflight_sync_point),
+ atomic_read(&rb->inflight_flush_point),
#else
0,
#endif
- rb->sync_point,
+ rb->flush_point,
pblk_rb_read_count(rb),
pblk_rb_space(rb),
- pblk_rb_sync_point_count(rb),
+ pblk_rb_flush_point_count(rb),
queued_entries);
else
offset = scnprintf(buf, PAGE_SIZE,
@@ -875,13 +874,13 @@ ssize_t pblk_rb_sysfs(struct pblk_rb *rb, char *buf)
rb->sync,
rb->l2p_update,
#ifdef CONFIG_NVM_DEBUG
- atomic_read(&rb->inflight_sync_point),
+ atomic_read(&rb->inflight_flush_point),
#else
0,
#endif
pblk_rb_read_count(rb),
pblk_rb_space(rb),
- pblk_rb_sync_point_count(rb),
+ pblk_rb_flush_point_count(rb),
queued_entries);
return offset;
diff --git a/drivers/lightnvm/pblk-read.c b/drivers/lightnvm/pblk-read.c
index ca79d8fb3e60..2f761283f43e 100644
--- a/drivers/lightnvm/pblk-read.c
+++ b/drivers/lightnvm/pblk-read.c
@@ -141,7 +141,7 @@ static void pblk_read_put_rqd_kref(struct pblk *pblk, struct nvm_rq *rqd)
struct ppa_addr ppa = ppa_list[i];
struct pblk_line *line;
- line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+ line = &pblk->lines[pblk_ppa_to_line(ppa)];
kref_put(&line->ref, pblk_line_put_wq);
}
}
@@ -158,8 +158,12 @@ static void pblk_end_user_read(struct bio *bio)
static void __pblk_end_io_read(struct pblk *pblk, struct nvm_rq *rqd,
bool put_line)
{
+ struct nvm_tgt_dev *dev = pblk->dev;
struct pblk_g_ctx *r_ctx = nvm_rq_to_pdu(rqd);
struct bio *bio = rqd->bio;
+ unsigned long start_time = r_ctx->start_time;
+
+ generic_end_io_acct(dev->q, READ, &pblk->disk->part0, start_time);
if (rqd->error)
pblk_log_read_err(pblk, rqd);
@@ -193,9 +197,9 @@ static void pblk_end_io_read(struct nvm_rq *rqd)
__pblk_end_io_read(pblk, rqd, true);
}
-static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
- unsigned int bio_init_idx,
- unsigned long *read_bitmap)
+static int pblk_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
+ unsigned int bio_init_idx,
+ unsigned long *read_bitmap)
{
struct bio *new_bio, *bio = rqd->bio;
struct pblk_sec_meta *meta_list = rqd->meta_list;
@@ -270,7 +274,7 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
i = 0;
hole = find_first_zero_bit(read_bitmap, nr_secs);
do {
- int line_id = pblk_dev_ppa_to_line(rqd->ppa_list[i]);
+ int line_id = pblk_ppa_to_line(rqd->ppa_list[i]);
struct pblk_line *line = &pblk->lines[line_id];
kref_put(&line->ref, pblk_line_put);
@@ -306,6 +310,8 @@ static int pblk_fill_partial_read_bio(struct pblk *pblk, struct nvm_rq *rqd,
return NVM_IO_OK;
err:
+ pr_err("pblk: failed to perform partial read\n");
+
/* Free allocated pages in new bio */
pblk_bio_free_pages(pblk, bio, 0, new_bio->bi_vcnt);
__pblk_end_io_read(pblk, rqd, false);
@@ -357,6 +363,7 @@ retry:
int pblk_submit_read(struct pblk *pblk, struct bio *bio)
{
struct nvm_tgt_dev *dev = pblk->dev;
+ struct request_queue *q = dev->q;
sector_t blba = pblk_get_lba(bio);
unsigned int nr_secs = pblk_get_secs(bio);
struct pblk_g_ctx *r_ctx;
@@ -372,6 +379,8 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
return NVM_IO_ERR;
}
+ generic_start_io_acct(q, READ, bio_sectors(bio), &pblk->disk->part0);
+
bitmap_zero(&read_bitmap, nr_secs);
rqd = pblk_alloc_rqd(pblk, PBLK_READ);
@@ -383,6 +392,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
rqd->end_io = pblk_end_io_read;
r_ctx = nvm_rq_to_pdu(rqd);
+ r_ctx->start_time = jiffies;
r_ctx->lba = blba;
/* Save the index for this bio's start. This is needed in case
@@ -422,7 +432,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
int_bio = bio_clone_fast(bio, GFP_KERNEL, pblk_bio_set);
if (!int_bio) {
pr_err("pblk: could not clone read bio\n");
- return NVM_IO_ERR;
+ goto fail_end_io;
}
rqd->bio = int_bio;
@@ -433,7 +443,7 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
pr_err("pblk: read IO submission failed\n");
if (int_bio)
bio_put(int_bio);
- return ret;
+ goto fail_end_io;
}
return NVM_IO_OK;
@@ -442,17 +452,14 @@ int pblk_submit_read(struct pblk *pblk, struct bio *bio)
/* The read bio request could be partially filled by the write buffer,
* but there are some holes that need to be read from the drive.
*/
- ret = pblk_fill_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
- if (ret) {
- pr_err("pblk: failed to perform partial read\n");
- return ret;
- }
-
- return NVM_IO_OK;
+ return pblk_partial_read_bio(pblk, rqd, bio_init_idx, &read_bitmap);
fail_rqd_free:
pblk_free_rqd(pblk, rqd, PBLK_READ);
return ret;
+fail_end_io:
+ __pblk_end_io_read(pblk, rqd, false);
+ return ret;
}
static int read_ppalist_rq_gc(struct pblk *pblk, struct nvm_rq *rqd,
diff --git a/drivers/lightnvm/pblk-recovery.c b/drivers/lightnvm/pblk-recovery.c
index eadb3eb5d4dc..1d5e961bf5e0 100644
--- a/drivers/lightnvm/pblk-recovery.c
+++ b/drivers/lightnvm/pblk-recovery.c
@@ -111,18 +111,18 @@ int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
return 0;
}
-__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta_buf)
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta_buf)
{
u32 crc;
crc = pblk_calc_emeta_crc(pblk, emeta_buf);
if (le32_to_cpu(emeta_buf->crc) != crc)
- return NULL;
+ return 1;
if (le32_to_cpu(emeta_buf->header.identifier) != PBLK_MAGIC)
- return NULL;
+ return 1;
- return emeta_to_lbas(pblk, emeta_buf);
+ return 0;
}
static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
@@ -137,7 +137,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
u64 nr_valid_lbas, nr_lbas = 0;
u64 i;
- lba_list = pblk_recov_get_lba_list(pblk, emeta_buf);
+ lba_list = emeta_to_lbas(pblk, emeta_buf);
if (!lba_list)
return 1;
@@ -149,7 +149,7 @@ static int pblk_recov_l2p_from_emeta(struct pblk *pblk, struct pblk_line *line)
struct ppa_addr ppa;
int pos;
- ppa = addr_to_pblk_ppa(pblk, i, line->id);
+ ppa = addr_to_gen_ppa(pblk, i, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
/* Do not update bad blocks */
@@ -188,7 +188,7 @@ static int pblk_calc_sec_in_line(struct pblk *pblk, struct pblk_line *line)
int nr_bb = bitmap_weight(line->blk_bitmap, lm->blk_per_line);
return lm->sec_per_line - lm->smeta_sec - lm->emeta_sec[0] -
- nr_bb * geo->sec_per_blk;
+ nr_bb * geo->sec_per_chk;
}
struct pblk_recov_alloc {
@@ -263,12 +263,12 @@ next_read_rq:
int pos;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
r_ptr_int += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, r_ptr_int, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, r_ptr_int++)
@@ -288,7 +288,7 @@ next_read_rq:
/* At this point, the read should not fail. If it does, it is a problem
* we cannot recover from here. Need FTL log.
*/
- if (rqd->error) {
+ if (rqd->error && rqd->error != NVM_RSP_WARN_HIGHECC) {
pr_err("pblk: L2P recovery failed (%d)\n", rqd->error);
return -EINTR;
}
@@ -411,12 +411,12 @@ next_pad_rq:
int pos;
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
- ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs;
- ppa = addr_to_pblk_ppa(pblk, w_ptr, line->id);
+ ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
}
@@ -541,12 +541,12 @@ next_rq:
w_ptr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
w_ptr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, w_ptr, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, w_ptr++)
@@ -672,12 +672,12 @@ next_rq:
paddr = pblk_alloc_page(pblk, line, pblk->min_write_pgs);
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
while (test_bit(pos, line->blk_bitmap)) {
paddr += pblk->min_write_pgs;
ppa = addr_to_gen_ppa(pblk, paddr, line->id);
- pos = pblk_dev_ppa_to_pos(geo, ppa);
+ pos = pblk_ppa_to_pos(geo, ppa);
}
for (j = 0; j < pblk->min_write_pgs; j++, i++, paddr++)
@@ -817,7 +817,7 @@ static u64 pblk_line_emeta_start(struct pblk *pblk, struct pblk_line *line)
while (emeta_secs) {
emeta_start--;
- ppa = addr_to_pblk_ppa(pblk, emeta_start, line->id);
+ ppa = addr_to_gen_ppa(pblk, emeta_start, line->id);
pos = pblk_ppa_to_pos(geo, ppa);
if (!test_bit(pos, line->blk_bitmap))
emeta_secs--;
@@ -938,6 +938,11 @@ struct pblk_line *pblk_recov_l2p(struct pblk *pblk)
goto next;
}
+ if (pblk_recov_check_emeta(pblk, line->emeta->buf)) {
+ pblk_recov_l2p_from_oob(pblk, line);
+ goto next;
+ }
+
if (pblk_recov_l2p_from_emeta(pblk, line))
pblk_recov_l2p_from_oob(pblk, line);
@@ -984,10 +989,8 @@ next:
}
spin_unlock(&l_mg->free_lock);
- if (is_next) {
+ if (is_next)
pblk_line_erase(pblk, l_mg->data_next);
- pblk_rl_free_lines_dec(&pblk->rl, l_mg->data_next);
- }
out:
if (found_lines != recovered_lines)
diff --git a/drivers/lightnvm/pblk-rl.c b/drivers/lightnvm/pblk-rl.c
index dacc71922260..0d457b162f23 100644
--- a/drivers/lightnvm/pblk-rl.c
+++ b/drivers/lightnvm/pblk-rl.c
@@ -89,17 +89,15 @@ unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl)
return atomic_read(&rl->free_blocks);
}
-/*
- * We check for (i) the number of free blocks in the current LUN and (ii) the
- * total number of free blocks in the pblk instance. This is to even out the
- * number of free blocks on each LUN when GC kicks in.
- *
- * Only the total number of free blocks is used to configure the rate limiter.
- */
-void pblk_rl_update_rates(struct pblk_rl *rl)
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl)
+{
+ return atomic_read(&rl->free_user_blocks);
+}
+
+static void __pblk_rl_update_rates(struct pblk_rl *rl,
+ unsigned long free_blocks)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
- unsigned long free_blocks = pblk_rl_nr_free_blks(rl);
int max = rl->rb_budget;
if (free_blocks >= rl->high) {
@@ -132,20 +130,37 @@ void pblk_rl_update_rates(struct pblk_rl *rl)
pblk_gc_should_stop(pblk);
}
+void pblk_rl_update_rates(struct pblk_rl *rl)
+{
+ __pblk_rl_update_rates(rl, pblk_rl_nr_user_free_blks(rl));
+}
+
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line)
{
int blk_in_line = atomic_read(&line->blk_in_line);
+ int free_blocks;
atomic_add(blk_in_line, &rl->free_blocks);
- pblk_rl_update_rates(rl);
+ free_blocks = atomic_add_return(blk_in_line, &rl->free_user_blocks);
+
+ __pblk_rl_update_rates(rl, free_blocks);
}
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line)
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+ bool used)
{
int blk_in_line = atomic_read(&line->blk_in_line);
+ int free_blocks;
atomic_sub(blk_in_line, &rl->free_blocks);
- pblk_rl_update_rates(rl);
+
+ if (used)
+ free_blocks = atomic_sub_return(blk_in_line,
+ &rl->free_user_blocks);
+ else
+ free_blocks = atomic_read(&rl->free_user_blocks);
+
+ __pblk_rl_update_rates(rl, free_blocks);
}
int pblk_rl_high_thrs(struct pblk_rl *rl)
@@ -174,16 +189,21 @@ void pblk_rl_free(struct pblk_rl *rl)
void pblk_rl_init(struct pblk_rl *rl, int budget)
{
struct pblk *pblk = container_of(rl, struct pblk, rl);
+ struct nvm_tgt_dev *dev = pblk->dev;
+ struct nvm_geo *geo = &dev->geo;
+ struct pblk_line_mgmt *l_mg = &pblk->l_mg;
struct pblk_line_meta *lm = &pblk->lm;
int min_blocks = lm->blk_per_line * PBLK_GC_RSV_LINE;
+ int sec_meta, blk_meta;
+
unsigned int rb_windows;
- rl->high = rl->total_blocks / PBLK_USER_HIGH_THRS;
- rl->high_pw = get_count_order(rl->high);
+ /* Consider sectors used for metadata */
+ sec_meta = (lm->smeta_sec + lm->emeta_sec[0]) * l_mg->nr_free_lines;
+ blk_meta = DIV_ROUND_UP(sec_meta, geo->sec_per_chk);
- rl->low = rl->total_blocks / PBLK_USER_LOW_THRS;
- if (rl->low < min_blocks)
- rl->low = min_blocks;
+ rl->high = pblk->op_blks - blk_meta - lm->blk_per_line;
+ rl->high_pw = get_count_order(rl->high);
rl->rsv_blocks = min_blocks;
diff --git a/drivers/lightnvm/pblk-sysfs.c b/drivers/lightnvm/pblk-sysfs.c
index cd49e8875d4e..620bab853579 100644
--- a/drivers/lightnvm/pblk-sysfs.c
+++ b/drivers/lightnvm/pblk-sysfs.c
@@ -28,7 +28,7 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
ssize_t sz = 0;
int i;
- for (i = 0; i < geo->nr_luns; i++) {
+ for (i = 0; i < geo->all_luns; i++) {
int active = 1;
rlun = &pblk->luns[i];
@@ -49,11 +49,12 @@ static ssize_t pblk_sysfs_luns_show(struct pblk *pblk, char *page)
static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
{
- int free_blocks, total_blocks;
+ int free_blocks, free_user_blocks, total_blocks;
int rb_user_max, rb_user_cnt;
int rb_gc_max, rb_gc_cnt, rb_budget, rb_state;
- free_blocks = atomic_read(&pblk->rl.free_blocks);
+ free_blocks = pblk_rl_nr_free_blks(&pblk->rl);
+ free_user_blocks = pblk_rl_nr_user_free_blks(&pblk->rl);
rb_user_max = pblk->rl.rb_user_max;
rb_user_cnt = atomic_read(&pblk->rl.rb_user_cnt);
rb_gc_max = pblk->rl.rb_gc_max;
@@ -64,16 +65,16 @@ static ssize_t pblk_sysfs_rate_limiter(struct pblk *pblk, char *page)
total_blocks = pblk->rl.total_blocks;
return snprintf(page, PAGE_SIZE,
- "u:%u/%u,gc:%u/%u(%u/%u)(stop:<%u,full:>%u,free:%d/%d)-%d\n",
+ "u:%u/%u,gc:%u/%u(%u)(stop:<%u,full:>%u,free:%d/%d/%d)-%d\n",
rb_user_cnt,
rb_user_max,
rb_gc_cnt,
rb_gc_max,
rb_state,
rb_budget,
- pblk->rl.low,
pblk->rl.high,
free_blocks,
+ free_user_blocks,
total_blocks,
READ_ONCE(pblk->rl.rb_user_active));
}
@@ -238,7 +239,7 @@ static ssize_t pblk_sysfs_lines(struct pblk *pblk, char *page)
sz = snprintf(page, PAGE_SIZE - sz,
"line: nluns:%d, nblks:%d, nsecs:%d\n",
- geo->nr_luns, lm->blk_per_line, lm->sec_per_line);
+ geo->all_luns, lm->blk_per_line, lm->sec_per_line);
sz += snprintf(page + sz, PAGE_SIZE - sz,
"lines:d:%d,l:%d-f:%d,m:%d/%d,c:%d,b:%d,co:%d(d:%d,l:%d)t:%d\n",
@@ -287,7 +288,7 @@ static ssize_t pblk_sysfs_lines_info(struct pblk *pblk, char *page)
"blk_line:%d, sec_line:%d, sec_blk:%d\n",
lm->blk_per_line,
lm->sec_per_line,
- geo->sec_per_blk);
+ geo->sec_per_chk);
return sz;
}
diff --git a/drivers/lightnvm/pblk-write.c b/drivers/lightnvm/pblk-write.c
index 6c1cafafef53..aae86ed60b98 100644
--- a/drivers/lightnvm/pblk-write.c
+++ b/drivers/lightnvm/pblk-write.c
@@ -21,13 +21,28 @@ static unsigned long pblk_end_w_bio(struct pblk *pblk, struct nvm_rq *rqd,
struct pblk_c_ctx *c_ctx)
{
struct bio *original_bio;
+ struct pblk_rb *rwb = &pblk->rwb;
unsigned long ret;
int i;
for (i = 0; i < c_ctx->nr_valid; i++) {
struct pblk_w_ctx *w_ctx;
+ int pos = c_ctx->sentry + i;
+ int flags;
+
+ w_ctx = pblk_rb_w_ctx(rwb, pos);
+ flags = READ_ONCE(w_ctx->flags);
+
+ if (flags & PBLK_FLUSH_ENTRY) {
+ flags &= ~PBLK_FLUSH_ENTRY;
+ /* Release flags on context. Protect from writes */
+ smp_store_release(&w_ctx->flags, flags);
+
+#ifdef CONFIG_NVM_DEBUG
+ atomic_dec(&rwb->inflight_flush_point);
+#endif
+ }
- w_ctx = pblk_rb_w_ctx(&pblk->rwb, c_ctx->sentry + i);
while ((original_bio = bio_list_pop(&w_ctx->bios)))
bio_endio(original_bio);
}
@@ -439,7 +454,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
struct pblk_line *meta_line;
int err;
- ppa_set_empty(&erase_ppa);
+ pblk_ppa_set_empty(&erase_ppa);
/* Assign lbas to ppas and populate request structure */
err = pblk_setup_w_rq(pblk, rqd, &erase_ppa);
@@ -457,7 +472,7 @@ static int pblk_submit_io_set(struct pblk *pblk, struct nvm_rq *rqd)
return NVM_IO_ERR;
}
- if (!ppa_empty(erase_ppa)) {
+ if (!pblk_ppa_empty(erase_ppa)) {
/* Submit erase for next data line */
if (pblk_blk_erase_async(pblk, erase_ppa)) {
struct pblk_line *e_line = pblk_line_get_erase(pblk);
@@ -508,7 +523,7 @@ static int pblk_submit_write(struct pblk *pblk)
if (!secs_avail)
return 1;
- secs_to_flush = pblk_rb_sync_point_count(&pblk->rwb);
+ secs_to_flush = pblk_rb_flush_point_count(&pblk->rwb);
if (!secs_to_flush && secs_avail < pblk->min_write_pgs)
return 1;
diff --git a/drivers/lightnvm/pblk.h b/drivers/lightnvm/pblk.h
index 59a64d461a5d..8c357fb6538e 100644
--- a/drivers/lightnvm/pblk.h
+++ b/drivers/lightnvm/pblk.h
@@ -51,17 +51,16 @@
#define NR_PHY_IN_LOG (PBLK_EXPOSED_PAGE_SIZE / PBLK_SECTOR)
-#define pblk_for_each_lun(pblk, rlun, i) \
- for ((i) = 0, rlun = &(pblk)->luns[0]; \
- (i) < (pblk)->nr_luns; (i)++, rlun = &(pblk)->luns[(i)])
-
/* Static pool sizes */
#define PBLK_GEN_WS_POOL_SIZE (2)
+#define PBLK_DEFAULT_OP (11)
+
enum {
PBLK_READ = READ,
PBLK_WRITE = WRITE,/* Write from write buffer */
PBLK_WRITE_INT, /* Internal write - no write buffer */
+ PBLK_READ_RECOV, /* Recovery read - errors allowed */
PBLK_ERASE,
};
@@ -114,6 +113,7 @@ struct pblk_c_ctx {
/* read context */
struct pblk_g_ctx {
void *private;
+ unsigned long start_time;
u64 lba;
};
@@ -170,7 +170,7 @@ struct pblk_rb {
* the last submitted entry that has
* been successfully persisted to media
*/
- unsigned int sync_point; /* Sync point - last entry that must be
+ unsigned int flush_point; /* Sync point - last entry that must be
* flushed to the media. Used with
* REQ_FLUSH and REQ_FUA
*/
@@ -193,7 +193,7 @@ struct pblk_rb {
spinlock_t s_lock; /* Sync lock */
#ifdef CONFIG_NVM_DEBUG
- atomic_t inflight_sync_point; /* Not served REQ_FLUSH | REQ_FUA */
+ atomic_t inflight_flush_point; /* Not served REQ_FLUSH | REQ_FUA */
#endif
};
@@ -256,9 +256,6 @@ struct pblk_rl {
unsigned int high; /* Upper threshold for rate limiter (free run -
* user I/O rate limiter
*/
- unsigned int low; /* Lower threshold for rate limiter (user I/O
- * rate limiter - stall)
- */
unsigned int high_pw; /* High rounded up as a power of 2 */
#define PBLK_USER_HIGH_THRS 8 /* Begin write limit at 12% available blks */
@@ -292,7 +289,9 @@ struct pblk_rl {
unsigned long long nr_secs;
unsigned long total_blocks;
- atomic_t free_blocks;
+
+ atomic_t free_blocks; /* Total number of free blocks (+ OP) */
+ atomic_t free_user_blocks; /* Number of user free blocks (no OP) */
};
#define PBLK_LINE_EMPTY (~0U)
@@ -583,7 +582,9 @@ struct pblk {
*/
sector_t capacity; /* Device capacity when bad blocks are subtracted */
- int over_pct; /* Percentage of device used for over-provisioning */
+
+ int op; /* Percentage of device used for over-provisioning */
+ int op_blks; /* Number of blocks used for over-provisioning */
/* pblk provisioning values. Used by rate limiter */
struct pblk_rl rl;
@@ -691,7 +692,7 @@ unsigned int pblk_rb_sync_advance(struct pblk_rb *rb, unsigned int nr_entries);
struct pblk_rb_entry *pblk_rb_sync_scan_entry(struct pblk_rb *rb,
struct ppa_addr *ppa);
void pblk_rb_sync_end(struct pblk_rb *rb, unsigned long *flags);
-unsigned int pblk_rb_sync_point_count(struct pblk_rb *rb);
+unsigned int pblk_rb_flush_point_count(struct pblk_rb *rb);
unsigned int pblk_rb_read_count(struct pblk_rb *rb);
unsigned int pblk_rb_sync_count(struct pblk_rb *rb);
@@ -812,7 +813,7 @@ int pblk_submit_read_gc(struct pblk *pblk, struct pblk_gc_rq *gc_rq);
void pblk_submit_rec(struct work_struct *work);
struct pblk_line *pblk_recov_l2p(struct pblk *pblk);
int pblk_recov_pad(struct pblk *pblk);
-__le64 *pblk_recov_get_lba_list(struct pblk *pblk, struct line_emeta *emeta);
+int pblk_recov_check_emeta(struct pblk *pblk, struct line_emeta *emeta);
int pblk_recov_setup_rq(struct pblk *pblk, struct pblk_c_ctx *c_ctx,
struct pblk_rec_ctx *recovery, u64 *comp_bits,
unsigned int comp);
@@ -843,6 +844,7 @@ void pblk_rl_free(struct pblk_rl *rl);
void pblk_rl_update_rates(struct pblk_rl *rl);
int pblk_rl_high_thrs(struct pblk_rl *rl);
unsigned long pblk_rl_nr_free_blks(struct pblk_rl *rl);
+unsigned long pblk_rl_nr_user_free_blks(struct pblk_rl *rl);
int pblk_rl_user_may_insert(struct pblk_rl *rl, int nr_entries);
void pblk_rl_inserted(struct pblk_rl *rl, int nr_entries);
void pblk_rl_user_in(struct pblk_rl *rl, int nr_entries);
@@ -851,7 +853,8 @@ void pblk_rl_gc_in(struct pblk_rl *rl, int nr_entries);
void pblk_rl_out(struct pblk_rl *rl, int nr_user, int nr_gc);
int pblk_rl_max_io(struct pblk_rl *rl);
void pblk_rl_free_lines_inc(struct pblk_rl *rl, struct pblk_line *line);
-void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line);
+void pblk_rl_free_lines_dec(struct pblk_rl *rl, struct pblk_line *line,
+ bool used);
int pblk_rl_is_limit(struct pblk_rl *rl);
/*
@@ -907,28 +910,47 @@ static inline int pblk_pad_distance(struct pblk *pblk)
struct nvm_tgt_dev *dev = pblk->dev;
struct nvm_geo *geo = &dev->geo;
- return NVM_MEM_PAGE_WRITE * geo->nr_luns * geo->sec_per_pl;
+ return NVM_MEM_PAGE_WRITE * geo->all_luns * geo->sec_per_pl;
}
-static inline int pblk_dev_ppa_to_line(struct ppa_addr p)
+static inline int pblk_ppa_to_line(struct ppa_addr p)
{
return p.g.blk;
}
-static inline int pblk_tgt_ppa_to_line(struct ppa_addr p)
+static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
{
- return p.g.blk;
+ return p.g.lun * geo->nr_chnls + p.g.ch;
}
-static inline int pblk_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
+ u64 line_id)
{
- return p.g.lun * geo->nr_chnls + p.g.ch;
+ struct ppa_addr ppa;
+
+ ppa.ppa = 0;
+ ppa.g.blk = line_id;
+ ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
+ ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
+ ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
+ ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
+ ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
+
+ return ppa;
}
-/* A block within a line corresponds to the lun */
-static inline int pblk_dev_ppa_to_pos(struct nvm_geo *geo, struct ppa_addr p)
+static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
+ struct ppa_addr p)
{
- return p.g.lun * geo->nr_chnls + p.g.ch;
+ u64 paddr;
+
+ paddr = (u64)p.g.pg << pblk->ppaf.pg_offset;
+ paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
+ paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
+ paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
+ paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+
+ return paddr;
}
static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
@@ -960,24 +982,6 @@ static inline struct ppa_addr pblk_ppa32_to_ppa64(struct pblk *pblk, u32 ppa32)
return ppa64;
}
-static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
- sector_t lba)
-{
- struct ppa_addr ppa;
-
- if (pblk->ppaf_bitsize < 32) {
- u32 *map = (u32 *)pblk->trans_map;
-
- ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
- } else {
- struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
-
- ppa = map[lba];
- }
-
- return ppa;
-}
-
static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
{
u32 ppa32 = 0;
@@ -999,33 +1003,36 @@ static inline u32 pblk_ppa64_to_ppa32(struct pblk *pblk, struct ppa_addr ppa64)
return ppa32;
}
-static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
- struct ppa_addr ppa)
+static inline struct ppa_addr pblk_trans_map_get(struct pblk *pblk,
+ sector_t lba)
{
+ struct ppa_addr ppa;
+
if (pblk->ppaf_bitsize < 32) {
u32 *map = (u32 *)pblk->trans_map;
- map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+ ppa = pblk_ppa32_to_ppa64(pblk, map[lba]);
} else {
- u64 *map = (u64 *)pblk->trans_map;
+ struct ppa_addr *map = (struct ppa_addr *)pblk->trans_map;
- map[lba] = ppa.ppa;
+ ppa = map[lba];
}
+
+ return ppa;
}
-static inline u64 pblk_dev_ppa_to_line_addr(struct pblk *pblk,
- struct ppa_addr p)
+static inline void pblk_trans_map_set(struct pblk *pblk, sector_t lba,
+ struct ppa_addr ppa)
{
- u64 paddr;
+ if (pblk->ppaf_bitsize < 32) {
+ u32 *map = (u32 *)pblk->trans_map;
- paddr = 0;
- paddr |= (u64)p.g.pg << pblk->ppaf.pg_offset;
- paddr |= (u64)p.g.lun << pblk->ppaf.lun_offset;
- paddr |= (u64)p.g.ch << pblk->ppaf.ch_offset;
- paddr |= (u64)p.g.pl << pblk->ppaf.pln_offset;
- paddr |= (u64)p.g.sec << pblk->ppaf.sec_offset;
+ map[lba] = pblk_ppa64_to_ppa32(pblk, ppa);
+ } else {
+ u64 *map = (u64 *)pblk->trans_map;
- return paddr;
+ map[lba] = ppa.ppa;
+ }
}
static inline int pblk_ppa_empty(struct ppa_addr ppa_addr)
@@ -1040,10 +1047,7 @@ static inline void pblk_ppa_set_empty(struct ppa_addr *ppa_addr)
static inline bool pblk_ppa_comp(struct ppa_addr lppa, struct ppa_addr rppa)
{
- if (lppa.ppa == rppa.ppa)
- return true;
-
- return false;
+ return (lppa.ppa == rppa.ppa);
}
static inline int pblk_addr_in_cache(struct ppa_addr ppa)
@@ -1066,32 +1070,6 @@ static inline struct ppa_addr pblk_cacheline_to_addr(int addr)
return p;
}
-static inline struct ppa_addr addr_to_gen_ppa(struct pblk *pblk, u64 paddr,
- u64 line_id)
-{
- struct ppa_addr ppa;
-
- ppa.ppa = 0;
- ppa.g.blk = line_id;
- ppa.g.pg = (paddr & pblk->ppaf.pg_mask) >> pblk->ppaf.pg_offset;
- ppa.g.lun = (paddr & pblk->ppaf.lun_mask) >> pblk->ppaf.lun_offset;
- ppa.g.ch = (paddr & pblk->ppaf.ch_mask) >> pblk->ppaf.ch_offset;
- ppa.g.pl = (paddr & pblk->ppaf.pln_mask) >> pblk->ppaf.pln_offset;
- ppa.g.sec = (paddr & pblk->ppaf.sec_mask) >> pblk->ppaf.sec_offset;
-
- return ppa;
-}
-
-static inline struct ppa_addr addr_to_pblk_ppa(struct pblk *pblk, u64 paddr,
- u64 line_id)
-{
- struct ppa_addr ppa;
-
- ppa = addr_to_gen_ppa(pblk, paddr, line_id);
-
- return ppa;
-}
-
static inline u32 pblk_calc_meta_header_crc(struct pblk *pblk,
struct line_header *header)
{
@@ -1212,10 +1190,10 @@ static inline int pblk_boundary_ppa_checks(struct nvm_tgt_dev *tgt_dev,
if (!ppa->c.is_cached &&
ppa->g.ch < geo->nr_chnls &&
- ppa->g.lun < geo->luns_per_chnl &&
+ ppa->g.lun < geo->nr_luns &&
ppa->g.pl < geo->nr_planes &&
- ppa->g.blk < geo->blks_per_lun &&
- ppa->g.pg < geo->pgs_per_blk &&
+ ppa->g.blk < geo->nr_chks &&
+ ppa->g.pg < geo->ws_per_chk &&
ppa->g.sec < geo->sec_per_pg)
continue;
@@ -1245,7 +1223,7 @@ static inline int pblk_check_io(struct pblk *pblk, struct nvm_rq *rqd)
for (i = 0; i < rqd->nr_ppas; i++) {
ppa = ppa_list[i];
- line = &pblk->lines[pblk_dev_ppa_to_line(ppa)];
+ line = &pblk->lines[pblk_ppa_to_line(ppa)];
spin_lock(&line->lock);
if (line->state != PBLK_LINESTATE_OPEN) {
@@ -1288,11 +1266,6 @@ static inline unsigned int pblk_get_secs(struct bio *bio)
return bio->bi_iter.bi_size / PBLK_EXPOSED_PAGE_SIZE;
}
-static inline sector_t pblk_get_sector(sector_t lba)
-{
- return lba * NR_PHY_IN_LOG;
-}
-
static inline void pblk_setup_uuid(struct pblk *pblk)
{
uuid_le uuid;
diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c
deleted file mode 100644
index 0993c14be860..000000000000
--- a/drivers/lightnvm/rrpc.c
+++ /dev/null
@@ -1,1625 +0,0 @@
-/*
- * Copyright (C) 2015 IT University of Copenhagen
- * Initial release: Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
- */
-
-#include "rrpc.h"
-
-static struct kmem_cache *rrpc_gcb_cache, *rrpc_rq_cache;
-static DECLARE_RWSEM(rrpc_lock);
-
-static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags);
-
-#define rrpc_for_each_lun(rrpc, rlun, i) \
- for ((i) = 0, rlun = &(rrpc)->luns[0]; \
- (i) < (rrpc)->nr_luns; (i)++, rlun = &(rrpc)->luns[(i)])
-
-static void rrpc_page_invalidate(struct rrpc *rrpc, struct rrpc_addr *a)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_block *rblk = a->rblk;
- unsigned int pg_offset;
-
- lockdep_assert_held(&rrpc->rev_lock);
-
- if (a->addr == ADDR_EMPTY || !rblk)
- return;
-
- spin_lock(&rblk->lock);
-
- div_u64_rem(a->addr, dev->geo.sec_per_blk, &pg_offset);
- WARN_ON(test_and_set_bit(pg_offset, rblk->invalid_pages));
- rblk->nr_invalid_pages++;
-
- spin_unlock(&rblk->lock);
-
- rrpc->rev_trans_map[a->addr].addr = ADDR_EMPTY;
-}
-
-static void rrpc_invalidate_range(struct rrpc *rrpc, sector_t slba,
- unsigned int len)
-{
- sector_t i;
-
- spin_lock(&rrpc->rev_lock);
- for (i = slba; i < slba + len; i++) {
- struct rrpc_addr *gp = &rrpc->trans_map[i];
-
- rrpc_page_invalidate(rrpc, gp);
- gp->rblk = NULL;
- }
- spin_unlock(&rrpc->rev_lock);
-}
-
-static struct nvm_rq *rrpc_inflight_laddr_acquire(struct rrpc *rrpc,
- sector_t laddr, unsigned int pages)
-{
- struct nvm_rq *rqd;
- struct rrpc_inflight_rq *inf;
-
- rqd = mempool_alloc(rrpc->rq_pool, GFP_ATOMIC);
- if (!rqd)
- return ERR_PTR(-ENOMEM);
-
- inf = rrpc_get_inflight_rq(rqd);
- if (rrpc_lock_laddr(rrpc, laddr, pages, inf)) {
- mempool_free(rqd, rrpc->rq_pool);
- return NULL;
- }
-
- return rqd;
-}
-
-static void rrpc_inflight_laddr_release(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
- struct rrpc_inflight_rq *inf = rrpc_get_inflight_rq(rqd);
-
- rrpc_unlock_laddr(rrpc, inf);
-
- mempool_free(rqd, rrpc->rq_pool);
-}
-
-static void rrpc_discard(struct rrpc *rrpc, struct bio *bio)
-{
- sector_t slba = bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
- sector_t len = bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
- struct nvm_rq *rqd;
-
- while (1) {
- rqd = rrpc_inflight_laddr_acquire(rrpc, slba, len);
- if (rqd)
- break;
-
- schedule();
- }
-
- if (IS_ERR(rqd)) {
- pr_err("rrpc: unable to acquire inflight IO\n");
- bio_io_error(bio);
- return;
- }
-
- rrpc_invalidate_range(rrpc, slba, len);
- rrpc_inflight_laddr_release(rrpc, rqd);
-}
-
-static int block_is_full(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
-
- return (rblk->next_page == dev->geo.sec_per_blk);
-}
-
-/* Calculate relative addr for the given block, considering instantiated LUNs */
-static u64 block_to_rel_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_lun *rlun = rblk->rlun;
-
- return rlun->id * dev->geo.sec_per_blk;
-}
-
-static struct ppa_addr rrpc_ppa_to_gaddr(struct nvm_tgt_dev *dev,
- struct rrpc_addr *gp)
-{
- struct rrpc_block *rblk = gp->rblk;
- struct rrpc_lun *rlun = rblk->rlun;
- u64 addr = gp->addr;
- struct ppa_addr paddr;
-
- paddr.ppa = addr;
- paddr = rrpc_linear_to_generic_addr(&dev->geo, paddr);
- paddr.g.ch = rlun->bppa.g.ch;
- paddr.g.lun = rlun->bppa.g.lun;
- paddr.g.blk = rblk->id;
-
- return paddr;
-}
-
-/* requires lun->lock taken */
-static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *new_rblk,
- struct rrpc_block **cur_rblk)
-{
- struct rrpc *rrpc = rlun->rrpc;
-
- if (*cur_rblk) {
- spin_lock(&(*cur_rblk)->lock);
- WARN_ON(!block_is_full(rrpc, *cur_rblk));
- spin_unlock(&(*cur_rblk)->lock);
- }
- *cur_rblk = new_rblk;
-}
-
-static struct rrpc_block *__rrpc_get_blk(struct rrpc *rrpc,
- struct rrpc_lun *rlun)
-{
- struct rrpc_block *rblk = NULL;
-
- if (list_empty(&rlun->free_list))
- goto out;
-
- rblk = list_first_entry(&rlun->free_list, struct rrpc_block, list);
-
- list_move_tail(&rblk->list, &rlun->used_list);
- rblk->state = NVM_BLK_ST_TGT;
- rlun->nr_free_blocks--;
-
-out:
- return rblk;
-}
-
-static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun,
- unsigned long flags)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_block *rblk;
- int is_gc = flags & NVM_IOTYPE_GC;
-
- spin_lock(&rlun->lock);
- if (!is_gc && rlun->nr_free_blocks < rlun->reserved_blocks) {
- pr_err("nvm: rrpc: cannot give block to non GC request\n");
- spin_unlock(&rlun->lock);
- return NULL;
- }
-
- rblk = __rrpc_get_blk(rrpc, rlun);
- if (!rblk) {
- pr_err("nvm: rrpc: cannot get new block\n");
- spin_unlock(&rlun->lock);
- return NULL;
- }
- spin_unlock(&rlun->lock);
-
- bitmap_zero(rblk->invalid_pages, dev->geo.sec_per_blk);
- rblk->next_page = 0;
- rblk->nr_invalid_pages = 0;
- atomic_set(&rblk->data_cmnt_size, 0);
-
- return rblk;
-}
-
-static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct rrpc_lun *rlun = rblk->rlun;
-
- spin_lock(&rlun->lock);
- if (rblk->state & NVM_BLK_ST_TGT) {
- list_move_tail(&rblk->list, &rlun->free_list);
- rlun->nr_free_blocks++;
- rblk->state = NVM_BLK_ST_FREE;
- } else if (rblk->state & NVM_BLK_ST_BAD) {
- list_move_tail(&rblk->list, &rlun->bb_list);
- rblk->state = NVM_BLK_ST_BAD;
- } else {
- WARN_ON_ONCE(1);
- pr_err("rrpc: erroneous type (ch:%d,lun:%d,blk%d-> %u)\n",
- rlun->bppa.g.ch, rlun->bppa.g.lun,
- rblk->id, rblk->state);
- list_move_tail(&rblk->list, &rlun->bb_list);
- }
- spin_unlock(&rlun->lock);
-}
-
-static void rrpc_put_blks(struct rrpc *rrpc)
-{
- struct rrpc_lun *rlun;
- int i;
-
- for (i = 0; i < rrpc->nr_luns; i++) {
- rlun = &rrpc->luns[i];
- if (rlun->cur)
- rrpc_put_blk(rrpc, rlun->cur);
- if (rlun->gc_cur)
- rrpc_put_blk(rrpc, rlun->gc_cur);
- }
-}
-
-static struct rrpc_lun *get_next_lun(struct rrpc *rrpc)
-{
- int next = atomic_inc_return(&rrpc->next_lun);
-
- return &rrpc->luns[next % rrpc->nr_luns];
-}
-
-static void rrpc_gc_kick(struct rrpc *rrpc)
-{
- struct rrpc_lun *rlun;
- unsigned int i;
-
- for (i = 0; i < rrpc->nr_luns; i++) {
- rlun = &rrpc->luns[i];
- queue_work(rrpc->krqd_wq, &rlun->ws_gc);
- }
-}
-
-/*
- * timed GC every interval.
- */
-static void rrpc_gc_timer(struct timer_list *t)
-{
- struct rrpc *rrpc = from_timer(rrpc, t, gc_timer);
-
- rrpc_gc_kick(rrpc);
- mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
-}
-
-static void rrpc_end_sync_bio(struct bio *bio)
-{
- struct completion *waiting = bio->bi_private;
-
- if (bio->bi_status)
- pr_err("nvm: gc request failed (%u).\n", bio->bi_status);
-
- complete(waiting);
-}
-
-/*
- * rrpc_move_valid_pages -- migrate live data off the block
- * @rrpc: the 'rrpc' structure
- * @block: the block from which to migrate live pages
- *
- * Description:
- * GC algorithms may call this function to migrate remaining live
- * pages off the block prior to erasing it. This function blocks
- * further execution until the operation is complete.
- */
-static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct request_queue *q = dev->q;
- struct rrpc_rev_addr *rev;
- struct nvm_rq *rqd;
- struct bio *bio;
- struct page *page;
- int slot;
- int nr_sec_per_blk = dev->geo.sec_per_blk;
- u64 phys_addr;
- DECLARE_COMPLETION_ONSTACK(wait);
-
- if (bitmap_full(rblk->invalid_pages, nr_sec_per_blk))
- return 0;
-
- bio = bio_alloc(GFP_NOIO, 1);
- if (!bio) {
- pr_err("nvm: could not alloc bio to gc\n");
- return -ENOMEM;
- }
-
- page = mempool_alloc(rrpc->page_pool, GFP_NOIO);
-
- while ((slot = find_first_zero_bit(rblk->invalid_pages,
- nr_sec_per_blk)) < nr_sec_per_blk) {
-
- /* Lock laddr */
- phys_addr = rrpc_blk_to_ppa(rrpc, rblk) + slot;
-
-try:
- spin_lock(&rrpc->rev_lock);
- /* Get logical address from physical to logical table */
- rev = &rrpc->rev_trans_map[phys_addr];
- /* already updated by previous regular write */
- if (rev->addr == ADDR_EMPTY) {
- spin_unlock(&rrpc->rev_lock);
- continue;
- }
-
- rqd = rrpc_inflight_laddr_acquire(rrpc, rev->addr, 1);
- if (IS_ERR_OR_NULL(rqd)) {
- spin_unlock(&rrpc->rev_lock);
- schedule();
- goto try;
- }
-
- spin_unlock(&rrpc->rev_lock);
-
- /* Perform read to do GC */
- bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
- bio_set_op_attrs(bio, REQ_OP_READ, 0);
- bio->bi_private = &wait;
- bio->bi_end_io = rrpc_end_sync_bio;
-
- /* TODO: may fail when EXP_PG_SIZE > PAGE_SIZE */
- bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
-
- if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
- pr_err("rrpc: gc read failed.\n");
- rrpc_inflight_laddr_release(rrpc, rqd);
- goto finished;
- }
- wait_for_completion_io(&wait);
- if (bio->bi_status) {
- rrpc_inflight_laddr_release(rrpc, rqd);
- goto finished;
- }
-
- bio_reset(bio);
- reinit_completion(&wait);
-
- bio->bi_iter.bi_sector = rrpc_get_sector(rev->addr);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
- bio->bi_private = &wait;
- bio->bi_end_io = rrpc_end_sync_bio;
-
- bio_add_pc_page(q, bio, page, RRPC_EXPOSED_PAGE_SIZE, 0);
-
- /* turn the command around and write the data back to a new
- * address
- */
- if (rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_GC)) {
- pr_err("rrpc: gc write failed.\n");
- rrpc_inflight_laddr_release(rrpc, rqd);
- goto finished;
- }
- wait_for_completion_io(&wait);
-
- rrpc_inflight_laddr_release(rrpc, rqd);
- if (bio->bi_status)
- goto finished;
-
- bio_reset(bio);
- }
-
-finished:
- mempool_free(page, rrpc->page_pool);
- bio_put(bio);
-
- if (!bitmap_full(rblk->invalid_pages, nr_sec_per_blk)) {
- pr_err("nvm: failed to garbage collect block\n");
- return -EIO;
- }
-
- return 0;
-}
-
-static void rrpc_block_gc(struct work_struct *work)
-{
- struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
- ws_gc);
- struct rrpc *rrpc = gcb->rrpc;
- struct rrpc_block *rblk = gcb->rblk;
- struct rrpc_lun *rlun = rblk->rlun;
- struct ppa_addr ppa;
-
- mempool_free(gcb, rrpc->gcb_pool);
- pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' being reclaimed\n",
- rlun->bppa.g.ch, rlun->bppa.g.lun,
- rblk->id);
-
- if (rrpc_move_valid_pages(rrpc, rblk))
- goto put_back;
-
- ppa.ppa = 0;
- ppa.g.ch = rlun->bppa.g.ch;
- ppa.g.lun = rlun->bppa.g.lun;
- ppa.g.blk = rblk->id;
-
- if (nvm_erase_sync(rrpc->dev, &ppa, 1))
- goto put_back;
-
- rrpc_put_blk(rrpc, rblk);
-
- return;
-
-put_back:
- spin_lock(&rlun->lock);
- list_add_tail(&rblk->prio, &rlun->prio_list);
- spin_unlock(&rlun->lock);
-}
-
-/* the block with highest number of invalid pages, will be in the beginning
- * of the list
- */
-static struct rrpc_block *rblk_max_invalid(struct rrpc_block *ra,
- struct rrpc_block *rb)
-{
- if (ra->nr_invalid_pages == rb->nr_invalid_pages)
- return ra;
-
- return (ra->nr_invalid_pages < rb->nr_invalid_pages) ? rb : ra;
-}
-
-/* linearly find the block with highest number of invalid pages
- * requires lun->lock
- */
-static struct rrpc_block *block_prio_find_max(struct rrpc_lun *rlun)
-{
- struct list_head *prio_list = &rlun->prio_list;
- struct rrpc_block *rblk, *max;
-
- BUG_ON(list_empty(prio_list));
-
- max = list_first_entry(prio_list, struct rrpc_block, prio);
- list_for_each_entry(rblk, prio_list, prio)
- max = rblk_max_invalid(max, rblk);
-
- return max;
-}
-
-static void rrpc_lun_gc(struct work_struct *work)
-{
- struct rrpc_lun *rlun = container_of(work, struct rrpc_lun, ws_gc);
- struct rrpc *rrpc = rlun->rrpc;
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_block_gc *gcb;
- unsigned int nr_blocks_need;
-
- nr_blocks_need = dev->geo.blks_per_lun / GC_LIMIT_INVERSE;
-
- if (nr_blocks_need < rrpc->nr_luns)
- nr_blocks_need = rrpc->nr_luns;
-
- spin_lock(&rlun->lock);
- while (nr_blocks_need > rlun->nr_free_blocks &&
- !list_empty(&rlun->prio_list)) {
- struct rrpc_block *rblk = block_prio_find_max(rlun);
-
- if (!rblk->nr_invalid_pages)
- break;
-
- gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
- if (!gcb)
- break;
-
- list_del_init(&rblk->prio);
-
- WARN_ON(!block_is_full(rrpc, rblk));
-
- pr_debug("rrpc: selected block 'ch:%d,lun:%d,blk:%d' for GC\n",
- rlun->bppa.g.ch, rlun->bppa.g.lun,
- rblk->id);
-
- gcb->rrpc = rrpc;
- gcb->rblk = rblk;
- INIT_WORK(&gcb->ws_gc, rrpc_block_gc);
-
- queue_work(rrpc->kgc_wq, &gcb->ws_gc);
-
- nr_blocks_need--;
- }
- spin_unlock(&rlun->lock);
-
- /* TODO: Hint that request queue can be started again */
-}
-
-static void rrpc_gc_queue(struct work_struct *work)
-{
- struct rrpc_block_gc *gcb = container_of(work, struct rrpc_block_gc,
- ws_gc);
- struct rrpc *rrpc = gcb->rrpc;
- struct rrpc_block *rblk = gcb->rblk;
- struct rrpc_lun *rlun = rblk->rlun;
-
- spin_lock(&rlun->lock);
- list_add_tail(&rblk->prio, &rlun->prio_list);
- spin_unlock(&rlun->lock);
-
- mempool_free(gcb, rrpc->gcb_pool);
- pr_debug("nvm: block 'ch:%d,lun:%d,blk:%d' full, allow GC (sched)\n",
- rlun->bppa.g.ch, rlun->bppa.g.lun,
- rblk->id);
-}
-
-static const struct block_device_operations rrpc_fops = {
- .owner = THIS_MODULE,
-};
-
-static struct rrpc_lun *rrpc_get_lun_rr(struct rrpc *rrpc, int is_gc)
-{
- unsigned int i;
- struct rrpc_lun *rlun, *max_free;
-
- if (!is_gc)
- return get_next_lun(rrpc);
-
- /* during GC, we don't care about RR, instead we want to make
- * sure that we maintain evenness between the block luns.
- */
- max_free = &rrpc->luns[0];
- /* prevent GC-ing lun from devouring pages of a lun with
- * little free blocks. We don't take the lock as we only need an
- * estimate.
- */
- rrpc_for_each_lun(rrpc, rlun, i) {
- if (rlun->nr_free_blocks > max_free->nr_free_blocks)
- max_free = rlun;
- }
-
- return max_free;
-}
-
-static struct rrpc_addr *rrpc_update_map(struct rrpc *rrpc, sector_t laddr,
- struct rrpc_block *rblk, u64 paddr)
-{
- struct rrpc_addr *gp;
- struct rrpc_rev_addr *rev;
-
- BUG_ON(laddr >= rrpc->nr_sects);
-
- gp = &rrpc->trans_map[laddr];
- spin_lock(&rrpc->rev_lock);
- if (gp->rblk)
- rrpc_page_invalidate(rrpc, gp);
-
- gp->addr = paddr;
- gp->rblk = rblk;
-
- rev = &rrpc->rev_trans_map[gp->addr];
- rev->addr = laddr;
- spin_unlock(&rrpc->rev_lock);
-
- return gp;
-}
-
-static u64 rrpc_alloc_addr(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- u64 addr = ADDR_EMPTY;
-
- spin_lock(&rblk->lock);
- if (block_is_full(rrpc, rblk))
- goto out;
-
- addr = rblk->next_page;
-
- rblk->next_page++;
-out:
- spin_unlock(&rblk->lock);
- return addr;
-}
-
-/* Map logical address to a physical page. The mapping implements a round robin
- * approach and allocates a page from the next lun available.
- *
- * Returns rrpc_addr with the physical address and block. Returns NULL if no
- * blocks in the next rlun are available.
- */
-static struct ppa_addr rrpc_map_page(struct rrpc *rrpc, sector_t laddr,
- int is_gc)
-{
- struct nvm_tgt_dev *tgt_dev = rrpc->dev;
- struct rrpc_lun *rlun;
- struct rrpc_block *rblk, **cur_rblk;
- struct rrpc_addr *p;
- struct ppa_addr ppa;
- u64 paddr;
- int gc_force = 0;
-
- ppa.ppa = ADDR_EMPTY;
- rlun = rrpc_get_lun_rr(rrpc, is_gc);
-
- if (!is_gc && rlun->nr_free_blocks < rrpc->nr_luns * 4)
- return ppa;
-
- /*
- * page allocation steps:
- * 1. Try to allocate new page from current rblk
- * 2a. If succeed, proceed to map it in and return
- * 2b. If fail, first try to allocate a new block from media manger,
- * and then retry step 1. Retry until the normal block pool is
- * exhausted.
- * 3. If exhausted, and garbage collector is requesting the block,
- * go to the reserved block and retry step 1.
- * In the case that this fails as well, or it is not GC
- * requesting, report not able to retrieve a block and let the
- * caller handle further processing.
- */
-
- spin_lock(&rlun->lock);
- cur_rblk = &rlun->cur;
- rblk = rlun->cur;
-retry:
- paddr = rrpc_alloc_addr(rrpc, rblk);
-
- if (paddr != ADDR_EMPTY)
- goto done;
-
- if (!list_empty(&rlun->wblk_list)) {
-new_blk:
- rblk = list_first_entry(&rlun->wblk_list, struct rrpc_block,
- prio);
- rrpc_set_lun_cur(rlun, rblk, cur_rblk);
- list_del(&rblk->prio);
- goto retry;
- }
- spin_unlock(&rlun->lock);
-
- rblk = rrpc_get_blk(rrpc, rlun, gc_force);
- if (rblk) {
- spin_lock(&rlun->lock);
- list_add_tail(&rblk->prio, &rlun->wblk_list);
- /*
- * another thread might already have added a new block,
- * Therefore, make sure that one is used, instead of the
- * one just added.
- */
- goto new_blk;
- }
-
- if (unlikely(is_gc) && !gc_force) {
- /* retry from emergency gc block */
- cur_rblk = &rlun->gc_cur;
- rblk = rlun->gc_cur;
- gc_force = 1;
- spin_lock(&rlun->lock);
- goto retry;
- }
-
- pr_err("rrpc: failed to allocate new block\n");
- return ppa;
-done:
- spin_unlock(&rlun->lock);
- p = rrpc_update_map(rrpc, laddr, rblk, paddr);
- if (!p)
- return ppa;
-
- /* return global address */
- return rrpc_ppa_to_gaddr(tgt_dev, p);
-}
-
-static void rrpc_run_gc(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct rrpc_block_gc *gcb;
-
- gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC);
- if (!gcb) {
- pr_err("rrpc: unable to queue block for gc.");
- return;
- }
-
- gcb->rrpc = rrpc;
- gcb->rblk = rblk;
-
- INIT_WORK(&gcb->ws_gc, rrpc_gc_queue);
- queue_work(rrpc->kgc_wq, &gcb->ws_gc);
-}
-
-static struct rrpc_lun *rrpc_ppa_to_lun(struct rrpc *rrpc, struct ppa_addr p)
-{
- struct rrpc_lun *rlun = NULL;
- int i;
-
- for (i = 0; i < rrpc->nr_luns; i++) {
- if (rrpc->luns[i].bppa.g.ch == p.g.ch &&
- rrpc->luns[i].bppa.g.lun == p.g.lun) {
- rlun = &rrpc->luns[i];
- break;
- }
- }
-
- return rlun;
-}
-
-static void __rrpc_mark_bad_block(struct rrpc *rrpc, struct ppa_addr ppa)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_lun *rlun;
- struct rrpc_block *rblk;
-
- rlun = rrpc_ppa_to_lun(rrpc, ppa);
- rblk = &rlun->blocks[ppa.g.blk];
- rblk->state = NVM_BLK_ST_BAD;
-
- nvm_set_tgt_bb_tbl(dev, &ppa, 1, NVM_BLK_T_GRWN_BAD);
-}
-
-static void rrpc_mark_bad_block(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
- void *comp_bits = &rqd->ppa_status;
- struct ppa_addr ppa, prev_ppa;
- int nr_ppas = rqd->nr_ppas;
- int bit;
-
- if (rqd->nr_ppas == 1)
- __rrpc_mark_bad_block(rrpc, rqd->ppa_addr);
-
- ppa_set_empty(&prev_ppa);
- bit = -1;
- while ((bit = find_next_bit(comp_bits, nr_ppas, bit + 1)) < nr_ppas) {
- ppa = rqd->ppa_list[bit];
- if (ppa_cmp_blk(ppa, prev_ppa))
- continue;
-
- __rrpc_mark_bad_block(rrpc, ppa);
- }
-}
-
-static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd,
- sector_t laddr, uint8_t npages)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_addr *p;
- struct rrpc_block *rblk;
- int cmnt_size, i;
-
- for (i = 0; i < npages; i++) {
- p = &rrpc->trans_map[laddr + i];
- rblk = p->rblk;
-
- cmnt_size = atomic_inc_return(&rblk->data_cmnt_size);
- if (unlikely(cmnt_size == dev->geo.sec_per_blk))
- rrpc_run_gc(rrpc, rblk);
- }
-}
-
-static void rrpc_end_io(struct nvm_rq *rqd)
-{
- struct rrpc *rrpc = rqd->private;
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
- uint8_t npages = rqd->nr_ppas;
- sector_t laddr = rrpc_get_laddr(rqd->bio) - npages;
-
- if (bio_data_dir(rqd->bio) == WRITE) {
- if (rqd->error == NVM_RSP_ERR_FAILWRITE)
- rrpc_mark_bad_block(rrpc, rqd);
-
- rrpc_end_io_write(rrpc, rrqd, laddr, npages);
- }
-
- bio_put(rqd->bio);
-
- if (rrqd->flags & NVM_IOTYPE_GC)
- return;
-
- rrpc_unlock_rq(rrpc, rqd);
-
- if (npages > 1)
- nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
-
- mempool_free(rqd, rrpc->rq_pool);
-}
-
-static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags, int npages)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
- struct rrpc_addr *gp;
- sector_t laddr = rrpc_get_laddr(bio);
- int is_gc = flags & NVM_IOTYPE_GC;
- int i;
-
- if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
- nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
- return NVM_IO_REQUEUE;
- }
-
- for (i = 0; i < npages; i++) {
- /* We assume that mapping occurs at 4KB granularity */
- BUG_ON(!(laddr + i < rrpc->nr_sects));
- gp = &rrpc->trans_map[laddr + i];
-
- if (gp->rblk) {
- rqd->ppa_list[i] = rrpc_ppa_to_gaddr(dev, gp);
- } else {
- BUG_ON(is_gc);
- rrpc_unlock_laddr(rrpc, r);
- nvm_dev_dma_free(dev->parent, rqd->ppa_list,
- rqd->dma_ppa_list);
- return NVM_IO_DONE;
- }
- }
-
- rqd->opcode = NVM_OP_HBREAD;
-
- return NVM_IO_OK;
-}
-
-static int rrpc_read_rq(struct rrpc *rrpc, struct bio *bio, struct nvm_rq *rqd,
- unsigned long flags)
-{
- int is_gc = flags & NVM_IOTYPE_GC;
- sector_t laddr = rrpc_get_laddr(bio);
- struct rrpc_addr *gp;
-
- if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
- return NVM_IO_REQUEUE;
-
- BUG_ON(!(laddr < rrpc->nr_sects));
- gp = &rrpc->trans_map[laddr];
-
- if (gp->rblk) {
- rqd->ppa_addr = rrpc_ppa_to_gaddr(rrpc->dev, gp);
- } else {
- BUG_ON(is_gc);
- rrpc_unlock_rq(rrpc, rqd);
- return NVM_IO_DONE;
- }
-
- rqd->opcode = NVM_OP_HBREAD;
-
- return NVM_IO_OK;
-}
-
-static int rrpc_write_ppalist_rq(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags, int npages)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
- struct ppa_addr p;
- sector_t laddr = rrpc_get_laddr(bio);
- int is_gc = flags & NVM_IOTYPE_GC;
- int i;
-
- if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd)) {
- nvm_dev_dma_free(dev->parent, rqd->ppa_list, rqd->dma_ppa_list);
- return NVM_IO_REQUEUE;
- }
-
- for (i = 0; i < npages; i++) {
- /* We assume that mapping occurs at 4KB granularity */
- p = rrpc_map_page(rrpc, laddr + i, is_gc);
- if (p.ppa == ADDR_EMPTY) {
- BUG_ON(is_gc);
- rrpc_unlock_laddr(rrpc, r);
- nvm_dev_dma_free(dev->parent, rqd->ppa_list,
- rqd->dma_ppa_list);
- rrpc_gc_kick(rrpc);
- return NVM_IO_REQUEUE;
- }
-
- rqd->ppa_list[i] = p;
- }
-
- rqd->opcode = NVM_OP_HBWRITE;
-
- return NVM_IO_OK;
-}
-
-static int rrpc_write_rq(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags)
-{
- struct ppa_addr p;
- int is_gc = flags & NVM_IOTYPE_GC;
- sector_t laddr = rrpc_get_laddr(bio);
-
- if (!is_gc && rrpc_lock_rq(rrpc, bio, rqd))
- return NVM_IO_REQUEUE;
-
- p = rrpc_map_page(rrpc, laddr, is_gc);
- if (p.ppa == ADDR_EMPTY) {
- BUG_ON(is_gc);
- rrpc_unlock_rq(rrpc, rqd);
- rrpc_gc_kick(rrpc);
- return NVM_IO_REQUEUE;
- }
-
- rqd->ppa_addr = p;
- rqd->opcode = NVM_OP_HBWRITE;
-
- return NVM_IO_OK;
-}
-
-static int rrpc_setup_rq(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags, uint8_t npages)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
-
- if (npages > 1) {
- rqd->ppa_list = nvm_dev_dma_alloc(dev->parent, GFP_KERNEL,
- &rqd->dma_ppa_list);
- if (!rqd->ppa_list) {
- pr_err("rrpc: not able to allocate ppa list\n");
- return NVM_IO_ERR;
- }
-
- if (bio_op(bio) == REQ_OP_WRITE)
- return rrpc_write_ppalist_rq(rrpc, bio, rqd, flags,
- npages);
-
- return rrpc_read_ppalist_rq(rrpc, bio, rqd, flags, npages);
- }
-
- if (bio_op(bio) == REQ_OP_WRITE)
- return rrpc_write_rq(rrpc, bio, rqd, flags);
-
- return rrpc_read_rq(rrpc, bio, rqd, flags);
-}
-
-static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd, unsigned long flags)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_rq *rrq = nvm_rq_to_pdu(rqd);
- uint8_t nr_pages = rrpc_get_pages(bio);
- int bio_size = bio_sectors(bio) << 9;
- int err;
-
- if (bio_size < dev->geo.sec_size)
- return NVM_IO_ERR;
- else if (bio_size > dev->geo.max_rq_size)
- return NVM_IO_ERR;
-
- err = rrpc_setup_rq(rrpc, bio, rqd, flags, nr_pages);
- if (err)
- return err;
-
- bio_get(bio);
- rqd->bio = bio;
- rqd->private = rrpc;
- rqd->nr_ppas = nr_pages;
- rqd->end_io = rrpc_end_io;
- rrq->flags = flags;
-
- err = nvm_submit_io(dev, rqd);
- if (err) {
- pr_err("rrpc: I/O submission failed: %d\n", err);
- bio_put(bio);
- if (!(flags & NVM_IOTYPE_GC)) {
- rrpc_unlock_rq(rrpc, rqd);
- if (rqd->nr_ppas > 1)
- nvm_dev_dma_free(dev->parent, rqd->ppa_list,
- rqd->dma_ppa_list);
- }
- return NVM_IO_ERR;
- }
-
- return NVM_IO_OK;
-}
-
-static blk_qc_t rrpc_make_rq(struct request_queue *q, struct bio *bio)
-{
- struct rrpc *rrpc = q->queuedata;
- struct nvm_rq *rqd;
- int err;
-
- blk_queue_split(q, &bio);
-
- if (bio_op(bio) == REQ_OP_DISCARD) {
- rrpc_discard(rrpc, bio);
- return BLK_QC_T_NONE;
- }
-
- rqd = mempool_alloc(rrpc->rq_pool, GFP_KERNEL);
- memset(rqd, 0, sizeof(struct nvm_rq));
-
- err = rrpc_submit_io(rrpc, bio, rqd, NVM_IOTYPE_NONE);
- switch (err) {
- case NVM_IO_OK:
- return BLK_QC_T_NONE;
- case NVM_IO_ERR:
- bio_io_error(bio);
- break;
- case NVM_IO_DONE:
- bio_endio(bio);
- break;
- case NVM_IO_REQUEUE:
- spin_lock(&rrpc->bio_lock);
- bio_list_add(&rrpc->requeue_bios, bio);
- spin_unlock(&rrpc->bio_lock);
- queue_work(rrpc->kgc_wq, &rrpc->ws_requeue);
- break;
- }
-
- mempool_free(rqd, rrpc->rq_pool);
- return BLK_QC_T_NONE;
-}
-
-static void rrpc_requeue(struct work_struct *work)
-{
- struct rrpc *rrpc = container_of(work, struct rrpc, ws_requeue);
- struct bio_list bios;
- struct bio *bio;
-
- bio_list_init(&bios);
-
- spin_lock(&rrpc->bio_lock);
- bio_list_merge(&bios, &rrpc->requeue_bios);
- bio_list_init(&rrpc->requeue_bios);
- spin_unlock(&rrpc->bio_lock);
-
- while ((bio = bio_list_pop(&bios)))
- rrpc_make_rq(rrpc->disk->queue, bio);
-}
-
-static void rrpc_gc_free(struct rrpc *rrpc)
-{
- if (rrpc->krqd_wq)
- destroy_workqueue(rrpc->krqd_wq);
-
- if (rrpc->kgc_wq)
- destroy_workqueue(rrpc->kgc_wq);
-}
-
-static int rrpc_gc_init(struct rrpc *rrpc)
-{
- rrpc->krqd_wq = alloc_workqueue("rrpc-lun", WQ_MEM_RECLAIM|WQ_UNBOUND,
- rrpc->nr_luns);
- if (!rrpc->krqd_wq)
- return -ENOMEM;
-
- rrpc->kgc_wq = alloc_workqueue("rrpc-bg", WQ_MEM_RECLAIM, 1);
- if (!rrpc->kgc_wq)
- return -ENOMEM;
-
- timer_setup(&rrpc->gc_timer, rrpc_gc_timer, 0);
-
- return 0;
-}
-
-static void rrpc_map_free(struct rrpc *rrpc)
-{
- vfree(rrpc->rev_trans_map);
- vfree(rrpc->trans_map);
-}
-
-static int rrpc_l2p_update(u64 slba, u32 nlb, __le64 *entries, void *private)
-{
- struct rrpc *rrpc = (struct rrpc *)private;
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_addr *addr = rrpc->trans_map + slba;
- struct rrpc_rev_addr *raddr = rrpc->rev_trans_map;
- struct rrpc_lun *rlun;
- struct rrpc_block *rblk;
- u64 i;
-
- for (i = 0; i < nlb; i++) {
- struct ppa_addr gaddr;
- u64 pba = le64_to_cpu(entries[i]);
- unsigned int mod;
-
- /* LNVM treats address-spaces as silos, LBA and PBA are
- * equally large and zero-indexed.
- */
- if (unlikely(pba >= dev->total_secs && pba != U64_MAX)) {
- pr_err("nvm: L2P data entry is out of bounds!\n");
- pr_err("nvm: Maybe loaded an old target L2P\n");
- return -EINVAL;
- }
-
- /* Address zero is a special one. The first page on a disk is
- * protected. As it often holds internal device boot
- * information.
- */
- if (!pba)
- continue;
-
- div_u64_rem(pba, rrpc->nr_sects, &mod);
-
- gaddr = rrpc_recov_addr(dev, pba);
- rlun = rrpc_ppa_to_lun(rrpc, gaddr);
- if (!rlun) {
- pr_err("rrpc: l2p corruption on lba %llu\n",
- slba + i);
- return -EINVAL;
- }
-
- rblk = &rlun->blocks[gaddr.g.blk];
- if (!rblk->state) {
- /* at this point, we don't know anything about the
- * block. It's up to the FTL on top to re-etablish the
- * block state. The block is assumed to be open.
- */
- list_move_tail(&rblk->list, &rlun->used_list);
- rblk->state = NVM_BLK_ST_TGT;
- rlun->nr_free_blocks--;
- }
-
- addr[i].addr = pba;
- addr[i].rblk = rblk;
- raddr[mod].addr = slba + i;
- }
-
- return 0;
-}
-
-static int rrpc_map_init(struct rrpc *rrpc)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- sector_t i;
- int ret;
-
- rrpc->trans_map = vzalloc(sizeof(struct rrpc_addr) * rrpc->nr_sects);
- if (!rrpc->trans_map)
- return -ENOMEM;
-
- rrpc->rev_trans_map = vmalloc(sizeof(struct rrpc_rev_addr)
- * rrpc->nr_sects);
- if (!rrpc->rev_trans_map)
- return -ENOMEM;
-
- for (i = 0; i < rrpc->nr_sects; i++) {
- struct rrpc_addr *p = &rrpc->trans_map[i];
- struct rrpc_rev_addr *r = &rrpc->rev_trans_map[i];
-
- p->addr = ADDR_EMPTY;
- r->addr = ADDR_EMPTY;
- }
-
- /* Bring up the mapping table from device */
- ret = nvm_get_l2p_tbl(dev, rrpc->soffset, rrpc->nr_sects,
- rrpc_l2p_update, rrpc);
- if (ret) {
- pr_err("nvm: rrpc: could not read L2P table.\n");
- return -EINVAL;
- }
-
- return 0;
-}
-
-/* Minimum pages needed within a lun */
-#define PAGE_POOL_SIZE 16
-#define ADDR_POOL_SIZE 64
-
-static int rrpc_core_init(struct rrpc *rrpc)
-{
- down_write(&rrpc_lock);
- if (!rrpc_gcb_cache) {
- rrpc_gcb_cache = kmem_cache_create("rrpc_gcb",
- sizeof(struct rrpc_block_gc), 0, 0, NULL);
- if (!rrpc_gcb_cache) {
- up_write(&rrpc_lock);
- return -ENOMEM;
- }
-
- rrpc_rq_cache = kmem_cache_create("rrpc_rq",
- sizeof(struct nvm_rq) + sizeof(struct rrpc_rq),
- 0, 0, NULL);
- if (!rrpc_rq_cache) {
- kmem_cache_destroy(rrpc_gcb_cache);
- up_write(&rrpc_lock);
- return -ENOMEM;
- }
- }
- up_write(&rrpc_lock);
-
- rrpc->page_pool = mempool_create_page_pool(PAGE_POOL_SIZE, 0);
- if (!rrpc->page_pool)
- return -ENOMEM;
-
- rrpc->gcb_pool = mempool_create_slab_pool(rrpc->dev->geo.nr_luns,
- rrpc_gcb_cache);
- if (!rrpc->gcb_pool)
- return -ENOMEM;
-
- rrpc->rq_pool = mempool_create_slab_pool(64, rrpc_rq_cache);
- if (!rrpc->rq_pool)
- return -ENOMEM;
-
- spin_lock_init(&rrpc->inflights.lock);
- INIT_LIST_HEAD(&rrpc->inflights.reqs);
-
- return 0;
-}
-
-static void rrpc_core_free(struct rrpc *rrpc)
-{
- mempool_destroy(rrpc->page_pool);
- mempool_destroy(rrpc->gcb_pool);
- mempool_destroy(rrpc->rq_pool);
-}
-
-static void rrpc_luns_free(struct rrpc *rrpc)
-{
- struct rrpc_lun *rlun;
- int i;
-
- if (!rrpc->luns)
- return;
-
- for (i = 0; i < rrpc->nr_luns; i++) {
- rlun = &rrpc->luns[i];
- vfree(rlun->blocks);
- }
-
- kfree(rrpc->luns);
-}
-
-static int rrpc_bb_discovery(struct nvm_tgt_dev *dev, struct rrpc_lun *rlun)
-{
- struct nvm_geo *geo = &dev->geo;
- struct rrpc_block *rblk;
- struct ppa_addr ppa;
- u8 *blks;
- int nr_blks;
- int i;
- int ret;
-
- if (!dev->parent->ops->get_bb_tbl)
- return 0;
-
- nr_blks = geo->blks_per_lun * geo->plane_mode;
- blks = kmalloc(nr_blks, GFP_KERNEL);
- if (!blks)
- return -ENOMEM;
-
- ppa.ppa = 0;
- ppa.g.ch = rlun->bppa.g.ch;
- ppa.g.lun = rlun->bppa.g.lun;
-
- ret = nvm_get_tgt_bb_tbl(dev, ppa, blks);
- if (ret) {
- pr_err("rrpc: could not get BB table\n");
- goto out;
- }
-
- nr_blks = nvm_bb_tbl_fold(dev->parent, blks, nr_blks);
- if (nr_blks < 0) {
- ret = nr_blks;
- goto out;
- }
-
- for (i = 0; i < nr_blks; i++) {
- if (blks[i] == NVM_BLK_T_FREE)
- continue;
-
- rblk = &rlun->blocks[i];
- list_move_tail(&rblk->list, &rlun->bb_list);
- rblk->state = NVM_BLK_ST_BAD;
- rlun->nr_free_blocks--;
- }
-
-out:
- kfree(blks);
- return ret;
-}
-
-static void rrpc_set_lun_ppa(struct rrpc_lun *rlun, struct ppa_addr ppa)
-{
- rlun->bppa.ppa = 0;
- rlun->bppa.g.ch = ppa.g.ch;
- rlun->bppa.g.lun = ppa.g.lun;
-}
-
-static int rrpc_luns_init(struct rrpc *rrpc, struct ppa_addr *luns)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct nvm_geo *geo = &dev->geo;
- struct rrpc_lun *rlun;
- int i, j, ret = -EINVAL;
-
- if (geo->sec_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) {
- pr_err("rrpc: number of pages per block too high.");
- return -EINVAL;
- }
-
- spin_lock_init(&rrpc->rev_lock);
-
- rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun),
- GFP_KERNEL);
- if (!rrpc->luns)
- return -ENOMEM;
-
- /* 1:1 mapping */
- for (i = 0; i < rrpc->nr_luns; i++) {
- rlun = &rrpc->luns[i];
- rlun->id = i;
- rrpc_set_lun_ppa(rlun, luns[i]);
- rlun->blocks = vzalloc(sizeof(struct rrpc_block) *
- geo->blks_per_lun);
- if (!rlun->blocks) {
- ret = -ENOMEM;
- goto err;
- }
-
- INIT_LIST_HEAD(&rlun->free_list);
- INIT_LIST_HEAD(&rlun->used_list);
- INIT_LIST_HEAD(&rlun->bb_list);
-
- for (j = 0; j < geo->blks_per_lun; j++) {
- struct rrpc_block *rblk = &rlun->blocks[j];
-
- rblk->id = j;
- rblk->rlun = rlun;
- rblk->state = NVM_BLK_T_FREE;
- INIT_LIST_HEAD(&rblk->prio);
- INIT_LIST_HEAD(&rblk->list);
- spin_lock_init(&rblk->lock);
-
- list_add_tail(&rblk->list, &rlun->free_list);
- }
-
- rlun->rrpc = rrpc;
- rlun->nr_free_blocks = geo->blks_per_lun;
- rlun->reserved_blocks = 2; /* for GC only */
-
- INIT_LIST_HEAD(&rlun->prio_list);
- INIT_LIST_HEAD(&rlun->wblk_list);
-
- INIT_WORK(&rlun->ws_gc, rrpc_lun_gc);
- spin_lock_init(&rlun->lock);
-
- if (rrpc_bb_discovery(dev, rlun))
- goto err;
-
- }
-
- return 0;
-err:
- return ret;
-}
-
-/* returns 0 on success and stores the beginning address in *begin */
-static int rrpc_area_init(struct rrpc *rrpc, sector_t *begin)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- sector_t size = rrpc->nr_sects * dev->geo.sec_size;
- int ret;
-
- size >>= 9;
-
- ret = nvm_get_area(dev, begin, size);
- if (!ret)
- *begin >>= (ilog2(dev->geo.sec_size) - 9);
-
- return ret;
-}
-
-static void rrpc_area_free(struct rrpc *rrpc)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- sector_t begin = rrpc->soffset << (ilog2(dev->geo.sec_size) - 9);
-
- nvm_put_area(dev, begin);
-}
-
-static void rrpc_free(struct rrpc *rrpc)
-{
- rrpc_gc_free(rrpc);
- rrpc_map_free(rrpc);
- rrpc_core_free(rrpc);
- rrpc_luns_free(rrpc);
- rrpc_area_free(rrpc);
-
- kfree(rrpc);
-}
-
-static void rrpc_exit(void *private)
-{
- struct rrpc *rrpc = private;
-
- del_timer(&rrpc->gc_timer);
-
- flush_workqueue(rrpc->krqd_wq);
- flush_workqueue(rrpc->kgc_wq);
-
- rrpc_free(rrpc);
-}
-
-static sector_t rrpc_capacity(void *private)
-{
- struct rrpc *rrpc = private;
- struct nvm_tgt_dev *dev = rrpc->dev;
- sector_t reserved, provisioned;
-
- /* cur, gc, and two emergency blocks for each lun */
- reserved = rrpc->nr_luns * dev->geo.sec_per_blk * 4;
- provisioned = rrpc->nr_sects - reserved;
-
- if (reserved > rrpc->nr_sects) {
- pr_err("rrpc: not enough space available to expose storage.\n");
- return 0;
- }
-
- sector_div(provisioned, 10);
- return provisioned * 9 * NR_PHY_IN_LOG;
-}
-
-/*
- * Looks up the logical address from reverse trans map and check if its valid by
- * comparing the logical to physical address with the physical address.
- * Returns 0 on free, otherwise 1 if in use
- */
-static void rrpc_block_map_update(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- int offset;
- struct rrpc_addr *laddr;
- u64 bpaddr, paddr, pladdr;
-
- bpaddr = block_to_rel_addr(rrpc, rblk);
- for (offset = 0; offset < dev->geo.sec_per_blk; offset++) {
- paddr = bpaddr + offset;
-
- pladdr = rrpc->rev_trans_map[paddr].addr;
- if (pladdr == ADDR_EMPTY)
- continue;
-
- laddr = &rrpc->trans_map[pladdr];
-
- if (paddr == laddr->addr) {
- laddr->rblk = rblk;
- } else {
- set_bit(offset, rblk->invalid_pages);
- rblk->nr_invalid_pages++;
- }
- }
-}
-
-static int rrpc_blocks_init(struct rrpc *rrpc)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct rrpc_lun *rlun;
- struct rrpc_block *rblk;
- int lun_iter, blk_iter;
-
- for (lun_iter = 0; lun_iter < rrpc->nr_luns; lun_iter++) {
- rlun = &rrpc->luns[lun_iter];
-
- for (blk_iter = 0; blk_iter < dev->geo.blks_per_lun;
- blk_iter++) {
- rblk = &rlun->blocks[blk_iter];
- rrpc_block_map_update(rrpc, rblk);
- }
- }
-
- return 0;
-}
-
-static int rrpc_luns_configure(struct rrpc *rrpc)
-{
- struct rrpc_lun *rlun;
- struct rrpc_block *rblk;
- int i;
-
- for (i = 0; i < rrpc->nr_luns; i++) {
- rlun = &rrpc->luns[i];
-
- rblk = rrpc_get_blk(rrpc, rlun, 0);
- if (!rblk)
- goto err;
- rrpc_set_lun_cur(rlun, rblk, &rlun->cur);
-
- /* Emergency gc block */
- rblk = rrpc_get_blk(rrpc, rlun, 1);
- if (!rblk)
- goto err;
- rrpc_set_lun_cur(rlun, rblk, &rlun->gc_cur);
- }
-
- return 0;
-err:
- rrpc_put_blks(rrpc);
- return -EINVAL;
-}
-
-static struct nvm_tgt_type tt_rrpc;
-
-static void *rrpc_init(struct nvm_tgt_dev *dev, struct gendisk *tdisk,
- int flags)
-{
- struct request_queue *bqueue = dev->q;
- struct request_queue *tqueue = tdisk->queue;
- struct nvm_geo *geo = &dev->geo;
- struct rrpc *rrpc;
- sector_t soffset;
- int ret;
-
- if (!(dev->identity.dom & NVM_RSP_L2P)) {
- pr_err("nvm: rrpc: device does not support l2p (%x)\n",
- dev->identity.dom);
- return ERR_PTR(-EINVAL);
- }
-
- rrpc = kzalloc(sizeof(struct rrpc), GFP_KERNEL);
- if (!rrpc)
- return ERR_PTR(-ENOMEM);
-
- rrpc->dev = dev;
- rrpc->disk = tdisk;
-
- bio_list_init(&rrpc->requeue_bios);
- spin_lock_init(&rrpc->bio_lock);
- INIT_WORK(&rrpc->ws_requeue, rrpc_requeue);
-
- rrpc->nr_luns = geo->nr_luns;
- rrpc->nr_sects = (unsigned long long)geo->sec_per_lun * rrpc->nr_luns;
-
- /* simple round-robin strategy */
- atomic_set(&rrpc->next_lun, -1);
-
- ret = rrpc_area_init(rrpc, &soffset);
- if (ret < 0) {
- pr_err("nvm: rrpc: could not initialize area\n");
- return ERR_PTR(ret);
- }
- rrpc->soffset = soffset;
-
- ret = rrpc_luns_init(rrpc, dev->luns);
- if (ret) {
- pr_err("nvm: rrpc: could not initialize luns\n");
- goto err;
- }
-
- ret = rrpc_core_init(rrpc);
- if (ret) {
- pr_err("nvm: rrpc: could not initialize core\n");
- goto err;
- }
-
- ret = rrpc_map_init(rrpc);
- if (ret) {
- pr_err("nvm: rrpc: could not initialize maps\n");
- goto err;
- }
-
- ret = rrpc_blocks_init(rrpc);
- if (ret) {
- pr_err("nvm: rrpc: could not initialize state for blocks\n");
- goto err;
- }
-
- ret = rrpc_luns_configure(rrpc);
- if (ret) {
- pr_err("nvm: rrpc: not enough blocks available in LUNs.\n");
- goto err;
- }
-
- ret = rrpc_gc_init(rrpc);
- if (ret) {
- pr_err("nvm: rrpc: could not initialize gc\n");
- goto err;
- }
-
- /* inherit the size from the underlying device */
- blk_queue_logical_block_size(tqueue, queue_physical_block_size(bqueue));
- blk_queue_max_hw_sectors(tqueue, queue_max_hw_sectors(bqueue));
-
- pr_info("nvm: rrpc initialized with %u luns and %llu pages.\n",
- rrpc->nr_luns, (unsigned long long)rrpc->nr_sects);
-
- mod_timer(&rrpc->gc_timer, jiffies + msecs_to_jiffies(10));
-
- return rrpc;
-err:
- rrpc_free(rrpc);
- return ERR_PTR(ret);
-}
-
-/* round robin, page-based FTL, and cost-based GC */
-static struct nvm_tgt_type tt_rrpc = {
- .name = "rrpc",
- .version = {1, 0, 0},
-
- .make_rq = rrpc_make_rq,
- .capacity = rrpc_capacity,
-
- .init = rrpc_init,
- .exit = rrpc_exit,
-};
-
-static int __init rrpc_module_init(void)
-{
- return nvm_register_tgt_type(&tt_rrpc);
-}
-
-static void rrpc_module_exit(void)
-{
- nvm_unregister_tgt_type(&tt_rrpc);
-}
-
-module_init(rrpc_module_init);
-module_exit(rrpc_module_exit);
-MODULE_LICENSE("GPL v2");
-MODULE_DESCRIPTION("Block-Device Target for Open-Channel SSDs");
diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h
deleted file mode 100644
index fdb6ff902903..000000000000
--- a/drivers/lightnvm/rrpc.h
+++ /dev/null
@@ -1,290 +0,0 @@
-/*
- * Copyright (C) 2015 IT University of Copenhagen
- * Initial release: Matias Bjorling <m@bjorling.me>
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License version
- * 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
- * Implementation of a Round-robin page-based Hybrid FTL for Open-channel SSDs.
- */
-
-#ifndef RRPC_H_
-#define RRPC_H_
-
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/kthread.h>
-#include <linux/vmalloc.h>
-
-#include <linux/lightnvm.h>
-
-/* Run only GC if less than 1/X blocks are free */
-#define GC_LIMIT_INVERSE 10
-#define GC_TIME_SECS 100
-
-#define RRPC_SECTOR (512)
-#define RRPC_EXPOSED_PAGE_SIZE (4096)
-
-#define NR_PHY_IN_LOG (RRPC_EXPOSED_PAGE_SIZE / RRPC_SECTOR)
-
-struct rrpc_inflight {
- struct list_head reqs;
- spinlock_t lock;
-};
-
-struct rrpc_inflight_rq {
- struct list_head list;
- sector_t l_start;
- sector_t l_end;
-};
-
-struct rrpc_rq {
- struct rrpc_inflight_rq inflight_rq;
- unsigned long flags;
-};
-
-struct rrpc_block {
- int id; /* id inside of LUN */
- struct rrpc_lun *rlun;
-
- struct list_head prio; /* LUN CG list */
- struct list_head list; /* LUN free, used, bb list */
-
-#define MAX_INVALID_PAGES_STORAGE 8
- /* Bitmap for invalid page intries */
- unsigned long invalid_pages[MAX_INVALID_PAGES_STORAGE];
- /* points to the next writable page within a block */
- unsigned int next_page;
- /* number of pages that are invalid, wrt host page size */
- unsigned int nr_invalid_pages;
-
- int state;
-
- spinlock_t lock;
- atomic_t data_cmnt_size; /* data pages committed to stable storage */
-};
-
-struct rrpc_lun {
- struct rrpc *rrpc;
-
- int id;
- struct ppa_addr bppa;
-
- struct rrpc_block *cur, *gc_cur;
- struct rrpc_block *blocks; /* Reference to block allocation */
-
- struct list_head prio_list; /* Blocks that may be GC'ed */
- struct list_head wblk_list; /* Queued blocks to be written to */
-
- /* lun block lists */
- struct list_head used_list; /* In-use blocks */
- struct list_head free_list; /* Not used blocks i.e. released
- * and ready for use
- */
- struct list_head bb_list; /* Bad blocks. Mutually exclusive with
- * free_list and used_list
- */
- unsigned int nr_free_blocks; /* Number of unused blocks */
-
- struct work_struct ws_gc;
-
- int reserved_blocks;
-
- spinlock_t lock;
-};
-
-struct rrpc {
- struct nvm_tgt_dev *dev;
- struct gendisk *disk;
-
- sector_t soffset; /* logical sector offset */
-
- int nr_luns;
- struct rrpc_lun *luns;
-
- /* calculated values */
- unsigned long long nr_sects;
-
- /* Write strategy variables. Move these into each for structure for each
- * strategy
- */
- atomic_t next_lun; /* Whenever a page is written, this is updated
- * to point to the next write lun
- */
-
- spinlock_t bio_lock;
- struct bio_list requeue_bios;
- struct work_struct ws_requeue;
-
- /* Simple translation map of logical addresses to physical addresses.
- * The logical addresses is known by the host system, while the physical
- * addresses are used when writing to the disk block device.
- */
- struct rrpc_addr *trans_map;
- /* also store a reverse map for garbage collection */
- struct rrpc_rev_addr *rev_trans_map;
- spinlock_t rev_lock;
-
- struct rrpc_inflight inflights;
-
- mempool_t *addr_pool;
- mempool_t *page_pool;
- mempool_t *gcb_pool;
- mempool_t *rq_pool;
-
- struct timer_list gc_timer;
- struct workqueue_struct *krqd_wq;
- struct workqueue_struct *kgc_wq;
-};
-
-struct rrpc_block_gc {
- struct rrpc *rrpc;
- struct rrpc_block *rblk;
- struct work_struct ws_gc;
-};
-
-/* Logical to physical mapping */
-struct rrpc_addr {
- u64 addr;
- struct rrpc_block *rblk;
-};
-
-/* Physical to logical mapping */
-struct rrpc_rev_addr {
- u64 addr;
-};
-
-static inline struct ppa_addr rrpc_linear_to_generic_addr(struct nvm_geo *geo,
- struct ppa_addr r)
-{
- struct ppa_addr l;
- int secs, pgs;
- sector_t ppa = r.ppa;
-
- l.ppa = 0;
-
- div_u64_rem(ppa, geo->sec_per_pg, &secs);
- l.g.sec = secs;
-
- sector_div(ppa, geo->sec_per_pg);
- div_u64_rem(ppa, geo->pgs_per_blk, &pgs);
- l.g.pg = pgs;
-
- return l;
-}
-
-static inline struct ppa_addr rrpc_recov_addr(struct nvm_tgt_dev *dev, u64 pba)
-{
- return linear_to_generic_addr(&dev->geo, pba);
-}
-
-static inline u64 rrpc_blk_to_ppa(struct rrpc *rrpc, struct rrpc_block *rblk)
-{
- struct nvm_tgt_dev *dev = rrpc->dev;
- struct nvm_geo *geo = &dev->geo;
- struct rrpc_lun *rlun = rblk->rlun;
-
- return (rlun->id * geo->sec_per_lun) + (rblk->id * geo->sec_per_blk);
-}
-
-static inline sector_t rrpc_get_laddr(struct bio *bio)
-{
- return bio->bi_iter.bi_sector / NR_PHY_IN_LOG;
-}
-
-static inline unsigned int rrpc_get_pages(struct bio *bio)
-{
- return bio->bi_iter.bi_size / RRPC_EXPOSED_PAGE_SIZE;
-}
-
-static inline sector_t rrpc_get_sector(sector_t laddr)
-{
- return laddr * NR_PHY_IN_LOG;
-}
-
-static inline int request_intersects(struct rrpc_inflight_rq *r,
- sector_t laddr_start, sector_t laddr_end)
-{
- return (laddr_end >= r->l_start) && (laddr_start <= r->l_end);
-}
-
-static int __rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
- unsigned int pages, struct rrpc_inflight_rq *r)
-{
- sector_t laddr_end = laddr + pages - 1;
- struct rrpc_inflight_rq *rtmp;
-
- WARN_ON(irqs_disabled());
-
- spin_lock_irq(&rrpc->inflights.lock);
- list_for_each_entry(rtmp, &rrpc->inflights.reqs, list) {
- if (unlikely(request_intersects(rtmp, laddr, laddr_end))) {
- /* existing, overlapping request, come back later */
- spin_unlock_irq(&rrpc->inflights.lock);
- return 1;
- }
- }
-
- r->l_start = laddr;
- r->l_end = laddr_end;
-
- list_add_tail(&r->list, &rrpc->inflights.reqs);
- spin_unlock_irq(&rrpc->inflights.lock);
- return 0;
-}
-
-static inline int rrpc_lock_laddr(struct rrpc *rrpc, sector_t laddr,
- unsigned int pages,
- struct rrpc_inflight_rq *r)
-{
- BUG_ON((laddr + pages) > rrpc->nr_sects);
-
- return __rrpc_lock_laddr(rrpc, laddr, pages, r);
-}
-
-static inline struct rrpc_inflight_rq *rrpc_get_inflight_rq(struct nvm_rq *rqd)
-{
- struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd);
-
- return &rrqd->inflight_rq;
-}
-
-static inline int rrpc_lock_rq(struct rrpc *rrpc, struct bio *bio,
- struct nvm_rq *rqd)
-{
- sector_t laddr = rrpc_get_laddr(bio);
- unsigned int pages = rrpc_get_pages(bio);
- struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
-
- return rrpc_lock_laddr(rrpc, laddr, pages, r);
-}
-
-static inline void rrpc_unlock_laddr(struct rrpc *rrpc,
- struct rrpc_inflight_rq *r)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&rrpc->inflights.lock, flags);
- list_del_init(&r->list);
- spin_unlock_irqrestore(&rrpc->inflights.lock, flags);
-}
-
-static inline void rrpc_unlock_rq(struct rrpc *rrpc, struct nvm_rq *rqd)
-{
- struct rrpc_inflight_rq *r = rrpc_get_inflight_rq(rqd);
- uint8_t pages = rqd->nr_ppas;
-
- BUG_ON((r->l_start + pages) > rrpc->nr_sects);
-
- rrpc_unlock_laddr(rrpc, r);
-}
-
-#endif /* RRPC_H_ */
diff --git a/drivers/md/bcache/alloc.c b/drivers/md/bcache/alloc.c
index a0cc1bc6d884..6cc6c0f9c3a9 100644
--- a/drivers/md/bcache/alloc.c
+++ b/drivers/md/bcache/alloc.c
@@ -525,15 +525,21 @@ struct open_bucket {
/*
* We keep multiple buckets open for writes, and try to segregate different
- * write streams for better cache utilization: first we look for a bucket where
- * the last write to it was sequential with the current write, and failing that
- * we look for a bucket that was last used by the same task.
+ * write streams for better cache utilization: first we try to segregate flash
+ * only volume write streams from cached devices, secondly we look for a bucket
+ * where the last write to it was sequential with the current write, and
+ * failing that we look for a bucket that was last used by the same task.
*
* The ideas is if you've got multiple tasks pulling data into the cache at the
* same time, you'll get better cache utilization if you try to segregate their
* data and preserve locality.
*
- * For example, say you've starting Firefox at the same time you're copying a
+ * For example, dirty sectors of flash only volume is not reclaimable, if their
+ * dirty sectors mixed with dirty sectors of cached device, such buckets will
+ * be marked as dirty and won't be reclaimed, though the dirty data of cached
+ * device have been written back to backend device.
+ *
+ * And say you've starting Firefox at the same time you're copying a
* bunch of files. Firefox will likely end up being fairly hot and stay in the
* cache awhile, but the data you copied might not be; if you wrote all that
* data to the same buckets it'd get invalidated at the same time.
@@ -550,7 +556,10 @@ static struct open_bucket *pick_data_bucket(struct cache_set *c,
struct open_bucket *ret, *ret_task = NULL;
list_for_each_entry_reverse(ret, &c->data_buckets, list)
- if (!bkey_cmp(&ret->key, search))
+ if (UUID_FLASH_ONLY(&c->uuids[KEY_INODE(&ret->key)]) !=
+ UUID_FLASH_ONLY(&c->uuids[KEY_INODE(search)]))
+ continue;
+ else if (!bkey_cmp(&ret->key, search))
goto found;
else if (ret->last_write_point == write_point)
ret_task = ret;
diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h
index 843877e017e1..5e2d4e80198e 100644
--- a/drivers/md/bcache/bcache.h
+++ b/drivers/md/bcache/bcache.h
@@ -320,14 +320,15 @@ struct cached_dev {
*/
atomic_t has_dirty;
- struct bch_ratelimit writeback_rate;
- struct delayed_work writeback_rate_update;
-
/*
- * Internal to the writeback code, so read_dirty() can keep track of
- * where it's at.
+ * Set to zero by things that touch the backing volume-- except
+ * writeback. Incremented by writeback. Used to determine when to
+ * accelerate idle writeback.
*/
- sector_t last_read;
+ atomic_t backing_idle;
+
+ struct bch_ratelimit writeback_rate;
+ struct delayed_work writeback_rate_update;
/* Limit number of writeback bios in flight */
struct semaphore in_flight;
@@ -336,6 +337,14 @@ struct cached_dev {
struct keybuf writeback_keys;
+ /*
+ * Order the write-half of writeback operations strongly in dispatch
+ * order. (Maintain LBA order; don't allow reads completing out of
+ * order to re-order the writes...)
+ */
+ struct closure_waitlist writeback_ordering_wait;
+ atomic_t writeback_sequence_next;
+
/* For tracking sequential IO */
#define RECENT_IO_BITS 7
#define RECENT_IO (1 << RECENT_IO_BITS)
@@ -488,6 +497,7 @@ struct cache_set {
int caches_loaded;
struct bcache_device **devices;
+ unsigned devices_max_used;
struct list_head cached_devs;
uint64_t cached_dev_sectors;
struct closure caching;
@@ -852,7 +862,7 @@ static inline void wake_up_allocators(struct cache_set *c)
/* Forward declarations */
-void bch_count_io_errors(struct cache *, blk_status_t, const char *);
+void bch_count_io_errors(struct cache *, blk_status_t, int, const char *);
void bch_bbio_count_io_errors(struct cache_set *, struct bio *,
blk_status_t, const char *);
void bch_bbio_endio(struct cache_set *, struct bio *, blk_status_t,
diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c
index 81e8dc3dbe5e..bf3a48aa9a9a 100644
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -419,7 +419,7 @@ static void do_btree_node_write(struct btree *b)
SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) +
bset_sector_offset(&b->keys, i));
- if (!bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
+ if (!bch_bio_alloc_pages(b->bio, __GFP_NOWARN|GFP_NOWAIT)) {
int j;
struct bio_vec *bv;
void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
@@ -432,6 +432,7 @@ static void do_btree_node_write(struct btree *b)
continue_at(cl, btree_node_write_done, NULL);
} else {
+ /* No problem for multipage bvec since the bio is just allocated */
b->bio->bi_vcnt = 0;
bch_bio_map(b->bio, i);
@@ -1678,7 +1679,7 @@ static void bch_btree_gc_finish(struct cache_set *c)
/* don't reclaim buckets to which writeback keys point */
rcu_read_lock();
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
struct cached_dev *dc;
struct keybuf_key *w, *n;
@@ -1803,10 +1804,7 @@ static int bch_gc_thread(void *arg)
int bch_gc_thread_start(struct cache_set *c)
{
c->gc_thread = kthread_run(bch_gc_thread, c, "bcache_gc");
- if (IS_ERR(c->gc_thread))
- return PTR_ERR(c->gc_thread);
-
- return 0;
+ return PTR_ERR_OR_ZERO(c->gc_thread);
}
/* Initial partial gc */
diff --git a/drivers/md/bcache/closure.c b/drivers/md/bcache/closure.c
index 1841d0359bac..7f12920c14f7 100644
--- a/drivers/md/bcache/closure.c
+++ b/drivers/md/bcache/closure.c
@@ -8,6 +8,7 @@
#include <linux/debugfs.h>
#include <linux/module.h>
#include <linux/seq_file.h>
+#include <linux/sched/debug.h>
#include "closure.h"
@@ -18,10 +19,6 @@ static inline void closure_put_after_sub(struct closure *cl, int flags)
BUG_ON(flags & CLOSURE_GUARD_MASK);
BUG_ON(!r && (flags & ~CLOSURE_DESTRUCTOR));
- /* Must deliver precisely one wakeup */
- if (r == 1 && (flags & CLOSURE_SLEEPING))
- wake_up_process(cl->task);
-
if (!r) {
if (cl->fn && !(flags & CLOSURE_DESTRUCTOR)) {
atomic_set(&cl->remaining,
@@ -100,28 +97,34 @@ bool closure_wait(struct closure_waitlist *waitlist, struct closure *cl)
}
EXPORT_SYMBOL(closure_wait);
-/**
- * closure_sync - sleep until a closure has nothing left to wait on
- *
- * Sleeps until the refcount hits 1 - the thread that's running the closure owns
- * the last refcount.
- */
-void closure_sync(struct closure *cl)
+struct closure_syncer {
+ struct task_struct *task;
+ int done;
+};
+
+static void closure_sync_fn(struct closure *cl)
{
- while (1) {
- __closure_start_sleep(cl);
- closure_set_ret_ip(cl);
+ cl->s->done = 1;
+ wake_up_process(cl->s->task);
+}
- if ((atomic_read(&cl->remaining) &
- CLOSURE_REMAINING_MASK) == 1)
- break;
+void __sched __closure_sync(struct closure *cl)
+{
+ struct closure_syncer s = { .task = current };
+ cl->s = &s;
+ continue_at(cl, closure_sync_fn, NULL);
+
+ while (1) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (s.done)
+ break;
schedule();
}
- __closure_end_sleep(cl);
+ __set_current_state(TASK_RUNNING);
}
-EXPORT_SYMBOL(closure_sync);
+EXPORT_SYMBOL(__closure_sync);
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -168,12 +171,10 @@ static int debug_seq_show(struct seq_file *f, void *data)
cl, (void *) cl->ip, cl->fn, cl->parent,
r & CLOSURE_REMAINING_MASK);
- seq_printf(f, "%s%s%s%s\n",
+ seq_printf(f, "%s%s\n",
test_bit(WORK_STRUCT_PENDING_BIT,
work_data_bits(&cl->work)) ? "Q" : "",
- r & CLOSURE_RUNNING ? "R" : "",
- r & CLOSURE_STACK ? "S" : "",
- r & CLOSURE_SLEEPING ? "Sl" : "");
+ r & CLOSURE_RUNNING ? "R" : "");
if (r & CLOSURE_WAITING)
seq_printf(f, " W %pF\n",
diff --git a/drivers/md/bcache/closure.h b/drivers/md/bcache/closure.h
index ccfbea6f9f6b..3b9dfc9962ad 100644
--- a/drivers/md/bcache/closure.h
+++ b/drivers/md/bcache/closure.h
@@ -103,6 +103,7 @@
*/
struct closure;
+struct closure_syncer;
typedef void (closure_fn) (struct closure *);
struct closure_waitlist {
@@ -115,10 +116,6 @@ enum closure_state {
* the thread that owns the closure, and cleared by the thread that's
* waking up the closure.
*
- * CLOSURE_SLEEPING: Must be set before a thread uses a closure to sleep
- * - indicates that cl->task is valid and closure_put() may wake it up.
- * Only set or cleared by the thread that owns the closure.
- *
* The rest are for debugging and don't affect behaviour:
*
* CLOSURE_RUNNING: Set when a closure is running (i.e. by
@@ -128,22 +125,16 @@ enum closure_state {
* continue_at() and closure_return() clear it for you, if you're doing
* something unusual you can use closure_set_dead() which also helps
* annotate where references are being transferred.
- *
- * CLOSURE_STACK: Sanity check - remaining should never hit 0 on a
- * closure with this flag set
*/
- CLOSURE_BITS_START = (1 << 23),
- CLOSURE_DESTRUCTOR = (1 << 23),
- CLOSURE_WAITING = (1 << 25),
- CLOSURE_SLEEPING = (1 << 27),
- CLOSURE_RUNNING = (1 << 29),
- CLOSURE_STACK = (1 << 31),
+ CLOSURE_BITS_START = (1U << 26),
+ CLOSURE_DESTRUCTOR = (1U << 26),
+ CLOSURE_WAITING = (1U << 28),
+ CLOSURE_RUNNING = (1U << 30),
};
#define CLOSURE_GUARD_MASK \
- ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_SLEEPING| \
- CLOSURE_RUNNING|CLOSURE_STACK) << 1)
+ ((CLOSURE_DESTRUCTOR|CLOSURE_WAITING|CLOSURE_RUNNING) << 1)
#define CLOSURE_REMAINING_MASK (CLOSURE_BITS_START - 1)
#define CLOSURE_REMAINING_INITIALIZER (1|CLOSURE_RUNNING)
@@ -152,7 +143,7 @@ struct closure {
union {
struct {
struct workqueue_struct *wq;
- struct task_struct *task;
+ struct closure_syncer *s;
struct llist_node list;
closure_fn *fn;
};
@@ -178,7 +169,19 @@ void closure_sub(struct closure *cl, int v);
void closure_put(struct closure *cl);
void __closure_wake_up(struct closure_waitlist *list);
bool closure_wait(struct closure_waitlist *list, struct closure *cl);
-void closure_sync(struct closure *cl);
+void __closure_sync(struct closure *cl);
+
+/**
+ * closure_sync - sleep until a closure a closure has nothing left to wait on
+ *
+ * Sleeps until the refcount hits 1 - the thread that's running the closure owns
+ * the last refcount.
+ */
+static inline void closure_sync(struct closure *cl)
+{
+ if ((atomic_read(&cl->remaining) & CLOSURE_REMAINING_MASK) != 1)
+ __closure_sync(cl);
+}
#ifdef CONFIG_BCACHE_CLOSURES_DEBUG
@@ -215,24 +218,6 @@ static inline void closure_set_waiting(struct closure *cl, unsigned long f)
#endif
}
-static inline void __closure_end_sleep(struct closure *cl)
-{
- __set_current_state(TASK_RUNNING);
-
- if (atomic_read(&cl->remaining) & CLOSURE_SLEEPING)
- atomic_sub(CLOSURE_SLEEPING, &cl->remaining);
-}
-
-static inline void __closure_start_sleep(struct closure *cl)
-{
- closure_set_ip(cl);
- cl->task = current;
- set_current_state(TASK_UNINTERRUPTIBLE);
-
- if (!(atomic_read(&cl->remaining) & CLOSURE_SLEEPING))
- atomic_add(CLOSURE_SLEEPING, &cl->remaining);
-}
-
static inline void closure_set_stopped(struct closure *cl)
{
atomic_sub(CLOSURE_RUNNING, &cl->remaining);
@@ -241,7 +226,6 @@ static inline void closure_set_stopped(struct closure *cl)
static inline void set_closure_fn(struct closure *cl, closure_fn *fn,
struct workqueue_struct *wq)
{
- BUG_ON(object_is_on_stack(cl));
closure_set_ip(cl);
cl->fn = fn;
cl->wq = wq;
@@ -300,7 +284,7 @@ static inline void closure_init(struct closure *cl, struct closure *parent)
static inline void closure_init_stack(struct closure *cl)
{
memset(cl, 0, sizeof(struct closure));
- atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER|CLOSURE_STACK);
+ atomic_set(&cl->remaining, CLOSURE_REMAINING_INITIALIZER);
}
/**
@@ -322,6 +306,8 @@ static inline void closure_wake_up(struct closure_waitlist *list)
* This is because after calling continue_at() you no longer have a ref on @cl,
* and whatever @cl owns may be freed out from under you - a running closure fn
* has a ref on its own closure which continue_at() drops.
+ *
+ * Note you are expected to immediately return after using this macro.
*/
#define continue_at(_cl, _fn, _wq) \
do { \
diff --git a/drivers/md/bcache/debug.c b/drivers/md/bcache/debug.c
index c7a02c4900da..af89408befe8 100644
--- a/drivers/md/bcache/debug.c
+++ b/drivers/md/bcache/debug.c
@@ -116,7 +116,7 @@ void bch_data_verify(struct cached_dev *dc, struct bio *bio)
return;
check->bi_opf = REQ_OP_READ;
- if (bio_alloc_pages(check, GFP_NOIO))
+ if (bch_bio_alloc_pages(check, GFP_NOIO))
goto out_put;
submit_bio_wait(check);
@@ -251,8 +251,7 @@ void bch_debug_exit(void)
int __init bch_debug_init(struct kobject *kobj)
{
- int ret = 0;
-
debug = debugfs_create_dir("bcache", NULL);
- return ret;
+
+ return IS_ERR_OR_NULL(debug);
}
diff --git a/drivers/md/bcache/io.c b/drivers/md/bcache/io.c
index fac97ec2d0e2..a783c5a41ff1 100644
--- a/drivers/md/bcache/io.c
+++ b/drivers/md/bcache/io.c
@@ -51,7 +51,10 @@ void bch_submit_bbio(struct bio *bio, struct cache_set *c,
/* IO errors */
-void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
+void bch_count_io_errors(struct cache *ca,
+ blk_status_t error,
+ int is_read,
+ const char *m)
{
/*
* The halflife of an error is:
@@ -94,8 +97,9 @@ void bch_count_io_errors(struct cache *ca, blk_status_t error, const char *m)
errors >>= IO_ERROR_SHIFT;
if (errors < ca->set->error_limit)
- pr_err("%s: IO error on %s, recovering",
- bdevname(ca->bdev, buf), m);
+ pr_err("%s: IO error on %s%s",
+ bdevname(ca->bdev, buf), m,
+ is_read ? ", recovering." : ".");
else
bch_cache_set_error(ca->set,
"%s: too many IO errors %s",
@@ -108,6 +112,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
{
struct bbio *b = container_of(bio, struct bbio, bio);
struct cache *ca = PTR_CACHE(c, &b->key, 0);
+ int is_read = (bio_data_dir(bio) == READ ? 1 : 0);
unsigned threshold = op_is_write(bio_op(bio))
? c->congested_write_threshold_us
@@ -129,7 +134,7 @@ void bch_bbio_count_io_errors(struct cache_set *c, struct bio *bio,
atomic_inc(&c->congested);
}
- bch_count_io_errors(ca, error, m);
+ bch_count_io_errors(ca, error, is_read, m);
}
void bch_bbio_endio(struct cache_set *c, struct bio *bio,
diff --git a/drivers/md/bcache/movinggc.c b/drivers/md/bcache/movinggc.c
index d50c1c97da68..a24c3a95b2c0 100644
--- a/drivers/md/bcache/movinggc.c
+++ b/drivers/md/bcache/movinggc.c
@@ -162,7 +162,7 @@ static void read_moving(struct cache_set *c)
bio_set_op_attrs(bio, REQ_OP_READ, 0);
bio->bi_end_io = read_moving_endio;
- if (bio_alloc_pages(bio, GFP_KERNEL))
+ if (bch_bio_alloc_pages(bio, GFP_KERNEL))
goto err;
trace_bcache_gc_copy(&w->key);
diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c
index 643c3021624f..1a46b41dac70 100644
--- a/drivers/md/bcache/request.c
+++ b/drivers/md/bcache/request.c
@@ -576,6 +576,7 @@ static void cache_lookup(struct closure *cl)
{
struct search *s = container_of(cl, struct search, iop.cl);
struct bio *bio = &s->bio.bio;
+ struct cached_dev *dc;
int ret;
bch_btree_op_init(&s->op, -1);
@@ -588,6 +589,27 @@ static void cache_lookup(struct closure *cl)
return;
}
+ /*
+ * We might meet err when searching the btree, If that happens, we will
+ * get negative ret, in this scenario we should not recover data from
+ * backing device (when cache device is dirty) because we don't know
+ * whether bkeys the read request covered are all clean.
+ *
+ * And after that happened, s->iop.status is still its initial value
+ * before we submit s->bio.bio
+ */
+ if (ret < 0) {
+ BUG_ON(ret == -EINTR);
+ if (s->d && s->d->c &&
+ !UUID_FLASH_ONLY(&s->d->c->uuids[s->d->id])) {
+ dc = container_of(s->d, struct cached_dev, disk);
+ if (dc && atomic_read(&dc->has_dirty))
+ s->recoverable = false;
+ }
+ if (!s->iop.status)
+ s->iop.status = BLK_STS_IOERR;
+ }
+
closure_return(cl);
}
@@ -611,8 +633,8 @@ static void request_endio(struct bio *bio)
static void bio_complete(struct search *s)
{
if (s->orig_bio) {
- struct request_queue *q = s->orig_bio->bi_disk->queue;
- generic_end_io_acct(q, bio_data_dir(s->orig_bio),
+ generic_end_io_acct(s->d->disk->queue,
+ bio_data_dir(s->orig_bio),
&s->d->disk->part0, s->start_time);
trace_bcache_request_end(s->d, s->orig_bio);
@@ -841,7 +863,7 @@ static int cached_dev_cache_miss(struct btree *b, struct search *s,
cache_bio->bi_private = &s->cl;
bch_bio_map(cache_bio, NULL);
- if (bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
+ if (bch_bio_alloc_pages(cache_bio, __GFP_NOWARN|GFP_NOIO))
goto out_put;
if (reada)
@@ -974,6 +996,7 @@ static blk_qc_t cached_dev_make_request(struct request_queue *q,
struct cached_dev *dc = container_of(d, struct cached_dev, disk);
int rw = bio_data_dir(bio);
+ atomic_set(&dc->backing_idle, 0);
generic_start_io_acct(q, rw, bio_sectors(bio), &d->disk->part0);
bio_set_dev(bio, dc->bdev);
diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c
index b4d28928dec5..133b81225ea9 100644
--- a/drivers/md/bcache/super.c
+++ b/drivers/md/bcache/super.c
@@ -211,7 +211,7 @@ static void write_bdev_super_endio(struct bio *bio)
static void __write_super(struct cache_sb *sb, struct bio *bio)
{
- struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page);
+ struct cache_sb *out = page_address(bio_first_page_all(bio));
unsigned i;
bio->bi_iter.bi_sector = SB_SECTOR;
@@ -274,7 +274,9 @@ static void write_super_endio(struct bio *bio)
{
struct cache *ca = bio->bi_private;
- bch_count_io_errors(ca, bio->bi_status, "writing superblock");
+ /* is_read = 0 */
+ bch_count_io_errors(ca, bio->bi_status, 0,
+ "writing superblock");
closure_put(&ca->set->sb_write);
}
@@ -721,6 +723,9 @@ static void bcache_device_attach(struct bcache_device *d, struct cache_set *c,
d->c = c;
c->devices[id] = d;
+ if (id >= c->devices_max_used)
+ c->devices_max_used = id + 1;
+
closure_get(&c->caching);
}
@@ -906,6 +911,12 @@ static void cached_dev_detach_finish(struct work_struct *w)
mutex_lock(&bch_register_lock);
+ cancel_delayed_work_sync(&dc->writeback_rate_update);
+ if (!IS_ERR_OR_NULL(dc->writeback_thread)) {
+ kthread_stop(dc->writeback_thread);
+ dc->writeback_thread = NULL;
+ }
+
memset(&dc->sb.set_uuid, 0, 16);
SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE);
@@ -1166,7 +1177,7 @@ static void register_bdev(struct cache_sb *sb, struct page *sb_page,
dc->bdev->bd_holder = dc;
bio_init(&dc->sb_bio, dc->sb_bio.bi_inline_vecs, 1);
- dc->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&dc->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (cached_dev_init(dc, sb->block_size << 9))
@@ -1261,7 +1272,7 @@ static int flash_devs_run(struct cache_set *c)
struct uuid_entry *u;
for (u = c->uuids;
- u < c->uuids + c->nr_uuids && !ret;
+ u < c->uuids + c->devices_max_used && !ret;
u++)
if (UUID_FLASH_ONLY(u))
ret = flash_dev_run(c, u);
@@ -1427,7 +1438,7 @@ static void __cache_set_unregister(struct closure *cl)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++)
+ for (i = 0; i < c->devices_max_used; i++)
if (c->devices[i]) {
if (!UUID_FLASH_ONLY(&c->uuids[i]) &&
test_bit(CACHE_SET_UNREGISTERING, &c->flags)) {
@@ -1490,7 +1501,7 @@ struct cache_set *bch_cache_set_alloc(struct cache_sb *sb)
c->bucket_bits = ilog2(sb->bucket_size);
c->block_bits = ilog2(sb->block_size);
c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry);
-
+ c->devices_max_used = 0;
c->btree_pages = bucket_pages(c);
if (c->btree_pages > BTREE_MAX_PAGES)
c->btree_pages = max_t(int, c->btree_pages / 4,
@@ -1810,7 +1821,7 @@ void bch_cache_release(struct kobject *kobj)
free_fifo(&ca->free[i]);
if (ca->sb_bio.bi_inline_vecs[0].bv_page)
- put_page(ca->sb_bio.bi_io_vec[0].bv_page);
+ put_page(bio_first_page_all(&ca->sb_bio));
if (!IS_ERR_OR_NULL(ca->bdev))
blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
@@ -1864,7 +1875,7 @@ static int register_cache(struct cache_sb *sb, struct page *sb_page,
ca->bdev->bd_holder = ca;
bio_init(&ca->sb_bio, ca->sb_bio.bi_inline_vecs, 1);
- ca->sb_bio.bi_io_vec[0].bv_page = sb_page;
+ bio_first_bvec_all(&ca->sb_bio)->bv_page = sb_page;
get_page(sb_page);
if (blk_queue_discard(bdev_get_queue(ca->bdev)))
diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c
index e548b8b51322..a23cd6a14b74 100644
--- a/drivers/md/bcache/util.c
+++ b/drivers/md/bcache/util.c
@@ -249,6 +249,13 @@ uint64_t bch_next_delay(struct bch_ratelimit *d, uint64_t done)
: 0;
}
+/*
+ * Generally it isn't good to access .bi_io_vec and .bi_vcnt directly,
+ * the preferred way is bio_add_page, but in this case, bch_bio_map()
+ * supposes that the bvec table is empty, so it is safe to access
+ * .bi_vcnt & .bi_io_vec in this way even after multipage bvec is
+ * supported.
+ */
void bch_bio_map(struct bio *bio, void *base)
{
size_t size = bio->bi_iter.bi_size;
@@ -276,6 +283,33 @@ start: bv->bv_len = min_t(size_t, PAGE_SIZE - bv->bv_offset,
}
}
+/**
+ * bch_bio_alloc_pages - allocates a single page for each bvec in a bio
+ * @bio: bio to allocate pages for
+ * @gfp_mask: flags for allocation
+ *
+ * Allocates pages up to @bio->bi_vcnt.
+ *
+ * Returns 0 on success, -ENOMEM on failure. On failure, any allocated pages are
+ * freed.
+ */
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask)
+{
+ int i;
+ struct bio_vec *bv;
+
+ bio_for_each_segment_all(bv, bio, i) {
+ bv->bv_page = alloc_page(gfp_mask);
+ if (!bv->bv_page) {
+ while (--bv >= bio->bi_io_vec)
+ __free_page(bv->bv_page);
+ return -ENOMEM;
+ }
+ }
+
+ return 0;
+}
+
/*
* Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any
* use permitted, subject to terms of PostgreSQL license; see.)
diff --git a/drivers/md/bcache/util.h b/drivers/md/bcache/util.h
index ed5e8a412eb8..4df4c5c1cab2 100644
--- a/drivers/md/bcache/util.h
+++ b/drivers/md/bcache/util.h
@@ -558,6 +558,7 @@ static inline unsigned fract_exp_two(unsigned x, unsigned fract_bits)
}
void bch_bio_map(struct bio *bio, void *base);
+int bch_bio_alloc_pages(struct bio *bio, gfp_t gfp_mask);
static inline sector_t bdev_sectors(struct block_device *bdev)
{
diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c
index 56a37884ca8b..51306a19ab03 100644
--- a/drivers/md/bcache/writeback.c
+++ b/drivers/md/bcache/writeback.c
@@ -18,17 +18,39 @@
#include <trace/events/bcache.h>
/* Rate limiting */
-
-static void __update_writeback_rate(struct cached_dev *dc)
+static uint64_t __calc_target_rate(struct cached_dev *dc)
{
struct cache_set *c = dc->disk.c;
+
+ /*
+ * This is the size of the cache, minus the amount used for
+ * flash-only devices
+ */
uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
bcache_flash_devs_sectors_dirty(c);
+
+ /*
+ * Unfortunately there is no control of global dirty data. If the
+ * user states that they want 10% dirty data in the cache, and has,
+ * e.g., 5 backing volumes of equal size, we try and ensure each
+ * backing volume uses about 2% of the cache for dirty data.
+ */
+ uint32_t bdev_share =
+ div64_u64(bdev_sectors(dc->bdev) << WRITEBACK_SHARE_SHIFT,
+ c->cached_dev_sectors);
+
uint64_t cache_dirty_target =
div_u64(cache_sectors * dc->writeback_percent, 100);
- int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
- c->cached_dev_sectors);
+ /* Ensure each backing dev gets at least one dirty share */
+ if (bdev_share < 1)
+ bdev_share = 1;
+
+ return (cache_dirty_target * bdev_share) >> WRITEBACK_SHARE_SHIFT;
+}
+
+static void __update_writeback_rate(struct cached_dev *dc)
+{
/*
* PI controller:
* Figures out the amount that should be written per second.
@@ -49,6 +71,7 @@ static void __update_writeback_rate(struct cached_dev *dc)
* This acts as a slow, long-term average that is not subject to
* variations in usage like the p term.
*/
+ int64_t target = __calc_target_rate(dc);
int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
int64_t error = dirty - target;
int64_t proportional_scaled =
@@ -116,6 +139,7 @@ static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
struct dirty_io {
struct closure cl;
struct cached_dev *dc;
+ uint16_t sequence;
struct bio bio;
};
@@ -194,6 +218,27 @@ static void write_dirty(struct closure *cl)
{
struct dirty_io *io = container_of(cl, struct dirty_io, cl);
struct keybuf_key *w = io->bio.bi_private;
+ struct cached_dev *dc = io->dc;
+
+ uint16_t next_sequence;
+
+ if (atomic_read(&dc->writeback_sequence_next) != io->sequence) {
+ /* Not our turn to write; wait for a write to complete */
+ closure_wait(&dc->writeback_ordering_wait, cl);
+
+ if (atomic_read(&dc->writeback_sequence_next) == io->sequence) {
+ /*
+ * Edge case-- it happened in indeterminate order
+ * relative to when we were added to wait list..
+ */
+ closure_wake_up(&dc->writeback_ordering_wait);
+ }
+
+ continue_at(cl, write_dirty, io->dc->writeback_write_wq);
+ return;
+ }
+
+ next_sequence = io->sequence + 1;
/*
* IO errors are signalled using the dirty bit on the key.
@@ -211,6 +256,9 @@ static void write_dirty(struct closure *cl)
closure_bio_submit(&io->bio, cl);
}
+ atomic_set(&dc->writeback_sequence_next, next_sequence);
+ closure_wake_up(&dc->writeback_ordering_wait);
+
continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
}
@@ -219,8 +267,10 @@ static void read_dirty_endio(struct bio *bio)
struct keybuf_key *w = bio->bi_private;
struct dirty_io *io = w->private;
+ /* is_read = 1 */
bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
- bio->bi_status, "reading dirty data from cache");
+ bio->bi_status, 1,
+ "reading dirty data from cache");
dirty_endio(bio);
}
@@ -237,10 +287,15 @@ static void read_dirty_submit(struct closure *cl)
static void read_dirty(struct cached_dev *dc)
{
unsigned delay = 0;
- struct keybuf_key *w;
+ struct keybuf_key *next, *keys[MAX_WRITEBACKS_IN_PASS], *w;
+ size_t size;
+ int nk, i;
struct dirty_io *io;
struct closure cl;
+ uint16_t sequence = 0;
+ BUG_ON(!llist_empty(&dc->writeback_ordering_wait.list));
+ atomic_set(&dc->writeback_sequence_next, sequence);
closure_init_stack(&cl);
/*
@@ -248,45 +303,109 @@ static void read_dirty(struct cached_dev *dc)
* mempools.
*/
- while (!kthread_should_stop()) {
-
- w = bch_keybuf_next(&dc->writeback_keys);
- if (!w)
- break;
-
- BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
-
- if (KEY_START(&w->key) != dc->last_read ||
- jiffies_to_msecs(delay) > 50)
- while (!kthread_should_stop() && delay)
- delay = schedule_timeout_interruptible(delay);
-
- dc->last_read = KEY_OFFSET(&w->key);
-
- io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
- * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
- GFP_KERNEL);
- if (!io)
- goto err;
-
- w->private = io;
- io->dc = dc;
-
- dirty_init(w);
- bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
- io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
- bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
- io->bio.bi_end_io = read_dirty_endio;
-
- if (bio_alloc_pages(&io->bio, GFP_KERNEL))
- goto err_free;
+ next = bch_keybuf_next(&dc->writeback_keys);
+
+ while (!kthread_should_stop() && next) {
+ size = 0;
+ nk = 0;
+
+ do {
+ BUG_ON(ptr_stale(dc->disk.c, &next->key, 0));
+
+ /*
+ * Don't combine too many operations, even if they
+ * are all small.
+ */
+ if (nk >= MAX_WRITEBACKS_IN_PASS)
+ break;
+
+ /*
+ * If the current operation is very large, don't
+ * further combine operations.
+ */
+ if (size >= MAX_WRITESIZE_IN_PASS)
+ break;
+
+ /*
+ * Operations are only eligible to be combined
+ * if they are contiguous.
+ *
+ * TODO: add a heuristic willing to fire a
+ * certain amount of non-contiguous IO per pass,
+ * so that we can benefit from backing device
+ * command queueing.
+ */
+ if ((nk != 0) && bkey_cmp(&keys[nk-1]->key,
+ &START_KEY(&next->key)))
+ break;
+
+ size += KEY_SIZE(&next->key);
+ keys[nk++] = next;
+ } while ((next = bch_keybuf_next(&dc->writeback_keys)));
+
+ /* Now we have gathered a set of 1..5 keys to write back. */
+ for (i = 0; i < nk; i++) {
+ w = keys[i];
+
+ io = kzalloc(sizeof(struct dirty_io) +
+ sizeof(struct bio_vec) *
+ DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
+ GFP_KERNEL);
+ if (!io)
+ goto err;
+
+ w->private = io;
+ io->dc = dc;
+ io->sequence = sequence++;
+
+ dirty_init(w);
+ bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
+ io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
+ bio_set_dev(&io->bio,
+ PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
+ io->bio.bi_end_io = read_dirty_endio;
+
+ if (bch_bio_alloc_pages(&io->bio, GFP_KERNEL))
+ goto err_free;
+
+ trace_bcache_writeback(&w->key);
+
+ down(&dc->in_flight);
+
+ /* We've acquired a semaphore for the maximum
+ * simultaneous number of writebacks; from here
+ * everything happens asynchronously.
+ */
+ closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ }
- trace_bcache_writeback(&w->key);
+ delay = writeback_delay(dc, size);
- down(&dc->in_flight);
- closure_call(&io->cl, read_dirty_submit, NULL, &cl);
+ /* If the control system would wait for at least half a
+ * second, and there's been no reqs hitting the backing disk
+ * for awhile: use an alternate mode where we have at most
+ * one contiguous set of writebacks in flight at a time. If
+ * someone wants to do IO it will be quick, as it will only
+ * have to contend with one operation in flight, and we'll
+ * be round-tripping data to the backing disk as quickly as
+ * it can accept it.
+ */
+ if (delay >= HZ / 2) {
+ /* 3 means at least 1.5 seconds, up to 7.5 if we
+ * have slowed way down.
+ */
+ if (atomic_inc_return(&dc->backing_idle) >= 3) {
+ /* Wait for current I/Os to finish */
+ closure_sync(&cl);
+ /* And immediately launch a new set. */
+ delay = 0;
+ }
+ }
- delay = writeback_delay(dc, KEY_SIZE(&w->key));
+ while (!kthread_should_stop() && delay) {
+ schedule_timeout_interruptible(delay);
+ delay = writeback_delay(dc, 0);
+ }
}
if (0) {
diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h
index a9e3ffb4b03c..66f1c527fa24 100644
--- a/drivers/md/bcache/writeback.h
+++ b/drivers/md/bcache/writeback.h
@@ -5,6 +5,16 @@
#define CUTOFF_WRITEBACK 40
#define CUTOFF_WRITEBACK_SYNC 70
+#define MAX_WRITEBACKS_IN_PASS 5
+#define MAX_WRITESIZE_IN_PASS 5000 /* *512b */
+
+/*
+ * 14 (16384ths) is chosen here as something that each backing device
+ * should be a reasonable fraction of the share, and not to blow up
+ * until individual backing devices are a petabyte.
+ */
+#define WRITEBACK_SHARE_SHIFT 14
+
static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d)
{
uint64_t i, ret = 0;
@@ -21,7 +31,7 @@ static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c)
mutex_lock(&bch_register_lock);
- for (i = 0; i < c->nr_uuids; i++) {
+ for (i = 0; i < c->devices_max_used; i++) {
struct bcache_device *d = c->devices[i];
if (!d || !UUID_FLASH_ONLY(&c->uuids[i]))
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 554d60394c06..2ad429100d25 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -1446,7 +1446,6 @@ static void crypt_free_buffer_pages(struct crypt_config *cc, struct bio *clone)
bio_for_each_segment_all(bv, clone, i) {
BUG_ON(!bv->bv_page);
mempool_free(bv->bv_page, cc->page_pool);
- bv->bv_page = NULL;
}
}
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index f7810cc869ac..ef57c6d1c887 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -1475,21 +1475,6 @@ static void activate_path_work(struct work_struct *work)
activate_or_offline_path(pgpath);
}
-static int noretry_error(blk_status_t error)
-{
- switch (error) {
- case BLK_STS_NOTSUPP:
- case BLK_STS_NOSPC:
- case BLK_STS_TARGET:
- case BLK_STS_NEXUS:
- case BLK_STS_MEDIUM:
- return 1;
- }
-
- /* Anything else could be a path failure, so should be retried */
- return 0;
-}
-
static int multipath_end_io(struct dm_target *ti, struct request *clone,
blk_status_t error, union map_info *map_context)
{
@@ -1508,7 +1493,7 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
* request into dm core, which will remake a clone request and
* clone bios for it and resubmit it later.
*/
- if (error && !noretry_error(error)) {
+ if (error && blk_path_error(error)) {
struct multipath *m = ti->private;
r = DM_ENDIO_REQUEUE;
@@ -1544,7 +1529,7 @@ static int multipath_end_io_bio(struct dm_target *ti, struct bio *clone,
unsigned long flags;
int r = DM_ENDIO_DONE;
- if (!*error || noretry_error(*error))
+ if (!*error || !blk_path_error(*error))
goto done;
if (pgpath)
diff --git a/drivers/md/dm-rq.c b/drivers/md/dm-rq.c
index 9d32f25489c2..b7d175e94a02 100644
--- a/drivers/md/dm-rq.c
+++ b/drivers/md/dm-rq.c
@@ -395,7 +395,7 @@ static void end_clone_request(struct request *clone, blk_status_t error)
dm_complete_request(tio->orig, error);
}
-static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
+static blk_status_t dm_dispatch_clone_request(struct request *clone, struct request *rq)
{
blk_status_t r;
@@ -404,9 +404,10 @@ static void dm_dispatch_clone_request(struct request *clone, struct request *rq)
clone->start_time = jiffies;
r = blk_insert_cloned_request(clone->q, clone);
- if (r)
+ if (r != BLK_STS_OK && r != BLK_STS_RESOURCE)
/* must complete clone in terms of original request */
dm_complete_request(rq, r);
+ return r;
}
static int dm_rq_bio_constructor(struct bio *bio, struct bio *bio_orig,
@@ -476,8 +477,10 @@ static int map_request(struct dm_rq_target_io *tio)
struct mapped_device *md = tio->md;
struct request *rq = tio->orig;
struct request *clone = NULL;
+ blk_status_t ret;
r = ti->type->clone_and_map_rq(ti, rq, &tio->info, &clone);
+check_again:
switch (r) {
case DM_MAPIO_SUBMITTED:
/* The target has taken the I/O to submit by itself later */
@@ -492,7 +495,17 @@ static int map_request(struct dm_rq_target_io *tio)
/* The target has remapped the I/O so dispatch it */
trace_block_rq_remap(clone->q, clone, disk_devt(dm_disk(md)),
blk_rq_pos(rq));
- dm_dispatch_clone_request(clone, rq);
+ ret = dm_dispatch_clone_request(clone, rq);
+ if (ret == BLK_STS_RESOURCE) {
+ blk_rq_unprep_clone(clone);
+ tio->ti->type->release_clone_rq(clone);
+ tio->clone = NULL;
+ if (!rq->q->mq_ops)
+ r = DM_MAPIO_DELAY_REQUEUE;
+ else
+ r = DM_MAPIO_REQUEUE;
+ goto check_again;
+ }
break;
case DM_MAPIO_REQUEUE:
/* The target wants to requeue the I/O */
@@ -713,8 +726,6 @@ int dm_old_init_request_queue(struct mapped_device *md, struct dm_table *t)
return error;
}
- elv_register_queue(md->queue);
-
return 0;
}
@@ -812,15 +823,8 @@ int dm_mq_init_request_queue(struct mapped_device *md, struct dm_table *t)
}
dm_init_md_queue(md);
- /* backfill 'mq' sysfs registration normally done in blk_register_queue */
- err = blk_mq_register_dev(disk_to_dev(md->disk), q);
- if (err)
- goto out_cleanup_queue;
-
return 0;
-out_cleanup_queue:
- blk_cleanup_queue(q);
out_tag_set:
blk_mq_free_tag_set(md->tag_set);
out_kfree_tag_set:
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index de17b7193299..8c26bfc35335 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -920,7 +920,15 @@ int dm_set_target_max_io_len(struct dm_target *ti, sector_t len)
return -EINVAL;
}
- ti->max_io_len = (uint32_t) len;
+ /*
+ * BIO based queue uses its own splitting. When multipage bvecs
+ * is switched on, size of the incoming bio may be too big to
+ * be handled in some targets, such as crypt.
+ *
+ * When these targets are ready for the big bio, we can remove
+ * the limit.
+ */
+ ti->max_io_len = min_t(uint32_t, len, BIO_MAX_PAGES * PAGE_SIZE);
return 0;
}
@@ -1753,7 +1761,7 @@ static struct mapped_device *alloc_dev(int minor)
goto bad;
md->dax_dev = dax_dev;
- add_disk(md->disk);
+ add_disk_no_queue_reg(md->disk);
format_dev_t(md->name, MKDEV(_major, minor));
md->wq = alloc_workqueue("kdmflush", WQ_MEM_RECLAIM, 0);
@@ -2013,6 +2021,7 @@ EXPORT_SYMBOL_GPL(dm_get_queue_limits);
int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
{
int r;
+ struct queue_limits limits;
enum dm_queue_mode type = dm_get_md_type(md);
switch (type) {
@@ -2049,6 +2058,14 @@ int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t)
break;
}
+ r = dm_calculate_queue_limits(t, &limits);
+ if (r) {
+ DMERR("Cannot calculate initial queue limits");
+ return r;
+ }
+ dm_table_set_restrictions(t, md->queue, &limits);
+ blk_register_queue(md->disk);
+
return 0;
}
diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile
index a25fd43650ad..441e67e3a9d7 100644
--- a/drivers/nvme/host/Makefile
+++ b/drivers/nvme/host/Makefile
@@ -1,4 +1,7 @@
# SPDX-License-Identifier: GPL-2.0
+
+ccflags-y += -I$(src)
+
obj-$(CONFIG_NVME_CORE) += nvme-core.o
obj-$(CONFIG_BLK_DEV_NVME) += nvme.o
obj-$(CONFIG_NVME_FABRICS) += nvme-fabrics.o
@@ -6,6 +9,7 @@ obj-$(CONFIG_NVME_RDMA) += nvme-rdma.o
obj-$(CONFIG_NVME_FC) += nvme-fc.o
nvme-core-y := core.o
+nvme-core-$(CONFIG_TRACING) += trace.o
nvme-core-$(CONFIG_NVME_MULTIPATH) += multipath.o
nvme-core-$(CONFIG_NVM) += lightnvm.o
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 839650e0926a..e8104871cbbf 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -29,6 +29,9 @@
#include <linux/pm_qos.h>
#include <asm/unaligned.h>
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
#include "nvme.h"
#include "fabrics.h"
@@ -65,9 +68,26 @@ static bool streams;
module_param(streams, bool, 0644);
MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
+/*
+ * nvme_wq - hosts nvme related works that are not reset or delete
+ * nvme_reset_wq - hosts nvme reset works
+ * nvme_delete_wq - hosts nvme delete works
+ *
+ * nvme_wq will host works such are scan, aen handling, fw activation,
+ * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * runs reset works which also flush works hosted on nvme_wq for
+ * serialization purposes. nvme_delete_wq host controller deletion
+ * works which flush reset works for serialization.
+ */
struct workqueue_struct *nvme_wq;
EXPORT_SYMBOL_GPL(nvme_wq);
+struct workqueue_struct *nvme_reset_wq;
+EXPORT_SYMBOL_GPL(nvme_reset_wq);
+
+struct workqueue_struct *nvme_delete_wq;
+EXPORT_SYMBOL_GPL(nvme_delete_wq);
+
static DEFINE_IDA(nvme_subsystems_ida);
static LIST_HEAD(nvme_subsystems);
static DEFINE_MUTEX(nvme_subsystems_lock);
@@ -89,13 +109,13 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return -EBUSY;
- if (!queue_work(nvme_wq, &ctrl->reset_work))
+ if (!queue_work(nvme_reset_wq, &ctrl->reset_work))
return -EBUSY;
return 0;
}
EXPORT_SYMBOL_GPL(nvme_reset_ctrl);
-static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
{
int ret;
@@ -104,6 +124,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl)
flush_work(&ctrl->reset_work);
return ret;
}
+EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync);
static void nvme_delete_ctrl_work(struct work_struct *work)
{
@@ -122,7 +143,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl)
{
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING))
return -EBUSY;
- if (!queue_work(nvme_wq, &ctrl->delete_work))
+ if (!queue_work(nvme_delete_wq, &ctrl->delete_work))
return -EBUSY;
return 0;
}
@@ -157,13 +178,20 @@ static blk_status_t nvme_error_status(struct request *req)
return BLK_STS_OK;
case NVME_SC_CAP_EXCEEDED:
return BLK_STS_NOSPC;
+ case NVME_SC_LBA_RANGE:
+ return BLK_STS_TARGET;
+ case NVME_SC_BAD_ATTRIBUTES:
case NVME_SC_ONCS_NOT_SUPPORTED:
+ case NVME_SC_INVALID_OPCODE:
+ case NVME_SC_INVALID_FIELD:
+ case NVME_SC_INVALID_NS:
return BLK_STS_NOTSUPP;
case NVME_SC_WRITE_FAULT:
case NVME_SC_READ_ERROR:
case NVME_SC_UNWRITTEN_BLOCK:
case NVME_SC_ACCESS_DENIED:
case NVME_SC_READ_ONLY:
+ case NVME_SC_COMPARE_FAILED:
return BLK_STS_MEDIUM;
case NVME_SC_GUARD_CHECK:
case NVME_SC_APPTAG_CHECK:
@@ -190,8 +218,12 @@ static inline bool nvme_req_needs_retry(struct request *req)
void nvme_complete_rq(struct request *req)
{
- if (unlikely(nvme_req(req)->status && nvme_req_needs_retry(req))) {
- if (nvme_req_needs_failover(req)) {
+ blk_status_t status = nvme_error_status(req);
+
+ trace_nvme_complete_rq(req);
+
+ if (unlikely(status != BLK_STS_OK && nvme_req_needs_retry(req))) {
+ if (nvme_req_needs_failover(req, status)) {
nvme_failover_req(req);
return;
}
@@ -202,8 +234,7 @@ void nvme_complete_rq(struct request *req)
return;
}
}
-
- blk_mq_end_request(req, nvme_error_status(req));
+ blk_mq_end_request(req, status);
}
EXPORT_SYMBOL_GPL(nvme_complete_rq);
@@ -232,6 +263,15 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
old_state = ctrl->state;
switch (new_state) {
+ case NVME_CTRL_ADMIN_ONLY:
+ switch (old_state) {
+ case NVME_CTRL_RECONNECTING:
+ changed = true;
+ /* FALLTHRU */
+ default:
+ break;
+ }
+ break;
case NVME_CTRL_LIVE:
switch (old_state) {
case NVME_CTRL_NEW:
@@ -247,6 +287,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
switch (old_state) {
case NVME_CTRL_NEW:
case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
changed = true;
/* FALLTHRU */
default:
@@ -266,6 +307,7 @@ bool nvme_change_ctrl_state(struct nvme_ctrl *ctrl,
case NVME_CTRL_DELETING:
switch (old_state) {
case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
case NVME_CTRL_RESETTING:
case NVME_CTRL_RECONNECTING:
changed = true;
@@ -591,6 +633,10 @@ blk_status_t nvme_setup_cmd(struct nvme_ns *ns, struct request *req,
}
cmd->common.command_id = req->tag;
+ if (ns)
+ trace_nvme_setup_nvm_cmd(req->q->id, cmd);
+ else
+ trace_nvme_setup_admin_cmd(cmd);
return ret;
}
EXPORT_SYMBOL_GPL(nvme_setup_cmd);
@@ -1217,16 +1263,27 @@ static int nvme_open(struct block_device *bdev, fmode_t mode)
#ifdef CONFIG_NVME_MULTIPATH
/* should never be called due to GENHD_FL_HIDDEN */
if (WARN_ON_ONCE(ns->head->disk))
- return -ENXIO;
+ goto fail;
#endif
if (!kref_get_unless_zero(&ns->kref))
- return -ENXIO;
+ goto fail;
+ if (!try_module_get(ns->ctrl->ops->module))
+ goto fail_put_ns;
+
return 0;
+
+fail_put_ns:
+ nvme_put_ns(ns);
+fail:
+ return -ENXIO;
}
static void nvme_release(struct gendisk *disk, fmode_t mode)
{
- nvme_put_ns(disk->private_data);
+ struct nvme_ns *ns = disk->private_data;
+
+ module_put(ns->ctrl->ops->module);
+ nvme_put_ns(ns);
}
static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo)
@@ -2052,6 +2109,22 @@ static const struct attribute_group *nvme_subsys_attrs_groups[] = {
NULL,
};
+static int nvme_active_ctrls(struct nvme_subsystem *subsys)
+{
+ int count = 0;
+ struct nvme_ctrl *ctrl;
+
+ mutex_lock(&subsys->lock);
+ list_for_each_entry(ctrl, &subsys->ctrls, subsys_entry) {
+ if (ctrl->state != NVME_CTRL_DELETING &&
+ ctrl->state != NVME_CTRL_DEAD)
+ count++;
+ }
+ mutex_unlock(&subsys->lock);
+
+ return count;
+}
+
static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
{
struct nvme_subsystem *subsys, *found;
@@ -2090,7 +2163,7 @@ static int nvme_init_subsystem(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
* Verify that the subsystem actually supports multiple
* controllers, else bail out.
*/
- if (!(id->cmic & (1 << 1))) {
+ if (nvme_active_ctrls(found) && !(id->cmic & (1 << 1))) {
dev_err(ctrl->device,
"ignoring ctrl due to duplicate subnqn (%s).\n",
found->subnqn);
@@ -2257,7 +2330,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
shutdown_timeout, 60);
if (ctrl->shutdown_timeout != shutdown_timeout)
- dev_warn(ctrl->device,
+ dev_info(ctrl->device,
"Shutdown timeout set to %u seconds\n",
ctrl->shutdown_timeout);
} else
@@ -2341,8 +2414,14 @@ static int nvme_dev_open(struct inode *inode, struct file *file)
struct nvme_ctrl *ctrl =
container_of(inode->i_cdev, struct nvme_ctrl, cdev);
- if (ctrl->state != NVME_CTRL_LIVE)
+ switch (ctrl->state) {
+ case NVME_CTRL_LIVE:
+ case NVME_CTRL_ADMIN_ONLY:
+ break;
+ default:
return -EWOULDBLOCK;
+ }
+
file->private_data = ctrl;
return 0;
}
@@ -2606,6 +2685,7 @@ static ssize_t nvme_sysfs_show_state(struct device *dev,
static const char *const state_name[] = {
[NVME_CTRL_NEW] = "new",
[NVME_CTRL_LIVE] = "live",
+ [NVME_CTRL_ADMIN_ONLY] = "only-admin",
[NVME_CTRL_RESETTING] = "resetting",
[NVME_CTRL_RECONNECTING]= "reconnecting",
[NVME_CTRL_DELETING] = "deleting",
@@ -3079,6 +3159,8 @@ static void nvme_scan_work(struct work_struct *work)
if (ctrl->state != NVME_CTRL_LIVE)
return;
+ WARN_ON_ONCE(!ctrl->tagset);
+
if (nvme_identify_ctrl(ctrl, &id))
return;
@@ -3099,8 +3181,7 @@ static void nvme_scan_work(struct work_struct *work)
void nvme_queue_scan(struct nvme_ctrl *ctrl)
{
/*
- * Do not queue new scan work when a controller is reset during
- * removal.
+ * Only new queue scan work when admin and IO queues are both alive
*/
if (ctrl->state == NVME_CTRL_LIVE)
queue_work(nvme_wq, &ctrl->scan_work);
@@ -3477,16 +3558,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset);
int __init nvme_core_init(void)
{
- int result;
+ int result = -ENOMEM;
nvme_wq = alloc_workqueue("nvme-wq",
WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
if (!nvme_wq)
- return -ENOMEM;
+ goto out;
+
+ nvme_reset_wq = alloc_workqueue("nvme-reset-wq",
+ WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!nvme_reset_wq)
+ goto destroy_wq;
+
+ nvme_delete_wq = alloc_workqueue("nvme-delete-wq",
+ WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0);
+ if (!nvme_delete_wq)
+ goto destroy_reset_wq;
result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme");
if (result < 0)
- goto destroy_wq;
+ goto destroy_delete_wq;
nvme_class = class_create(THIS_MODULE, "nvme");
if (IS_ERR(nvme_class)) {
@@ -3505,8 +3596,13 @@ destroy_class:
class_destroy(nvme_class);
unregister_chrdev:
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+destroy_delete_wq:
+ destroy_workqueue(nvme_delete_wq);
+destroy_reset_wq:
+ destroy_workqueue(nvme_reset_wq);
destroy_wq:
destroy_workqueue(nvme_wq);
+out:
return result;
}
@@ -3516,6 +3612,8 @@ void nvme_core_exit(void)
class_destroy(nvme_subsys_class);
class_destroy(nvme_class);
unregister_chrdev_region(nvme_chr_devt, NVME_MINORS);
+ destroy_workqueue(nvme_delete_wq);
+ destroy_workqueue(nvme_reset_wq);
destroy_workqueue(nvme_wq);
}
diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c
index 894c2ccb3891..5dd4ceefed8f 100644
--- a/drivers/nvme/host/fabrics.c
+++ b/drivers/nvme/host/fabrics.c
@@ -493,7 +493,7 @@ EXPORT_SYMBOL_GPL(nvmf_should_reconnect);
*/
int nvmf_register_transport(struct nvmf_transport_ops *ops)
{
- if (!ops->create_ctrl)
+ if (!ops->create_ctrl || !ops->module)
return -EINVAL;
down_write(&nvmf_transports_rwsem);
@@ -739,11 +739,14 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts,
ret = -ENOMEM;
goto out;
}
- if (uuid_parse(p, &hostid)) {
+ ret = uuid_parse(p, &hostid);
+ if (ret) {
pr_err("Invalid hostid %s\n", p);
ret = -EINVAL;
+ kfree(p);
goto out;
}
+ kfree(p);
break;
case NVMF_OPT_DUP_CONNECT:
opts->duplicate_connect = true;
@@ -869,32 +872,41 @@ nvmf_create_ctrl(struct device *dev, const char *buf, size_t count)
goto out_unlock;
}
+ if (!try_module_get(ops->module)) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
ret = nvmf_check_required_opts(opts, ops->required_opts);
if (ret)
- goto out_unlock;
+ goto out_module_put;
ret = nvmf_check_allowed_opts(opts, NVMF_ALLOWED_OPTS |
ops->allowed_opts | ops->required_opts);
if (ret)
- goto out_unlock;
+ goto out_module_put;
ctrl = ops->create_ctrl(dev, opts);
if (IS_ERR(ctrl)) {
ret = PTR_ERR(ctrl);
- goto out_unlock;
+ goto out_module_put;
}
if (strcmp(ctrl->subsys->subnqn, opts->subsysnqn)) {
dev_warn(ctrl->device,
"controller returned incorrect NQN: \"%s\".\n",
ctrl->subsys->subnqn);
+ module_put(ops->module);
up_read(&nvmf_transports_rwsem);
nvme_delete_ctrl_sync(ctrl);
return ERR_PTR(-EINVAL);
}
+ module_put(ops->module);
up_read(&nvmf_transports_rwsem);
return ctrl;
+out_module_put:
+ module_put(ops->module);
out_unlock:
up_read(&nvmf_transports_rwsem);
out_free_opts:
diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h
index 9ba614953607..25b19f722f5b 100644
--- a/drivers/nvme/host/fabrics.h
+++ b/drivers/nvme/host/fabrics.h
@@ -108,6 +108,7 @@ struct nvmf_ctrl_options {
* fabric implementation of NVMe fabrics.
* @entry: Used by the fabrics library to add the new
* registration entry to its linked-list internal tree.
+ * @module: Transport module reference
* @name: Name of the NVMe fabric driver implementation.
* @required_opts: sysfs command-line options that must be specified
* when adding a new NVMe controller.
@@ -126,6 +127,7 @@ struct nvmf_ctrl_options {
*/
struct nvmf_transport_ops {
struct list_head entry;
+ struct module *module;
const char *name;
int required_opts;
int allowed_opts;
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index 794e66e4aa20..99bf51c7e513 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl)
__nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0);
nvme_fc_free_queue(&ctrl->queues[0]);
+ /* re-enable the admin_q so anything new can fast fail */
+ blk_mq_unquiesce_queue(ctrl->ctrl.admin_q);
+
nvme_fc_ctlr_inactive_on_rport(ctrl);
}
@@ -2935,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl)
* waiting for io to terminate
*/
nvme_fc_delete_association(ctrl);
+
+ /* resume the io queues so that things will fast fail */
+ nvme_start_queues(nctrl);
}
static void
@@ -3380,6 +3386,7 @@ nvme_fc_create_ctrl(struct device *dev, struct nvmf_ctrl_options *opts)
static struct nvmf_transport_ops nvme_fc_transport = {
.name = "fc",
+ .module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR | NVMF_OPT_HOST_TRADDR,
.allowed_opts = NVMF_OPT_RECONNECT_DELAY | NVMF_OPT_CTRL_LOSS_TMO,
.create_ctrl = nvme_fc_create_ctrl,
diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c
index ba3d7f3349e5..50ef71ee3d86 100644
--- a/drivers/nvme/host/lightnvm.c
+++ b/drivers/nvme/host/lightnvm.c
@@ -31,27 +31,10 @@
enum nvme_nvm_admin_opcode {
nvme_nvm_admin_identity = 0xe2,
- nvme_nvm_admin_get_l2p_tbl = 0xea,
nvme_nvm_admin_get_bb_tbl = 0xf2,
nvme_nvm_admin_set_bb_tbl = 0xf1,
};
-struct nvme_nvm_hb_rw {
- __u8 opcode;
- __u8 flags;
- __u16 command_id;
- __le32 nsid;
- __u64 rsvd2;
- __le64 metadata;
- __le64 prp1;
- __le64 prp2;
- __le64 spba;
- __le16 length;
- __le16 control;
- __le32 dsmgmt;
- __le64 slba;
-};
-
struct nvme_nvm_ph_rw {
__u8 opcode;
__u8 flags;
@@ -80,19 +63,6 @@ struct nvme_nvm_identity {
__u32 rsvd11[5];
};
-struct nvme_nvm_l2ptbl {
- __u8 opcode;
- __u8 flags;
- __u16 command_id;
- __le32 nsid;
- __le32 cdw2[4];
- __le64 prp1;
- __le64 prp2;
- __le64 slba;
- __le32 nlb;
- __le16 cdw14[6];
-};
-
struct nvme_nvm_getbbtbl {
__u8 opcode;
__u8 flags;
@@ -139,9 +109,7 @@ struct nvme_nvm_command {
union {
struct nvme_common_command common;
struct nvme_nvm_identity identity;
- struct nvme_nvm_hb_rw hb_rw;
struct nvme_nvm_ph_rw ph_rw;
- struct nvme_nvm_l2ptbl l2p;
struct nvme_nvm_getbbtbl get_bb;
struct nvme_nvm_setbbtbl set_bb;
struct nvme_nvm_erase_blk erase;
@@ -167,7 +135,7 @@ struct nvme_nvm_id_group {
__u8 num_lun;
__u8 num_pln;
__u8 rsvd1;
- __le16 num_blk;
+ __le16 num_chk;
__le16 num_pg;
__le16 fpg_sz;
__le16 csecs;
@@ -234,11 +202,9 @@ struct nvme_nvm_bb_tbl {
static inline void _nvme_nvm_check_size(void)
{
BUILD_BUG_ON(sizeof(struct nvme_nvm_identity) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_hb_rw) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_ph_rw) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_getbbtbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_setbbtbl) != 64);
- BUILD_BUG_ON(sizeof(struct nvme_nvm_l2ptbl) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_erase_blk) != 64);
BUILD_BUG_ON(sizeof(struct nvme_nvm_id_group) != 960);
BUILD_BUG_ON(sizeof(struct nvme_nvm_addr_format) != 16);
@@ -249,51 +215,58 @@ static inline void _nvme_nvm_check_size(void)
static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id)
{
struct nvme_nvm_id_group *src;
- struct nvm_id_group *dst;
+ struct nvm_id_group *grp;
+ int sec_per_pg, sec_per_pl, pg_per_blk;
if (nvme_nvm_id->cgrps != 1)
return -EINVAL;
src = &nvme_nvm_id->groups[0];
- dst = &nvm_id->grp;
-
- dst->mtype = src->mtype;
- dst->fmtype = src->fmtype;
- dst->num_ch = src->num_ch;
- dst->num_lun = src->num_lun;
- dst->num_pln = src->num_pln;
-
- dst->num_pg = le16_to_cpu(src->num_pg);
- dst->num_blk = le16_to_cpu(src->num_blk);
- dst->fpg_sz = le16_to_cpu(src->fpg_sz);
- dst->csecs = le16_to_cpu(src->csecs);
- dst->sos = le16_to_cpu(src->sos);
-
- dst->trdt = le32_to_cpu(src->trdt);
- dst->trdm = le32_to_cpu(src->trdm);
- dst->tprt = le32_to_cpu(src->tprt);
- dst->tprm = le32_to_cpu(src->tprm);
- dst->tbet = le32_to_cpu(src->tbet);
- dst->tbem = le32_to_cpu(src->tbem);
- dst->mpos = le32_to_cpu(src->mpos);
- dst->mccap = le32_to_cpu(src->mccap);
-
- dst->cpar = le16_to_cpu(src->cpar);
-
- if (dst->fmtype == NVM_ID_FMTYPE_MLC) {
- memcpy(dst->lptbl.id, src->lptbl.id, 8);
- dst->lptbl.mlc.num_pairs =
- le16_to_cpu(src->lptbl.mlc.num_pairs);
-
- if (dst->lptbl.mlc.num_pairs > NVME_NVM_LP_MLC_PAIRS) {
- pr_err("nvm: number of MLC pairs not supported\n");
- return -EINVAL;
- }
+ grp = &nvm_id->grp;
+
+ grp->mtype = src->mtype;
+ grp->fmtype = src->fmtype;
+
+ grp->num_ch = src->num_ch;
+ grp->num_lun = src->num_lun;
+
+ grp->num_chk = le16_to_cpu(src->num_chk);
+ grp->csecs = le16_to_cpu(src->csecs);
+ grp->sos = le16_to_cpu(src->sos);
+
+ pg_per_blk = le16_to_cpu(src->num_pg);
+ sec_per_pg = le16_to_cpu(src->fpg_sz) / grp->csecs;
+ sec_per_pl = sec_per_pg * src->num_pln;
+ grp->clba = sec_per_pl * pg_per_blk;
+ grp->ws_per_chk = pg_per_blk;
- memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs,
- dst->lptbl.mlc.num_pairs);
+ grp->mpos = le32_to_cpu(src->mpos);
+ grp->cpar = le16_to_cpu(src->cpar);
+ grp->mccap = le32_to_cpu(src->mccap);
+
+ grp->ws_opt = grp->ws_min = sec_per_pg;
+ grp->ws_seq = NVM_IO_SNGL_ACCESS;
+
+ if (grp->mpos & 0x020202) {
+ grp->ws_seq = NVM_IO_DUAL_ACCESS;
+ grp->ws_opt <<= 1;
+ } else if (grp->mpos & 0x040404) {
+ grp->ws_seq = NVM_IO_QUAD_ACCESS;
+ grp->ws_opt <<= 2;
}
+ grp->trdt = le32_to_cpu(src->trdt);
+ grp->trdm = le32_to_cpu(src->trdm);
+ grp->tprt = le32_to_cpu(src->tprt);
+ grp->tprm = le32_to_cpu(src->tprm);
+ grp->tbet = le32_to_cpu(src->tbet);
+ grp->tbem = le32_to_cpu(src->tbem);
+
+ /* 1.2 compatibility */
+ grp->num_pln = src->num_pln;
+ grp->num_pg = le16_to_cpu(src->num_pg);
+ grp->fpg_sz = le16_to_cpu(src->fpg_sz);
+
return 0;
}
@@ -332,62 +305,6 @@ out:
return ret;
}
-static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb,
- nvm_l2p_update_fn *update_l2p, void *priv)
-{
- struct nvme_ns *ns = nvmdev->q->queuedata;
- struct nvme_nvm_command c = {};
- u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9;
- u32 nlb_pr_rq = len / sizeof(u64);
- u64 cmd_slba = slba;
- void *entries;
- int ret = 0;
-
- c.l2p.opcode = nvme_nvm_admin_get_l2p_tbl;
- c.l2p.nsid = cpu_to_le32(ns->head->ns_id);
- entries = kmalloc(len, GFP_KERNEL);
- if (!entries)
- return -ENOMEM;
-
- while (nlb) {
- u32 cmd_nlb = min(nlb_pr_rq, nlb);
- u64 elba = slba + cmd_nlb;
-
- c.l2p.slba = cpu_to_le64(cmd_slba);
- c.l2p.nlb = cpu_to_le32(cmd_nlb);
-
- ret = nvme_submit_sync_cmd(ns->ctrl->admin_q,
- (struct nvme_command *)&c, entries, len);
- if (ret) {
- dev_err(ns->ctrl->device,
- "L2P table transfer failed (%d)\n", ret);
- ret = -EIO;
- goto out;
- }
-
- if (unlikely(elba > nvmdev->total_secs)) {
- pr_err("nvm: L2P data from device is out of bounds!\n");
- ret = -EINVAL;
- goto out;
- }
-
- /* Transform physical address to target address space */
- nvm_part_to_tgt(nvmdev, entries, cmd_nlb);
-
- if (update_l2p(cmd_slba, cmd_nlb, entries, priv)) {
- ret = -EINTR;
- goto out;
- }
-
- cmd_slba += cmd_nlb;
- nlb -= cmd_nlb;
- }
-
-out:
- kfree(entries);
- return ret;
-}
-
static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
u8 *blks)
{
@@ -397,7 +314,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
struct nvme_ctrl *ctrl = ns->ctrl;
struct nvme_nvm_command c = {};
struct nvme_nvm_bb_tbl *bb_tbl;
- int nr_blks = geo->blks_per_lun * geo->plane_mode;
+ int nr_blks = geo->nr_chks * geo->plane_mode;
int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blks;
int ret = 0;
@@ -438,7 +355,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa,
goto out;
}
- memcpy(blks, bb_tbl->blk, geo->blks_per_lun * geo->plane_mode);
+ memcpy(blks, bb_tbl->blk, geo->nr_chks * geo->plane_mode);
out:
kfree(bb_tbl);
return ret;
@@ -474,10 +391,6 @@ static inline void nvme_nvm_rqtocmd(struct nvm_rq *rqd, struct nvme_ns *ns,
c->ph_rw.metadata = cpu_to_le64(rqd->dma_meta_list);
c->ph_rw.control = cpu_to_le16(rqd->flags);
c->ph_rw.length = cpu_to_le16(rqd->nr_ppas - 1);
-
- if (rqd->opcode == NVM_OP_HBWRITE || rqd->opcode == NVM_OP_HBREAD)
- c->hb_rw.slba = cpu_to_le64(nvme_block_nr(ns,
- rqd->bio->bi_iter.bi_sector));
}
static void nvme_nvm_end_io(struct request *rq, blk_status_t status)
@@ -597,8 +510,6 @@ static void nvme_nvm_dev_dma_free(void *pool, void *addr,
static struct nvm_dev_ops nvme_nvm_dev_ops = {
.identity = nvme_nvm_identity,
- .get_l2p_tbl = nvme_nvm_get_l2p_tbl,
-
.get_bb_tbl = nvme_nvm_get_bb_tbl,
.set_bb_tbl = nvme_nvm_set_bb_tbl,
@@ -883,7 +794,7 @@ static ssize_t nvm_dev_attr_show(struct device *dev,
} else if (strcmp(attr->name, "num_planes") == 0) {
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pln);
} else if (strcmp(attr->name, "num_blocks") == 0) { /* u16 */
- return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_blk);
+ return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_chk);
} else if (strcmp(attr->name, "num_pages") == 0) {
return scnprintf(page, PAGE_SIZE, "%u\n", grp->num_pg);
} else if (strcmp(attr->name, "page_size") == 0) {
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index 1218a9fca846..3b211d9e58b8 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -33,51 +33,11 @@ void nvme_failover_req(struct request *req)
kblockd_schedule_work(&ns->head->requeue_work);
}
-bool nvme_req_needs_failover(struct request *req)
+bool nvme_req_needs_failover(struct request *req, blk_status_t error)
{
if (!(req->cmd_flags & REQ_NVME_MPATH))
return false;
-
- switch (nvme_req(req)->status & 0x7ff) {
- /*
- * Generic command status:
- */
- case NVME_SC_INVALID_OPCODE:
- case NVME_SC_INVALID_FIELD:
- case NVME_SC_INVALID_NS:
- case NVME_SC_LBA_RANGE:
- case NVME_SC_CAP_EXCEEDED:
- case NVME_SC_RESERVATION_CONFLICT:
- return false;
-
- /*
- * I/O command set specific error. Unfortunately these values are
- * reused for fabrics commands, but those should never get here.
- */
- case NVME_SC_BAD_ATTRIBUTES:
- case NVME_SC_INVALID_PI:
- case NVME_SC_READ_ONLY:
- case NVME_SC_ONCS_NOT_SUPPORTED:
- WARN_ON_ONCE(nvme_req(req)->cmd->common.opcode ==
- nvme_fabrics_command);
- return false;
-
- /*
- * Media and Data Integrity Errors:
- */
- case NVME_SC_WRITE_FAULT:
- case NVME_SC_READ_ERROR:
- case NVME_SC_GUARD_CHECK:
- case NVME_SC_APPTAG_CHECK:
- case NVME_SC_REFTAG_CHECK:
- case NVME_SC_COMPARE_FAILED:
- case NVME_SC_ACCESS_DENIED:
- case NVME_SC_UNWRITTEN_BLOCK:
- return false;
- }
-
- /* Everything else could be a path failure, so should be retried */
- return true;
+ return blk_path_error(error);
}
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl)
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index a00eabd06427..8e4550fa08f8 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -32,6 +32,8 @@ extern unsigned int admin_timeout;
#define NVME_KATO_GRACE 10
extern struct workqueue_struct *nvme_wq;
+extern struct workqueue_struct *nvme_reset_wq;
+extern struct workqueue_struct *nvme_delete_wq;
enum {
NVME_NS_LBA = 0,
@@ -119,6 +121,7 @@ static inline struct nvme_request *nvme_req(struct request *req)
enum nvme_ctrl_state {
NVME_CTRL_NEW,
NVME_CTRL_LIVE,
+ NVME_CTRL_ADMIN_ONLY, /* Only admin queue live */
NVME_CTRL_RESETTING,
NVME_CTRL_RECONNECTING,
NVME_CTRL_DELETING,
@@ -393,6 +396,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count);
void nvme_start_keep_alive(struct nvme_ctrl *ctrl);
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl);
int nvme_reset_ctrl(struct nvme_ctrl *ctrl);
+int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl(struct nvme_ctrl *ctrl);
int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl);
@@ -401,7 +405,7 @@ extern const struct block_device_operations nvme_ns_head_ops;
#ifdef CONFIG_NVME_MULTIPATH
void nvme_failover_req(struct request *req);
-bool nvme_req_needs_failover(struct request *req);
+bool nvme_req_needs_failover(struct request *req, blk_status_t error);
void nvme_kick_requeue_lists(struct nvme_ctrl *ctrl);
int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl,struct nvme_ns_head *head);
void nvme_mpath_add_disk(struct nvme_ns_head *head);
@@ -430,7 +434,8 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
static inline void nvme_failover_req(struct request *req)
{
}
-static inline bool nvme_req_needs_failover(struct request *req)
+static inline bool nvme_req_needs_failover(struct request *req,
+ blk_status_t error)
{
return false;
}
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 4276ebfff22b..6fe7af00a1f4 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
* Represents an NVM Express device. Each nvme_dev is a PCI function.
*/
struct nvme_dev {
- struct nvme_queue **queues;
+ struct nvme_queue *queues;
struct blk_mq_tag_set tagset;
struct blk_mq_tag_set admin_tagset;
u32 __iomem *dbs;
@@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
@@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
- struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1];
+ struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
if (!nvmeq->tags)
nvmeq->tags = &dev->tagset.tags[hctx_idx];
@@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req,
struct nvme_dev *dev = set->driver_data;
struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0;
- struct nvme_queue *nvmeq = dev->queues[queue_idx];
+ struct nvme_queue *nvmeq = &dev->queues[queue_idx];
BUG_ON(!nvmeq);
iod->nvmeq = nvmeq;
@@ -1044,7 +1044,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag)
static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl)
{
struct nvme_dev *dev = to_nvme_dev(ctrl);
- struct nvme_queue *nvmeq = dev->queues[0];
+ struct nvme_queue *nvmeq = &dev->queues[0];
struct nvme_command c;
memset(&c, 0, sizeof(c));
@@ -1138,9 +1138,14 @@ static bool nvme_should_reset(struct nvme_dev *dev, u32 csts)
*/
bool nssro = dev->subsystem && (csts & NVME_CSTS_NSSRO);
- /* If there is a reset ongoing, we shouldn't reset again. */
- if (dev->ctrl.state == NVME_CTRL_RESETTING)
+ /* If there is a reset/reinit ongoing, we shouldn't reset again. */
+ switch (dev->ctrl.state) {
+ case NVME_CTRL_RESETTING:
+ case NVME_CTRL_RECONNECTING:
return false;
+ default:
+ break;
+ }
/* We shouldn't reset unless the controller is on fatal error state
* _or_ if we lost the communication with it.
@@ -1280,7 +1285,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq)
if (nvmeq->sq_cmds)
dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth),
nvmeq->sq_cmds, nvmeq->sq_dma_addr);
- kfree(nvmeq);
}
static void nvme_free_queues(struct nvme_dev *dev, int lowest)
@@ -1288,10 +1292,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest)
int i;
for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) {
- struct nvme_queue *nvmeq = dev->queues[i];
dev->ctrl.queue_count--;
- dev->queues[i] = NULL;
- nvme_free_queue(nvmeq);
+ nvme_free_queue(&dev->queues[i]);
}
}
@@ -1323,12 +1325,7 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq)
static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
{
- struct nvme_queue *nvmeq = dev->queues[0];
-
- if (!nvmeq)
- return;
- if (nvme_suspend_queue(nvmeq))
- return;
+ struct nvme_queue *nvmeq = &dev->queues[0];
if (shutdown)
nvme_shutdown_ctrl(&dev->ctrl);
@@ -1367,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
int qid, int depth)
{
- if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) {
+ if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) {
unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth),
dev->ctrl.page_size);
nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset;
@@ -1382,13 +1379,13 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq,
return 0;
}
-static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
- int depth, int node)
+static int nvme_alloc_queue(struct nvme_dev *dev, int qid,
+ int depth, int node)
{
- struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL,
- node);
- if (!nvmeq)
- return NULL;
+ struct nvme_queue *nvmeq = &dev->queues[qid];
+
+ if (dev->ctrl.queue_count > qid)
+ return 0;
nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth),
&nvmeq->cq_dma_addr, GFP_KERNEL);
@@ -1407,17 +1404,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid,
nvmeq->q_depth = depth;
nvmeq->qid = qid;
nvmeq->cq_vector = -1;
- dev->queues[qid] = nvmeq;
dev->ctrl.queue_count++;
- return nvmeq;
+ return 0;
free_cqdma:
dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes,
nvmeq->cq_dma_addr);
free_nvmeq:
- kfree(nvmeq);
- return NULL;
+ return -ENOMEM;
}
static int queue_request_irq(struct nvme_queue *nvmeq)
@@ -1590,14 +1585,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev)
if (result < 0)
return result;
- nvmeq = dev->queues[0];
- if (!nvmeq) {
- nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
- dev_to_node(dev->dev));
- if (!nvmeq)
- return -ENOMEM;
- }
+ result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH,
+ dev_to_node(dev->dev));
+ if (result)
+ return result;
+ nvmeq = &dev->queues[0];
aqa = nvmeq->q_depth - 1;
aqa |= aqa << 16;
@@ -1627,7 +1620,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) {
/* vector == qid - 1, match nvme_create_queue */
- if (!nvme_alloc_queue(dev, i, dev->q_depth,
+ if (nvme_alloc_queue(dev, i, dev->q_depth,
pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) {
ret = -ENOMEM;
break;
@@ -1636,15 +1629,15 @@ static int nvme_create_io_queues(struct nvme_dev *dev)
max = min(dev->max_qid, dev->ctrl.queue_count - 1);
for (i = dev->online_queues; i <= max; i++) {
- ret = nvme_create_queue(dev->queues[i], i);
+ ret = nvme_create_queue(&dev->queues[i], i);
if (ret)
break;
}
/*
* Ignore failing Create SQ/CQ commands, we can continue with less
- * than the desired aount of queues, and even a controller without
- * I/O queues an still be used to issue admin commands. This might
+ * than the desired amount of queues, and even a controller without
+ * I/O queues can still be used to issue admin commands. This might
* be useful to upgrade a buggy firmware for example.
*/
return ret >= 0 ? 0 : ret;
@@ -1661,30 +1654,40 @@ static ssize_t nvme_cmb_show(struct device *dev,
}
static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL);
-static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
+static u64 nvme_cmb_size_unit(struct nvme_dev *dev)
{
- u64 szu, size, offset;
+ u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK;
+
+ return 1ULL << (12 + 4 * szu);
+}
+
+static u32 nvme_cmb_size(struct nvme_dev *dev)
+{
+ return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK;
+}
+
+static void nvme_map_cmb(struct nvme_dev *dev)
+{
+ u64 size, offset;
resource_size_t bar_size;
struct pci_dev *pdev = to_pci_dev(dev->dev);
- void __iomem *cmb;
int bar;
dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ);
- if (!(NVME_CMB_SZ(dev->cmbsz)))
- return NULL;
+ if (!dev->cmbsz)
+ return;
dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC);
if (!use_cmb_sqes)
- return NULL;
+ return;
- szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz));
- size = szu * NVME_CMB_SZ(dev->cmbsz);
- offset = szu * NVME_CMB_OFST(dev->cmbloc);
+ size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev);
+ offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc);
bar = NVME_CMB_BIR(dev->cmbloc);
bar_size = pci_resource_len(pdev, bar);
if (offset > bar_size)
- return NULL;
+ return;
/*
* Controllers may support a CMB size larger than their BAR,
@@ -1694,13 +1697,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev)
if (size > bar_size - offset)
size = bar_size - offset;
- cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
- if (!cmb)
- return NULL;
-
+ dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size);
+ if (!dev->cmb)
+ return;
dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset;
dev->cmb_size = size;
- return cmb;
+
+ if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
+ &dev_attr_cmb.attr, NULL))
+ dev_warn(dev->ctrl.device,
+ "failed to add sysfs attribute for CMB\n");
}
static inline void nvme_release_cmb(struct nvme_dev *dev)
@@ -1768,7 +1774,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
dma_addr_t descs_dma;
int i = 0;
void **bufs;
- u64 size = 0, tmp;
+ u64 size, tmp;
tmp = (preferred + chunk_size - 1);
do_div(tmp, chunk_size);
@@ -1851,7 +1857,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
u64 preferred = (u64)dev->ctrl.hmpre * 4096;
u64 min = (u64)dev->ctrl.hmmin * 4096;
u32 enable_bits = NVME_HOST_MEM_ENABLE;
- int ret = 0;
+ int ret;
preferred = min(preferred, max);
if (min > max) {
@@ -1892,7 +1898,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev)
static int nvme_setup_io_queues(struct nvme_dev *dev)
{
- struct nvme_queue *adminq = dev->queues[0];
+ struct nvme_queue *adminq = &dev->queues[0];
struct pci_dev *pdev = to_pci_dev(dev->dev);
int result, nr_io_queues;
unsigned long size;
@@ -1905,7 +1911,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev)
if (nr_io_queues == 0)
return 0;
- if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) {
+ if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) {
result = nvme_cmb_qdepth(dev, nr_io_queues,
sizeof(struct nvme_command));
if (result > 0)
@@ -2005,9 +2011,9 @@ static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode)
return 0;
}
-static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
+static void nvme_disable_io_queues(struct nvme_dev *dev)
{
- int pass;
+ int pass, queues = dev->online_queues - 1;
unsigned long timeout;
u8 opcode = nvme_admin_delete_sq;
@@ -2018,7 +2024,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
retry:
timeout = ADMIN_TIMEOUT;
for (; i > 0; i--, sent++)
- if (nvme_delete_queue(dev->queues[i], opcode))
+ if (nvme_delete_queue(&dev->queues[i], opcode))
break;
while (sent--) {
@@ -2033,13 +2039,12 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues)
}
/*
- * Return: error value if an error occurred setting up the queues or calling
- * Identify Device. 0 if these succeeded, even if adding some of the
- * namespaces failed. At the moment, these failures are silent. TBD which
- * failures should be reported.
+ * return error value only when tagset allocation failed
*/
static int nvme_dev_add(struct nvme_dev *dev)
{
+ int ret;
+
if (!dev->ctrl.tagset) {
dev->tagset.ops = &nvme_mq_ops;
dev->tagset.nr_hw_queues = dev->online_queues - 1;
@@ -2055,8 +2060,12 @@ static int nvme_dev_add(struct nvme_dev *dev)
dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE;
dev->tagset.driver_data = dev;
- if (blk_mq_alloc_tag_set(&dev->tagset))
- return 0;
+ ret = blk_mq_alloc_tag_set(&dev->tagset);
+ if (ret) {
+ dev_warn(dev->ctrl.device,
+ "IO queues tagset allocation failed %d\n", ret);
+ return ret;
+ }
dev->ctrl.tagset = &dev->tagset;
nvme_dbbuf_set(dev);
@@ -2122,22 +2131,7 @@ static int nvme_pci_enable(struct nvme_dev *dev)
"set queue depth=%u\n", dev->q_depth);
}
- /*
- * CMBs can currently only exist on >=1.2 PCIe devices. We only
- * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group
- * has no name we can pass NULL as final argument to
- * sysfs_add_file_to_group.
- */
-
- if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) {
- dev->cmb = nvme_map_cmb(dev);
- if (dev->cmb) {
- if (sysfs_add_file_to_group(&dev->ctrl.device->kobj,
- &dev_attr_cmb.attr, NULL))
- dev_warn(dev->ctrl.device,
- "failed to add sysfs attribute for CMB\n");
- }
- }
+ nvme_map_cmb(dev);
pci_enable_pcie_error_reporting(pdev);
pci_save_state(pdev);
@@ -2170,7 +2164,7 @@ static void nvme_pci_disable(struct nvme_dev *dev)
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
{
- int i, queues;
+ int i;
bool dead = true;
struct pci_dev *pdev = to_pci_dev(dev->dev);
@@ -2205,21 +2199,13 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
}
nvme_stop_queues(&dev->ctrl);
- queues = dev->online_queues - 1;
- for (i = dev->ctrl.queue_count - 1; i > 0; i--)
- nvme_suspend_queue(dev->queues[i]);
-
- if (dead) {
- /* A device might become IO incapable very soon during
- * probe, before the admin queue is configured. Thus,
- * queue_count can be 0 here.
- */
- if (dev->ctrl.queue_count)
- nvme_suspend_queue(dev->queues[0]);
- } else {
- nvme_disable_io_queues(dev, queues);
+ if (!dead) {
+ nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
}
+ for (i = dev->ctrl.queue_count - 1; i >= 0; i--)
+ nvme_suspend_queue(&dev->queues[i]);
+
nvme_pci_disable(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
@@ -2289,6 +2275,7 @@ static void nvme_reset_work(struct work_struct *work)
container_of(work, struct nvme_dev, ctrl.reset_work);
bool was_suspend = !!(dev->ctrl.ctrl_config & NVME_CC_SHN_NORMAL);
int result = -ENODEV;
+ enum nvme_ctrl_state new_state = NVME_CTRL_LIVE;
if (WARN_ON(dev->ctrl.state != NVME_CTRL_RESETTING))
goto out;
@@ -2300,6 +2287,16 @@ static void nvme_reset_work(struct work_struct *work)
if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
+ /*
+ * Introduce RECONNECTING state from nvme-fc/rdma transports to mark the
+ * initializing procedure here.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RECONNECTING)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller RECONNECTING\n");
+ goto out;
+ }
+
result = nvme_pci_enable(dev);
if (result)
goto out;
@@ -2352,15 +2349,23 @@ static void nvme_reset_work(struct work_struct *work)
dev_warn(dev->ctrl.device, "IO queues not created\n");
nvme_kill_queues(&dev->ctrl);
nvme_remove_namespaces(&dev->ctrl);
+ new_state = NVME_CTRL_ADMIN_ONLY;
} else {
nvme_start_queues(&dev->ctrl);
nvme_wait_freeze(&dev->ctrl);
- nvme_dev_add(dev);
+ /* hit this only when allocate tagset fails */
+ if (nvme_dev_add(dev))
+ new_state = NVME_CTRL_ADMIN_ONLY;
nvme_unfreeze(&dev->ctrl);
}
- if (!nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_LIVE)) {
- dev_warn(dev->ctrl.device, "failed to mark controller live\n");
+ /*
+ * If only admin queue live, keep it to do further investigation or
+ * recovery.
+ */
+ if (!nvme_change_ctrl_state(&dev->ctrl, new_state)) {
+ dev_warn(dev->ctrl.device,
+ "failed to mark controller state %d\n", new_state);
goto out;
}
@@ -2468,8 +2473,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node);
if (!dev)
return -ENOMEM;
- dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *),
- GFP_KERNEL, node);
+
+ dev->queues = kcalloc_node(num_possible_cpus() + 1,
+ sizeof(struct nvme_queue), GFP_KERNEL, node);
if (!dev->queues)
goto free;
@@ -2496,10 +2502,10 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
if (result)
goto release_pools;
- nvme_change_ctrl_state(&dev->ctrl, NVME_CTRL_RESETTING);
dev_info(dev->ctrl.device, "pci function %s\n", dev_name(&pdev->dev));
- queue_work(nvme_wq, &dev->ctrl.reset_work);
+ nvme_reset_ctrl(&dev->ctrl);
+
return 0;
release_pools:
@@ -2523,7 +2529,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev)
static void nvme_reset_done(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
- nvme_reset_ctrl(&dev->ctrl);
+ nvme_reset_ctrl_sync(&dev->ctrl);
}
static void nvme_shutdown(struct pci_dev *pdev)
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 2a0bba7f50cf..2bc059f7d73c 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -66,7 +66,6 @@ struct nvme_rdma_request {
struct ib_sge sge[1 + NVME_RDMA_MAX_INLINE_SEGMENTS];
u32 num_sge;
int nents;
- bool inline_data;
struct ib_reg_wr reg_wr;
struct ib_cqe reg_cqe;
struct nvme_rdma_queue *queue;
@@ -1092,7 +1091,6 @@ static int nvme_rdma_map_sg_inline(struct nvme_rdma_queue *queue,
sg->length = cpu_to_le32(sg_dma_len(req->sg_table.sgl));
sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
- req->inline_data = true;
req->num_sge++;
return 0;
}
@@ -1164,7 +1162,6 @@ static int nvme_rdma_map_data(struct nvme_rdma_queue *queue,
int count, ret;
req->num_sge = 1;
- req->inline_data = false;
refcount_set(&req->ref, 2); /* send and recv completions */
c->common.flags |= NVME_CMD_SGL_METABUF;
@@ -2018,6 +2015,7 @@ out_free_ctrl:
static struct nvmf_transport_ops nvme_rdma_transport = {
.name = "rdma",
+ .module = THIS_MODULE,
.required_opts = NVMF_OPT_TRADDR,
.allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO,
@@ -2040,7 +2038,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data)
}
mutex_unlock(&nvme_rdma_ctrl_mutex);
- flush_workqueue(nvme_wq);
+ flush_workqueue(nvme_delete_wq);
}
static struct ib_client nvme_rdma_ib_client = {
diff --git a/drivers/nvme/host/trace.c b/drivers/nvme/host/trace.c
new file mode 100644
index 000000000000..41944bbef835
--- /dev/null
+++ b/drivers/nvme/host/trace.c
@@ -0,0 +1,130 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#include <asm/unaligned.h>
+#include "trace.h"
+
+static const char *nvme_trace_create_sq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u16 sqid = get_unaligned_le16(cdw10);
+ u16 qsize = get_unaligned_le16(cdw10 + 2);
+ u16 sq_flags = get_unaligned_le16(cdw10 + 4);
+ u16 cqid = get_unaligned_le16(cdw10 + 6);
+
+
+ trace_seq_printf(p, "sqid=%u, qsize=%u, sq_flags=0x%x, cqid=%u",
+ sqid, qsize, sq_flags, cqid);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_create_cq(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u16 cqid = get_unaligned_le16(cdw10);
+ u16 qsize = get_unaligned_le16(cdw10 + 2);
+ u16 cq_flags = get_unaligned_le16(cdw10 + 4);
+ u16 irq_vector = get_unaligned_le16(cdw10 + 6);
+
+ trace_seq_printf(p, "cqid=%u, qsize=%u, cq_flags=0x%x, irq_vector=%u",
+ cqid, qsize, cq_flags, irq_vector);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_admin_identify(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u8 cns = cdw10[0];
+ u16 ctrlid = get_unaligned_le16(cdw10 + 2);
+
+ trace_seq_printf(p, "cns=%u, ctrlid=%u", cns, ctrlid);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+
+
+static const char *nvme_trace_read_write(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+ u64 slba = get_unaligned_le64(cdw10);
+ u16 length = get_unaligned_le16(cdw10 + 8);
+ u16 control = get_unaligned_le16(cdw10 + 10);
+ u32 dsmgmt = get_unaligned_le32(cdw10 + 12);
+ u32 reftag = get_unaligned_le32(cdw10 + 16);
+
+ trace_seq_printf(p,
+ "slba=%llu, len=%u, ctrl=0x%x, dsmgmt=%u, reftag=%u",
+ slba, length, control, dsmgmt, reftag);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_dsm(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_printf(p, "nr=%u, attributes=%u",
+ get_unaligned_le32(cdw10),
+ get_unaligned_le32(cdw10 + 4));
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+static const char *nvme_trace_common(struct trace_seq *p, u8 *cdw10)
+{
+ const char *ret = trace_seq_buffer_ptr(p);
+
+ trace_seq_printf(p, "cdw10=%*ph", 24, cdw10);
+ trace_seq_putc(p, 0);
+
+ return ret;
+}
+
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p,
+ u8 opcode, u8 *cdw10)
+{
+ switch (opcode) {
+ case nvme_admin_create_sq:
+ return nvme_trace_create_sq(p, cdw10);
+ case nvme_admin_create_cq:
+ return nvme_trace_create_cq(p, cdw10);
+ case nvme_admin_identify:
+ return nvme_trace_admin_identify(p, cdw10);
+ default:
+ return nvme_trace_common(p, cdw10);
+ }
+}
+
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p,
+ u8 opcode, u8 *cdw10)
+{
+ switch (opcode) {
+ case nvme_cmd_read:
+ case nvme_cmd_write:
+ case nvme_cmd_write_zeroes:
+ return nvme_trace_read_write(p, cdw10);
+ case nvme_cmd_dsm:
+ return nvme_trace_dsm(p, cdw10);
+ default:
+ return nvme_trace_common(p, cdw10);
+ }
+}
diff --git a/drivers/nvme/host/trace.h b/drivers/nvme/host/trace.h
new file mode 100644
index 000000000000..ea91fccd1bc0
--- /dev/null
+++ b/drivers/nvme/host/trace.h
@@ -0,0 +1,165 @@
+/*
+ * NVM Express device driver tracepoints
+ * Copyright (c) 2018 Johannes Thumshirn, SUSE Linux GmbH
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nvme
+
+#if !defined(_TRACE_NVME_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NVME_H
+
+#include <linux/nvme.h>
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "nvme.h"
+
+#define nvme_admin_opcode_name(opcode) { opcode, #opcode }
+#define show_admin_opcode_name(val) \
+ __print_symbolic(val, \
+ nvme_admin_opcode_name(nvme_admin_delete_sq), \
+ nvme_admin_opcode_name(nvme_admin_create_sq), \
+ nvme_admin_opcode_name(nvme_admin_get_log_page), \
+ nvme_admin_opcode_name(nvme_admin_delete_cq), \
+ nvme_admin_opcode_name(nvme_admin_create_cq), \
+ nvme_admin_opcode_name(nvme_admin_identify), \
+ nvme_admin_opcode_name(nvme_admin_abort_cmd), \
+ nvme_admin_opcode_name(nvme_admin_set_features), \
+ nvme_admin_opcode_name(nvme_admin_get_features), \
+ nvme_admin_opcode_name(nvme_admin_async_event), \
+ nvme_admin_opcode_name(nvme_admin_ns_mgmt), \
+ nvme_admin_opcode_name(nvme_admin_activate_fw), \
+ nvme_admin_opcode_name(nvme_admin_download_fw), \
+ nvme_admin_opcode_name(nvme_admin_ns_attach), \
+ nvme_admin_opcode_name(nvme_admin_keep_alive), \
+ nvme_admin_opcode_name(nvme_admin_directive_send), \
+ nvme_admin_opcode_name(nvme_admin_directive_recv), \
+ nvme_admin_opcode_name(nvme_admin_dbbuf), \
+ nvme_admin_opcode_name(nvme_admin_format_nvm), \
+ nvme_admin_opcode_name(nvme_admin_security_send), \
+ nvme_admin_opcode_name(nvme_admin_security_recv), \
+ nvme_admin_opcode_name(nvme_admin_sanitize_nvm))
+
+const char *nvme_trace_parse_admin_cmd(struct trace_seq *p, u8 opcode,
+ u8 *cdw10);
+#define __parse_nvme_admin_cmd(opcode, cdw10) \
+ nvme_trace_parse_admin_cmd(p, opcode, cdw10)
+
+#define nvme_opcode_name(opcode) { opcode, #opcode }
+#define show_opcode_name(val) \
+ __print_symbolic(val, \
+ nvme_opcode_name(nvme_cmd_flush), \
+ nvme_opcode_name(nvme_cmd_write), \
+ nvme_opcode_name(nvme_cmd_read), \
+ nvme_opcode_name(nvme_cmd_write_uncor), \
+ nvme_opcode_name(nvme_cmd_compare), \
+ nvme_opcode_name(nvme_cmd_write_zeroes), \
+ nvme_opcode_name(nvme_cmd_dsm), \
+ nvme_opcode_name(nvme_cmd_resv_register), \
+ nvme_opcode_name(nvme_cmd_resv_report), \
+ nvme_opcode_name(nvme_cmd_resv_acquire), \
+ nvme_opcode_name(nvme_cmd_resv_release))
+
+const char *nvme_trace_parse_nvm_cmd(struct trace_seq *p, u8 opcode,
+ u8 *cdw10);
+#define __parse_nvme_cmd(opcode, cdw10) \
+ nvme_trace_parse_nvm_cmd(p, opcode, cdw10)
+
+TRACE_EVENT(nvme_setup_admin_cmd,
+ TP_PROTO(struct nvme_command *cmd),
+ TP_ARGS(cmd),
+ TP_STRUCT__entry(
+ __field(u8, opcode)
+ __field(u8, flags)
+ __field(u16, cid)
+ __field(u64, metadata)
+ __array(u8, cdw10, 24)
+ ),
+ TP_fast_assign(
+ __entry->opcode = cmd->common.opcode;
+ __entry->flags = cmd->common.flags;
+ __entry->cid = cmd->common.command_id;
+ __entry->metadata = le64_to_cpu(cmd->common.metadata);
+ memcpy(__entry->cdw10, cmd->common.cdw10,
+ sizeof(__entry->cdw10));
+ ),
+ TP_printk(" cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+ __entry->cid, __entry->flags, __entry->metadata,
+ show_admin_opcode_name(__entry->opcode),
+ __parse_nvme_admin_cmd(__entry->opcode, __entry->cdw10))
+);
+
+
+TRACE_EVENT(nvme_setup_nvm_cmd,
+ TP_PROTO(int qid, struct nvme_command *cmd),
+ TP_ARGS(qid, cmd),
+ TP_STRUCT__entry(
+ __field(int, qid)
+ __field(u8, opcode)
+ __field(u8, flags)
+ __field(u16, cid)
+ __field(u32, nsid)
+ __field(u64, metadata)
+ __array(u8, cdw10, 24)
+ ),
+ TP_fast_assign(
+ __entry->qid = qid;
+ __entry->opcode = cmd->common.opcode;
+ __entry->flags = cmd->common.flags;
+ __entry->cid = cmd->common.command_id;
+ __entry->nsid = le32_to_cpu(cmd->common.nsid);
+ __entry->metadata = le64_to_cpu(cmd->common.metadata);
+ memcpy(__entry->cdw10, cmd->common.cdw10,
+ sizeof(__entry->cdw10));
+ ),
+ TP_printk("qid=%d, nsid=%u, cmdid=%u, flags=0x%x, meta=0x%llx, cmd=(%s %s)",
+ __entry->qid, __entry->nsid, __entry->cid,
+ __entry->flags, __entry->metadata,
+ show_opcode_name(__entry->opcode),
+ __parse_nvme_cmd(__entry->opcode, __entry->cdw10))
+);
+
+TRACE_EVENT(nvme_complete_rq,
+ TP_PROTO(struct request *req),
+ TP_ARGS(req),
+ TP_STRUCT__entry(
+ __field(int, qid)
+ __field(int, cid)
+ __field(u64, result)
+ __field(u8, retries)
+ __field(u8, flags)
+ __field(u16, status)
+ ),
+ TP_fast_assign(
+ __entry->qid = req->q->id;
+ __entry->cid = req->tag;
+ __entry->result = le64_to_cpu(nvme_req(req)->result.u64);
+ __entry->retries = nvme_req(req)->retries;
+ __entry->flags = nvme_req(req)->flags;
+ __entry->status = nvme_req(req)->status;
+ ),
+ TP_printk("cmdid=%u, qid=%d, res=%llu, retries=%u, flags=0x%x, status=%u",
+ __entry->cid, __entry->qid, __entry->result,
+ __entry->retries, __entry->flags, __entry->status)
+
+);
+
+#endif /* _TRACE_NVME_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/nvme/target/Kconfig b/drivers/nvme/target/Kconfig
index 03e4ab65fe77..5f4f8b16685f 100644
--- a/drivers/nvme/target/Kconfig
+++ b/drivers/nvme/target/Kconfig
@@ -29,6 +29,7 @@ config NVME_TARGET_RDMA
tristate "NVMe over Fabrics RDMA target support"
depends on INFINIBAND
depends on NVME_TARGET
+ select SGL_ALLOC
help
This enables the NVMe RDMA target support, which allows exporting NVMe
devices over RDMA.
@@ -39,6 +40,7 @@ config NVME_TARGET_FC
tristate "NVMe over Fabrics FC target driver"
depends on NVME_TARGET
depends on HAS_DMA
+ select SGL_ALLOC
help
This enables the NVMe FC target support, which allows exporting NVMe
devices over FC.
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index b54748ad5f48..0bd737117a80 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq,
req->sg_cnt = 0;
req->transfer_len = 0;
req->rsp->status = 0;
+ req->ns = NULL;
/* no support for fused commands yet */
if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) {
@@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init);
void nvmet_req_uninit(struct nvmet_req *req)
{
percpu_ref_put(&req->sq->ref);
+ if (req->ns)
+ nvmet_put_namespace(req->ns);
}
EXPORT_SYMBOL_GPL(nvmet_req_uninit);
@@ -830,7 +833,7 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
/* Don't accept keep-alive timeout for discovery controllers */
if (kato) {
status = NVME_SC_INVALID_FIELD | NVME_SC_DNR;
- goto out_free_sqs;
+ goto out_remove_ida;
}
/*
@@ -860,6 +863,8 @@ u16 nvmet_alloc_ctrl(const char *subsysnqn, const char *hostnqn,
*ctrlp = ctrl;
return 0;
+out_remove_ida:
+ ida_simple_remove(&cntlid_ida, ctrl->cntlid);
out_free_sqs:
kfree(ctrl->sqs);
out_free_cqs:
@@ -877,21 +882,22 @@ static void nvmet_ctrl_free(struct kref *ref)
struct nvmet_ctrl *ctrl = container_of(ref, struct nvmet_ctrl, ref);
struct nvmet_subsys *subsys = ctrl->subsys;
- nvmet_stop_keep_alive_timer(ctrl);
-
mutex_lock(&subsys->lock);
list_del(&ctrl->subsys_entry);
mutex_unlock(&subsys->lock);
+ nvmet_stop_keep_alive_timer(ctrl);
+
flush_work(&ctrl->async_event_work);
cancel_work_sync(&ctrl->fatal_err_work);
ida_simple_remove(&cntlid_ida, ctrl->cntlid);
- nvmet_subsys_put(subsys);
kfree(ctrl->sqs);
kfree(ctrl->cqs);
kfree(ctrl);
+
+ nvmet_subsys_put(subsys);
}
void nvmet_ctrl_put(struct nvmet_ctrl *ctrl)
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index db3bf6b8bf9e..19e9e42ae943 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -225,7 +225,7 @@ static void nvmet_execute_io_connect(struct nvmet_req *req)
goto out_ctrl_put;
}
- pr_info("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
+ pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid);
out:
kfree(d);
diff --git a/drivers/nvme/target/fc.c b/drivers/nvme/target/fc.c
index 5fd86039e353..9b39a6cb1935 100644
--- a/drivers/nvme/target/fc.c
+++ b/drivers/nvme/target/fc.c
@@ -1697,31 +1697,12 @@ static int
nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
{
struct scatterlist *sg;
- struct page *page;
unsigned int nent;
- u32 page_len, length;
- int i = 0;
- length = fod->req.transfer_len;
- nent = DIV_ROUND_UP(length, PAGE_SIZE);
- sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
+ sg = sgl_alloc(fod->req.transfer_len, GFP_KERNEL, &nent);
if (!sg)
goto out;
- sg_init_table(sg, nent);
-
- while (length) {
- page_len = min_t(u32, length, PAGE_SIZE);
-
- page = alloc_page(GFP_KERNEL);
- if (!page)
- goto out_free_pages;
-
- sg_set_page(&sg[i], page, page_len, 0);
- length -= page_len;
- i++;
- }
-
fod->data_sg = sg;
fod->data_sg_cnt = nent;
fod->data_sg_cnt = fc_dma_map_sg(fod->tgtport->dev, sg, nent,
@@ -1731,14 +1712,6 @@ nvmet_fc_alloc_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
return 0;
-out_free_pages:
- while (i > 0) {
- i--;
- __free_page(sg_page(&sg[i]));
- }
- kfree(sg);
- fod->data_sg = NULL;
- fod->data_sg_cnt = 0;
out:
return NVME_SC_INTERNAL;
}
@@ -1746,18 +1719,13 @@ out:
static void
nvmet_fc_free_tgt_pgs(struct nvmet_fc_fcp_iod *fod)
{
- struct scatterlist *sg;
- int count;
-
if (!fod->data_sg || !fod->data_sg_cnt)
return;
fc_dma_unmap_sg(fod->tgtport->dev, fod->data_sg, fod->data_sg_cnt,
((fod->io_dir == NVMET_FCP_WRITE) ?
DMA_FROM_DEVICE : DMA_TO_DEVICE));
- for_each_sg(fod->data_sg, sg, fod->data_sg_cnt, count)
- __free_page(sg_page(sg));
- kfree(fod->data_sg);
+ sgl_free(fod->data_sg);
fod->data_sg = NULL;
fod->data_sg_cnt = 0;
}
@@ -2522,14 +2490,8 @@ nvmet_fc_add_port(struct nvmet_port *port)
list_for_each_entry(tgtport, &nvmet_fc_target_list, tgt_list) {
if ((tgtport->fc_target_port.node_name == traddr.nn) &&
(tgtport->fc_target_port.port_name == traddr.pn)) {
- /* a FC port can only be 1 nvmet port id */
- if (!tgtport->port) {
- tgtport->port = port;
- port->priv = tgtport;
- nvmet_fc_tgtport_get(tgtport);
- ret = 0;
- } else
- ret = -EALREADY;
+ tgtport->port = port;
+ ret = 0;
break;
}
}
@@ -2540,19 +2502,7 @@ nvmet_fc_add_port(struct nvmet_port *port)
static void
nvmet_fc_remove_port(struct nvmet_port *port)
{
- struct nvmet_fc_tgtport *tgtport = port->priv;
- unsigned long flags;
- bool matched = false;
-
- spin_lock_irqsave(&nvmet_fc_tgtlock, flags);
- if (tgtport->port == port) {
- matched = true;
- tgtport->port = NULL;
- }
- spin_unlock_irqrestore(&nvmet_fc_tgtlock, flags);
-
- if (matched)
- nvmet_fc_tgtport_put(tgtport);
+ /* nothing to do */
}
static struct nvmet_fabrics_ops nvmet_fc_tgt_fcp_ops = {
diff --git a/drivers/nvme/target/fcloop.c b/drivers/nvme/target/fcloop.c
index 6a018a0bd6ce..34712def81b1 100644
--- a/drivers/nvme/target/fcloop.c
+++ b/drivers/nvme/target/fcloop.c
@@ -204,6 +204,10 @@ struct fcloop_lport {
struct completion unreg_done;
};
+struct fcloop_lport_priv {
+ struct fcloop_lport *lport;
+};
+
struct fcloop_rport {
struct nvme_fc_remote_port *remoteport;
struct nvmet_fc_target_port *targetport;
@@ -238,21 +242,32 @@ struct fcloop_lsreq {
int status;
};
+enum {
+ INI_IO_START = 0,
+ INI_IO_ACTIVE = 1,
+ INI_IO_ABORTED = 2,
+ INI_IO_COMPLETED = 3,
+};
+
struct fcloop_fcpreq {
struct fcloop_tport *tport;
struct nvmefc_fcp_req *fcpreq;
spinlock_t reqlock;
u16 status;
+ u32 inistate;
bool active;
bool aborted;
- struct work_struct work;
+ struct kref ref;
+ struct work_struct fcp_rcv_work;
+ struct work_struct abort_rcv_work;
+ struct work_struct tio_done_work;
struct nvmefc_tgt_fcp_req tgt_fcp_req;
};
struct fcloop_ini_fcpreq {
struct nvmefc_fcp_req *fcpreq;
struct fcloop_fcpreq *tfcp_req;
- struct work_struct iniwork;
+ spinlock_t inilock;
};
static inline struct fcloop_lsreq *
@@ -343,17 +358,122 @@ fcloop_xmt_ls_rsp(struct nvmet_fc_target_port *tport,
return 0;
}
-/*
- * FCP IO operation done by initiator abort.
- * call back up initiator "done" flows.
- */
static void
-fcloop_tgt_fcprqst_ini_done_work(struct work_struct *work)
+fcloop_tfcp_req_free(struct kref *ref)
{
- struct fcloop_ini_fcpreq *inireq =
- container_of(work, struct fcloop_ini_fcpreq, iniwork);
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(ref, struct fcloop_fcpreq, ref);
+
+ kfree(tfcp_req);
+}
+
+static void
+fcloop_tfcp_req_put(struct fcloop_fcpreq *tfcp_req)
+{
+ kref_put(&tfcp_req->ref, fcloop_tfcp_req_free);
+}
+
+static int
+fcloop_tfcp_req_get(struct fcloop_fcpreq *tfcp_req)
+{
+ return kref_get_unless_zero(&tfcp_req->ref);
+}
+
+static void
+fcloop_call_host_done(struct nvmefc_fcp_req *fcpreq,
+ struct fcloop_fcpreq *tfcp_req, int status)
+{
+ struct fcloop_ini_fcpreq *inireq = NULL;
+
+ if (fcpreq) {
+ inireq = fcpreq->private;
+ spin_lock(&inireq->inilock);
+ inireq->tfcp_req = NULL;
+ spin_unlock(&inireq->inilock);
+
+ fcpreq->status = status;
+ fcpreq->done(fcpreq);
+ }
+
+ /* release original io reference on tgt struct */
+ fcloop_tfcp_req_put(tfcp_req);
+}
+
+static void
+fcloop_fcp_recv_work(struct work_struct *work)
+{
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(work, struct fcloop_fcpreq, fcp_rcv_work);
+ struct nvmefc_fcp_req *fcpreq = tfcp_req->fcpreq;
+ int ret = 0;
+ bool aborted = false;
+
+ spin_lock(&tfcp_req->reqlock);
+ switch (tfcp_req->inistate) {
+ case INI_IO_START:
+ tfcp_req->inistate = INI_IO_ACTIVE;
+ break;
+ case INI_IO_ABORTED:
+ aborted = true;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(aborted))
+ ret = -ECANCELED;
+ else
+ ret = nvmet_fc_rcv_fcp_req(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req,
+ fcpreq->cmdaddr, fcpreq->cmdlen);
+ if (ret)
+ fcloop_call_host_done(fcpreq, tfcp_req, ret);
+
+ return;
+}
+
+static void
+fcloop_fcp_abort_recv_work(struct work_struct *work)
+{
+ struct fcloop_fcpreq *tfcp_req =
+ container_of(work, struct fcloop_fcpreq, abort_rcv_work);
+ struct nvmefc_fcp_req *fcpreq;
+ bool completed = false;
+
+ spin_lock(&tfcp_req->reqlock);
+ fcpreq = tfcp_req->fcpreq;
+ switch (tfcp_req->inistate) {
+ case INI_IO_ABORTED:
+ break;
+ case INI_IO_COMPLETED:
+ completed = true;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
+ spin_unlock(&tfcp_req->reqlock);
+
+ if (unlikely(completed)) {
+ /* remove reference taken in original abort downcall */
+ fcloop_tfcp_req_put(tfcp_req);
+ return;
+ }
- inireq->fcpreq->done(inireq->fcpreq);
+ if (tfcp_req->tport->targetport)
+ nvmet_fc_rcv_fcp_abort(tfcp_req->tport->targetport,
+ &tfcp_req->tgt_fcp_req);
+
+ spin_lock(&tfcp_req->reqlock);
+ tfcp_req->fcpreq = NULL;
+ spin_unlock(&tfcp_req->reqlock);
+
+ fcloop_call_host_done(fcpreq, tfcp_req, -ECANCELED);
+ /* call_host_done releases reference for abort downcall */
}
/*
@@ -364,20 +484,15 @@ static void
fcloop_tgt_fcprqst_done_work(struct work_struct *work)
{
struct fcloop_fcpreq *tfcp_req =
- container_of(work, struct fcloop_fcpreq, work);
- struct fcloop_tport *tport = tfcp_req->tport;
+ container_of(work, struct fcloop_fcpreq, tio_done_work);
struct nvmefc_fcp_req *fcpreq;
spin_lock(&tfcp_req->reqlock);
fcpreq = tfcp_req->fcpreq;
+ tfcp_req->inistate = INI_IO_COMPLETED;
spin_unlock(&tfcp_req->reqlock);
- if (tport->remoteport && fcpreq) {
- fcpreq->status = tfcp_req->status;
- fcpreq->done(fcpreq);
- }
-
- kfree(tfcp_req);
+ fcloop_call_host_done(fcpreq, tfcp_req, tfcp_req->status);
}
@@ -390,7 +505,6 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
struct fcloop_fcpreq *tfcp_req;
- int ret = 0;
if (!rport->targetport)
return -ECONNREFUSED;
@@ -401,16 +515,20 @@ fcloop_fcp_req(struct nvme_fc_local_port *localport,
inireq->fcpreq = fcpreq;
inireq->tfcp_req = tfcp_req;
- INIT_WORK(&inireq->iniwork, fcloop_tgt_fcprqst_ini_done_work);
+ spin_lock_init(&inireq->inilock);
+
tfcp_req->fcpreq = fcpreq;
tfcp_req->tport = rport->targetport->private;
+ tfcp_req->inistate = INI_IO_START;
spin_lock_init(&tfcp_req->reqlock);
- INIT_WORK(&tfcp_req->work, fcloop_tgt_fcprqst_done_work);
+ INIT_WORK(&tfcp_req->fcp_rcv_work, fcloop_fcp_recv_work);
+ INIT_WORK(&tfcp_req->abort_rcv_work, fcloop_fcp_abort_recv_work);
+ INIT_WORK(&tfcp_req->tio_done_work, fcloop_tgt_fcprqst_done_work);
+ kref_init(&tfcp_req->ref);
- ret = nvmet_fc_rcv_fcp_req(rport->targetport, &tfcp_req->tgt_fcp_req,
- fcpreq->cmdaddr, fcpreq->cmdlen);
+ schedule_work(&tfcp_req->fcp_rcv_work);
- return ret;
+ return 0;
}
static void
@@ -589,7 +707,7 @@ fcloop_fcp_req_release(struct nvmet_fc_target_port *tgtport,
{
struct fcloop_fcpreq *tfcp_req = tgt_fcp_req_to_fcpreq(tgt_fcpreq);
- schedule_work(&tfcp_req->work);
+ schedule_work(&tfcp_req->tio_done_work);
}
static void
@@ -605,27 +723,47 @@ fcloop_fcp_abort(struct nvme_fc_local_port *localport,
void *hw_queue_handle,
struct nvmefc_fcp_req *fcpreq)
{
- struct fcloop_rport *rport = remoteport->private;
struct fcloop_ini_fcpreq *inireq = fcpreq->private;
- struct fcloop_fcpreq *tfcp_req = inireq->tfcp_req;
+ struct fcloop_fcpreq *tfcp_req;
+ bool abortio = true;
+
+ spin_lock(&inireq->inilock);
+ tfcp_req = inireq->tfcp_req;
+ if (tfcp_req)
+ fcloop_tfcp_req_get(tfcp_req);
+ spin_unlock(&inireq->inilock);
if (!tfcp_req)
/* abort has already been called */
return;
- if (rport->targetport)
- nvmet_fc_rcv_fcp_abort(rport->targetport,
- &tfcp_req->tgt_fcp_req);
-
/* break initiator/target relationship for io */
spin_lock(&tfcp_req->reqlock);
- inireq->tfcp_req = NULL;
- tfcp_req->fcpreq = NULL;
+ switch (tfcp_req->inistate) {
+ case INI_IO_START:
+ case INI_IO_ACTIVE:
+ tfcp_req->inistate = INI_IO_ABORTED;
+ break;
+ case INI_IO_COMPLETED:
+ abortio = false;
+ break;
+ default:
+ spin_unlock(&tfcp_req->reqlock);
+ WARN_ON(1);
+ return;
+ }
spin_unlock(&tfcp_req->reqlock);
- /* post the aborted io completion */
- fcpreq->status = -ECANCELED;
- schedule_work(&inireq->iniwork);
+ if (abortio)
+ /* leave the reference while the work item is scheduled */
+ WARN_ON(!schedule_work(&tfcp_req->abort_rcv_work));
+ else {
+ /*
+ * as the io has already had the done callback made,
+ * nothing more to do. So release the reference taken above
+ */
+ fcloop_tfcp_req_put(tfcp_req);
+ }
}
static void
@@ -657,7 +795,8 @@ fcloop_nport_get(struct fcloop_nport *nport)
static void
fcloop_localport_delete(struct nvme_fc_local_port *localport)
{
- struct fcloop_lport *lport = localport->private;
+ struct fcloop_lport_priv *lport_priv = localport->private;
+ struct fcloop_lport *lport = lport_priv->lport;
/* release any threads waiting for the unreg to complete */
complete(&lport->unreg_done);
@@ -697,7 +836,7 @@ static struct nvme_fc_port_template fctemplate = {
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* sizes of additional private data for data structures */
- .local_priv_sz = sizeof(struct fcloop_lport),
+ .local_priv_sz = sizeof(struct fcloop_lport_priv),
.remote_priv_sz = sizeof(struct fcloop_rport),
.lsrqst_priv_sz = sizeof(struct fcloop_lsreq),
.fcprqst_priv_sz = sizeof(struct fcloop_ini_fcpreq),
@@ -714,8 +853,7 @@ static struct nvmet_fc_target_template tgttemplate = {
.max_dif_sgl_segments = FCLOOP_SGL_SEGS,
.dma_boundary = FCLOOP_DMABOUND_4G,
/* optional features */
- .target_features = NVMET_FCTGTFEAT_CMD_IN_ISR |
- NVMET_FCTGTFEAT_OPDONE_IN_ISR,
+ .target_features = 0,
/* sizes of additional private data for data structures */
.target_priv_sz = sizeof(struct fcloop_tport),
};
@@ -728,11 +866,17 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
struct fcloop_ctrl_options *opts;
struct nvme_fc_local_port *localport;
struct fcloop_lport *lport;
- int ret;
+ struct fcloop_lport_priv *lport_priv;
+ unsigned long flags;
+ int ret = -ENOMEM;
+
+ lport = kzalloc(sizeof(*lport), GFP_KERNEL);
+ if (!lport)
+ return -ENOMEM;
opts = kzalloc(sizeof(*opts), GFP_KERNEL);
if (!opts)
- return -ENOMEM;
+ goto out_free_lport;
ret = fcloop_parse_options(opts, buf);
if (ret)
@@ -752,23 +896,25 @@ fcloop_create_local_port(struct device *dev, struct device_attribute *attr,
ret = nvme_fc_register_localport(&pinfo, &fctemplate, NULL, &localport);
if (!ret) {
- unsigned long flags;
-
/* success */
- lport = localport->private;
+ lport_priv = localport->private;
+ lport_priv->lport = lport;
+
lport->localport = localport;
INIT_LIST_HEAD(&lport->lport_list);
spin_lock_irqsave(&fcloop_lock, flags);
list_add_tail(&lport->lport_list, &fcloop_lports);
spin_unlock_irqrestore(&fcloop_lock, flags);
-
- /* mark all of the input buffer consumed */
- ret = count;
}
out_free_opts:
kfree(opts);
+out_free_lport:
+ /* free only if we're going to fail */
+ if (ret)
+ kfree(lport);
+
return ret ? ret : count;
}
@@ -790,6 +936,8 @@ __wait_localport_unreg(struct fcloop_lport *lport)
wait_for_completion(&lport->unreg_done);
+ kfree(lport);
+
return ret;
}
diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c
index 1e21b286f299..7991ec3a17db 100644
--- a/drivers/nvme/target/loop.c
+++ b/drivers/nvme/target/loop.c
@@ -686,6 +686,7 @@ static struct nvmet_fabrics_ops nvme_loop_ops = {
static struct nvmf_transport_ops nvme_loop_transport = {
.name = "loop",
+ .module = THIS_MODULE,
.create_ctrl = nvme_loop_create_ctrl,
};
@@ -716,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void)
nvme_delete_ctrl(&ctrl->ctrl);
mutex_unlock(&nvme_loop_ctrl_mutex);
- flush_workqueue(nvme_wq);
+ flush_workqueue(nvme_delete_wq);
}
module_init(nvme_loop_init_module);
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 49912909c298..978e169c11bf 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -185,59 +185,6 @@ nvmet_rdma_put_rsp(struct nvmet_rdma_rsp *rsp)
spin_unlock_irqrestore(&rsp->queue->rsps_lock, flags);
}
-static void nvmet_rdma_free_sgl(struct scatterlist *sgl, unsigned int nents)
-{
- struct scatterlist *sg;
- int count;
-
- if (!sgl || !nents)
- return;
-
- for_each_sg(sgl, sg, nents, count)
- __free_page(sg_page(sg));
- kfree(sgl);
-}
-
-static int nvmet_rdma_alloc_sgl(struct scatterlist **sgl, unsigned int *nents,
- u32 length)
-{
- struct scatterlist *sg;
- struct page *page;
- unsigned int nent;
- int i = 0;
-
- nent = DIV_ROUND_UP(length, PAGE_SIZE);
- sg = kmalloc_array(nent, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
- goto out;
-
- sg_init_table(sg, nent);
-
- while (length) {
- u32 page_len = min_t(u32, length, PAGE_SIZE);
-
- page = alloc_page(GFP_KERNEL);
- if (!page)
- goto out_free_pages;
-
- sg_set_page(&sg[i], page, page_len, 0);
- length -= page_len;
- i++;
- }
- *sgl = sg;
- *nents = nent;
- return 0;
-
-out_free_pages:
- while (i > 0) {
- i--;
- __free_page(sg_page(&sg[i]));
- }
- kfree(sg);
-out:
- return NVME_SC_INTERNAL;
-}
-
static int nvmet_rdma_alloc_cmd(struct nvmet_rdma_device *ndev,
struct nvmet_rdma_cmd *c, bool admin)
{
@@ -484,7 +431,7 @@ static void nvmet_rdma_release_rsp(struct nvmet_rdma_rsp *rsp)
}
if (rsp->req.sg != &rsp->cmd->inline_sg)
- nvmet_rdma_free_sgl(rsp->req.sg, rsp->req.sg_cnt);
+ sgl_free(rsp->req.sg);
if (unlikely(!list_empty_careful(&queue->rsp_wr_wait_list)))
nvmet_rdma_process_wr_wait_list(queue);
@@ -621,16 +568,14 @@ static u16 nvmet_rdma_map_sgl_keyed(struct nvmet_rdma_rsp *rsp,
u32 len = get_unaligned_le24(sgl->length);
u32 key = get_unaligned_le32(sgl->key);
int ret;
- u16 status;
/* no data command? */
if (!len)
return 0;
- status = nvmet_rdma_alloc_sgl(&rsp->req.sg, &rsp->req.sg_cnt,
- len);
- if (status)
- return status;
+ rsp->req.sg = sgl_alloc(len, GFP_KERNEL, &rsp->req.sg_cnt);
+ if (!rsp->req.sg)
+ return NVME_SC_INTERNAL;
ret = rdma_rw_ctx_init(&rsp->rw, cm_id->qp, cm_id->port_num,
rsp->req.sg, rsp->req.sg_cnt, 0, addr, key,
@@ -976,7 +921,7 @@ static void nvmet_rdma_destroy_queue_ib(struct nvmet_rdma_queue *queue)
static void nvmet_rdma_free_queue(struct nvmet_rdma_queue *queue)
{
- pr_info("freeing queue %d\n", queue->idx);
+ pr_debug("freeing queue %d\n", queue->idx);
nvmet_sq_destroy(&queue->nvme_sq);
@@ -1558,25 +1503,9 @@ err_ib_client:
static void __exit nvmet_rdma_exit(void)
{
- struct nvmet_rdma_queue *queue;
-
nvmet_unregister_transport(&nvmet_rdma_ops);
-
- flush_scheduled_work();
-
- mutex_lock(&nvmet_rdma_queue_mutex);
- while ((queue = list_first_entry_or_null(&nvmet_rdma_queue_list,
- struct nvmet_rdma_queue, queue_list))) {
- list_del_init(&queue->queue_list);
-
- mutex_unlock(&nvmet_rdma_queue_mutex);
- __nvmet_rdma_queue_disconnect(queue);
- mutex_lock(&nvmet_rdma_queue_mutex);
- }
- mutex_unlock(&nvmet_rdma_queue_mutex);
-
- flush_scheduled_work();
ib_unregister_client(&nvmet_rdma_ib_client);
+ WARN_ON_ONCE(!list_empty(&nvmet_rdma_queue_list));
ida_destroy(&nvmet_rdma_queue_ida);
}
diff --git a/drivers/target/Kconfig b/drivers/target/Kconfig
index e2bc99980f75..4c44d7bed01a 100644
--- a/drivers/target/Kconfig
+++ b/drivers/target/Kconfig
@@ -5,6 +5,7 @@ menuconfig TARGET_CORE
select CONFIGFS_FS
select CRC_T10DIF
select BLK_SCSI_REQUEST # only for scsi_command_size_tbl..
+ select SGL_ALLOC
default n
help
Say Y or M here to enable the TCM Storage Engine and ConfigFS enabled
diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c
index 58caacd54a3b..c03a78ee26cd 100644
--- a/drivers/target/target_core_transport.c
+++ b/drivers/target/target_core_transport.c
@@ -2300,13 +2300,7 @@ queue_full:
void target_free_sgl(struct scatterlist *sgl, int nents)
{
- struct scatterlist *sg;
- int count;
-
- for_each_sg(sgl, sg, nents, count)
- __free_page(sg_page(sg));
-
- kfree(sgl);
+ sgl_free_n_order(sgl, nents, 0);
}
EXPORT_SYMBOL(target_free_sgl);
@@ -2414,42 +2408,10 @@ int
target_alloc_sgl(struct scatterlist **sgl, unsigned int *nents, u32 length,
bool zero_page, bool chainable)
{
- struct scatterlist *sg;
- struct page *page;
- gfp_t zero_flag = (zero_page) ? __GFP_ZERO : 0;
- unsigned int nalloc, nent;
- int i = 0;
-
- nalloc = nent = DIV_ROUND_UP(length, PAGE_SIZE);
- if (chainable)
- nalloc++;
- sg = kmalloc_array(nalloc, sizeof(struct scatterlist), GFP_KERNEL);
- if (!sg)
- return -ENOMEM;
+ gfp_t gfp = GFP_KERNEL | (zero_page ? __GFP_ZERO : 0);
- sg_init_table(sg, nalloc);
-
- while (length) {
- u32 page_len = min_t(u32, length, PAGE_SIZE);
- page = alloc_page(GFP_KERNEL | zero_flag);
- if (!page)
- goto out;
-
- sg_set_page(&sg[i], page, page_len, 0);
- length -= page_len;
- i++;
- }
- *sgl = sg;
- *nents = nent;
- return 0;
-
-out:
- while (i > 0) {
- i--;
- __free_page(sg_page(&sg[i]));
- }
- kfree(sg);
- return -ENOMEM;
+ *sgl = sgl_alloc_order(length, 0, chainable, gfp, nents);
+ return *sgl ? 0 : -ENOMEM;
}
EXPORT_SYMBOL(target_alloc_sgl);