From 8c9ce606a60e4a0cb447bdc082ce383b96b227b4 Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 25 May 2012 16:11:09 -0400 Subject: xen/blkback: Copy id field when doing BLKIF_DISCARD. We weren't copying the id field so when we sent the response back to the frontend (especially with a 64-bit host and 32-bit guest), we ended up using a random value. This lead to the frontend crashing as it would try to pass to __blk_end_request_all a NULL 'struct request' (b/c it would use the 'id' to find the proper 'struct request' in its shadow array) and end up crashing: BUG: unable to handle kernel NULL pointer dereference at 000000e4 IP: [] __blk_end_request_all+0xc/0x40 .. snip.. EIP is at __blk_end_request_all+0xc/0x40 .. snip.. [] blkif_interrupt+0x172/0x330 [xen_blkfront] This fixes the bug by passing in the proper id for the response. Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=824641 CC: stable@kernel.org Tested-by: William Dauchy Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkback/common.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'drivers') diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index 773cf27dc23f..9ad3b5ec1dc1 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -257,6 +257,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, break; case BLKIF_OP_DISCARD: dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; @@ -287,6 +288,7 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, break; case BLKIF_OP_DISCARD: dst->u.discard.flag = src->u.discard.flag; + dst->u.discard.id = src->u.discard.id; dst->u.discard.sector_number = src->u.discard.sector_number; dst->u.discard.nr_sectors = src->u.discard.nr_sectors; break; -- cgit v1.2.3 From 87c9ea76a242c2f9063e2a8f3e90846c932c61a7 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Sun, 3 Jun 2012 21:56:21 +0530 Subject: mtip32xx: Remove version.h header file inclusion version.h header file inclusion is no longer required. Signed-off-by: Sachin Kamat --- drivers/block/mtip32xx/mtip32xx.h | 1 - 1 file changed, 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index b2c88da26b2a..adb1aae3b752 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -26,7 +26,6 @@ #include #include #include -#include /* Offset of Subsystem Device ID in pci confoguration space */ #define PCI_SUBSYSTEM_DEVICEID 0x2E -- cgit v1.2.3 From 7412ff139d73f5561492478e89a22aede7252b7b Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 4 Jun 2012 12:43:03 -0700 Subject: mtip32xx: Remove 'registers' and 'flags' from sysfs This patch removes entries 'registers' and 'flags' from sysfs. Updated ABI file to reflect this change. Reported-by: Greg Kroah-Hartman Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block-rssd | 21 ------- drivers/block/mtip32xx/mtip32xx.c | 92 +----------------------------- 2 files changed, 1 insertion(+), 112 deletions(-) (limited to 'drivers') diff --git a/Documentation/ABI/testing/sysfs-block-rssd b/Documentation/ABI/testing/sysfs-block-rssd index 679ce3543122..beef30c046b0 100644 --- a/Documentation/ABI/testing/sysfs-block-rssd +++ b/Documentation/ABI/testing/sysfs-block-rssd @@ -1,26 +1,5 @@ -What: /sys/block/rssd*/registers -Date: March 2012 -KernelVersion: 3.3 -Contact: Asai Thambi S P -Description: This is a read-only file. Dumps below driver information and - hardware registers. - - S ACTive - - Command Issue - - Completed - - PORT IRQ STAT - - HOST IRQ STAT - - Allocated - - Commands in Q - What: /sys/block/rssd*/status Date: April 2012 KernelVersion: 3.4 Contact: Asai Thambi S P Description: This is a read-only file. Indicates the status of the device. - -What: /sys/block/rssd*/flags -Date: May 2012 -KernelVersion: 3.5 -Contact: Asai Thambi S P -Description: This is a read-only file. Dumps the flags in port and driver - data structure diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 264bc77dcb91..b6e95b911b8c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -2546,7 +2546,7 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, } /* - * Sysfs register/status dump. + * Sysfs status dump. * * @dev Pointer to the device structure, passed by the kernrel. * @attr Pointer to the device_attribute structure passed by the kernel. @@ -2555,71 +2555,6 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, * return value * The size, in bytes, of the data copied into buf. */ -static ssize_t mtip_hw_show_registers(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - u32 group_allocated; - struct driver_data *dd = dev_to_disk(dev)->private_data; - int size = 0; - int n; - - size += sprintf(&buf[size], "Hardware\n--------\n"); - size += sprintf(&buf[size], "S ACTive : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->s_active[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "Command Issue : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->cmd_issue[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "Completed : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) - size += sprintf(&buf[size], "%08X ", - readl(dd->port->completed[n])); - - size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "PORT IRQ STAT : [ 0x%08X ]\n", - readl(dd->port->mmio + PORT_IRQ_STAT)); - size += sprintf(&buf[size], "HOST IRQ STAT : [ 0x%08X ]\n", - readl(dd->mmio + HOST_IRQ_STAT)); - size += sprintf(&buf[size], "\n"); - - size += sprintf(&buf[size], "Local\n-----\n"); - size += sprintf(&buf[size], "Allocated : [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->allocated[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->allocated[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - - size += sprintf(&buf[size], "Commands in Q: [ 0x"); - - for (n = dd->slot_groups-1; n >= 0; n--) { - if (sizeof(long) > sizeof(u32)) - group_allocated = - dd->port->cmds_to_issue[n/2] >> (32*(n&1)); - else - group_allocated = dd->port->cmds_to_issue[n]; - size += sprintf(&buf[size], "%08X ", group_allocated); - } - size += sprintf(&buf[size], "]\n"); - - return size; -} - static ssize_t mtip_hw_show_status(struct device *dev, struct device_attribute *attr, char *buf) @@ -2637,24 +2572,7 @@ static ssize_t mtip_hw_show_status(struct device *dev, return size; } -static ssize_t mtip_hw_show_flags(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct driver_data *dd = dev_to_disk(dev)->private_data; - int size = 0; - - size += sprintf(&buf[size], "Flag in port struct : [ %08lX ]\n", - dd->port->flags); - size += sprintf(&buf[size], "Flag in dd struct : [ %08lX ]\n", - dd->dd_flag); - - return size; -} - -static DEVICE_ATTR(registers, S_IRUGO, mtip_hw_show_registers, NULL); static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); -static DEVICE_ATTR(flags, S_IRUGO, mtip_hw_show_flags, NULL); /* * Create the sysfs related attributes. @@ -2671,15 +2589,9 @@ static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj) if (!kobj || !dd) return -EINVAL; - if (sysfs_create_file(kobj, &dev_attr_registers.attr)) - dev_warn(&dd->pdev->dev, - "Error creating 'registers' sysfs entry\n"); if (sysfs_create_file(kobj, &dev_attr_status.attr)) dev_warn(&dd->pdev->dev, "Error creating 'status' sysfs entry\n"); - if (sysfs_create_file(kobj, &dev_attr_flags.attr)) - dev_warn(&dd->pdev->dev, - "Error creating 'flags' sysfs entry\n"); return 0; } @@ -2698,9 +2610,7 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) if (!kobj || !dd) return -EINVAL; - sysfs_remove_file(kobj, &dev_attr_registers.attr); sysfs_remove_file(kobj, &dev_attr_status.attr); - sysfs_remove_file(kobj, &dev_attr_flags.attr); return 0; } -- cgit v1.2.3 From 7b421d24eac79800ee68905f732300a291f72f00 Mon Sep 17 00:00:00 2001 From: Asai Thambi S P Date: Mon, 4 Jun 2012 12:44:02 -0700 Subject: mtip32xx: Create debugfs entries for troubleshooting On module load, creates a debugfs parent 'rssd' in debugfs root. Then for each device, create a new node with corresponding disk name. Under the new node, two entries 'registers' and 'flags' are created. NOTE: These entries were removed from sysfs in the previous patch Signed-off-by: Asai Thambi S P Signed-off-by: Jens Axboe --- drivers/block/mtip32xx/mtip32xx.c | 162 +++++++++++++++++++++++++++++++++++++- drivers/block/mtip32xx/mtip32xx.h | 4 + 2 files changed, 165 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index b6e95b911b8c..a8fddeb3d638 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -37,6 +37,7 @@ #include #include <../drivers/ata/ahci.h> #include +#include #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -85,6 +86,7 @@ static int instance; * allocated in mtip_init(). */ static int mtip_major; +static struct dentry *dfs_parent; static DEFINE_SPINLOCK(rssd_index_lock); static DEFINE_IDA(rssd_index_ida); @@ -2574,6 +2576,120 @@ static ssize_t mtip_hw_show_status(struct device *dev, static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); +static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + u32 group_allocated; + int size = *offset; + int n; + + if (!len || size) + return 0; + + if (size < 0) + return -EINVAL; + + size += sprintf(&buf[size], "H/ S ACTive : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->s_active[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Command Issue : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->cmd_issue[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ Completed : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->completed[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n", + readl(dd->port->mmio + PORT_IRQ_STAT)); + size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n", + readl(dd->mmio + HOST_IRQ_STAT)); + size += sprintf(&buf[size], "\n"); + + size += sprintf(&buf[size], "L/ Allocated : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->allocated[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->allocated[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + size += sprintf(&buf[size], "L/ Commands in Q : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->cmds_to_issue[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->cmds_to_issue[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf, + size_t len, loff_t *offset) +{ + struct driver_data *dd = (struct driver_data *)f->private_data; + char buf[MTIP_DFS_MAX_BUF_SIZE]; + int size = *offset; + + if (!len || size) + return 0; + + if (size < 0) + return -EINVAL; + + size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n", + dd->port->flags); + size += sprintf(&buf[size], "Flag-dd : [ %08lX ]\n", + dd->dd_flag); + + *offset = size <= len ? size : len; + size = copy_to_user(ubuf, buf, *offset); + if (size) + return -EFAULT; + + return *offset; +} + +static const struct file_operations mtip_regs_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_registers, + .llseek = no_llseek, +}; + +static const struct file_operations mtip_flags_fops = { + .owner = THIS_MODULE, + .open = simple_open, + .read = mtip_hw_read_flags, + .llseek = no_llseek, +}; + /* * Create the sysfs related attributes. * @@ -2615,6 +2731,34 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) return 0; } +static int mtip_hw_debugfs_init(struct driver_data *dd) +{ + if (!dfs_parent) + return -1; + + dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent); + if (IS_ERR_OR_NULL(dd->dfs_node)) { + dev_warn(&dd->pdev->dev, + "Error creating node %s under debugfs\n", + dd->disk->disk_name); + dd->dfs_node = NULL; + return -1; + } + + debugfs_create_file("flags", S_IRUGO, dd->dfs_node, dd, + &mtip_flags_fops); + debugfs_create_file("registers", S_IRUGO, dd->dfs_node, dd, + &mtip_regs_fops); + + return 0; +} + +static void mtip_hw_debugfs_exit(struct driver_data *dd) +{ + debugfs_remove_recursive(dd->dfs_node); +} + + /* * Perform any init/resume time hardware setup * @@ -3640,6 +3784,7 @@ skip_create_disk: mtip_hw_sysfs_init(dd, kobj); kobject_put(kobj); } + mtip_hw_debugfs_init(dd); if (dd->mtip_svc_handler) { set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); @@ -3665,6 +3810,8 @@ start_service_thread: return rv; kthread_run_error: + mtip_hw_debugfs_exit(dd); + /* Delete our gendisk. This also removes the device from /dev */ del_gendisk(dd->disk); @@ -3715,6 +3862,7 @@ static int mtip_block_remove(struct driver_data *dd) kobject_put(kobj); } } + mtip_hw_debugfs_exit(dd); /* * Delete our gendisk structure. This also removes the device @@ -4062,10 +4210,20 @@ static int __init mtip_init(void) } mtip_major = error; + if (!dfs_parent) { + dfs_parent = debugfs_create_dir("rssd", NULL); + if (IS_ERR_OR_NULL(dfs_parent)) { + printk(KERN_WARNING "Error creating debugfs parent\n"); + dfs_parent = NULL; + } + } + /* Register our PCI operations. */ error = pci_register_driver(&mtip_pci_driver); - if (error) + if (error) { + debugfs_remove(dfs_parent); unregister_blkdev(mtip_major, MTIP_DRV_NAME); + } return error; } @@ -4082,6 +4240,8 @@ static int __init mtip_init(void) */ static void __exit mtip_exit(void) { + debugfs_remove_recursive(dfs_parent); + /* Release the allocated major block device number. */ unregister_blkdev(mtip_major, MTIP_DRV_NAME); diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index adb1aae3b752..f51fc23d17bb 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -110,6 +110,8 @@ #define dbg_printk(format, arg...) #endif +#define MTIP_DFS_MAX_BUF_SIZE 1024 + #define __force_bit2int (unsigned int __force) enum { @@ -446,6 +448,8 @@ struct driver_data { unsigned long dd_flag; /* NOTE: use atomic bit operations on this */ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ + + struct dentry *dfs_node; }; #endif -- cgit v1.2.3 From 6878c32e5cc0e40980abe51d1f02fb453e27493e Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Fri, 25 May 2012 17:34:51 -0400 Subject: xen/blkfront: Add WARN to deal with misbehaving backends. Part of the ring structure is the 'id' field which is under control of the frontend. The frontend stamps it with "some" value (this some in this implementation being a value less than BLK_RING_SIZE), and when it gets a response expects said value to be in the response structure. We have a check for the id field when spolling new requests but not when de-spolling responses. We also add an extra check in add_id_to_freelist to make sure that the 'struct request' was not NULL - as we cannot pass a NULL to __blk_end_request_all, otherwise that crashes (and all the operations that the response is dealing with end up with __blk_end_request_all). Lastly we also print the name of the operation that failed. [v1: s/BUG/WARN/ suggested by Stefano] [v2: Add extra check in add_id_to_freelist] [v3: Redid op_name per Jan's suggestion] [v4: add const * and add WARN on failure returns] Acked-by: Jan Beulich Acked-by: Stefano Stabellini Signed-off-by: Konrad Rzeszutek Wilk --- drivers/block/xen-blkfront.c | 58 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 46 insertions(+), 12 deletions(-) (limited to 'drivers') diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 60eed4bdd2e4..e4fb3374dcd2 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -141,14 +141,36 @@ static int get_id_from_freelist(struct blkfront_info *info) return free; } -static void add_id_to_freelist(struct blkfront_info *info, +static int add_id_to_freelist(struct blkfront_info *info, unsigned long id) { + if (info->shadow[id].req.u.rw.id != id) + return -EINVAL; + if (info->shadow[id].request == NULL) + return -EINVAL; info->shadow[id].req.u.rw.id = info->shadow_free; info->shadow[id].request = NULL; info->shadow_free = id; + return 0; } +static const char *op_name(int op) +{ + static const char *const names[] = { + [BLKIF_OP_READ] = "read", + [BLKIF_OP_WRITE] = "write", + [BLKIF_OP_WRITE_BARRIER] = "barrier", + [BLKIF_OP_FLUSH_DISKCACHE] = "flush", + [BLKIF_OP_DISCARD] = "discard" }; + + if (op < 0 || op >= ARRAY_SIZE(names)) + return "unknown"; + + if (!names[op]) + return "reserved"; + + return names[op]; +} static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) { unsigned int end = minor + nr; @@ -746,20 +768,36 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; + /* + * The backend has messed up and given us an id that we would + * never have given to it (we stamp it up to BLK_RING_SIZE - + * look in get_id_from_freelist. + */ + if (id >= BLK_RING_SIZE) { + WARN(1, "%s: response to %s has incorrect id (%ld)\n", + info->gd->disk_name, op_name(bret->operation), id); + /* We can't safely get the 'struct request' as + * the id is busted. */ + continue; + } req = info->shadow[id].request; if (bret->operation != BLKIF_OP_DISCARD) blkif_completion(&info->shadow[id]); - add_id_to_freelist(info, id); + if (add_id_to_freelist(info, id)) { + WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", + info->gd->disk_name, op_name(bret->operation), id); + continue; + } error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { case BLKIF_OP_DISCARD: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { struct request_queue *rq = info->rq; - printk(KERN_WARNING "blkfront: %s: discard op failed\n", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; info->feature_discard = 0; info->feature_secdiscard = 0; @@ -771,18 +809,14 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.u.rw.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n", - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : "flush disk cache", - info->gd->disk_name); + printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", + info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; } if (unlikely(error)) { -- cgit v1.2.3 From 4eccc579795290a58e2262fa4e9d083d7672e699 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 13:18:51 +0200 Subject: drbd: fix access of unallocated pages and kernel panic BUG: unable to handle kernel NULL pointer dereference at (null) ... [] ? _drbd_bm_set_bits+0x151/0x240 [drbd] [] ? receive_bitmap+0x4f8/0xbc0 [drbd] This fixes an off-by-one error in the receive_bitmap() path, if run-length encoded bitmap transfer is enabled. If the bitmap is an exact multiple of PAGE_SIZE, which means the visible capacity of the drbd device is an exact multiple of 128 MiB (for 4k page size), and bitmap compression (use-rle) is enabled (which became default with 8.4), and the very last bit is dirty and reported in an rle comressed bitmap packet, we ended up trying to kmap_atomic a page pointer that does not exist (bitmap->bm_pages[last index + 1]). bug introduced by: Date: Fri Jul 24 15:33:24 2009 +0200 set bits: optimize for complete last word, fix off-by-one-word corner case made effective by: Date: Thu Dec 16 00:32:38 2010 +0100 drbd: get rid of unused debug code Long time ago, we had paranoia code in the bitmap that allocated one extra word, assigned a magic value, and checked on every occasion that the magic value was still unchanged. That debug code is unused, the extra long word complicates code a bit. Get rid of it. No-one triggered this bug in the last few years, because a large subset of our userbase is unaffected: * typically the last few blocks of a device are not modified frequently, and remain unset * use-rle was disabled by default in drbd < 8.4 * those with slightly "odd" device sizes, or * drbd internal meta data (which will skew the device size slightly, thus makes it harder to have a bug relevant device size) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_bitmap.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index b5c5ff53cb57..fcb956bb4b4c 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -1475,10 +1475,17 @@ void _drbd_bm_set_bits(struct drbd_conf *mdev, const unsigned long s, const unsi first_word = 0; spin_lock_irq(&b->bm_lock); } - /* last page (respectively only page, for first page == last page) */ last_word = MLPP(el >> LN2_BPL); - bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); + + /* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples). + * ==> e = 32767, el = 32768, last_page = 2, + * and now last_word = 0. + * We do not want to touch last_page in this case, + * as we did not allocate it, it is not present in bitmap->bm_pages. + */ + if (last_word) + bm_set_full_words_within_one_page(mdev->bitmap, last_page, first_word, last_word); /* possibly trailing bits. * example: (e & 63) == 63, el will be e+1. -- cgit v1.2.3 From 1ed25b269e3dd5ecc64f17beef9ea21745c39ca6 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 14:09:54 +0200 Subject: drbd: fix list corruption by failing but already aborted reads If a read is aborted due to force-detach of a supposedly unresponsive local backing device, and retried on the peer, it can happen that the local request later still completes (hopefully with an error). As it may already have been completed to upper layers meanwhile, it must not be retried again now. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 9c5c84946b05..773f4e2d3c1b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -472,12 +472,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + if (req->rq_state & RQ_LOCAL_ABORTED) { + _req_may_be_done(req, m); + break; + } __drbd_chk_io_error(mdev, false); goto_queue_for_net_read: + D_ASSERT(!(req->rq_state & RQ_NET_MASK)); + /* no point in retrying if there is no good remote data, * or we have no connection. */ if (mdev->state.pdsk != D_UP_TO_DATE) { -- cgit v1.2.3 From 0d5934e3c258fc5decc4103600c597086fd95a52 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Fri, 8 Jun 2012 14:17:36 +0200 Subject: drbd: fix null pointer dereference with on-congestion policy when diskless We must not look at mdev->actlog, unless we have a get_ldev() reference. It also does not make much sense to try to disconnect or pull-ahead of the peer, if we don't have good local data. Only even consider congestion policies, if our local disk is D_UP_TO_DATE. Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- drivers/block/drbd/drbd_req.c | 59 ++++++++++++++++++++++++++----------------- 1 file changed, 36 insertions(+), 23 deletions(-) (limited to 'drivers') diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 773f4e2d3c1b..8e93a6ac9bb6 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -770,6 +770,40 @@ static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int s return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr); } +static void maybe_pull_ahead(struct drbd_conf *mdev) +{ + int congested = 0; + + /* If I don't even have good local storage, we can not reasonably try + * to pull ahead of the peer. We also need the local reference to make + * sure mdev->act_log is there. + * Note: caller has to make sure that net_conf is there. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + return; + + if (mdev->net_conf->cong_fill && + atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { + dev_info(DEV, "Congestion-fill threshold reached\n"); + congested = 1; + } + + if (mdev->act_log->used >= mdev->net_conf->cong_extents) { + dev_info(DEV, "Congestion-extents threshold reached\n"); + congested = 1; + } + + if (congested) { + queue_barrier(mdev); /* last barrier, after mirrored writes */ + + if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) + _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); + else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ + _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); + } + put_ldev(mdev); +} + static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time) { const int rw = bio_rw(bio); @@ -977,29 +1011,8 @@ allocate_barrier: _req_mod(req, queue_for_send_oos); if (remote && - mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) { - int congested = 0; - - if (mdev->net_conf->cong_fill && - atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) { - dev_info(DEV, "Congestion-fill threshold reached\n"); - congested = 1; - } - - if (mdev->act_log->used >= mdev->net_conf->cong_extents) { - dev_info(DEV, "Congestion-extents threshold reached\n"); - congested = 1; - } - - if (congested) { - queue_barrier(mdev); /* last barrier, after mirrored writes */ - - if (mdev->net_conf->on_congestion == OC_PULL_AHEAD) - _drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL); - else /*mdev->net_conf->on_congestion == OC_DISCONNECT */ - _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL); - } - } + mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) + maybe_pull_ahead(mdev); spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ -- cgit v1.2.3 From 32587371ad3db2f9d335de10dbd8cffd4fff5669 Mon Sep 17 00:00:00 2001 From: Tao Guo Date: Wed, 13 Jun 2012 21:17:21 +0200 Subject: umem: fix up unplugging Fix a regression introduced by 7eaceaccab5f40 ("block: remove per-queue plugging"). In that patch, Jens removed the whole mm_unplug_device() function, which used to be the trigger to make umem start to work. We need to implement unplugging to make umem start to work, or I/O will never be triggered. Signed-off-by: Tao Guo Cc: Neil Brown Cc: Jens Axboe Cc: Shaohua Li Cc: Acked-by: NeilBrown Signed-off-by: Jens Axboe --- drivers/block/umem.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'drivers') diff --git a/drivers/block/umem.c b/drivers/block/umem.c index aa2712060bfb..9a72277a31df 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,6 +513,44 @@ static void process_page(unsigned long data) } } +struct mm_plug_cb { + struct blk_plug_cb cb; + struct cardinfo *card; +}; + +static void mm_unplug(struct blk_plug_cb *cb) +{ + struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); + + spin_lock_irq(&mmcb->card->lock); + activate(mmcb->card); + spin_unlock_irq(&mmcb->card->lock); + kfree(mmcb); +} + +static int mm_check_plugged(struct cardinfo *card) +{ + struct blk_plug *plug = current->plug; + struct mm_plug_cb *mmcb; + + if (!plug) + return 0; + + list_for_each_entry(mmcb, &plug->cb_list, cb.list) { + if (mmcb->cb.callback == mm_unplug && mmcb->card == card) + return 1; + } + /* Not currently on the callback list */ + mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); + if (!mmcb) + return 0; + + mmcb->card = card; + mmcb->cb.callback = mm_unplug; + list_add(&mmcb->cb.list, &plug->cb_list); + return 1; +} + static void mm_make_request(struct request_queue *q, struct bio *bio) { struct cardinfo *card = q->queuedata; @@ -523,6 +561,8 @@ static void mm_make_request(struct request_queue *q, struct bio *bio) *card->biotail = bio; bio->bi_next = NULL; card->biotail = &bio->bi_next; + if (bio->bi_rw & REQ_SYNC || !mm_check_plugged(card)) + activate(card); spin_unlock_irq(&card->lock); return; -- cgit v1.2.3