diff options
Diffstat (limited to 'drivers/block')
25 files changed, 2698 insertions, 323 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 83c32cb72582..717d6e4e18d3 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -470,6 +470,27 @@ config XEN_BLKDEV_FRONTEND block device driver. It communicates with a back-end driver in another domain which drives the actual block device. +config XEN_BLKDEV_BACKEND + tristate "Block-device backend driver" + depends on XEN_BACKEND + help + The block-device backend driver allows the kernel to export its + block devices to other guests via a high-performance shared-memory + interface. + + The corresponding Linux frontend driver is enabled by the + CONFIG_XEN_BLKDEV_FRONTEND configuration option. + + The backend driver attaches itself to a any block device specified + in the XenBus configuration. There are no limits to what the block + device as long as it has a major and minor. + + If you are compiling a kernel to run in a Xen block backend driver + domain (often this is domain 0) you should say Y here. To + compile this driver as a module, chose M here: the module + will be called xen-blkback. + + config VIRTIO_BLK tristate "Virtio block driver (EXPERIMENTAL)" depends on EXPERIMENTAL && VIRTIO diff --git a/drivers/block/Makefile b/drivers/block/Makefile index 40528ba56d1b..76646e9a1c91 100644 --- a/drivers/block/Makefile +++ b/drivers/block/Makefile @@ -36,6 +36,7 @@ obj-$(CONFIG_BLK_DEV_UB) += ub.o obj-$(CONFIG_BLK_DEV_HD) += hd.o obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o +obj-$(CONFIG_XEN_BLKDEV_BACKEND) += xen-blkback/ obj-$(CONFIG_BLK_DEV_DRBD) += drbd/ obj-$(CONFIG_BLK_DEV_RBD) += rbd.o diff --git a/drivers/block/brd.c b/drivers/block/brd.c index b7f51e4594f8..dba1c32e1ddf 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -35,10 +35,6 @@ */ struct brd_device { int brd_number; - int brd_refcnt; - loff_t brd_offset; - loff_t brd_sizelimit; - unsigned brd_blocksize; struct request_queue *brd_queue; struct gendisk *brd_disk; @@ -440,11 +436,11 @@ static int rd_nr; int rd_size = CONFIG_BLK_DEV_RAM_SIZE; static int max_part; static int part_shift; -module_param(rd_nr, int, 0); +module_param(rd_nr, int, S_IRUGO); MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices"); -module_param(rd_size, int, 0); +module_param(rd_size, int, S_IRUGO); MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes."); -module_param(max_part, int, 0); +module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per RAM disk"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR); @@ -552,7 +548,7 @@ static struct kobject *brd_probe(dev_t dev, int *part, void *data) struct kobject *kobj; mutex_lock(&brd_devices_mutex); - brd = brd_init_one(dev & MINORMASK); + brd = brd_init_one(MINOR(dev) >> part_shift); kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM); mutex_unlock(&brd_devices_mutex); @@ -575,25 +571,39 @@ static int __init brd_init(void) * * (1) if rd_nr is specified, create that many upfront, and this * also becomes a hard limit. - * (2) if rd_nr is not specified, create 1 rd device on module - * load, user can further extend brd device by create dev node - * themselves and have kernel automatically instantiate actual - * device on-demand. + * (2) if rd_nr is not specified, create CONFIG_BLK_DEV_RAM_COUNT + * (default 16) rd device on module load, user can further + * extend brd device by create dev node themselves and have + * kernel automatically instantiate actual device on-demand. */ part_shift = 0; - if (max_part > 0) + if (max_part > 0) { part_shift = fls(max_part); + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can decide correct minor number + * if [s]he want to create more devices. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + + if ((1UL << part_shift) > DISK_MAX_PARTS) + return -EINVAL; + if (rd_nr > 1UL << (MINORBITS - part_shift)) return -EINVAL; if (rd_nr) { nr = rd_nr; - range = rd_nr; + range = rd_nr << part_shift; } else { nr = CONFIG_BLK_DEV_RAM_COUNT; - range = 1UL << (MINORBITS - part_shift); + range = 1UL << MINORBITS; } if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) @@ -632,7 +642,7 @@ static void __exit brd_exit(void) unsigned long range; struct brd_device *brd, *next; - range = rd_nr ? rd_nr : 1UL << (MINORBITS - part_shift); + range = rd_nr ? rd_nr << part_shift : 1UL << MINORBITS; list_for_each_entry_safe(brd, next, &brd_devices, brd_list) brd_del_one(brd); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 9bf13988f1a2..8f4ef656a1af 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -64,6 +64,10 @@ MODULE_DESCRIPTION("Driver for HP Smart Array Controllers"); MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers"); MODULE_VERSION("3.6.26"); MODULE_LICENSE("GPL"); +static int cciss_tape_cmds = 6; +module_param(cciss_tape_cmds, int, 0644); +MODULE_PARM_DESC(cciss_tape_cmds, + "number of commands to allocate for tape devices (default: 6)"); static DEFINE_MUTEX(cciss_mutex); static struct proc_dir_entry *proc_cciss; @@ -194,6 +198,8 @@ static int __devinit cciss_find_cfg_addrs(struct pci_dev *pdev, static int __devinit cciss_pci_find_memory_BAR(struct pci_dev *pdev, unsigned long *memory_bar); static inline u32 cciss_tag_discard_error_bits(ctlr_info_t *h, u32 tag); +static __devinit int write_driver_ver_to_cfgtable( + CfgTable_struct __iomem *cfgtable); /* performant mode helper functions */ static void calc_bucket_map(int *bucket, int num_buckets, int nsgs, @@ -556,7 +562,7 @@ static void __devinit cciss_procinit(ctlr_info_t *h) #define to_hba(n) container_of(n, struct ctlr_info, dev) #define to_drv(n) container_of(n, drive_info_struct, dev) -/* List of controllers which cannot be reset on kexec with reset_devices */ +/* List of controllers which cannot be hard reset on kexec with reset_devices */ static u32 unresettable_controller[] = { 0x324a103C, /* Smart Array P712m */ 0x324b103C, /* SmartArray P711m */ @@ -574,23 +580,45 @@ static u32 unresettable_controller[] = { 0x409D0E11, /* Smart Array 6400 EM */ }; -static int ctlr_is_resettable(struct ctlr_info *h) +/* List of controllers which cannot even be soft reset */ +static u32 soft_unresettable_controller[] = { + 0x409C0E11, /* Smart Array 6400 */ + 0x409D0E11, /* Smart Array 6400 EM */ +}; + +static int ctlr_is_hard_resettable(u32 board_id) { int i; for (i = 0; i < ARRAY_SIZE(unresettable_controller); i++) - if (unresettable_controller[i] == h->board_id) + if (unresettable_controller[i] == board_id) return 0; return 1; } +static int ctlr_is_soft_resettable(u32 board_id) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(soft_unresettable_controller); i++) + if (soft_unresettable_controller[i] == board_id) + return 0; + return 1; +} + +static int ctlr_is_resettable(u32 board_id) +{ + return ctlr_is_hard_resettable(board_id) || + ctlr_is_soft_resettable(board_id); +} + static ssize_t host_show_resettable(struct device *dev, struct device_attribute *attr, char *buf) { struct ctlr_info *h = to_hba(dev); - return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h)); + return snprintf(buf, 20, "%d\n", ctlr_is_resettable(h->board_id)); } static DEVICE_ATTR(resettable, S_IRUGO, host_show_resettable, NULL); @@ -2567,7 +2595,7 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, } } else if (cmd_type == TYPE_MSG) { switch (cmd) { - case 0: /* ABORT message */ + case CCISS_ABORT_MSG: c->Request.CDBLen = 12; c->Request.Type.Attribute = ATTR_SIMPLE; c->Request.Type.Direction = XFER_WRITE; @@ -2577,16 +2605,16 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, /* buff contains the tag of the command to abort */ memcpy(&c->Request.CDB[4], buff, 8); break; - case 1: /* RESET message */ + case CCISS_RESET_MSG: c->Request.CDBLen = 16; c->Request.Type.Attribute = ATTR_SIMPLE; c->Request.Type.Direction = XFER_NONE; c->Request.Timeout = 0; memset(&c->Request.CDB[0], 0, sizeof(c->Request.CDB)); c->Request.CDB[0] = cmd; /* reset */ - c->Request.CDB[1] = 0x03; /* reset a target */ + c->Request.CDB[1] = CCISS_RESET_TYPE_TARGET; break; - case 3: /* No-Op message */ + case CCISS_NOOP_MSG: c->Request.CDBLen = 1; c->Request.Type.Attribute = ATTR_SIMPLE; c->Request.Type.Direction = XFER_WRITE; @@ -2615,6 +2643,31 @@ static int fill_cmd(ctlr_info_t *h, CommandList_struct *c, __u8 cmd, void *buff, return status; } +static int __devinit cciss_send_reset(ctlr_info_t *h, unsigned char *scsi3addr, + u8 reset_type) +{ + CommandList_struct *c; + int return_status; + + c = cmd_alloc(h); + if (!c) + return -ENOMEM; + return_status = fill_cmd(h, c, CCISS_RESET_MSG, NULL, 0, 0, + CTLR_LUNID, TYPE_MSG); + c->Request.CDB[1] = reset_type; /* fill_cmd defaults to target reset */ + if (return_status != IO_OK) { + cmd_special_free(h, c); + return return_status; + } + c->waiting = NULL; + enqueue_cmd_and_start_io(h, c); + /* Don't wait for completion, the reset won't complete. Don't free + * the command either. This is the last command we will send before + * re-initializing everything, so it doesn't matter and won't leak. + */ + return 0; +} + static int check_target_status(ctlr_info_t *h, CommandList_struct *c) { switch (c->err_info->ScsiStatus) { @@ -3461,6 +3514,63 @@ static inline u32 process_nonindexed_cmd(ctlr_info_t *h, u32 raw_tag) return next_command(h); } +/* Some controllers, like p400, will give us one interrupt + * after a soft reset, even if we turned interrupts off. + * Only need to check for this in the cciss_xxx_discard_completions + * functions. + */ +static int ignore_bogus_interrupt(ctlr_info_t *h) +{ + if (likely(!reset_devices)) + return 0; + + if (likely(h->interrupts_enabled)) + return 0; + + dev_info(&h->pdev->dev, "Received interrupt while interrupts disabled " + "(known firmware bug.) Ignoring.\n"); + + return 1; +} + +static irqreturn_t cciss_intx_discard_completions(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + if (ignore_bogus_interrupt(h)) + return IRQ_NONE; + + if (interrupt_not_for_us(h)) + return IRQ_NONE; + spin_lock_irqsave(&h->lock, flags); + while (interrupt_pending(h)) { + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) + raw_tag = next_command(h); + } + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + +static irqreturn_t cciss_msix_discard_completions(int irq, void *dev_id) +{ + ctlr_info_t *h = dev_id; + unsigned long flags; + u32 raw_tag; + + if (ignore_bogus_interrupt(h)) + return IRQ_NONE; + + spin_lock_irqsave(&h->lock, flags); + raw_tag = get_next_completion(h); + while (raw_tag != FIFO_EMPTY) + raw_tag = next_command(h); + spin_unlock_irqrestore(&h->lock, flags); + return IRQ_HANDLED; +} + static irqreturn_t do_cciss_intx(int irq, void *dev_id) { ctlr_info_t *h = dev_id; @@ -4078,6 +4188,9 @@ static int __devinit cciss_find_cfgtables(ctlr_info_t *h) cfg_base_addr_index) + cfg_offset, sizeof(h->cfgtable)); if (!h->cfgtable) return -ENOMEM; + rc = write_driver_ver_to_cfgtable(h->cfgtable); + if (rc) + return rc; /* Find performant mode table. */ trans_offset = readl(&h->cfgtable->TransMethodOffset); h->transtable = remap_pci_mem(pci_resource_start(h->pdev, @@ -4112,7 +4225,7 @@ static void __devinit cciss_get_max_perf_mode_cmds(struct ctlr_info *h) static void __devinit cciss_find_board_params(ctlr_info_t *h) { cciss_get_max_perf_mode_cmds(h); - h->nr_cmds = h->max_commands - 4; /* Allow room for some ioctls */ + h->nr_cmds = h->max_commands - 4 - cciss_tape_cmds; h->maxsgentries = readl(&(h->cfgtable->MaxSGElements)); /* * Limit in-command s/g elements to 32 save dma'able memory. @@ -4348,7 +4461,7 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u tag = readl(vaddr + SA5_REPLY_PORT_OFFSET); if ((tag & ~3) == paddr32) break; - schedule_timeout_uninterruptible(HZ); + msleep(CCISS_POST_RESET_NOOP_TIMEOUT_MSECS); } iounmap(vaddr); @@ -4375,11 +4488,10 @@ static __devinit int cciss_message(struct pci_dev *pdev, unsigned char opcode, u return 0; } -#define cciss_soft_reset_controller(p) cciss_message(p, 1, 0) #define cciss_noop(p) cciss_message(p, 3, 0) static int cciss_controller_hard_reset(struct pci_dev *pdev, - void * __iomem vaddr, bool use_doorbell) + void * __iomem vaddr, u32 use_doorbell) { u16 pmcsr; int pos; @@ -4390,8 +4502,7 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev, * other way using the doorbell register. */ dev_info(&pdev->dev, "using doorbell to reset controller\n"); - writel(DOORBELL_CTLR_RESET, vaddr + SA5_DOORBELL); - msleep(1000); + writel(use_doorbell, vaddr + SA5_DOORBELL); } else { /* Try to do it the PCI power state way */ /* Quoting from the Open CISS Specification: "The Power @@ -4422,12 +4533,64 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev, pmcsr &= ~PCI_PM_CTRL_STATE_MASK; pmcsr |= PCI_D0; pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); - - msleep(500); } return 0; } +static __devinit void init_driver_version(char *driver_version, int len) +{ + memset(driver_version, 0, len); + strncpy(driver_version, "cciss " DRIVER_NAME, len - 1); +} + +static __devinit int write_driver_ver_to_cfgtable( + CfgTable_struct __iomem *cfgtable) +{ + char *driver_version; + int i, size = sizeof(cfgtable->driver_version); + + driver_version = kmalloc(size, GFP_KERNEL); + if (!driver_version) + return -ENOMEM; + + init_driver_version(driver_version, size); + for (i = 0; i < size; i++) + writeb(driver_version[i], &cfgtable->driver_version[i]); + kfree(driver_version); + return 0; +} + +static __devinit void read_driver_ver_from_cfgtable( + CfgTable_struct __iomem *cfgtable, unsigned char *driver_ver) +{ + int i; + + for (i = 0; i < sizeof(cfgtable->driver_version); i++) + driver_ver[i] = readb(&cfgtable->driver_version[i]); +} + +static __devinit int controller_reset_failed( + CfgTable_struct __iomem *cfgtable) +{ + + char *driver_ver, *old_driver_ver; + int rc, size = sizeof(cfgtable->driver_version); + + old_driver_ver = kmalloc(2 * size, GFP_KERNEL); + if (!old_driver_ver) + return -ENOMEM; + driver_ver = old_driver_ver + size; + + /* After a reset, the 32 bytes of "driver version" in the cfgtable + * should have been changed, otherwise we know the reset failed. + */ + init_driver_version(old_driver_ver, size); + read_driver_ver_from_cfgtable(cfgtable, driver_ver); + rc = !memcmp(driver_ver, old_driver_ver, size); + kfree(old_driver_ver); + return rc; +} + /* This does a hard reset of the controller using PCI power management * states or using the doorbell register. */ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) @@ -4437,10 +4600,10 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) u64 cfg_base_addr_index; void __iomem *vaddr; unsigned long paddr; - u32 misc_fw_support, active_transport; + u32 misc_fw_support; int rc; CfgTable_struct __iomem *cfgtable; - bool use_doorbell; + u32 use_doorbell; u32 board_id; u16 command_register; @@ -4464,12 +4627,16 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) * likely not be happy. Just forbid resetting this conjoined mess. */ cciss_lookup_board_id(pdev, &board_id); - if (board_id == 0x409C0E11 || board_id == 0x409D0E11) { + if (!ctlr_is_resettable(board_id)) { dev_warn(&pdev->dev, "Cannot reset Smart Array 640x " "due to shared cache module."); return -ENODEV; } + /* if controller is soft- but not hard resettable... */ + if (!ctlr_is_hard_resettable(board_id)) + return -ENOTSUPP; /* try soft reset later. */ + /* Save the PCI command register */ pci_read_config_word(pdev, 4, &command_register); /* Turn the board off. This is so that later pci_restore_state() @@ -4497,16 +4664,28 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) rc = -ENOMEM; goto unmap_vaddr; } + rc = write_driver_ver_to_cfgtable(cfgtable); + if (rc) + goto unmap_vaddr; - /* If reset via doorbell register is supported, use that. */ - misc_fw_support = readl(&cfgtable->misc_fw_support); - use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET; - - /* The doorbell reset seems to cause lockups on some Smart - * Arrays (e.g. P410, P410i, maybe others). Until this is - * fixed or at least isolated, avoid the doorbell reset. + /* If reset via doorbell register is supported, use that. + * There are two such methods. Favor the newest method. */ - use_doorbell = 0; + misc_fw_support = readl(&cfgtable->misc_fw_support); + use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET2; + if (use_doorbell) { + use_doorbell = DOORBELL_CTLR_RESET2; + } else { + use_doorbell = misc_fw_support & MISC_FW_DOORBELL_RESET; + if (use_doorbell) { + dev_warn(&pdev->dev, "Controller claims that " + "'Bit 2 doorbell reset' is " + "supported, but not 'bit 5 doorbell reset'. " + "Firmware update is recommended.\n"); + rc = -ENOTSUPP; /* use the soft reset */ + goto unmap_cfgtable; + } + } rc = cciss_controller_hard_reset(pdev, vaddr, use_doorbell); if (rc) @@ -4524,30 +4703,31 @@ static __devinit int cciss_kdump_hard_reset_controller(struct pci_dev *pdev) msleep(CCISS_POST_RESET_PAUSE_MSECS); /* Wait for board to become not ready, then ready. */ - dev_info(&pdev->dev, "Waiting for board to become ready.\n"); + dev_info(&pdev->dev, "Waiting for board to reset.\n"); rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_NOT_READY); - if (rc) /* Don't bail, might be E500, etc. which can't be reset */ - dev_warn(&pdev->dev, - "failed waiting for board to become not ready\n"); + if (rc) { + dev_warn(&pdev->dev, "Failed waiting for board to hard reset." + " Will try soft reset.\n"); + rc = -ENOTSUPP; /* Not expected, but try soft reset later */ + goto unmap_cfgtable; + } rc = cciss_wait_for_board_state(pdev, vaddr, BOARD_READY); if (rc) { dev_warn(&pdev->dev, - "failed waiting for board to become ready\n"); + "failed waiting for board to become ready " + "after hard reset\n"); goto unmap_cfgtable; } - dev_info(&pdev->dev, "board ready.\n"); - /* Controller should be in simple mode at this point. If it's not, - * It means we're on one of those controllers which doesn't support - * the doorbell reset method and on which the PCI power management reset - * method doesn't work (P800, for example.) - * In those cases, don't try to proceed, as it generally doesn't work. - */ - active_transport = readl(&cfgtable->TransportActive); - if (active_transport & PERFORMANT_MODE) { - dev_warn(&pdev->dev, "Unable to successfully reset controller," - " Ignoring controller.\n"); - rc = -ENODEV; + rc = controller_reset_failed(vaddr); + if (rc < 0) + goto unmap_cfgtable; + if (rc) { + dev_warn(&pdev->dev, "Unable to successfully hard reset " + "controller. Will try soft reset.\n"); + rc = -ENOTSUPP; /* Not expected, but try soft reset later */ + } else { + dev_info(&pdev->dev, "Board ready after hard reset.\n"); } unmap_cfgtable: @@ -4574,11 +4754,12 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) * due to concerns about shared bbwc between 6402/6404 pair. */ if (rc == -ENOTSUPP) - return 0; /* just try to do the kdump anyhow. */ + return rc; /* just try to do the kdump anyhow. */ if (rc) return -ENODEV; /* Now try to get the controller to respond to a no-op */ + dev_warn(&pdev->dev, "Waiting for controller to respond to no-op\n"); for (i = 0; i < CCISS_POST_RESET_NOOP_RETRIES; i++) { if (cciss_noop(pdev) == 0) break; @@ -4591,6 +4772,148 @@ static __devinit int cciss_init_reset_devices(struct pci_dev *pdev) return 0; } +static __devinit int cciss_allocate_cmd_pool(ctlr_info_t *h) +{ + h->cmd_pool_bits = kmalloc( + DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) * + sizeof(unsigned long), GFP_KERNEL); + h->cmd_pool = pci_alloc_consistent(h->pdev, + h->nr_cmds * sizeof(CommandList_struct), + &(h->cmd_pool_dhandle)); + h->errinfo_pool = pci_alloc_consistent(h->pdev, + h->nr_cmds * sizeof(ErrorInfo_struct), + &(h->errinfo_pool_dhandle)); + if ((h->cmd_pool_bits == NULL) + || (h->cmd_pool == NULL) + || (h->errinfo_pool == NULL)) { + dev_err(&h->pdev->dev, "out of memory"); + return -ENOMEM; + } + return 0; +} + +static __devinit int cciss_allocate_scatterlists(ctlr_info_t *h) +{ + int i; + + /* zero it, so that on free we need not know how many were alloc'ed */ + h->scatter_list = kzalloc(h->max_commands * + sizeof(struct scatterlist *), GFP_KERNEL); + if (!h->scatter_list) + return -ENOMEM; + + for (i = 0; i < h->nr_cmds; i++) { + h->scatter_list[i] = kmalloc(sizeof(struct scatterlist) * + h->maxsgentries, GFP_KERNEL); + if (h->scatter_list[i] == NULL) { + dev_err(&h->pdev->dev, "could not allocate " + "s/g lists\n"); + return -ENOMEM; + } + } + return 0; +} + +static void cciss_free_scatterlists(ctlr_info_t *h) +{ + int i; + + if (h->scatter_list) { + for (i = 0; i < h->nr_cmds; i++) + kfree(h->scatter_list[i]); + kfree(h->scatter_list); + } +} + +static void cciss_free_cmd_pool(ctlr_info_t *h) +{ + kfree(h->cmd_pool_bits); + if (h->cmd_pool) + pci_free_consistent(h->pdev, + h->nr_cmds * sizeof(CommandList_struct), + h->cmd_pool, h->cmd_pool_dhandle); + if (h->errinfo_pool) + pci_free_consistent(h->pdev, + h->nr_cmds * sizeof(ErrorInfo_struct), + h->errinfo_pool, h->errinfo_pool_dhandle); +} + +static int cciss_request_irq(ctlr_info_t *h, + irqreturn_t (*msixhandler)(int, void *), + irqreturn_t (*intxhandler)(int, void *)) +{ + if (h->msix_vector || h->msi_vector) { + if (!request_irq(h->intr[PERF_MODE_INT], msixhandler, + IRQF_DISABLED, h->devname, h)) + return 0; + dev_err(&h->pdev->dev, "Unable to get msi irq %d" + " for %s\n", h->intr[PERF_MODE_INT], + h->devname); + return -1; + } + + if (!request_irq(h->intr[PERF_MODE_INT], intxhandler, + IRQF_DISABLED, h->devname, h)) + return 0; + dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", + h->intr[PERF_MODE_INT], h->devname); + return -1; +} + +static int __devinit cciss_kdump_soft_reset(ctlr_info_t *h) +{ + if (cciss_send_reset(h, CTLR_LUNID, CCISS_RESET_TYPE_CONTROLLER)) { + dev_warn(&h->pdev->dev, "Resetting array controller failed.\n"); + return -EIO; + } + + dev_info(&h->pdev->dev, "Waiting for board to soft reset.\n"); + if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_NOT_READY)) { + dev_warn(&h->pdev->dev, "Soft reset had no effect.\n"); + return -1; + } + + dev_info(&h->pdev->dev, "Board reset, awaiting READY status.\n"); + if (cciss_wait_for_board_state(h->pdev, h->vaddr, BOARD_READY)) { + dev_warn(&h->pdev->dev, "Board failed to become ready " + "after soft reset.\n"); + return -1; + } + + return 0; +} + +static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) +{ + int ctlr = h->ctlr; + + free_irq(h->intr[PERF_MODE_INT], h); +#ifdef CONFIG_PCI_MSI + if (h->msix_vector) + pci_disable_msix(h->pdev); + else if (h->msi_vector) + pci_disable_msi(h->pdev); +#endif /* CONFIG_PCI_MSI */ + cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); + cciss_free_scatterlists(h); + cciss_free_cmd_pool(h); + kfree(h->blockFetchTable); + if (h->reply_pool) + pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64), + h->reply_pool, h->reply_pool_dhandle); + if (h->transtable) + iounmap(h->transtable); + if (h->cfgtable) + iounmap(h->cfgtable); + if (h->vaddr) + iounmap(h->vaddr); + unregister_blkdev(h->major, h->devname); + cciss_destroy_hba_sysfs_entry(h); + pci_release_regions(h->pdev); + kfree(h); + hba[ctlr] = NULL; +} + /* * This is it. Find all the controllers and register them. I really hate * stealing all these major device numbers. @@ -4601,15 +4924,28 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, { int i; int j = 0; - int k = 0; int rc; + int try_soft_reset = 0; int dac, return_code; InquiryData_struct *inq_buff; ctlr_info_t *h; + unsigned long flags; rc = cciss_init_reset_devices(pdev); - if (rc) - return rc; + if (rc) { + if (rc != -ENOTSUPP) + return rc; + /* If the reset fails in a particular way (it has no way to do + * a proper hard reset, so returns -ENOTSUPP) we can try to do + * a soft reset once we get the controller configured up to the + * point that it can accept a command. + */ + try_soft_reset = 1; + rc = 0; + } + +reinit_after_soft_reset: + i = alloc_cciss_hba(pdev); if (i < 0) return -1; @@ -4627,6 +4963,11 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, sprintf(h->devname, "cciss%d", i); h->ctlr = i; + if (cciss_tape_cmds < 2) + cciss_tape_cmds = 2; + if (cciss_tape_cmds > 16) + cciss_tape_cmds = 16; + init_completion(&h->scan_wait); if (cciss_create_hba_sysfs_entry(h)) @@ -4662,62 +5003,20 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, /* make sure the board interrupts are off */ h->access.set_intr_mask(h, CCISS_INTR_OFF); - if (h->msi_vector || h->msix_vector) { - if (request_irq(h->intr[PERF_MODE_INT], - do_cciss_msix_intr, - IRQF_DISABLED, h->devname, h)) { - dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", - h->intr[PERF_MODE_INT], h->devname); - goto clean2; - } - } else { - if (request_irq(h->intr[PERF_MODE_INT], do_cciss_intx, - IRQF_DISABLED, h->devname, h)) { - dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", - h->intr[PERF_MODE_INT], h->devname); - goto clean2; - } - } + rc = cciss_request_irq(h, do_cciss_msix_intr, do_cciss_intx); + if (rc) + goto clean2; dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", h->devname, pdev->device, pci_name(pdev), h->intr[PERF_MODE_INT], dac ? "" : " not"); - h->cmd_pool_bits = - kmalloc(DIV_ROUND_UP(h->nr_cmds, BITS_PER_LONG) - * sizeof(unsigned long), GFP_KERNEL); - h->cmd_pool = (CommandList_struct *) - pci_alloc_consistent(h->pdev, - h->nr_cmds * sizeof(CommandList_struct), - &(h->cmd_pool_dhandle)); - h->errinfo_pool = (ErrorInfo_struct *) - pci_alloc_consistent(h->pdev, - h->nr_cmds * sizeof(ErrorInfo_struct), - &(h->errinfo_pool_dhandle)); - if ((h->cmd_pool_bits == NULL) - || (h->cmd_pool == NULL) - || (h->errinfo_pool == NULL)) { - dev_err(&h->pdev->dev, "out of memory"); + if (cciss_allocate_cmd_pool(h)) goto clean4; - } - /* Need space for temp scatter list */ - h->scatter_list = kmalloc(h->max_commands * - sizeof(struct scatterlist *), - GFP_KERNEL); - if (!h->scatter_list) + if (cciss_allocate_scatterlists(h)) goto clean4; - for (k = 0; k < h->nr_cmds; k++) { - h->scatter_list[k] = kmalloc(sizeof(struct scatterlist) * - h->maxsgentries, - GFP_KERNEL); - if (h->scatter_list[k] == NULL) { - dev_err(&h->pdev->dev, - "could not allocate s/g lists\n"); - goto clean4; - } - } h->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, h->chainsize, h->nr_cmds); if (!h->cmd_sg_list && h->chainsize > 0) @@ -4741,6 +5040,62 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, h->gendisk[j] = NULL; } + /* At this point, the controller is ready to take commands. + * Now, if reset_devices and the hard reset didn't work, try + * the soft reset and see if that works. + */ + if (try_soft_reset) { + + /* This is kind of gross. We may or may not get a completion + * from the soft reset command, and if we do, then the value + * from the fifo may or may not be valid. So, we wait 10 secs + * after the reset throwing away any completions we get during + * that time. Unregister the interrupt handler and register + * fake ones to scoop up any residual completions. + */ + spin_lock_irqsave(&h->lock, flags); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + spin_unlock_irqrestore(&h->lock, flags); + free_irq(h->intr[PERF_MODE_INT], h); + rc = cciss_request_irq(h, cciss_msix_discard_completions, + cciss_intx_discard_completions); + if (rc) { + dev_warn(&h->pdev->dev, "Failed to request_irq after " + "soft reset.\n"); + goto clean4; + } + + rc = cciss_kdump_soft_reset(h); + if (rc) { + dev_warn(&h->pdev->dev, "Soft reset failed.\n"); + goto clean4; + } + + dev_info(&h->pdev->dev, "Board READY.\n"); + dev_info(&h->pdev->dev, + "Waiting for stale completions to drain.\n"); + h->access.set_intr_mask(h, CCISS_INTR_ON); + msleep(10000); + h->access.set_intr_mask(h, CCISS_INTR_OFF); + + rc = controller_reset_failed(h->cfgtable); + if (rc) + dev_info(&h->pdev->dev, + "Soft reset appears to have failed.\n"); + + /* since the controller's reset, we have to go back and re-init + * everything. Easiest to just forget what we've done and do it + * all over again. + */ + cciss_undo_allocations_after_kdump_soft_reset(h); + try_soft_reset = 0; + if (rc) + /* don't go to clean4, we already unallocated */ + return -ENODEV; + + goto reinit_after_soft_reset; + } + cciss_scsi_setup(h); /* Turn the interrupts on so we can service requests */ @@ -4775,21 +5130,9 @@ static int __devinit cciss_init_one(struct pci_dev *pdev, return 1; clean4: - kfree(h->cmd_pool_bits); - /* Free up sg elements */ - for (k-- ; k >= 0; k--) - kfree(h->scatter_list[k]); - kfree(h->scatter_list); + cciss_free_cmd_pool(h); + cciss_free_scatterlists(h); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); - if (h->cmd_pool) - pci_free_consistent(h->pdev, - h->nr_cmds * sizeof(CommandList_struct), - h->cmd_pool, h->cmd_pool_dhandle); - if (h->errinfo_pool) - pci_free_consistent(h->pdev, - h->nr_cmds * sizeof(ErrorInfo_struct), - h->errinfo_pool, - h->errinfo_pool_dhandle); free_irq(h->intr[PERF_MODE_INT], h); clean2: unregister_blkdev(h->major, h->devname); @@ -4887,16 +5230,16 @@ static void __devexit cciss_remove_one(struct pci_dev *pdev) iounmap(h->cfgtable); iounmap(h->vaddr); - pci_free_consistent(h->pdev, h->nr_cmds * sizeof(CommandList_struct), - h->cmd_pool, h->cmd_pool_dhandle); - pci_free_consistent(h->pdev, h->nr_cmds * sizeof(ErrorInfo_struct), - h->errinfo_pool, h->errinfo_pool_dhandle); - kfree(h->cmd_pool_bits); + cciss_free_cmd_pool(h); /* Free up sg elements */ for (j = 0; j < h->nr_cmds; j++) kfree(h->scatter_list[j]); kfree(h->scatter_list); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); + kfree(h->blockFetchTable); + if (h->reply_pool) + pci_free_consistent(h->pdev, h->max_commands * sizeof(__u64), + h->reply_pool, h->reply_pool_dhandle); /* * Deliberately omit pci_disable_device(): it does something nasty to * Smart Array controllers that pci_enable_device does not undo diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index 554bbd907d14..16b4d58d84dd 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -200,7 +200,7 @@ struct ctlr_info * the above. */ #define CCISS_BOARD_READY_WAIT_SECS (120) -#define CCISS_BOARD_NOT_READY_WAIT_SECS (10) +#define CCISS_BOARD_NOT_READY_WAIT_SECS (100) #define CCISS_BOARD_READY_POLL_INTERVAL_MSECS (100) #define CCISS_BOARD_READY_ITERATIONS \ ((CCISS_BOARD_READY_WAIT_SECS * 1000) / \ @@ -209,8 +209,9 @@ struct ctlr_info ((CCISS_BOARD_NOT_READY_WAIT_SECS * 1000) / \ CCISS_BOARD_READY_POLL_INTERVAL_MSECS) #define CCISS_POST_RESET_PAUSE_MSECS (3000) -#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (1000) +#define CCISS_POST_RESET_NOOP_INTERVAL_MSECS (4000) #define CCISS_POST_RESET_NOOP_RETRIES (12) +#define CCISS_POST_RESET_NOOP_TIMEOUT_MSECS (10000) /* Send the command to the hardware @@ -239,11 +240,13 @@ static void SA5_intr_mask(ctlr_info_t *h, unsigned long val) { /* Turn interrupts on */ h->interrupts_enabled = 1; writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } else /* Turn them off */ { h->interrupts_enabled = 0; writel( SA5_INTR_OFF, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } } /* @@ -257,11 +260,13 @@ static void SA5B_intr_mask(ctlr_info_t *h, unsigned long val) { /* Turn interrupts on */ h->interrupts_enabled = 1; writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } else /* Turn them off */ { h->interrupts_enabled = 0; writel( SA5B_INTR_OFF, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } } @@ -271,10 +276,12 @@ static void SA5_performant_intr_mask(ctlr_info_t *h, unsigned long val) if (val) { /* turn on interrupts */ h->interrupts_enabled = 1; writel(0, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } else { h->interrupts_enabled = 0; writel(SA5_PERF_INTR_OFF, h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); + (void) readl(h->vaddr + SA5_REPLY_INTR_MASK_OFFSET); } } diff --git a/drivers/block/cciss_cmd.h b/drivers/block/cciss_cmd.h index cd441bef031f..d9be6b4d49a6 100644 --- a/drivers/block/cciss_cmd.h +++ b/drivers/block/cciss_cmd.h @@ -53,6 +53,7 @@ #define CFGTBL_ChangeReq 0x00000001l #define CFGTBL_AccCmds 0x00000001l #define DOORBELL_CTLR_RESET 0x00000004l +#define DOORBELL_CTLR_RESET2 0x00000020l #define CFGTBL_Trans_Simple 0x00000002l #define CFGTBL_Trans_Performant 0x00000004l @@ -142,6 +143,14 @@ typedef struct _ReadCapdata_struct_16 #define BMIC_CACHE_FLUSH 0xc2 #define CCISS_CACHE_FLUSH 0x01 /* C2 was already being used by CCISS */ +#define CCISS_ABORT_MSG 0x00 +#define CCISS_RESET_MSG 0x01 +#define CCISS_RESET_TYPE_CONTROLLER 0x00 +#define CCISS_RESET_TYPE_BUS 0x01 +#define CCISS_RESET_TYPE_TARGET 0x03 +#define CCISS_RESET_TYPE_LUN 0x04 +#define CCISS_NOOP_MSG 0x03 + /* Command List Structure */ #define CTLR_LUNID "\0\0\0\0\0\0\0\0" @@ -235,6 +244,8 @@ typedef struct _CfgTable_struct { u8 reserved[0x78 - 0x58]; u32 misc_fw_support; /* offset 0x78 */ #define MISC_FW_DOORBELL_RESET (0x02) +#define MISC_FW_DOORBELL_RESET2 (0x10) + u8 driver_version[32]; } CfgTable_struct; struct TransTable_struct { diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index df793803f5ae..696100241a6f 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -84,7 +84,6 @@ static struct scsi_host_template cciss_driver_template = { .proc_name = "cciss", .proc_info = cciss_scsi_proc_info, .queuecommand = cciss_scsi_queue_command, - .can_queue = SCSI_CCISS_CAN_QUEUE, .this_id = 7, .cmd_per_lun = 1, .use_clustering = DISABLE_CLUSTERING, @@ -108,16 +107,13 @@ struct cciss_scsi_cmd_stack_elem_t { #pragma pack() -#define CMD_STACK_SIZE (SCSI_CCISS_CAN_QUEUE * \ - CCISS_MAX_SCSI_DEVS_PER_HBA + 2) - // plus two for init time usage - #pragma pack(1) struct cciss_scsi_cmd_stack_t { struct cciss_scsi_cmd_stack_elem_t *pool; - struct cciss_scsi_cmd_stack_elem_t *elem[CMD_STACK_SIZE]; + struct cciss_scsi_cmd_stack_elem_t **elem; dma_addr_t cmd_pool_handle; int top; + int nelems; }; #pragma pack() @@ -191,7 +187,7 @@ scsi_cmd_free(ctlr_info_t *h, CommandList_struct *c) sa = h->scsi_ctlr; stk = &sa->cmd_stack; stk->top++; - if (stk->top >= CMD_STACK_SIZE) { + if (stk->top >= stk->nelems) { dev_err(&h->pdev->dev, "scsi_cmd_free called too many times.\n"); BUG(); @@ -206,13 +202,14 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa) struct cciss_scsi_cmd_stack_t *stk; size_t size; + stk = &sa->cmd_stack; + stk->nelems = cciss_tape_cmds + 2; sa->cmd_sg_list = cciss_allocate_sg_chain_blocks(h, - h->chainsize, CMD_STACK_SIZE); + h->chainsize, stk->nelems); if (!sa->cmd_sg_list && h->chainsize > 0) return -ENOMEM; - stk = &sa->cmd_stack; - size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; + size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems; /* Check alignment, see cciss_cmd.h near CommandList_struct def. */ BUILD_BUG_ON((sizeof(*stk->pool) % COMMANDLIST_ALIGNMENT) != 0); @@ -221,18 +218,23 @@ scsi_cmd_stack_setup(ctlr_info_t *h, struct cciss_scsi_adapter_data_t *sa) pci_alloc_consistent(h->pdev, size, &stk->cmd_pool_handle); if (stk->pool == NULL) { - cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); + cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems); sa->cmd_sg_list = NULL; return -ENOMEM; } - - for (i=0; i<CMD_STACK_SIZE; i++) { + stk->elem = kmalloc(sizeof(stk->elem[0]) * stk->nelems, GFP_KERNEL); + if (!stk->elem) { + pci_free_consistent(h->pdev, size, stk->pool, + stk->cmd_pool_handle); + return -1; + } + for (i = 0; i < stk->nelems; i++) { stk->elem[i] = &stk->pool[i]; stk->elem[i]->busaddr = (__u32) (stk->cmd_pool_handle + (sizeof(struct cciss_scsi_cmd_stack_elem_t) * i)); stk->elem[i]->cmdindex = i; } - stk->top = CMD_STACK_SIZE-1; + stk->top = stk->nelems-1; return 0; } @@ -245,16 +247,18 @@ scsi_cmd_stack_free(ctlr_info_t *h) sa = h->scsi_ctlr; stk = &sa->cmd_stack; - if (stk->top != CMD_STACK_SIZE-1) { + if (stk->top != stk->nelems-1) { dev_warn(&h->pdev->dev, "bug: %d scsi commands are still outstanding.\n", - CMD_STACK_SIZE - stk->top); + stk->nelems - stk->top); } - size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * CMD_STACK_SIZE; + size = sizeof(struct cciss_scsi_cmd_stack_elem_t) * stk->nelems; pci_free_consistent(h->pdev, size, stk->pool, stk->cmd_pool_handle); stk->pool = NULL; - cciss_free_sg_chain_blocks(sa->cmd_sg_list, CMD_STACK_SIZE); + cciss_free_sg_chain_blocks(sa->cmd_sg_list, stk->nelems); + kfree(stk->elem); + stk->elem = NULL; } #if 0 @@ -859,6 +863,7 @@ cciss_scsi_detect(ctlr_info_t *h) sh->io_port = 0; // good enough? FIXME, sh->n_io_port = 0; // I don't think we use these two... sh->this_id = SELF_SCSI_ID; + sh->can_queue = cciss_tape_cmds; sh->sg_tablesize = h->maxsgentries; sh->max_cmd_len = MAX_COMMAND_SIZE; diff --git a/drivers/block/cciss_scsi.h b/drivers/block/cciss_scsi.h index 6d5822fe851a..e71d986727ca 100644 --- a/drivers/block/cciss_scsi.h +++ b/drivers/block/cciss_scsi.h @@ -36,13 +36,9 @@ addressible natively, and may in fact turn out to be not scsi at all. */ -#define SCSI_CCISS_CAN_QUEUE 2 /* -Note, cmd_per_lun could give us some trouble, so I'm setting it very low. -Likewise, SCSI_CCISS_CAN_QUEUE is set very conservatively. - If the upper scsi layer tries to track how many commands we have outstanding, it will be operating under the misapprehension that it is the only one sending us requests. We also have the block interface, diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index c6828b68d77b..09ef9a878ef0 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -28,7 +28,7 @@ #include "drbd_int.h" #include "drbd_wrappers.h" -/* We maintain a trivial check sum in our on disk activity log. +/* We maintain a trivial checksum in our on disk activity log. * With that we can ensure correct operation even when the storage * device might do a partial (last) sector write while losing power. */ diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 76210ba401ac..f440a02dfdb1 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -74,7 +74,7 @@ * as we are "attached" to a local disk, which at 32 GiB for 1PiB storage * seems excessive. * - * We plan to reduce the amount of in-core bitmap pages by pageing them in + * We plan to reduce the amount of in-core bitmap pages by paging them in * and out against their on-disk location as necessary, but need to make * sure we don't cause too much meta data IO, and must not deadlock in * tight memory situations. This needs some more work. @@ -200,7 +200,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) * we if bits have been cleared since last IO. */ #define BM_PAGE_LAZY_WRITEOUT 28 -/* store_page_idx uses non-atomic assingment. It is only used directly after +/* store_page_idx uses non-atomic assignment. It is only used directly after * allocating the page. All other bm_set_page_* and bm_clear_page_* need to * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap * changes) may happen from various contexts, and wait_on_bit/wake_up_bit @@ -318,7 +318,7 @@ static void bm_unmap(unsigned long *p_addr) /* word offset from start of bitmap to word number _in_page_ * modulo longs per page #define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long)) - hm, well, Philipp thinks gcc might not optimze the % into & (... - 1) + hm, well, Philipp thinks gcc might not optimize the % into & (... - 1) so do it explicitly: */ #define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1)) diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b2699bb2e530..ef2ceed3be4b 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -42,6 +42,7 @@ #include <linux/genhd.h> #include <net/tcp.h> #include <linux/lru_cache.h> +#include <linux/prefetch.h> #ifdef __CHECKER__ # define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) @@ -699,7 +700,7 @@ struct drbd_request { * see drbd_endio_pri(). */ struct bio *private_bio; - struct hlist_node colision; + struct hlist_node collision; sector_t sector; unsigned int size; unsigned int epoch; /* barrier_nr */ @@ -765,7 +766,7 @@ struct digest_info { struct drbd_epoch_entry { struct drbd_work w; - struct hlist_node colision; + struct hlist_node collision; struct drbd_epoch *epoch; /* for writes */ struct drbd_conf *mdev; struct page *pages; @@ -1128,6 +1129,8 @@ struct drbd_conf { int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ + int peer_max_bio_size; + int local_max_bio_size; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1217,8 +1220,6 @@ extern void drbd_free_resources(struct drbd_conf *mdev); extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, unsigned int set_size); extern void tl_clear(struct drbd_conf *mdev); -enum drbd_req_event; -extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *); extern void drbd_free_sock(struct drbd_conf *mdev); extern int drbd_send(struct drbd_conf *mdev, struct socket *sock, @@ -1433,6 +1434,7 @@ struct bm_extent { * hash table. */ #define HT_SHIFT 8 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) +#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ #define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ @@ -1517,9 +1519,9 @@ extern void drbd_resume_io(struct drbd_conf *mdev); extern char *ppsize(char *buf, unsigned long long size); extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int); enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); +extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local); extern void resync_after_online_grow(struct drbd_conf *); -extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local); +extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev); extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force); @@ -1827,6 +1829,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, if (!forcedetach) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s.\n", where); + if (mdev->state.disk > D_INCONSISTENT) + _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_HARD, NULL); break; } /* NOTE fall through to detach case if forcedetach set */ @@ -2152,6 +2156,10 @@ static inline int get_net_conf(struct drbd_conf *mdev) static inline void put_ldev(struct drbd_conf *mdev) { int i = atomic_dec_return(&mdev->local_cnt); + + /* This may be called from some endio handler, + * so we must not sleep here. */ + __release(local); D_ASSERT(i >= 0); if (i == 0) { diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 5b525c179f39..0358e55356c8 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -745,6 +745,9 @@ is_valid_state(struct drbd_conf *mdev, union drbd_state ns) mdev->agreed_pro_version < 88) rv = SS_NOT_SUPPORTED; + else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) + rv = SS_CONNECTED_OUTDATES; + return rv; } @@ -1565,6 +1568,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, put_ldev(mdev); } + /* Notify peer that I had a local IO error, and did not detached.. */ + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) + drbd_send_state(mdev); + /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { @@ -2064,7 +2071,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl { struct p_sizes p; sector_t d_size, u_size; - int q_order_type; + int q_order_type, max_bio_size; int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -2072,17 +2079,20 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl d_size = drbd_get_max_capacity(mdev->ldev); u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); + max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; + max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); - p.max_bio_size = cpu_to_be32(queue_max_hw_sectors(mdev->rq_queue) << 9); + p.max_bio_size = cpu_to_be32(max_bio_size); p.queue_order_type = cpu_to_be16(q_order_type); p.dds_flags = cpu_to_be16(flags); @@ -2722,7 +2732,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) /* double check digest, sometimes buffers have been modified in flight. */ if (dgs > 0 && dgs <= 64) { - /* 64 byte, 512 bit, is the larges digest size + /* 64 byte, 512 bit, is the largest digest size * currently supported in kernel crypto. */ unsigned char digest[64]; drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest); @@ -3041,6 +3051,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) mdev->agreed_pro_version = PRO_VERSION_MAX; mdev->write_ordering = WO_bdev_flush; mdev->resync_wenr = LC_FREE; + mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; + mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE; } void drbd_mdev_cleanup(struct drbd_conf *mdev) @@ -3275,7 +3287,7 @@ static void drbd_delete_device(unsigned int minor) drbd_release_ee_lists(mdev); - /* should be free'd on disconnect? */ + /* should be freed on disconnect? */ kfree(mdev->ee_hash); /* mdev->ee_hash_s = 0; @@ -3415,7 +3427,9 @@ struct drbd_conf *drbd_new_device(unsigned int minor) q->backing_dev_info.congested_data = mdev; blk_queue_make_request(q, drbd_make_request); - blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE >> 9); + /* Setting the max_hw_sectors to an odd value of 8kibyte here + This triggers a max_bio_size message upon first attach or connect */ + blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); blk_queue_bounce_limit(q, BLK_BOUNCE_ANY); blk_queue_merge_bvec(q, drbd_merge_bvec); q->queue_lock = &mdev->req_lock; @@ -3627,7 +3641,8 @@ struct meta_data_on_disk { /* `-- act_log->nr_elements <-- sync_conf.al_extents */ u32 bm_offset; /* offset to the bitmap, from here */ u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */ - u32 reserved_u32[4]; + u32 la_peer_max_bio_size; /* last peer max_bio_size */ + u32 reserved_u32[3]; } __packed; @@ -3668,6 +3683,7 @@ void drbd_md_sync(struct drbd_conf *mdev) buffer->device_uuid = cpu_to_be64(mdev->ldev->md.device_uuid); buffer->bm_offset = cpu_to_be32(mdev->ldev->md.bm_offset); + buffer->la_peer_max_bio_size = cpu_to_be32(mdev->peer_max_bio_size); D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset); sector = mdev->ldev->md.md_offset; @@ -3751,6 +3767,15 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents); bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid); + spin_lock_irq(&mdev->req_lock); + if (mdev->state.conn < C_CONNECTED) { + int peer; + peer = be32_to_cpu(buffer->la_peer_max_bio_size); + peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + mdev->peer_max_bio_size = peer; + } + spin_unlock_irq(&mdev->req_lock); + if (mdev->sync_conf.al_extents < 7) mdev->sync_conf.al_extents = 127; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 03b29f78a37d..515bcd948a43 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -272,9 +272,28 @@ static int _try_outdate_peer_async(void *data) { struct drbd_conf *mdev = (struct drbd_conf *)data; enum drbd_disk_state nps; + union drbd_state ns; nps = drbd_try_outdate_peer(mdev); - drbd_request_state(mdev, NS(pdsk, nps)); + + /* Not using + drbd_request_state(mdev, NS(pdsk, nps)); + here, because we might were able to re-establish the connection + in the meantime. This can only partially be solved in the state's + engine is_valid_state() and is_valid_state_transition() + functions. + + nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN. + pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid, + therefore we have to have the pre state change check here. + */ + spin_lock_irq(&mdev->req_lock); + ns = mdev->state; + if (ns.conn < C_WF_REPORT_PARAMS) { + ns.pdsk = nps; + _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); + } + spin_unlock_irq(&mdev->req_lock); return 0; } @@ -577,7 +596,7 @@ void drbd_resume_io(struct drbd_conf *mdev) * Returns 0 on success, negative return values indicate errors. * You should call drbd_md_sync() after calling this function. */ -enum determine_dev_size drbd_determin_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) +enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local) { sector_t prev_first_sect, prev_size; /* previous meta location */ sector_t la_size; @@ -773,30 +792,78 @@ static int drbd_check_al_size(struct drbd_conf *mdev) return 0; } -void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) __must_hold(local) +static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; - int max_segments = mdev->ldev->dc.max_bio_bvecs; - int max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); + int max_hw_sectors = max_bio_size >> 9; + int max_segments = 0; + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; + + max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); + max_segments = mdev->ldev->dc.max_bio_bvecs; + put_ldev(mdev); + } blk_queue_logical_block_size(q, 512); blk_queue_max_hw_sectors(q, max_hw_sectors); /* This is the workaround for "bio would need to, but cannot, be split" */ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_CACHE_SIZE-1); - blk_queue_stack_limits(q, b); - dev_info(DEV, "max BIO size = %u\n", queue_max_hw_sectors(q) << 9); + if (get_ldev_if_state(mdev, D_ATTACHING)) { + struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; + + blk_queue_stack_limits(q, b); - if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { - dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", - q->backing_dev_info.ra_pages, - b->backing_dev_info.ra_pages); - q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + dev_info(DEV, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", + q->backing_dev_info.ra_pages, + b->backing_dev_info.ra_pages); + q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + } + put_ldev(mdev); } } +void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) +{ + int now, new, local, peer; + + now = queue_max_hw_sectors(mdev->rq_queue) << 9; + local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ + peer = mdev->peer_max_bio_size; /* Eventually last known value, from meta data */ + + if (get_ldev_if_state(mdev, D_ATTACHING)) { + local = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; + mdev->local_max_bio_size = local; + put_ldev(mdev); + } + + /* We may ignore peer limits if the peer is modern enough. + Because new from 8.3.8 onwards the peer can use multiple + BIOs for a single peer_request */ + if (mdev->state.conn >= C_CONNECTED) { + if (mdev->agreed_pro_version < 94) + peer = mdev->peer_max_bio_size; + else if (mdev->agreed_pro_version == 94) + peer = DRBD_MAX_SIZE_H80_PACKET; + else /* drbd 8.3.8 onwards */ + peer = DRBD_MAX_BIO_SIZE; + } + + new = min_t(int, local, peer); + + if (mdev->state.role == R_PRIMARY && new < now) + dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + + if (new != now) + dev_info(DEV, "max BIO size = %u\n", new); + + drbd_setup_queue_param(mdev, new); +} + /* serialize deconfig (worker exiting, doing cleanup) * and reconfig (drbdsetup disk, drbdsetup net) * @@ -865,7 +932,6 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp struct block_device *bdev; struct lru_cache *resync_lru = NULL; union drbd_state ns, os; - unsigned int max_bio_size; enum drbd_state_rv rv; int cp_discovered = 0; int logical_block_size; @@ -1117,20 +1183,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp mdev->read_cnt = 0; mdev->writ_cnt = 0; - max_bio_size = DRBD_MAX_BIO_SIZE; - if (mdev->state.conn == C_CONNECTED) { - /* We are Primary, Connected, and now attach a new local - * backing store. We must not increase the user visible maximum - * bio size on this device to something the peer may not be - * able to handle. */ - if (mdev->agreed_pro_version < 94) - max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9; - else if (mdev->agreed_pro_version == 94) - max_bio_size = DRBD_MAX_SIZE_H80_PACKET; - /* else: drbd 8.3.9 and later, stay with default */ - } - - drbd_setup_queue_param(mdev, max_bio_size); + drbd_reconsider_max_bio_size(mdev); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, @@ -1152,7 +1205,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND)) set_bit(USE_DEGR_WFC_T, &mdev->flags); - dd = drbd_determin_dev_size(mdev, 0); + dd = drbd_determine_dev_size(mdev, 0); if (dd == dev_size_error) { retcode = ERR_NOMEM_BITMAP; goto force_diskless_dec; @@ -1281,11 +1334,19 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { + enum drbd_ret_code retcode; + int ret; drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ - reply->ret_code = drbd_request_state(mdev, NS(disk, D_DISKLESS)); - if (mdev->state.disk == D_DISKLESS) - wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + /* D_FAILED will transition to DISKLESS. */ + ret = wait_event_interruptible(mdev->misc_wait, + mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) + retcode = SS_NOTHING_TO_DO; + if (ret) + retcode = ERR_INTR; + reply->ret_code = retcode; return 0; } @@ -1658,7 +1719,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, mdev->ldev->dc.disk_size = (sector_t)rs.resize_size; ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); - dd = drbd_determin_dev_size(mdev, ddsf); + dd = drbd_determine_dev_size(mdev, ddsf); drbd_md_sync(mdev); put_ldev(mdev); if (dd == dev_size_error) { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index fd26666c0b08..25d32c5aa50a 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -333,7 +333,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, if (!page) goto fail; - INIT_HLIST_NODE(&e->colision); + INIT_HLIST_NODE(&e->collision); e->epoch = NULL; e->mdev = mdev; e->pages = page; @@ -356,7 +356,7 @@ void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int i kfree(e->digest); drbd_pp_free(mdev, e->pages, is_net); D_ASSERT(atomic_read(&e->pending_bios) == 0); - D_ASSERT(hlist_unhashed(&e->colision)); + D_ASSERT(hlist_unhashed(&e->collision)); mempool_free(e, drbd_ee_mempool); } @@ -787,7 +787,7 @@ static int drbd_connect(struct drbd_conf *mdev) } if (sock && msock) { - schedule_timeout_interruptible(HZ / 10); + schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10); ok = drbd_socket_okay(mdev, &sock); ok = drbd_socket_okay(mdev, &msock) && ok; if (ok) @@ -899,11 +899,6 @@ retry: drbd_thread_start(&mdev->asender); - if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) { - drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET); - put_ldev(mdev); - } - if (drbd_send_protocol(mdev) == -1) return -1; drbd_send_sync_param(mdev, &mdev->sync_conf); @@ -1418,7 +1413,7 @@ static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int u sector_t sector = e->sector; int ok; - D_ASSERT(hlist_unhashed(&e->colision)); + D_ASSERT(hlist_unhashed(&e->collision)); if (likely((e->flags & EE_WAS_ERROR) == 0)) { drbd_set_in_sync(mdev, sector, e->size); @@ -1487,7 +1482,7 @@ static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsi return false; } - /* hlist_del(&req->colision) is done in _req_may_be_done, to avoid + /* hlist_del(&req->collision) is done in _req_may_be_done, to avoid * special casing it there for the various failure cases. * still no race with drbd_fail_pending_reads */ ok = recv_dless_read(mdev, req, sector, data_size); @@ -1558,11 +1553,11 @@ static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel) * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */ if (mdev->net_conf->two_primaries) { spin_lock_irq(&mdev->req_lock); - D_ASSERT(!hlist_unhashed(&e->colision)); - hlist_del_init(&e->colision); + D_ASSERT(!hlist_unhashed(&e->collision)); + hlist_del_init(&e->collision); spin_unlock_irq(&mdev->req_lock); } else { - D_ASSERT(hlist_unhashed(&e->colision)); + D_ASSERT(hlist_unhashed(&e->collision)); } drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); @@ -1579,8 +1574,8 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u ok = drbd_send_ack(mdev, P_DISCARD_ACK, e); spin_lock_irq(&mdev->req_lock); - D_ASSERT(!hlist_unhashed(&e->colision)); - hlist_del_init(&e->colision); + D_ASSERT(!hlist_unhashed(&e->collision)); + hlist_del_init(&e->collision); spin_unlock_irq(&mdev->req_lock); dec_unacked(mdev); @@ -1755,7 +1750,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned spin_lock_irq(&mdev->req_lock); - hlist_add_head(&e->colision, ee_hash_slot(mdev, sector)); + hlist_add_head(&e->collision, ee_hash_slot(mdev, sector)); #define OVERLAPS overlaps(i->sector, i->size, sector, size) slot = tl_hash_slot(mdev, sector); @@ -1765,7 +1760,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int have_conflict = 0; prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE); - hlist_for_each_entry(i, n, slot, colision) { + hlist_for_each_entry(i, n, slot, collision) { if (OVERLAPS) { /* only ALERT on first iteration, * we may be woken up early... */ @@ -1804,7 +1799,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned } if (signal_pending(current)) { - hlist_del_init(&e->colision); + hlist_del_init(&e->collision); spin_unlock_irq(&mdev->req_lock); @@ -1862,7 +1857,7 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned dev_err(DEV, "submit failed, triggering re-connect\n"); spin_lock_irq(&mdev->req_lock); list_del(&e->w.list); - hlist_del_init(&e->colision); + hlist_del_init(&e->collision); spin_unlock_irq(&mdev->req_lock); if (e->flags & EE_CALL_AL_COMPLETE_IO) drbd_al_complete_io(mdev, e->sector); @@ -2916,12 +2911,6 @@ disconnect: return false; } -static void drbd_setup_order_type(struct drbd_conf *mdev, int peer) -{ - /* sorry, we currently have no working implementation - * of distributed TCQ */ -} - /* warn if the arguments differ by more than 12.5% */ static void warn_if_differ_considerably(struct drbd_conf *mdev, const char *s, sector_t a, sector_t b) @@ -2939,7 +2928,6 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned { struct p_sizes *p = &mdev->data.rbuf.sizes; enum determine_dev_size dd = unchanged; - unsigned int max_bio_size; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ enum dds_flags ddsf; @@ -2994,7 +2982,7 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(mdev)) { - dd = drbd_determin_dev_size(mdev, ddsf); + dd = drbd_determine_dev_size(mdev, ddsf); put_ldev(mdev); if (dd == dev_size_error) return false; @@ -3004,23 +2992,15 @@ static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned drbd_set_my_capacity(mdev, p_size); } + mdev->peer_max_bio_size = be32_to_cpu(p->max_bio_size); + drbd_reconsider_max_bio_size(mdev); + if (get_ldev(mdev)) { if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) { mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev); ldsc = 1; } - if (mdev->agreed_pro_version < 94) - max_bio_size = be32_to_cpu(p->max_bio_size); - else if (mdev->agreed_pro_version == 94) - max_bio_size = DRBD_MAX_SIZE_H80_PACKET; - else /* drbd 8.3.8 onwards */ - max_bio_size = DRBD_MAX_BIO_SIZE; - - if (max_bio_size != queue_max_hw_sectors(mdev->rq_queue) << 9) - drbd_setup_queue_param(mdev, max_bio_size); - - drbd_setup_order_type(mdev, be16_to_cpu(p->queue_order_type)); put_ldev(mdev); } @@ -4275,7 +4255,7 @@ static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev, struct hlist_node *n; struct drbd_request *req; - hlist_for_each_entry(req, n, slot, colision) { + hlist_for_each_entry(req, n, slot, collision) { if ((unsigned long)req == (unsigned long)id) { if (req->sector != sector) { dev_err(DEV, "_ack_id_to_req: found req %p but it has " @@ -4554,6 +4534,7 @@ int drbd_asender(struct drbd_thread *thi) int received = 0; int expect = sizeof(struct p_header80); int empty; + int ping_timeout_active = 0; sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev)); @@ -4566,6 +4547,7 @@ int drbd_asender(struct drbd_thread *thi) ERR_IF(!drbd_send_ping(mdev)) goto reconnect; mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*HZ/10; + ping_timeout_active = 1; } /* conditionally cork; @@ -4620,8 +4602,7 @@ int drbd_asender(struct drbd_thread *thi) dev_err(DEV, "meta connection shut down by peer.\n"); goto reconnect; } else if (rv == -EAGAIN) { - if (mdev->meta.socket->sk->sk_rcvtimeo == - mdev->net_conf->ping_timeo*HZ/10) { + if (ping_timeout_active) { dev_err(DEV, "PingAck did not arrive in time.\n"); goto reconnect; } @@ -4660,6 +4641,11 @@ int drbd_asender(struct drbd_thread *thi) if (!cmd->process(mdev, h)) goto reconnect; + /* the idle_timeout (ping-int) + * has been restored in got_PingAck() */ + if (cmd == get_asender_cmd(P_PING_ACK)) + ping_timeout_active = 0; + buf = h; received = 0; expect = sizeof(struct p_header80); diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5c0c8be1bb0a..3424d675b769 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -163,7 +163,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, * they must have been failed on the spot */ #define OVERLAPS overlaps(sector, size, i->sector, i->size) slot = tl_hash_slot(mdev, sector); - hlist_for_each_entry(i, n, slot, colision) { + hlist_for_each_entry(i, n, slot, collision) { if (OVERLAPS) { dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; " "other: %p %llus +%u\n", @@ -187,7 +187,7 @@ static void _about_to_complete_local_write(struct drbd_conf *mdev, #undef OVERLAPS #define OVERLAPS overlaps(sector, size, e->sector, e->size) slot = ee_hash_slot(mdev, req->sector); - hlist_for_each_entry(e, n, slot, colision) { + hlist_for_each_entry(e, n, slot, collision) { if (OVERLAPS) { wake_up(&mdev->misc_wait); break; @@ -260,8 +260,8 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) /* remove the request from the conflict detection * respective block_id verification hash */ - if (!hlist_unhashed(&req->colision)) - hlist_del(&req->colision); + if (!hlist_unhashed(&req->collision)) + hlist_del(&req->collision); else D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0); @@ -329,7 +329,7 @@ static int _req_conflicts(struct drbd_request *req) struct hlist_node *n; struct hlist_head *slot; - D_ASSERT(hlist_unhashed(&req->colision)); + D_ASSERT(hlist_unhashed(&req->collision)); if (!get_net_conf(mdev)) return 0; @@ -341,7 +341,7 @@ static int _req_conflicts(struct drbd_request *req) #define OVERLAPS overlaps(i->sector, i->size, sector, size) slot = tl_hash_slot(mdev, sector); - hlist_for_each_entry(i, n, slot, colision) { + hlist_for_each_entry(i, n, slot, collision) { if (OVERLAPS) { dev_alert(DEV, "%s[%u] Concurrent local write detected! " "[DISCARD L] new: %llus +%u; " @@ -359,7 +359,7 @@ static int _req_conflicts(struct drbd_request *req) #undef OVERLAPS #define OVERLAPS overlaps(e->sector, e->size, sector, size) slot = ee_hash_slot(mdev, sector); - hlist_for_each_entry(e, n, slot, colision) { + hlist_for_each_entry(e, n, slot, collision) { if (OVERLAPS) { dev_alert(DEV, "%s[%u] Concurrent remote write detected!" " [DISCARD L] new: %llus +%u; " @@ -491,7 +491,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* so we can verify the handle in the answer packet * corresponding hlist_del is in _req_may_be_done() */ - hlist_add_head(&req->colision, ar_hash_slot(mdev, req->sector)); + hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector)); set_bit(UNPLUG_REMOTE, &mdev->flags); @@ -507,7 +507,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, /* assert something? */ /* from drbd_make_request_common only */ - hlist_add_head(&req->colision, tl_hash_slot(mdev, req->sector)); + hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector)); /* corresponding hlist_del is in _req_may_be_done() */ /* NOTE @@ -1033,7 +1033,7 @@ fail_conflicting: err = 0; fail_free_complete: - if (rw == WRITE && local) + if (req->rq_state & RQ_IN_ACT_LOG) drbd_al_complete_io(mdev, sector); fail_and_free_req: if (local) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 32e2c3e6a813..68a234a5fdc5 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -256,7 +256,7 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev, struct hlist_node *n; struct drbd_request *req; - hlist_for_each_entry(req, n, slot, colision) { + hlist_for_each_entry(req, n, slot, collision) { if ((unsigned long)req == (unsigned long)id) { D_ASSERT(req->sector == sector); return req; @@ -291,7 +291,7 @@ static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev, req->epoch = 0; req->sector = bio_src->bi_sector; req->size = bio_src->bi_size; - INIT_HLIST_NODE(&req->colision); + INIT_HLIST_NODE(&req->collision); INIT_LIST_HEAD(&req->tl_requests); INIT_LIST_HEAD(&req->w.list); } @@ -323,6 +323,7 @@ extern int __req_mod(struct drbd_request *req, enum drbd_req_event what, extern void complete_master_bio(struct drbd_conf *mdev, struct bio_and_error *m); extern void request_timer_fn(unsigned long data); +extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what); /* use this if you don't want to deal with calling complete_master_bio() * outside the spinlock, e.g. when walking some list on cleanup. */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index f7e6c92f8d03..4d76b06b6b20 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -126,7 +126,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo list_del(&e->w.list); /* has been on active_ee or sync_ee */ list_add_tail(&e->w.list, &mdev->done_ee); - /* No hlist_del_init(&e->colision) here, we did not send the Ack yet, + /* No hlist_del_init(&e->collision) here, we did not send the Ack yet, * neither did we wake possibly waiting conflicting requests. * done from "drbd_process_done_ee" within the appropriate w.cb * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */ @@ -297,42 +297,48 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * crypto_hash_final(&desc, digest); } -static int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) +/* TODO merge common code with w_e_end_ov_req */ +int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); int digest_size; void *digest; - int ok; + int ok = 1; D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef); - if (unlikely(cancel)) { - drbd_free_ee(mdev, e); - return 1; - } + if (unlikely(cancel)) + goto out; - if (likely((e->flags & EE_WAS_ERROR) == 0)) { - digest_size = crypto_hash_digestsize(mdev->csums_tfm); - digest = kmalloc(digest_size, GFP_NOIO); - if (digest) { - drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); + if (likely((e->flags & EE_WAS_ERROR) != 0)) + goto out; - inc_rs_pending(mdev); - ok = drbd_send_drequest_csum(mdev, - e->sector, - e->size, - digest, - digest_size, - P_CSUM_RS_REQUEST); - kfree(digest); - } else { - dev_err(DEV, "kmalloc() of digest failed.\n"); - ok = 0; - } - } else - ok = 1; + digest_size = crypto_hash_digestsize(mdev->csums_tfm); + digest = kmalloc(digest_size, GFP_NOIO); + if (digest) { + sector_t sector = e->sector; + unsigned int size = e->size; + drbd_csum_ee(mdev, mdev->csums_tfm, e, digest); + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); + e = NULL; + inc_rs_pending(mdev); + ok = drbd_send_drequest_csum(mdev, sector, size, + digest, digest_size, + P_CSUM_RS_REQUEST); + kfree(digest); + } else { + dev_err(DEV, "kmalloc() of digest failed.\n"); + ok = 0; + } - drbd_free_ee(mdev, e); +out: + if (e) + drbd_free_ee(mdev, e); if (unlikely(!ok)) dev_err(DEV, "drbd_send_drequest(..., csum) failed\n"); @@ -834,7 +840,7 @@ int drbd_resync_finished(struct drbd_conf *mdev) const int ratio = (t == 0) ? 0 : (t < 100000) ? ((s*100)/t) : (s/(t/100)); - dev_info(DEV, "%u %% had equal check sums, eliminated: %luK; " + dev_info(DEV, "%u %% had equal checksums, eliminated: %luK; " "transferred %luK total %luK\n", ratio, Bit2KB(mdev->rs_same_csum), @@ -1071,9 +1077,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) return ok; } +/* TODO merge common code with w_e_send_csum */ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); + sector_t sector = e->sector; + unsigned int size = e->size; int digest_size; void *digest; int ok = 1; @@ -1093,17 +1102,25 @@ int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel) else memset(digest, 0, digest_size); + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); + e = NULL; inc_rs_pending(mdev); - ok = drbd_send_drequest_csum(mdev, e->sector, e->size, - digest, digest_size, P_OV_REPLY); + ok = drbd_send_drequest_csum(mdev, sector, size, + digest, digest_size, + P_OV_REPLY); if (!ok) dec_rs_pending(mdev); kfree(digest); out: - drbd_free_ee(mdev, e); + if (e) + drbd_free_ee(mdev, e); dec_unacked(mdev); - return ok; } @@ -1122,8 +1139,10 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) { struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w); struct digest_info *di; - int digest_size; void *digest; + sector_t sector = e->sector; + unsigned int size = e->size; + int digest_size; int ok, eq = 0; if (unlikely(cancel)) { @@ -1153,16 +1172,21 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } } - dec_unacked(mdev); + /* Free e and pages before send. + * In case we block on congestion, we could otherwise run into + * some distributed deadlock, if the other side blocks on + * congestion as well, because our receiver blocks in + * drbd_pp_alloc due to pp_in_use > max_buffers. */ + drbd_free_ee(mdev, e); if (!eq) - drbd_ov_oos_found(mdev, e->sector, e->size); + drbd_ov_oos_found(mdev, sector, size); else ov_oos_print(mdev); - ok = drbd_send_ack_ex(mdev, P_OV_RESULT, e->sector, e->size, + ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, eq ? ID_IN_SYNC : ID_OUT_OF_SYNC); - drbd_free_ee(mdev, e); + dec_unacked(mdev); --mdev->ov_left; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index a076a14ca72d..76c8da78212b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -1540,9 +1540,9 @@ static const struct block_device_operations lo_fops = { * And now the modules code and kernel interface. */ static int max_loop; -module_param(max_loop, int, 0); +module_param(max_loop, int, S_IRUGO); MODULE_PARM_DESC(max_loop, "Maximum number of loop devices"); -module_param(max_part, int, 0); +module_param(max_part, int, S_IRUGO); MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device"); MODULE_LICENSE("GPL"); MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR); @@ -1658,7 +1658,7 @@ static struct kobject *loop_probe(dev_t dev, int *part, void *data) struct kobject *kobj; mutex_lock(&loop_devices_mutex); - lo = loop_init_one(dev & MINORMASK); + lo = loop_init_one(MINOR(dev) >> part_shift); kobj = lo ? get_disk(lo->lo_disk) : ERR_PTR(-ENOMEM); mutex_unlock(&loop_devices_mutex); @@ -1688,18 +1688,32 @@ static int __init loop_init(void) */ part_shift = 0; - if (max_part > 0) + if (max_part > 0) { part_shift = fls(max_part); + /* + * Adjust max_part according to part_shift as it is exported + * to user space so that user can decide correct minor number + * if [s]he want to create more devices. + * + * Note that -1 is required because partition 0 is reserved + * for the whole disk. + */ + max_part = (1UL << part_shift) - 1; + } + + if ((1UL << part_shift) > DISK_MAX_PARTS) + return -EINVAL; + if (max_loop > 1UL << (MINORBITS - part_shift)) return -EINVAL; if (max_loop) { nr = max_loop; - range = max_loop; + range = max_loop << part_shift; } else { nr = 8; - range = 1UL << (MINORBITS - part_shift); + range = 1UL << MINORBITS; } if (register_blkdev(LOOP_MAJOR, "loop")) @@ -1738,7 +1752,7 @@ static void __exit loop_exit(void) unsigned long range; struct loop_device *lo, *next; - range = max_loop ? max_loop : 1UL << (MINORBITS - part_shift); + range = max_loop ? max_loop << part_shift : 1UL << MINORBITS; list_for_each_entry_safe(lo, next, &loop_devices, lo_list) loop_del_one(lo); diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 8690e31d9932..a0aabd904a51 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -320,6 +320,8 @@ static void pcd_init_units(void) disk->first_minor = unit; strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; + disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; + disk->events = DISK_EVENT_MEDIA_CHANGE; } } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 9712fad82bc6..1278098624e6 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -1191,14 +1191,19 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { struct rbd_device *dev = (struct rbd_device *)data; + int rc; + if (!dev) return; dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, notify_id, (int)opcode); mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - __rbd_update_snaps(dev); + rc = __rbd_update_snaps(dev); mutex_unlock(&ctl_mutex); + if (rc) + pr_warning(DRV_NAME "%d got notification but failed to update" + " snaps: %d\n", dev->major, rc); rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); } @@ -1597,7 +1602,7 @@ static int rbd_header_add_snap(struct rbd_device *dev, int name_len = strlen(snap_name); u64 new_snapid; int ret; - void *data, *data_start, *data_end; + void *data, *p, *e; u64 ver; /* we should create a snapshot only if we're pointing at the head */ @@ -1614,16 +1619,16 @@ static int rbd_header_add_snap(struct rbd_device *dev, if (!data) return -ENOMEM; - data_start = data; - data_end = data + name_len + 16; + p = data; + e = data + name_len + 16; - ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad); - ceph_encode_64_safe(&data, data_end, new_snapid, bad); + ceph_encode_string_safe(&p, e, snap_name, name_len, bad); + ceph_encode_64_safe(&p, e, new_snapid, bad); ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data_start, data - data_start, &ver); + data, p - data, &ver); - kfree(data_start); + kfree(data); if (ret < 0) return ret; @@ -1659,6 +1664,9 @@ static int __rbd_update_snaps(struct rbd_device *rbd_dev) if (ret < 0) return ret; + /* resized? */ + set_capacity(rbd_dev->disk, h.image_size / 512ULL); + down_write(&rbd_dev->header.snap_rwsem); snap_seq = rbd_dev->header.snapc->seq; @@ -1716,7 +1724,8 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) if (!disk) goto out; - sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id); + snprintf(disk->disk_name, sizeof(disk->disk_name), DRV_NAME "%d", + rbd_dev->id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile new file mode 100644 index 000000000000..e491c1b76878 --- /dev/null +++ b/drivers/block/xen-blkback/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o + +xen-blkback-y := blkback.o xenbus.o diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c new file mode 100644 index 000000000000..c73910cc28c9 --- /dev/null +++ b/drivers/block/xen-blkback/blkback.c @@ -0,0 +1,824 @@ +/****************************************************************************** + * + * Back-end of the driver for virtual block devices. This portion of the + * driver exports a 'unified' block-device interface that can be accessed + * by any operating system that implements a compatible front end. A + * reference front-end implementation can be found in: + * drivers/block/xen-blkfront.c + * + * Copyright (c) 2003-2004, Keir Fraser & Steve Hand + * Copyright (c) 2005, Christopher Clark + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include <linux/spinlock.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/delay.h> +#include <linux/freezer.h> + +#include <xen/events.h> +#include <xen/page.h> +#include <asm/xen/hypervisor.h> +#include <asm/xen/hypercall.h> +#include "common.h" + +/* + * These are rather arbitrary. They are fairly large because adjacent requests + * pulled from a communication ring are quite likely to end up being part of + * the same scatter/gather request at the disc. + * + * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** + * + * This will increase the chances of being able to write whole tracks. + * 64 should be enough to keep us competitive with Linux. + */ +static int xen_blkif_reqs = 64; +module_param_named(reqs, xen_blkif_reqs, int, 0); +MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); + +/* Run-time switchable: /sys/module/blkback/parameters/ */ +static unsigned int log_stats; +module_param(log_stats, int, 0644); + +/* + * Each outstanding request that we've passed to the lower device layers has a + * 'pending_req' allocated to it. Each buffer_head that completes decrements + * the pendcnt towards zero. When it hits zero, the specified domain has a + * response queued for it, with the saved 'id' passed back. + */ +struct pending_req { + struct xen_blkif *blkif; + u64 id; + int nr_pages; + atomic_t pendcnt; + unsigned short operation; + int status; + struct list_head free_list; +}; + +#define BLKBACK_INVALID_HANDLE (~0) + +struct xen_blkbk { + struct pending_req *pending_reqs; + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + /* The list of all pages that are available. */ + struct page **pending_pages; + /* And the grant handles that are available. */ + grant_handle_t *pending_grant_handles; +}; + +static struct xen_blkbk *blkbk; + +/* + * Little helpful macro to figure out the index and virtual address of the + * pending_pages[..]. For each 'pending_req' we have have up to + * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through + * 10 and would index in the pending_pages[..]. + */ +static inline int vaddr_pagenr(struct pending_req *req, int seg) +{ + return (req - blkbk->pending_reqs) * + BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; +} + +#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)] + +static inline unsigned long vaddr(struct pending_req *req, int seg) +{ + unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg)); + return (unsigned long)pfn_to_kaddr(pfn); +} + +#define pending_handle(_req, _seg) \ + (blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)]) + + +static int do_block_io_op(struct xen_blkif *blkif); +static int dispatch_rw_block_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req); +static void make_response(struct xen_blkif *blkif, u64 id, + unsigned short op, int st); + +/* + * Retrieve from the 'pending_reqs' a free pending_req structure to be used. + */ +static struct pending_req *alloc_req(void) +{ + struct pending_req *req = NULL; + unsigned long flags; + + spin_lock_irqsave(&blkbk->pending_free_lock, flags); + if (!list_empty(&blkbk->pending_free)) { + req = list_entry(blkbk->pending_free.next, struct pending_req, + free_list); + list_del(&req->free_list); + } + spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); + return req; +} + +/* + * Return the 'pending_req' structure back to the freepool. We also + * wake up the thread if it was waiting for a free page. + */ +static void free_req(struct pending_req *req) +{ + unsigned long flags; + int was_empty; + + spin_lock_irqsave(&blkbk->pending_free_lock, flags); + was_empty = list_empty(&blkbk->pending_free); + list_add(&req->free_list, &blkbk->pending_free); + spin_unlock_irqrestore(&blkbk->pending_free_lock, flags); + if (was_empty) + wake_up(&blkbk->pending_free_wq); +} + +/* + * Routines for managing virtual block devices (vbds). + */ +static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, + int operation) +{ + struct xen_vbd *vbd = &blkif->vbd; + int rc = -EACCES; + + if ((operation != READ) && vbd->readonly) + goto out; + + if (likely(req->nr_sects)) { + blkif_sector_t end = req->sector_number + req->nr_sects; + + if (unlikely(end < req->sector_number)) + goto out; + if (unlikely(end > vbd_sz(vbd))) + goto out; + } + + req->dev = vbd->pdevice; + req->bdev = vbd->bdev; + rc = 0; + + out: + return rc; +} + +static void xen_vbd_resize(struct xen_blkif *blkif) +{ + struct xen_vbd *vbd = &blkif->vbd; + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be); + unsigned long long new_size = vbd_sz(vbd); + + pr_info(DRV_PFX "VBD Resize: Domid: %d, Device: (%d, %d)\n", + blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice)); + pr_info(DRV_PFX "VBD Resize: new size %llu\n", new_size); + vbd->size = new_size; +again: + err = xenbus_transaction_start(&xbt); + if (err) { + pr_warn(DRV_PFX "Error starting transaction"); + return; + } + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + (unsigned long long)vbd_sz(vbd)); + if (err) { + pr_warn(DRV_PFX "Error writing new size"); + goto abort; + } + /* + * Write the current state; we will use this to synchronize + * the front-end. If the current state is "connected" the + * front-end will get the new size information online. + */ + err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state); + if (err) { + pr_warn(DRV_PFX "Error writing the state"); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + pr_warn(DRV_PFX "Error ending transaction"); + return; +abort: + xenbus_transaction_end(xbt, 1); +} + +/* + * Notification from the guest OS. + */ +static void blkif_notify_work(struct xen_blkif *blkif) +{ + blkif->waiting_reqs = 1; + wake_up(&blkif->wq); +} + +irqreturn_t xen_blkif_be_int(int irq, void *dev_id) +{ + blkif_notify_work(dev_id); + return IRQ_HANDLED; +} + +/* + * SCHEDULER FUNCTIONS + */ + +static void print_stats(struct xen_blkif *blkif) +{ + pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n", + current->comm, blkif->st_oo_req, + blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req); + blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); + blkif->st_rd_req = 0; + blkif->st_wr_req = 0; + blkif->st_oo_req = 0; +} + +int xen_blkif_schedule(void *arg) +{ + struct xen_blkif *blkif = arg; + struct xen_vbd *vbd = &blkif->vbd; + + xen_blkif_get(blkif); + + while (!kthread_should_stop()) { + if (try_to_freeze()) + continue; + if (unlikely(vbd->size != vbd_sz(vbd))) + xen_vbd_resize(blkif); + + wait_event_interruptible( + blkif->wq, + blkif->waiting_reqs || kthread_should_stop()); + wait_event_interruptible( + blkbk->pending_free_wq, + !list_empty(&blkbk->pending_free) || + kthread_should_stop()); + + blkif->waiting_reqs = 0; + smp_mb(); /* clear flag *before* checking for work */ + + if (do_block_io_op(blkif)) + blkif->waiting_reqs = 1; + + if (log_stats && time_after(jiffies, blkif->st_print)) + print_stats(blkif); + } + + if (log_stats) + print_stats(blkif); + + blkif->xenblkd = NULL; + xen_blkif_put(blkif); + + return 0; +} + +struct seg_buf { + unsigned long buf; + unsigned int nsec; +}; +/* + * Unmap the grant references, and also remove the M2P over-rides + * used in the 'pending_req'. + */ +static void xen_blkbk_unmap(struct pending_req *req) +{ + struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int i, invcount = 0; + grant_handle_t handle; + int ret; + + for (i = 0; i < req->nr_pages; i++) { + handle = pending_handle(req, i); + if (handle == BLKBACK_INVALID_HANDLE) + continue; + gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), + GNTMAP_host_map, handle); + pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + invcount++; + } + + ret = HYPERVISOR_grant_table_op( + GNTTABOP_unmap_grant_ref, unmap, invcount); + BUG_ON(ret); + /* + * Note, we use invcount, so nr->pages, so we can't index + * using vaddr(req, i). + */ + for (i = 0; i < invcount; i++) { + ret = m2p_remove_override( + virt_to_page(unmap[i].host_addr), false); + if (ret) { + pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n", + (unsigned long)unmap[i].host_addr); + continue; + } + } +} + +static int xen_blkbk_map(struct blkif_request *req, + struct pending_req *pending_req, + struct seg_buf seg[]) +{ + struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i; + int nseg = req->nr_segments; + int ret = 0; + + /* + * Fill out preq.nr_sects with proper amount of sectors, and setup + * assign map[..] with the PFN of the page in our domain with the + * corresponding grant reference for each page. + */ + for (i = 0; i < nseg; i++) { + uint32_t flags; + + flags = GNTMAP_host_map; + if (pending_req->operation != BLKIF_OP_READ) + flags |= GNTMAP_readonly; + gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, + req->u.rw.seg[i].gref, + pending_req->blkif->domid); + } + + ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + BUG_ON(ret); + + /* + * Now swizzle the MFN in our domain with the MFN from the other domain + * so that when we access vaddr(pending_req,i) it has the contents of + * the page from the other domain. + */ + for (i = 0; i < nseg; i++) { + if (unlikely(map[i].status != 0)) { + pr_debug(DRV_PFX "invalid buffer -- could not remap it\n"); + map[i].handle = BLKBACK_INVALID_HANDLE; + ret |= 1; + } + + pending_handle(pending_req, i) = map[i].handle; + + if (ret) + continue; + + ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), + blkbk->pending_page(pending_req, i), false); + if (ret) { + pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", + (unsigned long)map[i].dev_bus_addr, ret); + /* We could switch over to GNTTABOP_copy */ + continue; + } + + seg[i].buf = map[i].dev_bus_addr | + (req->u.rw.seg[i].first_sect << 9); + } + return ret; +} + +/* + * Completion callback on the bio's. Called as bh->b_end_io() + */ + +static void __end_block_io_op(struct pending_req *pending_req, int error) +{ + /* An error fails the entire request. */ + if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && + (error == -EOPNOTSUPP)) { + pr_debug(DRV_PFX "flush diskcache op failed, not supported\n"); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if (error) { + pr_debug(DRV_PFX "Buffer not up-to-date at end of operation," + " error=%d\n", error); + pending_req->status = BLKIF_RSP_ERROR; + } + + /* + * If all of the bio's have completed it is time to unmap + * the grant references associated with 'request' and provide + * the proper response on the ring. + */ + if (atomic_dec_and_test(&pending_req->pendcnt)) { + xen_blkbk_unmap(pending_req); + make_response(pending_req->blkif, pending_req->id, + pending_req->operation, pending_req->status); + xen_blkif_put(pending_req->blkif); + free_req(pending_req); + } +} + +/* + * bio callback. + */ +static void end_block_io_op(struct bio *bio, int error) +{ + __end_block_io_op(bio->bi_private, error); + bio_put(bio); +} + + + +/* + * Function to copy the from the ring buffer the 'struct blkif_request' + * (which has the sectors we want, number of them, grant references, etc), + * and transmute it to the block API to hand it over to the proper block disk. + */ +static int do_block_io_op(struct xen_blkif *blkif) +{ + union blkif_back_rings *blk_rings = &blkif->blk_rings; + struct blkif_request req; + struct pending_req *pending_req; + RING_IDX rc, rp; + int more_to_do = 0; + + rc = blk_rings->common.req_cons; + rp = blk_rings->common.sring->req_prod; + rmb(); /* Ensure we see queued requests up to 'rp'. */ + + while (rc != rp) { + + if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) + break; + + if (kthread_should_stop()) { + more_to_do = 1; + break; + } + + pending_req = alloc_req(); + if (NULL == pending_req) { + blkif->st_oo_req++; + more_to_do = 1; + break; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); + break; + case BLKIF_PROTOCOL_X86_32: + blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); + break; + case BLKIF_PROTOCOL_X86_64: + blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); + break; + default: + BUG(); + } + blk_rings->common.req_cons = ++rc; /* before make_response() */ + + /* Apply all sanity checks to /private copy/ of request. */ + barrier(); + + if (dispatch_rw_block_io(blkif, &req, pending_req)) + break; + + /* Yield point for this unbounded loop. */ + cond_resched(); + } + + return more_to_do; +} + +/* + * Transmutation of the 'struct blkif_request' to a proper 'struct bio' + * and call the 'submit_bio' to pass it to the underlying storage. + */ +static int dispatch_rw_block_io(struct xen_blkif *blkif, + struct blkif_request *req, + struct pending_req *pending_req) +{ + struct phys_req preq; + struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + unsigned int nseg; + struct bio *bio = NULL; + struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + int i, nbio = 0; + int operation; + struct blk_plug plug; + + switch (req->operation) { + case BLKIF_OP_READ: + blkif->st_rd_req++; + operation = READ; + break; + case BLKIF_OP_WRITE: + blkif->st_wr_req++; + operation = WRITE_ODIRECT; + break; + case BLKIF_OP_FLUSH_DISKCACHE: + blkif->st_f_req++; + operation = WRITE_FLUSH; + break; + case BLKIF_OP_WRITE_BARRIER: + default: + operation = 0; /* make gcc happy */ + goto fail_response; + break; + } + + /* Check that the number of segments is sane. */ + nseg = req->nr_segments; + if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || + unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", + nseg); + /* Haven't submitted any bio's yet. */ + goto fail_response; + } + + preq.dev = req->handle; + preq.sector_number = req->u.rw.sector_number; + preq.nr_sects = 0; + + pending_req->blkif = blkif; + pending_req->id = req->id; + pending_req->operation = req->operation; + pending_req->status = BLKIF_RSP_OKAY; + pending_req->nr_pages = nseg; + + for (i = 0; i < nseg; i++) { + seg[i].nsec = req->u.rw.seg[i].last_sect - + req->u.rw.seg[i].first_sect + 1; + if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) || + (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect)) + goto fail_response; + preq.nr_sects += seg[i].nsec; + + } + + if (xen_vbd_translate(&preq, blkif, operation) != 0) { + pr_debug(DRV_PFX "access denied: %s of [%llu,%llu] on dev=%04x\n", + operation == READ ? "read" : "write", + preq.sector_number, + preq.sector_number + preq.nr_sects, preq.dev); + goto fail_response; + } + + /* + * This check _MUST_ be done after xen_vbd_translate as the preq.bdev + * is set there. + */ + for (i = 0; i < nseg; i++) { + if (((int)preq.sector_number|(int)seg[i].nsec) & + ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { + pr_debug(DRV_PFX "Misaligned I/O request from domain %d", + blkif->domid); + goto fail_response; + } + } + + /* + * If we have failed at this point, we need to undo the M2P override, + * set gnttab_set_unmap_op on all of the grant references and perform + * the hypercall to unmap the grants - that is all done in + * xen_blkbk_unmap. + */ + if (xen_blkbk_map(req, pending_req, seg)) + goto fail_flush; + + /* This corresponding xen_blkif_put is done in __end_block_io_op */ + xen_blkif_get(blkif); + + for (i = 0; i < nseg; i++) { + while ((bio == NULL) || + (bio_add_page(bio, + blkbk->pending_page(pending_req, i), + seg[i].nsec << 9, + seg[i].buf & ~PAGE_MASK) == 0)) { + + bio = bio_alloc(GFP_KERNEL, nseg-i); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + biolist[nbio++] = bio; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + bio->bi_sector = preq.sector_number; + } + + preq.sector_number += seg[i].nsec; + } + + /* This will be hit if the operation was a flush. */ + if (!bio) { + BUG_ON(operation != WRITE_FLUSH); + + bio = bio_alloc(GFP_KERNEL, 0); + if (unlikely(bio == NULL)) + goto fail_put_bio; + + biolist[nbio++] = bio; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + } + + /* + * We set it one so that the last submit_bio does not have to call + * atomic_inc. + */ + atomic_set(&pending_req->pendcnt, nbio); + + /* Get a reference count for the disk queue and start sending I/O */ + blk_start_plug(&plug); + + for (i = 0; i < nbio; i++) + submit_bio(operation, biolist[i]); + + /* Let the I/Os go.. */ + blk_finish_plug(&plug); + + if (operation == READ) + blkif->st_rd_sect += preq.nr_sects; + else if (operation == WRITE || operation == WRITE_FLUSH) + blkif->st_wr_sect += preq.nr_sects; + + return 0; + + fail_flush: + xen_blkbk_unmap(pending_req); + fail_response: + /* Haven't submitted any bio's yet. */ + make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); + free_req(pending_req); + msleep(1); /* back off a bit */ + return -EIO; + + fail_put_bio: + for (i = 0; i < nbio; i++) + bio_put(biolist[i]); + __end_block_io_op(pending_req, -EINVAL); + msleep(1); /* back off a bit */ + return -EIO; +} + + + +/* + * Put a response on the ring on how the operation fared. + */ +static void make_response(struct xen_blkif *blkif, u64 id, + unsigned short op, int st) +{ + struct blkif_response resp; + unsigned long flags; + union blkif_back_rings *blk_rings = &blkif->blk_rings; + int more_to_do = 0; + int notify; + + resp.id = id; + resp.operation = op; + resp.status = st; + + spin_lock_irqsave(&blkif->blk_ring_lock, flags); + /* Place on the response ring for the relevant domain. */ + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_32: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + case BLKIF_PROTOCOL_X86_64: + memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), + &resp, sizeof(resp)); + break; + default: + BUG(); + } + blk_rings->common.rsp_prod_pvt++; + RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); + if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { + /* + * Tail check for pending requests. Allows frontend to avoid + * notifications if requests are already in flight (lower + * overheads and promotes batching). + */ + RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); + + } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { + more_to_do = 1; + } + + spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + + if (more_to_do) + blkif_notify_work(blkif); + if (notify) + notify_remote_via_irq(blkif->irq); +} + +static int __init xen_blkif_init(void) +{ + int i, mmap_pages; + int rc = 0; + + if (!xen_pv_domain()) + return -ENODEV; + + blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); + if (!blkbk) { + pr_alert(DRV_PFX "%s: out of memory!\n", __func__); + return -ENOMEM; + } + + mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; + + blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) * + xen_blkif_reqs, GFP_KERNEL); + blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) * + mmap_pages, GFP_KERNEL); + blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * + mmap_pages, GFP_KERNEL); + + if (!blkbk->pending_reqs || !blkbk->pending_grant_handles || + !blkbk->pending_pages) { + rc = -ENOMEM; + goto out_of_memory; + } + + for (i = 0; i < mmap_pages; i++) { + blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; + blkbk->pending_pages[i] = alloc_page(GFP_KERNEL); + if (blkbk->pending_pages[i] == NULL) { + rc = -ENOMEM; + goto out_of_memory; + } + } + rc = xen_blkif_interface_init(); + if (rc) + goto failed_init; + + memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs)); + + INIT_LIST_HEAD(&blkbk->pending_free); + spin_lock_init(&blkbk->pending_free_lock); + init_waitqueue_head(&blkbk->pending_free_wq); + + for (i = 0; i < xen_blkif_reqs; i++) + list_add_tail(&blkbk->pending_reqs[i].free_list, + &blkbk->pending_free); + + rc = xen_blkif_xenbus_init(); + if (rc) + goto failed_init; + + return 0; + + out_of_memory: + pr_alert(DRV_PFX "%s: out of memory\n", __func__); + failed_init: + kfree(blkbk->pending_reqs); + kfree(blkbk->pending_grant_handles); + for (i = 0; i < mmap_pages; i++) { + if (blkbk->pending_pages[i]) + __free_page(blkbk->pending_pages[i]); + } + kfree(blkbk->pending_pages); + kfree(blkbk); + blkbk = NULL; + return rc; +} + +module_init(xen_blkif_init); + +MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h new file mode 100644 index 000000000000..9e40b283a468 --- /dev/null +++ b/drivers/block/xen-blkback/common.h @@ -0,0 +1,233 @@ +/* + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __XEN_BLKIF__BACKEND__COMMON_H__ +#define __XEN_BLKIF__BACKEND__COMMON_H__ + +#include <linux/version.h> +#include <linux/module.h> +#include <linux/interrupt.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/vmalloc.h> +#include <linux/wait.h> +#include <linux/io.h> +#include <asm/setup.h> +#include <asm/pgalloc.h> +#include <asm/hypervisor.h> +#include <xen/grant_table.h> +#include <xen/xenbus.h> +#include <xen/interface/io/ring.h> +#include <xen/interface/io/blkif.h> +#include <xen/interface/io/protocols.h> + +#define DRV_PFX "xen-blkback:" +#define DPRINTK(fmt, args...) \ + pr_debug(DRV_PFX "(%s:%d) " fmt ".\n", \ + __func__, __LINE__, ##args) + + +/* Not a real protocol. Used to generate ring structs which contain + * the elements common to all protocols only. This way we get a + * compiler-checkable way to use common struct elements, so we can + * avoid using switch(protocol) in a number of places. */ +struct blkif_common_request { + char dummy; +}; +struct blkif_common_response { + char dummy; +}; + +/* i386 protocol version */ +#pragma pack(push, 4) +struct blkif_x86_32_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t id; /* private guest value, echoed in resp */ + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_32_response { + uint64_t id; /* copied from request */ + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; +#pragma pack(pop) + +/* x86_64 protocol version */ +struct blkif_x86_64_request { + uint8_t operation; /* BLKIF_OP_??? */ + uint8_t nr_segments; /* number of segments */ + blkif_vdev_t handle; /* only for read/write requests */ + uint64_t __attribute__((__aligned__(8))) id; + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; +struct blkif_x86_64_response { + uint64_t __attribute__((__aligned__(8))) id; + uint8_t operation; /* copied from request */ + int16_t status; /* BLKIF_RSP_??? */ +}; + +DEFINE_RING_TYPES(blkif_common, struct blkif_common_request, + struct blkif_common_response); +DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request, + struct blkif_x86_32_response); +DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request, + struct blkif_x86_64_response); + +union blkif_back_rings { + struct blkif_back_ring native; + struct blkif_common_back_ring common; + struct blkif_x86_32_back_ring x86_32; + struct blkif_x86_64_back_ring x86_64; +}; + +enum blkif_protocol { + BLKIF_PROTOCOL_NATIVE = 1, + BLKIF_PROTOCOL_X86_32 = 2, + BLKIF_PROTOCOL_X86_64 = 3, +}; + +struct xen_vbd { + /* What the domain refers to this vbd as. */ + blkif_vdev_t handle; + /* Non-zero -> read-only */ + unsigned char readonly; + /* VDISK_xxx */ + unsigned char type; + /* phys device that this vbd maps to. */ + u32 pdevice; + struct block_device *bdev; + /* Cached size parameter. */ + sector_t size; + bool flush_support; +}; + +struct backend_info; + +struct xen_blkif { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Physical parameters of the comms window. */ + unsigned int irq; + /* Comms information. */ + enum blkif_protocol blk_protocol; + union blkif_back_rings blk_rings; + struct vm_struct *blk_ring_area; + /* The VBD attached to this interface. */ + struct xen_vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + /* Private fields. */ + spinlock_t blk_ring_lock; + atomic_t refcnt; + + wait_queue_head_t wq; + /* One thread per one blkif. */ + struct task_struct *xenblkd; + unsigned int waiting_reqs; + + /* statistics */ + unsigned long st_print; + int st_rd_req; + int st_wr_req; + int st_oo_req; + int st_f_req; + int st_rd_sect; + int st_wr_sect; + + wait_queue_head_t waiting_to_free; + + grant_handle_t shmem_handle; + grant_ref_t shmem_ref; +}; + + +#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ + (_v)->bdev->bd_part->nr_sects : \ + get_capacity((_v)->bdev->bd_disk)) + +#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt)) +#define xen_blkif_put(_b) \ + do { \ + if (atomic_dec_and_test(&(_b)->refcnt)) \ + wake_up(&(_b)->waiting_to_free);\ + } while (0) + +struct phys_req { + unsigned short dev; + unsigned short nr_sects; + struct block_device *bdev; + blkif_sector_t sector_number; +}; +int xen_blkif_interface_init(void); + +int xen_blkif_xenbus_init(void); + +irqreturn_t xen_blkif_be_int(int irq, void *dev_id); +int xen_blkif_schedule(void *arg); + +int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, + struct backend_info *be, int state); + +struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); + +static inline void blkif_get_x86_32_req(struct blkif_request *dst, + struct blkif_x86_32_request *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->u.rw.sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->seg[i]; +} + +static inline void blkif_get_x86_64_req(struct blkif_request *dst, + struct blkif_x86_64_request *src) +{ + int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST; + dst->operation = src->operation; + dst->nr_segments = src->nr_segments; + dst->handle = src->handle; + dst->id = src->id; + dst->u.rw.sector_number = src->sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->seg[i]; +} + +#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c new file mode 100644 index 000000000000..34570823355b --- /dev/null +++ b/drivers/block/xen-blkback/xenbus.c @@ -0,0 +1,768 @@ +/* Xenbus code for blkif backend + Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> + Copyright (C) 2005 XenSource Ltd + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + +*/ + +#include <stdarg.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <xen/events.h> +#include <xen/grant_table.h> +#include "common.h" + +struct backend_info { + struct xenbus_device *dev; + struct xen_blkif *blkif; + struct xenbus_watch backend_watch; + unsigned major; + unsigned minor; + char *mode; +}; + +static struct kmem_cache *xen_blkif_cachep; +static void connect(struct backend_info *); +static int connect_ring(struct backend_info *); +static void backend_changed(struct xenbus_watch *, const char **, + unsigned int); + +struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be) +{ + return be->dev; +} + +static int blkback_name(struct xen_blkif *blkif, char *buf) +{ + char *devpath, *devname; + struct xenbus_device *dev = blkif->be->dev; + + devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); + if (IS_ERR(devpath)) + return PTR_ERR(devpath); + + devname = strstr(devpath, "/dev/"); + if (devname != NULL) + devname += strlen("/dev/"); + else + devname = devpath; + + snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); + kfree(devpath); + + return 0; +} + +static void xen_update_blkif_status(struct xen_blkif *blkif) +{ + int err; + char name[TASK_COMM_LEN]; + + /* Not ready to connect? */ + if (!blkif->irq || !blkif->vbd.bdev) + return; + + /* Already connected? */ + if (blkif->be->dev->state == XenbusStateConnected) + return; + + /* Attempt to connect: exit if we fail to. */ + connect(blkif->be); + if (blkif->be->dev->state != XenbusStateConnected) + return; + + err = blkback_name(blkif, name); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); + return; + } + + err = filemap_write_and_wait(blkif->vbd.bdev->bd_inode->i_mapping); + if (err) { + xenbus_dev_error(blkif->be->dev, err, "block flush"); + return; + } + invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); + + blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, name); + if (IS_ERR(blkif->xenblkd)) { + err = PTR_ERR(blkif->xenblkd); + blkif->xenblkd = NULL; + xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); + } +} + +static struct xen_blkif *xen_blkif_alloc(domid_t domid) +{ + struct xen_blkif *blkif; + + blkif = kmem_cache_alloc(xen_blkif_cachep, GFP_KERNEL); + if (!blkif) + return ERR_PTR(-ENOMEM); + + memset(blkif, 0, sizeof(*blkif)); + blkif->domid = domid; + spin_lock_init(&blkif->blk_ring_lock); + atomic_set(&blkif->refcnt, 1); + init_waitqueue_head(&blkif->wq); + blkif->st_print = jiffies; + init_waitqueue_head(&blkif->waiting_to_free); + + return blkif; +} + +static int map_frontend_page(struct xen_blkif *blkif, unsigned long shared_page) +{ + struct gnttab_map_grant_ref op; + + gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, shared_page, blkif->domid); + + if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) + BUG(); + + if (op.status) { + DPRINTK("Grant table operation failure !\n"); + return op.status; + } + + blkif->shmem_ref = shared_page; + blkif->shmem_handle = op.handle; + + return 0; +} + +static void unmap_frontend_page(struct xen_blkif *blkif) +{ + struct gnttab_unmap_grant_ref op; + + gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, + GNTMAP_host_map, blkif->shmem_handle); + + if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) + BUG(); +} + +static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page, + unsigned int evtchn) +{ + int err; + + /* Already connected through? */ + if (blkif->irq) + return 0; + + blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE); + if (!blkif->blk_ring_area) + return -ENOMEM; + + err = map_frontend_page(blkif, shared_page); + if (err) { + free_vm_area(blkif->blk_ring_area); + return err; + } + + switch (blkif->blk_protocol) { + case BLKIF_PROTOCOL_NATIVE: + { + struct blkif_sring *sring; + sring = (struct blkif_sring *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_32: + { + struct blkif_x86_32_sring *sring_x86_32; + sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); + break; + } + case BLKIF_PROTOCOL_X86_64: + { + struct blkif_x86_64_sring *sring_x86_64; + sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring_area->addr; + BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); + break; + } + default: + BUG(); + } + + err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, + xen_blkif_be_int, 0, + "blkif-backend", blkif); + if (err < 0) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + return err; + } + blkif->irq = err; + + return 0; +} + +static void xen_blkif_disconnect(struct xen_blkif *blkif) +{ + if (blkif->xenblkd) { + kthread_stop(blkif->xenblkd); + blkif->xenblkd = NULL; + } + + atomic_dec(&blkif->refcnt); + wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); + atomic_inc(&blkif->refcnt); + + if (blkif->irq) { + unbind_from_irqhandler(blkif->irq, blkif); + blkif->irq = 0; + } + + if (blkif->blk_rings.common.sring) { + unmap_frontend_page(blkif); + free_vm_area(blkif->blk_ring_area); + blkif->blk_rings.common.sring = NULL; + } +} + +void xen_blkif_free(struct xen_blkif *blkif) +{ + if (!atomic_dec_and_test(&blkif->refcnt)) + BUG(); + kmem_cache_free(xen_blkif_cachep, blkif); +} + +int __init xen_blkif_interface_init(void) +{ + xen_blkif_cachep = kmem_cache_create("blkif_cache", + sizeof(struct xen_blkif), + 0, 0, NULL); + if (!xen_blkif_cachep) + return -ENOMEM; + + return 0; +} + +/* + * sysfs interface for VBD I/O requests + */ + +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + +VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); +VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); +VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); +VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); +VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); +VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); + +static struct attribute *xen_vbdstat_attrs[] = { + &dev_attr_oo_req.attr, + &dev_attr_rd_req.attr, + &dev_attr_wr_req.attr, + &dev_attr_f_req.attr, + &dev_attr_rd_sect.attr, + &dev_attr_wr_sect.attr, + NULL +}; + +static struct attribute_group xen_vbdstat_group = { + .name = "statistics", + .attrs = xen_vbdstat_attrs, +}; + +VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); +VBD_SHOW(mode, "%s\n", be->mode); + +int xenvbd_sysfs_addif(struct xenbus_device *dev) +{ + int error; + + error = device_create_file(&dev->dev, &dev_attr_physical_device); + if (error) + goto fail1; + + error = device_create_file(&dev->dev, &dev_attr_mode); + if (error) + goto fail2; + + error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group); + if (error) + goto fail3; + + return 0; + +fail3: sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); +fail2: device_remove_file(&dev->dev, &dev_attr_mode); +fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); + return error; +} + +void xenvbd_sysfs_delif(struct xenbus_device *dev) +{ + sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group); + device_remove_file(&dev->dev, &dev_attr_mode); + device_remove_file(&dev->dev, &dev_attr_physical_device); +} + + +static void xen_vbd_free(struct xen_vbd *vbd) +{ + if (vbd->bdev) + blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE); + vbd->bdev = NULL; +} + +static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, + unsigned major, unsigned minor, int readonly, + int cdrom) +{ + struct xen_vbd *vbd; + struct block_device *bdev; + struct request_queue *q; + + vbd = &blkif->vbd; + vbd->handle = handle; + vbd->readonly = readonly; + vbd->type = 0; + + vbd->pdevice = MKDEV(major, minor); + + bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ? + FMODE_READ : FMODE_WRITE, NULL); + + if (IS_ERR(bdev)) { + DPRINTK("xen_vbd_create: device %08x could not be opened.\n", + vbd->pdevice); + return -ENOENT; + } + + vbd->bdev = bdev; + vbd->size = vbd_sz(vbd); + + if (vbd->bdev->bd_disk == NULL) { + DPRINTK("xen_vbd_create: device %08x doesn't exist.\n", + vbd->pdevice); + xen_vbd_free(vbd); + return -ENOENT; + } + + if (vbd->bdev->bd_disk->flags & GENHD_FL_CD || cdrom) + vbd->type |= VDISK_CDROM; + if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) + vbd->type |= VDISK_REMOVABLE; + + q = bdev_get_queue(bdev); + if (q && q->flush_flags) + vbd->flush_support = true; + + DPRINTK("Successful creation of handle=%04x (dom=%u)\n", + handle, blkif->domid); + return 0; +} +static int xen_blkbk_remove(struct xenbus_device *dev) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + + DPRINTK(""); + + if (be->major || be->minor) + xenvbd_sysfs_delif(dev); + + if (be->backend_watch.node) { + unregister_xenbus_watch(&be->backend_watch); + kfree(be->backend_watch.node); + be->backend_watch.node = NULL; + } + + if (be->blkif) { + xen_blkif_disconnect(be->blkif); + xen_vbd_free(&be->blkif->vbd); + xen_blkif_free(be->blkif); + be->blkif = NULL; + } + + kfree(be); + dev_set_drvdata(&dev->dev, NULL); + return 0; +} + +int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-flush-cache"); + + return err; +} + +/* + * Entry point to this code when a new device is created. Allocate the basic + * structures, and watch the store waiting for the hotplug scripts to tell us + * the device's physical major and minor numbers. Switch to InitWait. + */ +static int xen_blkbk_probe(struct xenbus_device *dev, + const struct xenbus_device_id *id) +{ + int err; + struct backend_info *be = kzalloc(sizeof(struct backend_info), + GFP_KERNEL); + if (!be) { + xenbus_dev_fatal(dev, -ENOMEM, + "allocating backend structure"); + return -ENOMEM; + } + be->dev = dev; + dev_set_drvdata(&dev->dev, be); + + be->blkif = xen_blkif_alloc(dev->otherend_id); + if (IS_ERR(be->blkif)) { + err = PTR_ERR(be->blkif); + be->blkif = NULL; + xenbus_dev_fatal(dev, err, "creating block interface"); + goto fail; + } + + /* setup back pointer */ + be->blkif->be = be; + + err = xenbus_watch_pathfmt(dev, &be->backend_watch, backend_changed, + "%s/%s", dev->nodename, "physical-device"); + if (err) + goto fail; + + err = xenbus_switch_state(dev, XenbusStateInitWait); + if (err) + goto fail; + + return 0; + +fail: + DPRINTK("failed"); + xen_blkbk_remove(dev); + return err; +} + + +/* + * Callback received when the hotplug scripts have placed the physical-device + * node. Read it and the mode node, and create a vbd. If the frontend is + * ready, connect. + */ +static void backend_changed(struct xenbus_watch *watch, + const char **vec, unsigned int len) +{ + int err; + unsigned major; + unsigned minor; + struct backend_info *be + = container_of(watch, struct backend_info, backend_watch); + struct xenbus_device *dev = be->dev; + int cdrom = 0; + char *device_type; + + DPRINTK(""); + + err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", + &major, &minor); + if (XENBUS_EXIST_ERR(err)) { + /* + * Since this watch will fire once immediately after it is + * registered, we expect this. Ignore it, and wait for the + * hotplug scripts. + */ + return; + } + if (err != 2) { + xenbus_dev_fatal(dev, err, "reading physical-device"); + return; + } + + if ((be->major || be->minor) && + ((be->major != major) || (be->minor != minor))) { + pr_warn(DRV_PFX "changing physical device (from %x:%x to %x:%x) not supported.\n", + be->major, be->minor, major, minor); + return; + } + + be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); + if (IS_ERR(be->mode)) { + err = PTR_ERR(be->mode); + be->mode = NULL; + xenbus_dev_fatal(dev, err, "reading mode"); + return; + } + + device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL); + if (!IS_ERR(device_type)) { + cdrom = strcmp(device_type, "cdrom") == 0; + kfree(device_type); + } + + if (be->major == 0 && be->minor == 0) { + /* Front end dir is a number, which is used as the handle. */ + + char *p = strrchr(dev->otherend, '/') + 1; + long handle; + err = strict_strtoul(p, 0, &handle); + if (err) + return; + + be->major = major; + be->minor = minor; + + err = xen_vbd_create(be->blkif, handle, major, minor, + (NULL == strchr(be->mode, 'w')), cdrom); + if (err) { + be->major = 0; + be->minor = 0; + xenbus_dev_fatal(dev, err, "creating vbd structure"); + return; + } + + err = xenvbd_sysfs_addif(dev); + if (err) { + xen_vbd_free(&be->blkif->vbd); + be->major = 0; + be->minor = 0; + xenbus_dev_fatal(dev, err, "creating sysfs entries"); + return; + } + + /* We're potentially connected now */ + xen_update_blkif_status(be->blkif); + } +} + + +/* + * Callback received when the frontend's state changes. + */ +static void frontend_changed(struct xenbus_device *dev, + enum xenbus_state frontend_state) +{ + struct backend_info *be = dev_get_drvdata(&dev->dev); + int err; + + DPRINTK("%s", xenbus_strstate(frontend_state)); + + switch (frontend_state) { + case XenbusStateInitialising: + if (dev->state == XenbusStateClosed) { + pr_info(DRV_PFX "%s: prepare for reconnect\n", + dev->nodename); + xenbus_switch_state(dev, XenbusStateInitWait); + } + break; + + case XenbusStateInitialised: + case XenbusStateConnected: + /* + * Ensure we connect even when two watches fire in + * close successsion and we miss the intermediate value + * of frontend_state. + */ + if (dev->state == XenbusStateConnected) + break; + + /* + * Enforce precondition before potential leak point. + * blkif_disconnect() is idempotent. + */ + xen_blkif_disconnect(be->blkif); + + err = connect_ring(be); + if (err) + break; + xen_update_blkif_status(be->blkif); + break; + + case XenbusStateClosing: + xen_blkif_disconnect(be->blkif); + xenbus_switch_state(dev, XenbusStateClosing); + break; + + case XenbusStateClosed: + xenbus_switch_state(dev, XenbusStateClosed); + if (xenbus_dev_is_online(dev)) + break; + /* fall through if not online */ + case XenbusStateUnknown: + /* implies blkif_disconnect() via blkback_remove() */ + device_unregister(&dev->dev); + break; + + default: + xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", + frontend_state); + break; + } +} + + +/* ** Connection ** */ + + +/* + * Write the physical details regarding the block device to the store, and + * switch to Connected state. + */ +static void connect(struct backend_info *be) +{ + struct xenbus_transaction xbt; + int err; + struct xenbus_device *dev = be->dev; + + DPRINTK("%s", dev->otherend); + + /* Supply the information about the device the frontend needs */ +again: + err = xenbus_transaction_start(&xbt); + if (err) { + xenbus_dev_fatal(dev, err, "starting transaction"); + return; + } + + err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); + if (err) + goto abort; + + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", + (unsigned long long)vbd_sz(&be->blkif->vbd)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sectors", + dev->nodename); + goto abort; + } + + /* FIXME: use a typename instead */ + err = xenbus_printf(xbt, dev->nodename, "info", "%u", + be->blkif->vbd.type | + (be->blkif->vbd.readonly ? VDISK_READONLY : 0)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/info", + dev->nodename); + goto abort; + } + err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", + (unsigned long) + bdev_logical_block_size(be->blkif->vbd.bdev)); + if (err) { + xenbus_dev_fatal(dev, err, "writing %s/sector-size", + dev->nodename); + goto abort; + } + + err = xenbus_transaction_end(xbt, 0); + if (err == -EAGAIN) + goto again; + if (err) + xenbus_dev_fatal(dev, err, "ending transaction"); + + err = xenbus_switch_state(dev, XenbusStateConnected); + if (err) + xenbus_dev_fatal(dev, err, "switching to Connected state", + dev->nodename); + + return; + abort: + xenbus_transaction_end(xbt, 1); +} + + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned long ring_ref; + unsigned int evtchn; + char protocol[64] = ""; + int err; + + DPRINTK("%s", dev->otherend); + + err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", + &ring_ref, "event-channel", "%u", &evtchn, NULL); + if (err) { + xenbus_dev_fatal(dev, err, + "reading %s/ring-ref and event-channel", + dev->otherend); + return err; + } + + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming native"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -1; + } + pr_info(DRV_PFX "ring-ref %ld, event-channel %d, protocol %d (%s)\n", + ring_ref, evtchn, be->blkif->blk_protocol, protocol); + + /* Map the shared frame, irq etc. */ + err = xen_blkif_map(be->blkif, ring_ref, evtchn); + if (err) { + xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", + ring_ref, evtchn); + return err; + } + + return 0; +} + + +/* ** Driver Registration ** */ + + +static const struct xenbus_device_id xen_blkbk_ids[] = { + { "vbd" }, + { "" } +}; + + +static struct xenbus_driver xen_blkbk = { + .name = "vbd", + .owner = THIS_MODULE, + .ids = xen_blkbk_ids, + .probe = xen_blkbk_probe, + .remove = xen_blkbk_remove, + .otherend_changed = frontend_changed +}; + + +int xen_blkif_xenbus_init(void) +{ + return xenbus_register_backend(&xen_blkbk); +} diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9cb8668ff5f4..b536a9cef917 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -97,6 +97,7 @@ struct blkfront_info struct blk_shadow shadow[BLK_RING_SIZE]; unsigned long shadow_free; unsigned int feature_flush; + unsigned int flush_op; int is_ready; }; @@ -250,8 +251,7 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, /* * Generate a Xen blkfront IO request from a blk layer request. Reads - * and writes are handled as expected. Since we lack a loose flush - * request, we map flushes into a full ordered barrier. + * and writes are handled as expected. * * @req: a request struct */ @@ -293,14 +293,13 @@ static int blkif_queue_request(struct request *req) if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { /* - * Ideally we could just do an unordered - * flush-to-disk, but all we have is a full write - * barrier at the moment. However, a barrier write is + * Ideally we can do an unordered flush-to-disk. In case the + * backend onlysupports barriers, use that. A barrier request * a superset of FUA, so we can implement it the same * way. (It's also a FLUSH+FUA, since it is * guaranteed ordered WRT previous writes.) */ - ring_req->operation = BLKIF_OP_WRITE_BARRIER; + ring_req->operation = info->flush_op; } ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); @@ -433,8 +432,11 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: barriers %s\n", + printk(KERN_INFO "blkfront: %s: %s: %s\n", info->gd->disk_name, + info->flush_op == BLKIF_OP_WRITE_BARRIER ? + "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? + "flush diskcache" : "barrier or flush"), info->feature_flush ? "enabled" : "disabled"); } @@ -720,15 +722,20 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { + case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { - printk(KERN_WARNING "blkfront: %s: write barrier op failed\n", + printk(KERN_WARNING "blkfront: %s: write %s op failed\n", + info->flush_op == BLKIF_OP_WRITE_BARRIER ? + "barrier" : "flush disk cache", info->gd->disk_name); error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && info->shadow[id].req.nr_segments == 0)) { - printk(KERN_WARNING "blkfront: %s: empty write barrier op failed\n", + printk(KERN_WARNING "blkfront: %s: empty write %s op failed\n", + info->flush_op == BLKIF_OP_WRITE_BARRIER ? + "barrier" : "flush disk cache", info->gd->disk_name); error = -EOPNOTSUPP; } @@ -736,6 +743,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) if (error == -EOPNOTSUPP) error = 0; info->feature_flush = 0; + info->flush_op = 0; xlvbd_flush(info); } /* fall through */ @@ -1100,7 +1108,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int binfo; int err; - int barrier; + int barrier, flush; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1140,8 +1148,11 @@ static void blkfront_connect(struct blkfront_info *info) return; } + info->feature_flush = 0; + info->flush_op = 0; + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-barrier", "%lu", &barrier, + "feature-barrier", "%d", &barrier, NULL); /* @@ -1151,11 +1162,23 @@ static void blkfront_connect(struct blkfront_info *info) * * If there are barriers, then we use flush. */ - info->feature_flush = 0; - - if (!err && barrier) + if (!err && barrier) { info->feature_flush = REQ_FLUSH | REQ_FUA; + info->flush_op = BLKIF_OP_WRITE_BARRIER; + } + /* + * And if there is "feature-flush-cache" use that above + * barriers. + */ + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush, + NULL); + if (!err && flush) { + info->feature_flush = REQ_FLUSH; + info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; + } + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", |