diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-05 01:22:14 +0100 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-11-05 01:22:14 +0100 |
commit | 3d0a8d10cfb4cc3d1877c29a866ee7d8a46aa2fa (patch) | |
tree | 11a85044d1472f5972ae47ce10a2f446ad981e9f /drivers/block | |
parent | Merge branch 'for-3.2/core' of git://git.kernel.dk/linux-block (diff) | |
parent | virtio-blk: use ida to allocate disk index (diff) | |
download | linux-3d0a8d10cfb4cc3d1877c29a866ee7d8a46aa2fa.tar.xz linux-3d0a8d10cfb4cc3d1877c29a866ee7d8a46aa2fa.zip |
Merge branch 'for-3.2/drivers' of git://git.kernel.dk/linux-block
* 'for-3.2/drivers' of git://git.kernel.dk/linux-block: (30 commits)
virtio-blk: use ida to allocate disk index
hpsa: add small delay when using PCI Power Management to reset for kump
cciss: add small delay when using PCI Power Management to reset for kump
xen/blkback: Fix two races in the handling of barrier requests.
xen/blkback: Check for proper operation.
xen/blkback: Fix the inhibition to map pages when discarding sector ranges.
xen/blkback: Report VBD_WSECT (wr_sect) properly.
xen/blkback: Support 'feature-barrier' aka old-style BARRIER requests.
xen-blkfront: plug device number leak in xlblk_init() error path
xen-blkfront: If no barrier or flush is supported, use invalid operation.
xen-blkback: use kzalloc() in favor of kmalloc()+memset()
xen-blkback: fixed indentation and comments
xen-blkfront: fix a deadlock while handling discard response
xen-blkfront: Handle discard requests.
xen-blkback: Implement discard requests ('feature-discard')
xen-blkfront: add BLKIF_OP_DISCARD and discard request struct
drivers/block/loop.c: remove unnecessary bdev argument from loop_clr_fd()
drivers/block/loop.c: emit uevent on auto release
drivers/block/cpqarray.c: use pci_dev->revision
loop: always allow userspace partitions and optionally support automatic scanning
...
Fic up trivial header file includsion conflict in drivers/block/loop.c
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/cciss.c | 76 | ||||
-rw-r--r-- | drivers/block/cciss.h | 1 | ||||
-rw-r--r-- | drivers/block/cpqarray.c | 2 | ||||
-rw-r--r-- | drivers/block/loop.c | 111 | ||||
-rw-r--r-- | drivers/block/nbd.c | 69 | ||||
-rw-r--r-- | drivers/block/xen-blkback/blkback.c | 130 | ||||
-rw-r--r-- | drivers/block/xen-blkback/common.h | 98 | ||||
-rw-r--r-- | drivers/block/xen-blkback/xenbus.c | 76 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 123 |
9 files changed, 569 insertions, 117 deletions
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 8f4ef656a1af..486f94ef24d4 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -68,6 +68,10 @@ static int cciss_tape_cmds = 6; module_param(cciss_tape_cmds, int, 0644); MODULE_PARM_DESC(cciss_tape_cmds, "number of commands to allocate for tape devices (default: 6)"); +static int cciss_simple_mode; +module_param(cciss_simple_mode, int, S_IRUGO|S_IWUSR); +MODULE_PARM_DESC(cciss_simple_mode, + "Use 'simple mode' rather than 'performant mode'"); static DEFINE_MUTEX(cciss_mutex); static struct proc_dir_entry *proc_cciss; @@ -176,6 +180,7 @@ static void cciss_geometry_inquiry(ctlr_info_t *h, int logvol, unsigned int block_size, InquiryData_struct *inq_buff, drive_info_struct *drv); static void __devinit cciss_interrupt_mode(ctlr_info_t *); +static int __devinit cciss_enter_simple_mode(struct ctlr_info *h); static void start_io(ctlr_info_t *h); static int sendcmd_withirq(ctlr_info_t *h, __u8 cmd, void *buff, size_t size, __u8 page_code, unsigned char scsi3addr[], @@ -388,7 +393,7 @@ static void cciss_seq_show_header(struct seq_file *seq) h->product_name, (unsigned long)h->board_id, h->firm_ver[0], h->firm_ver[1], h->firm_ver[2], - h->firm_ver[3], (unsigned int)h->intr[PERF_MODE_INT], + h->firm_ver[3], (unsigned int)h->intr[h->intr_mode], h->num_luns, h->Qdepth, h->commands_outstanding, h->maxQsinceinit, h->max_outstanding, h->maxSG); @@ -636,6 +641,18 @@ static ssize_t host_store_rescan(struct device *dev, } static DEVICE_ATTR(rescan, S_IWUSR, NULL, host_store_rescan); +static ssize_t host_show_transport_mode(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct ctlr_info *h = to_hba(dev); + + return snprintf(buf, 20, "%s\n", + h->transMethod & CFGTBL_Trans_Performant ? + "performant" : "simple"); +} +static DEVICE_ATTR(transport_mode, S_IRUGO, host_show_transport_mode, NULL); + static ssize_t dev_show_unique_id(struct device *dev, struct device_attribute *attr, char *buf) @@ -808,6 +825,7 @@ static DEVICE_ATTR(usage_count, S_IRUGO, cciss_show_usage_count, NULL); static struct attribute *cciss_host_attrs[] = { &dev_attr_rescan.attr, &dev_attr_resettable.attr, + &dev_attr_transport_mode.attr, NULL }; @@ -3984,6 +4002,9 @@ static void __devinit cciss_put_controller_into_performant_mode(ctlr_info_t *h) { __u32 trans_support; + if (cciss_simple_mode) + return; + dev_dbg(&h->pdev->dev, "Trying to put board into Performant mode\n"); /* Attempt to put controller into performant mode if supported */ /* Does board support performant mode? */ @@ -4081,7 +4102,7 @@ static void __devinit cciss_interrupt_mode(ctlr_info_t *h) default_int_mode: #endif /* CONFIG_PCI_MSI */ /* if we get here we're going to use the default interrupt mode */ - h->intr[PERF_MODE_INT] = h->pdev->irq; + h->intr[h->intr_mode] = h->pdev->irq; return; } @@ -4341,6 +4362,9 @@ static int __devinit cciss_pci_init(ctlr_info_t *h) } cciss_enable_scsi_prefetch(h); cciss_p600_dma_prefetch_quirk(h); + err = cciss_enter_simple_mode(h); + if (err) + goto err_out_free_res; cciss_put_controller_into_performant_mode(h); return 0; @@ -4533,6 +4557,13 @@ static int cciss_controller_hard_reset(struct pci_dev *pdev, pmcsr &= ~PCI_PM_CTRL_STATE_MASK; pmcsr |= PCI_D0; pci_write_config_word(pdev, pos + PCI_PM_CTRL, pmcsr); + + /* + * The P600 requires a small delay when changing states. + * Otherwise we may think the board did not reset and we bail. + * This for kdump only and is particular to the P600. + */ + msleep(500); } return 0; } @@ -4843,20 +4874,20 @@ static int cciss_request_irq(ctlr_info_t *h, irqreturn_t (*intxhandler)(int, void *)) { if (h->msix_vector || h->msi_vector) { - if (!request_irq(h->intr[PERF_MODE_INT], msixhandler, + if (!request_irq(h->intr[h->intr_mode], msixhandler, IRQF_DISABLED, h->devname, h)) return 0; dev_err(&h->pdev->dev, "Unable to get msi irq %d" - " for %s\n", h->intr[PERF_MODE_INT], + " for %s\n", h->intr[h->intr_mode], h->devname); return -1; } - if (!request_irq(h->intr[PERF_MODE_INT], intxhandler, + if (!request_irq(h->intr[h->intr_mode], intxhandler, IRQF_DISABLED, h->devname, h)) return 0; dev_err(&h->pdev->dev, "Unable to get irq %d for %s\n", - h->intr[PERF_MODE_INT], h->devname); + h->intr[h->intr_mode], h->devname); return -1; } @@ -4887,7 +4918,7 @@ static void cciss_undo_allocations_after_kdump_soft_reset(ctlr_info_t *h) { int ctlr = h->ctlr; - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); #ifdef CONFIG_PCI_MSI if (h->msix_vector) pci_disable_msix(h->pdev); @@ -4953,6 +4984,7 @@ reinit_after_soft_reset: h = hba[i]; h->pdev = pdev; h->busy_initializing = 1; + h->intr_mode = cciss_simple_mode ? SIMPLE_MODE_INT : PERF_MODE_INT; INIT_LIST_HEAD(&h->cmpQ); INIT_LIST_HEAD(&h->reqQ); mutex_init(&h->busy_shutting_down); @@ -5009,7 +5041,7 @@ reinit_after_soft_reset: dev_info(&h->pdev->dev, "%s: <0x%x> at PCI %s IRQ %d%s using DAC\n", h->devname, pdev->device, pci_name(pdev), - h->intr[PERF_MODE_INT], dac ? "" : " not"); + h->intr[h->intr_mode], dac ? "" : " not"); if (cciss_allocate_cmd_pool(h)) goto clean4; @@ -5056,7 +5088,7 @@ reinit_after_soft_reset: spin_lock_irqsave(&h->lock, flags); h->access.set_intr_mask(h, CCISS_INTR_OFF); spin_unlock_irqrestore(&h->lock, flags); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); rc = cciss_request_irq(h, cciss_msix_discard_completions, cciss_intx_discard_completions); if (rc) { @@ -5133,7 +5165,7 @@ clean4: cciss_free_cmd_pool(h); cciss_free_scatterlists(h); cciss_free_sg_chain_blocks(h->cmd_sg_list, h->nr_cmds); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); clean2: unregister_blkdev(h->major, h->devname); clean1: @@ -5172,9 +5204,31 @@ static void cciss_shutdown(struct pci_dev *pdev) if (return_code != IO_OK) dev_warn(&h->pdev->dev, "Error flushing cache\n"); h->access.set_intr_mask(h, CCISS_INTR_OFF); - free_irq(h->intr[PERF_MODE_INT], h); + free_irq(h->intr[h->intr_mode], h); +} + +static int __devinit cciss_enter_simple_mode(struct ctlr_info *h) +{ + u32 trans_support; + + trans_support = readl(&(h->cfgtable->TransportSupport)); + if (!(trans_support & SIMPLE_MODE)) + return -ENOTSUPP; + + h->max_commands = readl(&(h->cfgtable->CmdsOutMax)); + writel(CFGTBL_Trans_Simple, &(h->cfgtable->HostWrite.TransportRequest)); + writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL); + cciss_wait_for_mode_change_ack(h); + print_cfg_table(h); + if (!(readl(&(h->cfgtable->TransportActive)) & CFGTBL_Trans_Simple)) { + dev_warn(&h->pdev->dev, "unable to get board into simple mode\n"); + return -ENODEV; + } + h->transMethod = CFGTBL_Trans_Simple; + return 0; } + static void __devexit cciss_remove_one(struct pci_dev *pdev) { ctlr_info_t *h; diff --git a/drivers/block/cciss.h b/drivers/block/cciss.h index c049548e68b7..7fda30e4a241 100644 --- a/drivers/block/cciss.h +++ b/drivers/block/cciss.h @@ -92,6 +92,7 @@ struct ctlr_info unsigned int intr[4]; unsigned int msix_vector; unsigned int msi_vector; + int intr_mode; int cciss_max_sectors; BYTE cciss_read; BYTE cciss_write; diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index b2fceb53e809..9125bbeacd4d 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -620,6 +620,7 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) } vendor_id = pdev->vendor; device_id = pdev->device; + revision = pdev->revision; irq = pdev->irq; for(i=0; i<6; i++) @@ -632,7 +633,6 @@ static int cpqarray_pci_init(ctlr_info_t *c, struct pci_dev *pdev) } pci_read_config_word(pdev, PCI_COMMAND, &command); - pci_read_config_byte(pdev, PCI_CLASS_REVISION, &revision); pci_read_config_byte(pdev, PCI_CACHE_LINE_SIZE, &cache_line_size); pci_read_config_byte(pdev, PCI_LATENCY_TIMER, &latency_timer); diff --git a/drivers/block/loop.c b/drivers/block/loop.c index c77983ea86c8..3d806820280e 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -76,6 +76,8 @@ #include <linux/splice.h> #include <linux/sysfs.h> #include <linux/miscdevice.h> +#include <linux/falloc.h> + #include <asm/uaccess.h> static DEFINE_IDR(loop_index_idr); @@ -407,6 +409,29 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio) } } + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + if (bio->bi_rw & REQ_DISCARD) { + struct file *file = lo->lo_backing_file; + int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; + + if ((!file->f_op->fallocate) || + lo->lo_encrypt_key_size) { + ret = -EOPNOTSUPP; + goto out; + } + ret = file->f_op->fallocate(file, mode, pos, + bio->bi_size); + if (unlikely(ret && ret != -EINVAL && + ret != -EOPNOTSUPP)) + ret = -EIO; + goto out; + } + ret = lo_send(lo, bio, pos); if ((bio->bi_rw & REQ_FUA) && !ret) { @@ -622,7 +647,7 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, goto out_putf; fput(old_file); - if (max_part > 0) + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -699,16 +724,25 @@ static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf) return sprintf(buf, "%s\n", autoclear ? "1" : "0"); } +static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf) +{ + int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN); + + return sprintf(buf, "%s\n", partscan ? "1" : "0"); +} + LOOP_ATTR_RO(backing_file); LOOP_ATTR_RO(offset); LOOP_ATTR_RO(sizelimit); LOOP_ATTR_RO(autoclear); +LOOP_ATTR_RO(partscan); static struct attribute *loop_attrs[] = { &loop_attr_backing_file.attr, &loop_attr_offset.attr, &loop_attr_sizelimit.attr, &loop_attr_autoclear.attr, + &loop_attr_partscan.attr, NULL, }; @@ -729,6 +763,35 @@ static void loop_sysfs_exit(struct loop_device *lo) &loop_attribute_group); } +static void loop_config_discard(struct loop_device *lo) +{ + struct file *file = lo->lo_backing_file; + struct inode *inode = file->f_mapping->host; + struct request_queue *q = lo->lo_queue; + + /* + * We use punch hole to reclaim the free space used by the + * image a.k.a. discard. However we do support discard if + * encryption is enabled, because it may give an attacker + * useful information. + */ + if ((!file->f_op->fallocate) || + lo->lo_encrypt_key_size) { + q->limits.discard_granularity = 0; + q->limits.discard_alignment = 0; + q->limits.max_discard_sectors = 0; + q->limits.discard_zeroes_data = 0; + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + return; + } + + q->limits.discard_granularity = inode->i_sb->s_blocksize; + q->limits.discard_alignment = inode->i_sb->s_blocksize; + q->limits.max_discard_sectors = UINT_MAX >> 9; + q->limits.discard_zeroes_data = 1; + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); +} + static int loop_set_fd(struct loop_device *lo, fmode_t mode, struct block_device *bdev, unsigned int arg) { @@ -829,7 +892,9 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, } lo->lo_state = Lo_bound; wake_up_process(lo->lo_thread); - if (max_part > 0) + if (part_shift) + lo->lo_flags |= LO_FLAGS_PARTSCAN; + if (lo->lo_flags & LO_FLAGS_PARTSCAN) ioctl_by_bdev(bdev, BLKRRPART, 0); return 0; @@ -890,10 +955,11 @@ loop_init_xfer(struct loop_device *lo, struct loop_func_table *xfer, return err; } -static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) +static int loop_clr_fd(struct loop_device *lo) { struct file *filp = lo->lo_backing_file; gfp_t gfp = lo->old_gfp_mask; + struct block_device *bdev = lo->lo_device; if (lo->lo_state != Lo_bound) return -ENXIO; @@ -922,7 +988,6 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_offset = 0; lo->lo_sizelimit = 0; lo->lo_encrypt_key_size = 0; - lo->lo_flags = 0; lo->lo_thread = NULL; memset(lo->lo_encrypt_key, 0, LO_KEY_SIZE); memset(lo->lo_crypt_name, 0, LO_NAME_SIZE); @@ -940,8 +1005,11 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev) lo->lo_state = Lo_unbound; /* This is safe: open() is still holding a reference. */ module_put(THIS_MODULE); - if (max_part > 0 && bdev) + if (lo->lo_flags & LO_FLAGS_PARTSCAN && bdev) ioctl_by_bdev(bdev, BLKRRPART, 0); + lo->lo_flags = 0; + if (!part_shift) + lo->lo_disk->flags |= GENHD_FL_NO_PART_SCAN; mutex_unlock(&lo->lo_ctl_mutex); /* * Need not hold lo_ctl_mutex to fput backing file. @@ -995,6 +1063,7 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) if (figure_loop_size(lo)) return -EFBIG; } + loop_config_discard(lo); memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); memcpy(lo->lo_crypt_name, info->lo_crypt_name, LO_NAME_SIZE); @@ -1010,6 +1079,13 @@ loop_set_status(struct loop_device *lo, const struct loop_info64 *info) (info->lo_flags & LO_FLAGS_AUTOCLEAR)) lo->lo_flags ^= LO_FLAGS_AUTOCLEAR; + if ((info->lo_flags & LO_FLAGS_PARTSCAN) && + !(lo->lo_flags & LO_FLAGS_PARTSCAN)) { + lo->lo_flags |= LO_FLAGS_PARTSCAN; + lo->lo_disk->flags &= ~GENHD_FL_NO_PART_SCAN; + ioctl_by_bdev(lo->lo_device, BLKRRPART, 0); + } + lo->lo_encrypt_key_size = info->lo_encrypt_key_size; lo->lo_init[0] = info->lo_init[0]; lo->lo_init[1] = info->lo_init[1]; @@ -1203,7 +1279,7 @@ static int lo_ioctl(struct block_device *bdev, fmode_t mode, break; case LOOP_CLR_FD: /* loop_clr_fd would have unlocked lo_ctl_mutex on success */ - err = loop_clr_fd(lo, bdev); + err = loop_clr_fd(lo); if (!err) goto out_unlocked; break; @@ -1423,7 +1499,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode) * In autoclear mode, stop the loop thread * and remove configuration after last close. */ - err = loop_clr_fd(lo, NULL); + err = loop_clr_fd(lo); if (!err) goto out_unlocked; } else { @@ -1545,6 +1621,27 @@ static int loop_add(struct loop_device **l, int i) if (!disk) goto out_free_queue; + /* + * Disable partition scanning by default. The in-kernel partition + * scanning can be requested individually per-device during its + * setup. Userspace can always add and remove partitions from all + * devices. The needed partition minors are allocated from the + * extended minor space, the main loop device numbers will continue + * to match the loop minors, regardless of the number of partitions + * used. + * + * If max_part is given, partition scanning is globally enabled for + * all loop devices. The minors for the main loop devices will be + * multiples of max_part. + * + * Note: Global-for-all-devices, set-only-at-init, read-only module + * parameteters like 'max_loop' and 'max_part' make things needlessly + * complicated, are too static, inflexible and may surprise + * userspace tools. Parameters like this in general should be avoided. + */ + if (!part_shift) + disk->flags |= GENHD_FL_NO_PART_SCAN; + disk->flags |= GENHD_FL_EXT_DEVT; mutex_init(&lo->lo_ctl_mutex); lo->lo_number = i; lo->lo_thread = NULL; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index f533f3375e24..c3f0ee16594d 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -127,8 +127,7 @@ static void sock_shutdown(struct nbd_device *lo, int lock) if (lock) mutex_lock(&lo->tx_lock); if (lo->sock) { - printk(KERN_WARNING "%s: shutting down socket\n", - lo->disk->disk_name); + dev_warn(disk_to_dev(lo->disk), "shutting down socket\n"); kernel_sock_shutdown(lo->sock, SHUT_RDWR); lo->sock = NULL; } @@ -158,8 +157,9 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, sigset_t blocked, oldset; if (unlikely(!sock)) { - printk(KERN_ERR "%s: Attempted %s on closed socket in sock_xmit\n", - lo->disk->disk_name, (send ? "send" : "recv")); + dev_err(disk_to_dev(lo->disk), + "Attempted %s on closed socket in sock_xmit\n", + (send ? "send" : "recv")); return -EINVAL; } @@ -250,8 +250,8 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) result = sock_xmit(lo, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { - printk(KERN_ERR "%s: Send control failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(lo->disk), + "Send control failed (result %d)\n", result); goto error_out; } @@ -270,8 +270,9 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) lo->disk->disk_name, req, bvec->bv_len); result = sock_send_bvec(lo, bvec, flags); if (result <= 0) { - printk(KERN_ERR "%s: Send data failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(lo->disk), + "Send data failed (result %d)\n", + result); goto error_out; } } @@ -328,14 +329,13 @@ static struct request *nbd_read_stat(struct nbd_device *lo) reply.magic = 0; result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - printk(KERN_ERR "%s: Receive control failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(lo->disk), + "Receive control failed (result %d)\n", result); goto harderror; } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { - printk(KERN_ERR "%s: Wrong magic (0x%lx)\n", - lo->disk->disk_name, + dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); result = -EPROTO; goto harderror; @@ -347,15 +347,15 @@ static struct request *nbd_read_stat(struct nbd_device *lo) if (result != -ENOENT) goto harderror; - printk(KERN_ERR "%s: Unexpected reply (%p)\n", - lo->disk->disk_name, reply.handle); + dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n", + reply.handle); result = -EBADR; goto harderror; } if (ntohl(reply.error)) { - printk(KERN_ERR "%s: Other side returned error (%d)\n", - lo->disk->disk_name, ntohl(reply.error)); + dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n", + ntohl(reply.error)); req->errors++; return req; } @@ -369,8 +369,8 @@ static struct request *nbd_read_stat(struct nbd_device *lo) rq_for_each_segment(bvec, req, iter) { result = sock_recv_bvec(lo, bvec); if (result <= 0) { - printk(KERN_ERR "%s: Receive data failed (result %d)\n", - lo->disk->disk_name, result); + dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n", + result); req->errors++; return req; } @@ -405,10 +405,10 @@ static int nbd_do_it(struct nbd_device *lo) BUG_ON(lo->magic != LO_MAGIC); - lo->pid = current->pid; - ret = sysfs_create_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); + lo->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(lo->disk), &pid_attr); if (ret) { - printk(KERN_ERR "nbd: sysfs_create_file failed!"); + dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n"); lo->pid = 0; return ret; } @@ -416,7 +416,7 @@ static int nbd_do_it(struct nbd_device *lo) while ((req = nbd_read_stat(lo)) != NULL) nbd_end_request(req); - sysfs_remove_file(&disk_to_dev(lo->disk)->kobj, &pid_attr.attr); + device_remove_file(disk_to_dev(lo->disk), &pid_attr); lo->pid = 0; return 0; } @@ -457,8 +457,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) if (rq_data_dir(req) == WRITE) { nbd_cmd(req) = NBD_CMD_WRITE; if (lo->flags & NBD_READ_ONLY) { - printk(KERN_ERR "%s: Write on read-only\n", - lo->disk->disk_name); + dev_err(disk_to_dev(lo->disk), + "Write on read-only\n"); goto error_out; } } @@ -468,16 +468,15 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) mutex_lock(&lo->tx_lock); if (unlikely(!lo->sock)) { mutex_unlock(&lo->tx_lock); - printk(KERN_ERR "%s: Attempted send on closed socket\n", - lo->disk->disk_name); + dev_err(disk_to_dev(lo->disk), + "Attempted send on closed socket\n"); goto error_out; } lo->active_req = req; if (nbd_send_req(lo, req) != 0) { - printk(KERN_ERR "%s: Request send failed\n", - lo->disk->disk_name); + dev_err(disk_to_dev(lo->disk), "Request send failed\n"); req->errors++; nbd_end_request(req); } else { @@ -549,8 +548,8 @@ static void do_nbd_request(struct request_queue *q) BUG_ON(lo->magic != LO_MAGIC); if (unlikely(!lo->sock)) { - printk(KERN_ERR "%s: Attempted send on closed socket\n", - lo->disk->disk_name); + dev_err(disk_to_dev(lo->disk), + "Attempted send on closed socket\n"); req->errors++; nbd_end_request(req); spin_lock_irq(q->queue_lock); @@ -576,7 +575,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, case NBD_DISCONNECT: { struct request sreq; - printk(KERN_INFO "%s: NBD_DISCONNECT\n", lo->disk->disk_name); + dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n"); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; @@ -674,7 +673,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, file = lo->file; lo->file = NULL; nbd_clear_que(lo); - printk(KERN_WARNING "%s: queue cleared\n", lo->disk->disk_name); + dev_warn(disk_to_dev(lo->disk), "queue cleared\n"); if (file) fput(file); lo->bytesize = 0; @@ -694,8 +693,8 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, return 0; case NBD_PRINT_DEBUG: - printk(KERN_INFO "%s: next = %p, prev = %p, head = %p\n", - bdev->bd_disk->disk_name, + dev_info(disk_to_dev(lo->disk), + "next = %p, prev = %p, head = %p\n", lo->queue_head.next, lo->queue_head.prev, &lo->queue_head); return 0; @@ -745,7 +744,7 @@ static int __init nbd_init(void) BUILD_BUG_ON(sizeof(struct nbd_request) != 28); if (max_part < 0) { - printk(KERN_CRIT "nbd: max_part must be >= 0\n"); + printk(KERN_ERR "nbd: max_part must be >= 0\n"); return -EINVAL; } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 1540792b1e54..15ec4db194d1 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -39,6 +39,9 @@ #include <linux/list.h> #include <linux/delay.h> #include <linux/freezer.h> +#include <linux/loop.h> +#include <linux/falloc.h> +#include <linux/fs.h> #include <xen/events.h> #include <xen/page.h> @@ -258,13 +261,16 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) static void print_stats(struct xen_blkif *blkif) { - pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d\n", + pr_info("xen-blkback (%s): oo %3d | rd %4d | wr %4d | f %4d" + " | ds %4d\n", current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, blkif->st_f_req); + blkif->st_rd_req, blkif->st_wr_req, + blkif->st_f_req, blkif->st_ds_req); blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); blkif->st_rd_req = 0; blkif->st_wr_req = 0; blkif->st_oo_req = 0; + blkif->st_ds_req = 0; } int xen_blkif_schedule(void *arg) @@ -410,6 +416,59 @@ static int xen_blkbk_map(struct blkif_request *req, return ret; } +static void xen_blk_discard(struct xen_blkif *blkif, struct blkif_request *req) +{ + int err = 0; + int status = BLKIF_RSP_OKAY; + struct block_device *bdev = blkif->vbd.bdev; + + if (blkif->blk_backend_type == BLKIF_BACKEND_PHY) + /* just forward the discard request */ + err = blkdev_issue_discard(bdev, + req->u.discard.sector_number, + req->u.discard.nr_sectors, + GFP_KERNEL, 0); + else if (blkif->blk_backend_type == BLKIF_BACKEND_FILE) { + /* punch a hole in the backing file */ + struct loop_device *lo = bdev->bd_disk->private_data; + struct file *file = lo->lo_backing_file; + + if (file->f_op->fallocate) + err = file->f_op->fallocate(file, + FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, + req->u.discard.sector_number << 9, + req->u.discard.nr_sectors << 9); + else + err = -EOPNOTSUPP; + } else + err = -EOPNOTSUPP; + + if (err == -EOPNOTSUPP) { + pr_debug(DRV_PFX "discard op failed, not supported\n"); + status = BLKIF_RSP_EOPNOTSUPP; + } else if (err) + status = BLKIF_RSP_ERROR; + + make_response(blkif, req->id, req->operation, status); +} + +static void xen_blk_drain_io(struct xen_blkif *blkif) +{ + atomic_set(&blkif->drain, 1); + do { + /* The initial value is one, and one refcnt taken at the + * start of the xen_blkif_schedule thread. */ + if (atomic_read(&blkif->refcnt) <= 2) + break; + wait_for_completion_interruptible_timeout( + &blkif->drain_complete, HZ); + + if (!atomic_read(&blkif->drain)) + break; + } while (!kthread_should_stop()); + atomic_set(&blkif->drain, 0); +} + /* * Completion callback on the bio's. Called as bh->b_end_io() */ @@ -422,6 +481,11 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) pr_debug(DRV_PFX "flush diskcache op failed, not supported\n"); xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; + } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && + (error == -EOPNOTSUPP)) { + pr_debug(DRV_PFX "write barrier op failed, not supported\n"); + xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug(DRV_PFX "Buffer not up-to-date at end of operation," " error=%d\n", error); @@ -438,6 +502,10 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) make_response(pending_req->blkif, pending_req->id, pending_req->operation, pending_req->status); xen_blkif_put(pending_req->blkif); + if (atomic_read(&pending_req->blkif->refcnt) <= 2) { + if (atomic_read(&pending_req->blkif->drain)) + complete(&pending_req->blkif->drain_complete); + } free_req(pending_req); } } @@ -532,7 +600,6 @@ do_block_io_op(struct xen_blkif *blkif) return more_to_do; } - /* * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. @@ -549,6 +616,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, int i, nbio = 0; int operation; struct blk_plug plug; + bool drain = false; switch (req->operation) { case BLKIF_OP_READ: @@ -559,11 +627,16 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blkif->st_wr_req++; operation = WRITE_ODIRECT; break; + case BLKIF_OP_WRITE_BARRIER: + drain = true; case BLKIF_OP_FLUSH_DISKCACHE: blkif->st_f_req++; operation = WRITE_FLUSH; break; - case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_DISCARD: + blkif->st_ds_req++; + operation = REQ_DISCARD; + break; default: operation = 0; /* make gcc happy */ goto fail_response; @@ -572,7 +645,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* Check that the number of segments is sane. */ nseg = req->nr_segments; - if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || + if (unlikely(nseg == 0 && operation != WRITE_FLUSH && + operation != REQ_DISCARD) || unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { pr_debug(DRV_PFX "Bad number of segments in request (%d)\n", nseg); @@ -621,16 +695,25 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, } } + /* Wait on all outstanding I/O's and once that has been completed + * issue the WRITE_FLUSH. + */ + if (drain) + xen_blk_drain_io(pending_req->blkif); + /* * If we have failed at this point, we need to undo the M2P override, * set gnttab_set_unmap_op on all of the grant references and perform * the hypercall to unmap the grants - that is all done in * xen_blkbk_unmap. */ - if (xen_blkbk_map(req, pending_req, seg)) + if (operation != REQ_DISCARD && xen_blkbk_map(req, pending_req, seg)) goto fail_flush; - /* This corresponding xen_blkif_put is done in __end_block_io_op */ + /* + * This corresponding xen_blkif_put is done in __end_block_io_op, or + * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. + */ xen_blkif_get(blkif); for (i = 0; i < nseg; i++) { @@ -654,18 +737,25 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.sector_number += seg[i].nsec; } - /* This will be hit if the operation was a flush. */ + /* This will be hit if the operation was a flush or discard. */ if (!bio) { - BUG_ON(operation != WRITE_FLUSH); + BUG_ON(operation != WRITE_FLUSH && operation != REQ_DISCARD); - bio = bio_alloc(GFP_KERNEL, 0); - if (unlikely(bio == NULL)) - goto fail_put_bio; + if (operation == WRITE_FLUSH) { + bio = bio_alloc(GFP_KERNEL, 0); + if (unlikely(bio == NULL)) + goto fail_put_bio; - biolist[nbio++] = bio; - bio->bi_bdev = preq.bdev; - bio->bi_private = pending_req; - bio->bi_end_io = end_block_io_op; + biolist[nbio++] = bio; + bio->bi_bdev = preq.bdev; + bio->bi_private = pending_req; + bio->bi_end_io = end_block_io_op; + } else if (operation == REQ_DISCARD) { + xen_blk_discard(blkif, req); + xen_blkif_put(blkif); + free_req(pending_req); + return 0; + } } /* @@ -685,7 +775,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (operation == READ) blkif->st_rd_sect += preq.nr_sects; - else if (operation == WRITE || operation == WRITE_FLUSH) + else if (operation & WRITE) blkif->st_wr_sect += preq.nr_sects; return 0; @@ -765,9 +855,9 @@ static int __init xen_blkif_init(void) mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; - blkbk->pending_reqs = kmalloc(sizeof(blkbk->pending_reqs[0]) * + blkbk->pending_reqs = kzalloc(sizeof(blkbk->pending_reqs[0]) * xen_blkif_reqs, GFP_KERNEL); - blkbk->pending_grant_handles = kzalloc(sizeof(blkbk->pending_grant_handles[0]) * + blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) * mmap_pages, GFP_KERNEL); blkbk->pending_pages = kzalloc(sizeof(blkbk->pending_pages[0]) * mmap_pages, GFP_KERNEL); @@ -790,8 +880,6 @@ static int __init xen_blkif_init(void) if (rc) goto failed_init; - memset(blkbk->pending_reqs, 0, sizeof(blkbk->pending_reqs)); - INIT_LIST_HEAD(&blkbk->pending_free); spin_lock_init(&blkbk->pending_free_lock); init_waitqueue_head(&blkbk->pending_free_wq); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c4bd34063ecc..de09f525d6c1 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -62,13 +62,26 @@ struct blkif_common_response { /* i386 protocol version */ #pragma pack(push, 4) + +struct blkif_x86_32_request_rw { + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +struct blkif_x86_32_request_discard { + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +}; + struct blkif_x86_32_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t id; /* private guest value, echoed in resp */ - blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + union { + struct blkif_x86_32_request_rw rw; + struct blkif_x86_32_request_discard discard; + } u; }; struct blkif_x86_32_response { uint64_t id; /* copied from request */ @@ -78,13 +91,26 @@ struct blkif_x86_32_response { #pragma pack(pop) /* x86_64 protocol version */ + +struct blkif_x86_64_request_rw { + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; +}; + +struct blkif_x86_64_request_discard { + blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ + uint64_t nr_sectors; +}; + struct blkif_x86_64_request { uint8_t operation; /* BLKIF_OP_??? */ uint8_t nr_segments; /* number of segments */ blkif_vdev_t handle; /* only for read/write requests */ uint64_t __attribute__((__aligned__(8))) id; - blkif_sector_t sector_number;/* start sector idx on disk (r/w only) */ - struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + union { + struct blkif_x86_64_request_rw rw; + struct blkif_x86_64_request_discard discard; + } u; }; struct blkif_x86_64_response { uint64_t __attribute__((__aligned__(8))) id; @@ -112,6 +138,11 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; +enum blkif_backend_type { + BLKIF_BACKEND_PHY = 1, + BLKIF_BACKEND_FILE = 2, +}; + struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; @@ -137,6 +168,7 @@ struct xen_blkif { unsigned int irq; /* Comms information. */ enum blkif_protocol blk_protocol; + enum blkif_backend_type blk_backend_type; union blkif_back_rings blk_rings; struct vm_struct *blk_ring_area; /* The VBD attached to this interface. */ @@ -148,6 +180,9 @@ struct xen_blkif { atomic_t refcnt; wait_queue_head_t wq; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; /* One thread per one blkif. */ struct task_struct *xenblkd; unsigned int waiting_reqs; @@ -158,6 +193,7 @@ struct xen_blkif { int st_wr_req; int st_oo_req; int st_f_req; + int st_ds_req; int st_rd_sect; int st_wr_sect; @@ -181,7 +217,7 @@ struct xen_blkif { struct phys_req { unsigned short dev; - unsigned short nr_sects; + blkif_sector_t nr_sects; struct block_device *bdev; blkif_sector_t sector_number; }; @@ -195,6 +231,8 @@ int xen_blkif_schedule(void *arg); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state); struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be); static inline void blkif_get_x86_32_req(struct blkif_request *dst, @@ -205,12 +243,25 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst, dst->nr_segments = src->nr_segments; dst->handle = src->handle; dst->id = src->id; - dst->u.rw.sector_number = src->sector_number; - barrier(); - if (n > dst->nr_segments) - n = dst->nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->seg[i]; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + default: + break; + } } static inline void blkif_get_x86_64_req(struct blkif_request *dst, @@ -221,12 +272,25 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst, dst->nr_segments = src->nr_segments; dst->handle = src->handle; dst->id = src->id; - dst->u.rw.sector_number = src->sector_number; - barrier(); - if (n > dst->nr_segments) - n = dst->nr_segments; - for (i = 0; i < n; i++) - dst->u.rw.seg[i] = src->seg[i]; + switch (src->operation) { + case BLKIF_OP_READ: + case BLKIF_OP_WRITE: + case BLKIF_OP_WRITE_BARRIER: + case BLKIF_OP_FLUSH_DISKCACHE: + dst->u.rw.sector_number = src->u.rw.sector_number; + barrier(); + if (n > dst->nr_segments) + n = dst->nr_segments; + for (i = 0; i < n; i++) + dst->u.rw.seg[i] = src->u.rw.seg[i]; + break; + case BLKIF_OP_DISCARD: + dst->u.discard.sector_number = src->u.discard.sector_number; + dst->u.discard.nr_sectors = src->u.discard.nr_sectors; + break; + default: + break; + } } #endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 5fd2010f7d2b..2c008afe63d9 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -114,6 +114,8 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); init_waitqueue_head(&blkif->wq); + init_completion(&blkif->drain_complete); + atomic_set(&blkif->drain, 0); blkif->st_print = jiffies; init_waitqueue_head(&blkif->waiting_to_free); @@ -272,6 +274,7 @@ VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); VBD_SHOW(f_req, "%d\n", be->blkif->st_f_req); +VBD_SHOW(ds_req, "%d\n", be->blkif->st_ds_req); VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); @@ -280,6 +283,7 @@ static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_rd_req.attr, &dev_attr_wr_req.attr, &dev_attr_f_req.attr, + &dev_attr_ds_req.attr, &dev_attr_rd_sect.attr, &dev_attr_wr_sect.attr, NULL @@ -419,6 +423,73 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, return err; } +int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + struct xen_blkif *blkif = be->blkif; + char *type; + int err; + int state = 0; + + type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL); + if (!IS_ERR(type)) { + if (strncmp(type, "file", 4) == 0) { + state = 1; + blkif->blk_backend_type = BLKIF_BACKEND_FILE; + } + if (strncmp(type, "phy", 3) == 0) { + struct block_device *bdev = be->blkif->vbd.bdev; + struct request_queue *q = bdev_get_queue(bdev); + if (blk_queue_discard(q)) { + err = xenbus_printf(xbt, dev->nodename, + "discard-granularity", "%u", + q->limits.discard_granularity); + if (err) { + xenbus_dev_fatal(dev, err, + "writing discard-granularity"); + goto kfree; + } + err = xenbus_printf(xbt, dev->nodename, + "discard-alignment", "%u", + q->limits.discard_alignment); + if (err) { + xenbus_dev_fatal(dev, err, + "writing discard-alignment"); + goto kfree; + } + state = 1; + blkif->blk_backend_type = BLKIF_BACKEND_PHY; + } + } + } else { + err = PTR_ERR(type); + xenbus_dev_fatal(dev, err, "reading type"); + goto out; + } + + err = xenbus_printf(xbt, dev->nodename, "feature-discard", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-discard"); +kfree: + kfree(type); +out: + return err; +} +int xen_blkbk_barrier(struct xenbus_transaction xbt, + struct backend_info *be, int state) +{ + struct xenbus_device *dev = be->dev; + int err; + + err = xenbus_printf(xbt, dev->nodename, "feature-barrier", + "%d", state); + if (err) + xenbus_dev_fatal(dev, err, "writing feature-barrier"); + + return err; +} + /* * Entry point to this code when a new device is created. Allocate the basic * structures, and watch the store waiting for the hotplug scripts to tell us @@ -650,6 +721,11 @@ again: if (err) goto abort; + err = xen_blkbk_discard(xbt, be); + + /* If we can't advertise it is OK. */ + err = xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); + err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); if (err) { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 9ea8c2576c70..7b2ec5908413 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -98,6 +98,9 @@ struct blkfront_info unsigned long shadow_free; unsigned int feature_flush; unsigned int flush_op; + unsigned int feature_discard; + unsigned int discard_granularity; + unsigned int discard_alignment; int is_ready; }; @@ -302,29 +305,36 @@ static int blkif_queue_request(struct request *req) ring_req->operation = info->flush_op; } - ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); - BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); + if (unlikely(req->cmd_flags & REQ_DISCARD)) { + /* id, sector_number and handle are set above. */ + ring_req->operation = BLKIF_OP_DISCARD; + ring_req->nr_segments = 0; + ring_req->u.discard.nr_sectors = blk_rq_sectors(req); + } else { + ring_req->nr_segments = blk_rq_map_sg(req->q, req, info->sg); + BUG_ON(ring_req->nr_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST); - for_each_sg(info->sg, sg, ring_req->nr_segments, i) { - buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); - fsect = sg->offset >> 9; - lsect = fsect + (sg->length >> 9) - 1; - /* install a grant reference. */ - ref = gnttab_claim_grant_reference(&gref_head); - BUG_ON(ref == -ENOSPC); + for_each_sg(info->sg, sg, ring_req->nr_segments, i) { + buffer_mfn = pfn_to_mfn(page_to_pfn(sg_page(sg))); + fsect = sg->offset >> 9; + lsect = fsect + (sg->length >> 9) - 1; + /* install a grant reference. */ + ref = gnttab_claim_grant_reference(&gref_head); + BUG_ON(ref == -ENOSPC); - gnttab_grant_foreign_access_ref( - ref, - info->xbdev->otherend_id, - buffer_mfn, - rq_data_dir(req) ); - - info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); - ring_req->u.rw.seg[i] = - (struct blkif_request_segment) { - .gref = ref, - .first_sect = fsect, - .last_sect = lsect }; + gnttab_grant_foreign_access_ref( + ref, + info->xbdev->otherend_id, + buffer_mfn, + rq_data_dir(req)); + + info->shadow[id].frame[i] = mfn_to_pfn(buffer_mfn); + ring_req->u.rw.seg[i] = + (struct blkif_request_segment) { + .gref = ref, + .first_sect = fsect, + .last_sect = lsect }; + } } info->ring.req_prod_pvt++; @@ -370,7 +380,9 @@ static void do_blkif_request(struct request_queue *rq) blk_start_request(req); - if (req->cmd_type != REQ_TYPE_FS) { + if ((req->cmd_type != REQ_TYPE_FS) || + ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && + !info->flush_op)) { __blk_end_request_all(req, -EIO); continue; } @@ -399,6 +411,7 @@ wait: static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) { struct request_queue *rq; + struct blkfront_info *info = gd->private_data; rq = blk_init_queue(do_blkif_request, &blkif_io_lock); if (rq == NULL) @@ -406,6 +419,13 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); + if (info->feature_discard) { + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, rq); + blk_queue_max_discard_sectors(rq, get_capacity(gd)); + rq->limits.discard_granularity = info->discard_granularity; + rq->limits.discard_alignment = info->discard_alignment; + } + /* Hard sector size and max sectors impersonate the equiv. hardware. */ blk_queue_logical_block_size(rq, sector_size); blk_queue_max_hw_sectors(rq, 512); @@ -722,6 +742,17 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = (bret->status == BLKIF_RSP_OKAY) ? 0 : -EIO; switch (bret->operation) { + case BLKIF_OP_DISCARD: + if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { + struct request_queue *rq = info->rq; + printk(KERN_WARNING "blkfront: %s: discard op failed\n", + info->gd->disk_name); + error = -EOPNOTSUPP; + info->feature_discard = 0; + queue_flag_clear(QUEUE_FLAG_DISCARD, rq); + } + __blk_end_request_all(req, error); + break; case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_WRITE_BARRIER: if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { @@ -1098,6 +1129,33 @@ blkfront_closing(struct blkfront_info *info) bdput(bdev); } +static void blkfront_setup_discard(struct blkfront_info *info) +{ + int err; + char *type; + unsigned int discard_granularity; + unsigned int discard_alignment; + + type = xenbus_read(XBT_NIL, info->xbdev->otherend, "type", NULL); + if (IS_ERR(type)) + return; + + if (strncmp(type, "phy", 3) == 0) { + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "discard-granularity", "%u", &discard_granularity, + "discard-alignment", "%u", &discard_alignment, + NULL); + if (!err) { + info->feature_discard = 1; + info->discard_granularity = discard_granularity; + info->discard_alignment = discard_alignment; + } + } else if (strncmp(type, "file", 4) == 0) + info->feature_discard = 1; + + kfree(type); +} + /* * Invoked when the backend is finally 'ready' (and has told produced * the details about the physical device - #sectors, size, etc). @@ -1108,7 +1166,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int binfo; int err; - int barrier, flush; + int barrier, flush, discard; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -1178,7 +1236,14 @@ static void blkfront_connect(struct blkfront_info *info) info->feature_flush = REQ_FLUSH; info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; } - + + err = xenbus_gather(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard, + NULL); + + if (!err && discard) + blkfront_setup_discard(info); + err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size); if (err) { xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", @@ -1385,6 +1450,8 @@ static struct xenbus_driver blkfront = { static int __init xlblk_init(void) { + int ret; + if (!xen_domain()) return -ENODEV; @@ -1394,7 +1461,13 @@ static int __init xlblk_init(void) return -ENODEV; } - return xenbus_register_frontend(&blkfront); + ret = xenbus_register_frontend(&blkfront); + if (ret) { + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + return ret; + } + + return 0; } module_init(xlblk_init); |