summaryrefslogtreecommitdiffstats
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/DAC960.c14
-rw-r--r--drivers/block/Kconfig17
-rw-r--r--drivers/block/Makefile1
-rw-r--r--drivers/block/amiflop.c79
-rw-r--r--drivers/block/aoe/aoeblk.c13
-rw-r--r--drivers/block/aoe/aoechr.c10
-rw-r--r--drivers/block/aoe/aoedev.c4
-rw-r--r--drivers/block/ataflop.c65
-rw-r--r--drivers/block/brd.c8
-rw-r--r--drivers/block/cciss.c891
-rw-r--r--drivers/block/cpqarray.c15
-rw-r--r--drivers/block/drbd/drbd_actlog.c41
-rw-r--r--drivers/block/drbd/drbd_bitmap.c2
-rw-r--r--drivers/block/drbd/drbd_int.h219
-rw-r--r--drivers/block/drbd/drbd_main.c606
-rw-r--r--drivers/block/drbd/drbd_nl.c270
-rw-r--r--drivers/block/drbd/drbd_proc.c34
-rw-r--r--drivers/block/drbd/drbd_receiver.c949
-rw-r--r--drivers/block/drbd/drbd_req.c165
-rw-r--r--drivers/block/drbd/drbd_req.h62
-rw-r--r--drivers/block/drbd/drbd_worker.c292
-rw-r--r--drivers/block/floppy.c82
-rw-r--r--drivers/block/loop.c136
-rw-r--r--drivers/block/nbd.c7
-rw-r--r--drivers/block/osdblk.c5
-rw-r--r--drivers/block/paride/pcd.c15
-rw-r--r--drivers/block/paride/pd.c15
-rw-r--r--drivers/block/paride/pf.c17
-rw-r--r--drivers/block/paride/pg.c8
-rw-r--r--drivers/block/paride/pt.c20
-rw-r--r--drivers/block/pktcdvd.c18
-rw-r--r--drivers/block/ps3disk.c4
-rw-r--r--drivers/block/rbd.c1841
-rw-r--r--drivers/block/rbd_types.h73
-rw-r--r--drivers/block/swim.c15
-rw-r--r--drivers/block/swim3.c15
-rw-r--r--drivers/block/ub.c17
-rw-r--r--drivers/block/viodasd.c11
-rw-r--r--drivers/block/virtio_blk.c54
-rw-r--r--drivers/block/xd.c7
-rw-r--r--drivers/block/xen-blkfront.c65
-rw-r--r--drivers/block/xsysace.c14
-rw-r--r--drivers/block/z2ram.c21
43 files changed, 4553 insertions, 1664 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 4e2c367fec11..1f286ab461d3 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -36,7 +36,7 @@
#include <linux/ioport.h>
#include <linux/mm.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <linux/reboot.h>
@@ -54,6 +54,7 @@
#define DAC960_GAM_MINOR 252
+static DEFINE_MUTEX(DAC960_mutex);
static DAC960_Controller_T *DAC960_Controllers[DAC960_MaxControllers];
static int DAC960_ControllerCount;
static struct proc_dir_entry *DAC960_ProcDirectoryEntry;
@@ -81,7 +82,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
int drive_nr = (long)disk->private_data;
int ret = -ENXIO;
- lock_kernel();
+ mutex_lock(&DAC960_mutex);
if (p->FirmwareType == DAC960_V1_Controller) {
if (p->V1.LogicalDriveInformation[drive_nr].
LogicalDriveState == DAC960_V1_LogicalDrive_Offline)
@@ -99,7 +100,7 @@ static int DAC960_open(struct block_device *bdev, fmode_t mode)
goto out;
ret = 0;
out:
- unlock_kernel();
+ mutex_unlock(&DAC960_mutex);
return ret;
}
@@ -6625,7 +6626,7 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
long ErrorCode = 0;
if (!capable(CAP_SYS_ADMIN)) return -EACCES;
- lock_kernel();
+ mutex_lock(&DAC960_mutex);
switch (Request)
{
case DAC960_IOCTL_GET_CONTROLLER_COUNT:
@@ -7056,13 +7057,14 @@ static long DAC960_gam_ioctl(struct file *file, unsigned int Request,
default:
ErrorCode = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&DAC960_mutex);
return ErrorCode;
}
static const struct file_operations DAC960_gam_fops = {
.owner = THIS_MODULE,
- .unlocked_ioctl = DAC960_gam_ioctl
+ .unlocked_ioctl = DAC960_gam_ioctl,
+ .llseek = noop_llseek,
};
static struct miscdevice DAC960_gam_dev = {
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index de277689da61..4b9359a6f6ca 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -488,4 +488,21 @@ config BLK_DEV_HD
If unsure, say N.
+config BLK_DEV_RBD
+ tristate "Rados block device (RBD)"
+ depends on INET && EXPERIMENTAL && BLOCK
+ select CEPH_LIB
+ select LIBCRC32C
+ select CRYPTO_AES
+ select CRYPTO
+ default n
+ help
+ Say Y here if you want include the Rados block device, which stripes
+ a block device over objects stored in the Ceph distributed object
+ store.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index aff5ac925c34..d7f463d6312d 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_BLK_DEV_HD) += hd.o
obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += xen-blkfront.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd/
+obj-$(CONFIG_BLK_DEV_RBD) += rbd.o
swim_mod-objs := swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
index 76f114f0bba3..a1725e6488d3 100644
--- a/drivers/block/amiflop.c
+++ b/drivers/block/amiflop.c
@@ -60,7 +60,7 @@
#include <linux/hdreg.h>
#include <linux/delay.h>
#include <linux/init.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/amifdreg.h>
#include <linux/amifd.h>
#include <linux/buffer_head.h>
@@ -109,13 +109,12 @@
#define FD_HD_3 0x55555555 /* high-density 3.5" (1760K) drive */
#define FD_DD_5 0xaaaaaaaa /* double-density 5.25" (440K) drive */
+static DEFINE_MUTEX(amiflop_mutex);
static unsigned long int fd_def_df0 = FD_DD_3; /* default for df0 if it doesn't identify */
module_param(fd_def_df0, ulong, 0);
MODULE_LICENSE("GPL");
-static struct request_queue *floppy_queue;
-
/*
* Macros
*/
@@ -164,6 +163,7 @@ static volatile int selected = -1; /* currently selected drive */
static int writepending;
static int writefromint;
static char *raw_buf;
+static int fdc_queue;
static DEFINE_SPINLOCK(amiflop_lock);
@@ -1334,6 +1334,42 @@ static int get_track(int drive, int track)
return -1;
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int cnt = FD_MAX_UNITS;
+ struct request *rq;
+
+ /* Find next queue we can dispatch from */
+ fdc_queue = fdc_queue + 1;
+ if (fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+
+ for(cnt = FD_MAX_UNITS; cnt > 0; cnt--) {
+
+ if (unit[fdc_queue].type->code == FD_NODRIVE) {
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ continue;
+ }
+
+ q = unit[fdc_queue].gendisk->queue;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ }
+
+ return rq;
+}
+
static void redo_fd_request(void)
{
struct request *rq;
@@ -1345,7 +1381,7 @@ static void redo_fd_request(void)
int err;
next_req:
- rq = blk_fetch_request(floppy_queue);
+ rq = set_next_request();
if (!rq) {
/* Nothing left to do */
return;
@@ -1506,9 +1542,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return ret;
}
@@ -1555,11 +1591,11 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
int old_dev;
unsigned long flags;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
old_dev = fd_device[drive];
if (fd_ref[drive] && old_dev != system) {
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return -EBUSY;
}
@@ -1575,7 +1611,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
rel_fdc();
if (wrprot) {
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return -EROFS;
}
}
@@ -1594,7 +1630,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
unit[drive].type->name, data_types[system].name);
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return 0;
}
@@ -1603,7 +1639,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
struct amiga_floppy_struct *p = disk->private_data;
int drive = p - unit;
- lock_kernel();
+ mutex_lock(&amiflop_mutex);
if (unit[drive].dirty == 1) {
del_timer (flush_track_timer + drive);
non_int_flush_track (drive);
@@ -1617,7 +1653,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
/* the mod_use counter is handled this way */
floppy_off (drive | 0x40000000);
#endif
- unlock_kernel();
+ mutex_unlock(&amiflop_mutex);
return 0;
}
@@ -1682,6 +1718,13 @@ static int __init fd_probe_drives(void)
continue;
}
unit[drive].gendisk = disk;
+
+ disk->queue = blk_init_queue(do_fd_request, &amiflop_lock);
+ if (!disk->queue) {
+ unit[drive].type->code = FD_NODRIVE;
+ continue;
+ }
+
drives++;
if ((unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL)) == NULL) {
printk("no mem for ");
@@ -1695,7 +1738,6 @@ static int __init fd_probe_drives(void)
disk->fops = &floppy_fops;
sprintf(disk->disk_name, "fd%d", drive);
disk->private_data = &unit[drive];
- disk->queue = floppy_queue;
set_capacity(disk, 880*2);
add_disk(disk);
}
@@ -1743,11 +1785,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
goto out_irq2;
}
- ret = -ENOMEM;
- floppy_queue = blk_init_queue(do_fd_request, &amiflop_lock);
- if (!floppy_queue)
- goto out_queue;
-
ret = -ENODEV;
if (fd_probe_drives() < 1) /* No usable drives */
goto out_probe;
@@ -1791,8 +1828,6 @@ static int __init amiga_floppy_probe(struct platform_device *pdev)
return 0;
out_probe:
- blk_cleanup_queue(floppy_queue);
-out_queue:
free_irq(IRQ_AMIGA_CIAA_TB, NULL);
out_irq2:
free_irq(IRQ_AMIGA_DSKBLK, NULL);
@@ -1810,9 +1845,12 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
for( i = 0; i < FD_MAX_UNITS; i++) {
if (unit[i].type->code != FD_NODRIVE) {
+ struct request_queue *q = unit[i].gendisk->queue;
del_gendisk(unit[i].gendisk);
put_disk(unit[i].gendisk);
kfree(unit[i].trackbuf);
+ if (q)
+ blk_cleanup_queue(q);
}
}
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
@@ -1820,7 +1858,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev)
free_irq(IRQ_AMIGA_DSKBLK, NULL);
custom.dmacon = DMAF_DISK; /* disable DMA */
amiga_chip_free(raw_buf);
- blk_cleanup_queue(floppy_queue);
unregister_blkdev(FLOPPY_MAJOR, "fd");
}
#endif
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index a946929735a5..541e18879965 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -4,17 +4,20 @@
* block device routines
*/
+#include <linux/kernel.h>
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/fs.h>
#include <linux/ioctl.h>
#include <linux/slab.h>
+#include <linux/ratelimit.h>
#include <linux/genhd.h>
#include <linux/netdevice.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include "aoe.h"
+static DEFINE_MUTEX(aoeblk_mutex);
static struct kmem_cache *buf_pool_cache;
static ssize_t aoedisk_show_state(struct device *dev,
@@ -125,16 +128,16 @@ aoeblk_open(struct block_device *bdev, fmode_t mode)
struct aoedev *d = bdev->bd_disk->private_data;
ulong flags;
- lock_kernel();
+ mutex_lock(&aoeblk_mutex);
spin_lock_irqsave(&d->lock, flags);
if (d->flags & DEVFL_UP) {
d->nopen++;
spin_unlock_irqrestore(&d->lock, flags);
- unlock_kernel();
+ mutex_unlock(&aoeblk_mutex);
return 0;
}
spin_unlock_irqrestore(&d->lock, flags);
- unlock_kernel();
+ mutex_unlock(&aoeblk_mutex);
return -ENODEV;
}
@@ -206,7 +209,7 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
spin_lock_irqsave(&d->lock, flags);
if ((d->flags & DEVFL_UP) == 0) {
- printk(KERN_INFO "aoe: device %ld.%d is not up\n",
+ pr_info_ratelimited("aoe: device %ld.%d is not up\n",
d->aoemajor, d->aoeminor);
spin_unlock_irqrestore(&d->lock, flags);
mempool_free(buf, d->bufpool);
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index 4a1b9e7464aa..146296ca4965 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -9,7 +9,7 @@
#include <linux/completion.h>
#include <linux/delay.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/skbuff.h>
#include "aoe.h"
@@ -37,6 +37,7 @@ struct ErrMsg {
char *msg;
};
+static DEFINE_MUTEX(aoechr_mutex);
static struct ErrMsg emsgs[NMSG];
static int emsgs_head_idx, emsgs_tail_idx;
static struct completion emsgs_comp;
@@ -183,16 +184,16 @@ aoechr_open(struct inode *inode, struct file *filp)
{
int n, i;
- lock_kernel();
+ mutex_lock(&aoechr_mutex);
n = iminor(inode);
filp->private_data = (void *) (unsigned long) n;
for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
if (chardevs[i].minor == n) {
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return 0;
}
- unlock_kernel();
+ mutex_unlock(&aoechr_mutex);
return -EINVAL;
}
@@ -265,6 +266,7 @@ static const struct file_operations aoe_fops = {
.open = aoechr_open,
.release = aoechr_rel,
.owner = THIS_MODULE,
+ .llseek = noop_llseek,
};
static char *aoe_devnode(struct device *dev, mode_t *mode)
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 0849280bfc1c..6b5110a47458 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -102,6 +102,7 @@ aoedev_freedev(struct aoedev *d)
{
struct aoetgt **t, **e;
+ cancel_work_sync(&d->work);
if (d->gd) {
aoedisk_rm_sysfs(d);
del_gendisk(d->gd);
@@ -135,7 +136,6 @@ aoedev_flush(const char __user *str, size_t cnt)
all = !strncmp(buf, "all", 3);
}
- flush_scheduled_work();
spin_lock_irqsave(&devlist_lock, flags);
dd = &devlist;
while ((d = *dd)) {
@@ -257,8 +257,6 @@ aoedev_exit(void)
struct aoedev *d;
ulong flags;
- flush_scheduled_work();
-
while ((d = devlist)) {
devlist = d->next;
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
index aceb96476524..4e4cc6c828cb 100644
--- a/drivers/block/ataflop.c
+++ b/drivers/block/ataflop.c
@@ -67,7 +67,7 @@
#include <linux/delay.h>
#include <linux/init.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/atafd.h>
#include <asm/atafdreg.h>
@@ -79,8 +79,9 @@
#undef DEBUG
-static struct request_queue *floppy_queue;
+static DEFINE_MUTEX(ataflop_mutex);
static struct request *fd_request;
+static int fdc_queue;
/* Disk types: DD, HD, ED */
static struct atari_disk_type {
@@ -1391,6 +1392,29 @@ static void setup_req_params( int drive )
ReqTrack, ReqSector, (unsigned long)ReqData ));
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static struct request *set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+ struct request *rq;
+
+ do {
+ q = unit[fdc_queue].disk->queue;
+ if (++fdc_queue == FD_MAX_UNITS)
+ fdc_queue = 0;
+ if (q) {
+ rq = blk_fetch_request(q);
+ if (rq)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return rq;
+}
+
static void redo_fd_request(void)
{
@@ -1405,7 +1429,7 @@ static void redo_fd_request(void)
repeat:
if (!fd_request) {
- fd_request = blk_fetch_request(floppy_queue);
+ fd_request = set_next_request();
if (!fd_request)
goto the_end;
}
@@ -1671,9 +1695,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return ret;
}
@@ -1854,9 +1878,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return ret;
}
@@ -1864,14 +1888,14 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
static int floppy_release(struct gendisk *disk, fmode_t mode)
{
struct atari_floppy_struct *p = disk->private_data;
- lock_kernel();
+ mutex_lock(&ataflop_mutex);
if (p->ref < 0)
p->ref = 0;
else if (!p->ref--) {
printk(KERN_ERR "floppy_release with fd_ref == 0");
p->ref = 0;
}
- unlock_kernel();
+ mutex_unlock(&ataflop_mutex);
return 0;
}
@@ -1932,10 +1956,6 @@ static int __init atari_floppy_init (void)
PhysTrackBuffer = virt_to_phys(TrackBuffer);
BufferDrive = BufferSide = BufferTrack = -1;
- floppy_queue = blk_init_queue(do_fd_request, &ataflop_lock);
- if (!floppy_queue)
- goto Enomem;
-
for (i = 0; i < FD_MAX_UNITS; i++) {
unit[i].track = -1;
unit[i].flags = 0;
@@ -1944,7 +1964,10 @@ static int __init atari_floppy_init (void)
sprintf(unit[i].disk->disk_name, "fd%d", i);
unit[i].disk->fops = &floppy_fops;
unit[i].disk->private_data = &unit[i];
- unit[i].disk->queue = floppy_queue;
+ unit[i].disk->queue = blk_init_queue(do_fd_request,
+ &ataflop_lock);
+ if (!unit[i].disk->queue)
+ goto Enomem;
set_capacity(unit[i].disk, MAX_DISK_SIZE * 2);
add_disk(unit[i].disk);
}
@@ -1959,10 +1982,14 @@ static int __init atari_floppy_init (void)
return 0;
Enomem:
- while (i--)
+ while (i--) {
+ struct request_queue *q = unit[i].disk->queue;
+
put_disk(unit[i].disk);
- if (floppy_queue)
- blk_cleanup_queue(floppy_queue);
+ if (q)
+ blk_cleanup_queue(q);
+ }
+
unregister_blkdev(FLOPPY_MAJOR, "fd");
return -ENOMEM;
}
@@ -2011,12 +2038,14 @@ static void __exit atari_floppy_exit(void)
int i;
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
for (i = 0; i < FD_MAX_UNITS; i++) {
+ struct request_queue *q = unit[i].disk->queue;
+
del_gendisk(unit[i].disk);
put_disk(unit[i].disk);
+ blk_cleanup_queue(q);
}
unregister_blkdev(FLOPPY_MAJOR, "fd");
- blk_cleanup_queue(floppy_queue);
del_timer_sync(&fd_timer);
atari_stram_free( DMABuffer );
}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index 1c7f63792ff8..b7f51e4594f8 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -15,7 +15,7 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/highmem.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/radix-tree.h>
#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
#include <linux/slab.h>
@@ -55,6 +55,7 @@ struct brd_device {
/*
* Look up and return a brd's page for a given sector.
*/
+static DEFINE_MUTEX(brd_mutex);
static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
{
pgoff_t idx;
@@ -402,7 +403,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
* ram device BLKFLSBUF has special semantics, we want to actually
* release and destroy the ramdisk data.
*/
- lock_kernel();
+ mutex_lock(&brd_mutex);
mutex_lock(&bdev->bd_mutex);
error = -EBUSY;
if (bdev->bd_openers <= 1) {
@@ -419,7 +420,7 @@ static int brd_ioctl(struct block_device *bdev, fmode_t mode,
error = 0;
}
mutex_unlock(&bdev->bd_mutex);
- unlock_kernel();
+ mutex_unlock(&brd_mutex);
return error;
}
@@ -482,7 +483,6 @@ static struct brd_device *brd_alloc(int i)
if (!brd->brd_queue)
goto out_free_dev;
blk_queue_make_request(brd->brd_queue, brd_make_request);
- blk_queue_ordered(brd->brd_queue, QUEUE_ORDERED_TAG);
blk_queue_max_hw_sectors(brd->brd_queue, 1024);
blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 5e4fadcdece9..2cc4dda46279 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -26,7 +26,6 @@
#include <linux/pci.h>
#include <linux/kernel.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
#include <linux/delay.h>
#include <linux/major.h>
#include <linux/fs.h>
@@ -66,11 +65,7 @@ MODULE_SUPPORTED_DEVICE("HP Smart Array Controllers");
MODULE_VERSION("3.6.26");
MODULE_LICENSE("GPL");
-static int cciss_allow_hpsa;
-module_param(cciss_allow_hpsa, int, S_IRUGO|S_IWUSR);
-MODULE_PARM_DESC(cciss_allow_hpsa,
- "Prevent cciss driver from accessing hardware known to be "
- " supported by the hpsa driver");
+static DEFINE_MUTEX(cciss_mutex);
#include "cciss_cmd.h"
#include "cciss.h"
@@ -98,18 +93,6 @@ static const struct pci_device_id cciss_pci_device_id[] = {
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSD, 0x103C, 0x3215},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x3237},
{PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSC, 0x103C, 0x323D},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3241},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3243},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3245},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3247},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3249},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324A},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x324B},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3250},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3251},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3252},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3253},
- {PCI_VENDOR_ID_HP, PCI_DEVICE_ID_HP_CISSE, 0x103C, 0x3254},
{0,}
};
@@ -137,23 +120,9 @@ static struct board_type products[] = {
{0x3214103C, "Smart Array E200i", &SA5_access},
{0x3215103C, "Smart Array E200i", &SA5_access},
{0x3237103C, "Smart Array E500", &SA5_access},
-/* controllers below this line are also supported by the hpsa driver. */
-#define HPSA_BOUNDARY 0x3223103C
{0x3223103C, "Smart Array P800", &SA5_access},
{0x3234103C, "Smart Array P400", &SA5_access},
{0x323D103C, "Smart Array P700m", &SA5_access},
- {0x3241103C, "Smart Array P212", &SA5_access},
- {0x3243103C, "Smart Array P410", &SA5_access},
- {0x3245103C, "Smart Array P410i", &SA5_access},
- {0x3247103C, "Smart Array P411", &SA5_access},
- {0x3249103C, "Smart Array P812", &SA5_access},
- {0x324A103C, "Smart Array P712m", &SA5_access},
- {0x324B103C, "Smart Array P711m", &SA5_access},
- {0x3250103C, "Smart Array", &SA5_access},
- {0x3251103C, "Smart Array", &SA5_access},
- {0x3252103C, "Smart Array", &SA5_access},
- {0x3253103C, "Smart Array", &SA5_access},
- {0x3254103C, "Smart Array", &SA5_access},
};
/* How long to wait (in milliseconds) for board to go into simple mode */
@@ -1059,9 +1028,9 @@ static int cciss_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
ret = cciss_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return ret;
}
@@ -1074,13 +1043,13 @@ static int cciss_release(struct gendisk *disk, fmode_t mode)
ctlr_info_t *h;
drive_info_struct *drv;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
h = get_host(disk);
drv = get_drv(disk);
dev_dbg(&h->pdev->dev, "cciss_release %s\n", disk->disk_name);
drv->usage_count--;
h->usage_count--;
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return 0;
}
@@ -1088,9 +1057,9 @@ static int do_ioctl(struct block_device *bdev, fmode_t mode,
unsigned cmd, unsigned long arg)
{
int ret;
- lock_kernel();
+ mutex_lock(&cciss_mutex);
ret = cciss_ioctl(bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&cciss_mutex);
return ret;
}
@@ -1182,6 +1151,7 @@ static int cciss_ioctl32_big_passthru(struct block_device *bdev, fmode_t mode,
int err;
u32 cp;
+ memset(&arg64, 0, sizeof(arg64));
err = 0;
err |=
copy_from_user(&arg64.LUN_info, &arg32->LUN_info,
@@ -1232,470 +1202,452 @@ static void check_ioctl_unit_attention(ctlr_info_t *h, CommandList_struct *c)
c->err_info->ScsiStatus != SAM_STAT_CHECK_CONDITION)
(void)check_for_unit_attention(h, c);
}
-/*
- * ioctl
- */
-static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+
+static int cciss_getpciinfo(ctlr_info_t *h, void __user *argp)
{
- struct gendisk *disk = bdev->bd_disk;
- ctlr_info_t *h = get_host(disk);
- drive_info_struct *drv = get_drv(disk);
- void __user *argp = (void __user *)arg;
+ cciss_pci_info_struct pciinfo;
- dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
- cmd, arg);
- switch (cmd) {
- case CCISS_GETPCIINFO:
- {
- cciss_pci_info_struct pciinfo;
-
- if (!arg)
- return -EINVAL;
- pciinfo.domain = pci_domain_nr(h->pdev->bus);
- pciinfo.bus = h->pdev->bus->number;
- pciinfo.dev_fn = h->pdev->devfn;
- pciinfo.board_id = h->board_id;
- if (copy_to_user
- (argp, &pciinfo, sizeof(cciss_pci_info_struct)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETINTINFO:
- {
- cciss_coalint_struct intinfo;
- if (!arg)
- return -EINVAL;
- intinfo.delay =
- readl(&h->cfgtable->HostWrite.CoalIntDelay);
- intinfo.count =
- readl(&h->cfgtable->HostWrite.CoalIntCount);
- if (copy_to_user
- (argp, &intinfo, sizeof(cciss_coalint_struct)))
- return -EFAULT;
- return 0;
- }
- case CCISS_SETINTINFO:
- {
- cciss_coalint_struct intinfo;
- unsigned long flags;
- int i;
-
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
- if (copy_from_user
- (&intinfo, argp, sizeof(cciss_coalint_struct)))
- return -EFAULT;
- if ((intinfo.delay == 0) && (intinfo.count == 0))
- return -EINVAL;
- spin_lock_irqsave(&h->lock, flags);
- /* Update the field, and then ring the doorbell */
- writel(intinfo.delay,
- &(h->cfgtable->HostWrite.CoalIntDelay));
- writel(intinfo.count,
- &(h->cfgtable->HostWrite.CoalIntCount));
- writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
-
- for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
- if (!(readl(h->vaddr + SA5_DOORBELL)
- & CFGTBL_ChangeReq))
- break;
- /* delay and try again */
- udelay(1000);
- }
- spin_unlock_irqrestore(&h->lock, flags);
- if (i >= MAX_IOCTL_CONFIG_WAIT)
- return -EAGAIN;
- return 0;
- }
- case CCISS_GETNODENAME:
- {
- NodeName_type NodeName;
- int i;
-
- if (!arg)
- return -EINVAL;
- for (i = 0; i < 16; i++)
- NodeName[i] =
- readb(&h->cfgtable->ServerName[i]);
- if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_SETNODENAME:
- {
- NodeName_type NodeName;
- unsigned long flags;
- int i;
+ if (!argp)
+ return -EINVAL;
+ pciinfo.domain = pci_domain_nr(h->pdev->bus);
+ pciinfo.bus = h->pdev->bus->number;
+ pciinfo.dev_fn = h->pdev->devfn;
+ pciinfo.board_id = h->board_id;
+ if (copy_to_user(argp, &pciinfo, sizeof(cciss_pci_info_struct)))
+ return -EFAULT;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EPERM;
+static int cciss_getintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
- if (copy_from_user
- (NodeName, argp, sizeof(NodeName_type)))
- return -EFAULT;
+ if (!argp)
+ return -EINVAL;
+ intinfo.delay = readl(&h->cfgtable->HostWrite.CoalIntDelay);
+ intinfo.count = readl(&h->cfgtable->HostWrite.CoalIntCount);
+ if (copy_to_user
+ (argp, &intinfo, sizeof(cciss_coalint_struct)))
+ return -EFAULT;
+ return 0;
+}
- spin_lock_irqsave(&h->lock, flags);
+static int cciss_setintinfo(ctlr_info_t *h, void __user *argp)
+{
+ cciss_coalint_struct intinfo;
+ unsigned long flags;
+ int i;
- /* Update the field, and then ring the doorbell */
- for (i = 0; i < 16; i++)
- writeb(NodeName[i],
- &h->cfgtable->ServerName[i]);
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(&intinfo, argp, sizeof(intinfo)))
+ return -EFAULT;
+ if ((intinfo.delay == 0) && (intinfo.count == 0))
+ return -EINVAL;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ writel(intinfo.delay, &(h->cfgtable->HostWrite.CoalIntDelay));
+ writel(intinfo.count, &(h->cfgtable->HostWrite.CoalIntCount));
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
- writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
- for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
- if (!(readl(h->vaddr + SA5_DOORBELL)
- & CFGTBL_ChangeReq))
- break;
- /* delay and try again */
- udelay(1000);
- }
- spin_unlock_irqrestore(&h->lock, flags);
- if (i >= MAX_IOCTL_CONFIG_WAIT)
- return -EAGAIN;
- return 0;
- }
+static int cciss_getnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ int i;
- case CCISS_GETHEARTBEAT:
- {
- Heartbeat_type heartbeat;
-
- if (!arg)
- return -EINVAL;
- heartbeat = readl(&h->cfgtable->HeartBeat);
- if (copy_to_user
- (argp, &heartbeat, sizeof(Heartbeat_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETBUSTYPES:
- {
- BusTypes_type BusTypes;
-
- if (!arg)
- return -EINVAL;
- BusTypes = readl(&h->cfgtable->BusTypes);
- if (copy_to_user
- (argp, &BusTypes, sizeof(BusTypes_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETFIRMVER:
- {
- FirmwareVer_type firmware;
+ if (!argp)
+ return -EINVAL;
+ for (i = 0; i < 16; i++)
+ NodeName[i] = readb(&h->cfgtable->ServerName[i]);
+ if (copy_to_user(argp, NodeName, sizeof(NodeName_type)))
+ return -EFAULT;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
- memcpy(firmware, h->firm_ver, 4);
+static int cciss_setnodename(ctlr_info_t *h, void __user *argp)
+{
+ NodeName_type NodeName;
+ unsigned long flags;
+ int i;
- if (copy_to_user
- (argp, firmware, sizeof(FirmwareVer_type)))
- return -EFAULT;
- return 0;
- }
- case CCISS_GETDRIVVER:
- {
- DriverVer_type DriverVer = DRIVER_VERSION;
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+ if (copy_from_user(NodeName, argp, sizeof(NodeName_type)))
+ return -EFAULT;
+ spin_lock_irqsave(&h->lock, flags);
+ /* Update the field, and then ring the doorbell */
+ for (i = 0; i < 16; i++)
+ writeb(NodeName[i], &h->cfgtable->ServerName[i]);
+ writel(CFGTBL_ChangeReq, h->vaddr + SA5_DOORBELL);
+ for (i = 0; i < MAX_IOCTL_CONFIG_WAIT; i++) {
+ if (!(readl(h->vaddr + SA5_DOORBELL) & CFGTBL_ChangeReq))
+ break;
+ udelay(1000); /* delay and try again */
+ }
+ spin_unlock_irqrestore(&h->lock, flags);
+ if (i >= MAX_IOCTL_CONFIG_WAIT)
+ return -EAGAIN;
+ return 0;
+}
- if (!arg)
- return -EINVAL;
+static int cciss_getheartbeat(ctlr_info_t *h, void __user *argp)
+{
+ Heartbeat_type heartbeat;
- if (copy_to_user
- (argp, &DriverVer, sizeof(DriverVer_type)))
- return -EFAULT;
- return 0;
- }
+ if (!argp)
+ return -EINVAL;
+ heartbeat = readl(&h->cfgtable->HeartBeat);
+ if (copy_to_user(argp, &heartbeat, sizeof(Heartbeat_type)))
+ return -EFAULT;
+ return 0;
+}
- case CCISS_DEREGDISK:
- case CCISS_REGNEWD:
- case CCISS_REVALIDVOLS:
- return rebuild_lun_table(h, 0, 1);
+static int cciss_getbustypes(ctlr_info_t *h, void __user *argp)
+{
+ BusTypes_type BusTypes;
+
+ if (!argp)
+ return -EINVAL;
+ BusTypes = readl(&h->cfgtable->BusTypes);
+ if (copy_to_user(argp, &BusTypes, sizeof(BusTypes_type)))
+ return -EFAULT;
+ return 0;
+}
- case CCISS_GETLUNINFO:{
- LogvolInfo_struct luninfo;
+static int cciss_getfirmver(ctlr_info_t *h, void __user *argp)
+{
+ FirmwareVer_type firmware;
- memcpy(&luninfo.LunID, drv->LunID,
- sizeof(luninfo.LunID));
- luninfo.num_opens = drv->usage_count;
- luninfo.num_parts = 0;
- if (copy_to_user(argp, &luninfo,
- sizeof(LogvolInfo_struct)))
- return -EFAULT;
- return 0;
+ if (!argp)
+ return -EINVAL;
+ memcpy(firmware, h->firm_ver, 4);
+
+ if (copy_to_user
+ (argp, firmware, sizeof(FirmwareVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getdrivver(ctlr_info_t *h, void __user *argp)
+{
+ DriverVer_type DriverVer = DRIVER_VERSION;
+
+ if (!argp)
+ return -EINVAL;
+ if (copy_to_user(argp, &DriverVer, sizeof(DriverVer_type)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_getluninfo(ctlr_info_t *h,
+ struct gendisk *disk, void __user *argp)
+{
+ LogvolInfo_struct luninfo;
+ drive_info_struct *drv = get_drv(disk);
+
+ if (!argp)
+ return -EINVAL;
+ memcpy(&luninfo.LunID, drv->LunID, sizeof(luninfo.LunID));
+ luninfo.num_opens = drv->usage_count;
+ luninfo.num_parts = 0;
+ if (copy_to_user(argp, &luninfo, sizeof(LogvolInfo_struct)))
+ return -EFAULT;
+ return 0;
+}
+
+static int cciss_passthru(ctlr_info_t *h, void __user *argp)
+{
+ IOCTL_Command_struct iocommand;
+ CommandList_struct *c;
+ char *buff = NULL;
+ u64bit temp64;
+ DECLARE_COMPLETION_ONSTACK(wait);
+
+ if (!argp)
+ return -EINVAL;
+
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+
+ if (copy_from_user
+ (&iocommand, argp, sizeof(IOCTL_Command_struct)))
+ return -EFAULT;
+ if ((iocommand.buf_size < 1) &&
+ (iocommand.Request.Type.Direction != XFER_NONE)) {
+ return -EINVAL;
+ }
+ if (iocommand.buf_size > 0) {
+ buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
+ if (buff == NULL)
+ return -EFAULT;
+ }
+ if (iocommand.Request.Type.Direction == XFER_WRITE) {
+ /* Copy the data into the buffer we created */
+ if (copy_from_user(buff, iocommand.buf, iocommand.buf_size)) {
+ kfree(buff);
+ return -EFAULT;
}
- case CCISS_PASSTHRU:
- {
- IOCTL_Command_struct iocommand;
- CommandList_struct *c;
- char *buff = NULL;
- u64bit temp64;
- DECLARE_COMPLETION_ONSTACK(wait);
-
- if (!arg)
- return -EINVAL;
-
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
-
- if (copy_from_user
- (&iocommand, argp, sizeof(IOCTL_Command_struct)))
- return -EFAULT;
- if ((iocommand.buf_size < 1) &&
- (iocommand.Request.Type.Direction != XFER_NONE)) {
- return -EINVAL;
- }
-#if 0 /* 'buf_size' member is 16-bits, and always smaller than kmalloc limit */
- /* Check kmalloc limits */
- if (iocommand.buf_size > 128000)
- return -EINVAL;
-#endif
- if (iocommand.buf_size > 0) {
- buff = kmalloc(iocommand.buf_size, GFP_KERNEL);
- if (buff == NULL)
- return -EFAULT;
- }
- if (iocommand.Request.Type.Direction == XFER_WRITE) {
- /* Copy the data into the buffer we created */
- if (copy_from_user
- (buff, iocommand.buf, iocommand.buf_size)) {
- kfree(buff);
- return -EFAULT;
- }
- } else {
- memset(buff, 0, iocommand.buf_size);
- }
- c = cmd_special_alloc(h);
- if (!c) {
- kfree(buff);
- return -ENOMEM;
- }
- /* Fill in the command type */
- c->cmd_type = CMD_IOCTL_PEND;
- /* Fill in Command Header */
- c->Header.ReplyQueue = 0; /* unused in simple mode */
- if (iocommand.buf_size > 0) /* buffer to fill */
- {
- c->Header.SGList = 1;
- c->Header.SGTotal = 1;
- } else /* no buffers to fill */
- {
- c->Header.SGList = 0;
- c->Header.SGTotal = 0;
- }
- c->Header.LUN = iocommand.LUN_info;
- /* use the kernel address the cmd block for tag */
- c->Header.Tag.lower = c->busaddr;
-
- /* Fill in Request block */
- c->Request = iocommand.Request;
-
- /* Fill in the scatter gather information */
- if (iocommand.buf_size > 0) {
- temp64.val = pci_map_single(h->pdev, buff,
- iocommand.buf_size,
- PCI_DMA_BIDIRECTIONAL);
- c->SG[0].Addr.lower = temp64.val32.lower;
- c->SG[0].Addr.upper = temp64.val32.upper;
- c->SG[0].Len = iocommand.buf_size;
- c->SG[0].Ext = 0; /* we are not chaining */
- }
- c->waiting = &wait;
+ } else {
+ memset(buff, 0, iocommand.buf_size);
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ kfree(buff);
+ return -ENOMEM;
+ }
+ /* Fill in the command type */
+ c->cmd_type = CMD_IOCTL_PEND;
+ /* Fill in Command Header */
+ c->Header.ReplyQueue = 0; /* unused in simple mode */
+ if (iocommand.buf_size > 0) { /* buffer to fill */
+ c->Header.SGList = 1;
+ c->Header.SGTotal = 1;
+ } else { /* no buffers to fill */
+ c->Header.SGList = 0;
+ c->Header.SGTotal = 0;
+ }
+ c->Header.LUN = iocommand.LUN_info;
+ /* use the kernel address the cmd block for tag */
+ c->Header.Tag.lower = c->busaddr;
- enqueue_cmd_and_start_io(h, c);
- wait_for_completion(&wait);
+ /* Fill in Request block */
+ c->Request = iocommand.Request;
- /* unlock the buffers from DMA */
- temp64.val32.lower = c->SG[0].Addr.lower;
- temp64.val32.upper = c->SG[0].Addr.upper;
- pci_unmap_single(h->pdev, (dma_addr_t) temp64.val,
- iocommand.buf_size,
- PCI_DMA_BIDIRECTIONAL);
+ /* Fill in the scatter gather information */
+ if (iocommand.buf_size > 0) {
+ temp64.val = pci_map_single(h->pdev, buff,
+ iocommand.buf_size, PCI_DMA_BIDIRECTIONAL);
+ c->SG[0].Addr.lower = temp64.val32.lower;
+ c->SG[0].Addr.upper = temp64.val32.upper;
+ c->SG[0].Len = iocommand.buf_size;
+ c->SG[0].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
- check_ioctl_unit_attention(h, c);
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
- /* Copy the error information out */
- iocommand.error_info = *(c->err_info);
- if (copy_to_user
- (argp, &iocommand, sizeof(IOCTL_Command_struct))) {
- kfree(buff);
- cmd_special_free(h, c);
- return -EFAULT;
- }
+ /* unlock the buffers from DMA */
+ temp64.val32.lower = c->SG[0].Addr.lower;
+ temp64.val32.upper = c->SG[0].Addr.upper;
+ pci_unmap_single(h->pdev, (dma_addr_t) temp64.val, iocommand.buf_size,
+ PCI_DMA_BIDIRECTIONAL);
+ check_ioctl_unit_attention(h, c);
+
+ /* Copy the error information out */
+ iocommand.error_info = *(c->err_info);
+ if (copy_to_user(argp, &iocommand, sizeof(IOCTL_Command_struct))) {
+ kfree(buff);
+ cmd_special_free(h, c);
+ return -EFAULT;
+ }
- if (iocommand.Request.Type.Direction == XFER_READ) {
- /* Copy the data out of the buffer we created */
- if (copy_to_user
- (iocommand.buf, buff, iocommand.buf_size)) {
- kfree(buff);
- cmd_special_free(h, c);
- return -EFAULT;
- }
- }
+ if (iocommand.Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ if (copy_to_user(iocommand.buf, buff, iocommand.buf_size)) {
kfree(buff);
cmd_special_free(h, c);
- return 0;
+ return -EFAULT;
}
- case CCISS_BIG_PASSTHRU:{
- BIG_IOCTL_Command_struct *ioc;
- CommandList_struct *c;
- unsigned char **buff = NULL;
- int *buff_size = NULL;
- u64bit temp64;
- BYTE sg_used = 0;
- int status = 0;
- int i;
- DECLARE_COMPLETION_ONSTACK(wait);
- __u32 left;
- __u32 sz;
- BYTE __user *data_ptr;
-
- if (!arg)
- return -EINVAL;
- if (!capable(CAP_SYS_RAWIO))
- return -EPERM;
- ioc = (BIG_IOCTL_Command_struct *)
- kmalloc(sizeof(*ioc), GFP_KERNEL);
- if (!ioc) {
- status = -ENOMEM;
- goto cleanup1;
- }
- if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+ }
+ kfree(buff);
+ cmd_special_free(h, c);
+ return 0;
+}
+
+static int cciss_bigpassthru(ctlr_info_t *h, void __user *argp)
+{
+ BIG_IOCTL_Command_struct *ioc;
+ CommandList_struct *c;
+ unsigned char **buff = NULL;
+ int *buff_size = NULL;
+ u64bit temp64;
+ BYTE sg_used = 0;
+ int status = 0;
+ int i;
+ DECLARE_COMPLETION_ONSTACK(wait);
+ __u32 left;
+ __u32 sz;
+ BYTE __user *data_ptr;
+
+ if (!argp)
+ return -EINVAL;
+ if (!capable(CAP_SYS_RAWIO))
+ return -EPERM;
+ ioc = (BIG_IOCTL_Command_struct *)
+ kmalloc(sizeof(*ioc), GFP_KERNEL);
+ if (!ioc) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (copy_from_user(ioc, argp, sizeof(*ioc))) {
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if ((ioc->buf_size < 1) &&
+ (ioc->Request.Type.Direction != XFER_NONE)) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ /* Check kmalloc limits using all SGs */
+ if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
+ status = -EINVAL;
+ goto cleanup1;
+ }
+ buff = kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
+ if (!buff) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ buff_size = kmalloc(MAXSGENTRIES * sizeof(int), GFP_KERNEL);
+ if (!buff_size) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ left = ioc->buf_size;
+ data_ptr = ioc->buf;
+ while (left) {
+ sz = (left > ioc->malloc_size) ? ioc->malloc_size : left;
+ buff_size[sg_used] = sz;
+ buff[sg_used] = kmalloc(sz, GFP_KERNEL);
+ if (buff[sg_used] == NULL) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_WRITE) {
+ if (copy_from_user(buff[sg_used], data_ptr, sz)) {
status = -EFAULT;
goto cleanup1;
}
- if ((ioc->buf_size < 1) &&
- (ioc->Request.Type.Direction != XFER_NONE)) {
- status = -EINVAL;
- goto cleanup1;
- }
- /* Check kmalloc limits using all SGs */
- if (ioc->malloc_size > MAX_KMALLOC_SIZE) {
- status = -EINVAL;
- goto cleanup1;
- }
- if (ioc->buf_size > ioc->malloc_size * MAXSGENTRIES) {
- status = -EINVAL;
- goto cleanup1;
- }
- buff =
- kzalloc(MAXSGENTRIES * sizeof(char *), GFP_KERNEL);
- if (!buff) {
- status = -ENOMEM;
- goto cleanup1;
- }
- buff_size = kmalloc(MAXSGENTRIES * sizeof(int),
- GFP_KERNEL);
- if (!buff_size) {
- status = -ENOMEM;
- goto cleanup1;
- }
- left = ioc->buf_size;
- data_ptr = ioc->buf;
- while (left) {
- sz = (left >
- ioc->malloc_size) ? ioc->
- malloc_size : left;
- buff_size[sg_used] = sz;
- buff[sg_used] = kmalloc(sz, GFP_KERNEL);
- if (buff[sg_used] == NULL) {
- status = -ENOMEM;
- goto cleanup1;
- }
- if (ioc->Request.Type.Direction == XFER_WRITE) {
- if (copy_from_user
- (buff[sg_used], data_ptr, sz)) {
- status = -EFAULT;
- goto cleanup1;
- }
- } else {
- memset(buff[sg_used], 0, sz);
- }
- left -= sz;
- data_ptr += sz;
- sg_used++;
- }
- c = cmd_special_alloc(h);
- if (!c) {
- status = -ENOMEM;
- goto cleanup1;
- }
- c->cmd_type = CMD_IOCTL_PEND;
- c->Header.ReplyQueue = 0;
+ } else {
+ memset(buff[sg_used], 0, sz);
+ }
+ left -= sz;
+ data_ptr += sz;
+ sg_used++;
+ }
+ c = cmd_special_alloc(h);
+ if (!c) {
+ status = -ENOMEM;
+ goto cleanup1;
+ }
+ c->cmd_type = CMD_IOCTL_PEND;
+ c->Header.ReplyQueue = 0;
+ c->Header.SGList = sg_used;
+ c->Header.SGTotal = sg_used;
+ c->Header.LUN = ioc->LUN_info;
+ c->Header.Tag.lower = c->busaddr;
- if (ioc->buf_size > 0) {
- c->Header.SGList = sg_used;
- c->Header.SGTotal = sg_used;
- } else {
- c->Header.SGList = 0;
- c->Header.SGTotal = 0;
- }
- c->Header.LUN = ioc->LUN_info;
- c->Header.Tag.lower = c->busaddr;
-
- c->Request = ioc->Request;
- if (ioc->buf_size > 0) {
- for (i = 0; i < sg_used; i++) {
- temp64.val =
- pci_map_single(h->pdev, buff[i],
- buff_size[i],
- PCI_DMA_BIDIRECTIONAL);
- c->SG[i].Addr.lower =
- temp64.val32.lower;
- c->SG[i].Addr.upper =
- temp64.val32.upper;
- c->SG[i].Len = buff_size[i];
- c->SG[i].Ext = 0; /* we are not chaining */
- }
- }
- c->waiting = &wait;
- enqueue_cmd_and_start_io(h, c);
- wait_for_completion(&wait);
- /* unlock the buffers from DMA */
- for (i = 0; i < sg_used; i++) {
- temp64.val32.lower = c->SG[i].Addr.lower;
- temp64.val32.upper = c->SG[i].Addr.upper;
- pci_unmap_single(h->pdev,
- (dma_addr_t) temp64.val, buff_size[i],
- PCI_DMA_BIDIRECTIONAL);
- }
- check_ioctl_unit_attention(h, c);
- /* Copy the error information out */
- ioc->error_info = *(c->err_info);
- if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+ c->Request = ioc->Request;
+ for (i = 0; i < sg_used; i++) {
+ temp64.val = pci_map_single(h->pdev, buff[i], buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ c->SG[i].Addr.lower = temp64.val32.lower;
+ c->SG[i].Addr.upper = temp64.val32.upper;
+ c->SG[i].Len = buff_size[i];
+ c->SG[i].Ext = 0; /* we are not chaining */
+ }
+ c->waiting = &wait;
+ enqueue_cmd_and_start_io(h, c);
+ wait_for_completion(&wait);
+ /* unlock the buffers from DMA */
+ for (i = 0; i < sg_used; i++) {
+ temp64.val32.lower = c->SG[i].Addr.lower;
+ temp64.val32.upper = c->SG[i].Addr.upper;
+ pci_unmap_single(h->pdev,
+ (dma_addr_t) temp64.val, buff_size[i],
+ PCI_DMA_BIDIRECTIONAL);
+ }
+ check_ioctl_unit_attention(h, c);
+ /* Copy the error information out */
+ ioc->error_info = *(c->err_info);
+ if (copy_to_user(argp, ioc, sizeof(*ioc))) {
+ cmd_special_free(h, c);
+ status = -EFAULT;
+ goto cleanup1;
+ }
+ if (ioc->Request.Type.Direction == XFER_READ) {
+ /* Copy the data out of the buffer we created */
+ BYTE __user *ptr = ioc->buf;
+ for (i = 0; i < sg_used; i++) {
+ if (copy_to_user(ptr, buff[i], buff_size[i])) {
cmd_special_free(h, c);
status = -EFAULT;
goto cleanup1;
}
- if (ioc->Request.Type.Direction == XFER_READ) {
- /* Copy the data out of the buffer we created */
- BYTE __user *ptr = ioc->buf;
- for (i = 0; i < sg_used; i++) {
- if (copy_to_user
- (ptr, buff[i], buff_size[i])) {
- cmd_special_free(h, c);
- status = -EFAULT;
- goto cleanup1;
- }
- ptr += buff_size[i];
- }
- }
- cmd_special_free(h, c);
- status = 0;
- cleanup1:
- if (buff) {
- for (i = 0; i < sg_used; i++)
- kfree(buff[i]);
- kfree(buff);
- }
- kfree(buff_size);
- kfree(ioc);
- return status;
+ ptr += buff_size[i];
}
+ }
+ cmd_special_free(h, c);
+ status = 0;
+cleanup1:
+ if (buff) {
+ for (i = 0; i < sg_used; i++)
+ kfree(buff[i]);
+ kfree(buff);
+ }
+ kfree(buff_size);
+ kfree(ioc);
+ return status;
+}
+
+static int cciss_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ ctlr_info_t *h = get_host(disk);
+ void __user *argp = (void __user *)arg;
+
+ dev_dbg(&h->pdev->dev, "cciss_ioctl: Called with cmd=%x %lx\n",
+ cmd, arg);
+ switch (cmd) {
+ case CCISS_GETPCIINFO:
+ return cciss_getpciinfo(h, argp);
+ case CCISS_GETINTINFO:
+ return cciss_getintinfo(h, argp);
+ case CCISS_SETINTINFO:
+ return cciss_setintinfo(h, argp);
+ case CCISS_GETNODENAME:
+ return cciss_getnodename(h, argp);
+ case CCISS_SETNODENAME:
+ return cciss_setnodename(h, argp);
+ case CCISS_GETHEARTBEAT:
+ return cciss_getheartbeat(h, argp);
+ case CCISS_GETBUSTYPES:
+ return cciss_getbustypes(h, argp);
+ case CCISS_GETFIRMVER:
+ return cciss_getfirmver(h, argp);
+ case CCISS_GETDRIVVER:
+ return cciss_getdrivver(h, argp);
+ case CCISS_DEREGDISK:
+ case CCISS_REGNEWD:
+ case CCISS_REVALIDVOLS:
+ return rebuild_lun_table(h, 0, 1);
+ case CCISS_GETLUNINFO:
+ return cciss_getluninfo(h, disk, argp);
+ case CCISS_PASSTHRU:
+ return cciss_passthru(h, argp);
+ case CCISS_BIG_PASSTHRU:
+ return cciss_bigpassthru(h, argp);
/* scsi_cmd_ioctl handles these, below, though some are not */
/* very meaningful for cciss. SG_IO is the main one people want. */
@@ -3986,9 +3938,6 @@ static int __devinit cciss_lookup_board_id(struct pci_dev *pdev, u32 *board_id)
subsystem_vendor_id;
for (i = 0; i < ARRAY_SIZE(products); i++) {
- /* Stand aside for hpsa driver on request */
- if (cciss_allow_hpsa && products[i].board_id == HPSA_BOUNDARY)
- return -ENODEV;
if (*board_id == products[i].board_id)
return i;
}
diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c
index d53b0291c44b..946dad4caef3 100644
--- a/drivers/block/cpqarray.c
+++ b/drivers/block/cpqarray.c
@@ -35,7 +35,7 @@
#include <linux/seq_file.h>
#include <linux/init.h>
#include <linux/hdreg.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
@@ -68,6 +68,7 @@ MODULE_LICENSE("GPL");
#define CPQARRAY_DMA_MASK 0xFFFFFFFF /* 32 bit DMA */
+static DEFINE_MUTEX(cpqarray_mutex);
static int nr_ctlr;
static ctlr_info_t *hba[MAX_CTLR];
@@ -845,9 +846,9 @@ static int ida_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
ret = ida_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return ret;
}
@@ -859,10 +860,10 @@ static int ida_release(struct gendisk *disk, fmode_t mode)
{
ctlr_info_t *host;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
host = get_host(disk);
host->usage_count--;
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return 0;
}
@@ -1217,9 +1218,9 @@ static int ida_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&cpqarray_mutex);
ret = ida_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&cpqarray_mutex);
return ret;
}
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 9400845d602e..ac04ef97eac2 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -965,29 +965,30 @@ void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector, int size,
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
- spin_lock_irqsave(&mdev->al_lock, flags);
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
- if (count) {
- /* we need the lock for drbd_try_clear_on_disk_bm */
- if (jiffies - mdev->rs_mark_time > HZ*10) {
- /* should be rolling marks,
- * but we estimate only anyways. */
- if (mdev->rs_mark_left != drbd_bm_total_weight(mdev) &&
+ if (count && get_ldev(mdev)) {
+ unsigned long now = jiffies;
+ unsigned long last = mdev->rs_mark_time[mdev->rs_last_mark];
+ int next = (mdev->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+ if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+ unsigned long tw = drbd_bm_total_weight(mdev);
+ if (mdev->rs_mark_left[mdev->rs_last_mark] != tw &&
mdev->state.conn != C_PAUSED_SYNC_T &&
mdev->state.conn != C_PAUSED_SYNC_S) {
- mdev->rs_mark_time = jiffies;
- mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ mdev->rs_mark_time[next] = now;
+ mdev->rs_mark_left[next] = tw;
+ mdev->rs_last_mark = next;
}
}
- if (get_ldev(mdev)) {
- drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
- put_ldev(mdev);
- }
+ spin_lock_irqsave(&mdev->al_lock, flags);
+ drbd_try_clear_on_disk_bm(mdev, sector, count, TRUE);
+ spin_unlock_irqrestore(&mdev->al_lock, flags);
+
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
+ put_ldev(mdev);
}
- spin_unlock_irqrestore(&mdev->al_lock, flags);
if (wake_up)
wake_up(&mdev->al_wait);
}
@@ -1118,7 +1119,7 @@ static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
* @mdev: DRBD device.
* @sector: The sector number.
*
- * This functions sleeps on al_wait. Returns 1 on success, 0 if interrupted.
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
*/
int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
{
@@ -1129,10 +1130,10 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
sig = wait_event_interruptible(mdev->al_wait,
(bm_ext = _bme_get(mdev, enr)));
if (sig)
- return 0;
+ return -EINTR;
if (test_bit(BME_LOCKED, &bm_ext->flags))
- return 1;
+ return 0;
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
sig = wait_event_interruptible(mdev->al_wait,
@@ -1145,13 +1146,11 @@ int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector)
wake_up(&mdev->al_wait);
}
spin_unlock_irq(&mdev->al_lock);
- return 0;
+ return -EINTR;
}
}
-
set_bit(BME_LOCKED, &bm_ext->flags);
-
- return 1;
+ return 0;
}
/**
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index e3f88d6e1412..fd42832f785b 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -569,7 +569,7 @@ int drbd_bm_resize(struct drbd_conf *mdev, sector_t capacity, int set_new_bits)
*
* maybe bm_set should be atomic_t ?
*/
-static unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
+unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
unsigned long s;
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 352441b0f92f..9bdcf4393c0a 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -337,13 +337,25 @@ static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
* NOTE that the payload starts at a long aligned offset,
* regardless of 32 or 64 bit arch!
*/
-struct p_header {
+struct p_header80 {
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
u8 payload[0];
} __packed;
-/* 8 bytes. packet FIXED for the next century! */
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+ u16 magic; /* use DRBD_MAGIC_BIG here */
+ u16 command;
+ u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
+ u8 payload[0];
+} __packed;
+
+union p_header {
+ struct p_header80 h80;
+ struct p_header95 h95;
+};
/*
* short commands, packets without payload, plain p_header:
@@ -362,12 +374,16 @@ struct p_header {
*/
/* these defines must not be changed without changing the protocol version */
-#define DP_HARDBARRIER 1
-#define DP_RW_SYNC 2
+#define DP_HARDBARRIER 1 /* depricated */
+#define DP_RW_SYNC 2 /* equals REQ_SYNC */
#define DP_MAY_SET_IN_SYNC 4
+#define DP_UNPLUG 8 /* equals REQ_UNPLUG */
+#define DP_FUA 16 /* equals REQ_FUA */
+#define DP_FLUSH 32 /* equals REQ_FLUSH */
+#define DP_DISCARD 64 /* equals REQ_DISCARD */
struct p_data {
- struct p_header head;
+ union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
@@ -383,7 +399,7 @@ struct p_data {
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -392,7 +408,7 @@ struct p_block_ack {
struct p_block_req {
- struct p_header head;
+ struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -409,7 +425,7 @@ struct p_block_req {
*/
struct p_handshake {
- struct p_header head; /* 8 bytes */
+ struct p_header80 head; /* 8 bytes */
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
@@ -424,19 +440,19 @@ struct p_handshake {
/* 80 bytes, FIXED for the next century */
struct p_barrier {
- struct p_header head;
+ struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
- struct p_header head;
+ struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* Since protocol version 88 and higher. */
@@ -444,20 +460,31 @@ struct p_rs_param {
} __packed;
struct p_rs_param_89 {
- struct p_header head;
+ struct p_header80 head;
u32 rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
+struct p_rs_param_95 {
+ struct p_header80 head;
+ u32 rate;
+ char verify_alg[SHARED_SECRET_MAX];
+ char csums_alg[SHARED_SECRET_MAX];
+ u32 c_plan_ahead;
+ u32 c_delay_target;
+ u32 c_fill_target;
+ u32 c_max_rate;
+} __packed;
+
enum drbd_conn_flags {
CF_WANT_LOSE = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
- struct p_header head;
+ struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
@@ -471,17 +498,17 @@ struct p_protocol {
} __packed;
struct p_uuids {
- struct p_header head;
+ struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
- struct p_header head;
+ struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
- struct p_header head;
+ struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
@@ -491,18 +518,18 @@ struct p_sizes {
} __packed;
struct p_state {
- struct p_header head;
+ struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
- struct p_header head;
+ struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
- struct p_header head;
+ struct p_header80 head;
u32 retcode;
} __packed;
@@ -517,7 +544,7 @@ struct p_drbd06_param {
} __packed;
struct p_discard {
- struct p_header head;
+ struct p_header80 head;
u64 block_id;
u32 seq_num;
u32 pad;
@@ -533,7 +560,7 @@ enum drbd_bitmap_code {
};
struct p_compressed_bm {
- struct p_header head;
+ struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -544,10 +571,10 @@ struct p_compressed_bm {
u8 code[0];
} __packed;
-struct p_delay_probe {
- struct p_header head;
- u32 seq_num; /* sequence number to match the two probe packets */
- u32 offset; /* usecs the probe got sent after the reference time point */
+struct p_delay_probe93 {
+ struct p_header80 head;
+ u32 seq_num; /* sequence number to match the two probe packets */
+ u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
/* DCBP: Drbd Compressed Bitmap Packet ... */
@@ -594,7 +621,7 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
* so we need to use the fixed size 4KiB page size
* most architechtures have used for a long time.
*/
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header))
+#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
#if (PAGE_SIZE < 4096)
@@ -603,13 +630,14 @@ DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
#endif
union p_polymorph {
- struct p_header header;
+ union p_header header;
struct p_handshake handshake;
struct p_data data;
struct p_block_ack block_ack;
struct p_barrier barrier;
struct p_barrier_ack barrier_ack;
struct p_rs_param_89 rs_param_89;
+ struct p_rs_param_95 rs_param_95;
struct p_protocol protocol;
struct p_sizes sizes;
struct p_uuids uuids;
@@ -617,6 +645,8 @@ union p_polymorph {
struct p_req_state req_state;
struct p_req_state_reply req_state_reply;
struct p_block_req block_req;
+ struct p_delay_probe93 delay_probe93;
+ struct p_rs_uuid rs_uuid;
} __packed;
/**********************************************************************/
@@ -697,7 +727,7 @@ struct drbd_tl_epoch {
struct list_head requests; /* requests before */
struct drbd_tl_epoch *next; /* pointer to the next barrier */
unsigned int br_number; /* the barriers identifier. */
- int n_req; /* number of requests attached before this barrier */
+ int n_writes; /* number of requests attached before this barrier */
};
struct drbd_request;
@@ -747,7 +777,7 @@ struct digest_info {
struct drbd_epoch_entry {
struct drbd_work w;
struct hlist_node colision;
- struct drbd_epoch *epoch;
+ struct drbd_epoch *epoch; /* for writes */
struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
@@ -755,7 +785,10 @@ struct drbd_epoch_entry {
/* see comments on ee flag bits below */
unsigned long flags;
sector_t sector;
- u64 block_id;
+ union {
+ u64 block_id;
+ struct digest_info *digest;
+ };
};
/* ee flag bits.
@@ -781,12 +814,16 @@ enum {
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
+
+ /* This ee has a pointer to a digest instead of a block id */
+ __EE_HAS_DIGEST,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_IS_BARRIER (1<<__EE_IS_BARRIER)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
/* global flag bits */
enum {
@@ -794,7 +831,6 @@ enum {
SIGNAL_ASENDER, /* whether asender wants to be interrupted */
SEND_PING, /* whether asender should send a ping asap */
- STOP_SYNC_TIMER, /* tell timer to cancel itself */
UNPLUG_QUEUED, /* only relevant with kernel 2.4 */
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
@@ -816,6 +852,7 @@ enum {
BITMAP_IO, /* suspend application io;
once no more io in flight, start bitmap io */
BITMAP_IO_QUEUED, /* Started bitmap IO */
+ GO_DISKLESS, /* Disk failed, local_cnt reached zero, we are going diskless */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
NET_CONGESTED, /* The data socket is congested */
@@ -829,6 +866,8 @@ enum {
* the peer, if it changed there as well. */
CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
+ NEW_CUR_UUID, /* Create new current UUID when thawing IO */
+ AL_SUSPENDED, /* Activity logging is currently suspended. */
};
struct drbd_bitmap; /* opaque for drbd_conf */
@@ -838,10 +877,6 @@ struct drbd_bitmap; /* opaque for drbd_conf */
/* THINK maybe we actually want to use the default "event/%s" worker threads
* or similar in linux 2.6, which uses per cpu data and threads.
- *
- * To be general, this might need a spin_lock member.
- * For now, please use the mdev->req_lock to protect list_head,
- * see drbd_queue_work below.
*/
struct drbd_work_queue {
struct list_head q;
@@ -915,6 +950,12 @@ enum write_ordering_e {
WO_bio_barrier
};
+struct fifo_buffer {
+ int *values;
+ unsigned int head_index;
+ unsigned int size;
+};
+
struct drbd_conf {
/* things that are stored as / read from meta data on disk */
unsigned long flags;
@@ -936,9 +977,16 @@ struct drbd_conf {
unsigned int ko_count;
struct drbd_work resync_work,
unplug_work,
+ go_diskless,
md_sync_work;
struct timer_list resync_timer;
struct timer_list md_sync_timer;
+#ifdef DRBD_DEBUG_MD_SYNC
+ struct {
+ unsigned int line;
+ const char* func;
+ } last_md_mark_dirty;
+#endif
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
@@ -946,6 +994,7 @@ struct drbd_conf {
union drbd_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
+ wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
@@ -974,12 +1023,16 @@ struct drbd_conf {
unsigned long rs_start;
/* cumulated time in PausedSyncX state [unit jiffies] */
unsigned long rs_paused;
+ /* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+ unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
- unsigned long rs_mark_left;
+ unsigned long rs_mark_left[DRBD_SYNC_MARKS];
/* marks's time [unit jiffies] */
- unsigned long rs_mark_time;
- /* skipped because csum was equeal [unit BM_BLOCK_SIZE] */
- unsigned long rs_same_csum;
+ unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+ /* current index into rs_mark_{left,time} */
+ int rs_last_mark;
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
@@ -1012,10 +1065,10 @@ struct drbd_conf {
spinlock_t epoch_lock;
unsigned int epochs;
enum write_ordering_e write_ordering;
- struct list_head active_ee; /* IO in progress */
- struct list_head sync_ee; /* IO in progress */
+ struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+ struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress */
+ struct list_head read_ee; /* IO in progress (any read) */
struct list_head net_ee; /* zero-copy network send in progress */
struct hlist_head *ee_hash; /* is proteced by req_lock! */
unsigned int ee_hash_s;
@@ -1026,7 +1079,8 @@ struct drbd_conf {
int next_barrier_nr;
struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
- atomic_t pp_in_use;
+ atomic_t pp_in_use; /* allocated from page pool */
+ atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
struct page *md_io_tmpp; /* for logical_block_size != 512 */
@@ -1054,6 +1108,15 @@ struct drbd_conf {
u64 ed_uuid; /* UUID of the exposed data */
struct mutex state_mutex;
char congestion_reason; /* Why we where congested... */
+ atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+ atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+ int rs_last_sect_ev; /* counter to compare with */
+ int rs_last_events; /* counter of read or write "events" (unit sectors)
+ * on the lower level device when we last looked. */
+ int c_sync_rate; /* current resync rate after syncer throttle magic */
+ struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+ int rs_planed; /* resync sectors already planed */
};
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
@@ -1138,6 +1201,8 @@ extern void drbd_free_resources(struct drbd_conf *mdev);
extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
unsigned int set_size);
extern void tl_clear(struct drbd_conf *mdev);
+enum drbd_req_event;
+extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
extern void drbd_free_sock(struct drbd_conf *mdev);
extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
@@ -1150,12 +1215,12 @@ extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_f
extern int _drbd_send_state(struct drbd_conf *mdev);
extern int drbd_send_state(struct drbd_conf *mdev);
extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags);
#define USE_DATA_SOCKET 1
#define USE_META_SOCKET 0
extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size);
extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
char *data, size_t size);
@@ -1167,7 +1232,7 @@ extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
struct p_block_req *rp);
extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp);
+ struct p_data *dp, int data_size);
extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
sector_t sector, int blksize, u64 block_id);
extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
@@ -1201,7 +1266,13 @@ extern void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
extern void drbd_md_set_flag(struct drbd_conf *mdev, int flags) __must_hold(local);
extern void drbd_md_clear_flag(struct drbd_conf *mdev, int flags)__must_hold(local);
extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+#ifndef DRBD_DEBUG_MD_SYNC
extern void drbd_md_mark_dirty(struct drbd_conf *mdev);
+#else
+#define drbd_md_mark_dirty(m) drbd_md_mark_dirty_(m, __LINE__ , __func__ )
+extern void drbd_md_mark_dirty_(struct drbd_conf *mdev,
+ unsigned int line, const char *func);
+#endif
extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
void (*done)(struct drbd_conf *, int),
@@ -1209,6 +1280,7 @@ extern void drbd_queue_bitmap_io(struct drbd_conf *mdev,
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern int drbd_bitmap_io(struct drbd_conf *mdev, int (*io_fn)(struct drbd_conf *), char *why);
+extern void drbd_go_diskless(struct drbd_conf *mdev);
/* Meta data layout
@@ -1264,6 +1336,8 @@ struct bm_extent {
* Bit 1 ==> local node thinks this block needs to be synced.
*/
+#define SLEEP_TIME (HZ/10)
+
#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
@@ -1335,11 +1409,13 @@ struct bm_extent {
#endif
/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 6 all IO in one 32K block make it to the same slot of the
+ * With a value of 8 all IO in one 128K block make it to the same slot of the
* hash table. */
-#define HT_SHIFT 6
+#define HT_SHIFT 8
#define DRBD_MAX_SEGMENT_SIZE (1U<<(9+HT_SHIFT))
+#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */
+
/* Number of elements in the app_reads_hash */
#define APP_R_HSIZE 15
@@ -1369,6 +1445,7 @@ extern unsigned long drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_
/* bm_find_next variants for use while you hold drbd_bm_lock() */
extern unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo);
extern unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_conf *mdev);
extern unsigned long drbd_bm_total_weight(struct drbd_conf *mdev);
extern int drbd_bm_rs_done(struct drbd_conf *mdev);
/* for receive_bitmap */
@@ -1421,7 +1498,8 @@ extern void resync_after_online_grow(struct drbd_conf *);
extern void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int) __must_hold(local);
extern int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role,
int force);
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
+extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
@@ -1467,10 +1545,12 @@ extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
@@ -1479,7 +1559,10 @@ extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
sector_t sector,
unsigned int data_size,
gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e);
+extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
+ int is_net);
+#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
+#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
struct list_head *head);
extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
@@ -1487,6 +1570,7 @@ extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
extern void drbd_flush_workqueue(struct drbd_conf *mdev);
+extern void drbd_free_tl_hash(struct drbd_conf *mdev);
/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
* mess with get_fs/set_fs, we know we are KERNEL_DS always. */
@@ -1600,6 +1684,8 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
#define susp_MASK 1
#define user_isp_MASK 1
#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
#define NS(T, S) \
({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
@@ -1856,13 +1942,6 @@ static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
}
static inline void
-_drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
-{
- list_add_tail(&w->list, &q->q);
- up(&q->s);
-}
-
-static inline void
drbd_queue_work_front(struct drbd_work_queue *q, struct drbd_work *w)
{
unsigned long flags;
@@ -1899,19 +1978,19 @@ static inline void request_ping(struct drbd_conf *mdev)
static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
enum drbd_packets cmd)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
}
static inline int drbd_send_ping(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
}
static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
{
- struct p_header h;
+ struct p_header80 h;
return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
}
@@ -2013,7 +2092,7 @@ static inline void inc_unacked(struct drbd_conf *mdev)
static inline void put_net_conf(struct drbd_conf *mdev)
{
if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->misc_wait);
+ wake_up(&mdev->net_cnt_wait);
}
/**
@@ -2044,10 +2123,14 @@ static inline int get_net_conf(struct drbd_conf *mdev)
static inline void put_ldev(struct drbd_conf *mdev)
{
+ int i = atomic_dec_return(&mdev->local_cnt);
__release(local);
- if (atomic_dec_and_test(&mdev->local_cnt))
+ D_ASSERT(i >= 0);
+ if (i == 0) {
+ if (mdev->state.disk == D_FAILED)
+ drbd_go_diskless(mdev);
wake_up(&mdev->misc_wait);
- D_ASSERT(atomic_read(&mdev->local_cnt) >= 0);
+ }
}
#ifndef __CHECKER__
@@ -2179,11 +2262,16 @@ static inline int drbd_state_is_stable(union drbd_state s)
return 1;
}
+static inline int is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
static inline int __inc_ap_bio_cond(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
- if (mdev->state.susp)
+ if (is_susp(mdev->state))
return 0;
if (test_bit(SUSPEND_IO, &mdev->flags))
return 0;
@@ -2321,8 +2409,7 @@ static inline void drbd_md_flush(struct drbd_conf *mdev)
if (test_bit(MD_NO_BARRIER, &mdev->flags))
return;
- r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL,
- BLKDEV_IFL_WAIT);
+ r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_KERNEL, NULL);
if (r) {
set_bit(MD_NO_BARRIER, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index fa650dd85b90..25c7a73c5062 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -32,7 +32,7 @@
#include <asm/types.h>
#include <net/sock.h>
#include <linux/ctype.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/proc_fs.h>
@@ -64,6 +64,7 @@ struct after_state_chg_work {
struct completion *done;
};
+static DEFINE_MUTEX(drbd_main_mutex);
int drbdd_init(struct drbd_thread *);
int drbd_worker(struct drbd_thread *);
int drbd_asender(struct drbd_thread *);
@@ -77,6 +78,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
@@ -199,7 +201,7 @@ static int tl_init(struct drbd_conf *mdev)
INIT_LIST_HEAD(&b->w.list);
b->next = NULL;
b->br_number = 4711;
- b->n_req = 0;
+ b->n_writes = 0;
b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
mdev->oldest_tle = b;
@@ -240,7 +242,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
INIT_LIST_HEAD(&new->w.list);
new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
new->next = NULL;
- new->n_req = 0;
+ new->n_writes = 0;
newest_before = mdev->newest_tle;
/* never send a barrier number == 0, because that is special-cased
@@ -284,9 +286,9 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
barrier_nr, b->br_number);
goto bail;
}
- if (b->n_req != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_req=%u, expected n_req=%u!\n",
- barrier_nr, set_size, b->n_req);
+ if (b->n_writes != set_size) {
+ dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, b->n_writes);
goto bail;
}
@@ -333,6 +335,82 @@ bail:
drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
}
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @mdev: DRBD device.
+ * @what: The action/event to perform with all request objects
+ *
+ * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
+ * restart_frozen_disk_io.
+ */
+static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+ struct drbd_tl_epoch *b, *tmp, **pn;
+ struct list_head *le, *tle, carry_reads;
+ struct drbd_request *req;
+ int rv, n_writes, n_reads;
+
+ b = mdev->oldest_tle;
+ pn = &mdev->oldest_tle;
+ while (b) {
+ n_writes = 0;
+ n_reads = 0;
+ INIT_LIST_HEAD(&carry_reads);
+ list_for_each_safe(le, tle, &b->requests) {
+ req = list_entry(le, struct drbd_request, tl_requests);
+ rv = _req_mod(req, what);
+
+ n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
+ n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
+ }
+ tmp = b->next;
+
+ if (n_writes) {
+ if (what == resend) {
+ b->n_writes = n_writes;
+ if (b->w.cb == NULL) {
+ b->w.cb = w_send_barrier;
+ inc_ap_pending(mdev);
+ set_bit(CREATE_BARRIER, &mdev->flags);
+ }
+
+ drbd_queue_work(&mdev->data.work, &b->w);
+ }
+ pn = &b->next;
+ } else {
+ if (n_reads)
+ list_add(&carry_reads, &b->requests);
+ /* there could still be requests on that ring list,
+ * in case local io is still pending */
+ list_del(&b->requests);
+
+ /* dec_ap_pending corresponding to queue_barrier.
+ * the newest barrier may not have been queued yet,
+ * in which case w.cb is still NULL. */
+ if (b->w.cb != NULL)
+ dec_ap_pending(mdev);
+
+ if (b == mdev->newest_tle) {
+ /* recycle, but reinit! */
+ D_ASSERT(tmp == NULL);
+ INIT_LIST_HEAD(&b->requests);
+ list_splice(&carry_reads, &b->requests);
+ INIT_LIST_HEAD(&b->w.list);
+ b->w.cb = NULL;
+ b->br_number = net_random();
+ b->n_writes = 0;
+
+ *pn = b;
+ break;
+ }
+ *pn = tmp;
+ kfree(b);
+ }
+ b = tmp;
+ list_splice(&carry_reads, &b->requests);
+ }
+}
+
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
@@ -344,48 +422,12 @@ bail:
*/
void tl_clear(struct drbd_conf *mdev)
{
- struct drbd_tl_epoch *b, *tmp;
struct list_head *le, *tle;
struct drbd_request *r;
- int new_initial_bnr = net_random();
spin_lock_irq(&mdev->req_lock);
- b = mdev->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
- tmp = b->next;
-
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = new_initial_bnr;
- b->n_req = 0;
-
- mdev->oldest_tle = b;
- break;
- }
- kfree(b);
- b = tmp;
- }
+ _tl_restart(mdev, connection_lost_while_pending);
/* we expect this list to be empty. */
D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
@@ -401,6 +443,15 @@ void tl_clear(struct drbd_conf *mdev)
/* ensure bit indicating barrier is required is clear */
clear_bit(CREATE_BARRIER, &mdev->flags);
+ memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
+
+ spin_unlock_irq(&mdev->req_lock);
+}
+
+void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+{
+ spin_lock_irq(&mdev->req_lock);
+ _tl_restart(mdev, what);
spin_unlock_irq(&mdev->req_lock);
}
@@ -455,7 +506,7 @@ static int is_valid_state(struct drbd_conf *mdev, union drbd_state ns);
static int is_valid_state_transition(struct drbd_conf *,
union drbd_state, union drbd_state);
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, int *warn_sync_abort);
+ union drbd_state ns, const char **warn_sync_abort);
int drbd_send_state_req(struct drbd_conf *,
union drbd_state, union drbd_state);
@@ -605,7 +656,7 @@ static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
drbd_role_str(ns.peer),
drbd_disk_str(ns.disk),
drbd_disk_str(ns.pdsk),
- ns.susp ? 's' : 'r',
+ is_susp(ns) ? 's' : 'r',
ns.aftr_isp ? 'a' : '-',
ns.peer_isp ? 'p' : '-',
ns.user_isp ? 'u' : '-'
@@ -763,7 +814,7 @@ static int is_valid_state_transition(struct drbd_conf *mdev,
* to D_UNKNOWN. This rule and many more along those lines are in this function.
*/
static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, int *warn_sync_abort)
+ union drbd_state ns, const char **warn_sync_abort)
{
enum drbd_fencing_p fp;
@@ -778,9 +829,10 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
os.conn <= C_DISCONNECTING)
ns.conn = os.conn;
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow */
+ /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
+ * If you try to go into some Sync* state, that shall fail (elsewhere). */
if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING)
+ ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
ns.conn = os.conn;
/* After C_DISCONNECTING only C_STANDALONE may follow */
@@ -798,14 +850,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
ns.aftr_isp = 0;
- if (ns.conn <= C_DISCONNECTING && ns.disk == D_DISKLESS)
- ns.pdsk = D_UNKNOWN;
-
/* Abort resync if a disk fails/detaches */
if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
(ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
if (warn_sync_abort)
- *warn_sync_abort = 1;
+ *warn_sync_abort =
+ os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+ "Online-verify" : "Resync";
ns.conn = C_CONNECTED;
}
@@ -876,7 +927,12 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
if (fp == FP_STONITH &&
(ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
!(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp = 1;
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+ !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
if (ns.conn == C_SYNC_SOURCE)
@@ -912,6 +968,12 @@ static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
}
}
+static void drbd_resume_al(struct drbd_conf *mdev)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+ dev_info(DEV, "Resumed AL updates\n");
+}
+
/**
* __drbd_set_state() - Set a new DRBD state
* @mdev: DRBD device.
@@ -927,7 +989,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
{
union drbd_state os;
int rv = SS_SUCCESS;
- int warn_sync_abort = 0;
+ const char *warn_sync_abort = NULL;
struct after_state_chg_work *ascw;
os = mdev->state;
@@ -946,14 +1008,8 @@ int __drbd_set_state(struct drbd_conf *mdev,
/* If the old state was illegal as well, then let
this happen...*/
- if (is_valid_state(mdev, os) == rv) {
- dev_err(DEV, "Considering state change from bad state. "
- "Error would be: '%s'\n",
- drbd_set_st_err_str(rv));
- print_st(mdev, "old", os);
- print_st(mdev, "new", ns);
+ if (is_valid_state(mdev, os) == rv)
rv = is_valid_state_transition(mdev, ns, os);
- }
} else
rv = is_valid_state_transition(mdev, ns, os);
}
@@ -965,7 +1021,7 @@ int __drbd_set_state(struct drbd_conf *mdev,
}
if (warn_sync_abort)
- dev_warn(DEV, "Resync aborted.\n");
+ dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
{
char *pbp, pb[300];
@@ -976,7 +1032,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
PSC(conn);
PSC(disk);
PSC(pdsk);
- PSC(susp);
+ if (is_susp(ns) != is_susp(os))
+ pbp += sprintf(pbp, "susp( %s -> %s ) ",
+ drbd_susp_str(is_susp(os)),
+ drbd_susp_str(is_susp(ns)));
PSC(aftr_isp);
PSC(peer_isp);
PSC(user_isp);
@@ -1001,12 +1060,6 @@ int __drbd_set_state(struct drbd_conf *mdev,
wake_up(&mdev->misc_wait);
wake_up(&mdev->state_wait);
- /* post-state-change actions */
- if (os.conn >= C_SYNC_SOURCE && ns.conn <= C_CONNECTED) {
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
/* aborted verify run. log the last position */
if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
ns.conn < C_CONNECTED) {
@@ -1019,41 +1072,42 @@ int __drbd_set_state(struct drbd_conf *mdev,
if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
(ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies-(long)mdev->rs_mark_time;
- if (ns.conn == C_SYNC_TARGET) {
- if (!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))
- mod_timer(&mdev->resync_timer, jiffies);
- /* This if (!test_bit) is only needed for the case
- that a device that has ceased to used its timer,
- i.e. it is already in drbd_resync_finished() gets
- paused and resumed. */
- }
+ mdev->rs_paused += (long)jiffies
+ -(long)mdev->rs_mark_time[mdev->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&mdev->resync_timer, jiffies);
}
if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
(ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time = jiffies;
- if (ns.conn == C_PAUSED_SYNC_T)
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
+ mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
}
if (os.conn == C_CONNECTED &&
(ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
mdev->ov_position = 0;
- mdev->rs_total =
- mdev->rs_mark_left = drbd_bm_bits(mdev);
+ mdev->rs_total = drbd_bm_bits(mdev);
if (mdev->agreed_pro_version >= 90)
set_ov_position(mdev, ns.conn);
else
mdev->ov_start_sector = 0;
mdev->ov_left = mdev->rs_total
- BM_SECT_TO_BIT(mdev->ov_position);
- mdev->rs_start =
- mdev->rs_mark_time = jiffies;
+ mdev->rs_start = now;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
mdev->ov_last_oos_size = 0;
mdev->ov_last_oos_start = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = mdev->rs_total;
+ mdev->rs_mark_time[i] = now;
+ }
+
if (ns.conn == C_VERIFY_S) {
dev_info(DEV, "Starting Online Verify from sector %llu\n",
(unsigned long long)mdev->ov_position);
@@ -1106,6 +1160,10 @@ int __drbd_set_state(struct drbd_conf *mdev,
ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
drbd_thread_restart_nowait(&mdev->receiver);
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+ drbd_resume_al(mdev);
+
ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
if (ascw) {
ascw->os = os;
@@ -1164,6 +1222,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
union drbd_state ns, enum chg_state_flags flags)
{
enum drbd_fencing_p fp;
+ enum drbd_req_event what = nothing;
+ union drbd_state nsm = (union drbd_state){ .i = -1 };
if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
clear_bit(CRASHED_PRIMARY, &mdev->flags);
@@ -1187,17 +1247,49 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
/* Here we have the actions that are performed after a
state change. This function might sleep */
- if (fp == FP_STONITH && ns.susp) {
- /* case1: The outdate peer handler is successful:
- * case2: The connection was established again: */
- if ((os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) ||
- (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)) {
+ nsm.i = -1;
+ if (ns.susp_nod) {
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ if (ns.conn == C_CONNECTED)
+ what = resend, nsm.susp_nod = 0;
+ else /* ns.conn > C_CONNECTED */
+ dev_err(DEV, "Unexpected Resynd going on!\n");
+ }
+
+ if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+ what = restart_frozen_disk_io, nsm.susp_nod = 0;
+
+ }
+
+ if (ns.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
tl_clear(mdev);
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ drbd_md_sync(mdev);
+ }
spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS(mdev, susp, 0), CS_VERBOSE, NULL);
+ _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
}
+ /* case2: The connection was established again: */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ what = resend;
+ nsm.susp_fen = 0;
+ }
+ }
+
+ if (what != nothing) {
+ spin_lock_irq(&mdev->req_lock);
+ _tl_restart(mdev, what);
+ nsm.i &= mdev->state.i;
+ _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
+ spin_unlock_irq(&mdev->req_lock);
}
+
/* Do not change the order of the if above and the two below... */
if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
drbd_send_uuids(mdev);
@@ -1216,16 +1308,22 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
if (get_ldev(mdev)) {
if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
+ if (is_susp(mdev->state)) {
+ set_bit(NEW_CUR_UUID, &mdev->flags);
+ } else {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
}
put_ldev(mdev);
}
}
if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0)
+ if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
/* D_DISKLESS Peer becomes secondary */
if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
@@ -1267,42 +1365,51 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL, "set_n_write from invalidate");
+ /* first half of local IO error */
if (os.disk > D_FAILED && ns.disk == D_FAILED) {
- enum drbd_io_error_p eh;
+ enum drbd_io_error_p eh = EP_PASS_ON;
+
+ if (drbd_send_state(mdev))
+ dev_warn(DEV, "Notified peer that my disk is broken.\n");
+ else
+ dev_err(DEV, "Sending state for drbd_io_error() failed\n");
+
+ drbd_rs_cancel_all(mdev);
- eh = EP_PASS_ON;
if (get_ldev_if_state(mdev, D_FAILED)) {
eh = mdev->ldev->dc.on_io_error;
put_ldev(mdev);
}
+ if (eh == EP_CALL_HELPER)
+ drbd_khelper(mdev, "local-io-error");
+ }
- drbd_rs_cancel_all(mdev);
- /* since get_ldev() only works as long as disk>=D_INCONSISTENT,
- and it is D_DISKLESS here, local_cnt can only go down, it can
- not increase... It will reach zero */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+
+ /* second half of local IO error handling,
+ * after local_cnt references have reached zero: */
+ if (os.disk == D_FAILED && ns.disk == D_DISKLESS) {
mdev->rs_total = 0;
mdev->rs_failed = 0;
atomic_set(&mdev->rs_pending_cnt, 0);
-
- spin_lock_irq(&mdev->req_lock);
- _drbd_set_state(_NS(mdev, disk, D_DISKLESS), CS_HARD, NULL);
- spin_unlock_irq(&mdev->req_lock);
-
- if (eh == EP_CALL_HELPER)
- drbd_khelper(mdev, "local-io-error");
}
if (os.disk > D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (mdev->state.disk != D_DISKLESS)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(mdev->state.disk));
+
+ /* we cannot assert local_cnt == 0 here, as get_ldev_if_state
+ * will inc/dec it frequently. Since we became D_DISKLESS, no
+ * one has touched the protected members anymore, though, so we
+ * are safe to free them here. */
+ if (drbd_send_state(mdev))
+ dev_warn(DEV, "Notified peer that I detached my disk.\n");
+ else
+ dev_err(DEV, "Sending state for detach failed\n");
- if (os.disk == D_FAILED) /* && ns.disk == D_DISKLESS*/ {
- if (drbd_send_state(mdev))
- dev_warn(DEV, "Notified peer that my disk is broken.\n");
- else
- dev_err(DEV, "Sending state in drbd_io_error() failed\n");
- }
-
- wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
lc_destroy(mdev->resync);
mdev->resync = NULL;
lc_destroy(mdev->act_log);
@@ -1311,8 +1418,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
drbd_free_bc(mdev->ldev);
mdev->ldev = NULL;);
- if (mdev->md_io_tmpp)
+ if (mdev->md_io_tmpp) {
__free_page(mdev->md_io_tmpp);
+ mdev->md_io_tmpp = NULL;
+ }
}
/* Disks got bigger while they were detached */
@@ -1328,6 +1437,15 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(mdev);
+
+ /* free tl_hash if we Got thawed and are C_STANDALONE */
+ if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
+ drbd_free_tl_hash(mdev);
+
/* Upon network connection, we need to start the receiver */
if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
drbd_thread_start(&mdev->receiver);
@@ -1554,7 +1672,7 @@ void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
/* the appropriate socket mutex must be held already */
int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header *h,
+ enum drbd_packets cmd, struct p_header80 *h,
size_t size, unsigned msg_flags)
{
int sent, ok;
@@ -1564,7 +1682,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
h->magic = BE_DRBD_MAGIC;
h->command = cpu_to_be16(cmd);
- h->length = cpu_to_be16(size-sizeof(struct p_header));
+ h->length = cpu_to_be16(size-sizeof(struct p_header80));
sent = drbd_send(mdev, sock, h, size, msg_flags);
@@ -1579,7 +1697,7 @@ int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
* when we hold the appropriate socket mutex.
*/
int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header *h, size_t size)
+ enum drbd_packets cmd, struct p_header80 *h, size_t size)
{
int ok = 0;
struct socket *sock;
@@ -1607,7 +1725,7 @@ int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
size_t size)
{
- struct p_header h;
+ struct p_header80 h;
int ok;
h.magic = BE_DRBD_MAGIC;
@@ -1629,7 +1747,7 @@ int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
{
- struct p_rs_param_89 *p;
+ struct p_rs_param_95 *p;
struct socket *sock;
int size, rv;
const int apv = mdev->agreed_pro_version;
@@ -1637,7 +1755,8 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ strlen(mdev->sync_conf.verify_alg) + 1
- : /* 89 */ sizeof(struct p_rs_param_89);
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
/* used from admin command context and receiver/worker context.
* to avoid kmalloc, grab the socket right here,
@@ -1648,12 +1767,16 @@ int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
if (likely(sock != NULL)) {
enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
- p = &mdev->data.sbuf.rs_param_89;
+ p = &mdev->data.sbuf.rs_param_95;
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
p->rate = cpu_to_be32(sc->rate);
+ p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(sc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(sc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(sc->c_max_rate);
if (apv >= 88)
strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
@@ -1709,7 +1832,7 @@ int drbd_send_protocol(struct drbd_conf *mdev)
strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
- (struct p_header *)p, size);
+ (struct p_header80 *)p, size);
kfree(p);
return rv;
}
@@ -1735,7 +1858,7 @@ int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
put_ldev(mdev);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_uuids(struct drbd_conf *mdev)
@@ -1756,7 +1879,7 @@ int drbd_send_sync_uuid(struct drbd_conf *mdev, u64 val)
p.uuid = cpu_to_be64(val);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
@@ -1786,7 +1909,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
p.dds_flags = cpu_to_be16(flags);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -1811,7 +1934,7 @@ int drbd_send_state(struct drbd_conf *mdev)
if (likely(sock != NULL)) {
ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header *)&p, sizeof(p), 0);
+ (struct p_header80 *)&p, sizeof(p), 0);
}
mutex_unlock(&mdev->data.mutex);
@@ -1829,7 +1952,7 @@ int drbd_send_state_req(struct drbd_conf *mdev,
p.val = cpu_to_be32(val.i);
return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
@@ -1839,7 +1962,7 @@ int drbd_send_sr_reply(struct drbd_conf *mdev, int retcode)
p.retcode = cpu_to_be32(retcode);
return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
}
int fill_bitmap_rle_bits(struct drbd_conf *mdev,
@@ -1938,7 +2061,7 @@ int fill_bitmap_rle_bits(struct drbd_conf *mdev,
enum { OK, FAILED, DONE }
send_bitmap_rle_or_plain(struct drbd_conf *mdev,
- struct p_header *h, struct bm_xfer_ctx *c)
+ struct p_header80 *h, struct bm_xfer_ctx *c)
{
struct p_compressed_bm *p = (void*)h;
unsigned long num_words;
@@ -1968,12 +2091,12 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
if (len)
drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
- h, sizeof(struct p_header) + len, 0);
+ h, sizeof(struct p_header80) + len, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
- c->bytes[1] += sizeof(struct p_header) + len;
+ c->bytes[1] += sizeof(struct p_header80) + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
@@ -1989,14 +2112,14 @@ send_bitmap_rle_or_plain(struct drbd_conf *mdev,
int _drbd_send_bitmap(struct drbd_conf *mdev)
{
struct bm_xfer_ctx c;
- struct p_header *p;
+ struct p_header80 *p;
int ret;
ERR_IF(!mdev->bitmap) return FALSE;
/* maybe we should use some per thread scratch page,
* and allocate that during initial device creation? */
- p = (struct p_header *) __get_free_page(GFP_NOIO);
+ p = (struct p_header80 *) __get_free_page(GFP_NOIO);
if (!p) {
dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
return FALSE;
@@ -2054,7 +2177,7 @@ int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
if (mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2082,17 +2205,18 @@ static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
return FALSE;
ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp)
+ struct p_data *dp, int data_size)
{
- const int header_size = sizeof(struct p_data)
- - sizeof(struct p_header);
- int data_size = ((struct p_header *)dp)->length - header_size;
-
+ data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
+ crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
dp->block_id);
}
@@ -2140,7 +2264,7 @@ int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2158,7 +2282,7 @@ int drbd_send_drequest_csum(struct drbd_conf *mdev,
p.head.magic = BE_DRBD_MAGIC;
p.head.command = cpu_to_be16(cmd);
- p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header) + digest_size);
+ p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
mutex_lock(&mdev->data.mutex);
@@ -2180,7 +2304,7 @@ int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
p.blksize = cpu_to_be32(size);
ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
- (struct p_header *)&p, sizeof(p));
+ (struct p_header80 *)&p, sizeof(p));
return ok;
}
@@ -2332,6 +2456,18 @@ static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
return 1;
}
+static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
+{
+ if (mdev->agreed_pro_version >= 95)
+ return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
+ (bi_rw & REQ_UNPLUG ? DP_UNPLUG : 0) |
+ (bi_rw & REQ_FUA ? DP_FUA : 0) |
+ (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
+ (bi_rw & REQ_DISCARD ? DP_DISCARD : 0);
+ else
+ return bi_rw & (REQ_SYNC | REQ_UNPLUG) ? DP_RW_SYNC : 0;
+}
+
/* Used to send write requests
* R_PRIMARY -> Peer (P_DATA)
*/
@@ -2349,30 +2485,25 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(P_DATA);
- p.head.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + req->size);
+ if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
+ p.head.h80.magic = BE_DRBD_MAGIC;
+ p.head.h80.command = cpu_to_be16(P_DATA);
+ p.head.h80.length =
+ cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+ } else {
+ p.head.h95.magic = BE_DRBD_MAGIC_BIG;
+ p.head.h95.command = cpu_to_be16(P_DATA);
+ p.head.h95.length =
+ cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
+ }
p.sector = cpu_to_be64(req->sector);
p.block_id = (unsigned long)req;
p.seq_num = cpu_to_be32(req->seq_num =
atomic_add_return(1, &mdev->packet_seq));
- dp_flags = 0;
- /* NOTE: no need to check if barriers supported here as we would
- * not pass the test in make_request_common in that case
- */
- if (req->master_bio->bi_rw & REQ_HARDBARRIER) {
- dev_err(DEV, "ASSERT FAILED would have set DP_HARDBARRIER\n");
- /* dp_flags |= DP_HARDBARRIER; */
- }
- if (req->master_bio->bi_rw & REQ_SYNC)
- dp_flags |= DP_RW_SYNC;
- /* for now handle SYNCIO and UNPLUG
- * as if they still were one and the same flag */
- if (req->master_bio->bi_rw & REQ_UNPLUG)
- dp_flags |= DP_RW_SYNC;
+ dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
+
if (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
@@ -2413,10 +2544,17 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(cmd);
- p.head.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header) + dgs + e->size);
+ if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
+ p.head.h80.magic = BE_DRBD_MAGIC;
+ p.head.h80.command = cpu_to_be16(cmd);
+ p.head.h80.length =
+ cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+ } else {
+ p.head.h95.magic = BE_DRBD_MAGIC_BIG;
+ p.head.h95.command = cpu_to_be16(cmd);
+ p.head.h95.length =
+ cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
+ }
p.sector = cpu_to_be64(e->sector);
p.block_id = e->block_id;
@@ -2429,8 +2567,7 @@ int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
if (!drbd_get_data_sock(mdev))
return 0;
- ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p,
- sizeof(p), dgs ? MSG_MORE : 0);
+ ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
if (ok && dgs) {
dgb = mdev->int_dig_out;
drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
@@ -2536,7 +2673,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
unsigned long flags;
int rv = 0;
- lock_kernel();
+ mutex_lock(&drbd_main_mutex);
spin_lock_irqsave(&mdev->req_lock, flags);
/* to have a stable mdev->state.role
* and no race with updating open_cnt */
@@ -2551,7 +2688,7 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
if (!rv)
mdev->open_cnt++;
spin_unlock_irqrestore(&mdev->req_lock, flags);
- unlock_kernel();
+ mutex_unlock(&drbd_main_mutex);
return rv;
}
@@ -2559,9 +2696,9 @@ static int drbd_open(struct block_device *bdev, fmode_t mode)
static int drbd_release(struct gendisk *gd, fmode_t mode)
{
struct drbd_conf *mdev = gd->private_data;
- lock_kernel();
+ mutex_lock(&drbd_main_mutex);
mdev->open_cnt--;
- unlock_kernel();
+ mutex_unlock(&drbd_main_mutex);
return 0;
}
@@ -2605,7 +2742,13 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
/* .verify_alg = */ {}, 0,
/* .cpu_mask = */ {}, 0,
/* .csums_alg = */ {}, 0,
- /* .use_rle = */ 0
+ /* .use_rle = */ 0,
+ /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
+ /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
+ /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
+ /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
+ /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
+ /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
};
/* Have to use that way, because the layout differs between
@@ -2616,7 +2759,9 @@ static void drbd_set_defaults(struct drbd_conf *mdev)
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
- .susp = 0
+ .susp = 0,
+ .susp_nod = 0,
+ .susp_fen = 0
} };
}
@@ -2640,6 +2785,9 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
atomic_set(&mdev->net_cnt, 0);
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
+ atomic_set(&mdev->pp_in_use_by_net, 0);
+ atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
@@ -2666,11 +2814,13 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
INIT_LIST_HEAD(&mdev->meta.work.q);
INIT_LIST_HEAD(&mdev->resync_work.list);
INIT_LIST_HEAD(&mdev->unplug_work.list);
+ INIT_LIST_HEAD(&mdev->go_diskless.list);
INIT_LIST_HEAD(&mdev->md_sync_work.list);
INIT_LIST_HEAD(&mdev->bm_io_work.w.list);
mdev->resync_work.cb = w_resync_inactive;
mdev->unplug_work.cb = w_send_write_hint;
+ mdev->go_diskless.cb = w_go_diskless;
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
init_timer(&mdev->resync_timer);
@@ -2682,6 +2832,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
+ init_waitqueue_head(&mdev->net_cnt_wait);
init_waitqueue_head(&mdev->ee_wait);
init_waitqueue_head(&mdev->al_wait);
init_waitqueue_head(&mdev->seq_wait);
@@ -2697,6 +2848,7 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
void drbd_mdev_cleanup(struct drbd_conf *mdev)
{
+ int i;
if (mdev->receiver.t_state != None)
dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
mdev->receiver.t_state);
@@ -2713,9 +2865,13 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
mdev->p_size =
mdev->rs_start =
mdev->rs_total =
- mdev->rs_failed =
- mdev->rs_mark_left =
- mdev->rs_mark_time = 0;
+ mdev->rs_failed = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = 0;
+ mdev->rs_mark_time[i] = 0;
+ }
D_ASSERT(mdev->net_conf == NULL);
drbd_set_my_capacity(mdev, 0);
@@ -2726,6 +2882,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
}
drbd_free_resources(mdev);
+ clear_bit(AL_SUSPENDED, &mdev->flags);
/*
* currently we drbd_init_ee only on module load, so
@@ -2741,6 +2898,7 @@ void drbd_mdev_cleanup(struct drbd_conf *mdev)
D_ASSERT(list_empty(&mdev->meta.work.q));
D_ASSERT(list_empty(&mdev->resync_work.list));
D_ASSERT(list_empty(&mdev->unplug_work.list));
+ D_ASSERT(list_empty(&mdev->go_diskless.list));
}
@@ -2824,7 +2982,7 @@ static int drbd_create_mempools(void)
drbd_ee_mempool = mempool_create(number,
mempool_alloc_slab, mempool_free_slab, drbd_ee_cache);
- if (drbd_request_mempool == NULL)
+ if (drbd_ee_mempool == NULL)
goto Enomem;
/* drbd's page pool */
@@ -3280,9 +3438,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
sector_t sector;
int i;
+ del_timer(&mdev->md_sync_timer);
+ /* timer may be rearmed by drbd_md_mark_dirty() now. */
if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
return;
- del_timer(&mdev->md_sync_timer);
/* We use here D_FAILED and not D_ATTACHING because we try to write
* metadata even if we detach due to a disk failure! */
@@ -3310,12 +3469,9 @@ void drbd_md_sync(struct drbd_conf *mdev)
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
- if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
- clear_bit(MD_DIRTY, &mdev->flags);
- } else {
+ if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
-
drbd_chk_io_error(mdev, 1, TRUE);
}
@@ -3402,6 +3558,28 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
return rv;
}
+static void debug_drbd_uuid(struct drbd_conf *mdev, enum drbd_uuid_index index)
+{
+ static char *uuid_str[UI_EXTENDED_SIZE] = {
+ [UI_CURRENT] = "CURRENT",
+ [UI_BITMAP] = "BITMAP",
+ [UI_HISTORY_START] = "HISTORY_START",
+ [UI_HISTORY_END] = "HISTORY_END",
+ [UI_SIZE] = "SIZE",
+ [UI_FLAGS] = "FLAGS",
+ };
+
+ if (index >= UI_EXTENDED_SIZE) {
+ dev_warn(DEV, " uuid_index >= EXTENDED_SIZE\n");
+ return;
+ }
+
+ dynamic_dev_dbg(DEV, " uuid[%s] now %016llX\n",
+ uuid_str[index],
+ (unsigned long long)mdev->ldev->md.uuid[index]);
+}
+
+
/**
* drbd_md_mark_dirty() - Mark meta data super block as dirty
* @mdev: DRBD device.
@@ -3410,19 +3588,31 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
* the meta-data super block. This function sets MD_DIRTY, and starts a
* timer that ensures that within five seconds you have to call drbd_md_sync().
*/
+#ifdef DEBUG
+void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
+{
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
+ mod_timer(&mdev->md_sync_timer, jiffies + HZ);
+ mdev->last_md_mark_dirty.line = line;
+ mdev->last_md_mark_dirty.func = func;
+ }
+}
+#else
void drbd_md_mark_dirty(struct drbd_conf *mdev)
{
- set_bit(MD_DIRTY, &mdev->flags);
- mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
+ mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
}
-
+#endif
static void drbd_uuid_move_history(struct drbd_conf *mdev) __must_hold(local)
{
int i;
- for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+ for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++) {
mdev->ldev->md.uuid[i+1] = mdev->ldev->md.uuid[i];
+ debug_drbd_uuid(mdev, i+1);
+ }
}
void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
@@ -3437,6 +3627,7 @@ void _drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
}
mdev->ldev->md.uuid[idx] = val;
+ debug_drbd_uuid(mdev, idx);
drbd_md_mark_dirty(mdev);
}
@@ -3446,6 +3637,7 @@ void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local)
if (mdev->ldev->md.uuid[idx]) {
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[idx];
+ debug_drbd_uuid(mdev, UI_HISTORY_START);
}
_drbd_uuid_set(mdev, idx, val);
}
@@ -3464,6 +3656,7 @@ void drbd_uuid_new_current(struct drbd_conf *mdev) __must_hold(local)
dev_info(DEV, "Creating new current UUID\n");
D_ASSERT(mdev->ldev->md.uuid[UI_BITMAP] == 0);
mdev->ldev->md.uuid[UI_BITMAP] = mdev->ldev->md.uuid[UI_CURRENT];
+ debug_drbd_uuid(mdev, UI_BITMAP);
get_random_bytes(&val, sizeof(u64));
_drbd_uuid_set(mdev, UI_CURRENT, val);
@@ -3478,6 +3671,8 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
drbd_uuid_move_history(mdev);
mdev->ldev->md.uuid[UI_HISTORY_START] = mdev->ldev->md.uuid[UI_BITMAP];
mdev->ldev->md.uuid[UI_BITMAP] = 0;
+ debug_drbd_uuid(mdev, UI_HISTORY_START);
+ debug_drbd_uuid(mdev, UI_BITMAP);
} else {
if (mdev->ldev->md.uuid[UI_BITMAP])
dev_warn(DEV, "bm UUID already set");
@@ -3485,6 +3680,7 @@ void drbd_uuid_set_bm(struct drbd_conf *mdev, u64 val) __must_hold(local)
mdev->ldev->md.uuid[UI_BITMAP] = val;
mdev->ldev->md.uuid[UI_BITMAP] &= ~((u64)1);
+ debug_drbd_uuid(mdev, UI_BITMAP);
}
drbd_md_mark_dirty(mdev);
}
@@ -3527,6 +3723,7 @@ int drbd_bmio_clear_n_write(struct drbd_conf *mdev)
{
int rv = -EIO;
+ drbd_resume_al(mdev);
if (get_ldev_if_state(mdev, D_ATTACHING)) {
drbd_bm_clear_all(mdev);
rv = drbd_bm_write(mdev);
@@ -3559,6 +3756,32 @@ static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
return 1;
}
+static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+{
+ D_ASSERT(mdev->state.disk == D_FAILED);
+ /* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+ * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+ * the protected members anymore, though, so in the after_state_ch work
+ * it will be safe to free them. */
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
+ /* We need to wait for return of references checked out while we still
+ * have been D_FAILED, though (drbd_md_sync, bitmap io). */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
+
+ clear_bit(GO_DISKLESS, &mdev->flags);
+ return 1;
+}
+
+void drbd_go_diskless(struct drbd_conf *mdev)
+{
+ D_ASSERT(mdev->state.disk == D_FAILED);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+ /* don't drbd_queue_work_front,
+ * we need to serialize with the after_state_ch work
+ * of the -> D_FAILED transition. */
+}
+
/**
* drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
* @mdev: DRBD device.
@@ -3655,8 +3878,11 @@ static void md_sync_timer_fn(unsigned long data)
static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
{
dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+#ifdef DEBUG
+ dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
+ mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
+#endif
drbd_md_sync(mdev);
-
return 1;
}
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 73131c5ae339..87925e97e613 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -33,10 +33,13 @@
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
+#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
+#include <linux/compiler.h>
+#include <linux/kthread.h>
static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
@@ -169,6 +172,10 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
put_net_conf(mdev);
}
+ /* The helper may take some time.
+ * write out any unsynced meta data changes now */
+ drbd_md_sync(mdev);
+
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
drbd_bcast_ev_helper(mdev, cmd);
@@ -202,12 +209,10 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
put_ldev(mdev);
} else {
dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
- return mdev->state.pdsk;
+ nps = mdev->state.pdsk;
+ goto out;
}
- if (fp == FP_STONITH)
- _drbd_request_state(mdev, NS(susp, 1), CS_WAIT_COMPLETE);
-
r = drbd_khelper(mdev, "fence-peer");
switch ((r>>8) & 0xff) {
@@ -252,9 +257,36 @@ enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
dev_info(DEV, "fence-peer helper returned %d (%s)\n",
(r>>8) & 0xff, ex_to_string);
+
+out:
+ if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
+ /* The handler was not successful... unfreeze here, the
+ state engine can not unfreeze... */
+ _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
+ }
+
return nps;
}
+static int _try_outdate_peer_async(void *data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *)data;
+ enum drbd_disk_state nps;
+
+ nps = drbd_try_outdate_peer(mdev);
+ drbd_request_state(mdev, NS(pdsk, nps));
+
+ return 0;
+}
+
+void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+{
+ struct task_struct *opa;
+
+ opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
+ if (IS_ERR(opa))
+ dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+}
int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
{
@@ -394,6 +426,39 @@ int drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
return r;
}
+static struct drbd_conf *ensure_mdev(int minor, int create)
+{
+ struct drbd_conf *mdev;
+
+ if (minor >= minor_count)
+ return NULL;
+
+ mdev = minor_to_mdev(minor);
+
+ if (!mdev && create) {
+ struct gendisk *disk = NULL;
+ mdev = drbd_new_device(minor);
+
+ spin_lock_irq(&drbd_pp_lock);
+ if (minor_table[minor] == NULL) {
+ minor_table[minor] = mdev;
+ disk = mdev->vdisk;
+ mdev = NULL;
+ } /* else: we lost the race */
+ spin_unlock_irq(&drbd_pp_lock);
+
+ if (disk) /* we won the race above */
+ /* in case we ever add a drbd_delete_device(),
+ * don't forget the del_gendisk! */
+ add_disk(disk);
+ else /* we lost the race above */
+ drbd_free_mdev(mdev);
+
+ mdev = minor_to_mdev(minor);
+ }
+
+ return mdev;
+}
static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
@@ -494,6 +559,8 @@ char *ppsize(char *buf, unsigned long long size)
void drbd_suspend_io(struct drbd_conf *mdev)
{
set_bit(SUSPEND_IO, &mdev->flags);
+ if (is_susp(mdev->state))
+ return;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
@@ -713,9 +780,6 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
blk_queue_segment_boundary(q, PAGE_SIZE-1);
blk_stack_limits(&q->limits, &b->limits, 0);
- if (b->merge_bvec_fn)
- dev_warn(DEV, "Backing device's merge_bvec_fn() = %p\n",
- b->merge_bvec_fn);
dev_info(DEV, "max_segment_size ( = BIO size ) = %u\n", queue_max_segment_size(q));
if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) {
@@ -729,14 +793,16 @@ void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_seg_s) __mu
/* serialize deconfig (worker exiting, doing cleanup)
* and reconfig (drbdsetup disk, drbdsetup net)
*
- * wait for a potentially exiting worker, then restart it,
- * or start a new one.
+ * Wait for a potentially exiting worker, then restart it,
+ * or start a new one. Flush any pending work, there may still be an
+ * after_state_change queued.
*/
static void drbd_reconfig_start(struct drbd_conf *mdev)
{
wait_event(mdev->state_wait, !test_and_set_bit(CONFIG_PENDING, &mdev->flags));
wait_event(mdev->state_wait, !test_bit(DEVICE_DYING, &mdev->flags));
drbd_thread_start(&mdev->worker);
+ drbd_flush_workqueue(mdev);
}
/* if still unconfigured, stops worker again.
@@ -756,6 +822,29 @@ static void drbd_reconfig_done(struct drbd_conf *mdev)
wake_up(&mdev->state_wait);
}
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_conf *mdev)
+{
+ int s = 0;
+
+ if (lc_try_lock(mdev->act_log)) {
+ drbd_al_shrink(mdev);
+ lc_unlock(mdev->act_log);
+ } else {
+ dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
+ return;
+ }
+
+ spin_lock_irq(&mdev->req_lock);
+ if (mdev->state.conn < C_CONNECTED)
+ s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+
+ spin_unlock_irq(&mdev->req_lock);
+
+ if (s)
+ dev_info(DEV, "Suspended AL updates\n");
+}
+
/* does always return 0;
* interesting return code is in reply->ret_code */
static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -769,6 +858,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
struct inode *inode, *inode2;
struct lru_cache *resync_lru = NULL;
union drbd_state ns, os;
+ unsigned int max_seg_s;
int rv;
int cp_discovered = 0;
int logical_block_size;
@@ -803,6 +893,15 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
goto fail;
}
+ if (get_net_conf(mdev)) {
+ int prot = mdev->net_conf->wire_protocol;
+ put_net_conf(mdev);
+ if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
+ }
+
nbc->lo_file = filp_open(nbc->dc.backing_dev, O_RDWR, 0);
if (IS_ERR(nbc->lo_file)) {
dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
@@ -924,7 +1023,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt));
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
@@ -1021,7 +1120,8 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
else
clear_bit(CRASHED_PRIMARY, &mdev->flags);
- if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
+ !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
set_bit(CRASHED_PRIMARY, &mdev->flags);
cp_discovered = 1;
}
@@ -1031,7 +1131,20 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
mdev->read_cnt = 0;
mdev->writ_cnt = 0;
- drbd_setup_queue_param(mdev, DRBD_MAX_SEGMENT_SIZE);
+ max_seg_s = DRBD_MAX_SEGMENT_SIZE;
+ if (mdev->state.conn == C_CONNECTED) {
+ /* We are Primary, Connected, and now attach a new local
+ * backing store. We must not increase the user visible maximum
+ * bio size on this device to something the peer may not be
+ * able to handle. */
+ if (mdev->agreed_pro_version < 94)
+ max_seg_s = queue_max_segment_size(mdev->rq_queue);
+ else if (mdev->agreed_pro_version == 94)
+ max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
+ /* else: drbd 8.3.9 and later, stay with default */
+ }
+
+ drbd_setup_queue_param(mdev, max_seg_s);
/* If I am currently not R_PRIMARY,
* but meta data primary indicator is set,
@@ -1079,6 +1192,9 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
drbd_al_to_on_disk_bm(mdev);
}
+ if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
+ drbd_suspend_al(mdev); /* IO is still suspended here... */
+
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
ns.i = os.i;
@@ -1235,7 +1351,16 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
&& (new_conf->wire_protocol != DRBD_PROT_C)) {
retcode = ERR_NOT_PROTO_C;
goto fail;
- };
+ }
+
+ if (get_ldev(mdev)) {
+ enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
+ put_ldev(mdev);
+ if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
+ retcode = ERR_STONITH_AND_PROT_A;
+ goto fail;
+ }
+ }
if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
retcode = ERR_DISCARD;
@@ -1350,6 +1475,7 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
}
}
+ drbd_flush_workqueue(mdev);
spin_lock_irq(&mdev->req_lock);
if (mdev->net_conf != NULL) {
retcode = ERR_NET_CONFIGURED;
@@ -1388,10 +1514,9 @@ static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
mdev->int_dig_out=int_dig_out;
mdev->int_dig_in=int_dig_in;
mdev->int_dig_vv=int_dig_vv;
+ retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
spin_unlock_irq(&mdev->req_lock);
- retcode = _drbd_request_state(mdev, NS(conn, C_UNCONNECTED), CS_VERBOSE);
-
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
reply->ret_code = retcode;
drbd_reconfig_done(mdev);
@@ -1546,6 +1671,8 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
struct crypto_hash *csums_tfm = NULL;
struct syncer_conf sc;
cpumask_var_t new_cpu_mask;
+ int *rs_plan_s = NULL;
+ int fifo_size;
if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
retcode = ERR_NOMEM;
@@ -1557,6 +1684,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
sc.rate = DRBD_RATE_DEF;
sc.after = DRBD_AFTER_DEF;
sc.al_extents = DRBD_AL_EXTENTS_DEF;
+ sc.on_no_data = DRBD_ON_NO_DATA_DEF;
+ sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
+ sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
+ sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
+ sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+ sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
@@ -1634,6 +1767,12 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
}
#undef AL_MAX
+ /* to avoid spurious errors when configuring minors before configuring
+ * the minors they depend on: if necessary, first create the minor we
+ * depend on */
+ if (sc.after >= 0)
+ ensure_mdev(sc.after, 1);
+
/* most sanity checks done, try to assign the new sync-after
* dependency. need to hold the global lock in there,
* to avoid a race in the dependency loop check. */
@@ -1641,6 +1780,16 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
if (retcode != NO_ERROR)
goto fail;
+ fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ if (!rs_plan_s) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ }
+
/* ok, assign the rest of it as well.
* lock against receive_SyncParam() */
spin_lock(&mdev->peer_seq_lock);
@@ -1657,6 +1806,15 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
mdev->verify_tfm = verify_tfm;
verify_tfm = NULL;
}
+
+ if (fifo_size != mdev->rs_plan_s.size) {
+ kfree(mdev->rs_plan_s.values);
+ mdev->rs_plan_s.values = rs_plan_s;
+ mdev->rs_plan_s.size = fifo_size;
+ mdev->rs_planed = 0;
+ rs_plan_s = NULL;
+ }
+
spin_unlock(&mdev->peer_seq_lock);
if (get_ldev(mdev)) {
@@ -1688,6 +1846,7 @@ static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *n
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
+ kfree(rs_plan_s);
free_cpumask_var(new_cpu_mask);
crypto_free_hash(csums_tfm);
crypto_free_hash(verify_tfm);
@@ -1721,12 +1880,38 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
return 0;
}
+static int drbd_bmio_set_susp_al(struct drbd_conf *mdev)
+{
+ int rv;
+
+ rv = drbd_bmio_set_n_write(mdev);
+ drbd_suspend_al(mdev);
+ return rv;
+}
+
static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
+ int retcode;
- reply->ret_code = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+ retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
+
+ if (retcode < SS_SUCCESS) {
+ if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
+ /* The peer will get a resync upon connect anyways. Just make that
+ into a full resync. */
+ retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
+ if (retcode >= SS_SUCCESS) {
+ /* open coded drbd_bitmap_io() */
+ if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
+ "set_n_write from invalidate_peer"))
+ retcode = ERR_IO_MD_DISK;
+ }
+ } else
+ retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
+ }
+ reply->ret_code = retcode;
return 0;
}
@@ -1765,7 +1950,21 @@ static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
struct drbd_nl_cfg_reply *reply)
{
- reply->ret_code = drbd_request_state(mdev, NS(susp, 0));
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ drbd_md_sync(mdev);
+ }
+ drbd_suspend_io(mdev);
+ reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (reply->ret_code == SS_SUCCESS) {
+ if (mdev->state.conn < C_CONNECTED)
+ tl_clear(mdev);
+ if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
+ tl_restart(mdev, fail_frozen_disk_io);
+ }
+ drbd_resume_io(mdev);
+
return 0;
}
@@ -1941,40 +2140,6 @@ out:
return 0;
}
-static struct drbd_conf *ensure_mdev(struct drbd_nl_cfg_req *nlp)
-{
- struct drbd_conf *mdev;
-
- if (nlp->drbd_minor >= minor_count)
- return NULL;
-
- mdev = minor_to_mdev(nlp->drbd_minor);
-
- if (!mdev && (nlp->flags & DRBD_NL_CREATE_DEVICE)) {
- struct gendisk *disk = NULL;
- mdev = drbd_new_device(nlp->drbd_minor);
-
- spin_lock_irq(&drbd_pp_lock);
- if (minor_table[nlp->drbd_minor] == NULL) {
- minor_table[nlp->drbd_minor] = mdev;
- disk = mdev->vdisk;
- mdev = NULL;
- } /* else: we lost the race */
- spin_unlock_irq(&drbd_pp_lock);
-
- if (disk) /* we won the race above */
- /* in case we ever add a drbd_delete_device(),
- * don't forget the del_gendisk! */
- add_disk(disk);
- else /* we lost the race above */
- drbd_free_mdev(mdev);
-
- mdev = minor_to_mdev(nlp->drbd_minor);
- }
-
- return mdev;
-}
-
struct cn_handler_struct {
int (*function)(struct drbd_conf *,
struct drbd_nl_cfg_req *,
@@ -2035,7 +2200,8 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
goto fail;
}
- mdev = ensure_mdev(nlp);
+ mdev = ensure_mdev(nlp->drbd_minor,
+ (nlp->flags & DRBD_NL_CREATE_DEVICE));
if (!mdev) {
retcode = ERR_MINOR_INVALID;
goto fail;
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index be3374b68460..ad325c5d0ce1 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -57,6 +57,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
unsigned long db, dt, dbdt, rt, rs_left;
unsigned int res;
int i, x, y;
+ int stalled = 0;
drbd_get_syncer_progress(mdev, &rs_left, &res);
@@ -90,18 +91,17 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
* db: blocks written from mark until now
* rt: remaining time
*/
- dt = (jiffies - mdev->rs_mark_time) / HZ;
-
- if (dt > 20) {
- /* if we made no update to rs_mark_time for too long,
- * we are stalled. show that. */
- seq_printf(seq, "stalled\n");
- return;
- }
+ /* Rolling marks. last_mark+1 may just now be modified. last_mark+2 is
+ * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+ * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+ i = (mdev->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+ dt = (jiffies - mdev->rs_mark_time[i]) / HZ;
+ if (dt > (DRBD_SYNC_MARK_STEP * DRBD_SYNC_MARKS))
+ stalled = 1;
if (!dt)
dt++;
- db = mdev->rs_mark_left - rs_left;
+ db = mdev->rs_mark_left[i] - rs_left;
rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
seq_printf(seq, "finish: %lu:%02lu:%02lu",
@@ -118,7 +118,7 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
/* mean speed since syncer started
* we do account for PausedSync periods */
dt = (jiffies - mdev->rs_start - mdev->rs_paused) / HZ;
- if (dt <= 0)
+ if (dt == 0)
dt = 1;
db = mdev->rs_total - rs_left;
dbdt = Bit2KB(db/dt);
@@ -128,7 +128,14 @@ static void drbd_syncer_progress(struct drbd_conf *mdev, struct seq_file *seq)
else
seq_printf(seq, " (%ld)", dbdt);
- seq_printf(seq, " K/sec\n");
+ if (mdev->state.conn == C_SYNC_TARGET) {
+ if (mdev->c_sync_rate > 1000)
+ seq_printf(seq, " want: %d,%03d",
+ mdev->c_sync_rate / 1000, mdev->c_sync_rate % 1000);
+ else
+ seq_printf(seq, " want: %d", mdev->c_sync_rate);
+ }
+ seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
}
static void resync_dump_detail(struct seq_file *seq, struct lc_element *e)
@@ -196,7 +203,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "%2d: cs:Unconfigured\n", i);
} else {
seq_printf(seq,
- "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c\n"
+ "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
"lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
i, sn,
@@ -206,11 +213,12 @@ static int drbd_seq_show(struct seq_file *seq, void *v)
drbd_disk_str(mdev->state.pdsk),
(mdev->net_conf == NULL ? ' ' :
(mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
- mdev->state.susp ? 's' : 'r',
+ is_susp(mdev->state) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
+ test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 081522d3c742..efd6169acf2f 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -241,7 +241,7 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
}
/**
@@ -298,9 +298,11 @@ static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool
* Is also used from inside an other spin_lock_irq(&mdev->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
+static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
{
+ atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
+
if (drbd_pp_vacant > (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE)*minor_count)
i = page_chain_free(page);
else {
@@ -311,10 +313,10 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page)
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
}
- atomic_sub(i, &mdev->pp_in_use);
- i = atomic_read(&mdev->pp_in_use);
+ i = atomic_sub_return(i, a);
if (i < 0)
- dev_warn(DEV, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
+ dev_warn(DEV, "ASSERTION FAILED: %s: %d < 0\n",
+ is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
}
@@ -365,7 +367,6 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
e->size = data_size;
e->flags = 0;
e->sector = sector;
- e->sector = sector;
e->block_id = id;
return e;
@@ -375,9 +376,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
return NULL;
}
-void drbd_free_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
{
- drbd_pp_free(mdev, e->pages);
+ if (e->flags & EE_HAS_DIGEST)
+ kfree(e->digest);
+ drbd_pp_free(mdev, e->pages, is_net);
D_ASSERT(atomic_read(&e->pending_bios) == 0);
D_ASSERT(hlist_unhashed(&e->colision));
mempool_free(e, drbd_ee_mempool);
@@ -388,13 +391,14 @@ int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_epoch_entry *e, *t;
int count = 0;
+ int is_net = list == &mdev->net_ee;
spin_lock_irq(&mdev->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_ee(mdev, e);
+ drbd_free_some_ee(mdev, e, is_net);
count++;
}
return count;
@@ -423,7 +427,7 @@ static int drbd_process_done_ee(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_ee(mdev, e);
+ drbd_free_net_ee(mdev, e);
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_discard_ack.
@@ -719,14 +723,14 @@ out:
static int drbd_send_fp(struct drbd_conf *mdev,
struct socket *sock, enum drbd_packets cmd)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.sbuf.header.h80;
return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
}
static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
{
- struct p_header *h = (struct p_header *) &mdev->data.sbuf.header;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
int rr;
rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
@@ -776,9 +780,6 @@ static int drbd_connect(struct drbd_conf *mdev)
D_ASSERT(!mdev->data.socket);
- if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags))
- dev_err(DEV, "CREATE_BARRIER flag was set in drbd_connect - now cleared!\n");
-
if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
return -2;
@@ -927,6 +928,11 @@ retry:
drbd_thread_start(&mdev->asender);
+ if (mdev->agreed_pro_version < 95 && get_ldev(mdev)) {
+ drbd_setup_queue_param(mdev, DRBD_MAX_SIZE_H80_PACKET);
+ put_ldev(mdev);
+ }
+
if (!drbd_send_protocol(mdev))
return -1;
drbd_send_sync_param(mdev, &mdev->sync_conf);
@@ -946,22 +952,28 @@ out_release_sockets:
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, struct p_header *h)
+static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
{
+ union p_header *h = &mdev->data.rbuf.header;
int r;
r = drbd_recv(mdev, h, sizeof(*h));
-
if (unlikely(r != sizeof(*h))) {
dev_err(DEV, "short read expecting header on sock: r=%d\n", r);
return FALSE;
- };
- h->command = be16_to_cpu(h->command);
- h->length = be16_to_cpu(h->length);
- if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on data m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ }
+
+ if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
+ *cmd = be16_to_cpu(h->h80.command);
+ *packet_size = be16_to_cpu(h->h80.length);
+ } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
+ *cmd = be16_to_cpu(h->h95.command);
+ *packet_size = be32_to_cpu(h->h95.length);
+ } else {
+ dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->h80.magic),
+ be16_to_cpu(h->h80.command),
+ be16_to_cpu(h->h80.length));
return FALSE;
}
mdev->last_received = jiffies;
@@ -975,7 +987,7 @@ static enum finish_epoch drbd_flush_after_epoch(struct drbd_conf *mdev, struct d
if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
- NULL, BLKDEV_IFL_WAIT);
+ NULL);
if (rv) {
dev_err(DEV, "local disk flush failed with status %d\n", rv);
/* would rather check on EOPNOTSUPP, but that is not reliable.
@@ -1268,17 +1280,12 @@ int w_e_reissue(struct drbd_conf *mdev, struct drbd_work *w, int cancel) __relea
return 1;
}
-static int receive_Barrier(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
int rv, issue_flush;
- struct p_barrier *p = (struct p_barrier *)h;
+ struct p_barrier *p = &mdev->data.rbuf.barrier;
struct drbd_epoch *epoch;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
-
- rv = drbd_recv(mdev, h->payload, h->length);
- ERR_IF(rv != h->length) return FALSE;
-
inc_unacked(mdev);
if (mdev->net_conf->wire_protocol != DRBD_PROT_C)
@@ -1457,7 +1464,7 @@ static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
data_size -= rr;
}
kunmap(page);
- drbd_pp_free(mdev, page);
+ drbd_pp_free(mdev, page, 0);
return rv;
}
@@ -1562,30 +1569,29 @@ static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_si
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
+ atomic_add(data_size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+
drbd_free_ee(mdev, e);
fail:
put_ldev(mdev);
return FALSE;
}
-static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct drbd_request *req;
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
@@ -1611,20 +1617,11 @@ static int receive_DataReply(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
+static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
- unsigned int header_size, data_size;
int ok;
- struct p_data *p = (struct p_data *)h;
-
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
+ struct p_data *p = &mdev->data.rbuf.data;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
@@ -1640,9 +1637,11 @@ static int receive_RSDataReply(struct drbd_conf *mdev, struct p_header *h)
ok = drbd_drain_block(mdev, data_size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
}
+ atomic_add(data_size >> 9, &mdev->rs_sect_in);
+
return ok;
}
@@ -1765,24 +1764,27 @@ static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
return ret;
}
+static unsigned long write_flags_to_bio(struct drbd_conf *mdev, u32 dpf)
+{
+ if (mdev->agreed_pro_version >= 95)
+ return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+ (dpf & DP_UNPLUG ? REQ_UNPLUG : 0) |
+ (dpf & DP_FUA ? REQ_FUA : 0) |
+ (dpf & DP_FLUSH ? REQ_FUA : 0) |
+ (dpf & DP_DISCARD ? REQ_DISCARD : 0);
+ else
+ return dpf & DP_RW_SYNC ? (REQ_SYNC | REQ_UNPLUG) : 0;
+}
+
/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
+static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
sector_t sector;
struct drbd_epoch_entry *e;
- struct p_data *p = (struct p_data *)h;
- int header_size, data_size;
+ struct p_data *p = &mdev->data.rbuf.data;
int rw = WRITE;
u32 dp_flags;
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- ERR_IF(data_size == 0) return FALSE;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write mirrored data block "
@@ -1792,7 +1794,7 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
atomic_inc(&mdev->current_epoch->epoch_size);
return drbd_drain_block(mdev, data_size);
}
@@ -1839,12 +1841,8 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
spin_unlock(&mdev->epoch_lock);
dp_flags = be32_to_cpu(p->dp_flags);
- if (dp_flags & DP_HARDBARRIER) {
- dev_err(DEV, "ASSERT FAILED would have submitted barrier request\n");
- /* rw |= REQ_HARDBARRIER; */
- }
- if (dp_flags & DP_RW_SYNC)
- rw |= REQ_SYNC | REQ_UNPLUG;
+ rw |= write_flags_to_bio(mdev, dp_flags);
+
if (dp_flags & DP_MAY_SET_IN_SYNC)
e->flags |= EE_MAY_SET_IN_SYNC;
@@ -2007,6 +2005,16 @@ static int receive_Data(struct drbd_conf *mdev, struct p_header *h)
if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ hlist_del_init(&e->colision);
+ spin_unlock_irq(&mdev->req_lock);
+ if (e->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(mdev, e->sector);
+
out_interrupted:
/* yes, the epoch_size now is imbalanced.
* but we drop the connection anyways, so we don't have a chance to
@@ -2016,20 +2024,64 @@ out_interrupted:
return FALSE;
}
-static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+{
+ struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ int curr_events;
+ int throttle = 0;
+
+ /* feature disabled? */
+ if (mdev->sync_conf.c_min_rate == 0)
+ return 0;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&mdev->rs_sect_ev);
+ if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ mdev->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+ rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+ dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = mdev->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > mdev->sync_conf.c_min_rate)
+ throttle = 1;
+ }
+ return throttle;
+}
+
+
+static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
{
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
struct digest_info *di = NULL;
- int size, digest_size;
+ int size, verb;
unsigned int fault_type;
- struct p_block_req *p =
- (struct p_block_req *)h;
- const int brps = sizeof(*p)-sizeof(*h);
-
- if (drbd_recv(mdev, h->payload, brps) != brps)
- return FALSE;
+ struct p_block_req *p = &mdev->data.rbuf.block_req;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
@@ -2046,12 +2098,31 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
- if (__ratelimit(&drbd_ratelimit_state))
+ verb = 1;
+ switch (cmd) {
+ case P_DATA_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
+ break;
+ case P_RS_DATA_REQUEST:
+ case P_CSUM_RS_REQUEST:
+ case P_OV_REQUEST:
+ drbd_send_ack_rp(mdev, P_NEG_RS_DREPLY , p);
+ break;
+ case P_OV_REPLY:
+ verb = 0;
+ dec_rs_pending(mdev);
+ drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
+ break;
+ default:
+ dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
+ cmdname(cmd));
+ }
+ if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
- drbd_send_ack_rp(mdev, h->command == P_DATA_REQUEST ? P_NEG_DREPLY :
- P_NEG_RS_DREPLY , p);
- return drbd_drain_block(mdev, h->length - brps);
+
+ /* drain possibly payload */
+ return drbd_drain_block(mdev, digest_size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
@@ -2063,31 +2134,21 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
- switch (h->command) {
+ switch (cmd) {
case P_DATA_REQUEST:
e->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
- break;
+ /* application IO, don't drbd_rs_begin_io */
+ goto submit;
+
case P_RS_DATA_REQUEST:
e->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronously. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- digest_size = h->length - brps ;
di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
if (!di)
goto out_free_e;
@@ -2095,31 +2156,25 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
di->digest_size = digest_size;
di->digest = (((char *)di)+sizeof(struct digest_info));
+ e->digest = di;
+ e->flags |= EE_HAS_DIGEST;
+
if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
goto out_free_e;
- e->block_id = (u64)(unsigned long)di;
- if (h->command == P_CSUM_RS_REQUEST) {
+ if (cmd == P_CSUM_RS_REQUEST) {
D_ASSERT(mdev->agreed_pro_version >= 89);
e->w.cb = w_e_end_csum_rs_req;
- } else if (h->command == P_OV_REPLY) {
+ } else if (cmd == P_OV_REPLY) {
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
- break;
- }
-
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted, probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
}
break;
case P_OV_REQUEST:
- if (mdev->state.conn >= C_CONNECTED &&
- mdev->state.conn != C_VERIFY_T)
- dev_warn(DEV, "ASSERT FAILED: got P_OV_REQUEST while being %s\n",
- drbd_conn_str(mdev->state.conn));
if (mdev->ov_start_sector == ~(sector_t)0 &&
mdev->agreed_pro_version >= 90) {
mdev->ov_start_sector = sector;
@@ -2130,37 +2185,63 @@ static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
}
e->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
- /* Eventually this should become asynchronous. Currently it
- * blocks the whole receiver just to delay the reading of a
- * resync data block.
- * the drbd_work_queue mechanism is made for this...
- */
- if (!drbd_rs_begin_io(mdev, sector)) {
- /* we have been interrupted,
- * probably connection lost! */
- D_ASSERT(signal_pending(current));
- goto out_free_e;
- }
break;
-
default:
dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(h->command));
+ cmdname(cmd));
fault_type = DRBD_FAULT_MAX;
+ goto out_free_e;
}
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+ if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+ msleep(100);
+ if (drbd_rs_begin_io(mdev, e->sector))
+ goto out_free_e;
+submit_for_resync:
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
+
+submit:
inc_unacked(mdev);
+ spin_lock_irq(&mdev->req_lock);
+ list_add_tail(&e->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->req_lock);
if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
return TRUE;
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
+ /* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
out_free_e:
- kfree(di);
put_ldev(mdev);
drbd_free_ee(mdev, e);
return FALSE;
@@ -2699,20 +2780,13 @@ static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
return 1;
}
-static int receive_protocol(struct drbd_conf *mdev, struct p_header *h)
+static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_protocol *p = (struct p_protocol *)h;
- int header_size, data_size;
+ struct p_protocol *p = &mdev->data.rbuf.protocol;
int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
int p_want_lose, p_two_primaries, cf;
char p_integrity_alg[SHARED_SECRET_MAX] = "";
- header_size = sizeof(*p) - sizeof(*h);
- data_size = h->length - header_size;
-
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
- return FALSE;
-
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
p_after_sb_1p = be32_to_cpu(p->after_sb_1p);
@@ -2805,39 +2879,46 @@ struct crypto_hash *drbd_crypto_alloc_digest_safe(const struct drbd_conf *mdev,
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
+static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
{
int ok = TRUE;
- struct p_rs_param_89 *p = (struct p_rs_param_89 *)h;
+ struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
const int apv = mdev->agreed_pro_version;
+ int *rs_plan_s = NULL;
+ int fifo_size = 0;
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
+ SHARED_SECRET_MAX
- : /* 89 */ sizeof(struct p_rs_param_89);
+ : apv <= 94 ? sizeof(struct p_rs_param_89)
+ : /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (h->length > exp_max_sz) {
+ if (packet_size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- h->length, exp_max_sz);
+ packet_size, exp_max_sz);
return FALSE;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(*h);
- data_size = h->length - header_size;
- } else /* apv >= 89 */ {
- header_size = sizeof(struct p_rs_param_89) - sizeof(*h);
- data_size = h->length - header_size;
+ header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ } else if (apv <= 94) {
+ header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
+ D_ASSERT(data_size == 0);
+ } else {
+ header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
+ data_size = packet_size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, h->payload, header_size) != header_size)
+ if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
return FALSE;
mdev->sync_conf.rate = be32_to_cpu(p->rate);
@@ -2896,6 +2977,22 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
}
}
+ if (apv > 94) {
+ mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
+ mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
+ mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+
+ fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
+ rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
+ if (!rs_plan_s) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ goto disconnect;
+ }
+ }
+ }
spin_lock(&mdev->peer_seq_lock);
/* lock against drbd_nl_syncer_conf() */
@@ -2913,6 +3010,12 @@ static int receive_SyncParam(struct drbd_conf *mdev, struct p_header *h)
mdev->csums_tfm = csums_tfm;
dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
}
+ if (fifo_size != mdev->rs_plan_s.size) {
+ kfree(mdev->rs_plan_s.values);
+ mdev->rs_plan_s.values = rs_plan_s;
+ mdev->rs_plan_s.size = fifo_size;
+ mdev->rs_planed = 0;
+ }
spin_unlock(&mdev->peer_seq_lock);
}
@@ -2946,19 +3049,15 @@ static void warn_if_differ_considerably(struct drbd_conf *mdev,
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_sizes *p = (struct p_sizes *)h;
+ struct p_sizes *p = &mdev->data.rbuf.sizes;
enum determine_dev_size dd = unchanged;
unsigned int max_seg_s;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
@@ -2972,7 +3071,6 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
* we still need to figure out whether we accept that. */
mdev->p_size = p_size;
-#define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r))
if (get_ldev(mdev)) {
warn_if_differ_considerably(mdev, "lower level device sizes",
p_size, drbd_get_max_capacity(mdev->ldev));
@@ -3029,6 +3127,8 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
if (mdev->agreed_pro_version < 94)
max_seg_s = be32_to_cpu(p->max_segment_size);
+ else if (mdev->agreed_pro_version == 94)
+ max_seg_s = DRBD_MAX_SIZE_H80_PACKET;
else /* drbd 8.3.8 onwards */
max_seg_s = DRBD_MAX_SEGMENT_SIZE;
@@ -3062,16 +3162,12 @@ static int receive_sizes(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
+static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_uuids *p = (struct p_uuids *)h;
+ struct p_uuids *p = &mdev->data.rbuf.uuids;
u64 *p_uuid;
int i;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3107,6 +3203,11 @@ static int receive_uuids(struct drbd_conf *mdev, struct p_header *h)
drbd_md_sync(mdev);
}
put_ldev(mdev);
+ } else if (mdev->state.disk < D_INCONSISTENT &&
+ mdev->state.role == R_PRIMARY) {
+ /* I am a diskless primary, the peer just created a new current UUID
+ for me. */
+ drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
}
/* Before we test for the disk state, we should wait until an eventually
@@ -3150,16 +3251,12 @@ static union drbd_state convert_state(union drbd_state ps)
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_req_state *p = (struct p_req_state *)h;
+ struct p_req_state *p = &mdev->data.rbuf.req_state;
union drbd_state mask, val;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
@@ -3180,20 +3277,14 @@ static int receive_req_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_state(struct drbd_conf *mdev, struct p_header *h)
+static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_state *p = (struct p_state *)h;
- enum drbd_conns nconn, oconn;
- union drbd_state ns, peer_state;
+ struct p_state *p = &mdev->data.rbuf.state;
+ union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
+ enum chg_state_flags cs_flags;
int rv;
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h)))
- return FALSE;
-
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
@@ -3204,38 +3295,72 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
spin_lock_irq(&mdev->req_lock);
retry:
- oconn = nconn = mdev->state.conn;
+ os = ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
- if (nconn == C_WF_REPORT_PARAMS)
- nconn = C_CONNECTED;
+ /* peer says his disk is uptodate, while we think it is inconsistent,
+ * and this happens while we think we have a sync going on. */
+ if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+ os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+ /* If we are (becoming) SyncSource, but peer is still in sync
+ * preparation, ignore its uptodate-ness to avoid flapping, it
+ * will change to inconsistent once the peer reaches active
+ * syncing states.
+ * It may have changed syncer-paused flags, however, so we
+ * cannot ignore this completely. */
+ if (peer_state.conn > C_CONNECTED &&
+ peer_state.conn < C_SYNC_SOURCE)
+ real_peer_disk = D_INCONSISTENT;
+
+ /* if peer_state changes to connected at the same time,
+ * it explicitly notifies us that it finished resync.
+ * Maybe we should finish it up, too? */
+ else if (os.conn >= C_SYNC_SOURCE &&
+ peer_state.conn == C_CONNECTED) {
+ if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
+ drbd_resync_finished(mdev);
+ return TRUE;
+ }
+ }
+
+ /* peer says his disk is inconsistent, while we think it is uptodate,
+ * and this happens while the peer still thinks we have a sync going on,
+ * but we think we are already done with the sync.
+ * We ignore this to avoid flapping pdsk.
+ * This should not happen, if the peer is a recent version of drbd. */
+ if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+ os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+ real_peer_disk = D_UP_TO_DATE;
+
+ if (ns.conn == C_WF_REPORT_PARAMS)
+ ns.conn = C_CONNECTED;
if (mdev->p_uuid && peer_state.disk >= D_NEGOTIATING &&
get_ldev_if_state(mdev, D_NEGOTIATING)) {
int cr; /* consider resync */
/* if we established a new connection */
- cr = (oconn < C_CONNECTED);
+ cr = (os.conn < C_CONNECTED);
/* if we had an established connection
* and one of the nodes newly attaches a disk */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.disk == D_NEGOTIATING ||
- mdev->state.disk == D_NEGOTIATING));
+ os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
- cr |= (oconn == C_CONNECTED &&
+ cr |= (os.conn == C_CONNECTED &&
(peer_state.conn >= C_STARTING_SYNC_S &&
peer_state.conn <= C_WF_BITMAP_T));
if (cr)
- nconn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
+ ns.conn = drbd_sync_handshake(mdev, peer_state.role, real_peer_disk);
put_ldev(mdev);
- if (nconn == C_MASK) {
- nconn = C_CONNECTED;
+ if (ns.conn == C_MASK) {
+ ns.conn = C_CONNECTED;
if (mdev->state.disk == D_NEGOTIATING) {
drbd_force_state(mdev, NS(disk, D_DISKLESS));
} else if (peer_state.disk == D_NEGOTIATING) {
@@ -3245,7 +3370,7 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
} else {
if (test_and_clear_bit(CONN_DRY_RUN, &mdev->flags))
return FALSE;
- D_ASSERT(oconn == C_WF_REPORT_PARAMS);
+ D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
return FALSE;
}
@@ -3253,18 +3378,28 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
}
spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn != oconn)
+ if (mdev->state.i != os.i)
goto retry;
clear_bit(CONSIDER_RESYNC, &mdev->flags);
- ns.i = mdev->state.i;
- ns.conn = nconn;
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
- if ((nconn == C_CONNECTED || nconn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+ if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
-
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE | CS_HARD, NULL);
+ cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+ if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ for temporal network outages! */
+ spin_unlock_irq(&mdev->req_lock);
+ dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+ tl_clear(mdev);
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
+ return FALSE;
+ }
+ rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
ns = mdev->state;
spin_unlock_irq(&mdev->req_lock);
@@ -3273,8 +3408,8 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return FALSE;
}
- if (oconn > C_WF_REPORT_PARAMS) {
- if (nconn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+ if (os.conn > C_WF_REPORT_PARAMS) {
+ if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
peer_state.disk != D_NEGOTIATING ) {
/* we want resync, peer has not yet decided to sync... */
/* Nowadays only used when forcing a node into primary role and
@@ -3291,9 +3426,9 @@ static int receive_state(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
+static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
- struct p_rs_uuid *p = (struct p_rs_uuid *)h;
+ struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3302,10 +3437,6 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
/* D_ASSERT( mdev->state.conn == C_WF_SYNC_UUID ); */
- ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE;
- if (drbd_recv(mdev, h->payload, h->length) != h->length)
- return FALSE;
-
/* Here the _drbd_uuid_ functions are right, current should
_not_ be rotated into the history */
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
@@ -3324,14 +3455,14 @@ static int receive_sync_uuid(struct drbd_conf *mdev, struct p_header *h)
enum receive_bitmap_ret { OK, DONE, FAILED };
static enum receive_bitmap_ret
-receive_bitmap_plain(struct drbd_conf *mdev, struct p_header *h,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
+ unsigned long *buffer, struct bm_xfer_ctx *c)
{
unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
unsigned want = num_words * sizeof(long);
- if (want != h->length) {
- dev_err(DEV, "%s:want (%u) != h->length (%u)\n", __func__, want, h->length);
+ if (want != data_size) {
+ dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
return FAILED;
}
if (want == 0)
@@ -3360,7 +3491,7 @@ recv_bm_rle_bits(struct drbd_conf *mdev,
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = p->head.length - (sizeof(*p) - sizeof(p->head));
+ int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
int toggle = DCBP_get_start(p);
int have;
int bits;
@@ -3429,7 +3560,7 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header) *
+ unsigned plain = sizeof(struct p_header80) *
((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
+ c->bm_words * sizeof(long);
unsigned total = c->bytes[0] + c->bytes[1];
@@ -3467,12 +3598,13 @@ void INFO_bm_xfer_stats(struct drbd_conf *mdev,
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
+static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
struct bm_xfer_ctx c;
void *buffer;
enum receive_bitmap_ret ret;
int ok = FALSE;
+ struct p_header80 *h = &mdev->data.rbuf.header.h80;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
@@ -3492,39 +3624,39 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
};
do {
- if (h->command == P_BITMAP) {
- ret = receive_bitmap_plain(mdev, h, buffer, &c);
- } else if (h->command == P_COMPRESSED_BITMAP) {
+ if (cmd == P_BITMAP) {
+ ret = receive_bitmap_plain(mdev, data_size, buffer, &c);
+ } else if (cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
struct p_compressed_bm *p;
- if (h->length > BM_PACKET_PAYLOAD_BYTES) {
+ if (data_size > BM_PACKET_PAYLOAD_BYTES) {
dev_err(DEV, "ReportCBitmap packet too large\n");
goto out;
}
/* use the page buff */
p = buffer;
memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, h->length) != h->length)
+ if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
goto out;
- if (p->head.length <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", p->head.length);
+ if (data_size <= (sizeof(*p) - sizeof(p->head))) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
return FAILED;
}
ret = decode_bitmap_c(mdev, p, &c);
} else {
- dev_warn(DEV, "receive_bitmap: h->command neither ReportBitMap nor ReportCBitMap (is 0x%x)", h->command);
+ dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
goto out;
}
- c.packets[h->command == P_BITMAP]++;
- c.bytes[h->command == P_BITMAP] += sizeof(struct p_header) + h->length;
+ c.packets[cmd == P_BITMAP]++;
+ c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
if (ret != OK)
break;
- if (!drbd_recv_header(mdev, h))
+ if (!drbd_recv_header(mdev, &cmd, &data_size))
goto out;
} while (ret == OK);
if (ret == FAILED)
@@ -3555,17 +3687,16 @@ static int receive_bitmap(struct drbd_conf *mdev, struct p_header *h)
return ok;
}
-static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
+static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
/* TODO zero copy sink :) */
static char sink[128];
int size, want, r;
- if (!silent)
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- h->command, h->length);
+ dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
+ cmd, data_size);
- size = h->length;
+ size = data_size;
while (size > 0) {
want = min_t(int, size, sizeof(sink));
r = drbd_recv(mdev, sink, want);
@@ -3575,17 +3706,7 @@ static int receive_skip_(struct drbd_conf *mdev, struct p_header *h, int silent)
return size == 0;
}
-static int receive_skip(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 0);
-}
-
-static int receive_skip_silent(struct drbd_conf *mdev, struct p_header *h)
-{
- return receive_skip_(mdev, h, 1);
-}
-
-static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
+static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
{
if (mdev->state.disk >= D_INCONSISTENT)
drbd_kick_lo(mdev);
@@ -3597,108 +3718,94 @@ static int receive_UnplugRemote(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, struct p_header *);
-
-static drbd_cmd_handler_f drbd_default_handler[] = {
- [P_DATA] = receive_Data,
- [P_DATA_REPLY] = receive_DataReply,
- [P_RS_DATA_REPLY] = receive_RSDataReply,
- [P_BARRIER] = receive_Barrier,
- [P_BITMAP] = receive_bitmap,
- [P_COMPRESSED_BITMAP] = receive_bitmap,
- [P_UNPLUG_REMOTE] = receive_UnplugRemote,
- [P_DATA_REQUEST] = receive_DataRequest,
- [P_RS_DATA_REQUEST] = receive_DataRequest,
- [P_SYNC_PARAM] = receive_SyncParam,
- [P_SYNC_PARAM89] = receive_SyncParam,
- [P_PROTOCOL] = receive_protocol,
- [P_UUIDS] = receive_uuids,
- [P_SIZES] = receive_sizes,
- [P_STATE] = receive_state,
- [P_STATE_CHG_REQ] = receive_req_state,
- [P_SYNC_UUID] = receive_sync_uuid,
- [P_OV_REQUEST] = receive_DataRequest,
- [P_OV_REPLY] = receive_DataRequest,
- [P_CSUM_RS_REQUEST] = receive_DataRequest,
- [P_DELAY_PROBE] = receive_skip_silent,
+typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
+
+struct data_cmd {
+ int expect_payload;
+ size_t pkt_size;
+ drbd_cmd_handler_f function;
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+ [P_DATA] = { 1, sizeof(struct p_data), receive_Data },
+ [P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
+ [P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+ [P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+ [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
+ [P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
+ [P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
+ [P_STATE] = { 0, sizeof(struct p_state), receive_state },
+ [P_STATE_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_state },
+ [P_SYNC_UUID] = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+ [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+ [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+ [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
/* anything missing from this table is in
* the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = NULL,
+ [P_MAX_CMD] = { 0, 0, NULL },
};
-static drbd_cmd_handler_f *drbd_cmd_handler = drbd_default_handler;
-static drbd_cmd_handler_f *drbd_opt_cmd_handler;
+/* All handler functions that expect a sub-header get that sub-heder in
+ mdev->data.rbuf.header.head.payload.
+
+ Usually in mdev->data.rbuf.header.head the callback can find the usual
+ p_header, but they may not rely on that. Since there is also p_header95 !
+ */
static void drbdd(struct drbd_conf *mdev)
{
- drbd_cmd_handler_f handler;
- struct p_header *header = &mdev->data.rbuf.header;
+ union p_header *header = &mdev->data.rbuf.header;
+ unsigned int packet_size;
+ enum drbd_packets cmd;
+ size_t shs; /* sub header size */
+ int rv;
while (get_t_state(&mdev->receiver) == Running) {
drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, header)) {
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
- }
+ if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ goto err_out;
- if (header->command < P_MAX_CMD)
- handler = drbd_cmd_handler[header->command];
- else if (P_MAY_IGNORE < header->command
- && header->command < P_MAX_OPT_CMD)
- handler = drbd_opt_cmd_handler[header->command-P_MAY_IGNORE];
- else if (header->command > P_MAX_OPT_CMD)
- handler = receive_skip;
- else
- handler = NULL;
+ if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
+ dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ goto err_out;
+ }
- if (unlikely(!handler)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n",
- header->command, header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+ shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
+ rv = drbd_recv(mdev, &header->h80.payload, shs);
+ if (unlikely(rv != shs)) {
+ dev_err(DEV, "short read while reading sub header: rv=%d\n", rv);
+ goto err_out;
}
- if (unlikely(!handler(mdev, header))) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(header->command), header->length);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- break;
+
+ if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
+ dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ goto err_out;
}
- }
-}
-static void drbd_fail_pending_reads(struct drbd_conf *mdev)
-{
- struct hlist_head *slot;
- struct hlist_node *pos;
- struct hlist_node *tmp;
- struct drbd_request *req;
- int i;
+ rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
- /*
- * Application READ requests
- */
- spin_lock_irq(&mdev->req_lock);
- for (i = 0; i < APP_R_HSIZE; i++) {
- slot = mdev->app_reads_hash+i;
- hlist_for_each_entry_safe(req, pos, tmp, slot, colision) {
- /* it may (but should not any longer!)
- * be on the work queue; if that assert triggers,
- * we need to also grab the
- * spin_lock_irq(&mdev->data.work.q_lock);
- * and list_del_init here. */
- D_ASSERT(list_empty(&req->w.list));
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(req, connection_lost_while_pending);
+ if (unlikely(!rv)) {
+ dev_err(DEV, "error receiving %s, l: %d!\n",
+ cmdname(cmd), packet_size);
+ goto err_out;
}
}
- for (i = 0; i < APP_R_HSIZE; i++)
- if (!hlist_empty(mdev->app_reads_hash+i))
- dev_warn(DEV, "ASSERT FAILED: app_reads_hash[%d].first: "
- "%p, should be NULL\n", i, mdev->app_reads_hash[i].first);
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
- spin_unlock_irq(&mdev->req_lock);
+ if (0) {
+ err_out:
+ drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ }
+ /* If we leave here, we probably want to update at least the
+ * "Connected" indicator on stable storage. Do so explicitly here. */
+ drbd_md_sync(mdev);
}
void drbd_flush_workqueue(struct drbd_conf *mdev)
@@ -3711,6 +3818,36 @@ void drbd_flush_workqueue(struct drbd_conf *mdev)
wait_for_completion(&barr.done);
}
+void drbd_free_tl_hash(struct drbd_conf *mdev)
+{
+ struct hlist_head *h;
+
+ spin_lock_irq(&mdev->req_lock);
+
+ if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
+ spin_unlock_irq(&mdev->req_lock);
+ return;
+ }
+ /* paranoia code */
+ for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
+ (int)(h - mdev->ee_hash), h->first);
+ kfree(mdev->ee_hash);
+ mdev->ee_hash = NULL;
+ mdev->ee_hash_s = 0;
+
+ /* paranoia code */
+ for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
+ if (h->first)
+ dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
+ (int)(h - mdev->tl_hash), h->first);
+ kfree(mdev->tl_hash);
+ mdev->tl_hash = NULL;
+ mdev->tl_hash_s = 0;
+ spin_unlock_irq(&mdev->req_lock);
+}
+
static void drbd_disconnect(struct drbd_conf *mdev)
{
enum drbd_fencing_p fp;
@@ -3728,6 +3865,7 @@ static void drbd_disconnect(struct drbd_conf *mdev)
drbd_thread_stop(&mdev->asender);
drbd_free_sock(mdev);
+ /* wait for current activity to cease. */
spin_lock_irq(&mdev->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
@@ -3752,7 +3890,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
/* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
- set_bit(STOP_SYNC_TIMER, &mdev->flags);
resync_timer_fn((unsigned long)mdev);
/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
@@ -3767,11 +3904,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
- if (!mdev->state.susp)
+ if (!is_susp(mdev->state))
tl_clear(mdev);
- drbd_fail_pending_reads(mdev);
-
dev_info(DEV, "Connection closed\n");
drbd_md_sync(mdev);
@@ -3782,12 +3917,8 @@ static void drbd_disconnect(struct drbd_conf *mdev)
put_ldev(mdev);
}
- if (mdev->state.role == R_PRIMARY) {
- if (fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN) {
- enum drbd_disk_state nps = drbd_try_outdate_peer(mdev);
- drbd_request_state(mdev, NS(pdsk, nps));
- }
- }
+ if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
+ drbd_try_outdate_peer_async(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
@@ -3800,32 +3931,14 @@ static void drbd_disconnect(struct drbd_conf *mdev)
spin_unlock_irq(&mdev->req_lock);
if (os.conn == C_DISCONNECTING) {
- struct hlist_head *h;
- wait_event(mdev->misc_wait, atomic_read(&mdev->net_cnt) == 0);
+ wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
- /* we must not free the tl_hash
- * while application io is still on the fly */
- wait_event(mdev->misc_wait, atomic_read(&mdev->ap_bio_cnt) == 0);
-
- spin_lock_irq(&mdev->req_lock);
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* paranoia code */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n",
- (int)(h - mdev->tl_hash), h->first);
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
+ if (!is_susp(mdev->state)) {
+ /* we must not free the tl_hash
+ * while application io is still on the fly */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
+ drbd_free_tl_hash(mdev);
+ }
crypto_free_hash(mdev->cram_hmac_tfm);
mdev->cram_hmac_tfm = NULL;
@@ -3845,6 +3958,9 @@ static void drbd_disconnect(struct drbd_conf *mdev)
i = drbd_release_ee(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
+ i = atomic_read(&mdev->pp_in_use_by_net);
+ if (i)
+ dev_info(DEV, "pp_in_use_by_net = %d, expected 0\n", i);
i = atomic_read(&mdev->pp_in_use);
if (i)
dev_info(DEV, "pp_in_use = %d, expected 0\n", i);
@@ -3888,7 +4004,7 @@ static int drbd_send_handshake(struct drbd_conf *mdev)
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header *)p, sizeof(*p), 0 );
+ (struct p_header80 *)p, sizeof(*p), 0 );
mutex_unlock(&mdev->data.mutex);
return ok;
}
@@ -3904,27 +4020,28 @@ static int drbd_do_handshake(struct drbd_conf *mdev)
{
/* ASSERT current == mdev->receiver ... */
struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake)
- -sizeof(struct p_header);
+ const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
+ unsigned int length;
+ enum drbd_packets cmd;
int rv;
rv = drbd_send_handshake(mdev);
if (!rv)
return 0;
- rv = drbd_recv_header(mdev, &p->head);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
return 0;
- if (p->head.command != P_HAND_SHAKE) {
+ if (cmd != P_HAND_SHAKE) {
dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(p->head.command), p->head.command);
+ cmdname(cmd), cmd);
return -1;
}
- if (p->head.length != expect) {
+ if (length != expect) {
dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, p->head.length);
+ expect, length);
return -1;
}
@@ -3982,10 +4099,11 @@ static int drbd_do_auth(struct drbd_conf *mdev)
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
- struct p_header p;
unsigned int key_len = strlen(mdev->net_conf->shared_secret);
unsigned int resp_size;
struct hash_desc desc;
+ enum drbd_packets cmd;
+ unsigned int length;
int rv;
desc.tfm = mdev->cram_hmac_tfm;
@@ -4005,33 +4123,33 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_CHALLENGE) {
+ if (cmd != P_AUTH_CHALLENGE) {
dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length > CHALLENGE_LEN*2) {
+ if (length > CHALLENGE_LEN * 2) {
dev_err(DEV, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
- peers_ch = kmalloc(p.length, GFP_NOIO);
+ peers_ch = kmalloc(length, GFP_NOIO);
if (peers_ch == NULL) {
dev_err(DEV, "kmalloc of peers_ch failed\n");
rv = -1;
goto fail;
}
- rv = drbd_recv(mdev, peers_ch, p.length);
+ rv = drbd_recv(mdev, peers_ch, length);
- if (rv != p.length) {
+ if (rv != length) {
dev_err(DEV, "short read AuthChallenge: l=%u\n", rv);
rv = 0;
goto fail;
@@ -4046,7 +4164,7 @@ static int drbd_do_auth(struct drbd_conf *mdev)
}
sg_init_table(&sg, 1);
- sg_set_buf(&sg, peers_ch, p.length);
+ sg_set_buf(&sg, peers_ch, length);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
@@ -4059,18 +4177,18 @@ static int drbd_do_auth(struct drbd_conf *mdev)
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &p);
+ rv = drbd_recv_header(mdev, &cmd, &length);
if (!rv)
goto fail;
- if (p.command != P_AUTH_RESPONSE) {
+ if (cmd != P_AUTH_RESPONSE) {
dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- cmdname(p.command), p.command);
+ cmdname(cmd), cmd);
rv = 0;
goto fail;
}
- if (p.length != resp_size) {
+ if (length != resp_size) {
dev_err(DEV, "expected AuthResponse payload of wrong size\n");
rv = 0;
goto fail;
@@ -4155,7 +4273,7 @@ int drbdd_init(struct drbd_thread *thi)
/* ********* acknowledge sender ******** */
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_req_state_reply *p = (struct p_req_state_reply *)h;
@@ -4173,13 +4291,13 @@ static int got_RqSReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_Ping(struct drbd_conf *mdev, struct p_header *h)
+static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
{
return drbd_send_ping_ack(mdev);
}
-static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
{
/* restore idle timeout */
mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
@@ -4189,7 +4307,7 @@ static int got_PingAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
+static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4199,11 +4317,15 @@ static int got_IsInSync(struct drbd_conf *mdev, struct p_header *h)
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- drbd_rs_complete_io(mdev, sector);
- drbd_set_in_sync(mdev, sector, blksize);
- /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
- mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, sector);
+ drbd_set_in_sync(mdev, sector, blksize);
+ /* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+ mdev->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+ put_ldev(mdev);
+ }
dec_rs_pending(mdev);
+ atomic_add(blksize >> 9, &mdev->rs_sect_in);
return TRUE;
}
@@ -4259,7 +4381,7 @@ static int validate_req_change_req_state(struct drbd_conf *mdev,
return TRUE;
}
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4299,7 +4421,7 @@ static int got_BlockAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , what);
}
-static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4319,7 +4441,7 @@ static int got_NegAck(struct drbd_conf *mdev, struct p_header *h)
_ack_id_to_req, __func__ , neg_acked);
}
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
sector_t sector = be64_to_cpu(p->sector);
@@ -4332,7 +4454,7 @@ static int got_NegDReply(struct drbd_conf *mdev, struct p_header *h)
_ar_id_to_req, __func__ , neg_acked);
}
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
+static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
{
sector_t sector;
int size;
@@ -4354,7 +4476,7 @@ static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
+static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_barrier_ack *p = (struct p_barrier_ack *)h;
@@ -4363,7 +4485,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header *h)
return TRUE;
}
-static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
+static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
{
struct p_block_ack *p = (struct p_block_ack *)h;
struct drbd_work *w;
@@ -4380,6 +4502,9 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
else
ov_oos_print(mdev);
+ if (!get_ldev(mdev))
+ return TRUE;
+
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
@@ -4394,18 +4519,18 @@ static int got_OVResult(struct drbd_conf *mdev, struct p_header *h)
drbd_resync_finished(mdev);
}
}
+ put_ldev(mdev);
return TRUE;
}
-static int got_something_to_ignore_m(struct drbd_conf *mdev, struct p_header *h)
+static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
{
- /* IGNORE */
return TRUE;
}
struct asender_cmd {
size_t pkt_size;
- int (*process)(struct drbd_conf *mdev, struct p_header *h);
+ int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
};
static struct asender_cmd *get_asender_cmd(int cmd)
@@ -4414,8 +4539,8 @@ static struct asender_cmd *get_asender_cmd(int cmd)
/* anything missing from this table is in
* the drbd_cmd_handler (drbd_default_handler) table,
* see the beginning of drbdd() */
- [P_PING] = { sizeof(struct p_header), got_Ping },
- [P_PING_ACK] = { sizeof(struct p_header), got_PingAck },
+ [P_PING] = { sizeof(struct p_header80), got_Ping },
+ [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
@@ -4427,7 +4552,7 @@ static struct asender_cmd *get_asender_cmd(int cmd)
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
- [P_DELAY_PROBE] = { sizeof(struct p_delay_probe), got_something_to_ignore_m },
+ [P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
[P_MAX_CMD] = { 0, NULL },
};
if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
@@ -4438,13 +4563,13 @@ static struct asender_cmd *get_asender_cmd(int cmd)
int drbd_asender(struct drbd_thread *thi)
{
struct drbd_conf *mdev = thi->mdev;
- struct p_header *h = &mdev->meta.rbuf.header;
+ struct p_header80 *h = &mdev->meta.rbuf.header.h80;
struct asender_cmd *cmd = NULL;
int rv, len;
void *buf = h;
int received = 0;
- int expect = sizeof(struct p_header);
+ int expect = sizeof(struct p_header80);
int empty;
sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
@@ -4468,10 +4593,8 @@ int drbd_asender(struct drbd_thread *thi)
while (1) {
clear_bit(SIGNAL_ASENDER, &mdev->flags);
flush_signals(current);
- if (!drbd_process_done_ee(mdev)) {
- dev_err(DEV, "process_done_ee() = NOT_OK\n");
+ if (!drbd_process_done_ee(mdev))
goto reconnect;
- }
/* to avoid race with newly queued ACKs */
set_bit(SIGNAL_ASENDER, &mdev->flags);
spin_lock_irq(&mdev->req_lock);
@@ -4530,21 +4653,23 @@ int drbd_asender(struct drbd_thread *thi)
if (received == expect && cmd == NULL) {
if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto reconnect;
}
cmd = get_asender_cmd(be16_to_cpu(h->command));
len = be16_to_cpu(h->length);
if (unlikely(cmd == NULL)) {
- dev_err(DEV, "unknown command?? on meta m: 0x%lx c: %d l: %d\n",
- (long)be32_to_cpu(h->magic),
- h->command, h->length);
+ dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
+ be32_to_cpu(h->magic),
+ be16_to_cpu(h->command),
+ be16_to_cpu(h->length));
goto disconnect;
}
expect = cmd->pkt_size;
- ERR_IF(len != expect-sizeof(struct p_header))
+ ERR_IF(len != expect-sizeof(struct p_header80))
goto reconnect;
}
if (received == expect) {
@@ -4554,7 +4679,7 @@ int drbd_asender(struct drbd_thread *thi)
buf = h;
received = 0;
- expect = sizeof(struct p_header);
+ expect = sizeof(struct p_header80);
cmd = NULL;
}
}
@@ -4562,10 +4687,12 @@ int drbd_asender(struct drbd_thread *thi)
if (0) {
reconnect:
drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ drbd_md_sync(mdev);
}
if (0) {
disconnect:
drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ drbd_md_sync(mdev);
}
clear_bit(SIGNAL_ASENDER, &mdev->flags);
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index f761d98a4e90..9e91a2545fc8 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -59,17 +59,19 @@ static void _drbd_end_io_acct(struct drbd_conf *mdev, struct drbd_request *req)
static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
{
const unsigned long s = req->rq_state;
+
+ /* remove it from the transfer log.
+ * well, only if it had been there in the first
+ * place... if it had not (local only or conflicting
+ * and never sent), it should still be "empty" as
+ * initialized in drbd_req_new(), so we can list_del() it
+ * here unconditionally */
+ list_del(&req->tl_requests);
+
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
if (rw == WRITE) {
- /* remove it from the transfer log.
- * well, only if it had been there in the first
- * place... if it had not (local only or conflicting
- * and never sent), it should still be "empty" as
- * initialized in drbd_req_new(), so we can list_del() it
- * here unconditionally */
- list_del(&req->tl_requests);
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
@@ -92,7 +94,8 @@ static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const
*/
if (s & RQ_LOCAL_MASK) {
if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_al_complete_io(mdev, req->sector);
+ if (s & RQ_IN_ACT_LOG)
+ drbd_al_complete_io(mdev, req->sector);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
@@ -280,6 +283,14 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* protocol A or B, barrier ack still pending... */
}
+static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+{
+ struct drbd_conf *mdev = req->mdev;
+
+ if (!is_susp(mdev->state))
+ _req_may_be_done(req, m);
+}
+
/*
* checks whether there was an overlapping request
* or ee already registered.
@@ -380,10 +391,11 @@ out_conflict:
* and it enforces that we have to think in a very structured manner
* about the "events" that may happen to a request during its life time ...
*/
-void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
struct drbd_conf *mdev = req->mdev;
+ int rv = 0;
m->bio = NULL;
switch (what) {
@@ -420,7 +432,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -429,7 +441,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_LOCAL_PENDING;
__drbd_chk_io_error(mdev, FALSE);
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -437,7 +449,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* it is legal to fail READA */
req->rq_state |= RQ_LOCAL_COMPLETED;
req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
put_ldev(mdev);
break;
@@ -455,7 +467,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* no point in retrying if there is no good remote data,
* or we have no connection. */
if (mdev->state.pdsk != D_UP_TO_DATE) {
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
}
@@ -517,11 +529,9 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
req->epoch = mdev->newest_tle->br_number;
- list_add_tail(&req->tl_requests,
- &mdev->newest_tle->requests);
/* increment size of current epoch */
- mdev->newest_tle->n_req++;
+ mdev->newest_tle->n_writes++;
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
@@ -530,7 +540,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
drbd_queue_work(&mdev->data.work, &req->w);
/* close the epoch, in case it outgrew the limit */
- if (mdev->newest_tle->n_req >= mdev->net_conf->max_epoch_size)
+ if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
queue_barrier(mdev);
break;
@@ -543,7 +553,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~RQ_NET_QUEUED;
/* if we did it right, tl_clear should be scheduled only after
* this, so this should not be necessary! */
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case handed_over_to_network:
@@ -568,7 +578,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
* "completed_ok" events came in, once we return from
* _drbd_send_zc_bio (drbd_send_dblock), we have to check
* whether it is done already, and end it. */
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case read_retry_remote_canceled:
@@ -584,7 +594,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
/* if it is still queued, we may not complete it here.
* it will be canceled soon. */
if (!(req->rq_state & RQ_NET_QUEUED))
- _req_may_be_done(req, m);
+ _req_may_be_done(req, m); /* Allowed while state.susp */
break;
case write_acked_by_peer_and_sis:
@@ -619,7 +629,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
D_ASSERT(req->rq_state & RQ_NET_PENDING);
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
case neg_acked:
@@ -629,11 +639,50 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
/* else: done by handed_over_to_network */
break;
+ case fail_frozen_disk_io:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ _req_may_be_done(req, m); /* Allowed while state.susp */
+ break;
+
+ case restart_frozen_disk_io:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+
+ req->rq_state &= ~RQ_LOCAL_COMPLETED;
+
+ rv = MR_READ;
+ if (bio_data_dir(req->master_bio) == WRITE)
+ rv = MR_WRITE;
+
+ get_ldev(mdev);
+ req->w.cb = w_restart_disk_io;
+ drbd_queue_work(&mdev->data.work, &req->w);
+ break;
+
+ case resend:
+ /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+ before the connection loss (B&C only); only P_BARRIER_ACK was missing.
+ Trowing them out of the TL here by pretending we got a BARRIER_ACK
+ We ensure that the peer was not rebooted */
+ if (!(req->rq_state & RQ_NET_OK)) {
+ if (req->w.cb) {
+ drbd_queue_work(&mdev->data.work, &req->w);
+ rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+ }
+ break;
+ }
+ /* else, fall through to barrier_acked */
+
case barrier_acked:
+ if (!(req->rq_state & RQ_WRITE))
+ break;
+
if (req->rq_state & RQ_NET_PENDING) {
/* barrier came in before all requests have been acked.
* this is bad, because if the connection is lost now,
@@ -643,7 +692,7 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
}
D_ASSERT(req->rq_state & RQ_NET_SENT);
req->rq_state |= RQ_NET_DONE;
- _req_may_be_done(req, m);
+ _req_may_be_done(req, m); /* Allowed while state.susp */
break;
case data_received:
@@ -651,9 +700,11 @@ void __req_mod(struct drbd_request *req, enum drbd_req_event what,
dec_ap_pending(mdev);
req->rq_state &= ~RQ_NET_PENDING;
req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
- _req_may_be_done(req, m);
+ _req_may_be_done_not_susp(req, m);
break;
};
+
+ return rv;
}
/* we may do a local read if:
@@ -752,14 +803,16 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio)
* resync extent to finish, and, if necessary, pulls in the target
* extent into the activity log, which involves further disk io because
* of transactional on-disk meta data updates. */
- if (rw == WRITE && local)
+ if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+ req->rq_state |= RQ_IN_ACT_LOG;
drbd_al_begin_io(mdev, sector);
+ }
remote = remote && (mdev->state.pdsk == D_UP_TO_DATE ||
(mdev->state.pdsk == D_INCONSISTENT &&
mdev->state.conn >= C_CONNECTED));
- if (!(local || remote) && !mdev->state.susp) {
+ if (!(local || remote) && !is_susp(mdev->state)) {
dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
goto fail_free_complete;
}
@@ -785,7 +838,7 @@ allocate_barrier:
/* GOOD, everything prepared, grab the spin_lock */
spin_lock_irq(&mdev->req_lock);
- if (mdev->state.susp) {
+ if (is_susp(mdev->state)) {
/* If we got suspended, use the retry mechanism of
generic_make_request() to restart processing of this
bio. In the next call to drbd_make_request_26
@@ -867,30 +920,10 @@ allocate_barrier:
/* check this request on the collision detection hash tables.
* if we have a conflict, just complete it here.
* THINK do we want to check reads, too? (I don't think so...) */
- if (rw == WRITE && _req_conflicts(req)) {
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- if (local) {
- bio_put(req->private_bio);
- req->private_bio = NULL;
- drbd_al_complete_io(mdev, req->sector);
- put_ldev(mdev);
- local = 0;
- }
- if (remote)
- dec_ap_pending(mdev);
- _drbd_end_io_acct(mdev, req);
- /* THINK: do we want to fail it (-EIO), or pretend success? */
- bio_endio(req->master_bio, 0);
- req->master_bio = NULL;
- dec_ap_bio(mdev);
- drbd_req_free(req);
- remote = 0;
- }
+ if (rw == WRITE && _req_conflicts(req))
+ goto fail_conflicting;
+
+ list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
/* NOTE remote first: to get the concurrent write detection right,
* we must register the request before start of local IO. */
@@ -923,6 +956,21 @@ allocate_barrier:
return 0;
+fail_conflicting:
+ /* this is a conflicting request.
+ * even though it may have been only _partially_
+ * overlapping with one of the currently pending requests,
+ * without even submitting or sending it, we will
+ * pretend that it was successfully served right now.
+ */
+ _drbd_end_io_acct(mdev, req);
+ spin_unlock_irq(&mdev->req_lock);
+ if (remote)
+ dec_ap_pending(mdev);
+ /* THINK: do we want to fail it (-EIO), or pretend success?
+ * this pretends success. */
+ err = 0;
+
fail_free_complete:
if (rw == WRITE && local)
drbd_al_complete_io(mdev, sector);
@@ -961,21 +1009,6 @@ static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
return 1;
}
- /*
- * Paranoia: we might have been primary, but sync target, or
- * even diskless, then lost the connection.
- * This should have been handled (panic? suspend?) somewhere
- * else. But maybe it was not, so check again here.
- * Caution: as long as we do not have a read/write lock on mdev,
- * to serialize state changes, this is racy, since we may lose
- * the connection *after* we test for the cstate.
- */
- if (mdev->state.disk < D_UP_TO_DATE && mdev->state.pdsk < D_UP_TO_DATE) {
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "Sorry, I have no access to good data anymore.\n");
- return 1;
- }
-
return 0;
}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 02d575d24518..181ea0364822 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -104,6 +104,9 @@ enum drbd_req_event {
read_ahead_completed_with_error,
write_completed_with_error,
completed_ok,
+ resend,
+ fail_frozen_disk_io,
+ restart_frozen_disk_io,
nothing, /* for tracing only */
};
@@ -183,6 +186,12 @@ enum drbd_req_state_bits {
/* keep this last, its for the RQ_NET_MASK */
__RQ_NET_MAX,
+
+ /* Set when this is a write, clear for a read */
+ __RQ_WRITE,
+
+ /* Should call drbd_al_complete_io() for this request... */
+ __RQ_IN_ACT_LOG,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@@ -201,6 +210,16 @@ enum drbd_req_state_bits {
/* 0x1f8 */
#define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+#define RQ_WRITE (1UL << __RQ_WRITE)
+#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+ should be counted in the epoch object*/
+#define MR_WRITE_SHIFT 0
+#define MR_WRITE (1 << MR_WRITE_SHIFT)
+#define MR_READ_SHIFT 1
+#define MR_READ (1 << MR_READ_SHIFT)
+
/* epoch entries */
static inline
struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
@@ -244,30 +263,36 @@ static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
return NULL;
}
+static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
+{
+ struct bio *bio;
+ bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+
+ req->private_bio = bio;
+
+ bio->bi_private = req;
+ bio->bi_end_io = drbd_endio_pri;
+ bio->bi_next = NULL;
+}
+
static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
struct bio *bio_src)
{
- struct bio *bio;
struct drbd_request *req =
mempool_alloc(drbd_request_mempool, GFP_NOIO);
if (likely(req)) {
- bio = bio_clone(bio_src, GFP_NOIO); /* XXX cannot fail?? */
+ drbd_req_make_private_bio(req, bio_src);
- req->rq_state = 0;
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
req->mdev = mdev;
req->master_bio = bio_src;
- req->private_bio = bio;
req->epoch = 0;
- req->sector = bio->bi_sector;
- req->size = bio->bi_size;
+ req->sector = bio_src->bi_sector;
+ req->size = bio_src->bi_size;
req->start_time = jiffies;
INIT_HLIST_NODE(&req->colision);
INIT_LIST_HEAD(&req->tl_requests);
INIT_LIST_HEAD(&req->w.list);
-
- bio->bi_private = req;
- bio->bi_end_io = drbd_endio_pri;
- bio->bi_next = NULL;
}
return req;
}
@@ -292,36 +317,43 @@ struct bio_and_error {
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
-extern void __req_mod(struct drbd_request *req, enum drbd_req_event what,
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m);
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
-static inline void _req_mod(struct drbd_request *req, enum drbd_req_event what)
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
+ int rv;
/* __req_mod possibly frees req, do not touch req after that! */
- __req_mod(req, what, &m);
+ rv = __req_mod(req, what, &m);
if (m.bio)
complete_master_bio(mdev, &m);
+
+ return rv;
}
/* completion of master bio is outside of spinlock.
* If you need it irqsave, do it your self! */
-static inline void req_mod(struct drbd_request *req,
+static inline int req_mod(struct drbd_request *req,
enum drbd_req_event what)
{
struct drbd_conf *mdev = req->mdev;
struct bio_and_error m;
+ int rv;
+
spin_lock_irq(&mdev->req_lock);
- __req_mod(req, what, &m);
+ rv = __req_mod(req, what, &m);
spin_unlock_irq(&mdev->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
+
+ return rv;
}
#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index ca4a16cea2d8..108d58015cd1 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -39,8 +39,6 @@
#include "drbd_int.h"
#include "drbd_req.h"
-#define SLEEP_TIME (HZ/10)
-
static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
@@ -217,10 +215,8 @@ void drbd_endio_sec(struct bio *bio, int error)
*/
void drbd_endio_pri(struct bio *bio, int error)
{
- unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
- struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -246,12 +242,7 @@ void drbd_endio_pri(struct bio *bio, int error)
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
- spin_lock_irqsave(&mdev->req_lock, flags);
- __req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (m.bio)
- complete_master_bio(mdev, &m);
+ req_mod(req, what);
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -376,54 +367,145 @@ static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
struct drbd_epoch_entry *e;
if (!get_ldev(mdev))
- return 0;
+ return -EIO;
+
+ if (drbd_rs_should_slow_down(mdev))
+ goto defer;
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
if (!e)
- goto fail;
+ goto defer;
+ e->w.cb = w_e_send_csum;
spin_lock_irq(&mdev->req_lock);
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
- e->w.cb = w_e_send_csum;
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
- return 1;
+ return 0;
+
+ /* drbd_submit_ee currently fails for one reason only:
+ * not being able to allocate enough bios.
+ * Is dropping the connection going to help? */
+ spin_lock_irq(&mdev->req_lock);
+ list_del(&e->w.list);
+ spin_unlock_irq(&mdev->req_lock);
drbd_free_ee(mdev, e);
-fail:
+defer:
put_ldev(mdev);
- return 2;
+ return -EAGAIN;
}
void resync_timer_fn(unsigned long data)
{
- unsigned long flags;
struct drbd_conf *mdev = (struct drbd_conf *) data;
int queue;
- spin_lock_irqsave(&mdev->req_lock, flags);
-
- if (likely(!test_and_clear_bit(STOP_SYNC_TIMER, &mdev->flags))) {
- queue = 1;
- if (mdev->state.conn == C_VERIFY_S)
- mdev->resync_work.cb = w_make_ov_request;
- else
- mdev->resync_work.cb = w_make_resync_request;
- } else {
+ queue = 1;
+ switch (mdev->state.conn) {
+ case C_VERIFY_S:
+ mdev->resync_work.cb = w_make_ov_request;
+ break;
+ case C_SYNC_TARGET:
+ mdev->resync_work.cb = w_make_resync_request;
+ break;
+ default:
queue = 0;
mdev->resync_work.cb = w_resync_inactive;
}
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
/* harmless race: list_empty outside data.work.q_lock */
if (list_empty(&mdev->resync_work.list) && queue)
drbd_queue_work(&mdev->data.work, &mdev->resync_work);
}
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+ int ov;
+
+ ov = fb->values[fb->head_index];
+ fb->values[fb->head_index++] = value;
+
+ if (fb->head_index >= fb->size)
+ fb->head_index = 0;
+
+ return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+ int i;
+
+ for (i = 0; i < fb->size; i++)
+ fb->values[i] += value;
+}
+
+int drbd_rs_controller(struct drbd_conf *mdev)
+{
+ unsigned int sect_in; /* Number of sectors that came in since the last turn */
+ unsigned int want; /* The number of sectors we want in the proxy */
+ int req_sect; /* Number of sectors to request in this turn */
+ int correction; /* Number of sectors more we need in the proxy*/
+ int cps; /* correction per invocation of drbd_rs_controller() */
+ int steps; /* Number of time steps to plan ahead */
+ int curr_corr;
+ int max_sect;
+
+ sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
+ mdev->rs_in_flight -= sect_in;
+
+ spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+
+ steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+ if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
+ want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ } else { /* normal path */
+ want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
+ sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ }
+
+ correction = want - mdev->rs_in_flight - mdev->rs_planed;
+
+ /* Plan ahead */
+ cps = correction / steps;
+ fifo_add_val(&mdev->rs_plan_s, cps);
+ mdev->rs_planed += cps * steps;
+
+ /* What we do in this step */
+ curr_corr = fifo_push(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
+ mdev->rs_planed -= curr_corr;
+
+ req_sect = sect_in + curr_corr;
+ if (req_sect < 0)
+ req_sect = 0;
+
+ max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ if (req_sect > max_sect)
+ req_sect = max_sect;
+
+ /*
+ dev_warn(DEV, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+ sect_in, mdev->rs_in_flight, want, correction,
+ steps, cps, mdev->rs_planed, curr_corr, req_sect);
+ */
+
+ return req_sect;
+}
+
int w_make_resync_request(struct drbd_conf *mdev,
struct drbd_work *w, int cancel)
{
@@ -431,8 +513,9 @@ int w_make_resync_request(struct drbd_conf *mdev,
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size;
- int number, i, size, pe, mx;
+ int number, rollback_i, size, pe, mx;
int align, queued, sndbuf;
+ int i = 0;
if (unlikely(cancel))
return 1;
@@ -446,6 +529,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
+ if (mdev->rs_total == 0) {
+ /* empty resync? */
+ drbd_resync_finished(mdev);
+ return 1;
+ }
+
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
@@ -458,11 +547,25 @@ int w_make_resync_request(struct drbd_conf *mdev,
/* starting with drbd 8.3.8, we can handle multi-bio EEs,
* if it should be necessary */
- max_segment_size = mdev->agreed_pro_version < 94 ?
- queue_max_segment_size(mdev->rq_queue) : DRBD_MAX_SEGMENT_SIZE;
+ max_segment_size =
+ mdev->agreed_pro_version < 94 ? queue_max_segment_size(mdev->rq_queue) :
+ mdev->agreed_pro_version < 95 ? DRBD_MAX_SIZE_H80_PACKET : DRBD_MAX_SEGMENT_SIZE;
- number = SLEEP_TIME * mdev->sync_conf.rate / ((BM_BLOCK_SIZE / 1024) * HZ);
- pe = atomic_read(&mdev->rs_pending_cnt);
+ if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+ number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
+ mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+ } else {
+ mdev->c_sync_rate = mdev->sync_conf.rate;
+ number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
+ }
+
+ /* Throttle resync on lower level disk activity, which may also be
+ * caused by application IO on Primary/SyncTarget.
+ * Keep this after the call to drbd_rs_controller, as that assumes
+ * to be called as precisely as possible every SLEEP_TIME,
+ * and would be confused otherwise. */
+ if (drbd_rs_should_slow_down(mdev))
+ goto requeue;
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
@@ -476,6 +579,7 @@ int w_make_resync_request(struct drbd_conf *mdev,
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+ pe = atomic_read(&mdev->rs_pending_cnt);
if ((pe + number) > mx) {
number = mx - pe;
}
@@ -526,6 +630,7 @@ next_sector:
* be prepared for all stripe sizes of software RAIDs.
*/
align = 1;
+ rollback_i = i;
for (;;) {
if (size + BM_BLOCK_SIZE > max_segment_size)
break;
@@ -561,14 +666,19 @@ next_sector:
size = (capacity-sector)<<9;
if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
- case 0: /* Disk failure*/
+ case -EIO: /* Disk failure */
put_ldev(mdev);
return 0;
- case 2: /* Allocation failed */
+ case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
+ i = rollback_i;
goto requeue;
- /* case 1: everything ok */
+ case 0:
+ /* everything ok */
+ break;
+ default:
+ BUG();
}
} else {
inc_rs_pending(mdev);
@@ -595,6 +705,7 @@ next_sector:
}
requeue:
+ mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
return 1;
@@ -670,6 +781,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
+static void ping_peer(struct drbd_conf *mdev)
+{
+ clear_bit(GOT_PING_ACK, &mdev->flags);
+ request_ping(mdev);
+ wait_event(mdev->misc_wait,
+ test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
+}
+
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
@@ -709,6 +828,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;
+ ping_peer(mdev);
+
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
@@ -801,6 +922,8 @@ out:
mdev->rs_paused = 0;
mdev->ov_start_sector = 0;
+ drbd_md_sync(mdev);
+
if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC, &mdev->flags)) {
dev_warn(DEV, "Writing the whole bitmap, due to failed kmalloc\n");
drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, "write from resync_finished");
@@ -817,9 +940,13 @@ static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_ent
{
if (drbd_ee_has_active_page(e)) {
/* This might happen if sendpage() has not finished */
+ int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ atomic_add(i, &mdev->pp_in_use_by_net);
+ atomic_sub(i, &mdev->pp_in_use);
spin_lock_irq(&mdev->req_lock);
list_add_tail(&e->w.list, &mdev->net_ee);
spin_unlock_irq(&mdev->req_lock);
+ wake_up(&drbd_pp_wait);
} else
drbd_free_ee(mdev, e);
}
@@ -926,9 +1053,12 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return 1;
}
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
@@ -952,7 +1082,9 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
} else {
inc_rs_pending(mdev);
- e->block_id = ID_SYNCER;
+ e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ kfree(di);
ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
}
} else {
@@ -962,9 +1094,6 @@ int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
move_to_net_ee_or_free(mdev, e);
if (unlikely(!ok))
@@ -1034,9 +1163,12 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
- drbd_rs_complete_io(mdev, e->sector);
+ if (get_ldev(mdev)) {
+ drbd_rs_complete_io(mdev, e->sector);
+ put_ldev(mdev);
+ }
- di = (struct digest_info *)(unsigned long)e->block_id;
+ di = e->digest;
if (likely((e->flags & EE_WAS_ERROR) == 0)) {
digest_size = crypto_hash_digestsize(mdev->verify_tfm);
@@ -1055,9 +1187,6 @@ int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
}
dec_unacked(mdev);
-
- kfree(di);
-
if (!eq)
drbd_ov_oos_found(mdev, e->sector, e->size);
else
@@ -1108,7 +1237,7 @@ int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
* dec_ap_pending will be done in got_BarrierAck
* or (on connection loss) in w_clear_epoch. */
ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header *)p, sizeof(*p), 0);
+ (struct p_header80 *)p, sizeof(*p), 0);
drbd_put_data_sock(mdev);
return ok;
@@ -1173,6 +1302,24 @@ int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
return ok;
}
+int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+{
+ struct drbd_request *req = container_of(w, struct drbd_request, w);
+
+ if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+ drbd_al_begin_io(mdev, req->sector);
+ /* Calling drbd_al_begin_io() out of the worker might deadlocks
+ theoretically. Practically it can not deadlock, since this is
+ only used when unfreezing IOs. All the extents of the requests
+ that made it into the TL are already active */
+
+ drbd_req_make_private_bio(req, req->master_bio);
+ req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
+ generic_make_request(req->private_bio);
+
+ return 1;
+}
+
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
@@ -1298,14 +1445,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}
-static void ping_peer(struct drbd_conf *mdev)
-{
- clear_bit(GOT_PING_ACK, &mdev->flags);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
-}
-
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
@@ -1379,13 +1518,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
r = SS_UNKNOWN_ERROR;
if (r == SS_SUCCESS) {
- mdev->rs_total =
- mdev->rs_mark_left = drbd_bm_total_weight(mdev);
+ unsigned long tw = drbd_bm_total_weight(mdev);
+ unsigned long now = jiffies;
+ int i;
+
mdev->rs_failed = 0;
mdev->rs_paused = 0;
- mdev->rs_start =
- mdev->rs_mark_time = jiffies;
mdev->rs_same_csum = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ mdev->rs_total = tw;
+ mdev->rs_start = now;
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = tw;
+ mdev->rs_mark_time[i] = now;
+ }
_drbd_pause_after(mdev);
}
write_unlock_irq(&global_state_lock);
@@ -1397,12 +1544,31 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
- if (mdev->rs_total == 0) {
- /* Peer still reachable? Beware of failing before-resync-target handlers! */
- ping_peer(mdev);
+ if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ /* This still has a race (about when exactly the peers
+ * detect connection loss) that can lead to a full sync
+ * on next handshake. In 8.3.9 we fixed this with explicit
+ * resync-finished notifications, but the fix
+ * introduces a protocol change. Sleeping for some
+ * time longer than the ping interval + timeout on the
+ * SyncSource, to give the SyncTarget the chance to
+ * detect connection loss, then waiting for a ping
+ * response (implicit in drbd_resync_finished) reduces
+ * the race considerably, but does not solve it. */
+ if (side == C_SYNC_SOURCE)
+ schedule_timeout_interruptible(
+ mdev->net_conf->ping_int * HZ +
+ mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}
+ atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
+ mdev->rs_in_flight = 0;
+ mdev->rs_planed = 0;
+ spin_lock(&mdev->peer_seq_lock);
+ fifo_set(&mdev->rs_plan_s, 0);
+ spin_unlock(&mdev->peer_seq_lock);
/* ns.conn may already be != mdev->state.conn,
* we may have been paused in between, or become paused until
* the timer triggers.
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index cf04c1b234ed..767107cce982 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -178,7 +178,6 @@ static int print_unex = 1;
#include <linux/slab.h>
#include <linux/mm.h>
#include <linux/bio.h>
-#include <linux/smp_lock.h>
#include <linux/string.h>
#include <linux/jiffies.h>
#include <linux/fcntl.h>
@@ -199,6 +198,7 @@ static int print_unex = 1;
* It's been recommended that take about 1/4 of the default speed
* in some more extreme cases.
*/
+static DEFINE_MUTEX(floppy_mutex);
static int slow_floppy;
#include <asm/dma.h>
@@ -258,8 +258,8 @@ static int irqdma_allocated;
#include <linux/completion.h>
static struct request *current_req;
-static struct request_queue *floppy_queue;
static void do_fd_request(struct request_queue *q);
+static int set_next_request(void);
#ifndef fd_get_dma_residue
#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
@@ -413,6 +413,7 @@ static struct gendisk *disks[N_DRIVE];
static struct block_device *opened_bdev[N_DRIVE];
static DEFINE_MUTEX(open_lock);
static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+static int fdc_queue;
/*
* This struct defines the different floppy types.
@@ -890,8 +891,8 @@ static void unlock_fdc(void)
del_timer(&fd_timeout);
cont = NULL;
clear_bit(0, &fdc_busy);
- if (current_req || blk_peek_request(floppy_queue))
- do_fd_request(floppy_queue);
+ if (current_req || set_next_request())
+ do_fd_request(current_req->q);
spin_unlock_irqrestore(&floppy_lock, flags);
wake_up(&fdc_wait);
}
@@ -2243,8 +2244,8 @@ static void floppy_end_request(struct request *req, int error)
* logical buffer */
static void request_done(int uptodate)
{
- struct request_queue *q = floppy_queue;
struct request *req = current_req;
+ struct request_queue *q;
unsigned long flags;
int block;
char msg[sizeof("request done ") + sizeof(int) * 3];
@@ -2258,6 +2259,8 @@ static void request_done(int uptodate)
return;
}
+ q = req->q;
+
if (uptodate) {
/* maintain values for invalidation on geometry
* change */
@@ -2811,6 +2814,28 @@ static int make_raw_rw_request(void)
return 2;
}
+/*
+ * Round-robin between our available drives, doing one request from each
+ */
+static int set_next_request(void)
+{
+ struct request_queue *q;
+ int old_pos = fdc_queue;
+
+ do {
+ q = disks[fdc_queue]->queue;
+ if (++fdc_queue == N_DRIVE)
+ fdc_queue = 0;
+ if (q) {
+ current_req = blk_fetch_request(q);
+ if (current_req)
+ break;
+ }
+ } while (fdc_queue != old_pos);
+
+ return current_req != NULL;
+}
+
static void redo_fd_request(void)
{
int drive;
@@ -2822,17 +2847,17 @@ static void redo_fd_request(void)
do_request:
if (!current_req) {
- struct request *req;
+ int pending;
+
+ spin_lock_irq(&floppy_lock);
+ pending = set_next_request();
+ spin_unlock_irq(&floppy_lock);
- spin_lock_irq(floppy_queue->queue_lock);
- req = blk_fetch_request(floppy_queue);
- spin_unlock_irq(floppy_queue->queue_lock);
- if (!req) {
+ if (!pending) {
do_floppy = NULL;
unlock_fdc();
return;
}
- current_req = req;
}
drive = (long)current_req->rq_disk->private_data;
set_fdc(drive);
@@ -3553,9 +3578,9 @@ static int fd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
ret = fd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return ret;
}
@@ -3616,7 +3641,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
{
int drive = (long)disk->private_data;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
mutex_lock(&open_lock);
if (UDRS->fd_ref < 0)
UDRS->fd_ref = 0;
@@ -3627,7 +3652,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
if (!UDRS->fd_ref)
opened_bdev[drive] = NULL;
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return 0;
}
@@ -3645,7 +3670,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
int res = -EBUSY;
char *tmp;
- lock_kernel();
+ mutex_lock(&floppy_mutex);
mutex_lock(&open_lock);
old_dev = UDRS->fd_device;
if (opened_bdev[drive] && opened_bdev[drive] != bdev)
@@ -3722,7 +3747,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
goto out;
}
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return 0;
out:
if (UDRS->fd_ref < 0)
@@ -3733,7 +3758,7 @@ out:
opened_bdev[drive] = NULL;
out2:
mutex_unlock(&open_lock);
- unlock_kernel();
+ mutex_unlock(&floppy_mutex);
return res;
}
@@ -4165,6 +4190,13 @@ static int __init floppy_init(void)
goto out_put_disk;
}
+ disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
+ if (!disks[dr]->queue) {
+ err = -ENOMEM;
+ goto out_put_disk;
+ }
+
+ blk_queue_max_hw_sectors(disks[dr]->queue, 64);
disks[dr]->major = FLOPPY_MAJOR;
disks[dr]->first_minor = TOMINOR(dr);
disks[dr]->fops = &floppy_fops;
@@ -4183,13 +4215,6 @@ static int __init floppy_init(void)
if (err)
goto out_unreg_blkdev;
- floppy_queue = blk_init_queue(do_fd_request, &floppy_lock);
- if (!floppy_queue) {
- err = -ENOMEM;
- goto out_unreg_driver;
- }
- blk_queue_max_hw_sectors(floppy_queue, 64);
-
blk_register_region(MKDEV(FLOPPY_MAJOR, 0), 256, THIS_MODULE,
floppy_find, NULL, NULL);
@@ -4317,7 +4342,6 @@ static int __init floppy_init(void)
/* to be cleaned up... */
disks[drive]->private_data = (void *)(long)drive;
- disks[drive]->queue = floppy_queue;
disks[drive]->flags |= GENHD_FL_REMOVABLE;
disks[drive]->driverfs_dev = &floppy_device[drive].dev;
add_disk(disks[drive]);
@@ -4333,8 +4357,6 @@ out_flush_work:
floppy_release_irq_and_dma();
out_unreg_region:
blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
- blk_cleanup_queue(floppy_queue);
-out_unreg_driver:
platform_driver_unregister(&floppy_driver);
out_unreg_blkdev:
unregister_blkdev(FLOPPY_MAJOR, "fd");
@@ -4342,6 +4364,8 @@ out_put_disk:
while (dr--) {
del_timer(&motor_off_timer[dr]);
put_disk(disks[dr]);
+ if (disks[dr]->queue)
+ blk_cleanup_queue(disks[dr]->queue);
}
return err;
}
@@ -4550,11 +4574,11 @@ static void __exit floppy_module_exit(void)
platform_device_unregister(&floppy_device[drive]);
}
put_disk(disks[drive]);
+ blk_cleanup_queue(disks[drive]->queue);
}
del_timer_sync(&fd_timeout);
del_timer_sync(&fd_timer);
- blk_cleanup_queue(floppy_queue);
if (atomic_read(&usage_count))
floppy_release_irq_and_dma();
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 91797bbbe702..1e5284ef65fa 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -67,16 +67,18 @@
#include <linux/compat.h>
#include <linux/suspend.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/writeback.h>
#include <linux/buffer_head.h> /* for invalidate_bdev() */
#include <linux/completion.h>
#include <linux/highmem.h>
#include <linux/kthread.h>
#include <linux/splice.h>
+#include <linux/sysfs.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(loop_mutex);
static LIST_HEAD(loop_devices);
static DEFINE_MUTEX(loop_devices_mutex);
@@ -99,8 +101,8 @@ static int transfer_none(struct loop_device *lo, int cmd,
else
memcpy(raw_buf, loop_buf, size);
- kunmap_atomic(raw_buf, KM_USER0);
kunmap_atomic(loop_buf, KM_USER1);
+ kunmap_atomic(raw_buf, KM_USER0);
cond_resched();
return 0;
}
@@ -128,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
for (i = 0; i < size; i++)
*out++ = *in++ ^ key[(i & 511) % keysize];
- kunmap_atomic(raw_buf, KM_USER0);
kunmap_atomic(loop_buf, KM_USER1);
+ kunmap_atomic(raw_buf, KM_USER0);
cond_resched();
return 0;
}
@@ -477,17 +479,17 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
pos = ((loff_t) bio->bi_sector << 9) + lo->lo_offset;
if (bio_rw(bio) == WRITE) {
- bool barrier = !!(bio->bi_rw & REQ_HARDBARRIER);
struct file *file = lo->lo_backing_file;
- if (barrier) {
- if (unlikely(!file->f_op->fsync)) {
- ret = -EOPNOTSUPP;
- goto out;
- }
+ /* REQ_HARDBARRIER is deprecated */
+ if (bio->bi_rw & REQ_HARDBARRIER) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
+ if (bio->bi_rw & REQ_FLUSH) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret)) {
+ if (unlikely(ret && ret != -EINVAL)) {
ret = -EIO;
goto out;
}
@@ -495,9 +497,9 @@ static int do_bio_filebacked(struct loop_device *lo, struct bio *bio)
ret = lo_send(lo, bio, pos);
- if (barrier && !ret) {
+ if ((bio->bi_rw & REQ_FUA) && !ret) {
ret = vfs_fsync(file, 0);
- if (unlikely(ret))
+ if (unlikely(ret && ret != -EINVAL))
ret = -EIO;
}
} else
@@ -737,6 +739,103 @@ static inline int is_loop_device(struct file *file)
return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR;
}
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+ ssize_t (*callback)(struct loop_device *, char *))
+{
+ struct loop_device *l, *lo = NULL;
+
+ mutex_lock(&loop_devices_mutex);
+ list_for_each_entry(l, &loop_devices, lo_list)
+ if (disk_to_dev(l->lo_disk) == dev) {
+ lo = l;
+ break;
+ }
+ mutex_unlock(&loop_devices_mutex);
+
+ return lo ? callback(lo, page) : -EIO;
+}
+
+#define LOOP_ATTR_RO(_name) \
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *); \
+static ssize_t loop_attr_do_show_##_name(struct device *d, \
+ struct device_attribute *attr, char *b) \
+{ \
+ return loop_attr_show(d, b, loop_attr_##_name##_show); \
+} \
+static struct device_attribute loop_attr_##_name = \
+ __ATTR(_name, S_IRUGO, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+ ssize_t ret;
+ char *p = NULL;
+
+ mutex_lock(&lo->lo_ctl_mutex);
+ if (lo->lo_backing_file)
+ p = d_path(&lo->lo_backing_file->f_path, buf, PAGE_SIZE - 1);
+ mutex_unlock(&lo->lo_ctl_mutex);
+
+ if (IS_ERR_OR_NULL(p))
+ ret = PTR_ERR(p);
+ else {
+ ret = strlen(p);
+ memmove(buf, p, ret);
+ buf[ret++] = '\n';
+ buf[ret] = 0;
+ }
+
+ return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+ return sprintf(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+ int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+ return sprintf(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+
+static struct attribute *loop_attrs[] = {
+ &loop_attr_backing_file.attr,
+ &loop_attr_offset.attr,
+ &loop_attr_sizelimit.attr,
+ &loop_attr_autoclear.attr,
+ NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+ .name = "loop",
+ .attrs= loop_attrs,
+};
+
+static int loop_sysfs_init(struct loop_device *lo)
+{
+ return sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+ sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+ &loop_attribute_group);
+}
+
static int loop_set_fd(struct loop_device *lo, fmode_t mode,
struct block_device *bdev, unsigned int arg)
{
@@ -832,10 +931,11 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
lo->lo_queue->unplug_fn = loop_unplug;
if (!(lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
- blk_queue_ordered(lo->lo_queue, QUEUE_ORDERED_DRAIN);
+ blk_queue_flush(lo->lo_queue, REQ_FLUSH);
set_capacity(lo->lo_disk, size);
bd_set_size(bdev, size << 9);
+ loop_sysfs_init(lo);
/* let user-space know about the new size */
kobject_uevent(&disk_to_dev(bdev->bd_disk)->kobj, KOBJ_CHANGE);
@@ -854,6 +954,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode,
return 0;
out_clr:
+ loop_sysfs_exit(lo);
lo->lo_thread = NULL;
lo->lo_device = NULL;
lo->lo_backing_file = NULL;
@@ -948,6 +1049,7 @@ static int loop_clr_fd(struct loop_device *lo, struct block_device *bdev)
if (bdev)
invalidate_bdev(bdev);
set_capacity(lo->lo_disk, 0);
+ loop_sysfs_exit(lo);
if (bdev) {
bd_set_size(bdev, 0);
/* let user-space know about this change */
@@ -1409,11 +1511,11 @@ static int lo_open(struct block_device *bdev, fmode_t mode)
{
struct loop_device *lo = bdev->bd_disk->private_data;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
lo->lo_refcnt++;
mutex_unlock(&lo->lo_ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
@@ -1423,7 +1525,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
struct loop_device *lo = disk->private_data;
int err;
- lock_kernel();
+ mutex_lock(&loop_mutex);
mutex_lock(&lo->lo_ctl_mutex);
if (--lo->lo_refcnt)
@@ -1448,7 +1550,7 @@ static int lo_release(struct gendisk *disk, fmode_t mode)
out:
mutex_unlock(&lo->lo_ctl_mutex);
out_unlocked:
- lock_kernel();
+ mutex_unlock(&loop_mutex);
return 0;
}
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index 0daa422aa281..a32fb41246f8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -24,7 +24,7 @@
#include <linux/errno.h>
#include <linux/file.h>
#include <linux/ioctl.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/compiler.h>
#include <linux/err.h>
#include <linux/kernel.h>
@@ -53,6 +53,7 @@
#define DBG_BLKDEV 0x0100
#define DBG_RX 0x0200
#define DBG_TX 0x0400
+static DEFINE_MUTEX(nbd_mutex);
static unsigned int debugflags;
#endif /* NDEBUG */
@@ -717,11 +718,11 @@ static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
- lock_kernel();
+ mutex_lock(&nbd_mutex);
mutex_lock(&lo->tx_lock);
error = __nbd_ioctl(bdev, lo, cmd, arg);
mutex_unlock(&lo->tx_lock);
- unlock_kernel();
+ mutex_unlock(&nbd_mutex);
return error;
}
diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c
index 2284b4f05c62..87311ebac0db 100644
--- a/drivers/block/osdblk.c
+++ b/drivers/block/osdblk.c
@@ -310,8 +310,7 @@ static void osdblk_rq_fn(struct request_queue *q)
break;
/* filter out block requests we don't understand */
- if (rq->cmd_type != REQ_TYPE_FS &&
- !(rq->cmd_flags & REQ_HARDBARRIER)) {
+ if (rq->cmd_type != REQ_TYPE_FS) {
blk_end_request_all(rq, 0);
continue;
}
@@ -439,7 +438,7 @@ static int osdblk_init_disk(struct osdblk_device *osdev)
blk_queue_stack_limits(q, osd_request_queue(osdev->osd));
blk_queue_prep_rq(q, blk_queue_start_tag);
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_flush(q, REQ_FLUSH);
disk->queue = q;
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
index 76f8565e1e8d..62cec6afd7ad 100644
--- a/drivers/block/paride/pcd.c
+++ b/drivers/block/paride/pcd.c
@@ -138,9 +138,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
#include <linux/cdrom.h>
#include <linux/spinlock.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(pcd_mutex);
static DEFINE_SPINLOCK(pcd_lock);
module_param(verbose, bool, 0644);
@@ -227,9 +228,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
struct pcd_unit *cd = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
ret = cdrom_open(&cd->info, bdev, mode);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return ret;
}
@@ -237,9 +238,9 @@ static int pcd_block_open(struct block_device *bdev, fmode_t mode)
static int pcd_block_release(struct gendisk *disk, fmode_t mode)
{
struct pcd_unit *cd = disk->private_data;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
cdrom_release(&cd->info, mode);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return 0;
}
@@ -249,9 +250,9 @@ static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
struct pcd_unit *cd = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pcd_mutex);
ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
- unlock_kernel();
+ mutex_unlock(&pcd_mutex);
return ret;
}
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
index 985f0d4f1d1e..c0ee1558b9bb 100644
--- a/drivers/block/paride/pd.c
+++ b/drivers/block/paride/pd.c
@@ -153,10 +153,11 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
#include <linux/blkdev.h>
#include <linux/blkpg.h>
#include <linux/kernel.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
#include <linux/workqueue.h>
+static DEFINE_MUTEX(pd_mutex);
static DEFINE_SPINLOCK(pd_lock);
module_param(verbose, bool, 0);
@@ -736,14 +737,14 @@ static int pd_open(struct block_device *bdev, fmode_t mode)
{
struct pd_unit *disk = bdev->bd_disk->private_data;
- lock_kernel();
+ mutex_lock(&pd_mutex);
disk->access++;
if (disk->removable) {
pd_special_command(disk, pd_media_check);
pd_special_command(disk, pd_door_lock);
}
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
}
@@ -771,10 +772,10 @@ static int pd_ioctl(struct block_device *bdev, fmode_t mode,
switch (cmd) {
case CDROMEJECT:
- lock_kernel();
+ mutex_lock(&pd_mutex);
if (disk->access == 1)
pd_special_command(disk, pd_eject);
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
default:
return -EINVAL;
@@ -785,10 +786,10 @@ static int pd_release(struct gendisk *p, fmode_t mode)
{
struct pd_unit *disk = p->private_data;
- lock_kernel();
+ mutex_lock(&pd_mutex);
if (!--disk->access && disk->removable)
pd_special_command(disk, pd_door_unlock);
- unlock_kernel();
+ mutex_unlock(&pd_mutex);
return 0;
}
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
index 4457b494882a..635f25dd9e10 100644
--- a/drivers/block/paride/pf.c
+++ b/drivers/block/paride/pf.c
@@ -152,9 +152,10 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
#include <linux/spinlock.h>
#include <linux/blkdev.h>
#include <linux/blkpg.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
+static DEFINE_MUTEX(pf_mutex);
static DEFINE_SPINLOCK(pf_spin_lock);
module_param(verbose, bool, 0644);
@@ -302,7 +303,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
struct pf_unit *pf = bdev->bd_disk->private_data;
int ret;
- lock_kernel();
+ mutex_lock(&pf_mutex);
pf_identify(pf);
ret = -ENODEV;
@@ -318,7 +319,7 @@ static int pf_open(struct block_device *bdev, fmode_t mode)
if (pf->removable)
pf_lock(pf, 1);
out:
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return ret;
}
@@ -349,9 +350,9 @@ static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, u
if (pf->access != 1)
return -EBUSY;
- lock_kernel();
+ mutex_lock(&pf_mutex);
pf_eject(pf);
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return 0;
}
@@ -360,9 +361,9 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
{
struct pf_unit *pf = disk->private_data;
- lock_kernel();
+ mutex_lock(&pf_mutex);
if (pf->access <= 0) {
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return -EINVAL;
}
@@ -371,7 +372,7 @@ static int pf_release(struct gendisk *disk, fmode_t mode)
if (!pf->access && pf->removable)
pf_lock(pf, 0);
- unlock_kernel();
+ mutex_unlock(&pf_mutex);
return 0;
}
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
index c397b3ddba9b..6b9a2000d56a 100644
--- a/drivers/block/paride/pg.c
+++ b/drivers/block/paride/pg.c
@@ -162,7 +162,7 @@ enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
#include <linux/pg.h>
#include <linux/device.h>
#include <linux/sched.h> /* current, TASK_* */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/jiffies.h>
#include <asm/uaccess.h>
@@ -193,6 +193,7 @@ module_param_array(drive3, int, NULL, 0);
#define ATAPI_IDENTIFY 0x12
+static DEFINE_MUTEX(pg_mutex);
static int pg_open(struct inode *inode, struct file *file);
static int pg_release(struct inode *inode, struct file *file);
static ssize_t pg_read(struct file *filp, char __user *buf,
@@ -234,6 +235,7 @@ static const struct file_operations pg_fops = {
.write = pg_write,
.open = pg_open,
.release = pg_release,
+ .llseek = noop_llseek,
};
static void pg_init_units(void)
@@ -518,7 +520,7 @@ static int pg_open(struct inode *inode, struct file *file)
struct pg *dev = &devices[unit];
int ret = 0;
- lock_kernel();
+ mutex_lock(&pg_mutex);
if ((unit >= PG_UNITS) || (!dev->present)) {
ret = -ENODEV;
goto out;
@@ -547,7 +549,7 @@ static int pg_open(struct inode *inode, struct file *file)
file->private_data = dev;
out:
- unlock_kernel();
+ mutex_unlock(&pg_mutex);
return ret;
}
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
index bc5825fdeaab..7179f79d7468 100644
--- a/drivers/block/paride/pt.c
+++ b/drivers/block/paride/pt.c
@@ -146,7 +146,7 @@ static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
#include <linux/mtio.h>
#include <linux/device.h>
#include <linux/sched.h> /* current, TASK_*, schedule_timeout() */
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <asm/uaccess.h>
@@ -189,6 +189,7 @@ module_param_array(drive3, int, NULL, 0);
#define ATAPI_MODE_SENSE 0x1a
#define ATAPI_LOG_SENSE 0x4d
+static DEFINE_MUTEX(pt_mutex);
static int pt_open(struct inode *inode, struct file *file);
static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
static int pt_release(struct inode *inode, struct file *file);
@@ -239,6 +240,7 @@ static const struct file_operations pt_fops = {
.unlocked_ioctl = pt_ioctl,
.open = pt_open,
.release = pt_release,
+ .llseek = noop_llseek,
};
/* sysfs class support */
@@ -650,9 +652,9 @@ static int pt_open(struct inode *inode, struct file *file)
struct pt_unit *tape = pt + unit;
int err;
- lock_kernel();
+ mutex_lock(&pt_mutex);
if (unit >= PT_UNITS || (!tape->present)) {
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return -ENODEV;
}
@@ -681,12 +683,12 @@ static int pt_open(struct inode *inode, struct file *file)
}
file->private_data = tape;
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
out:
atomic_inc(&tape->available);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return err;
}
@@ -704,15 +706,15 @@ static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
switch (mtop.mt_op) {
case MTREW:
- lock_kernel();
+ mutex_lock(&pt_mutex);
pt_rewind(tape);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
case MTWEOF:
- lock_kernel();
+ mutex_lock(&pt_mutex);
pt_write_fm(tape);
- unlock_kernel();
+ mutex_unlock(&pt_mutex);
return 0;
default:
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index 37a2bb595076..19b3568e9326 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -57,7 +57,6 @@
#include <linux/seq_file.h>
#include <linux/miscdevice.h>
#include <linux/freezer.h>
-#include <linux/smp_lock.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <scsi/scsi_cmnd.h>
@@ -86,6 +85,7 @@
#define ZONE(sector, pd) (((sector) + (pd)->offset) & ~((pd)->settings.size - 1))
+static DEFINE_MUTEX(pktcdvd_mutex);
static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
static struct proc_dir_entry *pkt_proc;
static int pktdev_major;
@@ -753,7 +753,6 @@ static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *
rq->timeout = 60*HZ;
rq->cmd_type = REQ_TYPE_BLOCK_PC;
- rq->cmd_flags |= REQ_HARDBARRIER;
if (cgc->quiet)
rq->cmd_flags |= REQ_QUIET;
@@ -2383,7 +2382,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
VPRINTK(DRIVER_NAME": entering open\n");
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
if (!pd) {
@@ -2411,7 +2410,7 @@ static int pkt_open(struct block_device *bdev, fmode_t mode)
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return 0;
out_dec:
@@ -2419,7 +2418,7 @@ out_dec:
out:
VPRINTK(DRIVER_NAME": failed open (%d)\n", ret);
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -2428,7 +2427,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
struct pktcdvd_device *pd = disk->private_data;
int ret = 0;
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
mutex_lock(&ctl_mutex);
pd->refcnt--;
BUG_ON(pd->refcnt < 0);
@@ -2437,7 +2436,7 @@ static int pkt_close(struct gendisk *disk, fmode_t mode)
pkt_release_dev(pd, flush);
}
mutex_unlock(&ctl_mutex);
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -2773,7 +2772,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
VPRINTK("pkt_ioctl: cmd %x, dev %d:%d\n", cmd,
MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
- lock_kernel();
+ mutex_lock(&pktcdvd_mutex);
switch (cmd) {
case CDROMEJECT:
/*
@@ -2798,7 +2797,7 @@ static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
VPRINTK(DRIVER_NAME": Unknown ioctl for %s (%x)\n", pd->name, cmd);
ret = -ENOTTY;
}
- unlock_kernel();
+ mutex_unlock(&pktcdvd_mutex);
return ret;
}
@@ -3046,6 +3045,7 @@ static const struct file_operations pkt_ctl_fops = {
.compat_ioctl = pkt_ctl_compat_ioctl,
#endif
.owner = THIS_MODULE,
+ .llseek = no_llseek,
};
static struct miscdevice pkt_misc = {
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
index e9da874d0419..8e1ce2e2916a 100644
--- a/drivers/block/ps3disk.c
+++ b/drivers/block/ps3disk.c
@@ -113,7 +113,7 @@ static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
memcpy(buf, dev->bounce_buf+offset, size);
offset += size;
flush_kernel_dcache_page(bvec->bv_page);
- bvec_kunmap_irq(bvec, &flags);
+ bvec_kunmap_irq(buf, &flags);
i++;
}
}
@@ -468,7 +468,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev)
blk_queue_dma_alignment(queue, dev->blk_size-1);
blk_queue_logical_block_size(queue, dev->blk_size);
- blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH);
+ blk_queue_flush(queue, REQ_FLUSH);
blk_queue_max_segments(queue, -1);
blk_queue_max_segment_size(queue, dev->bounce_size);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000000..6ec9d53806c5
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,1841 @@
+/*
+ rbd.c -- Export ceph rados objects as a Linux block device
+
+
+ based on drivers/block/osdblk.c:
+
+ Copyright 2009 Red Hat, Inc.
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+ Instructions for use
+ --------------------
+
+ 1) Map a Linux block device to an existing rbd image.
+
+ Usage: <mon ip addr> <options> <pool name> <rbd image name> [snap name]
+
+ $ echo "192.168.0.1 name=admin rbd foo" > /sys/class/rbd/add
+
+ The snapshot name can be "-" or omitted to map the image read/write.
+
+ 2) List all active blkdev<->object mappings.
+
+ In this example, we have performed step #1 twice, creating two blkdevs,
+ mapped to two separate rados objects in the rados rbd pool
+
+ $ cat /sys/class/rbd/list
+ #id major client_name pool name snap KB
+ 0 254 client4143 rbd foo - 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - blkdev assigned major
+ - rados client id
+ - rados pool name
+ - rados block device name
+ - mapped snapshot ("-" if none)
+ - device size in KB
+
+
+ 3) Create a snapshot.
+
+ Usage: <blkdev id> <snapname>
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_create
+
+
+ 4) Listing a snapshot.
+
+ $ cat /sys/class/rbd/snaps_list
+ #id snap KB
+ 0 - 1024000 (*)
+ 0 foo 1024000
+
+ The columns, in order, are:
+ - blkdev unique id
+ - snapshot name, '-' means none (active read/write version)
+ - size of device at time of snapshot
+ - the (*) indicates this is the active version
+
+ 5) Rollback to snapshot.
+
+ Usage: <blkdev id> <snapname>
+
+ $ echo "0 mysnap" > /sys/class/rbd/snap_rollback
+
+
+ 6) Mapping an image using snapshot.
+
+ A snapshot mapping is read-only. This is being done by passing
+ snap=<snapname> to the options when adding a device.
+
+ $ echo "192.168.0.1 name=admin,snap=mysnap rbd foo" > /sys/class/rbd/add
+
+
+ 7) Remove an active blkdev<->rbd image mapping.
+
+ In this example, we remove the mapping with blkdev unique id 1.
+
+ $ echo 1 > /sys/class/rbd/remove
+
+
+ NOTE: The actual creation and deletion of rados objects is outside the scope
+ of this driver.
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/decode.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+
+#include "rbd_types.h"
+
+#define DRV_NAME "rbd"
+#define DRV_NAME_LONG "rbd (rados block device)"
+
+#define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */
+
+#define RBD_MAX_MD_NAME_LEN (96 + sizeof(RBD_SUFFIX))
+#define RBD_MAX_POOL_NAME_LEN 64
+#define RBD_MAX_SNAP_NAME_LEN 32
+#define RBD_MAX_OPT_LEN 1024
+
+#define RBD_SNAP_HEAD_NAME "-"
+
+#define DEV_NAME_LEN 32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+ u64 image_size;
+ char block_name[32];
+ __u8 obj_order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ struct rw_semaphore snap_rwsem;
+ struct ceph_snap_context *snapc;
+ size_t snap_names_len;
+ u64 snap_seq;
+ u32 total_snaps;
+
+ char *snap_names;
+ u64 *snap_sizes;
+};
+
+/*
+ * an instance of the client. multiple devices may share a client.
+ */
+struct rbd_client {
+ struct ceph_client *client;
+ struct kref kref;
+ struct list_head node;
+};
+
+/*
+ * a single io request
+ */
+struct rbd_request {
+ struct request *rq; /* blk layer request */
+ struct bio *bio; /* cloned bio */
+ struct page **pages; /* list of used pages */
+ u64 len;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+ int id; /* blkdev unique id */
+
+ int major; /* blkdev assigned major */
+ struct gendisk *disk; /* blkdev's gendisk and rq */
+ struct request_queue *q;
+
+ struct ceph_client *client;
+ struct rbd_client *rbd_client;
+
+ char name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+ spinlock_t lock; /* queue lock */
+
+ struct rbd_image_header header;
+ char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */
+ int obj_len;
+ char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */
+ char pool_name[RBD_MAX_POOL_NAME_LEN];
+ int poolid;
+
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u32 cur_snap; /* index+1 of current snapshot within snap context
+ 0 - for the head */
+ int read_only;
+
+ struct list_head node;
+};
+
+static spinlock_t node_lock; /* protects client get/put */
+
+static struct class *class_rbd; /* /sys/class/rbd */
+static DEFINE_MUTEX(ctl_mutex); /* Serialize open/close/setup/teardown */
+static LIST_HEAD(rbd_dev_list); /* devices */
+static LIST_HEAD(rbd_client_list); /* clients */
+
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ struct rbd_device *rbd_dev = disk->private_data;
+
+ set_device_ro(bdev, rbd_dev->read_only);
+
+ if ((mode & FMODE_WRITE) && rbd_dev->read_only)
+ return -EROFS;
+
+ return 0;
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+ .owner = THIS_MODULE,
+ .open = rbd_open,
+};
+
+/*
+ * Initialize an rbd client instance.
+ * We own *opt.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *opt)
+{
+ struct rbd_client *rbdc;
+ int ret = -ENOMEM;
+
+ dout("rbd_client_create\n");
+ rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+ if (!rbdc)
+ goto out_opt;
+
+ kref_init(&rbdc->kref);
+ INIT_LIST_HEAD(&rbdc->node);
+
+ rbdc->client = ceph_create_client(opt, rbdc);
+ if (IS_ERR(rbdc->client))
+ goto out_rbdc;
+ opt = NULL; /* Now rbdc->client is responsible for opt */
+
+ ret = ceph_open_session(rbdc->client);
+ if (ret < 0)
+ goto out_err;
+
+ spin_lock(&node_lock);
+ list_add_tail(&rbdc->node, &rbd_client_list);
+ spin_unlock(&node_lock);
+
+ dout("rbd_client_create created %p\n", rbdc);
+ return rbdc;
+
+out_err:
+ ceph_destroy_client(rbdc->client);
+out_rbdc:
+ kfree(rbdc);
+out_opt:
+ if (opt)
+ ceph_destroy_options(opt);
+ return ERR_PTR(ret);
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.
+ */
+static struct rbd_client *__rbd_client_find(struct ceph_options *opt)
+{
+ struct rbd_client *client_node;
+
+ if (opt->flags & CEPH_OPT_NOSHARE)
+ return NULL;
+
+ list_for_each_entry(client_node, &rbd_client_list, node)
+ if (ceph_compare_options(opt, client_node->client) == 0)
+ return client_node;
+ return NULL;
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.
+ */
+static int rbd_get_client(struct rbd_device *rbd_dev, const char *mon_addr,
+ char *options)
+{
+ struct rbd_client *rbdc;
+ struct ceph_options *opt;
+ int ret;
+
+ ret = ceph_parse_options(&opt, options, mon_addr,
+ mon_addr + strlen(mon_addr), NULL, NULL);
+ if (ret < 0)
+ return ret;
+
+ spin_lock(&node_lock);
+ rbdc = __rbd_client_find(opt);
+ if (rbdc) {
+ ceph_destroy_options(opt);
+
+ /* using an existing client */
+ kref_get(&rbdc->kref);
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ spin_unlock(&node_lock);
+ return 0;
+ }
+ spin_unlock(&node_lock);
+
+ rbdc = rbd_client_create(opt);
+ if (IS_ERR(rbdc))
+ return PTR_ERR(rbdc);
+
+ rbd_dev->rbd_client = rbdc;
+ rbd_dev->client = rbdc->client;
+ return 0;
+}
+
+/*
+ * Destroy ceph client
+ */
+static void rbd_client_release(struct kref *kref)
+{
+ struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+ dout("rbd_release_client %p\n", rbdc);
+ spin_lock(&node_lock);
+ list_del(&rbdc->node);
+ spin_unlock(&node_lock);
+
+ ceph_destroy_client(rbdc->client);
+ kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_device *rbd_dev)
+{
+ kref_put(&rbd_dev->rbd_client->kref, rbd_client_release);
+ rbd_dev->rbd_client = NULL;
+ rbd_dev->client = NULL;
+}
+
+
+/*
+ * Create a new header structure, translate header format from the on-disk
+ * header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+ struct rbd_image_header_ondisk *ondisk,
+ int allocated_snaps,
+ gfp_t gfp_flags)
+{
+ int i;
+ u32 snap_count = le32_to_cpu(ondisk->snap_count);
+ int ret = -ENOMEM;
+
+ init_rwsem(&header->snap_rwsem);
+
+ header->snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+ header->snapc = kmalloc(sizeof(struct ceph_snap_context) +
+ snap_count *
+ sizeof(struct rbd_image_snap_ondisk),
+ gfp_flags);
+ if (!header->snapc)
+ return -ENOMEM;
+ if (snap_count) {
+ header->snap_names = kmalloc(header->snap_names_len,
+ GFP_KERNEL);
+ if (!header->snap_names)
+ goto err_snapc;
+ header->snap_sizes = kmalloc(snap_count * sizeof(u64),
+ GFP_KERNEL);
+ if (!header->snap_sizes)
+ goto err_names;
+ } else {
+ header->snap_names = NULL;
+ header->snap_sizes = NULL;
+ }
+ memcpy(header->block_name, ondisk->block_name,
+ sizeof(ondisk->block_name));
+
+ header->image_size = le64_to_cpu(ondisk->image_size);
+ header->obj_order = ondisk->options.order;
+ header->crypt_type = ondisk->options.crypt_type;
+ header->comp_type = ondisk->options.comp_type;
+
+ atomic_set(&header->snapc->nref, 1);
+ header->snap_seq = le64_to_cpu(ondisk->snap_seq);
+ header->snapc->num_snaps = snap_count;
+ header->total_snaps = snap_count;
+
+ if (snap_count &&
+ allocated_snaps == snap_count) {
+ for (i = 0; i < snap_count; i++) {
+ header->snapc->snaps[i] =
+ le64_to_cpu(ondisk->snaps[i].id);
+ header->snap_sizes[i] =
+ le64_to_cpu(ondisk->snaps[i].image_size);
+ }
+
+ /* copy snapshot names */
+ memcpy(header->snap_names, &ondisk->snaps[i],
+ header->snap_names_len);
+ }
+
+ return 0;
+
+err_names:
+ kfree(header->snap_names);
+err_snapc:
+ kfree(header->snapc);
+ return ret;
+}
+
+static int snap_index(struct rbd_image_header *header, int snap_num)
+{
+ return header->total_snaps - snap_num;
+}
+
+static u64 cur_snap_id(struct rbd_device *rbd_dev)
+{
+ struct rbd_image_header *header = &rbd_dev->header;
+
+ if (!rbd_dev->cur_snap)
+ return 0;
+
+ return header->snapc->snaps[snap_index(header, rbd_dev->cur_snap)];
+}
+
+static int snap_by_name(struct rbd_image_header *header, const char *snap_name,
+ u64 *seq, u64 *size)
+{
+ int i;
+ char *p = header->snap_names;
+
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ if (strcmp(snap_name, p) == 0)
+ break;
+ }
+ if (i == header->total_snaps)
+ return -ENOENT;
+ if (seq)
+ *seq = header->snapc->snaps[i];
+
+ if (size)
+ *size = header->snap_sizes[i];
+
+ return i;
+}
+
+static int rbd_header_set_snap(struct rbd_device *dev,
+ const char *snap_name,
+ u64 *size)
+{
+ struct rbd_image_header *header = &dev->header;
+ struct ceph_snap_context *snapc = header->snapc;
+ int ret = -ENOENT;
+
+ down_write(&header->snap_rwsem);
+
+ if (!snap_name ||
+ !*snap_name ||
+ strcmp(snap_name, "-") == 0 ||
+ strcmp(snap_name, RBD_SNAP_HEAD_NAME) == 0) {
+ if (header->total_snaps)
+ snapc->seq = header->snap_seq;
+ else
+ snapc->seq = 0;
+ dev->cur_snap = 0;
+ dev->read_only = 0;
+ if (size)
+ *size = header->image_size;
+ } else {
+ ret = snap_by_name(header, snap_name, &snapc->seq, size);
+ if (ret < 0)
+ goto done;
+
+ dev->cur_snap = header->total_snaps - ret;
+ dev->read_only = 1;
+ }
+
+ ret = 0;
+done:
+ up_write(&header->snap_rwsem);
+ return ret;
+}
+
+static void rbd_header_free(struct rbd_image_header *header)
+{
+ kfree(header->snapc);
+ kfree(header->snap_names);
+ kfree(header->snap_sizes);
+}
+
+/*
+ * get the actual striped segment name, offset and length
+ */
+static u64 rbd_get_segment(struct rbd_image_header *header,
+ const char *block_name,
+ u64 ofs, u64 len,
+ char *seg_name, u64 *segofs)
+{
+ u64 seg = ofs >> header->obj_order;
+
+ if (seg_name)
+ snprintf(seg_name, RBD_MAX_SEG_NAME_LEN,
+ "%s.%012llx", block_name, seg);
+
+ ofs = ofs & ((1 << header->obj_order) - 1);
+ len = min_t(u64, len, (1 << header->obj_order) - ofs);
+
+ if (segofs)
+ *segofs = ofs;
+
+ return len;
+}
+
+/*
+ * bio helpers
+ */
+
+static void bio_chain_put(struct bio *chain)
+{
+ struct bio *tmp;
+
+ while (chain) {
+ tmp = chain;
+ chain = chain->bi_next;
+ bio_put(tmp);
+ }
+}
+
+/*
+ * zeros a bio chain, starting at specific offset
+ */
+static void zero_bio_chain(struct bio *chain, int start_ofs)
+{
+ struct bio_vec *bv;
+ unsigned long flags;
+ void *buf;
+ int i;
+ int pos = 0;
+
+ while (chain) {
+ bio_for_each_segment(bv, chain, i) {
+ if (pos + bv->bv_len > start_ofs) {
+ int remainder = max(start_ofs - pos, 0);
+ buf = bvec_kmap_irq(bv, &flags);
+ memset(buf + remainder, 0,
+ bv->bv_len - remainder);
+ bvec_kunmap_irq(buf, &flags);
+ }
+ pos += bv->bv_len;
+ }
+
+ chain = chain->bi_next;
+ }
+}
+
+/*
+ * bio_chain_clone - clone a chain of bios up to a certain length.
+ * might return a bio_pair that will need to be released.
+ */
+static struct bio *bio_chain_clone(struct bio **old, struct bio **next,
+ struct bio_pair **bp,
+ int len, gfp_t gfpmask)
+{
+ struct bio *tmp, *old_chain = *old, *new_chain = NULL, *tail = NULL;
+ int total = 0;
+
+ if (*bp) {
+ bio_pair_release(*bp);
+ *bp = NULL;
+ }
+
+ while (old_chain && (total < len)) {
+ tmp = bio_kmalloc(gfpmask, old_chain->bi_max_vecs);
+ if (!tmp)
+ goto err_out;
+
+ if (total + old_chain->bi_size > len) {
+ struct bio_pair *bp;
+
+ /*
+ * this split can only happen with a single paged bio,
+ * split_bio will BUG_ON if this is not the case
+ */
+ dout("bio_chain_clone split! total=%d remaining=%d"
+ "bi_size=%d\n",
+ (int)total, (int)len-total,
+ (int)old_chain->bi_size);
+
+ /* split the bio. We'll release it either in the next
+ call, or it will have to be released outside */
+ bp = bio_split(old_chain, (len - total) / 512ULL);
+ if (!bp)
+ goto err_out;
+
+ __bio_clone(tmp, &bp->bio1);
+
+ *next = &bp->bio2;
+ } else {
+ __bio_clone(tmp, old_chain);
+ *next = old_chain->bi_next;
+ }
+
+ tmp->bi_bdev = NULL;
+ gfpmask &= ~__GFP_WAIT;
+ tmp->bi_next = NULL;
+
+ if (!new_chain) {
+ new_chain = tail = tmp;
+ } else {
+ tail->bi_next = tmp;
+ tail = tmp;
+ }
+ old_chain = old_chain->bi_next;
+
+ total += tmp->bi_size;
+ }
+
+ BUG_ON(total < len);
+
+ if (tail)
+ tail->bi_next = NULL;
+
+ *old = old_chain;
+
+ return new_chain;
+
+err_out:
+ dout("bio_chain_clone with err\n");
+ bio_chain_put(new_chain);
+ return NULL;
+}
+
+/*
+ * helpers for osd request op vectors.
+ */
+static int rbd_create_rw_ops(struct ceph_osd_req_op **ops,
+ int num_ops,
+ int opcode,
+ u32 payload_len)
+{
+ *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1),
+ GFP_NOIO);
+ if (!*ops)
+ return -ENOMEM;
+ (*ops)[0].op = opcode;
+ /*
+ * op extent offset and length will be set later on
+ * in calc_raw_layout()
+ */
+ (*ops)[0].payload_len = payload_len;
+ return 0;
+}
+
+static void rbd_destroy_ops(struct ceph_osd_req_op *ops)
+{
+ kfree(ops);
+}
+
+/*
+ * Send ceph osd request
+ */
+static int rbd_do_request(struct request *rq,
+ struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj, u64 ofs, u64 len,
+ struct bio *bio,
+ struct page **pages,
+ int num_pages,
+ int flags,
+ struct ceph_osd_req_op *ops,
+ int num_reply,
+ void (*rbd_cb)(struct ceph_osd_request *req,
+ struct ceph_msg *msg))
+{
+ struct ceph_osd_request *req;
+ struct ceph_file_layout *layout;
+ int ret;
+ u64 bno;
+ struct timespec mtime = CURRENT_TIME;
+ struct rbd_request *req_data;
+ struct ceph_osd_request_head *reqhead;
+ struct rbd_image_header *header = &dev->header;
+
+ ret = -ENOMEM;
+ req_data = kzalloc(sizeof(*req_data), GFP_NOIO);
+ if (!req_data)
+ goto done;
+
+ dout("rbd_do_request len=%lld ofs=%lld\n", len, ofs);
+
+ down_read(&header->snap_rwsem);
+
+ req = ceph_osdc_alloc_request(&dev->client->osdc, flags,
+ snapc,
+ ops,
+ false,
+ GFP_NOIO, pages, bio);
+ if (IS_ERR(req)) {
+ up_read(&header->snap_rwsem);
+ ret = PTR_ERR(req);
+ goto done_pages;
+ }
+
+ req->r_callback = rbd_cb;
+
+ req_data->rq = rq;
+ req_data->bio = bio;
+ req_data->pages = pages;
+ req_data->len = len;
+
+ req->r_priv = req_data;
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->snapid = cpu_to_le64(CEPH_NOSNAP);
+
+ strncpy(req->r_oid, obj, sizeof(req->r_oid));
+ req->r_oid_len = strlen(req->r_oid);
+
+ layout = &req->r_file_layout;
+ memset(layout, 0, sizeof(*layout));
+ layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_stripe_count = cpu_to_le32(1);
+ layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER);
+ layout->fl_pg_preferred = cpu_to_le32(-1);
+ layout->fl_pg_pool = cpu_to_le32(dev->poolid);
+ ceph_calc_raw_layout(&dev->client->osdc, layout, snapid,
+ ofs, &len, &bno, req, ops);
+
+ ceph_osdc_build_request(req, ofs, &len,
+ ops,
+ snapc,
+ &mtime,
+ req->r_oid, req->r_oid_len);
+ up_read(&header->snap_rwsem);
+
+ ret = ceph_osdc_start_request(&dev->client->osdc, req, false);
+ if (ret < 0)
+ goto done_err;
+
+ if (!rbd_cb) {
+ ret = ceph_osdc_wait_request(&dev->client->osdc, req);
+ ceph_osdc_put_request(req);
+ }
+ return ret;
+
+done_err:
+ bio_chain_put(req_data->bio);
+ ceph_osdc_put_request(req);
+done_pages:
+ kfree(req_data);
+done:
+ if (rq)
+ blk_end_request(rq, ret, len);
+ return ret;
+}
+
+/*
+ * Ceph osd op callback
+ */
+static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg)
+{
+ struct rbd_request *req_data = req->r_priv;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ __s32 rc;
+ u64 bytes;
+ int read_op;
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+ read_op = (le32_to_cpu(op->op) == CEPH_OSD_OP_READ);
+
+ dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc);
+
+ if (rc == -ENOENT && read_op) {
+ zero_bio_chain(req_data->bio, 0);
+ rc = 0;
+ } else if (rc == 0 && read_op && bytes < req_data->len) {
+ zero_bio_chain(req_data->bio, bytes);
+ bytes = req_data->len;
+ }
+
+ blk_end_request(req_data->rq, rc, bytes);
+
+ if (req_data->bio)
+ bio_chain_put(req_data->bio);
+
+ ceph_osdc_put_request(req);
+ kfree(req_data);
+}
+
+/*
+ * Do a synchronous ceph osd operation
+ */
+static int rbd_req_sync_op(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode,
+ int flags,
+ struct ceph_osd_req_op *orig_ops,
+ int num_reply,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ int ret;
+ struct page **pages;
+ int num_pages;
+ struct ceph_osd_req_op *ops = orig_ops;
+ u32 payload_len;
+
+ num_pages = calc_pages_for(ofs , len);
+ pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ if (!orig_ops) {
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0);
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ if ((flags & CEPH_OSD_FLAG_WRITE) && buf) {
+ ret = ceph_copy_to_page_vector(pages, buf, ofs, len);
+ if (ret < 0)
+ goto done_ops;
+ }
+ }
+
+ ret = rbd_do_request(NULL, dev, snapc, snapid,
+ obj, ofs, len, NULL,
+ pages, num_pages,
+ flags,
+ ops,
+ 2,
+ NULL);
+ if (ret < 0)
+ goto done_ops;
+
+ if ((flags & CEPH_OSD_FLAG_READ) && buf)
+ ret = ceph_copy_from_page_vector(pages, buf, ofs, ret);
+
+done_ops:
+ if (!orig_ops)
+ rbd_destroy_ops(ops);
+done:
+ ceph_release_page_vector(pages, num_pages);
+ return ret;
+}
+
+/*
+ * Do an asynchronous ceph osd operation
+ */
+static int rbd_do_op(struct request *rq,
+ struct rbd_device *rbd_dev ,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ int opcode, int flags, int num_reply,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ char *seg_name;
+ u64 seg_ofs;
+ u64 seg_len;
+ int ret;
+ struct ceph_osd_req_op *ops;
+ u32 payload_len;
+
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return -ENOMEM;
+
+ seg_len = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, len,
+ seg_name, &seg_ofs);
+
+ payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0);
+
+ ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len);
+ if (ret < 0)
+ goto done;
+
+ /* we've taken care of segment sizes earlier when we
+ cloned the bios. We should never have a segment
+ truncated at this point */
+ BUG_ON(seg_len < len);
+
+ ret = rbd_do_request(rq, rbd_dev, snapc, snapid,
+ seg_name, seg_ofs, seg_len,
+ bio,
+ NULL, 0,
+ flags,
+ ops,
+ num_reply,
+ rbd_req_cb);
+done:
+ kfree(seg_name);
+ return ret;
+}
+
+/*
+ * Request async osd write
+ */
+static int rbd_req_write(struct request *rq,
+ struct rbd_device *rbd_dev,
+ struct ceph_snap_context *snapc,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request async osd read
+ */
+static int rbd_req_read(struct request *rq,
+ struct rbd_device *rbd_dev,
+ u64 snapid,
+ u64 ofs, u64 len,
+ struct bio *bio)
+{
+ return rbd_do_op(rq, rbd_dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ 2,
+ ofs, len, bio);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_read(struct rbd_device *dev,
+ struct ceph_snap_context *snapc,
+ u64 snapid,
+ const char *obj,
+ u64 ofs, u64 len,
+ char *buf)
+{
+ return rbd_req_sync_op(dev, NULL,
+ (snapid ? snapid : CEPH_NOSNAP),
+ CEPH_OSD_OP_READ,
+ CEPH_OSD_FLAG_READ,
+ NULL,
+ 1, obj, ofs, len, buf);
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_rollback_obj(struct rbd_device *dev,
+ u64 snapid,
+ const char *obj)
+{
+ struct ceph_osd_req_op *ops;
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_ROLLBACK, 0);
+ if (ret < 0)
+ return ret;
+
+ ops[0].snap.snapid = snapid;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ if (ret < 0)
+ return ret;
+
+ return ret;
+}
+
+/*
+ * Request sync osd read
+ */
+static int rbd_req_sync_exec(struct rbd_device *dev,
+ const char *obj,
+ const char *cls,
+ const char *method,
+ const char *data,
+ int len)
+{
+ struct ceph_osd_req_op *ops;
+ int cls_len = strlen(cls);
+ int method_len = strlen(method);
+ int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL,
+ cls_len + method_len + len);
+ if (ret < 0)
+ return ret;
+
+ ops[0].cls.class_name = cls;
+ ops[0].cls.class_len = (__u8)cls_len;
+ ops[0].cls.method_name = method;
+ ops[0].cls.method_len = (__u8)method_len;
+ ops[0].cls.argc = 0;
+ ops[0].cls.indata = data;
+ ops[0].cls.indata_len = len;
+
+ ret = rbd_req_sync_op(dev, NULL,
+ CEPH_NOSNAP,
+ 0,
+ CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK,
+ ops,
+ 1, obj, 0, 0, NULL);
+
+ rbd_destroy_ops(ops);
+
+ dout("cls_exec returned %d\n", ret);
+ return ret;
+}
+
+/*
+ * block device queue callback
+ */
+static void rbd_rq_fn(struct request_queue *q)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ struct request *rq;
+ struct bio_pair *bp = NULL;
+
+ rq = blk_fetch_request(q);
+
+ while (1) {
+ struct bio *bio;
+ struct bio *rq_bio, *next_bio = NULL;
+ bool do_write;
+ int size, op_size = 0;
+ u64 ofs;
+
+ /* peek at request from block layer */
+ if (!rq)
+ break;
+
+ dout("fetched request\n");
+
+ /* filter out block requests we don't understand */
+ if ((rq->cmd_type != REQ_TYPE_FS)) {
+ __blk_end_request_all(rq, 0);
+ goto next;
+ }
+
+ /* deduce our operation (read, write) */
+ do_write = (rq_data_dir(rq) == WRITE);
+
+ size = blk_rq_bytes(rq);
+ ofs = blk_rq_pos(rq) * 512ULL;
+ rq_bio = rq->bio;
+ if (do_write && rbd_dev->read_only) {
+ __blk_end_request_all(rq, -EROFS);
+ goto next;
+ }
+
+ spin_unlock_irq(q->queue_lock);
+
+ dout("%s 0x%x bytes at 0x%llx\n",
+ do_write ? "write" : "read",
+ size, blk_rq_pos(rq) * 512ULL);
+
+ do {
+ /* a bio clone to be passed down to OSD req */
+ dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt);
+ op_size = rbd_get_segment(&rbd_dev->header,
+ rbd_dev->header.block_name,
+ ofs, size,
+ NULL, NULL);
+ bio = bio_chain_clone(&rq_bio, &next_bio, &bp,
+ op_size, GFP_ATOMIC);
+ if (!bio) {
+ spin_lock_irq(q->queue_lock);
+ __blk_end_request_all(rq, -ENOMEM);
+ goto next;
+ }
+
+ /* init OSD command: write or read */
+ if (do_write)
+ rbd_req_write(rq, rbd_dev,
+ rbd_dev->header.snapc,
+ ofs,
+ op_size, bio);
+ else
+ rbd_req_read(rq, rbd_dev,
+ cur_snap_id(rbd_dev),
+ ofs,
+ op_size, bio);
+
+ size -= op_size;
+ ofs += op_size;
+
+ rq_bio = next_bio;
+ } while (size > 0);
+
+ if (bp)
+ bio_pair_release(bp);
+
+ spin_lock_irq(q->queue_lock);
+next:
+ rq = blk_fetch_request(q);
+ }
+}
+
+/*
+ * a queue callback. Makes sure that we don't create a bio that spans across
+ * multiple osd objects. One exception would be with a single page bios,
+ * which we handle later at bio_chain_clone
+ */
+static int rbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bmd,
+ struct bio_vec *bvec)
+{
+ struct rbd_device *rbd_dev = q->queuedata;
+ unsigned int chunk_sectors = 1 << (rbd_dev->header.obj_order - 9);
+ sector_t sector = bmd->bi_sector + get_start_sect(bmd->bi_bdev);
+ unsigned int bio_sectors = bmd->bi_size >> 9;
+ int max;
+
+ max = (chunk_sectors - ((sector & (chunk_sectors - 1))
+ + bio_sectors)) << 9;
+ if (max < 0)
+ max = 0; /* bio_add cannot handle a negative return */
+ if (max <= bvec->bv_len && bio_sectors == 0)
+ return bvec->bv_len;
+ return max;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk = rbd_dev->disk;
+
+ if (!disk)
+ return;
+
+ rbd_header_free(&rbd_dev->header);
+
+ if (disk->flags & GENHD_FL_UP)
+ del_gendisk(disk);
+ if (disk->queue)
+ blk_cleanup_queue(disk->queue);
+ put_disk(disk);
+}
+
+/*
+ * reload the ondisk the header
+ */
+static int rbd_read_header(struct rbd_device *rbd_dev,
+ struct rbd_image_header *header)
+{
+ ssize_t rc;
+ struct rbd_image_header_ondisk *dh;
+ int snap_count = 0;
+ u64 snap_names_len = 0;
+
+ while (1) {
+ int len = sizeof(*dh) +
+ snap_count * sizeof(struct rbd_image_snap_ondisk) +
+ snap_names_len;
+
+ rc = -ENOMEM;
+ dh = kmalloc(len, GFP_KERNEL);
+ if (!dh)
+ return -ENOMEM;
+
+ rc = rbd_req_sync_read(rbd_dev,
+ NULL, CEPH_NOSNAP,
+ rbd_dev->obj_md_name,
+ 0, len,
+ (char *)dh);
+ if (rc < 0)
+ goto out_dh;
+
+ rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL);
+ if (rc < 0)
+ goto out_dh;
+
+ if (snap_count != header->total_snaps) {
+ snap_count = header->total_snaps;
+ snap_names_len = header->snap_names_len;
+ rbd_header_free(header);
+ kfree(dh);
+ continue;
+ }
+ break;
+ }
+
+out_dh:
+ kfree(dh);
+ return rc;
+}
+
+/*
+ * create a snapshot
+ */
+static int rbd_header_add_snap(struct rbd_device *dev,
+ const char *snap_name,
+ gfp_t gfp_flags)
+{
+ int name_len = strlen(snap_name);
+ u64 new_snapid;
+ int ret;
+ void *data, *data_start, *data_end;
+
+ /* we should create a snapshot only if we're pointing at the head */
+ if (dev->cur_snap)
+ return -EINVAL;
+
+ ret = ceph_monc_create_snapid(&dev->client->monc, dev->poolid,
+ &new_snapid);
+ dout("created snapid=%lld\n", new_snapid);
+ if (ret < 0)
+ return ret;
+
+ data = kmalloc(name_len + 16, gfp_flags);
+ if (!data)
+ return -ENOMEM;
+
+ data_start = data;
+ data_end = data + name_len + 16;
+
+ ceph_encode_string_safe(&data, data_end, snap_name, name_len, bad);
+ ceph_encode_64_safe(&data, data_end, new_snapid, bad);
+
+ ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add",
+ data_start, data - data_start);
+
+ kfree(data_start);
+
+ if (ret < 0)
+ return ret;
+
+ dev->header.snapc->seq = new_snapid;
+
+ return 0;
+bad:
+ return -ERANGE;
+}
+
+/*
+ * only read the first part of the ondisk header, without the snaps info
+ */
+static int rbd_update_snaps(struct rbd_device *rbd_dev)
+{
+ int ret;
+ struct rbd_image_header h;
+ u64 snap_seq;
+
+ ret = rbd_read_header(rbd_dev, &h);
+ if (ret < 0)
+ return ret;
+
+ down_write(&rbd_dev->header.snap_rwsem);
+
+ snap_seq = rbd_dev->header.snapc->seq;
+
+ kfree(rbd_dev->header.snapc);
+ kfree(rbd_dev->header.snap_names);
+ kfree(rbd_dev->header.snap_sizes);
+
+ rbd_dev->header.total_snaps = h.total_snaps;
+ rbd_dev->header.snapc = h.snapc;
+ rbd_dev->header.snap_names = h.snap_names;
+ rbd_dev->header.snap_sizes = h.snap_sizes;
+ rbd_dev->header.snapc->seq = snap_seq;
+
+ up_write(&rbd_dev->header.snap_rwsem);
+
+ return 0;
+}
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+ struct gendisk *disk;
+ struct request_queue *q;
+ int rc;
+ u64 total_size = 0;
+
+ /* contact OSD, request size info about the object being mapped */
+ rc = rbd_read_header(rbd_dev, &rbd_dev->header);
+ if (rc)
+ return rc;
+
+ rc = rbd_header_set_snap(rbd_dev, rbd_dev->snap_name, &total_size);
+ if (rc)
+ return rc;
+
+ /* create gendisk info */
+ rc = -ENOMEM;
+ disk = alloc_disk(RBD_MINORS_PER_MAJOR);
+ if (!disk)
+ goto out;
+
+ sprintf(disk->disk_name, DRV_NAME "%d", rbd_dev->id);
+ disk->major = rbd_dev->major;
+ disk->first_minor = 0;
+ disk->fops = &rbd_bd_ops;
+ disk->private_data = rbd_dev;
+
+ /* init rq */
+ rc = -ENOMEM;
+ q = blk_init_queue(rbd_rq_fn, &rbd_dev->lock);
+ if (!q)
+ goto out_disk;
+ blk_queue_merge_bvec(q, rbd_merge_bvec);
+ disk->queue = q;
+
+ q->queuedata = rbd_dev;
+
+ rbd_dev->disk = disk;
+ rbd_dev->q = q;
+
+ /* finally, announce the disk to the world */
+ set_capacity(disk, total_size / 512ULL);
+ add_disk(disk);
+
+ pr_info("%s: added with size 0x%llx\n",
+ disk->disk_name, (unsigned long long)total_size);
+ return 0;
+
+out_disk:
+ put_disk(disk);
+out:
+ return rc;
+}
+
+/********************************************************************
+ * /sys/class/rbd/
+ * add map rados objects to blkdev
+ * remove unmap rados objects
+ * list show mappings
+ *******************************************************************/
+
+static void class_rbd_release(struct class *cls)
+{
+ kfree(cls);
+}
+
+static ssize_t class_rbd_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ int n = 0;
+ struct list_head *tmp;
+ int max = PAGE_SIZE;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max,
+ "#id\tmajor\tclient_name\tpool\tname\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ n += snprintf(data+n, max-n,
+ "%d\t%d\tclient%lld\t%s\t%s\t%s\t%lld\n",
+ rbd_dev->id,
+ rbd_dev->major,
+ ceph_client_id(rbd_dev->client),
+ rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name,
+ rbd_dev->header.image_size >> 10);
+ if (n == max)
+ break;
+ }
+
+ mutex_unlock(&ctl_mutex);
+ return n;
+}
+
+static ssize_t class_rbd_add(struct class *c,
+ struct class_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct ceph_osd_client *osdc;
+ struct rbd_device *rbd_dev;
+ ssize_t rc = -ENOMEM;
+ int irc, new_id = 0;
+ struct list_head *tmp;
+ char *mon_dev_name;
+ char *options;
+
+ if (!try_module_get(THIS_MODULE))
+ return -ENODEV;
+
+ mon_dev_name = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!mon_dev_name)
+ goto err_out_mod;
+
+ options = kmalloc(RBD_MAX_OPT_LEN, GFP_KERNEL);
+ if (!options)
+ goto err_mon_dev;
+
+ /* new rbd_device object */
+ rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+ if (!rbd_dev)
+ goto err_out_opt;
+
+ /* static rbd_device initialization */
+ spin_lock_init(&rbd_dev->lock);
+ INIT_LIST_HEAD(&rbd_dev->node);
+
+ /* generate unique id: find highest unique id, add one */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ list_for_each(tmp, &rbd_dev_list) {
+ struct rbd_device *rbd_dev;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id >= new_id)
+ new_id = rbd_dev->id + 1;
+ }
+
+ rbd_dev->id = new_id;
+
+ /* add to global list */
+ list_add_tail(&rbd_dev->node, &rbd_dev_list);
+
+ /* parse add command */
+ if (sscanf(buf, "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_OPT_LEN) "s "
+ "%" __stringify(RBD_MAX_POOL_NAME_LEN) "s "
+ "%" __stringify(RBD_MAX_OBJ_NAME_LEN) "s"
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ mon_dev_name, options, rbd_dev->pool_name,
+ rbd_dev->obj, rbd_dev->snap_name) < 4) {
+ rc = -EINVAL;
+ goto err_out_slot;
+ }
+
+ if (rbd_dev->snap_name[0] == 0)
+ rbd_dev->snap_name[0] = '-';
+
+ rbd_dev->obj_len = strlen(rbd_dev->obj);
+ snprintf(rbd_dev->obj_md_name, sizeof(rbd_dev->obj_md_name), "%s%s",
+ rbd_dev->obj, RBD_SUFFIX);
+
+ /* initialize rest of new object */
+ snprintf(rbd_dev->name, DEV_NAME_LEN, DRV_NAME "%d", rbd_dev->id);
+ rc = rbd_get_client(rbd_dev, mon_dev_name, options);
+ if (rc < 0)
+ goto err_out_slot;
+
+ mutex_unlock(&ctl_mutex);
+
+ /* pick the pool */
+ osdc = &rbd_dev->client->osdc;
+ rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name);
+ if (rc < 0)
+ goto err_out_client;
+ rbd_dev->poolid = rc;
+
+ /* register our block device */
+ irc = register_blkdev(0, rbd_dev->name);
+ if (irc < 0) {
+ rc = irc;
+ goto err_out_client;
+ }
+ rbd_dev->major = irc;
+
+ /* set up and announce blkdev mapping */
+ rc = rbd_init_disk(rbd_dev);
+ if (rc)
+ goto err_out_blkdev;
+
+ return count;
+
+err_out_blkdev:
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_client:
+ rbd_put_client(rbd_dev);
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+err_out_slot:
+ list_del_init(&rbd_dev->node);
+ mutex_unlock(&ctl_mutex);
+
+ kfree(rbd_dev);
+err_out_opt:
+ kfree(options);
+err_mon_dev:
+ kfree(mon_dev_name);
+err_out_mod:
+ dout("Error adding device %s\n", buf);
+ module_put(THIS_MODULE);
+ return rc;
+}
+
+static struct rbd_device *__rbd_get_dev(unsigned long id)
+{
+ struct list_head *tmp;
+ struct rbd_device *rbd_dev;
+
+ list_for_each(tmp, &rbd_dev_list) {
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ if (rbd_dev->id == id)
+ return rbd_dev;
+ }
+ return NULL;
+}
+
+static ssize_t class_rbd_remove(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ /* remove object from list immediately */
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (rbd_dev)
+ list_del_init(&rbd_dev->node);
+
+ mutex_unlock(&ctl_mutex);
+
+ if (!rbd_dev)
+ return -ENOENT;
+
+ rbd_put_client(rbd_dev);
+
+ /* clean up and free blkdev */
+ rbd_free_disk(rbd_dev);
+ unregister_blkdev(rbd_dev->major, rbd_dev->name);
+ kfree(rbd_dev);
+
+ /* release module ref */
+ module_put(THIS_MODULE);
+
+ return count;
+}
+
+static ssize_t class_rbd_snaps_list(struct class *c,
+ struct class_attribute *attr,
+ char *data)
+{
+ struct rbd_device *rbd_dev = NULL;
+ struct list_head *tmp;
+ struct rbd_image_header *header;
+ int i, n = 0, max = PAGE_SIZE;
+ int ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ n += snprintf(data, max, "#id\tsnap\tKB\n");
+
+ list_for_each(tmp, &rbd_dev_list) {
+ char *names, *p;
+ struct ceph_snap_context *snapc;
+
+ rbd_dev = list_entry(tmp, struct rbd_device, node);
+ header = &rbd_dev->header;
+
+ down_read(&header->snap_rwsem);
+
+ names = header->snap_names;
+ snapc = header->snapc;
+
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, RBD_SNAP_HEAD_NAME,
+ header->image_size >> 10,
+ (!rbd_dev->cur_snap ? " (*)" : ""));
+ if (n == max)
+ break;
+
+ p = names;
+ for (i = 0; i < header->total_snaps; i++, p += strlen(p) + 1) {
+ n += snprintf(data + n, max - n, "%d\t%s\t%lld%s\n",
+ rbd_dev->id, p, header->snap_sizes[i] >> 10,
+ (rbd_dev->cur_snap &&
+ (snap_index(header, i) == rbd_dev->cur_snap) ?
+ " (*)" : ""));
+ if (n == max)
+ break;
+ }
+
+ up_read(&header->snap_rwsem);
+ }
+
+
+ ret = n;
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snaps_refresh(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, rc;
+ unsigned long ul;
+ int ret = count;
+
+ rc = strict_strtoul(buf, 10, &ul);
+ if (rc)
+ return rc;
+
+ /* convert to int; abort if we lost anything in the conversion */
+ target_id = (int) ul;
+ if (target_id != ul)
+ return -EINVAL;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done;
+ }
+
+ rc = rbd_update_snaps(rbd_dev);
+ if (rc < 0)
+ ret = rc;
+
+done:
+ mutex_unlock(&ctl_mutex);
+ return ret;
+}
+
+static ssize_t class_rbd_snap_create(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ char *name;
+
+ name = kmalloc(RBD_MAX_SNAP_NAME_LEN + 1, GFP_KERNEL);
+ if (!name)
+ return -ENOMEM;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ name) != 2) {
+ ret = -EINVAL;
+ goto done;
+ }
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = rbd_header_add_snap(rbd_dev,
+ name, GFP_KERNEL);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+done:
+ kfree(name);
+ return ret;
+}
+
+static ssize_t class_rbd_rollback(struct class *c,
+ struct class_attribute *attr,
+ const char *buf,
+ size_t count)
+{
+ struct rbd_device *rbd_dev = NULL;
+ int target_id, ret;
+ u64 snapid;
+ char snap_name[RBD_MAX_SNAP_NAME_LEN];
+ u64 cur_ofs;
+ char *seg_name;
+
+ /* parse snaps add command */
+ if (sscanf(buf, "%d "
+ "%" __stringify(RBD_MAX_SNAP_NAME_LEN) "s",
+ &target_id,
+ snap_name) != 2) {
+ return -EINVAL;
+ }
+
+ ret = -ENOMEM;
+ seg_name = kmalloc(RBD_MAX_SEG_NAME_LEN + 1, GFP_NOIO);
+ if (!seg_name)
+ return ret;
+
+ mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+ rbd_dev = __rbd_get_dev(target_id);
+ if (!rbd_dev) {
+ ret = -ENOENT;
+ goto done_unlock;
+ }
+
+ ret = snap_by_name(&rbd_dev->header, snap_name, &snapid, NULL);
+ if (ret < 0)
+ goto done_unlock;
+
+ dout("snapid=%lld\n", snapid);
+
+ cur_ofs = 0;
+ while (cur_ofs < rbd_dev->header.image_size) {
+ cur_ofs += rbd_get_segment(&rbd_dev->header,
+ rbd_dev->obj,
+ cur_ofs, (u64)-1,
+ seg_name, NULL);
+ dout("seg_name=%s\n", seg_name);
+
+ ret = rbd_req_sync_rollback_obj(rbd_dev, snapid, seg_name);
+ if (ret < 0)
+ pr_warning("could not roll back obj %s err=%d\n",
+ seg_name, ret);
+ }
+
+ ret = rbd_update_snaps(rbd_dev);
+ if (ret < 0)
+ goto done_unlock;
+
+ ret = count;
+
+done_unlock:
+ mutex_unlock(&ctl_mutex);
+ kfree(seg_name);
+
+ return ret;
+}
+
+static struct class_attribute class_rbd_attrs[] = {
+ __ATTR(add, 0200, NULL, class_rbd_add),
+ __ATTR(remove, 0200, NULL, class_rbd_remove),
+ __ATTR(list, 0444, class_rbd_list, NULL),
+ __ATTR(snaps_refresh, 0200, NULL, class_rbd_snaps_refresh),
+ __ATTR(snap_create, 0200, NULL, class_rbd_snap_create),
+ __ATTR(snaps_list, 0444, class_rbd_snaps_list, NULL),
+ __ATTR(snap_rollback, 0200, NULL, class_rbd_rollback),
+ __ATTR_NULL
+};
+
+/*
+ * create control files in sysfs
+ * /sys/class/rbd/...
+ */
+static int rbd_sysfs_init(void)
+{
+ int ret = -ENOMEM;
+
+ class_rbd = kzalloc(sizeof(*class_rbd), GFP_KERNEL);
+ if (!class_rbd)
+ goto out;
+
+ class_rbd->name = DRV_NAME;
+ class_rbd->owner = THIS_MODULE;
+ class_rbd->class_release = class_rbd_release;
+ class_rbd->class_attrs = class_rbd_attrs;
+
+ ret = class_register(class_rbd);
+ if (ret)
+ goto out_class;
+ return 0;
+
+out_class:
+ kfree(class_rbd);
+ class_rbd = NULL;
+ pr_err(DRV_NAME ": failed to create class rbd\n");
+out:
+ return ret;
+}
+
+static void rbd_sysfs_cleanup(void)
+{
+ if (class_rbd)
+ class_destroy(class_rbd);
+ class_rbd = NULL;
+}
+
+int __init rbd_init(void)
+{
+ int rc;
+
+ rc = rbd_sysfs_init();
+ if (rc)
+ return rc;
+ spin_lock_init(&node_lock);
+ pr_info("loaded " DRV_NAME_LONG "\n");
+ return 0;
+}
+
+void __exit rbd_exit(void)
+{
+ rbd_sysfs_cleanup();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+MODULE_DESCRIPTION("rados block device");
+
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000000..fc6c678aa2cb
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,73 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/*
+ * rbd image 'foo' consists of objects
+ * foo.rbd - image metadata
+ * foo.00000000
+ * foo.00000001
+ * ... - data
+ */
+
+#define RBD_SUFFIX ".rbd"
+#define RBD_DIRECTORY "rbd_directory"
+#define RBD_INFO "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER 22 /* 4MB */
+#define RBD_MIN_OBJ_ORDER 16
+#define RBD_MAX_OBJ_ORDER 30
+
+#define RBD_MAX_OBJ_NAME_LEN 96
+#define RBD_MAX_SEG_NAME_LEN 128
+
+#define RBD_COMP_NONE 0
+#define RBD_CRYPT_NONE 0
+
+#define RBD_HEADER_TEXT "<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE "RBD"
+#define RBD_HEADER_VERSION "001.005"
+
+struct rbd_info {
+ __le64 max_id;
+} __attribute__ ((packed));
+
+struct rbd_image_snap_ondisk {
+ __le64 id;
+ __le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+ char text[40];
+ char block_name[24];
+ char signature[4];
+ char version[8];
+ struct {
+ __u8 order;
+ __u8 crypt_type;
+ __u8 comp_type;
+ __u8 unused;
+ } __attribute__((packed)) options;
+ __le64 image_size;
+ __le64 snap_seq;
+ __le32 snap_count;
+ __le32 reserved;
+ __le64 snap_names_len;
+ struct rbd_image_snap_ondisk snaps[0];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
index 2e46815876df..75333d0a3327 100644
--- a/drivers/block/swim.c
+++ b/drivers/block/swim.c
@@ -20,7 +20,7 @@
#include <linux/fd.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/hdreg.h>
#include <linux/kernel.h>
#include <linux/delay.h>
@@ -222,6 +222,7 @@ extern int swim_read_sector_header(struct swim __iomem *base,
extern int swim_read_sector_data(struct swim __iomem *base,
unsigned char *data);
+static DEFINE_MUTEX(swim_mutex);
static inline void set_swim_mode(struct swim __iomem *base, int enable)
{
struct iwm __iomem *iwm_base;
@@ -666,9 +667,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&swim_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return ret;
}
@@ -678,7 +679,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
struct floppy_state *fs = disk->private_data;
struct swim __iomem *base = fs->swd->base;
- lock_kernel();
+ mutex_lock(&swim_mutex);
if (fs->ref_count < 0)
fs->ref_count = 0;
else if (fs->ref_count > 0)
@@ -686,7 +687,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
if (fs->ref_count == 0)
swim_motor(base, OFF);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return 0;
}
@@ -704,9 +705,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
case FDEJECT:
if (fs->ref_count != 1)
return -EBUSY;
- lock_kernel();
+ mutex_lock(&swim_mutex);
err = floppy_eject(fs);
- unlock_kernel();
+ mutex_unlock(&swim_mutex);
return err;
case FDGETPRM:
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
index cc6a3864822c..bf3a5b859299 100644
--- a/drivers/block/swim3.c
+++ b/drivers/block/swim3.c
@@ -25,7 +25,7 @@
#include <linux/ioctl.h>
#include <linux/blkdev.h>
#include <linux/interrupt.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/module.h>
#include <linux/spinlock.h>
#include <asm/io.h>
@@ -36,6 +36,7 @@
#include <asm/machdep.h>
#include <asm/pmac_feature.h>
+static DEFINE_MUTEX(swim3_mutex);
static struct request_queue *swim3_queue;
static struct gendisk *disks[2];
static struct request *fd_req;
@@ -873,9 +874,9 @@ static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
ret = floppy_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return ret;
}
@@ -953,9 +954,9 @@ static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
ret = floppy_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return ret;
}
@@ -964,13 +965,13 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
{
struct floppy_state *fs = disk->private_data;
struct swim3 __iomem *sw = fs->swim3;
- lock_kernel();
+ mutex_lock(&swim3_mutex);
if (fs->ref_count > 0 && --fs->ref_count == 0) {
swim3_action(fs, MOTOR_OFF);
out_8(&sw->control_bic, 0xff);
swim3_select(fs, RELAX);
}
- unlock_kernel();
+ mutex_unlock(&swim3_mutex);
return 0;
}
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index c48e14878582..9ae3bb713286 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -28,7 +28,7 @@
#include <linux/timer.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <scsi/scsi.h>
#define DRV_NAME "ub"
@@ -248,6 +248,7 @@ struct ub_completion {
spinlock_t lock;
};
+static DEFINE_MUTEX(ub_mutex);
static inline void ub_init_completion(struct ub_completion *x)
{
x->done = 0;
@@ -396,7 +397,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum);
#else
static const struct usb_device_id ub_usb_ids[] = {
- { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, US_SC_SCSI, US_PR_BULK) },
+ { USB_INTERFACE_INFO(USB_CLASS_MASS_STORAGE, USB_SC_SCSI, USB_PR_BULK) },
{ }
};
@@ -1715,9 +1716,9 @@ static int ub_bd_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ret = ub_bd_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return ret;
}
@@ -1730,9 +1731,9 @@ static int ub_bd_release(struct gendisk *disk, fmode_t mode)
struct ub_lun *lun = disk->private_data;
struct ub_dev *sc = lun->udev;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ub_put(sc);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return 0;
}
@@ -1747,9 +1748,9 @@ static int ub_bd_ioctl(struct block_device *bdev, fmode_t mode,
void __user *usermem = (void __user *) arg;
int ret;
- lock_kernel();
+ mutex_lock(&ub_mutex);
ret = scsi_cmd_ioctl(disk->queue, disk, mode, cmd, usermem);
- unlock_kernel();
+ mutex_unlock(&ub_mutex);
return ret;
}
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
index f651e51a3319..e2ff697697c2 100644
--- a/drivers/block/viodasd.c
+++ b/drivers/block/viodasd.c
@@ -41,7 +41,7 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/string.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/dma-mapping.h>
#include <linux/completion.h>
#include <linux/device.h>
@@ -73,6 +73,7 @@ enum {
MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
};
+static DEFINE_MUTEX(viodasd_mutex);
static DEFINE_SPINLOCK(viodasd_spinlock);
#define VIOMAXREQ 16
@@ -180,9 +181,9 @@ static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
{
int ret;
- lock_kernel();
+ mutex_lock(&viodasd_mutex);
ret = viodasd_open(bdev, mode);
- unlock_kernel();
+ mutex_unlock(&viodasd_mutex);
return ret;
}
@@ -196,7 +197,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
struct viodasd_device *d = disk->private_data;
HvLpEvent_Rc hvrc;
- lock_kernel();
+ mutex_lock(&viodasd_mutex);
/* Send the event to OS/400. We DON'T expect a response */
hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
HvLpEvent_Type_VirtualIo,
@@ -210,7 +211,7 @@ static int viodasd_release(struct gendisk *disk, fmode_t mode)
if (hvrc != 0)
pr_warning("HV close call failed %d\n", (int)hvrc);
- unlock_kernel();
+ mutex_unlock(&viodasd_mutex);
return 0;
}
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index 1101e251a629..6ecf89cdf006 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -2,7 +2,6 @@
#include <linux/spinlock.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
#include <linux/hdreg.h>
#include <linux/virtio.h>
#include <linux/virtio_blk.h>
@@ -128,9 +127,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
}
}
- if (vbr->req->cmd_flags & REQ_HARDBARRIER)
- vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER;
-
sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr));
/*
@@ -222,8 +218,8 @@ static int virtblk_get_id(struct gendisk *disk, char *id_str)
return err;
}
-static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long data)
+static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long data)
{
struct gendisk *disk = bdev->bd_disk;
struct virtio_blk *vblk = disk->private_data;
@@ -238,18 +234,6 @@ static int virtblk_locked_ioctl(struct block_device *bdev, fmode_t mode,
(void __user *)data);
}
-static int virtblk_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long param)
-{
- int ret;
-
- lock_kernel();
- ret = virtblk_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
-
- return ret;
-}
-
/* We provide getgeo only to please some old bootloader/partitioning tools */
static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
{
@@ -392,31 +376,9 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
vblk->disk->driverfs_dev = &vdev->dev;
index++;
- if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) {
- /*
- * If the FLUSH feature is supported we do have support for
- * flushing a volatile write cache on the host. Use that
- * to implement write barrier support.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN_FLUSH);
- } else if (virtio_has_feature(vdev, VIRTIO_BLK_F_BARRIER)) {
- /*
- * If the BARRIER feature is supported the host expects us
- * to order request by tags. This implies there is not
- * volatile write cache on the host, and that the host
- * never re-orders outstanding I/O. This feature is not
- * useful for real life scenarious and deprecated.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_TAG);
- } else {
- /*
- * If the FLUSH feature is not supported we must assume that
- * the host does not perform any kind of volatile write
- * caching. We still need to drain the queue to provider
- * proper barrier semantics.
- */
- blk_queue_ordered(q, QUEUE_ORDERED_DRAIN);
- }
+ /* configure queue flush support */
+ if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH))
+ blk_queue_flush(q, REQ_FLUSH);
/* If disk is read-only in the host, the guest should obey */
if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
@@ -535,9 +497,9 @@ static const struct virtio_device_id id_table[] = {
};
static unsigned int features[] = {
- VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX,
- VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
- VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
+ VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+ VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI,
+ VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY
};
/*
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index d5a3cd750561..4abd2bcd20fb 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -46,7 +46,7 @@
#include <linux/init.h>
#include <linux/wait.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/blkpg.h>
#include <linux/delay.h>
#include <linux/io.h>
@@ -58,6 +58,7 @@
#include "xd.h"
+static DEFINE_MUTEX(xd_mutex);
static void __init do_xd_setup (int *integers);
#ifdef MODULE
static int xd[5] = { -1,-1,-1,-1, };
@@ -381,9 +382,9 @@ static int xd_ioctl(struct block_device *bdev, fmode_t mode,
{
int ret;
- lock_kernel();
+ mutex_lock(&xd_mutex);
ret = xd_locked_ioctl(bdev, mode, cmd, param);
- unlock_kernel();
+ mutex_unlock(&xd_mutex);
return ret;
}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index c4e9d817caaa..06e2812ba124 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -41,7 +41,7 @@
#include <linux/cdrom.h>
#include <linux/module.h>
#include <linux/slab.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/scatterlist.h>
#include <xen/xen.h>
@@ -69,6 +69,7 @@ struct blk_shadow {
unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST];
};
+static DEFINE_MUTEX(blkfront_mutex);
static const struct block_device_operations xlvbd_block_fops;
#define BLK_RING_SIZE __RING_SIZE((struct blkif_sring *)0, PAGE_SIZE)
@@ -95,7 +96,7 @@ struct blkfront_info
struct gnttab_free_callback callback;
struct blk_shadow shadow[BLK_RING_SIZE];
unsigned long shadow_free;
- int feature_barrier;
+ unsigned int feature_flush;
int is_ready;
};
@@ -418,26 +419,12 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
}
-static int xlvbd_barrier(struct blkfront_info *info)
+static void xlvbd_flush(struct blkfront_info *info)
{
- int err;
- const char *barrier;
-
- switch (info->feature_barrier) {
- case QUEUE_ORDERED_DRAIN: barrier = "enabled (drain)"; break;
- case QUEUE_ORDERED_TAG: barrier = "enabled (tag)"; break;
- case QUEUE_ORDERED_NONE: barrier = "disabled"; break;
- default: return -EINVAL;
- }
-
- err = blk_queue_ordered(info->rq, info->feature_barrier);
-
- if (err)
- return err;
-
+ blk_queue_flush(info->rq, info->feature_flush);
printk(KERN_INFO "blkfront: %s: barriers %s\n",
- info->gd->disk_name, barrier);
- return 0;
+ info->gd->disk_name,
+ info->feature_flush ? "enabled" : "disabled");
}
@@ -516,7 +503,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
info->rq = gd->queue;
info->gd = gd;
- xlvbd_barrier(info);
+ xlvbd_flush(info);
if (vdisk_info & VDISK_READONLY)
set_disk_ro(gd, 1);
@@ -662,8 +649,8 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
printk(KERN_WARNING "blkfront: %s: write barrier op failed\n",
info->gd->disk_name);
error = -EOPNOTSUPP;
- info->feature_barrier = QUEUE_ORDERED_NONE;
- xlvbd_barrier(info);
+ info->feature_flush = 0;
+ xlvbd_flush(info);
}
/* fall through */
case BLKIF_OP_READ:
@@ -1076,20 +1063,20 @@ static void blkfront_connect(struct blkfront_info *info)
/*
* If there's no "feature-barrier" defined, then it means
* we're dealing with a very old backend which writes
- * synchronously; draining will do what needs to get done.
+ * synchronously; nothing to do.
*
- * If there are barriers, then we can do full queued writes
- * with tagged barriers.
- *
- * If barriers are not supported, then there's no much we can
- * do, so just set ordering to NONE.
+ * If there are barriers, then we use flush.
*/
- if (err)
- info->feature_barrier = QUEUE_ORDERED_DRAIN;
- else if (barrier)
- info->feature_barrier = QUEUE_ORDERED_TAG;
- else
- info->feature_barrier = QUEUE_ORDERED_NONE;
+ info->feature_flush = 0;
+
+ /*
+ * The driver doesn't properly handled empty flushes, so
+ * lets disable barrier support for now.
+ */
+#if 0
+ if (!err && barrier)
+ info->feature_flush = REQ_FLUSH;
+#endif
err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
if (err) {
@@ -1203,7 +1190,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
struct blkfront_info *info;
int err = 0;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
info = disk->private_data;
if (!info) {
@@ -1221,7 +1208,7 @@ static int blkif_open(struct block_device *bdev, fmode_t mode)
mutex_unlock(&info->mutex);
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return err;
}
@@ -1231,7 +1218,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
struct block_device *bdev;
struct xenbus_device *xbdev;
- lock_kernel();
+ mutex_lock(&blkfront_mutex);
bdev = bdget_disk(disk, 0);
bdput(bdev);
@@ -1265,7 +1252,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
}
out:
- unlock_kernel();
+ mutex_unlock(&blkfront_mutex);
return 0;
}
diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c
index 057413bb16e2..829161edae53 100644
--- a/drivers/block/xsysace.c
+++ b/drivers/block/xsysace.c
@@ -89,7 +89,7 @@
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/blkdev.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/ata.h>
#include <linux/hdreg.h>
#include <linux/platform_device.h>
@@ -214,6 +214,7 @@ struct ace_device {
u16 cf_id[ATA_ID_WORDS];
};
+static DEFINE_MUTEX(xsysace_mutex);
static int ace_major;
/* ---------------------------------------------------------------------
@@ -903,13 +904,13 @@ static int ace_open(struct block_device *bdev, fmode_t mode)
dev_dbg(ace->dev, "ace_open() users=%i\n", ace->users + 1);
- lock_kernel();
+ mutex_lock(&xsysace_mutex);
spin_lock_irqsave(&ace->lock, flags);
ace->users++;
spin_unlock_irqrestore(&ace->lock, flags);
check_disk_change(bdev);
- unlock_kernel();
+ mutex_unlock(&xsysace_mutex);
return 0;
}
@@ -922,7 +923,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
dev_dbg(ace->dev, "ace_release() users=%i\n", ace->users - 1);
- lock_kernel();
+ mutex_lock(&xsysace_mutex);
spin_lock_irqsave(&ace->lock, flags);
ace->users--;
if (ace->users == 0) {
@@ -930,7 +931,7 @@ static int ace_release(struct gendisk *disk, fmode_t mode)
ace_out(ace, ACE_CTRL, val & ~ACE_CTRL_LOCKREQ);
}
spin_unlock_irqrestore(&ace->lock, flags);
- unlock_kernel();
+ mutex_unlock(&xsysace_mutex);
return 0;
}
@@ -1224,7 +1225,8 @@ ace_of_probe(struct platform_device *op, const struct of_device_id *match)
bus_width = ACE_BUS_WIDTH_8;
/* Call the bus-independant setup code */
- return ace_alloc(&op->dev, id ? *id : 0, physaddr, irq, bus_width);
+ return ace_alloc(&op->dev, id ? be32_to_cpup(id) : 0,
+ physaddr, irq, bus_width);
}
static int __devexit ace_of_remove(struct platform_device *op)
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
index d75b2bb601ad..a22e3f895947 100644
--- a/drivers/block/z2ram.c
+++ b/drivers/block/z2ram.c
@@ -33,7 +33,7 @@
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/bitops.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/slab.h>
#include <asm/setup.h>
@@ -57,6 +57,7 @@ extern struct mem_info m68k_memory[NUM_MEMINFO];
#define Z2RAM_CHUNK1024 ( Z2RAM_CHUNKSIZE >> 10 )
+static DEFINE_MUTEX(z2ram_mutex);
static u_long *z2ram_map = NULL;
static u_long z2ram_size = 0;
static int z2_count = 0;
@@ -79,8 +80,10 @@ static void do_z2_request(struct request_queue *q)
int err = 0;
if (start + len > z2ram_size) {
- printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n",
- blk_rq_pos(req), blk_rq_cur_sectors(req));
+ pr_err(DEVICE_NAME ": bad access: block=%llu, "
+ "count=%u\n",
+ (unsigned long long)blk_rq_pos(req),
+ blk_rq_cur_sectors(req));
err = -EIO;
goto done;
}
@@ -154,7 +157,7 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
device = MINOR(bdev->bd_dev);
- lock_kernel();
+ mutex_lock(&z2ram_mutex);
if ( current_device != -1 && current_device != device )
{
rc = -EBUSY;
@@ -296,25 +299,25 @@ static int z2_open(struct block_device *bdev, fmode_t mode)
set_capacity(z2ram_gendisk, z2ram_size >> 9);
}
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return 0;
err_out_kfree:
kfree(z2ram_map);
err_out:
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return rc;
}
static int
z2_release(struct gendisk *disk, fmode_t mode)
{
- lock_kernel();
+ mutex_lock(&z2ram_mutex);
if ( current_device == -1 ) {
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
return 0;
}
- unlock_kernel();
+ mutex_unlock(&z2ram_mutex);
/*
* FIXME: unmap memory
*/