Merge tag 'v3.5-rc1'

Linux 3.5-rc1 Conflicts: net/ceph/messenger.c
author: Sage Weil <sage@inktank.com> 2012-06-15 21:32:04 +0200
committer: Sage Weil <sage@inktank.com> 2012-06-15 21:32:04 +0200
commit: 9a64e8e0ace51b309fdcff4b4754b3649250382a (patch)
tree: 1f0d75c196c5ab0408c55ed6cf3a152f1f921e15 /drivers/block
parent: libceph: flush msgr queue during mon_client shutdown (diff)
parent: Linux 3.5-rc1 (diff)
download: linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.tar.xz
linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.zip
32 files changed, 2026 insertions, 1920 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c
index 8db9089127c5..9a13e889837e 100644
--- a/drivers/block/DAC960.c
+++ b/drivers/block/DAC960.c
@@ -6580,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = {
 
 static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller)
 {
-	struct proc_dir_entry *StatusProcEntry;
 	struct proc_dir_entry *ControllerProcEntry;
-	struct proc_dir_entry *UserCommandProcEntry;
 
 	if (DAC960_ProcDirectoryEntry == NULL) {
-  		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
-  		StatusProcEntry = proc_create("status", 0,
-					   DAC960_ProcDirectoryEntry,
-					   &dac960_proc_fops);
+		DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL);
+		proc_create("status", 0, DAC960_ProcDirectoryEntry,
+			    &dac960_proc_fops);
 	}
 
-      sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
-      ControllerProcEntry = proc_mkdir(Controller->ControllerName,
-				       DAC960_ProcDirectoryEntry);
-      proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
-      proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
-      UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
-      Controller->ControllerProcEntry = ControllerProcEntry;
+	sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber);
+	ControllerProcEntry = proc_mkdir(Controller->ControllerName,
+					 DAC960_ProcDirectoryEntry);
+	proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller);
+	proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller);
+	proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller);
+	Controller->ControllerProcEntry = ControllerProcEntry;
 }
 
 
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 4e4c8a4a5fd3..a796407123c7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -354,7 +354,7 @@ config BLK_DEV_SX8
 	  Use devices /dev/sx8/$N and /dev/sx8/$Np$M.
 
 config BLK_DEV_UB
-	tristate "Low Performance USB Block driver"
+	tristate "Low Performance USB Block driver (deprecated)"
 	depends on USB
 	help
 	  This driver supports certain USB attached storage devices
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
index ec246437f5a4..531ceb31d0ff 100644
--- a/drivers/block/brd.c
+++ b/drivers/block/brd.c
@@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 	page = brd_lookup_page(brd, sector);
 	BUG_ON(!page);
 
-	dst = kmap_atomic(page, KM_USER1);
+	dst = kmap_atomic(page);
 	memcpy(dst + offset, src, copy);
-	kunmap_atomic(dst, KM_USER1);
+	kunmap_atomic(dst);
 
 	if (copy < n) {
 		src += copy;
@@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src,
 		page = brd_lookup_page(brd, sector);
 		BUG_ON(!page);
 
-		dst = kmap_atomic(page, KM_USER1);
+		dst = kmap_atomic(page);
 		memcpy(dst, src, copy);
-		kunmap_atomic(dst, KM_USER1);
+		kunmap_atomic(dst);
 	}
 }
 
@@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 	copy = min_t(size_t, n, PAGE_SIZE - offset);
 	page = brd_lookup_page(brd, sector);
 	if (page) {
-		src = kmap_atomic(page, KM_USER1);
+		src = kmap_atomic(page);
 		memcpy(dst, src + offset, copy);
-		kunmap_atomic(src, KM_USER1);
+		kunmap_atomic(src);
 	} else
 		memset(dst, 0, copy);
 
@@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd,
 		copy = n - copy;
 		page = brd_lookup_page(brd, sector);
 		if (page) {
-			src = kmap_atomic(page, KM_USER1);
+			src = kmap_atomic(page);
 			memcpy(dst, src, copy);
-			kunmap_atomic(src, KM_USER1);
+			kunmap_atomic(src);
 		} else
 			memset(dst, 0, copy);
 	}
@@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 			goto out;
 	}
 
-	mem = kmap_atomic(page, KM_USER0);
+	mem = kmap_atomic(page);
 	if (rw == READ) {
 		copy_from_brd(mem + off, brd, sector, len);
 		flush_dcache_page(page);
@@ -317,7 +317,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page,
 		flush_dcache_page(page);
 		copy_to_brd(brd, mem + off, sector, len);
 	}
-	kunmap_atomic(mem, KM_USER0);
+	kunmap_atomic(mem);
 
 out:
 	return err;
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c
index e820b68d2f6c..acda773b3720 100644
--- a/drivers/block/cciss_scsi.c
+++ b/drivers/block/cciss_scsi.c
@@ -866,6 +866,7 @@ cciss_scsi_detect(ctlr_info_t *h)
 	sh->can_queue = cciss_tape_cmds;
 	sh->sg_tablesize = h->maxsgentries;
 	sh->max_cmd_len = MAX_COMMAND_SIZE;
+	sh->max_sectors = h->cciss_max_sectors;
 
 	((struct cciss_scsi_adapter_data_t *) 
 		h->scsi_ctlr)->scsi_host = sh;
@@ -1410,7 +1411,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c,
 	/* track how many SG entries we are using */
 	if (request_nsgs > h->maxSG)
 		h->maxSG = request_nsgs;
-	c->Header.SGTotal = (__u8) request_nsgs + chained;
+	c->Header.SGTotal = (u16) request_nsgs + chained;
 	if (request_nsgs > h->max_cmd_sgentries)
 		c->Header.SGList = h->max_cmd_sgentries;
 	else
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index cf0e63dd97da..e54e31b02b88 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -65,39 +65,80 @@ struct drbd_atodb_wait {
 
 int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
 
+void *drbd_md_get_buffer(struct drbd_conf *mdev)
+{
+	int r;
+
+	wait_event(mdev->misc_wait,
+		   (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 ||
+		   mdev->state.disk <= D_FAILED);
+
+	return r ? NULL : page_address(mdev->md_io_page);
+}
+
+void drbd_md_put_buffer(struct drbd_conf *mdev)
+{
+	if (atomic_dec_and_test(&mdev->md_io_in_use))
+		wake_up(&mdev->misc_wait);
+}
+
+static bool md_io_allowed(struct drbd_conf *mdev)
+{
+	enum drbd_disk_state ds = mdev->state.disk;
+	return ds >= D_NEGOTIATING || ds == D_ATTACHING;
+}
+
+void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt = bdev->dc.disk_timeout * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt);
+	if (dt == 0)
+		dev_err(DEV, "meta-data IO operation timed out\n");
+}
+
 static int _drbd_md_sync_page_io(struct drbd_conf *mdev,
 				 struct drbd_backing_dev *bdev,
 				 struct page *page, sector_t sector,
 				 int rw, int size)
 {
 	struct bio *bio;
-	struct drbd_md_io md_io;
 	int ok;
 
-	md_io.mdev = mdev;
-	init_completion(&md_io.event);
-	md_io.error = 0;
+	mdev->md_io.done = 0;
+	mdev->md_io.error = -ENODEV;
 
 	if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
 		rw |= REQ_FUA | REQ_FLUSH;
 	rw |= REQ_SYNC;
 
-	bio = bio_alloc(GFP_NOIO, 1);
+	bio = bio_alloc_drbd(GFP_NOIO);
 	bio->bi_bdev = bdev->md_bdev;
 	bio->bi_sector = sector;
 	ok = (bio_add_page(bio, page, size, 0) == size);
 	if (!ok)
 		goto out;
-	bio->bi_private = &md_io;
+	bio->bi_private = &mdev->md_io;
 	bio->bi_end_io = drbd_md_io_complete;
 	bio->bi_rw = rw;
 
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* Corresponding put_ldev in drbd_md_io_complete() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		ok = 0;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */
 	if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
 		bio_endio(bio, -EIO);
 	else
 		submit_bio(rw, bio);
-	wait_for_completion(&md_io.event);
-	ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0;
+	wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done);
+	ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
 
  out:
 	bio_put(bio);
@@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
 	int offset = 0;
 	struct page *iop = mdev->md_io_page;
 
-	D_ASSERT(mutex_is_locked(&mdev->md_io_mutex));
+	D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
 
 	BUG_ON(!bdev->md_bdev);
 
@@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 		return 1;
 	}
 
-	mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */
-	buffer = (struct al_transaction *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
+	if (!buffer) {
+		dev_err(DEV, "disk failed while waiting for md_io buffer\n");
+		complete(&((struct update_al_work *)w)->event);
+		put_ldev(mdev);
+		return 1;
+	}
 
 	buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
 	buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
@@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
 	D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
 	mdev->al_tr_number++;
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	complete(&((struct update_al_work *)w)->event);
 	put_ldev(mdev);
@@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	/* lock out all other meta data io for now,
 	 * and make sure the page is mapped.
 	 */
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		return 0;
 
 	/* Find the valid transaction in the log */
 	for (i = 0; i <= mx; i++) {
@@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		if (rv == 0)
 			continue;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 		cnr = be32_to_cpu(buffer->tr_number);
@@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 
 	if (!found_valid) {
 		dev_warn(DEV, "No usable activity log found.\n");
-		mutex_unlock(&mdev->md_io_mutex);
+		drbd_md_put_buffer(mdev);
 		return 1;
 	}
 
@@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		rv = drbd_al_read_tr(mdev, bdev, buffer, i);
 		ERR_IF(rv == 0) goto cancel;
 		if (rv == -1) {
-			mutex_unlock(&mdev->md_io_mutex);
+			drbd_md_put_buffer(mdev);
 			return 0;
 		}
 
@@ -534,7 +581,7 @@ cancel:
 		mdev->al_tr_pos = 0;
 
 	/* ok, we are done with it */
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
 
 	dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
 	     transactions, active_extents);
@@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector,
 			else
 				ext->rs_failed += count;
 			if (ext->rs_left < ext->rs_failed) {
-				dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
-				    "rs_failed=%d count=%d\n",
+				dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
 				     (unsigned long long)sector,
 				     ext->lce.lc_number, ext->rs_left,
-				     ext->rs_failed, count);
-				dump_stack();
-
-				lc_put(mdev->resync, &ext->lce);
-				drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
-				return;
+				     ext->rs_failed, count,
+				     drbd_conn_str(mdev->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(mdev, enr);
 			}
 		} else {
 			/* Normally this element should be in the cache,
@@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev)
 		put_ldev(mdev);
 	}
 	spin_unlock_irq(&mdev->al_lock);
+	wake_up(&mdev->al_wait);
 
 	return 0;
 }
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 912f585a760f..b5c5ff53cb57 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev)
 static void bm_store_page_idx(struct page *page, unsigned long idx)
 {
 	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
-	page_private(page) |= idx;
+	set_page_private(page, idx);
 }
 
 static unsigned long bm_page_to_idx(struct page *page)
@@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
 	return page_nr;
 }
 
-static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km)
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 {
 	struct page *page = b->bm_pages[idx];
-	return (unsigned long *) kmap_atomic(page, km);
+	return (unsigned long *) kmap_atomic(page);
 }
 
 static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
 {
-	return __bm_map_pidx(b, idx, KM_IRQ1);
+	return __bm_map_pidx(b, idx);
 }
 
-static void __bm_unmap(unsigned long *p_addr, const enum km_type km)
+static void __bm_unmap(unsigned long *p_addr)
 {
-	kunmap_atomic(p_addr, km);
+	kunmap_atomic(p_addr);
 };
 
 static void bm_unmap(unsigned long *p_addr)
 {
-	return __bm_unmap(p_addr, KM_IRQ1);
+	return __bm_unmap(p_addr);
 }
 
 /* long word offset of _bitmap_ sector */
@@ -543,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 
 	/* all but last page */
 	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
-		p_addr = __bm_map_pidx(b, idx, KM_USER0);
+		p_addr = __bm_map_pidx(b, idx);
 		for (i = 0; i < LWPP; i++)
 			bits += hweight_long(p_addr[i]);
-		__bm_unmap(p_addr, KM_USER0);
+		__bm_unmap(p_addr);
 		cond_resched();
 	}
 	/* last (or only) page */
 	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
-	p_addr = __bm_map_pidx(b, idx, KM_USER0);
+	p_addr = __bm_map_pidx(b, idx);
 	for (i = 0; i < last_word; i++)
 		bits += hweight_long(p_addr[i]);
 	p_addr[last_word] &= cpu_to_lel(mask);
@@ -559,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b)
 	/* 32bit arch, may have an unused padding long */
 	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
 		p_addr[last_word+1] = 0;
-	__bm_unmap(p_addr, KM_USER0);
+	__bm_unmap(p_addr);
 	return bits;
 }
 
@@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev)
 struct bm_aio_ctx {
 	struct drbd_conf *mdev;
 	atomic_t in_flight;
-	struct completion done;
+	unsigned int done;
 	unsigned flags;
 #define BM_AIO_COPY_PAGES	1
 	int error;
+	struct kref kref;
 };
 
+static void bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref);
+
+	put_ldev(ctx->mdev);
+	kfree(ctx);
+}
+
 /* bv_page may be a copy, or may be the original */
 static void bm_async_io_complete(struct bio *bio, int error)
 {
@@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error)
 
 	bm_page_unlock_io(mdev, idx);
 
-	/* FIXME give back to page pool */
 	if (ctx->flags & BM_AIO_COPY_PAGES)
-		put_page(bio->bi_io_vec[0].bv_page);
+		mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool);
 
 	bio_put(bio);
 
-	if (atomic_dec_and_test(&ctx->in_flight))
-		complete(&ctx->done);
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&mdev->misc_wait);
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	}
 }
 
 static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local)
 {
-	/* we are process context. we always get a bio */
-	struct bio *bio = bio_alloc(GFP_KERNEL, 1);
+	struct bio *bio = bio_alloc_drbd(GFP_NOIO);
 	struct drbd_conf *mdev = ctx->mdev;
 	struct drbd_bitmap *b = mdev->bitmap;
 	struct page *page;
@@ -966,21 +976,21 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 	bm_set_page_unchanged(b->bm_pages[page_nr]);
 
 	if (ctx->flags & BM_AIO_COPY_PAGES) {
-		/* FIXME alloc_page is good enough for now, but actually needs
-		 * to use pre-allocated page pool */
 		void *src, *dest;
-		page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT);
-		dest = kmap_atomic(page, KM_USER0);
-		src = kmap_atomic(b->bm_pages[page_nr], KM_USER1);
+		page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT);
+		dest = kmap_atomic(page);
+		src = kmap_atomic(b->bm_pages[page_nr]);
 		memcpy(dest, src, PAGE_SIZE);
-		kunmap_atomic(src, KM_USER1);
-		kunmap_atomic(dest, KM_USER0);
+		kunmap_atomic(src);
+		kunmap_atomic(dest);
 		bm_store_page_idx(page, page_nr);
 	} else
 		page = b->bm_pages[page_nr];
 
 	bio->bi_bdev = mdev->ldev->md_bdev;
 	bio->bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
 	bio_add_page(bio, page, len, 0);
 	bio->bi_private = ctx;
 	bio->bi_end_io = bm_async_io_complete;
@@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must
 /*
  * bm_rw: read/write the whole bitmap from/to its on disk location.
  */
-static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local)
+static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
-		.mdev = mdev,
-		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
-		.flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0,
-	};
+	struct bm_aio_ctx *ctx;
 	struct drbd_bitmap *b = mdev->bitmap;
 	int num_pages, i, count = 0;
 	unsigned long now;
@@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
 	 * as we submit copies of pages anyways.
 	 */
-	if (!ctx.flags)
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
+		.mdev = mdev,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
+	};
+
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+
+	if (!ctx->flags)
 		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
 
 	num_pages = b->bm_number_of_pages;
@@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 				continue;
 			}
 		}
-		atomic_inc(&ctx.in_flight);
-		bm_page_io_async(&ctx, i, rw);
+		atomic_inc(&ctx->in_flight);
+		bm_page_io_async(ctx, i, rw);
 		++count;
 		cond_resched();
 	}
 
 	/*
-	 * We initialize ctx.in_flight to one to make sure bm_async_io_complete
-	 * will not complete() early, and decrement / test it here.  If there
+	 * We initialize ctx->in_flight to one to make sure bm_async_io_complete
+	 * will not set ctx->done early, and decrement / test it here.  If there
 	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
 	 */
-	if (!atomic_dec_and_test(&ctx.in_flight))
-		wait_for_completion(&ctx.done);
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+
 	dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
 			rw == WRITE ? "WRITE" : "READ",
 			count, jiffies - now);
 
-	if (ctx.error) {
+	if (ctx->error) {
 		dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
 		drbd_chk_io_error(mdev, 1, true);
-		err = -EIO; /* ctx.error ? */
+		err = -EIO; /* ctx->error ? */
 	}
 
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk failed during IO... */
+
 	now = jiffies;
 	if (rw == WRITE) {
 		drbd_md_flush(mdev);
@@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
 	dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
 	     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
 
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
 	return err;
 }
 
@@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id
  */
 int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, READ, 0);
+	return bm_rw(mdev, READ, 0, 0);
 }
 
 /**
@@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, 0);
+	return bm_rw(mdev, WRITE, 0, 0);
 }
 
 /**
@@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
  */
 int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local)
 {
-	return bm_rw(mdev, WRITE, upper_idx);
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
 }
 
 
@@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l
  */
 int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local)
 {
-	struct bm_aio_ctx ctx = {
+	struct bm_aio_ctx *ctx;
+	int err;
+
+	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
+		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
+		return 0;
+	}
+
+	ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct bm_aio_ctx) {
 		.mdev = mdev,
 		.in_flight = ATOMIC_INIT(1),
-		.done = COMPLETION_INITIALIZER_ONSTACK(ctx.done),
+		.done = 0,
 		.flags = BM_AIO_COPY_PAGES,
+		.error = 0,
+		.kref = { ATOMIC_INIT(2) },
 	};
 
-	if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) {
-		dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx);
-		return 0;
+	if (!get_ldev_if_state(mdev, D_ATTACHING)) {  /* put is in bm_aio_ctx_destroy() */
+		dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n");
+		kfree(ctx);
+		return -ENODEV;
 	}
 
-	bm_page_io_async(&ctx, idx, WRITE_SYNC);
-	wait_for_completion(&ctx.done);
+	bm_page_io_async(ctx, idx, WRITE_SYNC);
+	wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done);
 
-	if (ctx.error)
+	if (ctx->error)
 		drbd_chk_io_error(mdev, 1, true);
 		/* that should force detach, so the in memory bitmap will be
 		 * gone in a moment as well. */
 
 	mdev->bm_writ_cnt++;
-	return ctx.error;
+	err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error;
+	kref_put(&ctx->kref, &bm_aio_ctx_destroy);
+	return err;
 }
 
 /* NOTE
@@ -1163,7 +1231,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc
  * this returns a bit number, NOT a sector!
  */
 static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
-	const int find_zero_bit, const enum km_type km)
+	const int find_zero_bit)
 {
 	struct drbd_bitmap *b = mdev->bitmap;
 	unsigned long *p_addr;
@@ -1178,7 +1246,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 		while (bm_fo < b->bm_bits) {
 			/* bit offset of the first bit in the page */
 			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
-			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km);
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
 
 			if (find_zero_bit)
 				i = find_next_zero_bit_le(p_addr,
@@ -1187,7 +1255,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo,
 				i = find_next_bit_le(p_addr,
 						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
 
-			__bm_unmap(p_addr, km);
+			__bm_unmap(p_addr);
 			if (i < PAGE_SIZE*8) {
 				bm_fo = bit_offset + i;
 				if (bm_fo >= b->bm_bits)
@@ -1215,7 +1283,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev,
 	if (BM_DONT_TEST & b->bm_flags)
 		bm_print_lock_info(mdev);
 
-	i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1);
+	i = __bm_find_next(mdev, bm_fo, find_zero_bit);
 
 	spin_unlock_irq(&b->bm_lock);
 	return i;
@@ -1239,13 +1307,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo
 unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo)
 {
 	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
-	return __bm_find_next(mdev, bm_fo, 0, KM_USER1);
+	return __bm_find_next(mdev, bm_fo, 0);
 }
 
 unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo)
 {
 	/* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */
-	return __bm_find_next(mdev, bm_fo, 1, KM_USER1);
+	return __bm_find_next(mdev, bm_fo, 1);
 }
 
 /* returns number of bits actually changed.
@@ -1273,14 +1341,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
 		if (page_nr != last_page_nr) {
 			if (p_addr)
-				__bm_unmap(p_addr, KM_IRQ1);
+				__bm_unmap(p_addr);
 			if (c < 0)
 				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
 			else if (c > 0)
 				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
 			changed_total += c;
 			c = 0;
-			p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1);
+			p_addr = __bm_map_pidx(b, page_nr);
 			last_page_nr = page_nr;
 		}
 		if (val)
@@ -1289,7 +1357,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s,
 			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
 	}
 	if (p_addr)
-		__bm_unmap(p_addr, KM_IRQ1);
+		__bm_unmap(p_addr);
 	if (c < 0)
 		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
 	else if (c > 0)
@@ -1342,13 +1410,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
 {
 	int i;
 	int bits;
-	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1);
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
 	for (i = first_word; i < last_word; i++) {
 		bits = hweight_long(paddr[i]);
 		paddr[i] = ~0UL;
 		b->bm_set += BITS_PER_LONG - bits;
 	}
-	kunmap_atomic(paddr, KM_IRQ1);
+	kunmap_atomic(paddr);
 }
 
 /* Same thing as drbd_bm_set_bits,
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 8d680562ba73..02f013a073a7 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -712,7 +712,6 @@ struct drbd_request {
 	struct list_head tl_requests; /* ring list in the transfer log */
 	struct bio *master_bio;       /* master bio pointer */
 	unsigned long rq_state; /* see comments above _req_mod() */
-	int seq_num;
 	unsigned long start_time;
 };
 
@@ -851,6 +850,7 @@ enum {
 	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
 	AL_SUSPENDED,		/* Activity logging is currently suspended. */
 	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
 };
 
 struct drbd_bitmap; /* opaque for drbd_conf */
@@ -862,31 +862,30 @@ enum bm_flag {
 	BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */
 
 	/* currently locked for bulk operation */
-	BM_LOCKED_MASK = 0x7,
+	BM_LOCKED_MASK = 0xf,
 
 	/* in detail, that is: */
 	BM_DONT_CLEAR = 0x1,
 	BM_DONT_SET   = 0x2,
 	BM_DONT_TEST  = 0x4,
 
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
 	/* (test bit, count bit) allowed (common case) */
-	BM_LOCKED_TEST_ALLOWED = 0x3,
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
 
 	/* testing bits, as well as setting new bits allowed, but clearing bits
 	 * would be unexpected.  Used during bitmap receive.  Setting new bits
 	 * requires sending of "out-of-sync" information, though. */
-	BM_LOCKED_SET_ALLOWED = 0x1,
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
 
-	/* clear is not expected while bitmap is locked for bulk operation */
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
 };
 
-
-/* TODO sort members for performance
- * MAYBE group them further */
-
-/* THINK maybe we actually want to use the default "event/%s" worker threads
- * or similar in linux 2.6, which uses per cpu data and threads.
- */
 struct drbd_work_queue {
 	struct list_head q;
 	struct semaphore s; /* producers up it, worker down()s it */
@@ -938,8 +937,7 @@ struct drbd_backing_dev {
 };
 
 struct drbd_md_io {
-	struct drbd_conf *mdev;
-	struct completion event;
+	unsigned int done;
 	int error;
 };
 
@@ -1022,6 +1020,7 @@ struct drbd_conf {
 	struct drbd_tl_epoch *newest_tle;
 	struct drbd_tl_epoch *oldest_tle;
 	struct list_head out_of_sequence_requests;
+	struct list_head barrier_acked_requests;
 	struct hlist_head *tl_hash;
 	unsigned int tl_hash_s;
 
@@ -1056,6 +1055,8 @@ struct drbd_conf {
 	struct crypto_hash *csums_tfm;
 	struct crypto_hash *verify_tfm;
 
+	unsigned long last_reattach_jif;
+	unsigned long last_reconnect_jif;
 	struct drbd_thread receiver;
 	struct drbd_thread worker;
 	struct drbd_thread asender;
@@ -1094,7 +1095,8 @@ struct drbd_conf {
 	wait_queue_head_t ee_wait;
 	struct page *md_io_page;	/* one page buffer for md_io */
 	struct page *md_io_tmpp;	/* for logical_block_size != 512 */
-	struct mutex md_io_mutex;	/* protects the md_io_buffer */
+	struct drbd_md_io md_io;
+	atomic_t md_io_in_use;		/* protects the md_io, md_io_page and md_io_tmpp */
 	spinlock_t al_lock;
 	wait_queue_head_t al_wait;
 	struct lru_cache *act_log;	/* activity log */
@@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev);
 extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
 extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
 extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
-extern int _drbd_send_state(struct drbd_conf *mdev);
-extern int drbd_send_state(struct drbd_conf *mdev);
+extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_conf *mdev);
 extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
 			enum drbd_packets cmd, struct p_header80 *h,
 			size_t size, unsigned msg_flags);
@@ -1461,6 +1463,7 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
 		unsigned long al_enr);
 extern size_t	     drbd_bm_words(struct drbd_conf *mdev);
@@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 extern mempool_t *drbd_request_mempool;
 extern mempool_t *drbd_ee_mempool;
 
-extern struct page *drbd_pp_pool; /* drbd's page pool */
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
 extern spinlock_t   drbd_pp_lock;
 extern int	    drbd_pp_vacant;
 extern wait_queue_head_t drbd_pp_wait;
 
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t *drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set *drbd_md_io_bio_set;
+/* to allocate from that set */
+extern struct bio *bio_alloc_drbd(gfp_t gfp_mask);
+
 extern rwlock_t global_state_lock;
 
 extern struct drbd_conf *drbd_new_device(unsigned int minor);
@@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev);
 extern void suspend_other_sg(struct drbd_conf *mdev);
 extern int drbd_resync_finished(struct drbd_conf *mdev);
 /* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
+extern void drbd_md_put_buffer(struct drbd_conf *mdev);
 extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
-		struct drbd_backing_dev *bdev, sector_t sector, int rw);
+				struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+					    unsigned int *done);
 extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
 extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
 
@@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page)
 #define page_chain_for_each_safe(page, n) \
 	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
 
-static inline int drbd_bio_has_active_page(struct bio *bio)
-{
-	struct bio_vec *bvec;
-	int i;
-
-	__bio_for_each_segment(bvec, bio, i, 0) {
-		if (page_count(bvec->bv_page) > 1)
-			return 1;
-	}
-
-	return 0;
-}
-
 static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 {
 	struct page *page = e->pages;
@@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
 	return 0;
 }
 
-
 static inline void drbd_state_lock(struct drbd_conf *mdev)
 {
 	wait_event(mdev->misc_wait,
@@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev,
 		 * Note: currently we don't support such large bitmaps on 32bit
 		 * arch anyways, but no harm done to be prepared for it here.
 		 */
-		unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10;
+		unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10;
 		unsigned long left = *bits_left >> shift;
 		unsigned long total = 1UL + (mdev->rs_total >> shift);
 		unsigned long tmp = 1000UL - left * 1000UL/total;
@@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev)
 	case D_OUTDATED:
 	case D_CONSISTENT:
 	case D_UP_TO_DATE:
+	case D_FAILED:
 		/* disk state is stable as well. */
 		break;
 
 	/* no new io accepted during tansitional states */
 	case D_ATTACHING:
-	case D_FAILED:
 	case D_NEGOTIATING:
 	case D_UNKNOWN:
 	case D_MASK:
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 211fc44f84be..920ede2829d6 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
 struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
 mempool_t *drbd_request_mempool;
 mempool_t *drbd_ee_mempool;
+mempool_t *drbd_md_io_page_pool;
+struct bio_set *drbd_md_io_bio_set;
 
 /* I do not use a standard mempool, because:
    1) I want to hand out the pre-allocated objects first.
@@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = {
 	.release = drbd_release,
 };
 
-#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0]))
+static void bio_destructor_drbd(struct bio *bio)
+{
+	bio_free(bio, drbd_md_io_bio_set);
+}
+
+struct bio *bio_alloc_drbd(gfp_t gfp_mask)
+{
+	struct bio *bio;
+
+	if (!drbd_md_io_bio_set)
+		return bio_alloc(gfp_mask, 1);
+
+	bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+	if (!bio)
+		return NULL;
+	bio->bi_destructor = bio_destructor_drbd;
+	return bio;
+}
 
 #ifdef __CHECKER__
 /* When checking with sparse, and this is an inline function, sparse will
@@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev)
 	mdev->oldest_tle = b;
 	mdev->newest_tle = b;
 	INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
+	INIT_LIST_HEAD(&mdev->barrier_acked_requests);
 
 	mdev->tl_hash = NULL;
 	mdev->tl_hash_s = 0;
@@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
 	new->n_writes = 0;
 
 	newest_before = mdev->newest_tle;
-	/* never send a barrier number == 0, because that is special-cased
-	 * when using TCQ for our write ordering code */
-	new->br_number = (newest_before->br_number+1) ?: 1;
+	new->br_number = newest_before->br_number+1;
 	if (mdev->newest_tle != new) {
 		mdev->newest_tle->next = new;
 		mdev->newest_tle = new;
@@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
 	   These have been list_move'd to the out_of_sequence_requests list in
 	   _req_mod(, barrier_acked) above.
 	   */
-	list_del_init(&b->requests);
+	list_splice_init(&b->requests, &mdev->barrier_acked_requests);
 
 	nob = b->next;
 	if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) {
@@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 		b = tmp;
 		list_splice(&carry_reads, &b->requests);
 	}
+
+	/* Actions operating on the disk state, also want to work on
+	   requests that got barrier acked. */
+	switch (what) {
+	case fail_frozen_disk_io:
+	case restart_frozen_disk_io:
+		list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			_req_mod(req, what);
+		}
+
+	case connection_lost_while_pending:
+	case resend:
+		break;
+	default:
+		dev_err(DEV, "what = %d in _tl_restart()\n", what);
+	}
 }
 
 
@@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 }
 
 /**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL
+ * @mdev:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_conf *mdev)
+{
+	struct drbd_tl_epoch *b;
+	struct list_head *le, *tle;
+	struct drbd_request *req;
+
+	spin_lock_irq(&mdev->req_lock);
+	b = mdev->oldest_tle;
+	while (b) {
+		list_for_each_safe(le, tle, &b->requests) {
+			req = list_entry(le, struct drbd_request, tl_requests);
+			if (!(req->rq_state & RQ_LOCAL_PENDING))
+				continue;
+			_req_mod(req, abort_disk_io);
+		}
+		b = b->next;
+	}
+
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		_req_mod(req, abort_disk_io);
+	}
+
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+/**
  * cl_wide_st_chg() - true if the state change is a cluster wide one
  * @mdev:	DRBD device.
  * @os:		old (current) state.
@@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev,
 		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
 		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
 		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
-		  (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
 		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
 		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
 }
@@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
 static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
 						    union drbd_state,
 						    union drbd_state);
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort);
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
 int drbd_send_state_req(struct drbd_conf *,
 			union drbd_state, union drbd_state);
 
@@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
 		rv = SS_IN_TRANSIENT_STATE;
 
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... */
+	if (test_bit(STATE_SENT, &mdev->flags) &&
+	    !(os.conn == C_WF_REPORT_PARAMS ||
+	      (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+		rv = SS_IN_TRANSIENT_STATE;
+
 	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
 		rv = SS_NEED_CONNECTION;
 
@@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
 	return rv;
 }
 
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
 /**
  * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
  * @mdev:	DRBD device.
@@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
  * to D_UNKNOWN. This rule and many more along those lines are in this function.
  */
 static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
-				       union drbd_state ns, const char **warn_sync_abort)
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
 {
 	enum drbd_fencing_p fp;
 	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
 
+	if (warn)
+		*warn = NO_WARNING;
+
 	fp = FP_DONT_CARE;
 	if (get_ldev(mdev)) {
 		fp = mdev->ldev->dc.fencing;
@@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
 	 * If you try to go into some Sync* state, that shall fail (elsewhere). */
 	if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
-	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN)
+	    ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
 		ns.conn = os.conn;
 
 	/* we cannot fail (again) if we already detached */
 	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
 		ns.disk = D_DISKLESS;
 
-	/* if we are only D_ATTACHING yet,
-	 * we can (and should) go directly to D_DISKLESS. */
-	if (ns.disk == D_FAILED && os.disk == D_ATTACHING)
-		ns.disk = D_DISKLESS;
-
 	/* After C_DISCONNECTING only C_STANDALONE may follow */
 	if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
 		ns.conn = os.conn;
@@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 	/* Abort resync if a disk fails/detaches */
 	if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
 	    (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
-		if (warn_sync_abort)
-			*warn_sync_abort =
-				os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
-				"Online-verify" : "Resync";
+		if (warn)
+			*warn =	os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
 		ns.conn = C_CONNECTED;
 	}
 
@@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 			ns.disk = mdev->new_state_tmp.disk;
 			ns.pdsk = mdev->new_state_tmp.pdsk;
 		} else {
-			dev_alert(DEV, "Connection lost while negotiating, no data!\n");
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
 			ns.disk = D_DISKLESS;
 			ns.pdsk = D_UNKNOWN;
 		}
@@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state
 		ns.disk = disk_max;
 
 	if (ns.disk < disk_min) {
-		dev_warn(DEV, "Implicitly set disk from %s to %s\n",
-			 drbd_disk_str(ns.disk), drbd_disk_str(disk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
 		ns.disk = disk_min;
 	}
 	if (ns.pdsk > pdsk_max)
 		ns.pdsk = pdsk_max;
 
 	if (ns.pdsk < pdsk_min) {
-		dev_warn(DEV, "Implicitly set pdsk from %s to %s\n",
-			 drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min));
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
 		ns.pdsk = pdsk_min;
 	}
 
@@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 {
 	union drbd_state os;
 	enum drbd_state_rv rv = SS_SUCCESS;
-	const char *warn_sync_abort = NULL;
+	enum sanitize_state_warnings ssw;
 	struct after_state_chg_work *ascw;
 
 	os = mdev->state;
 
-	ns = sanitize_state(mdev, os, ns, &warn_sync_abort);
+	ns = sanitize_state(mdev, os, ns, &ssw);
 
 	if (ns.i == os.i)
 		return SS_NOTHING_TO_DO;
@@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		return rv;
 	}
 
-	if (warn_sync_abort)
-		dev_warn(DEV, "%s aborted.\n", warn_sync_abort);
+	print_sanitize_warnings(mdev, ssw);
 
 	{
 	char *pbp, pb[300];
@@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		drbd_thread_stop_nowait(&mdev->receiver);
 
 	/* Upon network failure, we need to restart the receiver. */
-	if (os.conn > C_TEAR_DOWN &&
+	if (os.conn > C_WF_CONNECTION &&
 	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
 		drbd_thread_restart_nowait(&mdev->receiver);
 
@@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 		drbd_resume_al(mdev);
 
+	/* remember last connect and attach times so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
+		mdev->last_reconnect_jif = jiffies;
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		mdev->last_reattach_jif = jiffies;
+
 	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
 	if (ascw) {
 		ascw->os = os;
@@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Here we have the actions that are performed after a
 	   state change. This function might sleep */
 
+	if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
+		mod_timer(&mdev->request_timer, jiffies + HZ);
+
 	nsm.i = -1;
 	if (ns.susp_nod) {
 		if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
 			what = resend;
 
-		if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING)
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    ns.disk > D_NEGOTIATING)
 			what = restart_frozen_disk_io;
 
 		if (what != nothing)
@@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Do not change the order of the if above and the two below... */
 	if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) {      /* attach on the peer */
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 	/* No point in queuing send_bitmap if we don't have a connection
 	 * anymore, so check also the _current_ state, not only the new state
@@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	}
 
 	if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
-		if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) {
+		if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+		    mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
 			drbd_uuid_new_current(mdev);
 			drbd_send_uuids(mdev);
 		}
-
 		/* D_DISKLESS Peer becomes secondary */
 		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
 			/* We may still be Primary ourselves.
@@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
 		drbd_send_sizes(mdev, 0, 0);  /* to start sync... */
 		drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 	}
 
 	/* We want to pause/continue resync, tell peer. */
 	if (ns.conn >= C_CONNECTED &&
 	     ((os.aftr_isp != ns.aftr_isp) ||
 	      (os.user_isp != ns.user_isp)))
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* In case one of the isp bits got set, suspend other devices. */
 	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
@@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* Make sure the peer gets informed about eventual state
 	   changes (ISP bits) while we were in WFReportParams. */
 	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
 
 	/* We are in the progress to start a full sync... */
 	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
@@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* first half of local IO error, failure to attach,
 	 * or administrative detach */
 	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
-		enum drbd_io_error_p eh;
-		int was_io_error;
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
 		/* corresponding get_ldev was in __drbd_set_state, to serialize
-		 * our cleanup here with the transition to D_DISKLESS,
-		 * so it is safe to dreference ldev here. */
-		eh = mdev->ldev->dc.on_io_error;
-		was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
-
-		/* current state still has to be D_FAILED,
-		 * there is only one way out: to D_DISKLESS,
-		 * and that may only happen after our put_ldev below. */
-		if (mdev->state.disk != D_FAILED)
-			dev_err(DEV,
-				"ASSERT FAILED: disk is %s during detach\n",
-				drbd_disk_str(mdev->state.disk));
-
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I am detaching my disk\n");
-		else
-			dev_err(DEV, "Sending state for detaching disk failed\n");
-
-		drbd_rs_cancel_all(mdev);
-
-		/* In case we want to get something to stable storage still,
-		 * this may be the last chance.
-		 * Following put_ldev may transition to D_DISKLESS. */
-		drbd_md_sync(mdev);
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (mdev->ldev) {
+			eh = mdev->ldev->dc.on_io_error;
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+			/* Immediately allow completion of all application IO, that waits
+			   for completion from the local disk. */
+			tl_abort_disk_io(mdev);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (mdev->state.disk != D_FAILED)
+				dev_err(DEV,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(mdev->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(mdev, ns);
+
+			drbd_rs_cancel_all(mdev);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(mdev);
+		}
 		put_ldev(mdev);
 
 		if (was_io_error && eh == EP_CALL_HELPER)
@@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
                 mdev->rs_failed = 0;
                 atomic_set(&mdev->rs_pending_cnt, 0);
 
-		if (drbd_send_state(mdev))
-			dev_warn(DEV, "Notified peer that I'm now diskless.\n");
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(mdev, ns);
+
 		/* corresponding get_ldev in __drbd_set_state
 		 * this may finally trigger drbd_ldev_destroy. */
 		put_ldev(mdev);
 	}
 
 	/* Notify peer that I had a local IO error, and did not detached.. */
-	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT)
-		drbd_send_state(mdev);
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(mdev, ns);
 
 	/* Disks got bigger while they were detached */
 	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
@@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	/* sync target done with resync.  Explicitly notify peer, even though
 	 * it should (at least for non-empty resyncs) already know itself. */
 	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
-		drbd_send_state(mdev);
+		drbd_send_state(mdev, ns);
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &mdev->flags);
+		wake_up(&mdev->state_wait);
+	}
 
 	/* This triggers bitmap writeout of potentially still unwritten pages
 	 * if the resync finished cleanly, or aborted because of peer disk
@@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	 * No harm done if some bits change during this phase.
 	 */
 	if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
-		drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL,
-			"write from resync_finished", BM_LOCKED_SET_ALLOWED);
+		drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
 		put_ldev(mdev);
 	}
 
@@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
 
 	D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
 
-	uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET;
+	uuid = mdev->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
 	drbd_uuid_set(mdev, UI_BITMAP, uuid);
 	drbd_print_uuids(mdev, "updated sync UUID");
 	drbd_md_sync(mdev);
@@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
 	}
 
+	/* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
+	if (mdev->agreed_pro_version <= 94)
+		max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+
 	p.d_size = cpu_to_be64(d_size);
 	p.u_size = cpu_to_be64(u_size);
 	p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
@@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl
 }
 
 /**
- * drbd_send_state() - Sends the drbd state to the peer
+ * drbd_send_current_state() - Sends the drbd state to the peer
  * @mdev:	DRBD device.
  */
-int drbd_send_state(struct drbd_conf *mdev)
+int drbd_send_current_state(struct drbd_conf *mdev)
 {
 	struct socket *sock;
 	struct p_state p;
@@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev)
 	return ok;
 }
 
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @mdev:	DRBD device.
+ * @state:	the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
+{
+	struct socket *sock;
+	struct p_state p;
+	int ok = 0;
+
+	mutex_lock(&mdev->data.mutex);
+
+	p.state = cpu_to_be32(state.i);
+	sock = mdev->data.socket;
+
+	if (likely(sock != NULL)) {
+		ok = _drbd_send_cmd(mdev, sock, P_STATE,
+				    (struct p_header80 *)&p, sizeof(p), 0);
+	}
+
+	mutex_unlock(&mdev->data.mutex);
+
+	return ok;
+}
+
 int drbd_send_state_req(struct drbd_conf *mdev,
 	union drbd_state mask, union drbd_state val)
 {
@@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_no_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
 	struct bio_vec *bvec;
 	int i;
 	/* hint all but last page with MSG_MORE */
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		if (!_drbd_send_page(mdev, bvec->bv_page,
 				     bvec->bv_offset, bvec->bv_len,
 				     i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
@@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
 
 	p.sector   = cpu_to_be64(req->sector);
 	p.block_id = (unsigned long)req;
-	p.seq_num  = cpu_to_be32(req->seq_num =
-				 atomic_add_return(1, &mdev->packet_seq));
+	p.seq_num  = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
 
 	dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
 
@@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_sect_in, 0);
 	atomic_set(&mdev->rs_sect_ev, 0);
 	atomic_set(&mdev->ap_in_flight, 0);
+	atomic_set(&mdev->md_io_in_use, 0);
 
-	mutex_init(&mdev->md_io_mutex);
 	mutex_init(&mdev->data.mutex);
 	mutex_init(&mdev->meta.mutex);
 	sema_init(&mdev->data.work.s, 0);
@@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void)
 
 	/* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */
 
+	if (drbd_md_io_bio_set)
+		bioset_free(drbd_md_io_bio_set);
+	if (drbd_md_io_page_pool)
+		mempool_destroy(drbd_md_io_page_pool);
 	if (drbd_ee_mempool)
 		mempool_destroy(drbd_ee_mempool);
 	if (drbd_request_mempool)
@@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void)
 	if (drbd_al_ext_cache)
 		kmem_cache_destroy(drbd_al_ext_cache);
 
+	drbd_md_io_bio_set   = NULL;
+	drbd_md_io_page_pool = NULL;
 	drbd_ee_mempool      = NULL;
 	drbd_request_mempool = NULL;
 	drbd_ee_cache        = NULL;
@@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void)
 	drbd_bm_ext_cache    = NULL;
 	drbd_al_ext_cache    = NULL;
 	drbd_pp_pool         = NULL;
+	drbd_md_io_page_pool = NULL;
+	drbd_md_io_bio_set   = NULL;
 
 	/* caches */
 	drbd_request_cache = kmem_cache_create(
@@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void)
 		goto Enomem;
 
 	/* mempools */
+#ifdef COMPAT_HAVE_BIOSET_CREATE
+	drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_bio_set == NULL)
+		goto Enomem;
+#endif
+
+	drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
+	if (drbd_md_io_page_pool == NULL)
+		goto Enomem;
+
 	drbd_request_mempool = mempool_create(number,
 		mempool_alloc_slab, mempool_free_slab, drbd_request_cache);
 	if (drbd_request_mempool == NULL)
@@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor)
 	if (!mdev)
 		return;
 
+	del_timer_sync(&mdev->request_timer);
+
 	/* paranoia asserts */
 	if (mdev->open_cnt != 0)
 		dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
@@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	if (!get_ldev_if_state(mdev, D_FAILED))
 		return;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
 	memset(buffer, 0, 512);
 
 	buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev)
 	 * since we updated it on metadata. */
 	mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev);
 
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+out:
 	put_ldev(mdev);
 }
 
@@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 	if (!get_ldev_if_state(mdev, D_ATTACHING))
 		return ERR_IO_MD_DISK;
 
-	mutex_lock(&mdev->md_io_mutex);
-	buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page);
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
 
 	if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
 		/* NOTE: can't do normal error processing here as this is
@@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
 		mdev->sync_conf.al_extents = 127;
 
  err:
-	mutex_unlock(&mdev->md_io_mutex);
+	drbd_md_put_buffer(mdev);
+ out:
 	put_ldev(mdev);
 
 	return rv;
@@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void)
 	static char buildtag[38] = "\0uilt-in";
 
 	if (buildtag[0] == 0) {
-#ifdef CONFIG_MODULES
-		if (THIS_MODULE != NULL)
-			sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
-		else
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
 #endif
-			buildtag[0] = 'b';
 	}
 
 	return buildtag;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index af2a25049bce..6d4de6a72e80 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -179,7 +179,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd)
 	dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
 
 	drbd_bcast_ev_helper(mdev, cmd);
-	ret = call_usermodehelper(usermode_helper, argv, envp, 1);
+	ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
 	if (ret)
 		dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
 				usermode_helper, cmd, mb,
@@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data)
 	*/
 	spin_lock_irq(&mdev->req_lock);
 	ns = mdev->state;
-	if (ns.conn < C_WF_REPORT_PARAMS) {
+	if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) {
 		ns.pdsk = nps;
 		_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
 	}
@@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force)
 		/* if this was forced, we should consider sync */
 		if (forced)
 			drbd_send_uuids(mdev);
-		drbd_send_state(mdev);
+		drbd_send_current_state(mdev);
 	}
 
 	drbd_md_sync(mdev);
@@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev)
 	   Because new from 8.3.8 onwards the peer can use multiple
 	   BIOs for a single peer_request */
 	if (mdev->state.conn >= C_CONNECTED) {
-		if (mdev->agreed_pro_version < 94)
-			peer = mdev->peer_max_bio_size;
-		else if (mdev->agreed_pro_version == 94)
+		if (mdev->agreed_pro_version < 94) {
+			peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		} else if (mdev->agreed_pro_version == 94)
 			peer = DRBD_MAX_SIZE_H80_PACKET;
 		else /* drbd 8.3.8 onwards */
 			peer = DRBD_MAX_BIO_SIZE;
@@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 		dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
 			(unsigned long long) drbd_get_max_capacity(nbc),
 			(unsigned long long) nbc->dc.disk_size);
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	}
 
 	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
-		retcode = ERR_MD_DISK_TO_SMALL;
+		retcode = ERR_MD_DISK_TOO_SMALL;
 		dev_warn(DEV, "refusing attach: md-device too small, "
 		     "at least %llu sectors needed for this meta-disk type\n",
 		     (unsigned long long) min_md_device_sectors);
@@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	 * (we may currently be R_PRIMARY with no local disk...) */
 	if (drbd_get_max_capacity(nbc) <
 	    drbd_get_capacity(mdev->this_bdev)) {
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto fail;
 	}
 
@@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp
 	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
 	    drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
 		dev_warn(DEV, "refusing to truncate a consistent device\n");
-		retcode = ERR_DISK_TO_SMALL;
+		retcode = ERR_DISK_TOO_SMALL;
 		goto force_diskless_dec;
 	}
 
@@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 {
 	enum drbd_ret_code retcode;
 	int ret;
+	struct detach dt = {};
+
+	if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
+		reply->ret_code = ERR_MANDATORY_TAG;
+		goto out;
+	}
+
+	if (dt.detach_force) {
+		drbd_force_state(mdev, NS(disk, D_FAILED));
+		reply->ret_code = SS_SUCCESS;
+		goto out;
+	}
+
 	drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */
+	drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */
 	retcode = drbd_request_state(mdev, NS(disk, D_FAILED));
+	drbd_md_put_buffer(mdev);
 	/* D_FAILED will transition to DISKLESS. */
 	ret = wait_event_interruptible(mdev->misc_wait,
 			mdev->state.disk != D_FAILED);
 	drbd_resume_io(mdev);
+
 	if ((int)retcode == (int)SS_IS_DISKLESS)
 		retcode = SS_NOTHING_TO_DO;
 	if (ret)
 		retcode = ERR_INTR;
 	reply->ret_code = retcode;
+out:
 	return 0;
 }
 
@@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	if (rs.no_resync && mdev->agreed_pro_version < 93) {
 		retcode = ERR_NEED_APV_93;
-		goto fail;
+		goto fail_ldev;
 	}
 
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
@@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
  fail:
 	reply->ret_code = retcode;
 	return 0;
+
+ fail_ldev:
+	put_ldev(mdev);
+	goto fail;
 }
 
 static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
@@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl
 
 		retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 
 	/* If there is still bitmap IO pending, probably because of a previous
 	 * resync just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
@@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re
 		} else
 			retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S));
 	}
+	drbd_resume_io(mdev);
 
 	reply->ret_code = retcode;
 	return 0;
@@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
 
 	/* If there is still bitmap IO pending, e.g. previous resync or verify
 	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(mdev);
 	wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
 
 	/* w_make_ov_request expects position to be aligned */
 	mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT;
 	reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+	drbd_resume_io(mdev);
 	return 0;
 }
 
@@ -2297,7 +2325,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms
 		return;
 	}
 
-	if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) {
+	if (!capable(CAP_SYS_ADMIN)) {
 		retcode = ERR_PERM;
 		goto fail;
 	}
@@ -2526,10 +2554,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev,
 
 	page = e->pages;
 	page_chain_for_each(page) {
-		void *d = kmap_atomic(page, KM_USER0);
+		void *d = kmap_atomic(page);
 		unsigned l = min_t(unsigned, len, PAGE_SIZE);
 		memcpy(tl, d, l);
-		kunmap_atomic(d, KM_USER0);
+		kunmap_atomic(d);
 		tl = (unsigned short*)((char*)tl + l);
 		len -= l;
 		if (len == 0)
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 2959cdfb77f5..869bada2ed06 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
 	if (unlikely(v >= 1000000)) {
 		/* cool: > GiByte/s */
 		seq_printf(seq, "%ld,", v / 1000000);
-		v /= 1000000;
+		v %= 1000000;
 		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
 	} else if (likely(v >= 1000))
 		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 43beaca53179..ea4836e0ae98 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what,
 		goto out;
 	}
 	(*newsock)->ops  = sock->ops;
+	__module_get((*newsock)->ops->owner);
 
 out:
 	return err;
@@ -664,7 +665,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
 	timeo = mdev->net_conf->try_connect_int * HZ;
 	timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
 
-	s_listen->sk->sk_reuse    = 1; /* SO_REUSEADDR */
+	s_listen->sk->sk_reuse    = SK_CAN_REUSE; /* SO_REUSEADDR */
 	s_listen->sk->sk_rcvtimeo = timeo;
 	s_listen->sk->sk_sndtimeo = timeo;
 	drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
@@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev)
 {
 	struct socket *s, *sock, *msock;
 	int try, h, ok;
+	enum drbd_state_rv rv;
 
 	D_ASSERT(!mdev->data.socket);
 
@@ -841,8 +843,8 @@ retry:
 		}
 	} while (1);
 
-	msock->sk->sk_reuse = 1; /* SO_REUSEADDR */
-	sock->sk->sk_reuse = 1; /* SO_REUSEADDR */
+	msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
 
 	sock->sk->sk_allocation = GFP_NOIO;
 	msock->sk->sk_allocation = GFP_NOIO;
@@ -888,25 +890,32 @@ retry:
 		}
 	}
 
-	if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS)
-		return 0;
-
 	sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
 	sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
 
 	atomic_set(&mdev->packet_seq, 0);
 	mdev->peer_seq = 0;
 
-	drbd_thread_start(&mdev->asender);
-
 	if (drbd_send_protocol(mdev) == -1)
 		return -1;
+	set_bit(STATE_SENT, &mdev->flags);
 	drbd_send_sync_param(mdev, &mdev->sync_conf);
 	drbd_send_sizes(mdev, 0, 0);
 	drbd_send_uuids(mdev);
-	drbd_send_state(mdev);
+	drbd_send_current_state(mdev);
 	clear_bit(USE_DEGR_WFC_T, &mdev->flags);
 	clear_bit(RESIZE_PENDING, &mdev->flags);
+
+	spin_lock_irq(&mdev->req_lock);
+	rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
+	if (mdev->state.conn != C_WF_REPORT_PARAMS)
+		clear_bit(STATE_SENT, &mdev->flags);
+	spin_unlock_irq(&mdev->req_lock);
+
+	if (rv < SS_SUCCESS)
+		return 0;
+
+	drbd_thread_start(&mdev->asender);
 	mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
 
 	return 1;
@@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev)
 		rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL,
 					NULL);
 		if (rv) {
-			dev_err(DEV, "local disk flush failed with status %d\n", rv);
+			dev_info(DEV, "local disk flush failed with status %d\n", rv);
 			/* would rather check on EOPNOTSUPP, but that is not reliable.
 			 * don't try again for ANY return value != 0
 			 * if (rv == -EOPNOTSUPP) */
@@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
 
 		if (epoch_size != 0 &&
 		    atomic_read(&epoch->active) == 0 &&
-		    test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) {
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
 			if (!(ev & EV_CLEANUP)) {
 				spin_unlock(&mdev->epoch_lock);
 				drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
 				spin_lock(&mdev->epoch_lock);
 			}
-			dec_unacked(mdev);
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(mdev);
 
 			if (mdev->current_epoch != epoch) {
 				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
@@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 	/* In most cases, we will only need one bio.  But in case the lower
 	 * level restrictions happen to be different at this offset on this
 	 * side than those of the sending peer, we may need to submit the
-	 * request in more than one bio. */
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
 next_bio:
 	bio = bio_alloc(GFP_NOIO, nr_pages);
 	if (!bio) {
@@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u
 	return ok;
 }
 
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+{
+
+	struct drbd_epoch_entry *rs_e;
+	bool rv = 0;
+
+	spin_lock_irq(&mdev->req_lock);
+	list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
+		if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+			rv = 1;
+			break;
+		}
+	}
+	spin_unlock_irq(&mdev->req_lock);
+
+	return rv;
+}
+
 /* Called from receive_Data.
  * Synchronize packets on sock with packets on msock.
  *
@@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	list_add(&e->w.list, &mdev->active_ee);
 	spin_unlock_irq(&mdev->req_lock);
 
+	if (mdev->state.conn == C_SYNC_TARGET)
+		wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+
 	switch (mdev->net_conf->wire_protocol) {
 	case DRBD_PROT_C:
 		inc_unacked(mdev);
@@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l
 			mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
 			mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1];
 
-			dev_info(DEV, "Did not got last syncUUID packet, corrected:\n");
+			dev_info(DEV, "Lost last syncUUID packet, corrected:\n");
 			drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]);
 
 			return -1;
@@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi
 
 	if (apv >= 88) {
 		if (apv == 88) {
-			if (data_size > SHARED_SECRET_MAX) {
-				dev_err(DEV, "verify-alg too long, "
-				    "peer wants %u, accepting only %u byte\n",
-						data_size, SHARED_SECRET_MAX);
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				dev_err(DEV, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
 				return false;
 			}
 
@@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 	os = ns = mdev->state;
 	spin_unlock_irq(&mdev->req_lock);
 
-	/* peer says his disk is uptodate, while we think it is inconsistent,
-	 * and this happens while we think we have a sync going on. */
-	if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE &&
+	/* If some other part of the code (asender thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return false;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
 	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
 		/* If we are (becoming) SyncSource, but peer is still in sync
 		 * preparation, ignore its uptodate-ness to avoid flapping, it
@@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned
 			/* Nowadays only used when forcing a node into primary role and
 			   setting its disk to UpToDate with that */
 			drbd_send_uuids(mdev);
-			drbd_send_state(mdev);
+			drbd_send_current_state(mdev);
 		}
 	}
 
@@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	if (mdev->state.conn == C_STANDALONE)
 		return;
 
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+
 	/* asender does not clean up anything. it must not interfere, either */
 	drbd_thread_stop(&mdev->asender);
 	drbd_free_sock(mdev);
@@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev)
 	atomic_set(&mdev->rs_pending_cnt, 0);
 	wake_up(&mdev->misc_wait);
 
-	del_timer(&mdev->request_timer);
-
 	/* make sure syncer is stopped and w_resume_next_sg queued */
 	del_timer_sync(&mdev->resync_timer);
 	resync_timer_fn((unsigned long)mdev);
@@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
 
 	if (mdev->state.conn == C_AHEAD &&
 	    atomic_read(&mdev->ap_in_flight) == 0 &&
-	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) {
+	    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
 		mdev->start_resync_timer.expires = jiffies + HZ;
 		add_timer(&mdev->start_resync_timer);
 	}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 4a0f314086e5..9c5c84946b05 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req
 	const int rw = bio_data_dir(bio);
 	int cpu;
 	cpu = part_stat_lock();
+	part_round_stats(cpu, &mdev->vdisk->part0);
 	part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
 	part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
 	part_inc_in_flight(&mdev->vdisk->part0, rw);
@@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 {
 	const unsigned long s = req->rq_state;
 	struct drbd_conf *mdev = req->mdev;
-	/* only WRITES may end up here without a master bio (on barrier ack) */
-	int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE;
+	int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
 
 	/* we must not complete the master bio, while it is
 	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		return;
 	if (s & RQ_NET_PENDING)
 		return;
-	if (s & RQ_LOCAL_PENDING)
+	if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
 		return;
 
 	if (req->master_bio) {
@@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
 		req->master_bio = NULL;
 	}
 
+	if (s & RQ_LOCAL_PENDING)
+		return;
+
 	if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
 		/* this is disconnected (local only) operation,
 		 * or protocol C P_WRITE_ACK,
@@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case completed_ok:
-		if (bio_data_dir(req->master_bio) == WRITE)
+		if (req->rq_state & RQ_WRITE)
 			mdev->writ_cnt += req->size>>9;
 		else
 			mdev->read_cnt += req->size>>9;
@@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
+		break;
+
+	case abort_disk_io:
+		req->rq_state |= RQ_LOCAL_ABORTED;
+		if (req->rq_state & RQ_WRITE)
+			_req_may_be_done_not_susp(req, m);
+		else
+			goto goto_queue_for_net_read;
 		break;
 
 	case write_completed_with_error:
@@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 
 		__drbd_chk_io_error(mdev, false);
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_ahead_completed_with_error:
@@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		req->rq_state |= RQ_LOCAL_COMPLETED;
 		req->rq_state &= ~RQ_LOCAL_PENDING;
 		_req_may_be_done_not_susp(req, m);
-		put_ldev(mdev);
 		break;
 
 	case read_completed_with_error:
@@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		D_ASSERT(!(req->rq_state & RQ_NET_MASK));
 
 		__drbd_chk_io_error(mdev, false);
-		put_ldev(mdev);
+
+	goto_queue_for_net_read:
 
 		/* no point in retrying if there is no good remote data,
 		 * or we have no connection. */
@@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		drbd_queue_work(&mdev->data.work, &req->w);
 		break;
 
-	case oos_handed_to_network:
-		/* actually the same */
+	case read_retry_remote_canceled:
 	case send_canceled:
-		/* treat it the same */
 	case send_failed:
 		/* real cleanup will be done from tl_clear.  just update flags
 		 * so it is no longer marked as on the worker queue */
@@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		}
 		req->rq_state &= ~RQ_NET_QUEUED;
 		req->rq_state |= RQ_NET_SENT;
-		/* because _drbd_send_zc_bio could sleep, and may want to
-		 * dereference the bio even after the "write_acked_by_peer" and
-		 * "completed_ok" events came in, once we return from
-		 * _drbd_send_zc_bio (drbd_send_dblock), we have to check
-		 * whether it is done already, and end it.  */
 		_req_may_be_done_not_susp(req, m);
 		break;
 
-	case read_retry_remote_canceled:
+	case oos_handed_to_network:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
 		req->rq_state &= ~RQ_NET_QUEUED;
-		/* fall through, in case we raced with drbd_disconnect */
+		req->rq_state |= RQ_NET_DONE;
+		_req_may_be_done_not_susp(req, m);
+		break;
+
 	case connection_lost_while_pending:
 		/* transfer log cleanup after connection loss */
 		/* assert something? */
@@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			_req_may_be_done(req, m); /* Allowed while state.susp */
 		break;
 
-	case write_acked_by_peer_and_sis:
-		req->rq_state |= RQ_NET_SIS;
 	case conflict_discarded_by_peer:
 		/* for discarded conflicting writes of multiple primaries,
 		 * there is no need to keep anything in the tl, potential
@@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 			      (unsigned long long)req->sector, req->size);
 		req->rq_state |= RQ_NET_DONE;
 		/* fall through */
+	case write_acked_by_peer_and_sis:
 	case write_acked_by_peer:
+		if (what == write_acked_by_peer_and_sis)
+			req->rq_state |= RQ_NET_SIS;
 		/* protocol C; successfully written on peer.
-		 * Nothing to do here.
+		 * Nothing more to do here.
 		 * We want to keep the tl in place for all protocols, to cater
-		 * for volatile write-back caches on lower level devices.
-		 *
-		 * A barrier request is expected to have forced all prior
-		 * requests onto stable storage, so completion of a barrier
-		 * request could set NET_DONE right here, and not wait for the
-		 * P_BARRIER_ACK, but that is an unnecessary optimization. */
+		 * for volatile write-back caches on lower level devices. */
 
-		/* this makes it effectively the same as for: */
 	case recv_acked_by_peer:
 		/* protocol B; pretends to be successfully written on peer.
 		 * see also notes above in handed_over_to_network about
@@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	int local, remote, send_oos = 0;
 	int err = -EIO;
 	int ret = 0;
+	union drbd_state s;
 
 	/* allocate outside of all locks; */
 	req = drbd_req_new(mdev, bio);
@@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 		drbd_al_begin_io(mdev, sector);
 	}
 
-	remote = remote && drbd_should_do_remote(mdev->state);
-	send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
+	s = mdev->state;
+	remote = remote && drbd_should_do_remote(s);
+	send_oos = rw == WRITE && drbd_should_send_oos(s);
 	D_ASSERT(!(remote && send_oos));
 
 	if (!(local || remote) && !is_susp(mdev->state)) {
@@ -867,7 +871,7 @@ allocate_barrier:
 
 	if (is_susp(mdev->state)) {
 		/* If we got suspended, use the retry mechanism of
-		   generic_make_request() to restart processing of this
+		   drbd_make_request() to restart processing of this
 		   bio. In the next call to drbd_make_request
 		   we sleep in inc_ap_bio() */
 		ret = 1;
@@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	 */
 	D_ASSERT(bio->bi_size > 0);
 	D_ASSERT((bio->bi_size & 0x1ff) == 0);
-	D_ASSERT(bio->bi_idx == 0);
 
 	/* to make some things easier, force alignment of requests within the
 	 * granularity of our hash tables */
@@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio)
 	e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT;
 
 	if (likely(s_enr == e_enr)) {
-		inc_ap_bio(mdev, 1);
-		drbd_make_request_common(mdev, bio, start_time);
+		do {
+			inc_ap_bio(mdev, 1);
+		} while (drbd_make_request_common(mdev, bio, start_time));
 		return;
 	}
 
@@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data)
 	struct drbd_conf *mdev = (struct drbd_conf *) data;
 	struct drbd_request *req; /* oldest request */
 	struct list_head *le;
-	unsigned long et = 0; /* effective timeout = ko_count * timeout */
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
 
 	if (get_net_conf(mdev)) {
-		et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count;
+		if (mdev->state.conn >= C_WF_REPORT_PARAMS)
+			ent = mdev->net_conf->timeout*HZ/10
+				* mdev->net_conf->ko_count;
 		put_net_conf(mdev);
 	}
-	if (!et || mdev->state.conn < C_WF_REPORT_PARAMS)
+	if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+		put_ldev(mdev);
+	}
+	et = min_not_zero(dt, ent);
+
+	if (!et)
 		return; /* Recurring timer stopped */
 
+	now = jiffies;
+
 	spin_lock_irq(&mdev->req_lock);
 	le = &mdev->oldest_tle->requests;
 	if (list_empty(le)) {
 		spin_unlock_irq(&mdev->req_lock);
-		mod_timer(&mdev->request_timer, jiffies + et);
+		mod_timer(&mdev->request_timer, now + et);
 		return;
 	}
 
 	le = le->prev;
 	req = list_entry(le, struct drbd_request, tl_requests);
-	if (time_is_before_eq_jiffies(req->start_time + et)) {
-		if (req->rq_state & RQ_NET_PENDING) {
-			dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
-			_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL);
-		} else {
-			dev_warn(DEV, "Local backing block device frozen?\n");
-			mod_timer(&mdev->request_timer, jiffies + et);
-		}
-	} else {
-		mod_timer(&mdev->request_timer, req->start_time + et);
-	}
 
+	/* The request is considered timed out, if
+	 * - we have some effective timeout from the configuration,
+	 *   with above state restrictions applied,
+	 * - the oldest request is waiting for a response from the network
+	 *   resp. the local disk,
+	 * - the oldest request is in fact older than the effective timeout,
+	 * - the connection was established (resp. disk was attached)
+	 *   for longer than the timeout already.
+	 * Note that for 32bit jiffies and very stable connections/disks,
+	 * we may have a wrap around, which is catched by
+	 *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+	 *
+	 * Side effect: once per 32bit wrap-around interval, which means every
+	 * ~198 days with 250 HZ, we have a window where the timeout would need
+	 * to expire twice (worst case) to become effective. Good enough.
+	 */
+	if (ent && req->rq_state & RQ_NET_PENDING &&
+		 time_after(now, req->start_time + ent) &&
+		!time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+		dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
+		_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
+	}
+	if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+		 time_after(now, req->start_time + dt) &&
+		!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
+		dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(mdev, 1);
+	}
+	nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
 	spin_unlock_irq(&mdev->req_lock);
+	mod_timer(&mdev->request_timer, nt);
 }
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 68a234a5fdc5..3d2111919486 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -105,6 +105,7 @@ enum drbd_req_event {
 	read_completed_with_error,
 	read_ahead_completed_with_error,
 	write_completed_with_error,
+	abort_disk_io,
 	completed_ok,
 	resend,
 	fail_frozen_disk_io,
@@ -118,18 +119,21 @@ enum drbd_req_event {
  * same time, so we should hold the request lock anyways.
  */
 enum drbd_req_state_bits {
-	/* 210
-	 * 000: no local possible
-	 * 001: to be submitted
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
 	 *    UNUSED, we could map: 011: submitted, completion still pending
-	 * 110: completed ok
-	 * 010: completed with error
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
 	 */
 	__RQ_LOCAL_PENDING,
 	__RQ_LOCAL_COMPLETED,
 	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
 
-	/* 76543
+	/* 87654
 	 * 00000: no network possible
 	 * 00001: to be send
 	 * 00011: to be send, on worker queue
@@ -199,8 +203,9 @@ enum drbd_req_state_bits {
 #define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
 #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
 #define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
 
-#define RQ_LOCAL_MASK      ((RQ_LOCAL_OK << 1)-1) /* 0x07 */
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
 
 #define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
 #define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 4d3e6f6213ba..620c70ff2231 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -70,11 +70,29 @@ rwlock_t global_state_lock;
 void drbd_md_io_complete(struct bio *bio, int error)
 {
 	struct drbd_md_io *md_io;
+	struct drbd_conf *mdev;
 
 	md_io = (struct drbd_md_io *)bio->bi_private;
+	mdev = container_of(md_io, struct drbd_conf, md_io);
+
 	md_io->error = error;
 
-	complete(&md_io->event);
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(mdev);
+	md_io->done = 1;
+	wake_up(&mdev->misc_wait);
+	bio_put(bio);
+	put_ldev(mdev);
 }
 
 /* reads on behalf of the partner,
@@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error)
 	spin_lock_irqsave(&mdev->req_lock, flags);
 	__req_mod(req, what, &m);
 	spin_unlock_irqrestore(&mdev->req_lock, flags);
+	put_ldev(mdev);
 
 	if (m.bio)
 		complete_master_bio(mdev, &m);
@@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio *
 	sg_init_table(&sg, 1);
 	crypto_hash_init(&desc);
 
-	__bio_for_each_segment(bvec, bio, i, 0) {
+	bio_for_each_segment(bvec, bio, i) {
 		sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset);
 		crypto_hash_update(&desc, &sg, sg.length);
 	}
@@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
 	}
 
 	drbd_start_resync(mdev, C_SYNC_SOURCE);
-	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
 	return 1;
 }
 
@@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
 	}
 
 	drbd_state_lock(mdev);
-
+	write_lock_irq(&global_state_lock);
 	if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
+		write_unlock_irq(&global_state_lock);
 		drbd_state_unlock(mdev);
 		return;
 	}
 
-	write_lock_irq(&global_state_lock);
-	ns = mdev->state;
+	ns.i = mdev->state.i;
 
 	ns.aftr_isp = !_drbd_may_sync_now(mdev);
 
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
index 744f078f4dd8..cce7df367b79 100644
--- a/drivers/block/floppy.c
+++ b/drivers/block/floppy.c
@@ -202,7 +202,6 @@ static int slow_floppy;
 
 #include <asm/dma.h>
 #include <asm/irq.h>
-#include <asm/system.h>
 
 static int FLOPPY_IRQ = 6;
 static int FLOPPY_DMA = 2;
@@ -552,7 +551,7 @@ static void floppy_ready(void);
 static void floppy_start(void);
 static void process_fd_request(void);
 static void recalibrate_floppy(void);
-static void floppy_shutdown(unsigned long);
+static void floppy_shutdown(struct work_struct *);
 
 static int floppy_request_regions(int);
 static void floppy_release_regions(int);
@@ -589,6 +588,8 @@ static int buffer_max = -1;
 static struct floppy_fdc_state fdc_state[N_FDC];
 static int fdc;			/* current fdc */
 
+static struct workqueue_struct *floppy_wq;
+
 static struct floppy_struct *_floppy = floppy_type;
 static unsigned char current_drive;
 static long current_count_sectors;
@@ -630,16 +631,15 @@ static inline void set_debugt(void) { }
 static inline void debugt(const char *func, const char *msg) { }
 #endif /* DEBUGT */
 
-typedef void (*timeout_fn)(unsigned long);
-static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0);
 
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
 static const char *timeout_message;
 
 static void is_alive(const char *func, const char *message)
 {
 	/* this routine checks whether the floppy driver is "alive" */
 	if (test_bit(0, &fdc_busy) && command_status < 2 &&
-	    !timer_pending(&fd_timeout)) {
+	    !delayed_work_pending(&fd_timeout)) {
 		DPRINT("%s: timeout handler died.  %s\n", func, message);
 	}
 }
@@ -667,15 +667,18 @@ static int output_log_pos;
 
 static void __reschedule_timeout(int drive, const char *message)
 {
+	unsigned long delay;
+
 	if (drive == current_reqD)
 		drive = current_drive;
-	del_timer(&fd_timeout);
+
 	if (drive < 0 || drive >= N_DRIVE) {
-		fd_timeout.expires = jiffies + 20UL * HZ;
+		delay = 20UL * HZ;
 		drive = 0;
 	} else
-		fd_timeout.expires = jiffies + UDP->timeout;
-	add_timer(&fd_timeout);
+		delay = UDP->timeout;
+
+	queue_delayed_work(floppy_wq, &fd_timeout, delay);
 	if (UDP->flags & FD_DEBUG)
 		DPRINT("reschedule timeout %s\n", message);
 	timeout_message = message;
@@ -873,7 +876,7 @@ static int lock_fdc(int drive, bool interruptible)
 
 	command_status = FD_COMMAND_NONE;
 
-	__reschedule_timeout(drive, "lock fdc");
+	reschedule_timeout(drive, "lock fdc");
 	set_fdc(drive);
 	return 0;
 }
@@ -881,23 +884,15 @@ static int lock_fdc(int drive, bool interruptible)
 /* unlocks the driver */
 static void unlock_fdc(void)
 {
-	unsigned long flags;
-
-	raw_cmd = NULL;
 	if (!test_bit(0, &fdc_busy))
 		DPRINT("FDC access conflict!\n");
 
-	if (do_floppy)
-		DPRINT("device interrupt still active at FDC release: %pf!\n",
-		       do_floppy);
+	raw_cmd = NULL;
 	command_status = FD_COMMAND_NONE;
-	spin_lock_irqsave(&floppy_lock, flags);
-	del_timer(&fd_timeout);
+	__cancel_delayed_work(&fd_timeout);
+	do_floppy = NULL;
 	cont = NULL;
 	clear_bit(0, &fdc_busy);
-	if (current_req || set_next_request())
-		do_fd_request(current_req->q);
-	spin_unlock_irqrestore(&floppy_lock, flags);
 	wake_up(&fdc_wait);
 }
 
@@ -969,26 +964,24 @@ static DECLARE_WORK(floppy_work, NULL);
 
 static void schedule_bh(void (*handler)(void))
 {
+	WARN_ON(work_pending(&floppy_work));
+
 	PREPARE_WORK(&floppy_work, (work_func_t)handler);
-	schedule_work(&floppy_work);
+	queue_work(floppy_wq, &floppy_work);
 }
 
-static DEFINE_TIMER(fd_timer, NULL, 0, 0);
+static DECLARE_DELAYED_WORK(fd_timer, NULL);
 
 static void cancel_activity(void)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_lock, flags);
 	do_floppy = NULL;
-	PREPARE_WORK(&floppy_work, (work_func_t)empty);
-	del_timer(&fd_timer);
-	spin_unlock_irqrestore(&floppy_lock, flags);
+	cancel_delayed_work_sync(&fd_timer);
+	cancel_work_sync(&floppy_work);
 }
 
 /* this function makes sure that the disk stays in the drive during the
  * transfer */
-static void fd_watchdog(void)
+static void fd_watchdog(struct work_struct *arg)
 {
 	debug_dcl(DP->flags, "calling disk change from watchdog\n");
 
@@ -998,21 +991,20 @@ static void fd_watchdog(void)
 		cont->done(0);
 		reset_fdc();
 	} else {
-		del_timer(&fd_timer);
-		fd_timer.function = (timeout_fn)fd_watchdog;
-		fd_timer.expires = jiffies + HZ / 10;
-		add_timer(&fd_timer);
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog);
+		queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
 	}
 }
 
 static void main_command_interrupt(void)
 {
-	del_timer(&fd_timer);
+	cancel_delayed_work(&fd_timer);
 	cont->interrupt();
 }
 
 /* waits for a delay (spinup or select) to pass */
-static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
+static int fd_wait_for_completion(unsigned long expires, work_func_t function)
 {
 	if (FDCS->reset) {
 		reset_fdc();	/* do the reset during sleep to win time
@@ -1021,47 +1013,15 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function)
 		return 1;
 	}
 
-	if (time_before(jiffies, delay)) {
-		del_timer(&fd_timer);
-		fd_timer.function = function;
-		fd_timer.expires = delay;
-		add_timer(&fd_timer);
+	if (time_before(jiffies, expires)) {
+		cancel_delayed_work(&fd_timer);
+		PREPARE_DELAYED_WORK(&fd_timer, function);
+		queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
 		return 1;
 	}
 	return 0;
 }
 
-static DEFINE_SPINLOCK(floppy_hlt_lock);
-static int hlt_disabled;
-static void floppy_disable_hlt(void)
-{
-	unsigned long flags;
-
-	WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012");
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (!hlt_disabled) {
-		hlt_disabled = 1;
-#ifdef HAVE_DISABLE_HLT
-		disable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
-static void floppy_enable_hlt(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&floppy_hlt_lock, flags);
-	if (hlt_disabled) {
-		hlt_disabled = 0;
-#ifdef HAVE_DISABLE_HLT
-		enable_hlt();
-#endif
-	}
-	spin_unlock_irqrestore(&floppy_hlt_lock, flags);
-}
-
 static void setup_DMA(void)
 {
 	unsigned long f;
@@ -1106,7 +1066,6 @@ static void setup_DMA(void)
 	fd_enable_dma();
 	release_dma_lock(f);
 #endif
-	floppy_disable_hlt();
 }
 
 static void show_floppy(void);
@@ -1375,7 +1334,7 @@ static int fdc_dtr(void)
 	 */
 	FDCS->dtr = raw_cmd->rate & 3;
 	return fd_wait_for_completion(jiffies + 2UL * HZ / 100,
-				      (timeout_fn)floppy_ready);
+				      (work_func_t)floppy_ready);
 }				/* fdc_dtr */
 
 static void tell_sector(void)
@@ -1480,7 +1439,7 @@ static void setup_rw_floppy(void)
 	int flags;
 	int dflags;
 	unsigned long ready_date;
-	timeout_fn function;
+	work_func_t function;
 
 	flags = raw_cmd->flags;
 	if (flags & (FD_RAW_READ | FD_RAW_WRITE))
@@ -1494,9 +1453,9 @@ static void setup_rw_floppy(void)
 		 */
 		if (time_after(ready_date, jiffies + DP->select_delay)) {
 			ready_date -= DP->select_delay;
-			function = (timeout_fn)floppy_start;
+			function = (work_func_t)floppy_start;
 		} else
-			function = (timeout_fn)setup_rw_floppy;
+			function = (work_func_t)setup_rw_floppy;
 
 		/* wait until the floppy is spinning fast enough */
 		if (fd_wait_for_completion(ready_date, function))
@@ -1526,7 +1485,7 @@ static void setup_rw_floppy(void)
 		inr = result();
 		cont->interrupt();
 	} else if (flags & FD_RAW_NEED_DISK)
-		fd_watchdog();
+		fd_watchdog(NULL);
 }
 
 static int blind_seek;
@@ -1708,7 +1667,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id)
 	fd_disable_dma();
 	release_dma_lock(f);
 
-	floppy_enable_hlt();
 	do_floppy = NULL;
 	if (fdc >= N_FDC || FDCS->address == -1) {
 		/* we don't even know which FDC is the culprit */
@@ -1836,20 +1794,22 @@ static void show_floppy(void)
 		pr_info("do_floppy=%pf\n", do_floppy);
 	if (work_pending(&floppy_work))
 		pr_info("floppy_work.func=%pf\n", floppy_work.func);
-	if (timer_pending(&fd_timer))
-		pr_info("fd_timer.function=%pf\n", fd_timer.function);
-	if (timer_pending(&fd_timeout)) {
-		pr_info("timer_function=%pf\n", fd_timeout.function);
-		pr_info("expires=%lu\n", fd_timeout.expires - jiffies);
-		pr_info("now=%lu\n", jiffies);
-	}
+	if (delayed_work_pending(&fd_timer))
+		pr_info("delayed work.function=%p expires=%ld\n",
+		       fd_timer.work.func,
+		       fd_timer.timer.expires - jiffies);
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("timer_function=%p expires=%ld\n",
+		       fd_timeout.work.func,
+		       fd_timeout.timer.expires - jiffies);
+
 	pr_info("cont=%p\n", cont);
 	pr_info("current_req=%p\n", current_req);
 	pr_info("command_status=%d\n", command_status);
 	pr_info("\n");
 }
 
-static void floppy_shutdown(unsigned long data)
+static void floppy_shutdown(struct work_struct *arg)
 {
 	unsigned long flags;
 
@@ -1857,8 +1817,6 @@ static void floppy_shutdown(unsigned long data)
 		show_floppy();
 	cancel_activity();
 
-	floppy_enable_hlt();
-
 	flags = claim_dma_lock();
 	fd_disable_dma();
 	release_dma_lock(flags);
@@ -1904,7 +1862,7 @@ static int start_motor(void (*function)(void))
 
 	/* wait_for_completion also schedules reset if needed. */
 	return fd_wait_for_completion(DRS->select_date + DP->select_delay,
-				      (timeout_fn)function);
+				      (work_func_t)function);
 }
 
 static void floppy_ready(void)
@@ -2857,7 +2815,6 @@ do_request:
 		spin_lock_irq(&floppy_lock);
 		pending = set_next_request();
 		spin_unlock_irq(&floppy_lock);
-
 		if (!pending) {
 			do_floppy = NULL;
 			unlock_fdc();
@@ -2934,13 +2891,15 @@ static void do_fd_request(struct request_queue *q)
 		 current_req->cmd_flags))
 		return;
 
-	if (test_bit(0, &fdc_busy)) {
+	if (test_and_set_bit(0, &fdc_busy)) {
 		/* fdc busy, this new request will be treated when the
 		   current one is done */
 		is_alive(__func__, "old request running");
 		return;
 	}
-	lock_fdc(MAXTIMEOUT, false);
+	command_status = FD_COMMAND_NONE;
+	__reschedule_timeout(MAXTIMEOUT, "fd_request");
+	set_fdc(0);
 	process_fd_request();
 	is_alive(__func__, "");
 }
@@ -3648,9 +3607,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode)
 
 	mutex_lock(&floppy_mutex);
 	mutex_lock(&open_lock);
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else if (!UDRS->fd_ref--) {
+	if (!UDRS->fd_ref--) {
 		DPRINT("floppy_release with fd_ref == 0");
 		UDRS->fd_ref = 0;
 	}
@@ -3686,13 +3643,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 		set_bit(FD_VERIFY_BIT, &UDRS->flags);
 	}
 
-	if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL)))
-		goto out2;
-
-	if (mode & FMODE_EXCL)
-		UDRS->fd_ref = -1;
-	else
-		UDRS->fd_ref++;
+	UDRS->fd_ref++;
 
 	opened_bdev[drive] = bdev;
 
@@ -3755,10 +3706,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode)
 	mutex_unlock(&floppy_mutex);
 	return 0;
 out:
-	if (UDRS->fd_ref < 0)
-		UDRS->fd_ref = 0;
-	else
-		UDRS->fd_ref--;
+	UDRS->fd_ref--;
+
 	if (!UDRS->fd_ref)
 		opened_bdev[drive] = NULL;
 out2:
@@ -4195,10 +4144,16 @@ static int __init floppy_init(void)
 			goto out_put_disk;
 		}
 
+		floppy_wq = alloc_ordered_workqueue("floppy", 0);
+		if (!floppy_wq) {
+			err = -ENOMEM;
+			goto out_put_disk;
+		}
+
 		disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock);
 		if (!disks[dr]->queue) {
 			err = -ENOMEM;
-			goto out_put_disk;
+			goto out_destroy_workq;
 		}
 
 		blk_queue_max_hw_sectors(disks[dr]->queue, 64);
@@ -4249,7 +4204,7 @@ static int __init floppy_init(void)
 	use_virtual_dma = can_use_virtual_dma & 1;
 	fdc_state[0].address = FDC1;
 	if (fdc_state[0].address == -1) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -ENODEV;
 		goto out_unreg_region;
 	}
@@ -4260,7 +4215,7 @@ static int __init floppy_init(void)
 	fdc = 0;		/* reset fdc in case of unexpected interrupt */
 	err = floppy_grab_irq_and_dma();
 	if (err) {
-		del_timer_sync(&fd_timeout);
+		cancel_delayed_work(&fd_timeout);
 		err = -EBUSY;
 		goto out_unreg_region;
 	}
@@ -4317,13 +4272,13 @@ static int __init floppy_init(void)
 		user_reset_fdc(-1, FD_RESET_ALWAYS, false);
 	}
 	fdc = 0;
-	del_timer_sync(&fd_timeout);
+	cancel_delayed_work(&fd_timeout);
 	current_drive = 0;
 	initialized = true;
 	if (have_no_fdc) {
 		DPRINT("no floppy controllers found\n");
 		err = have_no_fdc;
-		goto out_flush_work;
+		goto out_release_dma;
 	}
 
 	for (drive = 0; drive < N_DRIVE; drive++) {
@@ -4338,7 +4293,7 @@ static int __init floppy_init(void)
 
 		err = platform_device_register(&floppy_device[drive]);
 		if (err)
-			goto out_flush_work;
+			goto out_release_dma;
 
 		err = device_create_file(&floppy_device[drive].dev,
 					 &dev_attr_cmos);
@@ -4356,13 +4311,14 @@ static int __init floppy_init(void)
 
 out_unreg_platform_dev:
 	platform_device_unregister(&floppy_device[drive]);
-out_flush_work:
-	flush_work_sync(&floppy_work);
+out_release_dma:
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
 out_unreg_region:
 	blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256);
 	platform_driver_unregister(&floppy_driver);
+out_destroy_workq:
+	destroy_workqueue(floppy_wq);
 out_unreg_blkdev:
 	unregister_blkdev(FLOPPY_MAJOR, "fd");
 out_put_disk:
@@ -4433,7 +4389,7 @@ static int floppy_grab_irq_and_dma(void)
 	 * We might have scheduled a free_irq(), wait it to
 	 * drain first:
 	 */
-	flush_work_sync(&floppy_work);
+	flush_workqueue(floppy_wq);
 
 	if (fd_request_irq()) {
 		DPRINT("Unable to grab IRQ%d for the floppy driver\n",
@@ -4509,7 +4465,6 @@ static void floppy_release_irq_and_dma(void)
 #if N_FDC > 1
 	set_dor(1, ~8, 0);
 #endif
-	floppy_enable_hlt();
 
 	if (floppy_track_buffer && max_buffer_sectors) {
 		tmpsize = max_buffer_sectors * 1024;
@@ -4525,9 +4480,9 @@ static void floppy_release_irq_and_dma(void)
 			pr_info("motor off timer %d still active\n", drive);
 #endif
 
-	if (timer_pending(&fd_timeout))
+	if (delayed_work_pending(&fd_timeout))
 		pr_info("floppy timer still active:%s\n", timeout_message);
-	if (timer_pending(&fd_timer))
+	if (delayed_work_pending(&fd_timer))
 		pr_info("auxiliary floppy timer still active\n");
 	if (work_pending(&floppy_work))
 		pr_info("work still pending\n");
@@ -4597,8 +4552,9 @@ static void __exit floppy_module_exit(void)
 		put_disk(disks[drive]);
 	}
 
-	del_timer_sync(&fd_timeout);
-	del_timer_sync(&fd_timer);
+	cancel_delayed_work_sync(&fd_timeout);
+	cancel_delayed_work_sync(&fd_timer);
+	destroy_workqueue(floppy_wq);
 
 	if (atomic_read(&usage_count))
 		floppy_release_irq_and_dma();
diff --git a/drivers/block/hd.c b/drivers/block/hd.c
index b52c9ca146fc..bf397bf108b7 100644
--- a/drivers/block/hd.c
+++ b/drivers/block/hd.c
@@ -44,7 +44,6 @@
 #define HD_IRQ 14
 
 #define REALLY_SLOW_IO
-#include <asm/system.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index cd504353b278..bbca966f8f66 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -93,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd,
 			 struct page *loop_page, unsigned loop_off,
 			 int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 
 	if (cmd == READ)
 		memcpy(loop_buf, raw_buf, size);
 	else
 		memcpy(raw_buf, loop_buf, size);
 
-	kunmap_atomic(loop_buf, KM_USER1);
-	kunmap_atomic(raw_buf, KM_USER0);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
@@ -112,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 			struct page *loop_page, unsigned loop_off,
 			int size, sector_t real_block)
 {
-	char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off;
-	char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off;
+	char *raw_buf = kmap_atomic(raw_page) + raw_off;
+	char *loop_buf = kmap_atomic(loop_page) + loop_off;
 	char *in, *out, *key;
 	int i, keysize;
 
@@ -130,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd,
 	for (i = 0; i < size; i++)
 		*out++ = *in++ ^ key[(i & 511) % keysize];
 
-	kunmap_atomic(loop_buf, KM_USER1);
-	kunmap_atomic(raw_buf, KM_USER0);
+	kunmap_atomic(loop_buf);
+	kunmap_atomic(raw_buf);
 	cond_resched();
 	return 0;
 }
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
index b5dd14e072f2..0ba837fc62a8 100644
--- a/drivers/block/mtip32xx/Kconfig
+++ b/drivers/block/mtip32xx/Kconfig
@@ -4,6 +4,6 @@
 
 config BLK_DEV_PCIESSD_MTIP32XX
 	tristate "Block Device Driver for Micron PCIe SSDs"
-	depends on HOTPLUG_PCI_PCIE
+	depends on PCI
 	help
           This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
index 8eb81c96608f..264bc77dcb91 100644
--- a/drivers/block/mtip32xx/mtip32xx.c
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -36,6 +36,7 @@
 #include <linux/idr.h>
 #include <linux/kthread.h>
 #include <../drivers/ata/ahci.h>
+#include <linux/export.h>
 #include "mtip32xx.h"
 
 #define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
@@ -44,6 +45,7 @@
 #define HW_PORT_PRIV_DMA_SZ \
 		(HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ)
 
+#define HOST_CAP_NZDMA		(1 << 19)
 #define HOST_HSORG		0xFC
 #define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
 #define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
@@ -139,6 +141,12 @@ static void mtip_command_cleanup(struct driver_data *dd)
 	int group = 0, commandslot = 0, commandindex = 0;
 	struct mtip_cmd *command;
 	struct mtip_port *port = dd->port;
+	static int in_progress;
+
+	if (in_progress)
+		return;
+
+	in_progress = 1;
 
 	for (group = 0; group < 4; group++) {
 		for (commandslot = 0; commandslot < 32; commandslot++) {
@@ -165,7 +173,8 @@ static void mtip_command_cleanup(struct driver_data *dd)
 
 	up(&port->cmd_slot);
 
-	atomic_set(&dd->drv_cleanup_done, true);
+	set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag);
+	in_progress = 0;
 }
 
 /*
@@ -262,6 +271,9 @@ static int hba_reset_nosleep(struct driver_data *dd)
 		 && time_before(jiffies, timeout))
 		mdelay(1);
 
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+		return -1;
+
 	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
 		return -1;
 
@@ -282,18 +294,20 @@ static int hba_reset_nosleep(struct driver_data *dd)
  */
 static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
 {
-	unsigned long flags = 0;
-
 	atomic_set(&port->commands[tag].active, 1);
 
-	spin_lock_irqsave(&port->cmd_issue_lock, flags);
+	spin_lock(&port->cmd_issue_lock);
 
 	writel((1 << MTIP_TAG_BIT(tag)),
 			port->s_active[MTIP_TAG_INDEX(tag)]);
 	writel((1 << MTIP_TAG_BIT(tag)),
 			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
 
-	spin_unlock_irqrestore(&port->cmd_issue_lock, flags);
+	spin_unlock(&port->cmd_issue_lock);
+
+	/* Set the command's timeout value.*/
+	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
+					MTIP_NCQ_COMMAND_TIMEOUT_MS);
 }
 
 /*
@@ -422,6 +436,10 @@ static void mtip_init_port(struct mtip_port *port)
 	/* Clear any pending interrupts for this port */
 	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
 
+	/* Clear any pending interrupts on the HBA. */
+	writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+					port->dd->mmio + HOST_IRQ_STAT);
+
 	/* Enable port interrupts */
 	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
 }
@@ -447,6 +465,9 @@ static void mtip_restart_port(struct mtip_port *port)
 		 && time_before(jiffies, timeout))
 		;
 
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
 	/*
 	 * Chip quirk: escalate to hba reset if
 	 * PxCMD.CR not clear after 500 ms
@@ -475,6 +496,9 @@ static void mtip_restart_port(struct mtip_port *port)
 	while (time_before(jiffies, timeout))
 		;
 
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
 	/* Clear PxSCTL.DET */
 	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
 			 port->mmio + PORT_SCR_CTL);
@@ -486,15 +510,35 @@ static void mtip_restart_port(struct mtip_port *port)
 			 && time_before(jiffies, timeout))
 		;
 
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
 	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
 		dev_warn(&port->dd->pdev->dev,
 			"COM reset failed\n");
 
-	/* Clear SError, the PxSERR.DIAG.x should be set so clear it */
-	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+	mtip_init_port(port);
+	mtip_start_port(port);
 
-	/* Enable the DMA engine */
-	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits,
+			int cnt)
+{
+	unsigned char tagmap[128];
+	int group, tagmap_len = 0;
+
+	memset(tagmap, 0, sizeof(tagmap));
+	for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+		tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ",
+						tagbits[group-1]);
+	dev_warn(&dd->pdev->dev,
+			"%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
 }
 
 /*
@@ -514,15 +558,18 @@ static void mtip_timeout_function(unsigned long int data)
 	int tag, cmdto_cnt = 0;
 	unsigned int bit, group;
 	unsigned int num_command_slots = port->dd->slot_groups * 32;
+	unsigned long to, tagaccum[SLOTBITS_IN_LONGS];
 
 	if (unlikely(!port))
 		return;
 
-	if (atomic_read(&port->dd->resumeflag) == true) {
+	if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) {
 		mod_timer(&port->cmd_timer,
 			jiffies + msecs_to_jiffies(30000));
 		return;
 	}
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
 
 	for (tag = 0; tag < num_command_slots; tag++) {
 		/*
@@ -540,12 +587,10 @@ static void mtip_timeout_function(unsigned long int data)
 			command = &port->commands[tag];
 			fis = (struct host_to_dev_fis *) command->command;
 
-			dev_warn(&port->dd->pdev->dev,
-				"Timeout for command tag %d\n", tag);
-
+			set_bit(tag, tagaccum);
 			cmdto_cnt++;
 			if (cmdto_cnt == 1)
-				set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+				set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 
 			/*
 			 * Clear the completed bit. This should prevent
@@ -578,15 +623,29 @@ static void mtip_timeout_function(unsigned long int data)
 		}
 	}
 
-	if (cmdto_cnt) {
-		dev_warn(&port->dd->pdev->dev,
-			"%d commands timed out: restarting port",
-			cmdto_cnt);
+	if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+		print_tags(port->dd, "timed out", tagaccum, cmdto_cnt);
+
 		mtip_restart_port(port);
-		clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+		clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 		wake_up_interruptible(&port->svc_wait);
 	}
 
+	if (port->ic_pause_timer) {
+		to  = port->ic_pause_timer + msecs_to_jiffies(1000);
+		if (time_after(jiffies, to)) {
+			if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+				port->ic_pause_timer = 0;
+				clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+				clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+				wake_up_interruptible(&port->svc_wait);
+			}
+
+
+		}
+	}
+
 	/* Restart the timer */
 	mod_timer(&port->cmd_timer,
 		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
@@ -681,23 +740,18 @@ static void mtip_completion(struct mtip_port *port,
 	complete(waiting);
 }
 
-/*
- * Helper function for tag logging
- */
-static void print_tags(struct driver_data *dd,
-			char *msg,
-			unsigned long *tagbits)
+static void mtip_null_completion(struct mtip_port *port,
+			    int tag,
+			    void *data,
+			    int status)
 {
-	unsigned int tag, count = 0;
-
-	for (tag = 0; tag < (dd->slot_groups) * 32; tag++) {
-		if (test_bit(tag, tagbits))
-			count++;
-	}
-	if (count)
-		dev_info(&dd->pdev->dev, "%s [%i tags]\n", msg, count);
+	return;
 }
 
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib);
 /*
  * Handle an error.
  *
@@ -708,12 +762,16 @@ static void print_tags(struct driver_data *dd,
  */
 static void mtip_handle_tfe(struct driver_data *dd)
 {
-	int group, tag, bit, reissue;
+	int group, tag, bit, reissue, rv;
 	struct mtip_port *port;
-	struct mtip_cmd  *command;
+	struct mtip_cmd  *cmd;
 	u32 completed;
 	struct host_to_dev_fis *fis;
 	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+	unsigned int cmd_cnt = 0;
+	unsigned char *buf;
+	char *fail_reason = NULL;
+	int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
 
 	dev_warn(&dd->pdev->dev, "Taskfile error\n");
 
@@ -721,9 +779,23 @@ static void mtip_handle_tfe(struct driver_data *dd)
 
 	/* Stop the timer to prevent command timeouts. */
 	del_timer(&port->cmd_timer);
+	set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 
-	/* Set eh_active */
-	set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+		cmd = &port->commands[MTIP_TAG_INTERNAL];
+		dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+
+		atomic_inc(&cmd->active); /* active > 1 indicates error */
+		if (cmd->comp_data && cmd->comp_func) {
+			cmd->comp_func(port, MTIP_TAG_INTERNAL,
+					cmd->comp_data, PORT_IRQ_TF_ERR);
+		}
+		goto handle_tfe_exit;
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
 
 	/* Loop through all the groups */
 	for (group = 0; group < dd->slot_groups; group++) {
@@ -732,9 +804,6 @@ static void mtip_handle_tfe(struct driver_data *dd)
 		/* clear completed status register in the hardware.*/
 		writel(completed, port->completed[group]);
 
-		/* clear the tag accumulator */
-		memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
-
 		/* Process successfully completed commands */
 		for (bit = 0; bit < 32 && completed; bit++) {
 			if (!(completed & (1<<bit)))
@@ -745,13 +814,14 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			if (tag == MTIP_TAG_INTERNAL)
 				continue;
 
-			command = &port->commands[tag];
-			if (likely(command->comp_func)) {
+			cmd = &port->commands[tag];
+			if (likely(cmd->comp_func)) {
 				set_bit(tag, tagaccum);
-				atomic_set(&port->commands[tag].active, 0);
-				command->comp_func(port,
+				cmd_cnt++;
+				atomic_set(&cmd->active, 0);
+				cmd->comp_func(port,
 					 tag,
-					 command->comp_data,
+					 cmd->comp_data,
 					 0);
 			} else {
 				dev_err(&port->dd->pdev->dev,
@@ -765,12 +835,45 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			}
 		}
 	}
-	print_tags(dd, "TFE tags completed:", tagaccum);
+
+	print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
 
 	/* Restart the port */
 	mdelay(20);
 	mtip_restart_port(port);
 
+	/* Trying to determine the cause of the error */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+			fail_all_ncq_write = 1;
+			fail_reason = "write protect";
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+			fail_all_ncq_cmds = 1;
+			fail_reason = "thermal shutdown";
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			fail_all_ncq_cmds = 1;
+			fail_reason = "rebuild failed";
+		}
+	}
+
 	/* clear the tag accumulator */
 	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
 
@@ -779,32 +882,47 @@ static void mtip_handle_tfe(struct driver_data *dd)
 		for (bit = 0; bit < 32; bit++) {
 			reissue = 1;
 			tag = (group << 5) + bit;
+			cmd = &port->commands[tag];
 
 			/* If the active bit is set re-issue the command */
-			if (atomic_read(&port->commands[tag].active) == 0)
+			if (atomic_read(&cmd->active) == 0)
 				continue;
 
-			fis = (struct host_to_dev_fis *)
-				port->commands[tag].command;
+			fis = (struct host_to_dev_fis *)cmd->command;
 
 			/* Should re-issue? */
 			if (tag == MTIP_TAG_INTERNAL ||
 			    fis->command == ATA_CMD_SET_FEATURES)
 				reissue = 0;
+			else {
+				if (fail_all_ncq_cmds ||
+					(fail_all_ncq_write &&
+					fis->command == ATA_CMD_FPDMA_WRITE)) {
+					dev_warn(&dd->pdev->dev,
+					"  Fail: %s w/tag %d [%s].\n",
+					fis->command == ATA_CMD_FPDMA_WRITE ?
+						"write" : "read",
+					tag,
+					fail_reason != NULL ?
+						fail_reason : "unknown");
+					atomic_set(&cmd->active, 0);
+					if (cmd->comp_func) {
+						cmd->comp_func(port, tag,
+							cmd->comp_data,
+							-ENODATA);
+					}
+					continue;
+				}
+			}
 
 			/*
 			 * First check if this command has
 			 *  exceeded its retries.
 			 */
-			if (reissue &&
-			    (port->commands[tag].retries-- > 0)) {
+			if (reissue && (cmd->retries-- > 0)) {
 
 				set_bit(tag, tagaccum);
 
-				/* Update the timeout value. */
-				port->commands[tag].comp_time =
-					jiffies + msecs_to_jiffies(
-					MTIP_NCQ_COMMAND_TIMEOUT_MS);
 				/* Re-issue the command. */
 				mtip_issue_ncq_command(port, tag);
 
@@ -814,13 +932,13 @@ static void mtip_handle_tfe(struct driver_data *dd)
 			/* Retire a command that will not be reissued */
 			dev_warn(&port->dd->pdev->dev,
 				"retiring tag %d\n", tag);
-			atomic_set(&port->commands[tag].active, 0);
+			atomic_set(&cmd->active, 0);
 
-			if (port->commands[tag].comp_func)
-				port->commands[tag].comp_func(
+			if (cmd->comp_func)
+				cmd->comp_func(
 					port,
 					tag,
-					port->commands[tag].comp_data,
+					cmd->comp_data,
 					PORT_IRQ_TF_ERR);
 			else
 				dev_warn(&port->dd->pdev->dev,
@@ -828,10 +946,11 @@ static void mtip_handle_tfe(struct driver_data *dd)
 					tag);
 		}
 	}
-	print_tags(dd, "TFE tags reissued:", tagaccum);
+	print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
 
+handle_tfe_exit:
 	/* clear eh_active */
-	clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags);
+	clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
 	wake_up_interruptible(&port->svc_wait);
 
 	mod_timer(&port->cmd_timer,
@@ -851,6 +970,8 @@ static inline void mtip_process_sdbf(struct driver_data *dd)
 	/* walk all bits in all slot groups */
 	for (group = 0; group < dd->slot_groups; group++) {
 		completed = readl(port->completed[group]);
+		if (!completed)
+			continue;
 
 		/* clear completed status register in the hardware.*/
 		writel(completed, port->completed[group]);
@@ -899,7 +1020,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
 	struct mtip_port *port = dd->port;
 	struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL];
 
-	if (test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) &&
 	    (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL])
 		& (1 << MTIP_TAG_INTERNAL))) {
 		if (cmd->comp_func) {
@@ -911,8 +1032,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
 		}
 	}
 
-	dev_warn(&dd->pdev->dev, "IRQ status 0x%x ignored.\n", port_stat);
-
 	return;
 }
 
@@ -968,6 +1087,9 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
 				/* don't proceed further */
 				return IRQ_HANDLED;
 			}
+			if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))
+				return rv;
 
 			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
 		}
@@ -1015,6 +1137,39 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
 		port->cmd_issue[MTIP_TAG_INDEX(tag)]);
 }
 
+static bool mtip_pause_ncq(struct mtip_port *port,
+				struct host_to_dev_fis *fis)
+{
+	struct host_to_dev_fis *reply;
+	unsigned long task_file_data;
+
+	reply = port->rxfis + RX_FIS_D2H_REG;
+	task_file_data = readl(port->mmio+PORT_TFDATA);
+
+	if ((task_file_data & 1) || (fis->command == ATA_CMD_SEC_ERASE_UNIT))
+		return false;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+		set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+					(fis->features == 0x03)) {
+		set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+		((fis->command == 0xFC) &&
+			(fis->features == 0x27 || fis->features == 0x72 ||
+			 fis->features == 0x62 || fis->features == 0x26))) {
+		/* Com reset after secure erase or lowlevel format */
+		mtip_restart_port(port);
+		return false;
+	}
+
+	return false;
+}
+
 /*
  * Wait for port to quiesce
  *
@@ -1033,11 +1188,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
 
 	to = jiffies + msecs_to_jiffies(timeout);
 	do {
-		if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags) &&
-			test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
+		if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
 			msleep(20);
 			continue; /* svc thd is actively issuing commands */
 		}
+		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+			return -EFAULT;
 		/*
 		 * Ignore s_active bit 0 of array element 0.
 		 * This bit will always be set
@@ -1074,7 +1231,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
  *	-EAGAIN  Time out waiting for command to complete.
  */
 static int mtip_exec_internal_command(struct mtip_port *port,
-					void *fis,
+					struct host_to_dev_fis *fis,
 					int fis_len,
 					dma_addr_t buffer,
 					int buf_len,
@@ -1084,8 +1241,9 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 {
 	struct mtip_cmd_sg *command_sg;
 	DECLARE_COMPLETION_ONSTACK(wait);
-	int rv = 0;
+	int rv = 0, ready2go = 1;
 	struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL];
+	unsigned long to;
 
 	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
 	if (buffer & 0x00000007) {
@@ -1094,23 +1252,38 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 		return -EFAULT;
 	}
 
-	/* Only one internal command should be running at a time */
-	if (test_and_set_bit(MTIP_TAG_INTERNAL, port->allocated)) {
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL,
+						port->allocated);
+		if (ready2go)
+			break;
+		mdelay(100);
+	} while (time_before(jiffies, to));
+	if (!ready2go) {
 		dev_warn(&port->dd->pdev->dev,
-			"Internal command already active\n");
+			"Internal cmd active. new cmd [%02X]\n", fis->command);
 		return -EBUSY;
 	}
-	set_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	port->ic_pause_timer = 0;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_UNIT)
+		clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+	else if (fis->command == ATA_CMD_DOWNLOAD_MICRO)
+		clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
 
 	if (atomic == GFP_KERNEL) {
-		/* wait for io to complete if non atomic */
-		if (mtip_quiesce_io(port, 5000) < 0) {
-			dev_warn(&port->dd->pdev->dev,
-				"Failed to quiesce IO\n");
-			release_slot(port, MTIP_TAG_INTERNAL);
-			clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
-			wake_up_interruptible(&port->svc_wait);
-			return -EBUSY;
+		if (fis->command != ATA_CMD_STANDBYNOW1) {
+			/* wait for io to complete if non atomic */
+			if (mtip_quiesce_io(port, 5000) < 0) {
+				dev_warn(&port->dd->pdev->dev,
+					"Failed to quiesce IO\n");
+				release_slot(port, MTIP_TAG_INTERNAL);
+				clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+				wake_up_interruptible(&port->svc_wait);
+				return -EBUSY;
+			}
 		}
 
 		/* Set the completion function and data for the command. */
@@ -1120,7 +1293,7 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 	} else {
 		/* Clear completion - we're going to poll */
 		int_cmd->comp_data = NULL;
-		int_cmd->comp_func = NULL;
+		int_cmd->comp_func = mtip_null_completion;
 	}
 
 	/* Copy the command to the command table */
@@ -1159,38 +1332,60 @@ static int mtip_exec_internal_command(struct mtip_port *port,
 				"Internal command did not complete [%d] "
 				"within timeout of  %lu ms\n",
 				atomic, timeout);
+			if (mtip_check_surprise_removal(port->dd->pdev) ||
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&port->dd->dd_flag)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
 			rv = -EAGAIN;
 		}
-
-		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
-			& (1 << MTIP_TAG_INTERNAL)) {
-			dev_warn(&port->dd->pdev->dev,
-				"Retiring internal command but CI is 1.\n");
-		}
-
 	} else {
 		/* Spin for <timeout> checking if command still outstanding */
 		timeout = jiffies + msecs_to_jiffies(timeout);
+		while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+				& (1 << MTIP_TAG_INTERNAL))
+				&& time_before(jiffies, timeout)) {
+			if (mtip_check_surprise_removal(port->dd->pdev)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			if ((fis->command != ATA_CMD_STANDBYNOW1) &&
+				test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+						&port->dd->dd_flag)) {
+				rv = -ENXIO;
+				goto exec_ic_exit;
+			}
+			if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) {
+				atomic_inc(&int_cmd->active); /* error */
+				break;
+			}
+		}
+	}
 
-		while ((readl(
-			port->cmd_issue[MTIP_TAG_INTERNAL])
-			& (1 << MTIP_TAG_INTERNAL))
-			&& time_before(jiffies, timeout))
-			;
-
-		if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
+	if (atomic_read(&int_cmd->active) > 1) {
+		dev_err(&port->dd->pdev->dev,
+			"Internal command [%02X] failed\n", fis->command);
+		rv = -EIO;
+	}
+	if (readl(port->cmd_issue[MTIP_TAG_INTERNAL])
 			& (1 << MTIP_TAG_INTERNAL)) {
-			dev_err(&port->dd->pdev->dev,
-				"Internal command did not complete [%d]\n",
-				atomic);
+		rv = -ENXIO;
+		if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+					&port->dd->dd_flag)) {
+			mtip_restart_port(port);
 			rv = -EAGAIN;
 		}
 	}
-
+exec_ic_exit:
 	/* Clear the allocated and active bits for the internal command. */
 	atomic_set(&int_cmd->active, 0);
 	release_slot(port, MTIP_TAG_INTERNAL);
-	clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags);
+	if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+		/* NCQ paused */
+		return rv;
+	}
+	clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
 	wake_up_interruptible(&port->svc_wait);
 
 	return rv;
@@ -1240,6 +1435,9 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
 	int rv = 0;
 	struct host_to_dev_fis fis;
 
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return -EFAULT;
+
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
 	fis.type	= 0x27;
@@ -1313,6 +1511,7 @@ static int mtip_standby_immediate(struct mtip_port *port)
 {
 	int rv;
 	struct host_to_dev_fis	fis;
+	unsigned long start;
 
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
@@ -1320,15 +1519,150 @@ static int mtip_standby_immediate(struct mtip_port *port)
 	fis.opts	= 1 << 7;
 	fis.command	= ATA_CMD_STANDBYNOW1;
 
-	/* Execute the command.  Use a 15-second timeout for large drives. */
+	start = jiffies;
 	rv = mtip_exec_internal_command(port,
 					&fis,
 					5,
 					0,
 					0,
 					0,
-					GFP_KERNEL,
+					GFP_ATOMIC,
 					15000);
+	dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+			jiffies_to_msecs(jiffies - start));
+	if (rv)
+		dev_warn(&port->dd->pdev->dev,
+			"STANDBY IMMEDIATE command failed.\n");
+
+	return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @page	page number to fetch
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ * @sectors	page length to fetch, in sectors
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_READ_LOG_EXT;
+	fis.sect_count	= sectors & 0xFF;
+	fis.sect_cnt_ex	= (sectors >> 8) & 0xFF;
+	fis.lba_low	= page;
+	fis.lba_mid	= 0;
+	fis.device	= ATA_DEVICE_OBS;
+
+	memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					sectors * ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					MTIP_INTERNAL_COMMAND_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+					dma_addr_t buffer_dma)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_SMART;
+	fis.features	= 0xD0;
+	fis.sect_count	= 1;
+	fis.lba_mid	= 0x4F;
+	fis.lba_hi	= 0xC2;
+	fis.device	= ATA_DEVICE_OBS;
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					ATA_SECT_SIZE,
+					0,
+					GFP_ATOMIC,
+					15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port	pointer to the port structure
+ * @id		attribute number
+ * @attrib	pointer to return attrib information corresponding to @id
+ *
+ * return value
+ *	-EINVAL	NULL buffer passed or unsupported attribute @id.
+ *	-EPERM	Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib)
+{
+	int rv, i;
+	struct smart_attr *pattr;
+
+	if (!attrib)
+		return -EINVAL;
+
+	if (!port->identify_valid) {
+		dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+		return -EPERM;
+	}
+	if (!(port->identify[82] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+		return -EPERM;
+	}
+	if (!(port->identify[85] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+		return -EPERM;
+	}
+
+	memset(port->smart_buf, 0, ATA_SECT_SIZE);
+	rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+	if (rv) {
+		dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+		return rv;
+	}
+
+	pattr = (struct smart_attr *)(port->smart_buf + 2);
+	for (i = 0; i < 29; i++, pattr++)
+		if (pattr->attr_id == id) {
+			memcpy(attrib, pattr, sizeof(struct smart_attr));
+			break;
+		}
+
+	if (i == 29) {
+		dev_warn(&port->dd->pdev->dev,
+			"Query for invalid SMART attribute ID\n");
+		rv = -EINVAL;
+	}
 
 	return rv;
 }
@@ -1504,10 +1838,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 	fis.cyl_hi	= command[5];
 	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
 
-
-	dbg_printk(MTIP_DRV_NAME "%s: User Command: cmd %x, feat %x, "
-		"nsect %x, sect %x, lcyl %x, "
-		"hcyl %x, sel %x\n",
+	dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
 		__func__,
 		command[0],
 		command[1],
@@ -1534,8 +1865,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command)
 	command[4] = reply->cyl_low;
 	command[5] = reply->cyl_hi;
 
-	dbg_printk(MTIP_DRV_NAME "%s: Completion Status: stat %x, "
-		"err %x , cyl_lo %x cyl_hi %x\n",
+	dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
 		__func__,
 		command[0],
 		command[1],
@@ -1562,13 +1892,33 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 				void __user *user_buffer)
 {
 	struct host_to_dev_fis	fis;
-	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+	struct host_to_dev_fis *reply;
+	u8 *buf = NULL;
+	dma_addr_t dma_addr = 0;
+	int rv = 0, xfer_sz = command[3];
+
+	if (xfer_sz) {
+		if (user_buffer)
+			return -EFAULT;
+
+		buf = dmam_alloc_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz,
+				&dma_addr,
+				GFP_KERNEL);
+		if (!buf) {
+			dev_err(&port->dd->pdev->dev,
+				"Memory allocation failed (%d bytes)\n",
+				ATA_SECT_SIZE * xfer_sz);
+			return -ENOMEM;
+		}
+		memset(buf, 0, ATA_SECT_SIZE * xfer_sz);
+	}
 
 	/* Build the FIS. */
 	memset(&fis, 0, sizeof(struct host_to_dev_fis));
-	fis.type		= 0x27;
-	fis.opts		= 1 << 7;
-	fis.command		= command[0];
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
 	fis.features	= command[2];
 	fis.sect_count	= command[3];
 	if (fis.command == ATA_CMD_SMART) {
@@ -1577,8 +1927,13 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 		fis.cyl_hi	= 0xC2;
 	}
 
+	if (xfer_sz)
+		reply = (port->rxfis + RX_FIS_PIO_SETUP);
+	else
+		reply = (port->rxfis + RX_FIS_D2H_REG);
+
 	dbg_printk(MTIP_DRV_NAME
-		"%s: User Command: cmd %x, sect %x, "
+		" %s: User Command: cmd %x, sect %x, "
 		"feat %x, sectcnt %x\n",
 		__func__,
 		command[0],
@@ -1586,43 +1941,46 @@ static int exec_drive_command(struct mtip_port *port, u8 *command,
 		command[2],
 		command[3]);
 
-	memset(port->sector_buffer, 0x00, ATA_SECT_SIZE);
-
 	/* Execute the command. */
 	if (mtip_exec_internal_command(port,
 				&fis,
 				 5,
-				 port->sector_buffer_dma,
-				 (command[3] != 0) ? ATA_SECT_SIZE : 0,
+				 (xfer_sz ? dma_addr : 0),
+				 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
 				 0,
 				 GFP_KERNEL,
 				 MTIP_IOCTL_COMMAND_TIMEOUT_MS)
 				 < 0) {
-		return -1;
+		rv = -EFAULT;
+		goto exit_drive_command;
 	}
 
 	/* Collect the completion status. */
 	command[0] = reply->command; /* Status*/
 	command[1] = reply->features; /* Error*/
-	command[2] = command[3];
+	command[2] = reply->sect_count;
 
 	dbg_printk(MTIP_DRV_NAME
-		"%s: Completion Status: stat %x, "
-		"err %x, cmd %x\n",
+		" %s: Completion Status: stat %x, "
+		"err %x, nsect %x\n",
 		__func__,
 		command[0],
 		command[1],
 		command[2]);
 
-	if (user_buffer && command[3]) {
+	if (xfer_sz) {
 		if (copy_to_user(user_buffer,
-				 port->sector_buffer,
+				 buf,
 				 ATA_SECT_SIZE * command[3])) {
-			return -EFAULT;
+			rv = -EFAULT;
+			goto exit_drive_command;
 		}
 	}
-
-	return 0;
+exit_drive_command:
+	if (buf)
+		dmam_free_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+	return rv;
 }
 
 /*
@@ -1672,6 +2030,32 @@ static unsigned int implicit_sector(unsigned char command,
 	return rv;
 }
 
+static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout)
+{
+	switch (fis->command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		*timeout = 120000; /* 2 minutes */
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case 0xFC:
+		*timeout = 240000; /* 4 minutes */
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		*timeout = 10000;  /* 10 seconds */
+		break;
+	case 0xF7:
+	case 0xFA:
+		*timeout = 60000;  /* 60 seconds */
+		break;
+	case ATA_CMD_SMART:
+		*timeout = 15000;  /* 15 seconds */
+		break;
+	default:
+		*timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+		break;
+	}
+}
+
 /*
  * Executes a taskfile
  * See ide_taskfile_ioctl() for derivation
@@ -1692,7 +2076,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	unsigned int taskin = 0;
 	unsigned int taskout = 0;
 	u8 nsect = 0;
-	unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
+	unsigned int timeout;
 	unsigned int force_single_sector;
 	unsigned int transfer_size;
 	unsigned long task_file_data;
@@ -1810,9 +2194,10 @@ static int exec_drive_taskfile(struct driver_data *dd,
 	}
 
 	dbg_printk(MTIP_DRV_NAME
-		"taskfile: cmd %x, feat %x, nsect %x,"
+		" %s: cmd %x, feat %x, nsect %x,"
 		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
 		" head/dev %x\n",
+		__func__,
 		fis.command,
 		fis.features,
 		fis.sect_count,
@@ -1821,32 +2206,7 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		fis.lba_hi,
 		fis.device);
 
-	switch (fis.command) {
-	case ATA_CMD_DOWNLOAD_MICRO:
-		/* Change timeout for Download Microcode to 60 seconds.*/
-		timeout = 60000;
-		break;
-	case ATA_CMD_SEC_ERASE_UNIT:
-		/* Change timeout for Security Erase Unit to 4 minutes.*/
-		timeout = 240000;
-		break;
-	case ATA_CMD_STANDBYNOW1:
-		/* Change timeout for standby immediate to 10 seconds.*/
-		timeout = 10000;
-		break;
-	case 0xF7:
-	case 0xFA:
-		/* Change timeout for vendor unique command to 10 secs */
-		timeout = 10000;
-		break;
-	case ATA_CMD_SMART:
-		/* Change timeout for vendor unique command to 10 secs */
-		timeout = 10000;
-		break;
-	default:
-		timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS;
-		break;
-	}
+	mtip_set_timeout(&fis, &timeout);
 
 	/* Determine the correct transfer size.*/
 	if (force_single_sector)
@@ -1903,18 +2263,8 @@ static int exec_drive_taskfile(struct driver_data *dd,
 		req_task->hob_ports[1] = reply->features_ex;
 		req_task->hob_ports[2] = reply->sect_cnt_ex;
 	}
-
-	/* Com rest after secure erase or lowlevel format */
-	if (((fis.command == ATA_CMD_SEC_ERASE_UNIT) ||
-		((fis.command == 0xFC) &&
-			(fis.features == 0x27 || fis.features == 0x72 ||
-			 fis.features == 0x62 || fis.features == 0x26))) &&
-			 !(reply->command & 1)) {
-		mtip_restart_port(dd->port);
-	}
-
 	dbg_printk(MTIP_DRV_NAME
-		"%s: Completion: stat %x,"
+		" %s: Completion: stat %x,"
 		"err %x, sect_cnt %x, lbalo %x,"
 		"lbamid %x, lbahi %x, dev %x\n",
 		__func__,
@@ -1973,13 +2323,12 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
 {
 	switch (cmd) {
 	case HDIO_GET_IDENTITY:
-		if (mtip_get_identify(dd->port, (void __user *) arg) < 0) {
-			dev_warn(&dd->pdev->dev,
-				"Unable to read identity\n");
-			return -EIO;
-		}
-
+	{
+		if (copy_to_user((void __user *)arg, dd->port->identify,
+						sizeof(u16) * ATA_ID_WORDS))
+			return -EFAULT;
 		break;
+	}
 	case HDIO_DRIVE_CMD:
 	{
 		u8 drive_command[4];
@@ -2080,14 +2429,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	struct host_to_dev_fis	*fis;
 	struct mtip_port *port = dd->port;
 	struct mtip_cmd *command = &port->commands[tag];
+	int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
 
 	/* Map the scatter list for DMA access */
-	if (dir == READ)
-		nents = dma_map_sg(&dd->pdev->dev, command->sg,
-					nents, DMA_FROM_DEVICE);
-	else
-		nents = dma_map_sg(&dd->pdev->dev, command->sg,
-					nents, DMA_TO_DEVICE);
+	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
 
 	command->scatter_ents = nents;
 
@@ -2127,7 +2472,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	 */
 	command->comp_data = dd;
 	command->comp_func = mtip_async_complete;
-	command->direction = (dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE);
+	command->direction = dma_dir;
 
 	/*
 	 * Set the completion function and data for the command passed
@@ -2140,19 +2485,16 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start,
 	 * To prevent this command from being issued
 	 * if an internal command is in progress or error handling is active.
 	 */
-	if (unlikely(test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) ||
-			test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags))) {
+	if (port->flags & MTIP_PF_PAUSE_IO) {
 		set_bit(tag, port->cmds_to_issue);
-		set_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
+		set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
 		return;
 	}
 
 	/* Issue the command to the hardware */
 	mtip_issue_ncq_command(port, tag);
 
-	/* Set the command's timeout value.*/
-	port->commands[tag].comp_time = jiffies + msecs_to_jiffies(
-					MTIP_NCQ_COMMAND_TIMEOUT_MS);
+	return;
 }
 
 /*
@@ -2191,8 +2533,14 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
 	down(&dd->port->cmd_slot);
 	*tag = get_slot(dd->port);
 
-	if (unlikely(*tag < 0))
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		up(&dd->port->cmd_slot);
+		return NULL;
+	}
+	if (unlikely(*tag < 0)) {
+		up(&dd->port->cmd_slot);
 		return NULL;
+	}
 
 	return dd->port->commands[*tag].sg;
 }
@@ -2207,7 +2555,7 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd,
  * return value
  *	The size, in bytes, of the data copied into buf.
  */
-static ssize_t hw_show_registers(struct device *dev,
+static ssize_t mtip_hw_show_registers(struct device *dev,
 				struct device_attribute *attr,
 				char *buf)
 {
@@ -2216,44 +2564,97 @@ static ssize_t hw_show_registers(struct device *dev,
 	int size = 0;
 	int n;
 
-	size += sprintf(&buf[size], "%s:\ns_active:\n", __func__);
+	size += sprintf(&buf[size], "Hardware\n--------\n");
+	size += sprintf(&buf[size], "S ACTive      : [ 0x");
 
-	for (n = 0; n < dd->slot_groups; n++)
-		size += sprintf(&buf[size], "0x%08x\n",
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
 					 readl(dd->port->s_active[n]));
 
-	size += sprintf(&buf[size], "Command Issue:\n");
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "Command Issue : [ 0x");
 
-	for (n = 0; n < dd->slot_groups; n++)
-		size += sprintf(&buf[size], "0x%08x\n",
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
 					readl(dd->port->cmd_issue[n]));
 
-	size += sprintf(&buf[size], "Allocated:\n");
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "Completed     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "PORT IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "HOST IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+	size += sprintf(&buf[size], "\n");
+
+	size += sprintf(&buf[size], "Local\n-----\n");
+	size += sprintf(&buf[size], "Allocated    : [ 0x");
 
-	for (n = 0; n < dd->slot_groups; n++) {
+	for (n = dd->slot_groups-1; n >= 0; n--) {
 		if (sizeof(long) > sizeof(u32))
 			group_allocated =
 				dd->port->allocated[n/2] >> (32*(n&1));
 		else
 			group_allocated = dd->port->allocated[n];
-		size += sprintf(&buf[size], "0x%08x\n",
-				 group_allocated);
+		size += sprintf(&buf[size], "%08X ", group_allocated);
 	}
+	size += sprintf(&buf[size], "]\n");
 
-	size += sprintf(&buf[size], "completed:\n");
+	size += sprintf(&buf[size], "Commands in Q: [ 0x");
 
-	for (n = 0; n < dd->slot_groups; n++)
-		size += sprintf(&buf[size], "0x%08x\n",
-				readl(dd->port->completed[n]));
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->cmds_to_issue[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
 
-	size += sprintf(&buf[size], "PORT_IRQ_STAT 0x%08x\n",
-				readl(dd->port->mmio + PORT_IRQ_STAT));
-	size += sprintf(&buf[size], "HOST_IRQ_STAT 0x%08x\n",
-				readl(dd->mmio + HOST_IRQ_STAT));
+	return size;
+}
+
+static ssize_t mtip_hw_show_status(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "thermal_shutdown\n");
+	else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "write_protect\n");
+	else
+		size += sprintf(buf, "%s", "online\n");
+
+	return size;
+}
+
+static ssize_t mtip_hw_show_flags(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+
+	size += sprintf(&buf[size], "Flag in port struct : [ %08lX ]\n",
+							dd->port->flags);
+	size += sprintf(&buf[size], "Flag in dd struct   : [ %08lX ]\n",
+							dd->dd_flag);
 
 	return size;
 }
-static DEVICE_ATTR(registers, S_IRUGO, hw_show_registers, NULL);
+
+static DEVICE_ATTR(registers, S_IRUGO, mtip_hw_show_registers, NULL);
+static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL);
+static DEVICE_ATTR(flags, S_IRUGO, mtip_hw_show_flags, NULL);
 
 /*
  * Create the sysfs related attributes.
@@ -2272,7 +2673,13 @@ static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj)
 
 	if (sysfs_create_file(kobj, &dev_attr_registers.attr))
 		dev_warn(&dd->pdev->dev,
-			"Error creating registers sysfs entry\n");
+			"Error creating 'registers' sysfs entry\n");
+	if (sysfs_create_file(kobj, &dev_attr_status.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating 'status' sysfs entry\n");
+	if (sysfs_create_file(kobj, &dev_attr_flags.attr))
+		dev_warn(&dd->pdev->dev,
+			"Error creating 'flags' sysfs entry\n");
 	return 0;
 }
 
@@ -2292,6 +2699,8 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj)
 		return -EINVAL;
 
 	sysfs_remove_file(kobj, &dev_attr_registers.attr);
+	sysfs_remove_file(kobj, &dev_attr_status.attr);
+	sysfs_remove_file(kobj, &dev_attr_flags.attr);
 
 	return 0;
 }
@@ -2384,10 +2793,12 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
 		"FTL rebuild in progress. Polling for completion.\n");
 
 	start = jiffies;
-	dd->ftlrebuildflag = 1;
 	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
 
 	do {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			return -EFAULT;
 		if (mtip_check_surprise_removal(dd->pdev))
 			return -EFAULT;
 
@@ -2408,22 +2819,17 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd)
 			dev_warn(&dd->pdev->dev,
 				"FTL rebuild complete (%d secs).\n",
 			jiffies_to_msecs(jiffies - start) / 1000);
-			dd->ftlrebuildflag = 0;
 			mtip_block_initialize(dd);
-			break;
+			return 0;
 		}
 		ssleep(10);
 	} while (time_before(jiffies, timeout));
 
 	/* Check for timeout */
-	if (dd->ftlrebuildflag) {
-		dev_err(&dd->pdev->dev,
+	dev_err(&dd->pdev->dev,
 		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
 		jiffies_to_msecs(jiffies - start) / 1000);
-		return -EFAULT;
-	}
-
-	return 0;
+	return -EFAULT;
 }
 
 /*
@@ -2448,14 +2854,17 @@ static int mtip_service_thread(void *data)
 		 * is in progress nor error handling is active
 		 */
 		wait_event_interruptible(port->svc_wait, (port->flags) &&
-			!test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) &&
-			!test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags));
+			!(port->flags & MTIP_PF_PAUSE_IO));
 
 		if (kthread_should_stop())
 			break;
 
-		set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
-		if (test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			break;
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
 			slot = 1;
 			/* used to restrict the loop to one iteration */
 			slot_start = num_cmd_slots;
@@ -2480,21 +2889,19 @@ static int mtip_service_thread(void *data)
 				/* Issue the command to the hardware */
 				mtip_issue_ncq_command(port, slot);
 
-				/* Set the command's timeout value.*/
-				port->commands[slot].comp_time = jiffies +
-				msecs_to_jiffies(MTIP_NCQ_COMMAND_TIMEOUT_MS);
-
 				clear_bit(slot, port->cmds_to_issue);
 			}
 
-			clear_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags);
-		} else if (test_bit(MTIP_FLAG_REBUILD_BIT, &port->flags)) {
-			mtip_ftl_rebuild_poll(dd);
-			clear_bit(MTIP_FLAG_REBUILD_BIT, &port->flags);
+			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		} else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+			if (!mtip_ftl_rebuild_poll(dd))
+				set_bit(MTIP_DDF_REBUILD_FAILED_BIT,
+							&dd->dd_flag);
+			clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
 		}
-		clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags);
+		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
 
-		if (test_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &port->flags))
+		if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
 			break;
 	}
 	return 0;
@@ -2513,6 +2920,9 @@ static int mtip_hw_init(struct driver_data *dd)
 	int i;
 	int rv;
 	unsigned int num_command_slots;
+	unsigned long timeout, timetaken;
+	unsigned char *buf;
+	struct smart_attr attr242;
 
 	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
 
@@ -2547,7 +2957,7 @@ static int mtip_hw_init(struct driver_data *dd)
 	/* Allocate memory for the command list. */
 	dd->port->command_list =
 		dmam_alloc_coherent(&dd->pdev->dev,
-			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
 			&dd->port->command_list_dma,
 			GFP_KERNEL);
 	if (!dd->port->command_list) {
@@ -2560,7 +2970,7 @@ static int mtip_hw_init(struct driver_data *dd)
 	/* Clear the memory we have allocated. */
 	memset(dd->port->command_list,
 		0,
-		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2));
+		HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4));
 
 	/* Setup the addresse of the RX FIS. */
 	dd->port->rxfis	    = dd->port->command_list + HW_CMD_SLOT_SZ;
@@ -2576,10 +2986,19 @@ static int mtip_hw_init(struct driver_data *dd)
 	dd->port->identify_dma = dd->port->command_tbl_dma +
 					HW_CMD_TBL_AR_SZ;
 
-	/* Setup the address of the sector buffer. */
+	/* Setup the address of the sector buffer - for some non-ncq cmds */
 	dd->port->sector_buffer	= (void *) dd->port->identify + ATA_SECT_SIZE;
 	dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE;
 
+	/* Setup the address of the log buf - for read log command */
+	dd->port->log_buf = (void *)dd->port->sector_buffer  + ATA_SECT_SIZE;
+	dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE;
+
+	/* Setup the address of the smart buf - for smart read data command */
+	dd->port->smart_buf = (void *)dd->port->log_buf  + ATA_SECT_SIZE;
+	dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE;
+
+
 	/* Point the command headers at the command tables. */
 	for (i = 0; i < num_command_slots; i++) {
 		dd->port->commands[i].command_header =
@@ -2623,14 +3042,43 @@ static int mtip_hw_init(struct driver_data *dd)
 			dd->port->mmio + i*0x80 + PORT_SDBV;
 	}
 
-	/* Reset the HBA. */
-	if (mtip_hba_reset(dd) < 0) {
-		dev_err(&dd->pdev->dev,
-			"Card did not reset within timeout\n");
-		rv = -EIO;
+	timetaken = jiffies;
+	timeout = jiffies + msecs_to_jiffies(30000);
+	while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+		 time_before(jiffies, timeout)) {
+		mdelay(100);
+	}
+	if (unlikely(mtip_check_surprise_removal(dd->pdev))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Surprise removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -ENODEV;
+		goto out2 ;
+	}
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -EFAULT;
 		goto out2;
 	}
 
+	/* Conditionally reset the HBA. */
+	if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+		if (mtip_hba_reset(dd) < 0) {
+			dev_err(&dd->pdev->dev,
+				"Card did not reset within timeout\n");
+			rv = -EIO;
+			goto out2;
+		}
+	} else {
+		/* Clear any pending interrupts on the HBA */
+		writel(readl(dd->mmio + HOST_IRQ_STAT),
+			dd->mmio + HOST_IRQ_STAT);
+	}
+
 	mtip_init_port(dd->port);
 	mtip_start_port(dd->port);
 
@@ -2660,6 +3108,12 @@ static int mtip_hw_init(struct driver_data *dd)
 	mod_timer(&dd->port->cmd_timer,
 		jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD));
 
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
 	if (mtip_get_identify(dd->port, NULL) < 0) {
 		rv = -EFAULT;
 		goto out3;
@@ -2667,10 +3121,47 @@ static int mtip_hw_init(struct driver_data *dd)
 
 	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
 		MTIP_FTL_REBUILD_MAGIC) {
-		set_bit(MTIP_FLAG_REBUILD_BIT, &dd->port->flags);
+		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
 		return MTIP_FTL_REBUILD_MAGIC;
 	}
 	mtip_dump_identify(dd->port);
+
+	/* check write protect, over temp and rebuild statuses */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			/* TODO */
+		}
+	}
+
+	/* get write protect progess */
+	memset(&attr242, 0, sizeof(struct smart_attr));
+	if (mtip_get_smart_attr(dd->port, 242, &attr242))
+		dev_warn(&dd->pdev->dev,
+				"Unable to check write protect progress\n");
+	else
+		dev_info(&dd->pdev->dev,
+				"Write protect progress: %d%% (%d blocks)\n",
+				attr242.cur, attr242.data);
 	return rv;
 
 out3:
@@ -2688,7 +3179,7 @@ out2:
 
 	/* Free the command/command header memory. */
 	dmam_free_coherent(&dd->pdev->dev,
-				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+				HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
 				dd->port->command_list,
 				dd->port->command_list_dma);
 out1:
@@ -2712,9 +3203,12 @@ static int mtip_hw_exit(struct driver_data *dd)
 	 * Send standby immediate (E0h) to the drive so that it
 	 * saves its state.
 	 */
-	if (atomic_read(&dd->drv_cleanup_done) != true) {
+	if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
 
-		mtip_standby_immediate(dd->port);
+		if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags))
+			if (mtip_standby_immediate(dd->port))
+				dev_warn(&dd->pdev->dev,
+					"STANDBY IMMEDIATE failed\n");
 
 		/* de-initialize the port. */
 		mtip_deinit_port(dd->port);
@@ -2734,7 +3228,7 @@ static int mtip_hw_exit(struct driver_data *dd)
 
 	/* Free the command/command header memory. */
 	dmam_free_coherent(&dd->pdev->dev,
-			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2),
+			HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4),
 			dd->port->command_list,
 			dd->port->command_list_dma);
 	/* Free the memory allocated for the for structure. */
@@ -2892,6 +3386,9 @@ static int mtip_block_ioctl(struct block_device *dev,
 	if (!dd)
 		return -ENOTTY;
 
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
 	switch (cmd) {
 	case BLKFLSBUF:
 		return -ENOTTY;
@@ -2927,6 +3424,9 @@ static int mtip_block_compat_ioctl(struct block_device *dev,
 	if (!dd)
 		return -ENOTTY;
 
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
 	switch (cmd) {
 	case BLKFLSBUF:
 		return -ENOTTY;
@@ -3049,6 +3549,24 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 	int nents = 0;
 	int tag = 0;
 
+	if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))) {
+			bio_endio(bio, -ENXIO);
+			return;
+		}
+		if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) {
+			bio_endio(bio, -ENODATA);
+			return;
+		}
+		if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT,
+							&dd->dd_flag) &&
+				bio_data_dir(bio))) {
+			bio_endio(bio, -ENODATA);
+			return;
+		}
+	}
+
 	if (unlikely(!bio_has_data(bio))) {
 		blk_queue_flush(queue, 0);
 		bio_endio(bio, 0);
@@ -3061,7 +3579,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio)
 
 		if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) {
 			dev_warn(&dd->pdev->dev,
-				"Maximum number of SGL entries exceeded");
+				"Maximum number of SGL entries exceeded\n");
 			bio_io_error(bio);
 			mtip_hw_release_scatterlist(dd, tag);
 			return;
@@ -3181,7 +3699,10 @@ skip_create_disk:
 	set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags);
 	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
 	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_max_hw_sectors(dd->queue, 0xffff);
+	blk_queue_max_segment_size(dd->queue, 0x400000);
 	blk_queue_io_min(dd->queue, 4096);
+
 	/*
 	 * write back cache is not supported in the device. FUA depends on
 	 * write back cache support, hence setting flush support to zero.
@@ -3210,8 +3731,10 @@ skip_create_disk:
 		kobject_put(kobj);
 	}
 
-	if (dd->mtip_svc_handler)
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
 		return rv; /* service thread created for handling rebuild */
+	}
 
 start_service_thread:
 	sprintf(thd_name, "mtip_svc_thd_%02d", index);
@@ -3220,12 +3743,15 @@ start_service_thread:
 						dd, thd_name);
 
 	if (IS_ERR(dd->mtip_svc_handler)) {
-		printk(KERN_ERR "mtip32xx: service thread failed to start\n");
+		dev_err(&dd->pdev->dev, "service thread failed to start\n");
 		dd->mtip_svc_handler = NULL;
 		rv = -EFAULT;
 		goto kthread_run_error;
 	}
 
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		rv = wait_for_rebuild;
+
 	return rv;
 
 kthread_run_error:
@@ -3266,16 +3792,18 @@ static int mtip_block_remove(struct driver_data *dd)
 	struct kobject *kobj;
 
 	if (dd->mtip_svc_handler) {
-		set_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &dd->port->flags);
+		set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
 		wake_up_interruptible(&dd->port->svc_wait);
 		kthread_stop(dd->mtip_svc_handler);
 	}
 
-	/* Clean up the sysfs attributes managed by the protocol layer. */
-	kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
-	if (kobj) {
-		mtip_hw_sysfs_exit(dd, kobj);
-		kobject_put(kobj);
+	/* Clean up the sysfs attributes, if created */
+	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) {
+		kobj = kobject_get(&disk_to_dev(dd->disk)->kobj);
+		if (kobj) {
+			mtip_hw_sysfs_exit(dd, kobj);
+			kobject_put(kobj);
+		}
 	}
 
 	/*
@@ -3283,6 +3811,11 @@ static int mtip_block_remove(struct driver_data *dd)
 	 * from /dev
 	 */
 	del_gendisk(dd->disk);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
 	blk_cleanup_queue(dd->queue);
 	dd->disk  = NULL;
 	dd->queue = NULL;
@@ -3312,6 +3845,11 @@ static int mtip_block_shutdown(struct driver_data *dd)
 
 	/* Delete our gendisk structure, and cleanup the blk queue. */
 	del_gendisk(dd->disk);
+
+	spin_lock(&rssd_index_lock);
+	ida_remove(&rssd_index_ida, dd->index);
+	spin_unlock(&rssd_index_lock);
+
 	blk_cleanup_queue(dd->queue);
 	dd->disk  = NULL;
 	dd->queue = NULL;
@@ -3359,11 +3897,6 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 		return -ENOMEM;
 	}
 
-	/* Set the atomic variable as 1 in case of SRSI */
-	atomic_set(&dd->drv_cleanup_done, true);
-
-	atomic_set(&dd->resumeflag, false);
-
 	/* Attach the private data to this PCI device.  */
 	pci_set_drvdata(pdev, dd);
 
@@ -3420,7 +3953,8 @@ static int mtip_pci_probe(struct pci_dev *pdev,
 	 * instance number.
 	 */
 	instance++;
-
+	if (rv != MTIP_FTL_REBUILD_MAGIC)
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
 	goto done;
 
 block_initialize_err:
@@ -3434,9 +3968,6 @@ iomap_err:
 	pci_set_drvdata(pdev, NULL);
 	return rv;
 done:
-	/* Set the atomic variable as 0 in case of SRSI */
-	atomic_set(&dd->drv_cleanup_done, true);
-
 	return rv;
 }
 
@@ -3452,8 +3983,10 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 	struct driver_data *dd = pci_get_drvdata(pdev);
 	int counter = 0;
 
+	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
 	if (mtip_check_surprise_removal(pdev)) {
-		while (atomic_read(&dd->drv_cleanup_done) == false) {
+		while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) {
 			counter++;
 			msleep(20);
 			if (counter == 10) {
@@ -3463,8 +3996,6 @@ static void mtip_pci_remove(struct pci_dev *pdev)
 			}
 		}
 	}
-	/* Set the atomic variable as 1 in case of SRSI */
-	atomic_set(&dd->drv_cleanup_done, true);
 
 	/* Clean up the block layer. */
 	mtip_block_remove(dd);
@@ -3493,7 +4024,7 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg)
 		return -EFAULT;
 	}
 
-	atomic_set(&dd->resumeflag, true);
+	set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
 
 	/* Disable ports & interrupts then send standby immediate */
 	rv = mtip_block_suspend(dd);
@@ -3559,7 +4090,7 @@ static int mtip_pci_resume(struct pci_dev *pdev)
 		dev_err(&pdev->dev, "Unable to resume\n");
 
 err:
-	atomic_set(&dd->resumeflag, false);
+	clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
 
 	return rv;
 }
@@ -3608,18 +4139,25 @@ MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
  */
 static int __init mtip_init(void)
 {
+	int error;
+
 	printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
 
 	/* Allocate a major block device number to use with this driver. */
-	mtip_major = register_blkdev(0, MTIP_DRV_NAME);
-	if (mtip_major < 0) {
+	error = register_blkdev(0, MTIP_DRV_NAME);
+	if (error <= 0) {
 		printk(KERN_ERR "Unable to register block device (%d)\n",
-		mtip_major);
+		error);
 		return -EBUSY;
 	}
+	mtip_major = error;
 
 	/* Register our PCI operations. */
-	return pci_register_driver(&mtip_pci_driver);
+	error = pci_register_driver(&mtip_pci_driver);
+	if (error)
+		unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	return error;
 }
 
 /*
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
index e0554a8f2233..b2c88da26b2a 100644
--- a/drivers/block/mtip32xx/mtip32xx.h
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -34,8 +34,8 @@
 /* offset of Device Control register in PCIe extended capabilites space */
 #define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
 
-/* # of times to retry timed out IOs */
-#define MTIP_MAX_RETRIES	5
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES	2
 
 /* Various timeout values in ms */
 #define MTIP_NCQ_COMMAND_TIMEOUT_MS       5000
@@ -113,13 +113,44 @@
 
 #define __force_bit2int (unsigned int __force)
 
-/* below are bit numbers in 'flags' defined in mtip_port */
-#define MTIP_FLAG_IC_ACTIVE_BIT			0
-#define MTIP_FLAG_EH_ACTIVE_BIT			1
-#define MTIP_FLAG_SVC_THD_ACTIVE_BIT		2
-#define MTIP_FLAG_ISSUE_CMDS_BIT		4
-#define MTIP_FLAG_REBUILD_BIT			5
-#define MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT	8
+enum {
+	/* below are bit numbers in 'flags' defined in mtip_port */
+	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
+	MTIP_PF_EH_ACTIVE_BIT       = 1, /* error handling */
+	MTIP_PF_SE_ACTIVE_BIT       = 2, /* secure erase */
+	MTIP_PF_DM_ACTIVE_BIT       = 3, /* download microcde */
+	MTIP_PF_PAUSE_IO      =	((1 << MTIP_PF_IC_ACTIVE_BIT) | \
+				(1 << MTIP_PF_EH_ACTIVE_BIT) | \
+				(1 << MTIP_PF_SE_ACTIVE_BIT) | \
+				(1 << MTIP_PF_DM_ACTIVE_BIT)),
+
+	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
+	MTIP_PF_ISSUE_CMDS_BIT      = 5,
+	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SVC_THD_STOP_BIT    = 8,
+
+	/* below are bit numbers in 'dd_flag' defined in driver_data */
+	MTIP_DDF_REMOVE_PENDING_BIT = 1,
+	MTIP_DDF_OVER_TEMP_BIT      = 2,
+	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \
+				(1 << MTIP_DDF_OVER_TEMP_BIT) | \
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT)),
+
+	MTIP_DDF_CLEANUP_BIT        = 5,
+	MTIP_DDF_RESUME_BIT         = 6,
+	MTIP_DDF_INIT_DONE_BIT      = 7,
+	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+};
+
+__packed struct smart_attr{
+	u8 attr_id;
+	u16 flags;
+	u8 cur;
+	u8 worst;
+	u32 data;
+	u8 res[3];
+};
 
 /* Register Frame Information Structure (FIS), host to device. */
 struct host_to_dev_fis {
@@ -345,6 +376,12 @@ struct mtip_port {
 	 * when the command slot and all associated data structures
 	 * are no longer needed.
 	 */
+	u16 *log_buf;
+	dma_addr_t log_buf_dma;
+
+	u8 *smart_buf;
+	dma_addr_t smart_buf_dma;
+
 	unsigned long allocated[SLOTBITS_IN_LONGS];
 	/*
 	 * used to queue commands when an internal command is in progress
@@ -368,6 +405,7 @@ struct mtip_port {
 	 * Timer used to complete commands that have been active for too long.
 	 */
 	struct timer_list cmd_timer;
+	unsigned long ic_pause_timer;
 	/*
 	 * Semaphore used to block threads if there are no
 	 * command slots available.
@@ -404,13 +442,9 @@ struct driver_data {
 
 	unsigned slot_groups; /* number of slot groups the product supports */
 
-	atomic_t drv_cleanup_done; /* Atomic variable for SRSI */
-
 	unsigned long index; /* Index to determine the disk name */
 
-	unsigned int ftlrebuildflag; /* FTL rebuild flag */
-
-	atomic_t resumeflag; /* Atomic variable to track suspend/resume */
+	unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
 
 	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
 };
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index c3f0ee16594d..061427a75d37 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -34,12 +34,11 @@
 #include <linux/kthread.h>
 
 #include <asm/uaccess.h>
-#include <asm/system.h>
 #include <asm/types.h>
 
 #include <linux/nbd.h>
 
-#define LO_MAGIC 0x68797548
+#define NBD_MAGIC 0x68797548
 
 #ifdef NDEBUG
 #define dprintk(flags, fmt...)
@@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req)
 	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
-static void sock_shutdown(struct nbd_device *lo, int lock)
+static void sock_shutdown(struct nbd_device *nbd, int lock)
 {
 	/* Forcibly shutdown the socket causing all listeners
 	 * to error
@@ -125,14 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock)
 	 * there should be a more generic interface rather than
 	 * calling socket ops directly here */
 	if (lock)
-		mutex_lock(&lo->tx_lock);
-	if (lo->sock) {
-		dev_warn(disk_to_dev(lo->disk), "shutting down socket\n");
-		kernel_sock_shutdown(lo->sock, SHUT_RDWR);
-		lo->sock = NULL;
+		mutex_lock(&nbd->tx_lock);
+	if (nbd->sock) {
+		dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n");
+		kernel_sock_shutdown(nbd->sock, SHUT_RDWR);
+		nbd->sock = NULL;
 	}
 	if (lock)
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
 }
 
 static void nbd_xmit_timeout(unsigned long arg)
@@ -147,17 +146,17 @@ static void nbd_xmit_timeout(unsigned long arg)
 /*
  *  Send or receive packet.
  */
-static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
+static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size,
 		int msg_flags)
 {
-	struct socket *sock = lo->sock;
+	struct socket *sock = nbd->sock;
 	int result;
 	struct msghdr msg;
 	struct kvec iov;
 	sigset_t blocked, oldset;
 
 	if (unlikely(!sock)) {
-		dev_err(disk_to_dev(lo->disk),
+		dev_err(disk_to_dev(nbd->disk),
 			"Attempted %s on closed socket in sock_xmit\n",
 			(send ? "send" : "recv"));
 		return -EINVAL;
@@ -181,15 +180,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 		if (send) {
 			struct timer_list ti;
 
-			if (lo->xmit_timeout) {
+			if (nbd->xmit_timeout) {
 				init_timer(&ti);
 				ti.function = nbd_xmit_timeout;
 				ti.data = (unsigned long)current;
-				ti.expires = jiffies + lo->xmit_timeout;
+				ti.expires = jiffies + nbd->xmit_timeout;
 				add_timer(&ti);
 			}
 			result = kernel_sendmsg(sock, &msg, &iov, 1, size);
-			if (lo->xmit_timeout)
+			if (nbd->xmit_timeout)
 				del_timer_sync(&ti);
 		} else
 			result = kernel_recvmsg(sock, &msg, &iov, 1, size,
@@ -201,7 +200,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 				task_pid_nr(current), current->comm,
 				dequeue_signal_lock(current, &current->blocked, &info));
 			result = -EINTR;
-			sock_shutdown(lo, !send);
+			sock_shutdown(nbd, !send);
 			break;
 		}
 
@@ -219,18 +218,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size,
 	return result;
 }
 
-static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec,
+static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec,
 		int flags)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags);
+	result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset,
+			   bvec->bv_len, flags);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* always call with the tx_lock held */
-static int nbd_send_req(struct nbd_device *lo, struct request *req)
+static int nbd_send_req(struct nbd_device *nbd, struct request *req)
 {
 	int result, flags;
 	struct nbd_request request;
@@ -243,14 +243,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 	memcpy(request.handle, &req, sizeof(req));
 
 	dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n",
-			lo->disk->disk_name, req,
+			nbd->disk->disk_name, req,
 			nbdcmd_to_ascii(nbd_cmd(req)),
 			(unsigned long long)blk_rq_pos(req) << 9,
 			blk_rq_bytes(req));
-	result = sock_xmit(lo, 1, &request, sizeof(request),
+	result = sock_xmit(nbd, 1, &request, sizeof(request),
 			(nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0);
 	if (result <= 0) {
-		dev_err(disk_to_dev(lo->disk),
+		dev_err(disk_to_dev(nbd->disk),
 			"Send control failed (result %d)\n", result);
 		goto error_out;
 	}
@@ -267,10 +267,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req)
 			if (!rq_iter_last(req, iter))
 				flags = MSG_MORE;
 			dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n",
-					lo->disk->disk_name, req, bvec->bv_len);
-			result = sock_send_bvec(lo, bvec, flags);
+					nbd->disk->disk_name, req, bvec->bv_len);
+			result = sock_send_bvec(nbd, bvec, flags);
 			if (result <= 0) {
-				dev_err(disk_to_dev(lo->disk),
+				dev_err(disk_to_dev(nbd->disk),
 					"Send data failed (result %d)\n",
 					result);
 				goto error_out;
@@ -283,25 +283,25 @@ error_out:
 	return -EIO;
 }
 
-static struct request *nbd_find_request(struct nbd_device *lo,
+static struct request *nbd_find_request(struct nbd_device *nbd,
 					struct request *xreq)
 {
 	struct request *req, *tmp;
 	int err;
 
-	err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq);
+	err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq);
 	if (unlikely(err))
 		goto out;
 
-	spin_lock(&lo->queue_lock);
-	list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) {
+	spin_lock(&nbd->queue_lock);
+	list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) {
 		if (req != xreq)
 			continue;
 		list_del_init(&req->queuelist);
-		spin_unlock(&lo->queue_lock);
+		spin_unlock(&nbd->queue_lock);
 		return req;
 	}
-	spin_unlock(&lo->queue_lock);
+	spin_unlock(&nbd->queue_lock);
 
 	err = -ENOENT;
 
@@ -309,78 +309,78 @@ out:
 	return ERR_PTR(err);
 }
 
-static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec)
+static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec)
 {
 	int result;
 	void *kaddr = kmap(bvec->bv_page);
-	result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len,
+	result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len,
 			MSG_WAITALL);
 	kunmap(bvec->bv_page);
 	return result;
 }
 
 /* NULL returned = something went wrong, inform userspace */
-static struct request *nbd_read_stat(struct nbd_device *lo)
+static struct request *nbd_read_stat(struct nbd_device *nbd)
 {
 	int result;
 	struct nbd_reply reply;
 	struct request *req;
 
 	reply.magic = 0;
-	result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL);
+	result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL);
 	if (result <= 0) {
-		dev_err(disk_to_dev(lo->disk),
+		dev_err(disk_to_dev(nbd->disk),
 			"Receive control failed (result %d)\n", result);
 		goto harderror;
 	}
 
 	if (ntohl(reply.magic) != NBD_REPLY_MAGIC) {
-		dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n",
+		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
 				(unsigned long)ntohl(reply.magic));
 		result = -EPROTO;
 		goto harderror;
 	}
 
-	req = nbd_find_request(lo, *(struct request **)reply.handle);
+	req = nbd_find_request(nbd, *(struct request **)reply.handle);
 	if (IS_ERR(req)) {
 		result = PTR_ERR(req);
 		if (result != -ENOENT)
 			goto harderror;
 
-		dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n",
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n",
 			reply.handle);
 		result = -EBADR;
 		goto harderror;
 	}
 
 	if (ntohl(reply.error)) {
-		dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n",
+		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
 			ntohl(reply.error));
 		req->errors++;
 		return req;
 	}
 
 	dprintk(DBG_RX, "%s: request %p: got reply\n",
-			lo->disk->disk_name, req);
+			nbd->disk->disk_name, req);
 	if (nbd_cmd(req) == NBD_CMD_READ) {
 		struct req_iterator iter;
 		struct bio_vec *bvec;
 
 		rq_for_each_segment(bvec, req, iter) {
-			result = sock_recv_bvec(lo, bvec);
+			result = sock_recv_bvec(nbd, bvec);
 			if (result <= 0) {
-				dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n",
+				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
 					result);
 				req->errors++;
 				return req;
 			}
 			dprintk(DBG_RX, "%s: request %p: got %d bytes data\n",
-				lo->disk->disk_name, req, bvec->bv_len);
+				nbd->disk->disk_name, req, bvec->bv_len);
 		}
 	}
 	return req;
 harderror:
-	lo->harderror = result;
+	nbd->harderror = result;
 	return NULL;
 }
 
@@ -398,48 +398,48 @@ static struct device_attribute pid_attr = {
 	.show = pid_show,
 };
 
-static int nbd_do_it(struct nbd_device *lo)
+static int nbd_do_it(struct nbd_device *nbd)
 {
 	struct request *req;
 	int ret;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
-	lo->pid = task_pid_nr(current);
-	ret = device_create_file(disk_to_dev(lo->disk), &pid_attr);
+	nbd->pid = task_pid_nr(current);
+	ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
 	if (ret) {
-		dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n");
-		lo->pid = 0;
+		dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n");
+		nbd->pid = 0;
 		return ret;
 	}
 
-	while ((req = nbd_read_stat(lo)) != NULL)
+	while ((req = nbd_read_stat(nbd)) != NULL)
 		nbd_end_request(req);
 
-	device_remove_file(disk_to_dev(lo->disk), &pid_attr);
-	lo->pid = 0;
+	device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+	nbd->pid = 0;
 	return 0;
 }
 
-static void nbd_clear_que(struct nbd_device *lo)
+static void nbd_clear_que(struct nbd_device *nbd)
 {
 	struct request *req;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/*
-	 * Because we have set lo->sock to NULL under the tx_lock, all
+	 * Because we have set nbd->sock to NULL under the tx_lock, all
 	 * modifications to the list must have completed by now.  For
 	 * the same reason, the active_req must be NULL.
 	 *
 	 * As a consequence, we don't need to take the spin lock while
 	 * purging the list here.
 	 */
-	BUG_ON(lo->sock);
-	BUG_ON(lo->active_req);
+	BUG_ON(nbd->sock);
+	BUG_ON(nbd->active_req);
 
-	while (!list_empty(&lo->queue_head)) {
-		req = list_entry(lo->queue_head.next, struct request,
+	while (!list_empty(&nbd->queue_head)) {
+		req = list_entry(nbd->queue_head.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
 		req->errors++;
@@ -448,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo)
 }
 
 
-static void nbd_handle_req(struct nbd_device *lo, struct request *req)
+static void nbd_handle_req(struct nbd_device *nbd, struct request *req)
 {
 	if (req->cmd_type != REQ_TYPE_FS)
 		goto error_out;
@@ -456,8 +456,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
 	nbd_cmd(req) = NBD_CMD_READ;
 	if (rq_data_dir(req) == WRITE) {
 		nbd_cmd(req) = NBD_CMD_WRITE;
-		if (lo->flags & NBD_READ_ONLY) {
-			dev_err(disk_to_dev(lo->disk),
+		if (nbd->flags & NBD_READ_ONLY) {
+			dev_err(disk_to_dev(nbd->disk),
 				"Write on read-only\n");
 			goto error_out;
 		}
@@ -465,29 +465,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req)
 
 	req->errors = 0;
 
-	mutex_lock(&lo->tx_lock);
-	if (unlikely(!lo->sock)) {
-		mutex_unlock(&lo->tx_lock);
-		dev_err(disk_to_dev(lo->disk),
+	mutex_lock(&nbd->tx_lock);
+	if (unlikely(!nbd->sock)) {
+		mutex_unlock(&nbd->tx_lock);
+		dev_err(disk_to_dev(nbd->disk),
 			"Attempted send on closed socket\n");
 		goto error_out;
 	}
 
-	lo->active_req = req;
+	nbd->active_req = req;
 
-	if (nbd_send_req(lo, req) != 0) {
-		dev_err(disk_to_dev(lo->disk), "Request send failed\n");
+	if (nbd_send_req(nbd, req) != 0) {
+		dev_err(disk_to_dev(nbd->disk), "Request send failed\n");
 		req->errors++;
 		nbd_end_request(req);
 	} else {
-		spin_lock(&lo->queue_lock);
-		list_add(&req->queuelist, &lo->queue_head);
-		spin_unlock(&lo->queue_lock);
+		spin_lock(&nbd->queue_lock);
+		list_add(&req->queuelist, &nbd->queue_head);
+		spin_unlock(&nbd->queue_lock);
 	}
 
-	lo->active_req = NULL;
-	mutex_unlock(&lo->tx_lock);
-	wake_up_all(&lo->active_wq);
+	nbd->active_req = NULL;
+	mutex_unlock(&nbd->tx_lock);
+	wake_up_all(&nbd->active_wq);
 
 	return;
 
@@ -498,28 +498,28 @@ error_out:
 
 static int nbd_thread(void *data)
 {
-	struct nbd_device *lo = data;
+	struct nbd_device *nbd = data;
 	struct request *req;
 
 	set_user_nice(current, -20);
-	while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) {
+	while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) {
 		/* wait for something to do */
-		wait_event_interruptible(lo->waiting_wq,
+		wait_event_interruptible(nbd->waiting_wq,
 					 kthread_should_stop() ||
-					 !list_empty(&lo->waiting_queue));
+					 !list_empty(&nbd->waiting_queue));
 
 		/* extract request */
-		if (list_empty(&lo->waiting_queue))
+		if (list_empty(&nbd->waiting_queue))
 			continue;
 
-		spin_lock_irq(&lo->queue_lock);
-		req = list_entry(lo->waiting_queue.next, struct request,
+		spin_lock_irq(&nbd->queue_lock);
+		req = list_entry(nbd->waiting_queue.next, struct request,
 				 queuelist);
 		list_del_init(&req->queuelist);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_unlock_irq(&nbd->queue_lock);
 
 		/* handle request */
-		nbd_handle_req(lo, req);
+		nbd_handle_req(nbd, req);
 	}
 	return 0;
 }
@@ -527,7 +527,7 @@ static int nbd_thread(void *data)
 /*
  * We always wait for result of write, for now. It would be nice to make it optional
  * in future
- * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK))
+ * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK))
  *   { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); }
  */
 
@@ -536,19 +536,19 @@ static void do_nbd_request(struct request_queue *q)
 	struct request *req;
 	
 	while ((req = blk_fetch_request(q)) != NULL) {
-		struct nbd_device *lo;
+		struct nbd_device *nbd;
 
 		spin_unlock_irq(q->queue_lock);
 
 		dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n",
 				req->rq_disk->disk_name, req, req->cmd_type);
 
-		lo = req->rq_disk->private_data;
+		nbd = req->rq_disk->private_data;
 
-		BUG_ON(lo->magic != LO_MAGIC);
+		BUG_ON(nbd->magic != NBD_MAGIC);
 
-		if (unlikely(!lo->sock)) {
-			dev_err(disk_to_dev(lo->disk),
+		if (unlikely(!nbd->sock)) {
+			dev_err(disk_to_dev(nbd->disk),
 				"Attempted send on closed socket\n");
 			req->errors++;
 			nbd_end_request(req);
@@ -556,11 +556,11 @@ static void do_nbd_request(struct request_queue *q)
 			continue;
 		}
 
-		spin_lock_irq(&lo->queue_lock);
-		list_add_tail(&req->queuelist, &lo->waiting_queue);
-		spin_unlock_irq(&lo->queue_lock);
+		spin_lock_irq(&nbd->queue_lock);
+		list_add_tail(&req->queuelist, &nbd->waiting_queue);
+		spin_unlock_irq(&nbd->queue_lock);
 
-		wake_up(&lo->waiting_wq);
+		wake_up(&nbd->waiting_wq);
 
 		spin_lock_irq(q->queue_lock);
 	}
@@ -568,32 +568,32 @@ static void do_nbd_request(struct request_queue *q)
 
 /* Must be called with tx_lock held */
 
-static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		       unsigned int cmd, unsigned long arg)
 {
 	switch (cmd) {
 	case NBD_DISCONNECT: {
 		struct request sreq;
 
-		dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n");
+		dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
 
 		blk_rq_init(NULL, &sreq);
 		sreq.cmd_type = REQ_TYPE_SPECIAL;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
-		if (!lo->sock)
+		if (!nbd->sock)
 			return -EINVAL;
-		nbd_send_req(lo, &sreq);
+		nbd_send_req(nbd, &sreq);
                 return 0;
 	}
  
 	case NBD_CLEAR_SOCK: {
 		struct file *file;
 
-		lo->sock = NULL;
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		BUG_ON(!list_empty(&lo->queue_head));
+		nbd->sock = NULL;
+		file = nbd->file;
+		nbd->file = NULL;
+		nbd_clear_que(nbd);
+		BUG_ON(!list_empty(&nbd->queue_head));
 		if (file)
 			fput(file);
 		return 0;
@@ -601,14 +601,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 
 	case NBD_SET_SOCK: {
 		struct file *file;
-		if (lo->file)
+		if (nbd->file)
 			return -EBUSY;
 		file = fget(arg);
 		if (file) {
 			struct inode *inode = file->f_path.dentry->d_inode;
 			if (S_ISSOCK(inode->i_mode)) {
-				lo->file = file;
-				lo->sock = SOCKET_I(inode);
+				nbd->file = file;
+				nbd->sock = SOCKET_I(inode);
 				if (max_part > 0)
 					bdev->bd_invalidated = 1;
 				return 0;
@@ -620,29 +620,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 	}
 
 	case NBD_SET_BLKSIZE:
-		lo->blksize = arg;
-		lo->bytesize &= ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->blksize = arg;
+		nbd->bytesize &= ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_SIZE:
-		lo->bytesize = arg & ~(lo->blksize-1);
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = arg & ~(nbd->blksize-1);
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_SET_TIMEOUT:
-		lo->xmit_timeout = arg * HZ;
+		nbd->xmit_timeout = arg * HZ;
 		return 0;
 
 	case NBD_SET_SIZE_BLOCKS:
-		lo->bytesize = ((u64) arg) * lo->blksize;
-		bdev->bd_inode->i_size = lo->bytesize;
-		set_blocksize(bdev, lo->blksize);
-		set_capacity(lo->disk, lo->bytesize >> 9);
+		nbd->bytesize = ((u64) arg) * nbd->blksize;
+		bdev->bd_inode->i_size = nbd->bytesize;
+		set_blocksize(bdev, nbd->blksize);
+		set_capacity(nbd->disk, nbd->bytesize >> 9);
 		return 0;
 
 	case NBD_DO_IT: {
@@ -650,38 +650,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		struct file *file;
 		int error;
 
-		if (lo->pid)
+		if (nbd->pid)
 			return -EBUSY;
-		if (!lo->file)
+		if (!nbd->file)
 			return -EINVAL;
 
-		mutex_unlock(&lo->tx_lock);
+		mutex_unlock(&nbd->tx_lock);
 
-		thread = kthread_create(nbd_thread, lo, lo->disk->disk_name);
+		thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name);
 		if (IS_ERR(thread)) {
-			mutex_lock(&lo->tx_lock);
+			mutex_lock(&nbd->tx_lock);
 			return PTR_ERR(thread);
 		}
 		wake_up_process(thread);
-		error = nbd_do_it(lo);
+		error = nbd_do_it(nbd);
 		kthread_stop(thread);
 
-		mutex_lock(&lo->tx_lock);
+		mutex_lock(&nbd->tx_lock);
 		if (error)
 			return error;
-		sock_shutdown(lo, 0);
-		file = lo->file;
-		lo->file = NULL;
-		nbd_clear_que(lo);
-		dev_warn(disk_to_dev(lo->disk), "queue cleared\n");
+		sock_shutdown(nbd, 0);
+		file = nbd->file;
+		nbd->file = NULL;
+		nbd_clear_que(nbd);
+		dev_warn(disk_to_dev(nbd->disk), "queue cleared\n");
 		if (file)
 			fput(file);
-		lo->bytesize = 0;
+		nbd->bytesize = 0;
 		bdev->bd_inode->i_size = 0;
-		set_capacity(lo->disk, 0);
+		set_capacity(nbd->disk, 0);
 		if (max_part > 0)
 			ioctl_by_bdev(bdev, BLKRRPART, 0);
-		return lo->harderror;
+		return nbd->harderror;
 	}
 
 	case NBD_CLEAR_QUE:
@@ -689,14 +689,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 		 * This is for compatibility only.  The queue is always cleared
 		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
 		 */
-		BUG_ON(!lo->sock && !list_empty(&lo->queue_head));
+		BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head));
 		return 0;
 
 	case NBD_PRINT_DEBUG:
-		dev_info(disk_to_dev(lo->disk),
+		dev_info(disk_to_dev(nbd->disk),
 			"next = %p, prev = %p, head = %p\n",
-			lo->queue_head.next, lo->queue_head.prev,
-			&lo->queue_head);
+			nbd->queue_head.next, nbd->queue_head.prev,
+			&nbd->queue_head);
 		return 0;
 	}
 	return -ENOTTY;
@@ -705,21 +705,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo,
 static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
 		     unsigned int cmd, unsigned long arg)
 {
-	struct nbd_device *lo = bdev->bd_disk->private_data;
+	struct nbd_device *nbd = bdev->bd_disk->private_data;
 	int error;
 
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
-	BUG_ON(lo->magic != LO_MAGIC);
+	BUG_ON(nbd->magic != NBD_MAGIC);
 
 	/* Anyone capable of this syscall can do *real bad* things */
 	dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n",
-			lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
+		nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg);
 
-	mutex_lock(&lo->tx_lock);
-	error = __nbd_ioctl(bdev, lo, cmd, arg);
-	mutex_unlock(&lo->tx_lock);
+	mutex_lock(&nbd->tx_lock);
+	error = __nbd_ioctl(bdev, nbd, cmd, arg);
+	mutex_unlock(&nbd->tx_lock);
 
 	return error;
 }
@@ -805,7 +805,7 @@ static int __init nbd_init(void)
 	for (i = 0; i < nbds_max; i++) {
 		struct gendisk *disk = nbd_dev[i].disk;
 		nbd_dev[i].file = NULL;
-		nbd_dev[i].magic = LO_MAGIC;
+		nbd_dev[i].magic = NBD_MAGIC;
 		nbd_dev[i].flags = 0;
 		INIT_LIST_HEAD(&nbd_dev[i].waiting_queue);
 		spin_lock_init(&nbd_dev[i].queue_lock);
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 1f3c1a7d132a..38a2d0631882 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -39,7 +39,6 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/types.h>
-#include <linux/version.h>
 
 #include <asm-generic/io-64-nonatomic-lo-hi.h>
 
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
index d59edeabd93f..ba66e4445f41 100644
--- a/drivers/block/pktcdvd.c
+++ b/drivers/block/pktcdvd.c
@@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag
 
 	while (copy_size > 0) {
 		struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg);
-		void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) +
+		void *vfrom = kmap_atomic(src_bvl->bv_page) +
 			src_bvl->bv_offset + offs;
 		void *vto = page_address(dst_page) + dst_offs;
 		int len = min_t(int, copy_size, src_bvl->bv_len - offs);
 
 		BUG_ON(len < 0);
 		memcpy(vto, vfrom, len);
-		kunmap_atomic(vfrom, KM_USER0);
+		kunmap_atomic(vfrom);
 
 		seg++;
 		offs = 0;
@@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec)
 	offs = 0;
 	for (f = 0; f < pkt->frames; f++) {
 		if (bvec[f].bv_page != pkt->pages[p]) {
-			void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset;
+			void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset;
 			void *vto = page_address(pkt->pages[p]) + offs;
 			memcpy(vto, vfrom, CD_FRAMESIZE);
-			kunmap_atomic(vfrom, KM_USER0);
+			kunmap_atomic(vfrom);
 			bvec[f].bv_page = pkt->pages[p];
 			bvec[f].bv_offset = offs;
 		} else {
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
index 48e8fee9f2d4..9dcf76a10bb6 100644
--- a/drivers/block/sunvdc.c
+++ b/drivers/block/sunvdc.c
@@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = {
 	.id_table	= vdc_port_match,
 	.probe		= vdc_port_probe,
 	.remove		= vdc_port_remove,
-	.driver		= {
-		.name	= "vdc_port",
-		.owner	= THIS_MODULE,
-	}
+	.name		= "vdc_port",
 };
 
 static int __init vdc_init(void)
diff --git a/drivers/block/ub.c b/drivers/block/ub.c
index 7333b9e44411..fcec0225ac76 100644
--- a/drivers/block/ub.c
+++ b/drivers/block/ub.c
@@ -119,43 +119,6 @@
 
 /*
  */
-
-/* command block wrapper */
-struct bulk_cb_wrap {
-	__le32	Signature;		/* contains 'USBC' */
-	u32	Tag;			/* unique per command id */
-	__le32	DataTransferLength;	/* size of data */
-	u8	Flags;			/* direction in bit 0 */
-	u8	Lun;			/* LUN */
-	u8	Length;			/* of of the CDB */
-	u8	CDB[UB_MAX_CDB_SIZE];	/* max command */
-};
-
-#define US_BULK_CB_WRAP_LEN	31
-#define US_BULK_CB_SIGN		0x43425355	/*spells out USBC */
-#define US_BULK_FLAG_IN		1
-#define US_BULK_FLAG_OUT	0
-
-/* command status wrapper */
-struct bulk_cs_wrap {
-	__le32	Signature;		/* should = 'USBS' */
-	u32	Tag;			/* same as original command */
-	__le32	Residue;		/* amount not transferred */
-	u8	Status;			/* see below */
-};
-
-#define US_BULK_CS_WRAP_LEN	13
-#define US_BULK_CS_SIGN		0x53425355	/* spells out 'USBS' */
-#define US_BULK_STAT_OK		0
-#define US_BULK_STAT_FAIL	1
-#define US_BULK_STAT_PHASE	2
-
-/* bulk-only class specific requests */
-#define US_BULK_RESET_REQUEST	0xff
-#define US_BULK_GET_MAX_LUN	0xfe
-
-/*
- */
 struct ub_dev;
 
 #define UB_MAX_REQ_SG	9	/* cdrecord requires 32KB and maybe a header */
@@ -2477,6 +2440,8 @@ static int __init ub_init(void)
 	int rc;
 	int i;
 
+	pr_info("'Low Performance USB Block' driver is deprecated. "
+			"Please switch to usb-storage\n");
 	for (i = 0; i < UB_QLOCK_NUM; i++)
 		spin_lock_init(&ub_qlockv[i]);
 
diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c
deleted file mode 100644
index 9a5b2a2d616d..000000000000
--- a/drivers/block/viodasd.c
+++ /dev/null
@@ -1,809 +0,0 @@
-/* -*- linux-c -*-
- * viodasd.c
- *  Authors: Dave Boutcher <boutcher@us.ibm.com>
- *           Ryan Arnold <ryanarn@us.ibm.com>
- *           Colin Devilbiss <devilbis@us.ibm.com>
- *           Stephen Rothwell
- *
- * (C) Copyright 2000-2004 IBM Corporation
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License as
- * published by the Free Software Foundation; either version 2 of the
- * License, or (at your option) any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
- *
- * This routine provides access to disk space (termed "DASD" in historical
- * IBM terms) owned and managed by an OS/400 partition running on the
- * same box as this Linux partition.
- *
- * All disk operations are performed by sending messages back and forth to
- * the OS/400 partition.
- */
-
-#define pr_fmt(fmt) "viod: " fmt
-
-#include <linux/major.h>
-#include <linux/fs.h>
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/hdreg.h>
-#include <linux/errno.h>
-#include <linux/init.h>
-#include <linux/string.h>
-#include <linux/mutex.h>
-#include <linux/dma-mapping.h>
-#include <linux/completion.h>
-#include <linux/device.h>
-#include <linux/scatterlist.h>
-
-#include <asm/uaccess.h>
-#include <asm/vio.h>
-#include <asm/iseries/hv_types.h>
-#include <asm/iseries/hv_lp_event.h>
-#include <asm/iseries/hv_lp_config.h>
-#include <asm/iseries/vio.h>
-#include <asm/firmware.h>
-
-MODULE_DESCRIPTION("iSeries Virtual DASD");
-MODULE_AUTHOR("Dave Boutcher");
-MODULE_LICENSE("GPL");
-
-/*
- * We only support 7 partitions per physical disk....so with minor
- * numbers 0-255 we get a maximum of 32 disks.
- */
-#define VIOD_GENHD_NAME		"iseries/vd"
-
-#define VIOD_VERS		"1.64"
-
-enum {
-	PARTITION_SHIFT = 3,
-	MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS,
-	MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name)
-};
-
-static DEFINE_MUTEX(viodasd_mutex);
-static DEFINE_SPINLOCK(viodasd_spinlock);
-
-#define VIOMAXREQ		16
-
-#define DEVICE_NO(cell)	((struct viodasd_device *)(cell) - &viodasd_devices[0])
-
-struct viodasd_waitevent {
-	struct completion	com;
-	int			rc;
-	u16			sub_result;
-	int			max_disk;	/* open */
-};
-
-static const struct vio_error_entry viodasd_err_table[] = {
-	{ 0x0201, EINVAL, "Invalid Range" },
-	{ 0x0202, EINVAL, "Invalid Token" },
-	{ 0x0203, EIO, "DMA Error" },
-	{ 0x0204, EIO, "Use Error" },
-	{ 0x0205, EIO, "Release Error" },
-	{ 0x0206, EINVAL, "Invalid Disk" },
-	{ 0x0207, EBUSY, "Can't Lock" },
-	{ 0x0208, EIO, "Already Locked" },
-	{ 0x0209, EIO, "Already Unlocked" },
-	{ 0x020A, EIO, "Invalid Arg" },
-	{ 0x020B, EIO, "Bad IFS File" },
-	{ 0x020C, EROFS, "Read Only Device" },
-	{ 0x02FF, EIO, "Internal Error" },
-	{ 0x0000, 0, NULL },
-};
-
-/*
- * Figure out the biggest I/O request (in sectors) we can accept
- */
-#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA)
-
-/*
- * Number of disk I/O requests we've sent to OS/400
- */
-static int num_req_outstanding;
-
-/*
- * This is our internal structure for keeping track of disk devices
- */
-struct viodasd_device {
-	u16		cylinders;
-	u16		tracks;
-	u16		sectors;
-	u16		bytes_per_sector;
-	u64		size;
-	int		read_only;
-	spinlock_t	q_lock;
-	struct gendisk	*disk;
-	struct device	*dev;
-} viodasd_devices[MAX_DISKNO];
-
-/*
- * External open entry point.
- */
-static int viodasd_open(struct block_device *bdev, fmode_t mode)
-{
-	struct viodasd_device *d = bdev->bd_disk->private_data;
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	u16 flags = 0;
-
-	if (d->read_only) {
-		if (mode & FMODE_WRITE)
-			return -EROFS;
-		flags = vioblockflags_ro;
-	}
-
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("HV open failed %d\n", (int)hvrc);
-		return -EIO;
-	}
-
-	wait_for_completion(&we.com);
-
-	/* Check the return code */
-	if (we.rc != 0) {
-		const struct vio_error_entry *err =
-			vio_lookup_rc(viodasd_err_table, we.sub_result);
-
-		pr_warning("bad rc opening disk: %d:0x%04x (%s)\n",
-			   (int)we.rc, we.sub_result, err->msg);
-		return -EIO;
-	}
-
-	return 0;
-}
-
-static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode)
-{
-	int ret;
-
-	mutex_lock(&viodasd_mutex);
-	ret = viodasd_open(bdev, mode);
-	mutex_unlock(&viodasd_mutex);
-
-	return ret;
-}
-
-
-/*
- * External release entry point.
- */
-static int viodasd_release(struct gendisk *disk, fmode_t mode)
-{
-	struct viodasd_device *d = disk->private_data;
-	HvLpEvent_Rc hvrc;
-
-	mutex_lock(&viodasd_mutex);
-	/* Send the event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */,
-			0, 0, 0);
-	if (hvrc != 0)
-		pr_warning("HV close call failed %d\n", (int)hvrc);
-
-	mutex_unlock(&viodasd_mutex);
-
-	return 0;
-}
-
-
-/* External ioctl entry point.
- */
-static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
-{
-	struct gendisk *disk = bdev->bd_disk;
-	struct viodasd_device *d = disk->private_data;
-
-	geo->sectors = d->sectors ? d->sectors : 32;
-	geo->heads = d->tracks ? d->tracks  : 64;
-	geo->cylinders = d->cylinders ? d->cylinders :
-		get_capacity(disk) / (geo->sectors * geo->heads);
-
-	return 0;
-}
-
-/*
- * Our file operations table
- */
-static const struct block_device_operations viodasd_fops = {
-	.owner = THIS_MODULE,
-	.open = viodasd_unlocked_open,
-	.release = viodasd_release,
-	.getgeo = viodasd_getgeo,
-};
-
-/*
- * End a request
- */
-static void viodasd_end_request(struct request *req, int error,
-		int num_sectors)
-{
-	__blk_end_request(req, error, num_sectors << 9);
-}
-
-/*
- * Send an actual I/O request to OS/400
- */
-static int send_request(struct request *req)
-{
-	u64 start;
-	int direction;
-	int nsg;
-	u16 viocmd;
-	HvLpEvent_Rc hvrc;
-	struct vioblocklpevent *bevent;
-	struct HvLpEvent *hev;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	int sgindex;
-	struct viodasd_device *d;
-	unsigned long flags;
-
-	start = (u64)blk_rq_pos(req) << 9;
-
-	if (rq_data_dir(req) == READ) {
-		direction = DMA_FROM_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockread;
-	} else {
-		direction = DMA_TO_DEVICE;
-		viocmd = viomajorsubtype_blockio | vioblockwrite;
-	}
-
-        d = req->rq_disk->private_data;
-
-	/* Now build the scatter-gather list */
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	nsg = blk_rq_map_sg(req->q, req, sg);
-	nsg = dma_map_sg(d->dev, sg, nsg, direction);
-
-	spin_lock_irqsave(&viodasd_spinlock, flags);
-	num_req_outstanding++;
-
-	/* This optimization handles a single DMA block */
-	if (nsg == 1)
-		hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-				HvLpEvent_Type_VirtualIo, viocmd,
-				HvLpEvent_AckInd_DoAck,
-				HvLpEvent_AckType_ImmediateAck,
-				viopath_sourceinst(viopath_hostLp),
-				viopath_targetinst(viopath_hostLp),
-				(u64)(unsigned long)req, VIOVERSION << 16,
-				((u64)DEVICE_NO(d) << 48), start,
-				((u64)sg_dma_address(&sg[0])) << 32,
-				sg_dma_len(&sg[0]));
-	else {
-		bevent = (struct vioblocklpevent *)
-			vio_get_event_buffer(viomajorsubtype_blockio);
-		if (bevent == NULL) {
-			pr_warning("error allocating disk event buffer\n");
-			goto error_ret;
-		}
-
-		/*
-		 * Now build up the actual request.  Note that we store
-		 * the pointer to the request in the correlation
-		 * token so we can match the response up later
-		 */
-		memset(bevent, 0, sizeof(struct vioblocklpevent));
-		hev = &bevent->event;
-		hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK |
-			HV_LP_EVENT_INT;
-		hev->xType = HvLpEvent_Type_VirtualIo;
-		hev->xSubtype = viocmd;
-		hev->xSourceLp = HvLpConfig_getLpIndex();
-		hev->xTargetLp = viopath_hostLp;
-		hev->xSizeMinus1 =
-			offsetof(struct vioblocklpevent, u.rw_data.dma_info) +
-			(sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1;
-		hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp);
-		hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp);
-		hev->xCorrelationToken = (u64)req;
-		bevent->version = VIOVERSION;
-		bevent->disk = DEVICE_NO(d);
-		bevent->u.rw_data.offset = start;
-
-		/*
-		 * Copy just the dma information from the sg list
-		 * into the request
-		 */
-		for (sgindex = 0; sgindex < nsg; sgindex++) {
-			bevent->u.rw_data.dma_info[sgindex].token =
-				sg_dma_address(&sg[sgindex]);
-			bevent->u.rw_data.dma_info[sgindex].len =
-				sg_dma_len(&sg[sgindex]);
-		}
-
-		/* Send the request */
-		hvrc = HvCallEvent_signalLpEvent(&bevent->event);
-		vio_free_event_buffer(viomajorsubtype_blockio, bevent);
-	}
-
-	if (hvrc != HvLpEvent_Rc_Good) {
-		pr_warning("error sending disk event to OS/400 (rc %d)\n",
-			   (int)hvrc);
-		goto error_ret;
-	}
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	return 0;
-
-error_ret:
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, flags);
-	dma_unmap_sg(d->dev, sg, nsg, direction);
-	return -1;
-}
-
-/*
- * This is the external request processing routine
- */
-static void do_viodasd_request(struct request_queue *q)
-{
-	struct request *req;
-
-	/*
-	 * If we already have the maximum number of requests
-	 * outstanding to OS/400 just bail out. We'll come
-	 * back later.
-	 */
-	while (num_req_outstanding < VIOMAXREQ) {
-		req = blk_fetch_request(q);
-		if (req == NULL)
-			return;
-		/* check that request contains a valid command */
-		if (req->cmd_type != REQ_TYPE_FS) {
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-			continue;
-		}
-		/* Try sending the request */
-		if (send_request(req) != 0)
-			viodasd_end_request(req, -EIO, blk_rq_sectors(req));
-	}
-}
-
-/*
- * Probe a single disk and fill in the viodasd_device structure
- * for it.
- */
-static int probe_disk(struct viodasd_device *d)
-{
-	HvLpEvent_Rc hvrc;
-	struct viodasd_waitevent we;
-	int dev_no = DEVICE_NO(d);
-	struct gendisk *g;
-	struct request_queue *q;
-	u16 flags = 0;
-
-retry:
-	init_completion(&we.com);
-
-	/* Send the open event to OS/400 */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockopen,
-			HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			(u64)(unsigned long)&we, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags<< 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("bad rc on HV open %d\n", (int)hvrc);
-		return 0;
-	}
-
-	wait_for_completion(&we.com);
-
-	if (we.rc != 0) {
-		if (flags != 0)
-			return 0;
-		/* try again with read only flag set */
-		flags = vioblockflags_ro;
-		goto retry;
-	}
-	if (we.max_disk > (MAX_DISKNO - 1)) {
-		printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"),
-			    MAX_DISKNO, we.max_disk + 1);
-	}
-
-	/* Send the close event to OS/400.  We DON'T expect a response */
-	hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp,
-			HvLpEvent_Type_VirtualIo,
-			viomajorsubtype_blockio | vioblockclose,
-			HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck,
-			viopath_sourceinst(viopath_hostLp),
-			viopath_targetinst(viopath_hostLp),
-			0, VIOVERSION << 16,
-			((u64)dev_no << 48) | ((u64)flags << 32),
-			0, 0, 0);
-	if (hvrc != 0) {
-		pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc);
-		return 0;
-	}
-
-	if (d->dev == NULL) {
-		/* this is when we reprobe for new disks */
-		if (vio_create_viodasd(dev_no) == NULL) {
-			pr_warning("cannot allocate virtual device for disk %d\n",
-				   dev_no);
-			return 0;
-		}
-		/*
-		 * The vio_create_viodasd will have recursed into this
-		 * routine with d->dev set to the new vio device and
-		 * will finish the setup of the disk below.
-		 */
-		return 1;
-	}
-
-	/* create the request queue for the disk */
-	spin_lock_init(&d->q_lock);
-	q = blk_init_queue(do_viodasd_request, &d->q_lock);
-	if (q == NULL) {
-		pr_warning("cannot allocate queue for disk %d\n", dev_no);
-		return 0;
-	}
-	g = alloc_disk(1 << PARTITION_SHIFT);
-	if (g == NULL) {
-		pr_warning("cannot allocate disk structure for disk %d\n",
-			   dev_no);
-		blk_cleanup_queue(q);
-		return 0;
-	}
-
-	d->disk = g;
-	blk_queue_max_segments(q, VIOMAXBLOCKDMA);
-	blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS);
-	g->major = VIODASD_MAJOR;
-	g->first_minor = dev_no << PARTITION_SHIFT;
-	if (dev_no >= 26)
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c%c",
-				'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26));
-	else
-		snprintf(g->disk_name, sizeof(g->disk_name),
-				VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26));
-	g->fops = &viodasd_fops;
-	g->queue = q;
-	g->private_data = d;
-	g->driverfs_dev = d->dev;
-	set_capacity(g, d->size >> 9);
-
-	pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n",
-		dev_no, (unsigned long)(d->size >> 9),
-		(unsigned long)(d->size >> 20),
-		(int)d->cylinders, (int)d->tracks,
-		(int)d->sectors, (int)d->bytes_per_sector,
-		d->read_only ? " (RO)" : "");
-
-	/* register us in the global list */
-	add_disk(g);
-	return 1;
-}
-
-/* returns the total number of scatterlist elements converted */
-static int block_event_to_scatterlist(const struct vioblocklpevent *bevent,
-		struct scatterlist *sg, int *total_len)
-{
-	int i, numsg;
-	const struct rw_data *rw_data = &bevent->u.rw_data;
-	static const int offset =
-		offsetof(struct vioblocklpevent, u.rw_data.dma_info);
-	static const int element_size = sizeof(rw_data->dma_info[0]);
-
-	numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size;
-	if (numsg > VIOMAXBLOCKDMA)
-		numsg = VIOMAXBLOCKDMA;
-
-	*total_len = 0;
-	sg_init_table(sg, VIOMAXBLOCKDMA);
-	for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) {
-		sg_dma_address(&sg[i]) = rw_data->dma_info[i].token;
-		sg_dma_len(&sg[i]) = rw_data->dma_info[i].len;
-		*total_len += rw_data->dma_info[i].len;
-	}
-	return i;
-}
-
-/*
- * Restart all queues, starting with the one _after_ the disk given,
- * thus reducing the chance of starvation of higher numbered disks.
- */
-static void viodasd_restart_all_queues_starting_from(int first_index)
-{
-	int i;
-
-	for (i = first_index + 1; i < MAX_DISKNO; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-	for (i = 0; i <= first_index; ++i)
-		if (viodasd_devices[i].disk)
-			blk_run_queue(viodasd_devices[i].disk->queue);
-}
-
-/*
- * For read and write requests, decrement the number of outstanding requests,
- * Free the DMA buffers we allocated.
- */
-static int viodasd_handle_read_write(struct vioblocklpevent *bevent)
-{
-	int num_sg, num_sect, pci_direction, total_len;
-	struct request *req;
-	struct scatterlist sg[VIOMAXBLOCKDMA];
-	struct HvLpEvent *event = &bevent->event;
-	unsigned long irq_flags;
-	struct viodasd_device *d;
-	int error;
-	spinlock_t *qlock;
-
-	num_sg = block_event_to_scatterlist(bevent, sg, &total_len);
-	num_sect = total_len >> 9;
-	if (event->xSubtype == (viomajorsubtype_blockio | vioblockread))
-		pci_direction = DMA_FROM_DEVICE;
-	else
-		pci_direction = DMA_TO_DEVICE;
-	req = (struct request *)bevent->event.xCorrelationToken;
-	d = req->rq_disk->private_data;
-
-	dma_unmap_sg(d->dev, sg, num_sg, pci_direction);
-
-	/*
-	 * Since this is running in interrupt mode, we need to make sure
-	 * we're not stepping on any global I/O operations
-	 */
-	spin_lock_irqsave(&viodasd_spinlock, irq_flags);
-	num_req_outstanding--;
-	spin_unlock_irqrestore(&viodasd_spinlock, irq_flags);
-
-	error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO;
-	if (error) {
-		const struct vio_error_entry *err;
-		err = vio_lookup_rc(viodasd_err_table, bevent->sub_result);
-		pr_warning("read/write error %d:0x%04x (%s)\n",
-			   event->xRc, bevent->sub_result, err->msg);
-		num_sect = blk_rq_sectors(req);
-	}
-	qlock = req->q->queue_lock;
-	spin_lock_irqsave(qlock, irq_flags);
-	viodasd_end_request(req, error, num_sect);
-	spin_unlock_irqrestore(qlock, irq_flags);
-
-	/* Finally, try to get more requests off of this device's queue */
-	viodasd_restart_all_queues_starting_from(DEVICE_NO(d));
-
-	return 0;
-}
-
-/* This routine handles incoming block LP events */
-static void handle_block_event(struct HvLpEvent *event)
-{
-	struct vioblocklpevent *bevent = (struct vioblocklpevent *)event;
-	struct viodasd_waitevent *pwe;
-
-	if (event == NULL)
-		/* Notification that a partition went away! */
-		return;
-	/* First, we should NEVER get an int here...only acks */
-	if (hvlpevent_is_int(event)) {
-		pr_warning("Yikes! got an int in viodasd event handler!\n");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-
-	switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) {
-	case vioblockopen:
-		/*
-		 * Handle a response to an open request.  We get all the
-		 * disk information in the response, so update it.  The
-		 * correlation token contains a pointer to a waitevent
-		 * structure that has a completion in it.  update the
-		 * return code in the waitevent structure and post the
-		 * completion to wake up the guy who sent the request
-		 */
-		pwe = (struct viodasd_waitevent *)event->xCorrelationToken;
-		pwe->rc = event->xRc;
-		pwe->sub_result = bevent->sub_result;
-		if (event->xRc == HvLpEvent_Rc_Good) {
-			const struct open_data *data = &bevent->u.open_data;
-			struct viodasd_device *device =
-				&viodasd_devices[bevent->disk];
-			device->read_only =
-				bevent->flags & vioblockflags_ro;
-			device->size = data->disk_size;
-			device->cylinders = data->cylinders;
-			device->tracks = data->tracks;
-			device->sectors = data->sectors;
-			device->bytes_per_sector = data->bytes_per_sector;
-			pwe->max_disk = data->max_disk;
-		}
-		complete(&pwe->com);
-		break;
-	case vioblockclose:
-		break;
-	case vioblockread:
-	case vioblockwrite:
-		viodasd_handle_read_write(bevent);
-		break;
-
-	default:
-		pr_warning("invalid subtype!");
-		if (hvlpevent_need_ack(event)) {
-			event->xRc = HvLpEvent_Rc_InvalidSubtype;
-			HvCallEvent_ackLpEvent(event);
-		}
-	}
-}
-
-/*
- * Get the driver to reprobe for more disks.
- */
-static ssize_t probe_disks(struct device_driver *drv, const char *buf,
-		size_t count)
-{
-	struct viodasd_device *d;
-
-	for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) {
-		if (d->disk == NULL)
-			probe_disk(d);
-	}
-	return count;
-}
-static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks);
-
-static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id)
-{
-	struct viodasd_device *d = &viodasd_devices[vdev->unit_address];
-
-	d->dev = &vdev->dev;
-	if (!probe_disk(d))
-		return -ENODEV;
-	return 0;
-}
-
-static int viodasd_remove(struct vio_dev *vdev)
-{
-	struct viodasd_device *d;
-
-	d = &viodasd_devices[vdev->unit_address];
-	if (d->disk) {
-		del_gendisk(d->disk);
-		blk_cleanup_queue(d->disk->queue);
-		put_disk(d->disk);
-		d->disk = NULL;
-	}
-	d->dev = NULL;
-	return 0;
-}
-
-/**
- * viodasd_device_table: Used by vio.c to match devices that we
- * support.
- */
-static struct vio_device_id viodasd_device_table[] __devinitdata = {
-	{ "block", "IBM,iSeries-viodasd" },
-	{ "", "" }
-};
-MODULE_DEVICE_TABLE(vio, viodasd_device_table);
-
-static struct vio_driver viodasd_driver = {
-	.id_table = viodasd_device_table,
-	.probe = viodasd_probe,
-	.remove = viodasd_remove,
-	.driver = {
-		.name = "viodasd",
-		.owner = THIS_MODULE,
-	}
-};
-
-static int need_delete_probe;
-
-/*
- * Initialize the whole device driver.  Handle module and non-module
- * versions
- */
-static int __init viodasd_init(void)
-{
-	int rc;
-
-	if (!firmware_has_feature(FW_FEATURE_ISERIES)) {
-		rc = -ENODEV;
-		goto early_fail;
-	}
-
-	/* Try to open to our host lp */
-	if (viopath_hostLp == HvLpIndexInvalid)
-		vio_set_hostlp();
-
-	if (viopath_hostLp == HvLpIndexInvalid) {
-		pr_warning("invalid hosting partition\n");
-		rc = -EIO;
-		goto early_fail;
-	}
-
-	pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp);
-
-        /* register the block device */
-	rc =  register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-	if (rc) {
-		pr_warning("Unable to get major number %d for %s\n",
-			   VIODASD_MAJOR, VIOD_GENHD_NAME);
-		goto early_fail;
-	}
-	/* Actually open the path to the hosting partition */
-	rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio,
-				VIOMAXREQ + 2);
-	if (rc) {
-		pr_warning("error opening path to host partition %d\n",
-			   viopath_hostLp);
-		goto unregister_blk;
-	}
-
-	/* Initialize our request handler */
-	vio_setHandler(viomajorsubtype_blockio, handle_block_event);
-
-	rc = vio_register_driver(&viodasd_driver);
-	if (rc) {
-		pr_warning("vio_register_driver failed\n");
-		goto unset_handler;
-	}
-
-	/*
-	 * If this call fails, it just means that we cannot dynamically
-	 * add virtual disks, but the driver will still work fine for
-	 * all existing disk, so ignore the failure.
-	 */
-	if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe))
-		need_delete_probe = 1;
-
-	return 0;
-
-unset_handler:
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-unregister_blk:
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-early_fail:
-	return rc;
-}
-module_init(viodasd_init);
-
-void __exit viodasd_exit(void)
-{
-	if (need_delete_probe)
-		driver_remove_file(&viodasd_driver.driver, &driver_attr_probe);
-	vio_unregister_driver(&viodasd_driver);
-	vio_clearHandler(viomajorsubtype_blockio);
-	viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2);
-	unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME);
-}
-module_exit(viodasd_exit);
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
index c4a60badf252..693187df7601 100644
--- a/drivers/block/virtio_blk.c
+++ b/drivers/block/virtio_blk.c
@@ -29,9 +29,6 @@ struct virtio_blk
 	/* The disk structure for the kernel. */
 	struct gendisk *disk;
 
-	/* Request tracking. */
-	struct list_head reqs;
-
 	mempool_t *pool;
 
 	/* Process context for config space updates */
@@ -55,7 +52,6 @@ struct virtio_blk
 
 struct virtblk_req
 {
-	struct list_head list;
 	struct request *req;
 	struct virtio_blk_outhdr out_hdr;
 	struct virtio_scsi_inhdr in_hdr;
@@ -99,7 +95,6 @@ static void blk_done(struct virtqueue *vq)
 		}
 
 		__blk_end_request_all(vbr->req, error);
-		list_del(&vbr->list);
 		mempool_free(vbr, vblk->pool);
 	}
 	/* In case queue is stopped waiting for more buffers. */
@@ -184,7 +179,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk,
 		return false;
 	}
 
-	list_add_tail(&vbr->list, &vblk->reqs);
 	return true;
 }
 
@@ -351,6 +345,7 @@ static void virtblk_config_changed_work(struct work_struct *work)
 		  cap_str_10, cap_str_2);
 
 	set_capacity(vblk->disk, capacity);
+	revalidate_disk(vblk->disk);
 done:
 	mutex_unlock(&vblk->config_lock);
 }
@@ -374,6 +369,34 @@ static int init_vq(struct virtio_blk *vblk)
 	return err;
 }
 
+/*
+ * Legacy naming scheme used for virtio devices.  We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
 static int __devinit virtblk_probe(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk;
@@ -408,7 +431,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 		goto out_free_index;
 	}
 
-	INIT_LIST_HEAD(&vblk->reqs);
 	spin_lock_init(&vblk->lock);
 	vblk->vdev = vdev;
 	vblk->sg_elems = sg_elems;
@@ -442,18 +464,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev)
 
 	q->queuedata = vblk;
 
-	if (index < 26) {
-		sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26);
-	} else if (index < (26 + 1) * 26) {
-		sprintf(vblk->disk->disk_name, "vd%c%c",
-			'a' + index / 26 - 1, 'a' + index % 26);
-	} else {
-		const unsigned int m1 = (index / 26 - 1) / 26 - 1;
-		const unsigned int m2 = (index / 26 - 1) % 26;
-		const unsigned int m3 =  index % 26;
-		sprintf(vblk->disk->disk_name, "vd%c%c%c",
-			'a' + m1, 'a' + m2, 'a' + m3);
-	}
+	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
 
 	vblk->disk->major = major;
 	vblk->disk->first_minor = index_to_minor(index);
@@ -565,21 +576,29 @@ static void __devexit virtblk_remove(struct virtio_device *vdev)
 {
 	struct virtio_blk *vblk = vdev->priv;
 	int index = vblk->index;
+	struct virtblk_req *vbr;
+	unsigned long flags;
 
 	/* Prevent config work handler from accessing the device. */
 	mutex_lock(&vblk->config_lock);
 	vblk->config_enable = false;
 	mutex_unlock(&vblk->config_lock);
 
-	/* Nothing should be pending. */
-	BUG_ON(!list_empty(&vblk->reqs));
-
 	/* Stop all the virtqueues. */
 	vdev->config->reset(vdev);
 
 	flush_work(&vblk->config_work);
 
 	del_gendisk(vblk->disk);
+
+	/* Abort requests dispatched to driver. */
+	spin_lock_irqsave(&vblk->lock, flags);
+	while ((vbr = virtqueue_detach_unused_buf(vblk->vq))) {
+		__blk_end_request_all(vbr->req, -EIO);
+		mempool_free(vbr, vblk->pool);
+	}
+	spin_unlock_irqrestore(&vblk->lock, flags);
+
 	blk_cleanup_queue(vblk->disk->queue);
 	put_disk(vblk->disk);
 	mempool_destroy(vblk->pool);
diff --git a/drivers/block/xd.c b/drivers/block/xd.c
index 51a972704db5..ff540520bada 100644
--- a/drivers/block/xd.c
+++ b/drivers/block/xd.c
@@ -52,7 +52,6 @@
 #include <linux/io.h>
 #include <linux/gfp.h>
 
-#include <asm/system.h>
 #include <asm/uaccess.h>
 #include <asm/dma.h>
 
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index 0088bf60f368..73f196ca713f 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -321,6 +321,7 @@ struct seg_buf {
 static void xen_blkbk_unmap(struct pending_req *req)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
 	grant_handle_t handle;
 	int ret;
@@ -332,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req)
 		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
 				    GNTMAP_host_map, handle);
 		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
+		pages[invcount] = virt_to_page(vaddr(req, i));
 		invcount++;
 	}
 
-	ret = HYPERVISOR_grant_table_op(
-		GNTTABOP_unmap_grant_ref, unmap, invcount);
+	ret = gnttab_unmap_refs(unmap, pages, invcount, false);
 	BUG_ON(ret);
-	/*
-	 * Note, we use invcount, so nr->pages, so we can't index
-	 * using vaddr(req, i).
-	 */
-	for (i = 0; i < invcount; i++) {
-		ret = m2p_remove_override(
-			virt_to_page(unmap[i].host_addr), false);
-		if (ret) {
-			pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n",
-				 (unsigned long)unmap[i].host_addr);
-			continue;
-		}
-	}
 }
 
 static int xen_blkbk_map(struct blkif_request *req,
@@ -378,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req,
 				  pending_req->blkif->domid);
 	}
 
-	ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg);
+	ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg);
 	BUG_ON(ret);
 
 	/*
@@ -398,15 +386,6 @@ static int xen_blkbk_map(struct blkif_request *req,
 		if (ret)
 			continue;
 
-		ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr),
-			blkbk->pending_page(pending_req, i), NULL);
-		if (ret) {
-			pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n",
-				 (unsigned long)map[i].dev_bus_addr, ret);
-			/* We could switch over to GNTTABOP_copy */
-			continue;
-		}
-
 		seg[i].buf  = map[i].dev_bus_addr |
 			(req->u.rw.seg[i].first_sect << 9);
 	}
@@ -419,21 +398,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	int err = 0;
 	int status = BLKIF_RSP_OKAY;
 	struct block_device *bdev = blkif->vbd.bdev;
+	unsigned long secure;
 
 	blkif->st_ds_req++;
 
 	xen_blkif_get(blkif);
-	if (blkif->blk_backend_type == BLKIF_BACKEND_PHY ||
-	    blkif->blk_backend_type == BLKIF_BACKEND_FILE) {
-		unsigned long secure = (blkif->vbd.discard_secure &&
-			(req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
-			BLKDEV_DISCARD_SECURE : 0;
-		err = blkdev_issue_discard(bdev,
-				req->u.discard.sector_number,
-				req->u.discard.nr_sectors,
-				GFP_KERNEL, secure);
-	} else
-		err = -EOPNOTSUPP;
+	secure = (blkif->vbd.discard_secure &&
+		 (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ?
+		 BLKDEV_DISCARD_SECURE : 0;
+
+	err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+				   req->u.discard.nr_sectors,
+				   GFP_KERNEL, secure);
 
 	if (err == -EOPNOTSUPP) {
 		pr_debug(DRV_PFX "discard op failed, not supported\n");
@@ -830,7 +806,7 @@ static int __init xen_blkif_init(void)
 	int i, mmap_pages;
 	int rc = 0;
 
-	if (!xen_pv_domain())
+	if (!xen_domain())
 		return -ENODEV;
 
 	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index d0ee7edc9be8..773cf27dc23f 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -146,11 +146,6 @@ enum blkif_protocol {
 	BLKIF_PROTOCOL_X86_64 = 3,
 };
 
-enum blkif_backend_type {
-	BLKIF_BACKEND_PHY  = 1,
-	BLKIF_BACKEND_FILE = 2,
-};
-
 struct xen_vbd {
 	/* What the domain refers to this vbd as. */
 	blkif_vdev_t		handle;
@@ -177,7 +172,6 @@ struct xen_blkif {
 	unsigned int		irq;
 	/* Comms information. */
 	enum blkif_protocol	blk_protocol;
-	enum blkif_backend_type blk_backend_type;
 	union blkif_back_rings	blk_rings;
 	void			*blk_ring;
 	/* The VBD attached to this interface. */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 24a2fb57e5d0..4f66171c6683 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -381,72 +381,49 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
 			    "%d", state);
 	if (err)
-		xenbus_dev_fatal(dev, err, "writing feature-flush-cache");
+		dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
 
 	return err;
 }
 
-int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
 {
 	struct xenbus_device *dev = be->dev;
 	struct xen_blkif *blkif = be->blkif;
-	char *type;
 	int err;
 	int state = 0;
+	struct block_device *bdev = be->blkif->vbd.bdev;
+	struct request_queue *q = bdev_get_queue(bdev);
 
-	type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL);
-	if (!IS_ERR(type)) {
-		if (strncmp(type, "file", 4) == 0) {
-			state = 1;
-			blkif->blk_backend_type = BLKIF_BACKEND_FILE;
+	if (blk_queue_discard(q)) {
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-granularity", "%u",
+			q->limits.discard_granularity);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+			return;
 		}
-		if (strncmp(type, "phy", 3) == 0) {
-			struct block_device *bdev = be->blkif->vbd.bdev;
-			struct request_queue *q = bdev_get_queue(bdev);
-			if (blk_queue_discard(q)) {
-				err = xenbus_printf(xbt, dev->nodename,
-					"discard-granularity", "%u",
-					q->limits.discard_granularity);
-				if (err) {
-					xenbus_dev_fatal(dev, err,
-						"writing discard-granularity");
-					goto kfree;
-				}
-				err = xenbus_printf(xbt, dev->nodename,
-					"discard-alignment", "%u",
-					q->limits.discard_alignment);
-				if (err) {
-					xenbus_dev_fatal(dev, err,
-						"writing discard-alignment");
-					goto kfree;
-				}
-				state = 1;
-				blkif->blk_backend_type = BLKIF_BACKEND_PHY;
-			}
-			/* Optional. */
-			err = xenbus_printf(xbt, dev->nodename,
-				"discard-secure", "%d",
-				blkif->vbd.discard_secure);
-			if (err) {
-				xenbus_dev_fatal(dev, err,
-					"writting discard-secure");
-				goto kfree;
-			}
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-alignment", "%u",
+			q->limits.discard_alignment);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+			return;
+		}
+		state = 1;
+		/* Optional. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "discard-secure", "%d",
+				    blkif->vbd.discard_secure);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+			return;
 		}
-	} else {
-		err = PTR_ERR(type);
-		xenbus_dev_fatal(dev, err, "reading type");
-		goto out;
 	}
-
 	err = xenbus_printf(xbt, dev->nodename, "feature-discard",
 			    "%d", state);
 	if (err)
-		xenbus_dev_fatal(dev, err, "writing feature-discard");
-kfree:
-	kfree(type);
-out:
-	return err;
+		dev_warn(&dev->dev, "writing feature-discard (%d)", err);
 }
 int xen_blkbk_barrier(struct xenbus_transaction xbt,
 		      struct backend_info *be, int state)
@@ -457,7 +434,7 @@ int xen_blkbk_barrier(struct xenbus_transaction xbt,
 	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
 			    "%d", state);
 	if (err)
-		xenbus_dev_fatal(dev, err, "writing feature-barrier");
+		dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
 
 	return err;
 }
@@ -689,14 +666,12 @@ again:
 		return;
 	}
 
-	err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
-	if (err)
-		goto abort;
+	/* If we can't advertise it is OK. */
+	xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
 
-	err = xen_blkbk_discard(xbt, be);
+	xen_blkbk_discard(xbt, be);
 
-	/* If we can't advertise it is OK. */
-	err = xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
+	xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
 
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(&be->blkif->vbd));
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index 2f22874c0a37..60eed4bdd2e4 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -43,6 +43,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 #include <linux/scatterlist.h>
+#include <linux/bitmap.h>
 
 #include <xen/xen.h>
 #include <xen/xenbus.h>
@@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops;
  */
 struct blkfront_info
 {
+	spinlock_t io_lock;
 	struct mutex mutex;
 	struct xenbus_device *xbdev;
 	struct gendisk *gd;
@@ -105,8 +107,6 @@ struct blkfront_info
 	int is_ready;
 };
 
-static DEFINE_SPINLOCK(blkif_io_lock);
-
 static unsigned int nr_minors;
 static unsigned long *minors;
 static DEFINE_SPINLOCK(minor_lock);
@@ -177,8 +177,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
 
 	spin_lock(&minor_lock);
 	if (find_next_bit(minors, end, minor) >= end) {
-		for (; minor < end; ++minor)
-			__set_bit(minor, minors);
+		bitmap_set(minors, minor, nr);
 		rc = 0;
 	} else
 		rc = -EBUSY;
@@ -193,8 +192,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr)
 
 	BUG_ON(end > nr_minors);
 	spin_lock(&minor_lock);
-	for (; minor < end; ++minor)
-		__clear_bit(minor, minors);
+	bitmap_clear(minors,  minor, nr);
 	spin_unlock(&minor_lock);
 }
 
@@ -419,7 +417,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 	struct request_queue *rq;
 	struct blkfront_info *info = gd->private_data;
 
-	rq = blk_init_queue(do_blkif_request, &blkif_io_lock);
+	rq = blk_init_queue(do_blkif_request, &info->io_lock);
 	if (rq == NULL)
 		return -1;
 
@@ -528,6 +526,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
 	return 0;
 }
 
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+	if (n >= 26)
+		ptr = encode_disk_name(ptr, n / 26 - 1);
+	*ptr = 'a' + n % 26;
+	return ptr + 1;
+}
+
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
 			       u16 vdisk_info, u16 sector_size)
@@ -538,6 +544,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	unsigned int offset;
 	int minor;
 	int nr_parts;
+	char *ptr;
 
 	BUG_ON(info->gd != NULL);
 	BUG_ON(info->rq != NULL);
@@ -562,7 +569,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 					"emulated IDE disks,\n\t choose an xvd device name"
 					"from xvde on\n", info->vdevice);
 	}
-	err = -ENODEV;
+	if (minor >> MINORBITS) {
+		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+			info->vdevice, minor);
+		return -ENODEV;
+	}
 
 	if ((minor % nr_parts) == 0)
 		nr_minors = nr_parts;
@@ -576,23 +587,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	if (gd == NULL)
 		goto release;
 
-	if (nr_minors > 1) {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset);
-		else
-			sprintf(gd->disk_name, "%s%c%c", DEV_NAME,
-				'a' + ((offset / 26)-1), 'a' + (offset % 26));
-	} else {
-		if (offset < 26)
-			sprintf(gd->disk_name, "%s%c%d", DEV_NAME,
-				'a' + offset,
-				minor & (nr_parts - 1));
-		else
-			sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME,
-				'a' + ((offset / 26) - 1),
-				'a' + (offset % 26),
-				minor & (nr_parts - 1));
-	}
+	strcpy(gd->disk_name, DEV_NAME);
+	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+	if (nr_minors > 1)
+		*ptr = 0;
+	else
+		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+			 "%d", minor & (nr_parts - 1));
 
 	gd->major = XENVBD_MAJOR;
 	gd->first_minor = minor;
@@ -636,14 +638,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info)
 	if (info->rq == NULL)
 		return;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
 	/* No more blkif_request(). */
 	blk_stop_queue(info->rq);
 
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work_sync(&info->work);
@@ -675,16 +677,16 @@ static void blkif_restart_queue(struct work_struct *work)
 {
 	struct blkfront_info *info = container_of(work, struct blkfront_info, work);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	if (info->connected == BLKIF_STATE_CONNECTED)
 		kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 }
 
 static void blkif_free(struct blkfront_info *info, int suspend)
 {
 	/* Prevent new requests being issued until we fix things up. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = suspend ?
 		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
 	/* No more blkif_request(). */
@@ -692,7 +694,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 		blk_stop_queue(info->rq);
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	/* Flush gnttab callback work. Must be done with no locks held. */
 	flush_work_sync(&info->work);
@@ -728,10 +730,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 	struct blkfront_info *info = (struct blkfront_info *)dev_id;
 	int error;
 
-	spin_lock_irqsave(&blkif_io_lock, flags);
+	spin_lock_irqsave(&info->io_lock, flags);
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
-		spin_unlock_irqrestore(&blkif_io_lock, flags);
+		spin_unlock_irqrestore(&info->io_lock, flags);
 		return IRQ_HANDLED;
 	}
 
@@ -816,7 +818,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id)
 
 	kick_pending_request_queues(info);
 
-	spin_unlock_irqrestore(&blkif_io_lock, flags);
+	spin_unlock_irqrestore(&info->io_lock, flags);
 
 	return IRQ_HANDLED;
 }
@@ -991,6 +993,7 @@ static int blkfront_probe(struct xenbus_device *dev,
 	}
 
 	mutex_init(&info->mutex);
+	spin_lock_init(&info->io_lock);
 	info->xbdev = dev;
 	info->vdevice = vdevice;
 	info->connected = BLKIF_STATE_DISCONNECTED;
@@ -1068,7 +1071,7 @@ static int blkif_recover(struct blkfront_info *info)
 
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
@@ -1079,7 +1082,7 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	return 0;
 }
@@ -1277,10 +1280,10 @@ static void blkfront_connect(struct blkfront_info *info)
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	/* Kick pending requests. */
-	spin_lock_irq(&blkif_io_lock);
+	spin_lock_irq(&info->io_lock);
 	info->connected = BLKIF_STATE_CONNECTED;
 	kick_pending_request_queues(info);
-	spin_unlock_irq(&blkif_io_lock);
+	spin_unlock_irq(&info->io_lock);
 
 	add_disk(info->gd);
 
@@ -1410,7 +1413,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 	mutex_lock(&blkfront_mutex);
 
 	bdev = bdget_disk(disk, 0);
-	bdput(bdev);
 
 	if (bdev->bd_openers)
 		goto out;
@@ -1441,6 +1443,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode)
 	}
 
 out:
+	bdput(bdev);
 	mutex_unlock(&blkfront_mutex);
 	return 0;
 }
@@ -1475,6 +1478,9 @@ static int __init xlblk_init(void)
 	if (!xen_domain())
 		return -ENODEV;
 
+	if (xen_hvm_domain() && !xen_platform_pci_unplug)
+		return -ENODEV;
+
 	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
 		printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n",
 		       XENVBD_MAJOR, DEV_NAME);
@@ -1494,7 +1500,9 @@ module_init(xlblk_init);
 
 static void __exit xlblk_exit(void)
 {
-	return xenbus_unregister_driver(&blkfront_driver);
+	xenbus_unregister_driver(&blkfront_driver);
+	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+	kfree(minors);
 }
 module_exit(xlblk_exit);
author	Sage Weil <sage@inktank.com>	2012-06-15 21:32:04 +0200
committer	Sage Weil <sage@inktank.com>	2012-06-15 21:32:04 +0200
commit	9a64e8e0ace51b309fdcff4b4754b3649250382a (patch)
tree	1f0d75c196c5ab0408c55ed6cf3a152f1f921e15 /drivers/block
parent	libceph: flush msgr queue during mon_client shutdown (diff)
parent	Linux 3.5-rc1 (diff)
download	linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.tar.xz linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.zip