1 files changed, 55 insertions, 52 deletions
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 2c4ce42c3657..d8f57b6305cd 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -118,7 +118,7 @@ static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __rele
 
 /* writes on behalf of the partner, or resync writes,
  * "submitted" by the receiver, final stage.  */
-static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
 {
 	unsigned long flags = 0;
 	struct drbd_peer_device *peer_device = peer_req->peer_device;
@@ -150,7 +150,9 @@ static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __rel
 
 	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
 
-	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+	/* FIXME do we want to detach for failed REQ_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
 		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
 	spin_unlock_irqrestore(&device->resource->req_lock, flags);
 
@@ -176,10 +178,12 @@ void drbd_peer_request_endio(struct bio *bio, int error)
 	struct drbd_device *device = peer_req->peer_device->device;
 	int uptodate = bio_flagged(bio, BIO_UPTODATE);
 	int is_write = bio_data_dir(bio) == WRITE;
+	int is_discard = !!(bio->bi_rw & REQ_DISCARD);
 
 	if (error && __ratelimit(&drbd_ratelimit_state))
 		drbd_warn(device, "%s: error=%d s=%llus\n",
-				is_write ? "write" : "read", error,
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", error,
 				(unsigned long long)peer_req->i.sector);
 	if (!error && !uptodate) {
 		if (__ratelimit(&drbd_ratelimit_state))
@@ -263,7 +267,12 @@ void drbd_request_endio(struct bio *bio, int error)
 
 	/* to avoid recursion in __req_mod */
 	if (unlikely(error)) {
-		what = (bio_data_dir(bio) == WRITE)
+		if (bio->bi_rw & REQ_DISCARD)
+			what = (error == -EOPNOTSUPP)
+				? DISCARD_COMPLETED_NOTSUPP
+				: DISCARD_COMPLETED_WITH_ERROR;
+		else
+			what = (bio_data_dir(bio) == WRITE)
 			? WRITE_COMPLETED_WITH_ERROR
 			: (bio_rw(bio) == READ)
 			  ? READ_COMPLETED_WITH_ERROR
@@ -395,7 +404,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector,
 	/* GFP_TRY, because if there is no memory available right now, this may
 	 * be rescheduled for later. It is "only" background resync, after all. */
 	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
-				       size, GFP_TRY);
+				       size, true /* has real payload */, GFP_TRY);
 	if (!peer_req)
 		goto defer;
 
@@ -492,10 +501,9 @@ struct fifo_buffer *fifo_alloc(int fifo_size)
 	return fb;
 }
 
-static int drbd_rs_controller(struct drbd_device *device)
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
 {
 	struct disk_conf *dc;
-	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
 	unsigned int want;     /* The number of sectors we want in the proxy */
 	int req_sect; /* Number of sectors to request in this turn */
 	int correction; /* Number of sectors more we need in the proxy*/
@@ -505,9 +513,6 @@ static int drbd_rs_controller(struct drbd_device *device)
 	int max_sect;
 	struct fifo_buffer *plan;
 
-	sect_in = atomic_xchg(&device->rs_sect_in, 0); /* Number of sectors that came in */
-	device->rs_in_flight -= sect_in;
-
 	dc = rcu_dereference(device->ldev->disk_conf);
 	plan = rcu_dereference(device->rs_plan_s);
 
@@ -550,11 +555,16 @@ static int drbd_rs_controller(struct drbd_device *device)
 
 static int drbd_rs_number_requests(struct drbd_device *device)
 {
-	int number;
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;
 
 	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
 	if (rcu_dereference(device->rs_plan_s)->size) {
-		number = drbd_rs_controller(device) >> (BM_BLOCK_SHIFT - 9);
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
 		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
 	} else {
 		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
@@ -562,8 +572,14 @@ static int drbd_rs_number_requests(struct drbd_device *device)
 	}
 	rcu_read_unlock();
 
-	/* ignore the amount of pending requests, the resync controller should
-	 * throttle down to incoming reply rate soon enough anyways. */
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+	if (mxb - device->rs_in_flight < number)
+		number = mxb - device->rs_in_flight;
+
 	return number;
 }
 
@@ -597,7 +613,7 @@ static int make_resync_request(struct drbd_device *device, int cancel)
 
 	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
 	number = drbd_rs_number_requests(device);
-	if (number == 0)
+	if (number <= 0)
 		goto requeue;
 
 	for (i = 0; i < number; i++) {
@@ -647,7 +663,7 @@ next_sector:
 		 */
 		align = 1;
 		rollback_i = i;
-		for (;;) {
+		while (i < number) {
 			if (size + BM_BLOCK_SIZE > max_bio_size)
 				break;
 
@@ -1670,11 +1686,15 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	}
 	clear_bit(B_RS_H_DONE, &device->flags);
 
-	write_lock_irq(&global_state_lock);
+	/* req_lock: serialize with drbd_send_and_submit() and others
+	 * global_state_lock: for stable sync-after dependencies */
+	spin_lock_irq(&device->resource->req_lock);
+	write_lock(&global_state_lock);
 	/* Did some connection breakage or IO error race with us? */
 	if (device->state.conn < C_CONNECTED
 	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
-		write_unlock_irq(&global_state_lock);
+		write_unlock(&global_state_lock);
+		spin_unlock_irq(&device->resource->req_lock);
 		mutex_unlock(device->state_mutex);
 		return;
 	}
@@ -1714,7 +1734,8 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 		}
 		_drbd_pause_after(device);
 	}
-	write_unlock_irq(&global_state_lock);
+	write_unlock(&global_state_lock);
+	spin_unlock_irq(&device->resource->req_lock);
 
 	if (r == SS_SUCCESS) {
 		/* reset rs_last_bcast when a resync or verify is started,
@@ -1778,34 +1799,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
 	mutex_unlock(device->state_mutex);
 }
 
-/* If the resource already closed the current epoch, but we did not
- * (because we have not yet seen new requests), we should send the
- * corresponding barrier now.  Must be checked within the same spinlock
- * that is used to check for new requests. */
-static bool need_to_send_barrier(struct drbd_connection *connection)
-{
-	if (!connection->send.seen_any_write_yet)
-		return false;
-
-	/* Skip barriers that do not contain any writes.
-	 * This may happen during AHEAD mode. */
-	if (!connection->send.current_epoch_writes)
-		return false;
-
-	/* ->req_lock is held when requests are queued on
-	 * connection->sender_work, and put into ->transfer_log.
-	 * It is also held when ->current_tle_nr is increased.
-	 * So either there are already new requests queued,
-	 * and corresponding barriers will be send there.
-	 * Or nothing new is queued yet, so the difference will be 1.
-	 */
-	if (atomic_read(&connection->current_tle_nr) !=
-	    connection->send.current_epoch_nr + 1)
-		return false;
-
-	return true;
-}
-
 static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
 {
 	spin_lock_irq(&queue->q_lock);
@@ -1864,12 +1857,22 @@ static void wait_for_work(struct drbd_connection *connection, struct list_head *
 			spin_unlock_irq(&connection->resource->req_lock);
 			break;
 		}
-		send_barrier = need_to_send_barrier(connection);
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
 		spin_unlock_irq(&connection->resource->req_lock);
-		if (send_barrier) {
-			drbd_send_barrier(connection);
-			connection->send.current_epoch_nr++;
-		}
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
 		schedule();
 		/* may be woken up for other things but new work, too,
 		 * e.g. if the current epoch got closed.