diff options
Diffstat (limited to 'drivers/block/drbd/drbd_receiver.c')
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 171 |
1 files changed, 156 insertions, 15 deletions
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 02a327891568..47d2d6f87c2c 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -50,7 +50,7 @@ #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES) struct packet_info { enum drbd_packet cmd; @@ -1490,14 +1490,129 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } -static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +/* + * Mapping "discard" to ZEROOUT with UNMAP does not work for us: + * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it + * will directly go to fallback mode, submitting normal writes, and + * never even try to UNMAP. + * + * And dm-thin does not do this (yet), mostly because in general it has + * to assume that "skip_block_zeroing" is set. See also: + * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html + * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html + * + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that this "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, with skip_block_zeroing=false, + * the result is effectively "discard_zeroes_data=1". + */ +/* flags: EE_TRIM|EE_ZEROOUT */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags) { struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; - if (blkdev_issue_zeroout(bdev, peer_req->i.sector, peer_req->i.size >> 9, - GFP_NOIO, 0)) - peer_req->flags |= EE_WAS_ERROR; + if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM)) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + /* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many + * layers are below us, some may have smaller granularity */ + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= max_discard_sectors) { + err |= blkdev_issue_discard(bdev, start, max_discard_sectors, GFP_NOIO, 0); + nr_sectors -= max_discard_sectors; + start += max_discard_sectors; + } + if (nr_sectors) { + /* max_discard_sectors is unsigned int (and a multiple of + * granularity, we made sure of that above already); + * nr is < max_discard_sectors; + * I don't need sector_div here, even though nr is sector_t */ + nr = nr_sectors; + nr -= (unsigned int)nr % granularity; + if (nr) { + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start += nr; + } + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, + (flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP); + } + return err != 0; +} + +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct disk_conf *dc; + bool can_do; + if (!blk_queue_discard(q)) + return false; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; +} + +static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM))) + peer_req->flags |= EE_WAS_ERROR; drbd_endio_write_sec_final(peer_req); } @@ -1550,7 +1665,7 @@ int drbd_submit_peer_request(struct drbd_device *device, * Correctness first, performance later. Next step is to code an * asynchronous variant of the same. */ - if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { + if (peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1567,8 +1682,8 @@ int drbd_submit_peer_request(struct drbd_device *device, spin_unlock_irq(&device->resource->req_lock); } - if (peer_req->flags & EE_IS_TRIM) - drbd_issue_peer_discard(device, peer_req); + if (peer_req->flags & (EE_TRIM|EE_ZEROOUT)) + drbd_issue_peer_discard_or_zero_out(device, peer_req); else /* EE_WRITE_SAME */ drbd_issue_peer_wsame(device, peer_req); return 0; @@ -1765,6 +1880,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL; struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; @@ -1786,6 +1902,10 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(data_size == 0)) return NULL; ds = be32_to_cpu(trim->size); + } else if (zeroes) { + if (!expect(data_size == 0)) + return NULL; + ds = be32_to_cpu(zeroes->size); } else if (wsame) { if (data_size != queue_logical_block_size(device->rq_queue)) { drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", @@ -1802,7 +1922,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, if (!expect(IS_ALIGNED(ds, 512))) return NULL; - if (trim || wsame) { + if (trim || wsame || zeroes) { if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) return NULL; } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) @@ -1827,7 +1947,11 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, peer_req->flags |= EE_WRITE; if (trim) { - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; + return peer_req; + } + if (zeroes) { + peer_req->flags |= EE_ZEROOUT; return peer_req; } if (wsame) @@ -2326,8 +2450,12 @@ static unsigned long wire_flags_to_bio_flags(u32 dpf) static unsigned long wire_flags_to_bio_op(u32 dpf) { - if (dpf & DP_DISCARD) + if (dpf & DP_ZEROES) return REQ_OP_WRITE_ZEROES; + if (dpf & DP_DISCARD) + return REQ_OP_DISCARD; + if (dpf & DP_WSAME) + return REQ_OP_WRITE_SAME; else return REQ_OP_WRITE; } @@ -2518,8 +2646,19 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * op_flags = wire_flags_to_bio_flags(dp_flags); if (pi->cmd == P_TRIM) { D_ASSERT(peer_device, peer_req->i.size > 0); + D_ASSERT(peer_device, op == REQ_OP_DISCARD); + D_ASSERT(peer_device, peer_req->pages == NULL); + /* need to play safe: an older DRBD sender + * may mean zero-out while sending P_TRIM. */ + if (0 == (connection->agreed_features & DRBD_FF_WZEROES)) + peer_req->flags |= EE_ZEROOUT; + } else if (pi->cmd == P_ZEROES) { + D_ASSERT(peer_device, peer_req->i.size > 0); D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES); D_ASSERT(peer_device, peer_req->pages == NULL); + /* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */ + if (dp_flags & DP_DISCARD) + peer_req->flags |= EE_TRIM; } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); D_ASSERT(device, dp_flags & DP_FLUSH); @@ -2587,7 +2726,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) + if ((peer_req->flags & (EE_TRIM|EE_WRITE_SAME|EE_ZEROOUT)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -4893,7 +5032,7 @@ static int receive_rs_deallocated(struct drbd_connection *connection, struct pac peer_req->w.cb = e_end_resync_block; peer_req->submit_jif = jiffies; - peer_req->flags |= EE_IS_TRIM; + peer_req->flags |= EE_TRIM; spin_lock_irq(&device->resource->req_lock); list_add_tail(&peer_req->w.list, &device->sync_ee); @@ -4961,6 +5100,7 @@ static struct data_cmd drbd_cmd_handler[] = { [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_ZEROES] = { 0, sizeof(struct p_trim), receive_Data }, [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; @@ -5245,11 +5385,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n", connection->agreed_features, connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", - connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "", + connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" : connection->agreed_features ? "" : " none"); return 1; |