diff options
Diffstat (limited to 'drivers/block')
49 files changed, 1625 insertions, 848 deletions
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index d597e432e195..ab19adb07a12 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -1750,7 +1750,7 @@ aoecmd_init(void) int ret; /* get_zeroed_page returns page with ref count 1 */ - p = (void *) get_zeroed_page(GFP_KERNEL | __GFP_REPEAT); + p = (void *) get_zeroed_page(GFP_KERNEL); if (!p) return -ENOMEM; empty_page = virt_to_page(p); diff --git a/drivers/block/brd.c b/drivers/block/brd.c index c04bd9bc39fd..ba5145d384d8 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -339,7 +339,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) if (bio_end_sector(bio) > get_capacity(bdev->bd_disk)) goto io_error; - if (unlikely(bio->bi_rw & REQ_DISCARD)) { + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { if (sector & ((PAGE_SIZE >> SECTOR_SHIFT) - 1) || bio->bi_iter.bi_size & ~PAGE_MASK) goto io_error; @@ -347,9 +347,7 @@ static blk_qc_t brd_make_request(struct request_queue *q, struct bio *bio) goto out; } - rw = bio_rw(bio); - if (rw == READA) - rw = READ; + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { unsigned int len = bvec.bv_len; @@ -509,7 +507,9 @@ static struct brd_device *brd_alloc(int i) blk_queue_max_discard_sectors(brd->brd_queue, UINT_MAX); brd->brd_queue->limits.discard_zeroes_data = 1; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, brd->brd_queue); - +#ifdef CONFIG_BLK_DEV_RAM_DAX + queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue); +#endif disk = brd->brd_disk = alloc_disk(max_part); if (!disk) goto out_free_queue; diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 63c2064689f8..db9d6bb6352d 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1951,7 +1951,6 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, if (cciss_create_ld_sysfs_entry(h, drv_index)) goto cleanup_queue; disk->private_data = h->drv[drv_index]; - disk->driverfs_dev = &h->drv[drv_index]->dev; /* Set up queue information */ blk_queue_bounce_limit(disk->queue, h->pdev->dma_mask); @@ -1973,7 +1972,7 @@ static int cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, /* allows the interrupt handler to start the queue */ wmb(); h->drv[drv_index]->queue = disk->queue; - add_disk(disk); + device_add_disk(&h->drv[drv_index]->dev, disk); return 0; cleanup_queue: diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index 10459a145062..0a1aaf8c24c4 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -137,19 +137,19 @@ void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_b static int _drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int rw) + sector_t sector, int op) { struct bio *bio; /* we do all our meta data IO in aligned 4k blocks. */ const int size = 4096; - int err; + int err, op_flags = 0; device->md_io.done = 0; device->md_io.error = -ENODEV; - if ((rw & WRITE) && !test_bit(MD_NO_FUA, &device->flags)) - rw |= REQ_FUA | REQ_FLUSH; - rw |= REQ_SYNC | REQ_NOIDLE; + if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags)) + op_flags |= REQ_FUA | REQ_PREFLUSH; + op_flags |= REQ_SYNC | REQ_NOIDLE; bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; @@ -159,9 +159,9 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, goto out; bio->bi_private = device; bio->bi_end_io = drbd_md_endio; - bio->bi_rw = rw; + bio_set_op_attrs(bio, op, op_flags); - if (!(rw & WRITE) && device->state.disk == D_DISKLESS && device->ldev == NULL) + if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL) /* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */ ; else if (!get_ldev_if_state(device, D_ATTACHING)) { @@ -174,10 +174,10 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, bio_get(bio); /* one bio_put() is in the completion handler */ atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */ device->md_io.submit_jif = jiffies; - if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) + if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_io_error(bio); else - submit_bio(rw, bio); + submit_bio(bio); wait_until_done_or_force_detached(device, bdev, &device->md_io.done); if (!bio->bi_error) err = device->md_io.error; @@ -188,7 +188,7 @@ static int _drbd_md_sync_page_io(struct drbd_device *device, } int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev, - sector_t sector, int rw) + sector_t sector, int op) { int err; D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1); @@ -197,19 +197,21 @@ int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bd dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n", current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", + (unsigned long long)sector, (op == REQ_OP_WRITE) ? "WRITE" : "READ", (void*)_RET_IP_ ); if (sector < drbd_md_first_sector(bdev) || sector + 7 > drbd_md_last_sector(bdev)) drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n", current->comm, current->pid, __func__, - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ"); + (unsigned long long)sector, + (op == REQ_OP_WRITE) ? "WRITE" : "READ"); - err = _drbd_md_sync_page_io(device, bdev, sector, rw); + err = _drbd_md_sync_page_io(device, bdev, sector, op); if (err) { drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n", - (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err); + (unsigned long long)sector, + (op == REQ_OP_WRITE) ? "WRITE" : "READ", err); } return err; } @@ -256,7 +258,7 @@ bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval unsigned first = i->sector >> (AL_EXTENT_SHIFT-9); unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9); - D_ASSERT(device, (unsigned)(last - first) <= 1); + D_ASSERT(device, first <= last); D_ASSERT(device, atomic_read(&device->local_cnt) > 0); /* FIXME figure out a fast path for bios crossing AL extent boundaries */ @@ -339,6 +341,8 @@ static int __al_write_transaction(struct drbd_device *device, struct al_transact i = 0; + drbd_bm_reset_al_hints(device); + /* Even though no one can start to change this list * once we set the LC_LOCKED -- from drbd_al_begin_io(), * lc_try_lock_for_transaction() --, someone may still @@ -768,10 +772,18 @@ static bool lazy_bitmap_update_due(struct drbd_device *device) static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done) { - if (rs_done) - set_bit(RS_DONE, &device->flags); - /* and also set RS_PROGRESS below */ - else if (!lazy_bitmap_update_due(device)) + if (rs_done) { + struct drbd_connection *connection = first_peer_device(device)->connection; + if (connection->agreed_pro_version <= 95 || + is_sync_target_state(device->state.conn)) + set_bit(RS_DONE, &device->flags); + /* and also set RS_PROGRESS below */ + + /* Else: rather wait for explicit notification via receive_state, + * to avoid uuids-rotated-too-fast causing full resync + * in next handshake, in case the replication link breaks + * at the most unfortunate time... */ + } else if (!lazy_bitmap_update_due(device)) return; drbd_device_post_work(device, RS_PROGRESS); @@ -830,6 +842,13 @@ static int update_sync_bits(struct drbd_device *device, return count; } +static bool plausible_request_size(int size) +{ + return size > 0 + && size <= DRBD_MAX_BATCH_BIO_SIZE + && IS_ALIGNED(size, 512); +} + /* clear the bit corresponding to the piece of storage in question: * size byte of data starting from sector. Only clear a bits of the affected * one ore more _aligned_ BM_BLOCK_SIZE blocks. @@ -845,11 +864,11 @@ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, unsigned long count = 0; sector_t esector, nr_sectors; - /* This would be an empty REQ_FLUSH, be silent. */ + /* This would be an empty REQ_PREFLUSH, be silent. */ if ((mode == SET_OUT_OF_SYNC) && size == 0) return 0; - if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_DISCARD_SIZE) { + if (!plausible_request_size(size)) { drbd_err(device, "%s: sector=%llus size=%d nonsense!\n", drbd_change_sync_fname[mode], (unsigned long long)sector, size); diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 92d6fc020a65..ab62b81c2ca7 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -96,6 +96,13 @@ struct drbd_bitmap { struct page **bm_pages; spinlock_t bm_lock; + /* exclusively to be used by __al_write_transaction(), + * drbd_bm_mark_for_writeout() and + * and drbd_bm_write_hinted() -> bm_rw() called from there. + */ + unsigned int n_bitmap_hints; + unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION]; + /* see LIMITATIONS: above */ unsigned long bm_set; /* nr of set bits; THINK maybe atomic_t? */ @@ -242,6 +249,11 @@ static void bm_set_page_need_writeout(struct page *page) set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page)); } +void drbd_bm_reset_al_hints(struct drbd_device *device) +{ + device->bitmap->n_bitmap_hints = 0; +} + /** * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout * @device: DRBD device. @@ -253,6 +265,7 @@ static void bm_set_page_need_writeout(struct page *page) */ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) { + struct drbd_bitmap *b = device->bitmap; struct page *page; if (page_nr >= device->bitmap->bm_number_of_pages) { drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n", @@ -260,7 +273,9 @@ void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr) return; } page = device->bitmap->bm_pages[page_nr]; - set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)); + BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints)); + if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page))) + b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr; } static int bm_test_page_unchanged(struct page *page) @@ -427,8 +442,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) } /* - * called on driver init only. TODO call when a device is created. - * allocates the drbd_bitmap, and stores it in device->bitmap. + * allocates the drbd_bitmap and stores it in device->bitmap. */ int drbd_bm_init(struct drbd_device *device) { @@ -633,7 +647,8 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi unsigned long bits, words, owords, obits; unsigned long want, have, onpages; /* number of pages */ struct page **npages, **opages = NULL; - int err = 0, growing; + int err = 0; + bool growing; if (!expect(b)) return -ENOMEM; @@ -980,7 +995,7 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho struct drbd_bitmap *b = device->bitmap; struct page *page; unsigned int len; - unsigned int rw = (ctx->flags & BM_AIO_READ) ? READ : WRITE; + unsigned int op = (ctx->flags & BM_AIO_READ) ? REQ_OP_READ : REQ_OP_WRITE; sector_t on_disk_sector = device->ldev->md.md_offset + device->ldev->md.bm_offset; @@ -1011,12 +1026,12 @@ static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_ho bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = drbd_bm_endio; + bio_set_op_attrs(bio, op, 0); - if (drbd_insert_fault(device, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { - bio->bi_rw |= rw; + if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) { bio_io_error(bio); } else { - submit_bio(rw, bio); + submit_bio(bio); /* this should not count as user activity and cause the * resync to throttle -- see drbd_rs_should_slow_down(). */ atomic_add(len >> 9, &device->rs_sect_ev); @@ -1030,7 +1045,7 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned { struct drbd_bm_aio_ctx *ctx; struct drbd_bitmap *b = device->bitmap; - int num_pages, i, count = 0; + unsigned int num_pages, i, count = 0; unsigned long now; char ppb[10]; int err = 0; @@ -1078,16 +1093,37 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned now = jiffies; /* let the layers below us try to merge these bios... */ - for (i = 0; i < num_pages; i++) { - /* ignore completely unchanged pages */ - if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) - break; - if (!(flags & BM_AIO_READ)) { - if ((flags & BM_AIO_WRITE_HINTED) && - !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, - &page_private(b->bm_pages[i]))) - continue; + if (flags & BM_AIO_READ) { + for (i = 0; i < num_pages; i++) { + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); + } + } else if (flags & BM_AIO_WRITE_HINTED) { + /* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */ + unsigned int hint; + for (hint = 0; hint < b->n_bitmap_hints; hint++) { + i = b->al_bitmap_hints[hint]; + if (i >= num_pages) /* == -1U: no hint here. */ + continue; + /* Several AL-extents may point to the same page. */ + if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT, + &page_private(b->bm_pages[i]))) + continue; + /* Has it even changed? */ + if (bm_test_page_unchanged(b->bm_pages[i])) + continue; + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + } + } else { + for (i = 0; i < num_pages; i++) { + /* ignore completely unchanged pages */ + if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) + break; if (!(flags & BM_AIO_WRITE_ALL_PAGES) && bm_test_page_unchanged(b->bm_pages[i])) { dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i); @@ -1100,11 +1136,11 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i); continue; } + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i); + ++count; + cond_resched(); } - atomic_inc(&ctx->in_flight); - bm_page_io_async(ctx, i); - ++count; - cond_resched(); } /* @@ -1121,10 +1157,14 @@ static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy); /* summary for global bitmap IO */ - if (flags == 0) - drbd_info(device, "bitmap %s of %u pages took %lu jiffies\n", - (flags & BM_AIO_READ) ? "READ" : "WRITE", - count, jiffies - now); + if (flags == 0) { + unsigned int ms = jiffies_to_msecs(jiffies - now); + if (ms > 5) { + drbd_info(device, "bitmap %s of %u pages took %u ms\n", + (flags & BM_AIO_READ) ? "READ" : "WRITE", + count, ms); + } + } if (ctx->error) { drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n"); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 4de95bbff486..be91a8d7c22a 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -237,14 +237,9 @@ static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_re seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C"); seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync"); - if (f & EE_IS_TRIM) { - seq_putc(m, sep); - sep = '|'; - if (f & EE_IS_TRIM_USE_ZEROOUT) - seq_puts(m, "zero-out"); - else - seq_puts(m, "trim"); - } + if (f & EE_IS_TRIM) + __seq_print_rq_state_bit(m, f & EE_IS_TRIM_USE_ZEROOUT, &sep, "zero-out", "trim"); + seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same"); seq_putc(m, '\n'); } @@ -908,7 +903,7 @@ static int drbd_version_open(struct inode *inode, struct file *file) return single_open(file, drbd_version_show, NULL); } -static struct file_operations drbd_version_fops = { +static const struct file_operations drbd_version_fops = { .owner = THIS_MODULE, .open = drbd_version_open, .llseek = seq_lseek, diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 7a1cf7eaa71d..7b54354976a5 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -468,9 +468,15 @@ enum { /* this is/was a write request */ __EE_WRITE, + /* this is/was a write same request */ + __EE_WRITE_SAME, + /* this originates from application on peer * (not some resync or verify or other DRBD internal request) */ __EE_APPLICATION, + + /* If it contains only 0 bytes, send back P_RS_DEALLOCATED */ + __EE_RS_THIN_REQ, }; #define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO) #define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC) @@ -484,7 +490,9 @@ enum { #define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE) #define EE_SUBMITTED (1<<__EE_SUBMITTED) #define EE_WRITE (1<<__EE_WRITE) +#define EE_WRITE_SAME (1<<__EE_WRITE_SAME) #define EE_APPLICATION (1<<__EE_APPLICATION) +#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ) /* flag bits per device */ enum { @@ -1123,6 +1131,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); +extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *); extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -1327,14 +1336,14 @@ struct bm_extent { #endif #endif -/* BIO_MAX_SIZE is 256 * PAGE_SIZE, +/* Estimate max bio size as 256 * PAGE_SIZE, * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte. * Since we may live in a mixed-platform cluster, * we limit us to a platform agnostic constant here for now. * A followup commit may allow even bigger BIO sizes, * once we thought that through. */ #define DRBD_MAX_BIO_SIZE (1U << 20) -#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE +#if DRBD_MAX_BIO_SIZE > (BIO_MAX_PAGES << PAGE_SHIFT) #error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE #endif #define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ @@ -1342,11 +1351,11 @@ struct bm_extent { #define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */ #define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */ -/* For now, don't allow more than one activity log extent worth of data - * to be discarded in one go. We may need to rework drbd_al_begin_io() - * to allow for even larger discard ranges */ -#define DRBD_MAX_DISCARD_SIZE AL_EXTENT_SIZE -#define DRBD_MAX_DISCARD_SECTORS (DRBD_MAX_DISCARD_SIZE >> 9) +/* For now, don't allow more than half of what we can "activate" in one + * activity log transaction to be discarded in one go. We may need to rework + * drbd_al_begin_io() to allow for even larger discard ranges */ +#define DRBD_MAX_BATCH_BIO_SIZE (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE) +#define DRBD_MAX_BBIO_SECTORS (DRBD_MAX_BATCH_BIO_SIZE >> 9) extern int drbd_bm_init(struct drbd_device *device); extern int drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits); @@ -1369,6 +1378,7 @@ extern int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr); extern int drbd_bm_read(struct drbd_device *device) __must_hold(local); extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr); extern int drbd_bm_write(struct drbd_device *device) __must_hold(local); +extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local); extern int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local); extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local); @@ -1483,12 +1493,14 @@ enum determine_dev_size { extern enum determine_dev_size drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local); extern void resync_after_online_grow(struct drbd_device *); -extern void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev); +extern void drbd_reconsider_queue_parameters(struct drbd_device *device, + struct drbd_backing_dev *bdev, struct o_qlim *o); extern enum drbd_state_rv drbd_set_role(struct drbd_device *device, enum drbd_role new_role, int force); extern bool conn_try_outdate_peer(struct drbd_connection *connection); extern void conn_try_outdate_peer_async(struct drbd_connection *connection); +extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd); extern int drbd_khelper(struct drbd_device *device, char *cmd); /* drbd_worker.c */ @@ -1507,7 +1519,7 @@ extern int drbd_resync_finished(struct drbd_device *device); extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent); extern void drbd_md_put_buffer(struct drbd_device *device); extern int drbd_md_sync_page_io(struct drbd_device *device, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int op); extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int); extern void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev, unsigned int *done); @@ -1548,6 +1560,8 @@ extern void start_resync_timer_fn(unsigned long data); extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ +extern int drbd_issue_discard_or_zero_out(struct drbd_device *device, + sector_t start, unsigned int nr_sectors, bool discard); extern int drbd_receiver(struct drbd_thread *thi); extern int drbd_ack_receiver(struct drbd_thread *thi); extern void drbd_send_ping_wf(struct work_struct *ws); @@ -1557,11 +1571,11 @@ extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector bool throttle_if_app_is_waiting); extern int drbd_submit_peer_request(struct drbd_device *, struct drbd_peer_request *, const unsigned, - const int); + const unsigned, const int); extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *); extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64, sector_t, unsigned int, - bool, + unsigned int, gfp_t) __must_hold(local); extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *, int); @@ -1635,8 +1649,6 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin /* drbd_proc.c */ extern struct proc_dir_entry *drbd_proc; extern const struct file_operations drbd_proc_fops; -extern const char *drbd_conn_str(enum drbd_conns s); -extern const char *drbd_role_str(enum drbd_role s); /* drbd_actlog.c */ extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i); @@ -2095,13 +2107,22 @@ static inline void _sub_unacked(struct drbd_device *device, int n, const char *f ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line); } +static inline bool is_sync_target_state(enum drbd_conns connection_state) +{ + return connection_state == C_SYNC_TARGET || + connection_state == C_PAUSED_SYNC_T; +} + +static inline bool is_sync_source_state(enum drbd_conns connection_state) +{ + return connection_state == C_SYNC_SOURCE || + connection_state == C_PAUSED_SYNC_S; +} + static inline bool is_sync_state(enum drbd_conns connection_state) { - return - (connection_state == C_SYNC_SOURCE - || connection_state == C_SYNC_TARGET - || connection_state == C_PAUSED_SYNC_S - || connection_state == C_PAUSED_SYNC_T); + return is_sync_source_state(connection_state) || + is_sync_target_state(connection_state); } /** diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h index f210543f05f4..23c5a94428d2 100644 --- a/drivers/block/drbd/drbd_interval.h +++ b/drivers/block/drbd/drbd_interval.h @@ -6,13 +6,13 @@ struct drbd_interval { struct rb_node rb; - sector_t sector; /* start sector of the interval */ - unsigned int size; /* size in bytes */ - sector_t end; /* highest interval end in subtree */ - int local:1 /* local or remote request? */; - int waiting:1; /* someone is waiting for this to complete */ - int completed:1; /* this has been completed already; - * ignore for conflict detection */ + sector_t sector; /* start sector of the interval */ + unsigned int size; /* size in bytes */ + sector_t end; /* highest interval end in subtree */ + unsigned int local:1 /* local or remote request? */; + unsigned int waiting:1; /* someone is waiting for completion */ + unsigned int completed:1; /* this has been completed already; + * ignore for conflict detection */ }; static inline void drbd_clear_interval(struct drbd_interval *i) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 2ba1494b2799..0501ae0c517b 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -31,7 +31,7 @@ #include <linux/module.h> #include <linux/jiffies.h> #include <linux/drbd.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <asm/types.h> #include <net/sock.h> #include <linux/ctype.h> @@ -920,6 +920,31 @@ void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device) } } +/* communicated if (agreed_features & DRBD_FF_WSAME) */ +void assign_p_sizes_qlim(struct drbd_device *device, struct p_sizes *p, struct request_queue *q) +{ + if (q) { + p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = cpu_to_be32(queue_alignment_offset(q)); + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = blk_queue_discard(q); + p->qlim->discard_zeroes_data = queue_discard_zeroes_data(q); + p->qlim->write_same_capable = !!q->limits.max_write_same_sectors; + } else { + q = device->rq_queue; + p->qlim->physical_block_size = cpu_to_be32(queue_physical_block_size(q)); + p->qlim->logical_block_size = cpu_to_be32(queue_logical_block_size(q)); + p->qlim->alignment_offset = 0; + p->qlim->io_min = cpu_to_be32(queue_io_min(q)); + p->qlim->io_opt = cpu_to_be32(queue_io_opt(q)); + p->qlim->discard_enabled = 0; + p->qlim->discard_zeroes_data = 0; + p->qlim->write_same_capable = 0; + } +} + int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags) { struct drbd_device *device = peer_device->device; @@ -928,29 +953,37 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu sector_t d_size, u_size; int q_order_type; unsigned int max_bio_size; + unsigned int packet_size; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + packet_size = sizeof(*p); + if (peer_device->connection->agreed_features & DRBD_FF_WSAME) + packet_size += sizeof(p->qlim[0]); + + memset(p, 0, packet_size); if (get_ldev_if_state(device, D_NEGOTIATING)) { - D_ASSERT(device, device->ldev->backing_bdev); + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); d_size = drbd_get_max_capacity(device->ldev); rcu_read_lock(); u_size = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); q_order_type = drbd_queue_order_type(device); - max_bio_size = queue_max_hw_sectors(device->ldev->backing_bdev->bd_disk->queue) << 9; + max_bio_size = queue_max_hw_sectors(q) << 9; max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); + assign_p_sizes_qlim(device, p, q); put_ldev(device); } else { d_size = 0; u_size = 0; q_order_type = QUEUE_ORDERED_NONE; max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ + assign_p_sizes_qlim(device, p, NULL); } - sock = &peer_device->connection->data; - p = drbd_prepare_command(peer_device, sock); - if (!p) - return -EIO; - if (peer_device->connection->agreed_pro_version <= 94) max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); else if (peer_device->connection->agreed_pro_version < 100) @@ -962,7 +995,8 @@ int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enu p->max_bio_size = cpu_to_be32(max_bio_size); p->queue_order_type = cpu_to_be16(q_order_type); p->dds_flags = cpu_to_be16(flags); - return drbd_send_command(peer_device, sock, P_SIZES, sizeof(*p), NULL, 0); + + return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0); } /** @@ -1377,6 +1411,22 @@ int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd, cpu_to_be64(block_id)); } +int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device, + struct drbd_peer_request *peer_req) +{ + struct drbd_socket *sock; + struct p_block_desc *p; + + sock = &peer_device->connection->data; + p = drbd_prepare_command(peer_device, sock); + if (!p) + return -EIO; + p->sector = cpu_to_be64(peer_req->i.sector); + p->blksize = cpu_to_be32(peer_req->i.size); + p->pad = 0; + return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0); +} + int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd, sector_t sector, int size, u64 block_id) { @@ -1561,6 +1611,9 @@ static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio) ? 0 : MSG_MORE); if (err) return err; + /* REQ_OP_WRITE_SAME has only one segment */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } return 0; } @@ -1579,6 +1632,9 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b bio_iter_last(bvec, iter) ? 0 : MSG_MORE); if (err) return err; + /* REQ_OP_WRITE_SAME has only one segment */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } return 0; } @@ -1603,15 +1659,17 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device, return 0; } -static u32 bio_flags_to_wire(struct drbd_connection *connection, unsigned long bi_rw) +static u32 bio_flags_to_wire(struct drbd_connection *connection, + struct bio *bio) { if (connection->agreed_pro_version >= 95) - return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | - (bi_rw & REQ_FUA ? DP_FUA : 0) | - (bi_rw & REQ_FLUSH ? DP_FLUSH : 0) | - (bi_rw & REQ_DISCARD ? DP_DISCARD : 0); + return (bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) | + (bio->bi_rw & REQ_FUA ? DP_FUA : 0) | + (bio->bi_rw & REQ_PREFLUSH ? DP_FLUSH : 0) | + (bio_op(bio) == REQ_OP_WRITE_SAME ? DP_WSAME : 0) | + (bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0); else - return bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; + return bio->bi_rw & REQ_SYNC ? DP_RW_SYNC : 0; } /* Used to send write or TRIM aka REQ_DISCARD requests @@ -1622,6 +1680,8 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * struct drbd_device *device = peer_device->device; struct drbd_socket *sock; struct p_data *p; + struct p_wsame *wsame = NULL; + void *digest_out; unsigned int dp_flags = 0; int digest_size; int err; @@ -1636,7 +1696,7 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * p->sector = cpu_to_be64(req->i.sector); p->block_id = (unsigned long)req; p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq)); - dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio->bi_rw); + dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio); if (device->state.conn >= C_SYNC_SOURCE && device->state.conn <= C_PAUSED_SYNC_T) dp_flags |= DP_MAY_SET_IN_SYNC; @@ -1657,12 +1717,29 @@ int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request * err = __send_command(peer_device->connection, device->vnr, sock, P_TRIM, sizeof(*t), NULL, 0); goto out; } + if (dp_flags & DP_WSAME) { + /* this will only work if DRBD_FF_WSAME is set AND the + * handshake agreed that all nodes and backend devices are + * WRITE_SAME capable and agree on logical_block_size */ + wsame = (struct p_wsame*)p; + digest_out = wsame + 1; + wsame->size = cpu_to_be32(req->i.size); + } else + digest_out = p + 1; /* our digest is still only over the payload. * TRIM does not carry any payload. */ if (digest_size) - drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, p + 1); - err = __send_command(peer_device->connection, device->vnr, sock, P_DATA, sizeof(*p) + digest_size, NULL, req->i.size); + drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out); + if (wsame) { + err = + __send_command(peer_device->connection, device->vnr, sock, P_WSAME, + sizeof(*wsame) + digest_size, NULL, + bio_iovec(req->master_bio).bv_len); + } else + err = + __send_command(peer_device->connection, device->vnr, sock, P_DATA, + sizeof(*p) + digest_size, NULL, req->i.size); if (!err) { /* For protocol A, we have to memcpy the payload into * socket buffers, as we may complete right away @@ -3061,7 +3138,7 @@ void drbd_md_write(struct drbd_device *device, void *b) D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset); sector = device->ldev->md.md_offset; - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) { /* this was a try anyways ... */ drbd_err(device, "meta data update failed!\n"); drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); @@ -3263,7 +3340,8 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */ bdev->md.md_size_sect = 8; - if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { + if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, + REQ_OP_READ)) { /* NOTE: can't do normal error processing here as this is called BEFORE disk is attached */ drbd_err(device, "Error while reading metadata.\n"); @@ -3505,7 +3583,12 @@ static int w_bitmap_io(struct drbd_work *w, int unused) struct bm_io_work *work = &device->bm_io_work; int rv = -EIO; - D_ASSERT(device, atomic_read(&device->ap_bio_cnt) == 0); + if (work->flags != BM_LOCKED_CHANGE_ALLOWED) { + int cnt = atomic_read(&device->ap_bio_cnt); + if (cnt) + drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n", + cnt, work->why); + } if (get_ldev(device)) { drbd_bm_lock(device, work->why, work->flags); @@ -3585,18 +3668,20 @@ void drbd_queue_bitmap_io(struct drbd_device *device, int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *), char *why, enum bm_flag flags) { + /* Only suspend io, if some operation is supposed to be locked out */ + const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST); int rv; D_ASSERT(device, current != first_peer_device(device)->connection->worker.task); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_suspend_io(device); drbd_bm_lock(device, why, flags); rv = io_fn(device); drbd_bm_unlock(device); - if ((flags & BM_LOCKED_SET_ALLOWED) == 0) + if (do_suspend_io) drbd_resume_io(device); return rv; @@ -3635,6 +3720,8 @@ const char *cmdname(enum drbd_packet cmd) * one PRO_VERSION */ static const char *cmdnames[] = { [P_DATA] = "Data", + [P_WSAME] = "WriteSame", + [P_TRIM] = "Trim", [P_DATA_REPLY] = "DataReply", [P_RS_DATA_REPLY] = "RSDataReply", [P_BARRIER] = "Barrier", @@ -3679,6 +3766,8 @@ const char *cmdname(enum drbd_packet cmd) [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply", [P_RETRY_WRITE] = "retry_write", [P_PROTOCOL_UPDATE] = "protocol_update", + [P_RS_THIN_REQ] = "rs_thin_req", + [P_RS_DEALLOCATED] = "rs_deallocated", /* enum drbd_packet, but not commands - obsoleted flags: * P_MAY_IGNORE diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 0bac9c8246bc..f35db29cac76 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -343,7 +343,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) (char[20]) { }, /* address family */ (char[60]) { }, /* address */ NULL }; - char mb[12]; + char mb[14]; char *argv[] = {usermode_helper, cmd, mb, NULL }; struct drbd_connection *connection = first_peer_device(device)->connection; struct sib_info sib; @@ -352,7 +352,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) if (current == connection->worker.task) set_bit(CALLBACK_PENDING, &connection->flags); - snprintf(mb, 12, "minor-%d", device_to_minor(device)); + snprintf(mb, 14, "minor-%d", device_to_minor(device)); setup_khelper_env(connection, envp); /* The helper may take some time. @@ -387,7 +387,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) return ret; } -static int conn_khelper(struct drbd_connection *connection, char *cmd) +enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd) { char *envp[] = { "HOME=/", "TERM=linux", @@ -442,19 +442,17 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connec } rcu_read_unlock(); - if (fp == FP_NOT_AVAIL) { - /* IO Suspending works on the whole resource. - Do it only for one device. */ - vnr = 0; - peer_device = idr_get_next(&connection->peer_devices, &vnr); - drbd_change_state(peer_device->device, CS_VERBOSE | CS_HARD, NS(susp_fen, 0)); - } - return fp; } +static bool resource_is_supended(struct drbd_resource *resource) +{ + return resource->susp || resource->susp_fen || resource->susp_nod; +} + bool conn_try_outdate_peer(struct drbd_connection *connection) { + struct drbd_resource * const resource = connection->resource; unsigned int connect_cnt; union drbd_state mask = { }; union drbd_state val = { }; @@ -462,21 +460,41 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) char *ex_to_string; int r; - spin_lock_irq(&connection->resource->req_lock); + spin_lock_irq(&resource->req_lock); if (connection->cstate >= C_WF_REPORT_PARAMS) { drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n"); - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); return false; } connect_cnt = connection->connect_cnt; - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); fp = highest_fencing_policy(connection); switch (fp) { case FP_NOT_AVAIL: drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n"); - goto out; + spin_lock_irq(&resource->req_lock); + if (connection->cstate < C_WF_REPORT_PARAMS) { + _conn_request_state(connection, + (union drbd_state) { { .susp_fen = 1 } }, + (union drbd_state) { { .susp_fen = 0 } }, + CS_VERBOSE | CS_HARD | CS_DC_SUSP); + /* We are no longer suspended due to the fencing policy. + * We may still be suspended due to the on-no-data-accessible policy. + * If that was OND_IO_ERROR, fail pending requests. */ + if (!resource_is_supended(resource)) + _tl_restart(connection, CONNECTION_LOST_WHILE_PENDING); + } + /* Else: in case we raced with a connection handshake, + * let the handshake figure out if we maybe can RESEND, + * and do not resume/fail pending requests here. + * Worst case is we stay suspended for now, which may be + * resolved by either re-establishing the replication link, or + * the next link failure, or eventually the administrator. */ + spin_unlock_irq(&resource->req_lock); + return false; + case FP_DONT_CARE: return true; default: ; @@ -485,17 +503,17 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) r = conn_khelper(connection, "fence-peer"); switch ((r>>8) & 0xff) { - case 3: /* peer is inconsistent */ + case P_INCONSISTENT: /* peer is inconsistent */ ex_to_string = "peer is inconsistent or worse"; mask.pdsk = D_MASK; val.pdsk = D_INCONSISTENT; break; - case 4: /* peer got outdated, or was already outdated */ + case P_OUTDATED: /* peer got outdated, or was already outdated */ ex_to_string = "peer was fenced"; mask.pdsk = D_MASK; val.pdsk = D_OUTDATED; break; - case 5: /* peer was down */ + case P_DOWN: /* peer was down */ if (conn_highest_disk(connection) == D_UP_TO_DATE) { /* we will(have) create(d) a new UUID anyways... */ ex_to_string = "peer is unreachable, assumed to be dead"; @@ -505,7 +523,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) ex_to_string = "peer unreachable, doing nothing since disk != UpToDate"; } break; - case 6: /* Peer is primary, voluntarily outdate myself. + case P_PRIMARY: /* Peer is primary, voluntarily outdate myself. * This is useful when an unconnected R_SECONDARY is asked to * become R_PRIMARY, but finds the other peer being active. */ ex_to_string = "peer is active"; @@ -513,7 +531,9 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) mask.disk = D_MASK; val.disk = D_OUTDATED; break; - case 7: + case P_FENCING: + /* THINK: do we need to handle this + * like case 4, or more like case 5? */ if (fp != FP_STONITH) drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n"); ex_to_string = "peer was stonithed"; @@ -529,13 +549,11 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) drbd_info(connection, "fence-peer helper returned %d (%s)\n", (r>>8) & 0xff, ex_to_string); - out: - /* Not using conn_request_state(connection, mask, val, CS_VERBOSE); here, because we might were able to re-establish the connection in the meantime. */ - spin_lock_irq(&connection->resource->req_lock); + spin_lock_irq(&resource->req_lock); if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) { if (connection->connect_cnt != connect_cnt) /* In case the connection was established and droped @@ -544,7 +562,7 @@ bool conn_try_outdate_peer(struct drbd_connection *connection) else _conn_request_state(connection, mask, val, CS_VERBOSE); } - spin_unlock_irq(&connection->resource->req_lock); + spin_unlock_irq(&resource->req_lock); return conn_highest_pdsk(connection) <= D_OUTDATED; } @@ -1154,51 +1172,160 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) return 0; } +static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity) +{ + q->limits.discard_granularity = granularity; +} + +static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection) +{ + /* when we introduced REQ_WRITE_SAME support, we also bumped + * our maximum supported batch bio size used for discards. */ + if (connection->agreed_features & DRBD_FF_WSAME) + return DRBD_MAX_BBIO_SECTORS; + /* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */ + return AL_EXTENT_SIZE >> 9; +} + +static void decide_on_discard_support(struct drbd_device *device, + struct request_queue *q, + struct request_queue *b, + bool discard_zeroes_if_aligned) +{ + /* q = drbd device queue (device->rq_queue) + * b = backing device queue (device->ldev->backing_bdev->bd_disk->queue), + * or NULL if diskless + */ + struct drbd_connection *connection = first_peer_device(device)->connection; + bool can_do = b ? blk_queue_discard(b) : true; + + if (can_do && b && !b->limits.discard_zeroes_data && !discard_zeroes_if_aligned) { + can_do = false; + drbd_info(device, "discard_zeroes_data=0 and discard_zeroes_if_aligned=no: disabling discards\n"); + } + if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_TRIM)) { + can_do = false; + drbd_info(connection, "peer DRBD too old, does not support TRIM: disabling discards\n"); + } + if (can_do) { + /* We don't care for the granularity, really. + * Stacking limits below should fix it for the local + * device. Whether or not it is a suitable granularity + * on the remote device is not our problem, really. If + * you care, you need to use devices with similar + * topology on all peers. */ + blk_queue_discard_granularity(q, 512); + q->limits.max_discard_sectors = drbd_max_discard_sectors(connection); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); + } else { + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); + blk_queue_discard_granularity(q, 0); + q->limits.max_discard_sectors = 0; + } +} + +static void fixup_discard_if_not_supported(struct request_queue *q) +{ + /* To avoid confusion, if this queue does not support discard, clear + * max_discard_sectors, which is what lsblk -D reports to the user. + * Older kernels got this wrong in "stack limits". + * */ + if (!blk_queue_discard(q)) { + blk_queue_max_discard_sectors(q, 0); + blk_queue_discard_granularity(q, 0); + } +} + +static void decide_on_write_same_support(struct drbd_device *device, + struct request_queue *q, + struct request_queue *b, struct o_qlim *o) +{ + struct drbd_peer_device *peer_device = first_peer_device(device); + struct drbd_connection *connection = peer_device->connection; + bool can_do = b ? b->limits.max_write_same_sectors : true; + + if (can_do && connection->cstate >= C_CONNECTED && !(connection->agreed_features & DRBD_FF_WSAME)) { + can_do = false; + drbd_info(peer_device, "peer does not support WRITE_SAME\n"); + } + + if (o) { + /* logical block size; queue_logical_block_size(NULL) is 512 */ + unsigned int peer_lbs = be32_to_cpu(o->logical_block_size); + unsigned int me_lbs_b = queue_logical_block_size(b); + unsigned int me_lbs = queue_logical_block_size(q); + + if (me_lbs_b != me_lbs) { + drbd_warn(device, + "logical block size of local backend does not match (drbd:%u, backend:%u); was this a late attach?\n", + me_lbs, me_lbs_b); + /* rather disable write same than trigger some BUG_ON later in the scsi layer. */ + can_do = false; + } + if (me_lbs_b != peer_lbs) { + drbd_warn(peer_device, "logical block sizes do not match (me:%u, peer:%u); this may cause problems.\n", + me_lbs, peer_lbs); + if (can_do) { + drbd_dbg(peer_device, "logical block size mismatch: WRITE_SAME disabled.\n"); + can_do = false; + } + me_lbs = max(me_lbs, me_lbs_b); + /* We cannot change the logical block size of an in-use queue. + * We can only hope that access happens to be properly aligned. + * If not, the peer will likely produce an IO error, and detach. */ + if (peer_lbs > me_lbs) { + if (device->state.role != R_PRIMARY) { + blk_queue_logical_block_size(q, peer_lbs); + drbd_warn(peer_device, "logical block size set to %u\n", peer_lbs); + } else { + drbd_warn(peer_device, + "current Primary must NOT adjust logical block size (%u -> %u); hope for the best.\n", + me_lbs, peer_lbs); + } + } + } + if (can_do && !o->write_same_capable) { + /* If we introduce an open-coded write-same loop on the receiving side, + * the peer would present itself as "capable". */ + drbd_dbg(peer_device, "WRITE_SAME disabled (peer device not capable)\n"); + can_do = false; + } + } + + blk_queue_max_write_same_sectors(q, can_do ? DRBD_MAX_BBIO_SECTORS : 0); +} + static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev, - unsigned int max_bio_size) + unsigned int max_bio_size, struct o_qlim *o) { struct request_queue * const q = device->rq_queue; unsigned int max_hw_sectors = max_bio_size >> 9; unsigned int max_segments = 0; struct request_queue *b = NULL; + struct disk_conf *dc; + bool discard_zeroes_if_aligned = true; if (bdev) { b = bdev->backing_bdev->bd_disk->queue; max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9); rcu_read_lock(); - max_segments = rcu_dereference(device->ldev->disk_conf)->max_bio_bvecs; + dc = rcu_dereference(device->ldev->disk_conf); + max_segments = dc->max_bio_bvecs; + discard_zeroes_if_aligned = dc->discard_zeroes_if_aligned; rcu_read_unlock(); blk_set_stacking_limits(&q->limits); - blk_queue_max_write_same_sectors(q, 0); } - blk_queue_logical_block_size(q, 512); blk_queue_max_hw_sectors(q, max_hw_sectors); /* This is the workaround for "bio would need to, but cannot, be split" */ blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS); blk_queue_segment_boundary(q, PAGE_SIZE-1); + decide_on_discard_support(device, q, b, discard_zeroes_if_aligned); + decide_on_write_same_support(device, q, b, o); if (b) { - struct drbd_connection *connection = first_peer_device(device)->connection; - - blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - - if (blk_queue_discard(b) && - (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { - /* We don't care, stacking below should fix it for the local device. - * Whether or not it is a suitable granularity on the remote device - * is not our problem, really. If you care, you need to - * use devices with similar topology on all peers. */ - q->limits.discard_granularity = 512; - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - } else { - blk_queue_max_discard_sectors(q, 0); - queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - q->limits.discard_granularity = 0; - } - blk_queue_stack_limits(q, b); if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { @@ -1208,15 +1335,10 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } } - /* To avoid confusion, if this queue does not support discard, clear - * max_discard_sectors, which is what lsblk -D reports to the user. */ - if (!blk_queue_discard(q)) { - blk_queue_max_discard_sectors(q, 0); - q->limits.discard_granularity = 0; - } + fixup_discard_if_not_supported(q); } -void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) +void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o) { unsigned int now, new, local, peer; @@ -1259,7 +1381,7 @@ void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backin if (new != now) drbd_info(device, "max BIO size = %u\n", new); - drbd_setup_queue_param(device, bdev, new); + drbd_setup_queue_param(device, bdev, new, o); } /* Starts the worker thread */ @@ -1348,6 +1470,43 @@ static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b) a->disk_drain != b->disk_drain; } +static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf, + struct drbd_backing_dev *nbc) +{ + struct request_queue * const q = nbc->backing_bdev->bd_disk->queue; + + if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) + disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; + if (disk_conf->al_extents > drbd_al_extents_max(nbc)) + disk_conf->al_extents = drbd_al_extents_max(nbc); + + if (!blk_queue_discard(q) + || (!q->limits.discard_zeroes_data && !disk_conf->discard_zeroes_if_aligned)) { + if (disk_conf->rs_discard_granularity) { + disk_conf->rs_discard_granularity = 0; /* disable feature */ + drbd_info(device, "rs_discard_granularity feature disabled\n"); + } + } + + if (disk_conf->rs_discard_granularity) { + int orig_value = disk_conf->rs_discard_granularity; + int remainder; + + if (q->limits.discard_granularity > disk_conf->rs_discard_granularity) + disk_conf->rs_discard_granularity = q->limits.discard_granularity; + + remainder = disk_conf->rs_discard_granularity % q->limits.discard_granularity; + disk_conf->rs_discard_granularity += remainder; + + if (disk_conf->rs_discard_granularity > q->limits.max_discard_sectors << 9) + disk_conf->rs_discard_granularity = q->limits.max_discard_sectors << 9; + + if (disk_conf->rs_discard_granularity != orig_value) + drbd_info(device, "rs_discard_granularity changed to %d\n", + disk_conf->rs_discard_granularity); + } +} + int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1395,10 +1554,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (!expect(new_disk_conf->resync_rate >= 1)) new_disk_conf->resync_rate = 1; - if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) - new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; - if (new_disk_conf->al_extents > drbd_al_extents_max(device->ldev)) - new_disk_conf->al_extents = drbd_al_extents_max(device->ldev); + sanitize_disk_conf(device, new_disk_conf, device->ldev); if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX) new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX; @@ -1457,6 +1613,9 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) if (write_ordering_changed(old_disk_conf, new_disk_conf)) drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); + if (old_disk_conf->discard_zeroes_if_aligned != new_disk_conf->discard_zeroes_if_aligned) + drbd_reconsider_queue_parameters(device, device->ldev, NULL); + drbd_md_sync(device); if (device->state.conn >= C_CONNECTED) { @@ -1693,10 +1852,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) if (retcode != NO_ERROR) goto fail; - if (new_disk_conf->al_extents < DRBD_AL_EXTENTS_MIN) - new_disk_conf->al_extents = DRBD_AL_EXTENTS_MIN; - if (new_disk_conf->al_extents > drbd_al_extents_max(nbc)) - new_disk_conf->al_extents = drbd_al_extents_max(nbc); + sanitize_disk_conf(device, new_disk_conf, nbc); if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) { drbd_err(device, "max capacity %llu smaller than disk size %llu\n", @@ -1838,7 +1994,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device->read_cnt = 0; device->writ_cnt = 0; - drbd_reconsider_max_bio_size(device, device->ldev); + drbd_reconsider_queue_parameters(device, device->ldev, NULL); /* If I am currently not R_PRIMARY, * but meta data primary indicator is set, diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 6537b25db9c1..be2b93fd2c11 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -25,7 +25,7 @@ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/fs.h> #include <linux/file.h> #include <linux/proc_fs.h> @@ -122,18 +122,18 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se x = res/50; y = 20-x; - seq_printf(seq, "\t["); + seq_puts(seq, "\t["); for (i = 1; i < x; i++) - seq_printf(seq, "="); - seq_printf(seq, ">"); + seq_putc(seq, '='); + seq_putc(seq, '>'); for (i = 0; i < y; i++) seq_printf(seq, "."); - seq_printf(seq, "] "); + seq_puts(seq, "] "); if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T) - seq_printf(seq, "verified:"); + seq_puts(seq, "verified:"); else - seq_printf(seq, "sync'ed:"); + seq_puts(seq, "sync'ed:"); seq_printf(seq, "%3u.%u%% ", res / 10, res % 10); /* if more than a few GB, display in MB */ @@ -146,7 +146,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se (unsigned long) Bit2KB(rs_left), (unsigned long) Bit2KB(rs_total)); - seq_printf(seq, "\n\t"); + seq_puts(seq, "\n\t"); /* see drivers/md/md.c * We do not want to overflow, so the order of operands and @@ -175,9 +175,9 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se rt / 3600, (rt % 3600) / 60, rt % 60); dbdt = Bit2KB(db/dt); - seq_printf(seq, " speed: "); + seq_puts(seq, " speed: "); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, " ("); + seq_puts(seq, " ("); /* ------------------------- ~3s average ------------------------ */ if (proc_details >= 1) { /* this is what drbd_rs_should_slow_down() uses */ @@ -188,7 +188,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se db = device->rs_mark_left[i] - rs_left; dbdt = Bit2KB(db/dt); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, " -- "); + seq_puts(seq, " -- "); } /* --------------------- long term average ---------------------- */ @@ -200,11 +200,11 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se db = rs_total - rs_left; dbdt = Bit2KB(db/dt); seq_printf_with_thousands_grouping(seq, dbdt); - seq_printf(seq, ")"); + seq_putc(seq, ')'); if (state.conn == C_SYNC_TARGET || state.conn == C_VERIFY_S) { - seq_printf(seq, " want: "); + seq_puts(seq, " want: "); seq_printf_with_thousands_grouping(seq, device->c_sync_rate); } seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : ""); @@ -231,7 +231,7 @@ static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *se (unsigned long long)bm_bits * BM_SECT_PER_BIT); if (stop_sector != 0 && stop_sector != ULLONG_MAX) seq_printf(seq, " stop sector: %llu", stop_sector); - seq_printf(seq, "\n"); + seq_putc(seq, '\n'); } } @@ -276,7 +276,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) rcu_read_lock(); idr_for_each_entry(&drbd_devices, device, i) { if (prev_i != i - 1) - seq_printf(seq, "\n"); + seq_putc(seq, '\n'); prev_i = i; state = device->state; diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index ef9245363dcc..4d296800f706 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -60,6 +60,15 @@ enum drbd_packet { * which is why I chose TRIM here, to disambiguate. */ P_TRIM = 0x31, + /* Only use these two if both support FF_THIN_RESYNC */ + P_RS_THIN_REQ = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */ + P_RS_DEALLOCATED = 0x33, /* Contains only zeros on sync source node */ + + /* REQ_WRITE_SAME. + * On a receiving side without REQ_WRITE_SAME, + * we may fall back to an opencoded loop instead. */ + P_WSAME = 0x34, + P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */ P_MAX_OPT_CMD = 0x101, @@ -106,16 +115,20 @@ struct p_header100 { u32 pad; } __packed; -/* these defines must not be changed without changing the protocol version */ -#define DP_HARDBARRIER 1 /* depricated */ +/* These defines must not be changed without changing the protocol version. + * New defines may only be introduced together with protocol version bump or + * new protocol feature flags. + */ +#define DP_HARDBARRIER 1 /* no longer used */ #define DP_RW_SYNC 2 /* equals REQ_SYNC */ #define DP_MAY_SET_IN_SYNC 4 #define DP_UNPLUG 8 /* not used anymore */ #define DP_FUA 16 /* equals REQ_FUA */ -#define DP_FLUSH 32 /* equals REQ_FLUSH */ +#define DP_FLUSH 32 /* equals REQ_PREFLUSH */ #define DP_DISCARD 64 /* equals REQ_DISCARD */ #define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */ #define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */ +#define DP_WSAME 512 /* equiv. REQ_WRITE_SAME */ struct p_data { u64 sector; /* 64 bits sector number */ @@ -129,6 +142,11 @@ struct p_trim { u32 size; /* == bio->bi_size */ } __packed; +struct p_wsame { + struct p_data p_data; + u32 size; /* == bio->bi_size */ +} __packed; + /* * commands which share a struct: * p_block_ack: @@ -160,7 +178,23 @@ struct p_block_req { * ReportParams */ -#define FF_TRIM 1 +/* supports TRIM/DISCARD on the "wire" protocol */ +#define DRBD_FF_TRIM 1 + +/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks + * instead of fully allocate a supposedly thin volume on initial resync */ +#define DRBD_FF_THIN_RESYNC 2 + +/* supports REQ_WRITE_SAME on the "wire" protocol. + * Note: this flag is overloaded, + * its presence also + * - indicates support for 128 MiB "batch bios", + * max discard size of 128 MiB + * instead of 4M before that. + * - indicates that we exchange additional settings in p_sizes + * drbd_send_sizes()/receive_sizes() + */ +#define DRBD_FF_WSAME 4 struct p_connection_features { u32 protocol_min; @@ -235,6 +269,40 @@ struct p_rs_uuid { u64 uuid; } __packed; +/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) + * see also struct queue_limits, as of late 2015 */ +struct o_qlim { + /* we don't need it yet, but we may as well communicate it now */ + u32 physical_block_size; + + /* so the original in struct queue_limits is unsigned short, + * but I'd have to put in padding anyways. */ + u32 logical_block_size; + + /* One incoming bio becomes one DRBD request, + * which may be translated to several bio on the receiving side. + * We don't need to communicate chunk/boundary/segment ... limits. + */ + + /* various IO hints may be useful with "diskless client" setups */ + u32 alignment_offset; + u32 io_min; + u32 io_opt; + + /* We may need to communicate integrity stuff at some point, + * but let's not get ahead of ourselves. */ + + /* Backend discard capabilities. + * Receiving side uses "blkdev_issue_discard()", no need to communicate + * more specifics. If the backend cannot do discards, the DRBD peer + * may fall back to blkdev_issue_zeroout(). + */ + u8 discard_enabled; + u8 discard_zeroes_data; + u8 write_same_capable; + u8 _pad; +} __packed; + struct p_sizes { u64 d_size; /* size of disk */ u64 u_size; /* user requested size */ @@ -242,6 +310,9 @@ struct p_sizes { u32 max_bio_size; /* Maximal size of a BIO */ u16 queue_order_type; /* not yet implemented in DRBD*/ u16 dds_flags; /* use enum dds_flags here. */ + + /* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */ + struct o_qlim qlim[0]; } __packed; struct p_state { diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 050aaa1c0350..df45713dfbe8 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -25,7 +25,7 @@ #include <linux/module.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <net/sock.h> #include <linux/drbd.h> @@ -48,7 +48,7 @@ #include "drbd_req.h" #include "drbd_vli.h" -#define PRO_FEATURES (FF_TRIM) +#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME) struct packet_info { enum drbd_packet cmd; @@ -361,14 +361,17 @@ You must not have the req_lock: drbd_wait_ee_list_empty() */ +/* normal: payload_size == request size (bi_size) + * w_same: payload_size == logical_block_size + * trim: payload_size == 0 */ struct drbd_peer_request * drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector, - unsigned int data_size, bool has_payload, gfp_t gfp_mask) __must_hold(local) + unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local) { struct drbd_device *device = peer_device->device; struct drbd_peer_request *peer_req; struct page *page = NULL; - unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; + unsigned nr_pages = (payload_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(device, DRBD_FAULT_AL_EE)) return NULL; @@ -380,7 +383,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto return NULL; } - if (has_payload && data_size) { + if (nr_pages) { page = drbd_alloc_pages(peer_device, nr_pages, gfpflags_allow_blocking(gfp_mask)); if (!page) @@ -390,7 +393,7 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto memset(peer_req, 0, sizeof(*peer_req)); INIT_LIST_HEAD(&peer_req->w.list); drbd_clear_interval(&peer_req->i); - peer_req->i.size = data_size; + peer_req->i.size = request_size; peer_req->i.sector = sector; peer_req->submit_jif = jiffies; peer_req->peer_device = peer_device; @@ -1204,13 +1207,84 @@ static int drbd_recv_header(struct drbd_connection *connection, struct packet_in return err; } -static void drbd_flush(struct drbd_connection *connection) +/* This is blkdev_issue_flush, but asynchronous. + * We want to submit to all component volumes in parallel, + * then wait for all completions. + */ +struct issue_flush_context { + atomic_t pending; + int error; + struct completion done; +}; +struct one_flush_context { + struct drbd_device *device; + struct issue_flush_context *ctx; +}; + +void one_flush_endio(struct bio *bio) { - int rv; - struct drbd_peer_device *peer_device; - int vnr; + struct one_flush_context *octx = bio->bi_private; + struct drbd_device *device = octx->device; + struct issue_flush_context *ctx = octx->ctx; + + if (bio->bi_error) { + ctx->error = bio->bi_error; + drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_error); + } + kfree(octx); + bio_put(bio); + + clear_bit(FLUSH_PENDING, &device->flags); + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + + if (atomic_dec_and_test(&ctx->pending)) + complete(&ctx->done); +} + +static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx) +{ + struct bio *bio = bio_alloc(GFP_NOIO, 0); + struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO); + if (!bio || !octx) { + drbd_warn(device, "Could not allocate a bio, CANNOT ISSUE FLUSH\n"); + /* FIXME: what else can I do now? disconnecting or detaching + * really does not help to improve the state of the world, either. + */ + kfree(octx); + if (bio) + bio_put(bio); + + ctx->error = -ENOMEM; + put_ldev(device); + kref_put(&device->kref, drbd_destroy_device); + return; + } + octx->device = device; + octx->ctx = ctx; + bio->bi_bdev = device->ldev->backing_bdev; + bio->bi_private = octx; + bio->bi_end_io = one_flush_endio; + bio_set_op_attrs(bio, REQ_OP_FLUSH, WRITE_FLUSH); + + device->flush_jif = jiffies; + set_bit(FLUSH_PENDING, &device->flags); + atomic_inc(&ctx->pending); + submit_bio(bio); +} + +static void drbd_flush(struct drbd_connection *connection) +{ if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { + struct drbd_peer_device *peer_device; + struct issue_flush_context ctx; + int vnr; + + atomic_set(&ctx.pending, 1); + ctx.error = 0; + init_completion(&ctx.done); + rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1220,31 +1294,24 @@ static void drbd_flush(struct drbd_connection *connection) kref_get(&device->kref); rcu_read_unlock(); - /* Right now, we have only this one synchronous code path - * for flushes between request epochs. - * We may want to make those asynchronous, - * or at least parallelize the flushes to the volume devices. - */ - device->flush_jif = jiffies; - set_bit(FLUSH_PENDING, &device->flags); - rv = blkdev_issue_flush(device->ldev->backing_bdev, - GFP_NOIO, NULL); - clear_bit(FLUSH_PENDING, &device->flags); - if (rv) { - drbd_info(device, "local disk flush failed with status %d\n", rv); - /* would rather check on EOPNOTSUPP, but that is not reliable. - * don't try again for ANY return value != 0 - * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); - } - put_ldev(device); - kref_put(&device->kref, drbd_destroy_device); + submit_one_flush(device, &ctx); rcu_read_lock(); - if (rv) - break; } rcu_read_unlock(); + + /* Do we want to add a timeout, + * if disk-timeout is set? */ + if (!atomic_dec_and_test(&ctx.pending)) + wait_for_completion(&ctx.done); + + if (ctx.error) { + /* would rather check on EOPNOTSUPP, but that is not reliable. + * don't try again for ANY return value != 0 + * if (rv == -EOPNOTSUPP) */ + /* Any error is already reported by bio_endio callback. */ + drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); + } } } @@ -1379,6 +1446,120 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } +/* + * We *may* ignore the discard-zeroes-data setting, if so configured. + * + * Assumption is that it "discard_zeroes_data=0" is only because the backend + * may ignore partial unaligned discards. + * + * LVM/DM thin as of at least + * LVM version: 2.02.115(2)-RHEL7 (2015-01-28) + * Library version: 1.02.93-RHEL7 (2015-01-28) + * Driver version: 4.29.0 + * still behaves this way. + * + * For unaligned (wrt. alignment and granularity) or too small discards, + * we zero-out the initial (and/or) trailing unaligned partial chunks, + * but discard all the aligned full chunks. + * + * At least for LVM/DM thin, the result is effectively "discard_zeroes_data=1". + */ +int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, bool discard) +{ + struct block_device *bdev = device->ldev->backing_bdev; + struct request_queue *q = bdev_get_queue(bdev); + sector_t tmp, nr; + unsigned int max_discard_sectors, granularity; + int alignment; + int err = 0; + + if (!discard) + goto zero_out; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + max_discard_sectors = min(q->limits.max_discard_sectors, (1U << 22)); + max_discard_sectors -= max_discard_sectors % granularity; + if (unlikely(!max_discard_sectors)) + goto zero_out; + + if (nr_sectors < granularity) + goto zero_out; + + tmp = start; + if (sector_div(tmp, granularity) != alignment) { + if (nr_sectors < 2*granularity) + goto zero_out; + /* start + gran - (start + gran - align) % gran */ + tmp = start + granularity - alignment; + tmp = start + granularity - sector_div(tmp, granularity); + + nr = tmp - start; + err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start = tmp; + } + while (nr_sectors >= granularity) { + nr = min_t(sector_t, nr_sectors, max_discard_sectors); + err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO, 0); + nr_sectors -= nr; + start += nr; + } + zero_out: + if (nr_sectors) { + err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO, 0); + } + return err != 0; +} + +static bool can_do_reliable_discards(struct drbd_device *device) +{ + struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); + struct disk_conf *dc; + bool can_do; + + if (!blk_queue_discard(q)) + return false; + + if (q->limits.discard_zeroes_data) + return true; + + rcu_read_lock(); + dc = rcu_dereference(device->ldev->disk_conf); + can_do = dc->discard_zeroes_if_aligned; + rcu_read_unlock(); + return can_do; +} + +static void drbd_issue_peer_discard(struct drbd_device *device, struct drbd_peer_request *peer_req) +{ + /* If the backend cannot discard, or does not guarantee + * read-back zeroes in discarded ranges, we fall back to + * zero-out. Unless configuration specifically requested + * otherwise. */ + if (!can_do_reliable_discards(device)) + peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; + + if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector, + peer_req->i.size >> 9, !(peer_req->flags & EE_IS_TRIM_USE_ZEROOUT))) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); +} + +static void drbd_issue_peer_wsame(struct drbd_device *device, + struct drbd_peer_request *peer_req) +{ + struct block_device *bdev = device->ldev->backing_bdev; + sector_t s = peer_req->i.sector; + sector_t nr = peer_req->i.size >> 9; + if (blkdev_issue_write_same(bdev, s, nr, GFP_NOIO, peer_req->pages)) + peer_req->flags |= EE_WAS_ERROR; + drbd_endio_write_sec_final(peer_req); +} + + /** * drbd_submit_peer_request() * @device: DRBD device. @@ -1398,7 +1579,8 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin /* TODO allocate from our own bio_set. */ int drbd_submit_peer_request(struct drbd_device *device, struct drbd_peer_request *peer_req, - const unsigned rw, const int fault_type) + const unsigned op, const unsigned op_flags, + const int fault_type) { struct bio *bios = NULL; struct bio *bio; @@ -1409,7 +1591,13 @@ int drbd_submit_peer_request(struct drbd_device *device, unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; int err = -ENOMEM; - if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { + /* TRIM/DISCARD: for now, always use the helper function + * blkdev_issue_zeroout(..., discard=true). + * It's synchronous, but it does the right thing wrt. bio splitting. + * Correctness first, performance later. Next step is to code an + * asynchronous variant of the same. + */ + if (peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) { /* wait for all pending IO completions, before we start * zeroing things out. */ conn_wait_active_ee_empty(peer_req->peer_device->connection); @@ -1417,22 +1605,22 @@ int drbd_submit_peer_request(struct drbd_device *device, * so we can find it to present it in debugfs */ peer_req->submit_jif = jiffies; peer_req->flags |= EE_SUBMITTED; - spin_lock_irq(&device->resource->req_lock); - list_add_tail(&peer_req->w.list, &device->active_ee); - spin_unlock_irq(&device->resource->req_lock); - if (blkdev_issue_zeroout(device->ldev->backing_bdev, - sector, data_size >> 9, GFP_NOIO, false)) - peer_req->flags |= EE_WAS_ERROR; - drbd_endio_write_sec_final(peer_req); + + /* If this was a resync request from receive_rs_deallocated(), + * it is already on the sync_ee list */ + if (list_empty(&peer_req->w.list)) { + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->active_ee); + spin_unlock_irq(&device->resource->req_lock); + } + + if (peer_req->flags & EE_IS_TRIM) + drbd_issue_peer_discard(device, peer_req); + else /* EE_WRITE_SAME */ + drbd_issue_peer_wsame(device, peer_req); return 0; } - /* Discards don't have any payload. - * But the scsi layer still expects a bio_vec it can use internally, - * see sd_setup_discard_cmnd() and blk_add_request_payload(). */ - if (peer_req->flags & EE_IS_TRIM) - nr_pages = 1; - /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the @@ -1450,7 +1638,7 @@ next_bio: /* > peer_req->i.sector, unless this is the first bio */ bio->bi_iter.bi_sector = sector; bio->bi_bdev = device->ldev->backing_bdev; - bio->bi_rw = rw; + bio_set_op_attrs(bio, op, op_flags); bio->bi_private = peer_req; bio->bi_end_io = drbd_peer_request_endio; @@ -1458,11 +1646,6 @@ next_bio: bios = bio; ++n_bios; - if (rw & REQ_DISCARD) { - bio->bi_iter.bi_size = data_size; - goto submit; - } - page_chain_for_each(page) { unsigned len = min_t(unsigned, data_size, PAGE_SIZE); if (!bio_add_page(bio, page, len, 0)) { @@ -1484,7 +1667,6 @@ next_bio: --nr_pages; } D_ASSERT(device, data_size == 0); -submit: D_ASSERT(device, page == NULL); atomic_set(&peer_req->pending_bios, n_bios); @@ -1608,8 +1790,26 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf return 0; } +/* quick wrapper in case payload size != request_size (write same) */ +static void drbd_csum_ee_size(struct crypto_ahash *h, + struct drbd_peer_request *r, void *d, + unsigned int payload_size) +{ + unsigned int tmp = r->i.size; + r->i.size = payload_size; + drbd_csum_ee(h, r, d); + r->i.size = tmp; +} + /* used from receive_RSDataReply (recv_resync_read) - * and from receive_Data */ + * and from receive_Data. + * data_size: actual payload ("data in") + * for normal writes that is bi_size. + * for discards, that is zero. + * for write same, it is logical_block_size. + * both trim and write same have the bi_size ("data len to be affected") + * as extra argument in the packet header. + */ static struct drbd_peer_request * read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, struct packet_info *pi) __must_hold(local) @@ -1624,6 +1824,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, void *dig_vv = peer_device->connection->int_dig_vv; unsigned long *data; struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL; + struct p_trim *wsame = (pi->cmd == P_WSAME) ? pi->data : NULL; digest_size = 0; if (!trim && peer_device->connection->peer_integrity_tfm) { @@ -1638,38 +1839,60 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, data_size -= digest_size; } + /* assume request_size == data_size, but special case trim and wsame. */ + ds = data_size; if (trim) { - D_ASSERT(peer_device, data_size == 0); - data_size = be32_to_cpu(trim->size); + if (!expect(data_size == 0)) + return NULL; + ds = be32_to_cpu(trim->size); + } else if (wsame) { + if (data_size != queue_logical_block_size(device->rq_queue)) { + drbd_err(peer_device, "data size (%u) != drbd logical block size (%u)\n", + data_size, queue_logical_block_size(device->rq_queue)); + return NULL; + } + if (data_size != bdev_logical_block_size(device->ldev->backing_bdev)) { + drbd_err(peer_device, "data size (%u) != backend logical block size (%u)\n", + data_size, bdev_logical_block_size(device->ldev->backing_bdev)); + return NULL; + } + ds = be32_to_cpu(wsame->size); } - if (!expect(IS_ALIGNED(data_size, 512))) + if (!expect(IS_ALIGNED(ds, 512))) return NULL; - /* prepare for larger trim requests. */ - if (!trim && !expect(data_size <= DRBD_MAX_BIO_SIZE)) + if (trim || wsame) { + if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9))) + return NULL; + } else if (!expect(ds <= DRBD_MAX_BIO_SIZE)) return NULL; /* even though we trust out peer, * we sometimes have to double check. */ - if (sector + (data_size>>9) > capacity) { + if (sector + (ds>>9) > capacity) { drbd_err(device, "request from peer beyond end of local disk: " "capacity: %llus < sector: %llus + size: %u\n", (unsigned long long)capacity, - (unsigned long long)sector, data_size); + (unsigned long long)sector, ds); return NULL; } /* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ - peer_req = drbd_alloc_peer_req(peer_device, id, sector, data_size, trim == NULL, GFP_NOIO); + peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO); if (!peer_req) return NULL; peer_req->flags |= EE_WRITE; - if (trim) + if (trim) { + peer_req->flags |= EE_IS_TRIM; return peer_req; + } + if (wsame) + peer_req->flags |= EE_WRITE_SAME; + /* receive payload size bytes into page chain */ ds = data_size; page = peer_req->pages; page_chain_for_each(page) { @@ -1689,7 +1912,7 @@ read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector, } if (digest_size) { - drbd_csum_ee(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv); + drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size); if (memcmp(dig_in, dig_vv, digest_size)) { drbd_err(device, "Digest integrity check FAILED: %llus +%u\n", (unsigned long long)sector, data_size); @@ -1830,7 +2053,8 @@ static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t secto spin_unlock_irq(&device->resource->req_lock); atomic_add(pi->size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0) + if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE, 0, + DRBD_FAULT_RS_WR) == 0) return 0; /* don't care for the reason here */ @@ -2065,13 +2289,13 @@ static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2) static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req) { struct drbd_peer_request *rs_req; - bool rv = 0; + bool rv = false; spin_lock_irq(&device->resource->req_lock); list_for_each_entry(rs_req, &device->sync_ee, w.list) { if (overlaps(peer_req->i.sector, peer_req->i.size, rs_req->i.sector, rs_req->i.size)) { - rv = 1; + rv = true; break; } } @@ -2152,12 +2376,19 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co /* see also bio_flags_to_wire() * DRBD_REQ_*, because we need to semantically map the flags to data packet * flags and back. We may replicate to other kernel versions. */ -static unsigned long wire_flags_to_bio(u32 dpf) +static unsigned long wire_flags_to_bio_flags(u32 dpf) { return (dpf & DP_RW_SYNC ? REQ_SYNC : 0) | (dpf & DP_FUA ? REQ_FUA : 0) | - (dpf & DP_FLUSH ? REQ_FLUSH : 0) | - (dpf & DP_DISCARD ? REQ_DISCARD : 0); + (dpf & DP_FLUSH ? REQ_PREFLUSH : 0); +} + +static unsigned long wire_flags_to_bio_op(u32 dpf) +{ + if (dpf & DP_DISCARD) + return REQ_OP_DISCARD; + else + return REQ_OP_WRITE; } static void fail_postponed_requests(struct drbd_device *device, sector_t sector, @@ -2303,7 +2534,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * struct drbd_peer_request *peer_req; struct p_data *p = pi->data; u32 peer_seq = be32_to_cpu(p->seq_num); - int rw = WRITE; + int op, op_flags; u32 dp_flags; int err, tp; @@ -2342,14 +2573,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_APPLICATION; dp_flags = be32_to_cpu(p->dp_flags); - rw |= wire_flags_to_bio(dp_flags); + op = wire_flags_to_bio_op(dp_flags); + op_flags = wire_flags_to_bio_flags(dp_flags); if (pi->cmd == P_TRIM) { - struct request_queue *q = bdev_get_queue(device->ldev->backing_bdev); - peer_req->flags |= EE_IS_TRIM; - if (!blk_queue_discard(q)) - peer_req->flags |= EE_IS_TRIM_USE_ZEROOUT; D_ASSERT(peer_device, peer_req->i.size > 0); - D_ASSERT(peer_device, rw & REQ_DISCARD); + D_ASSERT(peer_device, op == REQ_OP_DISCARD); D_ASSERT(peer_device, peer_req->pages == NULL); } else if (peer_req->pages == NULL) { D_ASSERT(device, peer_req->i.size == 0); @@ -2414,11 +2642,11 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * update_peer_seq(peer_device, peer_seq); spin_lock_irq(&device->resource->req_lock); } - /* if we use the zeroout fallback code, we process synchronously - * and we wait for all pending requests, respectively wait for + /* TRIM and WRITE_SAME are processed synchronously, + * we wait for all pending requests, respectively wait for * active_ee to become empty in drbd_submit_peer_request(); * better not add ourselves here. */ - if ((peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) == 0) + if ((peer_req->flags & (EE_IS_TRIM|EE_WRITE_SAME)) == 0) list_add_tail(&peer_req->w.list, &device->active_ee); spin_unlock_irq(&device->resource->req_lock); @@ -2433,7 +2661,8 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * peer_req->flags |= EE_CALL_AL_COMPLETE_IO; } - err = drbd_submit_peer_request(device, peer_req, rw, DRBD_FAULT_DT_WR); + err = drbd_submit_peer_request(device, peer_req, op, op_flags, + DRBD_FAULT_DT_WR); if (!err) return 0; @@ -2449,7 +2678,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * } out_interrupted: - drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT + EV_CLEANUP); + drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP); put_ldev(device); drbd_free_peer_req(device, peer_req); return err; @@ -2574,6 +2803,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet case P_DATA_REQUEST: drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p); break; + case P_RS_THIN_REQ: case P_RS_DATA_REQUEST: case P_CSUM_RS_REQUEST: case P_OV_REQUEST: @@ -2599,7 +2829,7 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet * "criss-cross" setup, that might cause write-out on some other DRBD, * which in turn might block on the other node at this very place. */ peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size, - true /* has real payload */, GFP_NOIO); + size, GFP_NOIO); if (!peer_req) { put_ldev(device); return -ENOMEM; @@ -2613,6 +2843,12 @@ static int receive_DataRequest(struct drbd_connection *connection, struct packet peer_req->flags |= EE_APPLICATION; goto submit; + case P_RS_THIN_REQ: + /* If at some point in the future we have a smart way to + find out if this data block is completely deallocated, + then we would do something smarter here than reading + the block... */ + peer_req->flags |= EE_RS_THIN_REQ; case P_RS_DATA_REQUEST: peer_req->w.cb = w_e_end_rsdata_req; fault_type = DRBD_FAULT_RS_RD; @@ -2723,7 +2959,8 @@ submit_for_resync: submit: update_receiver_timing_details(connection, drbd_submit_peer_request); inc_unacked(device); - if (drbd_submit_peer_request(device, peer_req, READ, fault_type) == 0) + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + fault_type) == 0) return 0; /* don't care for the reason here */ @@ -2957,7 +3194,8 @@ static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid, -1091 requires proto 91 -1096 requires proto 96 */ -static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __must_hold(local) + +static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local) { struct drbd_peer_device *const peer_device = first_peer_device(device); struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL; @@ -3037,8 +3275,39 @@ static int drbd_uuid_compare(struct drbd_device *const device, int *rule_nr) __m * next bit (weight 2) is set when peer was primary */ *rule_nr = 40; + /* Neither has the "crashed primary" flag set, + * only a replication link hickup. */ + if (rct == 0) + return 0; + + /* Current UUID equal and no bitmap uuid; does not necessarily + * mean this was a "simultaneous hard crash", maybe IO was + * frozen, so no UUID-bump happened. + * This is a protocol change, overload DRBD_FF_WSAME as flag + * for "new-enough" peer DRBD version. */ + if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) { + *rule_nr = 41; + if (!(connection->agreed_features & DRBD_FF_WSAME)) { + drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n"); + return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8)); + } + if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) { + /* At least one has the "crashed primary" bit set, + * both are primary now, but neither has rotated its UUIDs? + * "Can not happen." */ + drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n"); + return -100; + } + if (device->state.role == R_PRIMARY) + return 1; + return -1; + } + + /* Both are secondary. + * Really looks like recovery from simultaneous hard crash. + * Check which had been primary before, and arbitrate. */ switch (rct) { - case 0: /* !self_pri && !peer_pri */ return 0; + case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */ case 1: /* self_pri && !peer_pri */ return 1; case 2: /* !self_pri && peer_pri */ return -1; case 3: /* self_pri && peer_pri */ @@ -3165,7 +3434,7 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]); - hg = drbd_uuid_compare(device, &rule_nr); + hg = drbd_uuid_compare(device, peer_role, &rule_nr); spin_unlock_irq(&device->ldev->md.uuid_lock); drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr); @@ -3174,6 +3443,15 @@ static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device, drbd_alert(device, "Unrelated data, aborting!\n"); return C_MASK; } + if (hg < -0x10000) { + int proto, fflags; + hg = -hg; + proto = hg & 0xff; + fflags = (hg >> 8) & 0xff; + drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n", + proto, fflags); + return C_MASK; + } if (hg < -1000) { drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000); return C_MASK; @@ -3403,7 +3681,8 @@ static int receive_protocol(struct drbd_connection *connection, struct packet_in */ peer_integrity_tfm = crypto_alloc_ahash(integrity_alg, 0, CRYPTO_ALG_ASYNC); - if (!peer_integrity_tfm) { + if (IS_ERR(peer_integrity_tfm)) { + peer_integrity_tfm = NULL; drbd_err(connection, "peer data-integrity-alg %s not supported\n", integrity_alg); goto disconnect; @@ -3754,6 +4033,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info struct drbd_peer_device *peer_device; struct drbd_device *device; struct p_sizes *p = pi->data; + struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL; enum determine_dev_size dd = DS_UNCHANGED; sector_t p_size, p_usize, p_csize, my_usize; int ldsc = 0; /* local disk size changed */ @@ -3773,6 +4053,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info device->p_size = p_size; if (get_ldev(device)) { + sector_t new_size, cur_size; rcu_read_lock(); my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size; rcu_read_unlock(); @@ -3789,11 +4070,13 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info /* Never shrink a device with usable data during connect. But allow online shrinking if we are connected. */ - if (drbd_new_dev_size(device, device->ldev, p_usize, 0) < - drbd_get_capacity(device->this_bdev) && + new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0); + cur_size = drbd_get_capacity(device->this_bdev); + if (new_size < cur_size && device->state.disk >= D_OUTDATED && device->state.conn < C_CONNECTED) { - drbd_err(device, "The peer's disk size is too small!\n"); + drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n", + (unsigned long long)new_size, (unsigned long long)cur_size); conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD); put_ldev(device); return -EIO; @@ -3827,14 +4110,14 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info } device->peer_max_bio_size = be32_to_cpu(p->max_bio_size); - /* Leave drbd_reconsider_max_bio_size() before drbd_determine_dev_size(). + /* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size(). In case we cleared the QUEUE_FLAG_DISCARD from our queue in - drbd_reconsider_max_bio_size(), we can be sure that after + drbd_reconsider_queue_parameters(), we can be sure that after drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */ ddsf = be16_to_cpu(p->dds_flags); if (get_ldev(device)) { - drbd_reconsider_max_bio_size(device, device->ldev); + drbd_reconsider_queue_parameters(device, device->ldev, o); dd = drbd_determine_dev_size(device, ddsf, NULL); put_ldev(device); if (dd == DS_ERROR) @@ -3854,7 +4137,7 @@ static int receive_sizes(struct drbd_connection *connection, struct packet_info * However, if he sends a zero current size, * take his (user-capped or) backing disk size anyways. */ - drbd_reconsider_max_bio_size(device, NULL); + drbd_reconsider_queue_parameters(device, NULL, o); drbd_set_my_capacity(device, p_csize ?: p_usize ?: p_size); } @@ -4587,9 +4870,75 @@ static int receive_out_of_sync(struct drbd_connection *connection, struct packet return 0; } +static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi) +{ + struct drbd_peer_device *peer_device; + struct p_block_desc *p = pi->data; + struct drbd_device *device; + sector_t sector; + int size, err = 0; + + peer_device = conn_peer_device(connection, pi->vnr); + if (!peer_device) + return -EIO; + device = peer_device->device; + + sector = be64_to_cpu(p->sector); + size = be32_to_cpu(p->blksize); + + dec_rs_pending(device); + + if (get_ldev(device)) { + struct drbd_peer_request *peer_req; + const int op = REQ_OP_DISCARD; + + peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector, + size, 0, GFP_NOIO); + if (!peer_req) { + put_ldev(device); + return -ENOMEM; + } + + peer_req->w.cb = e_end_resync_block; + peer_req->submit_jif = jiffies; + peer_req->flags |= EE_IS_TRIM; + + spin_lock_irq(&device->resource->req_lock); + list_add_tail(&peer_req->w.list, &device->sync_ee); + spin_unlock_irq(&device->resource->req_lock); + + atomic_add(pi->size >> 9, &device->rs_sect_ev); + err = drbd_submit_peer_request(device, peer_req, op, 0, DRBD_FAULT_RS_WR); + + if (err) { + spin_lock_irq(&device->resource->req_lock); + list_del(&peer_req->w.list); + spin_unlock_irq(&device->resource->req_lock); + + drbd_free_peer_req(device, peer_req); + put_ldev(device); + err = 0; + goto fail; + } + + inc_unacked(device); + + /* No put_ldev() here. Gets called in drbd_endio_write_sec_final(), + as well as drbd_rs_complete_io() */ + } else { + fail: + drbd_rs_complete_io(device, sector); + drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER); + } + + atomic_add(size >> 9, &device->rs_sect_in); + + return err; +} + struct data_cmd { int expect_payload; - size_t pkt_size; + unsigned int pkt_size; int (*fn)(struct drbd_connection *, struct packet_info *); }; @@ -4614,11 +4963,14 @@ static struct data_cmd drbd_cmd_handler[] = { [P_OV_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_OV_REPLY] = { 1, sizeof(struct p_block_req), receive_DataRequest }, [P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest }, + [P_RS_THIN_REQ] = { 0, sizeof(struct p_block_req), receive_DataRequest }, [P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip }, [P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync }, [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state }, [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol }, [P_TRIM] = { 0, sizeof(struct p_trim), receive_Data }, + [P_RS_DEALLOCATED] = { 0, sizeof(struct p_block_desc), receive_rs_deallocated }, + [P_WSAME] = { 1, sizeof(struct p_wsame), receive_Data }, }; static void drbdd(struct drbd_connection *connection) @@ -4628,7 +4980,7 @@ static void drbdd(struct drbd_connection *connection) int err; while (get_t_state(&connection->receiver) == RUNNING) { - struct data_cmd *cmd; + struct data_cmd const *cmd; drbd_thread_current_set_cpu(&connection->receiver); update_receiver_timing_details(connection, drbd_recv_header); @@ -4643,11 +4995,18 @@ static void drbdd(struct drbd_connection *connection) } shs = cmd->pkt_size; + if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME) + shs += sizeof(struct o_qlim); if (pi.size > shs && !cmd->expect_payload) { drbd_err(connection, "No payload expected %s l:%d\n", cmdname(pi.cmd), pi.size); goto err_out; } + if (pi.size < shs) { + drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n", + cmdname(pi.cmd), (int)shs, pi.size); + goto err_out; + } if (shs) { update_receiver_timing_details(connection, drbd_recv_all_warn); @@ -4783,9 +5142,11 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device) drbd_md_sync(device); - /* serialize with bitmap writeout triggered by the state change, - * if any. */ - wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags)); + if (get_ldev(device)) { + drbd_bitmap_io(device, &drbd_bm_write_copy_pages, + "write from disconnected", BM_LOCKED_CHANGE_ALLOWED); + put_ldev(device); + } /* tcp_close and release of sendpage pages can be deferred. I don't * want to use SO_LINGER, because apparently it can be deferred for @@ -4892,8 +5253,12 @@ static int drbd_do_features(struct drbd_connection *connection) drbd_info(connection, "Handshake successful: " "Agreed network protocol version %d\n", connection->agreed_pro_version); - drbd_info(connection, "Agreed to%ssupport TRIM on protocol level\n", - connection->agreed_features & FF_TRIM ? " " : " not "); + drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s.\n", + connection->agreed_features, + connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "", + connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "", + connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : + connection->agreed_features ? "" : " none"); return 1; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 2255dcfebd2b..66b8e4bb74d8 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -47,8 +47,7 @@ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *r &device->vdisk->part0, req->start_jif); } -static struct drbd_request *drbd_req_new(struct drbd_device *device, - struct bio *bio_src) +static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src) { struct drbd_request *req; @@ -58,10 +57,12 @@ static struct drbd_request *drbd_req_new(struct drbd_device *device, memset(req, 0, sizeof(*req)); drbd_req_make_private_bio(req, bio_src); - req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0; - req->device = device; - req->master_bio = bio_src; - req->epoch = 0; + req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0) + | (bio_op(bio_src) == REQ_OP_WRITE_SAME ? RQ_WSAME : 0) + | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0); + req->device = device; + req->master_bio = bio_src; + req->epoch = 0; drbd_clear_interval(&req->i); req->i.sector = bio_src->bi_iter.bi_sector; @@ -218,7 +219,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) { const unsigned s = req->rq_state; struct drbd_device *device = req->device; - int rw; int error, ok; /* we must not complete the master bio, while it is @@ -242,8 +242,6 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) return; } - rw = bio_rw(req->master_bio); - /* * figure out whether to report success or failure. * @@ -267,7 +265,7 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) * epoch number. If they match, increase the current_tle_nr, * and reset the transfer log epoch write_cnt. */ - if (rw == WRITE && + if (op_is_write(bio_op(req->master_bio)) && req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr)) start_new_tl_epoch(first_peer_device(device)->connection); @@ -284,11 +282,14 @@ void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m) * because no path was available, in which case * it was not even added to the transfer_log. * - * READA may fail, and will not be retried. + * read-ahead may fail, and will not be retried. * * WRITE should have used all available paths already. */ - if (!ok && rw == READ && !list_empty(&req->tl_requests)) + if (!ok && + bio_op(req->master_bio) == REQ_OP_READ && + !(req->master_bio->bi_rw & REQ_RAHEAD) && + !list_empty(&req->tl_requests)) req->rq_state |= RQ_POSTPONED; if (!(req->rq_state & RQ_POSTPONED)) { @@ -644,7 +645,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(device, DRBD_READ_ERROR); /* fall through. */ case READ_AHEAD_COMPLETED_WITH_ERROR: - /* it is legal to fail READA, no __drbd_chk_io_error in that case. */ + /* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED); break; @@ -656,7 +657,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case QUEUE_FOR_NET_READ: - /* READ or READA, and + /* READ, and * no local disk, * or target area marked as invalid, * or just got an io-error. */ @@ -977,16 +978,20 @@ static void complete_conflicting_writes(struct drbd_request *req) sector_t sector = req->i.sector; int size = req->i.size; - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) - return; - for (;;) { - prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); - i = drbd_find_overlap(&device->write_requests, sector, size); - if (!i) + drbd_for_each_overlap(i, &device->write_requests, sector, size) { + /* Ignore, if already completed to upper layers. */ + if (i->completed) + continue; + /* Handle the first found overlap. After the schedule + * we have to restart the tree walk. */ + break; + } + if (!i) /* if any */ break; + /* Indicate to wake up device->misc_wait on progress. */ + prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE); i->waiting = true; spin_unlock_irq(&device->resource->req_lock); schedule(); @@ -995,7 +1000,7 @@ static void complete_conflicting_writes(struct drbd_request *req) finish_wait(&device->misc_wait, &wait); } -/* called within req_lock and rcu_read_lock() */ +/* called within req_lock */ static void maybe_pull_ahead(struct drbd_device *device) { struct drbd_connection *connection = first_peer_device(device)->connection; @@ -1132,7 +1137,7 @@ static int drbd_process_write_request(struct drbd_request *req) * replicating, in which case there is no point. */ if (unlikely(req->i.size == 0)) { /* The only size==0 bios we expect are empty flushes. */ - D_ASSERT(device, req->master_bio->bi_rw & REQ_FLUSH); + D_ASSERT(device, req->master_bio->bi_rw & REQ_PREFLUSH); if (remote) _req_mod(req, QUEUE_AS_DRBD_BARRIER); return remote; @@ -1152,12 +1157,29 @@ static int drbd_process_write_request(struct drbd_request *req) return remote; } +static void drbd_process_discard_req(struct drbd_request *req) +{ + int err = drbd_issue_discard_or_zero_out(req->device, + req->i.sector, req->i.size >> 9, true); + + if (err) + req->private_bio->bi_error = -EIO; + bio_endio(req->private_bio); +} + static void drbd_submit_req_private_bio(struct drbd_request *req) { struct drbd_device *device = req->device; struct bio *bio = req->private_bio; - const int rw = bio_rw(bio); + unsigned int type; + + if (bio_op(bio) != REQ_OP_READ) + type = DRBD_FAULT_DT_WR; + else if (bio->bi_rw & REQ_RAHEAD) + type = DRBD_FAULT_DT_RA; + else + type = DRBD_FAULT_DT_RD; bio->bi_bdev = device->ldev->backing_bdev; @@ -1167,11 +1189,10 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { - if (drbd_insert_fault(device, - rw == WRITE ? DRBD_FAULT_DT_WR - : rw == READ ? DRBD_FAULT_DT_RD - : DRBD_FAULT_DT_RA)) + if (drbd_insert_fault(device, type)) bio_io_error(bio); + else if (bio_op(bio) == REQ_OP_DISCARD) + drbd_process_discard_req(req); else generic_make_request(bio); put_ldev(device); @@ -1223,24 +1244,45 @@ drbd_request_prepare(struct drbd_device *device, struct bio *bio, unsigned long /* Update disk stats */ _drbd_start_io_acct(device, req); + /* process discards always from our submitter thread */ + if (bio_op(bio) & REQ_OP_DISCARD) + goto queue_for_submitter_thread; + if (rw == WRITE && req->private_bio && req->i.size && !test_bit(AL_SUSPENDED, &device->flags)) { - if (!drbd_al_begin_io_fastpath(device, &req->i)) { - atomic_inc(&device->ap_actlog_cnt); - drbd_queue_write(device, req); - return NULL; - } + if (!drbd_al_begin_io_fastpath(device, &req->i)) + goto queue_for_submitter_thread; req->rq_state |= RQ_IN_ACT_LOG; req->in_actlog_jif = jiffies; } - return req; + + queue_for_submitter_thread: + atomic_inc(&device->ap_actlog_cnt); + drbd_queue_write(device, req); + return NULL; +} + +/* Require at least one path to current data. + * We don't want to allow writes on C_STANDALONE D_INCONSISTENT: + * We would not allow to read what was written, + * we would not have bumped the data generation uuids, + * we would cause data divergence for all the wrong reasons. + * + * If we don't see at least one D_UP_TO_DATE, we will fail this request, + * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO, + * and queues for retry later. + */ +static bool may_do_writes(struct drbd_device *device) +{ + const union drbd_dev_state s = device->state; + return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE; } static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req) { struct drbd_resource *resource = device->resource; - const int rw = bio_rw(req->master_bio); + const int rw = bio_data_dir(req->master_bio); struct bio_and_error m = { NULL, }; bool no_remote = false; bool submit_private_bio = false; @@ -1270,7 +1312,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request goto out; } - /* We fail READ/READA early, if we can not serve it. + /* We fail READ early, if we can not serve it. * We must do this before req is registered on any lists. * Otherwise, drbd_req_complete() will queue failed READ for retry. */ if (rw != WRITE) { @@ -1291,6 +1333,12 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request } if (rw == WRITE) { + if (req->private_bio && !may_do_writes(device)) { + bio_put(req->private_bio); + req->private_bio = NULL; + put_ldev(device); + goto nodata; + } if (!drbd_process_write_request(req)) no_remote = true; } else { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index bb2ef78165e5..eb49e7f2da91 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -206,6 +206,8 @@ enum drbd_req_state_bits { /* Set when this is a write, clear for a read */ __RQ_WRITE, + __RQ_WSAME, + __RQ_UNMAP, /* Should call drbd_al_complete_io() for this request... */ __RQ_IN_ACT_LOG, @@ -241,10 +243,11 @@ enum drbd_req_state_bits { #define RQ_NET_OK (1UL << __RQ_NET_OK) #define RQ_NET_SIS (1UL << __RQ_NET_SIS) -/* 0x1f8 */ #define RQ_NET_MASK (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK) #define RQ_WRITE (1UL << __RQ_WRITE) +#define RQ_WSAME (1UL << __RQ_WSAME) +#define RQ_UNMAP (1UL << __RQ_UNMAP) #define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG) #define RQ_POSTPONED (1UL << __RQ_POSTPONED) #define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP) diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 5a7ef7873b67..eea0c4aec978 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -814,7 +814,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) } if (rv <= 0) - /* already found a reason to abort */; + goto out; /* already found a reason to abort */ else if (ns.role == R_SECONDARY && device->open_cnt) rv = SS_DEVICE_IN_USE; @@ -862,6 +862,7 @@ is_valid_state(struct drbd_device *device, union drbd_state ns) else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN) rv = SS_CONNECTED_OUTDATES; +out: rcu_read_unlock(); return rv; @@ -906,6 +907,15 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) rv = SS_IN_TRANSIENT_STATE; + /* Do not promote during resync handshake triggered by "force primary". + * This is a hack. It should really be rejected by the peer during the + * cluster wide state change request. */ + if (os.role != R_PRIMARY && ns.role == R_PRIMARY + && ns.pdsk == D_UP_TO_DATE + && ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS + && (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn)) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -1628,6 +1638,26 @@ static void broadcast_state_change(struct drbd_state_change *state_change) #undef REMEMBER_STATE_CHANGE } +/* takes old and new peer disk state */ +static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns) +{ + if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED) + && (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED)) + return true; + + /* Scenario, starting with normal operation + * Connected Primary/Secondary UpToDate/UpToDate + * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen) + * ... + * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!) + */ + if (os == D_UNKNOWN + && (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED)) + return true; + + return false; +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1675,7 +1705,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, what = RESEND; if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && - conn_lowest_disk(connection) > D_NEGOTIATING) + conn_lowest_disk(connection) == D_UP_TO_DATE) what = RESTART_FROZEN_DISK_IO; if (resource->susp_nod && what != NOTHING) { @@ -1699,6 +1729,13 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, idr_for_each_entry(&connection->peer_devices, peer_device, vnr) clear_bit(NEW_CUR_UUID, &peer_device->device->flags); rcu_read_unlock(); + + /* We should actively create a new uuid, _before_ + * we resume/resent, if the peer is diskless + * (recovery from a multiple error scenario). + * Currently, this happens with a slight delay + * below when checking lost_contact_to_peer_data() ... + */ _tl_restart(connection, RESEND); _conn_request_state(connection, (union drbd_state) { { .susp_fen = 1 } }, @@ -1742,12 +1779,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, BM_LOCKED_TEST_ALLOWED); /* Lost contact to peer's copy of the data */ - if ((os.pdsk >= D_INCONSISTENT && - os.pdsk != D_UNKNOWN && - os.pdsk != D_OUTDATED) - && (ns.pdsk < D_INCONSISTENT || - ns.pdsk == D_UNKNOWN || - ns.pdsk == D_OUTDATED)) { + if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) { if (get_ldev(device)) { if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { @@ -1934,12 +1966,17 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk - * failure, or because of connection loss. + * failure, or on transition from resync back to AHEAD/BEHIND. + * + * Connection loss is handled in drbd_disconnected() by the receiver. + * * For resync aborted because of local disk failure, we cannot do * any bitmap writeout anymore. + * * No harm done if some bits change during this phase. */ - if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(device)) { + if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) && + (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) { drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL, "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(device); @@ -2160,9 +2197,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union ns.disk = os.disk; rv = _drbd_set_state(device, ns, flags, NULL); - if (rv < SS_SUCCESS) - BUG(); - + BUG_ON(rv < SS_SUCCESS); ns.i = device->state.i; ns_max.role = max_role(ns.role, ns_max.role); ns_max.peer = max_role(ns.peer, ns_max.peer); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index bd989536f888..6c9d5d4a8a75 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -140,7 +140,7 @@ extern void drbd_resume_al(struct drbd_device *device); extern bool conn_all_vols_unconf(struct drbd_connection *connection); /** - * drbd_request_state() - Reqest a state change + * drbd_request_state() - Request a state change * @device: DRBD device. * @mask: mask of state bits to change. * @val: value of new state bits. diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c index 80b0f63c7075..0eeab14776e9 100644 --- a/drivers/block/drbd/drbd_strings.c +++ b/drivers/block/drbd/drbd_strings.c @@ -26,7 +26,7 @@ #include <linux/drbd.h> #include "drbd_strings.h" -static const char *drbd_conn_s_names[] = { +static const char * const drbd_conn_s_names[] = { [C_STANDALONE] = "StandAlone", [C_DISCONNECTING] = "Disconnecting", [C_UNCONNECTED] = "Unconnected", @@ -53,13 +53,13 @@ static const char *drbd_conn_s_names[] = { [C_BEHIND] = "Behind", }; -static const char *drbd_role_s_names[] = { +static const char * const drbd_role_s_names[] = { [R_PRIMARY] = "Primary", [R_SECONDARY] = "Secondary", [R_UNKNOWN] = "Unknown" }; -static const char *drbd_disk_s_names[] = { +static const char * const drbd_disk_s_names[] = { [D_DISKLESS] = "Diskless", [D_ATTACHING] = "Attaching", [D_FAILED] = "Failed", @@ -71,7 +71,7 @@ static const char *drbd_disk_s_names[] = { [D_UP_TO_DATE] = "UpToDate", }; -static const char *drbd_state_sw_errors[] = { +static const char * const drbd_state_sw_errors[] = { [-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config", [-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data", [-SS_NO_LOCAL_DISK] = "Can not resync without local disk", diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d87499f0d54..35dbb3dca47e 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -173,8 +173,8 @@ void drbd_peer_request_endio(struct bio *bio) { struct drbd_peer_request *peer_req = bio->bi_private; struct drbd_device *device = peer_req->peer_device->device; - int is_write = bio_data_dir(bio) == WRITE; - int is_discard = !!(bio->bi_rw & REQ_DISCARD); + bool is_write = bio_data_dir(bio) == WRITE; + bool is_discard = !!(bio_op(bio) == REQ_OP_DISCARD); if (bio->bi_error && __ratelimit(&drbd_ratelimit_state)) drbd_warn(device, "%s: error=%d s=%llus\n", @@ -248,18 +248,26 @@ void drbd_request_endio(struct bio *bio) /* to avoid recursion in __req_mod */ if (unlikely(bio->bi_error)) { - if (bio->bi_rw & REQ_DISCARD) - what = (bio->bi_error == -EOPNOTSUPP) - ? DISCARD_COMPLETED_NOTSUPP - : DISCARD_COMPLETED_WITH_ERROR; - else - what = (bio_data_dir(bio) == WRITE) - ? WRITE_COMPLETED_WITH_ERROR - : (bio_rw(bio) == READ) - ? READ_COMPLETED_WITH_ERROR - : READ_AHEAD_COMPLETED_WITH_ERROR; - } else + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + if (bio->bi_error == -EOPNOTSUPP) + what = DISCARD_COMPLETED_NOTSUPP; + else + what = DISCARD_COMPLETED_WITH_ERROR; + break; + case REQ_OP_READ: + if (bio->bi_rw & REQ_RAHEAD) + what = READ_AHEAD_COMPLETED_WITH_ERROR; + else + what = READ_COMPLETED_WITH_ERROR; + break; + default: + what = WRITE_COMPLETED_WITH_ERROR; + break; + } + } else { what = COMPLETED_OK; + } bio_put(req->private_bio); req->private_bio = ERR_PTR(bio->bi_error); @@ -320,6 +328,10 @@ void drbd_csum_bio(struct crypto_ahash *tfm, struct bio *bio, void *digest) sg_set_page(&sg, bvec.bv_page, bvec.bv_len, bvec.bv_offset); ahash_request_set_crypt(req, &sg, NULL, sg.length); crypto_ahash_update(req); + /* REQ_OP_WRITE_SAME has only one segment, + * checksum the payload only once. */ + if (bio_op(bio) == REQ_OP_WRITE_SAME) + break; } ahash_request_set_crypt(req, NULL, digest, 0); crypto_ahash_final(req); @@ -387,7 +399,7 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, /* GFP_TRY, because if there is no memory available right now, this may * be rescheduled for later. It is "only" background resync, after all. */ peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector, - size, true /* has real payload */, GFP_TRY); + size, size, GFP_TRY); if (!peer_req) goto defer; @@ -397,7 +409,8 @@ static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, spin_unlock_irq(&device->resource->req_lock); atomic_add(size >> 9, &device->rs_sect_ev); - if (drbd_submit_peer_request(device, peer_req, READ, DRBD_FAULT_RS_RD) == 0) + if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ, 0, + DRBD_FAULT_RS_RD) == 0) return 0; /* If it failed because of ENOMEM, retry should help. If it failed @@ -582,6 +595,7 @@ static int make_resync_request(struct drbd_device *const device, int cancel) int number, rollback_i, size; int align, requeue = 0; int i = 0; + int discard_granularity = 0; if (unlikely(cancel)) return 0; @@ -601,6 +615,12 @@ static int make_resync_request(struct drbd_device *const device, int cancel) return 0; } + if (connection->agreed_features & DRBD_FF_THIN_RESYNC) { + rcu_read_lock(); + discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity; + rcu_read_unlock(); + } + max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9; number = drbd_rs_number_requests(device); if (number <= 0) @@ -665,6 +685,9 @@ next_sector: if (sector & ((1<<(align+3))-1)) break; + if (discard_granularity && size == discard_granularity) + break; + /* do not cross extent boundaries */ if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0) break; @@ -711,7 +734,8 @@ next_sector: int err; inc_rs_pending(device); - err = drbd_send_drequest(peer_device, P_RS_DATA_REQUEST, + err = drbd_send_drequest(peer_device, + size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST, sector, size, ID_SYNCER); if (err) { drbd_err(device, "drbd_send_drequest() failed, aborting...\n"); @@ -828,6 +852,7 @@ static void ping_peer(struct drbd_device *device) int drbd_resync_finished(struct drbd_device *device) { + struct drbd_connection *connection = first_peer_device(device)->connection; unsigned long db, dt, dbdt; unsigned long n_oos; union drbd_state os, ns; @@ -849,8 +874,7 @@ int drbd_resync_finished(struct drbd_device *device) if (dw) { dw->w.cb = w_resync_finished; dw->device = device; - drbd_queue_work(&first_peer_device(device)->connection->sender_work, - &dw->w); + drbd_queue_work(&connection->sender_work, &dw->w); return 1; } drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n"); @@ -963,6 +987,30 @@ int drbd_resync_finished(struct drbd_device *device) _drbd_set_state(device, ns, CS_VERBOSE, NULL); out_unlock: spin_unlock_irq(&device->resource->req_lock); + + /* If we have been sync source, and have an effective fencing-policy, + * once *all* volumes are back in sync, call "unfence". */ + if (os.conn == C_SYNC_SOURCE) { + enum drbd_disk_state disk_state = D_MASK; + enum drbd_disk_state pdsk_state = D_MASK; + enum drbd_fencing_p fp = FP_DONT_CARE; + + rcu_read_lock(); + fp = rcu_dereference(device->ldev->disk_conf)->fencing; + if (fp != FP_DONT_CARE) { + struct drbd_peer_device *peer_device; + int vnr; + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk); + pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk); + } + } + rcu_read_unlock(); + if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE) + conn_khelper(connection, "unfence-peer"); + } + put_ldev(device); out: device->rs_total = 0; @@ -999,7 +1047,6 @@ static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_ /** * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST - * @device: DRBD device. * @w: work object. * @cancel: The connection will be closed anyways */ @@ -1035,6 +1082,30 @@ int w_e_end_data_req(struct drbd_work *w, int cancel) return err; } +static bool all_zero(struct drbd_peer_request *peer_req) +{ + struct page *page = peer_req->pages; + unsigned int len = peer_req->i.size; + + page_chain_for_each(page) { + unsigned int l = min_t(unsigned int, len, PAGE_SIZE); + unsigned int i, words = l / sizeof(long); + unsigned long *d; + + d = kmap_atomic(page); + for (i = 0; i < words; i++) { + if (d[i]) { + kunmap_atomic(d); + return false; + } + } + kunmap_atomic(d); + len -= l; + } + + return true; +} + /** * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST * @w: work object. @@ -1063,7 +1134,10 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel) } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) { if (likely(device->state.pdsk >= D_INCONSISTENT)) { inc_rs_pending(device); - err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); + if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req)) + err = drbd_send_rs_deallocated(peer_device, peer_req); + else + err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req); } else { if (__ratelimit(&drbd_ratelimit_state)) drbd_err(device, "Not sending RSDataReply, " @@ -1633,7 +1707,7 @@ static bool use_checksum_based_resync(struct drbd_connection *connection, struct rcu_read_unlock(); return connection->agreed_pro_version >= 89 && /* supported? */ connection->csums_tfm && /* configured? */ - (csums_after_crash_only == 0 /* use for each resync? */ + (csums_after_crash_only == false /* use for each resync? */ || test_bit(CRASHED_PRIMARY, &device->flags)); /* or only after Primary crash? */ } @@ -1768,7 +1842,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->bm_resync_fo = 0; device->use_csums = use_checksum_based_resync(connection, device); } else { - device->use_csums = 0; + device->use_csums = false; } /* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 84708a5f8c52..c557057fe8ae 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -3822,8 +3822,9 @@ static int __floppy_read_block_0(struct block_device *bdev, int drive) bio.bi_flags |= (1 << BIO_QUIET); bio.bi_private = &cbdata; bio.bi_end_io = floppy_rb0_cb; + bio_set_op_attrs(&bio, REQ_OP_READ, 0); - submit_bio(READ, &bio); + submit_bio(&bio); process_fd_request(); init_completion(&cbdata.complete); @@ -4349,8 +4350,7 @@ static int __init do_floppy_init(void) /* to be cleaned up... */ disks[drive]->private_data = (void *)(long)drive; disks[drive]->flags |= GENHD_FL_REMOVABLE; - disks[drive]->driverfs_dev = &floppy_device[drive].dev; - add_disk(disks[drive]); + device_add_disk(&floppy_device[drive].dev, disks[drive]); } return 0; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 1fa8cc235977..075377eee0c0 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -447,7 +447,7 @@ static int lo_req_flush(struct loop_device *lo, struct request *rq) static inline void handle_partial_read(struct loop_cmd *cmd, long bytes) { - if (bytes < 0 || (cmd->rq->cmd_flags & REQ_WRITE)) + if (bytes < 0 || op_is_write(req_op(cmd->rq))) return; if (unlikely(bytes < blk_rq_bytes(cmd->rq))) { @@ -541,10 +541,10 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq) pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; - if (rq->cmd_flags & REQ_WRITE) { - if (rq->cmd_flags & REQ_FLUSH) + if (op_is_write(req_op(rq))) { + if (req_op(rq) == REQ_OP_FLUSH) ret = lo_req_flush(lo, rq); - else if (rq->cmd_flags & REQ_DISCARD) + else if (req_op(rq) == REQ_OP_DISCARD) ret = lo_discard(lo, rq, pos); else if (lo->transfer) ret = lo_write_transfer(lo, rq, pos); @@ -1659,8 +1659,8 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, if (lo->lo_state != Lo_bound) return -EIO; - if (lo->use_dio && !(cmd->rq->cmd_flags & (REQ_FLUSH | - REQ_DISCARD))) + if (lo->use_dio && (req_op(cmd->rq) != REQ_OP_FLUSH || + req_op(cmd->rq) == REQ_OP_DISCARD)) cmd->use_aio = true; else cmd->use_aio = false; @@ -1672,7 +1672,7 @@ static int loop_queue_rq(struct blk_mq_hw_ctx *hctx, static void loop_handle_cmd(struct loop_cmd *cmd) { - const bool write = cmd->rq->cmd_flags & REQ_WRITE; + const bool write = op_is_write(req_op(cmd->rq)); struct loop_device *lo = cmd->rq->q->queuedata; int ret = 0; @@ -1765,6 +1765,7 @@ static int loop_add(struct loop_device **l, int i) */ queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, lo->lo_queue); + err = -ENOMEM; disk = lo->lo_disk = alloc_disk(1 << part_shift); if (!disk) goto out_free_queue; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 145ce2aa2e78..e937fcf71769 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -687,15 +687,13 @@ static unsigned int mg_issue_req(struct request *req, unsigned int sect_num, unsigned int sect_cnt) { - switch (rq_data_dir(req)) { - case READ: + if (rq_data_dir(req) == READ) { if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) != MG_ERR_NONE) { mg_bad_rw_intr(host); return host->error; } - break; - case WRITE: + } else { /* TODO : handler */ outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) @@ -714,7 +712,6 @@ static unsigned int mg_issue_req(struct request *req, mod_timer(&host->timer, jiffies + 3 * HZ); outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - break; } return MG_ERR_NONE; } @@ -1018,7 +1015,7 @@ probe_err_7: probe_err_6: blk_cleanup_queue(host->breq); probe_err_5: - unregister_blkdev(MG_DISK_MAJ, MG_DISK_NAME); + unregister_blkdev(host->major, MG_DISK_NAME); probe_err_4: if (!prv_data->use_polling) free_irq(host->irq, host); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 6053e4659fa2..2aca98e8e427 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3765,7 +3765,7 @@ static int mtip_submit_request(struct blk_mq_hw_ctx *hctx, struct request *rq) return -ENODATA; } - if (rq->cmd_flags & REQ_DISCARD) { + if (req_op(rq) == REQ_OP_DISCARD) { int err; err = mtip_send_trim(dd, blk_rq_pos(rq), blk_rq_sectors(rq)); @@ -3956,7 +3956,6 @@ static int mtip_block_initialize(struct driver_data *dd) if (rv) goto disk_index_error; - dd->disk->driverfs_dev = &dd->pdev->dev; dd->disk->major = dd->major; dd->disk->first_minor = index * MTIP_MAX_MINORS; dd->disk->minors = MTIP_MAX_MINORS; @@ -4008,7 +4007,7 @@ skip_create_disk: /* * if rebuild pending, start the service thread, and delay the block - * queue creation and add_disk() + * queue creation and device_add_disk() */ if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) goto start_service_thread; @@ -4042,7 +4041,7 @@ skip_create_disk: set_capacity(dd->disk, capacity); /* Enable the block device and add it to /dev */ - add_disk(dd->disk); + device_add_disk(&dd->pdev->dev, dd->disk); dd->bdev = bdget_disk(dd->disk, 0); /* diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 31e73a7a40f2..6f55b262b5ce 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -282,9 +282,9 @@ static int nbd_send_req(struct nbd_device *nbd, struct request *req) if (req->cmd_type == REQ_TYPE_DRV_PRIV) type = NBD_CMD_DISC; - else if (req->cmd_flags & REQ_DISCARD) + else if (req_op(req) == REQ_OP_DISCARD) type = NBD_CMD_TRIM; - else if (req->cmd_flags & REQ_FLUSH) + else if (req_op(req) == REQ_OP_FLUSH) type = NBD_CMD_FLUSH; else if (rq_data_dir(req) == WRITE) type = NBD_CMD_WRITE; @@ -941,7 +941,7 @@ static int nbd_dev_dbg_init(struct nbd_device *nbd) debugfs_create_u64("size_bytes", 0444, dir, &nbd->bytesize); debugfs_create_u32("timeout", 0444, dir, &nbd->xmit_timeout); debugfs_create_u32("blocksize", 0444, dir, &nbd->blksize); - debugfs_create_file("flags", 0444, dir, &nbd, &nbd_dbg_flags_ops); + debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_ops); return 0; } diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index cab97593ba54..75a7f88d6717 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -448,7 +448,7 @@ static int null_lnvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) struct request *rq; struct bio *bio = rqd->bio; - rq = blk_mq_alloc_request(q, bio_rw(bio), 0); + rq = blk_mq_alloc_request(q, bio_data_dir(bio), 0); if (IS_ERR(rq)) return -ENOMEM; diff --git a/drivers/block/osdblk.c b/drivers/block/osdblk.c index c2854a2bfdb0..92900f5f0b47 100644 --- a/drivers/block/osdblk.c +++ b/drivers/block/osdblk.c @@ -321,7 +321,7 @@ static void osdblk_rq_fn(struct request_queue *q) * driver-specific, etc. */ - do_flush = rq->cmd_flags & REQ_FLUSH; + do_flush = (req_op(rq) == REQ_OP_FLUSH); do_write = (rq_data_dir(rq) == WRITE); if (!do_flush) { /* osd_flush does not use a bio */ diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d06c62eccdf0..9393bc730acf 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1074,7 +1074,7 @@ static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt) BUG(); atomic_inc(&pkt->io_wait); - bio->bi_rw = READ; + bio_set_op_attrs(bio, REQ_OP_READ, 0); pkt_queue_bio(pd, bio); frames_read++; } @@ -1336,7 +1336,7 @@ static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt) /* Start the write request */ atomic_set(&pkt->io_wait, 1); - pkt->w_bio->bi_rw = WRITE; + bio_set_op_attrs(pkt->w_bio, REQ_OP_WRITE, 0); pkt_queue_bio(pd, pkt->w_bio); } diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 4b7e405830d7..76f33c84ce3d 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -196,7 +196,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); while ((req = blk_fetch_request(q))) { - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { if (ps3disk_submit_flush_request(dev, req)) break; } else if (req->cmd_type == REQ_TYPE_FS) { @@ -256,7 +256,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) return IRQ_HANDLED; } - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { read = 0; op = "flush"; } else { @@ -487,7 +487,6 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) gendisk->fops = &ps3disk_fops; gendisk->queue = queue; gendisk->private_data = dev; - gendisk->driverfs_dev = &dev->sbd.core; snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME, devidx+'a'); priv->blocking_factor = dev->blk_size >> 9; @@ -499,7 +498,7 @@ static int ps3disk_probe(struct ps3_system_bus_device *_dev) gendisk->disk_name, priv->model, priv->raw_capacity >> 11, get_capacity(gendisk) >> 11); - add_disk(gendisk); + device_add_disk(&dev->sbd.core, gendisk); return 0; fail_cleanup_queue: diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c index 56847fcda086..456b4fe21559 100644 --- a/drivers/block/ps3vram.c +++ b/drivers/block/ps3vram.c @@ -773,14 +773,13 @@ static int ps3vram_probe(struct ps3_system_bus_device *dev) gendisk->fops = &ps3vram_fops; gendisk->queue = queue; gendisk->private_data = dev; - gendisk->driverfs_dev = &dev->core; strlcpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name)); set_capacity(gendisk, priv->size >> 9); dev_info(&dev->core, "%s: Using %lu MiB of GPU memory\n", gendisk->disk_name, get_capacity(gendisk) >> 11); - add_disk(gendisk); + device_add_disk(&dev->core, gendisk); return 0; fail_cleanup_queue: diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 81666a56415e..450662055d97 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3286,9 +3286,9 @@ static void rbd_queue_workfn(struct work_struct *work) goto err; } - if (rq->cmd_flags & REQ_DISCARD) + if (req_op(rq) == REQ_OP_DISCARD) op_type = OBJ_OP_DISCARD; - else if (rq->cmd_flags & REQ_WRITE) + else if (req_op(rq) == REQ_OP_WRITE) op_type = OBJ_OP_WRITE; else op_type = OBJ_OP_READ; diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index e1b8b7061d2f..f81d70b39d10 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -230,8 +230,7 @@ int rsxx_attach_dev(struct rsxx_cardinfo *card) set_capacity(card->gendisk, card->size8 >> 9); else set_capacity(card->gendisk, 0); - add_disk(card->gendisk); - + device_add_disk(CARD_TO_DEV(card), card->gendisk); card->bdev_attached = 1; } @@ -308,7 +307,6 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card) snprintf(card->gendisk->disk_name, sizeof(card->gendisk->disk_name), "rsxx%d", card->disk_id); - card->gendisk->driverfs_dev = &card->dev->dev; card->gendisk->major = card->major; card->gendisk->first_minor = 0; card->gendisk->fops = &rsxx_fops; diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c index cf8cd293abb5..5a20385f87d0 100644 --- a/drivers/block/rsxx/dma.c +++ b/drivers/block/rsxx/dma.c @@ -705,7 +705,7 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card, dma_cnt[i] = 0; } - if (bio->bi_rw & REQ_DISCARD) { + if (bio_op(bio) == REQ_OP_DISCARD) { bv_len = bio->bi_iter.bi_size; while (bv_len > 0) { diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 910e065918af..3822eae102db 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -597,7 +597,7 @@ static void skd_request_fn(struct request_queue *q) data_dir = rq_data_dir(req); io_flags = req->cmd_flags; - if (io_flags & REQ_FLUSH) + if (req_op(req) == REQ_OP_FLUSH) flush++; if (io_flags & REQ_FUA) @@ -4690,10 +4690,10 @@ static int skd_bdev_getgeo(struct block_device *bdev, struct hd_geometry *geo) return -EIO; } -static int skd_bdev_attach(struct skd_device *skdev) +static int skd_bdev_attach(struct device *parent, struct skd_device *skdev) { pr_debug("%s:%s:%d add_disk\n", skdev->name, __func__, __LINE__); - add_disk(skdev->disk); + device_add_disk(parent, skdev->disk); return 0; } @@ -4812,8 +4812,6 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, skdev); - skdev->disk->driverfs_dev = &pdev->dev; - for (i = 0; i < SKD_MAX_BARS; i++) { skdev->mem_phys[i] = pci_resource_start(pdev, i); skdev->mem_size[i] = (u32)pci_resource_len(pdev, i); @@ -4851,7 +4849,7 @@ static int skd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) (SKD_START_WAIT_SECONDS * HZ)); if (skdev->gendisk_on > 0) { /* device came on-line after reset */ - skd_bdev_attach(skdev); + skd_bdev_attach(&pdev->dev, skdev); rc = 0; } else { /* we timed out, something is wrong with the device, diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 4b911ed96ea3..cab157331c4e 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -804,7 +804,6 @@ static int probe_disk(struct vdc_port *port) g->fops = &vdc_fops; g->queue = q; g->private_data = port; - g->driverfs_dev = &port->vio.vdev->dev; set_capacity(g, port->vdisk_size); @@ -835,7 +834,7 @@ static int probe_disk(struct vdc_port *port) port->vdisk_size, (port->vdisk_size >> (20 - 9)), port->vio.ver.major, port->vio.ver.minor); - add_disk(g); + device_add_disk(&port->vio.vdev->dev, g); return 0; } diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 7939b9f87441..d0a3e6d4515f 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -344,7 +344,6 @@ static int add_bio(struct cardinfo *card) int offset; struct bio *bio; struct bio_vec vec; - int rw; bio = card->currentbio; if (!bio && card->bio) { @@ -359,7 +358,6 @@ static int add_bio(struct cardinfo *card) if (!bio) return 0; - rw = bio_rw(bio); if (card->mm_pages[card->Ready].cnt >= DESC_PER_PAGE) return 0; @@ -369,7 +367,7 @@ static int add_bio(struct cardinfo *card) vec.bv_page, vec.bv_offset, vec.bv_len, - (rw == READ) ? + bio_op(bio) == REQ_OP_READ ? PCI_DMA_FROMDEVICE : PCI_DMA_TODEVICE); p = &card->mm_pages[card->Ready]; @@ -398,7 +396,7 @@ static int add_bio(struct cardinfo *card) DMASCR_CHAIN_EN | DMASCR_SEM_EN | pci_cmds); - if (rw == WRITE) + if (bio_op(bio) == REQ_OP_WRITE) desc->control_bits |= cpu_to_le32(DMASCR_TRANSFER_READ); desc->sem_control_bits = desc->control_bits; @@ -462,7 +460,7 @@ static void process_page(unsigned long data) le32_to_cpu(desc->local_addr)>>9, le32_to_cpu(desc->transfer_size)); dump_dmastat(card, control); - } else if ((bio->bi_rw & REQ_WRITE) && + } else if (op_is_write(bio_op(bio)) && le32_to_cpu(desc->local_addr) >> 9 == card->init_size) { card->init_size += le32_to_cpu(desc->transfer_size) >> 9; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 42758b52768c..1523e05c46fc 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -172,7 +172,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); vbr->req = req; - if (req->cmd_flags & REQ_FLUSH) { + if (req_op(req) == REQ_OP_FLUSH) { vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); vbr->out_hdr.sector = 0; vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); @@ -236,25 +236,22 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, static int virtblk_get_id(struct gendisk *disk, char *id_str) { struct virtio_blk *vblk = disk->private_data; + struct request_queue *q = vblk->disk->queue; struct request *req; - struct bio *bio; int err; - bio = bio_map_kern(vblk->disk->queue, id_str, VIRTIO_BLK_ID_BYTES, - GFP_KERNEL); - if (IS_ERR(bio)) - return PTR_ERR(bio); - - req = blk_make_request(vblk->disk->queue, bio, GFP_KERNEL); - if (IS_ERR(req)) { - bio_put(bio); + req = blk_get_request(q, READ, GFP_KERNEL); + if (IS_ERR(req)) return PTR_ERR(req); - } - req->cmd_type = REQ_TYPE_DRV_PRIV; + + err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL); + if (err) + goto out; + err = blk_execute_rq(vblk->disk->queue, vblk->disk, req, false); +out: blk_put_request(req); - return err; } @@ -656,7 +653,6 @@ static int virtblk_probe(struct virtio_device *vdev) vblk->disk->first_minor = index_to_minor(index); vblk->disk->private_data = vblk; vblk->disk->fops = &virtblk_fops; - vblk->disk->driverfs_dev = &vdev->dev; vblk->disk->flags |= GENHD_FL_EXT_DEVT; vblk->index = index; @@ -733,7 +729,7 @@ static int virtblk_probe(struct virtio_device *vdev) virtio_device_ready(vdev); - add_disk(vblk->disk); + device_add_disk(&vdev->dev, vblk->disk); err = device_create_file(disk_to_dev(vblk->disk), &dev_attr_serial); if (err) goto out_del_disk; diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 4809c1501d7e..4a80ee752597 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -501,7 +501,7 @@ static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif, struct xen_vbd *vbd = &blkif->vbd; int rc = -EACCES; - if ((operation != READ) && vbd->readonly) + if ((operation != REQ_OP_READ) && vbd->readonly) goto out; if (likely(req->nr_sects)) { @@ -1014,7 +1014,7 @@ static int dispatch_discard_io(struct xen_blkif_ring *ring, preq.sector_number = req->u.discard.sector_number; preq.nr_sects = req->u.discard.nr_sectors; - err = xen_vbd_translate(&preq, blkif, WRITE); + err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE); if (err) { pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n", preq.sector_number, @@ -1229,6 +1229,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct bio **biolist = pending_req->biolist; int i, nbio = 0; int operation; + int operation_flags = 0; struct blk_plug plug; bool drain = false; struct grant_page **pages = pending_req->segments; @@ -1247,17 +1248,19 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, switch (req_operation) { case BLKIF_OP_READ: ring->st_rd_req++; - operation = READ; + operation = REQ_OP_READ; break; case BLKIF_OP_WRITE: ring->st_wr_req++; - operation = WRITE_ODIRECT; + operation = REQ_OP_WRITE; + operation_flags = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: ring->st_f_req++; - operation = WRITE_FLUSH; + operation = REQ_OP_WRITE; + operation_flags = WRITE_FLUSH; break; default: operation = 0; /* make gcc happy */ @@ -1269,7 +1272,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, nseg = req->operation == BLKIF_OP_INDIRECT ? req->u.indirect.nr_segments : req->u.rw.nr_segments; - if (unlikely(nseg == 0 && operation != WRITE_FLUSH) || + if (unlikely(nseg == 0 && operation_flags != WRITE_FLUSH) || unlikely((req->operation != BLKIF_OP_INDIRECT) && (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) || unlikely((req->operation == BLKIF_OP_INDIRECT) && @@ -1310,7 +1313,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", - operation == READ ? "read" : "write", + operation == REQ_OP_READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, ring->blkif->vbd.pdevice); @@ -1369,6 +1372,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; bio->bi_iter.bi_sector = preq.sector_number; + bio_set_op_attrs(bio, operation, operation_flags); } preq.sector_number += seg[i].nsec; @@ -1376,7 +1380,7 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, /* This will be hit if the operation was a flush or discard. */ if (!bio) { - BUG_ON(operation != WRITE_FLUSH); + BUG_ON(operation_flags != WRITE_FLUSH); bio = bio_alloc(GFP_KERNEL, 0); if (unlikely(bio == NULL)) @@ -1386,20 +1390,21 @@ static int dispatch_rw_block_io(struct xen_blkif_ring *ring, bio->bi_bdev = preq.bdev; bio->bi_private = pending_req; bio->bi_end_io = end_block_io_op; + bio_set_op_attrs(bio, operation, operation_flags); } atomic_set(&pending_req->pendcnt, nbio); blk_start_plug(&plug); for (i = 0; i < nbio; i++) - submit_bio(operation, biolist[i]); + submit_bio(biolist[i]); /* Let the I/Os go.. */ blk_finish_plug(&plug); - if (operation == READ) + if (operation == REQ_OP_READ) ring->st_rd_sect += preq.nr_sects; - else if (operation & WRITE) + else if (operation == REQ_OP_WRITE) ring->st_wr_sect += preq.nr_sects; return 0; diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 3355f1cdd4e5..3cc6d1d86f1e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -379,7 +379,7 @@ static struct attribute *xen_vbdstat_attrs[] = { NULL }; -static struct attribute_group xen_vbdstat_group = { +static const struct attribute_group xen_vbdstat_group = { .name = "statistics", .attrs = xen_vbdstat_attrs, }; @@ -480,7 +480,7 @@ static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle, if (q && test_bit(QUEUE_FLAG_WC, &q->queue_flags)) vbd->flush_support = true; - if (q && blk_queue_secdiscard(q)) + if (q && blk_queue_secure_erase(q)) vbd->discard_secure = true; pr_debug("Successful creation of handle=%04x (dom=%u)\n", @@ -715,8 +715,11 @@ static void backend_changed(struct xenbus_watch *watch, /* Front end dir is a number, which is used as the handle. */ err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle); - if (err) + if (err) { + kfree(be->mode); + be->mode = NULL; return; + } be->major = major; be->minor = minor; @@ -1022,9 +1025,9 @@ static int connect_ring(struct backend_info *be) pr_debug("%s %s\n", __func__, dev->otherend); be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; - err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", - "%63s", protocol, NULL); - if (err) + err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol); + if (err <= 0) strcpy(protocol, "unspecified, assuming default"); else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; @@ -1036,10 +1039,9 @@ static int connect_ring(struct backend_info *be) xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); return -ENOSYS; } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) + err = xenbus_scanf(XBT_NIL, dev->otherend, + "feature-persistent", "%u", &pers_grants); + if (err <= 0) pers_grants = 0; be->blkif->vbd.feature_gnt_persistent = pers_grants; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index ca13df854639..be4fea6a5dd3 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -196,6 +196,7 @@ struct blkfront_info unsigned int nr_ring_pages; struct request_queue *rq; unsigned int feature_flush; + unsigned int feature_fua; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; unsigned int discard_granularity; @@ -207,6 +208,9 @@ struct blkfront_info struct blk_mq_tag_set tag_set; struct blkfront_ring_info *rinfo; unsigned int nr_rings; + /* Save uncomplete reqs and bios for migration. */ + struct list_head requests; + struct bio_list bio_list; }; static unsigned int nr_minors; @@ -544,7 +548,7 @@ static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_inf ring_req->u.discard.nr_sectors = blk_rq_sectors(req); ring_req->u.discard.id = id; ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req); - if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard) + if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard) ring_req->u.discard.flag = BLKIF_DISCARD_SECURE; else ring_req->u.discard.flag = 0; @@ -743,7 +747,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE */ - BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA)); + BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA); ring_req->operation = BLKIF_OP_INDIRECT; ring_req->u.indirect.indirect_op = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; @@ -755,7 +759,7 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri ring_req->u.rw.handle = info->handle; ring_req->operation = rq_data_dir(req) ? BLKIF_OP_WRITE : BLKIF_OP_READ; - if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) { + if (req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA) { /* * Ideally we can do an unordered flush-to-disk. * In case the backend onlysupports barriers, use that. @@ -763,19 +767,14 @@ static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *ri * implement it the same way. (It's also a FLUSH+FUA, * since it is guaranteed ordered WRT previous writes.) */ - switch (info->feature_flush & - ((REQ_FLUSH|REQ_FUA))) { - case REQ_FLUSH|REQ_FUA: + if (info->feature_flush && info->feature_fua) ring_req->operation = BLKIF_OP_WRITE_BARRIER; - break; - case REQ_FLUSH: + else if (info->feature_flush) ring_req->operation = BLKIF_OP_FLUSH_DISKCACHE; - break; - default: + else ring_req->operation = 0; - } } ring_req->u.rw.nr_segments = num_grant; if (unlikely(require_extra_req)) { @@ -844,7 +843,8 @@ static int blkif_queue_request(struct request *req, struct blkfront_ring_info *r if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; - if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) + if (unlikely(req_op(req) == REQ_OP_DISCARD || + req_op(req) == REQ_OP_SECURE_ERASE)) return blkif_queue_discard_req(req, rinfo); else return blkif_queue_rw_req(req, rinfo); @@ -864,18 +864,22 @@ static inline bool blkif_request_flush_invalid(struct request *req, struct blkfront_info *info) { return ((req->cmd_type != REQ_TYPE_FS) || - ((req->cmd_flags & REQ_FLUSH) && - !(info->feature_flush & REQ_FLUSH)) || + ((req_op(req) == REQ_OP_FLUSH) && + !info->feature_flush) || ((req->cmd_flags & REQ_FUA) && - !(info->feature_flush & REQ_FUA))); + !info->feature_fua)); } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *qd) { unsigned long flags; - struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; + int qid = hctx->queue_num; + struct blkfront_info *info = hctx->queue->queuedata; + struct blkfront_ring_info *rinfo = NULL; + BUG_ON(info->nr_rings <= qid); + rinfo = &info->rinfo[qid]; blk_mq_start_request(qd->rq); spin_lock_irqsave(&rinfo->ring_lock, flags); if (RING_FULL(&rinfo->ring)) @@ -901,20 +905,9 @@ out_busy: return BLK_MQ_RQ_QUEUE_BUSY; } -static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, - unsigned int index) -{ - struct blkfront_info *info = (struct blkfront_info *)data; - - BUG_ON(info->nr_rings <= index); - hctx->driver_data = &info->rinfo[index]; - return 0; -} - static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, - .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -950,6 +943,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return PTR_ERR(rq); } + rq->queuedata = info; queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); if (info->feature_discard) { @@ -958,7 +952,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, rq->limits.discard_granularity = info->discard_granularity; rq->limits.discard_alignment = info->discard_alignment; if (info->feature_secdiscard) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_set_unlocked(QUEUE_FLAG_SECERASE, rq); } /* Hard sector size and max sectors impersonate the equiv. hardware. */ @@ -984,24 +978,22 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return 0; } -static const char *flush_info(unsigned int feature_flush) +static const char *flush_info(struct blkfront_info *info) { - switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) { - case REQ_FLUSH|REQ_FUA: + if (info->feature_flush && info->feature_fua) return "barrier: enabled;"; - case REQ_FLUSH: + else if (info->feature_flush) return "flush diskcache: enabled;"; - default: + else return "barrier or flush: disabled;"; - } } static void xlvbd_flush(struct blkfront_info *info) { - blk_queue_write_cache(info->rq, info->feature_flush & REQ_FLUSH, - info->feature_flush & REQ_FUA); + blk_queue_write_cache(info->rq, info->feature_flush ? true : false, + info->feature_fua ? true : false); pr_info("blkfront: %s: %s %s %s %s %s\n", - info->gd->disk_name, flush_info(info->feature_flush), + info->gd->disk_name, flush_info(info), "persistent grants:", info->feature_persistent ? "enabled;" : "disabled;", "indirect descriptors:", info->max_indirect_segments ? "enabled;" : "disabled;"); @@ -1142,7 +1134,6 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, gd->first_minor = minor; gd->fops = &xlvbd_block_fops; gd->private_data = info; - gd->driverfs_dev = &(info->xbdev->dev); set_capacity(gd, capacity); if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size, @@ -1600,7 +1591,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) info->feature_discard = 0; info->feature_secdiscard = 0; queue_flag_clear(QUEUE_FLAG_DISCARD, rq); - queue_flag_clear(QUEUE_FLAG_SECDISCARD, rq); + queue_flag_clear(QUEUE_FLAG_SECERASE, rq); } blk_mq_complete_request(req, error); break; @@ -1620,6 +1611,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) if (unlikely(error)) { if (error == -EOPNOTSUPP) error = 0; + info->feature_fua = 0; info->feature_flush = 0; xlvbd_flush(info); } @@ -2008,69 +2000,22 @@ static int blkif_recover(struct blkfront_info *info) { unsigned int i, r_index; struct request *req, *n; - struct blk_shadow *copy; int rc; struct bio *bio, *cloned_bio; - struct bio_list bio_list, merge_bio; unsigned int segs, offset; int pending, size; struct split_bio *split_bio; - struct list_head requests; blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); - bio_list_init(&bio_list); - INIT_LIST_HEAD(&requests); for (r_index = 0; r_index < info->nr_rings; r_index++) { - struct blkfront_ring_info *rinfo; - - rinfo = &info->rinfo[r_index]; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - rinfo->shadow[i].req.u.rw.id = i+1; - rinfo->shadow_free = rinfo->ring.req_prod_pvt; - rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + struct blkfront_ring_info *rinfo = &info->rinfo[r_index]; rc = blkfront_setup_indirect(rinfo); - if (rc) { - kfree(copy); + if (rc) return rc; - } - - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { - /* - * Flush operations don't contain bios, so - * we need to requeue the whole request - */ - list_add(©[i].request->queuelist, &requests); - continue; - } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); } xenbus_switch_state(info->xbdev, XenbusStateConnected); @@ -2085,7 +2030,7 @@ static int blkif_recover(struct blkfront_info *info) kick_pending_request_queues(rinfo); } - list_for_each_entry_safe(req, n, &requests, queuelist) { + list_for_each_entry_safe(req, n, &info->requests, queuelist) { /* Requeue pending requests (flush or discard) */ list_del_init(&req->queuelist); BUG_ON(req->nr_phys_segments > segs); @@ -2093,7 +2038,7 @@ static int blkif_recover(struct blkfront_info *info) } blk_mq_kick_requeue_list(info->rq); - while ((bio = bio_list_pop(&bio_list)) != NULL) { + while ((bio = bio_list_pop(&info->bio_list)) != NULL) { /* Traverse the list of pending bios and re-queue them */ if (bio_segments(bio) > segs) { /* @@ -2114,7 +2059,7 @@ static int blkif_recover(struct blkfront_info *info) bio_trim(cloned_bio, offset, size); cloned_bio->bi_private = split_bio; cloned_bio->bi_end_io = split_bio_end; - submit_bio(cloned_bio->bi_rw, cloned_bio); + submit_bio(cloned_bio); } /* * Now we have to wait for all those smaller bios to @@ -2123,7 +2068,7 @@ static int blkif_recover(struct blkfront_info *info) continue; } /* We don't need to split this bio */ - submit_bio(bio->bi_rw, bio); + submit_bio(bio); } return 0; @@ -2139,9 +2084,47 @@ static int blkfront_resume(struct xenbus_device *dev) { struct blkfront_info *info = dev_get_drvdata(&dev->dev); int err = 0; + unsigned int i, j; dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename); + bio_list_init(&info->bio_list); + INIT_LIST_HEAD(&info->requests); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + struct bio_list merge_bio; + struct blk_shadow *shadow = rinfo->shadow; + + for (j = 0; j < BLK_RING_SIZE(info); j++) { + /* Not in use? */ + if (!shadow[j].request) + continue; + + /* + * Get the bios in the request so we can re-queue them. + */ + if (req_op(shadow[i].request) == REQ_OP_FLUSH || + req_op(shadow[i].request) == REQ_OP_DISCARD || + req_op(shadow[i].request) == REQ_OP_SECURE_ERASE || + shadow[j].request->cmd_flags & REQ_FUA) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + * + * XXX: but this doesn't make any sense for a + * write with the FUA flag set.. + */ + list_add(&shadow[j].request->queuelist, &info->requests); + continue; + } + merge_bio.head = shadow[j].request->bio; + merge_bio.tail = shadow[j].request->biotail; + bio_list_merge(&info->bio_list, &merge_bio); + shadow[j].request->bio = NULL; + blk_mq_end_request(shadow[j].request, 0); + } + } + blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); err = negotiate_mq(info); @@ -2149,6 +2132,8 @@ static int blkfront_resume(struct xenbus_device *dev) return err; err = talk_to_blkback(dev, info); + if (!err) + blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings); /* * We have to wait for the backend to switch to @@ -2212,10 +2197,9 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->discard_granularity = discard_granularity; info->discard_alignment = discard_alignment; } - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "discard-secure", "%d", &discard_secure, - NULL); - if (!err) + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "discard-secure", "%u", &discard_secure); + if (err > 0) info->feature_secdiscard = !!discard_secure; } @@ -2313,10 +2297,10 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) unsigned int indirect_segments; info->feature_flush = 0; + info->feature_fua = 0; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-barrier", "%d", &barrier, - NULL); + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-barrier", "%d", &barrier); /* * If there's no "feature-barrier" defined, then it means @@ -2325,38 +2309,40 @@ static void blkfront_gather_backend_features(struct blkfront_info *info) * * If there are barriers, then we use flush. */ - if (!err && barrier) - info->feature_flush = REQ_FLUSH | REQ_FUA; + if (err > 0 && barrier) { + info->feature_flush = 1; + info->feature_fua = 1; + } + /* * And if there is "feature-flush-cache" use that above * barriers. */ - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-flush-cache", "%d", &flush, - NULL); + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-flush-cache", "%d", &flush); - if (!err && flush) - info->feature_flush = REQ_FLUSH; + if (err > 0 && flush) { + info->feature_flush = 1; + info->feature_fua = 0; + } - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-discard", "%d", &discard, - NULL); + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-discard", "%d", &discard); - if (!err && discard) + if (err > 0 && discard) blkfront_setup_discard(info); - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-persistent", "%u", &persistent, - NULL); - if (err) + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-persistent", "%d", &persistent); + if (err <= 0) info->feature_persistent = 0; else info->feature_persistent = persistent; - err = xenbus_gather(XBT_NIL, info->xbdev->otherend, - "feature-max-indirect-segments", "%u", &indirect_segments, - NULL); - if (err) + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "feature-max-indirect-segments", "%u", + &indirect_segments); + if (err <= 0) info->max_indirect_segments = 0; else info->max_indirect_segments = min(indirect_segments, @@ -2456,7 +2442,7 @@ static void blkfront_connect(struct blkfront_info *info) for (i = 0; i < info->nr_rings; i++) kick_pending_request_queues(&info->rinfo[i]); - add_disk(info->gd); + device_add_disk(&info->xbdev->dev, info->gd); info->is_ready = 1; } @@ -2485,10 +2471,23 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: - if (dev->state != XenbusStateInitialised) { + /* + * talk_to_blkback sets state to XenbusStateInitialised + * and blkfront_connect sets it to XenbusStateConnected + * (if connection went OK). + * + * If the backend (or toolstack) decides to poke at backend + * state (and re-trigger the watch by setting the state repeatedly + * to XenbusStateConnected (4)) we need to deal with this. + * This is allowed as this is used to communicate to the guest + * that the size of disk has changed! + */ + if ((dev->state != XenbusStateInitialised) && + (dev->state != XenbusStateConnected)) { if (talk_to_blkback(dev, info)) break; } + blkfront_connect(info); break; diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index 386ba3d1a6ee..b8ecba6dcd3b 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -1,8 +1,7 @@ config ZRAM tristate "Compressed RAM block device support" - depends on BLOCK && SYSFS && ZSMALLOC - select LZO_COMPRESS - select LZO_DECOMPRESS + depends on BLOCK && SYSFS && ZSMALLOC && CRYPTO + select CRYPTO_LZO default n help Creates virtual block devices called /dev/zramX (X = 0, 1, ...). @@ -14,13 +13,3 @@ config ZRAM disks and maybe many more. See zram.txt for more information. - -config ZRAM_LZ4_COMPRESS - bool "Enable LZ4 algorithm support" - depends on ZRAM - select LZ4_COMPRESS - select LZ4_DECOMPRESS - default n - help - This option enables LZ4 compression algorithm support. Compression - algorithm can be changed using `comp_algorithm' device attribute.
\ No newline at end of file diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile index be0763ff57a2..9e2b79e9a990 100644 --- a/drivers/block/zram/Makefile +++ b/drivers/block/zram/Makefile @@ -1,5 +1,3 @@ -zram-y := zcomp_lzo.o zcomp.o zram_drv.o - -zram-$(CONFIG_ZRAM_LZ4_COMPRESS) += zcomp_lz4.o +zram-y := zcomp.o zram_drv.o obj-$(CONFIG_ZRAM) += zram.o diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c index b51a816d766b..4b5cd3a7b2b6 100644 --- a/drivers/block/zram/zcomp.c +++ b/drivers/block/zram/zcomp.c @@ -14,108 +14,150 @@ #include <linux/wait.h> #include <linux/sched.h> #include <linux/cpu.h> +#include <linux/crypto.h> #include "zcomp.h" -#include "zcomp_lzo.h" -#ifdef CONFIG_ZRAM_LZ4_COMPRESS -#include "zcomp_lz4.h" -#endif -static struct zcomp_backend *backends[] = { - &zcomp_lzo, -#ifdef CONFIG_ZRAM_LZ4_COMPRESS - &zcomp_lz4, +static const char * const backends[] = { + "lzo", +#if IS_ENABLED(CONFIG_CRYPTO_LZ4) + "lz4", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_DEFLATE) + "deflate", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC) + "lz4hc", +#endif +#if IS_ENABLED(CONFIG_CRYPTO_842) + "842", #endif NULL }; -static struct zcomp_backend *find_backend(const char *compress) -{ - int i = 0; - while (backends[i]) { - if (sysfs_streq(compress, backends[i]->name)) - break; - i++; - } - return backends[i]; -} - -static void zcomp_strm_free(struct zcomp *comp, struct zcomp_strm *zstrm) +static void zcomp_strm_free(struct zcomp_strm *zstrm) { - if (zstrm->private) - comp->backend->destroy(zstrm->private); + if (!IS_ERR_OR_NULL(zstrm->tfm)) + crypto_free_comp(zstrm->tfm); free_pages((unsigned long)zstrm->buffer, 1); kfree(zstrm); } /* - * allocate new zcomp_strm structure with ->private initialized by + * allocate new zcomp_strm structure with ->tfm initialized by * backend, return NULL on error */ -static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp, gfp_t flags) +static struct zcomp_strm *zcomp_strm_alloc(struct zcomp *comp) { - struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), flags); + struct zcomp_strm *zstrm = kmalloc(sizeof(*zstrm), GFP_KERNEL); if (!zstrm) return NULL; - zstrm->private = comp->backend->create(flags); + zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0); /* * allocate 2 pages. 1 for compressed data, plus 1 extra for the * case when compressed size is larger than the original one */ - zstrm->buffer = (void *)__get_free_pages(flags | __GFP_ZERO, 1); - if (!zstrm->private || !zstrm->buffer) { - zcomp_strm_free(comp, zstrm); + zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1); + if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) { + zcomp_strm_free(zstrm); zstrm = NULL; } return zstrm; } +bool zcomp_available_algorithm(const char *comp) +{ + int i = 0; + + while (backends[i]) { + if (sysfs_streq(comp, backends[i])) + return true; + i++; + } + + /* + * Crypto does not ignore a trailing new line symbol, + * so make sure you don't supply a string containing + * one. + * This also means that we permit zcomp initialisation + * with any compressing algorithm known to crypto api. + */ + return crypto_has_comp(comp, 0, 0) == 1; +} + /* show available compressors */ ssize_t zcomp_available_show(const char *comp, char *buf) { + bool known_algorithm = false; ssize_t sz = 0; int i = 0; - while (backends[i]) { - if (!strcmp(comp, backends[i]->name)) + for (; backends[i]; i++) { + if (!strcmp(comp, backends[i])) { + known_algorithm = true; sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "[%s] ", backends[i]->name); - else + "[%s] ", backends[i]); + } else { sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, - "%s ", backends[i]->name); - i++; + "%s ", backends[i]); + } } + + /* + * Out-of-tree module known to crypto api or a missing + * entry in `backends'. + */ + if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1) + sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2, + "[%s] ", comp); + sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n"); return sz; } -bool zcomp_available_algorithm(const char *comp) -{ - return find_backend(comp) != NULL; -} - -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp) +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp) { return *get_cpu_ptr(comp->stream); } -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm) +void zcomp_stream_put(struct zcomp *comp) { put_cpu_ptr(comp->stream); } -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len) +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len) { - return comp->backend->compress(src, zstrm->buffer, dst_len, - zstrm->private); + /* + * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized + * because sometimes we can endup having a bigger compressed data + * due to various reasons: for example compression algorithms tend + * to add some padding to the compressed buffer. Speaking of padding, + * comp algorithm `842' pads the compressed length to multiple of 8 + * and returns -ENOSP when the dst memory is not big enough, which + * is not something that ZRAM wants to see. We can handle the + * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we + * receive -ERRNO from the compressing backend we can't help it + * anymore. To make `842' happy we need to tell the exact size of + * the dst buffer, zram_drv will take care of the fact that + * compressed buffer is too big. + */ + *dst_len = PAGE_SIZE * 2; + + return crypto_comp_compress(zstrm->tfm, + src, PAGE_SIZE, + zstrm->buffer, dst_len); } -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst) +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst) { - return comp->backend->decompress(src, src_len, dst); + unsigned int dst_len = PAGE_SIZE; + + return crypto_comp_decompress(zstrm->tfm, + src, src_len, + dst, &dst_len); } static int __zcomp_cpu_notifier(struct zcomp *comp, @@ -127,7 +169,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_PREPARE: if (WARN_ON(*per_cpu_ptr(comp->stream, cpu))) break; - zstrm = zcomp_strm_alloc(comp, GFP_KERNEL); + zstrm = zcomp_strm_alloc(comp); if (IS_ERR_OR_NULL(zstrm)) { pr_err("Can't allocate a compression stream\n"); return NOTIFY_BAD; @@ -138,7 +180,7 @@ static int __zcomp_cpu_notifier(struct zcomp *comp, case CPU_UP_CANCELED: zstrm = *per_cpu_ptr(comp->stream, cpu); if (!IS_ERR_OR_NULL(zstrm)) - zcomp_strm_free(comp, zstrm); + zcomp_strm_free(zstrm); *per_cpu_ptr(comp->stream, cpu) = NULL; break; default: @@ -209,18 +251,16 @@ void zcomp_destroy(struct zcomp *comp) struct zcomp *zcomp_create(const char *compress) { struct zcomp *comp; - struct zcomp_backend *backend; int error; - backend = find_backend(compress); - if (!backend) + if (!zcomp_available_algorithm(compress)) return ERR_PTR(-EINVAL); comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL); if (!comp) return ERR_PTR(-ENOMEM); - comp->backend = backend; + comp->name = compress; error = zcomp_init(comp); if (error) { kfree(comp); diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h index ffd88cb747fe..478cac2ed465 100644 --- a/drivers/block/zram/zcomp.h +++ b/drivers/block/zram/zcomp.h @@ -13,33 +13,15 @@ struct zcomp_strm { /* compression/decompression buffer */ void *buffer; - /* - * The private data of the compression stream, only compression - * stream backend can touch this (e.g. compression algorithm - * working memory) - */ - void *private; -}; - -/* static compression backend */ -struct zcomp_backend { - int (*compress)(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private); - - int (*decompress)(const unsigned char *src, size_t src_len, - unsigned char *dst); - - void *(*create)(gfp_t flags); - void (*destroy)(void *private); - - const char *name; + struct crypto_comp *tfm; }; /* dynamic per-device compression frontend */ struct zcomp { struct zcomp_strm * __percpu *stream; - struct zcomp_backend *backend; struct notifier_block notifier; + + const char *name; }; ssize_t zcomp_available_show(const char *comp, char *buf); @@ -48,14 +30,14 @@ bool zcomp_available_algorithm(const char *comp); struct zcomp *zcomp_create(const char *comp); void zcomp_destroy(struct zcomp *comp); -struct zcomp_strm *zcomp_strm_find(struct zcomp *comp); -void zcomp_strm_release(struct zcomp *comp, struct zcomp_strm *zstrm); +struct zcomp_strm *zcomp_stream_get(struct zcomp *comp); +void zcomp_stream_put(struct zcomp *comp); -int zcomp_compress(struct zcomp *comp, struct zcomp_strm *zstrm, - const unsigned char *src, size_t *dst_len); +int zcomp_compress(struct zcomp_strm *zstrm, + const void *src, unsigned int *dst_len); -int zcomp_decompress(struct zcomp *comp, const unsigned char *src, - size_t src_len, unsigned char *dst); +int zcomp_decompress(struct zcomp_strm *zstrm, + const void *src, unsigned int src_len, void *dst); bool zcomp_set_max_streams(struct zcomp *comp, int num_strm); #endif /* _ZCOMP_H_ */ diff --git a/drivers/block/zram/zcomp_lz4.c b/drivers/block/zram/zcomp_lz4.c deleted file mode 100644 index 0110086accba..000000000000 --- a/drivers/block/zram/zcomp_lz4.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/lz4.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> - -#include "zcomp_lz4.h" - -static void *zcomp_lz4_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZ4_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZ4_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void zcomp_lz4_destroy(void *private) -{ - kvfree(private); -} - -static int zcomp_lz4_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - /* return : Success if return 0 */ - return lz4_compress(src, PAGE_SIZE, dst, dst_len, private); -} - -static int zcomp_lz4_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - /* return : Success if return 0 */ - return lz4_decompress_unknownoutputsize(src, src_len, dst, &dst_len); -} - -struct zcomp_backend zcomp_lz4 = { - .compress = zcomp_lz4_compress, - .decompress = zcomp_lz4_decompress, - .create = zcomp_lz4_create, - .destroy = zcomp_lz4_destroy, - .name = "lz4", -}; diff --git a/drivers/block/zram/zcomp_lz4.h b/drivers/block/zram/zcomp_lz4.h deleted file mode 100644 index 60613fb29dd8..000000000000 --- a/drivers/block/zram/zcomp_lz4.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZ4_H_ -#define _ZCOMP_LZ4_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lz4; - -#endif /* _ZCOMP_LZ4_H_ */ diff --git a/drivers/block/zram/zcomp_lzo.c b/drivers/block/zram/zcomp_lzo.c deleted file mode 100644 index ed7a1f0549ec..000000000000 --- a/drivers/block/zram/zcomp_lzo.c +++ /dev/null @@ -1,56 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#include <linux/kernel.h> -#include <linux/slab.h> -#include <linux/lzo.h> -#include <linux/vmalloc.h> -#include <linux/mm.h> - -#include "zcomp_lzo.h" - -static void *lzo_create(gfp_t flags) -{ - void *ret; - - ret = kmalloc(LZO1X_MEM_COMPRESS, flags); - if (!ret) - ret = __vmalloc(LZO1X_MEM_COMPRESS, - flags | __GFP_HIGHMEM, - PAGE_KERNEL); - return ret; -} - -static void lzo_destroy(void *private) -{ - kvfree(private); -} - -static int lzo_compress(const unsigned char *src, unsigned char *dst, - size_t *dst_len, void *private) -{ - int ret = lzo1x_1_compress(src, PAGE_SIZE, dst, dst_len, private); - return ret == LZO_E_OK ? 0 : ret; -} - -static int lzo_decompress(const unsigned char *src, size_t src_len, - unsigned char *dst) -{ - size_t dst_len = PAGE_SIZE; - int ret = lzo1x_decompress_safe(src, src_len, dst, &dst_len); - return ret == LZO_E_OK ? 0 : ret; -} - -struct zcomp_backend zcomp_lzo = { - .compress = lzo_compress, - .decompress = lzo_decompress, - .create = lzo_create, - .destroy = lzo_destroy, - .name = "lzo", -}; diff --git a/drivers/block/zram/zcomp_lzo.h b/drivers/block/zram/zcomp_lzo.h deleted file mode 100644 index 128c5807fa14..000000000000 --- a/drivers/block/zram/zcomp_lzo.h +++ /dev/null @@ -1,17 +0,0 @@ -/* - * Copyright (C) 2014 Sergey Senozhatsky. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License - * as published by the Free Software Foundation; either version - * 2 of the License, or (at your option) any later version. - */ - -#ifndef _ZCOMP_LZO_H_ -#define _ZCOMP_LZO_H_ - -#include "zcomp.h" - -extern struct zcomp_backend zcomp_lzo; - -#endif /* _ZCOMP_LZO_H_ */ diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 8fcad8b761f1..7454cf188c8e 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -342,9 +342,16 @@ static ssize_t comp_algorithm_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) { struct zram *zram = dev_to_zram(dev); + char compressor[CRYPTO_MAX_ALG_NAME]; size_t sz; - if (!zcomp_available_algorithm(buf)) + strlcpy(compressor, buf, sizeof(compressor)); + /* ignore trailing newline */ + sz = strlen(compressor); + if (sz > 0 && compressor[sz - 1] == '\n') + compressor[sz - 1] = 0x00; + + if (!zcomp_available_algorithm(compressor)) return -EINVAL; down_write(&zram->init_lock); @@ -353,13 +360,8 @@ static ssize_t comp_algorithm_store(struct device *dev, pr_info("Can't change algorithm for initialized device\n"); return -EBUSY; } - strlcpy(zram->compressor, buf, sizeof(zram->compressor)); - - /* ignore trailing newline */ - sz = strlen(zram->compressor); - if (sz > 0 && zram->compressor[sz - 1] == '\n') - zram->compressor[sz - 1] = 0x00; + strlcpy(zram->compressor, compressor, sizeof(compressor)); up_write(&zram->init_lock); return len; } @@ -563,7 +565,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) unsigned char *cmem; struct zram_meta *meta = zram->meta; unsigned long handle; - size_t size; + unsigned int size; bit_spin_lock(ZRAM_ACCESS, &meta->table[index].value); handle = meta->table[index].handle; @@ -576,10 +578,14 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } cmem = zs_map_object(meta->mem_pool, handle, ZS_MM_RO); - if (size == PAGE_SIZE) + if (size == PAGE_SIZE) { copy_page(mem, cmem); - else - ret = zcomp_decompress(zram->comp, cmem, size, mem); + } else { + struct zcomp_strm *zstrm = zcomp_stream_get(zram->comp); + + ret = zcomp_decompress(zstrm, cmem, size, mem); + zcomp_stream_put(zram->comp); + } zs_unmap_object(meta->mem_pool, handle); bit_spin_unlock(ZRAM_ACCESS, &meta->table[index].value); @@ -646,7 +652,7 @@ static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec, u32 index, int offset) { int ret = 0; - size_t clen; + unsigned int clen; unsigned long handle = 0; struct page *page; unsigned char *user_mem, *cmem, *src, *uncmem = NULL; @@ -695,8 +701,8 @@ compress_again: goto out; } - zstrm = zcomp_strm_find(zram->comp); - ret = zcomp_compress(zram->comp, zstrm, uncmem, &clen); + zstrm = zcomp_stream_get(zram->comp); + ret = zcomp_compress(zstrm, uncmem, &clen); if (!is_partial_io(bvec)) { kunmap_atomic(user_mem); user_mem = NULL; @@ -732,19 +738,21 @@ compress_again: handle = zs_malloc(meta->mem_pool, clen, __GFP_KSWAPD_RECLAIM | __GFP_NOWARN | - __GFP_HIGHMEM); + __GFP_HIGHMEM | + __GFP_MOVABLE); if (!handle) { - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; atomic64_inc(&zram->stats.writestall); handle = zs_malloc(meta->mem_pool, clen, - GFP_NOIO | __GFP_HIGHMEM); + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); if (handle) goto compress_again; - pr_err("Error allocating memory for compressed page: %u, size=%zu\n", + pr_err("Error allocating memory for compressed page: %u, size=%u\n", index, clen); ret = -ENOMEM; goto out; @@ -769,7 +777,7 @@ compress_again: memcpy(cmem, src, clen); } - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); zstrm = NULL; zs_unmap_object(meta->mem_pool, handle); @@ -789,7 +797,7 @@ compress_again: atomic64_inc(&zram->stats.pages_stored); out: if (zstrm) - zcomp_strm_release(zram->comp, zstrm); + zcomp_stream_put(zram->comp); if (is_partial_io(bvec)) kfree(uncmem); return ret; @@ -874,7 +882,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) offset = (bio->bi_iter.bi_sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT; - if (unlikely(bio->bi_rw & REQ_DISCARD)) { + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) { zram_bio_discard(zram, index, offset, bio); bio_endio(bio); return; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 3f5bf66a27e4..74fcf10da374 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -15,8 +15,9 @@ #ifndef _ZRAM_DRV_H_ #define _ZRAM_DRV_H_ -#include <linux/spinlock.h> +#include <linux/rwsem.h> #include <linux/zsmalloc.h> +#include <linux/crypto.h> #include "zcomp.h" @@ -113,7 +114,7 @@ struct zram { * we can store in a disk. */ u64 disksize; /* bytes */ - char compressor[10]; + char compressor[CRYPTO_MAX_ALG_NAME]; /* * zram is claimed so open request will be failed */ |