diff options
Diffstat (limited to 'drivers/block')
-rw-r--r-- | drivers/block/cciss_scsi.c | 11 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_actlog.c | 8 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_bitmap.c | 19 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_int.h | 45 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_main.c | 97 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_nl.c | 40 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_proc.c | 3 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_receiver.c | 38 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_req.c | 45 | ||||
-rw-r--r-- | drivers/block/drbd/drbd_worker.c | 12 | ||||
-rw-r--r-- | drivers/block/floppy.c | 24 | ||||
-rw-r--r-- | drivers/block/mg_disk.c | 13 | ||||
-rw-r--r-- | drivers/block/nbd.c | 8 | ||||
-rw-r--r-- | drivers/block/rbd.c | 816 | ||||
-rw-r--r-- | drivers/block/rbd_types.h | 1 | ||||
-rw-r--r-- | drivers/block/umem.c | 37 | ||||
-rw-r--r-- | drivers/block/virtio_blk.c | 115 | ||||
-rw-r--r-- | drivers/block/xen-blkfront.c | 5 |
18 files changed, 815 insertions, 522 deletions
diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index acda773b3720..38aa6dda6b81 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -763,16 +763,7 @@ static void complete_scsi_command(CommandList_struct *c, int timeout, { case CMD_TARGET_STATUS: /* Pass it up to the upper layers... */ - if( ei->ScsiStatus) - { -#if 0 - printk(KERN_WARNING "cciss: cmd %p " - "has SCSI Status = %x\n", - c, ei->ScsiStatus); -#endif - cmd->result |= (ei->ScsiStatus << 1); - } - else { /* scsi status is zero??? How??? */ + if (!ei->ScsiStatus) { /* Ordinarily, this case should never happen, but there is a bug in some released firmware revisions that allows it to happen diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index e54e31b02b88..3fbef018ce55 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -411,7 +411,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) + mdev->ldev->md.al_offset + mdev->al_tr_pos; if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); if (++mdev->al_tr_pos > div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT)) @@ -876,7 +876,11 @@ int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size, unsigned int enr, count = 0; struct lc_element *e; - if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { + /* this should be an empty REQ_FLUSH */ + if (size == 0) + return 0; + + if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) { dev_err(DEV, "sector: %llus, size: %d\n", (unsigned long long)sector, size); return 0; diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index fcb956bb4b4c..d84566496746 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -889,6 +889,7 @@ struct bm_aio_ctx { unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 +#define BM_WRITE_ALL_PAGES 2 int error; struct kref kref; }; @@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx) break; if (rw & WRITE) { - if (bm_test_page_unchanged(b->bm_pages[i])) { + if (!(flags & BM_WRITE_ALL_PAGES) && + bm_test_page_unchanged(b->bm_pages[i])) { dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i); continue; } @@ -1096,7 +1098,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); err = -EIO; /* ctx->error ? */ } @@ -1141,6 +1143,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) } /** + * drbd_bm_write_all() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will write all pages. + */ +int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0); +} + +/** * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed. * @mdev: DRBD device. * @upper_idx: 0: write all changed pages; +ve: page index to stop scanning for changed pages @@ -1212,7 +1225,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); if (ctx->error) - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 02f013a073a7..b953cc7c9c00 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -813,7 +813,6 @@ enum { SIGNAL_ASENDER, /* whether asender wants to be interrupted */ SEND_PING, /* whether asender should send a ping asap */ - UNPLUG_QUEUED, /* only relevant with kernel 2.4 */ UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */ MD_DIRTY, /* current uuids and flags not yet on disk */ DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */ @@ -824,7 +823,6 @@ enum { CRASHED_PRIMARY, /* This node was a crashed primary. * Gets cleared when the state.conn * goes into C_CONNECTED state. */ - NO_BARRIER_SUPP, /* underlying block device doesn't implement barriers */ CONSIDER_RESYNC, MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ @@ -834,6 +832,7 @@ enum { BITMAP_IO_QUEUED, /* Started bitmap IO */ GO_DISKLESS, /* Disk is being detached, on io-error or admin request. */ WAS_IO_ERROR, /* Local disk failed returned IO error */ + FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */ RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ NET_CONGESTED, /* The data socket is congested */ @@ -851,6 +850,13 @@ enum { AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ STATE_SENT, /* Do not change state/UUIDs while this is set */ + + CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC) + * pending, from drbd worker context. + * If set, bdi_write_congested() returns true, + * so shrink_page_list() would not recurse into, + * and potentially deadlock on, this drbd worker. + */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -1130,8 +1136,8 @@ struct drbd_conf { int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */ int rs_planed; /* resync sectors already planned */ atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */ - int peer_max_bio_size; - int local_max_bio_size; + unsigned int peer_max_bio_size; + unsigned int local_max_bio_size; }; static inline struct drbd_conf *minor_to_mdev(unsigned int minor) @@ -1435,9 +1441,9 @@ struct bm_extent { * hash table. */ #define HT_SHIFT 8 #define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT)) -#define DRBD_MAX_BIO_SIZE_SAFE (1 << 12) /* Works always = 4k */ +#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */ -#define DRBD_MAX_SIZE_H80_PACKET (1 << 15) /* The old header only allows packets up to 32Kib data */ +#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */ /* Number of elements in the app_reads_hash */ #define APP_R_HSIZE 15 @@ -1463,6 +1469,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); @@ -1840,12 +1847,20 @@ static inline int drbd_request_state(struct drbd_conf *mdev, return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED); } +enum drbd_force_detach_flags { + DRBD_IO_ERROR, + DRBD_META_IO_ERROR, + DRBD_FORCE_DETACH, +}; + #define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__) -static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, const char *where) +static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, + enum drbd_force_detach_flags forcedetach, + const char *where) { switch (mdev->ldev->dc.on_io_error) { case EP_PASS_ON: - if (!forcedetach) { + if (forcedetach == DRBD_IO_ERROR) { if (__ratelimit(&drbd_ratelimit_state)) dev_err(DEV, "Local IO failed in %s.\n", where); if (mdev->state.disk > D_INCONSISTENT) @@ -1856,6 +1871,8 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, case EP_DETACH: case EP_CALL_HELPER: set_bit(WAS_IO_ERROR, &mdev->flags); + if (forcedetach == DRBD_FORCE_DETACH) + set_bit(FORCE_DETACH, &mdev->flags); if (mdev->state.disk > D_FAILED) { _drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL); dev_err(DEV, @@ -1875,7 +1892,7 @@ static inline void __drbd_chk_io_error_(struct drbd_conf *mdev, int forcedetach, */ #define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__) static inline void drbd_chk_io_error_(struct drbd_conf *mdev, - int error, int forcedetach, const char *where) + int error, enum drbd_force_detach_flags forcedetach, const char *where) { if (error) { unsigned long flags; @@ -2405,15 +2422,17 @@ static inline void dec_ap_bio(struct drbd_conf *mdev) int ap_bio = atomic_dec_return(&mdev->ap_bio_cnt); D_ASSERT(ap_bio >= 0); + + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } + /* this currently does wake_up for every dec_ap_bio! * maybe rather introduce some type of hysteresis? * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */ if (ap_bio < mxb) wake_up(&mdev->misc_wait); - if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { - if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) - drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); - } } static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val) diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 920ede2829d6..f93a0320e952 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused); static void md_sync_timer_fn(unsigned long data); static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused); static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused); +static void _tl_clear(struct drbd_conf *mdev); MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, " "Lars Ellenberg <lars@linbit.com>"); @@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) /* Actions operating on the disk state, also want to work on requests that got barrier acked. */ - switch (what) { - case fail_frozen_disk_io: - case restart_frozen_disk_io: - list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { - req = list_entry(le, struct drbd_request, tl_requests); - _req_mod(req, what); - } - case connection_lost_while_pending: - case resend: - break; - default: - dev_err(DEV, "what = %d in _tl_restart()\n", what); + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); } } @@ -459,11 +451,16 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) */ void tl_clear(struct drbd_conf *mdev) { + spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); + spin_unlock_irq(&mdev->req_lock); +} + +static void _tl_clear(struct drbd_conf *mdev) +{ struct list_head *le, *tle; struct drbd_request *r; - spin_lock_irq(&mdev->req_lock); - _tl_restart(mdev, connection_lost_while_pending); /* we expect this list to be empty. */ @@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev) memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *)); - spin_unlock_irq(&mdev->req_lock); } void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) @@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, if (ns.susp_fen) { /* case1: The outdate peer handler is successful: */ if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) { - tl_clear(mdev); if (test_bit(NEW_CUR_UUID, &mdev->flags)) { drbd_uuid_new_current(mdev); clear_bit(NEW_CUR_UUID, &mdev->flags); } spin_lock_irq(&mdev->req_lock); + _tl_clear(mdev); _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL); spin_unlock_irq(&mdev->req_lock); } @@ -1514,6 +1510,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ + /* we probably will start a resync soon. + * make sure those things are properly reset. */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + drbd_rs_cancel_all(mdev); + drbd_send_uuids(mdev); drbd_send_state(mdev, ns); } @@ -1630,9 +1633,24 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, eh = mdev->ldev->dc.on_io_error; was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - /* Immediately allow completion of all application IO, that waits - for completion from the local disk. */ - tl_abort_disk_io(mdev); + if (was_io_error && eh == EP_CALL_HELPER) + drbd_khelper(mdev, "local-io-error"); + + /* Immediately allow completion of all application IO, + * that waits for completion from the local disk, + * if this was a force-detach due to disk_timeout + * or administrator request (drbdsetup detach --force). + * Do NOT abort otherwise. + * Aborting local requests may cause serious problems, + * if requests are completed to upper layers already, + * and then later the already submitted local bio completes. + * This can cause DMA into former bio pages that meanwhile + * have been re-used for other things. + * So aborting local requests may cause crashes, + * or even worse, silent data corruption. + */ + if (test_and_clear_bit(FORCE_DETACH, &mdev->flags)) + tl_abort_disk_io(mdev); /* current state still has to be D_FAILED, * there is only one way out: to D_DISKLESS, @@ -1653,9 +1671,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, drbd_md_sync(mdev); } put_ldev(mdev); - - if (was_io_error && eh == EP_CALL_HELPER) - drbd_khelper(mdev, "local-io-error"); } /* second half of local IO error, failure to attach, @@ -1669,10 +1684,6 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, "ASSERT FAILED: disk is %s while going diskless\n", drbd_disk_str(mdev->state.disk)); - mdev->rs_total = 0; - mdev->rs_failed = 0; - atomic_set(&mdev->rs_pending_cnt, 0); - if (ns.conn >= C_CONNECTED) drbd_send_state(mdev, ns); @@ -2194,7 +2205,8 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl { struct p_sizes p; sector_t d_size, u_size; - int q_order_type, max_bio_size; + int q_order_type; + unsigned int max_bio_size; int ok; if (get_ldev_if_state(mdev, D_NEGOTIATING)) { @@ -2203,7 +2215,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl u_size = mdev->ldev->dc.disk_size; q_order_type = drbd_queue_order_type(mdev); max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9; - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_BIO_SIZE); + max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE); put_ldev(mdev); } else { d_size = 0; @@ -2214,7 +2226,7 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ if (mdev->agreed_pro_version <= 94) - max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET); p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); @@ -3521,9 +3533,9 @@ static void drbd_cleanup(void) } /** - * drbd_congested() - Callback for pdflush + * drbd_congested() - Callback for the flusher thread * @congested_data: User data - * @bdi_bits: Bits pdflush is currently interested in + * @bdi_bits: Bits the BDI flusher thread is currently interested in * * Returns 1<<BDI_async_congested and/or 1<<BDI_sync_congested if we are congested. */ @@ -3541,6 +3553,22 @@ static int drbd_congested(void *congested_data, int bdi_bits) goto out; } + if (test_bit(CALLBACK_PENDING, &mdev->flags)) { + r |= (1 << BDI_async_congested); + /* Without good local data, we would need to read from remote, + * and that would need the worker thread as well, which is + * currently blocked waiting for that usermode helper to + * finish. + */ + if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) + r |= (1 << BDI_sync_congested); + else + put_ldev(mdev); + r &= bdi_bits; + reason = 'c'; + goto out; + } + if (get_ldev(mdev)) { q = bdev_get_queue(mdev->ldev->backing_bdev); r = bdi_congested(&q->backing_dev_info, bdi_bits); @@ -3604,6 +3632,7 @@ struct drbd_conf *drbd_new_device(unsigned int minor) q->backing_dev_info.congested_data = mdev; blk_queue_make_request(q, drbd_make_request); + blk_queue_flush(q, REQ_FLUSH | REQ_FUA); /* Setting the max_hw_sectors to an odd value of 8kibyte here This triggers a max_bio_size message upon first attach or connect */ blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8); @@ -3870,7 +3899,7 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) { /* this was a try anyways ... */ dev_err(DEV, "meta data update failed!\n"); - drbd_chk_io_error(mdev, 1, true); + drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR); } /* Update mdev->ldev->md.la_size_sect, @@ -3950,9 +3979,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) spin_lock_irq(&mdev->req_lock); if (mdev->state.conn < C_CONNECTED) { - int peer; + unsigned int peer; peer = be32_to_cpu(buffer->la_peer_max_bio_size); - peer = max_t(int, peer, DRBD_MAX_BIO_SIZE_SAFE); + peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE); mdev->peer_max_bio_size = peer; } spin_unlock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 6d4de6a72e80..edb490aad8b4 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -147,6 +147,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) char *argv[] = {usermode_helper, cmd, mb, NULL }; int ret; + if (current == mdev->worker.task) + set_bit(CALLBACK_PENDING, &mdev->flags); + snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev)); if (get_net_conf(mdev)) { @@ -189,6 +192,9 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) usermode_helper, cmd, mb, (ret >> 8) & 0xff, ret); + if (current == mdev->worker.task) + clear_bit(CALLBACK_PENDING, &mdev->flags); + if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -668,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ - err = drbd_bitmap_io(mdev, &drbd_bm_write, - "size changed", BM_LOCKED_MASK); + err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write, + "size changed", BM_LOCKED_MASK); if (err) { rv = dev_size_error; goto out; @@ -795,8 +801,8 @@ static int drbd_check_al_size(struct drbd_conf *mdev) static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_size) { struct request_queue * const q = mdev->rq_queue; - int max_hw_sectors = max_bio_size >> 9; - int max_segments = 0; + unsigned int max_hw_sectors = max_bio_size >> 9; + unsigned int max_segments = 0; if (get_ldev_if_state(mdev, D_ATTACHING)) { struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue; @@ -829,7 +835,7 @@ static void drbd_setup_queue_param(struct drbd_conf *mdev, unsigned int max_bio_ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) { - int now, new, local, peer; + unsigned int now, new, local, peer; now = queue_max_hw_sectors(mdev->rq_queue) << 9; local = mdev->local_max_bio_size; /* Eventually last known value, from volatile memory */ @@ -840,13 +846,14 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) mdev->local_max_bio_size = local; put_ldev(mdev); } + local = min(local, DRBD_MAX_BIO_SIZE); /* We may ignore peer limits if the peer is modern enough. Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { if (mdev->agreed_pro_version < 94) { - peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; @@ -854,10 +861,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) peer = DRBD_MAX_BIO_SIZE; } - new = min_t(int, local, peer); + new = min(local, peer); if (mdev->state.role == R_PRIMARY && new < now) - dev_err(DEV, "ASSERT FAILED new < now; (%d < %d)\n", new, now); + dev_err(DEV, "ASSERT FAILED new < now; (%u < %u)\n", new, now); if (new != now) dev_info(DEV, "max BIO size = %u\n", new); @@ -950,6 +957,14 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * to realize a "hot spare" feature (not that I'd recommend that) */ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); + /* make sure there is no leftover from previous force-detach attempts */ + clear_bit(FORCE_DETACH, &mdev->flags); + + /* and no leftover from previously aborted resync or verify, either */ + mdev->rs_total = 0; + mdev->rs_failed = 0; + atomic_set(&mdev->rs_pending_cnt, 0); + /* allocation not in the IO path, cqueue thread context */ nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL); if (!nbc) { @@ -1345,6 +1360,7 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, } if (dt.detach_force) { + set_bit(FORCE_DETACH, &mdev->flags); drbd_force_state(mdev, NS(disk, D_FAILED)); reply->ret_code = SS_SUCCESS; goto out; @@ -1962,9 +1978,11 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -2003,9 +2021,11 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re int retcode; /* If there is still bitmap IO pending, probably because of a previous - * resync just being finished, wait for it before requesting a new resync. */ + * resync just being finished, wait for it before requesting a new resync. + * Also wait for it's after_state_ch(). */ drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); + drbd_flush_workqueue(mdev); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 869bada2ed06..5496104f90b9 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -245,6 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) mdev->state.role == R_SECONDARY) { seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { + /* reset mdev->congestion_reason */ + bdi_rw_congested(&mdev->rq_queue->backing_dev_info); + seq_printf(seq, "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n" " ns:%u nr:%u dw:%u dr:%u al:%u bm:%u " diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index ea4836e0ae98..c74ca2df7431 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -277,6 +277,9 @@ static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net) atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use; int i; + if (page == NULL) + return; + if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count) i = page_chain_free(page); else { @@ -316,7 +319,7 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, gfp_t gfp_mask) __must_hold(local) { struct drbd_epoch_entry *e; - struct page *page; + struct page *page = NULL; unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT; if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE)) @@ -329,9 +332,11 @@ struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev, return NULL; } - page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); - if (!page) - goto fail; + if (data_size) { + page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT)); + if (!page) + goto fail; + } INIT_HLIST_NODE(&e->collision); e->epoch = NULL; @@ -1270,7 +1275,6 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ data_size -= dgs; - ERR_IF(data_size == 0) return NULL; ERR_IF(data_size & 0x1ff) return NULL; ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL; @@ -1291,6 +1295,9 @@ read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __ if (!e) return NULL; + if (!data_size) + return e; + ds = data_size; page = e->pages; page_chain_for_each(page) { @@ -1715,6 +1722,10 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned dp_flags = be32_to_cpu(p->dp_flags); rw |= wire_flags_to_bio(mdev, dp_flags); + if (e->pages == NULL) { + D_ASSERT(e->size == 0); + D_ASSERT(dp_flags & DP_FLUSH); + } if (dp_flags & DP_MAY_SET_IN_SYNC) e->flags |= EE_MAY_SET_IN_SYNC; @@ -3801,11 +3812,18 @@ void drbd_free_tl_hash(struct drbd_conf *mdev) mdev->ee_hash = NULL; mdev->ee_hash_s = 0; - /* paranoia code */ - for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) - if (h->first) - dev_err(DEV, "ASSERT FAILED tl_hash[%u] == %p, expected NULL\n", - (int)(h - mdev->tl_hash), h->first); + /* We may not have had the chance to wait for all locally pending + * application requests. The hlist_add_fake() prevents access after + * free on master bio completion. */ + for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) { + struct drbd_request *req; + struct hlist_node *pos, *n; + hlist_for_each_entry_safe(req, pos, n, h, collision) { + hlist_del_init(&req->collision); + hlist_add_fake(&req->collision); + } + } + kfree(mdev->tl_hash); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 8e93a6ac9bb6..01b2ac641c7b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -455,7 +455,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); _req_may_be_done_not_susp(req, m); break; @@ -477,7 +477,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; } - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); goto_queue_for_net_read: @@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case resend: + /* Simply complete (local only) READs. */ + if (!(req->rq_state & RQ_WRITE) && !req->w.cb) { + _req_may_be_done(req, m); + break; + } + /* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK before the connection loss (B&C only); only P_BARRIER_ACK was missing. Trowing them out of the TL here by pretending we got a BARRIER_ACK @@ -834,7 +840,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns req->private_bio = NULL; } if (rw == WRITE) { - remote = 1; + /* Need to replicate writes. Unless it is an empty flush, + * which is better mapped to a DRBD P_BARRIER packet, + * also for drbd wire protocol compatibility reasons. */ + if (unlikely(size == 0)) { + /* The only size==0 bios we expect are empty flushes. */ + D_ASSERT(bio->bi_rw & REQ_FLUSH); + remote = 0; + } else + remote = 1; } else { /* READ || READA */ if (local) { @@ -870,8 +884,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns * extent. This waits for any resync activity in the corresponding * resync extent to finish, and, if necessary, pulls in the target * extent into the activity log, which involves further disk io because - * of transactional on-disk meta data updates. */ - if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) { + * of transactional on-disk meta data updates. + * Empty flushes don't need to go into the activity log, they can only + * flush data for pending writes which are already in there. */ + if (rw == WRITE && local && size + && !test_bit(AL_SUSPENDED, &mdev->flags)) { req->rq_state |= RQ_IN_ACT_LOG; drbd_al_begin_io(mdev, sector); } @@ -994,7 +1011,10 @@ allocate_barrier: if (rw == WRITE && _req_conflicts(req)) goto fail_conflicting; - list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); + /* no point in adding empty flushes to the transfer log, + * they are mapped to drbd barriers already. */ + if (likely(size!=0)) + list_add_tail(&req->tl_requests, &mdev->newest_tle->requests); /* NOTE remote first: to get the concurrent write detection right, * we must register the request before start of local IO. */ @@ -1014,6 +1034,14 @@ allocate_barrier: mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96) maybe_pull_ahead(mdev); + /* If this was a flush, queue a drbd barrier/start a new epoch. + * Unless the current epoch was empty anyways, or we are not currently + * replicating, in which case there is no point. */ + if (unlikely(bio->bi_rw & REQ_FLUSH) + && mdev->newest_tle->n_writes + && drbd_should_do_remote(mdev->state)) + queue_barrier(mdev); + spin_unlock_irq(&mdev->req_lock); kfree(b); /* if someone else has beaten us to it... */ @@ -1111,13 +1139,12 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) /* * what we "blindly" assume: */ - D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ s_enr = bio->bi_sector >> HT_SHIFT; - e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; + e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr; if (likely(s_enr == e_enr)) { do { @@ -1275,7 +1302,7 @@ void request_timer_fn(unsigned long data) time_after(now, req->start_time + dt) && !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); - __drbd_chk_io_error(mdev, 1); + __drbd_chk_io_error(mdev, DRBD_FORCE_DETACH); } nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 620c70ff2231..6bce2cc179d4 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -111,7 +111,7 @@ void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local) if (list_empty(&mdev->read_ee)) wake_up(&mdev->ee_wait); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); drbd_queue_work(&mdev->data.work, &e->w); @@ -154,7 +154,7 @@ static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(lo : list_empty(&mdev->active_ee); if (test_bit(__EE_WAS_ERROR, &e->flags)) - __drbd_chk_io_error(mdev, false); + __drbd_chk_io_error(mdev, DRBD_IO_ERROR); spin_unlock_irqrestore(&mdev->req_lock, flags); if (is_syncer_req) @@ -1501,14 +1501,6 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) return; } - if (mdev->state.conn < C_AHEAD) { - /* In case a previous resync run was aborted by an IO error/detach on the peer. */ - drbd_rs_cancel_all(mdev); - /* This should be done when we abort the resync. We definitely do not - want to have this for connections going back and forth between - Ahead/Behind and SyncSource/SyncTarget */ - } - if (side == C_SYNC_TARGET) { /* Since application IO was locked out during C_WF_BITMAP_T and C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 553f43a90953..a7d6347aaa79 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -191,6 +191,7 @@ static int print_unex = 1; #include <linux/mutex.h> #include <linux/io.h> #include <linux/uaccess.h> +#include <linux/async.h> /* * PS/2 floppies have much slower step rates than regular floppies. @@ -2516,8 +2517,7 @@ static int make_raw_rw_request(void) set_fdc((long)current_req->rq_disk->private_data); raw_cmd = &default_raw_cmd; - raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_DISK | - FD_RAW_NEED_SEEK; + raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK; raw_cmd->cmd_count = NR_RW; if (rq_data_dir(current_req) == READ) { raw_cmd->flags |= FD_RAW_READ; @@ -4123,7 +4123,7 @@ static struct kobject *floppy_find(dev_t dev, int *part, void *data) return get_disk(disks[drive]); } -static int __init floppy_init(void) +static int __init do_floppy_init(void) { int i, unit, drive; int err, dr; @@ -4338,6 +4338,24 @@ out_put_disk: return err; } +#ifndef MODULE +static __init void floppy_async_init(void *data, async_cookie_t cookie) +{ + do_floppy_init(); +} +#endif + +static int __init floppy_init(void) +{ +#ifdef MODULE + return do_floppy_init(); +#else + /* Don't hold up the bootup by the floppy initialization */ + async_schedule(floppy_async_init, NULL); + return 0; +#endif +} + static const struct io_region { int offset; int size; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 76fa3deaee84..1788f491e0fb 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -780,9 +780,9 @@ static const struct block_device_operations mg_disk_ops = { .getgeo = mg_getgeo }; -static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) +static int mg_suspend(struct device *dev) { - struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + struct mg_drv_data *prv_data = dev->platform_data; struct mg_host *host = prv_data->host; if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) @@ -804,9 +804,9 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) return 0; } -static int mg_resume(struct platform_device *plat_dev) +static int mg_resume(struct device *dev) { - struct mg_drv_data *prv_data = plat_dev->dev.platform_data; + struct mg_drv_data *prv_data = dev->platform_data; struct mg_host *host = prv_data->host; if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) @@ -825,6 +825,8 @@ static int mg_resume(struct platform_device *plat_dev) return 0; } +static SIMPLE_DEV_PM_OPS(mg_pm, mg_suspend, mg_resume); + static int mg_probe(struct platform_device *plat_dev) { struct mg_host *host; @@ -1074,11 +1076,10 @@ static int mg_remove(struct platform_device *plat_dev) static struct platform_driver mg_disk_driver = { .probe = mg_probe, .remove = mg_remove, - .suspend = mg_suspend, - .resume = mg_resume, .driver = { .name = MG_DEV_NAME, .owner = THIS_MODULE, + .pm = &mg_pm, } }; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 061427a75d37..d07c9f7fded6 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -154,6 +154,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; + unsigned long pflags = current->flags; if (unlikely(!sock)) { dev_err(disk_to_dev(nbd->disk), @@ -167,8 +168,9 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, siginitsetinv(&blocked, sigmask(SIGKILL)); sigprocmask(SIG_SETMASK, &blocked, &oldset); + current->flags |= PF_MEMALLOC; do { - sock->sk->sk_allocation = GFP_NOIO; + sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC; iov.iov_base = buf; iov.iov_len = size; msg.msg_name = NULL; @@ -214,6 +216,7 @@ static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, } while (size > 0); sigprocmask(SIG_SETMASK, &oldset, NULL); + tsk_restore_flags(current, pflags, PF_MEMALLOC); return result; } @@ -405,6 +408,7 @@ static int nbd_do_it(struct nbd_device *nbd) BUG_ON(nbd->magic != NBD_MAGIC); + sk_set_memalloc(nbd->sock->sk); nbd->pid = task_pid_nr(current); ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { @@ -481,7 +485,7 @@ static void nbd_handle_req(struct nbd_device *nbd, struct request *req) nbd_end_request(req); } else { spin_lock(&nbd->queue_lock); - list_add(&req->queuelist, &nbd->queue_head); + list_add_tail(&req->queuelist, &nbd->queue_head); spin_unlock(&nbd->queue_lock); } diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 8f428a8ab003..9917943a3572 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -55,8 +55,6 @@ #define RBD_MINORS_PER_MAJOR 256 /* max minors per blkdev */ -#define RBD_MAX_MD_NAME_LEN (RBD_MAX_OBJ_NAME_LEN + sizeof(RBD_SUFFIX)) -#define RBD_MAX_POOL_NAME_LEN 64 #define RBD_MAX_SNAP_NAME_LEN 32 #define RBD_MAX_OPT_LEN 1024 @@ -78,13 +76,12 @@ */ struct rbd_image_header { u64 image_size; - char block_name[32]; + char *object_prefix; __u8 obj_order; __u8 crypt_type; __u8 comp_type; struct ceph_snap_context *snapc; size_t snap_names_len; - u64 snap_seq; u32 total_snaps; char *snap_names; @@ -150,7 +147,7 @@ struct rbd_snap { * a single device */ struct rbd_device { - int id; /* blkdev unique id */ + int dev_id; /* blkdev unique id */ int major; /* blkdev assigned major */ struct gendisk *disk; /* blkdev's gendisk and rq */ @@ -163,20 +160,24 @@ struct rbd_device { spinlock_t lock; /* queue lock */ struct rbd_image_header header; - char obj[RBD_MAX_OBJ_NAME_LEN]; /* rbd image name */ - int obj_len; - char obj_md_name[RBD_MAX_MD_NAME_LEN]; /* hdr nm. */ - char pool_name[RBD_MAX_POOL_NAME_LEN]; - int poolid; + char *image_name; + size_t image_name_len; + char *header_name; + char *pool_name; + int pool_id; struct ceph_osd_event *watch_event; struct ceph_osd_request *watch_request; /* protects updating the header */ struct rw_semaphore header_rwsem; - char snap_name[RBD_MAX_SNAP_NAME_LEN]; + /* name of the snapshot this device reads from */ + char *snap_name; + /* id of the snapshot this device reads from */ u64 snap_id; /* current snapshot id */ - int read_only; + /* whether the snap_id this device reads from still exists */ + bool snap_exists; + int read_only; struct list_head node; @@ -201,8 +202,7 @@ static ssize_t rbd_snap_add(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap); +static void __rbd_remove_snap_dev(struct rbd_snap *snap); static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count); @@ -240,7 +240,7 @@ static void rbd_put_dev(struct rbd_device *rbd_dev) put_device(&rbd_dev->dev); } -static int __rbd_refresh_header(struct rbd_device *rbd_dev); +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver); static int rbd_open(struct block_device *bdev, fmode_t mode) { @@ -273,9 +273,9 @@ static const struct block_device_operations rbd_bd_ops = { /* * Initialize an rbd client instance. - * We own *opt. + * We own *ceph_opts. */ -static struct rbd_client *rbd_client_create(struct ceph_options *opt, +static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts, struct rbd_options *rbd_opts) { struct rbd_client *rbdc; @@ -291,10 +291,10 @@ static struct rbd_client *rbd_client_create(struct ceph_options *opt, mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rbdc->client = ceph_create_client(opt, rbdc, 0, 0); + rbdc->client = ceph_create_client(ceph_opts, rbdc, 0, 0); if (IS_ERR(rbdc->client)) goto out_mutex; - opt = NULL; /* Now rbdc->client is responsible for opt */ + ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */ ret = ceph_open_session(rbdc->client); if (ret < 0) @@ -317,23 +317,23 @@ out_mutex: mutex_unlock(&ctl_mutex); kfree(rbdc); out_opt: - if (opt) - ceph_destroy_options(opt); + if (ceph_opts) + ceph_destroy_options(ceph_opts); return ERR_PTR(ret); } /* * Find a ceph client with specific addr and configuration. */ -static struct rbd_client *__rbd_client_find(struct ceph_options *opt) +static struct rbd_client *__rbd_client_find(struct ceph_options *ceph_opts) { struct rbd_client *client_node; - if (opt->flags & CEPH_OPT_NOSHARE) + if (ceph_opts->flags & CEPH_OPT_NOSHARE) return NULL; list_for_each_entry(client_node, &rbd_client_list, node) - if (ceph_compare_options(opt, client_node->client) == 0) + if (!ceph_compare_options(ceph_opts, client_node->client)) return client_node; return NULL; } @@ -349,7 +349,7 @@ enum { /* string args above */ }; -static match_table_t rbdopt_tokens = { +static match_table_t rbd_opts_tokens = { {Opt_notify_timeout, "notify_timeout=%d"}, /* int args above */ /* string args above */ @@ -358,11 +358,11 @@ static match_table_t rbdopt_tokens = { static int parse_rbd_opts_token(char *c, void *private) { - struct rbd_options *rbdopt = private; + struct rbd_options *rbd_opts = private; substring_t argstr[MAX_OPT_ARGS]; int token, intval, ret; - token = match_token(c, rbdopt_tokens, argstr); + token = match_token(c, rbd_opts_tokens, argstr); if (token < 0) return -EINVAL; @@ -383,7 +383,7 @@ static int parse_rbd_opts_token(char *c, void *private) switch (token) { case Opt_notify_timeout: - rbdopt->notify_timeout = intval; + rbd_opts->notify_timeout = intval; break; default: BUG_ON(token); @@ -400,7 +400,7 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, char *options) { struct rbd_client *rbdc; - struct ceph_options *opt; + struct ceph_options *ceph_opts; struct rbd_options *rbd_opts; rbd_opts = kzalloc(sizeof(*rbd_opts), GFP_KERNEL); @@ -409,29 +409,29 @@ static struct rbd_client *rbd_get_client(const char *mon_addr, rbd_opts->notify_timeout = RBD_NOTIFY_TIMEOUT_DEFAULT; - opt = ceph_parse_options(options, mon_addr, - mon_addr + mon_addr_len, - parse_rbd_opts_token, rbd_opts); - if (IS_ERR(opt)) { + ceph_opts = ceph_parse_options(options, mon_addr, + mon_addr + mon_addr_len, + parse_rbd_opts_token, rbd_opts); + if (IS_ERR(ceph_opts)) { kfree(rbd_opts); - return ERR_CAST(opt); + return ERR_CAST(ceph_opts); } spin_lock(&rbd_client_list_lock); - rbdc = __rbd_client_find(opt); + rbdc = __rbd_client_find(ceph_opts); if (rbdc) { /* using an existing client */ kref_get(&rbdc->kref); spin_unlock(&rbd_client_list_lock); - ceph_destroy_options(opt); + ceph_destroy_options(ceph_opts); kfree(rbd_opts); return rbdc; } spin_unlock(&rbd_client_list_lock); - rbdc = rbd_client_create(opt, rbd_opts); + rbdc = rbd_client_create(ceph_opts, rbd_opts); if (IS_ERR(rbdc)) kfree(rbd_opts); @@ -480,46 +480,60 @@ static void rbd_coll_release(struct kref *kref) kfree(coll); } +static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk) +{ + return !memcmp(&ondisk->text, + RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)); +} + /* * Create a new header structure, translate header format from the on-disk * header. */ static int rbd_header_from_disk(struct rbd_image_header *header, struct rbd_image_header_ondisk *ondisk, - u32 allocated_snaps, - gfp_t gfp_flags) + u32 allocated_snaps) { - u32 i, snap_count; + u32 snap_count; - if (memcmp(ondisk, RBD_HEADER_TEXT, sizeof(RBD_HEADER_TEXT))) + if (!rbd_dev_ondisk_valid(ondisk)) return -ENXIO; snap_count = le32_to_cpu(ondisk->snap_count); - if (snap_count > (UINT_MAX - sizeof(struct ceph_snap_context)) - / sizeof (*ondisk)) + if (snap_count > (SIZE_MAX - sizeof(struct ceph_snap_context)) + / sizeof (u64)) return -EINVAL; header->snapc = kmalloc(sizeof(struct ceph_snap_context) + snap_count * sizeof(u64), - gfp_flags); + GFP_KERNEL); if (!header->snapc) return -ENOMEM; - header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); if (snap_count) { + header->snap_names_len = le64_to_cpu(ondisk->snap_names_len); header->snap_names = kmalloc(header->snap_names_len, - gfp_flags); + GFP_KERNEL); if (!header->snap_names) goto err_snapc; header->snap_sizes = kmalloc(snap_count * sizeof(u64), - gfp_flags); + GFP_KERNEL); if (!header->snap_sizes) goto err_names; } else { + WARN_ON(ondisk->snap_names_len); + header->snap_names_len = 0; header->snap_names = NULL; header->snap_sizes = NULL; } - memcpy(header->block_name, ondisk->block_name, + + header->object_prefix = kmalloc(sizeof (ondisk->block_name) + 1, + GFP_KERNEL); + if (!header->object_prefix) + goto err_sizes; + + memcpy(header->object_prefix, ondisk->block_name, sizeof(ondisk->block_name)); + header->object_prefix[sizeof (ondisk->block_name)] = '\0'; header->image_size = le64_to_cpu(ondisk->image_size); header->obj_order = ondisk->options.order; @@ -527,11 +541,13 @@ static int rbd_header_from_disk(struct rbd_image_header *header, header->comp_type = ondisk->options.comp_type; atomic_set(&header->snapc->nref, 1); - header->snap_seq = le64_to_cpu(ondisk->snap_seq); + header->snapc->seq = le64_to_cpu(ondisk->snap_seq); header->snapc->num_snaps = snap_count; header->total_snaps = snap_count; if (snap_count && allocated_snaps == snap_count) { + int i; + for (i = 0; i < snap_count; i++) { header->snapc->snaps[i] = le64_to_cpu(ondisk->snaps[i].id); @@ -540,16 +556,22 @@ static int rbd_header_from_disk(struct rbd_image_header *header, } /* copy snapshot names */ - memcpy(header->snap_names, &ondisk->snaps[i], + memcpy(header->snap_names, &ondisk->snaps[snap_count], header->snap_names_len); } return 0; +err_sizes: + kfree(header->snap_sizes); + header->snap_sizes = NULL; err_names: kfree(header->snap_names); + header->snap_names = NULL; err_snapc: kfree(header->snapc); + header->snapc = NULL; + return -ENOMEM; } @@ -575,52 +597,50 @@ static int snap_by_name(struct rbd_image_header *header, const char *snap_name, return -ENOENT; } -static int rbd_header_set_snap(struct rbd_device *dev, u64 *size) +static int rbd_header_set_snap(struct rbd_device *rbd_dev, u64 *size) { - struct rbd_image_header *header = &dev->header; - struct ceph_snap_context *snapc = header->snapc; - int ret = -ENOENT; - - BUILD_BUG_ON(sizeof (dev->snap_name) < sizeof (RBD_SNAP_HEAD_NAME)); + int ret; - down_write(&dev->header_rwsem); + down_write(&rbd_dev->header_rwsem); - if (!memcmp(dev->snap_name, RBD_SNAP_HEAD_NAME, + if (!memcmp(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME))) { - if (header->total_snaps) - snapc->seq = header->snap_seq; - else - snapc->seq = 0; - dev->snap_id = CEPH_NOSNAP; - dev->read_only = 0; + rbd_dev->snap_id = CEPH_NOSNAP; + rbd_dev->snap_exists = false; + rbd_dev->read_only = 0; if (size) - *size = header->image_size; + *size = rbd_dev->header.image_size; } else { - ret = snap_by_name(header, dev->snap_name, &snapc->seq, size); + u64 snap_id = 0; + + ret = snap_by_name(&rbd_dev->header, rbd_dev->snap_name, + &snap_id, size); if (ret < 0) goto done; - dev->snap_id = snapc->seq; - dev->read_only = 1; + rbd_dev->snap_id = snap_id; + rbd_dev->snap_exists = true; + rbd_dev->read_only = 1; } ret = 0; done: - up_write(&dev->header_rwsem); + up_write(&rbd_dev->header_rwsem); return ret; } static void rbd_header_free(struct rbd_image_header *header) { - kfree(header->snapc); - kfree(header->snap_names); + kfree(header->object_prefix); kfree(header->snap_sizes); + kfree(header->snap_names); + ceph_put_snap_context(header->snapc); } /* * get the actual striped segment name, offset and length */ static u64 rbd_get_segment(struct rbd_image_header *header, - const char *block_name, + const char *object_prefix, u64 ofs, u64 len, char *seg_name, u64 *segofs) { @@ -628,7 +648,7 @@ static u64 rbd_get_segment(struct rbd_image_header *header, if (seg_name) snprintf(seg_name, RBD_MAX_SEG_NAME_LEN, - "%s.%012llx", block_name, seg); + "%s.%012llx", object_prefix, seg); ofs = ofs & ((1 << header->obj_order) - 1); len = min_t(u64, len, (1 << header->obj_order) - ofs); @@ -726,9 +746,8 @@ static struct bio *bio_chain_clone(struct bio **old, struct bio **next, * split_bio will BUG_ON if this is not the case */ dout("bio_chain_clone split! total=%d remaining=%d" - "bi_size=%d\n", - (int)total, (int)len-total, - (int)old_chain->bi_size); + "bi_size=%u\n", + total, len - total, old_chain->bi_size); /* split the bio. We'll release it either in the next call, or it will have to be released outside */ @@ -777,22 +796,24 @@ err_out: /* * helpers for osd request op vectors. */ -static int rbd_create_rw_ops(struct ceph_osd_req_op **ops, - int num_ops, - int opcode, - u32 payload_len) -{ - *ops = kzalloc(sizeof(struct ceph_osd_req_op) * (num_ops + 1), - GFP_NOIO); - if (!*ops) - return -ENOMEM; - (*ops)[0].op = opcode; +static struct ceph_osd_req_op *rbd_create_rw_ops(int num_ops, + int opcode, u32 payload_len) +{ + struct ceph_osd_req_op *ops; + + ops = kzalloc(sizeof (*ops) * (num_ops + 1), GFP_NOIO); + if (!ops) + return NULL; + + ops[0].op = opcode; + /* * op extent offset and length will be set later on * in calc_raw_layout() */ - (*ops)[0].payload_len = payload_len; - return 0; + ops[0].payload_len = payload_len; + + return ops; } static void rbd_destroy_ops(struct ceph_osd_req_op *ops) @@ -808,8 +829,8 @@ static void rbd_coll_end_req_index(struct request *rq, struct request_queue *q; int min, max, i; - dout("rbd_coll_end_req_index %p index %d ret %d len %lld\n", - coll, index, ret, len); + dout("rbd_coll_end_req_index %p index %d ret %d len %llu\n", + coll, index, ret, (unsigned long long) len); if (!rq) return; @@ -848,16 +869,15 @@ static void rbd_coll_end_req(struct rbd_request *req, * Send ceph osd request */ static int rbd_do_request(struct request *rq, - struct rbd_device *dev, + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - const char *obj, u64 ofs, u64 len, + const char *object_name, u64 ofs, u64 len, struct bio *bio, struct page **pages, int num_pages, int flags, struct ceph_osd_req_op *ops, - int num_reply, struct rbd_req_coll *coll, int coll_index, void (*rbd_cb)(struct ceph_osd_request *req, @@ -887,15 +907,13 @@ static int rbd_do_request(struct request *rq, req_data->coll_index = coll_index; } - dout("rbd_do_request obj=%s ofs=%lld len=%lld\n", obj, len, ofs); - - down_read(&dev->header_rwsem); + dout("rbd_do_request object_name=%s ofs=%llu len=%llu\n", object_name, + (unsigned long long) ofs, (unsigned long long) len); - osdc = &dev->rbd_client->client->osdc; + osdc = &rbd_dev->rbd_client->client->osdc; req = ceph_osdc_alloc_request(osdc, flags, snapc, ops, false, GFP_NOIO, pages, bio); if (!req) { - up_read(&dev->header_rwsem); ret = -ENOMEM; goto done_pages; } @@ -912,7 +930,7 @@ static int rbd_do_request(struct request *rq, reqhead = req->r_request->front.iov_base; reqhead->snapid = cpu_to_le64(CEPH_NOSNAP); - strncpy(req->r_oid, obj, sizeof(req->r_oid)); + strncpy(req->r_oid, object_name, sizeof(req->r_oid)); req->r_oid_len = strlen(req->r_oid); layout = &req->r_file_layout; @@ -920,7 +938,7 @@ static int rbd_do_request(struct request *rq, layout->fl_stripe_unit = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); layout->fl_stripe_count = cpu_to_le32(1); layout->fl_object_size = cpu_to_le32(1 << RBD_MAX_OBJ_ORDER); - layout->fl_pg_pool = cpu_to_le32(dev->poolid); + layout->fl_pg_pool = cpu_to_le32(rbd_dev->pool_id); ceph_calc_raw_layout(osdc, layout, snapid, ofs, &len, &bno, req, ops); @@ -929,7 +947,6 @@ static int rbd_do_request(struct request *rq, snapc, &mtime, req->r_oid, req->r_oid_len); - up_read(&dev->header_rwsem); if (linger_req) { ceph_osdc_set_request_linger(osdc, req); @@ -944,8 +961,9 @@ static int rbd_do_request(struct request *rq, ret = ceph_osdc_wait_request(osdc, req); if (ver) *ver = le64_to_cpu(req->r_reassert_version.version); - dout("reassert_ver=%lld\n", - le64_to_cpu(req->r_reassert_version.version)); + dout("reassert_ver=%llu\n", + (unsigned long long) + le64_to_cpu(req->r_reassert_version.version)); ceph_osdc_put_request(req); } return ret; @@ -979,7 +997,8 @@ static void rbd_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg) bytes = le64_to_cpu(op->extent.length); read_op = (le16_to_cpu(op->op) == CEPH_OSD_OP_READ); - dout("rbd_req_cb bytes=%lld readop=%d rc=%d\n", bytes, read_op, rc); + dout("rbd_req_cb bytes=%llu readop=%d rc=%d\n", + (unsigned long long) bytes, read_op, (int) rc); if (rc == -ENOENT && read_op) { zero_bio_chain(req_data->bio, 0); @@ -1006,14 +1025,12 @@ static void rbd_simple_req_cb(struct ceph_osd_request *req, struct ceph_msg *msg /* * Do a synchronous ceph osd operation */ -static int rbd_req_sync_op(struct rbd_device *dev, +static int rbd_req_sync_op(struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, - struct ceph_osd_req_op *orig_ops, - int num_reply, - const char *obj, + struct ceph_osd_req_op *ops, + const char *object_name, u64 ofs, u64 len, char *buf, struct ceph_osd_request **linger_req, @@ -1022,45 +1039,28 @@ static int rbd_req_sync_op(struct rbd_device *dev, int ret; struct page **pages; int num_pages; - struct ceph_osd_req_op *ops = orig_ops; - u32 payload_len; + + BUG_ON(ops == NULL); num_pages = calc_pages_for(ofs , len); pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); if (IS_ERR(pages)) return PTR_ERR(pages); - if (!orig_ops) { - payload_len = (flags & CEPH_OSD_FLAG_WRITE ? len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) - goto done; - - if ((flags & CEPH_OSD_FLAG_WRITE) && buf) { - ret = ceph_copy_to_page_vector(pages, buf, ofs, len); - if (ret < 0) - goto done_ops; - } - } - - ret = rbd_do_request(NULL, dev, snapc, snapid, - obj, ofs, len, NULL, + ret = rbd_do_request(NULL, rbd_dev, snapc, snapid, + object_name, ofs, len, NULL, pages, num_pages, flags, ops, - 2, NULL, 0, NULL, linger_req, ver); if (ret < 0) - goto done_ops; + goto done; if ((flags & CEPH_OSD_FLAG_READ) && buf) ret = ceph_copy_from_page_vector(pages, buf, ofs, ret); -done_ops: - if (!orig_ops) - rbd_destroy_ops(ops); done: ceph_release_page_vector(pages, num_pages); return ret; @@ -1070,10 +1070,10 @@ done: * Do an asynchronous ceph osd operation */ static int rbd_do_op(struct request *rq, - struct rbd_device *rbd_dev , + struct rbd_device *rbd_dev, struct ceph_snap_context *snapc, u64 snapid, - int opcode, int flags, int num_reply, + int opcode, int flags, u64 ofs, u64 len, struct bio *bio, struct rbd_req_coll *coll, @@ -1091,14 +1091,15 @@ static int rbd_do_op(struct request *rq, return -ENOMEM; seg_len = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, len, seg_name, &seg_ofs); payload_len = (flags & CEPH_OSD_FLAG_WRITE ? seg_len : 0); - ret = rbd_create_rw_ops(&ops, 1, opcode, payload_len); - if (ret < 0) + ret = -ENOMEM; + ops = rbd_create_rw_ops(1, opcode, payload_len); + if (!ops) goto done; /* we've taken care of segment sizes earlier when we @@ -1112,7 +1113,6 @@ static int rbd_do_op(struct request *rq, NULL, 0, flags, ops, - num_reply, coll, coll_index, rbd_req_cb, 0, NULL); @@ -1136,7 +1136,6 @@ static int rbd_req_write(struct request *rq, return rbd_do_op(rq, rbd_dev, snapc, CEPH_NOSNAP, CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, - 2, ofs, len, bio, coll, coll_index); } @@ -1155,55 +1154,58 @@ static int rbd_req_read(struct request *rq, snapid, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - 2, ofs, len, bio, coll, coll_index); } /* * Request sync osd read */ -static int rbd_req_sync_read(struct rbd_device *dev, - struct ceph_snap_context *snapc, +static int rbd_req_sync_read(struct rbd_device *rbd_dev, u64 snapid, - const char *obj, + const char *object_name, u64 ofs, u64 len, char *buf, u64 *ver) { - return rbd_req_sync_op(dev, NULL, + struct ceph_osd_req_op *ops; + int ret; + + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_READ, 0); + if (!ops) + return -ENOMEM; + + ret = rbd_req_sync_op(rbd_dev, NULL, snapid, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, - NULL, - 1, obj, ofs, len, buf, NULL, ver); + ops, object_name, ofs, len, buf, NULL, ver); + rbd_destroy_ops(ops); + + return ret; } /* * Request sync osd watch */ -static int rbd_req_sync_notify_ack(struct rbd_device *dev, +static int rbd_req_sync_notify_ack(struct rbd_device *rbd_dev, u64 ver, - u64 notify_id, - const char *obj) + u64 notify_id) { struct ceph_osd_req_op *ops; - struct page **pages = NULL; int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY_ACK, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY_ACK, 0); + if (!ops) + return -ENOMEM; - ops[0].watch.ver = cpu_to_le64(dev->header.obj_version); + ops[0].watch.ver = cpu_to_le64(ver); ops[0].watch.cookie = notify_id; ops[0].watch.flag = 0; - ret = rbd_do_request(NULL, dev, NULL, CEPH_NOSNAP, - obj, 0, 0, NULL, - pages, 0, + ret = rbd_do_request(NULL, rbd_dev, NULL, CEPH_NOSNAP, + rbd_dev->header_name, 0, 0, NULL, + NULL, 0, CEPH_OSD_FLAG_READ, ops, - 1, NULL, 0, rbd_simple_req_cb, 0, NULL); @@ -1213,54 +1215,53 @@ static int rbd_req_sync_notify_ack(struct rbd_device *dev, static void rbd_watch_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; + struct rbd_device *rbd_dev = (struct rbd_device *)data; + u64 hver; int rc; - if (!dev) + if (!rbd_dev) return; - dout("rbd_watch_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_refresh_header(dev); - mutex_unlock(&ctl_mutex); + dout("rbd_watch_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); + rc = rbd_refresh_header(rbd_dev, &hver); if (rc) pr_warning(RBD_DRV_NAME "%d got notification but failed to " - " update snaps: %d\n", dev->major, rc); + " update snaps: %d\n", rbd_dev->major, rc); - rbd_req_sync_notify_ack(dev, ver, notify_id, dev->obj_md_name); + rbd_req_sync_notify_ack(rbd_dev, hver, notify_id); } /* * Request sync osd watch */ -static int rbd_req_sync_watch(struct rbd_device *dev, - const char *obj, - u64 ver) +static int rbd_req_sync_watch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ret = ceph_osdc_create_event(osdc, rbd_watch_cb, 0, - (void *)dev, &dev->watch_event); + (void *)rbd_dev, &rbd_dev->watch_event); if (ret < 0) goto fail; - ops[0].watch.ver = cpu_to_le64(ver); - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.ver = cpu_to_le64(rbd_dev->header.obj_version); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 1; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, - &dev->watch_request, NULL); + rbd_dev->header_name, + 0, 0, NULL, + &rbd_dev->watch_request, NULL); if (ret < 0) goto fail_event; @@ -1269,8 +1270,8 @@ static int rbd_req_sync_watch(struct rbd_device *dev, return 0; fail_event: - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; fail: rbd_destroy_ops(ops); return ret; @@ -1279,64 +1280,65 @@ fail: /* * Request sync osd unwatch */ -static int rbd_req_sync_unwatch(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_unwatch(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; + int ret; - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_WATCH, 0); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_WATCH, 0); + if (!ops) + return -ENOMEM; ops[0].watch.ver = 0; - ops[0].watch.cookie = cpu_to_le64(dev->watch_event->cookie); + ops[0].watch.cookie = cpu_to_le64(rbd_dev->watch_event->cookie); ops[0].watch.flag = 0; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); + rbd_destroy_ops(ops); - ceph_osdc_cancel_event(dev->watch_event); - dev->watch_event = NULL; + ceph_osdc_cancel_event(rbd_dev->watch_event); + rbd_dev->watch_event = NULL; return ret; } struct rbd_notify_info { - struct rbd_device *dev; + struct rbd_device *rbd_dev; }; static void rbd_notify_cb(u64 ver, u64 notify_id, u8 opcode, void *data) { - struct rbd_device *dev = (struct rbd_device *)data; - if (!dev) + struct rbd_device *rbd_dev = (struct rbd_device *)data; + if (!rbd_dev) return; - dout("rbd_notify_cb %s notify_id=%lld opcode=%d\n", dev->obj_md_name, - notify_id, (int)opcode); + dout("rbd_notify_cb %s notify_id=%llu opcode=%u\n", + rbd_dev->header_name, (unsigned long long) notify_id, + (unsigned int) opcode); } /* * Request sync osd notify */ -static int rbd_req_sync_notify(struct rbd_device *dev, - const char *obj) +static int rbd_req_sync_notify(struct rbd_device *rbd_dev) { struct ceph_osd_req_op *ops; - struct ceph_osd_client *osdc = &dev->rbd_client->client->osdc; + struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc; struct ceph_osd_event *event; struct rbd_notify_info info; int payload_len = sizeof(u32) + sizeof(u32); int ret; - ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_NOTIFY, payload_len); - if (ret < 0) - return ret; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_NOTIFY, payload_len); + if (!ops) + return -ENOMEM; - info.dev = dev; + info.rbd_dev = rbd_dev; ret = ceph_osdc_create_event(osdc, rbd_notify_cb, 1, (void *)&info, &event); @@ -1349,12 +1351,12 @@ static int rbd_req_sync_notify(struct rbd_device *dev, ops[0].watch.prot_ver = RADOS_NOTIFY_VER; ops[0].watch.timeout = 12; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, NULL); + rbd_dev->header_name, + 0, 0, NULL, NULL, NULL); if (ret < 0) goto fail_event; @@ -1373,36 +1375,37 @@ fail: /* * Request sync osd read */ -static int rbd_req_sync_exec(struct rbd_device *dev, - const char *obj, - const char *cls, - const char *method, +static int rbd_req_sync_exec(struct rbd_device *rbd_dev, + const char *object_name, + const char *class_name, + const char *method_name, const char *data, int len, u64 *ver) { struct ceph_osd_req_op *ops; - int cls_len = strlen(cls); - int method_len = strlen(method); - int ret = rbd_create_rw_ops(&ops, 1, CEPH_OSD_OP_CALL, - cls_len + method_len + len); - if (ret < 0) - return ret; + int class_name_len = strlen(class_name); + int method_name_len = strlen(method_name); + int ret; - ops[0].cls.class_name = cls; - ops[0].cls.class_len = (__u8)cls_len; - ops[0].cls.method_name = method; - ops[0].cls.method_len = (__u8)method_len; + ops = rbd_create_rw_ops(1, CEPH_OSD_OP_CALL, + class_name_len + method_name_len + len); + if (!ops) + return -ENOMEM; + + ops[0].cls.class_name = class_name; + ops[0].cls.class_len = (__u8) class_name_len; + ops[0].cls.method_name = method_name; + ops[0].cls.method_len = (__u8) method_name_len; ops[0].cls.argc = 0; ops[0].cls.indata = data; ops[0].cls.indata_len = len; - ret = rbd_req_sync_op(dev, NULL, + ret = rbd_req_sync_op(rbd_dev, NULL, CEPH_NOSNAP, - 0, CEPH_OSD_FLAG_WRITE | CEPH_OSD_FLAG_ONDISK, ops, - 1, obj, 0, 0, NULL, NULL, ver); + object_name, 0, 0, NULL, NULL, ver); rbd_destroy_ops(ops); @@ -1437,10 +1440,12 @@ static void rbd_rq_fn(struct request_queue *q) struct bio *bio; struct bio *rq_bio, *next_bio = NULL; bool do_write; - int size, op_size = 0; + unsigned int size; + u64 op_size = 0; u64 ofs; int num_segs, cur_seg = 0; struct rbd_req_coll *coll; + struct ceph_snap_context *snapc; /* peek at request from block layer */ if (!rq) @@ -1467,23 +1472,38 @@ static void rbd_rq_fn(struct request_queue *q) spin_unlock_irq(q->queue_lock); + down_read(&rbd_dev->header_rwsem); + + if (rbd_dev->snap_id != CEPH_NOSNAP && !rbd_dev->snap_exists) { + up_read(&rbd_dev->header_rwsem); + dout("request for non-existent snapshot"); + spin_lock_irq(q->queue_lock); + __blk_end_request_all(rq, -ENXIO); + continue; + } + + snapc = ceph_get_snap_context(rbd_dev->header.snapc); + + up_read(&rbd_dev->header_rwsem); + dout("%s 0x%x bytes at 0x%llx\n", do_write ? "write" : "read", - size, blk_rq_pos(rq) * SECTOR_SIZE); + size, (unsigned long long) blk_rq_pos(rq) * SECTOR_SIZE); num_segs = rbd_get_num_segments(&rbd_dev->header, ofs, size); coll = rbd_alloc_coll(num_segs); if (!coll) { spin_lock_irq(q->queue_lock); __blk_end_request_all(rq, -ENOMEM); + ceph_put_snap_context(snapc); continue; } do { /* a bio clone to be passed down to OSD req */ - dout("rq->bio->bi_vcnt=%d\n", rq->bio->bi_vcnt); + dout("rq->bio->bi_vcnt=%hu\n", rq->bio->bi_vcnt); op_size = rbd_get_segment(&rbd_dev->header, - rbd_dev->header.block_name, + rbd_dev->header.object_prefix, ofs, size, NULL, NULL); kref_get(&coll->kref); @@ -1499,7 +1519,7 @@ static void rbd_rq_fn(struct request_queue *q) /* init OSD command: write or read */ if (do_write) rbd_req_write(rq, rbd_dev, - rbd_dev->header.snapc, + snapc, ofs, op_size, bio, coll, cur_seg); @@ -1522,6 +1542,8 @@ next_seg: if (bp) bio_pair_release(bp); spin_lock_irq(q->queue_lock); + + ceph_put_snap_context(snapc); } } @@ -1592,18 +1614,19 @@ static int rbd_read_header(struct rbd_device *rbd_dev, return -ENOMEM; rc = rbd_req_sync_read(rbd_dev, - NULL, CEPH_NOSNAP, - rbd_dev->obj_md_name, + CEPH_NOSNAP, + rbd_dev->header_name, 0, len, (char *)dh, &ver); if (rc < 0) goto out_dh; - rc = rbd_header_from_disk(header, dh, snap_count, GFP_KERNEL); + rc = rbd_header_from_disk(header, dh, snap_count); if (rc < 0) { if (rc == -ENXIO) pr_warning("unrecognized header format" - " for image %s", rbd_dev->obj); + " for image %s\n", + rbd_dev->image_name); goto out_dh; } @@ -1628,7 +1651,7 @@ out_dh: /* * create a snapshot */ -static int rbd_header_add_snap(struct rbd_device *dev, +static int rbd_header_add_snap(struct rbd_device *rbd_dev, const char *snap_name, gfp_t gfp_flags) { @@ -1636,16 +1659,15 @@ static int rbd_header_add_snap(struct rbd_device *dev, u64 new_snapid; int ret; void *data, *p, *e; - u64 ver; struct ceph_mon_client *monc; /* we should create a snapshot only if we're pointing at the head */ - if (dev->snap_id != CEPH_NOSNAP) + if (rbd_dev->snap_id != CEPH_NOSNAP) return -EINVAL; - monc = &dev->rbd_client->client->monc; - ret = ceph_monc_create_snapid(monc, dev->poolid, &new_snapid); - dout("created snapid=%lld\n", new_snapid); + monc = &rbd_dev->rbd_client->client->monc; + ret = ceph_monc_create_snapid(monc, rbd_dev->pool_id, &new_snapid); + dout("created snapid=%llu\n", (unsigned long long) new_snapid); if (ret < 0) return ret; @@ -1659,19 +1681,13 @@ static int rbd_header_add_snap(struct rbd_device *dev, ceph_encode_string_safe(&p, e, snap_name, name_len, bad); ceph_encode_64_safe(&p, e, new_snapid, bad); - ret = rbd_req_sync_exec(dev, dev->obj_md_name, "rbd", "snap_add", - data, p - data, &ver); + ret = rbd_req_sync_exec(rbd_dev, rbd_dev->header_name, + "rbd", "snap_add", + data, p - data, NULL); kfree(data); - if (ret < 0) - return ret; - - down_write(&dev->header_rwsem); - dev->header.snapc->seq = new_snapid; - up_write(&dev->header_rwsem); - - return 0; + return ret < 0 ? ret : 0; bad: return -ERANGE; } @@ -1679,52 +1695,52 @@ bad: static void __rbd_remove_all_snaps(struct rbd_device *rbd_dev) { struct rbd_snap *snap; + struct rbd_snap *next; - while (!list_empty(&rbd_dev->snaps)) { - snap = list_first_entry(&rbd_dev->snaps, struct rbd_snap, node); - __rbd_remove_snap_dev(rbd_dev, snap); - } + list_for_each_entry_safe(snap, next, &rbd_dev->snaps, node) + __rbd_remove_snap_dev(snap); } /* * only read the first part of the ondisk header, without the snaps info */ -static int __rbd_refresh_header(struct rbd_device *rbd_dev) +static int __rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) { int ret; struct rbd_image_header h; - u64 snap_seq; - int follow_seq = 0; ret = rbd_read_header(rbd_dev, &h); if (ret < 0) return ret; - /* resized? */ - set_capacity(rbd_dev->disk, h.image_size / SECTOR_SIZE); - down_write(&rbd_dev->header_rwsem); - snap_seq = rbd_dev->header.snapc->seq; - if (rbd_dev->header.total_snaps && - rbd_dev->header.snapc->snaps[0] == snap_seq) - /* pointing at the head, will need to follow that - if head moves */ - follow_seq = 1; + /* resized? */ + if (rbd_dev->snap_id == CEPH_NOSNAP) { + sector_t size = (sector_t) h.image_size / SECTOR_SIZE; - kfree(rbd_dev->header.snapc); - kfree(rbd_dev->header.snap_names); + dout("setting size to %llu sectors", (unsigned long long) size); + set_capacity(rbd_dev->disk, size); + } + + /* rbd_dev->header.object_prefix shouldn't change */ kfree(rbd_dev->header.snap_sizes); + kfree(rbd_dev->header.snap_names); + /* osd requests may still refer to snapc */ + ceph_put_snap_context(rbd_dev->header.snapc); + if (hver) + *hver = h.obj_version; + rbd_dev->header.obj_version = h.obj_version; + rbd_dev->header.image_size = h.image_size; rbd_dev->header.total_snaps = h.total_snaps; rbd_dev->header.snapc = h.snapc; rbd_dev->header.snap_names = h.snap_names; rbd_dev->header.snap_names_len = h.snap_names_len; rbd_dev->header.snap_sizes = h.snap_sizes; - if (follow_seq) - rbd_dev->header.snapc->seq = rbd_dev->header.snapc->snaps[0]; - else - rbd_dev->header.snapc->seq = snap_seq; + /* Free the extra copy of the object prefix */ + WARN_ON(strcmp(rbd_dev->header.object_prefix, h.object_prefix)); + kfree(h.object_prefix); ret = __rbd_init_snaps_header(rbd_dev); @@ -1733,6 +1749,17 @@ static int __rbd_refresh_header(struct rbd_device *rbd_dev) return ret; } +static int rbd_refresh_header(struct rbd_device *rbd_dev, u64 *hver) +{ + int ret; + + mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + ret = __rbd_refresh_header(rbd_dev, hver); + mutex_unlock(&ctl_mutex); + + return ret; +} + static int rbd_init_disk(struct rbd_device *rbd_dev) { struct gendisk *disk; @@ -1762,7 +1789,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) goto out; snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d", - rbd_dev->id); + rbd_dev->dev_id); disk->major = rbd_dev->major; disk->first_minor = 0; disk->fops = &rbd_bd_ops; @@ -1819,8 +1846,13 @@ static ssize_t rbd_size_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + sector_t size; + + down_read(&rbd_dev->header_rwsem); + size = get_capacity(rbd_dev->disk); + up_read(&rbd_dev->header_rwsem); - return sprintf(buf, "%llu\n", (unsigned long long)rbd_dev->header.image_size); + return sprintf(buf, "%llu\n", (unsigned long long) size * SECTOR_SIZE); } static ssize_t rbd_major_show(struct device *dev, @@ -1848,12 +1880,20 @@ static ssize_t rbd_pool_show(struct device *dev, return sprintf(buf, "%s\n", rbd_dev->pool_name); } +static ssize_t rbd_pool_id_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); + + return sprintf(buf, "%d\n", rbd_dev->pool_id); +} + static ssize_t rbd_name_show(struct device *dev, struct device_attribute *attr, char *buf) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - return sprintf(buf, "%s\n", rbd_dev->obj); + return sprintf(buf, "%s\n", rbd_dev->image_name); } static ssize_t rbd_snap_show(struct device *dev, @@ -1871,23 +1911,18 @@ static ssize_t rbd_image_refresh(struct device *dev, size_t size) { struct rbd_device *rbd_dev = dev_to_rbd_dev(dev); - int rc; - int ret = size; - - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); + int ret; - rc = __rbd_refresh_header(rbd_dev); - if (rc < 0) - ret = rc; + ret = rbd_refresh_header(rbd_dev, NULL); - mutex_unlock(&ctl_mutex); - return ret; + return ret < 0 ? ret : size; } static DEVICE_ATTR(size, S_IRUGO, rbd_size_show, NULL); static DEVICE_ATTR(major, S_IRUGO, rbd_major_show, NULL); static DEVICE_ATTR(client_id, S_IRUGO, rbd_client_id_show, NULL); static DEVICE_ATTR(pool, S_IRUGO, rbd_pool_show, NULL); +static DEVICE_ATTR(pool_id, S_IRUGO, rbd_pool_id_show, NULL); static DEVICE_ATTR(name, S_IRUGO, rbd_name_show, NULL); static DEVICE_ATTR(refresh, S_IWUSR, NULL, rbd_image_refresh); static DEVICE_ATTR(current_snap, S_IRUGO, rbd_snap_show, NULL); @@ -1898,6 +1933,7 @@ static struct attribute *rbd_attrs[] = { &dev_attr_major.attr, &dev_attr_client_id.attr, &dev_attr_pool.attr, + &dev_attr_pool_id.attr, &dev_attr_name.attr, &dev_attr_current_snap.attr, &dev_attr_refresh.attr, @@ -1977,15 +2013,13 @@ static struct device_type rbd_snap_device_type = { .release = rbd_snap_dev_release, }; -static void __rbd_remove_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap) +static void __rbd_remove_snap_dev(struct rbd_snap *snap) { list_del(&snap->node); device_unregister(&snap->dev); } -static int rbd_register_snap_dev(struct rbd_device *rbd_dev, - struct rbd_snap *snap, +static int rbd_register_snap_dev(struct rbd_snap *snap, struct device *parent) { struct device *dev = &snap->dev; @@ -2000,29 +2034,36 @@ static int rbd_register_snap_dev(struct rbd_device *rbd_dev, return ret; } -static int __rbd_add_snap_dev(struct rbd_device *rbd_dev, - int i, const char *name, - struct rbd_snap **snapp) +static struct rbd_snap *__rbd_add_snap_dev(struct rbd_device *rbd_dev, + int i, const char *name) { + struct rbd_snap *snap; int ret; - struct rbd_snap *snap = kzalloc(sizeof(*snap), GFP_KERNEL); + + snap = kzalloc(sizeof (*snap), GFP_KERNEL); if (!snap) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + ret = -ENOMEM; snap->name = kstrdup(name, GFP_KERNEL); + if (!snap->name) + goto err; + snap->size = rbd_dev->header.snap_sizes[i]; snap->id = rbd_dev->header.snapc->snaps[i]; if (device_is_registered(&rbd_dev->dev)) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) goto err; } - *snapp = snap; - return 0; + + return snap; + err: kfree(snap->name); kfree(snap); - return ret; + + return ERR_PTR(ret); } /* @@ -2055,7 +2096,6 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) const char *name, *first_name; int i = rbd_dev->header.total_snaps; struct rbd_snap *snap, *old_snap = NULL; - int ret; struct list_head *p, *n; first_name = rbd_dev->header.snap_names; @@ -2070,8 +2110,15 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) cur_id = rbd_dev->header.snapc->snaps[i - 1]; if (!i || old_snap->id < cur_id) { - /* old_snap->id was skipped, thus was removed */ - __rbd_remove_snap_dev(rbd_dev, old_snap); + /* + * old_snap->id was skipped, thus was + * removed. If this rbd_dev is mapped to + * the removed snapshot, record that it no + * longer exists, to prevent further I/O. + */ + if (rbd_dev->snap_id == old_snap->id) + rbd_dev->snap_exists = false; + __rbd_remove_snap_dev(old_snap); continue; } if (old_snap->id == cur_id) { @@ -2091,9 +2138,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) if (cur_id >= old_snap->id) break; /* a new snapshot */ - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); /* note that we add it backward so using n and not p */ list_add(&snap->node, n); @@ -2107,9 +2154,9 @@ static int __rbd_init_snaps_header(struct rbd_device *rbd_dev) WARN_ON(1); return -EINVAL; } - ret = __rbd_add_snap_dev(rbd_dev, i - 1, name, &snap); - if (ret < 0) - return ret; + snap = __rbd_add_snap_dev(rbd_dev, i - 1, name); + if (IS_ERR(snap)) + return PTR_ERR(snap); list_add(&snap->node, &rbd_dev->snaps); } @@ -2129,14 +2176,13 @@ static int rbd_bus_add_dev(struct rbd_device *rbd_dev) dev->type = &rbd_device_type; dev->parent = &rbd_root_dev; dev->release = rbd_dev_release; - dev_set_name(dev, "%d", rbd_dev->id); + dev_set_name(dev, "%d", rbd_dev->dev_id); ret = device_register(dev); if (ret < 0) goto out; list_for_each_entry(snap, &rbd_dev->snaps, node) { - ret = rbd_register_snap_dev(rbd_dev, snap, - &rbd_dev->dev); + ret = rbd_register_snap_dev(snap, &rbd_dev->dev); if (ret < 0) break; } @@ -2155,12 +2201,9 @@ static int rbd_init_watch_dev(struct rbd_device *rbd_dev) int ret, rc; do { - ret = rbd_req_sync_watch(rbd_dev, rbd_dev->obj_md_name, - rbd_dev->header.obj_version); + ret = rbd_req_sync_watch(rbd_dev); if (ret == -ERANGE) { - mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING); - rc = __rbd_refresh_header(rbd_dev); - mutex_unlock(&ctl_mutex); + rc = rbd_refresh_header(rbd_dev, NULL); if (rc < 0) return rc; } @@ -2177,7 +2220,7 @@ static atomic64_t rbd_id_max = ATOMIC64_INIT(0); */ static void rbd_id_get(struct rbd_device *rbd_dev) { - rbd_dev->id = atomic64_inc_return(&rbd_id_max); + rbd_dev->dev_id = atomic64_inc_return(&rbd_id_max); spin_lock(&rbd_dev_list_lock); list_add_tail(&rbd_dev->node, &rbd_dev_list); @@ -2191,7 +2234,7 @@ static void rbd_id_get(struct rbd_device *rbd_dev) static void rbd_id_put(struct rbd_device *rbd_dev) { struct list_head *tmp; - int rbd_id = rbd_dev->id; + int rbd_id = rbd_dev->dev_id; int max_id; BUG_ON(rbd_id < 1); @@ -2282,19 +2325,58 @@ static inline size_t copy_token(const char **buf, } /* - * This fills in the pool_name, obj, obj_len, snap_name, obj_len, + * Finds the next token in *buf, dynamically allocates a buffer big + * enough to hold a copy of it, and copies the token into the new + * buffer. The copy is guaranteed to be terminated with '\0'. Note + * that a duplicate buffer is created even for a zero-length token. + * + * Returns a pointer to the newly-allocated duplicate, or a null + * pointer if memory for the duplicate was not available. If + * the lenp argument is a non-null pointer, the length of the token + * (not including the '\0') is returned in *lenp. + * + * If successful, the *buf pointer will be updated to point beyond + * the end of the found token. + * + * Note: uses GFP_KERNEL for allocation. + */ +static inline char *dup_token(const char **buf, size_t *lenp) +{ + char *dup; + size_t len; + + len = next_token(buf); + dup = kmalloc(len + 1, GFP_KERNEL); + if (!dup) + return NULL; + + memcpy(dup, *buf, len); + *(dup + len) = '\0'; + *buf += len; + + if (lenp) + *lenp = len; + + return dup; +} + +/* + * This fills in the pool_name, image_name, image_name_len, snap_name, * rbd_dev, rbd_md_name, and name fields of the given rbd_dev, based * on the list of monitor addresses and other options provided via * /sys/bus/rbd/add. + * + * Note: rbd_dev is assumed to have been initially zero-filled. */ static int rbd_add_parse_args(struct rbd_device *rbd_dev, const char *buf, const char **mon_addrs, size_t *mon_addrs_size, char *options, - size_t options_size) + size_t options_size) { - size_t len; + size_t len; + int ret; /* The first four tokens are required */ @@ -2310,56 +2392,74 @@ static int rbd_add_parse_args(struct rbd_device *rbd_dev, if (!len || len >= options_size) return -EINVAL; - len = copy_token(&buf, rbd_dev->pool_name, sizeof (rbd_dev->pool_name)); - if (!len || len >= sizeof (rbd_dev->pool_name)) - return -EINVAL; - - len = copy_token(&buf, rbd_dev->obj, sizeof (rbd_dev->obj)); - if (!len || len >= sizeof (rbd_dev->obj)) - return -EINVAL; + ret = -ENOMEM; + rbd_dev->pool_name = dup_token(&buf, NULL); + if (!rbd_dev->pool_name) + goto out_err; - /* We have the object length in hand, save it. */ + rbd_dev->image_name = dup_token(&buf, &rbd_dev->image_name_len); + if (!rbd_dev->image_name) + goto out_err; - rbd_dev->obj_len = len; + /* Create the name of the header object */ - BUILD_BUG_ON(RBD_MAX_MD_NAME_LEN - < RBD_MAX_OBJ_NAME_LEN + sizeof (RBD_SUFFIX)); - sprintf(rbd_dev->obj_md_name, "%s%s", rbd_dev->obj, RBD_SUFFIX); + rbd_dev->header_name = kmalloc(rbd_dev->image_name_len + + sizeof (RBD_SUFFIX), + GFP_KERNEL); + if (!rbd_dev->header_name) + goto out_err; + sprintf(rbd_dev->header_name, "%s%s", rbd_dev->image_name, RBD_SUFFIX); /* - * The snapshot name is optional, but it's an error if it's - * too long. If no snapshot is supplied, fill in the default. + * The snapshot name is optional. If none is is supplied, + * we use the default value. */ - len = copy_token(&buf, rbd_dev->snap_name, sizeof (rbd_dev->snap_name)); - if (!len) + rbd_dev->snap_name = dup_token(&buf, &len); + if (!rbd_dev->snap_name) + goto out_err; + if (!len) { + /* Replace the empty name with the default */ + kfree(rbd_dev->snap_name); + rbd_dev->snap_name + = kmalloc(sizeof (RBD_SNAP_HEAD_NAME), GFP_KERNEL); + if (!rbd_dev->snap_name) + goto out_err; + memcpy(rbd_dev->snap_name, RBD_SNAP_HEAD_NAME, sizeof (RBD_SNAP_HEAD_NAME)); - else if (len >= sizeof (rbd_dev->snap_name)) - return -EINVAL; + } return 0; + +out_err: + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + rbd_dev->pool_name = NULL; + + return ret; } static ssize_t rbd_add(struct bus_type *bus, const char *buf, size_t count) { - struct rbd_device *rbd_dev; + char *options; + struct rbd_device *rbd_dev = NULL; const char *mon_addrs = NULL; size_t mon_addrs_size = 0; - char *options = NULL; struct ceph_osd_client *osdc; int rc = -ENOMEM; if (!try_module_get(THIS_MODULE)) return -ENODEV; - rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); - if (!rbd_dev) - goto err_nomem; options = kmalloc(count, GFP_KERNEL); if (!options) goto err_nomem; + rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL); + if (!rbd_dev) + goto err_nomem; /* static rbd_device initialization */ spin_lock_init(&rbd_dev->lock); @@ -2367,15 +2467,13 @@ static ssize_t rbd_add(struct bus_type *bus, INIT_LIST_HEAD(&rbd_dev->snaps); init_rwsem(&rbd_dev->header_rwsem); - init_rwsem(&rbd_dev->header_rwsem); - /* generate unique id: find highest unique id, add one */ rbd_id_get(rbd_dev); /* Fill in the device name, now that we have its id. */ BUILD_BUG_ON(DEV_NAME_LEN < sizeof (RBD_DRV_NAME) + MAX_INT_FORMAT_WIDTH); - sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->id); + sprintf(rbd_dev->name, "%s%d", RBD_DRV_NAME, rbd_dev->dev_id); /* parse add command */ rc = rbd_add_parse_args(rbd_dev, buf, &mon_addrs, &mon_addrs_size, @@ -2395,7 +2493,7 @@ static ssize_t rbd_add(struct bus_type *bus, rc = ceph_pg_poolid_by_name(osdc->osdmap, rbd_dev->pool_name); if (rc < 0) goto err_out_client; - rbd_dev->poolid = rc; + rbd_dev->pool_id = rc; /* register our block device */ rc = register_blkdev(0, rbd_dev->name); @@ -2435,10 +2533,16 @@ err_out_blkdev: err_out_client: rbd_put_client(rbd_dev); err_put_id: + if (rbd_dev->pool_name) { + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->image_name); + kfree(rbd_dev->pool_name); + } rbd_id_put(rbd_dev); err_nomem: - kfree(options); kfree(rbd_dev); + kfree(options); dout("Error adding device %s\n", buf); module_put(THIS_MODULE); @@ -2446,7 +2550,7 @@ err_nomem: return (ssize_t) rc; } -static struct rbd_device *__rbd_get_dev(unsigned long id) +static struct rbd_device *__rbd_get_dev(unsigned long dev_id) { struct list_head *tmp; struct rbd_device *rbd_dev; @@ -2454,7 +2558,7 @@ static struct rbd_device *__rbd_get_dev(unsigned long id) spin_lock(&rbd_dev_list_lock); list_for_each(tmp, &rbd_dev_list) { rbd_dev = list_entry(tmp, struct rbd_device, node); - if (rbd_dev->id == id) { + if (rbd_dev->dev_id == dev_id) { spin_unlock(&rbd_dev_list_lock); return rbd_dev; } @@ -2474,7 +2578,7 @@ static void rbd_dev_release(struct device *dev) rbd_dev->watch_request); } if (rbd_dev->watch_event) - rbd_req_sync_unwatch(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_unwatch(rbd_dev); rbd_put_client(rbd_dev); @@ -2483,6 +2587,10 @@ static void rbd_dev_release(struct device *dev) unregister_blkdev(rbd_dev->major, rbd_dev->name); /* done with the id, and with the rbd_dev */ + kfree(rbd_dev->snap_name); + kfree(rbd_dev->header_name); + kfree(rbd_dev->pool_name); + kfree(rbd_dev->image_name); rbd_id_put(rbd_dev); kfree(rbd_dev); @@ -2544,7 +2652,7 @@ static ssize_t rbd_snap_add(struct device *dev, if (ret < 0) goto err_unlock; - ret = __rbd_refresh_header(rbd_dev); + ret = __rbd_refresh_header(rbd_dev, NULL); if (ret < 0) goto err_unlock; @@ -2553,7 +2661,7 @@ static ssize_t rbd_snap_add(struct device *dev, mutex_unlock(&ctl_mutex); /* make a best effort, don't error if failed */ - rbd_req_sync_notify(rbd_dev, rbd_dev->obj_md_name); + rbd_req_sync_notify(rbd_dev); ret = count; kfree(name); diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h index 950708688f17..0924e9e41a60 100644 --- a/drivers/block/rbd_types.h +++ b/drivers/block/rbd_types.h @@ -31,7 +31,6 @@ #define RBD_MIN_OBJ_ORDER 16 #define RBD_MAX_OBJ_ORDER 30 -#define RBD_MAX_OBJ_NAME_LEN 96 #define RBD_MAX_SEG_NAME_LEN 128 #define RBD_COMP_NONE 0 diff --git a/drivers/block/umem.c b/drivers/block/umem.c index 9a72277a31df..eb0d8216f557 100644 --- a/drivers/block/umem.c +++ b/drivers/block/umem.c @@ -513,42 +513,19 @@ static void process_page(unsigned long data) } } -struct mm_plug_cb { - struct blk_plug_cb cb; - struct cardinfo *card; -}; - -static void mm_unplug(struct blk_plug_cb *cb) +static void mm_unplug(struct blk_plug_cb *cb, bool from_schedule) { - struct mm_plug_cb *mmcb = container_of(cb, struct mm_plug_cb, cb); + struct cardinfo *card = cb->data; - spin_lock_irq(&mmcb->card->lock); - activate(mmcb->card); - spin_unlock_irq(&mmcb->card->lock); - kfree(mmcb); + spin_lock_irq(&card->lock); + activate(card); + spin_unlock_irq(&card->lock); + kfree(cb); } static int mm_check_plugged(struct cardinfo *card) { - struct blk_plug *plug = current->plug; - struct mm_plug_cb *mmcb; - - if (!plug) - return 0; - - list_for_each_entry(mmcb, &plug->cb_list, cb.list) { - if (mmcb->cb.callback == mm_unplug && mmcb->card == card) - return 1; - } - /* Not currently on the callback list */ - mmcb = kmalloc(sizeof(*mmcb), GFP_ATOMIC); - if (!mmcb) - return 0; - - mmcb->card = card; - mmcb->cb.callback = mm_unplug; - list_add(&mmcb->cb.list, &plug->cb_list); - return 1; + return !!blk_check_plugged(mm_unplug, card, sizeof(struct blk_plug_cb)); } static void mm_make_request(struct request_queue *q, struct bio *bio) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 693187df7601..c0bbeb470754 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -21,8 +21,6 @@ struct workqueue_struct *virtblk_wq; struct virtio_blk { - spinlock_t lock; - struct virtio_device *vdev; struct virtqueue *vq; @@ -65,7 +63,7 @@ static void blk_done(struct virtqueue *vq) unsigned int len; unsigned long flags; - spin_lock_irqsave(&vblk->lock, flags); + spin_lock_irqsave(vblk->disk->queue->queue_lock, flags); while ((vbr = virtqueue_get_buf(vblk->vq, &len)) != NULL) { int error; @@ -99,7 +97,7 @@ static void blk_done(struct virtqueue *vq) } /* In case queue is stopped waiting for more buffers. */ blk_start_queue(vblk->disk->queue); - spin_unlock_irqrestore(&vblk->lock, flags); + spin_unlock_irqrestore(vblk->disk->queue->queue_lock, flags); } static bool do_req(struct request_queue *q, struct virtio_blk *vblk, @@ -397,6 +395,83 @@ static int virtblk_name_format(char *prefix, int index, char *buf, int buflen) return 0; } +static int virtblk_get_cache_mode(struct virtio_device *vdev) +{ + u8 writeback; + int err; + + err = virtio_config_val(vdev, VIRTIO_BLK_F_CONFIG_WCE, + offsetof(struct virtio_blk_config, wce), + &writeback); + if (err) + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); + + return writeback; +} + +static void virtblk_update_cache_mode(struct virtio_device *vdev) +{ + u8 writeback = virtblk_get_cache_mode(vdev); + struct virtio_blk *vblk = vdev->priv; + + if (writeback) + blk_queue_flush(vblk->disk->queue, REQ_FLUSH); + else + blk_queue_flush(vblk->disk->queue, 0); + + revalidate_disk(vblk->disk); +} + +static const char *const virtblk_cache_types[] = { + "write through", "write back" +}; + +static ssize_t +virtblk_cache_type_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + struct virtio_device *vdev = vblk->vdev; + int i; + u8 writeback; + + BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE)); + for (i = ARRAY_SIZE(virtblk_cache_types); --i >= 0; ) + if (sysfs_streq(buf, virtblk_cache_types[i])) + break; + + if (i < 0) + return -EINVAL; + + writeback = i; + vdev->config->set(vdev, + offsetof(struct virtio_blk_config, wce), + &writeback, sizeof(writeback)); + + virtblk_update_cache_mode(vdev); + return count; +} + +static ssize_t +virtblk_cache_type_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + struct virtio_blk *vblk = disk->private_data; + u8 writeback = virtblk_get_cache_mode(vblk->vdev); + + BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types)); + return snprintf(buf, 40, "%s\n", virtblk_cache_types[writeback]); +} + +static const struct device_attribute dev_attr_cache_type_ro = + __ATTR(cache_type, S_IRUGO, + virtblk_cache_type_show, NULL); +static const struct device_attribute dev_attr_cache_type_rw = + __ATTR(cache_type, S_IRUGO|S_IWUSR, + virtblk_cache_type_show, virtblk_cache_type_store); + static int __devinit virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -431,7 +506,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_free_index; } - spin_lock_init(&vblk->lock); vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); @@ -456,7 +530,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_mempool; } - q = vblk->disk->queue = blk_init_queue(do_virtblk_request, &vblk->lock); + q = vblk->disk->queue = blk_init_queue(do_virtblk_request, NULL); if (!q) { err = -ENOMEM; goto out_put_disk; @@ -474,8 +548,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) vblk->index = index; /* configure queue flush support */ - if (virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH)) - blk_queue_flush(q, REQ_FLUSH); + virtblk_update_cache_mode(vdev); /* If disk is read-only in the host, the guest should obey */ if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO)) @@ -553,6 +626,14 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) if (err) goto out_del_disk; + if (virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE)) + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_rw); + else + err = device_create_file(disk_to_dev(vblk->disk), + &dev_attr_cache_type_ro); + if (err) + goto out_del_disk; return 0; out_del_disk: @@ -576,30 +657,20 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; int index = vblk->index; - struct virtblk_req *vbr; - unsigned long flags; /* Prevent config work handler from accessing the device. */ mutex_lock(&vblk->config_lock); vblk->config_enable = false; mutex_unlock(&vblk->config_lock); + del_gendisk(vblk->disk); + blk_cleanup_queue(vblk->disk->queue); + /* Stop all the virtqueues. */ vdev->config->reset(vdev); flush_work(&vblk->config_work); - del_gendisk(vblk->disk); - - /* Abort requests dispatched to driver. */ - spin_lock_irqsave(&vblk->lock, flags); - while ((vbr = virtqueue_detach_unused_buf(vblk->vq))) { - __blk_end_request_all(vbr->req, -EIO); - mempool_free(vbr, vblk->pool); - } - spin_unlock_irqrestore(&vblk->lock, flags); - - blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); mempool_destroy(vblk->pool); vdev->config->del_vqs(vdev); @@ -655,7 +726,7 @@ static const struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, - VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY + VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE }; /* diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index e4fb3374dcd2..2c2d2e5c1597 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -888,9 +888,8 @@ static int setup_blkring(struct xenbus_device *dev, if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, - blkif_interrupt, - IRQF_SAMPLE_RANDOM, "blkif", info); + err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, + "blkif", info); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); |