diff options
author | Sage Weil <sage@inktank.com> | 2012-06-15 21:32:04 +0200 |
---|---|---|
committer | Sage Weil <sage@inktank.com> | 2012-06-15 21:32:04 +0200 |
commit | 9a64e8e0ace51b309fdcff4b4754b3649250382a (patch) | |
tree | 1f0d75c196c5ab0408c55ed6cf3a152f1f921e15 /drivers/block | |
parent | libceph: flush msgr queue during mon_client shutdown (diff) | |
parent | Linux 3.5-rc1 (diff) | |
download | linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.tar.xz linux-9a64e8e0ace51b309fdcff4b4754b3649250382a.zip |
Merge tag 'v3.5-rc1'
Linux 3.5-rc1
Conflicts:
net/ceph/messenger.c
Diffstat (limited to 'drivers/block')
32 files changed, 2026 insertions, 1920 deletions
diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 8db9089127c5..9a13e889837e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -6580,24 +6580,21 @@ static const struct file_operations dac960_user_command_proc_fops = { static void DAC960_CreateProcEntries(DAC960_Controller_T *Controller) { - struct proc_dir_entry *StatusProcEntry; struct proc_dir_entry *ControllerProcEntry; - struct proc_dir_entry *UserCommandProcEntry; if (DAC960_ProcDirectoryEntry == NULL) { - DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); - StatusProcEntry = proc_create("status", 0, - DAC960_ProcDirectoryEntry, - &dac960_proc_fops); + DAC960_ProcDirectoryEntry = proc_mkdir("rd", NULL); + proc_create("status", 0, DAC960_ProcDirectoryEntry, + &dac960_proc_fops); } - sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); - ControllerProcEntry = proc_mkdir(Controller->ControllerName, - DAC960_ProcDirectoryEntry); - proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); - proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); - UserCommandProcEntry = proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); - Controller->ControllerProcEntry = ControllerProcEntry; + sprintf(Controller->ControllerName, "c%d", Controller->ControllerNumber); + ControllerProcEntry = proc_mkdir(Controller->ControllerName, + DAC960_ProcDirectoryEntry); + proc_create_data("initial_status", 0, ControllerProcEntry, &dac960_initial_status_proc_fops, Controller); + proc_create_data("current_status", 0, ControllerProcEntry, &dac960_current_status_proc_fops, Controller); + proc_create_data("user_command", S_IWUSR | S_IRUSR, ControllerProcEntry, &dac960_user_command_proc_fops, Controller); + Controller->ControllerProcEntry = ControllerProcEntry; } diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index 4e4c8a4a5fd3..a796407123c7 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -354,7 +354,7 @@ config BLK_DEV_SX8 Use devices /dev/sx8/$N and /dev/sx8/$Np$M. config BLK_DEV_UB - tristate "Low Performance USB Block driver" + tristate "Low Performance USB Block driver (deprecated)" depends on USB help This driver supports certain USB attached storage devices diff --git a/drivers/block/brd.c b/drivers/block/brd.c index ec246437f5a4..531ceb31d0ff 100644 --- a/drivers/block/brd.c +++ b/drivers/block/brd.c @@ -242,9 +242,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src, page = brd_lookup_page(brd, sector); BUG_ON(!page); - dst = kmap_atomic(page, KM_USER1); + dst = kmap_atomic(page); memcpy(dst + offset, src, copy); - kunmap_atomic(dst, KM_USER1); + kunmap_atomic(dst); if (copy < n) { src += copy; @@ -253,9 +253,9 @@ static void copy_to_brd(struct brd_device *brd, const void *src, page = brd_lookup_page(brd, sector); BUG_ON(!page); - dst = kmap_atomic(page, KM_USER1); + dst = kmap_atomic(page); memcpy(dst, src, copy); - kunmap_atomic(dst, KM_USER1); + kunmap_atomic(dst); } } @@ -273,9 +273,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd, copy = min_t(size_t, n, PAGE_SIZE - offset); page = brd_lookup_page(brd, sector); if (page) { - src = kmap_atomic(page, KM_USER1); + src = kmap_atomic(page); memcpy(dst, src + offset, copy); - kunmap_atomic(src, KM_USER1); + kunmap_atomic(src); } else memset(dst, 0, copy); @@ -285,9 +285,9 @@ static void copy_from_brd(void *dst, struct brd_device *brd, copy = n - copy; page = brd_lookup_page(brd, sector); if (page) { - src = kmap_atomic(page, KM_USER1); + src = kmap_atomic(page); memcpy(dst, src, copy); - kunmap_atomic(src, KM_USER1); + kunmap_atomic(src); } else memset(dst, 0, copy); } @@ -309,7 +309,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, goto out; } - mem = kmap_atomic(page, KM_USER0); + mem = kmap_atomic(page); if (rw == READ) { copy_from_brd(mem + off, brd, sector, len); flush_dcache_page(page); @@ -317,7 +317,7 @@ static int brd_do_bvec(struct brd_device *brd, struct page *page, flush_dcache_page(page); copy_to_brd(brd, mem + off, sector, len); } - kunmap_atomic(mem, KM_USER0); + kunmap_atomic(mem); out: return err; diff --git a/drivers/block/cciss_scsi.c b/drivers/block/cciss_scsi.c index e820b68d2f6c..acda773b3720 100644 --- a/drivers/block/cciss_scsi.c +++ b/drivers/block/cciss_scsi.c @@ -866,6 +866,7 @@ cciss_scsi_detect(ctlr_info_t *h) sh->can_queue = cciss_tape_cmds; sh->sg_tablesize = h->maxsgentries; sh->max_cmd_len = MAX_COMMAND_SIZE; + sh->max_sectors = h->cciss_max_sectors; ((struct cciss_scsi_adapter_data_t *) h->scsi_ctlr)->scsi_host = sh; @@ -1410,7 +1411,7 @@ static void cciss_scatter_gather(ctlr_info_t *h, CommandList_struct *c, /* track how many SG entries we are using */ if (request_nsgs > h->maxSG) h->maxSG = request_nsgs; - c->Header.SGTotal = (__u8) request_nsgs + chained; + c->Header.SGTotal = (u16) request_nsgs + chained; if (request_nsgs > h->max_cmd_sgentries) c->Header.SGList = h->max_cmd_sgentries; else diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index cf0e63dd97da..e54e31b02b88 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -65,39 +65,80 @@ struct drbd_atodb_wait { int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int); +void *drbd_md_get_buffer(struct drbd_conf *mdev) +{ + int r; + + wait_event(mdev->misc_wait, + (r = atomic_cmpxchg(&mdev->md_io_in_use, 0, 1)) == 0 || + mdev->state.disk <= D_FAILED); + + return r ? NULL : page_address(mdev->md_io_page); +} + +void drbd_md_put_buffer(struct drbd_conf *mdev) +{ + if (atomic_dec_and_test(&mdev->md_io_in_use)) + wake_up(&mdev->misc_wait); +} + +static bool md_io_allowed(struct drbd_conf *mdev) +{ + enum drbd_disk_state ds = mdev->state.disk; + return ds >= D_NEGOTIATING || ds == D_ATTACHING; +} + +void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done) +{ + long dt = bdev->dc.disk_timeout * HZ / 10; + if (dt == 0) + dt = MAX_SCHEDULE_TIMEOUT; + + dt = wait_event_timeout(mdev->misc_wait, *done || !md_io_allowed(mdev), dt); + if (dt == 0) + dev_err(DEV, "meta-data IO operation timed out\n"); +} + static int _drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, struct page *page, sector_t sector, int rw, int size) { struct bio *bio; - struct drbd_md_io md_io; int ok; - md_io.mdev = mdev; - init_completion(&md_io.event); - md_io.error = 0; + mdev->md_io.done = 0; + mdev->md_io.error = -ENODEV; if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags)) rw |= REQ_FUA | REQ_FLUSH; rw |= REQ_SYNC; - bio = bio_alloc(GFP_NOIO, 1); + bio = bio_alloc_drbd(GFP_NOIO); bio->bi_bdev = bdev->md_bdev; bio->bi_sector = sector; ok = (bio_add_page(bio, page, size, 0) == size); if (!ok) goto out; - bio->bi_private = &md_io; + bio->bi_private = &mdev->md_io; bio->bi_end_io = drbd_md_io_complete; bio->bi_rw = rw; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n"); + ok = 0; + goto out; + } + + bio_get(bio); /* one bio_put() is in the completion handler */ + atomic_inc(&mdev->md_io_in_use); /* drbd_md_put_buffer() is in the completion handler */ if (drbd_insert_fault(mdev, (rw & WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) bio_endio(bio, -EIO); else submit_bio(rw, bio); - wait_for_completion(&md_io.event); - ok = bio_flagged(bio, BIO_UPTODATE) && md_io.error == 0; + wait_until_done_or_disk_failure(mdev, bdev, &mdev->md_io.done); + ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0; out: bio_put(bio); @@ -111,7 +152,7 @@ int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int offset = 0; struct page *iop = mdev->md_io_page; - D_ASSERT(mutex_is_locked(&mdev->md_io_mutex)); + D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1); BUG_ON(!bdev->md_bdev); @@ -328,8 +369,13 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) return 1; } - mutex_lock(&mdev->md_io_mutex); /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = (struct al_transaction *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */ + if (!buffer) { + dev_err(DEV, "disk failed while waiting for md_io buffer\n"); + complete(&((struct update_al_work *)w)->event); + put_ldev(mdev); + return 1; + } buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC); buffer->tr_number = cpu_to_be32(mdev->al_tr_number); @@ -374,7 +420,7 @@ w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused) D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE); mdev->al_tr_number++; - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); complete(&((struct update_al_work *)w)->event); put_ldev(mdev); @@ -443,8 +489,9 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) /* lock out all other meta data io for now, * and make sure the page is mapped. */ - mutex_lock(&mdev->md_io_mutex); - buffer = page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + return 0; /* Find the valid transaction in the log */ for (i = 0; i <= mx; i++) { @@ -452,7 +499,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (rv == 0) continue; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } cnr = be32_to_cpu(buffer->tr_number); @@ -478,7 +525,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!found_valid) { dev_warn(DEV, "No usable activity log found.\n"); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 1; } @@ -493,7 +540,7 @@ int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) rv = drbd_al_read_tr(mdev, bdev, buffer, i); ERR_IF(rv == 0) goto cancel; if (rv == -1) { - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); return 0; } @@ -534,7 +581,7 @@ cancel: mdev->al_tr_pos = 0; /* ok, we are done with it */ - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n", transactions, active_extents); @@ -671,16 +718,20 @@ static void drbd_try_clear_on_disk_bm(struct drbd_conf *mdev, sector_t sector, else ext->rs_failed += count; if (ext->rs_left < ext->rs_failed) { - dev_err(DEV, "BAD! sector=%llus enr=%u rs_left=%d " - "rs_failed=%d count=%d\n", + dev_warn(DEV, "BAD! sector=%llus enr=%u rs_left=%d " + "rs_failed=%d count=%d cstate=%s\n", (unsigned long long)sector, ext->lce.lc_number, ext->rs_left, - ext->rs_failed, count); - dump_stack(); - - lc_put(mdev->resync, &ext->lce); - drbd_force_state(mdev, NS(conn, C_DISCONNECTING)); - return; + ext->rs_failed, count, + drbd_conn_str(mdev->state.conn)); + + /* We don't expect to be able to clear more bits + * than have been set when we originally counted + * the set bits to cache that value in ext->rs_left. + * Whatever the reason (disconnect during resync, + * delayed local completion of an application write), + * try to fix it up by recounting here. */ + ext->rs_left = drbd_bm_e_weight(mdev, enr); } } else { /* Normally this element should be in the cache, @@ -1192,6 +1243,7 @@ int drbd_rs_del_all(struct drbd_conf *mdev) put_ldev(mdev); } spin_unlock_irq(&mdev->al_lock); + wake_up(&mdev->al_wait); return 0; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 912f585a760f..b5c5ff53cb57 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -205,7 +205,7 @@ void drbd_bm_unlock(struct drbd_conf *mdev) static void bm_store_page_idx(struct page *page, unsigned long idx) { BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK)); - page_private(page) |= idx; + set_page_private(page, idx); } static unsigned long bm_page_to_idx(struct page *page) @@ -289,25 +289,25 @@ static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr) return page_nr; } -static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx, const enum km_type km) +static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { struct page *page = b->bm_pages[idx]; - return (unsigned long *) kmap_atomic(page, km); + return (unsigned long *) kmap_atomic(page); } static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx) { - return __bm_map_pidx(b, idx, KM_IRQ1); + return __bm_map_pidx(b, idx); } -static void __bm_unmap(unsigned long *p_addr, const enum km_type km) +static void __bm_unmap(unsigned long *p_addr) { - kunmap_atomic(p_addr, km); + kunmap_atomic(p_addr); }; static void bm_unmap(unsigned long *p_addr) { - return __bm_unmap(p_addr, KM_IRQ1); + return __bm_unmap(p_addr); } /* long word offset of _bitmap_ sector */ @@ -543,15 +543,15 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) /* all but last page */ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { - p_addr = __bm_map_pidx(b, idx, KM_USER0); + p_addr = __bm_map_pidx(b, idx); for (i = 0; i < LWPP; i++) bits += hweight_long(p_addr[i]); - __bm_unmap(p_addr, KM_USER0); + __bm_unmap(p_addr); cond_resched(); } /* last (or only) page */ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; - p_addr = __bm_map_pidx(b, idx, KM_USER0); + p_addr = __bm_map_pidx(b, idx); for (i = 0; i < last_word; i++) bits += hweight_long(p_addr[i]); p_addr[last_word] &= cpu_to_lel(mask); @@ -559,7 +559,7 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) /* 32bit arch, may have an unused padding long */ if (BITS_PER_LONG == 32 && (last_word & 1) == 0) p_addr[last_word+1] = 0; - __bm_unmap(p_addr, KM_USER0); + __bm_unmap(p_addr); return bits; } @@ -886,12 +886,21 @@ void drbd_bm_clear_all(struct drbd_conf *mdev) struct bm_aio_ctx { struct drbd_conf *mdev; atomic_t in_flight; - struct completion done; + unsigned int done; unsigned flags; #define BM_AIO_COPY_PAGES 1 int error; + struct kref kref; }; +static void bm_aio_ctx_destroy(struct kref *kref) +{ + struct bm_aio_ctx *ctx = container_of(kref, struct bm_aio_ctx, kref); + + put_ldev(ctx->mdev); + kfree(ctx); +} + /* bv_page may be a copy, or may be the original */ static void bm_async_io_complete(struct bio *bio, int error) { @@ -930,20 +939,21 @@ static void bm_async_io_complete(struct bio *bio, int error) bm_page_unlock_io(mdev, idx); - /* FIXME give back to page pool */ if (ctx->flags & BM_AIO_COPY_PAGES) - put_page(bio->bi_io_vec[0].bv_page); + mempool_free(bio->bi_io_vec[0].bv_page, drbd_md_io_page_pool); bio_put(bio); - if (atomic_dec_and_test(&ctx->in_flight)) - complete(&ctx->done); + if (atomic_dec_and_test(&ctx->in_flight)) { + ctx->done = 1; + wake_up(&mdev->misc_wait); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + } } static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must_hold(local) { - /* we are process context. we always get a bio */ - struct bio *bio = bio_alloc(GFP_KERNEL, 1); + struct bio *bio = bio_alloc_drbd(GFP_NOIO); struct drbd_conf *mdev = ctx->mdev; struct drbd_bitmap *b = mdev->bitmap; struct page *page; @@ -966,21 +976,21 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must bm_set_page_unchanged(b->bm_pages[page_nr]); if (ctx->flags & BM_AIO_COPY_PAGES) { - /* FIXME alloc_page is good enough for now, but actually needs - * to use pre-allocated page pool */ void *src, *dest; - page = alloc_page(__GFP_HIGHMEM|__GFP_WAIT); - dest = kmap_atomic(page, KM_USER0); - src = kmap_atomic(b->bm_pages[page_nr], KM_USER1); + page = mempool_alloc(drbd_md_io_page_pool, __GFP_HIGHMEM|__GFP_WAIT); + dest = kmap_atomic(page); + src = kmap_atomic(b->bm_pages[page_nr]); memcpy(dest, src, PAGE_SIZE); - kunmap_atomic(src, KM_USER1); - kunmap_atomic(dest, KM_USER0); + kunmap_atomic(src); + kunmap_atomic(dest); bm_store_page_idx(page, page_nr); } else page = b->bm_pages[page_nr]; bio->bi_bdev = mdev->ldev->md_bdev; bio->bi_sector = on_disk_sector; + /* bio_add_page of a single page to an empty bio will always succeed, + * according to api. Do we want to assert that? */ bio_add_page(bio, page, len, 0); bio->bi_private = ctx; bio->bi_end_io = bm_async_io_complete; @@ -999,14 +1009,9 @@ static void bm_page_io_async(struct bm_aio_ctx *ctx, int page_nr, int rw) __must /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_idx) __must_hold(local) +static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_writeout_upper_idx) __must_hold(local) { - struct bm_aio_ctx ctx = { - .mdev = mdev, - .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), - .flags = lazy_writeout_upper_idx ? BM_AIO_COPY_PAGES : 0, - }; + struct bm_aio_ctx *ctx; struct drbd_bitmap *b = mdev->bitmap; int num_pages, i, count = 0; unsigned long now; @@ -1021,7 +1026,27 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id * For lazy writeout, we don't care for ongoing changes to the bitmap, * as we submit copies of pages anyways. */ - if (!ctx.flags) + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { + .mdev = mdev, + .in_flight = ATOMIC_INIT(1), + .done = 0, + .flags = flags, + .error = 0, + .kref = { ATOMIC_INIT(2) }, + }; + + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n"); + kfree(ctx); + return -ENODEV; + } + + if (!ctx->flags) WARN_ON(!(BM_LOCKED_MASK & b->bm_flags)); num_pages = b->bm_number_of_pages; @@ -1046,29 +1071,38 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id continue; } } - atomic_inc(&ctx.in_flight); - bm_page_io_async(&ctx, i, rw); + atomic_inc(&ctx->in_flight); + bm_page_io_async(ctx, i, rw); ++count; cond_resched(); } /* - * We initialize ctx.in_flight to one to make sure bm_async_io_complete - * will not complete() early, and decrement / test it here. If there + * We initialize ctx->in_flight to one to make sure bm_async_io_complete + * will not set ctx->done early, and decrement / test it here. If there * are still some bios in flight, we need to wait for them here. + * If all IO is done already (or nothing had been submitted), there is + * no need to wait. Still, we need to put the kref associated with the + * "in_flight reached zero, all done" event. */ - if (!atomic_dec_and_test(&ctx.in_flight)) - wait_for_completion(&ctx.done); + if (!atomic_dec_and_test(&ctx->in_flight)) + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); + else + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n", rw == WRITE ? "WRITE" : "READ", count, jiffies - now); - if (ctx.error) { + if (ctx->error) { dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n"); drbd_chk_io_error(mdev, 1, true); - err = -EIO; /* ctx.error ? */ + err = -EIO; /* ctx->error ? */ } + if (atomic_read(&ctx->in_flight)) + err = -EIO; /* Disk failed during IO... */ + now = jiffies; if (rw == WRITE) { drbd_md_flush(mdev); @@ -1082,6 +1116,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n", ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now); + kref_put(&ctx->kref, &bm_aio_ctx_destroy); return err; } @@ -1091,7 +1126,7 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned lazy_writeout_upper_id */ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, READ, 0); + return bm_rw(mdev, READ, 0, 0); } /** @@ -1102,7 +1137,7 @@ int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) { - return bm_rw(mdev, WRITE, 0); + return bm_rw(mdev, WRITE, 0, 0); } /** @@ -1112,7 +1147,23 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local) */ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(local) { - return bm_rw(mdev, WRITE, upper_idx); + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, upper_idx); +} + +/** + * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location. + * @mdev: DRBD device. + * + * Will only write pages that have changed since last IO. + * In contrast to drbd_bm_write(), this will copy the bitmap pages + * to temporary writeout pages. It is intended to trigger a full write-out + * while still allowing the bitmap to change, for example if a resync or online + * verify is aborted due to a failed peer disk, while local IO continues, or + * pending resync acks are still being processed. + */ +int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local) +{ + return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0); } @@ -1130,28 +1181,45 @@ int drbd_bm_write_lazy(struct drbd_conf *mdev, unsigned upper_idx) __must_hold(l */ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local) { - struct bm_aio_ctx ctx = { + struct bm_aio_ctx *ctx; + int err; + + if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { + dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); + return 0; + } + + ctx = kmalloc(sizeof(struct bm_aio_ctx), GFP_NOIO); + if (!ctx) + return -ENOMEM; + + *ctx = (struct bm_aio_ctx) { .mdev = mdev, .in_flight = ATOMIC_INIT(1), - .done = COMPLETION_INITIALIZER_ONSTACK(ctx.done), + .done = 0, .flags = BM_AIO_COPY_PAGES, + .error = 0, + .kref = { ATOMIC_INIT(2) }, }; - if (bm_test_page_unchanged(mdev->bitmap->bm_pages[idx])) { - dynamic_dev_dbg(DEV, "skipped bm page write for idx %u\n", idx); - return 0; + if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* put is in bm_aio_ctx_destroy() */ + dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in drbd_bm_write_page()\n"); + kfree(ctx); + return -ENODEV; } - bm_page_io_async(&ctx, idx, WRITE_SYNC); - wait_for_completion(&ctx.done); + bm_page_io_async(ctx, idx, WRITE_SYNC); + wait_until_done_or_disk_failure(mdev, mdev->ldev, &ctx->done); - if (ctx.error) + if (ctx->error) drbd_chk_io_error(mdev, 1, true); /* that should force detach, so the in memory bitmap will be * gone in a moment as well. */ mdev->bm_writ_cnt++; - return ctx.error; + err = atomic_read(&ctx->in_flight) ? -EIO : ctx->error; + kref_put(&ctx->kref, &bm_aio_ctx_destroy); + return err; } /* NOTE @@ -1163,7 +1231,7 @@ int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(loc * this returns a bit number, NOT a sector! */ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, - const int find_zero_bit, const enum km_type km) + const int find_zero_bit) { struct drbd_bitmap *b = mdev->bitmap; unsigned long *p_addr; @@ -1178,7 +1246,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, while (bm_fo < b->bm_bits) { /* bit offset of the first bit in the page */ bit_offset = bm_fo & ~BITS_PER_PAGE_MASK; - p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo), km); + p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo)); if (find_zero_bit) i = find_next_zero_bit_le(p_addr, @@ -1187,7 +1255,7 @@ static unsigned long __bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo, i = find_next_bit_le(p_addr, PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK); - __bm_unmap(p_addr, km); + __bm_unmap(p_addr); if (i < PAGE_SIZE*8) { bm_fo = bit_offset + i; if (bm_fo >= b->bm_bits) @@ -1215,7 +1283,7 @@ static unsigned long bm_find_next(struct drbd_conf *mdev, if (BM_DONT_TEST & b->bm_flags) bm_print_lock_info(mdev); - i = __bm_find_next(mdev, bm_fo, find_zero_bit, KM_IRQ1); + i = __bm_find_next(mdev, bm_fo, find_zero_bit); spin_unlock_irq(&b->bm_lock); return i; @@ -1239,13 +1307,13 @@ unsigned long drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo unsigned long _drbd_bm_find_next(struct drbd_conf *mdev, unsigned long bm_fo) { /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ - return __bm_find_next(mdev, bm_fo, 0, KM_USER1); + return __bm_find_next(mdev, bm_fo, 0); } unsigned long _drbd_bm_find_next_zero(struct drbd_conf *mdev, unsigned long bm_fo) { /* WARN_ON(!(BM_DONT_SET & mdev->b->bm_flags)); */ - return __bm_find_next(mdev, bm_fo, 1, KM_USER1); + return __bm_find_next(mdev, bm_fo, 1); } /* returns number of bits actually changed. @@ -1273,14 +1341,14 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, unsigned int page_nr = bm_bit_to_page_idx(b, bitnr); if (page_nr != last_page_nr) { if (p_addr) - __bm_unmap(p_addr, KM_IRQ1); + __bm_unmap(p_addr); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) bm_set_page_need_writeout(b->bm_pages[last_page_nr]); changed_total += c; c = 0; - p_addr = __bm_map_pidx(b, page_nr, KM_IRQ1); + p_addr = __bm_map_pidx(b, page_nr); last_page_nr = page_nr; } if (val) @@ -1289,7 +1357,7 @@ static int __bm_change_bits_to(struct drbd_conf *mdev, const unsigned long s, c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr)); } if (p_addr) - __bm_unmap(p_addr, KM_IRQ1); + __bm_unmap(p_addr); if (c < 0) bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]); else if (c > 0) @@ -1342,13 +1410,13 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, { int i; int bits; - unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr], KM_IRQ1); + unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; b->bm_set += BITS_PER_LONG - bits; } - kunmap_atomic(paddr, KM_IRQ1); + kunmap_atomic(paddr); } /* Same thing as drbd_bm_set_bits, diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 8d680562ba73..02f013a073a7 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -712,7 +712,6 @@ struct drbd_request { struct list_head tl_requests; /* ring list in the transfer log */ struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ - int seq_num; unsigned long start_time; }; @@ -851,6 +850,7 @@ enum { NEW_CUR_UUID, /* Create new current UUID when thawing IO */ AL_SUSPENDED, /* Activity logging is currently suspended. */ AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */ + STATE_SENT, /* Do not change state/UUIDs while this is set */ }; struct drbd_bitmap; /* opaque for drbd_conf */ @@ -862,31 +862,30 @@ enum bm_flag { BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ /* currently locked for bulk operation */ - BM_LOCKED_MASK = 0x7, + BM_LOCKED_MASK = 0xf, /* in detail, that is: */ BM_DONT_CLEAR = 0x1, BM_DONT_SET = 0x2, BM_DONT_TEST = 0x4, + /* so we can mark it locked for bulk operation, + * and still allow all non-bulk operations */ + BM_IS_LOCKED = 0x8, + /* (test bit, count bit) allowed (common case) */ - BM_LOCKED_TEST_ALLOWED = 0x3, + BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED, /* testing bits, as well as setting new bits allowed, but clearing bits * would be unexpected. Used during bitmap receive. Setting new bits * requires sending of "out-of-sync" information, though. */ - BM_LOCKED_SET_ALLOWED = 0x1, + BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED, - /* clear is not expected while bitmap is locked for bulk operation */ + /* for drbd_bm_write_copy_pages, everything is allowed, + * only concurrent bulk operations are locked out. */ + BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED, }; - -/* TODO sort members for performance - * MAYBE group them further */ - -/* THINK maybe we actually want to use the default "event/%s" worker threads - * or similar in linux 2.6, which uses per cpu data and threads. - */ struct drbd_work_queue { struct list_head q; struct semaphore s; /* producers up it, worker down()s it */ @@ -938,8 +937,7 @@ struct drbd_backing_dev { }; struct drbd_md_io { - struct drbd_conf *mdev; - struct completion event; + unsigned int done; int error; }; @@ -1022,6 +1020,7 @@ struct drbd_conf { struct drbd_tl_epoch *newest_tle; struct drbd_tl_epoch *oldest_tle; struct list_head out_of_sequence_requests; + struct list_head barrier_acked_requests; struct hlist_head *tl_hash; unsigned int tl_hash_s; @@ -1056,6 +1055,8 @@ struct drbd_conf { struct crypto_hash *csums_tfm; struct crypto_hash *verify_tfm; + unsigned long last_reattach_jif; + unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; struct drbd_thread asender; @@ -1094,7 +1095,8 @@ struct drbd_conf { wait_queue_head_t ee_wait; struct page *md_io_page; /* one page buffer for md_io */ struct page *md_io_tmpp; /* for logical_block_size != 512 */ - struct mutex md_io_mutex; /* protects the md_io_buffer */ + struct drbd_md_io md_io; + atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */ spinlock_t al_lock; wait_queue_head_t al_wait; struct lru_cache *act_log; /* activity log */ @@ -1228,8 +1230,8 @@ extern int drbd_send_uuids(struct drbd_conf *mdev); extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev); extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev); extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags); -extern int _drbd_send_state(struct drbd_conf *mdev); -extern int drbd_send_state(struct drbd_conf *mdev); +extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s); +extern int drbd_send_current_state(struct drbd_conf *mdev); extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock, enum drbd_packets cmd, struct p_header80 *h, size_t size, unsigned msg_flags); @@ -1461,6 +1463,7 @@ extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr); extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local); extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local); extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local); +extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr); extern size_t drbd_bm_words(struct drbd_conf *mdev); @@ -1493,11 +1496,38 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ extern mempool_t *drbd_request_mempool; extern mempool_t *drbd_ee_mempool; -extern struct page *drbd_pp_pool; /* drbd's page pool */ +/* drbd's page pool, used to buffer data received from the peer, + * or data requested by the peer. + * + * This does not have an emergency reserve. + * + * When allocating from this pool, it first takes pages from the pool. + * Only if the pool is depleted will try to allocate from the system. + * + * The assumption is that pages taken from this pool will be processed, + * and given back, "quickly", and then can be recycled, so we can avoid + * frequent calls to alloc_page(), and still will be able to make progress even + * under memory pressure. + */ +extern struct page *drbd_pp_pool; extern spinlock_t drbd_pp_lock; extern int drbd_pp_vacant; extern wait_queue_head_t drbd_pp_wait; +/* We also need a standard (emergency-reserve backed) page pool + * for meta data IO (activity log, bitmap). + * We can keep it global, as long as it is used as "N pages at a time". + * 128 should be plenty, currently we probably can get away with as few as 1. + */ +#define DRBD_MIN_POOL_PAGES 128 +extern mempool_t *drbd_md_io_page_pool; + +/* We also need to make sure we get a bio + * when we need it for housekeeping purposes */ +extern struct bio_set *drbd_md_io_bio_set; +/* to allocate from that set */ +extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); + extern rwlock_t global_state_lock; extern struct drbd_conf *drbd_new_device(unsigned int minor); @@ -1536,8 +1566,12 @@ extern void resume_next_sg(struct drbd_conf *mdev); extern void suspend_other_sg(struct drbd_conf *mdev); extern int drbd_resync_finished(struct drbd_conf *mdev); /* maybe rather drbd_main.c ? */ +extern void *drbd_md_get_buffer(struct drbd_conf *mdev); +extern void drbd_md_put_buffer(struct drbd_conf *mdev); extern int drbd_md_sync_page_io(struct drbd_conf *mdev, - struct drbd_backing_dev *bdev, sector_t sector, int rw); + struct drbd_backing_dev *bdev, sector_t sector, int rw); +extern void wait_until_done_or_disk_failure(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, + unsigned int *done); extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int); extern void drbd_rs_controller_reset(struct drbd_conf *mdev); @@ -1754,19 +1788,6 @@ static inline struct page *page_chain_next(struct page *page) #define page_chain_for_each_safe(page, n) \ for (; page && ({ n = page_chain_next(page); 1; }); page = n) -static inline int drbd_bio_has_active_page(struct bio *bio) -{ - struct bio_vec *bvec; - int i; - - __bio_for_each_segment(bvec, bio, i, 0) { - if (page_count(bvec->bv_page) > 1) - return 1; - } - - return 0; -} - static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) { struct page *page = e->pages; @@ -1777,7 +1798,6 @@ static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e) return 0; } - static inline void drbd_state_lock(struct drbd_conf *mdev) { wait_event(mdev->misc_wait, @@ -2230,7 +2250,7 @@ static inline void drbd_get_syncer_progress(struct drbd_conf *mdev, * Note: currently we don't support such large bitmaps on 32bit * arch anyways, but no harm done to be prepared for it here. */ - unsigned int shift = mdev->rs_total >= (1ULL << 32) ? 16 : 10; + unsigned int shift = mdev->rs_total > UINT_MAX ? 16 : 10; unsigned long left = *bits_left >> shift; unsigned long total = 1UL + (mdev->rs_total >> shift); unsigned long tmp = 1000UL - left * 1000UL/total; @@ -2306,12 +2326,12 @@ static inline int drbd_state_is_stable(struct drbd_conf *mdev) case D_OUTDATED: case D_CONSISTENT: case D_UP_TO_DATE: + case D_FAILED: /* disk state is stable as well. */ break; /* no new io accepted during tansitional states */ case D_ATTACHING: - case D_FAILED: case D_NEGOTIATING: case D_UNKNOWN: case D_MASK: diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 211fc44f84be..920ede2829d6 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -139,6 +139,8 @@ struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */ mempool_t *drbd_request_mempool; mempool_t *drbd_ee_mempool; +mempool_t *drbd_md_io_page_pool; +struct bio_set *drbd_md_io_bio_set; /* I do not use a standard mempool, because: 1) I want to hand out the pre-allocated objects first. @@ -159,7 +161,24 @@ static const struct block_device_operations drbd_ops = { .release = drbd_release, }; -#define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +static void bio_destructor_drbd(struct bio *bio) +{ + bio_free(bio, drbd_md_io_bio_set); +} + +struct bio *bio_alloc_drbd(gfp_t gfp_mask) +{ + struct bio *bio; + + if (!drbd_md_io_bio_set) + return bio_alloc(gfp_mask, 1); + + bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set); + if (!bio) + return NULL; + bio->bi_destructor = bio_destructor_drbd; + return bio; +} #ifdef __CHECKER__ /* When checking with sparse, and this is an inline function, sparse will @@ -208,6 +227,7 @@ static int tl_init(struct drbd_conf *mdev) mdev->oldest_tle = b; mdev->newest_tle = b; INIT_LIST_HEAD(&mdev->out_of_sequence_requests); + INIT_LIST_HEAD(&mdev->barrier_acked_requests); mdev->tl_hash = NULL; mdev->tl_hash_s = 0; @@ -246,9 +266,7 @@ void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new) new->n_writes = 0; newest_before = mdev->newest_tle; - /* never send a barrier number == 0, because that is special-cased - * when using TCQ for our write ordering code */ - new->br_number = (newest_before->br_number+1) ?: 1; + new->br_number = newest_before->br_number+1; if (mdev->newest_tle != new) { mdev->newest_tle->next = new; mdev->newest_tle = new; @@ -311,7 +329,7 @@ void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr, These have been list_move'd to the out_of_sequence_requests list in _req_mod(, barrier_acked) above. */ - list_del_init(&b->requests); + list_splice_init(&b->requests, &mdev->barrier_acked_requests); nob = b->next; if (test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { @@ -411,6 +429,23 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) b = tmp; list_splice(&carry_reads, &b->requests); } + + /* Actions operating on the disk state, also want to work on + requests that got barrier acked. */ + switch (what) { + case fail_frozen_disk_io: + case restart_frozen_disk_io: + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + _req_mod(req, what); + } + + case connection_lost_while_pending: + case resend: + break; + default: + dev_err(DEV, "what = %d in _tl_restart()\n", what); + } } @@ -458,6 +493,38 @@ void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what) } /** + * tl_abort_disk_io() - Abort disk I/O for all requests for a certain mdev in the TL + * @mdev: DRBD device. + */ +void tl_abort_disk_io(struct drbd_conf *mdev) +{ + struct drbd_tl_epoch *b; + struct list_head *le, *tle; + struct drbd_request *req; + + spin_lock_irq(&mdev->req_lock); + b = mdev->oldest_tle; + while (b) { + list_for_each_safe(le, tle, &b->requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + b = b->next; + } + + list_for_each_safe(le, tle, &mdev->barrier_acked_requests) { + req = list_entry(le, struct drbd_request, tl_requests); + if (!(req->rq_state & RQ_LOCAL_PENDING)) + continue; + _req_mod(req, abort_disk_io); + } + + spin_unlock_irq(&mdev->req_lock); +} + +/** * cl_wide_st_chg() - true if the state change is a cluster wide one * @mdev: DRBD device. * @os: old (current) state. @@ -470,7 +537,7 @@ static int cl_wide_st_chg(struct drbd_conf *mdev, ((os.role != R_PRIMARY && ns.role == R_PRIMARY) || (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) || - (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))) || + (os.disk != D_FAILED && ns.disk == D_FAILED))) || (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) || (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S); } @@ -509,8 +576,16 @@ static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state); static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *, union drbd_state, union drbd_state); +enum sanitize_state_warnings { + NO_WARNING, + ABORTED_ONLINE_VERIFY, + ABORTED_RESYNC, + CONNECTION_LOST_NEGOTIATING, + IMPLICITLY_UPGRADED_DISK, + IMPLICITLY_UPGRADED_PDSK, +}; static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort); + union drbd_state ns, enum sanitize_state_warnings *warn); int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state); @@ -785,6 +860,13 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) rv = SS_IN_TRANSIENT_STATE; + /* While establishing a connection only allow cstate to change. + Delay/refuse role changes, detach attach etc... */ + if (test_bit(STATE_SENT, &mdev->flags) && + !(os.conn == C_WF_REPORT_PARAMS || + (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + rv = SS_IN_TRANSIENT_STATE; + if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) rv = SS_NEED_CONNECTION; @@ -803,6 +885,21 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, return rv; } +static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn) +{ + static const char *msg_table[] = { + [NO_WARNING] = "", + [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.", + [ABORTED_RESYNC] = "Resync aborted.", + [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!", + [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk", + [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk", + }; + + if (warn != NO_WARNING) + dev_warn(DEV, "%s\n", msg_table[warn]); +} + /** * sanitize_state() - Resolves implicitly necessary additional changes to a state transition * @mdev: DRBD device. @@ -814,11 +911,14 @@ is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns, * to D_UNKNOWN. This rule and many more along those lines are in this function. */ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os, - union drbd_state ns, const char **warn_sync_abort) + union drbd_state ns, enum sanitize_state_warnings *warn) { enum drbd_fencing_p fp; enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max; + if (warn) + *warn = NO_WARNING; + fp = FP_DONT_CARE; if (get_ldev(mdev)) { fp = mdev->ldev->dc.fencing; @@ -833,18 +933,13 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow. * If you try to go into some Sync* state, that shall fail (elsewhere). */ if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN && - ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_TEAR_DOWN) + ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED) ns.conn = os.conn; /* we cannot fail (again) if we already detached */ if (ns.disk == D_FAILED && os.disk == D_DISKLESS) ns.disk = D_DISKLESS; - /* if we are only D_ATTACHING yet, - * we can (and should) go directly to D_DISKLESS. */ - if (ns.disk == D_FAILED && os.disk == D_ATTACHING) - ns.disk = D_DISKLESS; - /* After C_DISCONNECTING only C_STANDALONE may follow */ if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE) ns.conn = os.conn; @@ -863,10 +958,9 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state /* Abort resync if a disk fails/detaches */ if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) { - if (warn_sync_abort) - *warn_sync_abort = - os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? - "Online-verify" : "Resync"; + if (warn) + *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ? + ABORTED_ONLINE_VERIFY : ABORTED_RESYNC; ns.conn = C_CONNECTED; } @@ -877,7 +971,8 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = mdev->new_state_tmp.disk; ns.pdsk = mdev->new_state_tmp.pdsk; } else { - dev_alert(DEV, "Connection lost while negotiating, no data!\n"); + if (warn) + *warn = CONNECTION_LOST_NEGOTIATING; ns.disk = D_DISKLESS; ns.pdsk = D_UNKNOWN; } @@ -959,16 +1054,16 @@ static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns.disk = disk_max; if (ns.disk < disk_min) { - dev_warn(DEV, "Implicitly set disk from %s to %s\n", - drbd_disk_str(ns.disk), drbd_disk_str(disk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_DISK; ns.disk = disk_min; } if (ns.pdsk > pdsk_max) ns.pdsk = pdsk_max; if (ns.pdsk < pdsk_min) { - dev_warn(DEV, "Implicitly set pdsk from %s to %s\n", - drbd_disk_str(ns.pdsk), drbd_disk_str(pdsk_min)); + if (warn) + *warn = IMPLICITLY_UPGRADED_PDSK; ns.pdsk = pdsk_min; } @@ -1045,12 +1140,12 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, { union drbd_state os; enum drbd_state_rv rv = SS_SUCCESS; - const char *warn_sync_abort = NULL; + enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; os = mdev->state; - ns = sanitize_state(mdev, os, ns, &warn_sync_abort); + ns = sanitize_state(mdev, os, ns, &ssw); if (ns.i == os.i) return SS_NOTHING_TO_DO; @@ -1076,8 +1171,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, return rv; } - if (warn_sync_abort) - dev_warn(DEV, "%s aborted.\n", warn_sync_abort); + print_sanitize_warnings(mdev, ssw); { char *pbp, pb[300]; @@ -1243,7 +1337,7 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, drbd_thread_stop_nowait(&mdev->receiver); /* Upon network failure, we need to restart the receiver. */ - if (os.conn > C_TEAR_DOWN && + if (os.conn > C_WF_CONNECTION && ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT) drbd_thread_restart_nowait(&mdev->receiver); @@ -1251,6 +1345,15 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns, if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) drbd_resume_al(mdev); + /* remember last connect and attach times so request_timer_fn() won't + * kill newly established sessions while we are still trying to thaw + * previously frozen IO */ + if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS) + mdev->last_reconnect_jif = jiffies; + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) + mdev->last_reattach_jif = jiffies; + ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC); if (ascw) { ascw->os = os; @@ -1354,12 +1457,16 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Here we have the actions that are performed after a state change. This function might sleep */ + if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING) + mod_timer(&mdev->request_timer, jiffies + HZ); + nsm.i = -1; if (ns.susp_nod) { if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) what = resend; - if (os.disk == D_ATTACHING && ns.disk > D_ATTACHING) + if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) && + ns.disk > D_NEGOTIATING) what = restart_frozen_disk_io; if (what != nothing) @@ -1408,7 +1515,7 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Do not change the order of the if above and the two below... */ if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* No point in queuing send_bitmap if we don't have a connection * anymore, so check also the _current_ state, not only the new state @@ -1441,11 +1548,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) { - if (ns.peer == R_PRIMARY && mdev->ldev->md.uuid[UI_BITMAP] == 0) { + if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(mdev); drbd_send_uuids(mdev); } - /* D_DISKLESS Peer becomes secondary */ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY) /* We may still be Primary ourselves. @@ -1473,14 +1580,14 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) { drbd_send_sizes(mdev, 0, 0); /* to start sync... */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_state(mdev, ns); } /* We want to pause/continue resync, tell peer. */ if (ns.conn >= C_CONNECTED && ((os.aftr_isp != ns.aftr_isp) || (os.user_isp != ns.user_isp))) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* In case one of the isp bits got set, suspend other devices. */ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) && @@ -1490,10 +1597,10 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* Make sure the peer gets informed about eventual state changes (ISP bits) while we were in WFReportParams. */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); if (os.conn != C_AHEAD && ns.conn == C_AHEAD) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); /* We are in the progress to start a full sync... */ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) || @@ -1513,33 +1620,38 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* first half of local IO error, failure to attach, * or administrative detach */ if (os.disk != D_FAILED && ns.disk == D_FAILED) { - enum drbd_io_error_p eh; - int was_io_error; + enum drbd_io_error_p eh = EP_PASS_ON; + int was_io_error = 0; /* corresponding get_ldev was in __drbd_set_state, to serialize - * our cleanup here with the transition to D_DISKLESS, - * so it is safe to dreference ldev here. */ - eh = mdev->ldev->dc.on_io_error; - was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); - - /* current state still has to be D_FAILED, - * there is only one way out: to D_DISKLESS, - * and that may only happen after our put_ldev below. */ - if (mdev->state.disk != D_FAILED) - dev_err(DEV, - "ASSERT FAILED: disk is %s during detach\n", - drbd_disk_str(mdev->state.disk)); - - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I am detaching my disk\n"); - else - dev_err(DEV, "Sending state for detaching disk failed\n"); - - drbd_rs_cancel_all(mdev); - - /* In case we want to get something to stable storage still, - * this may be the last chance. - * Following put_ldev may transition to D_DISKLESS. */ - drbd_md_sync(mdev); + * our cleanup here with the transition to D_DISKLESS. + * But is is still not save to dreference ldev here, since + * we might come from an failed Attach before ldev was set. */ + if (mdev->ldev) { + eh = mdev->ldev->dc.on_io_error; + was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags); + + /* Immediately allow completion of all application IO, that waits + for completion from the local disk. */ + tl_abort_disk_io(mdev); + + /* current state still has to be D_FAILED, + * there is only one way out: to D_DISKLESS, + * and that may only happen after our put_ldev below. */ + if (mdev->state.disk != D_FAILED) + dev_err(DEV, + "ASSERT FAILED: disk is %s during detach\n", + drbd_disk_str(mdev->state.disk)); + + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + + drbd_rs_cancel_all(mdev); + + /* In case we want to get something to stable storage still, + * this may be the last chance. + * Following put_ldev may transition to D_DISKLESS. */ + drbd_md_sync(mdev); + } put_ldev(mdev); if (was_io_error && eh == EP_CALL_HELPER) @@ -1561,16 +1673,17 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, mdev->rs_failed = 0; atomic_set(&mdev->rs_pending_cnt, 0); - if (drbd_send_state(mdev)) - dev_warn(DEV, "Notified peer that I'm now diskless.\n"); + if (ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); + /* corresponding get_ldev in __drbd_set_state * this may finally trigger drbd_ldev_destroy. */ put_ldev(mdev); } /* Notify peer that I had a local IO error, and did not detached.. */ - if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT) - drbd_send_state(mdev); + if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED) + drbd_send_state(mdev, ns); /* Disks got bigger while they were detached */ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING && @@ -1588,7 +1701,13 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, /* sync target done with resync. Explicitly notify peer, even though * it should (at least for non-empty resyncs) already know itself. */ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED) - drbd_send_state(mdev); + drbd_send_state(mdev, ns); + + /* Wake up role changes, that were delayed because of connection establishing */ + if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) { + clear_bit(STATE_SENT, &mdev->flags); + wake_up(&mdev->state_wait); + } /* This triggers bitmap writeout of potentially still unwritten pages * if the resync finished cleanly, or aborted because of peer disk @@ -1598,8 +1717,8 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os, * No harm done if some bits change during this phase. */ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) { - drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL, - "write from resync_finished", BM_LOCKED_SET_ALLOWED); + drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL, + "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED); put_ldev(mdev); } @@ -2057,7 +2176,11 @@ int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev) D_ASSERT(mdev->state.disk == D_UP_TO_DATE); - uuid = mdev->ldev->md.uuid[UI_BITMAP] + UUID_NEW_BM_OFFSET; + uuid = mdev->ldev->md.uuid[UI_BITMAP]; + if (uuid && uuid != UUID_JUST_CREATED) + uuid = uuid + UUID_NEW_BM_OFFSET; + else + get_random_bytes(&uuid, sizeof(u64)); drbd_uuid_set(mdev, UI_BITMAP, uuid); drbd_print_uuids(mdev, "updated sync UUID"); drbd_md_sync(mdev); @@ -2089,6 +2212,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */ } + /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */ + if (mdev->agreed_pro_version <= 94) + max_bio_size = min_t(int, max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + p.d_size = cpu_to_be64(d_size); p.u_size = cpu_to_be64(u_size); p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev)); @@ -2102,10 +2229,10 @@ int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags fl } /** - * drbd_send_state() - Sends the drbd state to the peer + * drbd_send_current_state() - Sends the drbd state to the peer * @mdev: DRBD device. */ -int drbd_send_state(struct drbd_conf *mdev) +int drbd_send_current_state(struct drbd_conf *mdev) { struct socket *sock; struct p_state p; @@ -2131,6 +2258,37 @@ int drbd_send_state(struct drbd_conf *mdev) return ok; } +/** + * drbd_send_state() - After a state change, sends the new state to the peer + * @mdev: DRBD device. + * @state: the state to send, not necessarily the current state. + * + * Each state change queues an "after_state_ch" work, which will eventually + * send the resulting new state to the peer. If more state changes happen + * between queuing and processing of the after_state_ch work, we still + * want to send each intermediary state in the order it occurred. + */ +int drbd_send_state(struct drbd_conf *mdev, union drbd_state state) +{ + struct socket *sock; + struct p_state p; + int ok = 0; + + mutex_lock(&mdev->data.mutex); + + p.state = cpu_to_be32(state.i); + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, P_STATE, + (struct p_header80 *)&p, sizeof(p), 0); + } + + mutex_unlock(&mdev->data.mutex); + + return ok; +} + int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val) { @@ -2615,7 +2773,7 @@ static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_no_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2629,7 +2787,7 @@ static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio) struct bio_vec *bvec; int i; /* hint all but last page with MSG_MORE */ - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { if (!_drbd_send_page(mdev, bvec->bv_page, bvec->bv_offset, bvec->bv_len, i == bio->bi_vcnt -1 ? 0 : MSG_MORE)) @@ -2695,8 +2853,7 @@ int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req) p.sector = cpu_to_be64(req->sector); p.block_id = (unsigned long)req; - p.seq_num = cpu_to_be32(req->seq_num = - atomic_add_return(1, &mdev->packet_seq)); + p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq)); dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw); @@ -2987,8 +3144,8 @@ void drbd_init_set_defaults(struct drbd_conf *mdev) atomic_set(&mdev->rs_sect_in, 0); atomic_set(&mdev->rs_sect_ev, 0); atomic_set(&mdev->ap_in_flight, 0); + atomic_set(&mdev->md_io_in_use, 0); - mutex_init(&mdev->md_io_mutex); mutex_init(&mdev->data.mutex); mutex_init(&mdev->meta.mutex); sema_init(&mdev->data.work.s, 0); @@ -3126,6 +3283,10 @@ static void drbd_destroy_mempools(void) /* D_ASSERT(atomic_read(&drbd_pp_vacant)==0); */ + if (drbd_md_io_bio_set) + bioset_free(drbd_md_io_bio_set); + if (drbd_md_io_page_pool) + mempool_destroy(drbd_md_io_page_pool); if (drbd_ee_mempool) mempool_destroy(drbd_ee_mempool); if (drbd_request_mempool) @@ -3139,6 +3300,8 @@ static void drbd_destroy_mempools(void) if (drbd_al_ext_cache) kmem_cache_destroy(drbd_al_ext_cache); + drbd_md_io_bio_set = NULL; + drbd_md_io_page_pool = NULL; drbd_ee_mempool = NULL; drbd_request_mempool = NULL; drbd_ee_cache = NULL; @@ -3162,6 +3325,8 @@ static int drbd_create_mempools(void) drbd_bm_ext_cache = NULL; drbd_al_ext_cache = NULL; drbd_pp_pool = NULL; + drbd_md_io_page_pool = NULL; + drbd_md_io_bio_set = NULL; /* caches */ drbd_request_cache = kmem_cache_create( @@ -3185,6 +3350,16 @@ static int drbd_create_mempools(void) goto Enomem; /* mempools */ +#ifdef COMPAT_HAVE_BIOSET_CREATE + drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_bio_set == NULL) + goto Enomem; +#endif + + drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0); + if (drbd_md_io_page_pool == NULL) + goto Enomem; + drbd_request_mempool = mempool_create(number, mempool_alloc_slab, mempool_free_slab, drbd_request_cache); if (drbd_request_mempool == NULL) @@ -3262,6 +3437,8 @@ static void drbd_delete_device(unsigned int minor) if (!mdev) return; + del_timer_sync(&mdev->request_timer); + /* paranoia asserts */ if (mdev->open_cnt != 0) dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt, @@ -3666,8 +3843,10 @@ void drbd_md_sync(struct drbd_conf *mdev) if (!get_ldev_if_state(mdev, D_FAILED)) return; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; + memset(buffer, 0, 512); buffer->la_size = cpu_to_be64(drbd_get_capacity(mdev->this_bdev)); @@ -3698,7 +3877,8 @@ void drbd_md_sync(struct drbd_conf *mdev) * since we updated it on metadata. */ mdev->ldev->md.la_size_sect = drbd_get_capacity(mdev->this_bdev); - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); +out: put_ldev(mdev); } @@ -3718,8 +3898,9 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) if (!get_ldev_if_state(mdev, D_ATTACHING)) return ERR_IO_MD_DISK; - mutex_lock(&mdev->md_io_mutex); - buffer = (struct meta_data_on_disk *)page_address(mdev->md_io_page); + buffer = drbd_md_get_buffer(mdev); + if (!buffer) + goto out; if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3780,7 +3961,8 @@ int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev) mdev->sync_conf.al_extents = 127; err: - mutex_unlock(&mdev->md_io_mutex); + drbd_md_put_buffer(mdev); + out: put_ldev(mdev); return rv; @@ -4183,12 +4365,11 @@ const char *drbd_buildtag(void) static char buildtag[38] = "\0uilt-in"; if (buildtag[0] == 0) { -#ifdef CONFIG_MODULES - if (THIS_MODULE != NULL) - sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); - else +#ifdef MODULE + sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion); +#else + buildtag[0] = 'b'; #endif - buildtag[0] = 'b'; } return buildtag; diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index af2a25049bce..6d4de6a72e80 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -179,7 +179,7 @@ int drbd_khelper(struct drbd_conf *mdev, char *cmd) dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb); drbd_bcast_ev_helper(mdev, cmd); - ret = call_usermodehelper(usermode_helper, argv, envp, 1); + ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n", usermode_helper, cmd, mb, @@ -289,7 +289,7 @@ static int _try_outdate_peer_async(void *data) */ spin_lock_irq(&mdev->req_lock); ns = mdev->state; - if (ns.conn < C_WF_REPORT_PARAMS) { + if (ns.conn < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &mdev->flags)) { ns.pdsk = nps; _drbd_set_state(mdev, ns, CS_VERBOSE, NULL); } @@ -432,7 +432,7 @@ drbd_set_role(struct drbd_conf *mdev, enum drbd_role new_role, int force) /* if this was forced, we should consider sync */ if (forced) drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } drbd_md_sync(mdev); @@ -845,9 +845,10 @@ void drbd_reconsider_max_bio_size(struct drbd_conf *mdev) Because new from 8.3.8 onwards the peer can use multiple BIOs for a single peer_request */ if (mdev->state.conn >= C_CONNECTED) { - if (mdev->agreed_pro_version < 94) - peer = mdev->peer_max_bio_size; - else if (mdev->agreed_pro_version == 94) + if (mdev->agreed_pro_version < 94) { + peer = min_t(int, mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET); + /* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */ + } else if (mdev->agreed_pro_version == 94) peer = DRBD_MAX_SIZE_H80_PACKET; else /* drbd 8.3.8 onwards */ peer = DRBD_MAX_BIO_SIZE; @@ -1032,7 +1033,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp dev_err(DEV, "max capacity %llu smaller than disk size %llu\n", (unsigned long long) drbd_get_max_capacity(nbc), (unsigned long long) nbc->dc.disk_size); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1046,7 +1047,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp } if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { - retcode = ERR_MD_DISK_TO_SMALL; + retcode = ERR_MD_DISK_TOO_SMALL; dev_warn(DEV, "refusing attach: md-device too small, " "at least %llu sectors needed for this meta-disk type\n", (unsigned long long) min_md_device_sectors); @@ -1057,7 +1058,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp * (we may currently be R_PRIMARY with no local disk...) */ if (drbd_get_max_capacity(nbc) < drbd_get_capacity(mdev->this_bdev)) { - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto fail; } @@ -1138,7 +1139,7 @@ static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) { dev_warn(DEV, "refusing to truncate a consistent device\n"); - retcode = ERR_DISK_TO_SMALL; + retcode = ERR_DISK_TOO_SMALL; goto force_diskless_dec; } @@ -1336,17 +1337,34 @@ static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, { enum drbd_ret_code retcode; int ret; + struct detach dt = {}; + + if (!detach_from_tags(mdev, nlp->tag_list, &dt)) { + reply->ret_code = ERR_MANDATORY_TAG; + goto out; + } + + if (dt.detach_force) { + drbd_force_state(mdev, NS(disk, D_FAILED)); + reply->ret_code = SS_SUCCESS; + goto out; + } + drbd_suspend_io(mdev); /* so no-one is stuck in drbd_al_begin_io */ + drbd_md_get_buffer(mdev); /* make sure there is no in-flight meta-data IO */ retcode = drbd_request_state(mdev, NS(disk, D_FAILED)); + drbd_md_put_buffer(mdev); /* D_FAILED will transition to DISKLESS. */ ret = wait_event_interruptible(mdev->misc_wait, mdev->state.disk != D_FAILED); drbd_resume_io(mdev); + if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) retcode = ERR_INTR; reply->ret_code = retcode; +out: return 0; } @@ -1711,7 +1729,7 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, if (rs.no_resync && mdev->agreed_pro_version < 93) { retcode = ERR_NEED_APV_93; - goto fail; + goto fail_ldev; } if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev)) @@ -1738,6 +1756,10 @@ static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, fail: reply->ret_code = retcode; return 0; + + fail_ldev: + put_ldev(mdev); + goto fail; } static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, @@ -1941,6 +1963,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED); @@ -1959,6 +1982,7 @@ static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nl retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -1980,6 +2004,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re /* If there is still bitmap IO pending, probably because of a previous * resync just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED); @@ -1998,6 +2023,7 @@ static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_re } else retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S)); } + drbd_resume_io(mdev); reply->ret_code = retcode; return 0; @@ -2170,11 +2196,13 @@ static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp, /* If there is still bitmap IO pending, e.g. previous resync or verify * just being finished, wait for it before requesting a new resync. */ + drbd_suspend_io(mdev); wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags)); /* w_make_ov_request expects position to be aligned */ mdev->ov_start_sector = args.start_sector & ~BM_SECT_PER_BIT; reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S)); + drbd_resume_io(mdev); return 0; } @@ -2297,7 +2325,7 @@ static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms return; } - if (!cap_raised(current_cap(), CAP_SYS_ADMIN)) { + if (!capable(CAP_SYS_ADMIN)) { retcode = ERR_PERM; goto fail; } @@ -2526,10 +2554,10 @@ void drbd_bcast_ee(struct drbd_conf *mdev, page = e->pages; page_chain_for_each(page) { - void *d = kmap_atomic(page, KM_USER0); + void *d = kmap_atomic(page); unsigned l = min_t(unsigned, len, PAGE_SIZE); memcpy(tl, d, l); - kunmap_atomic(d, KM_USER0); + kunmap_atomic(d); tl = (unsigned short*)((char*)tl + l); len -= l; if (len == 0) diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 2959cdfb77f5..869bada2ed06 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -52,7 +52,7 @@ void seq_printf_with_thousands_grouping(struct seq_file *seq, long v) if (unlikely(v >= 1000000)) { /* cool: > GiByte/s */ seq_printf(seq, "%ld,", v / 1000000); - v /= 1000000; + v %= 1000000; seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000); } else if (likely(v >= 1000)) seq_printf(seq, "%ld,%03ld", v/1000, v % 1000); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 43beaca53179..ea4836e0ae98 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -466,6 +466,7 @@ static int drbd_accept(struct drbd_conf *mdev, const char **what, goto out; } (*newsock)->ops = sock->ops; + __module_get((*newsock)->ops->owner); out: return err; @@ -664,7 +665,7 @@ static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev) timeo = mdev->net_conf->try_connect_int * HZ; timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */ - s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ s_listen->sk->sk_rcvtimeo = timeo; s_listen->sk->sk_sndtimeo = timeo; drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size, @@ -750,6 +751,7 @@ static int drbd_connect(struct drbd_conf *mdev) { struct socket *s, *sock, *msock; int try, h, ok; + enum drbd_state_rv rv; D_ASSERT(!mdev->data.socket); @@ -841,8 +843,8 @@ retry: } } while (1); - msock->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock->sk->sk_reuse = 1; /* SO_REUSEADDR */ + msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ + sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */ sock->sk->sk_allocation = GFP_NOIO; msock->sk->sk_allocation = GFP_NOIO; @@ -888,25 +890,32 @@ retry: } } - if (drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS)) < SS_SUCCESS) - return 0; - sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; atomic_set(&mdev->packet_seq, 0); mdev->peer_seq = 0; - drbd_thread_start(&mdev->asender); - if (drbd_send_protocol(mdev) == -1) return -1; + set_bit(STATE_SENT, &mdev->flags); drbd_send_sync_param(mdev, &mdev->sync_conf); drbd_send_sizes(mdev, 0, 0); drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); clear_bit(USE_DEGR_WFC_T, &mdev->flags); clear_bit(RESIZE_PENDING, &mdev->flags); + + spin_lock_irq(&mdev->req_lock); + rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL); + if (mdev->state.conn != C_WF_REPORT_PARAMS) + clear_bit(STATE_SENT, &mdev->flags); + spin_unlock_irq(&mdev->req_lock); + + if (rv < SS_SUCCESS) + return 0; + + drbd_thread_start(&mdev->asender); mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */ return 1; @@ -957,7 +966,7 @@ static void drbd_flush(struct drbd_conf *mdev) rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_KERNEL, NULL); if (rv) { - dev_err(DEV, "local disk flush failed with status %d\n", rv); + dev_info(DEV, "local disk flush failed with status %d\n", rv); /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ @@ -1001,13 +1010,14 @@ static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev, if (epoch_size != 0 && atomic_read(&epoch->active) == 0 && - test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) { + (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) { if (!(ev & EV_CLEANUP)) { spin_unlock(&mdev->epoch_lock); drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size); spin_lock(&mdev->epoch_lock); } - dec_unacked(mdev); + if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags)) + dec_unacked(mdev); if (mdev->current_epoch != epoch) { next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list); @@ -1096,7 +1106,11 @@ int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, /* In most cases, we will only need one bio. But in case the lower * level restrictions happen to be different at this offset on this * side than those of the sending peer, we may need to submit the - * request in more than one bio. */ + * request in more than one bio. + * + * Plain bio_alloc is good enough here, this is no DRBD internally + * generated bio, but a bio allocated on behalf of the peer. + */ next_bio: bio = bio_alloc(GFP_NOIO, nr_pages); if (!bio) { @@ -1583,6 +1597,24 @@ static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int u return ok; } +static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e) +{ + + struct drbd_epoch_entry *rs_e; + bool rv = 0; + + spin_lock_irq(&mdev->req_lock); + list_for_each_entry(rs_e, &mdev->sync_ee, w.list) { + if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) { + rv = 1; + break; + } + } + spin_unlock_irq(&mdev->req_lock); + + return rv; +} + /* Called from receive_Data. * Synchronize packets on sock with packets on msock. * @@ -1826,6 +1858,9 @@ static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned list_add(&e->w.list, &mdev->active_ee); spin_unlock_irq(&mdev->req_lock); + if (mdev->state.conn == C_SYNC_TARGET) + wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e)); + switch (mdev->net_conf->wire_protocol) { case DRBD_PROT_C: inc_unacked(mdev); @@ -2420,7 +2455,7 @@ static int drbd_uuid_compare(struct drbd_conf *mdev, int *rule_nr) __must_hold(l mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START]; mdev->p_uuid[UI_HISTORY_START] = mdev->p_uuid[UI_HISTORY_START + 1]; - dev_info(DEV, "Did not got last syncUUID packet, corrected:\n"); + dev_info(DEV, "Lost last syncUUID packet, corrected:\n"); drbd_uuid_dump(mdev, "peer", mdev->p_uuid, mdev->p_uuid[UI_SIZE], mdev->p_uuid[UI_FLAGS]); return -1; @@ -2806,10 +2841,10 @@ static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsi if (apv >= 88) { if (apv == 88) { - if (data_size > SHARED_SECRET_MAX) { - dev_err(DEV, "verify-alg too long, " - "peer wants %u, accepting only %u byte\n", - data_size, SHARED_SECRET_MAX); + if (data_size > SHARED_SECRET_MAX || data_size == 0) { + dev_err(DEV, "verify-alg of wrong size, " + "peer wants %u, accepting only up to %u byte\n", + data_size, SHARED_SECRET_MAX); return false; } @@ -3168,9 +3203,20 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned os = ns = mdev->state; spin_unlock_irq(&mdev->req_lock); - /* peer says his disk is uptodate, while we think it is inconsistent, - * and this happens while we think we have a sync going on. */ - if (os.pdsk == D_INCONSISTENT && real_peer_disk == D_UP_TO_DATE && + /* If some other part of the code (asender thread, timeout) + * already decided to close the connection again, + * we must not "re-establish" it here. */ + if (os.conn <= C_TEAR_DOWN) + return false; + + /* If this is the "end of sync" confirmation, usually the peer disk + * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits + * set) resync started in PausedSyncT, or if the timing of pause-/ + * unpause-sync events has been "just right", the peer disk may + * transition from D_CONSISTENT to D_UP_TO_DATE as well. + */ + if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) && + real_peer_disk == D_UP_TO_DATE && os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) { /* If we are (becoming) SyncSource, but peer is still in sync * preparation, ignore its uptodate-ness to avoid flapping, it @@ -3288,7 +3334,7 @@ static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned /* Nowadays only used when forcing a node into primary role and setting its disk to UpToDate with that */ drbd_send_uuids(mdev); - drbd_send_state(mdev); + drbd_send_current_state(mdev); } } @@ -3776,6 +3822,13 @@ static void drbd_disconnect(struct drbd_conf *mdev) if (mdev->state.conn == C_STANDALONE) return; + /* We are about to start the cleanup after connection loss. + * Make sure drbd_make_request knows about that. + * Usually we should be in some network failure state already, + * but just in case we are not, we fix it up here. + */ + drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE)); + /* asender does not clean up anything. it must not interfere, either */ drbd_thread_stop(&mdev->asender); drbd_free_sock(mdev); @@ -3803,8 +3856,6 @@ static void drbd_disconnect(struct drbd_conf *mdev) atomic_set(&mdev->rs_pending_cnt, 0); wake_up(&mdev->misc_wait); - del_timer(&mdev->request_timer); - /* make sure syncer is stopped and w_resume_next_sg queued */ del_timer_sync(&mdev->resync_timer); resync_timer_fn((unsigned long)mdev); @@ -4433,7 +4484,7 @@ static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h) if (mdev->state.conn == C_AHEAD && atomic_read(&mdev->ap_in_flight) == 0 && - !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags)) { + !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) { mdev->start_resync_timer.expires = jiffies + HZ; add_timer(&mdev->start_resync_timer); } diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 4a0f314086e5..9c5c84946b05 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -37,6 +37,7 @@ static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req const int rw = bio_data_dir(bio); int cpu; cpu = part_stat_lock(); + part_round_stats(cpu, &mdev->vdisk->part0); part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]); part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio)); part_inc_in_flight(&mdev->vdisk->part0, rw); @@ -214,8 +215,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) { const unsigned long s = req->rq_state; struct drbd_conf *mdev = req->mdev; - /* only WRITES may end up here without a master bio (on barrier ack) */ - int rw = req->master_bio ? bio_data_dir(req->master_bio) : WRITE; + int rw = req->rq_state & RQ_WRITE ? WRITE : READ; /* we must not complete the master bio, while it is * still being processed by _drbd_send_zc_bio (drbd_send_dblock) @@ -230,7 +230,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) return; if (s & RQ_NET_PENDING) return; - if (s & RQ_LOCAL_PENDING) + if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) return; if (req->master_bio) { @@ -277,6 +277,9 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m) req->master_bio = NULL; } + if (s & RQ_LOCAL_PENDING) + return; + if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) { /* this is disconnected (local only) operation, * or protocol C P_WRITE_ACK, @@ -429,7 +432,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, break; case completed_ok: - if (bio_data_dir(req->master_bio) == WRITE) + if (req->rq_state & RQ_WRITE) mdev->writ_cnt += req->size>>9; else mdev->read_cnt += req->size>>9; @@ -438,7 +441,14 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); + break; + + case abort_disk_io: + req->rq_state |= RQ_LOCAL_ABORTED; + if (req->rq_state & RQ_WRITE) + _req_may_be_done_not_susp(req, m); + else + goto goto_queue_for_net_read; break; case write_completed_with_error: @@ -447,7 +457,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, __drbd_chk_io_error(mdev, false); _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_ahead_completed_with_error: @@ -455,7 +464,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, req->rq_state |= RQ_LOCAL_COMPLETED; req->rq_state &= ~RQ_LOCAL_PENDING; _req_may_be_done_not_susp(req, m); - put_ldev(mdev); break; case read_completed_with_error: @@ -467,7 +475,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, D_ASSERT(!(req->rq_state & RQ_NET_MASK)); __drbd_chk_io_error(mdev, false); - put_ldev(mdev); + + goto_queue_for_net_read: /* no point in retrying if there is no good remote data, * or we have no connection. */ @@ -556,10 +565,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, drbd_queue_work(&mdev->data.work, &req->w); break; - case oos_handed_to_network: - /* actually the same */ + case read_retry_remote_canceled: case send_canceled: - /* treat it the same */ case send_failed: /* real cleanup will be done from tl_clear. just update flags * so it is no longer marked as on the worker queue */ @@ -589,17 +596,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, } req->rq_state &= ~RQ_NET_QUEUED; req->rq_state |= RQ_NET_SENT; - /* because _drbd_send_zc_bio could sleep, and may want to - * dereference the bio even after the "write_acked_by_peer" and - * "completed_ok" events came in, once we return from - * _drbd_send_zc_bio (drbd_send_dblock), we have to check - * whether it is done already, and end it. */ _req_may_be_done_not_susp(req, m); break; - case read_retry_remote_canceled: + case oos_handed_to_network: + /* Was not set PENDING, no longer QUEUED, so is now DONE + * as far as this connection is concerned. */ req->rq_state &= ~RQ_NET_QUEUED; - /* fall through, in case we raced with drbd_disconnect */ + req->rq_state |= RQ_NET_DONE; + _req_may_be_done_not_susp(req, m); + break; + case connection_lost_while_pending: /* transfer log cleanup after connection loss */ /* assert something? */ @@ -616,8 +623,6 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, _req_may_be_done(req, m); /* Allowed while state.susp */ break; - case write_acked_by_peer_and_sis: - req->rq_state |= RQ_NET_SIS; case conflict_discarded_by_peer: /* for discarded conflicting writes of multiple primaries, * there is no need to keep anything in the tl, potential @@ -628,18 +633,15 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what, (unsigned long long)req->sector, req->size); req->rq_state |= RQ_NET_DONE; /* fall through */ + case write_acked_by_peer_and_sis: case write_acked_by_peer: + if (what == write_acked_by_peer_and_sis) + req->rq_state |= RQ_NET_SIS; /* protocol C; successfully written on peer. - * Nothing to do here. + * Nothing more to do here. * We want to keep the tl in place for all protocols, to cater - * for volatile write-back caches on lower level devices. - * - * A barrier request is expected to have forced all prior - * requests onto stable storage, so completion of a barrier - * request could set NET_DONE right here, and not wait for the - * P_BARRIER_ACK, but that is an unnecessary optimization. */ + * for volatile write-back caches on lower level devices. */ - /* this makes it effectively the same as for: */ case recv_acked_by_peer: /* protocol B; pretends to be successfully written on peer. * see also notes above in handed_over_to_network about @@ -773,6 +775,7 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns int local, remote, send_oos = 0; int err = -EIO; int ret = 0; + union drbd_state s; /* allocate outside of all locks; */ req = drbd_req_new(mdev, bio); @@ -834,8 +837,9 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns drbd_al_begin_io(mdev, sector); } - remote = remote && drbd_should_do_remote(mdev->state); - send_oos = rw == WRITE && drbd_should_send_oos(mdev->state); + s = mdev->state; + remote = remote && drbd_should_do_remote(s); + send_oos = rw == WRITE && drbd_should_send_oos(s); D_ASSERT(!(remote && send_oos)); if (!(local || remote) && !is_susp(mdev->state)) { @@ -867,7 +871,7 @@ allocate_barrier: if (is_susp(mdev->state)) { /* If we got suspended, use the retry mechanism of - generic_make_request() to restart processing of this + drbd_make_request() to restart processing of this bio. In the next call to drbd_make_request we sleep in inc_ap_bio() */ ret = 1; @@ -1091,7 +1095,6 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) */ D_ASSERT(bio->bi_size > 0); D_ASSERT((bio->bi_size & 0x1ff) == 0); - D_ASSERT(bio->bi_idx == 0); /* to make some things easier, force alignment of requests within the * granularity of our hash tables */ @@ -1099,8 +1102,9 @@ void drbd_make_request(struct request_queue *q, struct bio *bio) e_enr = (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT; if (likely(s_enr == e_enr)) { - inc_ap_bio(mdev, 1); - drbd_make_request_common(mdev, bio, start_time); + do { + inc_ap_bio(mdev, 1); + } while (drbd_make_request_common(mdev, bio, start_time)); return; } @@ -1196,36 +1200,66 @@ void request_timer_fn(unsigned long data) struct drbd_conf *mdev = (struct drbd_conf *) data; struct drbd_request *req; /* oldest request */ struct list_head *le; - unsigned long et = 0; /* effective timeout = ko_count * timeout */ + unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ + unsigned long now; if (get_net_conf(mdev)) { - et = mdev->net_conf->timeout*HZ/10 * mdev->net_conf->ko_count; + if (mdev->state.conn >= C_WF_REPORT_PARAMS) + ent = mdev->net_conf->timeout*HZ/10 + * mdev->net_conf->ko_count; put_net_conf(mdev); } - if (!et || mdev->state.conn < C_WF_REPORT_PARAMS) + if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */ + dt = mdev->ldev->dc.disk_timeout * HZ / 10; + put_ldev(mdev); + } + et = min_not_zero(dt, ent); + + if (!et) return; /* Recurring timer stopped */ + now = jiffies; + spin_lock_irq(&mdev->req_lock); le = &mdev->oldest_tle->requests; if (list_empty(le)) { spin_unlock_irq(&mdev->req_lock); - mod_timer(&mdev->request_timer, jiffies + et); + mod_timer(&mdev->request_timer, now + et); return; } le = le->prev; req = list_entry(le, struct drbd_request, tl_requests); - if (time_is_before_eq_jiffies(req->start_time + et)) { - if (req->rq_state & RQ_NET_PENDING) { - dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE, NULL); - } else { - dev_warn(DEV, "Local backing block device frozen?\n"); - mod_timer(&mdev->request_timer, jiffies + et); - } - } else { - mod_timer(&mdev->request_timer, req->start_time + et); - } + /* The request is considered timed out, if + * - we have some effective timeout from the configuration, + * with above state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + if (ent && req->rq_state & RQ_NET_PENDING && + time_after(now, req->start_time + ent) && + !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) { + dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n"); + _drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + } + if (dt && req->rq_state & RQ_LOCAL_PENDING && + time_after(now, req->start_time + dt) && + !time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) { + dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n"); + __drbd_chk_io_error(mdev, 1); + } + nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et; spin_unlock_irq(&mdev->req_lock); + mod_timer(&mdev->request_timer, nt); } diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 68a234a5fdc5..3d2111919486 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -105,6 +105,7 @@ enum drbd_req_event { read_completed_with_error, read_ahead_completed_with_error, write_completed_with_error, + abort_disk_io, completed_ok, resend, fail_frozen_disk_io, @@ -118,18 +119,21 @@ enum drbd_req_event { * same time, so we should hold the request lock anyways. */ enum drbd_req_state_bits { - /* 210 - * 000: no local possible - * 001: to be submitted + /* 3210 + * 0000: no local possible + * 0001: to be submitted * UNUSED, we could map: 011: submitted, completion still pending - * 110: completed ok - * 010: completed with error + * 0110: completed ok + * 0010: completed with error + * 1001: Aborted (before completion) + * 1x10: Aborted and completed -> free */ __RQ_LOCAL_PENDING, __RQ_LOCAL_COMPLETED, __RQ_LOCAL_OK, + __RQ_LOCAL_ABORTED, - /* 76543 + /* 87654 * 00000: no network possible * 00001: to be send * 00011: to be send, on worker queue @@ -199,8 +203,9 @@ enum drbd_req_state_bits { #define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING) #define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED) #define RQ_LOCAL_OK (1UL << __RQ_LOCAL_OK) +#define RQ_LOCAL_ABORTED (1UL << __RQ_LOCAL_ABORTED) -#define RQ_LOCAL_MASK ((RQ_LOCAL_OK << 1)-1) /* 0x07 */ +#define RQ_LOCAL_MASK ((RQ_LOCAL_ABORTED << 1)-1) #define RQ_NET_PENDING (1UL << __RQ_NET_PENDING) #define RQ_NET_QUEUED (1UL << __RQ_NET_QUEUED) diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 4d3e6f6213ba..620c70ff2231 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -70,11 +70,29 @@ rwlock_t global_state_lock; void drbd_md_io_complete(struct bio *bio, int error) { struct drbd_md_io *md_io; + struct drbd_conf *mdev; md_io = (struct drbd_md_io *)bio->bi_private; + mdev = container_of(md_io, struct drbd_conf, md_io); + md_io->error = error; - complete(&md_io->event); + /* We grabbed an extra reference in _drbd_md_sync_page_io() to be able + * to timeout on the lower level device, and eventually detach from it. + * If this io completion runs after that timeout expired, this + * drbd_md_put_buffer() may allow us to finally try and re-attach. + * During normal operation, this only puts that extra reference + * down to 1 again. + * Make sure we first drop the reference, and only then signal + * completion, or we may (in drbd_al_read_log()) cycle so fast into the + * next drbd_md_sync_page_io(), that we trigger the + * ASSERT(atomic_read(&mdev->md_io_in_use) == 1) there. + */ + drbd_md_put_buffer(mdev); + md_io->done = 1; + wake_up(&mdev->misc_wait); + bio_put(bio); + put_ldev(mdev); } /* reads on behalf of the partner, @@ -226,6 +244,7 @@ void drbd_endio_pri(struct bio *bio, int error) spin_lock_irqsave(&mdev->req_lock, flags); __req_mod(req, what, &m); spin_unlock_irqrestore(&mdev->req_lock, flags); + put_ldev(mdev); if (m.bio) complete_master_bio(mdev, &m); @@ -290,7 +309,7 @@ void drbd_csum_bio(struct drbd_conf *mdev, struct crypto_hash *tfm, struct bio * sg_init_table(&sg, 1); crypto_hash_init(&desc); - __bio_for_each_segment(bvec, bio, i, 0) { + bio_for_each_segment(bvec, bio, i) { sg_set_page(&sg, bvec->bv_page, bvec->bv_len, bvec->bv_offset); crypto_hash_update(&desc, &sg, sg.length); } @@ -728,7 +747,7 @@ int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel) } drbd_start_resync(mdev, C_SYNC_SOURCE); - clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->current_epoch->flags); + clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags); return 1; } @@ -1519,14 +1538,14 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side) } drbd_state_lock(mdev); - + write_lock_irq(&global_state_lock); if (!get_ldev_if_state(mdev, D_NEGOTIATING)) { + write_unlock_irq(&global_state_lock); drbd_state_unlock(mdev); return; } - write_lock_irq(&global_state_lock); - ns = mdev->state; + ns.i = mdev->state.i; ns.aftr_isp = !_drbd_may_sync_now(mdev); diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 744f078f4dd8..cce7df367b79 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -202,7 +202,6 @@ static int slow_floppy; #include <asm/dma.h> #include <asm/irq.h> -#include <asm/system.h> static int FLOPPY_IRQ = 6; static int FLOPPY_DMA = 2; @@ -552,7 +551,7 @@ static void floppy_ready(void); static void floppy_start(void); static void process_fd_request(void); static void recalibrate_floppy(void); -static void floppy_shutdown(unsigned long); +static void floppy_shutdown(struct work_struct *); static int floppy_request_regions(int); static void floppy_release_regions(int); @@ -589,6 +588,8 @@ static int buffer_max = -1; static struct floppy_fdc_state fdc_state[N_FDC]; static int fdc; /* current fdc */ +static struct workqueue_struct *floppy_wq; + static struct floppy_struct *_floppy = floppy_type; static unsigned char current_drive; static long current_count_sectors; @@ -630,16 +631,15 @@ static inline void set_debugt(void) { } static inline void debugt(const char *func, const char *msg) { } #endif /* DEBUGT */ -typedef void (*timeout_fn)(unsigned long); -static DEFINE_TIMER(fd_timeout, floppy_shutdown, 0, 0); +static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown); static const char *timeout_message; static void is_alive(const char *func, const char *message) { /* this routine checks whether the floppy driver is "alive" */ if (test_bit(0, &fdc_busy) && command_status < 2 && - !timer_pending(&fd_timeout)) { + !delayed_work_pending(&fd_timeout)) { DPRINT("%s: timeout handler died. %s\n", func, message); } } @@ -667,15 +667,18 @@ static int output_log_pos; static void __reschedule_timeout(int drive, const char *message) { + unsigned long delay; + if (drive == current_reqD) drive = current_drive; - del_timer(&fd_timeout); + if (drive < 0 || drive >= N_DRIVE) { - fd_timeout.expires = jiffies + 20UL * HZ; + delay = 20UL * HZ; drive = 0; } else - fd_timeout.expires = jiffies + UDP->timeout; - add_timer(&fd_timeout); + delay = UDP->timeout; + + queue_delayed_work(floppy_wq, &fd_timeout, delay); if (UDP->flags & FD_DEBUG) DPRINT("reschedule timeout %s\n", message); timeout_message = message; @@ -873,7 +876,7 @@ static int lock_fdc(int drive, bool interruptible) command_status = FD_COMMAND_NONE; - __reschedule_timeout(drive, "lock fdc"); + reschedule_timeout(drive, "lock fdc"); set_fdc(drive); return 0; } @@ -881,23 +884,15 @@ static int lock_fdc(int drive, bool interruptible) /* unlocks the driver */ static void unlock_fdc(void) { - unsigned long flags; - - raw_cmd = NULL; if (!test_bit(0, &fdc_busy)) DPRINT("FDC access conflict!\n"); - if (do_floppy) - DPRINT("device interrupt still active at FDC release: %pf!\n", - do_floppy); + raw_cmd = NULL; command_status = FD_COMMAND_NONE; - spin_lock_irqsave(&floppy_lock, flags); - del_timer(&fd_timeout); + __cancel_delayed_work(&fd_timeout); + do_floppy = NULL; cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || set_next_request()) - do_fd_request(current_req->q); - spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); } @@ -969,26 +964,24 @@ static DECLARE_WORK(floppy_work, NULL); static void schedule_bh(void (*handler)(void)) { + WARN_ON(work_pending(&floppy_work)); + PREPARE_WORK(&floppy_work, (work_func_t)handler); - schedule_work(&floppy_work); + queue_work(floppy_wq, &floppy_work); } -static DEFINE_TIMER(fd_timer, NULL, 0, 0); +static DECLARE_DELAYED_WORK(fd_timer, NULL); static void cancel_activity(void) { - unsigned long flags; - - spin_lock_irqsave(&floppy_lock, flags); do_floppy = NULL; - PREPARE_WORK(&floppy_work, (work_func_t)empty); - del_timer(&fd_timer); - spin_unlock_irqrestore(&floppy_lock, flags); + cancel_delayed_work_sync(&fd_timer); + cancel_work_sync(&floppy_work); } /* this function makes sure that the disk stays in the drive during the * transfer */ -static void fd_watchdog(void) +static void fd_watchdog(struct work_struct *arg) { debug_dcl(DP->flags, "calling disk change from watchdog\n"); @@ -998,21 +991,20 @@ static void fd_watchdog(void) cont->done(0); reset_fdc(); } else { - del_timer(&fd_timer); - fd_timer.function = (timeout_fn)fd_watchdog; - fd_timer.expires = jiffies + HZ / 10; - add_timer(&fd_timer); + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, fd_watchdog); + queue_delayed_work(floppy_wq, &fd_timer, HZ / 10); } } static void main_command_interrupt(void) { - del_timer(&fd_timer); + cancel_delayed_work(&fd_timer); cont->interrupt(); } /* waits for a delay (spinup or select) to pass */ -static int fd_wait_for_completion(unsigned long delay, timeout_fn function) +static int fd_wait_for_completion(unsigned long expires, work_func_t function) { if (FDCS->reset) { reset_fdc(); /* do the reset during sleep to win time @@ -1021,47 +1013,15 @@ static int fd_wait_for_completion(unsigned long delay, timeout_fn function) return 1; } - if (time_before(jiffies, delay)) { - del_timer(&fd_timer); - fd_timer.function = function; - fd_timer.expires = delay; - add_timer(&fd_timer); + if (time_before(jiffies, expires)) { + cancel_delayed_work(&fd_timer); + PREPARE_DELAYED_WORK(&fd_timer, function); + queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies); return 1; } return 0; } -static DEFINE_SPINLOCK(floppy_hlt_lock); -static int hlt_disabled; -static void floppy_disable_hlt(void) -{ - unsigned long flags; - - WARN_ONCE(1, "floppy_disable_hlt() scheduled for removal in 2012"); - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (!hlt_disabled) { - hlt_disabled = 1; -#ifdef HAVE_DISABLE_HLT - disable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - -static void floppy_enable_hlt(void) -{ - unsigned long flags; - - spin_lock_irqsave(&floppy_hlt_lock, flags); - if (hlt_disabled) { - hlt_disabled = 0; -#ifdef HAVE_DISABLE_HLT - enable_hlt(); -#endif - } - spin_unlock_irqrestore(&floppy_hlt_lock, flags); -} - static void setup_DMA(void) { unsigned long f; @@ -1106,7 +1066,6 @@ static void setup_DMA(void) fd_enable_dma(); release_dma_lock(f); #endif - floppy_disable_hlt(); } static void show_floppy(void); @@ -1375,7 +1334,7 @@ static int fdc_dtr(void) */ FDCS->dtr = raw_cmd->rate & 3; return fd_wait_for_completion(jiffies + 2UL * HZ / 100, - (timeout_fn)floppy_ready); + (work_func_t)floppy_ready); } /* fdc_dtr */ static void tell_sector(void) @@ -1480,7 +1439,7 @@ static void setup_rw_floppy(void) int flags; int dflags; unsigned long ready_date; - timeout_fn function; + work_func_t function; flags = raw_cmd->flags; if (flags & (FD_RAW_READ | FD_RAW_WRITE)) @@ -1494,9 +1453,9 @@ static void setup_rw_floppy(void) */ if (time_after(ready_date, jiffies + DP->select_delay)) { ready_date -= DP->select_delay; - function = (timeout_fn)floppy_start; + function = (work_func_t)floppy_start; } else - function = (timeout_fn)setup_rw_floppy; + function = (work_func_t)setup_rw_floppy; /* wait until the floppy is spinning fast enough */ if (fd_wait_for_completion(ready_date, function)) @@ -1526,7 +1485,7 @@ static void setup_rw_floppy(void) inr = result(); cont->interrupt(); } else if (flags & FD_RAW_NEED_DISK) - fd_watchdog(); + fd_watchdog(NULL); } static int blind_seek; @@ -1708,7 +1667,6 @@ irqreturn_t floppy_interrupt(int irq, void *dev_id) fd_disable_dma(); release_dma_lock(f); - floppy_enable_hlt(); do_floppy = NULL; if (fdc >= N_FDC || FDCS->address == -1) { /* we don't even know which FDC is the culprit */ @@ -1836,20 +1794,22 @@ static void show_floppy(void) pr_info("do_floppy=%pf\n", do_floppy); if (work_pending(&floppy_work)) pr_info("floppy_work.func=%pf\n", floppy_work.func); - if (timer_pending(&fd_timer)) - pr_info("fd_timer.function=%pf\n", fd_timer.function); - if (timer_pending(&fd_timeout)) { - pr_info("timer_function=%pf\n", fd_timeout.function); - pr_info("expires=%lu\n", fd_timeout.expires - jiffies); - pr_info("now=%lu\n", jiffies); - } + if (delayed_work_pending(&fd_timer)) + pr_info("delayed work.function=%p expires=%ld\n", + fd_timer.work.func, + fd_timer.timer.expires - jiffies); + if (delayed_work_pending(&fd_timeout)) + pr_info("timer_function=%p expires=%ld\n", + fd_timeout.work.func, + fd_timeout.timer.expires - jiffies); + pr_info("cont=%p\n", cont); pr_info("current_req=%p\n", current_req); pr_info("command_status=%d\n", command_status); pr_info("\n"); } -static void floppy_shutdown(unsigned long data) +static void floppy_shutdown(struct work_struct *arg) { unsigned long flags; @@ -1857,8 +1817,6 @@ static void floppy_shutdown(unsigned long data) show_floppy(); cancel_activity(); - floppy_enable_hlt(); - flags = claim_dma_lock(); fd_disable_dma(); release_dma_lock(flags); @@ -1904,7 +1862,7 @@ static int start_motor(void (*function)(void)) /* wait_for_completion also schedules reset if needed. */ return fd_wait_for_completion(DRS->select_date + DP->select_delay, - (timeout_fn)function); + (work_func_t)function); } static void floppy_ready(void) @@ -2857,7 +2815,6 @@ do_request: spin_lock_irq(&floppy_lock); pending = set_next_request(); spin_unlock_irq(&floppy_lock); - if (!pending) { do_floppy = NULL; unlock_fdc(); @@ -2934,13 +2891,15 @@ static void do_fd_request(struct request_queue *q) current_req->cmd_flags)) return; - if (test_bit(0, &fdc_busy)) { + if (test_and_set_bit(0, &fdc_busy)) { /* fdc busy, this new request will be treated when the current one is done */ is_alive(__func__, "old request running"); return; } - lock_fdc(MAXTIMEOUT, false); + command_status = FD_COMMAND_NONE; + __reschedule_timeout(MAXTIMEOUT, "fd_request"); + set_fdc(0); process_fd_request(); is_alive(__func__, ""); } @@ -3648,9 +3607,7 @@ static int floppy_release(struct gendisk *disk, fmode_t mode) mutex_lock(&floppy_mutex); mutex_lock(&open_lock); - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else if (!UDRS->fd_ref--) { + if (!UDRS->fd_ref--) { DPRINT("floppy_release with fd_ref == 0"); UDRS->fd_ref = 0; } @@ -3686,13 +3643,7 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) set_bit(FD_VERIFY_BIT, &UDRS->flags); } - if (UDRS->fd_ref == -1 || (UDRS->fd_ref && (mode & FMODE_EXCL))) - goto out2; - - if (mode & FMODE_EXCL) - UDRS->fd_ref = -1; - else - UDRS->fd_ref++; + UDRS->fd_ref++; opened_bdev[drive] = bdev; @@ -3755,10 +3706,8 @@ static int floppy_open(struct block_device *bdev, fmode_t mode) mutex_unlock(&floppy_mutex); return 0; out: - if (UDRS->fd_ref < 0) - UDRS->fd_ref = 0; - else - UDRS->fd_ref--; + UDRS->fd_ref--; + if (!UDRS->fd_ref) opened_bdev[drive] = NULL; out2: @@ -4195,10 +4144,16 @@ static int __init floppy_init(void) goto out_put_disk; } + floppy_wq = alloc_ordered_workqueue("floppy", 0); + if (!floppy_wq) { + err = -ENOMEM; + goto out_put_disk; + } + disks[dr]->queue = blk_init_queue(do_fd_request, &floppy_lock); if (!disks[dr]->queue) { err = -ENOMEM; - goto out_put_disk; + goto out_destroy_workq; } blk_queue_max_hw_sectors(disks[dr]->queue, 64); @@ -4249,7 +4204,7 @@ static int __init floppy_init(void) use_virtual_dma = can_use_virtual_dma & 1; fdc_state[0].address = FDC1; if (fdc_state[0].address == -1) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -ENODEV; goto out_unreg_region; } @@ -4260,7 +4215,7 @@ static int __init floppy_init(void) fdc = 0; /* reset fdc in case of unexpected interrupt */ err = floppy_grab_irq_and_dma(); if (err) { - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); err = -EBUSY; goto out_unreg_region; } @@ -4317,13 +4272,13 @@ static int __init floppy_init(void) user_reset_fdc(-1, FD_RESET_ALWAYS, false); } fdc = 0; - del_timer_sync(&fd_timeout); + cancel_delayed_work(&fd_timeout); current_drive = 0; initialized = true; if (have_no_fdc) { DPRINT("no floppy controllers found\n"); err = have_no_fdc; - goto out_flush_work; + goto out_release_dma; } for (drive = 0; drive < N_DRIVE; drive++) { @@ -4338,7 +4293,7 @@ static int __init floppy_init(void) err = platform_device_register(&floppy_device[drive]); if (err) - goto out_flush_work; + goto out_release_dma; err = device_create_file(&floppy_device[drive].dev, &dev_attr_cmos); @@ -4356,13 +4311,14 @@ static int __init floppy_init(void) out_unreg_platform_dev: platform_device_unregister(&floppy_device[drive]); -out_flush_work: - flush_work_sync(&floppy_work); +out_release_dma: if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); out_unreg_region: blk_unregister_region(MKDEV(FLOPPY_MAJOR, 0), 256); platform_driver_unregister(&floppy_driver); +out_destroy_workq: + destroy_workqueue(floppy_wq); out_unreg_blkdev: unregister_blkdev(FLOPPY_MAJOR, "fd"); out_put_disk: @@ -4433,7 +4389,7 @@ static int floppy_grab_irq_and_dma(void) * We might have scheduled a free_irq(), wait it to * drain first: */ - flush_work_sync(&floppy_work); + flush_workqueue(floppy_wq); if (fd_request_irq()) { DPRINT("Unable to grab IRQ%d for the floppy driver\n", @@ -4509,7 +4465,6 @@ static void floppy_release_irq_and_dma(void) #if N_FDC > 1 set_dor(1, ~8, 0); #endif - floppy_enable_hlt(); if (floppy_track_buffer && max_buffer_sectors) { tmpsize = max_buffer_sectors * 1024; @@ -4525,9 +4480,9 @@ static void floppy_release_irq_and_dma(void) pr_info("motor off timer %d still active\n", drive); #endif - if (timer_pending(&fd_timeout)) + if (delayed_work_pending(&fd_timeout)) pr_info("floppy timer still active:%s\n", timeout_message); - if (timer_pending(&fd_timer)) + if (delayed_work_pending(&fd_timer)) pr_info("auxiliary floppy timer still active\n"); if (work_pending(&floppy_work)) pr_info("work still pending\n"); @@ -4597,8 +4552,9 @@ static void __exit floppy_module_exit(void) put_disk(disks[drive]); } - del_timer_sync(&fd_timeout); - del_timer_sync(&fd_timer); + cancel_delayed_work_sync(&fd_timeout); + cancel_delayed_work_sync(&fd_timer); + destroy_workqueue(floppy_wq); if (atomic_read(&usage_count)) floppy_release_irq_and_dma(); diff --git a/drivers/block/hd.c b/drivers/block/hd.c index b52c9ca146fc..bf397bf108b7 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -44,7 +44,6 @@ #define HD_IRQ 14 #define REALLY_SLOW_IO -#include <asm/system.h> #include <asm/io.h> #include <asm/uaccess.h> diff --git a/drivers/block/loop.c b/drivers/block/loop.c index cd504353b278..bbca966f8f66 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -93,16 +93,16 @@ static int transfer_none(struct loop_device *lo, int cmd, struct page *loop_page, unsigned loop_off, int size, sector_t real_block) { - char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; - char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + char *raw_buf = kmap_atomic(raw_page) + raw_off; + char *loop_buf = kmap_atomic(loop_page) + loop_off; if (cmd == READ) memcpy(loop_buf, raw_buf, size); else memcpy(raw_buf, loop_buf, size); - kunmap_atomic(loop_buf, KM_USER1); - kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf); + kunmap_atomic(raw_buf); cond_resched(); return 0; } @@ -112,8 +112,8 @@ static int transfer_xor(struct loop_device *lo, int cmd, struct page *loop_page, unsigned loop_off, int size, sector_t real_block) { - char *raw_buf = kmap_atomic(raw_page, KM_USER0) + raw_off; - char *loop_buf = kmap_atomic(loop_page, KM_USER1) + loop_off; + char *raw_buf = kmap_atomic(raw_page) + raw_off; + char *loop_buf = kmap_atomic(loop_page) + loop_off; char *in, *out, *key; int i, keysize; @@ -130,8 +130,8 @@ static int transfer_xor(struct loop_device *lo, int cmd, for (i = 0; i < size; i++) *out++ = *in++ ^ key[(i & 511) % keysize]; - kunmap_atomic(loop_buf, KM_USER1); - kunmap_atomic(raw_buf, KM_USER0); + kunmap_atomic(loop_buf); + kunmap_atomic(raw_buf); cond_resched(); return 0; } diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig index b5dd14e072f2..0ba837fc62a8 100644 --- a/drivers/block/mtip32xx/Kconfig +++ b/drivers/block/mtip32xx/Kconfig @@ -4,6 +4,6 @@ config BLK_DEV_PCIESSD_MTIP32XX tristate "Block Device Driver for Micron PCIe SSDs" - depends on HOTPLUG_PCI_PCIE + depends on PCI help This enables the block driver for Micron PCIe SSDs. diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 8eb81c96608f..264bc77dcb91 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -36,6 +36,7 @@ #include <linux/idr.h> #include <linux/kthread.h> #include <../drivers/ata/ahci.h> +#include <linux/export.h> #include "mtip32xx.h" #define HW_CMD_SLOT_SZ (MTIP_MAX_COMMAND_SLOTS * 32) @@ -44,6 +45,7 @@ #define HW_PORT_PRIV_DMA_SZ \ (HW_CMD_SLOT_SZ + HW_CMD_TBL_AR_SZ + AHCI_RX_FIS_SZ) +#define HOST_CAP_NZDMA (1 << 19) #define HOST_HSORG 0xFC #define HSORG_DISABLE_SLOTGRP_INTR (1<<24) #define HSORG_DISABLE_SLOTGRP_PXIS (1<<16) @@ -139,6 +141,12 @@ static void mtip_command_cleanup(struct driver_data *dd) int group = 0, commandslot = 0, commandindex = 0; struct mtip_cmd *command; struct mtip_port *port = dd->port; + static int in_progress; + + if (in_progress) + return; + + in_progress = 1; for (group = 0; group < 4; group++) { for (commandslot = 0; commandslot < 32; commandslot++) { @@ -165,7 +173,8 @@ static void mtip_command_cleanup(struct driver_data *dd) up(&port->cmd_slot); - atomic_set(&dd->drv_cleanup_done, true); + set_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag); + in_progress = 0; } /* @@ -262,6 +271,9 @@ static int hba_reset_nosleep(struct driver_data *dd) && time_before(jiffies, timeout)) mdelay(1); + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) + return -1; + if (readl(dd->mmio + HOST_CTL) & HOST_RESET) return -1; @@ -282,18 +294,20 @@ static int hba_reset_nosleep(struct driver_data *dd) */ static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag) { - unsigned long flags = 0; - atomic_set(&port->commands[tag].active, 1); - spin_lock_irqsave(&port->cmd_issue_lock, flags); + spin_lock(&port->cmd_issue_lock); writel((1 << MTIP_TAG_BIT(tag)), port->s_active[MTIP_TAG_INDEX(tag)]); writel((1 << MTIP_TAG_BIT(tag)), port->cmd_issue[MTIP_TAG_INDEX(tag)]); - spin_unlock_irqrestore(&port->cmd_issue_lock, flags); + spin_unlock(&port->cmd_issue_lock); + + /* Set the command's timeout value.*/ + port->commands[tag].comp_time = jiffies + msecs_to_jiffies( + MTIP_NCQ_COMMAND_TIMEOUT_MS); } /* @@ -422,6 +436,10 @@ static void mtip_init_port(struct mtip_port *port) /* Clear any pending interrupts for this port */ writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT); + /* Clear any pending interrupts on the HBA. */ + writel(readl(port->dd->mmio + HOST_IRQ_STAT), + port->dd->mmio + HOST_IRQ_STAT); + /* Enable port interrupts */ writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK); } @@ -447,6 +465,9 @@ static void mtip_restart_port(struct mtip_port *port) && time_before(jiffies, timeout)) ; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + /* * Chip quirk: escalate to hba reset if * PxCMD.CR not clear after 500 ms @@ -475,6 +496,9 @@ static void mtip_restart_port(struct mtip_port *port) while (time_before(jiffies, timeout)) ; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + /* Clear PxSCTL.DET */ writel(readl(port->mmio + PORT_SCR_CTL) & ~1, port->mmio + PORT_SCR_CTL); @@ -486,15 +510,35 @@ static void mtip_restart_port(struct mtip_port *port) && time_before(jiffies, timeout)) ; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return; + if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0) dev_warn(&port->dd->pdev->dev, "COM reset failed\n"); - /* Clear SError, the PxSERR.DIAG.x should be set so clear it */ - writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR); + mtip_init_port(port); + mtip_start_port(port); - /* Enable the DMA engine */ - mtip_enable_engine(port, 1); +} + +/* + * Helper function for tag logging + */ +static void print_tags(struct driver_data *dd, + char *msg, + unsigned long *tagbits, + int cnt) +{ + unsigned char tagmap[128]; + int group, tagmap_len = 0; + + memset(tagmap, 0, sizeof(tagmap)); + for (group = SLOTBITS_IN_LONGS; group > 0; group--) + tagmap_len = sprintf(tagmap + tagmap_len, "%016lX ", + tagbits[group-1]); + dev_warn(&dd->pdev->dev, + "%d command(s) %s: tagmap [%s]", cnt, msg, tagmap); } /* @@ -514,15 +558,18 @@ static void mtip_timeout_function(unsigned long int data) int tag, cmdto_cnt = 0; unsigned int bit, group; unsigned int num_command_slots = port->dd->slot_groups * 32; + unsigned long to, tagaccum[SLOTBITS_IN_LONGS]; if (unlikely(!port)) return; - if (atomic_read(&port->dd->resumeflag) == true) { + if (test_bit(MTIP_DDF_RESUME_BIT, &port->dd->dd_flag)) { mod_timer(&port->cmd_timer, jiffies + msecs_to_jiffies(30000)); return; } + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); for (tag = 0; tag < num_command_slots; tag++) { /* @@ -540,12 +587,10 @@ static void mtip_timeout_function(unsigned long int data) command = &port->commands[tag]; fis = (struct host_to_dev_fis *) command->command; - dev_warn(&port->dd->pdev->dev, - "Timeout for command tag %d\n", tag); - + set_bit(tag, tagaccum); cmdto_cnt++; if (cmdto_cnt == 1) - set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags); + set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); /* * Clear the completed bit. This should prevent @@ -578,15 +623,29 @@ static void mtip_timeout_function(unsigned long int data) } } - if (cmdto_cnt) { - dev_warn(&port->dd->pdev->dev, - "%d commands timed out: restarting port", - cmdto_cnt); + if (cmdto_cnt && !test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + print_tags(port->dd, "timed out", tagaccum, cmdto_cnt); + mtip_restart_port(port); - clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); } + if (port->ic_pause_timer) { + to = port->ic_pause_timer + msecs_to_jiffies(1000); + if (time_after(jiffies, to)) { + if (!test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) { + port->ic_pause_timer = 0; + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + } + + + } + } + /* Restart the timer */ mod_timer(&port->cmd_timer, jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); @@ -681,23 +740,18 @@ static void mtip_completion(struct mtip_port *port, complete(waiting); } -/* - * Helper function for tag logging - */ -static void print_tags(struct driver_data *dd, - char *msg, - unsigned long *tagbits) +static void mtip_null_completion(struct mtip_port *port, + int tag, + void *data, + int status) { - unsigned int tag, count = 0; - - for (tag = 0; tag < (dd->slot_groups) * 32; tag++) { - if (test_bit(tag, tagbits)) - count++; - } - if (count) - dev_info(&dd->pdev->dev, "%s [%i tags]\n", msg, count); + return; } +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors); +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib); /* * Handle an error. * @@ -708,12 +762,16 @@ static void print_tags(struct driver_data *dd, */ static void mtip_handle_tfe(struct driver_data *dd) { - int group, tag, bit, reissue; + int group, tag, bit, reissue, rv; struct mtip_port *port; - struct mtip_cmd *command; + struct mtip_cmd *cmd; u32 completed; struct host_to_dev_fis *fis; unsigned long tagaccum[SLOTBITS_IN_LONGS]; + unsigned int cmd_cnt = 0; + unsigned char *buf; + char *fail_reason = NULL; + int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0; dev_warn(&dd->pdev->dev, "Taskfile error\n"); @@ -721,9 +779,23 @@ static void mtip_handle_tfe(struct driver_data *dd) /* Stop the timer to prevent command timeouts. */ del_timer(&port->cmd_timer); + set_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); - /* Set eh_active */ - set_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags); + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_TAG_INTERNAL, port->allocated)) { + cmd = &port->commands[MTIP_TAG_INTERNAL]; + dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n"); + + atomic_inc(&cmd->active); /* active > 1 indicates error */ + if (cmd->comp_data && cmd->comp_func) { + cmd->comp_func(port, MTIP_TAG_INTERNAL, + cmd->comp_data, PORT_IRQ_TF_ERR); + } + goto handle_tfe_exit; + } + + /* clear the tag accumulator */ + memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); /* Loop through all the groups */ for (group = 0; group < dd->slot_groups; group++) { @@ -732,9 +804,6 @@ static void mtip_handle_tfe(struct driver_data *dd) /* clear completed status register in the hardware.*/ writel(completed, port->completed[group]); - /* clear the tag accumulator */ - memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); - /* Process successfully completed commands */ for (bit = 0; bit < 32 && completed; bit++) { if (!(completed & (1<<bit))) @@ -745,13 +814,14 @@ static void mtip_handle_tfe(struct driver_data *dd) if (tag == MTIP_TAG_INTERNAL) continue; - command = &port->commands[tag]; - if (likely(command->comp_func)) { + cmd = &port->commands[tag]; + if (likely(cmd->comp_func)) { set_bit(tag, tagaccum); - atomic_set(&port->commands[tag].active, 0); - command->comp_func(port, + cmd_cnt++; + atomic_set(&cmd->active, 0); + cmd->comp_func(port, tag, - command->comp_data, + cmd->comp_data, 0); } else { dev_err(&port->dd->pdev->dev, @@ -765,12 +835,45 @@ static void mtip_handle_tfe(struct driver_data *dd) } } } - print_tags(dd, "TFE tags completed:", tagaccum); + + print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt); /* Restart the port */ mdelay(20); mtip_restart_port(port); + /* Trying to determine the cause of the error */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + fail_all_ncq_write = 1; + fail_reason = "write protect"; + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + fail_all_ncq_cmds = 1; + fail_reason = "thermal shutdown"; + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + fail_all_ncq_cmds = 1; + fail_reason = "rebuild failed"; + } + } + /* clear the tag accumulator */ memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long)); @@ -779,32 +882,47 @@ static void mtip_handle_tfe(struct driver_data *dd) for (bit = 0; bit < 32; bit++) { reissue = 1; tag = (group << 5) + bit; + cmd = &port->commands[tag]; /* If the active bit is set re-issue the command */ - if (atomic_read(&port->commands[tag].active) == 0) + if (atomic_read(&cmd->active) == 0) continue; - fis = (struct host_to_dev_fis *) - port->commands[tag].command; + fis = (struct host_to_dev_fis *)cmd->command; /* Should re-issue? */ if (tag == MTIP_TAG_INTERNAL || fis->command == ATA_CMD_SET_FEATURES) reissue = 0; + else { + if (fail_all_ncq_cmds || + (fail_all_ncq_write && + fis->command == ATA_CMD_FPDMA_WRITE)) { + dev_warn(&dd->pdev->dev, + " Fail: %s w/tag %d [%s].\n", + fis->command == ATA_CMD_FPDMA_WRITE ? + "write" : "read", + tag, + fail_reason != NULL ? + fail_reason : "unknown"); + atomic_set(&cmd->active, 0); + if (cmd->comp_func) { + cmd->comp_func(port, tag, + cmd->comp_data, + -ENODATA); + } + continue; + } + } /* * First check if this command has * exceeded its retries. */ - if (reissue && - (port->commands[tag].retries-- > 0)) { + if (reissue && (cmd->retries-- > 0)) { set_bit(tag, tagaccum); - /* Update the timeout value. */ - port->commands[tag].comp_time = - jiffies + msecs_to_jiffies( - MTIP_NCQ_COMMAND_TIMEOUT_MS); /* Re-issue the command. */ mtip_issue_ncq_command(port, tag); @@ -814,13 +932,13 @@ static void mtip_handle_tfe(struct driver_data *dd) /* Retire a command that will not be reissued */ dev_warn(&port->dd->pdev->dev, "retiring tag %d\n", tag); - atomic_set(&port->commands[tag].active, 0); + atomic_set(&cmd->active, 0); - if (port->commands[tag].comp_func) - port->commands[tag].comp_func( + if (cmd->comp_func) + cmd->comp_func( port, tag, - port->commands[tag].comp_data, + cmd->comp_data, PORT_IRQ_TF_ERR); else dev_warn(&port->dd->pdev->dev, @@ -828,10 +946,11 @@ static void mtip_handle_tfe(struct driver_data *dd) tag); } } - print_tags(dd, "TFE tags reissued:", tagaccum); + print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt); +handle_tfe_exit: /* clear eh_active */ - clear_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); mod_timer(&port->cmd_timer, @@ -851,6 +970,8 @@ static inline void mtip_process_sdbf(struct driver_data *dd) /* walk all bits in all slot groups */ for (group = 0; group < dd->slot_groups; group++) { completed = readl(port->completed[group]); + if (!completed) + continue; /* clear completed status register in the hardware.*/ writel(completed, port->completed[group]); @@ -899,7 +1020,7 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) struct mtip_port *port = dd->port; struct mtip_cmd *cmd = &port->commands[MTIP_TAG_INTERNAL]; - if (test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) && + if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && (cmd != NULL) && !(readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL))) { if (cmd->comp_func) { @@ -911,8 +1032,6 @@ static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat) } } - dev_warn(&dd->pdev->dev, "IRQ status 0x%x ignored.\n", port_stat); - return; } @@ -968,6 +1087,9 @@ static inline irqreturn_t mtip_handle_irq(struct driver_data *data) /* don't proceed further */ return IRQ_HANDLED; } + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag)) + return rv; mtip_process_errors(dd, port_stat & PORT_IRQ_ERR); } @@ -1015,6 +1137,39 @@ static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag) port->cmd_issue[MTIP_TAG_INDEX(tag)]); } +static bool mtip_pause_ncq(struct mtip_port *port, + struct host_to_dev_fis *fis) +{ + struct host_to_dev_fis *reply; + unsigned long task_file_data; + + reply = port->rxfis + RX_FIS_D2H_REG; + task_file_data = readl(port->mmio+PORT_TFDATA); + + if ((task_file_data & 1) || (fis->command == ATA_CMD_SEC_ERASE_UNIT)) + return false; + + if (fis->command == ATA_CMD_SEC_ERASE_PREP) { + set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) && + (fis->features == 0x03)) { + set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = jiffies; + return true; + } else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) || + ((fis->command == 0xFC) && + (fis->features == 0x27 || fis->features == 0x72 || + fis->features == 0x62 || fis->features == 0x26))) { + /* Com reset after secure erase or lowlevel format */ + mtip_restart_port(port); + return false; + } + + return false; +} + /* * Wait for port to quiesce * @@ -1033,11 +1188,13 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) to = jiffies + msecs_to_jiffies(timeout); do { - if (test_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags) && - test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) { + if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) && + test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { msleep(20); continue; /* svc thd is actively issuing commands */ } + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return -EFAULT; /* * Ignore s_active bit 0 of array element 0. * This bit will always be set @@ -1074,7 +1231,7 @@ static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout) * -EAGAIN Time out waiting for command to complete. */ static int mtip_exec_internal_command(struct mtip_port *port, - void *fis, + struct host_to_dev_fis *fis, int fis_len, dma_addr_t buffer, int buf_len, @@ -1084,8 +1241,9 @@ static int mtip_exec_internal_command(struct mtip_port *port, { struct mtip_cmd_sg *command_sg; DECLARE_COMPLETION_ONSTACK(wait); - int rv = 0; + int rv = 0, ready2go = 1; struct mtip_cmd *int_cmd = &port->commands[MTIP_TAG_INTERNAL]; + unsigned long to; /* Make sure the buffer is 8 byte aligned. This is asic specific. */ if (buffer & 0x00000007) { @@ -1094,23 +1252,38 @@ static int mtip_exec_internal_command(struct mtip_port *port, return -EFAULT; } - /* Only one internal command should be running at a time */ - if (test_and_set_bit(MTIP_TAG_INTERNAL, port->allocated)) { + to = jiffies + msecs_to_jiffies(timeout); + do { + ready2go = !test_and_set_bit(MTIP_TAG_INTERNAL, + port->allocated); + if (ready2go) + break; + mdelay(100); + } while (time_before(jiffies, to)); + if (!ready2go) { dev_warn(&port->dd->pdev->dev, - "Internal command already active\n"); + "Internal cmd active. new cmd [%02X]\n", fis->command); return -EBUSY; } - set_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags); + set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + port->ic_pause_timer = 0; + + if (fis->command == ATA_CMD_SEC_ERASE_UNIT) + clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags); + else if (fis->command == ATA_CMD_DOWNLOAD_MICRO) + clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags); if (atomic == GFP_KERNEL) { - /* wait for io to complete if non atomic */ - if (mtip_quiesce_io(port, 5000) < 0) { - dev_warn(&port->dd->pdev->dev, - "Failed to quiesce IO\n"); - release_slot(port, MTIP_TAG_INTERNAL); - clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags); - wake_up_interruptible(&port->svc_wait); - return -EBUSY; + if (fis->command != ATA_CMD_STANDBYNOW1) { + /* wait for io to complete if non atomic */ + if (mtip_quiesce_io(port, 5000) < 0) { + dev_warn(&port->dd->pdev->dev, + "Failed to quiesce IO\n"); + release_slot(port, MTIP_TAG_INTERNAL); + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); + wake_up_interruptible(&port->svc_wait); + return -EBUSY; + } } /* Set the completion function and data for the command. */ @@ -1120,7 +1293,7 @@ static int mtip_exec_internal_command(struct mtip_port *port, } else { /* Clear completion - we're going to poll */ int_cmd->comp_data = NULL; - int_cmd->comp_func = NULL; + int_cmd->comp_func = mtip_null_completion; } /* Copy the command to the command table */ @@ -1159,38 +1332,60 @@ static int mtip_exec_internal_command(struct mtip_port *port, "Internal command did not complete [%d] " "within timeout of %lu ms\n", atomic, timeout); + if (mtip_check_surprise_removal(port->dd->pdev) || + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + rv = -ENXIO; + goto exec_ic_exit; + } rv = -EAGAIN; } - - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) - & (1 << MTIP_TAG_INTERNAL)) { - dev_warn(&port->dd->pdev->dev, - "Retiring internal command but CI is 1.\n"); - } - } else { /* Spin for <timeout> checking if command still outstanding */ timeout = jiffies + msecs_to_jiffies(timeout); + while ((readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + & (1 << MTIP_TAG_INTERNAL)) + && time_before(jiffies, timeout)) { + if (mtip_check_surprise_removal(port->dd->pdev)) { + rv = -ENXIO; + goto exec_ic_exit; + } + if ((fis->command != ATA_CMD_STANDBYNOW1) && + test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + rv = -ENXIO; + goto exec_ic_exit; + } + if (readl(port->mmio + PORT_IRQ_STAT) & PORT_IRQ_ERR) { + atomic_inc(&int_cmd->active); /* error */ + break; + } + } + } - while ((readl( - port->cmd_issue[MTIP_TAG_INTERNAL]) - & (1 << MTIP_TAG_INTERNAL)) - && time_before(jiffies, timeout)) - ; - - if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) + if (atomic_read(&int_cmd->active) > 1) { + dev_err(&port->dd->pdev->dev, + "Internal command [%02X] failed\n", fis->command); + rv = -EIO; + } + if (readl(port->cmd_issue[MTIP_TAG_INTERNAL]) & (1 << MTIP_TAG_INTERNAL)) { - dev_err(&port->dd->pdev->dev, - "Internal command did not complete [%d]\n", - atomic); + rv = -ENXIO; + if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &port->dd->dd_flag)) { + mtip_restart_port(port); rv = -EAGAIN; } } - +exec_ic_exit: /* Clear the allocated and active bits for the internal command. */ atomic_set(&int_cmd->active, 0); release_slot(port, MTIP_TAG_INTERNAL); - clear_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags); + if (rv >= 0 && mtip_pause_ncq(port, fis)) { + /* NCQ paused */ + return rv; + } + clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags); wake_up_interruptible(&port->svc_wait); return rv; @@ -1240,6 +1435,9 @@ static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer) int rv = 0; struct host_to_dev_fis fis; + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag)) + return -EFAULT; + /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); fis.type = 0x27; @@ -1313,6 +1511,7 @@ static int mtip_standby_immediate(struct mtip_port *port) { int rv; struct host_to_dev_fis fis; + unsigned long start; /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); @@ -1320,15 +1519,150 @@ static int mtip_standby_immediate(struct mtip_port *port) fis.opts = 1 << 7; fis.command = ATA_CMD_STANDBYNOW1; - /* Execute the command. Use a 15-second timeout for large drives. */ + start = jiffies; rv = mtip_exec_internal_command(port, &fis, 5, 0, 0, 0, - GFP_KERNEL, + GFP_ATOMIC, 15000); + dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n", + jiffies_to_msecs(jiffies - start)); + if (rv) + dev_warn(&port->dd->pdev->dev, + "STANDBY IMMEDIATE command failed.\n"); + + return rv; +} + +/* + * Issue a READ LOG EXT command to the device. + * + * @port pointer to the port structure. + * @page page number to fetch + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * @sectors page length to fetch, in sectors + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer, + dma_addr_t buffer_dma, unsigned int sectors) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_READ_LOG_EXT; + fis.sect_count = sectors & 0xFF; + fis.sect_cnt_ex = (sectors >> 8) & 0xFF; + fis.lba_low = page; + fis.lba_mid = 0; + fis.device = ATA_DEVICE_OBS; + + memset(buffer, 0, sectors * ATA_SECT_SIZE); + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + sectors * ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + MTIP_INTERNAL_COMMAND_TIMEOUT_MS); +} + +/* + * Issue a SMART READ DATA command to the device. + * + * @port pointer to the port structure. + * @buffer pointer to buffer + * @buffer_dma dma address corresponding to @buffer + * + * return value + * @rv return value from mtip_exec_internal_command() + */ +static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer, + dma_addr_t buffer_dma) +{ + struct host_to_dev_fis fis; + + memset(&fis, 0, sizeof(struct host_to_dev_fis)); + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = ATA_CMD_SMART; + fis.features = 0xD0; + fis.sect_count = 1; + fis.lba_mid = 0x4F; + fis.lba_hi = 0xC2; + fis.device = ATA_DEVICE_OBS; + + return mtip_exec_internal_command(port, + &fis, + 5, + buffer_dma, + ATA_SECT_SIZE, + 0, + GFP_ATOMIC, + 15000); +} + +/* + * Get the value of a smart attribute + * + * @port pointer to the port structure + * @id attribute number + * @attrib pointer to return attrib information corresponding to @id + * + * return value + * -EINVAL NULL buffer passed or unsupported attribute @id. + * -EPERM Identify data not valid, SMART not supported or not enabled + */ +static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id, + struct smart_attr *attrib) +{ + int rv, i; + struct smart_attr *pattr; + + if (!attrib) + return -EINVAL; + + if (!port->identify_valid) { + dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n"); + return -EPERM; + } + if (!(port->identify[82] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not supported\n"); + return -EPERM; + } + if (!(port->identify[85] & 0x1)) { + dev_warn(&port->dd->pdev->dev, "SMART not enabled\n"); + return -EPERM; + } + + memset(port->smart_buf, 0, ATA_SECT_SIZE); + rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma); + if (rv) { + dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n"); + return rv; + } + + pattr = (struct smart_attr *)(port->smart_buf + 2); + for (i = 0; i < 29; i++, pattr++) + if (pattr->attr_id == id) { + memcpy(attrib, pattr, sizeof(struct smart_attr)); + break; + } + + if (i == 29) { + dev_warn(&port->dd->pdev->dev, + "Query for invalid SMART attribute ID\n"); + rv = -EINVAL; + } return rv; } @@ -1504,10 +1838,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) fis.cyl_hi = command[5]; fis.device = command[6] & ~0x10; /* Clear the dev bit*/ - - dbg_printk(MTIP_DRV_NAME "%s: User Command: cmd %x, feat %x, " - "nsect %x, sect %x, lcyl %x, " - "hcyl %x, sel %x\n", + dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n", __func__, command[0], command[1], @@ -1534,8 +1865,7 @@ static int exec_drive_task(struct mtip_port *port, u8 *command) command[4] = reply->cyl_low; command[5] = reply->cyl_hi; - dbg_printk(MTIP_DRV_NAME "%s: Completion Status: stat %x, " - "err %x , cyl_lo %x cyl_hi %x\n", + dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n", __func__, command[0], command[1], @@ -1562,13 +1892,33 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, void __user *user_buffer) { struct host_to_dev_fis fis; - struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG); + struct host_to_dev_fis *reply; + u8 *buf = NULL; + dma_addr_t dma_addr = 0; + int rv = 0, xfer_sz = command[3]; + + if (xfer_sz) { + if (user_buffer) + return -EFAULT; + + buf = dmam_alloc_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, + &dma_addr, + GFP_KERNEL); + if (!buf) { + dev_err(&port->dd->pdev->dev, + "Memory allocation failed (%d bytes)\n", + ATA_SECT_SIZE * xfer_sz); + return -ENOMEM; + } + memset(buf, 0, ATA_SECT_SIZE * xfer_sz); + } /* Build the FIS. */ memset(&fis, 0, sizeof(struct host_to_dev_fis)); - fis.type = 0x27; - fis.opts = 1 << 7; - fis.command = command[0]; + fis.type = 0x27; + fis.opts = 1 << 7; + fis.command = command[0]; fis.features = command[2]; fis.sect_count = command[3]; if (fis.command == ATA_CMD_SMART) { @@ -1577,8 +1927,13 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, fis.cyl_hi = 0xC2; } + if (xfer_sz) + reply = (port->rxfis + RX_FIS_PIO_SETUP); + else + reply = (port->rxfis + RX_FIS_D2H_REG); + dbg_printk(MTIP_DRV_NAME - "%s: User Command: cmd %x, sect %x, " + " %s: User Command: cmd %x, sect %x, " "feat %x, sectcnt %x\n", __func__, command[0], @@ -1586,43 +1941,46 @@ static int exec_drive_command(struct mtip_port *port, u8 *command, command[2], command[3]); - memset(port->sector_buffer, 0x00, ATA_SECT_SIZE); - /* Execute the command. */ if (mtip_exec_internal_command(port, &fis, 5, - port->sector_buffer_dma, - (command[3] != 0) ? ATA_SECT_SIZE : 0, + (xfer_sz ? dma_addr : 0), + (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0), 0, GFP_KERNEL, MTIP_IOCTL_COMMAND_TIMEOUT_MS) < 0) { - return -1; + rv = -EFAULT; + goto exit_drive_command; } /* Collect the completion status. */ command[0] = reply->command; /* Status*/ command[1] = reply->features; /* Error*/ - command[2] = command[3]; + command[2] = reply->sect_count; dbg_printk(MTIP_DRV_NAME - "%s: Completion Status: stat %x, " - "err %x, cmd %x\n", + " %s: Completion Status: stat %x, " + "err %x, nsect %x\n", __func__, command[0], command[1], command[2]); - if (user_buffer && command[3]) { + if (xfer_sz) { if (copy_to_user(user_buffer, - port->sector_buffer, + buf, ATA_SECT_SIZE * command[3])) { - return -EFAULT; + rv = -EFAULT; + goto exit_drive_command; } } - - return 0; +exit_drive_command: + if (buf) + dmam_free_coherent(&port->dd->pdev->dev, + ATA_SECT_SIZE * xfer_sz, buf, dma_addr); + return rv; } /* @@ -1672,6 +2030,32 @@ static unsigned int implicit_sector(unsigned char command, return rv; } +static void mtip_set_timeout(struct host_to_dev_fis *fis, unsigned int *timeout) +{ + switch (fis->command) { + case ATA_CMD_DOWNLOAD_MICRO: + *timeout = 120000; /* 2 minutes */ + break; + case ATA_CMD_SEC_ERASE_UNIT: + case 0xFC: + *timeout = 240000; /* 4 minutes */ + break; + case ATA_CMD_STANDBYNOW1: + *timeout = 10000; /* 10 seconds */ + break; + case 0xF7: + case 0xFA: + *timeout = 60000; /* 60 seconds */ + break; + case ATA_CMD_SMART: + *timeout = 15000; /* 15 seconds */ + break; + default: + *timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; + break; + } +} + /* * Executes a taskfile * See ide_taskfile_ioctl() for derivation @@ -1692,7 +2076,7 @@ static int exec_drive_taskfile(struct driver_data *dd, unsigned int taskin = 0; unsigned int taskout = 0; u8 nsect = 0; - unsigned int timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; + unsigned int timeout; unsigned int force_single_sector; unsigned int transfer_size; unsigned long task_file_data; @@ -1810,9 +2194,10 @@ static int exec_drive_taskfile(struct driver_data *dd, } dbg_printk(MTIP_DRV_NAME - "taskfile: cmd %x, feat %x, nsect %x," + " %s: cmd %x, feat %x, nsect %x," " sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x," " head/dev %x\n", + __func__, fis.command, fis.features, fis.sect_count, @@ -1821,32 +2206,7 @@ static int exec_drive_taskfile(struct driver_data *dd, fis.lba_hi, fis.device); - switch (fis.command) { - case ATA_CMD_DOWNLOAD_MICRO: - /* Change timeout for Download Microcode to 60 seconds.*/ - timeout = 60000; - break; - case ATA_CMD_SEC_ERASE_UNIT: - /* Change timeout for Security Erase Unit to 4 minutes.*/ - timeout = 240000; - break; - case ATA_CMD_STANDBYNOW1: - /* Change timeout for standby immediate to 10 seconds.*/ - timeout = 10000; - break; - case 0xF7: - case 0xFA: - /* Change timeout for vendor unique command to 10 secs */ - timeout = 10000; - break; - case ATA_CMD_SMART: - /* Change timeout for vendor unique command to 10 secs */ - timeout = 10000; - break; - default: - timeout = MTIP_IOCTL_COMMAND_TIMEOUT_MS; - break; - } + mtip_set_timeout(&fis, &timeout); /* Determine the correct transfer size.*/ if (force_single_sector) @@ -1903,18 +2263,8 @@ static int exec_drive_taskfile(struct driver_data *dd, req_task->hob_ports[1] = reply->features_ex; req_task->hob_ports[2] = reply->sect_cnt_ex; } - - /* Com rest after secure erase or lowlevel format */ - if (((fis.command == ATA_CMD_SEC_ERASE_UNIT) || - ((fis.command == 0xFC) && - (fis.features == 0x27 || fis.features == 0x72 || - fis.features == 0x62 || fis.features == 0x26))) && - !(reply->command & 1)) { - mtip_restart_port(dd->port); - } - dbg_printk(MTIP_DRV_NAME - "%s: Completion: stat %x," + " %s: Completion: stat %x," "err %x, sect_cnt %x, lbalo %x," "lbamid %x, lbahi %x, dev %x\n", __func__, @@ -1973,13 +2323,12 @@ static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd, { switch (cmd) { case HDIO_GET_IDENTITY: - if (mtip_get_identify(dd->port, (void __user *) arg) < 0) { - dev_warn(&dd->pdev->dev, - "Unable to read identity\n"); - return -EIO; - } - + { + if (copy_to_user((void __user *)arg, dd->port->identify, + sizeof(u16) * ATA_ID_WORDS)) + return -EFAULT; break; + } case HDIO_DRIVE_CMD: { u8 drive_command[4]; @@ -2080,14 +2429,10 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, struct host_to_dev_fis *fis; struct mtip_port *port = dd->port; struct mtip_cmd *command = &port->commands[tag]; + int dma_dir = (dir == READ) ? DMA_FROM_DEVICE : DMA_TO_DEVICE; /* Map the scatter list for DMA access */ - if (dir == READ) - nents = dma_map_sg(&dd->pdev->dev, command->sg, - nents, DMA_FROM_DEVICE); - else - nents = dma_map_sg(&dd->pdev->dev, command->sg, - nents, DMA_TO_DEVICE); + nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir); command->scatter_ents = nents; @@ -2127,7 +2472,7 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, */ command->comp_data = dd; command->comp_func = mtip_async_complete; - command->direction = (dir == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE); + command->direction = dma_dir; /* * Set the completion function and data for the command passed @@ -2140,19 +2485,16 @@ static void mtip_hw_submit_io(struct driver_data *dd, sector_t start, * To prevent this command from being issued * if an internal command is in progress or error handling is active. */ - if (unlikely(test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) || - test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags))) { + if (port->flags & MTIP_PF_PAUSE_IO) { set_bit(tag, port->cmds_to_issue); - set_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags); + set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); return; } /* Issue the command to the hardware */ mtip_issue_ncq_command(port, tag); - /* Set the command's timeout value.*/ - port->commands[tag].comp_time = jiffies + msecs_to_jiffies( - MTIP_NCQ_COMMAND_TIMEOUT_MS); + return; } /* @@ -2191,8 +2533,14 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, down(&dd->port->cmd_slot); *tag = get_slot(dd->port); - if (unlikely(*tag < 0)) + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { + up(&dd->port->cmd_slot); + return NULL; + } + if (unlikely(*tag < 0)) { + up(&dd->port->cmd_slot); return NULL; + } return dd->port->commands[*tag].sg; } @@ -2207,7 +2555,7 @@ static struct scatterlist *mtip_hw_get_scatterlist(struct driver_data *dd, * return value * The size, in bytes, of the data copied into buf. */ -static ssize_t hw_show_registers(struct device *dev, +static ssize_t mtip_hw_show_registers(struct device *dev, struct device_attribute *attr, char *buf) { @@ -2216,44 +2564,97 @@ static ssize_t hw_show_registers(struct device *dev, int size = 0; int n; - size += sprintf(&buf[size], "%s:\ns_active:\n", __func__); + size += sprintf(&buf[size], "Hardware\n--------\n"); + size += sprintf(&buf[size], "S ACTive : [ 0x"); - for (n = 0; n < dd->slot_groups; n++) - size += sprintf(&buf[size], "0x%08x\n", + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", readl(dd->port->s_active[n])); - size += sprintf(&buf[size], "Command Issue:\n"); + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "Command Issue : [ 0x"); - for (n = 0; n < dd->slot_groups; n++) - size += sprintf(&buf[size], "0x%08x\n", + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", readl(dd->port->cmd_issue[n])); - size += sprintf(&buf[size], "Allocated:\n"); + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "Completed : [ 0x"); + + for (n = dd->slot_groups-1; n >= 0; n--) + size += sprintf(&buf[size], "%08X ", + readl(dd->port->completed[n])); + + size += sprintf(&buf[size], "]\n"); + size += sprintf(&buf[size], "PORT IRQ STAT : [ 0x%08X ]\n", + readl(dd->port->mmio + PORT_IRQ_STAT)); + size += sprintf(&buf[size], "HOST IRQ STAT : [ 0x%08X ]\n", + readl(dd->mmio + HOST_IRQ_STAT)); + size += sprintf(&buf[size], "\n"); + + size += sprintf(&buf[size], "Local\n-----\n"); + size += sprintf(&buf[size], "Allocated : [ 0x"); - for (n = 0; n < dd->slot_groups; n++) { + for (n = dd->slot_groups-1; n >= 0; n--) { if (sizeof(long) > sizeof(u32)) group_allocated = dd->port->allocated[n/2] >> (32*(n&1)); else group_allocated = dd->port->allocated[n]; - size += sprintf(&buf[size], "0x%08x\n", - group_allocated); + size += sprintf(&buf[size], "%08X ", group_allocated); } + size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "completed:\n"); + size += sprintf(&buf[size], "Commands in Q: [ 0x"); - for (n = 0; n < dd->slot_groups; n++) - size += sprintf(&buf[size], "0x%08x\n", - readl(dd->port->completed[n])); + for (n = dd->slot_groups-1; n >= 0; n--) { + if (sizeof(long) > sizeof(u32)) + group_allocated = + dd->port->cmds_to_issue[n/2] >> (32*(n&1)); + else + group_allocated = dd->port->cmds_to_issue[n]; + size += sprintf(&buf[size], "%08X ", group_allocated); + } + size += sprintf(&buf[size], "]\n"); - size += sprintf(&buf[size], "PORT_IRQ_STAT 0x%08x\n", - readl(dd->port->mmio + PORT_IRQ_STAT)); - size += sprintf(&buf[size], "HOST_IRQ_STAT 0x%08x\n", - readl(dd->mmio + HOST_IRQ_STAT)); + return size; +} + +static ssize_t mtip_hw_show_status(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct driver_data *dd = dev_to_disk(dev)->private_data; + int size = 0; + + if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "thermal_shutdown\n"); + else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag)) + size += sprintf(buf, "%s", "write_protect\n"); + else + size += sprintf(buf, "%s", "online\n"); + + return size; +} + +static ssize_t mtip_hw_show_flags(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct driver_data *dd = dev_to_disk(dev)->private_data; + int size = 0; + + size += sprintf(&buf[size], "Flag in port struct : [ %08lX ]\n", + dd->port->flags); + size += sprintf(&buf[size], "Flag in dd struct : [ %08lX ]\n", + dd->dd_flag); return size; } -static DEVICE_ATTR(registers, S_IRUGO, hw_show_registers, NULL); + +static DEVICE_ATTR(registers, S_IRUGO, mtip_hw_show_registers, NULL); +static DEVICE_ATTR(status, S_IRUGO, mtip_hw_show_status, NULL); +static DEVICE_ATTR(flags, S_IRUGO, mtip_hw_show_flags, NULL); /* * Create the sysfs related attributes. @@ -2272,7 +2673,13 @@ static int mtip_hw_sysfs_init(struct driver_data *dd, struct kobject *kobj) if (sysfs_create_file(kobj, &dev_attr_registers.attr)) dev_warn(&dd->pdev->dev, - "Error creating registers sysfs entry\n"); + "Error creating 'registers' sysfs entry\n"); + if (sysfs_create_file(kobj, &dev_attr_status.attr)) + dev_warn(&dd->pdev->dev, + "Error creating 'status' sysfs entry\n"); + if (sysfs_create_file(kobj, &dev_attr_flags.attr)) + dev_warn(&dd->pdev->dev, + "Error creating 'flags' sysfs entry\n"); return 0; } @@ -2292,6 +2699,8 @@ static int mtip_hw_sysfs_exit(struct driver_data *dd, struct kobject *kobj) return -EINVAL; sysfs_remove_file(kobj, &dev_attr_registers.attr); + sysfs_remove_file(kobj, &dev_attr_status.attr); + sysfs_remove_file(kobj, &dev_attr_flags.attr); return 0; } @@ -2384,10 +2793,12 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) "FTL rebuild in progress. Polling for completion.\n"); start = jiffies; - dd->ftlrebuildflag = 1; timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS); do { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + return -EFAULT; if (mtip_check_surprise_removal(dd->pdev)) return -EFAULT; @@ -2408,22 +2819,17 @@ static int mtip_ftl_rebuild_poll(struct driver_data *dd) dev_warn(&dd->pdev->dev, "FTL rebuild complete (%d secs).\n", jiffies_to_msecs(jiffies - start) / 1000); - dd->ftlrebuildflag = 0; mtip_block_initialize(dd); - break; + return 0; } ssleep(10); } while (time_before(jiffies, timeout)); /* Check for timeout */ - if (dd->ftlrebuildflag) { - dev_err(&dd->pdev->dev, + dev_err(&dd->pdev->dev, "Timed out waiting for FTL rebuild to complete (%d secs).\n", jiffies_to_msecs(jiffies - start) / 1000); - return -EFAULT; - } - - return 0; + return -EFAULT; } /* @@ -2448,14 +2854,17 @@ static int mtip_service_thread(void *data) * is in progress nor error handling is active */ wait_event_interruptible(port->svc_wait, (port->flags) && - !test_bit(MTIP_FLAG_IC_ACTIVE_BIT, &port->flags) && - !test_bit(MTIP_FLAG_EH_ACTIVE_BIT, &port->flags)); + !(port->flags & MTIP_PF_PAUSE_IO)); if (kthread_should_stop()) break; - set_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags); - if (test_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags)) { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) + break; + + set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); + if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) { slot = 1; /* used to restrict the loop to one iteration */ slot_start = num_cmd_slots; @@ -2480,21 +2889,19 @@ static int mtip_service_thread(void *data) /* Issue the command to the hardware */ mtip_issue_ncq_command(port, slot); - /* Set the command's timeout value.*/ - port->commands[slot].comp_time = jiffies + - msecs_to_jiffies(MTIP_NCQ_COMMAND_TIMEOUT_MS); - clear_bit(slot, port->cmds_to_issue); } - clear_bit(MTIP_FLAG_ISSUE_CMDS_BIT, &port->flags); - } else if (test_bit(MTIP_FLAG_REBUILD_BIT, &port->flags)) { - mtip_ftl_rebuild_poll(dd); - clear_bit(MTIP_FLAG_REBUILD_BIT, &port->flags); + clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags); + } else if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) { + if (!mtip_ftl_rebuild_poll(dd)) + set_bit(MTIP_DDF_REBUILD_FAILED_BIT, + &dd->dd_flag); + clear_bit(MTIP_PF_REBUILD_BIT, &port->flags); } - clear_bit(MTIP_FLAG_SVC_THD_ACTIVE_BIT, &port->flags); + clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags); - if (test_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &port->flags)) + if (test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags)) break; } return 0; @@ -2513,6 +2920,9 @@ static int mtip_hw_init(struct driver_data *dd) int i; int rv; unsigned int num_command_slots; + unsigned long timeout, timetaken; + unsigned char *buf; + struct smart_attr attr242; dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR]; @@ -2547,7 +2957,7 @@ static int mtip_hw_init(struct driver_data *dd) /* Allocate memory for the command list. */ dd->port->command_list = dmam_alloc_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2), + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), &dd->port->command_list_dma, GFP_KERNEL); if (!dd->port->command_list) { @@ -2560,7 +2970,7 @@ static int mtip_hw_init(struct driver_data *dd) /* Clear the memory we have allocated. */ memset(dd->port->command_list, 0, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2)); + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4)); /* Setup the addresse of the RX FIS. */ dd->port->rxfis = dd->port->command_list + HW_CMD_SLOT_SZ; @@ -2576,10 +2986,19 @@ static int mtip_hw_init(struct driver_data *dd) dd->port->identify_dma = dd->port->command_tbl_dma + HW_CMD_TBL_AR_SZ; - /* Setup the address of the sector buffer. */ + /* Setup the address of the sector buffer - for some non-ncq cmds */ dd->port->sector_buffer = (void *) dd->port->identify + ATA_SECT_SIZE; dd->port->sector_buffer_dma = dd->port->identify_dma + ATA_SECT_SIZE; + /* Setup the address of the log buf - for read log command */ + dd->port->log_buf = (void *)dd->port->sector_buffer + ATA_SECT_SIZE; + dd->port->log_buf_dma = dd->port->sector_buffer_dma + ATA_SECT_SIZE; + + /* Setup the address of the smart buf - for smart read data command */ + dd->port->smart_buf = (void *)dd->port->log_buf + ATA_SECT_SIZE; + dd->port->smart_buf_dma = dd->port->log_buf_dma + ATA_SECT_SIZE; + + /* Point the command headers at the command tables. */ for (i = 0; i < num_command_slots; i++) { dd->port->commands[i].command_header = @@ -2623,14 +3042,43 @@ static int mtip_hw_init(struct driver_data *dd) dd->port->mmio + i*0x80 + PORT_SDBV; } - /* Reset the HBA. */ - if (mtip_hba_reset(dd) < 0) { - dev_err(&dd->pdev->dev, - "Card did not reset within timeout\n"); - rv = -EIO; + timetaken = jiffies; + timeout = jiffies + msecs_to_jiffies(30000); + while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) && + time_before(jiffies, timeout)) { + mdelay(100); + } + if (unlikely(mtip_check_surprise_removal(dd->pdev))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Surprise removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -ENODEV; + goto out2 ; + } + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) { + timetaken = jiffies - timetaken; + dev_warn(&dd->pdev->dev, + "Removal detected at %u ms\n", + jiffies_to_msecs(timetaken)); + rv = -EFAULT; goto out2; } + /* Conditionally reset the HBA. */ + if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) { + if (mtip_hba_reset(dd) < 0) { + dev_err(&dd->pdev->dev, + "Card did not reset within timeout\n"); + rv = -EIO; + goto out2; + } + } else { + /* Clear any pending interrupts on the HBA */ + writel(readl(dd->mmio + HOST_IRQ_STAT), + dd->mmio + HOST_IRQ_STAT); + } + mtip_init_port(dd->port); mtip_start_port(dd->port); @@ -2660,6 +3108,12 @@ static int mtip_hw_init(struct driver_data *dd) mod_timer(&dd->port->cmd_timer, jiffies + msecs_to_jiffies(MTIP_TIMEOUT_CHECK_PERIOD)); + + if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) { + rv = -EFAULT; + goto out3; + } + if (mtip_get_identify(dd->port, NULL) < 0) { rv = -EFAULT; goto out3; @@ -2667,10 +3121,47 @@ static int mtip_hw_init(struct driver_data *dd) if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) == MTIP_FTL_REBUILD_MAGIC) { - set_bit(MTIP_FLAG_REBUILD_BIT, &dd->port->flags); + set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags); return MTIP_FTL_REBUILD_MAGIC; } mtip_dump_identify(dd->port); + + /* check write protect, over temp and rebuild statuses */ + rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ, + dd->port->log_buf, + dd->port->log_buf_dma, 1); + if (rv) { + dev_warn(&dd->pdev->dev, + "Error in READ LOG EXT (10h) command\n"); + /* non-critical error, don't fail the load */ + } else { + buf = (unsigned char *)dd->port->log_buf; + if (buf[259] & 0x1) { + dev_info(&dd->pdev->dev, + "Write protect bit is set.\n"); + set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag); + } + if (buf[288] == 0xF7) { + dev_info(&dd->pdev->dev, + "Exceeded Tmax, drive in thermal shutdown.\n"); + set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag); + } + if (buf[288] == 0xBF) { + dev_info(&dd->pdev->dev, + "Drive indicates rebuild has failed.\n"); + /* TODO */ + } + } + + /* get write protect progess */ + memset(&attr242, 0, sizeof(struct smart_attr)); + if (mtip_get_smart_attr(dd->port, 242, &attr242)) + dev_warn(&dd->pdev->dev, + "Unable to check write protect progress\n"); + else + dev_info(&dd->pdev->dev, + "Write protect progress: %d%% (%d blocks)\n", + attr242.cur, attr242.data); return rv; out3: @@ -2688,7 +3179,7 @@ out2: /* Free the command/command header memory. */ dmam_free_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2), + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), dd->port->command_list, dd->port->command_list_dma); out1: @@ -2712,9 +3203,12 @@ static int mtip_hw_exit(struct driver_data *dd) * Send standby immediate (E0h) to the drive so that it * saves its state. */ - if (atomic_read(&dd->drv_cleanup_done) != true) { + if (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { - mtip_standby_immediate(dd->port); + if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags)) + if (mtip_standby_immediate(dd->port)) + dev_warn(&dd->pdev->dev, + "STANDBY IMMEDIATE failed\n"); /* de-initialize the port. */ mtip_deinit_port(dd->port); @@ -2734,7 +3228,7 @@ static int mtip_hw_exit(struct driver_data *dd) /* Free the command/command header memory. */ dmam_free_coherent(&dd->pdev->dev, - HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 2), + HW_PORT_PRIV_DMA_SZ + (ATA_SECT_SIZE * 4), dd->port->command_list, dd->port->command_list_dma); /* Free the memory allocated for the for structure. */ @@ -2892,6 +3386,9 @@ static int mtip_block_ioctl(struct block_device *dev, if (!dd) return -ENOTTY; + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + switch (cmd) { case BLKFLSBUF: return -ENOTTY; @@ -2927,6 +3424,9 @@ static int mtip_block_compat_ioctl(struct block_device *dev, if (!dd) return -ENOTTY; + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) + return -ENOTTY; + switch (cmd) { case BLKFLSBUF: return -ENOTTY; @@ -3049,6 +3549,24 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) int nents = 0; int tag = 0; + if (unlikely(dd->dd_flag & MTIP_DDF_STOP_IO)) { + if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, + &dd->dd_flag))) { + bio_endio(bio, -ENXIO); + return; + } + if (unlikely(test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))) { + bio_endio(bio, -ENODATA); + return; + } + if (unlikely(test_bit(MTIP_DDF_WRITE_PROTECT_BIT, + &dd->dd_flag) && + bio_data_dir(bio))) { + bio_endio(bio, -ENODATA); + return; + } + } + if (unlikely(!bio_has_data(bio))) { blk_queue_flush(queue, 0); bio_endio(bio, 0); @@ -3061,7 +3579,7 @@ static void mtip_make_request(struct request_queue *queue, struct bio *bio) if (unlikely((bio)->bi_vcnt > MTIP_MAX_SG)) { dev_warn(&dd->pdev->dev, - "Maximum number of SGL entries exceeded"); + "Maximum number of SGL entries exceeded\n"); bio_io_error(bio); mtip_hw_release_scatterlist(dd, tag); return; @@ -3181,7 +3699,10 @@ skip_create_disk: set_bit(QUEUE_FLAG_NONROT, &dd->queue->queue_flags); blk_queue_max_segments(dd->queue, MTIP_MAX_SG); blk_queue_physical_block_size(dd->queue, 4096); + blk_queue_max_hw_sectors(dd->queue, 0xffff); + blk_queue_max_segment_size(dd->queue, 0x400000); blk_queue_io_min(dd->queue, 4096); + /* * write back cache is not supported in the device. FUA depends on * write back cache support, hence setting flush support to zero. @@ -3210,8 +3731,10 @@ skip_create_disk: kobject_put(kobj); } - if (dd->mtip_svc_handler) + if (dd->mtip_svc_handler) { + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); return rv; /* service thread created for handling rebuild */ + } start_service_thread: sprintf(thd_name, "mtip_svc_thd_%02d", index); @@ -3220,12 +3743,15 @@ start_service_thread: dd, thd_name); if (IS_ERR(dd->mtip_svc_handler)) { - printk(KERN_ERR "mtip32xx: service thread failed to start\n"); + dev_err(&dd->pdev->dev, "service thread failed to start\n"); dd->mtip_svc_handler = NULL; rv = -EFAULT; goto kthread_run_error; } + if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC) + rv = wait_for_rebuild; + return rv; kthread_run_error: @@ -3266,16 +3792,18 @@ static int mtip_block_remove(struct driver_data *dd) struct kobject *kobj; if (dd->mtip_svc_handler) { - set_bit(MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT, &dd->port->flags); + set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags); wake_up_interruptible(&dd->port->svc_wait); kthread_stop(dd->mtip_svc_handler); } - /* Clean up the sysfs attributes managed by the protocol layer. */ - kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); - if (kobj) { - mtip_hw_sysfs_exit(dd, kobj); - kobject_put(kobj); + /* Clean up the sysfs attributes, if created */ + if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag)) { + kobj = kobject_get(&disk_to_dev(dd->disk)->kobj); + if (kobj) { + mtip_hw_sysfs_exit(dd, kobj); + kobject_put(kobj); + } } /* @@ -3283,6 +3811,11 @@ static int mtip_block_remove(struct driver_data *dd) * from /dev */ del_gendisk(dd->disk); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + blk_cleanup_queue(dd->queue); dd->disk = NULL; dd->queue = NULL; @@ -3312,6 +3845,11 @@ static int mtip_block_shutdown(struct driver_data *dd) /* Delete our gendisk structure, and cleanup the blk queue. */ del_gendisk(dd->disk); + + spin_lock(&rssd_index_lock); + ida_remove(&rssd_index_ida, dd->index); + spin_unlock(&rssd_index_lock); + blk_cleanup_queue(dd->queue); dd->disk = NULL; dd->queue = NULL; @@ -3359,11 +3897,6 @@ static int mtip_pci_probe(struct pci_dev *pdev, return -ENOMEM; } - /* Set the atomic variable as 1 in case of SRSI */ - atomic_set(&dd->drv_cleanup_done, true); - - atomic_set(&dd->resumeflag, false); - /* Attach the private data to this PCI device. */ pci_set_drvdata(pdev, dd); @@ -3420,7 +3953,8 @@ static int mtip_pci_probe(struct pci_dev *pdev, * instance number. */ instance++; - + if (rv != MTIP_FTL_REBUILD_MAGIC) + set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag); goto done; block_initialize_err: @@ -3434,9 +3968,6 @@ iomap_err: pci_set_drvdata(pdev, NULL); return rv; done: - /* Set the atomic variable as 0 in case of SRSI */ - atomic_set(&dd->drv_cleanup_done, true); - return rv; } @@ -3452,8 +3983,10 @@ static void mtip_pci_remove(struct pci_dev *pdev) struct driver_data *dd = pci_get_drvdata(pdev); int counter = 0; + set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag); + if (mtip_check_surprise_removal(pdev)) { - while (atomic_read(&dd->drv_cleanup_done) == false) { + while (!test_bit(MTIP_DDF_CLEANUP_BIT, &dd->dd_flag)) { counter++; msleep(20); if (counter == 10) { @@ -3463,8 +3996,6 @@ static void mtip_pci_remove(struct pci_dev *pdev) } } } - /* Set the atomic variable as 1 in case of SRSI */ - atomic_set(&dd->drv_cleanup_done, true); /* Clean up the block layer. */ mtip_block_remove(dd); @@ -3493,7 +4024,7 @@ static int mtip_pci_suspend(struct pci_dev *pdev, pm_message_t mesg) return -EFAULT; } - atomic_set(&dd->resumeflag, true); + set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); /* Disable ports & interrupts then send standby immediate */ rv = mtip_block_suspend(dd); @@ -3559,7 +4090,7 @@ static int mtip_pci_resume(struct pci_dev *pdev) dev_err(&pdev->dev, "Unable to resume\n"); err: - atomic_set(&dd->resumeflag, false); + clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag); return rv; } @@ -3608,18 +4139,25 @@ MODULE_DEVICE_TABLE(pci, mtip_pci_tbl); */ static int __init mtip_init(void) { + int error; + printk(KERN_INFO MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n"); /* Allocate a major block device number to use with this driver. */ - mtip_major = register_blkdev(0, MTIP_DRV_NAME); - if (mtip_major < 0) { + error = register_blkdev(0, MTIP_DRV_NAME); + if (error <= 0) { printk(KERN_ERR "Unable to register block device (%d)\n", - mtip_major); + error); return -EBUSY; } + mtip_major = error; /* Register our PCI operations. */ - return pci_register_driver(&mtip_pci_driver); + error = pci_register_driver(&mtip_pci_driver); + if (error) + unregister_blkdev(mtip_major, MTIP_DRV_NAME); + + return error; } /* diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h index e0554a8f2233..b2c88da26b2a 100644 --- a/drivers/block/mtip32xx/mtip32xx.h +++ b/drivers/block/mtip32xx/mtip32xx.h @@ -34,8 +34,8 @@ /* offset of Device Control register in PCIe extended capabilites space */ #define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET 0x48 -/* # of times to retry timed out IOs */ -#define MTIP_MAX_RETRIES 5 +/* # of times to retry timed out/failed IOs */ +#define MTIP_MAX_RETRIES 2 /* Various timeout values in ms */ #define MTIP_NCQ_COMMAND_TIMEOUT_MS 5000 @@ -113,13 +113,44 @@ #define __force_bit2int (unsigned int __force) -/* below are bit numbers in 'flags' defined in mtip_port */ -#define MTIP_FLAG_IC_ACTIVE_BIT 0 -#define MTIP_FLAG_EH_ACTIVE_BIT 1 -#define MTIP_FLAG_SVC_THD_ACTIVE_BIT 2 -#define MTIP_FLAG_ISSUE_CMDS_BIT 4 -#define MTIP_FLAG_REBUILD_BIT 5 -#define MTIP_FLAG_SVC_THD_SHOULD_STOP_BIT 8 +enum { + /* below are bit numbers in 'flags' defined in mtip_port */ + MTIP_PF_IC_ACTIVE_BIT = 0, /* pio/ioctl */ + MTIP_PF_EH_ACTIVE_BIT = 1, /* error handling */ + MTIP_PF_SE_ACTIVE_BIT = 2, /* secure erase */ + MTIP_PF_DM_ACTIVE_BIT = 3, /* download microcde */ + MTIP_PF_PAUSE_IO = ((1 << MTIP_PF_IC_ACTIVE_BIT) | \ + (1 << MTIP_PF_EH_ACTIVE_BIT) | \ + (1 << MTIP_PF_SE_ACTIVE_BIT) | \ + (1 << MTIP_PF_DM_ACTIVE_BIT)), + + MTIP_PF_SVC_THD_ACTIVE_BIT = 4, + MTIP_PF_ISSUE_CMDS_BIT = 5, + MTIP_PF_REBUILD_BIT = 6, + MTIP_PF_SVC_THD_STOP_BIT = 8, + + /* below are bit numbers in 'dd_flag' defined in driver_data */ + MTIP_DDF_REMOVE_PENDING_BIT = 1, + MTIP_DDF_OVER_TEMP_BIT = 2, + MTIP_DDF_WRITE_PROTECT_BIT = 3, + MTIP_DDF_STOP_IO = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) | \ + (1 << MTIP_DDF_OVER_TEMP_BIT) | \ + (1 << MTIP_DDF_WRITE_PROTECT_BIT)), + + MTIP_DDF_CLEANUP_BIT = 5, + MTIP_DDF_RESUME_BIT = 6, + MTIP_DDF_INIT_DONE_BIT = 7, + MTIP_DDF_REBUILD_FAILED_BIT = 8, +}; + +__packed struct smart_attr{ + u8 attr_id; + u16 flags; + u8 cur; + u8 worst; + u32 data; + u8 res[3]; +}; /* Register Frame Information Structure (FIS), host to device. */ struct host_to_dev_fis { @@ -345,6 +376,12 @@ struct mtip_port { * when the command slot and all associated data structures * are no longer needed. */ + u16 *log_buf; + dma_addr_t log_buf_dma; + + u8 *smart_buf; + dma_addr_t smart_buf_dma; + unsigned long allocated[SLOTBITS_IN_LONGS]; /* * used to queue commands when an internal command is in progress @@ -368,6 +405,7 @@ struct mtip_port { * Timer used to complete commands that have been active for too long. */ struct timer_list cmd_timer; + unsigned long ic_pause_timer; /* * Semaphore used to block threads if there are no * command slots available. @@ -404,13 +442,9 @@ struct driver_data { unsigned slot_groups; /* number of slot groups the product supports */ - atomic_t drv_cleanup_done; /* Atomic variable for SRSI */ - unsigned long index; /* Index to determine the disk name */ - unsigned int ftlrebuildflag; /* FTL rebuild flag */ - - atomic_t resumeflag; /* Atomic variable to track suspend/resume */ + unsigned long dd_flag; /* NOTE: use atomic bit operations on this */ struct task_struct *mtip_svc_handler; /* task_struct of svc thd */ }; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index c3f0ee16594d..061427a75d37 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -34,12 +34,11 @@ #include <linux/kthread.h> #include <asm/uaccess.h> -#include <asm/system.h> #include <asm/types.h> #include <linux/nbd.h> -#define LO_MAGIC 0x68797548 +#define NBD_MAGIC 0x68797548 #ifdef NDEBUG #define dprintk(flags, fmt...) @@ -116,7 +115,7 @@ static void nbd_end_request(struct request *req) spin_unlock_irqrestore(q->queue_lock, flags); } -static void sock_shutdown(struct nbd_device *lo, int lock) +static void sock_shutdown(struct nbd_device *nbd, int lock) { /* Forcibly shutdown the socket causing all listeners * to error @@ -125,14 +124,14 @@ static void sock_shutdown(struct nbd_device *lo, int lock) * there should be a more generic interface rather than * calling socket ops directly here */ if (lock) - mutex_lock(&lo->tx_lock); - if (lo->sock) { - dev_warn(disk_to_dev(lo->disk), "shutting down socket\n"); - kernel_sock_shutdown(lo->sock, SHUT_RDWR); - lo->sock = NULL; + mutex_lock(&nbd->tx_lock); + if (nbd->sock) { + dev_warn(disk_to_dev(nbd->disk), "shutting down socket\n"); + kernel_sock_shutdown(nbd->sock, SHUT_RDWR); + nbd->sock = NULL; } if (lock) - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); } static void nbd_xmit_timeout(unsigned long arg) @@ -147,17 +146,17 @@ static void nbd_xmit_timeout(unsigned long arg) /* * Send or receive packet. */ -static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, +static int sock_xmit(struct nbd_device *nbd, int send, void *buf, int size, int msg_flags) { - struct socket *sock = lo->sock; + struct socket *sock = nbd->sock; int result; struct msghdr msg; struct kvec iov; sigset_t blocked, oldset; if (unlikely(!sock)) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Attempted %s on closed socket in sock_xmit\n", (send ? "send" : "recv")); return -EINVAL; @@ -181,15 +180,15 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, if (send) { struct timer_list ti; - if (lo->xmit_timeout) { + if (nbd->xmit_timeout) { init_timer(&ti); ti.function = nbd_xmit_timeout; ti.data = (unsigned long)current; - ti.expires = jiffies + lo->xmit_timeout; + ti.expires = jiffies + nbd->xmit_timeout; add_timer(&ti); } result = kernel_sendmsg(sock, &msg, &iov, 1, size); - if (lo->xmit_timeout) + if (nbd->xmit_timeout) del_timer_sync(&ti); } else result = kernel_recvmsg(sock, &msg, &iov, 1, size, @@ -201,7 +200,7 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, task_pid_nr(current), current->comm, dequeue_signal_lock(current, ¤t->blocked, &info)); result = -EINTR; - sock_shutdown(lo, !send); + sock_shutdown(nbd, !send); break; } @@ -219,18 +218,19 @@ static int sock_xmit(struct nbd_device *lo, int send, void *buf, int size, return result; } -static inline int sock_send_bvec(struct nbd_device *lo, struct bio_vec *bvec, +static inline int sock_send_bvec(struct nbd_device *nbd, struct bio_vec *bvec, int flags) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 1, kaddr + bvec->bv_offset, bvec->bv_len, flags); + result = sock_xmit(nbd, 1, kaddr + bvec->bv_offset, + bvec->bv_len, flags); kunmap(bvec->bv_page); return result; } /* always call with the tx_lock held */ -static int nbd_send_req(struct nbd_device *lo, struct request *req) +static int nbd_send_req(struct nbd_device *nbd, struct request *req) { int result, flags; struct nbd_request request; @@ -243,14 +243,14 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) memcpy(request.handle, &req, sizeof(req)); dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", - lo->disk->disk_name, req, + nbd->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), (unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req)); - result = sock_xmit(lo, 1, &request, sizeof(request), + result = sock_xmit(nbd, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send control failed (result %d)\n", result); goto error_out; } @@ -267,10 +267,10 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) if (!rq_iter_last(req, iter)) flags = MSG_MORE; dprintk(DBG_TX, "%s: request %p: sending %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); - result = sock_send_bvec(lo, bvec, flags); + nbd->disk->disk_name, req, bvec->bv_len); + result = sock_send_bvec(nbd, bvec, flags); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Send data failed (result %d)\n", result); goto error_out; @@ -283,25 +283,25 @@ error_out: return -EIO; } -static struct request *nbd_find_request(struct nbd_device *lo, +static struct request *nbd_find_request(struct nbd_device *nbd, struct request *xreq) { struct request *req, *tmp; int err; - err = wait_event_interruptible(lo->active_wq, lo->active_req != xreq); + err = wait_event_interruptible(nbd->active_wq, nbd->active_req != xreq); if (unlikely(err)) goto out; - spin_lock(&lo->queue_lock); - list_for_each_entry_safe(req, tmp, &lo->queue_head, queuelist) { + spin_lock(&nbd->queue_lock); + list_for_each_entry_safe(req, tmp, &nbd->queue_head, queuelist) { if (req != xreq) continue; list_del_init(&req->queuelist); - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); return req; } - spin_unlock(&lo->queue_lock); + spin_unlock(&nbd->queue_lock); err = -ENOENT; @@ -309,78 +309,78 @@ out: return ERR_PTR(err); } -static inline int sock_recv_bvec(struct nbd_device *lo, struct bio_vec *bvec) +static inline int sock_recv_bvec(struct nbd_device *nbd, struct bio_vec *bvec) { int result; void *kaddr = kmap(bvec->bv_page); - result = sock_xmit(lo, 0, kaddr + bvec->bv_offset, bvec->bv_len, + result = sock_xmit(nbd, 0, kaddr + bvec->bv_offset, bvec->bv_len, MSG_WAITALL); kunmap(bvec->bv_page); return result; } /* NULL returned = something went wrong, inform userspace */ -static struct request *nbd_read_stat(struct nbd_device *lo) +static struct request *nbd_read_stat(struct nbd_device *nbd) { int result; struct nbd_reply reply; struct request *req; reply.magic = 0; - result = sock_xmit(lo, 0, &reply, sizeof(reply), MSG_WAITALL); + result = sock_xmit(nbd, 0, &reply, sizeof(reply), MSG_WAITALL); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), + dev_err(disk_to_dev(nbd->disk), "Receive control failed (result %d)\n", result); goto harderror; } if (ntohl(reply.magic) != NBD_REPLY_MAGIC) { - dev_err(disk_to_dev(lo->disk), "Wrong magic (0x%lx)\n", + dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n", (unsigned long)ntohl(reply.magic)); result = -EPROTO; goto harderror; } - req = nbd_find_request(lo, *(struct request **)reply.handle); + req = nbd_find_request(nbd, *(struct request **)reply.handle); if (IS_ERR(req)) { result = PTR_ERR(req); if (result != -ENOENT) goto harderror; - dev_err(disk_to_dev(lo->disk), "Unexpected reply (%p)\n", + dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%p)\n", reply.handle); result = -EBADR; goto harderror; } if (ntohl(reply.error)) { - dev_err(disk_to_dev(lo->disk), "Other side returned error (%d)\n", + dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n", ntohl(reply.error)); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got reply\n", - lo->disk->disk_name, req); + nbd->disk->disk_name, req); if (nbd_cmd(req) == NBD_CMD_READ) { struct req_iterator iter; struct bio_vec *bvec; rq_for_each_segment(bvec, req, iter) { - result = sock_recv_bvec(lo, bvec); + result = sock_recv_bvec(nbd, bvec); if (result <= 0) { - dev_err(disk_to_dev(lo->disk), "Receive data failed (result %d)\n", + dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n", result); req->errors++; return req; } dprintk(DBG_RX, "%s: request %p: got %d bytes data\n", - lo->disk->disk_name, req, bvec->bv_len); + nbd->disk->disk_name, req, bvec->bv_len); } } return req; harderror: - lo->harderror = result; + nbd->harderror = result; return NULL; } @@ -398,48 +398,48 @@ static struct device_attribute pid_attr = { .show = pid_show, }; -static int nbd_do_it(struct nbd_device *lo) +static int nbd_do_it(struct nbd_device *nbd) { struct request *req; int ret; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - lo->pid = task_pid_nr(current); - ret = device_create_file(disk_to_dev(lo->disk), &pid_attr); + nbd->pid = task_pid_nr(current); + ret = device_create_file(disk_to_dev(nbd->disk), &pid_attr); if (ret) { - dev_err(disk_to_dev(lo->disk), "device_create_file failed!\n"); - lo->pid = 0; + dev_err(disk_to_dev(nbd->disk), "device_create_file failed!\n"); + nbd->pid = 0; return ret; } - while ((req = nbd_read_stat(lo)) != NULL) + while ((req = nbd_read_stat(nbd)) != NULL) nbd_end_request(req); - device_remove_file(disk_to_dev(lo->disk), &pid_attr); - lo->pid = 0; + device_remove_file(disk_to_dev(nbd->disk), &pid_attr); + nbd->pid = 0; return 0; } -static void nbd_clear_que(struct nbd_device *lo) +static void nbd_clear_que(struct nbd_device *nbd) { struct request *req; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* - * Because we have set lo->sock to NULL under the tx_lock, all + * Because we have set nbd->sock to NULL under the tx_lock, all * modifications to the list must have completed by now. For * the same reason, the active_req must be NULL. * * As a consequence, we don't need to take the spin lock while * purging the list here. */ - BUG_ON(lo->sock); - BUG_ON(lo->active_req); + BUG_ON(nbd->sock); + BUG_ON(nbd->active_req); - while (!list_empty(&lo->queue_head)) { - req = list_entry(lo->queue_head.next, struct request, + while (!list_empty(&nbd->queue_head)) { + req = list_entry(nbd->queue_head.next, struct request, queuelist); list_del_init(&req->queuelist); req->errors++; @@ -448,7 +448,7 @@ static void nbd_clear_que(struct nbd_device *lo) } -static void nbd_handle_req(struct nbd_device *lo, struct request *req) +static void nbd_handle_req(struct nbd_device *nbd, struct request *req) { if (req->cmd_type != REQ_TYPE_FS) goto error_out; @@ -456,8 +456,8 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) nbd_cmd(req) = NBD_CMD_READ; if (rq_data_dir(req) == WRITE) { nbd_cmd(req) = NBD_CMD_WRITE; - if (lo->flags & NBD_READ_ONLY) { - dev_err(disk_to_dev(lo->disk), + if (nbd->flags & NBD_READ_ONLY) { + dev_err(disk_to_dev(nbd->disk), "Write on read-only\n"); goto error_out; } @@ -465,29 +465,29 @@ static void nbd_handle_req(struct nbd_device *lo, struct request *req) req->errors = 0; - mutex_lock(&lo->tx_lock); - if (unlikely(!lo->sock)) { - mutex_unlock(&lo->tx_lock); - dev_err(disk_to_dev(lo->disk), + mutex_lock(&nbd->tx_lock); + if (unlikely(!nbd->sock)) { + mutex_unlock(&nbd->tx_lock); + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); goto error_out; } - lo->active_req = req; + nbd->active_req = req; - if (nbd_send_req(lo, req) != 0) { - dev_err(disk_to_dev(lo->disk), "Request send failed\n"); + if (nbd_send_req(nbd, req) != 0) { + dev_err(disk_to_dev(nbd->disk), "Request send failed\n"); req->errors++; nbd_end_request(req); } else { - spin_lock(&lo->queue_lock); - list_add(&req->queuelist, &lo->queue_head); - spin_unlock(&lo->queue_lock); + spin_lock(&nbd->queue_lock); + list_add(&req->queuelist, &nbd->queue_head); + spin_unlock(&nbd->queue_lock); } - lo->active_req = NULL; - mutex_unlock(&lo->tx_lock); - wake_up_all(&lo->active_wq); + nbd->active_req = NULL; + mutex_unlock(&nbd->tx_lock); + wake_up_all(&nbd->active_wq); return; @@ -498,28 +498,28 @@ error_out: static int nbd_thread(void *data) { - struct nbd_device *lo = data; + struct nbd_device *nbd = data; struct request *req; set_user_nice(current, -20); - while (!kthread_should_stop() || !list_empty(&lo->waiting_queue)) { + while (!kthread_should_stop() || !list_empty(&nbd->waiting_queue)) { /* wait for something to do */ - wait_event_interruptible(lo->waiting_wq, + wait_event_interruptible(nbd->waiting_wq, kthread_should_stop() || - !list_empty(&lo->waiting_queue)); + !list_empty(&nbd->waiting_queue)); /* extract request */ - if (list_empty(&lo->waiting_queue)) + if (list_empty(&nbd->waiting_queue)) continue; - spin_lock_irq(&lo->queue_lock); - req = list_entry(lo->waiting_queue.next, struct request, + spin_lock_irq(&nbd->queue_lock); + req = list_entry(nbd->waiting_queue.next, struct request, queuelist); list_del_init(&req->queuelist); - spin_unlock_irq(&lo->queue_lock); + spin_unlock_irq(&nbd->queue_lock); /* handle request */ - nbd_handle_req(lo, req); + nbd_handle_req(nbd, req); } return 0; } @@ -527,7 +527,7 @@ static int nbd_thread(void *data) /* * We always wait for result of write, for now. It would be nice to make it optional * in future - * if ((rq_data_dir(req) == WRITE) && (lo->flags & NBD_WRITE_NOCHK)) + * if ((rq_data_dir(req) == WRITE) && (nbd->flags & NBD_WRITE_NOCHK)) * { printk( "Warning: Ignoring result!\n"); nbd_end_request( req ); } */ @@ -536,19 +536,19 @@ static void do_nbd_request(struct request_queue *q) struct request *req; while ((req = blk_fetch_request(q)) != NULL) { - struct nbd_device *lo; + struct nbd_device *nbd; spin_unlock_irq(q->queue_lock); dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n", req->rq_disk->disk_name, req, req->cmd_type); - lo = req->rq_disk->private_data; + nbd = req->rq_disk->private_data; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); - if (unlikely(!lo->sock)) { - dev_err(disk_to_dev(lo->disk), + if (unlikely(!nbd->sock)) { + dev_err(disk_to_dev(nbd->disk), "Attempted send on closed socket\n"); req->errors++; nbd_end_request(req); @@ -556,11 +556,11 @@ static void do_nbd_request(struct request_queue *q) continue; } - spin_lock_irq(&lo->queue_lock); - list_add_tail(&req->queuelist, &lo->waiting_queue); - spin_unlock_irq(&lo->queue_lock); + spin_lock_irq(&nbd->queue_lock); + list_add_tail(&req->queuelist, &nbd->waiting_queue); + spin_unlock_irq(&nbd->queue_lock); - wake_up(&lo->waiting_wq); + wake_up(&nbd->waiting_wq); spin_lock_irq(q->queue_lock); } @@ -568,32 +568,32 @@ static void do_nbd_request(struct request_queue *q) /* Must be called with tx_lock held */ -static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, +static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd, unsigned int cmd, unsigned long arg) { switch (cmd) { case NBD_DISCONNECT: { struct request sreq; - dev_info(disk_to_dev(lo->disk), "NBD_DISCONNECT\n"); + dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n"); blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; - if (!lo->sock) + if (!nbd->sock) return -EINVAL; - nbd_send_req(lo, &sreq); + nbd_send_req(nbd, &sreq); return 0; } case NBD_CLEAR_SOCK: { struct file *file; - lo->sock = NULL; - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - BUG_ON(!list_empty(&lo->queue_head)); + nbd->sock = NULL; + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + BUG_ON(!list_empty(&nbd->queue_head)); if (file) fput(file); return 0; @@ -601,14 +601,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, case NBD_SET_SOCK: { struct file *file; - if (lo->file) + if (nbd->file) return -EBUSY; file = fget(arg); if (file) { struct inode *inode = file->f_path.dentry->d_inode; if (S_ISSOCK(inode->i_mode)) { - lo->file = file; - lo->sock = SOCKET_I(inode); + nbd->file = file; + nbd->sock = SOCKET_I(inode); if (max_part > 0) bdev->bd_invalidated = 1; return 0; @@ -620,29 +620,29 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, } case NBD_SET_BLKSIZE: - lo->blksize = arg; - lo->bytesize &= ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->blksize = arg; + nbd->bytesize &= ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_SIZE: - lo->bytesize = arg & ~(lo->blksize-1); - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = arg & ~(nbd->blksize-1); + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_SET_TIMEOUT: - lo->xmit_timeout = arg * HZ; + nbd->xmit_timeout = arg * HZ; return 0; case NBD_SET_SIZE_BLOCKS: - lo->bytesize = ((u64) arg) * lo->blksize; - bdev->bd_inode->i_size = lo->bytesize; - set_blocksize(bdev, lo->blksize); - set_capacity(lo->disk, lo->bytesize >> 9); + nbd->bytesize = ((u64) arg) * nbd->blksize; + bdev->bd_inode->i_size = nbd->bytesize; + set_blocksize(bdev, nbd->blksize); + set_capacity(nbd->disk, nbd->bytesize >> 9); return 0; case NBD_DO_IT: { @@ -650,38 +650,38 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, struct file *file; int error; - if (lo->pid) + if (nbd->pid) return -EBUSY; - if (!lo->file) + if (!nbd->file) return -EINVAL; - mutex_unlock(&lo->tx_lock); + mutex_unlock(&nbd->tx_lock); - thread = kthread_create(nbd_thread, lo, lo->disk->disk_name); + thread = kthread_create(nbd_thread, nbd, nbd->disk->disk_name); if (IS_ERR(thread)) { - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); return PTR_ERR(thread); } wake_up_process(thread); - error = nbd_do_it(lo); + error = nbd_do_it(nbd); kthread_stop(thread); - mutex_lock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); if (error) return error; - sock_shutdown(lo, 0); - file = lo->file; - lo->file = NULL; - nbd_clear_que(lo); - dev_warn(disk_to_dev(lo->disk), "queue cleared\n"); + sock_shutdown(nbd, 0); + file = nbd->file; + nbd->file = NULL; + nbd_clear_que(nbd); + dev_warn(disk_to_dev(nbd->disk), "queue cleared\n"); if (file) fput(file); - lo->bytesize = 0; + nbd->bytesize = 0; bdev->bd_inode->i_size = 0; - set_capacity(lo->disk, 0); + set_capacity(nbd->disk, 0); if (max_part > 0) ioctl_by_bdev(bdev, BLKRRPART, 0); - return lo->harderror; + return nbd->harderror; } case NBD_CLEAR_QUE: @@ -689,14 +689,14 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, * This is for compatibility only. The queue is always cleared * by NBD_DO_IT or NBD_CLEAR_SOCK. */ - BUG_ON(!lo->sock && !list_empty(&lo->queue_head)); + BUG_ON(!nbd->sock && !list_empty(&nbd->queue_head)); return 0; case NBD_PRINT_DEBUG: - dev_info(disk_to_dev(lo->disk), + dev_info(disk_to_dev(nbd->disk), "next = %p, prev = %p, head = %p\n", - lo->queue_head.next, lo->queue_head.prev, - &lo->queue_head); + nbd->queue_head.next, nbd->queue_head.prev, + &nbd->queue_head); return 0; } return -ENOTTY; @@ -705,21 +705,21 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, static int nbd_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nbd_device *lo = bdev->bd_disk->private_data; + struct nbd_device *nbd = bdev->bd_disk->private_data; int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - BUG_ON(lo->magic != LO_MAGIC); + BUG_ON(nbd->magic != NBD_MAGIC); /* Anyone capable of this syscall can do *real bad* things */ dprintk(DBG_IOCTL, "%s: nbd_ioctl cmd=%s(0x%x) arg=%lu\n", - lo->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); + nbd->disk->disk_name, ioctl_cmd_to_ascii(cmd), cmd, arg); - mutex_lock(&lo->tx_lock); - error = __nbd_ioctl(bdev, lo, cmd, arg); - mutex_unlock(&lo->tx_lock); + mutex_lock(&nbd->tx_lock); + error = __nbd_ioctl(bdev, nbd, cmd, arg); + mutex_unlock(&nbd->tx_lock); return error; } @@ -805,7 +805,7 @@ static int __init nbd_init(void) for (i = 0; i < nbds_max; i++) { struct gendisk *disk = nbd_dev[i].disk; nbd_dev[i].file = NULL; - nbd_dev[i].magic = LO_MAGIC; + nbd_dev[i].magic = NBD_MAGIC; nbd_dev[i].flags = 0; INIT_LIST_HEAD(&nbd_dev[i].waiting_queue); spin_lock_init(&nbd_dev[i].queue_lock); diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c index 1f3c1a7d132a..38a2d0631882 100644 --- a/drivers/block/nvme.c +++ b/drivers/block/nvme.c @@ -39,7 +39,6 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/types.h> -#include <linux/version.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d59edeabd93f..ba66e4445f41 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -987,14 +987,14 @@ static void pkt_copy_bio_data(struct bio *src_bio, int seg, int offs, struct pag while (copy_size > 0) { struct bio_vec *src_bvl = bio_iovec_idx(src_bio, seg); - void *vfrom = kmap_atomic(src_bvl->bv_page, KM_USER0) + + void *vfrom = kmap_atomic(src_bvl->bv_page) + src_bvl->bv_offset + offs; void *vto = page_address(dst_page) + dst_offs; int len = min_t(int, copy_size, src_bvl->bv_len - offs); BUG_ON(len < 0); memcpy(vto, vfrom, len); - kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vfrom); seg++; offs = 0; @@ -1019,10 +1019,10 @@ static void pkt_make_local_copy(struct packet_data *pkt, struct bio_vec *bvec) offs = 0; for (f = 0; f < pkt->frames; f++) { if (bvec[f].bv_page != pkt->pages[p]) { - void *vfrom = kmap_atomic(bvec[f].bv_page, KM_USER0) + bvec[f].bv_offset; + void *vfrom = kmap_atomic(bvec[f].bv_page) + bvec[f].bv_offset; void *vto = page_address(pkt->pages[p]) + offs; memcpy(vto, vfrom, CD_FRAMESIZE); - kunmap_atomic(vfrom, KM_USER0); + kunmap_atomic(vfrom); bvec[f].bv_page = pkt->pages[p]; bvec[f].bv_offset = offs; } else { diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 48e8fee9f2d4..9dcf76a10bb6 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -839,10 +839,7 @@ static struct vio_driver vdc_port_driver = { .id_table = vdc_port_match, .probe = vdc_port_probe, .remove = vdc_port_remove, - .driver = { - .name = "vdc_port", - .owner = THIS_MODULE, - } + .name = "vdc_port", }; static int __init vdc_init(void) diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 7333b9e44411..fcec0225ac76 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -119,43 +119,6 @@ /* */ - -/* command block wrapper */ -struct bulk_cb_wrap { - __le32 Signature; /* contains 'USBC' */ - u32 Tag; /* unique per command id */ - __le32 DataTransferLength; /* size of data */ - u8 Flags; /* direction in bit 0 */ - u8 Lun; /* LUN */ - u8 Length; /* of of the CDB */ - u8 CDB[UB_MAX_CDB_SIZE]; /* max command */ -}; - -#define US_BULK_CB_WRAP_LEN 31 -#define US_BULK_CB_SIGN 0x43425355 /*spells out USBC */ -#define US_BULK_FLAG_IN 1 -#define US_BULK_FLAG_OUT 0 - -/* command status wrapper */ -struct bulk_cs_wrap { - __le32 Signature; /* should = 'USBS' */ - u32 Tag; /* same as original command */ - __le32 Residue; /* amount not transferred */ - u8 Status; /* see below */ -}; - -#define US_BULK_CS_WRAP_LEN 13 -#define US_BULK_CS_SIGN 0x53425355 /* spells out 'USBS' */ -#define US_BULK_STAT_OK 0 -#define US_BULK_STAT_FAIL 1 -#define US_BULK_STAT_PHASE 2 - -/* bulk-only class specific requests */ -#define US_BULK_RESET_REQUEST 0xff -#define US_BULK_GET_MAX_LUN 0xfe - -/* - */ struct ub_dev; #define UB_MAX_REQ_SG 9 /* cdrecord requires 32KB and maybe a header */ @@ -2477,6 +2440,8 @@ static int __init ub_init(void) int rc; int i; + pr_info("'Low Performance USB Block' driver is deprecated. " + "Please switch to usb-storage\n"); for (i = 0; i < UB_QLOCK_NUM; i++) spin_lock_init(&ub_qlockv[i]); diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c deleted file mode 100644 index 9a5b2a2d616d..000000000000 --- a/drivers/block/viodasd.c +++ /dev/null @@ -1,809 +0,0 @@ -/* -*- linux-c -*- - * viodasd.c - * Authors: Dave Boutcher <boutcher@us.ibm.com> - * Ryan Arnold <ryanarn@us.ibm.com> - * Colin Devilbiss <devilbis@us.ibm.com> - * Stephen Rothwell - * - * (C) Copyright 2000-2004 IBM Corporation - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation; either version 2 of the - * License, or (at your option) any later version. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to the Free Software - * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - * - * This routine provides access to disk space (termed "DASD" in historical - * IBM terms) owned and managed by an OS/400 partition running on the - * same box as this Linux partition. - * - * All disk operations are performed by sending messages back and forth to - * the OS/400 partition. - */ - -#define pr_fmt(fmt) "viod: " fmt - -#include <linux/major.h> -#include <linux/fs.h> -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/blkdev.h> -#include <linux/genhd.h> -#include <linux/hdreg.h> -#include <linux/errno.h> -#include <linux/init.h> -#include <linux/string.h> -#include <linux/mutex.h> -#include <linux/dma-mapping.h> -#include <linux/completion.h> -#include <linux/device.h> -#include <linux/scatterlist.h> - -#include <asm/uaccess.h> -#include <asm/vio.h> -#include <asm/iseries/hv_types.h> -#include <asm/iseries/hv_lp_event.h> -#include <asm/iseries/hv_lp_config.h> -#include <asm/iseries/vio.h> -#include <asm/firmware.h> - -MODULE_DESCRIPTION("iSeries Virtual DASD"); -MODULE_AUTHOR("Dave Boutcher"); -MODULE_LICENSE("GPL"); - -/* - * We only support 7 partitions per physical disk....so with minor - * numbers 0-255 we get a maximum of 32 disks. - */ -#define VIOD_GENHD_NAME "iseries/vd" - -#define VIOD_VERS "1.64" - -enum { - PARTITION_SHIFT = 3, - MAX_DISKNO = HVMAXARCHITECTEDVIRTUALDISKS, - MAX_DISK_NAME = FIELD_SIZEOF(struct gendisk, disk_name) -}; - -static DEFINE_MUTEX(viodasd_mutex); -static DEFINE_SPINLOCK(viodasd_spinlock); - -#define VIOMAXREQ 16 - -#define DEVICE_NO(cell) ((struct viodasd_device *)(cell) - &viodasd_devices[0]) - -struct viodasd_waitevent { - struct completion com; - int rc; - u16 sub_result; - int max_disk; /* open */ -}; - -static const struct vio_error_entry viodasd_err_table[] = { - { 0x0201, EINVAL, "Invalid Range" }, - { 0x0202, EINVAL, "Invalid Token" }, - { 0x0203, EIO, "DMA Error" }, - { 0x0204, EIO, "Use Error" }, - { 0x0205, EIO, "Release Error" }, - { 0x0206, EINVAL, "Invalid Disk" }, - { 0x0207, EBUSY, "Can't Lock" }, - { 0x0208, EIO, "Already Locked" }, - { 0x0209, EIO, "Already Unlocked" }, - { 0x020A, EIO, "Invalid Arg" }, - { 0x020B, EIO, "Bad IFS File" }, - { 0x020C, EROFS, "Read Only Device" }, - { 0x02FF, EIO, "Internal Error" }, - { 0x0000, 0, NULL }, -}; - -/* - * Figure out the biggest I/O request (in sectors) we can accept - */ -#define VIODASD_MAXSECTORS (4096 / 512 * VIOMAXBLOCKDMA) - -/* - * Number of disk I/O requests we've sent to OS/400 - */ -static int num_req_outstanding; - -/* - * This is our internal structure for keeping track of disk devices - */ -struct viodasd_device { - u16 cylinders; - u16 tracks; - u16 sectors; - u16 bytes_per_sector; - u64 size; - int read_only; - spinlock_t q_lock; - struct gendisk *disk; - struct device *dev; -} viodasd_devices[MAX_DISKNO]; - -/* - * External open entry point. - */ -static int viodasd_open(struct block_device *bdev, fmode_t mode) -{ - struct viodasd_device *d = bdev->bd_disk->private_data; - HvLpEvent_Rc hvrc; - struct viodasd_waitevent we; - u16 flags = 0; - - if (d->read_only) { - if (mode & FMODE_WRITE) - return -EROFS; - flags = vioblockflags_ro; - } - - init_completion(&we.com); - - /* Send the open event to OS/400 */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockopen, - HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)&we, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48) | ((u64)flags << 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("HV open failed %d\n", (int)hvrc); - return -EIO; - } - - wait_for_completion(&we.com); - - /* Check the return code */ - if (we.rc != 0) { - const struct vio_error_entry *err = - vio_lookup_rc(viodasd_err_table, we.sub_result); - - pr_warning("bad rc opening disk: %d:0x%04x (%s)\n", - (int)we.rc, we.sub_result, err->msg); - return -EIO; - } - - return 0; -} - -static int viodasd_unlocked_open(struct block_device *bdev, fmode_t mode) -{ - int ret; - - mutex_lock(&viodasd_mutex); - ret = viodasd_open(bdev, mode); - mutex_unlock(&viodasd_mutex); - - return ret; -} - - -/* - * External release entry point. - */ -static int viodasd_release(struct gendisk *disk, fmode_t mode) -{ - struct viodasd_device *d = disk->private_data; - HvLpEvent_Rc hvrc; - - mutex_lock(&viodasd_mutex); - /* Send the event to OS/400. We DON'T expect a response */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockclose, - HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - 0, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48) /* | ((u64)flags << 32) */, - 0, 0, 0); - if (hvrc != 0) - pr_warning("HV close call failed %d\n", (int)hvrc); - - mutex_unlock(&viodasd_mutex); - - return 0; -} - - -/* External ioctl entry point. - */ -static int viodasd_getgeo(struct block_device *bdev, struct hd_geometry *geo) -{ - struct gendisk *disk = bdev->bd_disk; - struct viodasd_device *d = disk->private_data; - - geo->sectors = d->sectors ? d->sectors : 32; - geo->heads = d->tracks ? d->tracks : 64; - geo->cylinders = d->cylinders ? d->cylinders : - get_capacity(disk) / (geo->sectors * geo->heads); - - return 0; -} - -/* - * Our file operations table - */ -static const struct block_device_operations viodasd_fops = { - .owner = THIS_MODULE, - .open = viodasd_unlocked_open, - .release = viodasd_release, - .getgeo = viodasd_getgeo, -}; - -/* - * End a request - */ -static void viodasd_end_request(struct request *req, int error, - int num_sectors) -{ - __blk_end_request(req, error, num_sectors << 9); -} - -/* - * Send an actual I/O request to OS/400 - */ -static int send_request(struct request *req) -{ - u64 start; - int direction; - int nsg; - u16 viocmd; - HvLpEvent_Rc hvrc; - struct vioblocklpevent *bevent; - struct HvLpEvent *hev; - struct scatterlist sg[VIOMAXBLOCKDMA]; - int sgindex; - struct viodasd_device *d; - unsigned long flags; - - start = (u64)blk_rq_pos(req) << 9; - - if (rq_data_dir(req) == READ) { - direction = DMA_FROM_DEVICE; - viocmd = viomajorsubtype_blockio | vioblockread; - } else { - direction = DMA_TO_DEVICE; - viocmd = viomajorsubtype_blockio | vioblockwrite; - } - - d = req->rq_disk->private_data; - - /* Now build the scatter-gather list */ - sg_init_table(sg, VIOMAXBLOCKDMA); - nsg = blk_rq_map_sg(req->q, req, sg); - nsg = dma_map_sg(d->dev, sg, nsg, direction); - - spin_lock_irqsave(&viodasd_spinlock, flags); - num_req_outstanding++; - - /* This optimization handles a single DMA block */ - if (nsg == 1) - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, viocmd, - HvLpEvent_AckInd_DoAck, - HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)req, VIOVERSION << 16, - ((u64)DEVICE_NO(d) << 48), start, - ((u64)sg_dma_address(&sg[0])) << 32, - sg_dma_len(&sg[0])); - else { - bevent = (struct vioblocklpevent *) - vio_get_event_buffer(viomajorsubtype_blockio); - if (bevent == NULL) { - pr_warning("error allocating disk event buffer\n"); - goto error_ret; - } - - /* - * Now build up the actual request. Note that we store - * the pointer to the request in the correlation - * token so we can match the response up later - */ - memset(bevent, 0, sizeof(struct vioblocklpevent)); - hev = &bevent->event; - hev->flags = HV_LP_EVENT_VALID | HV_LP_EVENT_DO_ACK | - HV_LP_EVENT_INT; - hev->xType = HvLpEvent_Type_VirtualIo; - hev->xSubtype = viocmd; - hev->xSourceLp = HvLpConfig_getLpIndex(); - hev->xTargetLp = viopath_hostLp; - hev->xSizeMinus1 = - offsetof(struct vioblocklpevent, u.rw_data.dma_info) + - (sizeof(bevent->u.rw_data.dma_info[0]) * nsg) - 1; - hev->xSourceInstanceId = viopath_sourceinst(viopath_hostLp); - hev->xTargetInstanceId = viopath_targetinst(viopath_hostLp); - hev->xCorrelationToken = (u64)req; - bevent->version = VIOVERSION; - bevent->disk = DEVICE_NO(d); - bevent->u.rw_data.offset = start; - - /* - * Copy just the dma information from the sg list - * into the request - */ - for (sgindex = 0; sgindex < nsg; sgindex++) { - bevent->u.rw_data.dma_info[sgindex].token = - sg_dma_address(&sg[sgindex]); - bevent->u.rw_data.dma_info[sgindex].len = - sg_dma_len(&sg[sgindex]); - } - - /* Send the request */ - hvrc = HvCallEvent_signalLpEvent(&bevent->event); - vio_free_event_buffer(viomajorsubtype_blockio, bevent); - } - - if (hvrc != HvLpEvent_Rc_Good) { - pr_warning("error sending disk event to OS/400 (rc %d)\n", - (int)hvrc); - goto error_ret; - } - spin_unlock_irqrestore(&viodasd_spinlock, flags); - return 0; - -error_ret: - num_req_outstanding--; - spin_unlock_irqrestore(&viodasd_spinlock, flags); - dma_unmap_sg(d->dev, sg, nsg, direction); - return -1; -} - -/* - * This is the external request processing routine - */ -static void do_viodasd_request(struct request_queue *q) -{ - struct request *req; - - /* - * If we already have the maximum number of requests - * outstanding to OS/400 just bail out. We'll come - * back later. - */ - while (num_req_outstanding < VIOMAXREQ) { - req = blk_fetch_request(q); - if (req == NULL) - return; - /* check that request contains a valid command */ - if (req->cmd_type != REQ_TYPE_FS) { - viodasd_end_request(req, -EIO, blk_rq_sectors(req)); - continue; - } - /* Try sending the request */ - if (send_request(req) != 0) - viodasd_end_request(req, -EIO, blk_rq_sectors(req)); - } -} - -/* - * Probe a single disk and fill in the viodasd_device structure - * for it. - */ -static int probe_disk(struct viodasd_device *d) -{ - HvLpEvent_Rc hvrc; - struct viodasd_waitevent we; - int dev_no = DEVICE_NO(d); - struct gendisk *g; - struct request_queue *q; - u16 flags = 0; - -retry: - init_completion(&we.com); - - /* Send the open event to OS/400 */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockopen, - HvLpEvent_AckInd_DoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - (u64)(unsigned long)&we, VIOVERSION << 16, - ((u64)dev_no << 48) | ((u64)flags<< 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("bad rc on HV open %d\n", (int)hvrc); - return 0; - } - - wait_for_completion(&we.com); - - if (we.rc != 0) { - if (flags != 0) - return 0; - /* try again with read only flag set */ - flags = vioblockflags_ro; - goto retry; - } - if (we.max_disk > (MAX_DISKNO - 1)) { - printk_once(KERN_INFO pr_fmt("Only examining the first %d of %d disks connected\n"), - MAX_DISKNO, we.max_disk + 1); - } - - /* Send the close event to OS/400. We DON'T expect a response */ - hvrc = HvCallEvent_signalLpEventFast(viopath_hostLp, - HvLpEvent_Type_VirtualIo, - viomajorsubtype_blockio | vioblockclose, - HvLpEvent_AckInd_NoAck, HvLpEvent_AckType_ImmediateAck, - viopath_sourceinst(viopath_hostLp), - viopath_targetinst(viopath_hostLp), - 0, VIOVERSION << 16, - ((u64)dev_no << 48) | ((u64)flags << 32), - 0, 0, 0); - if (hvrc != 0) { - pr_warning("bad rc sending event to OS/400 %d\n", (int)hvrc); - return 0; - } - - if (d->dev == NULL) { - /* this is when we reprobe for new disks */ - if (vio_create_viodasd(dev_no) == NULL) { - pr_warning("cannot allocate virtual device for disk %d\n", - dev_no); - return 0; - } - /* - * The vio_create_viodasd will have recursed into this - * routine with d->dev set to the new vio device and - * will finish the setup of the disk below. - */ - return 1; - } - - /* create the request queue for the disk */ - spin_lock_init(&d->q_lock); - q = blk_init_queue(do_viodasd_request, &d->q_lock); - if (q == NULL) { - pr_warning("cannot allocate queue for disk %d\n", dev_no); - return 0; - } - g = alloc_disk(1 << PARTITION_SHIFT); - if (g == NULL) { - pr_warning("cannot allocate disk structure for disk %d\n", - dev_no); - blk_cleanup_queue(q); - return 0; - } - - d->disk = g; - blk_queue_max_segments(q, VIOMAXBLOCKDMA); - blk_queue_max_hw_sectors(q, VIODASD_MAXSECTORS); - g->major = VIODASD_MAJOR; - g->first_minor = dev_no << PARTITION_SHIFT; - if (dev_no >= 26) - snprintf(g->disk_name, sizeof(g->disk_name), - VIOD_GENHD_NAME "%c%c", - 'a' + (dev_no / 26) - 1, 'a' + (dev_no % 26)); - else - snprintf(g->disk_name, sizeof(g->disk_name), - VIOD_GENHD_NAME "%c", 'a' + (dev_no % 26)); - g->fops = &viodasd_fops; - g->queue = q; - g->private_data = d; - g->driverfs_dev = d->dev; - set_capacity(g, d->size >> 9); - - pr_info("disk %d: %lu sectors (%lu MB) CHS=%d/%d/%d sector size %d%s\n", - dev_no, (unsigned long)(d->size >> 9), - (unsigned long)(d->size >> 20), - (int)d->cylinders, (int)d->tracks, - (int)d->sectors, (int)d->bytes_per_sector, - d->read_only ? " (RO)" : ""); - - /* register us in the global list */ - add_disk(g); - return 1; -} - -/* returns the total number of scatterlist elements converted */ -static int block_event_to_scatterlist(const struct vioblocklpevent *bevent, - struct scatterlist *sg, int *total_len) -{ - int i, numsg; - const struct rw_data *rw_data = &bevent->u.rw_data; - static const int offset = - offsetof(struct vioblocklpevent, u.rw_data.dma_info); - static const int element_size = sizeof(rw_data->dma_info[0]); - - numsg = ((bevent->event.xSizeMinus1 + 1) - offset) / element_size; - if (numsg > VIOMAXBLOCKDMA) - numsg = VIOMAXBLOCKDMA; - - *total_len = 0; - sg_init_table(sg, VIOMAXBLOCKDMA); - for (i = 0; (i < numsg) && (rw_data->dma_info[i].len > 0); ++i) { - sg_dma_address(&sg[i]) = rw_data->dma_info[i].token; - sg_dma_len(&sg[i]) = rw_data->dma_info[i].len; - *total_len += rw_data->dma_info[i].len; - } - return i; -} - -/* - * Restart all queues, starting with the one _after_ the disk given, - * thus reducing the chance of starvation of higher numbered disks. - */ -static void viodasd_restart_all_queues_starting_from(int first_index) -{ - int i; - - for (i = first_index + 1; i < MAX_DISKNO; ++i) - if (viodasd_devices[i].disk) - blk_run_queue(viodasd_devices[i].disk->queue); - for (i = 0; i <= first_index; ++i) - if (viodasd_devices[i].disk) - blk_run_queue(viodasd_devices[i].disk->queue); -} - -/* - * For read and write requests, decrement the number of outstanding requests, - * Free the DMA buffers we allocated. - */ -static int viodasd_handle_read_write(struct vioblocklpevent *bevent) -{ - int num_sg, num_sect, pci_direction, total_len; - struct request *req; - struct scatterlist sg[VIOMAXBLOCKDMA]; - struct HvLpEvent *event = &bevent->event; - unsigned long irq_flags; - struct viodasd_device *d; - int error; - spinlock_t *qlock; - - num_sg = block_event_to_scatterlist(bevent, sg, &total_len); - num_sect = total_len >> 9; - if (event->xSubtype == (viomajorsubtype_blockio | vioblockread)) - pci_direction = DMA_FROM_DEVICE; - else - pci_direction = DMA_TO_DEVICE; - req = (struct request *)bevent->event.xCorrelationToken; - d = req->rq_disk->private_data; - - dma_unmap_sg(d->dev, sg, num_sg, pci_direction); - - /* - * Since this is running in interrupt mode, we need to make sure - * we're not stepping on any global I/O operations - */ - spin_lock_irqsave(&viodasd_spinlock, irq_flags); - num_req_outstanding--; - spin_unlock_irqrestore(&viodasd_spinlock, irq_flags); - - error = (event->xRc == HvLpEvent_Rc_Good) ? 0 : -EIO; - if (error) { - const struct vio_error_entry *err; - err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); - pr_warning("read/write error %d:0x%04x (%s)\n", - event->xRc, bevent->sub_result, err->msg); - num_sect = blk_rq_sectors(req); - } - qlock = req->q->queue_lock; - spin_lock_irqsave(qlock, irq_flags); - viodasd_end_request(req, error, num_sect); - spin_unlock_irqrestore(qlock, irq_flags); - - /* Finally, try to get more requests off of this device's queue */ - viodasd_restart_all_queues_starting_from(DEVICE_NO(d)); - - return 0; -} - -/* This routine handles incoming block LP events */ -static void handle_block_event(struct HvLpEvent *event) -{ - struct vioblocklpevent *bevent = (struct vioblocklpevent *)event; - struct viodasd_waitevent *pwe; - - if (event == NULL) - /* Notification that a partition went away! */ - return; - /* First, we should NEVER get an int here...only acks */ - if (hvlpevent_is_int(event)) { - pr_warning("Yikes! got an int in viodasd event handler!\n"); - if (hvlpevent_need_ack(event)) { - event->xRc = HvLpEvent_Rc_InvalidSubtype; - HvCallEvent_ackLpEvent(event); - } - } - - switch (event->xSubtype & VIOMINOR_SUBTYPE_MASK) { - case vioblockopen: - /* - * Handle a response to an open request. We get all the - * disk information in the response, so update it. The - * correlation token contains a pointer to a waitevent - * structure that has a completion in it. update the - * return code in the waitevent structure and post the - * completion to wake up the guy who sent the request - */ - pwe = (struct viodasd_waitevent *)event->xCorrelationToken; - pwe->rc = event->xRc; - pwe->sub_result = bevent->sub_result; - if (event->xRc == HvLpEvent_Rc_Good) { - const struct open_data *data = &bevent->u.open_data; - struct viodasd_device *device = - &viodasd_devices[bevent->disk]; - device->read_only = - bevent->flags & vioblockflags_ro; - device->size = data->disk_size; - device->cylinders = data->cylinders; - device->tracks = data->tracks; - device->sectors = data->sectors; - device->bytes_per_sector = data->bytes_per_sector; - pwe->max_disk = data->max_disk; - } - complete(&pwe->com); - break; - case vioblockclose: - break; - case vioblockread: - case vioblockwrite: - viodasd_handle_read_write(bevent); - break; - - default: - pr_warning("invalid subtype!"); - if (hvlpevent_need_ack(event)) { - event->xRc = HvLpEvent_Rc_InvalidSubtype; - HvCallEvent_ackLpEvent(event); - } - } -} - -/* - * Get the driver to reprobe for more disks. - */ -static ssize_t probe_disks(struct device_driver *drv, const char *buf, - size_t count) -{ - struct viodasd_device *d; - - for (d = viodasd_devices; d < &viodasd_devices[MAX_DISKNO]; d++) { - if (d->disk == NULL) - probe_disk(d); - } - return count; -} -static DRIVER_ATTR(probe, S_IWUSR, NULL, probe_disks); - -static int viodasd_probe(struct vio_dev *vdev, const struct vio_device_id *id) -{ - struct viodasd_device *d = &viodasd_devices[vdev->unit_address]; - - d->dev = &vdev->dev; - if (!probe_disk(d)) - return -ENODEV; - return 0; -} - -static int viodasd_remove(struct vio_dev *vdev) -{ - struct viodasd_device *d; - - d = &viodasd_devices[vdev->unit_address]; - if (d->disk) { - del_gendisk(d->disk); - blk_cleanup_queue(d->disk->queue); - put_disk(d->disk); - d->disk = NULL; - } - d->dev = NULL; - return 0; -} - -/** - * viodasd_device_table: Used by vio.c to match devices that we - * support. - */ -static struct vio_device_id viodasd_device_table[] __devinitdata = { - { "block", "IBM,iSeries-viodasd" }, - { "", "" } -}; -MODULE_DEVICE_TABLE(vio, viodasd_device_table); - -static struct vio_driver viodasd_driver = { - .id_table = viodasd_device_table, - .probe = viodasd_probe, - .remove = viodasd_remove, - .driver = { - .name = "viodasd", - .owner = THIS_MODULE, - } -}; - -static int need_delete_probe; - -/* - * Initialize the whole device driver. Handle module and non-module - * versions - */ -static int __init viodasd_init(void) -{ - int rc; - - if (!firmware_has_feature(FW_FEATURE_ISERIES)) { - rc = -ENODEV; - goto early_fail; - } - - /* Try to open to our host lp */ - if (viopath_hostLp == HvLpIndexInvalid) - vio_set_hostlp(); - - if (viopath_hostLp == HvLpIndexInvalid) { - pr_warning("invalid hosting partition\n"); - rc = -EIO; - goto early_fail; - } - - pr_info("vers " VIOD_VERS ", hosting partition %d\n", viopath_hostLp); - - /* register the block device */ - rc = register_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); - if (rc) { - pr_warning("Unable to get major number %d for %s\n", - VIODASD_MAJOR, VIOD_GENHD_NAME); - goto early_fail; - } - /* Actually open the path to the hosting partition */ - rc = viopath_open(viopath_hostLp, viomajorsubtype_blockio, - VIOMAXREQ + 2); - if (rc) { - pr_warning("error opening path to host partition %d\n", - viopath_hostLp); - goto unregister_blk; - } - - /* Initialize our request handler */ - vio_setHandler(viomajorsubtype_blockio, handle_block_event); - - rc = vio_register_driver(&viodasd_driver); - if (rc) { - pr_warning("vio_register_driver failed\n"); - goto unset_handler; - } - - /* - * If this call fails, it just means that we cannot dynamically - * add virtual disks, but the driver will still work fine for - * all existing disk, so ignore the failure. - */ - if (!driver_create_file(&viodasd_driver.driver, &driver_attr_probe)) - need_delete_probe = 1; - - return 0; - -unset_handler: - vio_clearHandler(viomajorsubtype_blockio); - viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2); -unregister_blk: - unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); -early_fail: - return rc; -} -module_init(viodasd_init); - -void __exit viodasd_exit(void) -{ - if (need_delete_probe) - driver_remove_file(&viodasd_driver.driver, &driver_attr_probe); - vio_unregister_driver(&viodasd_driver); - vio_clearHandler(viomajorsubtype_blockio); - viopath_close(viopath_hostLp, viomajorsubtype_blockio, VIOMAXREQ + 2); - unregister_blkdev(VIODASD_MAJOR, VIOD_GENHD_NAME); -} -module_exit(viodasd_exit); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c4a60badf252..693187df7601 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -29,9 +29,6 @@ struct virtio_blk /* The disk structure for the kernel. */ struct gendisk *disk; - /* Request tracking. */ - struct list_head reqs; - mempool_t *pool; /* Process context for config space updates */ @@ -55,7 +52,6 @@ struct virtio_blk struct virtblk_req { - struct list_head list; struct request *req; struct virtio_blk_outhdr out_hdr; struct virtio_scsi_inhdr in_hdr; @@ -99,7 +95,6 @@ static void blk_done(struct virtqueue *vq) } __blk_end_request_all(vbr->req, error); - list_del(&vbr->list); mempool_free(vbr, vblk->pool); } /* In case queue is stopped waiting for more buffers. */ @@ -184,7 +179,6 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, return false; } - list_add_tail(&vbr->list, &vblk->reqs); return true; } @@ -351,6 +345,7 @@ static void virtblk_config_changed_work(struct work_struct *work) cap_str_10, cap_str_2); set_capacity(vblk->disk, capacity); + revalidate_disk(vblk->disk); done: mutex_unlock(&vblk->config_lock); } @@ -374,6 +369,34 @@ static int init_vq(struct virtio_blk *vblk) return err; } +/* + * Legacy naming scheme used for virtio devices. We are stuck with it for + * virtio blk but don't ever use it for any new driver. + */ +static int virtblk_name_format(char *prefix, int index, char *buf, int buflen) +{ + const int base = 'z' - 'a' + 1; + char *begin = buf + strlen(prefix); + char *end = buf + buflen; + char *p; + int unit; + + p = end - 1; + *p = '\0'; + unit = base; + do { + if (p == begin) + return -EINVAL; + *--p = 'a' + (index % unit); + index = (index / unit) - 1; + } while (index >= 0); + + memmove(begin, p, end - p); + memcpy(buf, prefix, strlen(prefix)); + + return 0; +} + static int __devinit virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -408,7 +431,6 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) goto out_free_index; } - INIT_LIST_HEAD(&vblk->reqs); spin_lock_init(&vblk->lock); vblk->vdev = vdev; vblk->sg_elems = sg_elems; @@ -442,18 +464,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) q->queuedata = vblk; - if (index < 26) { - sprintf(vblk->disk->disk_name, "vd%c", 'a' + index % 26); - } else if (index < (26 + 1) * 26) { - sprintf(vblk->disk->disk_name, "vd%c%c", - 'a' + index / 26 - 1, 'a' + index % 26); - } else { - const unsigned int m1 = (index / 26 - 1) / 26 - 1; - const unsigned int m2 = (index / 26 - 1) % 26; - const unsigned int m3 = index % 26; - sprintf(vblk->disk->disk_name, "vd%c%c%c", - 'a' + m1, 'a' + m2, 'a' + m3); - } + virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN); vblk->disk->major = major; vblk->disk->first_minor = index_to_minor(index); @@ -565,21 +576,29 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; int index = vblk->index; + struct virtblk_req *vbr; + unsigned long flags; /* Prevent config work handler from accessing the device. */ mutex_lock(&vblk->config_lock); vblk->config_enable = false; mutex_unlock(&vblk->config_lock); - /* Nothing should be pending. */ - BUG_ON(!list_empty(&vblk->reqs)); - /* Stop all the virtqueues. */ vdev->config->reset(vdev); flush_work(&vblk->config_work); del_gendisk(vblk->disk); + + /* Abort requests dispatched to driver. */ + spin_lock_irqsave(&vblk->lock, flags); + while ((vbr = virtqueue_detach_unused_buf(vblk->vq))) { + __blk_end_request_all(vbr->req, -EIO); + mempool_free(vbr, vblk->pool); + } + spin_unlock_irqrestore(&vblk->lock, flags); + blk_cleanup_queue(vblk->disk->queue); put_disk(vblk->disk); mempool_destroy(vblk->pool); diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 51a972704db5..ff540520bada 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -52,7 +52,6 @@ #include <linux/io.h> #include <linux/gfp.h> -#include <asm/system.h> #include <asm/uaccess.h> #include <asm/dma.h> diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 0088bf60f368..73f196ca713f 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -321,6 +321,7 @@ struct seg_buf { static void xen_blkbk_unmap(struct pending_req *req) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; + struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; unsigned int i, invcount = 0; grant_handle_t handle; int ret; @@ -332,25 +333,12 @@ static void xen_blkbk_unmap(struct pending_req *req) gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i), GNTMAP_host_map, handle); pending_handle(req, i) = BLKBACK_INVALID_HANDLE; + pages[invcount] = virt_to_page(vaddr(req, i)); invcount++; } - ret = HYPERVISOR_grant_table_op( - GNTTABOP_unmap_grant_ref, unmap, invcount); + ret = gnttab_unmap_refs(unmap, pages, invcount, false); BUG_ON(ret); - /* - * Note, we use invcount, so nr->pages, so we can't index - * using vaddr(req, i). - */ - for (i = 0; i < invcount; i++) { - ret = m2p_remove_override( - virt_to_page(unmap[i].host_addr), false); - if (ret) { - pr_alert(DRV_PFX "Failed to remove M2P override for %lx\n", - (unsigned long)unmap[i].host_addr); - continue; - } - } } static int xen_blkbk_map(struct blkif_request *req, @@ -378,7 +366,7 @@ static int xen_blkbk_map(struct blkif_request *req, pending_req->blkif->domid); } - ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); + ret = gnttab_map_refs(map, NULL, &blkbk->pending_page(pending_req, 0), nseg); BUG_ON(ret); /* @@ -398,15 +386,6 @@ static int xen_blkbk_map(struct blkif_request *req, if (ret) continue; - ret = m2p_add_override(PFN_DOWN(map[i].dev_bus_addr), - blkbk->pending_page(pending_req, i), NULL); - if (ret) { - pr_alert(DRV_PFX "Failed to install M2P override for %lx (ret: %d)\n", - (unsigned long)map[i].dev_bus_addr, ret); - /* We could switch over to GNTTABOP_copy */ - continue; - } - seg[i].buf = map[i].dev_bus_addr | (req->u.rw.seg[i].first_sect << 9); } @@ -419,21 +398,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif, int err = 0; int status = BLKIF_RSP_OKAY; struct block_device *bdev = blkif->vbd.bdev; + unsigned long secure; blkif->st_ds_req++; xen_blkif_get(blkif); - if (blkif->blk_backend_type == BLKIF_BACKEND_PHY || - blkif->blk_backend_type == BLKIF_BACKEND_FILE) { - unsigned long secure = (blkif->vbd.discard_secure && - (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? - BLKDEV_DISCARD_SECURE : 0; - err = blkdev_issue_discard(bdev, - req->u.discard.sector_number, - req->u.discard.nr_sectors, - GFP_KERNEL, secure); - } else - err = -EOPNOTSUPP; + secure = (blkif->vbd.discard_secure && + (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? + BLKDEV_DISCARD_SECURE : 0; + + err = blkdev_issue_discard(bdev, req->u.discard.sector_number, + req->u.discard.nr_sectors, + GFP_KERNEL, secure); if (err == -EOPNOTSUPP) { pr_debug(DRV_PFX "discard op failed, not supported\n"); @@ -830,7 +806,7 @@ static int __init xen_blkif_init(void) int i, mmap_pages; int rc = 0; - if (!xen_pv_domain()) + if (!xen_domain()) return -ENODEV; blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL); diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index d0ee7edc9be8..773cf27dc23f 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -146,11 +146,6 @@ enum blkif_protocol { BLKIF_PROTOCOL_X86_64 = 3, }; -enum blkif_backend_type { - BLKIF_BACKEND_PHY = 1, - BLKIF_BACKEND_FILE = 2, -}; - struct xen_vbd { /* What the domain refers to this vbd as. */ blkif_vdev_t handle; @@ -177,7 +172,6 @@ struct xen_blkif { unsigned int irq; /* Comms information. */ enum blkif_protocol blk_protocol; - enum blkif_backend_type blk_backend_type; union blkif_back_rings blk_rings; void *blk_ring; /* The VBD attached to this interface. */ diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index 24a2fb57e5d0..4f66171c6683 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -381,72 +381,49 @@ int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache", "%d", state); if (err) - xenbus_dev_fatal(dev, err, "writing feature-flush-cache"); + dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err); return err; } -int xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) +static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be) { struct xenbus_device *dev = be->dev; struct xen_blkif *blkif = be->blkif; - char *type; int err; int state = 0; + struct block_device *bdev = be->blkif->vbd.bdev; + struct request_queue *q = bdev_get_queue(bdev); - type = xenbus_read(XBT_NIL, dev->nodename, "type", NULL); - if (!IS_ERR(type)) { - if (strncmp(type, "file", 4) == 0) { - state = 1; - blkif->blk_backend_type = BLKIF_BACKEND_FILE; + if (blk_queue_discard(q)) { + err = xenbus_printf(xbt, dev->nodename, + "discard-granularity", "%u", + q->limits.discard_granularity); + if (err) { + dev_warn(&dev->dev, "writing discard-granularity (%d)", err); + return; } - if (strncmp(type, "phy", 3) == 0) { - struct block_device *bdev = be->blkif->vbd.bdev; - struct request_queue *q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) { - err = xenbus_printf(xbt, dev->nodename, - "discard-granularity", "%u", - q->limits.discard_granularity); - if (err) { - xenbus_dev_fatal(dev, err, - "writing discard-granularity"); - goto kfree; - } - err = xenbus_printf(xbt, dev->nodename, - "discard-alignment", "%u", - q->limits.discard_alignment); - if (err) { - xenbus_dev_fatal(dev, err, - "writing discard-alignment"); - goto kfree; - } - state = 1; - blkif->blk_backend_type = BLKIF_BACKEND_PHY; - } - /* Optional. */ - err = xenbus_printf(xbt, dev->nodename, - "discard-secure", "%d", - blkif->vbd.discard_secure); - if (err) { - xenbus_dev_fatal(dev, err, - "writting discard-secure"); - goto kfree; - } + err = xenbus_printf(xbt, dev->nodename, + "discard-alignment", "%u", + q->limits.discard_alignment); + if (err) { + dev_warn(&dev->dev, "writing discard-alignment (%d)", err); + return; + } + state = 1; + /* Optional. */ + err = xenbus_printf(xbt, dev->nodename, + "discard-secure", "%d", + blkif->vbd.discard_secure); + if (err) { + dev_warn(&dev->dev, "writing discard-secure (%d)", err); + return; } - } else { - err = PTR_ERR(type); - xenbus_dev_fatal(dev, err, "reading type"); - goto out; } - err = xenbus_printf(xbt, dev->nodename, "feature-discard", "%d", state); if (err) - xenbus_dev_fatal(dev, err, "writing feature-discard"); -kfree: - kfree(type); -out: - return err; + dev_warn(&dev->dev, "writing feature-discard (%d)", err); } int xen_blkbk_barrier(struct xenbus_transaction xbt, struct backend_info *be, int state) @@ -457,7 +434,7 @@ int xen_blkbk_barrier(struct xenbus_transaction xbt, err = xenbus_printf(xbt, dev->nodename, "feature-barrier", "%d", state); if (err) - xenbus_dev_fatal(dev, err, "writing feature-barrier"); + dev_warn(&dev->dev, "writing feature-barrier (%d)", err); return err; } @@ -689,14 +666,12 @@ again: return; } - err = xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); - if (err) - goto abort; + /* If we can't advertise it is OK. */ + xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support); - err = xen_blkbk_discard(xbt, be); + xen_blkbk_discard(xbt, be); - /* If we can't advertise it is OK. */ - err = xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); + xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support); err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", (unsigned long long)vbd_sz(&be->blkif->vbd)); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2f22874c0a37..60eed4bdd2e4 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -43,6 +43,7 @@ #include <linux/slab.h> #include <linux/mutex.h> #include <linux/scatterlist.h> +#include <linux/bitmap.h> #include <xen/xen.h> #include <xen/xenbus.h> @@ -81,6 +82,7 @@ static const struct block_device_operations xlvbd_block_fops; */ struct blkfront_info { + spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; @@ -105,8 +107,6 @@ struct blkfront_info int is_ready; }; -static DEFINE_SPINLOCK(blkif_io_lock); - static unsigned int nr_minors; static unsigned long *minors; static DEFINE_SPINLOCK(minor_lock); @@ -177,8 +177,7 @@ static int xlbd_reserve_minors(unsigned int minor, unsigned int nr) spin_lock(&minor_lock); if (find_next_bit(minors, end, minor) >= end) { - for (; minor < end; ++minor) - __set_bit(minor, minors); + bitmap_set(minors, minor, nr); rc = 0; } else rc = -EBUSY; @@ -193,8 +192,7 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) BUG_ON(end > nr_minors); spin_lock(&minor_lock); - for (; minor < end; ++minor) - __clear_bit(minor, minors); + bitmap_clear(minors, minor, nr); spin_unlock(&minor_lock); } @@ -419,7 +417,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) struct request_queue *rq; struct blkfront_info *info = gd->private_data; - rq = blk_init_queue(do_blkif_request, &blkif_io_lock); + rq = blk_init_queue(do_blkif_request, &info->io_lock); if (rq == NULL) return -1; @@ -528,6 +526,14 @@ static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) return 0; } +static char *encode_disk_name(char *ptr, unsigned int n) +{ + if (n >= 26) + ptr = encode_disk_name(ptr, n / 26 - 1); + *ptr = 'a' + n % 26; + return ptr + 1; +} + static int xlvbd_alloc_gendisk(blkif_sector_t capacity, struct blkfront_info *info, u16 vdisk_info, u16 sector_size) @@ -538,6 +544,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, unsigned int offset; int minor; int nr_parts; + char *ptr; BUG_ON(info->gd != NULL); BUG_ON(info->rq != NULL); @@ -562,7 +569,11 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, "emulated IDE disks,\n\t choose an xvd device name" "from xvde on\n", info->vdevice); } - err = -ENODEV; + if (minor >> MINORBITS) { + pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n", + info->vdevice, minor); + return -ENODEV; + } if ((minor % nr_parts) == 0) nr_minors = nr_parts; @@ -576,23 +587,14 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, if (gd == NULL) goto release; - if (nr_minors > 1) { - if (offset < 26) - sprintf(gd->disk_name, "%s%c", DEV_NAME, 'a' + offset); - else - sprintf(gd->disk_name, "%s%c%c", DEV_NAME, - 'a' + ((offset / 26)-1), 'a' + (offset % 26)); - } else { - if (offset < 26) - sprintf(gd->disk_name, "%s%c%d", DEV_NAME, - 'a' + offset, - minor & (nr_parts - 1)); - else - sprintf(gd->disk_name, "%s%c%c%d", DEV_NAME, - 'a' + ((offset / 26) - 1), - 'a' + (offset % 26), - minor & (nr_parts - 1)); - } + strcpy(gd->disk_name, DEV_NAME); + ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset); + BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN); + if (nr_minors > 1) + *ptr = 0; + else + snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr, + "%d", minor & (nr_parts - 1)); gd->major = XENVBD_MAJOR; gd->first_minor = minor; @@ -636,14 +638,14 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) if (info->rq == NULL) return; - spin_lock_irqsave(&blkif_io_lock, flags); + spin_lock_irqsave(&info->io_lock, flags); /* No more blkif_request(). */ blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work_sync(&info->work); @@ -675,16 +677,16 @@ static void blkif_restart_queue(struct work_struct *work) { struct blkfront_info *info = container_of(work, struct blkfront_info, work); - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); if (info->connected == BLKIF_STATE_CONNECTED) kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); } static void blkif_free(struct blkfront_info *info, int suspend) { /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); info->connected = suspend ? BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; /* No more blkif_request(). */ @@ -692,7 +694,7 @@ static void blkif_free(struct blkfront_info *info, int suspend) blk_stop_queue(info->rq); /* No more gnttab callback work. */ gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); /* Flush gnttab callback work. Must be done with no locks held. */ flush_work_sync(&info->work); @@ -728,10 +730,10 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkfront_info *info = (struct blkfront_info *)dev_id; int error; - spin_lock_irqsave(&blkif_io_lock, flags); + spin_lock_irqsave(&info->io_lock, flags); if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); return IRQ_HANDLED; } @@ -816,7 +818,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) kick_pending_request_queues(info); - spin_unlock_irqrestore(&blkif_io_lock, flags); + spin_unlock_irqrestore(&info->io_lock, flags); return IRQ_HANDLED; } @@ -991,6 +993,7 @@ static int blkfront_probe(struct xenbus_device *dev, } mutex_init(&info->mutex); + spin_lock_init(&info->io_lock); info->xbdev = dev; info->vdevice = vdevice; info->connected = BLKIF_STATE_DISCONNECTED; @@ -1068,7 +1071,7 @@ static int blkif_recover(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; @@ -1079,7 +1082,7 @@ static int blkif_recover(struct blkfront_info *info) /* Kick any other new requests queued since we resumed */ kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); return 0; } @@ -1277,10 +1280,10 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&blkif_io_lock); + spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; kick_pending_request_queues(info); - spin_unlock_irq(&blkif_io_lock); + spin_unlock_irq(&info->io_lock); add_disk(info->gd); @@ -1410,7 +1413,6 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) mutex_lock(&blkfront_mutex); bdev = bdget_disk(disk, 0); - bdput(bdev); if (bdev->bd_openers) goto out; @@ -1441,6 +1443,7 @@ static int blkif_release(struct gendisk *disk, fmode_t mode) } out: + bdput(bdev); mutex_unlock(&blkfront_mutex); return 0; } @@ -1475,6 +1478,9 @@ static int __init xlblk_init(void) if (!xen_domain()) return -ENODEV; + if (xen_hvm_domain() && !xen_platform_pci_unplug) + return -ENODEV; + if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) { printk(KERN_WARNING "xen_blk: can't get major %d with name %s\n", XENVBD_MAJOR, DEV_NAME); @@ -1494,7 +1500,9 @@ module_init(xlblk_init); static void __exit xlblk_exit(void) { - return xenbus_unregister_driver(&blkfront_driver); + xenbus_unregister_driver(&blkfront_driver); + unregister_blkdev(XENVBD_MAJOR, DEV_NAME); + kfree(minors); } module_exit(xlblk_exit); |