diff options
Diffstat (limited to 'drivers/block')
25 files changed, 1374 insertions, 1300 deletions
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 758da2287d9a..5fd50a284168 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1864,7 +1864,6 @@ static int __exit amiga_floppy_remove(struct platform_device *pdev) static struct platform_driver amiga_floppy_driver = { .driver = { .name = "amiga-floppy", - .owner = THIS_MODULE, }, }; diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index dd73e1ff1759..46c282fff104 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -395,7 +395,7 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->flags & DEVFL_TKILL); WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); - blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); + blk_queue_max_hw_sectors(q, 1024); q->backing_dev_info.name = "aoe"; q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index a2dfa169237d..1318e3217cb0 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -827,8 +827,7 @@ static int update_sync_bits(struct drbd_device *device, * */ int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line) + enum update_sync_bits_mode mode) { /* Is called from worker and receiver context _only_ */ unsigned long sbnr, ebnr, lbnr; diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 900d4d3272d1..9a950022ff88 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -419,7 +419,7 @@ static int in_flight_summary_show(struct seq_file *m, void *pos) return 0; } -/* simple_positive(file->f_dentry) respectively debugfs_positive(), +/* simple_positive(file->f_path.dentry) respectively debugfs_positive(), * but neither is "reachable" from here. * So we have our own inline version of it above. :-( */ static inline int debugfs_positive(struct dentry *dentry) @@ -437,14 +437,14 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo /* Are we still linked, * or has debugfs_remove() already been called? */ - parent = file->f_dentry->d_parent; + parent = file->f_path.dentry->d_parent; /* not sure if this can happen: */ if (!parent || !parent->d_inode) goto out; /* serialize with d_delete() */ mutex_lock(&parent->d_inode->i_mutex); /* Make sure the object is still alive */ - if (debugfs_positive(file->f_dentry) + if (debugfs_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; mutex_unlock(&parent->d_inode->i_mutex); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index 9b22f8f01b57..b905e9888b88 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -1454,7 +1454,6 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ -extern int drbd_msg_put_info(struct sk_buff *skb, const char *info); extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1558,52 +1557,31 @@ extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled); extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed); extern int drbd_connected(struct drbd_peer_device *); -/* Yes, there is kernel_setsockopt, but only since 2.6.18. - * So we have our own copy of it here. */ -static inline int drbd_setsockopt(struct socket *sock, int level, int optname, - char *optval, int optlen) -{ - mm_segment_t oldfs = get_fs(); - char __user *uoptval; - int err; - - uoptval = (char __user __force *)optval; - - set_fs(KERNEL_DS); - if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, uoptval, optlen); - else - err = sock->ops->setsockopt(sock, level, optname, uoptval, - optlen); - set_fs(oldfs); - return err; -} - static inline void drbd_tcp_cork(struct socket *sock) { int val = 1; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val)); } static inline void drbd_tcp_uncork(struct socket *sock) { int val = 0; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val)); } static inline void drbd_tcp_nodelay(struct socket *sock) { int val = 1; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY, (char*)&val, sizeof(val)); } static inline void drbd_tcp_quickack(struct socket *sock) { int val = 2; - (void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK, + (void) kernel_setsockopt(sock, SOL_TCP, TCP_QUICKACK, (char*)&val, sizeof(val)); } @@ -1662,14 +1640,13 @@ extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long stil enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC }; extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size, - enum update_sync_bits_mode mode, - const char *file, const unsigned int line); + enum update_sync_bits_mode mode); #define drbd_set_in_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_IN_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_IN_SYNC) #define drbd_set_out_of_sync(device, sector, size) \ - __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC) #define drbd_rs_failed_io(device, sector, size) \ - __drbd_change_sync(device, sector, size, RECORD_RS_FAILED, __FILE__, __LINE__) + __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); extern int drbd_initialize_al(struct drbd_device *, void *); diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 973c185c9cfe..1fc83427199c 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2532,10 +2532,6 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) return -ENOMEM; - /* - retcode = ERR_NOMEM; - drbd_msg_put_info("unable to allocate cpumask"); - */ /* silently ignore cpu mask on UP kernel */ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) { @@ -2731,7 +2727,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig device = minor_to_device(minor); if (device) - return ERR_MINOR_EXISTS; + return ERR_MINOR_OR_VOLUME_EXISTS; /* GFP_KERNEL, we are outside of all write-out paths */ device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL); @@ -2793,20 +2789,16 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); - } + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; goto out_no_minor_idr; } kref_get(&device->kref); id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { - err = ERR_MINOR_EXISTS; - drbd_msg_put_info(adm_ctx->reply_skb, "requested minor exists already"); - } + if (id == -ENOSPC) + err = ERR_MINOR_OR_VOLUME_EXISTS; goto out_idr_remove_minor; } kref_get(&device->kref); @@ -2825,10 +2817,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL); if (id < 0) { - if (id == -ENOSPC) { + if (id == -ENOSPC) err = ERR_INVALID_REQUEST; - drbd_msg_put_info(adm_ctx->reply_skb, "requested volume exists already"); - } goto out_idr_remove_from_resource; } kref_get(&connection->kref); @@ -2836,7 +2826,6 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig if (init_submitter(device)) { err = ERR_NOMEM; - drbd_msg_put_info(adm_ctx->reply_skb, "unable to create submit workqueue"); goto out_idr_remove_vol; } diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index 1cd47df44bda..74df8cfad414 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -92,7 +92,7 @@ static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info) /* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only * reason it could fail was no space in skb, and there are 4k available. */ -int drbd_msg_put_info(struct sk_buff *skb, const char *info) +static int drbd_msg_put_info(struct sk_buff *skb, const char *info) { struct nlattr *nla; int err = -EMSGSIZE; @@ -588,7 +588,7 @@ drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int for val.i = 0; val.role = new_role; while (try++ < max_tries) { - rv = _drbd_request_state(device, mask, val, CS_WAIT_COMPLETE); + rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE); /* in case we first succeeded to outdate, * but now suddenly could establish a connection */ @@ -2052,7 +2052,7 @@ check_net_options(struct drbd_connection *connection, struct net_conf *new_net_c rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf); rcu_read_unlock(); - /* connection->volumes protected by genl_lock() here */ + /* connection->peer_devices protected by genl_lock() here */ idr_for_each_entry(&connection->peer_devices, peer_device, i) { struct drbd_device *device = peer_device->device; if (!device->bitmap) { @@ -3483,7 +3483,7 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) * that first_peer_device(device)->connection and device->vnr match the request. */ if (adm_ctx.device) { if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) - retcode = ERR_MINOR_EXISTS; + retcode = ERR_MINOR_OR_VOLUME_EXISTS; /* else: still NO_ERROR */ goto out; } @@ -3530,6 +3530,27 @@ out: return 0; } +static int adm_del_resource(struct drbd_resource *resource) +{ + struct drbd_connection *connection; + + for_each_connection(connection, resource) { + if (connection->cstate > C_STANDALONE) + return ERR_NET_CONFIGURED; + } + if (!idr_is_empty(&resource->devices)) + return ERR_RES_IN_USE; + + list_del_rcu(&resource->resources); + /* Make sure all threads have actually stopped: state handling only + * does drbd_thread_stop_nowait(). */ + list_for_each_entry(connection, &resource->connections, connections) + drbd_thread_stop(&connection->worker); + synchronize_rcu(); + drbd_free_resource(resource); + return NO_ERROR; +} + int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3575,14 +3596,6 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - /* If we reach this, all volumes (of this connection) are Secondary, - * Disconnected, Diskless, aka Unconfigured. Make sure all threads have - * actually stopped, state handling only does drbd_thread_stop_nowait(). */ - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - - /* Now, nothing can fail anymore */ - /* delete volumes */ idr_for_each_entry(&resource->devices, device, i) { retcode = adm_del_minor(device); @@ -3593,10 +3606,7 @@ int drbd_adm_down(struct sk_buff *skb, struct genl_info *info) } } - list_del_rcu(&resource->resources); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; + retcode = adm_del_resource(resource); out: mutex_unlock(&resource->adm_mutex); finish: @@ -3608,7 +3618,6 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; struct drbd_resource *resource; - struct drbd_connection *connection; enum drbd_ret_code retcode; retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE); @@ -3616,27 +3625,10 @@ int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info) return retcode; if (retcode != NO_ERROR) goto finish; - resource = adm_ctx.resource; - mutex_lock(&resource->adm_mutex); - for_each_connection(connection, resource) { - if (connection->cstate > C_STANDALONE) { - retcode = ERR_NET_CONFIGURED; - goto out; - } - } - if (!idr_is_empty(&resource->devices)) { - retcode = ERR_RES_IN_USE; - goto out; - } - list_del_rcu(&resource->resources); - for_each_connection(connection, resource) - drbd_thread_stop(&connection->worker); - synchronize_rcu(); - drbd_free_resource(resource); - retcode = NO_ERROR; -out: + mutex_lock(&resource->adm_mutex); + retcode = adm_del_resource(resource); mutex_unlock(&resource->adm_mutex); finish: drbd_adm_finish(&adm_ctx, info, retcode); diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index 6960fb064731..d169b4a79267 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -2482,7 +2482,7 @@ bool drbd_rs_c_min_rate_throttle(struct drbd_device *device) atomic_read(&device->rs_sect_ev); if (atomic_read(&device->ap_actlog_cnt) - || !device->rs_last_events || curr_events - device->rs_last_events > 64) { + || curr_events - device->rs_last_events > 64) { unsigned long rs_left; int i; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 5a01c53dddeb..34f2f0ba409b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -36,29 +36,15 @@ static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, /* Update disk stats at start of I/O request */ static void _drbd_start_io_acct(struct drbd_device *device, struct drbd_request *req) { - const int rw = bio_data_dir(req->master_bio); - int cpu; - cpu = part_stat_lock(); - part_round_stats(cpu, &device->vdisk->part0); - part_stat_inc(cpu, &device->vdisk->part0, ios[rw]); - part_stat_add(cpu, &device->vdisk->part0, sectors[rw], req->i.size >> 9); - (void) cpu; /* The macro invocations above want the cpu argument, I do not like - the compiler warning about cpu only assigned but never used... */ - part_inc_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); + generic_start_io_acct(bio_data_dir(req->master_bio), req->i.size >> 9, + &device->vdisk->part0); } /* Update disk stats when completing request upwards */ static void _drbd_end_io_acct(struct drbd_device *device, struct drbd_request *req) { - int rw = bio_data_dir(req->master_bio); - unsigned long duration = jiffies - req->start_jif; - int cpu; - cpu = part_stat_lock(); - part_stat_add(cpu, &device->vdisk->part0, ticks[rw], duration); - part_round_stats(cpu, &device->vdisk->part0); - part_dec_in_flight(&device->vdisk->part0, rw); - part_stat_unlock(); + generic_end_io_acct(bio_data_dir(req->master_bio), + &device->vdisk->part0, req->start_jif); } static struct drbd_request *drbd_req_new(struct drbd_device *device, @@ -1545,6 +1531,7 @@ int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct struct request_queue * const b = device->ldev->backing_bdev->bd_disk->queue; if (b->merge_bvec_fn) { + bvm->bi_bdev = device->ldev->backing_bdev; backing_limit = b->merge_bvec_fn(b, bvm, bvec); limit = min(limit, backing_limit); } @@ -1628,7 +1615,7 @@ void request_timer_fn(unsigned long data) time_after(now, req_peer->pre_send_jif + ent) && !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); - _drbd_set_state(_NS(device, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL); + _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); } if (dt && oldest_submit_jif != now && time_after(now, oldest_submit_jif + dt) && diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 84b11f887d73..2d7dd269b6a8 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -215,6 +215,18 @@ static bool no_peer_wf_report_params(struct drbd_connection *connection) return rv; } +static void wake_up_all_devices(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + wake_up(&peer_device->device->state_wait); + rcu_read_unlock(); + +} + /** * cl_wide_st_chg() - true if the state change is a cluster wide one @@ -410,6 +422,22 @@ _drbd_request_state(struct drbd_device *device, union drbd_state mask, return rv; } +enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask, + union drbd_state val, enum chg_state_flags f) +{ + enum drbd_state_rv rv; + + BUG_ON(f & CS_SERIALIZE); + + wait_event_cmd(device->state_wait, + (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE, + mutex_unlock(device->state_mutex), + mutex_lock(device->state_mutex)); + + return rv; +} + static void print_st(struct drbd_device *device, const char *name, union drbd_state ns) { drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n", @@ -629,14 +657,11 @@ is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_c if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED) rv = SS_IN_TRANSIENT_STATE; - /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS) - rv = SS_IN_TRANSIENT_STATE; */ - /* While establishing a connection only allow cstate to change. - Delay/refuse role changes, detach attach etc... */ + Delay/refuse role changes, detach attach etc... (they do not touch cstate) */ if (test_bit(STATE_SENT, &connection->flags) && - !(os.conn == C_WF_REPORT_PARAMS || - (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION))) + !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) || + (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS))) rv = SS_IN_TRANSIENT_STATE; if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED) @@ -1032,8 +1057,10 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, /* Wake up role changes, that were delayed because of connection establishing */ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS && - no_peer_wf_report_params(connection)) + no_peer_wf_report_params(connection)) { clear_bit(STATE_SENT, &connection->flags); + wake_up_all_devices(connection); + } wake_up(&device->misc_wait); wake_up(&device->state_wait); @@ -1072,7 +1099,6 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, set_ov_position(device, ns.conn); device->rs_start = now; - device->rs_last_events = 0; device->rs_last_sect_ev = 0; device->ov_last_oos_size = 0; device->ov_last_oos_start = 0; diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index cc41605ba21c..7f53c40823cd 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -117,6 +117,11 @@ extern enum drbd_state_rv _drbd_request_state(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); + +extern enum drbd_state_rv +_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, + union drbd_state, enum chg_state_flags); + extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, enum chg_state_flags, struct completion *done); diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index d2d1f97511bd..d0fae55d871d 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -1592,11 +1592,15 @@ void drbd_resync_after_changed(struct drbd_device *device) void drbd_rs_controller_reset(struct drbd_device *device) { + struct gendisk *disk = device->ldev->backing_bdev->bd_contains->bd_disk; struct fifo_buffer *plan; atomic_set(&device->rs_sect_in, 0); atomic_set(&device->rs_sect_ev, 0); device->rs_in_flight = 0; + device->rs_last_events = + (int)part_stat_read(&disk->part0, sectors[0]) + + (int)part_stat_read(&disk->part0, sectors[1]); /* Updating the RCU protected object in place is necessary since this function gets called from atomic context. @@ -1743,7 +1747,6 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_failed = 0; device->rs_paused = 0; device->rs_same_csum = 0; - device->rs_last_events = 0; device->rs_last_sect_ev = 0; device->rs_total = tw; device->rs_start = now; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index e352cac707e8..145ce2aa2e78 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -1082,7 +1082,6 @@ static struct platform_driver mg_disk_driver = { .remove = mg_remove, .driver = { .name = MG_DEV_NAME, - .owner = THIS_MODULE, .pm = &mg_pm, } }; diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 1bd5f523f8fd..3bd7ca9853a8 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -3775,9 +3775,10 @@ static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx, return false; } -static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int mtip_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct request *rq = bd->rq; int ret; if (unlikely(mtip_check_unal_depth(hctx, rq))) diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 8001e812018b..aa2224aa7caa 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -78,7 +78,33 @@ module_param(home_node, int, S_IRUGO); MODULE_PARM_DESC(home_node, "Home node for the device"); static int queue_mode = NULL_Q_MQ; -module_param(queue_mode, int, S_IRUGO); + +static int null_param_store_val(const char *str, int *val, int min, int max) +{ + int ret, new_val; + + ret = kstrtoint(str, 10, &new_val); + if (ret) + return -EINVAL; + + if (new_val < min || new_val > max) + return -EINVAL; + + *val = new_val; + return 0; +} + +static int null_set_queue_mode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &queue_mode, NULL_Q_BIO, NULL_Q_MQ); +} + +static struct kernel_param_ops null_queue_mode_param_ops = { + .set = null_set_queue_mode, + .get = param_get_int, +}; + +device_param_cb(queue_mode, &null_queue_mode_param_ops, &queue_mode, S_IRUGO); MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)"); static int gb = 250; @@ -94,7 +120,19 @@ module_param(nr_devices, int, S_IRUGO); MODULE_PARM_DESC(nr_devices, "Number of devices to register"); static int irqmode = NULL_IRQ_SOFTIRQ; -module_param(irqmode, int, S_IRUGO); + +static int null_set_irqmode(const char *str, const struct kernel_param *kp) +{ + return null_param_store_val(str, &irqmode, NULL_IRQ_NONE, + NULL_IRQ_TIMER); +} + +static struct kernel_param_ops null_irqmode_param_ops = { + .set = null_set_irqmode, + .get = param_get_int, +}; + +device_param_cb(irqmode, &null_irqmode_param_ops, &irqmode, S_IRUGO); MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer"); static int completion_nsec = 10000; @@ -313,15 +351,15 @@ static void null_request_fn(struct request_queue *q) } } -static int null_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool last) +static int null_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq); + struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq); - cmd->rq = rq; + cmd->rq = bd->rq; cmd->nq = hctx->driver_data; - blk_mq_start_request(rq); + blk_mq_start_request(bd->rq); null_handle_cmd(cmd); return BLK_MQ_RQ_QUEUE_OK; @@ -492,7 +530,7 @@ static int null_add_dev(void) goto out_cleanup_queues; nullb->q = blk_mq_init_queue(&nullb->tag_set); - if (!nullb->q) { + if (IS_ERR(nullb->q)) { rv = -ENOMEM; goto out_cleanup_tags; } diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index e2bb8afbeae5..d826bf3e62c8 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -13,9 +13,9 @@ */ #include <linux/nvme.h> -#include <linux/bio.h> #include <linux/bitops.h> #include <linux/blkdev.h> +#include <linux/blk-mq.h> #include <linux/cpu.h> #include <linux/delay.h> #include <linux/errno.h> @@ -33,7 +33,6 @@ #include <linux/module.h> #include <linux/moduleparam.h> #include <linux/pci.h> -#include <linux/percpu.h> #include <linux/poison.h> #include <linux/ptrace.h> #include <linux/sched.h> @@ -42,12 +41,12 @@ #include <scsi/sg.h> #include <asm-generic/io-64-nonatomic-lo-hi.h> -#include <trace/events/block.h> - #define NVME_Q_DEPTH 1024 +#define NVME_AQ_DEPTH 64 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) #define ADMIN_TIMEOUT (admin_timeout * HZ) +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) #define IOD_TIMEOUT (retry_time * HZ) static unsigned char admin_timeout = 60; @@ -62,6 +61,10 @@ static unsigned char retry_time = 30; module_param(retry_time, byte, 0644); MODULE_PARM_DESC(retry_time, "time in seconds to retry failed I/O"); +static unsigned char shutdown_timeout = 5; +module_param(shutdown_timeout, byte, 0644); +MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); + static int nvme_major; module_param(nvme_major, int, 0); @@ -76,10 +79,12 @@ static wait_queue_head_t nvme_kthread_wait; static struct notifier_block nvme_nb; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { struct kthread_work work; struct kthread_worker *worker; + struct request *req; u32 result; int status; void *ctx; @@ -90,7 +95,7 @@ struct async_cmd_info { * commands and one for I/O commands). */ struct nvme_queue { - struct rcu_head r_head; + struct llist_node node; struct device *q_dmadev; struct nvme_dev *dev; char irqname[24]; /* nvme4294967295-65535\0 */ @@ -99,23 +104,17 @@ struct nvme_queue { volatile struct nvme_completion *cqes; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; - wait_queue_head_t sq_full; - wait_queue_t sq_cong_wait; - struct bio_list sq_cong; - struct list_head iod_bio; u32 __iomem *q_db; u16 q_depth; - u16 cq_vector; + s16 cq_vector; u16 sq_head; u16 sq_tail; u16 cq_head; u16 qid; u8 cq_phase; u8 cqe_seen; - u8 q_suspended; - cpumask_var_t cpu_mask; struct async_cmd_info cmdinfo; - unsigned long cmdid_data[]; + struct blk_mq_hw_ctx *hctx; }; /* @@ -143,62 +142,80 @@ typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, struct nvme_cmd_info { nvme_completion_fn fn; void *ctx; - unsigned long timeout; int aborted; + struct nvme_queue *nvmeq; }; -static struct nvme_cmd_info *nvme_cmd_info(struct nvme_queue *nvmeq) +static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) { - return (void *)&nvmeq->cmdid_data[BITS_TO_LONGS(nvmeq->q_depth)]; + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[0]; + + WARN_ON(nvmeq->hctx); + nvmeq->hctx = hctx; + hctx->driver_data = nvmeq; + return 0; } -static unsigned nvme_queue_extra(int depth) +static int nvme_admin_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) { - return DIV_ROUND_UP(depth, 8) + (depth * sizeof(struct nvme_cmd_info)); + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[0]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; } -/** - * alloc_cmdid() - Allocate a Command ID - * @nvmeq: The queue that will be used for this command - * @ctx: A pointer that will be passed to the handler - * @handler: The function to call on completion - * - * Allocate a Command ID for a queue. The data passed in will - * be passed to the completion handler. This is implemented by using - * the bottom two bits of the ctx pointer to store the handler ID. - * Passing in a pointer that's not 4-byte aligned will cause a BUG. - * We can change this if it becomes a problem. - * - * May be called with local interrupts disabled and the q_lock held, - * or with interrupts enabled and no locks held. - */ -static int alloc_cmdid(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - int cmdid; + struct nvme_queue *nvmeq = hctx->driver_data; - do { - cmdid = find_first_zero_bit(nvmeq->cmdid_data, depth); - if (cmdid >= depth) - return -EBUSY; - } while (test_and_set_bit(cmdid, nvmeq->cmdid_data)); + nvmeq->hctx = NULL; +} + +static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int hctx_idx) +{ + struct nvme_dev *dev = data; + struct nvme_queue *nvmeq = dev->queues[ + (hctx_idx % dev->queue_count) + 1]; - info[cmdid].fn = handler; - info[cmdid].ctx = ctx; - info[cmdid].timeout = jiffies + timeout; - info[cmdid].aborted = 0; - return cmdid; + if (!nvmeq->hctx) + nvmeq->hctx = hctx; + + /* nvmeq queues are shared between namespaces. We assume here that + * blk-mq map the tags so they match up with the nvme queue tags. */ + WARN_ON(nvmeq->hctx->tags != hctx->tags); + + hctx->driver_data = nvmeq; + return 0; +} + +static int nvme_init_request(void *data, struct request *req, + unsigned int hctx_idx, unsigned int rq_idx, + unsigned int numa_node) +{ + struct nvme_dev *dev = data; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + + BUG_ON(!nvmeq); + cmd->nvmeq = nvmeq; + return 0; } -static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, - nvme_completion_fn handler, unsigned timeout) +static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, + nvme_completion_fn handler) { - int cmdid; - wait_event_killable(nvmeq->sq_full, - (cmdid = alloc_cmdid(nvmeq, ctx, handler, timeout)) >= 0); - return (cmdid < 0) ? -EINTR : cmdid; + cmd->fn = handler; + cmd->ctx = ctx; + cmd->aborted = 0; + blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); } /* Special values must be less than 0x1000 */ @@ -206,17 +223,12 @@ static int alloc_cmdid_killable(struct nvme_queue *nvmeq, void *ctx, #define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) #define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) #define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) -#define CMD_CTX_ABORT (0x318 + CMD_CTX_BASE) static void special_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { if (ctx == CMD_CTX_CANCELLED) return; - if (ctx == CMD_CTX_ABORT) { - ++nvmeq->dev->abort_limit; - return; - } if (ctx == CMD_CTX_COMPLETED) { dev_warn(nvmeq->q_dmadev, "completed id %d twice on queue %d\n", @@ -229,99 +241,89 @@ static void special_completion(struct nvme_queue *nvmeq, void *ctx, cqe->command_id, le16_to_cpup(&cqe->sq_id)); return; } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); } -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *free_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) { void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (cmdid >= nvmeq->q_depth || !info[cmdid].fn) { - if (fn) - *fn = special_completion; - return CMD_CTX_INVALID; - } if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_COMPLETED; - clear_bit(cmdid, nvmeq->cmdid_data); - wake_up(&nvmeq->sq_full); + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_CANCELLED; return ctx; } -static void *cancel_cmdid(struct nvme_queue *nvmeq, int cmdid, - nvme_completion_fn *fn) +static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - void *ctx; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - if (fn) - *fn = info[cmdid].fn; - ctx = info[cmdid].ctx; - info[cmdid].fn = special_completion; - info[cmdid].ctx = CMD_CTX_CANCELLED; - return ctx; -} + struct request *req = ctx; -static struct nvme_queue *raw_nvmeq(struct nvme_dev *dev, int qid) -{ - return rcu_dereference_raw(dev->queues[qid]); + u32 result = le32_to_cpup(&cqe->result); + u16 status = le16_to_cpup(&cqe->status) >> 1; + + if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) + ++nvmeq->dev->event_limit; + if (status == NVME_SC_SUCCESS) + dev_warn(nvmeq->q_dmadev, + "async event result %08x\n", result); + + blk_mq_free_hctx_request(nvmeq->hctx, req); } -static struct nvme_queue *get_nvmeq(struct nvme_dev *dev) __acquires(RCU) +static void abort_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - struct nvme_queue *nvmeq; - unsigned queue_id = get_cpu_var(*dev->io_queue); + struct request *req = ctx; - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[queue_id]); - if (nvmeq) - return nvmeq; + u16 status = le16_to_cpup(&cqe->status) >> 1; + u32 result = le32_to_cpup(&cqe->result); - rcu_read_unlock(); - put_cpu_var(*dev->io_queue); - return NULL; + blk_mq_free_hctx_request(nvmeq->hctx, req); + + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + ++nvmeq->dev->abort_limit; } -static void put_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +static void async_completion(struct nvme_queue *nvmeq, void *ctx, + struct nvme_completion *cqe) { - rcu_read_unlock(); - put_cpu_var(nvmeq->dev->io_queue); + struct async_cmd_info *cmdinfo = ctx; + cmdinfo->result = le32_to_cpup(&cqe->result); + cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; + queue_kthread_work(cmdinfo->worker, &cmdinfo->work); + blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); } -static struct nvme_queue *lock_nvmeq(struct nvme_dev *dev, int q_idx) - __acquires(RCU) +static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, + unsigned int tag) { - struct nvme_queue *nvmeq; + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); - rcu_read_lock(); - nvmeq = rcu_dereference(dev->queues[q_idx]); - if (nvmeq) - return nvmeq; - - rcu_read_unlock(); - return NULL; + return blk_mq_rq_to_pdu(req); } -static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) +/* + * Called with local interrupts disabled and the q_lock held. May not sleep. + */ +static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, + nvme_completion_fn *fn) { - rcu_read_unlock(); + struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); + void *ctx; + if (tag >= nvmeq->q_depth) { + *fn = special_completion; + return CMD_CTX_INVALID; + } + if (fn) + *fn = cmd->fn; + ctx = cmd->ctx; + cmd->fn = special_completion; + cmd->ctx = CMD_CTX_COMPLETED; + return ctx; } /** @@ -331,26 +333,29 @@ static void unlock_nvmeq(struct nvme_queue *nvmeq) __releases(RCU) * * Safe to use from interrupt context */ -static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +static int __nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) { - unsigned long flags; - u16 tail; - spin_lock_irqsave(&nvmeq->q_lock, flags); - if (nvmeq->q_suspended) { - spin_unlock_irqrestore(&nvmeq->q_lock, flags); - return -EBUSY; - } - tail = nvmeq->sq_tail; + u16 tail = nvmeq->sq_tail; + memcpy(&nvmeq->sq_cmds[tail], cmd, sizeof(*cmd)); if (++tail == nvmeq->q_depth) tail = 0; writel(tail, nvmeq->q_db); nvmeq->sq_tail = tail; - spin_unlock_irqrestore(&nvmeq->q_lock, flags); return 0; } +static int nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) +{ + unsigned long flags; + int ret; + spin_lock_irqsave(&nvmeq->q_lock, flags); + ret = __nvme_submit_cmd(nvmeq, cmd); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); + return ret; +} + static __le64 **iod_list(struct nvme_iod *iod) { return ((void *)iod) + iod->offset; @@ -361,17 +366,17 @@ static __le64 **iod_list(struct nvme_iod *iod) * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_npages(unsigned size) +static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + PAGE_SIZE, PAGE_SIZE); - return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); + unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + return DIV_ROUND_UP(8 * nprps, dev->page_size - 8); } static struct nvme_iod * -nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) +nvme_alloc_iod(unsigned nseg, unsigned nbytes, struct nvme_dev *dev, gfp_t gfp) { struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(nbytes) + + sizeof(__le64 *) * nvme_npages(nbytes, dev) + sizeof(struct scatterlist) * nseg, gfp); if (iod) { @@ -380,7 +385,6 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) iod->length = nbytes; iod->nents = 0; iod->first_dma = 0ULL; - iod->start_time = jiffies; } return iod; @@ -388,7 +392,7 @@ nvme_alloc_iod(unsigned nseg, unsigned nbytes, gfp_t gfp) void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { - const int last_prp = PAGE_SIZE / 8 - 1; + const int last_prp = dev->page_size / 8 - 1; int i; __le64 **list = iod_list(iod); dma_addr_t prp_dma = iod->first_dma; @@ -404,65 +408,54 @@ void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) kfree(iod); } -static void nvme_start_io_acct(struct bio *bio) -{ - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - int cpu = part_stat_lock(); - part_round_stats(cpu, &disk->part0); - part_stat_inc(cpu, &disk->part0, ios[rw]); - part_stat_add(cpu, &disk->part0, sectors[rw], - bio_sectors(bio)); - part_inc_in_flight(&disk->part0, rw); - part_stat_unlock(); - } -} - -static void nvme_end_io_acct(struct bio *bio, unsigned long start_time) +static int nvme_error_status(u16 status) { - struct gendisk *disk = bio->bi_bdev->bd_disk; - if (blk_queue_io_stat(disk->queue)) { - const int rw = bio_data_dir(bio); - unsigned long duration = jiffies - start_time; - int cpu = part_stat_lock(); - part_stat_add(cpu, &disk->part0, ticks[rw], duration); - part_round_stats(cpu, &disk->part0); - part_dec_in_flight(&disk->part0, rw); - part_stat_unlock(); + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; } } -static void bio_completion(struct nvme_queue *nvmeq, void *ctx, +static void req_completion(struct nvme_queue *nvmeq, void *ctx, struct nvme_completion *cqe) { struct nvme_iod *iod = ctx; - struct bio *bio = iod->private; + struct request *req = iod->private; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + u16 status = le16_to_cpup(&cqe->status) >> 1; - int error = 0; if (unlikely(status)) { - if (!(status & NVME_SC_DNR || - bio->bi_rw & REQ_FAILFAST_MASK) && - (jiffies - iod->start_time) < IOD_TIMEOUT) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - wake_up(&nvmeq->sq_full); + if (!(status & NVME_SC_DNR || blk_noretry_request(req)) + && (jiffies - req->start_time) < req->timeout) { + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); return; } - error = -EIO; - } - if (iod->nents) { - dma_unmap_sg(nvmeq->q_dmadev, iod->sg, iod->nents, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - nvme_end_io_acct(bio, iod->start_time); - } + req->errors = nvme_error_status(status); + } else + req->errors = 0; + + if (cmd_rq->aborted) + dev_warn(&nvmeq->dev->pci_dev->dev, + "completing aborted command with status:%04x\n", + status); + + if (iod->nents) + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); nvme_free_iod(nvmeq->dev, iod); - trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio, error); - bio_endio(bio, error); + blk_mq_complete_request(req); } /* length is in bytes. gfp flags indicates whether we may sleep. */ @@ -479,26 +472,27 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, __le64 **list = iod_list(iod); dma_addr_t prp_dma; int nprps, i; + u32 page_size = dev->page_size; - length -= (PAGE_SIZE - offset); + length -= (page_size - offset); if (length <= 0) return total_len; - dma_len -= (PAGE_SIZE - offset); + dma_len -= (page_size - offset); if (dma_len) { - dma_addr += (PAGE_SIZE - offset); + dma_addr += (page_size - offset); } else { sg = sg_next(sg); dma_addr = sg_dma_address(sg); dma_len = sg_dma_len(sg); } - if (length <= PAGE_SIZE) { + if (length <= page_size) { iod->first_dma = dma_addr; return total_len; } - nprps = DIV_ROUND_UP(length, PAGE_SIZE); + nprps = DIV_ROUND_UP(length, page_size); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; iod->npages = 0; @@ -511,13 +505,13 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + PAGE_SIZE; + return (total_len - length) + page_size; } list[0] = prp_list; iod->first_dma = prp_dma; i = 0; for (;;) { - if (i == PAGE_SIZE / 8) { + if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; prp_list = dma_pool_alloc(pool, gfp, &prp_dma); if (!prp_list) @@ -528,9 +522,9 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, i = 1; } prp_list[i++] = cpu_to_le64(dma_addr); - dma_len -= PAGE_SIZE; - dma_addr += PAGE_SIZE; - length -= PAGE_SIZE; + dma_len -= page_size; + dma_addr += page_size; + length -= page_size; if (length <= 0) break; if (dma_len > 0) @@ -544,88 +538,25 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } -static int nvme_split_and_submit(struct bio *bio, struct nvme_queue *nvmeq, - int len) -{ - struct bio *split = bio_split(bio, len >> 9, GFP_ATOMIC, NULL); - if (!split) - return -ENOMEM; - - trace_block_split(bdev_get_queue(bio->bi_bdev), bio, - split->bi_iter.bi_sector); - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up(&nvmeq->sq_full); - - return 0; -} - -/* NVMe scatterlists require no holes in the virtual address */ -#define BIOVEC_NOT_VIRT_MERGEABLE(vec1, vec2) ((vec2)->bv_offset || \ - (((vec1)->bv_offset + (vec1)->bv_len) % PAGE_SIZE)) - -static int nvme_map_bio(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct bio *bio, enum dma_data_direction dma_dir, int psegs) -{ - struct bio_vec bvec, bvprv; - struct bvec_iter iter; - struct scatterlist *sg = NULL; - int length = 0, nsegs = 0, split_len = bio->bi_iter.bi_size; - int first = 1; - - if (nvmeq->dev->stripe_size) - split_len = nvmeq->dev->stripe_size - - ((bio->bi_iter.bi_sector << 9) & - (nvmeq->dev->stripe_size - 1)); - - sg_init_table(iod->sg, psegs); - bio_for_each_segment(bvec, bio, iter) { - if (!first && BIOVEC_PHYS_MERGEABLE(&bvprv, &bvec)) { - sg->length += bvec.bv_len; - } else { - if (!first && BIOVEC_NOT_VIRT_MERGEABLE(&bvprv, &bvec)) - return nvme_split_and_submit(bio, nvmeq, - length); - - sg = sg ? sg + 1 : iod->sg; - sg_set_page(sg, bvec.bv_page, - bvec.bv_len, bvec.bv_offset); - nsegs++; - } - - if (split_len - length < bvec.bv_len) - return nvme_split_and_submit(bio, nvmeq, split_len); - length += bvec.bv_len; - bvprv = bvec; - first = 0; - } - iod->nents = nsegs; - sg_mark_end(sg); - if (dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir) == 0) - return -ENOMEM; - - BUG_ON(length != bio->bi_iter.bi_size); - return length; -} - -static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio, struct nvme_iod *iod, int cmdid) +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_iod *iod) { struct nvme_dsm_range *range = (struct nvme_dsm_range *)iod_list(iod)[0]; struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(bio->bi_iter.bi_size >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); memset(cmnd, 0, sizeof(*cmnd)); cmnd->dsm.opcode = nvme_cmd_dsm; - cmnd->dsm.command_id = cmdid; + cmnd->dsm.command_id = req->tag; cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); cmnd->dsm.nr = 0; @@ -634,11 +565,9 @@ static int nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, +static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, int cmdid) { struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; @@ -651,49 +580,34 @@ static int nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, if (++nvmeq->sq_tail == nvmeq->q_depth) nvmeq->sq_tail = 0; writel(nvmeq->sq_tail, nvmeq->q_db); - - return 0; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) +static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, + struct nvme_ns *ns) { - struct bio *bio = iod->private; - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; + struct request *req = iod->private; struct nvme_command *cmnd; - int cmdid; - u16 control; - u32 dsmgmt; - - cmdid = alloc_cmdid(nvmeq, iod, bio_completion, NVME_IO_TIMEOUT); - if (unlikely(cmdid < 0)) - return cmdid; + u16 control = 0; + u32 dsmgmt = 0; - if (bio->bi_rw & REQ_DISCARD) - return nvme_submit_discard(nvmeq, ns, bio, iod, cmdid); - if (bio->bi_rw & REQ_FLUSH) - return nvme_submit_flush(nvmeq, ns, cmdid); - - control = 0; - if (bio->bi_rw & REQ_FUA) + if (req->cmd_flags & REQ_FUA) control |= NVME_RW_FUA; - if (bio->bi_rw & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) control |= NVME_RW_LR; - dsmgmt = 0; - if (bio->bi_rw & REQ_RAHEAD) + if (req->cmd_flags & REQ_RAHEAD) dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; memset(cmnd, 0, sizeof(*cmnd)); - cmnd->rw.opcode = bio_data_dir(bio) ? nvme_cmd_write : nvme_cmd_read; - cmnd->rw.command_id = cmdid; + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; cmnd->rw.nsid = cpu_to_le32(ns->ns_id); cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, bio->bi_iter.bi_sector)); - cmnd->rw.length = - cpu_to_le16((bio->bi_iter.bi_size >> ns->lba_shift) - 1); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); cmnd->rw.control = cpu_to_le16(control); cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); @@ -704,45 +618,26 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod) return 0; } -static int nvme_split_flush_data(struct nvme_queue *nvmeq, struct bio *bio) -{ - struct bio *split = bio_clone(bio, GFP_ATOMIC); - if (!split) - return -ENOMEM; - - split->bi_iter.bi_size = 0; - split->bi_phys_segments = 0; - bio->bi_rw &= ~REQ_FLUSH; - bio_chain(split, bio); - - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, split); - bio_list_add(&nvmeq->sq_cong, bio); - wake_up_process(nvme_thread); - - return 0; -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct bio *bio) +static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { + struct nvme_ns *ns = hctx->queue->queuedata; + struct nvme_queue *nvmeq = hctx->driver_data; + struct request *req = bd->rq; + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; - int psegs = bio_phys_segments(ns->queue, bio); - int result; - - if ((bio->bi_rw & REQ_FLUSH) && psegs) - return nvme_split_flush_data(nvmeq, bio); + int psegs = req->nr_phys_segments; + enum dma_data_direction dma_dir; + unsigned size = !(req->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(req) : + sizeof(struct nvme_dsm_range); - iod = nvme_alloc_iod(psegs, bio->bi_iter.bi_size, GFP_ATOMIC); + iod = nvme_alloc_iod(psegs, size, ns->dev, GFP_ATOMIC); if (!iod) - return -ENOMEM; + return BLK_MQ_RQ_QUEUE_BUSY; + + iod->private = req; - iod->private = bio; - if (bio->bi_rw & REQ_DISCARD) { + if (req->cmd_flags & REQ_DISCARD) { void *range; /* * We reuse the small pool to allocate the 16-byte range here @@ -752,35 +647,48 @@ static int nvme_submit_bio_queue(struct nvme_queue *nvmeq, struct nvme_ns *ns, range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); - if (!range) { - result = -ENOMEM; - goto free_iod; - } + if (!range) + goto retry_cmd; iod_list(iod)[0] = (__le64 *)range; iod->npages = 0; } else if (psegs) { - result = nvme_map_bio(nvmeq, iod, bio, - bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, - psegs); - if (result <= 0) - goto free_iod; - if (nvme_setup_prps(nvmeq->dev, iod, result, GFP_ATOMIC) != - result) { - result = -ENOMEM; - goto free_iod; + dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + + sg_init_table(iod->sg, psegs); + iod->nents = blk_rq_map_sg(req->q, req, iod->sg); + if (!iod->nents) + goto error_cmd; + + if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) + goto retry_cmd; + + if (blk_rq_bytes(req) != + nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, + iod->nents, dma_dir); + goto retry_cmd; } - nvme_start_io_acct(bio); } - if (unlikely(nvme_submit_iod(nvmeq, iod))) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - list_add_tail(&iod->node, &nvmeq->iod_bio); - } - return 0; - free_iod: + nvme_set_info(cmd, iod, req_completion); + spin_lock_irq(&nvmeq->q_lock); + if (req->cmd_flags & REQ_DISCARD) + nvme_submit_discard(nvmeq, ns, req, iod); + else if (req->cmd_flags & REQ_FLUSH) + nvme_submit_flush(nvmeq, ns, req->tag); + else + nvme_submit_iod(nvmeq, iod, ns); + + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; + + error_cmd: nvme_free_iod(nvmeq->dev, iod); - return result; + return BLK_MQ_RQ_QUEUE_ERROR; + retry_cmd: + nvme_free_iod(nvmeq->dev, iod); + return BLK_MQ_RQ_QUEUE_BUSY; } static int nvme_process_cq(struct nvme_queue *nvmeq) @@ -801,8 +709,7 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) head = 0; phase = !phase; } - - ctx = free_cmdid(nvmeq, cqe.command_id, &fn); + ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); fn(nvmeq, ctx, &cqe); } @@ -823,29 +730,13 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -static void nvme_make_request(struct request_queue *q, struct bio *bio) +/* Admin queue isn't initialized as a request queue. If at some point this + * happens anyway, make sure to notify the user */ +static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { - struct nvme_ns *ns = q->queuedata; - struct nvme_queue *nvmeq = get_nvmeq(ns->dev); - int result = -EBUSY; - - if (!nvmeq) { - bio_endio(bio, -EIO); - return; - } - - spin_lock_irq(&nvmeq->q_lock); - if (!nvmeq->q_suspended && bio_list_empty(&nvmeq->sq_cong)) - result = nvme_submit_bio_queue(nvmeq, ns, bio); - if (unlikely(result)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, &nvmeq->sq_cong_wait); - bio_list_add(&nvmeq->sq_cong, bio); - } - - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - put_nvmeq(nvmeq); + WARN_ON_ONCE(1); + return BLK_MQ_RQ_QUEUE_ERROR; } static irqreturn_t nvme_irq(int irq, void *data) @@ -869,10 +760,11 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -static void nvme_abort_command(struct nvme_queue *nvmeq, int cmdid) +static void nvme_abort_cmd_info(struct nvme_queue *nvmeq, struct nvme_cmd_info * + cmd_info) { spin_lock_irq(&nvmeq->q_lock); - cancel_cmdid(nvmeq, cmdid, NULL); + cancel_cmd_info(cmd_info, NULL); spin_unlock_irq(&nvmeq->q_lock); } @@ -895,47 +787,40 @@ static void sync_completion(struct nvme_queue *nvmeq, void *ctx, * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, - struct nvme_command *cmd, +static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, u32 *result, unsigned timeout) { - int cmdid, ret; + int ret; struct sync_cmd_info cmdinfo; - struct nvme_queue *nvmeq; - - nvmeq = lock_nvmeq(dev, q_idx); - if (!nvmeq) - return -ENODEV; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; cmdinfo.task = current; cmdinfo.status = -EINTR; - cmdid = alloc_cmdid(nvmeq, &cmdinfo, sync_completion, timeout); - if (cmdid < 0) { - unlock_nvmeq(nvmeq); - return cmdid; - } - cmd->common.command_id = cmdid; + cmd->common.command_id = req->tag; + + nvme_set_info(cmd_rq, &cmdinfo, sync_completion); set_current_state(TASK_KILLABLE); ret = nvme_submit_cmd(nvmeq, cmd); if (ret) { - free_cmdid(nvmeq, cmdid, NULL); - unlock_nvmeq(nvmeq); + nvme_finish_cmd(nvmeq, req->tag, NULL); set_current_state(TASK_RUNNING); - return ret; } - unlock_nvmeq(nvmeq); - schedule_timeout(timeout); + ret = schedule_timeout(timeout); - if (cmdinfo.status == -EINTR) { - nvmeq = lock_nvmeq(dev, q_idx); - if (nvmeq) { - nvme_abort_command(nvmeq, cmdid); - unlock_nvmeq(nvmeq); - } + /* + * Ensure that sync_completion has either run, or that it will + * never run. + */ + nvme_abort_cmd_info(nvmeq, blk_mq_rq_to_pdu(req)); + + /* + * We never got the completion + */ + if (cmdinfo.status == -EINTR) return -EINTR; - } if (result) *result = cmdinfo.result; @@ -943,59 +828,100 @@ static int nvme_submit_sync_cmd(struct nvme_dev *dev, int q_idx, return cmdinfo.status; } -static int nvme_submit_async_cmd(struct nvme_queue *nvmeq, +static int nvme_submit_async_admin_req(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_command c; + struct nvme_cmd_info *cmd_info; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, false); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->cmd_flags |= REQ_NO_TIMEOUT; + cmd_info = blk_mq_rq_to_pdu(req); + nvme_set_info(cmd_info, req, async_req_completion); + + memset(&c, 0, sizeof(c)); + c.common.opcode = nvme_admin_async_event; + c.common.command_id = req->tag; + + return __nvme_submit_cmd(nvmeq, &c); +} + +static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, struct nvme_command *cmd, struct async_cmd_info *cmdinfo, unsigned timeout) { - int cmdid; + struct nvme_queue *nvmeq = dev->queues[0]; + struct request *req; + struct nvme_cmd_info *cmd_rq; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); - cmdid = alloc_cmdid_killable(nvmeq, cmdinfo, async_completion, timeout); - if (cmdid < 0) - return cmdid; + req->timeout = timeout; + cmd_rq = blk_mq_rq_to_pdu(req); + cmdinfo->req = req; + nvme_set_info(cmd_rq, cmdinfo, async_completion); cmdinfo->status = -EINTR; - cmd->common.command_id = cmdid; + + cmd->common.command_id = req->tag; + return nvme_submit_cmd(nvmeq, cmd); } -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) +static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, + u32 *result, unsigned timeout) { - return nvme_submit_sync_cmd(dev, 0, cmd, result, ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, timeout); + blk_mq_free_request(req); + return res; } -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_command *cmd, +int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, u32 *result) { - return nvme_submit_sync_cmd(dev, smp_processor_id() + 1, cmd, result, - NVME_IO_TIMEOUT); + return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); } -static int nvme_submit_admin_cmd_async(struct nvme_dev *dev, - struct nvme_command *cmd, struct async_cmd_info *cmdinfo) +int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_command *cmd, u32 *result) { - return nvme_submit_async_cmd(raw_nvmeq(dev, 0), cmd, cmdinfo, - ADMIN_TIMEOUT); + int res; + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), + false); + if (IS_ERR(req)) + return PTR_ERR(req); + res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); + blk_mq_free_request(req); + return res; } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { - int status; struct nvme_command c; memset(&c, 0, sizeof(c)); c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; @@ -1007,16 +933,12 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_queue *nvmeq) { - int status; struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; @@ -1028,10 +950,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - status = nvme_submit_admin_cmd(dev, &c, NULL); - if (status) - return -EIO; - return 0; + return nvme_submit_admin_cmd(dev, &c, NULL); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1087,151 +1006,170 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, } /** - * nvme_abort_cmd - Attempt aborting a command - * @cmdid: Command id of a timed out IO - * @queue: The queue with timed out IO + * nvme_abort_req - Attempt aborting a request * * Schedule controller reset if the command was already aborted once before and * still hasn't been returned to the driver, or if this is the admin queue. */ -static void nvme_abort_cmd(int cmdid, struct nvme_queue *nvmeq) +static void nvme_abort_req(struct request *req) { - int a_cmdid; - struct nvme_command cmd; + struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd_rq->nvmeq; struct nvme_dev *dev = nvmeq->dev; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - struct nvme_queue *adminq; + struct request *abort_req; + struct nvme_cmd_info *abort_cmd; + struct nvme_command cmd; + + if (!nvmeq->qid || cmd_rq->aborted) { + unsigned long flags; - if (!nvmeq->qid || info[cmdid].aborted) { + spin_lock_irqsave(&dev_list_lock, flags); if (work_busy(&dev->reset_work)) - return; + goto out; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", cmdid, - nvmeq->qid); + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); + out: + spin_unlock_irqrestore(&dev_list_lock, flags); return; } if (!dev->abort_limit) return; - adminq = rcu_dereference(dev->queues[0]); - a_cmdid = alloc_cmdid(adminq, CMD_CTX_ABORT, special_completion, - ADMIN_TIMEOUT); - if (a_cmdid < 0) + abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_ATOMIC, + false); + if (IS_ERR(abort_req)) return; + abort_cmd = blk_mq_rq_to_pdu(abort_req); + nvme_set_info(abort_cmd, abort_req, abort_completion); + memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; - cmd.abort.cid = cmdid; + cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = a_cmdid; + cmd.abort.command_id = abort_req->tag; --dev->abort_limit; - info[cmdid].aborted = 1; - info[cmdid].timeout = jiffies + ADMIN_TIMEOUT; + cmd_rq->aborted = 1; - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", cmdid, + dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, nvmeq->qid); - nvme_submit_cmd(adminq, &cmd); + if (nvme_submit_cmd(dev->queues[0], &cmd) < 0) { + dev_warn(nvmeq->q_dmadev, + "Could not abort I/O %d QID %d", + req->tag, nvmeq->qid); + blk_mq_free_request(abort_req); + } } -/** - * nvme_cancel_ios - Cancel outstanding I/Os - * @queue: The queue to cancel I/Os on - * @timeout: True to only cancel I/Os which have timed out - */ -static void nvme_cancel_ios(struct nvme_queue *nvmeq, bool timeout) +static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, + struct request *req, void *data, bool reserved) { - int depth = nvmeq->q_depth - 1; - struct nvme_cmd_info *info = nvme_cmd_info(nvmeq); - unsigned long now = jiffies; - int cmdid; + struct nvme_queue *nvmeq = data; + void *ctx; + nvme_completion_fn fn; + struct nvme_cmd_info *cmd; + struct nvme_completion cqe; - for_each_set_bit(cmdid, nvmeq->cmdid_data, depth) { - void *ctx; - nvme_completion_fn fn; - static struct nvme_completion cqe = { - .status = cpu_to_le16(NVME_SC_ABORT_REQ << 1), - }; + if (!blk_mq_request_started(req)) + return; - if (timeout && !time_after(now, info[cmdid].timeout)) - continue; - if (info[cmdid].ctx == CMD_CTX_CANCELLED) - continue; - if (timeout && nvmeq->dev->initialized) { - nvme_abort_cmd(cmdid, nvmeq); - continue; - } - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", cmdid, - nvmeq->qid); - ctx = cancel_cmdid(nvmeq, cmdid, &fn); - fn(nvmeq, ctx, &cqe); - } + cmd = blk_mq_rq_to_pdu(req); + + if (cmd->ctx == CMD_CTX_CANCELLED) + return; + + if (blk_queue_dying(req->q)) + cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); + else + cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); + + + dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", + req->tag, nvmeq->qid); + ctx = cancel_cmd_info(cmd, &fn); + fn(nvmeq, ctx, &cqe); } -static void nvme_free_queue(struct rcu_head *r) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - struct nvme_queue *nvmeq = container_of(r, struct nvme_queue, r_head); + struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = cmd->nvmeq; + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + int ret = BLK_EH_RESET_TIMER; + + dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, + nvmeq->qid); spin_lock_irq(&nvmeq->q_lock); - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - bio_endio(bio, -EIO); - } - while (!list_empty(&nvmeq->iod_bio)) { - static struct nvme_completion cqe = { - .status = cpu_to_le16( - (NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1), - }; - struct nvme_iod *iod = list_first_entry(&nvmeq->iod_bio, - struct nvme_iod, - node); - list_del(&iod->node); - bio_completion(nvmeq, iod, &cqe); - } + if (!nvmeq->dev->initialized) { + /* + * Force cancelled command frees the request, which requires we + * return BLK_EH_NOT_HANDLED. + */ + nvme_cancel_queue_ios(nvmeq->hctx, req, nvmeq, reserved); + ret = BLK_EH_NOT_HANDLED; + } else + nvme_abort_req(req); spin_unlock_irq(&nvmeq->q_lock); + return ret; +} + +static void nvme_free_queue(struct nvme_queue *nvmeq) +{ dma_free_coherent(nvmeq->q_dmadev, CQ_SIZE(nvmeq->q_depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - if (nvmeq->qid) - free_cpumask_var(nvmeq->cpu_mask); kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) { + LLIST_HEAD(q_list); + struct nvme_queue *nvmeq, *next; + struct llist_node *entry; int i; for (i = dev->queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); - rcu_assign_pointer(dev->queues[i], NULL); - call_rcu(&nvmeq->r_head, nvme_free_queue); + struct nvme_queue *nvmeq = dev->queues[i]; + llist_add(&nvmeq->node, &q_list); dev->queue_count--; + dev->queues[i] = NULL; } + synchronize_rcu(); + entry = llist_del_all(&q_list); + llist_for_each_entry_safe(nvmeq, next, entry, node) + nvme_free_queue(nvmeq); } /** * nvme_suspend_queue - put queue into suspended state * @nvmeq - queue to suspend - * - * Returns 1 if already suspended, 0 otherwise. */ static int nvme_suspend_queue(struct nvme_queue *nvmeq) { - int vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; + int vector; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) { + if (nvmeq->cq_vector == -1) { spin_unlock_irq(&nvmeq->q_lock); return 1; } - nvmeq->q_suspended = 1; + vector = nvmeq->dev->entry[nvmeq->cq_vector].vector; nvmeq->dev->online_queues--; + nvmeq->cq_vector = -1; spin_unlock_irq(&nvmeq->q_lock); irq_set_affinity_hint(vector, NULL); @@ -1242,15 +1180,18 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { + struct blk_mq_hw_ctx *hctx = nvmeq->hctx; + spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, false); + if (hctx && hctx->tags) + blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } static void nvme_disable_queue(struct nvme_dev *dev, int qid) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, qid); + struct nvme_queue *nvmeq = dev->queues[qid]; if (!nvmeq) return; @@ -1263,32 +1204,29 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) adapter_delete_sq(dev, qid); adapter_delete_cq(dev, qid); } + if (!qid && dev->admin_q) + blk_mq_freeze_queue_start(dev->admin_q); nvme_clear_queue(nvmeq); } static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int vector) + int depth) { struct device *dmadev = &dev->pci_dev->dev; - unsigned extra = nvme_queue_extra(depth); - struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq) + extra, GFP_KERNEL); + struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; - nvmeq->cqes = dma_alloc_coherent(dmadev, CQ_SIZE(depth), - &nvmeq->cq_dma_addr, GFP_KERNEL); + nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - memset((void *)nvmeq->cqes, 0, CQ_SIZE(depth)); nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) goto free_cqdma; - if (qid && !zalloc_cpumask_var(&nvmeq->cpu_mask, GFP_KERNEL)) - goto free_sqdma; - nvmeq->q_dmadev = dmadev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", @@ -1296,23 +1234,14 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; - init_waitqueue_head(&nvmeq->sq_full); - init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread); - bio_list_init(&nvmeq->sq_cong); - INIT_LIST_HEAD(&nvmeq->iod_bio); nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; - nvmeq->cq_vector = vector; nvmeq->qid = qid; - nvmeq->q_suspended = 1; dev->queue_count++; - rcu_assign_pointer(dev->queues[qid], nvmeq); + dev->queues[qid] = nvmeq; return nvmeq; - free_sqdma: - dma_free_coherent(dmadev, SQ_SIZE(depth), (void *)nvmeq->sq_cmds, - nvmeq->sq_dma_addr); free_cqdma: dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); @@ -1335,17 +1264,15 @@ static int queue_request_irq(struct nvme_dev *dev, struct nvme_queue *nvmeq, static void nvme_init_queue(struct nvme_queue *nvmeq, u16 qid) { struct nvme_dev *dev = nvmeq->dev; - unsigned extra = nvme_queue_extra(nvmeq->q_depth); + spin_lock_irq(&nvmeq->q_lock); nvmeq->sq_tail = 0; nvmeq->cq_head = 0; nvmeq->cq_phase = 1; nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; - memset(nvmeq->cmdid_data, 0, extra); memset((void *)nvmeq->cqes, 0, CQ_SIZE(nvmeq->q_depth)); - nvme_cancel_ios(nvmeq, false); - nvmeq->q_suspended = 0; dev->online_queues++; + spin_unlock_irq(&nvmeq->q_lock); } static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) @@ -1353,6 +1280,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) struct nvme_dev *dev = nvmeq->dev; int result; + nvmeq->cq_vector = qid - 1; result = adapter_alloc_cq(dev, qid, nvmeq); if (result < 0) return result; @@ -1365,10 +1293,7 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) if (result < 0) goto release_sq; - spin_lock_irq(&nvmeq->q_lock); nvme_init_queue(nvmeq, qid); - spin_unlock_irq(&nvmeq->q_lock); - return result; release_sq: @@ -1408,27 +1333,32 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) */ static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) { - u32 cc = readl(&dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config &= ~NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); - if (cc & NVME_CC_ENABLE) - writel(cc & ~NVME_CC_ENABLE, &dev->bar->cc); return nvme_wait_ready(dev, cap, false); } static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) { + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_ENABLE; + writel(dev->ctrl_config, &dev->bar->cc); + return nvme_wait_ready(dev, cap, true); } static int nvme_shutdown_ctrl(struct nvme_dev *dev) { unsigned long timeout; - u32 cc; - cc = (readl(&dev->bar->cc) & ~NVME_CC_SHN_MASK) | NVME_CC_SHN_NORMAL; - writel(cc, &dev->bar->cc); + dev->ctrl_config &= ~NVME_CC_SHN_MASK; + dev->ctrl_config |= NVME_CC_SHN_NORMAL; - timeout = 2 * HZ + jiffies; + writel(dev->ctrl_config, &dev->bar->cc); + + timeout = SHUTDOWN_TIMEOUT + jiffies; while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != NVME_CSTS_SHST_CMPLT) { msleep(100); @@ -1444,20 +1374,93 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) return 0; } +static struct blk_mq_ops nvme_mq_admin_ops = { + .queue_rq = nvme_admin_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_admin_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_admin_init_request, + .timeout = nvme_timeout, +}; + +static struct blk_mq_ops nvme_mq_ops = { + .queue_rq = nvme_queue_rq, + .map_queue = blk_mq_map_queue, + .init_hctx = nvme_init_hctx, + .exit_hctx = nvme_exit_hctx, + .init_request = nvme_init_request, + .timeout = nvme_timeout, +}; + +static void nvme_dev_remove_admin(struct nvme_dev *dev) +{ + if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { + blk_cleanup_queue(dev->admin_q); + blk_mq_free_tag_set(&dev->admin_tagset); + } +} + +static int nvme_alloc_admin_tags(struct nvme_dev *dev) +{ + if (!dev->admin_q) { + dev->admin_tagset.ops = &nvme_mq_admin_ops; + dev->admin_tagset.nr_hw_queues = 1; + dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; + dev->admin_tagset.timeout = ADMIN_TIMEOUT; + dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->admin_tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->admin_tagset)) + return -ENOMEM; + + dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->admin_q)) { + blk_mq_free_tag_set(&dev->admin_tagset); + return -ENOMEM; + } + if (!blk_get_queue(dev->admin_q)) { + nvme_dev_remove_admin(dev); + return -ENODEV; + } + } else + blk_mq_unfreeze_queue(dev->admin_q); + + return 0; +} + static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; u64 cap = readq(&dev->bar->cap); struct nvme_queue *nvmeq; + unsigned page_shift = PAGE_SHIFT; + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; + unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; + + if (page_shift < dev_page_min) { + dev_err(&dev->pci_dev->dev, + "Minimum device page size (%u) too large for " + "host (%u)\n", 1 << dev_page_min, + 1 << page_shift); + return -ENODEV; + } + if (page_shift > dev_page_max) { + dev_info(&dev->pci_dev->dev, + "Device maximum page size (%u) smaller than " + "host (%u); enabling work-around\n", + 1 << dev_page_max, 1 << page_shift); + page_shift = dev_page_max; + } result = nvme_disable_ctrl(dev, cap); if (result < 0) return result; - nvmeq = raw_nvmeq(dev, 0); + nvmeq = dev->queues[0]; if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, 64, 0); + nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH); if (!nvmeq) return -ENOMEM; } @@ -1465,27 +1468,30 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; - dev->ctrl_config = NVME_CC_ENABLE | NVME_CC_CSS_NVM; - dev->ctrl_config |= (PAGE_SHIFT - 12) << NVME_CC_MPS_SHIFT; + dev->page_size = 1 << page_shift; + + dev->ctrl_config = NVME_CC_CSS_NVM; + dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; writel(aqa, &dev->bar->aqa); writeq(nvmeq->sq_dma_addr, &dev->bar->asq); writeq(nvmeq->cq_dma_addr, &dev->bar->acq); - writel(dev->ctrl_config, &dev->bar->cc); result = nvme_enable_ctrl(dev, cap); if (result) - return result; + goto free_nvmeq; + nvmeq->cq_vector = 0; result = queue_request_irq(dev, nvmeq, nvmeq->irqname); if (result) - return result; + goto free_nvmeq; - spin_lock_irq(&nvmeq->q_lock); - nvme_init_queue(nvmeq, 0); - spin_unlock_irq(&nvmeq->q_lock); + return result; + + free_nvmeq: + nvme_free_queues(dev, 0); return result; } @@ -1516,7 +1522,7 @@ struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, } err = -ENOMEM; - iod = nvme_alloc_iod(count, length, GFP_KERNEL); + iod = nvme_alloc_iod(count, length, dev, GFP_KERNEL); if (!iod) goto put_pages; @@ -1644,7 +1650,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) if (length != (io.nblocks + 1) << ns->lba_shift) status = -ENOMEM; else - status = nvme_submit_io_cmd(dev, &c, NULL); + status = nvme_submit_io_cmd(dev, ns, &c, NULL); if (meta_len) { if (status == NVME_SC_SUCCESS && !(io.opcode & 1)) { @@ -1676,10 +1682,10 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) return status; } -static int nvme_user_admin_cmd(struct nvme_dev *dev, - struct nvme_admin_cmd __user *ucmd) +static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) { - struct nvme_admin_cmd cmd; + struct nvme_passthru_cmd cmd; struct nvme_command c; int status, length; struct nvme_iod *uninitialized_var(iod); @@ -1716,10 +1722,23 @@ static int nvme_user_admin_cmd(struct nvme_dev *dev, timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : ADMIN_TIMEOUT; + if (length != cmd.data_len) status = -ENOMEM; - else - status = nvme_submit_sync_cmd(dev, 0, &c, &cmd.result, timeout); + else if (ns) { + struct request *req; + + req = blk_mq_alloc_request(ns->queue, WRITE, + (GFP_KERNEL|__GFP_WAIT), false); + if (IS_ERR(req)) + status = PTR_ERR(req); + else { + status = nvme_submit_sync_cmd(req, &c, &cmd.result, + timeout); + blk_mq_free_request(req); + } + } else + status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); if (cmd.data_len) { nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); @@ -1743,7 +1762,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, force_successful_syscall_return(); return ns->ns_id; case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(ns->dev, (void __user *)arg); + return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->dev, ns, (void __user *)arg); case NVME_IOCTL_SUBMIT_IO: return nvme_submit_io(ns, (void __user *)arg); case SG_GET_VERSION_NUM: @@ -1759,11 +1780,9 @@ static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - switch (cmd) { case SG_IO: - return nvme_sg_io32(ns, arg); + return -ENOIOCTLCMD; } return nvme_ioctl(bdev, mode, cmd, arg); } @@ -1773,11 +1792,18 @@ static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, static int nvme_open(struct block_device *bdev, fmode_t mode) { - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_dev *dev = ns->dev; + int ret = 0; + struct nvme_ns *ns; - kref_get(&dev->kref); - return 0; + spin_lock(&dev_list_lock); + ns = bdev->bd_disk->private_data; + if (!ns) + ret = -ENXIO; + else if (!kref_get_unless_zero(&ns->dev->kref)) + ret = -ENXIO; + spin_unlock(&dev_list_lock); + + return ret; } static void nvme_free_dev(struct kref *kref); @@ -1799,6 +1825,35 @@ static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) return 0; } +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_dev *dev = ns->dev; + struct nvme_id_ns *id; + dma_addr_t dma_addr; + int lbaf; + + id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, + GFP_KERNEL); + if (!id) { + dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", + __func__); + return 0; + } + + if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) + goto free; + + lbaf = id->flbas & 0xf; + ns->lba_shift = id->lbaf[lbaf].ds; + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + free: + dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + return 0; +} + static const struct block_device_operations nvme_fops = { .owner = THIS_MODULE, .ioctl = nvme_ioctl, @@ -1806,43 +1861,9 @@ static const struct block_device_operations nvme_fops = { .open = nvme_open, .release = nvme_release, .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, }; -static void nvme_resubmit_iods(struct nvme_queue *nvmeq) -{ - struct nvme_iod *iod, *next; - - list_for_each_entry_safe(iod, next, &nvmeq->iod_bio, node) { - if (unlikely(nvme_submit_iod(nvmeq, iod))) - break; - list_del(&iod->node); - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - } -} - -static void nvme_resubmit_bios(struct nvme_queue *nvmeq) -{ - while (bio_list_peek(&nvmeq->sq_cong)) { - struct bio *bio = bio_list_pop(&nvmeq->sq_cong); - struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data; - - if (bio_list_empty(&nvmeq->sq_cong) && - list_empty(&nvmeq->iod_bio)) - remove_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - if (nvme_submit_bio_queue(nvmeq, ns, bio)) { - if (!waitqueue_active(&nvmeq->sq_full)) - add_wait_queue(&nvmeq->sq_full, - &nvmeq->sq_cong_wait); - bio_list_add_head(&nvmeq->sq_cong, bio); - break; - } - } -} - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -1858,28 +1879,26 @@ static int nvme_kthread(void *data) continue; list_del_init(&dev->node); dev_warn(&dev->pci_dev->dev, - "Failed status, reset controller\n"); + "Failed status: %x, reset controller\n", + readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); continue; } - rcu_read_lock(); for (i = 0; i < dev->queue_count; i++) { - struct nvme_queue *nvmeq = - rcu_dereference(dev->queues[i]); + struct nvme_queue *nvmeq = dev->queues[i]; if (!nvmeq) continue; spin_lock_irq(&nvmeq->q_lock); - if (nvmeq->q_suspended) - goto unlock; nvme_process_cq(nvmeq); - nvme_cancel_ios(nvmeq, true); - nvme_resubmit_bios(nvmeq); - nvme_resubmit_iods(nvmeq); - unlock: + + while ((i == 0) && (dev->event_limit > 0)) { + if (nvme_submit_async_admin_req(dev)) + break; + dev->event_limit--; + } spin_unlock_irq(&nvmeq->q_lock); } - rcu_read_unlock(); } spin_unlock(&dev_list_lock); schedule_timeout(round_jiffies_relative(HZ)); @@ -1902,28 +1921,28 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, { struct nvme_ns *ns; struct gendisk *disk; + int node = dev_to_node(&dev->pci_dev->dev); int lbaf; if (rt->attributes & NVME_LBART_ATTRIB_HIDE) return NULL; - ns = kzalloc(sizeof(*ns), GFP_KERNEL); + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) return NULL; - ns->queue = blk_alloc_queue(GFP_KERNEL); - if (!ns->queue) + ns->queue = blk_mq_init_queue(&dev->tagset); + if (IS_ERR(ns->queue)) goto out_free_ns; - ns->queue->queue_flags = QUEUE_FLAG_DEFAULT; queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - queue_flag_clear_unlocked(QUEUE_FLAG_ADD_RANDOM, ns->queue); - blk_queue_make_request(ns->queue, nvme_make_request); + queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, ns->queue); ns->dev = dev; ns->queue->queuedata = ns; - disk = alloc_disk(0); + disk = alloc_disk_node(0, node); if (!disk) goto out_free_queue; + ns->ns_id = nsid; ns->disk = disk; lbaf = id->flbas & 0xf; @@ -1932,6 +1951,8 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); if (dev->max_hw_sectors) blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); + if (dev->stripe_size) + blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); if (dev->vwc & NVME_CTRL_VWC_PRESENT) blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); @@ -1957,141 +1978,17 @@ static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid, return NULL; } -static int nvme_find_closest_node(int node) -{ - int n, val, min_val = INT_MAX, best_node = node; - - for_each_online_node(n) { - if (n == node) - continue; - val = node_distance(node, n); - if (val < min_val) { - min_val = val; - best_node = n; - } - } - return best_node; -} - -static void nvme_set_queue_cpus(cpumask_t *qmask, struct nvme_queue *nvmeq, - int count) -{ - int cpu; - for_each_cpu(cpu, qmask) { - if (cpumask_weight(nvmeq->cpu_mask) >= count) - break; - if (!cpumask_test_and_set_cpu(cpu, nvmeq->cpu_mask)) - *per_cpu_ptr(nvmeq->dev->io_queue, cpu) = nvmeq->qid; - } -} - -static void nvme_add_cpus(cpumask_t *mask, const cpumask_t *unassigned_cpus, - const cpumask_t *new_mask, struct nvme_queue *nvmeq, int cpus_per_queue) -{ - int next_cpu; - for_each_cpu(next_cpu, new_mask) { - cpumask_or(mask, mask, get_cpu_mask(next_cpu)); - cpumask_or(mask, mask, topology_thread_cpumask(next_cpu)); - cpumask_and(mask, mask, unassigned_cpus); - nvme_set_queue_cpus(mask, nvmeq, cpus_per_queue); - } -} - static void nvme_create_io_queues(struct nvme_dev *dev) { - unsigned i, max; - - max = min(dev->max_qid, num_online_cpus()); - for (i = dev->queue_count; i <= max; i++) - if (!nvme_alloc_queue(dev, i, dev->q_depth, i - 1)) - break; + unsigned i; - max = min(dev->queue_count - 1, num_online_cpus()); - for (i = dev->online_queues; i <= max; i++) - if (nvme_create_queue(raw_nvmeq(dev, i), i)) + for (i = dev->queue_count; i <= dev->max_qid; i++) + if (!nvme_alloc_queue(dev, i, dev->q_depth)) break; -} - -/* - * If there are fewer queues than online cpus, this will try to optimally - * assign a queue to multiple cpus by grouping cpus that are "close" together: - * thread siblings, core, socket, closest node, then whatever else is - * available. - */ -static void nvme_assign_io_queues(struct nvme_dev *dev) -{ - unsigned cpu, cpus_per_queue, queues, remainder, i; - cpumask_var_t unassigned_cpus; - - nvme_create_io_queues(dev); - - queues = min(dev->online_queues - 1, num_online_cpus()); - if (!queues) - return; - cpus_per_queue = num_online_cpus() / queues; - remainder = queues - (num_online_cpus() - queues * cpus_per_queue); - - if (!alloc_cpumask_var(&unassigned_cpus, GFP_KERNEL)) - return; - - cpumask_copy(unassigned_cpus, cpu_online_mask); - cpu = cpumask_first(unassigned_cpus); - for (i = 1; i <= queues; i++) { - struct nvme_queue *nvmeq = lock_nvmeq(dev, i); - cpumask_t mask; - - cpumask_clear(nvmeq->cpu_mask); - if (!cpumask_weight(unassigned_cpus)) { - unlock_nvmeq(nvmeq); + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) + if (nvme_create_queue(dev->queues[i], i)) break; - } - - mask = *get_cpu_mask(cpu); - nvme_set_queue_cpus(&mask, nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_thread_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - topology_core_cpumask(cpu), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node(cpu_to_node(cpu)), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - cpumask_of_node( - nvme_find_closest_node( - cpu_to_node(cpu))), - nvmeq, cpus_per_queue); - if (cpus_weight(mask) < cpus_per_queue) - nvme_add_cpus(&mask, unassigned_cpus, - unassigned_cpus, - nvmeq, cpus_per_queue); - - WARN(cpumask_weight(nvmeq->cpu_mask) != cpus_per_queue, - "nvme%d qid:%d mis-matched queue-to-cpu assignment\n", - dev->instance, i); - - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->cpu_mask); - cpumask_andnot(unassigned_cpus, unassigned_cpus, - nvmeq->cpu_mask); - cpu = cpumask_next(cpu, unassigned_cpus); - if (remainder && !--remainder) - cpus_per_queue++; - unlock_nvmeq(nvmeq); - } - WARN(cpumask_weight(unassigned_cpus), "nvme%d unassigned online cpus\n", - dev->instance); - i = 0; - cpumask_andnot(unassigned_cpus, cpu_possible_mask, cpu_online_mask); - for_each_cpu(cpu, unassigned_cpus) - *per_cpu_ptr(dev->io_queue, cpu) = (i++ % queues) + 1; - free_cpumask_var(unassigned_cpus); } static int set_queue_count(struct nvme_dev *dev, int count) @@ -2107,7 +2004,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) if (status > 0) { dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", status); - return -EBUSY; + return 0; } return min(result & 0xffff, result >> 16) + 1; } @@ -2117,39 +2014,15 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) return 4096 + ((nr_io_queues + 1) * 8 * dev->db_stride); } -static void nvme_cpu_workfn(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, cpu_work); - if (dev->initialized) - nvme_assign_io_queues(dev); -} - -static int nvme_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) -{ - struct nvme_dev *dev; - - switch (action) { - case CPU_ONLINE: - case CPU_DEAD: - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) - schedule_work(&dev->cpu_work); - spin_unlock(&dev_list_lock); - break; - } - return NOTIFY_OK; -} - static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = raw_nvmeq(dev, 0); + struct nvme_queue *adminq = dev->queues[0]; struct pci_dev *pdev = dev->pci_dev; int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); result = set_queue_count(dev, nr_io_queues); - if (result < 0) + if (result <= 0) return result; if (result < nr_io_queues) nr_io_queues = result; @@ -2172,6 +2045,13 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Deregister the admin queue's interrupt */ free_irq(dev->entry[0].vector, adminq); + /* + * If we enable msix early due to not intx, disable it again before + * setting up the full range we need. + */ + if (!pdev->irq) + pci_disable_msix(pdev); + for (i = 0; i < nr_io_queues; i++) dev->entry[i].entry = i; vecs = pci_enable_msix_range(pdev, dev->entry, 1, nr_io_queues); @@ -2195,14 +2075,12 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) dev->max_qid = nr_io_queues; result = queue_request_irq(dev, adminq, adminq->irqname); - if (result) { - adminq->q_suspended = 1; + if (result) goto free_queues; - } /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_assign_io_queues(dev); + nvme_create_io_queues(dev); return 0; @@ -2245,14 +2123,37 @@ static int nvme_dev_add(struct nvme_dev *dev) dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; dev->vwc = ctrl->vwc; + dev->event_limit = min(ctrl->aerl + 1, 8); memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); if (ctrl->mdts) dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) + (pdev->device == 0x0953) && ctrl->vs[3]) { + unsigned int max_hw_sectors; + dev->stripe_size = 1 << (ctrl->vs[3] + shift); + max_hw_sectors = dev->stripe_size >> (shift - 9); + if (dev->max_hw_sectors) { + dev->max_hw_sectors = min(max_hw_sectors, + dev->max_hw_sectors); + } else + dev->max_hw_sectors = max_hw_sectors; + } + + dev->tagset.ops = &nvme_mq_ops; + dev->tagset.nr_hw_queues = dev->online_queues - 1; + dev->tagset.timeout = NVME_IO_TIMEOUT; + dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.queue_depth = + min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; + dev->tagset.cmd_size = sizeof(struct nvme_cmd_info); + dev->tagset.flags = BLK_MQ_F_SHOULD_MERGE; + dev->tagset.driver_data = dev; + + if (blk_mq_alloc_tag_set(&dev->tagset)) + goto out; id_ns = mem; for (i = 1; i <= nn; i++) { @@ -2293,6 +2194,9 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->entry[0].vector = pdev->irq; pci_set_master(pdev); bars = pci_select_bars(pdev, IORESOURCE_MEM); + if (!bars) + goto disable_pci; + if (pci_request_selected_regions(pdev, bars, "nvme")) goto disable_pci; @@ -2303,10 +2207,22 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); if (!dev->bar) goto disable; + if (readl(&dev->bar->csts) == -1) { result = -ENODEV; goto unmap; } + + /* + * Some devices don't advertse INTx interrupts, pre-enable a single + * MSIX vec for setup. We'll adjust this later. + */ + if (!pdev->irq) { + result = pci_enable_msix(pdev, dev->entry, 1); + if (result < 0) + goto unmap; + } + cap = readq(&dev->bar->cap); dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); @@ -2358,13 +2274,18 @@ static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) break; if (!schedule_timeout(ADMIN_TIMEOUT) || fatal_signal_pending(current)) { + /* + * Disable the controller first since we can't trust it + * at this point, but leave the admin queue enabled + * until all queue deletion requests are flushed. + * FIXME: This may take a while if there are more h/w + * queues than admin tags. + */ set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, readq(&dev->bar->cap)); - nvme_disable_queue(dev, 0); - - send_sig(SIGKILL, dq->worker->task, 1); + nvme_clear_queue(dev->queues[0]); flush_kthread_worker(dq->worker); + nvme_disable_queue(dev, 0); return; } } @@ -2402,7 +2323,8 @@ static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, c.delete_queue.qid = cpu_to_le16(nvmeq->qid); init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_cmd_async(nvmeq->dev, &c, &nvmeq->cmdinfo); + return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, + ADMIN_TIMEOUT); } static void nvme_del_cq_work_handler(struct kthread_work *work) @@ -2440,7 +2362,6 @@ static void nvme_del_queue_start(struct kthread_work *work) { struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, cmdinfo.work); - allow_signal(SIGKILL); if (nvme_delete_sq(nvmeq)) nvme_del_queue_end(nvmeq); } @@ -2465,7 +2386,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) atomic_set(&dq.refcount, 0); dq.worker = &worker; for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; if (nvme_suspend_queue(nvmeq)) continue; @@ -2498,16 +2419,49 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) kthread_stop(tmp); } +static void nvme_freeze_queues(struct nvme_dev *dev) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + blk_mq_freeze_queue_start(ns->queue); + + spin_lock(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } +} + +static void nvme_unfreeze_queues(struct nvme_dev *dev) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_unfreeze_queue(ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } +} + static void nvme_dev_shutdown(struct nvme_dev *dev) { int i; + u32 csts = -1; dev->initialized = 0; nvme_dev_list_remove(dev); - if (!dev->bar || (dev->bar && readl(&dev->bar->csts) == -1)) { + if (dev->bar) { + nvme_freeze_queues(dev); + csts = readl(&dev->bar->csts); + } + if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { - struct nvme_queue *nvmeq = raw_nvmeq(dev, i); + struct nvme_queue *nvmeq = dev->queues[i]; nvme_suspend_queue(nvmeq); nvme_clear_queue(nvmeq); } @@ -2526,8 +2480,10 @@ static void nvme_dev_remove(struct nvme_dev *dev) list_for_each_entry(ns, &dev->namespaces, list) { if (ns->disk->flags & GENHD_FL_UP) del_gendisk(ns->disk); - if (!blk_queue_dying(ns->queue)) + if (!blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); blk_cleanup_queue(ns->queue); + } } } @@ -2590,6 +2546,11 @@ static void nvme_free_namespaces(struct nvme_dev *dev) list_for_each_entry_safe(ns, next, &dev->namespaces, list) { list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + put_disk(ns->disk); kfree(ns); } @@ -2599,8 +2560,11 @@ static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + pci_dev_put(dev->pci_dev); nvme_free_namespaces(dev); - free_percpu(dev->io_queue); + nvme_release_instance(dev); + blk_mq_free_tag_set(&dev->tagset); + blk_put_queue(dev->admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2625,9 +2589,16 @@ static int nvme_dev_release(struct inode *inode, struct file *f) static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) { struct nvme_dev *dev = f->private_data; + struct nvme_ns *ns; + switch (cmd) { case NVME_IOCTL_ADMIN_CMD: - return nvme_user_admin_cmd(dev, (void __user *)arg); + return nvme_user_cmd(dev, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + if (list_empty(&dev->namespaces)) + return -ENOTTY; + ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); + return nvme_user_cmd(dev, ns, (void __user *)arg); default: return -ENOTTY; } @@ -2641,6 +2612,22 @@ static const struct file_operations nvme_dev_fops = { .compat_ioctl = nvme_dev_ioctl, }; +static void nvme_set_irq_hints(struct nvme_dev *dev) +{ + struct nvme_queue *nvmeq; + int i; + + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; + + if (!nvmeq->hctx) + continue; + + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + nvmeq->hctx->cpumask); + } +} + static int nvme_dev_start(struct nvme_dev *dev) { int result; @@ -2664,7 +2651,7 @@ static int nvme_dev_start(struct nvme_dev *dev) if (start_thread) { nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up(&nvme_kthread_wait); + wake_up_all(&nvme_kthread_wait); } else wait_event_killable(nvme_kthread_wait, nvme_thread); @@ -2673,12 +2660,21 @@ static int nvme_dev_start(struct nvme_dev *dev) goto disable; } - result = nvme_setup_io_queues(dev); - if (result && result != -EBUSY) + nvme_init_queue(dev->queues[0], 0); + result = nvme_alloc_admin_tags(dev); + if (result) goto disable; + result = nvme_setup_io_queues(dev); + if (result) + goto free_tags; + + nvme_set_irq_hints(dev); + return result; + free_tags: + nvme_dev_remove_admin(dev); disable: nvme_disable_queue(dev, 0); nvme_dev_list_remove(dev); @@ -2693,7 +2689,7 @@ static int nvme_remove_dead_ctrl(void *arg) struct pci_dev *pdev = dev->pci_dev; if (pci_get_drvdata(pdev)) - pci_stop_and_remove_bus_device(pdev); + pci_stop_and_remove_bus_device_locked(pdev); kref_put(&dev->kref, nvme_free_dev); return 0; } @@ -2702,8 +2698,8 @@ static void nvme_remove_disks(struct work_struct *ws) { struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - nvme_dev_remove(dev); nvme_free_queues(dev, 1); + nvme_dev_remove(dev); } static int nvme_dev_resume(struct nvme_dev *dev) @@ -2711,13 +2707,16 @@ static int nvme_dev_resume(struct nvme_dev *dev) int ret; ret = nvme_dev_start(dev); - if (ret && ret != -EBUSY) + if (ret) return ret; - if (ret == -EBUSY) { + if (dev->online_queues < 2) { spin_lock(&dev_list_lock); dev->reset_workfn = nvme_remove_disks; queue_work(nvme_workq, &dev->reset_work); spin_unlock(&dev_list_lock); + } else { + nvme_unfreeze_queues(dev); + nvme_set_irq_hints(dev); } dev->initialized = 1; return 0; @@ -2727,7 +2726,7 @@ static void nvme_dev_reset(struct nvme_dev *dev) { nvme_dev_shutdown(dev); if (nvme_dev_resume(dev)) { - dev_err(&dev->pci_dev->dev, "Device failed to resume\n"); + dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->instance))) { @@ -2752,33 +2751,33 @@ static void nvme_reset_workfn(struct work_struct *work) static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { - int result = -ENOMEM; + int node, result = -ENOMEM; struct nvme_dev *dev; - dev = kzalloc(sizeof(*dev), GFP_KERNEL); + node = dev_to_node(&pdev->dev); + if (node == NUMA_NO_NODE) + set_dev_node(&pdev->dev, 0); + + dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->entry = kcalloc(num_possible_cpus(), sizeof(*dev->entry), - GFP_KERNEL); + dev->entry = kzalloc_node(num_possible_cpus() * sizeof(*dev->entry), + GFP_KERNEL, node); if (!dev->entry) goto free; - dev->queues = kcalloc(num_possible_cpus() + 1, sizeof(void *), - GFP_KERNEL); + dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), + GFP_KERNEL, node); if (!dev->queues) goto free; - dev->io_queue = alloc_percpu(unsigned short); - if (!dev->io_queue) - goto free; INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - INIT_WORK(&dev->cpu_work, nvme_cpu_workfn); - dev->pci_dev = pdev; + dev->pci_dev = pci_dev_get(pdev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) - goto free; + goto put_pci; result = nvme_setup_prp_pools(dev); if (result) @@ -2786,17 +2785,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) kref_init(&dev->kref); result = nvme_dev_start(dev); - if (result) { - if (result == -EBUSY) - goto create_cdev; + if (result) goto release_pools; - } - result = nvme_dev_add(dev); + if (dev->online_queues > 1) + result = nvme_dev_add(dev); if (result) goto shutdown; - create_cdev: scnprintf(dev->name, sizeof(dev->name), "nvme%d", dev->instance); dev->miscdev.minor = MISC_DYNAMIC_MINOR; dev->miscdev.parent = &pdev->dev; @@ -2806,11 +2802,14 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (result) goto remove; + nvme_set_irq_hints(dev); + dev->initialized = 1; return 0; remove: nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); nvme_free_namespaces(dev); shutdown: nvme_dev_shutdown(dev); @@ -2819,8 +2818,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) nvme_release_prp_pools(dev); release: nvme_release_instance(dev); + put_pci: + pci_dev_put(dev->pci_dev); free: - free_percpu(dev->io_queue); kfree(dev->queues); kfree(dev->entry); kfree(dev); @@ -2829,12 +2829,12 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) { - struct nvme_dev *dev = pci_get_drvdata(pdev); + struct nvme_dev *dev = pci_get_drvdata(pdev); - if (prepare) - nvme_dev_shutdown(dev); - else - nvme_dev_resume(dev); + if (prepare) + nvme_dev_shutdown(dev); + else + nvme_dev_resume(dev); } static void nvme_shutdown(struct pci_dev *pdev) @@ -2853,13 +2853,11 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->reset_work); - flush_work(&dev->cpu_work); misc_deregister(&dev->miscdev); - nvme_dev_remove(dev); nvme_dev_shutdown(dev); + nvme_dev_remove(dev); + nvme_dev_remove_admin(dev); nvme_free_queues(dev, 0); - rcu_barrier(); - nvme_release_instance(dev); nvme_release_prp_pools(dev); kref_put(&dev->kref, nvme_free_dev); } @@ -2942,18 +2940,11 @@ static int __init nvme_init(void) else if (result > 0) nvme_major = result; - nvme_nb.notifier_call = &nvme_cpu_notify; - result = register_hotcpu_notifier(&nvme_nb); - if (result) - goto unregister_blkdev; - result = pci_register_driver(&nvme_driver); if (result) - goto unregister_hotcpu; + goto unregister_blkdev; return 0; - unregister_hotcpu: - unregister_hotcpu_notifier(&nvme_nb); unregister_blkdev: unregister_blkdev(nvme_major, "nvme"); kill_workq: @@ -2973,6 +2964,6 @@ static void __exit nvme_exit(void) MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>"); MODULE_LICENSE("GPL"); -MODULE_VERSION("0.9"); +MODULE_VERSION("1.0"); module_init(nvme_init); module_exit(nvme_exit); diff --git a/drivers/block/nvme-scsi.c b/drivers/block/nvme-scsi.c index a4cd6d691c63..5e78568026c3 100644 --- a/drivers/block/nvme-scsi.c +++ b/drivers/block/nvme-scsi.c @@ -329,7 +329,7 @@ INQUIRY_EVPD_BIT_MASK) ? 1 : 0) (GET_U32_FROM_CDB(cdb, READ_CAP_16_CDB_ALLOC_LENGTH_OFFSET)) #define IS_READ_CAP_16(cdb) \ -((cdb[0] == SERVICE_ACTION_IN && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) +((cdb[0] == SERVICE_ACTION_IN_16 && cdb[1] == SAI_READ_CAPACITY_16) ? 1 : 0) /* Request Sense Helper Macros */ #define GET_REQUEST_SENSE_ALLOC_LENGTH(cdb) \ @@ -2105,7 +2105,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_offset += unit_num_blocks; - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); if (nvme_sc != NVME_SC_SUCCESS) { nvme_unmap_user_pages(dev, (is_write) ? DMA_TO_DEVICE : DMA_FROM_DEVICE, @@ -2658,7 +2658,7 @@ static int nvme_trans_start_stop(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out; @@ -2686,7 +2686,7 @@ static int nvme_trans_synchronize_cache(struct nvme_ns *ns, c.common.opcode = nvme_cmd_flush; c.common.nsid = cpu_to_le32(ns->ns_id); - nvme_sc = nvme_submit_io_cmd(ns->dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(ns->dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -2894,7 +2894,7 @@ static int nvme_trans_unmap(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.dsm.nr = cpu_to_le32(ndesc - 1); c.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); - nvme_sc = nvme_submit_io_cmd(dev, &c, NULL); + nvme_sc = nvme_submit_io_cmd(dev, ns, &c, NULL); res = nvme_trans_status_code(hdr, nvme_sc); dma_free_coherent(&dev->pci_dev->dev, ndesc * sizeof(*range), @@ -2915,6 +2915,14 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) if (copy_from_user(cmd, hdr->cmdp, hdr->cmd_len)) return -EFAULT; + /* + * Prime the hdr with good status for scsi commands that don't require + * an nvme command for translation. + */ + retcode = nvme_trans_status_code(hdr, NVME_SC_SUCCESS); + if (retcode) + return retcode; + opcode = cmd[0]; switch (opcode) { @@ -2947,7 +2955,7 @@ static int nvme_scsi_translate(struct nvme_ns *ns, struct sg_io_hdr *hdr) case READ_CAPACITY: retcode = nvme_trans_read_capacity(ns, hdr, cmd); break; - case SERVICE_ACTION_IN: + case SERVICE_ACTION_IN_16: if (IS_READ_CAP_16(cmd)) retcode = nvme_trans_read_capacity(ns, hdr, cmd); else @@ -3016,152 +3024,6 @@ int nvme_sg_io(struct nvme_ns *ns, struct sg_io_hdr __user *u_hdr) return retcode; } -#ifdef CONFIG_COMPAT -typedef struct sg_io_hdr32 { - compat_int_t interface_id; /* [i] 'S' for SCSI generic (required) */ - compat_int_t dxfer_direction; /* [i] data transfer direction */ - unsigned char cmd_len; /* [i] SCSI command length ( <= 16 bytes) */ - unsigned char mx_sb_len; /* [i] max length to write to sbp */ - unsigned short iovec_count; /* [i] 0 implies no scatter gather */ - compat_uint_t dxfer_len; /* [i] byte count of data transfer */ - compat_uint_t dxferp; /* [i], [*io] points to data transfer memory - or scatter gather list */ - compat_uptr_t cmdp; /* [i], [*i] points to command to perform */ - compat_uptr_t sbp; /* [i], [*o] points to sense_buffer memory */ - compat_uint_t timeout; /* [i] MAX_UINT->no timeout (unit: millisec) */ - compat_uint_t flags; /* [i] 0 -> default, see SG_FLAG... */ - compat_int_t pack_id; /* [i->o] unused internally (normally) */ - compat_uptr_t usr_ptr; /* [i->o] unused internally */ - unsigned char status; /* [o] scsi status */ - unsigned char masked_status; /* [o] shifted, masked scsi status */ - unsigned char msg_status; /* [o] messaging level data (optional) */ - unsigned char sb_len_wr; /* [o] byte count actually written to sbp */ - unsigned short host_status; /* [o] errors from host adapter */ - unsigned short driver_status; /* [o] errors from software driver */ - compat_int_t resid; /* [o] dxfer_len - actual_transferred */ - compat_uint_t duration; /* [o] time taken by cmd (unit: millisec) */ - compat_uint_t info; /* [o] auxiliary information */ -} sg_io_hdr32_t; /* 64 bytes long (on sparc32) */ - -typedef struct sg_iovec32 { - compat_uint_t iov_base; - compat_uint_t iov_len; -} sg_iovec32_t; - -static int sg_build_iovec(sg_io_hdr_t __user *sgio, void __user *dxferp, u16 iovec_count) -{ - sg_iovec_t __user *iov = (sg_iovec_t __user *) (sgio + 1); - sg_iovec32_t __user *iov32 = dxferp; - int i; - - for (i = 0; i < iovec_count; i++) { - u32 base, len; - - if (get_user(base, &iov32[i].iov_base) || - get_user(len, &iov32[i].iov_len) || - put_user(compat_ptr(base), &iov[i].iov_base) || - put_user(len, &iov[i].iov_len)) - return -EFAULT; - } - - if (put_user(iov, &sgio->dxferp)) - return -EFAULT; - return 0; -} - -int nvme_sg_io32(struct nvme_ns *ns, unsigned long arg) -{ - sg_io_hdr32_t __user *sgio32 = (sg_io_hdr32_t __user *)arg; - sg_io_hdr_t __user *sgio; - u16 iovec_count; - u32 data; - void __user *dxferp; - int err; - int interface_id; - - if (get_user(interface_id, &sgio32->interface_id)) - return -EFAULT; - if (interface_id != 'S') - return -EINVAL; - - if (get_user(iovec_count, &sgio32->iovec_count)) - return -EFAULT; - - { - void __user *top = compat_alloc_user_space(0); - void __user *new = compat_alloc_user_space(sizeof(sg_io_hdr_t) + - (iovec_count * sizeof(sg_iovec_t))); - if (new > top) - return -EINVAL; - - sgio = new; - } - - /* Ok, now construct. */ - if (copy_in_user(&sgio->interface_id, &sgio32->interface_id, - (2 * sizeof(int)) + - (2 * sizeof(unsigned char)) + - (1 * sizeof(unsigned short)) + - (1 * sizeof(unsigned int)))) - return -EFAULT; - - if (get_user(data, &sgio32->dxferp)) - return -EFAULT; - dxferp = compat_ptr(data); - if (iovec_count) { - if (sg_build_iovec(sgio, dxferp, iovec_count)) - return -EFAULT; - } else { - if (put_user(dxferp, &sgio->dxferp)) - return -EFAULT; - } - - { - unsigned char __user *cmdp; - unsigned char __user *sbp; - - if (get_user(data, &sgio32->cmdp)) - return -EFAULT; - cmdp = compat_ptr(data); - - if (get_user(data, &sgio32->sbp)) - return -EFAULT; - sbp = compat_ptr(data); - - if (put_user(cmdp, &sgio->cmdp) || - put_user(sbp, &sgio->sbp)) - return -EFAULT; - } - - if (copy_in_user(&sgio->timeout, &sgio32->timeout, - 3 * sizeof(int))) - return -EFAULT; - - if (get_user(data, &sgio32->usr_ptr)) - return -EFAULT; - if (put_user(compat_ptr(data), &sgio->usr_ptr)) - return -EFAULT; - - err = nvme_sg_io(ns, sgio); - if (err >= 0) { - void __user *datap; - - if (copy_in_user(&sgio32->pack_id, &sgio->pack_id, - sizeof(int)) || - get_user(datap, &sgio->usr_ptr) || - put_user((u32)(unsigned long)datap, - &sgio32->usr_ptr) || - copy_in_user(&sgio32->status, &sgio->status, - (4 * sizeof(unsigned char)) + - (2 * sizeof(unsigned short)) + - (3 * sizeof(int)))) - err = -EFAULT; - } - - return err; -} -#endif - int nvme_sg_get_version_num(int __user *ip) { return put_user(sg_version_num, ip); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 27b71a0b72d0..8a86b62466f7 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -2098,32 +2098,26 @@ static void rbd_dev_parent_put(struct rbd_device *rbd_dev) * If an image has a non-zero parent overlap, get a reference to its * parent. * - * We must get the reference before checking for the overlap to - * coordinate properly with zeroing the parent overlap in - * rbd_dev_v2_parent_info() when an image gets flattened. We - * drop it again if there is no overlap. - * * Returns true if the rbd device has a parent with a non-zero * overlap and a reference for it was successfully taken, or * false otherwise. */ static bool rbd_dev_parent_get(struct rbd_device *rbd_dev) { - int counter; + int counter = 0; if (!rbd_dev->parent_spec) return false; - counter = atomic_inc_return_safe(&rbd_dev->parent_ref); - if (counter > 0 && rbd_dev->parent_overlap) - return true; - - /* Image was flattened, but parent is not yet torn down */ + down_read(&rbd_dev->header_rwsem); + if (rbd_dev->parent_overlap) + counter = atomic_inc_return_safe(&rbd_dev->parent_ref); + up_read(&rbd_dev->header_rwsem); if (counter < 0) rbd_warn(rbd_dev, "parent reference overflow"); - return false; + return counter > 0; } /* @@ -2370,8 +2364,12 @@ static void rbd_img_obj_request_fill(struct rbd_obj_request *obj_request, opcode = CEPH_OSD_OP_READ; } - osd_req_op_extent_init(osd_request, num_ops, opcode, offset, length, - 0, 0); + if (opcode == CEPH_OSD_OP_DELETE) + osd_req_op_init(osd_request, num_ops, opcode); + else + osd_req_op_extent_init(osd_request, num_ops, opcode, + offset, length, 0, 0); + if (obj_request->type == OBJ_REQUEST_BIO) osd_req_op_extent_osd_data_bio(osd_request, num_ops, obj_request->bio_list, length); @@ -3405,8 +3403,7 @@ err_rq: if (result) rbd_warn(rbd_dev, "%s %llx at %llx result %d", obj_op_name(op_type), length, offset, result); - if (snapc) - ceph_put_snap_context(snapc); + ceph_put_snap_context(snapc); blk_end_request_all(rq, result); } @@ -4236,7 +4233,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) */ if (rbd_dev->parent_overlap) { rbd_dev->parent_overlap = 0; - smp_mb(); rbd_dev_parent_put(rbd_dev); pr_info("%s: clone image has been flattened\n", rbd_dev->disk->disk_name); @@ -4282,7 +4278,6 @@ static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev) * treat it specially. */ rbd_dev->parent_overlap = overlap; - smp_mb(); if (!overlap) { /* A null parent_spec indicates it's the initial probe */ @@ -5111,10 +5106,7 @@ static void rbd_dev_unprobe(struct rbd_device *rbd_dev) { struct rbd_image_header *header; - /* Drop parent reference unless it's already been done (or none) */ - - if (rbd_dev->parent_overlap) - rbd_dev_parent_put(rbd_dev); + rbd_dev_parent_put(rbd_dev); /* Free dynamic fields from the header, then zero it out */ diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c index 40ee7705df63..ac8c62cb4875 100644 --- a/drivers/block/rsxx/dev.c +++ b/drivers/block/rsxx/dev.c @@ -112,37 +112,16 @@ static const struct block_device_operations rsxx_fops = { static void disk_stats_start(struct rsxx_cardinfo *card, struct bio *bio) { - struct hd_struct *part0 = &card->gendisk->part0; - int rw = bio_data_dir(bio); - int cpu; - - cpu = part_stat_lock(); - - part_round_stats(cpu, part0); - part_inc_in_flight(part0, rw); - - part_stat_unlock(); + generic_start_io_acct(bio_data_dir(bio), bio_sectors(bio), + &card->gendisk->part0); } static void disk_stats_complete(struct rsxx_cardinfo *card, struct bio *bio, unsigned long start_time) { - struct hd_struct *part0 = &card->gendisk->part0; - unsigned long duration = jiffies - start_time; - int rw = bio_data_dir(bio); - int cpu; - - cpu = part_stat_lock(); - - part_stat_add(cpu, part0, sectors[rw], bio_sectors(bio)); - part_stat_inc(cpu, part0, ios[rw]); - part_stat_add(cpu, part0, ticks[rw], duration); - - part_round_stats(cpu, part0); - part_dec_in_flight(part0, rw); - - part_stat_unlock(); + generic_end_io_acct(bio_data_dir(bio), &card->gendisk->part0, + start_time); } static void bio_dma_done_cb(struct rsxx_cardinfo *card, diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 0ebadf93b6c5..4b911ed96ea3 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -23,8 +23,8 @@ #define DRV_MODULE_NAME "sunvdc" #define PFX DRV_MODULE_NAME ": " -#define DRV_MODULE_VERSION "1.1" -#define DRV_MODULE_RELDATE "February 13, 2013" +#define DRV_MODULE_VERSION "1.2" +#define DRV_MODULE_RELDATE "November 24, 2014" static char version[] = DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n"; @@ -40,6 +40,8 @@ MODULE_VERSION(DRV_MODULE_VERSION); #define WAITING_FOR_GEN_CMD 0x04 #define WAITING_FOR_ANY -1 +static struct workqueue_struct *sunvdc_wq; + struct vdc_req_entry { struct request *req; }; @@ -60,6 +62,10 @@ struct vdc_port { u64 max_xfer_size; u32 vdisk_block_size; + u64 ldc_timeout; + struct timer_list ldc_reset_timer; + struct work_struct ldc_reset_work; + /* The server fills these in for us in the disk attribute * ACK packet. */ @@ -71,6 +77,10 @@ struct vdc_port { char disk_name[32]; }; +static void vdc_ldc_reset(struct vdc_port *port); +static void vdc_ldc_reset_work(struct work_struct *work); +static void vdc_ldc_reset_timer(unsigned long _arg); + static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio) { return container_of(vio, struct vdc_port, vio); @@ -150,6 +160,21 @@ static const struct block_device_operations vdc_fops = { .ioctl = vdc_ioctl, }; +static void vdc_blk_queue_start(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + + /* restart blk queue when ring is half emptied. also called after + * handshake completes, so check for initial handshake before we've + * allocated a disk. + */ + if (port->disk && blk_queue_stopped(port->disk->queue) && + vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) { + blk_start_queue(port->disk->queue); + } + +} + static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) { if (vio->cmp && @@ -163,7 +188,11 @@ static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for) static void vdc_handshake_complete(struct vio_driver_state *vio) { + struct vdc_port *port = to_vdc_port(vio); + + del_timer(&port->ldc_reset_timer); vdc_finish(vio, 0, WAITING_FOR_LINK_UP); + vdc_blk_queue_start(port); } static int vdc_handle_unknown(struct vdc_port *port, void *arg) @@ -269,7 +298,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); desc->hdr.state = VIO_DESC_FREE; - dr->cons = (index + 1) & (VDC_TX_RING_SIZE - 1); + dr->cons = vio_dring_next(dr, index); req = rqe->req; if (req == NULL) { @@ -281,10 +310,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); - /* restart blk queue when ring is half emptied */ - if (blk_queue_stopped(port->disk->queue) && - vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50) - blk_start_queue(port->disk->queue); + vdc_blk_queue_start(port); } static int vdc_ack(struct vdc_port *port, void *msgbuf) @@ -317,17 +343,20 @@ static void vdc_event(void *arg, int event) spin_lock_irqsave(&vio->lock, flags); - if (unlikely(event == LDC_EVENT_RESET || - event == LDC_EVENT_UP)) { + if (unlikely(event == LDC_EVENT_RESET)) { vio_link_state_change(vio, event); - spin_unlock_irqrestore(&vio->lock, flags); - return; + queue_work(sunvdc_wq, &port->ldc_reset_work); + goto out; + } + + if (unlikely(event == LDC_EVENT_UP)) { + vio_link_state_change(vio, event); + goto out; } if (unlikely(event != LDC_EVENT_DATA_READY)) { - printk(KERN_WARNING PFX "Unexpected LDC event %d\n", event); - spin_unlock_irqrestore(&vio->lock, flags); - return; + pr_warn(PFX "Unexpected LDC event %d\n", event); + goto out; } err = 0; @@ -371,6 +400,7 @@ static void vdc_event(void *arg, int event) } if (err < 0) vdc_finish(&port->vio, err, WAITING_FOR_ANY); +out: spin_unlock_irqrestore(&vio->lock, flags); } @@ -403,6 +433,8 @@ static int __vdc_tx_trigger(struct vdc_port *port) delay = 128; } while (err == -EAGAIN); + if (err == -ENOTCONN) + vdc_ldc_reset(port); return err; } @@ -472,7 +504,7 @@ static int __send_request(struct request *req) printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err); } else { port->req_id++; - dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + dr->prod = vio_dring_next(dr, dr->prod); } return err; @@ -626,7 +658,7 @@ static int generic_request(struct vdc_port *port, u8 op, void *buf, int len) err = __vdc_tx_trigger(port); if (err >= 0) { port->req_id++; - dr->prod = (dr->prod + 1) & (VDC_TX_RING_SIZE - 1); + dr->prod = vio_dring_next(dr, dr->prod); spin_unlock_irqrestore(&port->vio.lock, flags); wait_for_completion(&comp.com); @@ -690,12 +722,9 @@ static void vdc_free_tx_ring(struct vdc_port *port) } } -static int probe_disk(struct vdc_port *port) +static int vdc_port_up(struct vdc_port *port) { struct vio_completion comp; - struct request_queue *q; - struct gendisk *g; - int err; init_completion(&comp.com); comp.err = 0; @@ -703,10 +732,27 @@ static int probe_disk(struct vdc_port *port) port->vio.cmp = ∁ vio_port_up(&port->vio); - wait_for_completion(&comp.com); - if (comp.err) - return comp.err; + return comp.err; +} + +static void vdc_port_down(struct vdc_port *port) +{ + ldc_disconnect(port->vio.lp); + ldc_unbind(port->vio.lp); + vdc_free_tx_ring(port); + vio_ldc_free(&port->vio); +} + +static int probe_disk(struct vdc_port *port) +{ + struct request_queue *q; + struct gendisk *g; + int err; + + err = vdc_port_up(port); + if (err) + return err; if (vdc_version_supported(port, 1, 1)) { /* vdisk_size should be set during the handshake, if it wasn't @@ -819,6 +865,7 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) struct mdesc_handle *hp; struct vdc_port *port; int err; + const u64 *ldc_timeout; print_version(); @@ -848,6 +895,16 @@ static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id) VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26)); port->vdisk_size = -1; + /* Actual wall time may be double due to do_generic_file_read() doing + * a readahead I/O first, and once that fails it will try to read a + * single page. + */ + ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL); + port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0; + setup_timer(&port->ldc_reset_timer, vdc_ldc_reset_timer, + (unsigned long)port); + INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work); + err = vio_driver_init(&port->vio, vdev, VDEV_DISK, vdc_versions, ARRAY_SIZE(vdc_versions), &vdc_vio_ops, port->disk_name); @@ -896,8 +953,21 @@ static int vdc_port_remove(struct vio_dev *vdev) struct vdc_port *port = dev_get_drvdata(&vdev->dev); if (port) { + unsigned long flags; + + spin_lock_irqsave(&port->vio.lock, flags); + blk_stop_queue(port->disk->queue); + spin_unlock_irqrestore(&port->vio.lock, flags); + + flush_work(&port->ldc_reset_work); + del_timer_sync(&port->ldc_reset_timer); del_timer_sync(&port->vio.timer); + del_gendisk(port->disk); + blk_cleanup_queue(port->disk->queue); + put_disk(port->disk); + port->disk = NULL; + vdc_free_tx_ring(port); vio_ldc_free(&port->vio); @@ -908,6 +978,102 @@ static int vdc_port_remove(struct vio_dev *vdev) return 0; } +static void vdc_requeue_inflight(struct vdc_port *port) +{ + struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING]; + u32 idx; + + for (idx = dr->cons; idx != dr->prod; idx = vio_dring_next(dr, idx)) { + struct vio_disk_desc *desc = vio_dring_entry(dr, idx); + struct vdc_req_entry *rqe = &port->rq_arr[idx]; + struct request *req; + + ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies); + desc->hdr.state = VIO_DESC_FREE; + dr->cons = vio_dring_next(dr, idx); + + req = rqe->req; + if (req == NULL) { + vdc_end_special(port, desc); + continue; + } + + rqe->req = NULL; + blk_requeue_request(port->disk->queue, req); + } +} + +static void vdc_queue_drain(struct vdc_port *port) +{ + struct request *req; + + while ((req = blk_fetch_request(port->disk->queue)) != NULL) + __blk_end_request_all(req, -EIO); +} + +static void vdc_ldc_reset_timer(unsigned long _arg) +{ + struct vdc_port *port = (struct vdc_port *) _arg; + struct vio_driver_state *vio = &port->vio; + unsigned long flags; + + spin_lock_irqsave(&vio->lock, flags); + if (!(port->vio.hs_state & VIO_HS_COMPLETE)) { + pr_warn(PFX "%s ldc down %llu seconds, draining queue\n", + port->disk_name, port->ldc_timeout); + vdc_queue_drain(port); + vdc_blk_queue_start(port); + } + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset_work(struct work_struct *work) +{ + struct vdc_port *port; + struct vio_driver_state *vio; + unsigned long flags; + + port = container_of(work, struct vdc_port, ldc_reset_work); + vio = &port->vio; + + spin_lock_irqsave(&vio->lock, flags); + vdc_ldc_reset(port); + spin_unlock_irqrestore(&vio->lock, flags); +} + +static void vdc_ldc_reset(struct vdc_port *port) +{ + int err; + + assert_spin_locked(&port->vio.lock); + + pr_warn(PFX "%s ldc link reset\n", port->disk_name); + blk_stop_queue(port->disk->queue); + vdc_requeue_inflight(port); + vdc_port_down(port); + + err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port); + if (err) { + pr_err(PFX "%s vio_ldc_alloc:%d\n", port->disk_name, err); + return; + } + + err = vdc_alloc_tx_ring(port); + if (err) { + pr_err(PFX "%s vio_alloc_tx_ring:%d\n", port->disk_name, err); + goto err_free_ldc; + } + + if (port->ldc_timeout) + mod_timer(&port->ldc_reset_timer, + round_jiffies(jiffies + HZ * port->ldc_timeout)); + mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ)); + return; + +err_free_ldc: + vio_ldc_free(&port->vio); +} + static const struct vio_device_id vdc_port_match[] = { { .type = "vdc-port", @@ -927,9 +1093,13 @@ static int __init vdc_init(void) { int err; + sunvdc_wq = alloc_workqueue("sunvdc", 0, 0); + if (!sunvdc_wq) + return -ENOMEM; + err = register_blkdev(0, VDCBLK_NAME); if (err < 0) - goto out_err; + goto out_free_wq; vdc_major = err; @@ -943,7 +1113,8 @@ out_unregister_blkdev: unregister_blkdev(vdc_major, VDCBLK_NAME); vdc_major = 0; -out_err: +out_free_wq: + destroy_workqueue(sunvdc_wq); return err; } @@ -951,6 +1122,7 @@ static void __exit vdc_exit(void) { vio_unregister_driver(&vdc_port_driver); unregister_blkdev(vdc_major, VDCBLK_NAME); + destroy_workqueue(sunvdc_wq); } module_init(vdc_init); diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 6b44bbe528b7..b5afd495d482 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -971,7 +971,6 @@ static struct platform_driver swim_driver = { .remove = swim_remove, .driver = { .name = CARDNAME, - .owner = THIS_MODULE, }, }; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index c6a27d54ad62..cdfbd21e3597 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -80,7 +80,7 @@ static int __virtblk_add_req(struct virtqueue *vq, { struct scatterlist hdr, status, cmd, sense, inhdr, *sgs[6]; unsigned int num_out = 0, num_in = 0; - int type = vbr->out_hdr.type & ~VIRTIO_BLK_T_OUT; + __virtio32 type = vbr->out_hdr.type & ~cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT); sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr)); sgs[num_out++] = &hdr; @@ -91,19 +91,19 @@ static int __virtblk_add_req(struct virtqueue *vq, * block, and before the normal inhdr we put the sense data and the * inhdr with additional status information. */ - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&cmd, vbr->req->cmd, vbr->req->cmd_len); sgs[num_out++] = &cmd; } if (have_data) { - if (vbr->out_hdr.type & VIRTIO_BLK_T_OUT) + if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT)) sgs[num_out++] = data_sg; else sgs[num_out + num_in++] = data_sg; } - if (type == VIRTIO_BLK_T_SCSI_CMD) { + if (type == cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_SCSI_CMD)) { sg_init_one(&sense, vbr->req->sense, SCSI_SENSE_BUFFERSIZE); sgs[num_out + num_in++] = &sense; sg_init_one(&inhdr, &vbr->in_hdr, sizeof(vbr->in_hdr)); @@ -119,12 +119,13 @@ static int __virtblk_add_req(struct virtqueue *vq, static inline void virtblk_request_done(struct request *req) { struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); + struct virtio_blk *vblk = req->q->queuedata; int error = virtblk_result(vbr); if (req->cmd_type == REQ_TYPE_BLOCK_PC) { - req->resid_len = vbr->in_hdr.residual; - req->sense_len = vbr->in_hdr.sense_len; - req->errors = vbr->in_hdr.errors; + req->resid_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.residual); + req->sense_len = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.sense_len); + req->errors = virtio32_to_cpu(vblk->vdev, vbr->in_hdr.errors); } else if (req->cmd_type == REQ_TYPE_SPECIAL) { req->errors = (error != 0); } @@ -158,10 +159,11 @@ static void virtblk_done(struct virtqueue *vq) spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); } -static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, - bool last) +static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) { struct virtio_blk *vblk = hctx->queue->queuedata; + struct request *req = bd->rq; struct virtblk_req *vbr = blk_mq_rq_to_pdu(req); unsigned long flags; unsigned int num; @@ -173,25 +175,25 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, vbr->req = req; if (req->cmd_flags & REQ_FLUSH) { - vbr->out_hdr.type = VIRTIO_BLK_T_FLUSH; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_FLUSH); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); } else { switch (req->cmd_type) { case REQ_TYPE_FS: vbr->out_hdr.type = 0; - vbr->out_hdr.sector = blk_rq_pos(vbr->req); - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.sector = cpu_to_virtio64(vblk->vdev, blk_rq_pos(vbr->req)); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_BLOCK_PC: - vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_SCSI_CMD); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; case REQ_TYPE_SPECIAL: - vbr->out_hdr.type = VIRTIO_BLK_T_GET_ID; + vbr->out_hdr.type = cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_GET_ID); vbr->out_hdr.sector = 0; - vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); + vbr->out_hdr.ioprio = cpu_to_virtio32(vblk->vdev, req_get_ioprio(vbr->req)); break; default: /* We don't put anything else in the queue. */ @@ -204,9 +206,9 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, num = blk_rq_map_sg(hctx->queue, vbr->req, vbr->sg); if (num) { if (rq_data_dir(vbr->req) == WRITE) - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_OUT); else - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + vbr->out_hdr.type |= cpu_to_virtio32(vblk->vdev, VIRTIO_BLK_T_IN); } spin_lock_irqsave(&vblk->vqs[qid].lock, flags); @@ -222,7 +224,7 @@ static int virtio_queue_rq(struct blk_mq_hw_ctx *hctx, struct request *req, return BLK_MQ_RQ_QUEUE_ERROR; } - if (last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) + if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq)) notify = true; spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags); @@ -331,7 +333,8 @@ static ssize_t virtblk_serial_show(struct device *dev, return err; } -DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); + +static DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); static void virtblk_config_changed_work(struct work_struct *work) { @@ -476,7 +479,8 @@ static int virtblk_get_cache_mode(struct virtio_device *vdev) struct virtio_blk_config, wce, &writeback); if (err) - writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE); + writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_WCE) || + virtio_has_feature(vdev, VIRTIO_F_VERSION_1); return writeback; } @@ -634,7 +638,7 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_put_disk; q = vblk->disk->queue = blk_mq_init_queue(&vblk->tag_set); - if (!q) { + if (IS_ERR(q)) { err = -ENOMEM; goto out_free_tags; } @@ -821,25 +825,34 @@ static const struct virtio_device_id id_table[] = { { 0 }, }; -static unsigned int features[] = { +static unsigned int features_legacy[] = { VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, VIRTIO_BLK_F_SCSI, VIRTIO_BLK_F_WCE, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE, VIRTIO_BLK_F_MQ, +} +; +static unsigned int features[] = { + VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, + VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, + VIRTIO_BLK_F_TOPOLOGY, + VIRTIO_BLK_F_MQ, }; static struct virtio_driver virtio_blk = { - .feature_table = features, - .feature_table_size = ARRAY_SIZE(features), - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = virtblk_probe, - .remove = virtblk_remove, - .config_changed = virtblk_config_changed, + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .feature_table_legacy = features_legacy, + .feature_table_size_legacy = ARRAY_SIZE(features_legacy), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = virtblk_remove, + .config_changed = virtblk_config_changed, #ifdef CONFIG_PM_SLEEP - .freeze = virtblk_freeze, - .restore = virtblk_restore, + .freeze = virtblk_freeze, + .restore = virtblk_restore, #endif }; @@ -871,8 +884,8 @@ out_destroy_workqueue: static void __exit fini(void) { - unregister_blkdev(major, "virtblk"); unregister_virtio_driver(&virtio_blk); + unregister_blkdev(major, "virtblk"); destroy_workqueue(virtblk_wq); } module_init(init); diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 5ac312f6e0be..2236c6f31608 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -126,7 +126,6 @@ struct blkfront_info unsigned int persistent_gnts_c; unsigned long shadow_free; unsigned int feature_flush; - unsigned int flush_op; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; unsigned int discard_granularity; @@ -479,7 +478,19 @@ static int blkif_queue_request(struct request *req) * way. (It's also a FLUSH+FUA, since it is * guaranteed ordered WRT previous writes.) */ - ring_req->operation = info->flush_op; + switch (info->feature_flush & + ((REQ_FLUSH|REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + ring_req->operation = + BLKIF_OP_WRITE_BARRIER; + break; + case REQ_FLUSH: + ring_req->operation = + BLKIF_OP_FLUSH_DISKCACHE; + break; + default: + ring_req->operation = 0; + } } ring_req->u.rw.nr_segments = nseg; } @@ -582,12 +593,14 @@ static inline void flush_requests(struct blkfront_info *info) notify_remote_via_irq(info->irq); } -static inline bool blkif_request_flush_valid(struct request *req, - struct blkfront_info *info) +static inline bool blkif_request_flush_invalid(struct request *req, + struct blkfront_info *info) { return ((req->cmd_type != REQ_TYPE_FS) || - ((req->cmd_flags & (REQ_FLUSH | REQ_FUA)) && - !info->flush_op)); + ((req->cmd_flags & REQ_FLUSH) && + !(info->feature_flush & REQ_FLUSH)) || + ((req->cmd_flags & REQ_FUA) && + !(info->feature_flush & REQ_FUA))); } /* @@ -612,8 +625,8 @@ static void do_blkif_request(struct request_queue *rq) blk_start_request(req); - if (blkif_request_flush_valid(req, info)) { - __blk_end_request_all(req, -EIO); + if (blkif_request_flush_invalid(req, info)) { + __blk_end_request_all(req, -EOPNOTSUPP); continue; } @@ -683,20 +696,26 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, return 0; } +static const char *flush_info(unsigned int feature_flush) +{ + switch (feature_flush & ((REQ_FLUSH | REQ_FUA))) { + case REQ_FLUSH|REQ_FUA: + return "barrier: enabled;"; + case REQ_FLUSH: + return "flush diskcache: enabled;"; + default: + return "barrier or flush: disabled;"; + } +} static void xlvbd_flush(struct blkfront_info *info) { blk_queue_flush(info->rq, info->feature_flush); - printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n", - info->gd->disk_name, - info->flush_op == BLKIF_OP_WRITE_BARRIER ? - "barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ? - "flush diskcache" : "barrier or flush"), - info->feature_flush ? "enabled;" : "disabled;", - "persistent grants:", - info->feature_persistent ? "enabled;" : "disabled;", - "indirect descriptors:", - info->max_indirect_segments ? "enabled;" : "disabled;"); + pr_info("blkfront: %s: %s %s %s %s %s\n", + info->gd->disk_name, flush_info(info->feature_flush), + "persistent grants:", info->feature_persistent ? + "enabled;" : "disabled;", "indirect descriptors:", + info->max_indirect_segments ? "enabled;" : "disabled;"); } static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset) @@ -1188,7 +1207,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) if (error == -EOPNOTSUPP) error = 0; info->feature_flush = 0; - info->flush_op = 0; xlvbd_flush(info); } /* fall through */ @@ -1808,7 +1826,6 @@ static void blkfront_connect(struct blkfront_info *info) physical_sector_size = sector_size; info->feature_flush = 0; - info->flush_op = 0; err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "feature-barrier", "%d", &barrier, @@ -1821,10 +1838,8 @@ static void blkfront_connect(struct blkfront_info *info) * * If there are barriers, then we use flush. */ - if (!err && barrier) { + if (!err && barrier) info->feature_flush = REQ_FLUSH | REQ_FUA; - info->flush_op = BLKIF_OP_WRITE_BARRIER; - } /* * And if there is "feature-flush-cache" use that above * barriers. @@ -1833,10 +1848,8 @@ static void blkfront_connect(struct blkfront_info *info) "feature-flush-cache", "%d", &flush, NULL); - if (!err && flush) { + if (!err && flush) info->feature_flush = REQ_FLUSH; - info->flush_op = BLKIF_OP_FLUSH_DISKCACHE; - } err = xenbus_gather(XBT_NIL, info->xbdev->otherend, "feature-discard", "%d", &discard, diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 3920ee45aa59..bd8bda386e02 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -44,15 +44,14 @@ static const char *default_compressor = "lzo"; static unsigned int num_devices = 1; #define ZRAM_ATTR_RO(name) \ -static ssize_t zram_attr_##name##_show(struct device *d, \ +static ssize_t name##_show(struct device *d, \ struct device_attribute *attr, char *b) \ { \ struct zram *zram = dev_to_zram(d); \ return scnprintf(b, PAGE_SIZE, "%llu\n", \ (u64)atomic64_read(&zram->stats.name)); \ } \ -static struct device_attribute dev_attr_##name = \ - __ATTR(name, S_IRUGO, zram_attr_##name##_show, NULL); +static DEVICE_ATTR_RO(name); static inline int init_done(struct zram *zram) { @@ -287,19 +286,18 @@ static inline int is_partial_io(struct bio_vec *bvec) /* * Check if request is within bounds and aligned on zram logical blocks. */ -static inline int valid_io_request(struct zram *zram, struct bio *bio) +static inline int valid_io_request(struct zram *zram, + sector_t start, unsigned int size) { - u64 start, end, bound; + u64 end, bound; /* unaligned request */ - if (unlikely(bio->bi_iter.bi_sector & - (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) + if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1))) return 0; - if (unlikely(bio->bi_iter.bi_size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) + if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1))) return 0; - start = bio->bi_iter.bi_sector; - end = start + (bio->bi_iter.bi_size >> SECTOR_SHIFT); + end = start + (size >> SECTOR_SHIFT); bound = zram->disksize >> SECTOR_SHIFT; /* out of range range */ if (unlikely(start >= bound || end > bound || start > end)) @@ -453,7 +451,7 @@ static int zram_decompress_page(struct zram *zram, char *mem, u32 index) } static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, - u32 index, int offset, struct bio *bio) + u32 index, int offset) { int ret; struct page *page; @@ -645,14 +643,13 @@ out: } static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, - int offset, struct bio *bio) + int offset, int rw) { int ret; - int rw = bio_data_dir(bio); if (rw == READ) { atomic64_inc(&zram->stats.num_reads); - ret = zram_bvec_read(zram, bvec, index, offset, bio); + ret = zram_bvec_read(zram, bvec, index, offset); } else { atomic64_inc(&zram->stats.num_writes); ret = zram_bvec_write(zram, bvec, index, offset); @@ -853,7 +850,7 @@ out: static void __zram_make_request(struct zram *zram, struct bio *bio) { - int offset; + int offset, rw; u32 index; struct bio_vec bvec; struct bvec_iter iter; @@ -868,6 +865,7 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) return; } + rw = bio_data_dir(bio); bio_for_each_segment(bvec, bio, iter) { int max_transfer_size = PAGE_SIZE - offset; @@ -882,15 +880,15 @@ static void __zram_make_request(struct zram *zram, struct bio *bio) bv.bv_len = max_transfer_size; bv.bv_offset = bvec.bv_offset; - if (zram_bvec_rw(zram, &bv, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bv, index, offset, rw) < 0) goto out; bv.bv_len = bvec.bv_len - max_transfer_size; bv.bv_offset += max_transfer_size; - if (zram_bvec_rw(zram, &bv, index + 1, 0, bio) < 0) + if (zram_bvec_rw(zram, &bv, index + 1, 0, rw) < 0) goto out; } else - if (zram_bvec_rw(zram, &bvec, index, offset, bio) < 0) + if (zram_bvec_rw(zram, &bvec, index, offset, rw) < 0) goto out; update_position(&index, &offset, &bvec); @@ -915,7 +913,8 @@ static void zram_make_request(struct request_queue *queue, struct bio *bio) if (unlikely(!init_done(zram))) goto error; - if (!valid_io_request(zram, bio)) { + if (!valid_io_request(zram, bio->bi_iter.bi_sector, + bio->bi_iter.bi_size)) { atomic64_inc(&zram->stats.invalid_io); goto error; } @@ -945,25 +944,64 @@ static void zram_slot_free_notify(struct block_device *bdev, atomic64_inc(&zram->stats.notify_free); } +static int zram_rw_page(struct block_device *bdev, sector_t sector, + struct page *page, int rw) +{ + int offset, err; + u32 index; + struct zram *zram; + struct bio_vec bv; + + zram = bdev->bd_disk->private_data; + if (!valid_io_request(zram, sector, PAGE_SIZE)) { + atomic64_inc(&zram->stats.invalid_io); + return -EINVAL; + } + + down_read(&zram->init_lock); + if (unlikely(!init_done(zram))) { + err = -EIO; + goto out_unlock; + } + + index = sector >> SECTORS_PER_PAGE_SHIFT; + offset = sector & (SECTORS_PER_PAGE - 1) << SECTOR_SHIFT; + + bv.bv_page = page; + bv.bv_len = PAGE_SIZE; + bv.bv_offset = 0; + + err = zram_bvec_rw(zram, &bv, index, offset, rw); +out_unlock: + up_read(&zram->init_lock); + /* + * If I/O fails, just return error(ie, non-zero) without + * calling page_endio. + * It causes resubmit the I/O with bio request by upper functions + * of rw_page(e.g., swap_readpage, __swap_writepage) and + * bio->bi_end_io does things to handle the error + * (e.g., SetPageError, set_page_dirty and extra works). + */ + if (err == 0) + page_endio(page, rw, 0); + return err; +} + static const struct block_device_operations zram_devops = { .swap_slot_free_notify = zram_slot_free_notify, + .rw_page = zram_rw_page, .owner = THIS_MODULE }; -static DEVICE_ATTR(disksize, S_IRUGO | S_IWUSR, - disksize_show, disksize_store); -static DEVICE_ATTR(initstate, S_IRUGO, initstate_show, NULL); -static DEVICE_ATTR(reset, S_IWUSR, NULL, reset_store); -static DEVICE_ATTR(orig_data_size, S_IRUGO, orig_data_size_show, NULL); -static DEVICE_ATTR(mem_used_total, S_IRUGO, mem_used_total_show, NULL); -static DEVICE_ATTR(mem_limit, S_IRUGO | S_IWUSR, mem_limit_show, - mem_limit_store); -static DEVICE_ATTR(mem_used_max, S_IRUGO | S_IWUSR, mem_used_max_show, - mem_used_max_store); -static DEVICE_ATTR(max_comp_streams, S_IRUGO | S_IWUSR, - max_comp_streams_show, max_comp_streams_store); -static DEVICE_ATTR(comp_algorithm, S_IRUGO | S_IWUSR, - comp_algorithm_show, comp_algorithm_store); +static DEVICE_ATTR_RW(disksize); +static DEVICE_ATTR_RO(initstate); +static DEVICE_ATTR_WO(reset); +static DEVICE_ATTR_RO(orig_data_size); +static DEVICE_ATTR_RO(mem_used_total); +static DEVICE_ATTR_RW(mem_limit); +static DEVICE_ATTR_RW(mem_used_max); +static DEVICE_ATTR_RW(max_comp_streams); +static DEVICE_ATTR_RW(comp_algorithm); ZRAM_ATTR_RO(num_reads); ZRAM_ATTR_RO(num_writes); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index c6ee271317f5..b05a816b09ac 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -66,8 +66,8 @@ static const size_t max_zpage_size = PAGE_SIZE / 4 * 3; /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { /* Page consists entirely of zeros */ - ZRAM_ZERO = ZRAM_FLAG_SHIFT + 1, - ZRAM_ACCESS, /* page in now accessed */ + ZRAM_ZERO = ZRAM_FLAG_SHIFT, + ZRAM_ACCESS, /* page is now accessed */ __NR_ZRAM_PAGEFLAGS, }; |