diff options
Diffstat (limited to 'drivers/misc/habanalabs/command_submission.c')
-rw-r--r-- | drivers/misc/habanalabs/command_submission.c | 385 |
1 files changed, 354 insertions, 31 deletions
diff --git a/drivers/misc/habanalabs/command_submission.c b/drivers/misc/habanalabs/command_submission.c index 409276b6374d..f82974a916c3 100644 --- a/drivers/misc/habanalabs/command_submission.c +++ b/drivers/misc/habanalabs/command_submission.c @@ -11,11 +11,33 @@ #include <linux/uaccess.h> #include <linux/slab.h> +#define HL_CS_FLAGS_SIG_WAIT (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT) + static void job_wq_completion(struct work_struct *work); static long _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq); static void cs_do_release(struct kref *ref); +static void hl_sob_reset(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + hdev->asic_funcs->reset_sob(hdev, hw_sob); +} + +void hl_sob_reset_error(struct kref *ref) +{ + struct hl_hw_sob *hw_sob = container_of(ref, struct hl_hw_sob, + kref); + struct hl_device *hdev = hw_sob->hdev; + + dev_crit(hdev->dev, + "SOB release shouldn't be called here, q_idx: %d, sob_id: %d\n", + hw_sob->q_idx, hw_sob->sob_id); +} + static const char *hl_fence_get_driver_name(struct dma_fence *fence) { return "HabanaLabs"; @@ -23,10 +45,10 @@ static const char *hl_fence_get_driver_name(struct dma_fence *fence) static const char *hl_fence_get_timeline_name(struct dma_fence *fence) { - struct hl_dma_fence *hl_fence = - container_of(fence, struct hl_dma_fence, base_fence); + struct hl_cs_compl *hl_cs_compl = + container_of(fence, struct hl_cs_compl, base_fence); - return dev_name(hl_fence->hdev->dev); + return dev_name(hl_cs_compl->hdev->dev); } static bool hl_fence_enable_signaling(struct dma_fence *fence) @@ -36,17 +58,47 @@ static bool hl_fence_enable_signaling(struct dma_fence *fence) static void hl_fence_release(struct dma_fence *fence) { - struct hl_dma_fence *hl_fence = - container_of(fence, struct hl_dma_fence, base_fence); + struct hl_cs_compl *hl_cs_cmpl = + container_of(fence, struct hl_cs_compl, base_fence); + struct hl_device *hdev = hl_cs_cmpl->hdev; + + if ((hl_cs_cmpl->type == CS_TYPE_SIGNAL) || + (hl_cs_cmpl->type == CS_TYPE_WAIT)) { - kfree_rcu(hl_fence, base_fence.rcu); + dev_dbg(hdev->dev, + "CS 0x%llx type %d finished, sob_id: %d, sob_val: 0x%x\n", + hl_cs_cmpl->cs_seq, + hl_cs_cmpl->type, + hl_cs_cmpl->hw_sob->sob_id, + hl_cs_cmpl->sob_val); + + /* + * A signal CS can get completion while the corresponding wait + * for signal CS is on its way to the PQ. The wait for signal CS + * will get stuck if the signal CS incremented the SOB to its + * max value and there are no pending (submitted) waits on this + * SOB. + * We do the following to void this situation: + * 1. The wait for signal CS must get a ref for the signal CS as + * soon as possible in cs_ioctl_signal_wait() and put it + * before being submitted to the PQ but after it incremented + * the SOB refcnt in init_signal_wait_cs(). + * 2. Signal/Wait for signal CS will decrement the SOB refcnt + * here. + * These two measures guarantee that the wait for signal CS will + * reset the SOB upon completion rather than the signal CS and + * hence the above scenario is avoided. + */ + kref_put(&hl_cs_cmpl->hw_sob->kref, hl_sob_reset); + } + + kfree_rcu(hl_cs_cmpl, base_fence.rcu); } static const struct dma_fence_ops hl_fence_ops = { .get_driver_name = hl_fence_get_driver_name, .get_timeline_name = hl_fence_get_timeline_name, .enable_signaling = hl_fence_enable_signaling, - .wait = dma_fence_default_wait, .release = hl_fence_release }; @@ -113,6 +165,7 @@ static int cs_parser(struct hl_fpriv *hpriv, struct hl_cs_job *job) if (!rc) { job->patched_cb = parser.patched_cb; job->job_cb_size = parser.patched_cb_size; + job->contains_dma_pkt = parser.contains_dma_pkt; spin_lock(&job->patched_cb->lock); job->patched_cb->cs_cnt++; @@ -259,6 +312,12 @@ static void cs_do_release(struct kref *ref) spin_unlock(&hdev->hw_queues_mirror_lock); } + } else if (cs->type == CS_TYPE_WAIT) { + /* + * In case the wait for signal CS was submitted, the put occurs + * in init_signal_wait_cs() right before hanging on the PQ. + */ + dma_fence_put(cs->signal_fence); } /* @@ -312,9 +371,9 @@ static void cs_timedout(struct work_struct *work) } static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, - struct hl_cs **cs_new) + enum hl_cs_type cs_type, struct hl_cs **cs_new) { - struct hl_dma_fence *fence; + struct hl_cs_compl *cs_cmpl; struct dma_fence *other = NULL; struct hl_cs *cs; int rc; @@ -326,25 +385,27 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, cs->ctx = ctx; cs->submitted = false; cs->completed = false; + cs->type = cs_type; INIT_LIST_HEAD(&cs->job_list); INIT_DELAYED_WORK(&cs->work_tdr, cs_timedout); kref_init(&cs->refcount); spin_lock_init(&cs->job_lock); - fence = kmalloc(sizeof(*fence), GFP_ATOMIC); - if (!fence) { + cs_cmpl = kmalloc(sizeof(*cs_cmpl), GFP_ATOMIC); + if (!cs_cmpl) { rc = -ENOMEM; goto free_cs; } - fence->hdev = hdev; - spin_lock_init(&fence->lock); - cs->fence = &fence->base_fence; + cs_cmpl->hdev = hdev; + cs_cmpl->type = cs->type; + spin_lock_init(&cs_cmpl->lock); + cs->fence = &cs_cmpl->base_fence; spin_lock(&ctx->cs_lock); - fence->cs_seq = ctx->cs_sequence; - other = ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)]; + cs_cmpl->cs_seq = ctx->cs_sequence; + other = ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)]; if ((other) && (!dma_fence_is_signaled(other))) { spin_unlock(&ctx->cs_lock); dev_dbg(hdev->dev, @@ -353,16 +414,16 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, goto free_fence; } - dma_fence_init(&fence->base_fence, &hl_fence_ops, &fence->lock, + dma_fence_init(&cs_cmpl->base_fence, &hl_fence_ops, &cs_cmpl->lock, ctx->asid, ctx->cs_sequence); - cs->sequence = fence->cs_seq; + cs->sequence = cs_cmpl->cs_seq; - ctx->cs_pending[fence->cs_seq & (HL_MAX_PENDING_CS - 1)] = - &fence->base_fence; + ctx->cs_pending[cs_cmpl->cs_seq & (HL_MAX_PENDING_CS - 1)] = + &cs_cmpl->base_fence; ctx->cs_sequence++; - dma_fence_get(&fence->base_fence); + dma_fence_get(&cs_cmpl->base_fence); dma_fence_put(other); @@ -373,7 +434,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx, return 0; free_fence: - kfree(fence); + kfree(cs_cmpl); free_cs: kfree(cs); return rc; @@ -499,8 +560,8 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, return job; } -static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, - u32 num_chunks, u64 *cs_seq) +static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, + u32 num_chunks, u64 *cs_seq) { struct hl_device *hdev = hpriv->hdev; struct hl_cs_chunk *cs_chunk_array; @@ -538,7 +599,7 @@ static int _hl_cs_ioctl(struct hl_fpriv *hpriv, void __user *chunks, /* increment refcnt for context */ hl_ctx_get(hdev, hpriv->ctx); - rc = allocate_cs(hdev, hpriv->ctx, &cs); + rc = allocate_cs(hdev, hpriv->ctx, CS_TYPE_DEFAULT, &cs); if (rc) { hl_ctx_put(hpriv->ctx); goto free_cs_chunk_array; @@ -652,13 +713,230 @@ out: return rc; } +static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, + void __user *chunks, u32 num_chunks, + u64 *cs_seq) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + struct hl_cs_chunk *cs_chunk_array, *chunk; + struct hw_queue_properties *hw_queue_prop; + struct dma_fence *sig_fence = NULL; + struct hl_cs_job *job; + struct hl_cs *cs; + struct hl_cb *cb; + u64 *signal_seq_arr = NULL, signal_seq; + u32 size_to_copy, q_idx, signal_seq_arr_len, cb_size; + int rc; + + *cs_seq = ULLONG_MAX; + + if (num_chunks > HL_MAX_JOBS_PER_CS) { + dev_err(hdev->dev, + "Number of chunks can NOT be larger than %d\n", + HL_MAX_JOBS_PER_CS); + rc = -EINVAL; + goto out; + } + + cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array), + GFP_ATOMIC); + if (!cs_chunk_array) { + rc = -ENOMEM; + goto out; + } + + size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); + if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) { + dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); + rc = -EFAULT; + goto free_cs_chunk_array; + } + + /* currently it is guaranteed to have only one chunk */ + chunk = &cs_chunk_array[0]; + q_idx = chunk->queue_index; + hw_queue_prop = &hdev->asic_prop.hw_queues_props[q_idx]; + + if ((q_idx >= HL_MAX_QUEUES) || + (hw_queue_prop->type != QUEUE_TYPE_EXT)) { + dev_err(hdev->dev, "Queue index %d is invalid\n", q_idx); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + if (cs_type == CS_TYPE_WAIT) { + struct hl_cs_compl *sig_waitcs_cmpl; + + signal_seq_arr_len = chunk->num_signal_seq_arr; + + /* currently only one signal seq is supported */ + if (signal_seq_arr_len != 1) { + dev_err(hdev->dev, + "Wait for signal CS supports only one signal CS seq\n"); + rc = -EINVAL; + goto free_cs_chunk_array; + } + + signal_seq_arr = kmalloc_array(signal_seq_arr_len, + sizeof(*signal_seq_arr), + GFP_ATOMIC); + if (!signal_seq_arr) { + rc = -ENOMEM; + goto free_cs_chunk_array; + } + + size_to_copy = chunk->num_signal_seq_arr * + sizeof(*signal_seq_arr); + if (copy_from_user(signal_seq_arr, + u64_to_user_ptr(chunk->signal_seq_arr), + size_to_copy)) { + dev_err(hdev->dev, + "Failed to copy signal seq array from user\n"); + rc = -EFAULT; + goto free_signal_seq_array; + } + + /* currently it is guaranteed to have only one signal seq */ + signal_seq = signal_seq_arr[0]; + sig_fence = hl_ctx_get_fence(ctx, signal_seq); + if (IS_ERR(sig_fence)) { + dev_err(hdev->dev, + "Failed to get signal CS with seq 0x%llx\n", + signal_seq); + rc = PTR_ERR(sig_fence); + goto free_signal_seq_array; + } + + if (!sig_fence) { + /* signal CS already finished */ + rc = 0; + goto free_signal_seq_array; + } + + sig_waitcs_cmpl = + container_of(sig_fence, struct hl_cs_compl, base_fence); + + if (sig_waitcs_cmpl->type != CS_TYPE_SIGNAL) { + dev_err(hdev->dev, + "CS seq 0x%llx is not of a signal CS\n", + signal_seq); + dma_fence_put(sig_fence); + rc = -EINVAL; + goto free_signal_seq_array; + } + + if (dma_fence_is_signaled(sig_fence)) { + /* signal CS already finished */ + dma_fence_put(sig_fence); + rc = 0; + goto free_signal_seq_array; + } + } + + /* increment refcnt for context */ + hl_ctx_get(hdev, ctx); + + rc = allocate_cs(hdev, ctx, cs_type, &cs); + if (rc) { + if (cs_type == CS_TYPE_WAIT) + dma_fence_put(sig_fence); + hl_ctx_put(ctx); + goto free_signal_seq_array; + } + + /* + * Save the signal CS fence for later initialization right before + * hanging the wait CS on the queue. + */ + if (cs->type == CS_TYPE_WAIT) + cs->signal_fence = sig_fence; + + hl_debugfs_add_cs(cs); + + *cs_seq = cs->sequence; + + job = hl_cs_allocate_job(hdev, QUEUE_TYPE_EXT, true); + if (!job) { + dev_err(hdev->dev, "Failed to allocate a new job\n"); + rc = -ENOMEM; + goto put_cs; + } + + cb = hl_cb_kernel_create(hdev, PAGE_SIZE); + if (!cb) { + kfree(job); + rc = -EFAULT; + goto put_cs; + } + + if (cs->type == CS_TYPE_WAIT) + cb_size = hdev->asic_funcs->get_wait_cb_size(hdev); + else + cb_size = hdev->asic_funcs->get_signal_cb_size(hdev); + + job->id = 0; + job->cs = cs; + job->user_cb = cb; + job->user_cb->cs_cnt++; + job->user_cb_size = cb_size; + job->hw_queue_id = q_idx; + + /* + * No need in parsing, user CB is the patched CB. + * We call hl_cb_destroy() out of two reasons - we don't need the CB in + * the CB idr anymore and to decrement its refcount as it was + * incremented inside hl_cb_kernel_create(). + */ + job->patched_cb = job->user_cb; + job->job_cb_size = job->user_cb_size; + hl_cb_destroy(hdev, &hdev->kernel_cb_mgr, cb->id << PAGE_SHIFT); + + cs->jobs_in_queue_cnt[job->hw_queue_id]++; + + list_add_tail(&job->cs_node, &cs->job_list); + + /* increment refcount as for external queues we get completion */ + cs_get(cs); + + hl_debugfs_add_job(hdev, job); + + rc = hl_hw_queue_schedule_cs(cs); + if (rc) { + if (rc != -EAGAIN) + dev_err(hdev->dev, + "Failed to submit CS %d.%llu to H/W queues, error %d\n", + ctx->asid, cs->sequence, rc); + goto free_cs_object; + } + + rc = HL_CS_STATUS_SUCCESS; + goto put_cs; + +free_cs_object: + cs_rollback(hdev, cs); + *cs_seq = ULLONG_MAX; + /* The path below is both for good and erroneous exits */ +put_cs: + /* We finished with the CS in this function, so put the ref */ + cs_put(cs); +free_signal_seq_array: + if (cs_type == CS_TYPE_WAIT) + kfree(signal_seq_arr); +free_cs_chunk_array: + kfree(cs_chunk_array); +out: + return rc; +} + int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) { struct hl_device *hdev = hpriv->hdev; union hl_cs_args *args = data; struct hl_ctx *ctx = hpriv->ctx; void __user *chunks_execute, *chunks_restore; - u32 num_chunks_execute, num_chunks_restore; + enum hl_cs_type cs_type; + u32 num_chunks_execute, num_chunks_restore, sig_wait_flags; u64 cs_seq = ULONG_MAX; int rc, do_ctx_switch; bool need_soft_reset = false; @@ -671,12 +949,44 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) goto out; } + sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT; + + if (unlikely(sig_wait_flags == HL_CS_FLAGS_SIG_WAIT)) { + dev_err(hdev->dev, + "Signal and wait CS flags are mutually exclusive, context %d\n", + ctx->asid); + rc = -EINVAL; + goto out; + } + + if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) && + (!hdev->supports_sync_stream))) { + dev_err(hdev->dev, "Sync stream CS is not supported\n"); + rc = -EINVAL; + goto out; + } + + if (args->in.cs_flags & HL_CS_FLAGS_SIGNAL) + cs_type = CS_TYPE_SIGNAL; + else if (args->in.cs_flags & HL_CS_FLAGS_WAIT) + cs_type = CS_TYPE_WAIT; + else + cs_type = CS_TYPE_DEFAULT; + chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute; num_chunks_execute = args->in.num_chunks_execute; - if (!num_chunks_execute) { + if (cs_type == CS_TYPE_DEFAULT) { + if (!num_chunks_execute) { + dev_err(hdev->dev, + "Got execute CS with 0 chunks, context %d\n", + ctx->asid); + rc = -EINVAL; + goto out; + } + } else if (num_chunks_execute != 1) { dev_err(hdev->dev, - "Got execute CS with 0 chunks, context %d\n", + "Sync stream CS mandates one chunk only, context %d\n", ctx->asid); rc = -EINVAL; goto out; @@ -722,7 +1032,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) "Need to run restore phase but restore CS is empty\n"); rc = 0; } else { - rc = _hl_cs_ioctl(hpriv, chunks_restore, + rc = cs_ioctl_default(hpriv, chunks_restore, num_chunks_restore, &cs_seq); } @@ -764,7 +1074,12 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) } } - rc = _hl_cs_ioctl(hpriv, chunks_execute, num_chunks_execute, &cs_seq); + if (cs_type == CS_TYPE_DEFAULT) + rc = cs_ioctl_default(hpriv, chunks_execute, num_chunks_execute, + &cs_seq); + else + rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks_execute, + num_chunks_execute, &cs_seq); out: if (rc != -EAGAIN) { @@ -796,6 +1111,10 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, fence = hl_ctx_get_fence(ctx, seq); if (IS_ERR(fence)) { rc = PTR_ERR(fence); + if (rc == -EINVAL) + dev_notice_ratelimited(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu\n", + seq, ctx->cs_sequence); } else if (fence) { rc = dma_fence_wait_timeout(fence, true, timeout); if (fence->error == -ETIMEDOUT) @@ -803,8 +1122,12 @@ static long _hl_cs_wait_ioctl(struct hl_device *hdev, else if (fence->error == -EIO) rc = -EIO; dma_fence_put(fence); - } else + } else { + dev_dbg(hdev->dev, + "Can't wait on seq %llu because current CS is at seq %llu (Fence is gone)\n", + seq, ctx->cs_sequence); rc = 1; + } hl_ctx_put(ctx); |