diff options
author | Tomer Tayar <ttayar@habana.ai> | 2020-07-19 20:07:15 +0200 |
---|---|---|
committer | Oded Gabbay <ogabbay@kernel.org> | 2020-11-30 09:47:32 +0100 |
commit | 6de3d769fde0303a746a5e540bf23fa4b07d0ad5 (patch) | |
tree | 8da4fb0c404c65304a8bcd276b4be23bc1c950f2 /drivers/misc | |
parent | habanalabs/gaudi: fetch PLL info from FW (diff) | |
download | linux-6de3d769fde0303a746a5e540bf23fa4b07d0ad5.tar.xz linux-6de3d769fde0303a746a5e540bf23fa4b07d0ad5.zip |
habanalabs: Small refactoring of CS IOCTL handling
Refactor the CS IOCTL handling by gathering common code into
sub-functions, in order to ease future additions of new CS types.
Signed-off-by: Tomer Tayar <ttayar@habana.ai>
Reviewed-by: Oded Gabbay <ogabbay@kernel.org>
Signed-off-by: Oded Gabbay <ogabbay@kernel.org>
Diffstat (limited to 'drivers/misc')
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c | 417 |
1 files changed, 223 insertions, 194 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index e123101b74d6..fc1586e09ed0 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -11,8 +11,6 @@ #include <linux/uaccess.h> #include <linux/slab.h> -#define HL_CS_FLAGS_SIG_WAIT (HL_CS_FLAGS_SIGNAL | HL_CS_FLAGS_WAIT) - static void job_wq_completion(struct work_struct *work); static long _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, u64 timeout_us, u64 seq); @@ -660,44 +658,114 @@ struct hl_cs_job *hl_cs_allocate_job(struct hl_device *hdev, return job; } -static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, - u32 num_chunks, u64 *cs_seq) +static enum hl_cs_type hl_cs_get_cs_type(u32 cs_type_flags) +{ + if (cs_type_flags & HL_CS_FLAGS_SIGNAL) + return CS_TYPE_SIGNAL; + else if (cs_type_flags & HL_CS_FLAGS_WAIT) + return CS_TYPE_WAIT; + else if (cs_type_flags & HL_CS_FLAGS_COLLECTIVE_WAIT) + return CS_TYPE_COLLECTIVE_WAIT; + else + return CS_TYPE_DEFAULT; +} + +static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args) { struct hl_device *hdev = hpriv->hdev; - struct hl_cs_chunk *cs_chunk_array; - struct hl_cs_counters_atomic *cntr; - struct hl_cs_job *job; - struct hl_cs *cs; - struct hl_cb *cb; - bool int_queues_only = true; - u32 size_to_copy; - int rc, i; + struct hl_ctx *ctx = hpriv->ctx; + u32 cs_type_flags, num_chunks; + enum hl_cs_type cs_type; - cntr = &hdev->aggregated_cs_counters; - *cs_seq = ULLONG_MAX; + if (hl_device_disabled_or_in_reset(hdev)) { + dev_warn_ratelimited(hdev->dev, + "Device is %s. Can't submit new CS\n", + atomic_read(&hdev->in_reset) ? "in_reset" : "disabled"); + return -EBUSY; + } + + cs_type_flags = args->in.cs_flags & ~HL_CS_FLAGS_FORCE_RESTORE; + + if (unlikely(cs_type_flags && !is_power_of_2(cs_type_flags))) { + dev_err(hdev->dev, + "CS type flags are mutually exclusive, context %d\n", + ctx->asid); + return -EINVAL; + } + + cs_type = hl_cs_get_cs_type(cs_type_flags); + num_chunks = args->in.num_chunks_execute; + + if (unlikely((cs_type != CS_TYPE_DEFAULT) && + !hdev->supports_sync_stream)) { + dev_err(hdev->dev, "Sync stream CS is not supported\n"); + return -EINVAL; + } + + if (cs_type == CS_TYPE_DEFAULT) { + if (!num_chunks) { + dev_err(hdev->dev, + "Got execute CS with 0 chunks, context %d\n", + ctx->asid); + return -EINVAL; + } + } else if (num_chunks != 1) { + dev_err(hdev->dev, + "Sync stream CS mandates one chunk only, context %d\n", + ctx->asid); + return -EINVAL; + } + + return 0; +} + +static int hl_cs_copy_chunk_array(struct hl_device *hdev, + struct hl_cs_chunk **cs_chunk_array, + void __user *chunks, u32 num_chunks) +{ + u32 size_to_copy; if (num_chunks > HL_MAX_JOBS_PER_CS) { dev_err(hdev->dev, "Number of chunks can NOT be larger than %d\n", HL_MAX_JOBS_PER_CS); - rc = -EINVAL; - goto out; + return -EINVAL; } - cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array), + *cs_chunk_array = kmalloc_array(num_chunks, sizeof(**cs_chunk_array), GFP_ATOMIC); - if (!cs_chunk_array) { - rc = -ENOMEM; - goto out; - } + if (!*cs_chunk_array) + return -ENOMEM; size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); - if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) { + if (copy_from_user(*cs_chunk_array, chunks, size_to_copy)) { dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); - rc = -EFAULT; - goto free_cs_chunk_array; + kfree(*cs_chunk_array); + return -EFAULT; } + return 0; +} + +static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, + u32 num_chunks, u64 *cs_seq) +{ + bool int_queues_only = true; + struct hl_device *hdev = hpriv->hdev; + struct hl_cs_chunk *cs_chunk_array; + struct hl_cs_counters_atomic *cntr; + struct hl_cs_job *job; + struct hl_cs *cs; + struct hl_cb *cb; + int rc, i; + + cntr = &hdev->aggregated_cs_counters; + *cs_seq = ULLONG_MAX; + + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks); + if (rc) + goto out; + /* increment refcnt for context */ hl_ctx_get(hdev, hpriv->ctx); @@ -828,6 +896,108 @@ out: return rc; } +static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args, + u64 *cs_seq) +{ + struct hl_device *hdev = hpriv->hdev; + struct hl_ctx *ctx = hpriv->ctx; + bool need_soft_reset = false; + int rc = 0, do_ctx_switch; + void __user *chunks; + u32 num_chunks, tmp; + long ret; + + do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0); + + if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { + mutex_lock(&hpriv->restore_phase_mutex); + + if (do_ctx_switch) { + rc = hdev->asic_funcs->context_switch(hdev, ctx->asid); + if (rc) { + dev_err_ratelimited(hdev->dev, + "Failed to switch to context %d, rejecting CS! %d\n", + ctx->asid, rc); + /* + * If we timedout, or if the device is not IDLE + * while we want to do context-switch (-EBUSY), + * we need to soft-reset because QMAN is + * probably stuck. However, we can't call to + * reset here directly because of deadlock, so + * need to do it at the very end of this + * function + */ + if ((rc == -ETIMEDOUT) || (rc == -EBUSY)) + need_soft_reset = true; + mutex_unlock(&hpriv->restore_phase_mutex); + goto out; + } + } + + hdev->asic_funcs->restore_phase_topology(hdev); + + chunks = (void __user *) (uintptr_t) args->in.chunks_restore; + num_chunks = args->in.num_chunks_restore; + + if (!num_chunks) { + dev_dbg(hdev->dev, + "Need to run restore phase but restore CS is empty\n"); + rc = 0; + } else { + rc = cs_ioctl_default(hpriv, chunks, num_chunks, + cs_seq); + } + + mutex_unlock(&hpriv->restore_phase_mutex); + + if (rc) { + dev_err(hdev->dev, + "Failed to submit restore CS for context %d (%d)\n", + ctx->asid, rc); + goto out; + } + + /* Need to wait for restore completion before execution phase */ + if (num_chunks) { +wait_again: + ret = _hl_cs_wait_ioctl(hdev, ctx, + jiffies_to_usecs(hdev->timeout_jiffies), + *cs_seq); + if (ret <= 0) { + if (ret == -ERESTARTSYS) { + usleep_range(100, 200); + goto wait_again; + } + + dev_err(hdev->dev, + "Restore CS for context %d failed to complete %ld\n", + ctx->asid, ret); + rc = -ENOEXEC; + goto out; + } + } + + ctx->thread_ctx_switch_wait_token = 1; + + } else if (!ctx->thread_ctx_switch_wait_token) { + rc = hl_poll_timeout_memory(hdev, + &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1), + 100, jiffies_to_usecs(hdev->timeout_jiffies), false); + + if (rc == -ETIMEDOUT) { + dev_err(hdev->dev, + "context switch phase timeout (%d)\n", tmp); + goto out; + } + } + +out: + if ((rc == -ETIMEDOUT || rc == -EBUSY) && (need_soft_reset)) + hl_device_reset(hdev, false, false); + + return rc; +} + static int cs_ioctl_extract_signal_seq(struct hl_device *hdev, struct hl_cs_chunk *chunk, u64 *signal_seq) { @@ -935,41 +1105,23 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, void __user *chunks, u32 num_chunks, u64 *cs_seq) { - struct hl_device *hdev = hpriv->hdev; - struct hl_ctx *ctx = hpriv->ctx; struct hl_cs_chunk *cs_chunk_array, *chunk; struct hw_queue_properties *hw_queue_prop; - struct hl_fence *sig_fence = NULL; + struct hl_device *hdev = hpriv->hdev; struct hl_cs_compl *sig_waitcs_cmpl; - struct hl_cs *cs; + u32 q_idx, collective_engine_id = 0; + struct hl_fence *sig_fence = NULL; + struct hl_ctx *ctx = hpriv->ctx; enum hl_queue_type q_type; - u32 size_to_copy, q_idx, collective_engine_id = 0; + struct hl_cs *cs; u64 signal_seq; int rc; *cs_seq = ULLONG_MAX; - if (num_chunks > HL_MAX_JOBS_PER_CS) { - dev_err(hdev->dev, - "Number of chunks can NOT be larger than %d\n", - HL_MAX_JOBS_PER_CS); - rc = -EINVAL; - goto out; - } - - cs_chunk_array = kmalloc_array(num_chunks, sizeof(*cs_chunk_array), - GFP_ATOMIC); - if (!cs_chunk_array) { - rc = -ENOMEM; + rc = hl_cs_copy_chunk_array(hdev, &cs_chunk_array, chunks, num_chunks); + if (rc) goto out; - } - - size_to_copy = num_chunks * sizeof(struct hl_cs_chunk); - if (copy_from_user(cs_chunk_array, chunks, size_to_copy)) { - dev_err(hdev->dev, "Failed to copy cs chunk array from user\n"); - rc = -EFAULT; - goto free_cs_chunk_array; - } /* currently it is guaranteed to have only one chunk */ chunk = &cs_chunk_array[0]; @@ -1106,158 +1258,38 @@ out: int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data) { - struct hl_device *hdev = hpriv->hdev; union hl_cs_args *args = data; - struct hl_ctx *ctx = hpriv->ctx; - void __user *chunks_execute, *chunks_restore; enum hl_cs_type cs_type; - u32 num_chunks_execute, num_chunks_restore, sig_wait_flags; u64 cs_seq = ULONG_MAX; - int rc, do_ctx_switch; - bool need_soft_reset = false; - - if (hl_device_disabled_or_in_reset(hdev)) { - dev_warn_ratelimited(hdev->dev, - "Device is %s. Can't submit new CS\n", - atomic_read(&hdev->in_reset) ? "in_reset" : "disabled"); - rc = -EBUSY; - goto out; - } - - sig_wait_flags = args->in.cs_flags & HL_CS_FLAGS_SIG_WAIT; - - if (unlikely(sig_wait_flags == HL_CS_FLAGS_SIG_WAIT)) { - dev_err(hdev->dev, - "Signal and wait CS flags are mutually exclusive, context %d\n", - ctx->asid); - rc = -EINVAL; - goto out; - } + void __user *chunks; + u32 num_chunks; + int rc; - if (unlikely((sig_wait_flags & HL_CS_FLAGS_SIG_WAIT) && - (!hdev->supports_sync_stream))) { - dev_err(hdev->dev, "Sync stream CS is not supported\n"); - rc = -EINVAL; + rc = hl_cs_sanity_checks(hpriv, args); + if (rc) goto out; - } - if (args->in.cs_flags & HL_CS_FLAGS_SIGNAL) - cs_type = CS_TYPE_SIGNAL; - else if (args->in.cs_flags & HL_CS_FLAGS_WAIT) - cs_type = CS_TYPE_WAIT; - else if (args->in.cs_flags & HL_CS_FLAGS_COLLECTIVE_WAIT) - cs_type = CS_TYPE_COLLECTIVE_WAIT; - else - cs_type = CS_TYPE_DEFAULT; - - chunks_execute = (void __user *) (uintptr_t) args->in.chunks_execute; - num_chunks_execute = args->in.num_chunks_execute; - - if (cs_type == CS_TYPE_DEFAULT) { - if (!num_chunks_execute) { - dev_err(hdev->dev, - "Got execute CS with 0 chunks, context %d\n", - ctx->asid); - rc = -EINVAL; - goto out; - } - } else if (num_chunks_execute != 1) { - dev_err(hdev->dev, - "Sync stream CS mandates one chunk only, context %d\n", - ctx->asid); - rc = -EINVAL; + rc = hl_cs_ctx_switch(hpriv, args, &cs_seq); + if (rc) goto out; - } - - do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0); - - if (do_ctx_switch || (args->in.cs_flags & HL_CS_FLAGS_FORCE_RESTORE)) { - long ret; - - chunks_restore = - (void __user *) (uintptr_t) args->in.chunks_restore; - num_chunks_restore = args->in.num_chunks_restore; - - mutex_lock(&hpriv->restore_phase_mutex); - - if (do_ctx_switch) { - rc = hdev->asic_funcs->context_switch(hdev, ctx->asid); - if (rc) { - dev_err_ratelimited(hdev->dev, - "Failed to switch to context %d, rejecting CS! %d\n", - ctx->asid, rc); - /* - * If we timedout, or if the device is not IDLE - * while we want to do context-switch (-EBUSY), - * we need to soft-reset because QMAN is - * probably stuck. However, we can't call to - * reset here directly because of deadlock, so - * need to do it at the very end of this - * function - */ - if ((rc == -ETIMEDOUT) || (rc == -EBUSY)) - need_soft_reset = true; - mutex_unlock(&hpriv->restore_phase_mutex); - goto out; - } - } - - hdev->asic_funcs->restore_phase_topology(hdev); - - if (!num_chunks_restore) { - dev_dbg(hdev->dev, - "Need to run restore phase but restore CS is empty\n"); - rc = 0; - } else { - rc = cs_ioctl_default(hpriv, chunks_restore, - num_chunks_restore, &cs_seq); - } - mutex_unlock(&hpriv->restore_phase_mutex); - - if (rc) { - dev_err(hdev->dev, - "Failed to submit restore CS for context %d (%d)\n", - ctx->asid, rc); - goto out; - } - - /* Need to wait for restore completion before execution phase */ - if (num_chunks_restore) { - ret = _hl_cs_wait_ioctl(hdev, ctx, - jiffies_to_usecs(hdev->timeout_jiffies), - cs_seq); - if (ret <= 0) { - dev_err(hdev->dev, - "Restore CS for context %d failed to complete %ld\n", - ctx->asid, ret); - rc = -ENOEXEC; - goto out; - } - } - - ctx->thread_ctx_switch_wait_token = 1; - } else if (!ctx->thread_ctx_switch_wait_token) { - u32 tmp; - - rc = hl_poll_timeout_memory(hdev, - &ctx->thread_ctx_switch_wait_token, tmp, (tmp == 1), - 100, jiffies_to_usecs(hdev->timeout_jiffies), false); - - if (rc == -ETIMEDOUT) { - dev_err(hdev->dev, - "context switch phase timeout (%d)\n", tmp); - goto out; - } + cs_type = hl_cs_get_cs_type(args->in.cs_flags & + ~HL_CS_FLAGS_FORCE_RESTORE); + chunks = (void __user *) (uintptr_t) args->in.chunks_execute; + num_chunks = args->in.num_chunks_execute; + + switch (cs_type) { + case CS_TYPE_SIGNAL: + case CS_TYPE_WAIT: + case CS_TYPE_COLLECTIVE_WAIT: + rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks, + &cs_seq); + break; + default: + rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq); + break; } - if (cs_type == CS_TYPE_DEFAULT) - rc = cs_ioctl_default(hpriv, chunks_execute, num_chunks_execute, - &cs_seq); - else - rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks_execute, - num_chunks_execute, &cs_seq); - out: if (rc != -EAGAIN) { memset(args, 0, sizeof(*args)); @@ -1265,9 +1297,6 @@ out: args->out.seq = cs_seq; } - if (((rc == -ETIMEDOUT) || (rc == -EBUSY)) && (need_soft_reset)) - hl_device_reset(hdev, false, false); - return rc; } |