diff options
Diffstat (limited to 'drivers/misc/habanalabs/common/command_submission.c')
-rw-r--r-- | drivers/misc/habanalabs/common/command_submission.c | 71 |
1 files changed, 51 insertions, 20 deletions
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c index 7b0516cf808b..91b57544f7c6 100644 --- a/drivers/misc/habanalabs/common/command_submission.c +++ b/drivers/misc/habanalabs/common/command_submission.c @@ -405,7 +405,7 @@ static void staged_cs_put(struct hl_device *hdev, struct hl_cs *cs) static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) { bool next_entry_found = false; - struct hl_cs *next; + struct hl_cs *next, *first_cs; if (!cs_needs_timeout(cs)) return; @@ -415,9 +415,16 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs) /* We need to handle tdr only once for the complete staged submission. * Hence, we choose the CS that reaches this function first which is * the CS marked as 'staged_last'. + * In case single staged cs was submitted which has both first and last + * indications, then "cs_find_first" below will return NULL, since we + * removed the cs node from the list before getting here, + * in such cases just continue with the cs to cancel it's TDR work. */ - if (cs->staged_cs && cs->staged_last) - cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (cs->staged_cs && cs->staged_last) { + first_cs = hl_staged_cs_find_first(hdev, cs->staged_sequence); + if (first_cs) + cs = first_cs; + } spin_unlock(&hdev->cs_mirror_lock); @@ -1288,6 +1295,12 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks, if (rc) goto free_cs_object; + /* If this is a staged submission we must return the staged sequence + * rather than the internal CS sequence + */ + if (cs->staged_cs) + *cs_seq = cs->staged_sequence; + /* Validate ALL the CS chunks before submitting the CS */ for (i = 0 ; i < num_chunks ; i++) { struct hl_cs_chunk *chunk = &cs_chunk_array[i]; @@ -1988,6 +2001,15 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, goto free_cs_chunk_array; } + if (!hdev->nic_ports_mask) { + atomic64_inc(&ctx->cs_counters.validation_drop_cnt); + atomic64_inc(&cntr->validation_drop_cnt); + dev_err(hdev->dev, + "Collective operations not supported when NIC ports are disabled"); + rc = -EINVAL; + goto free_cs_chunk_array; + } + collective_engine_id = chunk->collective_engine_id; } @@ -2026,9 +2048,10 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type, spin_unlock(&ctx->sig_mgr.lock); if (!handle_found) { - dev_err(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", + /* treat as signal CS already finished */ + dev_dbg(hdev->dev, "Cannot find encapsulated signals handle for seq 0x%llx\n", signal_seq); - rc = -EINVAL; + rc = 0; goto free_cs_chunk_array; } @@ -2613,7 +2636,8 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data) * completed after the poll function. */ if (!mcs_data.completion_bitmap) { - dev_err(hdev->dev, "Multi-CS got completion on wait but no CS completed\n"); + dev_warn_ratelimited(hdev->dev, + "Multi-CS got completion on wait but no CS completed\n"); rc = -EFAULT; } } @@ -2740,10 +2764,20 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, else interrupt = &hdev->user_interrupt[interrupt_offset]; + /* Add pending user interrupt to relevant list for the interrupt + * handler to monitor + */ + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + + /* We check for completion value as interrupt could have been received + * before we added the node to the wait list + */ if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; - goto free_fence; + goto remove_pending_user_interrupt; } if (completion_value >= target_value) @@ -2752,14 +2786,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx, *status = CS_WAIT_STATUS_BUSY; if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED)) - goto free_fence; - - /* Add pending user interrupt to relevant list for the interrupt - * handler to monitor - */ - spin_lock_irqsave(&interrupt->wait_list_lock, flags); - list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head); - spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + goto remove_pending_user_interrupt; wait_again: /* Wait for interrupt handler to signal completion */ @@ -2770,6 +2797,15 @@ wait_again: * If comparison fails, keep waiting until timeout expires */ if (completion_rc > 0) { + spin_lock_irqsave(&interrupt->wait_list_lock, flags); + /* reinit_completion must be called before we check for user + * completion value, otherwise, if interrupt is received after + * the comparison and before the next wait_for_completion, + * we will reach timeout and fail + */ + reinit_completion(&pend->fence.completion); + spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); + if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) { dev_err(hdev->dev, "Failed to copy completion value from user\n"); rc = -EFAULT; @@ -2780,11 +2816,7 @@ wait_again: if (completion_value >= target_value) { *status = CS_WAIT_STATUS_COMPLETED; } else { - spin_lock_irqsave(&interrupt->wait_list_lock, flags); - reinit_completion(&pend->fence.completion); timeout = completion_rc; - - spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); goto wait_again; } } else if (completion_rc == -ERESTARTSYS) { @@ -2802,7 +2834,6 @@ remove_pending_user_interrupt: list_del(&pend->wait_list_node); spin_unlock_irqrestore(&interrupt->wait_list_lock, flags); -free_fence: kfree(pend); hl_ctx_put(ctx); |