28 files changed, 2031 insertions, 889 deletions
diff --git a/drivers/misc/habanalabs/common/command_buffer.c b/drivers/misc/habanalabs/common/command_buffer.c
index 8132a84698d5..3c0ae07a2d80 100644
--- a/drivers/misc/habanalabs/common/command_buffer.c
+++ b/drivers/misc/habanalabs/common/command_buffer.c
@@ -57,7 +57,7 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 		}
 
 		va_block->start = virt_addr;
-		va_block->end = virt_addr + page_size;
+		va_block->end = virt_addr + page_size - 1;
 		va_block->size = page_size;
 		list_add_tail(&va_block->node, &cb->va_block_list);
 	}
@@ -80,13 +80,13 @@ static int cb_map_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 		offset += va_block->size;
 	}
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+	rc = hl_mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR | MMU_OP_SKIP_LOW_CACHE_INV);
 
 	mutex_unlock(&ctx->mmu_lock);
 
 	cb->is_mmu_mapped = true;
 
-	return 0;
+	return rc;
 
 err_va_umap:
 	list_for_each_entry(va_block, &cb->va_block_list, node) {
@@ -97,7 +97,7 @@ err_va_umap:
 		offset -= va_block->size;
 	}
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+	rc = hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
 
 	mutex_unlock(&ctx->mmu_lock);
 
@@ -126,7 +126,7 @@ static void cb_unmap_mem(struct hl_ctx *ctx, struct hl_cb *cb)
 					"Failed to unmap CB's va 0x%llx\n",
 					va_block->start);
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+	hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
 
 	mutex_unlock(&ctx->mmu_lock);
 
@@ -250,8 +250,7 @@ int hl_cb_create(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 	 * Can't use generic function to check this because of special case
 	 * where we create a CB as part of the reset process
 	 */
-	if ((hdev->disabled) || ((atomic_read(&hdev->in_reset)) &&
-					(ctx_id != HL_KERNEL_ASID_ID))) {
+	if ((hdev->disabled) || (hdev->reset_info.in_reset && (ctx_id != HL_KERNEL_ASID_ID))) {
 		dev_warn_ratelimited(hdev->dev,
 			"Device is disabled or in reset. Can't create new CBs\n");
 		rc = -EBUSY;
@@ -380,8 +379,9 @@ int hl_cb_destroy(struct hl_device *hdev, struct hl_cb_mgr *mgr, u64 cb_handle)
 }
 
 static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
-			u64 cb_handle, u32 *usage_cnt)
+			u64 cb_handle, u32 flags, u32 *usage_cnt, u64 *device_va)
 {
+	struct hl_vm_va_block *va_block;
 	struct hl_cb *cb;
 	u32 handle;
 	int rc = 0;
@@ -402,7 +402,18 @@ static int hl_cb_info(struct hl_device *hdev, struct hl_cb_mgr *mgr,
 		goto out;
 	}
 
-	*usage_cnt = atomic_read(&cb->cs_cnt);
+	if (flags & HL_CB_FLAGS_GET_DEVICE_VA) {
+		va_block = list_first_entry(&cb->va_block_list, struct hl_vm_va_block, node);
+		if (va_block) {
+			*device_va = va_block->start;
+		} else {
+			dev_err(hdev->dev, "CB is not mapped to the device's MMU\n");
+			rc = -EINVAL;
+			goto out;
+		}
+	} else {
+		*usage_cnt = atomic_read(&cb->cs_cnt);
+	}
 
 out:
 	spin_unlock(&mgr->cb_lock);
@@ -414,7 +425,7 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 	union hl_cb_args *args = data;
 	struct hl_device *hdev = hpriv->hdev;
 	enum hl_device_status status;
-	u64 handle = 0;
+	u64 handle = 0, device_va;
 	u32 usage_cnt = 0;
 	int rc;
 
@@ -450,13 +461,20 @@ int hl_cb_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	case HL_CB_OP_INFO:
 		rc = hl_cb_info(hdev, &hpriv->cb_mgr, args->in.cb_handle,
-				&usage_cnt);
-		memset(args, 0, sizeof(*args));
-		args->out.usage_cnt = usage_cnt;
+				args->in.flags,
+				&usage_cnt,
+				&device_va);
+
+		memset(&args->out, 0, sizeof(args->out));
+
+		if (args->in.flags & HL_CB_FLAGS_GET_DEVICE_VA)
+			args->out.device_va = device_va;
+		else
+			args->out.usage_cnt = usage_cnt;
 		break;
 
 	default:
-		rc = -ENOTTY;
+		rc = -EINVAL;
 		break;
 	}
 
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
index 4c8000fd246c..0a4ef13d9ac4 100644
--- a/drivers/misc/habanalabs/common/command_submission.c
+++ b/drivers/misc/habanalabs/common/command_submission.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -533,8 +533,8 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
 					mcs_compl->stream_master_qid_map)) {
 			/* extract the timestamp only of first completed CS */
 			if (!mcs_compl->timestamp)
-				mcs_compl->timestamp =
-						ktime_to_ns(fence->timestamp);
+				mcs_compl->timestamp = ktime_to_ns(fence->timestamp);
+
 			complete_all(&mcs_compl->completion);
 
 			/*
@@ -733,6 +733,14 @@ static void cs_timedout(struct work_struct *work)
 
 	hdev = cs->ctx->hdev;
 
+	/* Save only the first CS timeout parameters */
+	rc = atomic_cmpxchg(&hdev->last_error.cs_write_disable, 0, 1);
+	if (!rc) {
+		hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
+		hdev->last_error.cs_timeout_timestamp = ktime_get();
+		hdev->last_error.cs_timeout_seq = cs->sequence;
+	}
+
 	switch (cs->type) {
 	case CS_TYPE_SIGNAL:
 		dev_err(hdev->dev,
@@ -767,9 +775,9 @@ static void cs_timedout(struct work_struct *work)
 
 	if (likely(!skip_reset_on_timeout)) {
 		if (hdev->reset_on_lockup)
-			hl_device_reset(hdev, HL_RESET_TDR);
+			hl_device_reset(hdev, HL_DRV_RESET_TDR);
 		else
-			hdev->needs_reset = true;
+			hdev->reset_info.needs_reset = true;
 	}
 }
 
@@ -806,7 +814,7 @@ static int allocate_cs(struct hl_device *hdev, struct hl_ctx *ctx,
 	cs->encaps_signals = !!(flags & HL_CS_FLAGS_ENCAP_SIGNALS);
 	cs->timeout_jiffies = timeout;
 	cs->skip_reset_on_timeout =
-		hdev->skip_reset_on_timeout ||
+		hdev->reset_info.skip_reset_on_timeout ||
 		!!(flags & HL_CS_FLAGS_SKIP_RESET_ON_TIMEOUT);
 	cs->submission_time_jiffies = jiffies;
 	INIT_LIST_HEAD(&cs->job_list);
@@ -1131,9 +1139,6 @@ static int hl_cs_sanity_checks(struct hl_fpriv *hpriv, union hl_cs_args *args)
 	enum hl_cs_type cs_type;
 
 	if (!hl_device_operational(hdev, &status)) {
-		dev_warn_ratelimited(hdev->dev,
-			"Device is %s. Can't submit new CS\n",
-			hdev->status[status]);
 		return -EBUSY;
 	}
 
@@ -1262,7 +1267,8 @@ static u32 get_stream_master_qid_mask(struct hl_device *hdev, u32 qid)
 
 static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 				u32 num_chunks, u64 *cs_seq, u32 flags,
-				u32 encaps_signals_handle, u32 timeout)
+				u32 encaps_signals_handle, u32 timeout,
+				u16 *signal_initial_sob_count)
 {
 	bool staged_mid, int_queues_only = true;
 	struct hl_device *hdev = hpriv->hdev;
@@ -1429,6 +1435,8 @@ static int cs_ioctl_default(struct hl_fpriv *hpriv, void __user *chunks,
 		goto free_cs_object;
 	}
 
+	*signal_initial_sob_count = cs->initial_sob_count;
+
 	rc = HL_CS_STATUS_SUCCESS;
 	goto put_cs;
 
@@ -1457,6 +1465,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 	int rc = 0, do_ctx_switch;
 	void __user *chunks;
 	u32 num_chunks, tmp;
+	u16 sob_count;
 	int ret;
 
 	do_ctx_switch = atomic_cmpxchg(&ctx->thread_ctx_switch_token, 1, 0);
@@ -1497,7 +1506,7 @@ static int hl_cs_ctx_switch(struct hl_fpriv *hpriv, union hl_cs_args *args,
 			rc = 0;
 		} else {
 			rc = cs_ioctl_default(hpriv, chunks, num_chunks,
-					cs_seq, 0, 0, hdev->timeout_jiffies);
+					cs_seq, 0, 0, hdev->timeout_jiffies, &sob_count);
 		}
 
 		mutex_unlock(&hpriv->restore_phase_mutex);
@@ -1813,6 +1822,9 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 	}
 
 	handle->count = count;
+
+	hl_ctx_get(hdev, hpriv->ctx);
+	handle->ctx = hpriv->ctx;
 	mgr = &hpriv->ctx->sig_mgr;
 
 	spin_lock(&mgr->lock);
@@ -1822,7 +1834,7 @@ static int cs_ioctl_reserve_signals(struct hl_fpriv *hpriv,
 	if (hdl_id < 0) {
 		dev_err(hdev->dev, "Failed to allocate IDR for a new signal reservation\n");
 		rc = -EINVAL;
-		goto out;
+		goto put_ctx;
 	}
 
 	handle->id = hdl_id;
@@ -1875,7 +1887,10 @@ remove_idr:
 	idr_remove(&mgr->handles, hdl_id);
 	spin_unlock(&mgr->lock);
 
+put_ctx:
+	hl_ctx_put(handle->ctx);
 	kfree(handle);
+
 out:
 	return rc;
 }
@@ -1935,6 +1950,7 @@ static int cs_ioctl_unreserve_signals(struct hl_fpriv *hpriv, u32 handle_id)
 
 		/* Release the id and free allocated memory of the handle */
 		idr_remove(&mgr->handles, handle_id);
+		hl_ctx_put(encaps_sig_hdl->ctx);
 		kfree(encaps_sig_hdl);
 	} else {
 		rc = -EINVAL;
@@ -1948,7 +1964,8 @@ out:
 
 static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 				void __user *chunks, u32 num_chunks,
-				u64 *cs_seq, u32 flags, u32 timeout)
+				u64 *cs_seq, u32 flags, u32 timeout,
+				u32 *signal_sob_addr_offset, u16 *signal_initial_sob_count)
 {
 	struct hl_cs_encaps_sig_handle *encaps_sig_hdl = NULL;
 	bool handle_found = false, is_wait_cs = false,
@@ -2180,6 +2197,9 @@ static int cs_ioctl_signal_wait(struct hl_fpriv *hpriv, enum hl_cs_type cs_type,
 		goto free_cs_object;
 	}
 
+	*signal_sob_addr_offset = cs->sob_addr_offset;
+	*signal_initial_sob_count = cs->initial_sob_count;
+
 	rc = HL_CS_STATUS_SUCCESS;
 	if (is_wait_cs)
 		wait_cs_submitted = true;
@@ -2210,6 +2230,7 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	void __user *chunks;
 	u32 num_chunks, flags, timeout,
 		signals_count = 0, sob_addr = 0, handle_id = 0;
+	u16 sob_initial_count = 0;
 	int rc;
 
 	rc = hl_cs_sanity_checks(hpriv, args);
@@ -2240,7 +2261,8 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 	case CS_TYPE_WAIT:
 	case CS_TYPE_COLLECTIVE_WAIT:
 		rc = cs_ioctl_signal_wait(hpriv, cs_type, chunks, num_chunks,
-					&cs_seq, args->in.cs_flags, timeout);
+					&cs_seq, args->in.cs_flags, timeout,
+					&sob_addr, &sob_initial_count);
 		break;
 	case CS_RESERVE_SIGNALS:
 		rc = cs_ioctl_reserve_signals(hpriv,
@@ -2256,20 +2278,33 @@ int hl_cs_ioctl(struct hl_fpriv *hpriv, void *data)
 		rc = cs_ioctl_default(hpriv, chunks, num_chunks, &cs_seq,
 						args->in.cs_flags,
 						args->in.encaps_sig_handle_id,
-						timeout);
+						timeout, &sob_initial_count);
 		break;
 	}
 out:
 	if (rc != -EAGAIN) {
 		memset(args, 0, sizeof(*args));
 
-		if (cs_type == CS_RESERVE_SIGNALS) {
+		switch (cs_type) {
+		case CS_RESERVE_SIGNALS:
 			args->out.handle_id = handle_id;
 			args->out.sob_base_addr_offset = sob_addr;
 			args->out.count = signals_count;
-		} else {
+			break;
+		case CS_TYPE_SIGNAL:
+			args->out.sob_base_addr_offset = sob_addr;
+			args->out.sob_count_before_submission = sob_initial_count;
+			args->out.seq = cs_seq;
+			break;
+		case CS_TYPE_DEFAULT:
+			args->out.sob_count_before_submission = sob_initial_count;
 			args->out.seq = cs_seq;
+			break;
+		default:
+			args->out.seq = cs_seq;
+			break;
 		}
+
 		args->out.status = rc;
 	}
 
@@ -2334,16 +2369,18 @@ static int hl_wait_for_fence(struct hl_ctx *ctx, u64 seq, struct hl_fence *fence
  * hl_cs_poll_fences - iterate CS fences to check for CS completion
  *
  * @mcs_data: multi-CS internal data
+ * @mcs_compl: multi-CS completion structure
  *
  * @return 0 on success, otherwise non 0 error code
  *
  * The function iterates on all CS sequence in the list and set bit in
  * completion_bitmap for each completed CS.
- * while iterating, the function can extracts the stream map to be later
- * used by the waiting function.
- * this function shall be called after taking context ref
+ * While iterating, the function sets the stream map of each fence in the fence
+ * array in the completion QID stream map to be used by CSs to perform
+ * completion to the multi-CS context.
+ * This function shall be called after taking context ref
  */
-static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
+static int hl_cs_poll_fences(struct multi_cs_data *mcs_data, struct multi_cs_completion *mcs_compl)
 {
 	struct hl_fence **fence_ptr = mcs_data->fence_arr;
 	struct hl_device *hdev = mcs_data->ctx->hdev;
@@ -2360,6 +2397,15 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 		return rc;
 
 	/*
+	 * re-initialize the completion here to handle 2 possible cases:
+	 * 1. CS will complete the multi-CS prior clearing the completion. in which
+	 *    case the fence iteration is guaranteed to catch the CS completion.
+	 * 2. the completion will occur after re-init of the completion.
+	 *    in which case we will wake up immediately in wait_for_completion.
+	 */
+	reinit_completion(&mcs_compl->completion);
+
+	/*
 	 * set to maximum time to verify timestamp is valid: if at the end
 	 * this value is maintained- no timestamp was updated
 	 */
@@ -2370,6 +2416,21 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 		struct hl_fence *fence = *fence_ptr;
 
 		/*
+		 * In order to prevent case where we wait until timeout even though a CS associated
+		 * with the multi-CS actually completed we do things in the below order:
+		 * 1. for each fence set it's QID map in the multi-CS completion QID map. This way
+		 *    any CS can, potentially, complete the multi CS for the specific QID (note
+		 *    that once completion is initialized, calling complete* and then wait on the
+		 *    completion will cause it to return at once)
+		 * 2. only after allowing multi-CS completion for the specific QID we check whether
+		 *    the specific CS already completed (and thus the wait for completion part will
+		 *    be skipped). if the CS not completed it is guaranteed that completing CS will
+		 *    wake up the completion.
+		 */
+		if (fence)
+			mcs_compl->stream_master_qid_map |= fence->stream_master_qid_map;
+
+		/*
 		 * function won't sleep as it is called with timeout 0 (i.e.
 		 * poll the fence)
 		 */
@@ -2384,9 +2445,7 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 
 		switch (status) {
 		case CS_WAIT_STATUS_BUSY:
-			/* CS did not finished, keep waiting on its QID*/
-			mcs_data->stream_master_qid_map |=
-					fence->stream_master_qid_map;
+			/* CS did not finished, QID to wait on already stored */
 			break;
 		case CS_WAIT_STATUS_COMPLETED:
 			/*
@@ -2394,9 +2453,19 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
 			 * returns to user indicating CS completed before it finished
 			 * all of its mcs handling, to avoid race the next time the
 			 * user waits for mcs.
+			 * note: when reaching this case fence is definitely not NULL
+			 *       but NULL check was added to overcome static analysis
 			 */
-			if (!fence->mcs_handling_done)
+			if (fence && !fence->mcs_handling_done) {
+				/*
+				 * in case multi CS is completed but MCS handling not done
+				 * we "complete" the multi CS to prevent it from waiting
+				 * until time-out and the "multi-CS handling done" will have
+				 * another chance at the next iteration
+				 */
+				complete_all(&mcs_compl->completion);
 				break;
+			}
 
 			mcs_data->completion_bitmap |= BIT(i);
 			/*
@@ -2456,6 +2525,21 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	return rc;
 }
 
+static inline unsigned long hl_usecs64_to_jiffies(const u64 usecs)
+{
+	if (usecs <= U32_MAX)
+		return usecs_to_jiffies(usecs);
+
+	/*
+	 * If the value in nanoseconds is larger than 64 bit, use the largest
+	 * 64 bit value.
+	 */
+	if (usecs >= ((u64)(U64_MAX / NSEC_PER_USEC)))
+		return nsecs_to_jiffies(U64_MAX);
+
+	return nsecs_to_jiffies(usecs * NSEC_PER_USEC);
+}
+
 /*
  * hl_wait_multi_cs_completion_init - init completion structure
  *
@@ -2469,9 +2553,7 @@ static int _hl_cs_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  * the function gets the first available completion (by marking it "used")
  * and initialize its values.
  */
-static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
-							struct hl_device *hdev,
-							u8 stream_master_bitmap)
+static struct multi_cs_completion *hl_wait_multi_cs_completion_init(struct hl_device *hdev)
 {
 	struct multi_cs_completion *mcs_compl;
 	int i;
@@ -2483,8 +2565,11 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
 		if (!mcs_compl->used) {
 			mcs_compl->used = 1;
 			mcs_compl->timestamp = 0;
-			mcs_compl->stream_master_qid_map = stream_master_bitmap;
-			reinit_completion(&mcs_compl->completion);
+			/*
+			 * init QID map to 0 to avoid completion by CSs. the actual QID map
+			 * to multi-CS CSs will be set incrementally at a later stage
+			 */
+			mcs_compl->stream_master_qid_map = 0;
 			spin_unlock(&mcs_compl->lock);
 			break;
 		}
@@ -2492,8 +2577,7 @@ static struct multi_cs_completion *hl_wait_multi_cs_completion_init(
 	}
 
 	if (i == MULTI_CS_MAX_USER_CTX) {
-		dev_err(hdev->dev,
-				"no available multi-CS completion structure\n");
+		dev_err(hdev->dev, "no available multi-CS completion structure\n");
 		return ERR_PTR(-ENOMEM);
 	}
 	return mcs_compl;
@@ -2524,27 +2608,18 @@ static void hl_wait_multi_cs_completion_fini(
  *
  * @return 0 on success, otherwise non 0 error code
  */
-static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data)
+static int hl_wait_multi_cs_completion(struct multi_cs_data *mcs_data,
+						struct multi_cs_completion *mcs_compl)
 {
-	struct hl_device *hdev = mcs_data->ctx->hdev;
-	struct multi_cs_completion *mcs_compl;
 	long completion_rc;
 
-	mcs_compl = hl_wait_multi_cs_completion_init(hdev,
-					mcs_data->stream_master_qid_map);
-	if (IS_ERR(mcs_compl))
-		return PTR_ERR(mcs_compl);
-
-	completion_rc = wait_for_completion_interruptible_timeout(
-					&mcs_compl->completion,
-					usecs_to_jiffies(mcs_data->timeout_us));
+	completion_rc = wait_for_completion_interruptible_timeout(&mcs_compl->completion,
+									mcs_data->timeout_jiffies);
 
 	/* update timestamp */
 	if (completion_rc > 0)
 		mcs_data->timestamp = mcs_compl->timestamp;
 
-	hl_wait_multi_cs_completion_fini(mcs_compl);
-
 	mcs_data->wait_status = completion_rc;
 
 	return 0;
@@ -2577,6 +2652,7 @@ void hl_multi_cs_completion_init(struct hl_device *hdev)
  */
 static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
+	struct multi_cs_completion *mcs_compl;
 	struct hl_device *hdev = hpriv->hdev;
 	struct multi_cs_data mcs_data = {0};
 	union hl_wait_cs_args *args = data;
@@ -2631,9 +2707,17 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	hl_ctx_get(hdev, ctx);
 
+	/* wait (with timeout) for the first CS to be completed */
+	mcs_data.timeout_jiffies = hl_usecs64_to_jiffies(args->in.timeout_us);
+	mcs_compl = hl_wait_multi_cs_completion_init(hdev);
+	if (IS_ERR(mcs_compl)) {
+		rc = PTR_ERR(mcs_compl);
+		goto put_ctx;
+	}
+
 	/* poll all CS fences, extract timestamp */
 	mcs_data.update_ts = true;
-	rc = hl_cs_poll_fences(&mcs_data);
+	rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
 	/*
 	 * skip wait for CS completion when one of the below is true:
 	 * - an error on the poll function
@@ -2641,34 +2725,39 @@ static int hl_multi_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	 * - the user called ioctl with timeout 0
 	 */
 	if (rc || mcs_data.completion_bitmap || !args->in.timeout_us)
-		goto put_ctx;
+		goto completion_fini;
 
-	/* wait (with timeout) for the first CS to be completed */
-	mcs_data.timeout_us = args->in.timeout_us;
-	rc = hl_wait_multi_cs_completion(&mcs_data);
-	if (rc)
-		goto put_ctx;
+	while (true) {
+		rc = hl_wait_multi_cs_completion(&mcs_data, mcs_compl);
+		if (rc || (mcs_data.wait_status == 0))
+			break;
 
-	if (mcs_data.wait_status > 0) {
 		/*
 		 * poll fences once again to update the CS map.
 		 * no timestamp should be updated this time.
 		 */
 		mcs_data.update_ts = false;
-		rc = hl_cs_poll_fences(&mcs_data);
+		rc = hl_cs_poll_fences(&mcs_data, mcs_compl);
+
+		if (mcs_data.completion_bitmap)
+			break;
 
 		/*
 		 * if hl_wait_multi_cs_completion returned before timeout (i.e.
-		 * it got a completion) we expect to see at least one CS
-		 * completed after the poll function.
+		 * it got a completion) it either got completed by CS in the multi CS list
+		 * (in which case the indication will be non empty completion_bitmap) or it
+		 * got completed by CS submitted to one of the shared stream master but
+		 * not in the multi CS list (in which case we should wait again but modify
+		 * the timeout and set timestamp as zero to let a CS related to the current
+		 * multi-CS set a new, relevant, timestamp)
 		 */
-		if (!mcs_data.completion_bitmap) {
-			dev_warn_ratelimited(hdev->dev,
-				"Multi-CS got completion on wait but no CS completed\n");
-			rc = -EFAULT;
-		}
+		mcs_data.timeout_jiffies = mcs_data.wait_status;
+		mcs_compl->timestamp = 0;
 	}
 
+completion_fini:
+	hl_wait_multi_cs_completion_fini(mcs_compl);
+
 put_ctx:
 	hl_ctx_put(ctx);
 	kfree(fence_arr);
@@ -2699,7 +2788,7 @@ free_seq_arr:
 		}
 
 		/* update if some CS was gone */
-		if (mcs_data.timestamp)
+		if (!mcs_data.timestamp)
 			args->out.flags |= HL_WAIT_CS_STATUS_FLAG_GONE;
 	} else {
 		args->out.status = HL_WAIT_CS_STATUS_BUSY;
@@ -2766,37 +2855,129 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 }
 
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
-				u32 timeout_us, u64 user_address,
-				u64 target_value, u16 interrupt_offset,
-				enum hl_cs_wait_status *status,
+				struct hl_cb_mgr *cb_mgr, u64 timeout_us,
+				u64 cq_counters_handle,	u64 cq_counters_offset,
+				u64 target_value, struct hl_user_interrupt *interrupt,
+				u32 *status,
 				u64 *timestamp)
 {
 	struct hl_user_pending_interrupt *pend;
-	struct hl_user_interrupt *interrupt;
 	unsigned long timeout, flags;
-	u64 completion_value;
 	long completion_rc;
+	struct hl_cb *cb;
 	int rc = 0;
+	u32 handle;
 
-	if (timeout_us == U32_MAX)
-		timeout = timeout_us;
-	else
-		timeout = usecs_to_jiffies(timeout_us);
+	timeout = hl_usecs64_to_jiffies(timeout_us);
 
 	hl_ctx_get(hdev, ctx);
 
-	pend = kmalloc(sizeof(*pend), GFP_KERNEL);
+	cq_counters_handle >>= PAGE_SHIFT;
+	handle = (u32) cq_counters_handle;
+
+	cb = hl_cb_get(hdev, cb_mgr, handle);
+	if (!cb) {
+		hl_ctx_put(ctx);
+		return -EINVAL;
+	}
+
+	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
 	if (!pend) {
+		hl_cb_put(cb);
 		hl_ctx_put(ctx);
 		return -ENOMEM;
 	}
 
 	hl_fence_init(&pend->fence, ULONG_MAX);
 
-	if (interrupt_offset == HL_COMMON_USER_INTERRUPT_ID)
-		interrupt = &hdev->common_user_interrupt;
-	else
-		interrupt = &hdev->user_interrupt[interrupt_offset];
+	pend->cq_kernel_addr = (u64 *) cb->kernel_address + cq_counters_offset;
+	pend->cq_target_value = target_value;
+
+	/* We check for completion value as interrupt could have been received
+	 * before we added the node to the wait list
+	 */
+	if (*pend->cq_kernel_addr >= target_value) {
+		*status = HL_WAIT_CS_STATUS_COMPLETED;
+		/* There was no interrupt, we assume the completion is now. */
+		pend->fence.timestamp = ktime_get();
+	}
+
+	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
+		goto set_timestamp;
+
+	/* Add pending user interrupt to relevant list for the interrupt
+	 * handler to monitor
+	 */
+	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
+	list_add_tail(&pend->wait_list_node, &interrupt->wait_list_head);
+	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
+
+	/* Wait for interrupt handler to signal completion */
+	completion_rc = wait_for_completion_interruptible_timeout(&pend->fence.completion,
+								timeout);
+	if (completion_rc > 0) {
+		*status = HL_WAIT_CS_STATUS_COMPLETED;
+	} else {
+		if (completion_rc == -ERESTARTSYS) {
+			dev_err_ratelimited(hdev->dev,
+					"user process got signal while waiting for interrupt ID %d\n",
+					interrupt->interrupt_id);
+			rc = -EINTR;
+			*status = HL_WAIT_CS_STATUS_ABORTED;
+		} else {
+			if (pend->fence.error == -EIO) {
+				dev_err_ratelimited(hdev->dev,
+						"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
+						pend->fence.error);
+				rc = -EIO;
+				*status = HL_WAIT_CS_STATUS_ABORTED;
+			} else {
+				dev_err_ratelimited(hdev->dev, "Waiting for interrupt ID %d timedout\n",
+						interrupt->interrupt_id);
+				rc = -ETIMEDOUT;
+			}
+			*status = HL_WAIT_CS_STATUS_BUSY;
+		}
+	}
+
+	spin_lock_irqsave(&interrupt->wait_list_lock, flags);
+	list_del(&pend->wait_list_node);
+	spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
+
+set_timestamp:
+	*timestamp = ktime_to_ns(pend->fence.timestamp);
+
+	kfree(pend);
+	hl_cb_put(cb);
+	hl_ctx_put(ctx);
+
+	return rc;
+}
+
+static int _hl_interrupt_wait_ioctl_user_addr(struct hl_device *hdev, struct hl_ctx *ctx,
+				u64 timeout_us, u64 user_address,
+				u64 target_value, struct hl_user_interrupt *interrupt,
+
+				u32 *status,
+				u64 *timestamp)
+{
+	struct hl_user_pending_interrupt *pend;
+	unsigned long timeout, flags;
+	u64 completion_value;
+	long completion_rc;
+	int rc = 0;
+
+	timeout = hl_usecs64_to_jiffies(timeout_us);
+
+	hl_ctx_get(hdev, ctx);
+
+	pend = kzalloc(sizeof(*pend), GFP_KERNEL);
+	if (!pend) {
+		hl_ctx_put(ctx);
+		return -ENOMEM;
+	}
+
+	hl_fence_init(&pend->fence, ULONG_MAX);
 
 	/* Add pending user interrupt to relevant list for the interrupt
 	 * handler to monitor
@@ -2815,13 +2996,14 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
 	}
 
 	if (completion_value >= target_value) {
-		*status = CS_WAIT_STATUS_COMPLETED;
+		*status = HL_WAIT_CS_STATUS_COMPLETED;
 		/* There was no interrupt, we assume the completion is now. */
 		pend->fence.timestamp = ktime_get();
-	} else
-		*status = CS_WAIT_STATUS_BUSY;
+	} else {
+		*status = HL_WAIT_CS_STATUS_BUSY;
+	}
 
-	if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED))
+	if (!timeout_us || (*status == HL_WAIT_CS_STATUS_COMPLETED))
 		goto remove_pending_user_interrupt;
 
 wait_again:
@@ -2850,7 +3032,13 @@ wait_again:
 		}
 
 		if (completion_value >= target_value) {
-			*status = CS_WAIT_STATUS_COMPLETED;
+			*status = HL_WAIT_CS_STATUS_COMPLETED;
+		} else if (pend->fence.error) {
+			dev_err_ratelimited(hdev->dev,
+				"interrupt based wait ioctl aborted(error:%d) due to a reset cycle initiated\n",
+				pend->fence.error);
+			/* set the command completion status as ABORTED */
+			*status = HL_WAIT_CS_STATUS_ABORTED;
 		} else {
 			timeout = completion_rc;
 			goto wait_again;
@@ -2861,7 +3049,7 @@ wait_again:
 			interrupt->interrupt_id);
 		rc = -EINTR;
 	} else {
-		*status = CS_WAIT_STATUS_BUSY;
+		*status = HL_WAIT_CS_STATUS_BUSY;
 	}
 
 remove_pending_user_interrupt:
@@ -2879,11 +3067,12 @@ remove_pending_user_interrupt:
 
 static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 {
-	u16 interrupt_id, interrupt_offset, first_interrupt, last_interrupt;
+	u16 interrupt_id, first_interrupt, last_interrupt;
 	struct hl_device *hdev = hpriv->hdev;
 	struct asic_fixed_properties *prop;
+	struct hl_user_interrupt *interrupt;
 	union hl_wait_cs_args *args = data;
-	enum hl_cs_wait_status status;
+	u32 status = HL_WAIT_CS_STATUS_BUSY;
 	u64 timestamp;
 	int rc;
 
@@ -2894,8 +3083,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 		return -EPERM;
 	}
 
-	interrupt_id =
-		FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
+	interrupt_id = FIELD_GET(HL_WAIT_CS_FLAGS_INTERRUPT_MASK, args->in.flags);
 
 	first_interrupt = prop->first_available_user_msix_interrupt;
 	last_interrupt = prop->first_available_user_msix_interrupt +
@@ -2908,15 +3096,21 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	}
 
 	if (interrupt_id == HL_COMMON_USER_INTERRUPT_ID)
-		interrupt_offset = HL_COMMON_USER_INTERRUPT_ID;
+		interrupt = &hdev->common_user_interrupt;
 	else
-		interrupt_offset = interrupt_id - first_interrupt;
+		interrupt = &hdev->user_interrupt[interrupt_id - first_interrupt];
 
-	rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx,
+	if (args->in.flags & HL_WAIT_CS_FLAGS_INTERRUPT_KERNEL_CQ)
+		rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx, &hpriv->cb_mgr,
+				args->in.interrupt_timeout_us, args->in.cq_counters_handle,
+				args->in.cq_counters_offset,
+				args->in.target, interrupt, &status,
+				&timestamp);
+	else
+		rc = _hl_interrupt_wait_ioctl_user_addr(hdev, hpriv->ctx,
 				args->in.interrupt_timeout_us, args->in.addr,
-				args->in.target, interrupt_offset, &status,
+				args->in.target, interrupt, &status,
 				&timestamp);
-
 	if (rc) {
 		if (rc != -EINTR)
 			dev_err_ratelimited(hdev->dev,
@@ -2926,22 +3120,13 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	}
 
 	memset(args, 0, sizeof(*args));
+	args->out.status = status;
 
 	if (timestamp) {
 		args->out.timestamp_nsec = timestamp;
 		args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
 	}
 
-	switch (status) {
-	case CS_WAIT_STATUS_COMPLETED:
-		args->out.status = HL_WAIT_CS_STATUS_COMPLETED;
-		break;
-	case CS_WAIT_STATUS_BUSY:
-	default:
-		args->out.status = HL_WAIT_CS_STATUS_BUSY;
-		break;
-	}
-
 	return 0;
 }
 
@@ -2955,7 +3140,7 @@ int hl_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 	 * user interrupt
 	 */
 	if (!hl_device_operational(hpriv->hdev, NULL))
-		return -EPERM;
+		return -EBUSY;
 
 	if (flags & HL_WAIT_CS_FLAGS_INTERRUPT)
 		rc = hl_interrupt_wait_ioctl(hpriv, data);
diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
index d0aaccd4df2c..c6360e33bce8 100644
--- a/drivers/misc/habanalabs/common/context.c
+++ b/drivers/misc/habanalabs/common/context.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -13,13 +13,13 @@ void hl_encaps_handle_do_release(struct kref *ref)
 {
 	struct hl_cs_encaps_sig_handle *handle =
 		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
-	struct hl_ctx *ctx = handle->hdev->compute_ctx;
-	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+	struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
 
 	spin_lock(&mgr->lock);
 	idr_remove(&mgr->handles, handle->id);
 	spin_unlock(&mgr->lock);
 
+	hl_ctx_put(handle->ctx);
 	kfree(handle);
 }
 
@@ -27,8 +27,7 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref)
 {
 	struct hl_cs_encaps_sig_handle *handle =
 		container_of(ref, struct hl_cs_encaps_sig_handle, refcount);
-	struct hl_ctx *ctx = handle->hdev->compute_ctx;
-	struct hl_encaps_signals_mgr *mgr = &ctx->sig_mgr;
+	struct hl_encaps_signals_mgr *mgr = &handle->ctx->sig_mgr;
 
 	/* if we're here, then there was a signals reservation but cs with
 	 * encaps signals wasn't submitted, so need to put refcount
@@ -40,6 +39,7 @@ static void hl_encaps_handle_do_release_sob(struct kref *ref)
 	idr_remove(&mgr->handles, handle->id);
 	spin_unlock(&mgr->lock);
 
+	hl_ctx_put(handle->ctx);
 	kfree(handle);
 }
 
@@ -97,11 +97,9 @@ static void hl_ctx_fini(struct hl_ctx *ctx)
 		/* The engines are stopped as there is no executing CS, but the
 		 * Coresight might be still working by accessing addresses
 		 * related to the stopped engines. Hence stop it explicitly.
-		 * Stop only if this is the compute context, as there can be
-		 * only one compute context
 		 */
-		if ((hdev->in_debug) && (hdev->compute_ctx == ctx))
-			hl_device_set_debug_mode(hdev, false);
+		if (hdev->in_debug)
+			hl_device_set_debug_mode(hdev, ctx, false);
 
 		hdev->asic_funcs->ctx_fini(ctx);
 		hl_cb_va_pool_fini(ctx);
@@ -167,7 +165,7 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
 	hpriv->ctx = ctx;
 
 	/* TODO: remove the following line for multiple process support */
-	hdev->compute_ctx = ctx;
+	hdev->is_compute_ctx_active = true;
 
 	return 0;
 
@@ -274,6 +272,27 @@ int hl_ctx_put(struct hl_ctx *ctx)
 	return kref_put(&ctx->refcount, hl_ctx_do_release);
 }
 
+struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev)
+{
+	struct hl_ctx *ctx = NULL;
+	struct hl_fpriv *hpriv;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
+		/* There can only be a single user which has opened the compute device, so exit
+		 * immediately once we find him
+		 */
+		ctx = hpriv->ctx;
+		hl_ctx_get(hdev, ctx);
+		break;
+	}
+
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	return ctx;
+}
+
 /*
  * hl_ctx_get_fence_locked - get CS fence under CS lock
  *
diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
index 1f2a3dc6c4e2..fc084ee5106e 100644
--- a/drivers/misc/habanalabs/common/debugfs.c
+++ b/drivers/misc/habanalabs/common/debugfs.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -15,19 +15,25 @@
 #define MMU_ADDR_BUF_SIZE	40
 #define MMU_ASID_BUF_SIZE	10
 #define MMU_KBUF_SIZE		(MMU_ADDR_BUF_SIZE + MMU_ASID_BUF_SIZE)
+#define I2C_MAX_TRANSACTION_LEN	8
 
 static struct dentry *hl_debug_root;
 
 static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
-				u8 i2c_reg, long *val)
+				u8 i2c_reg, u8 i2c_len, u64 *val)
 {
 	struct cpucp_packet pkt;
-	u64 result;
 	int rc;
 
 	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
+	if (i2c_len > I2C_MAX_TRANSACTION_LEN) {
+		dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n",
+				i2c_len, I2C_MAX_TRANSACTION_LEN);
+		return -EINVAL;
+	}
+
 	memset(&pkt, 0, sizeof(pkt));
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_RD <<
@@ -35,12 +41,10 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
+	pkt.i2c_len = i2c_len;
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
-						0, &result);
-
-	*val = (long) result;
-
+						0, val);
 	if (rc)
 		dev_err(hdev->dev, "Failed to read from I2C, error %d\n", rc);
 
@@ -48,7 +52,7 @@ static int hl_debugfs_i2c_read(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 }
 
 static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
-				u8 i2c_reg, u32 val)
+				u8 i2c_reg, u8 i2c_len, u64 val)
 {
 	struct cpucp_packet pkt;
 	int rc;
@@ -56,6 +60,12 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	if (!hl_device_operational(hdev, NULL))
 		return -EBUSY;
 
+	if (i2c_len > I2C_MAX_TRANSACTION_LEN) {
+		dev_err(hdev->dev, "I2C transaction length %u, exceeds maximum of %u\n",
+				i2c_len, I2C_MAX_TRANSACTION_LEN);
+		return -EINVAL;
+	}
+
 	memset(&pkt, 0, sizeof(pkt));
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_I2C_WR <<
@@ -63,6 +73,7 @@ static int hl_debugfs_i2c_write(struct hl_device *hdev, u8 i2c_bus, u8 i2c_addr,
 	pkt.i2c_bus = i2c_bus;
 	pkt.i2c_addr = i2c_addr;
 	pkt.i2c_reg = i2c_reg;
+	pkt.i2c_len = i2c_len;
 	pkt.value = cpu_to_le64(val);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
@@ -235,6 +246,8 @@ static int vm_show(struct seq_file *s, void *data)
 	struct hl_vm_hash_node *hnode;
 	struct hl_userptr *userptr;
 	struct hl_vm_phys_pg_pack *phys_pg_pack = NULL;
+	struct hl_va_range *va_range;
+	struct hl_vm_va_block *va_block;
 	enum vm_type *vm_type;
 	bool once = true;
 	u64 j;
@@ -314,6 +327,25 @@ static int vm_show(struct seq_file *s, void *data)
 
 	spin_unlock(&dev_entry->ctx_mem_hash_spinlock);
 
+	ctx = hl_get_compute_ctx(dev_entry->hdev);
+	if (ctx) {
+		seq_puts(s, "\nVA ranges:\n\n");
+		for (i = HL_VA_RANGE_TYPE_HOST ; i < HL_VA_RANGE_TYPE_MAX ; ++i) {
+			va_range = ctx->va_range[i];
+			seq_printf(s, "   va_range %d\n", i);
+			seq_puts(s, "---------------------\n");
+			mutex_lock(&va_range->lock);
+			list_for_each_entry(va_block, &va_range->list, node) {
+				seq_printf(s, "%#16llx - %#16llx (%#llx)\n",
+					   va_block->start, va_block->end,
+					   va_block->size);
+			}
+			mutex_unlock(&va_range->lock);
+			seq_puts(s, "\n");
+		}
+		hl_ctx_put(ctx);
+	}
+
 	if (!once)
 		seq_puts(s, "\n");
 
@@ -407,7 +439,7 @@ static int mmu_show(struct seq_file *s, void *data)
 	if (dev_entry->mmu_asid == HL_KERNEL_ASID_ID)
 		ctx = hdev->kernel_ctx;
 	else
-		ctx = hdev->compute_ctx;
+		ctx = hl_get_compute_ctx(hdev);
 
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
@@ -495,7 +527,7 @@ static int engines_show(struct seq_file *s, void *data)
 	struct hl_dbg_device_entry *dev_entry = entry->dev_entry;
 	struct hl_device *hdev = dev_entry->hdev;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev,
 				"Can't check device idle during reset\n");
 		return 0;
@@ -560,7 +592,7 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
 			u64 *phys_addr)
 {
 	struct hl_vm_phys_pg_pack *phys_pg_pack;
-	struct hl_ctx *ctx = hdev->compute_ctx;
+	struct hl_ctx *ctx;
 	struct hl_vm_hash_node *hnode;
 	u64 end_address, range_size;
 	struct hl_userptr *userptr;
@@ -568,6 +600,8 @@ static int device_va_to_pa(struct hl_device *hdev, u64 virt_addr, u32 size,
 	bool valid = false;
 	int i, rc = 0;
 
+	ctx = hl_get_compute_ctx(hdev);
+
 	if (!ctx) {
 		dev_err(hdev->dev, "no ctx available\n");
 		return -EINVAL;
@@ -624,7 +658,7 @@ static ssize_t hl_data_read32(struct file *f, char __user *buf,
 	ssize_t rc;
 	u32 val;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
 		return 0;
 	}
@@ -660,7 +694,7 @@ static ssize_t hl_data_write32(struct file *f, const char __user *buf,
 	u32 value;
 	ssize_t rc;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
 		return 0;
 	}
@@ -697,7 +731,7 @@ static ssize_t hl_data_read64(struct file *f, char __user *buf,
 	ssize_t rc;
 	u64 val;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev, "Can't read during reset\n");
 		return 0;
 	}
@@ -733,7 +767,7 @@ static ssize_t hl_data_write64(struct file *f, const char __user *buf,
 	u64 value;
 	ssize_t rc;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev, "Can't write during reset\n");
 		return 0;
 	}
@@ -768,7 +802,7 @@ static ssize_t hl_dma_size_write(struct file *f, const char __user *buf,
 	ssize_t rc;
 	u32 size;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev, "Can't DMA during reset\n");
 		return 0;
 	}
@@ -874,22 +908,22 @@ static ssize_t hl_i2c_data_read(struct file *f, char __user *buf,
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
 	char tmp_buf[32];
-	long val;
+	u64 val;
 	ssize_t rc;
 
 	if (*ppos)
 		return 0;
 
 	rc = hl_debugfs_i2c_read(hdev, entry->i2c_bus, entry->i2c_addr,
-			entry->i2c_reg, &val);
+			entry->i2c_reg, entry->i2c_len, &val);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to read from I2C bus %d, addr %d, reg %d\n",
-			entry->i2c_bus, entry->i2c_addr, entry->i2c_reg);
+			"Failed to read from I2C bus %d, addr %d, reg %d, len %d\n",
+			entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len);
 		return rc;
 	}
 
-	sprintf(tmp_buf, "0x%02lx\n", val);
+	sprintf(tmp_buf, "%#02llx\n", val);
 	rc = simple_read_from_buffer(buf, count, ppos, tmp_buf,
 			strlen(tmp_buf));
 
@@ -901,19 +935,19 @@ static ssize_t hl_i2c_data_write(struct file *f, const char __user *buf,
 {
 	struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
 	struct hl_device *hdev = entry->hdev;
-	u32 value;
+	u64 value;
 	ssize_t rc;
 
-	rc = kstrtouint_from_user(buf, count, 16, &value);
+	rc = kstrtou64_from_user(buf, count, 16, &value);
 	if (rc)
 		return rc;
 
 	rc = hl_debugfs_i2c_write(hdev, entry->i2c_bus, entry->i2c_addr,
-			entry->i2c_reg, value);
+			entry->i2c_reg, entry->i2c_len, value);
 	if (rc) {
 		dev_err(hdev->dev,
-			"Failed to write 0x%02x to I2C bus %d, addr %d, reg %d\n",
-			value, entry->i2c_bus, entry->i2c_addr, entry->i2c_reg);
+			"Failed to write %#02llx to I2C bus %d, addr %d, reg %d, len %d\n",
+			value, entry->i2c_bus, entry->i2c_addr, entry->i2c_reg, entry->i2c_len);
 		return rc;
 	}
 
@@ -1043,7 +1077,7 @@ static ssize_t hl_clk_gate_write(struct file *f, const char __user *buf,
 	u64 value;
 	ssize_t rc;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev,
 				"Can't change clock gating during reset\n");
 		return 0;
@@ -1085,7 +1119,7 @@ static ssize_t hl_stop_on_err_write(struct file *f, const char __user *buf,
 	u32 value;
 	ssize_t rc;
 
-	if (atomic_read(&hdev->in_reset)) {
+	if (hdev->reset_info.in_reset) {
 		dev_warn_ratelimited(hdev->dev,
 				"Can't change stop on error during reset\n");
 		return 0;
@@ -1396,6 +1430,11 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 				dev_entry->root,
 				&dev_entry->i2c_reg);
 
+	debugfs_create_u8("i2c_len",
+				0644,
+				dev_entry->root,
+				&dev_entry->i2c_len);
+
 	debugfs_create_file("i2c_data",
 				0644,
 				dev_entry->root,
@@ -1458,7 +1497,7 @@ void hl_debugfs_add_device(struct hl_device *hdev)
 	debugfs_create_x8("skip_reset_on_timeout",
 				0644,
 				dev_entry->root,
-				&hdev->skip_reset_on_timeout);
+				&hdev->reset_info.skip_reset_on_timeout);
 
 	debugfs_create_file("state_dump",
 				0600,
diff --git a/drivers/misc/habanalabs/common/device.c b/drivers/misc/habanalabs/common/device.c
index 2022e5d7b3ad..733338ab6f1d 100644
--- a/drivers/misc/habanalabs/common/device.c
+++ b/drivers/misc/habanalabs/common/device.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -17,9 +17,9 @@ enum hl_device_status hl_device_status(struct hl_device *hdev)
 {
 	enum hl_device_status status;
 
-	if (atomic_read(&hdev->in_reset))
+	if (hdev->reset_info.in_reset)
 		status = HL_DEVICE_STATUS_IN_RESET;
-	else if (hdev->needs_reset)
+	else if (hdev->reset_info.needs_reset)
 		status = HL_DEVICE_STATUS_NEEDS_RESET;
 	else if (hdev->disabled)
 		status = HL_DEVICE_STATUS_MALFUNCTION;
@@ -95,14 +95,14 @@ static void hpriv_release(struct kref *ref)
 
 	if ((hdev->reset_if_device_not_idle && !device_is_idle)
 			|| hdev->reset_upon_device_release)
-		hl_device_reset(hdev, HL_RESET_DEVICE_RELEASE);
+		hl_device_reset(hdev, HL_DRV_RESET_DEV_RELEASE);
 
-	/* Now we can mark the compute_ctx as empty. Even if a reset is running in a different
+	/* Now we can mark the compute_ctx as not active. Even if a reset is running in a different
 	 * thread, we don't care because the in_reset is marked so if a user will try to open
-	 * the device it will fail on that, even if compute_ctx is NULL.
+	 * the device it will fail on that, even if compute_ctx is false.
 	 */
 	mutex_lock(&hdev->fpriv_list_lock);
-	hdev->compute_ctx = NULL;
+	hdev->is_compute_ctx_active = false;
 	mutex_unlock(&hdev->fpriv_list_lock);
 
 	kfree(hpriv);
@@ -169,9 +169,9 @@ static int hl_device_release_ctrl(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
-	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_lock(&hdev->fpriv_ctrl_list_lock);
 	list_del(&hpriv->dev_node);
-	mutex_unlock(&hdev->fpriv_list_lock);
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 out:
 	put_pid(hpriv->taskpid);
 
@@ -324,16 +324,12 @@ put_devices:
 static void device_hard_reset_pending(struct work_struct *work)
 {
 	struct hl_device_reset_work *device_reset_work =
-		container_of(work, struct hl_device_reset_work,
-				reset_work.work);
+		container_of(work, struct hl_device_reset_work, reset_work.work);
 	struct hl_device *hdev = device_reset_work->hdev;
 	u32 flags;
 	int rc;
 
-	flags = HL_RESET_HARD | HL_RESET_FROM_RESET_THREAD;
-
-	if (device_reset_work->fw_reset)
-		flags |= HL_RESET_FW;
+	flags = device_reset_work->flags | HL_DRV_RESET_FROM_RESET_THR;
 
 	rc = hl_device_reset(hdev, flags);
 	if ((rc == -EBUSY) && !hdev->device_fini_pending) {
@@ -452,9 +448,12 @@ static int device_early_init(struct hl_device *hdev)
 	mutex_init(&hdev->debug_lock);
 	INIT_LIST_HEAD(&hdev->cs_mirror_list);
 	spin_lock_init(&hdev->cs_mirror_lock);
+	spin_lock_init(&hdev->reset_info.lock);
 	INIT_LIST_HEAD(&hdev->fpriv_list);
+	INIT_LIST_HEAD(&hdev->fpriv_ctrl_list);
 	mutex_init(&hdev->fpriv_list_lock);
-	atomic_set(&hdev->in_reset, 0);
+	mutex_init(&hdev->fpriv_ctrl_list_lock);
+	mutex_init(&hdev->clk_throttling.lock);
 
 	return 0;
 
@@ -494,6 +493,9 @@ static void device_early_fini(struct hl_device *hdev)
 	mutex_destroy(&hdev->send_cpu_message_lock);
 
 	mutex_destroy(&hdev->fpriv_list_lock);
+	mutex_destroy(&hdev->fpriv_ctrl_list_lock);
+
+	mutex_destroy(&hdev->clk_throttling.lock);
 
 	hl_cb_mgr_fini(hdev, &hdev->kernel_cb_mgr);
 
@@ -513,22 +515,6 @@ static void device_early_fini(struct hl_device *hdev)
 		hdev->asic_funcs->early_fini(hdev);
 }
 
-static void set_freq_to_low_job(struct work_struct *work)
-{
-	struct hl_device *hdev = container_of(work, struct hl_device,
-						work_freq.work);
-
-	mutex_lock(&hdev->fpriv_list_lock);
-
-	if (!hdev->compute_ctx)
-		hl_device_set_frequency(hdev, PLL_LOW);
-
-	mutex_unlock(&hdev->fpriv_list_lock);
-
-	schedule_delayed_work(&hdev->work_freq,
-			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
-}
-
 static void hl_device_heartbeat(struct work_struct *work)
 {
 	struct hl_device *hdev = container_of(work, struct hl_device,
@@ -540,8 +526,10 @@ static void hl_device_heartbeat(struct work_struct *work)
 	if (!hdev->asic_funcs->send_heartbeat(hdev))
 		goto reschedule;
 
-	dev_err(hdev->dev, "Device heartbeat failed!\n");
-	hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_HEARTBEAT);
+	if (hl_device_operational(hdev, NULL))
+		dev_err(hdev->dev, "Device heartbeat failed!\n");
+
+	hl_device_reset(hdev, HL_DRV_RESET_HARD | HL_DRV_RESET_HEARTBEAT);
 
 	return;
 
@@ -552,12 +540,12 @@ reschedule:
 	 * If control reached here, then at least one heartbeat work has been
 	 * scheduled since last reset/init cycle.
 	 * So if the device is not already in reset cycle, reset the flag
-	 * prev_reset_trigger as no reset occurred with HL_RESET_FW_FATAL_ERR
+	 * prev_reset_trigger as no reset occurred with HL_DRV_RESET_FW_FATAL_ERR
 	 * status for at least one heartbeat. From this point driver restarts
 	 * tracking future consecutive fatal errors.
 	 */
-	if (!(atomic_read(&hdev->in_reset)))
-		hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+	if (!hdev->reset_info.in_reset)
+		hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
 
 	schedule_delayed_work(&hdev->work_heartbeat,
 			usecs_to_jiffies(HL_HEARTBEAT_PER_USEC));
@@ -586,18 +574,6 @@ static int device_late_init(struct hl_device *hdev)
 
 	hdev->high_pll = hdev->asic_prop.high_pll;
 
-	/* force setting to low frequency */
-	hdev->curr_pll_profile = PLL_LOW;
-
-	if (hdev->pm_mng_profile == PM_AUTO)
-		hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
-	else
-		hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
-
-	INIT_DELAYED_WORK(&hdev->work_freq, set_freq_to_low_job);
-	schedule_delayed_work(&hdev->work_freq,
-	usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
-
 	if (hdev->heartbeat) {
 		INIT_DELAYED_WORK(&hdev->work_heartbeat, hl_device_heartbeat);
 		schedule_delayed_work(&hdev->work_heartbeat,
@@ -620,7 +596,6 @@ static void device_late_fini(struct hl_device *hdev)
 	if (!hdev->late_init_done)
 		return;
 
-	cancel_delayed_work_sync(&hdev->work_freq);
 	if (hdev->heartbeat)
 		cancel_delayed_work_sync(&hdev->work_heartbeat);
 
@@ -650,36 +625,7 @@ int hl_device_utilization(struct hl_device *hdev, u32 *utilization)
 	return 0;
 }
 
-/*
- * hl_device_set_frequency - set the frequency of the device
- *
- * @hdev: pointer to habanalabs device structure
- * @freq: the new frequency value
- *
- * Change the frequency if needed. This function has no protection against
- * concurrency, therefore it is assumed that the calling function has protected
- * itself against the case of calling this function from multiple threads with
- * different values
- *
- * Returns 0 if no change was done, otherwise returns 1
- */
-int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
-{
-	if ((hdev->pm_mng_profile == PM_MANUAL) ||
-			(hdev->curr_pll_profile == freq))
-		return 0;
-
-	dev_dbg(hdev->dev, "Changing device frequency to %s\n",
-		freq == PLL_HIGH ? "high" : "low");
-
-	hdev->asic_funcs->set_pll_profile(hdev, freq);
-
-	hdev->curr_pll_profile = freq;
-
-	return 1;
-}
-
-int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
+int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable)
 {
 	int rc = 0;
 
@@ -693,12 +639,12 @@ int hl_device_set_debug_mode(struct hl_device *hdev, bool enable)
 			goto out;
 		}
 
-		if (!hdev->hard_reset_pending)
-			hdev->asic_funcs->halt_coresight(hdev);
+		if (!hdev->reset_info.hard_reset_pending)
+			hdev->asic_funcs->halt_coresight(hdev, ctx);
 
 		hdev->in_debug = 0;
 
-		if (!hdev->hard_reset_pending)
+		if (!hdev->reset_info.hard_reset_pending)
 			hdev->asic_funcs->set_clock_gating(hdev);
 
 		goto out;
@@ -735,6 +681,8 @@ static void take_release_locks(struct hl_device *hdev)
 	/* Flush anyone that is inside device open */
 	mutex_lock(&hdev->fpriv_list_lock);
 	mutex_unlock(&hdev->fpriv_list_lock);
+	mutex_lock(&hdev->fpriv_ctrl_list_lock);
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 }
 
 static void cleanup_resources(struct hl_device *hdev, bool hard_reset, bool fw_reset)
@@ -774,11 +722,14 @@ int hl_device_suspend(struct hl_device *hdev)
 	pci_save_state(hdev->pdev);
 
 	/* Block future CS/VM/JOB completion operations */
-	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
-	if (rc) {
+	spin_lock(&hdev->reset_info.lock);
+	if (hdev->reset_info.in_reset) {
+		spin_unlock(&hdev->reset_info.lock);
 		dev_err(hdev->dev, "Can't suspend while in reset\n");
 		return -EIO;
 	}
+	hdev->reset_info.in_reset = 1;
+	spin_unlock(&hdev->reset_info.lock);
 
 	/* This blocks all other stuff that is not blocked by in_reset */
 	hdev->disabled = true;
@@ -828,10 +779,12 @@ int hl_device_resume(struct hl_device *hdev)
 	}
 
 
-	hdev->disabled = false;
-	atomic_set(&hdev->in_reset, 0);
+	/* 'in_reset' was set to true during suspend, now we must clear it in order
+	 * for hard reset to be performed
+	 */
+	hdev->reset_info.in_reset = 0;
 
-	rc = hl_device_reset(hdev, HL_RESET_HARD);
+	rc = hl_device_reset(hdev, HL_DRV_RESET_HARD);
 	if (rc) {
 		dev_err(hdev->dev, "Failed to reset device during resume\n");
 		goto disable_device;
@@ -846,17 +799,21 @@ disable_device:
 	return rc;
 }
 
-static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
+static int device_kill_open_processes(struct hl_device *hdev, u32 timeout, bool control_dev)
 {
-	struct hl_fpriv	*hpriv;
 	struct task_struct *task = NULL;
+	struct list_head *fd_list;
+	struct hl_fpriv	*hpriv;
+	struct mutex *fd_lock;
 	u32 pending_cnt;
 
+	fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
+	fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
 
 	/* Giving time for user to close FD, and for processes that are inside
 	 * hl_device_open to finish
 	 */
-	if (!list_empty(&hdev->fpriv_list))
+	if (!list_empty(fd_list))
 		ssleep(1);
 
 	if (timeout) {
@@ -872,12 +829,12 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 		}
 	}
 
-	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_lock(fd_lock);
 
 	/* This section must be protected because we are dereferencing
 	 * pointers that are freed if the process exits
 	 */
-	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node) {
+	list_for_each_entry(hpriv, fd_list, dev_node) {
 		task = get_pid_task(hpriv->taskpid, PIDTYPE_PID);
 		if (task) {
 			dev_info(hdev->dev, "Killing user process pid=%d\n",
@@ -889,12 +846,12 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 		} else {
 			dev_warn(hdev->dev,
 				"Can't get task struct for PID so giving up on killing process\n");
-			mutex_unlock(&hdev->fpriv_list_lock);
+			mutex_unlock(fd_lock);
 			return -ETIME;
 		}
 	}
 
-	mutex_unlock(&hdev->fpriv_list_lock);
+	mutex_unlock(fd_lock);
 
 	/*
 	 * We killed the open users, but that doesn't mean they are closed.
@@ -906,7 +863,7 @@ static int device_kill_open_processes(struct hl_device *hdev, u32 timeout)
 	 */
 
 wait_for_processes:
-	while ((!list_empty(&hdev->fpriv_list)) && (pending_cnt)) {
+	while ((!list_empty(fd_list)) && (pending_cnt)) {
 		dev_dbg(hdev->dev,
 			"Waiting for all unmap operations to finish before hard reset\n");
 
@@ -916,7 +873,7 @@ wait_for_processes:
 	}
 
 	/* All processes exited successfully */
-	if (list_empty(&hdev->fpriv_list))
+	if (list_empty(fd_list))
 		return 0;
 
 	/* Give up waiting for processes to exit */
@@ -928,14 +885,19 @@ wait_for_processes:
 	return -EBUSY;
 }
 
-static void device_disable_open_processes(struct hl_device *hdev)
+static void device_disable_open_processes(struct hl_device *hdev, bool control_dev)
 {
+	struct list_head *fd_list;
 	struct hl_fpriv *hpriv;
+	struct mutex *fd_lock;
 
-	mutex_lock(&hdev->fpriv_list_lock);
-	list_for_each_entry(hpriv, &hdev->fpriv_list, dev_node)
+	fd_lock = control_dev ? &hdev->fpriv_ctrl_list_lock : &hdev->fpriv_list_lock;
+	fd_list = control_dev ? &hdev->fpriv_ctrl_list : &hdev->fpriv_list;
+
+	mutex_lock(fd_lock);
+	list_for_each_entry(hpriv, fd_list, dev_node)
 		hpriv->hdev = NULL;
-	mutex_unlock(&hdev->fpriv_list_lock);
+	mutex_unlock(fd_lock);
 }
 
 static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
@@ -948,17 +910,17 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	 * ('in_reset' makes sure of it). This makes sure that
 	 * 'reset_cause' will continue holding its 1st recorded reason!
 	 */
-	if (flags & HL_RESET_HEARTBEAT) {
-		hdev->curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
-		cur_reset_trigger = HL_RESET_HEARTBEAT;
-	} else if (flags & HL_RESET_TDR) {
-		hdev->curr_reset_cause = HL_RESET_CAUSE_TDR;
-		cur_reset_trigger = HL_RESET_TDR;
-	} else if (flags & HL_RESET_FW_FATAL_ERR) {
-		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-		cur_reset_trigger = HL_RESET_FW_FATAL_ERR;
+	if (flags & HL_DRV_RESET_HEARTBEAT) {
+		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_HEARTBEAT;
+		cur_reset_trigger = HL_DRV_RESET_HEARTBEAT;
+	} else if (flags & HL_DRV_RESET_TDR) {
+		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_TDR;
+		cur_reset_trigger = HL_DRV_RESET_TDR;
+	} else if (flags & HL_DRV_RESET_FW_FATAL_ERR) {
+		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+		cur_reset_trigger = HL_DRV_RESET_FW_FATAL_ERR;
 	} else {
-		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
 
 	/*
@@ -966,11 +928,11 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	 * is set and if this reset is due to a fatal FW error
 	 * device is set to an unstable state.
 	 */
-	if (hdev->prev_reset_trigger != cur_reset_trigger) {
-		hdev->prev_reset_trigger = cur_reset_trigger;
-		hdev->reset_trigger_repeated = 0;
+	if (hdev->reset_info.prev_reset_trigger != cur_reset_trigger) {
+		hdev->reset_info.prev_reset_trigger = cur_reset_trigger;
+		hdev->reset_info.reset_trigger_repeated = 0;
 	} else {
-		hdev->reset_trigger_repeated = 1;
+		hdev->reset_info.reset_trigger_repeated = 1;
 	}
 
 	/* If reset is due to heartbeat, device CPU is no responsive in
@@ -979,8 +941,8 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
 	 * If F/W is performing the reset, no need to send it a message to disable
 	 * PCI access
 	 */
-	if ((flags & HL_RESET_HARD) &&
-			!(flags & (HL_RESET_HEARTBEAT | HL_RESET_FW))) {
+	if ((flags & HL_DRV_RESET_HARD) &&
+			!(flags & (HL_DRV_RESET_HEARTBEAT | HL_DRV_RESET_BYPASS_REQ_TO_FW))) {
 		/* Disable PCI access from device F/W so he won't send
 		 * us additional interrupts. We disable MSI/MSI-X at
 		 * the halt_engines function and we can't have the F/W
@@ -1015,34 +977,39 @@ static void handle_reset_trigger(struct hl_device *hdev, u32 flags)
  */
 int hl_device_reset(struct hl_device *hdev, u32 flags)
 {
+	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false,
+			reset_upon_device_release = false, schedule_hard_reset = false;
 	u64 idle_mask[HL_BUSY_ENGINES_MASK_EXT_SIZE] = {0};
-	bool hard_reset, from_hard_reset_thread, fw_reset, hard_instead_soft = false;
+	struct hl_ctx *ctx;
 	int i, rc;
 
 	if (!hdev->init_done) {
-		dev_err(hdev->dev,
-			"Can't reset before initialization is done\n");
+		dev_err(hdev->dev, "Can't reset before initialization is done\n");
 		return 0;
 	}
 
-	hard_reset = !!(flags & HL_RESET_HARD);
-	from_hard_reset_thread = !!(flags & HL_RESET_FROM_RESET_THREAD);
-	fw_reset = !!(flags & HL_RESET_FW);
+	hard_reset = !!(flags & HL_DRV_RESET_HARD);
+	from_hard_reset_thread = !!(flags & HL_DRV_RESET_FROM_RESET_THR);
+	fw_reset = !!(flags & HL_DRV_RESET_BYPASS_REQ_TO_FW);
 
-	if (!hard_reset && !hdev->supports_soft_reset) {
+	if (!hard_reset && !hdev->asic_prop.supports_soft_reset) {
 		hard_instead_soft = true;
 		hard_reset = true;
 	}
 
-	if (hdev->reset_upon_device_release &&
-			(flags & HL_RESET_DEVICE_RELEASE)) {
-		dev_dbg(hdev->dev,
-			"Perform %s-reset upon device release\n",
-			hard_reset ? "hard" : "soft");
+	if (hdev->reset_upon_device_release && (flags & HL_DRV_RESET_DEV_RELEASE)) {
+		if (hard_reset) {
+			dev_crit(hdev->dev,
+				"Aborting reset because hard-reset is mutually exclusive with reset-on-device-release\n");
+			return -EINVAL;
+		}
+
+		reset_upon_device_release = true;
+
 		goto do_reset;
 	}
 
-	if (!hard_reset && !hdev->allow_inference_soft_reset) {
+	if (!hard_reset && !hdev->asic_prop.allow_inference_soft_reset) {
 		hard_instead_soft = true;
 		hard_reset = true;
 	}
@@ -1062,12 +1029,22 @@ do_reset:
 	 */
 	if (!from_hard_reset_thread) {
 		/* Block future CS/VM/JOB completion operations */
-		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
-		if (rc)
+		spin_lock(&hdev->reset_info.lock);
+		if (hdev->reset_info.in_reset) {
+			/* We only allow scheduling of a hard reset during soft reset */
+			if (hard_reset && hdev->reset_info.is_in_soft_reset)
+				hdev->reset_info.hard_reset_schedule_flags = flags;
+			spin_unlock(&hdev->reset_info.lock);
 			return 0;
+		}
+		hdev->reset_info.in_reset = 1;
+		spin_unlock(&hdev->reset_info.lock);
 
 		handle_reset_trigger(hdev, flags);
 
+		/* This still allows the completion of some KDMA ops */
+		hdev->reset_info.is_in_soft_reset = !hard_reset;
+
 		/* This also blocks future CS/VM/JOB completion operations */
 		hdev->disabled = true;
 
@@ -1075,21 +1052,19 @@ do_reset:
 
 		if (hard_reset)
 			dev_info(hdev->dev, "Going to reset device\n");
-		else if (flags & HL_RESET_DEVICE_RELEASE)
-			dev_info(hdev->dev,
-				"Going to reset device after it was released by user\n");
+		else if (reset_upon_device_release)
+			dev_info(hdev->dev, "Going to reset device after release by user\n");
 		else
-			dev_info(hdev->dev,
-				"Going to reset compute engines of inference device\n");
+			dev_info(hdev->dev, "Going to reset engines of inference device\n");
 	}
 
 again:
 	if ((hard_reset) && (!from_hard_reset_thread)) {
-		hdev->hard_reset_pending = true;
+		hdev->reset_info.hard_reset_pending = true;
 
 		hdev->process_kill_trial_cnt = 0;
 
-		hdev->device_reset_work.fw_reset = fw_reset;
+		hdev->device_reset_work.flags = flags;
 
 		/*
 		 * Because the reset function can't run from heartbeat work,
@@ -1109,7 +1084,7 @@ kill_processes:
 		 * process can't really exit until all its CSs are done, which
 		 * is what we do in cs rollback
 		 */
-		rc = device_kill_open_processes(hdev, 0);
+		rc = device_kill_open_processes(hdev, 0, false);
 
 		if (rc == -EBUSY) {
 			if (hdev->device_fini_pending) {
@@ -1138,7 +1113,7 @@ kill_processes:
 	hdev->asic_funcs->hw_fini(hdev, hard_reset, fw_reset);
 
 	if (hard_reset) {
-		hdev->fw_loader.linux_loaded = false;
+		hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
 
 		/* Release kernel context */
 		if (hdev->kernel_ctx && hl_ctx_put(hdev->kernel_ctx) == 1)
@@ -1154,24 +1129,23 @@ kill_processes:
 	for (i = 0 ; i < hdev->asic_prop.completion_queues_count ; i++)
 		hl_cq_reset(hdev, &hdev->completion_queue[i]);
 
-	mutex_lock(&hdev->fpriv_list_lock);
-
 	/* Make sure the context switch phase will run again */
-	if (hdev->compute_ctx) {
-		atomic_set(&hdev->compute_ctx->thread_ctx_switch_token, 1);
-		hdev->compute_ctx->thread_ctx_switch_wait_token = 0;
+	ctx = hl_get_compute_ctx(hdev);
+	if (ctx) {
+		atomic_set(&ctx->thread_ctx_switch_token, 1);
+		ctx->thread_ctx_switch_wait_token = 0;
+		hl_ctx_put(ctx);
 	}
 
-	mutex_unlock(&hdev->fpriv_list_lock);
-
 	/* Finished tear-down, starting to re-initialize */
 
 	if (hard_reset) {
 		hdev->device_cpu_disabled = false;
-		hdev->hard_reset_pending = false;
+		hdev->reset_info.hard_reset_pending = false;
 
-		if (hdev->reset_trigger_repeated &&
-				(hdev->prev_reset_trigger == HL_RESET_FW_FATAL_ERR)) {
+		if (hdev->reset_info.reset_trigger_repeated &&
+				(hdev->reset_info.prev_reset_trigger ==
+						HL_DRV_RESET_FW_FATAL_ERR)) {
 			/* if there 2 back to back resets from FW,
 			 * ensure driver puts the driver in a unusable state
 			 */
@@ -1204,7 +1178,7 @@ kill_processes:
 			goto out_err;
 		}
 
-		hdev->compute_ctx = NULL;
+		hdev->is_compute_ctx_active = false;
 
 		rc = hl_ctx_init(hdev, hdev->kernel_ctx, true);
 		if (rc) {
@@ -1225,16 +1199,14 @@ kill_processes:
 
 	rc = hdev->asic_funcs->hw_init(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"failed to initialize the H/W after reset\n");
+		dev_err(hdev->dev, "failed to initialize the H/W after reset\n");
 		goto out_err;
 	}
 
 	/* If device is not idle fail the reset process */
 	if (!hdev->asic_funcs->is_device_idle(hdev, idle_mask,
 			HL_BUSY_ENGINES_MASK_EXT_SIZE, NULL)) {
-		dev_err(hdev->dev,
-			"device is not idle (mask 0x%llx_%llx) after reset\n",
+		dev_err(hdev->dev, "device is not idle (mask 0x%llx_%llx) after reset\n",
 			idle_mask[1], idle_mask[0]);
 		rc = -EIO;
 		goto out_err;
@@ -1243,43 +1215,56 @@ kill_processes:
 	/* Check that the communication with the device is working */
 	rc = hdev->asic_funcs->test_queues(hdev);
 	if (rc) {
-		dev_err(hdev->dev,
-			"Failed to detect if device is alive after reset\n");
+		dev_err(hdev->dev, "Failed to detect if device is alive after reset\n");
 		goto out_err;
 	}
 
 	if (hard_reset) {
 		rc = device_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after hard reset\n");
+			dev_err(hdev->dev, "Failed late init after hard reset\n");
 			goto out_err;
 		}
 
 		rc = hl_vm_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed to init memory module after hard reset\n");
+			dev_err(hdev->dev, "Failed to init memory module after hard reset\n");
 			goto out_err;
 		}
 
 		hl_set_max_power(hdev);
 	} else {
-		rc = hdev->asic_funcs->soft_reset_late_init(hdev);
+		rc = hdev->asic_funcs->non_hard_reset_late_init(hdev);
 		if (rc) {
-			dev_err(hdev->dev,
-				"Failed late init after soft reset\n");
+			if (reset_upon_device_release)
+				dev_err(hdev->dev,
+					"Failed late init in reset after device release\n");
+			else
+				dev_err(hdev->dev, "Failed late init after soft reset\n");
 			goto out_err;
 		}
 	}
 
-	atomic_set(&hdev->in_reset, 0);
-	hdev->needs_reset = false;
+	spin_lock(&hdev->reset_info.lock);
+	hdev->reset_info.is_in_soft_reset = false;
+
+	/* Schedule hard reset only if requested and if not already in hard reset.
+	 * We keep 'in_reset' enabled, so no other reset can go in during the hard
+	 * reset schedule
+	 */
+	if (!hard_reset && hdev->reset_info.hard_reset_schedule_flags)
+		schedule_hard_reset = true;
+	else
+		hdev->reset_info.in_reset = 0;
+
+	spin_unlock(&hdev->reset_info.lock);
+
+	hdev->reset_info.needs_reset = false;
 
 	dev_notice(hdev->dev, "Successfully finished resetting the device\n");
 
 	if (hard_reset) {
-		hdev->hard_reset_cnt++;
+		hdev->reset_info.hard_reset_cnt++;
 
 		/* After reset is done, we are ready to receive events from
 		 * the F/W. We can't do it before because we will ignore events
@@ -1287,28 +1272,41 @@ kill_processes:
 		 * the device will be operational although it shouldn't be
 		 */
 		hdev->asic_funcs->enable_events_from_fw(hdev);
-	} else {
-		hdev->soft_reset_cnt++;
+	} else if (!reset_upon_device_release) {
+		hdev->reset_info.soft_reset_cnt++;
+	}
+
+	if (schedule_hard_reset) {
+		dev_info(hdev->dev, "Performing hard reset scheduled during soft reset\n");
+		flags = hdev->reset_info.hard_reset_schedule_flags;
+		hdev->reset_info.hard_reset_schedule_flags = 0;
+		hdev->disabled = true;
+		hard_reset = true;
+		handle_reset_trigger(hdev, flags);
+		goto again;
 	}
 
 	return 0;
 
 out_err:
 	hdev->disabled = true;
+	hdev->reset_info.is_in_soft_reset = false;
 
 	if (hard_reset) {
-		dev_err(hdev->dev,
-			"Failed to reset! Device is NOT usable\n");
-		hdev->hard_reset_cnt++;
+		dev_err(hdev->dev, "Failed to reset! Device is NOT usable\n");
+		hdev->reset_info.hard_reset_cnt++;
+	} else if (reset_upon_device_release) {
+		dev_err(hdev->dev, "Failed to reset device after user release\n");
+		hard_reset = true;
+		goto again;
 	} else {
-		dev_err(hdev->dev,
-			"Failed to do soft-reset, trying hard reset\n");
-		hdev->soft_reset_cnt++;
+		dev_err(hdev->dev, "Failed to do soft-reset\n");
+		hdev->reset_info.soft_reset_cnt++;
 		hard_reset = true;
 		goto again;
 	}
 
-	atomic_set(&hdev->in_reset, 0);
+	hdev->reset_info.in_reset = 0;
 
 	return rc;
 }
@@ -1455,7 +1453,7 @@ int hl_device_init(struct hl_device *hdev, struct class *hclass)
 		goto mmu_fini;
 	}
 
-	hdev->compute_ctx = NULL;
+	hdev->is_compute_ctx_active = false;
 
 	hdev->asic_funcs->state_dump_init(hdev);
 
@@ -1619,6 +1617,7 @@ out_disabled:
  */
 void hl_device_fini(struct hl_device *hdev)
 {
+	bool device_in_reset;
 	ktime_t timeout;
 	u64 reset_sec;
 	int i, rc;
@@ -1642,10 +1641,22 @@ void hl_device_fini(struct hl_device *hdev)
 	 */
 
 	timeout = ktime_add_us(ktime_get(), reset_sec * 1000 * 1000);
-	rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
-	while (rc) {
+
+	spin_lock(&hdev->reset_info.lock);
+	device_in_reset = !!hdev->reset_info.in_reset;
+	if (!device_in_reset)
+		hdev->reset_info.in_reset = 1;
+	spin_unlock(&hdev->reset_info.lock);
+
+	while (device_in_reset) {
 		usleep_range(50, 200);
-		rc = atomic_cmpxchg(&hdev->in_reset, 0, 1);
+
+		spin_lock(&hdev->reset_info.lock);
+		device_in_reset = !!hdev->reset_info.in_reset;
+		if (!device_in_reset)
+			hdev->reset_info.in_reset = 1;
+		spin_unlock(&hdev->reset_info.lock);
+
 		if (ktime_compare(ktime_get(), timeout) > 0) {
 			dev_crit(hdev->dev,
 				"Failed to remove device because reset function did not finish\n");
@@ -1667,7 +1678,7 @@ void hl_device_fini(struct hl_device *hdev)
 
 	take_release_locks(hdev);
 
-	hdev->hard_reset_pending = true;
+	hdev->reset_info.hard_reset_pending = true;
 
 	hl_hwmon_fini(hdev);
 
@@ -1681,10 +1692,16 @@ void hl_device_fini(struct hl_device *hdev)
 		"Waiting for all processes to exit (timeout of %u seconds)",
 		HL_PENDING_RESET_LONG_SEC);
 
-	rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC);
+	rc = device_kill_open_processes(hdev, HL_PENDING_RESET_LONG_SEC, false);
 	if (rc) {
 		dev_crit(hdev->dev, "Failed to kill all open processes\n");
-		device_disable_open_processes(hdev);
+		device_disable_open_processes(hdev, false);
+	}
+
+	rc = device_kill_open_processes(hdev, 0, true);
+	if (rc) {
+		dev_crit(hdev->dev, "Failed to kill all control device open processes\n");
+		device_disable_open_processes(hdev, true);
 	}
 
 	hl_cb_pool_fini(hdev);
@@ -1692,7 +1709,7 @@ void hl_device_fini(struct hl_device *hdev)
 	/* Reset the H/W. It will be in idle state after this returns */
 	hdev->asic_funcs->hw_fini(hdev, true, false);
 
-	hdev->fw_loader.linux_loaded = false;
+	hdev->fw_loader.fw_comp_loaded = FW_TYPE_NONE;
 
 	/* Release kernel context */
 	if ((hdev->kernel_ctx) && (hl_ctx_put(hdev->kernel_ctx) != 1))
diff --git a/drivers/misc/habanalabs/common/firmware_if.c b/drivers/misc/habanalabs/common/firmware_if.c
index 4e68fb9d2a6b..6775c5c3166b 100644
--- a/drivers/misc/habanalabs/common/firmware_if.c
+++ b/drivers/misc/habanalabs/common/firmware_if.c
@@ -15,8 +15,6 @@
 
 #define FW_FILE_MAX_SIZE		0x1400000 /* maximum size of 20MB */
 
-#define FW_CPU_STATUS_POLL_INTERVAL_USEC	10000
-
 static char *extract_fw_ver_from_str(const char *fw_str)
 {
 	char *str, *fw_ver, *whitespace;
@@ -214,7 +212,8 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	struct cpucp_packet *pkt;
 	dma_addr_t pkt_dma_addr;
-	u32 tmp, expected_ack_val;
+	struct hl_bd *sent_bd;
+	u32 tmp, expected_ack_val, pi;
 	int rc = 0;
 
 	pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
@@ -239,6 +238,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 
 	/* set fence to a non valid value */
 	pkt->fence = cpu_to_le32(UINT_MAX);
+	pi = queue->pi;
 
 	/*
 	 * The CPU queue is a synchronous queue with an effective depth of
@@ -248,7 +248,7 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 	 * Which means that we don't need to lock the access to the entire H/W
 	 * queues module when submitting a JOB to the CPU queue.
 	 */
-	hl_hw_queue_submit_bd(hdev, queue, 0, len, pkt_dma_addr);
+	hl_hw_queue_submit_bd(hdev, queue, hl_queue_inc_ptr(queue->pi), len, pkt_dma_addr);
 
 	if (prop->fw_app_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
 		expected_ack_val = queue->pi;
@@ -280,6 +280,14 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
 		*result = le64_to_cpu(pkt->result);
 	}
 
+	/* Scrub previous buffer descriptor 'ctl' field which contains the
+	 * previous PI value written during packet submission.
+	 * We must do this or else F/W can read an old value upon queue wraparound.
+	 */
+	sent_bd = queue->kernel_address;
+	sent_bd += hl_pi_2_offset(pi);
+	sent_bd->ctl = cpu_to_le32(UINT_MAX);
+
 out:
 	mutex_unlock(&hdev->send_cpu_message_lock);
 
@@ -445,15 +453,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 		err_exists = true;
 	}
 
-	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
-		dev_warn(hdev->dev,
-			"Device boot warning - Skipped DRAM initialization\n");
-		/* This is a warning so we don't want it to disable the
-		 * device
-		 */
-		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
-	}
-
 	if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
 		if (hdev->bmc_enable) {
 			dev_err(hdev->dev,
@@ -497,15 +496,6 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 		err_exists = true;
 	}
 
-	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
-		dev_warn(hdev->dev,
-			"Device boot warning - Failed to load preboot primary image\n");
-		/* This is a warning so we don't want it to disable the
-		 * device as we have a secondary preboot image
-		 */
-		err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
-	}
-
 	if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
 		dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
 		err_exists = true;
@@ -525,6 +515,34 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
 	if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
 		dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
 
+	/* All warnings should go here in order not to reach the unknown error validation */
+	if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
+		dev_warn(hdev->dev,
+			"Device boot warning - Skipped DRAM initialization\n");
+		/* This is a warning so we don't want it to disable the
+		 * device
+		 */
+		err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
+	}
+
+	if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
+		dev_warn(hdev->dev,
+			"Device boot warning - Failed to load preboot primary image\n");
+		/* This is a warning so we don't want it to disable the
+		 * device as we have a secondary preboot image
+		 */
+		err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
+	}
+
+	if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
+		dev_warn(hdev->dev,
+			"Device boot warning - TPM failure\n");
+		/* This is a warning so we don't want it to disable the
+		 * device
+		 */
+		err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
+	}
+
 	if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
 		dev_err(hdev->dev,
 			"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
@@ -961,6 +979,7 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 
 	pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
 				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.type = cpu_to_le16(CPUCP_POWER_INPUT);
 
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 			HL_CPUCP_INFO_TIMEOUT_USEC, &result);
@@ -974,6 +993,92 @@ int hl_fw_cpucp_power_get(struct hl_device *hdev, u64 *power)
 	return rc;
 }
 
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+				struct cpucp_hbm_row_info *info)
+{
+	struct cpucp_hbm_row_info *cpucp_repl_rows_info_cpu_addr;
+	dma_addr_t cpucp_repl_rows_info_dma_addr;
+	struct cpucp_packet pkt = {};
+	u64 result;
+	int rc;
+
+	cpucp_repl_rows_info_cpu_addr =
+			hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev,
+					sizeof(struct cpucp_hbm_row_info),
+					&cpucp_repl_rows_info_dma_addr);
+	if (!cpucp_repl_rows_info_cpu_addr) {
+		dev_err(hdev->dev,
+			"Failed to allocate DMA memory for CPU-CP replaced rows info packet\n");
+		return -ENOMEM;
+	}
+
+	memset(cpucp_repl_rows_info_cpu_addr, 0, sizeof(struct cpucp_hbm_row_info));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET <<
+					CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.addr = cpu_to_le64(cpucp_repl_rows_info_dma_addr);
+	pkt.data_max_size = cpu_to_le32(sizeof(struct cpucp_hbm_row_info));
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+					HL_CPUCP_INFO_TIMEOUT_USEC, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to handle CPU-CP replaced rows info pkt, error %d\n", rc);
+		goto out;
+	}
+
+	memcpy(info, cpucp_repl_rows_info_cpu_addr, sizeof(*info));
+
+out:
+	hdev->asic_funcs->cpu_accessible_dma_pool_free(hdev,
+					sizeof(struct cpucp_hbm_row_info),
+					cpucp_repl_rows_info_cpu_addr);
+
+	return rc;
+}
+
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num)
+{
+	struct cpucp_packet pkt;
+	u64 result;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_HBM_PENDING_ROWS_STATUS << CPUCP_PKT_CTL_OPCODE_SHIFT);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt), 0, &result);
+	if (rc) {
+		dev_err(hdev->dev,
+				"Failed to handle CPU-CP pending rows info pkt, error %d\n", rc);
+		goto out;
+	}
+
+	*pend_rows_num = (u32) result;
+out:
+	return rc;
+}
+
+int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid)
+{
+	struct cpucp_packet pkt;
+	int rc;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.ctl = cpu_to_le32(CPUCP_PACKET_ENGINE_CORE_ASID_SET << CPUCP_PKT_CTL_OPCODE_SHIFT);
+	pkt.value = cpu_to_le64(asid);
+
+	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
+						HL_CPUCP_INFO_TIMEOUT_USEC, NULL);
+	if (rc)
+		dev_err(hdev->dev,
+			"Failed on ASID configuration request for engine core, error %d\n",
+			rc);
+
+	return rc;
+}
+
 void hl_fw_ask_hard_reset_without_linux(struct hl_device *hdev)
 {
 	struct static_fw_load_mgr *static_loader =
@@ -1028,7 +1133,7 @@ static void detect_cpu_boot_status(struct hl_device *hdev, u32 status)
 	switch (status) {
 	case CPU_BOOT_STATUS_NA:
 		dev_err(hdev->dev,
-			"Device boot progress - BTL did NOT run\n");
+			"Device boot progress - BTL/ROM did NOT run\n");
 		break;
 	case CPU_BOOT_STATUS_IN_WFE:
 		dev_err(hdev->dev,
@@ -1101,9 +1206,8 @@ static int hl_fw_read_preboot_caps(struct hl_device *hdev,
 		(status == CPU_BOOT_STATUS_DRAM_RDY) ||
 		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
 		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
-		(status == CPU_BOOT_STATUS_SRAM_AVAIL) ||
 		(status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT),
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		timeout);
 
 	if (rc) {
@@ -1250,8 +1354,7 @@ static void hl_fw_preboot_update_state(struct hl_device *hdev)
 	 * 3. FW application - a. Fetch fw application security status
 	 *                     b. Check whether hard reset is done by fw app
 	 */
-	prop->hard_reset_done_by_fw =
-		!!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
+	prop->hard_reset_done_by_fw = !!(cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 	dev_dbg(hdev->dev, "Firmware preboot boot device status0 %#x\n",
 							cpu_boot_dev_sts0);
@@ -1287,11 +1390,7 @@ int hl_fw_read_preboot_status(struct hl_device *hdev, u32 cpu_boot_status_reg,
 {
 	int rc;
 
-	/* pldm was added for cases in which we use preboot on pldm and want
-	 * to load boot fit, but we can't wait for preboot because it runs
-	 * very slowly
-	 */
-	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU) || hdev->pldm)
+	if (!(hdev->fw_components & FW_TYPE_PREBOOT_CPU))
 		return 0;
 
 	/*
@@ -1437,7 +1536,7 @@ static int hl_fw_dynamic_wait_for_status(struct hl_device *hdev,
 		le32_to_cpu(dyn_regs->cpu_cmd_status_to_host),
 		status,
 		FIELD_GET(COMMS_STATUS_STATUS_MASK, status) == expected_status,
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		timeout);
 
 	if (rc) {
@@ -1703,6 +1802,9 @@ static int hl_fw_dynamic_validate_descriptor(struct hl_device *hdev,
 		return rc;
 	}
 
+	/* here we can mark the descriptor as valid as the content has been validated */
+	fw_loader->dynamic_loader.fw_desc_valid = true;
+
 	return 0;
 }
 
@@ -1759,7 +1861,13 @@ static int hl_fw_dynamic_read_and_validate_descriptor(struct hl_device *hdev,
 		return rc;
 	}
 
-	/* extract address copy the descriptor from */
+	/*
+	 * extract address to copy the descriptor from
+	 * in addition, as the descriptor value is going to be over-ridden by new data- we mark it
+	 * as invalid.
+	 * it will be marked again as valid once validated
+	 */
+	fw_loader->dynamic_loader.fw_desc_valid = false;
 	src = hdev->pcie_bar[region->bar_id] + region->offset_in_bar +
 							response->ram_offset;
 	memcpy_fromio(fw_desc, src, sizeof(struct lkd_fw_comms_desc));
@@ -1920,17 +2028,15 @@ static void hl_fw_boot_fit_update_state(struct hl_device *hdev,
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	/* Clear reset status since we need to read it again from boot CPU */
-	prop->hard_reset_done_by_fw = false;
+	hdev->fw_loader.fw_comp_loaded |= FW_TYPE_BOOT_CPU;
 
 	/* Read boot_cpu status bits */
 	if (prop->fw_preboot_cpu_boot_dev_sts0 & CPU_BOOT_DEV_STS0_ENABLED) {
 		prop->fw_bootfit_cpu_boot_dev_sts0 =
 				RREG32(cpu_boot_dev_sts0_reg);
 
-		if (prop->fw_bootfit_cpu_boot_dev_sts0 &
-				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
-			prop->hard_reset_done_by_fw = true;
+		prop->hard_reset_done_by_fw = !!(prop->fw_bootfit_cpu_boot_dev_sts0 &
+							CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 		dev_dbg(hdev->dev, "Firmware boot CPU status0 %#x\n",
 					prop->fw_bootfit_cpu_boot_dev_sts0);
@@ -2055,14 +2161,21 @@ static int hl_fw_dynamic_wait_for_boot_fit_active(struct hl_device *hdev,
 
 	dyn_loader = &fw_loader->dynamic_loader;
 
-	/* Make sure CPU boot-loader is running */
+	/*
+	 * Make sure CPU boot-loader is running
+	 * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux
+	 * yet there is a debug scenario in which we loading uboot (without Linux)
+	 * which at later stage is relocated to DRAM. In this case we expect
+	 * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the
+	 * poll flags
+	 */
 	rc = hl_poll_timeout(
 		hdev,
 		le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status),
 		status,
-		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
-		(status == CPU_BOOT_STATUS_READY_TO_BOOT),
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
+		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
+		hdev->fw_poll_interval_usec,
 		dyn_loader->wait_for_bl_timeout);
 	if (rc) {
 		dev_err(hdev->dev, "failed to wait for boot\n");
@@ -2082,14 +2195,14 @@ static int hl_fw_dynamic_wait_for_linux_active(struct hl_device *hdev,
 
 	dyn_loader = &fw_loader->dynamic_loader;
 
-	/* Make sure CPU boot-loader is running */
+	/* Make sure CPU linux is running */
 
 	rc = hl_poll_timeout(
 		hdev,
 		le32_to_cpu(dyn_loader->comm_desc.cpu_dyn_regs.cpu_boot_status),
 		status,
 		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		fw_loader->cpu_timeout);
 	if (rc) {
 		dev_err(hdev->dev, "failed to wait for Linux\n");
@@ -2121,18 +2234,14 @@ static void hl_fw_linux_update_state(struct hl_device *hdev,
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
 
-	hdev->fw_loader.linux_loaded = true;
-
-	/* Clear reset status since we need to read again from app */
-	prop->hard_reset_done_by_fw = false;
+	hdev->fw_loader.fw_comp_loaded |= FW_TYPE_LINUX;
 
 	/* Read FW application security bits */
 	if (prop->fw_cpu_boot_dev_sts0_valid) {
 		prop->fw_app_cpu_boot_dev_sts0 = RREG32(cpu_boot_dev_sts0_reg);
 
-		if (prop->fw_app_cpu_boot_dev_sts0 &
-				CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
-			prop->hard_reset_done_by_fw = true;
+		prop->hard_reset_done_by_fw = !!(prop->fw_app_cpu_boot_dev_sts0 &
+							CPU_BOOT_DEV_STS0_FW_HARD_RST_EN);
 
 		if (prop->fw_app_cpu_boot_dev_sts0 &
 				CPU_BOOT_DEV_STS0_GIC_PRIVILEGED_EN)
@@ -2247,6 +2356,9 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	dev_info(hdev->dev,
 		"Loading firmware to device, may take some time...\n");
 
+	/* initialize FW descriptor as invalid */
+	fw_loader->dynamic_loader.fw_desc_valid = false;
+
 	/*
 	 * In this stage, "cpu_dyn_regs" contains only LKD's hard coded values!
 	 * It will be updated from FW after hl_fw_dynamic_request_descriptor().
@@ -2259,14 +2371,14 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	if (rc)
 		goto protocol_err;
 
-	if (hdev->curr_reset_cause) {
+	if (hdev->reset_info.curr_reset_cause) {
 		rc = hl_fw_dynamic_send_msg(hdev, fw_loader,
-				HL_COMMS_RESET_CAUSE_TYPE, &hdev->curr_reset_cause);
+				HL_COMMS_RESET_CAUSE_TYPE, &hdev->reset_info.curr_reset_cause);
 		if (rc)
 			goto protocol_err;
 
 		/* Clear current reset cause */
-		hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+		hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
 	}
 
 	if (!(hdev->fw_components & FW_TYPE_BOOT_CPU)) {
@@ -2288,6 +2400,15 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 		goto protocol_err;
 	}
 
+	/*
+	 * when testing FW load (without Linux) on PLDM we don't want to
+	 * wait until boot fit is active as it may take several hours.
+	 * instead, we load the bootfit and let it do all initializations in
+	 * the background.
+	 */
+	if (hdev->pldm && !(hdev->fw_components & FW_TYPE_LINUX))
+		return 0;
+
 	rc = hl_fw_dynamic_wait_for_boot_fit_active(hdev, fw_loader);
 	if (rc)
 		goto protocol_err;
@@ -2333,7 +2454,8 @@ static int hl_fw_dynamic_init_cpu(struct hl_device *hdev,
 	return 0;
 
 protocol_err:
-	fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
+	if (fw_loader->dynamic_loader.fw_desc_valid)
+		fw_read_errors(hdev, le32_to_cpu(dyn_regs->cpu_boot_err0),
 				le32_to_cpu(dyn_regs->cpu_boot_err1),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts0),
 				le32_to_cpu(dyn_regs->cpu_boot_dev_sts1));
@@ -2380,7 +2502,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 		cpu_boot_status_reg,
 		status,
 		status == CPU_BOOT_STATUS_WAITING_FOR_BOOT_FIT,
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		fw_loader->boot_fit_timeout);
 
 	if (rc) {
@@ -2403,7 +2525,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 			cpu_msg_status_reg,
 			status,
 			status == CPU_MSG_OK,
-			FW_CPU_STATUS_POLL_INTERVAL_USEC,
+			hdev->fw_poll_interval_usec,
 			fw_loader->boot_fit_timeout);
 
 		if (rc) {
@@ -2416,7 +2538,14 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 		WREG32(msg_to_cpu_reg, KMD_MSG_NA);
 	}
 
-	/* Make sure CPU boot-loader is running */
+	/*
+	 * Make sure CPU boot-loader is running
+	 * Note that the CPU_BOOT_STATUS_SRAM_AVAIL is generally set by Linux
+	 * yet there is a debug scenario in which we loading uboot (without Linux)
+	 * which at later stage is relocated to DRAM. In this case we expect
+	 * uboot to set the CPU_BOOT_STATUS_SRAM_AVAIL and so we add it to the
+	 * poll flags
+	 */
 	rc = hl_poll_timeout(
 		hdev,
 		cpu_boot_status_reg,
@@ -2425,7 +2554,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 		(status == CPU_BOOT_STATUS_NIC_FW_RDY) ||
 		(status == CPU_BOOT_STATUS_READY_TO_BOOT) ||
 		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		cpu_timeout);
 
 	dev_dbg(hdev->dev, "uboot status = %d\n", status);
@@ -2474,7 +2603,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 			cpu_boot_status_reg,
 			status,
 			(status == CPU_BOOT_STATUS_BMC_WAITING_SKIPPED),
-			FW_CPU_STATUS_POLL_INTERVAL_USEC,
+			hdev->fw_poll_interval_usec,
 			cpu_timeout);
 
 		if (rc) {
@@ -2494,7 +2623,7 @@ static int hl_fw_static_init_cpu(struct hl_device *hdev,
 		cpu_boot_status_reg,
 		status,
 		(status == CPU_BOOT_STATUS_SRAM_AVAIL),
-		FW_CPU_STATUS_POLL_INTERVAL_USEC,
+		hdev->fw_poll_interval_usec,
 		cpu_timeout);
 
 	/* Clear message */
diff --git a/drivers/misc/habanalabs/common/habanalabs.h b/drivers/misc/habanalabs/common/habanalabs.h
index a2002cbf794b..cb710fd478b6 100644
--- a/drivers/misc/habanalabs/common/habanalabs.h
+++ b/drivers/misc/habanalabs/common/habanalabs.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -61,6 +61,8 @@
 #define HL_CPUCP_INFO_TIMEOUT_USEC	10000000 /* 10s */
 #define HL_CPUCP_EEPROM_TIMEOUT_USEC	10000000 /* 10s */
 
+#define HL_FW_STATUS_POLL_INTERVAL_USEC		10000 /* 10ms */
+
 #define HL_PCI_ELBI_TIMEOUT_MSEC	10 /* 10ms */
 
 #define HL_SIM_MAX_TIMEOUT_US		10000000 /* 10s */
@@ -117,37 +119,37 @@ enum hl_mmu_page_table_location {
 /*
  * Reset Flags
  *
- * - HL_RESET_HARD
+ * - HL_DRV_RESET_HARD
  *       If set do hard reset to all engines. If not set reset just
  *       compute/DMA engines.
  *
- * - HL_RESET_FROM_RESET_THREAD
+ * - HL_DRV_RESET_FROM_RESET_THR
  *       Set if the caller is the hard-reset thread
  *
- * - HL_RESET_HEARTBEAT
+ * - HL_DRV_RESET_HEARTBEAT
  *       Set if reset is due to heartbeat
  *
- * - HL_RESET_TDR
+ * - HL_DRV_RESET_TDR
  *       Set if reset is due to TDR
  *
- * - HL_RESET_DEVICE_RELEASE
+ * - HL_DRV_RESET_DEV_RELEASE
  *       Set if reset is due to device release
  *
- * - HL_RESET_FW
+ * - HL_DRV_RESET_BYPASS_REQ_TO_FW
  *       F/W will perform the reset. No need to ask it to reset the device. This is relevant
  *       only when running with secured f/w
  *
- * - HL_RESET_FW_FATAL_ERR
+ * - HL_DRV_RESET_FW_FATAL_ERR
  *       Set if reset is due to a fatal error from FW
  */
 
-#define HL_RESET_HARD			(1 << 0)
-#define HL_RESET_FROM_RESET_THREAD	(1 << 1)
-#define HL_RESET_HEARTBEAT		(1 << 2)
-#define HL_RESET_TDR			(1 << 3)
-#define HL_RESET_DEVICE_RELEASE		(1 << 4)
-#define HL_RESET_FW			(1 << 5)
-#define HL_RESET_FW_FATAL_ERR		(1 << 6)
+#define HL_DRV_RESET_HARD		(1 << 0)
+#define HL_DRV_RESET_FROM_RESET_THR	(1 << 1)
+#define HL_DRV_RESET_HEARTBEAT		(1 << 2)
+#define HL_DRV_RESET_TDR		(1 << 3)
+#define HL_DRV_RESET_DEV_RELEASE	(1 << 4)
+#define HL_DRV_RESET_BYPASS_REQ_TO_FW	(1 << 5)
+#define HL_DRV_RESET_FW_FATAL_ERR	(1 << 6)
 
 #define HL_MAX_SOBS_PER_MONITOR	8
 
@@ -219,6 +221,7 @@ enum hl_fw_component {
 
 /**
  * enum hl_fw_types - F/W types present in the system
+ * @FW_TYPE_NONE: no FW component indication
  * @FW_TYPE_LINUX: Linux image for device CPU
  * @FW_TYPE_BOOT_CPU: Boot image for device CPU
  * @FW_TYPE_PREBOOT_CPU: Indicates pre-loaded CPUs are present in the system
@@ -226,6 +229,7 @@ enum hl_fw_component {
  * @FW_TYPE_ALL_TYPES: Mask for all types
  */
 enum hl_fw_types {
+	FW_TYPE_NONE = 0x0,
 	FW_TYPE_LINUX = 0x1,
 	FW_TYPE_BOOT_CPU = 0x2,
 	FW_TYPE_PREBOOT_CPU = 0x4,
@@ -353,6 +357,21 @@ enum vm_type {
 };
 
 /**
+ * enum mmu_op_flags - mmu operation relevant information.
+ * @MMU_OP_USERPTR: operation on user memory (host resident).
+ * @MMU_OP_PHYS_PACK: operation on DRAM (device resident).
+ * @MMU_OP_CLEAR_MEMCACHE: operation has to clear memcache.
+ * @MMU_OP_SKIP_LOW_CACHE_INV: operation is allowed to skip parts of cache invalidation.
+ */
+enum mmu_op_flags {
+	MMU_OP_USERPTR = 0x1,
+	MMU_OP_PHYS_PACK = 0x2,
+	MMU_OP_CLEAR_MEMCACHE = 0x4,
+	MMU_OP_SKIP_LOW_CACHE_INV = 0x8,
+};
+
+
+/**
  * enum hl_device_hw_state - H/W device state. use this to understand whether
  *                           to do reset before hw_init or not
  * @HL_DEVICE_HW_STATE_CLEAN: H/W state is clean. i.e. after hard reset
@@ -382,6 +401,7 @@ enum hl_device_hw_state {
  * @hop3_mask: mask to get the PTE address in hop 3.
  * @hop4_mask: mask to get the PTE address in hop 4.
  * @hop5_mask: mask to get the PTE address in hop 5.
+ * @last_mask: mask to get the bit indicating this is the last hop.
  * @page_size: default page size used to allocate memory.
  * @num_hops: The amount of hops supported by the translation table.
  * @host_resident: Should the MMU page table reside in host memory or in the
@@ -402,6 +422,7 @@ struct hl_mmu_properties {
 	u64	hop3_mask;
 	u64	hop4_mask;
 	u64	hop5_mask;
+	u64	last_mask;
 	u32	page_size;
 	u32	num_hops;
 	u8	host_resident;
@@ -524,6 +545,15 @@ struct hl_hints_range {
  * @dynamic_fw_load: is dynamic FW load is supported.
  * @gic_interrupts_enable: true if FW is not blocking GIC controller,
  *                         false otherwise.
+ * @use_get_power_for_reset_history: To support backward compatibility for Goya
+ *                                   and Gaudi
+ * @supports_soft_reset: is soft reset supported.
+ * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
+ *                              initiated by user or TDR. This is only true
+ *                              in inference ASICs, as there is no real-world
+ *                              use-case of doing soft-reset in training (due
+ *                              to the fact that training runs on multiple
+ *                              devices)
  */
 struct asic_fixed_properties {
 	struct hw_queue_properties	*hw_queues_props;
@@ -604,6 +634,9 @@ struct asic_fixed_properties {
 	u8				iatu_done_by_fw;
 	u8				dynamic_fw_load;
 	u8				gic_interrupts_enable;
+	u8				use_get_power_for_reset_history;
+	u8				supports_soft_reset;
+	u8				allow_inference_soft_reset;
 };
 
 /**
@@ -852,10 +885,15 @@ struct hl_user_interrupt {
  *                                    pending on an interrupt
  * @wait_list_node: node in the list of user threads pending on an interrupt
  * @fence: hl fence object for interrupt completion
+ * @cq_target_value: CQ target value
+ * @cq_kernel_addr: CQ kernel address, to be used in the cq interrupt
+ *                  handler for taget value comparison
  */
 struct hl_user_pending_interrupt {
 	struct list_head	wait_list_node;
 	struct hl_fence		fence;
+	u64			cq_target_value;
+	u64			*cq_kernel_addr;
 };
 
 /**
@@ -1010,6 +1048,7 @@ struct fw_response {
  * @image_region: region to copy the FW image to
  * @fw_image_size: size of FW image to load
  * @wait_for_bl_timeout: timeout for waiting for boot loader to respond
+ * @fw_desc_valid: true if FW descriptor has been validated and hence the data can be used
  */
 struct dynamic_fw_load_mgr {
 	struct fw_response response;
@@ -1017,6 +1056,7 @@ struct dynamic_fw_load_mgr {
 	struct pci_mem_region *image_region;
 	size_t fw_image_size;
 	u32 wait_for_bl_timeout;
+	bool fw_desc_valid;
 };
 
 /**
@@ -1042,7 +1082,8 @@ struct fw_image_props {
  * @skip_bmc: should BMC be skipped
  * @sram_bar_id: SRAM bar ID
  * @dram_bar_id: DRAM bar ID
- * @linux_loaded: true if linux was loaded so far
+ * @fw_comp_loaded: bitmask of loaded FW components. set bit meaning loaded
+ *                  component. values are set according to enum hl_fw_types.
  */
 struct fw_load_mgr {
 	union {
@@ -1056,7 +1097,7 @@ struct fw_load_mgr {
 	u8 skip_bmc;
 	u8 sram_bar_id;
 	u8 dram_bar_id;
-	u8 linux_loaded;
+	u8 fw_comp_loaded;
 };
 
 /**
@@ -1128,7 +1169,7 @@ struct fw_load_mgr {
  * @disable_clock_gating: disable clock gating completely
  * @debug_coresight: perform certain actions on Coresight for debugging.
  * @is_device_idle: return true if device is idle, false otherwise.
- * @soft_reset_late_init: perform certain actions needed after soft reset.
+ * @non_hard_reset_late_init: perform certain actions needed after a reset which is not hard-reset
  * @hw_queues_lock: acquire H/W queues lock.
  * @hw_queues_unlock: release H/W queues lock.
  * @get_pci_id: retrieve PCI ID.
@@ -1261,10 +1302,10 @@ struct hl_asic_funcs {
 	int (*send_heartbeat)(struct hl_device *hdev);
 	void (*set_clock_gating)(struct hl_device *hdev);
 	void (*disable_clock_gating)(struct hl_device *hdev);
-	int (*debug_coresight)(struct hl_device *hdev, void *data);
+	int (*debug_coresight)(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
 	bool (*is_device_idle)(struct hl_device *hdev, u64 *mask_arr,
 					u8 mask_len, struct seq_file *s);
-	int (*soft_reset_late_init)(struct hl_device *hdev);
+	int (*non_hard_reset_late_init)(struct hl_device *hdev);
 	void (*hw_queues_lock)(struct hl_device *hdev);
 	void (*hw_queues_unlock)(struct hl_device *hdev);
 	u32 (*get_pci_id)(struct hl_device *hdev);
@@ -1276,7 +1317,7 @@ struct hl_asic_funcs {
 	int (*init_iatu)(struct hl_device *hdev);
 	u32 (*rreg)(struct hl_device *hdev, u32 reg);
 	void (*wreg)(struct hl_device *hdev, u32 reg, u32 val);
-	void (*halt_coresight)(struct hl_device *hdev);
+	void (*halt_coresight)(struct hl_device *hdev, struct hl_ctx *ctx);
 	int (*ctx_init)(struct hl_ctx *ctx);
 	void (*ctx_fini)(struct hl_ctx *ctx);
 	int (*get_clk_rate)(struct hl_device *hdev, u32 *cur_clk, u32 *max_clk);
@@ -1518,6 +1559,9 @@ struct hl_userptr {
  * @submission_time_jiffies: submission time of the cs
  * @type: CS_TYPE_*.
  * @encaps_sig_hdl_id: encaps signals handle id, set for the first staged cs.
+ * @sob_addr_offset: sob offset from the configuration base address.
+ * @initial_sob_count: count of completed signals in SOB before current submission of signal or
+ *                     cs with encaps signals.
  * @submitted: true if CS was submitted to H/W.
  * @completed: true if CS was completed by device.
  * @timedout : true if CS was timedout.
@@ -1553,6 +1597,8 @@ struct hl_cs {
 	u64			submission_time_jiffies;
 	enum hl_cs_type		type;
 	u32			encaps_sig_hdl_id;
+	u32			sob_addr_offset;
+	u16			initial_sob_count;
 	u8			submitted;
 	u8			completed;
 	u8			timedout;
@@ -1792,7 +1838,6 @@ struct hl_debug_params {
  * @dev_node: node in the device list of file private data
  * @refcount: number of related contexts.
  * @restore_phase_mutex: lock for context switch and restore phase.
- * @is_control: true for control device, false otherwise
  */
 struct hl_fpriv {
 	struct hl_device	*hdev;
@@ -1805,7 +1850,6 @@ struct hl_fpriv {
 	struct list_head	dev_node;
 	struct kref		refcount;
 	struct mutex		restore_phase_mutex;
-	u8			is_control;
 };
 
 
@@ -1864,6 +1908,7 @@ struct hl_debugfs_entry {
  * @i2c_bus: generic u8 debugfs file for bus value to use in i2c_data_read.
  * @i2c_addr: generic u8 debugfs file for address value to use in i2c_data_read.
  * @i2c_reg: generic u8 debugfs file for register value to use in i2c_data_read.
+ * @i2c_len: generic u8 debugfs file for length value to use in i2c_data_read.
  */
 struct hl_dbg_device_entry {
 	struct dentry			*root;
@@ -1892,6 +1937,7 @@ struct hl_dbg_device_entry {
 	u8				i2c_bus;
 	u8				i2c_addr;
 	u8				i2c_reg;
+	u8				i2c_len;
 };
 
 /**
@@ -2180,13 +2226,13 @@ struct hwmon_chip_info;
  * @wq: work queue for device reset procedure.
  * @reset_work: reset work to be done.
  * @hdev: habanalabs device structure.
- * @fw_reset: whether f/w will do the reset without us sending them a message to do it.
+ * @flags: reset flags.
  */
 struct hl_device_reset_work {
 	struct workqueue_struct		*wq;
 	struct delayed_work		reset_work;
 	struct hl_device		*hdev;
-	bool				fw_reset;
+	u32				flags;
 };
 
 /**
@@ -2328,12 +2374,10 @@ struct multi_cs_completion {
  * @ctx: pointer to the context structure
  * @fence_arr: array of fences of all CSs
  * @seq_arr: array of CS sequence numbers
- * @timeout_us: timeout in usec for waiting for CS to complete
+ * @timeout_jiffies: timeout in jiffies for waiting for CS to complete
  * @timestamp: timestamp of first completed CS
  * @wait_status: wait for CS status
  * @completion_bitmap: bitmap of completed CSs (1- completed, otherwise 0)
- * @stream_master_qid_map: bitmap of all stream master QIDs on which the
- *                         multi-CS is waiting
  * @arr_len: fence_arr and seq_arr array length
  * @gone_cs: indication of gone CS (1- there was gone CS, otherwise 0)
  * @update_ts: update timestamp. 1- update the timestamp, otherwise 0.
@@ -2342,17 +2386,114 @@ struct multi_cs_data {
 	struct hl_ctx	*ctx;
 	struct hl_fence	**fence_arr;
 	u64		*seq_arr;
-	s64		timeout_us;
+	s64		timeout_jiffies;
 	s64		timestamp;
 	long		wait_status;
 	u32		completion_bitmap;
-	u32		stream_master_qid_map;
 	u8		arr_len;
 	u8		gone_cs;
 	u8		update_ts;
 };
 
 /**
+ * struct hl_clk_throttle_timestamp - current/last clock throttling timestamp
+ * @start: timestamp taken when 'start' event is received in driver
+ * @end: timestamp taken when 'end' event is received in driver
+ */
+struct hl_clk_throttle_timestamp {
+	ktime_t		start;
+	ktime_t		end;
+};
+
+/**
+ * struct hl_clk_throttle - keeps current/last clock throttling timestamps
+ * @timestamp: timestamp taken by driver and firmware, index 0 refers to POWER
+ *             index 1 refers to THERMAL
+ * @lock: protects this structure as it can be accessed from both event queue
+ *        context and info_ioctl context
+ * @current_reason: bitmask represents the current clk throttling reasons
+ * @aggregated_reason: bitmask represents aggregated clk throttling reasons since driver load
+ */
+struct hl_clk_throttle {
+	struct hl_clk_throttle_timestamp timestamp[HL_CLK_THROTTLE_TYPE_MAX];
+	struct mutex	lock;
+	u32		current_reason;
+	u32		aggregated_reason;
+};
+
+/**
+ * struct last_error_session_info - info about last session in which CS timeout or
+ *                                    razwi error occurred.
+ * @open_dev_timestamp: device open timestamp.
+ * @cs_timeout_timestamp: CS timeout timestamp.
+ * @razwi_timestamp: razwi timestamp.
+ * @cs_write_disable: if set writing to CS parameters in the structure is disabled so the
+ *                    first (root cause) CS timeout will not be overwritten.
+ * @razwi_write_disable: if set writing to razwi parameters in the structure is disabled so the
+ *                       first (root cause) razwi will not be overwritten.
+ * @cs_timeout_seq: CS timeout sequence number.
+ * @razwi_addr: address that caused razwi.
+ * @razwi_engine_id_1: engine id of the razwi initiator, if it was initiated by engine that does
+ *                     not have engine id it will be set to U16_MAX.
+ * @razwi_engine_id_2: second engine id of razwi initiator. Might happen that razwi have 2 possible
+ *                     engines which one them caused the razwi. In that case, it will contain the
+ *                     second possible engine id, otherwise it will be set to U16_MAX.
+ * @razwi_non_engine_initiator: in case the initiator of the razwi does not have engine id.
+ * @razwi_type: cause of razwi, page fault or access error, otherwise it will be set to U8_MAX.
+ */
+struct last_error_session_info {
+	ktime_t		open_dev_timestamp;
+	ktime_t		cs_timeout_timestamp;
+	ktime_t		razwi_timestamp;
+	atomic_t	cs_write_disable;
+	atomic_t	razwi_write_disable;
+	u64		cs_timeout_seq;
+	u64		razwi_addr;
+	u16		razwi_engine_id_1;
+	u16		razwi_engine_id_2;
+	u8		razwi_non_engine_initiator;
+	u8		razwi_type;
+};
+
+/**
+ * struct hl_reset_info - holds current device reset information.
+ * @lock: lock to protect critical reset flows.
+ * @soft_reset_cnt: number of soft reset since the driver was loaded.
+ * @hard_reset_cnt: number of hard reset since the driver was loaded.
+ * @hard_reset_schedule_flags: hard reset is scheduled to after current soft reset,
+ *                             here we hold the hard reset flags.
+ * @in_reset: is device in reset flow.
+ * @is_in_soft_reset: Device is currently in soft reset process.
+ * @needs_reset: true if reset_on_lockup is false and device should be reset
+ *               due to lockup.
+ * @hard_reset_pending: is there a hard reset work pending.
+ * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
+ *                    triggered, and cleared after it is shared with preboot.
+ * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
+ *                      with a new value on next reset
+ * @reset_trigger_repeated: set if device reset is triggered more than once with
+ *                          same cause.
+ * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
+ *                         complete instead.
+ */
+struct hl_reset_info {
+	spinlock_t	lock;
+	u32		soft_reset_cnt;
+	u32		hard_reset_cnt;
+	u32		hard_reset_schedule_flags;
+	u8		in_reset;
+	u8		is_in_soft_reset;
+	u8		needs_reset;
+	u8		hard_reset_pending;
+
+	u8		curr_reset_cause;
+	u8		prev_reset_trigger;
+	u8		reset_trigger_repeated;
+
+	u8		skip_reset_on_timeout;
+};
+
+/**
  * struct hl_device - habanalabs device structure.
  * @pdev: pointer to PCI device, can be NULL in case of simulator device.
  * @pcie_bar_phys: array of available PCIe bars physical addresses.
@@ -2363,7 +2504,6 @@ struct multi_cs_data {
  * @cdev_ctrl: char device for control operations only (INFO IOCTL)
  * @dev: related kernel basic device structure.
  * @dev_ctrl: related kernel device structure for the control device
- * @work_freq: delayed work to lower device frequency if possible.
  * @work_heartbeat: delayed work for CPU-CP is-alive check.
  * @device_reset_work: delayed work which performs hard reset
  * @asic_name: ASIC specific name.
@@ -2398,7 +2538,6 @@ struct multi_cs_data {
  * @asic_specific: ASIC specific information to use only from ASIC files.
  * @vm: virtual memory manager for MMU.
  * @hwmon_dev: H/W monitor device.
- * @pm_mng_profile: current power management profile.
  * @hl_chip_info: ASIC's sensors information.
  * @device_status_description: device status description.
  * @hl_debugfs: device's debugfs manager.
@@ -2410,8 +2549,10 @@ struct multi_cs_data {
  * @internal_cb_va_base: internal cb pool mmu virtual address base
  * @fpriv_list: list of file private data structures. Each structure is created
  *              when a user opens the device
+ * @fpriv_ctrl_list: list of file private data structures. Each structure is created
+ *              when a user opens the control device
  * @fpriv_list_lock: protects the fpriv_list
- * @compute_ctx: current compute context executing.
+ * @fpriv_ctrl_list_lock: protects the fpriv_ctrl_list
  * @aggregated_cs_counters: aggregated cs counters among all contexts
  * @mmu_priv: device-specific MMU data.
  * @mmu_func: device-related MMU functions.
@@ -2419,6 +2560,10 @@ struct multi_cs_data {
  * @pci_mem_region: array of memory regions in the PCI
  * @state_dump_specs: constants and dictionaries needed to dump system state.
  * @multi_cs_completion: array of multi-CS completion.
+ * @clk_throttling: holds information about current/previous clock throttling events
+ * @reset_info: holds current device reset information.
+ * @last_error: holds information about last session in which CS timeout or razwi error occurred.
+ * @stream_master_qid_arr: pointer to array with QIDs of master streams.
  * @dram_used_mem: current DRAM memory consumption.
  * @timeout_jiffies: device CS timeout value.
  * @max_power: the max power of the device, as configured by the sysadmin. This
@@ -2434,20 +2579,17 @@ struct multi_cs_data {
  *                          device initialization. Mainly used to debug and
  *                          workaround firmware bugs
  * @dram_pci_bar_start: start bus address of PCIe bar towards DRAM.
+ * @last_successful_open_ktime: timestamp (ktime) of the last successful device open.
  * @last_successful_open_jif: timestamp (jiffies) of the last successful
  *                            device open.
  * @last_open_session_duration_jif: duration (jiffies) of the last device open
  *                                  session.
  * @open_counter: number of successful device open operations.
- * @in_reset: is device in reset flow.
- * @curr_pll_profile: current PLL profile.
+ * @fw_poll_interval_usec: FW status poll interval in usec.
  * @card_type: Various ASICs have several card types. This indicates the card
  *             type of the current device.
  * @major: habanalabs kernel driver major.
  * @high_pll: high PLL profile frequency.
- * @soft_reset_cnt: number of soft reset since the driver was loaded.
- * @hard_reset_cnt: number of hard reset since the driver was loaded.
- * @clk_throttling_reason: bitmask represents the current clk throttling reasons
  * @id: device minor.
  * @id_control: minor of the control device
  * @cpu_pci_msb_addr: 50-bit extension bits for the device CPU's 40-bit
@@ -2455,7 +2597,6 @@ struct multi_cs_data {
  * @disabled: is device disabled.
  * @late_init_done: is late init stage was done during initialization.
  * @hwmon_initialized: is H/W monitor sensors was initialized.
- * @hard_reset_pending: is there a hard reset work pending.
  * @heartbeat: is heartbeat sanity check towards CPU-CP enabled.
  * @reset_on_lockup: true if a reset should be done in case of stuck CS, false
  *                   otherwise.
@@ -2467,8 +2608,9 @@ struct multi_cs_data {
  * @init_done: is the initialization of the device done.
  * @device_cpu_disabled: is the device CPU disabled (due to timeouts)
  * @dma_mask: the dma mask that was set for this device
- * @in_debug: is device under debug. This, together with fpriv_list, enforces
- *            that only a single user is configuring the debug infrastructure.
+ * @in_debug: whether the device is in a state where the profiling/tracing infrastructure
+ *            can be used. This indication is needed because in some ASICs we need to do
+ *            specific operations to enable that infrastructure.
  * @power9_64bit_dma_enable: true to enable 64-bit DMA mask support. Relevant
  *                           only to POWER9 machines.
  * @cdev_sysfs_created: were char devices and sysfs nodes created.
@@ -2477,34 +2619,18 @@ struct multi_cs_data {
  * @sync_stream_queue_idx: helper index for sync stream queues initialization.
  * @collective_mon_idx: helper index for collective initialization
  * @supports_coresight: is CoreSight supported.
- * @supports_soft_reset: is soft reset supported.
- * @allow_inference_soft_reset: true if the ASIC supports soft reset that is
- *                              initiated by user or TDR. This is only true
- *                              in inference ASICs, as there is no real-world
- *                              use-case of doing soft-reset in training (due
- *                              to the fact that training runs on multiple
- *                              devices)
  * @supports_cb_mapping: is mapping a CB to the device's MMU supported.
- * @needs_reset: true if reset_on_lockup is false and device should be reset
- *               due to lockup.
  * @process_kill_trial_cnt: number of trials reset thread tried killing
  *                          user processes
  * @device_fini_pending: true if device_fini was called and might be
  *                       waiting for the reset thread to finish
  * @supports_staged_submission: true if staged submissions are supported
- * @curr_reset_cause: saves an enumerated reset cause when a hard reset is
- *                    triggered, and cleared after it is shared with preboot.
- * @prev_reset_trigger: saves the previous trigger which caused a reset, overidden
- *                      with a new value on next reset
- * @reset_trigger_repeated: set if device reset is triggered more than once with
- *                          same cause.
- * @skip_reset_on_timeout: Skip device reset if CS has timed out, wait for it to
- *                         complete instead.
  * @device_cpu_is_halted: Flag to indicate whether the device CPU was already
  *                        halted. We can't halt it again because the COMMS
  *                        protocol will throw an error. Relevant only for
  *                        cases where Linux was not loaded to device CPU
  * @supports_wait_for_multi_cs: true if wait for multi CS is supported
+ * @is_compute_ctx_active: Whether there is an active compute context executing.
  */
 struct hl_device {
 	struct pci_dev			*pdev;
@@ -2515,7 +2641,6 @@ struct hl_device {
 	struct cdev			cdev_ctrl;
 	struct device			*dev;
 	struct device			*dev_ctrl;
-	struct delayed_work		work_freq;
 	struct delayed_work		work_heartbeat;
 	struct hl_device_reset_work	device_reset_work;
 	char				asic_name[HL_STR_MAX];
@@ -2546,7 +2671,6 @@ struct hl_device {
 	void				*asic_specific;
 	struct hl_vm			vm;
 	struct device			*hwmon_dev;
-	enum hl_pm_mng_profile		pm_mng_profile;
 	struct hwmon_chip_info		*hl_chip_info;
 
 	struct hl_dbg_device_entry	hl_debugfs;
@@ -2560,9 +2684,9 @@ struct hl_device {
 	u64				internal_cb_va_base;
 
 	struct list_head		fpriv_list;
+	struct list_head		fpriv_ctrl_list;
 	struct mutex			fpriv_list_lock;
-
-	struct hl_ctx			*compute_ctx;
+	struct mutex			fpriv_ctrl_list_lock;
 
 	struct hl_cs_counters_atomic	aggregated_cs_counters;
 
@@ -2577,6 +2701,11 @@ struct hl_device {
 
 	struct multi_cs_completion	multi_cs_completion[
 							MULTI_CS_MAX_USER_CTX];
+	struct hl_clk_throttle		clk_throttling;
+	struct last_error_session_info	last_error;
+
+	struct hl_reset_info		reset_info;
+
 	u32				*stream_master_qid_arr;
 	atomic64_t			dram_used_mem;
 	u64				timeout_jiffies;
@@ -2587,21 +2716,17 @@ struct hl_device {
 	u64				last_successful_open_jif;
 	u64				last_open_session_duration_jif;
 	u64				open_counter;
-	atomic_t			in_reset;
-	enum hl_pll_frequency		curr_pll_profile;
+	u64				fw_poll_interval_usec;
+	ktime_t				last_successful_open_ktime;
 	enum cpucp_card_types		card_type;
 	u32				major;
 	u32				high_pll;
-	u32				soft_reset_cnt;
-	u32				hard_reset_cnt;
-	u32				clk_throttling_reason;
 	u16				id;
 	u16				id_control;
 	u16				cpu_pci_msb_addr;
 	u8				disabled;
 	u8				late_init_done;
 	u8				hwmon_initialized;
-	u8				hard_reset_pending;
 	u8				heartbeat;
 	u8				reset_on_lockup;
 	u8				dram_default_page_mapping;
@@ -2618,20 +2743,14 @@ struct hl_device {
 	u8				sync_stream_queue_idx;
 	u8				collective_mon_idx;
 	u8				supports_coresight;
-	u8				supports_soft_reset;
-	u8				allow_inference_soft_reset;
 	u8				supports_cb_mapping;
-	u8				needs_reset;
 	u8				process_kill_trial_cnt;
 	u8				device_fini_pending;
 	u8				supports_staged_submission;
-	u8				curr_reset_cause;
-	u8				prev_reset_trigger;
-	u8				reset_trigger_repeated;
-	u8				skip_reset_on_timeout;
 	u8				device_cpu_is_halted;
 	u8				supports_wait_for_multi_cs;
 	u8				stream_master_qid_arr_size;
+	u8				is_compute_ctx_active;
 
 	/* Parameters for bring-up */
 	u64				nic_ports_mask;
@@ -2659,6 +2778,7 @@ struct hl_device {
  *            wait cs are used to wait of the reserved encaps signals.
  * @hdev: pointer to habanalabs device structure.
  * @hw_sob: pointer to  H/W SOB used in the reservation.
+ * @ctx: pointer to the user's context data structure
  * @cs_seq: staged cs sequence which contains encapsulated signals
  * @id: idr handler id to be used to fetch the handler info
  * @q_idx: stream queue index
@@ -2669,6 +2789,7 @@ struct hl_cs_encaps_sig_handle {
 	struct kref refcount;
 	struct hl_device *hdev;
 	struct hl_hw_sob *hw_sob;
+	struct hl_ctx *ctx;
 	u64  cs_seq;
 	u32  id;
 	u32  q_idx;
@@ -2757,21 +2878,9 @@ static inline bool hl_mem_area_inside_range(u64 address, u64 size,
 static inline bool hl_mem_area_crosses_range(u64 address, u32 size,
 				u64 range_start_address, u64 range_end_address)
 {
-	u64 end_address = address + size;
+	u64 end_address = address + size - 1;
 
-	if ((address >= range_start_address) &&
-			(address < range_end_address))
-		return true;
-
-	if ((end_address >= range_start_address) &&
-			(end_address < range_end_address))
-		return true;
-
-	if ((address < range_start_address) &&
-			(end_address >= range_end_address))
-		return true;
-
-	return false;
+	return ((address <= range_end_address) && (range_start_address <= end_address));
 }
 
 int hl_device_open(struct inode *inode, struct file *filp);
@@ -2779,10 +2888,7 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp);
 bool hl_device_operational(struct hl_device *hdev,
 		enum hl_device_status *status);
 enum hl_device_status hl_device_status(struct hl_device *hdev);
-int hl_device_set_debug_mode(struct hl_device *hdev, bool enable);
-int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
-		enum hl_asic_type asic_type, int minor);
-void destroy_hdev(struct hl_device *hdev);
+int hl_device_set_debug_mode(struct hl_device *hdev, struct hl_ctx *ctx, bool enable);
 int hl_hw_queues_create(struct hl_device *hdev);
 void hl_hw_queues_destroy(struct hl_device *hdev);
 int hl_hw_queue_send_cb_no_cmpl(struct hl_device *hdev, u32 hw_queue_id,
@@ -2821,6 +2927,7 @@ int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx);
 void hl_ctx_do_release(struct kref *ref);
 void hl_ctx_get(struct hl_device *hdev,	struct hl_ctx *ctx);
 int hl_ctx_put(struct hl_ctx *ctx);
+struct hl_ctx *hl_get_compute_ctx(struct hl_device *hdev);
 struct hl_fence *hl_ctx_get_fence(struct hl_ctx *ctx, u64 seq);
 int hl_ctx_get_fences(struct hl_ctx *ctx, u64 *seq_arr,
 				struct hl_fence **fence, u32 arr_len);
@@ -2834,7 +2941,6 @@ int hl_device_resume(struct hl_device *hdev);
 int hl_device_reset(struct hl_device *hdev, u32 flags);
 void hl_hpriv_get(struct hl_fpriv *hpriv);
 int hl_hpriv_put(struct hl_fpriv *hpriv);
-int hl_device_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
 int hl_device_utilization(struct hl_device *hdev, u32 *utilization);
 
 int hl_build_hwmon_channel_info(struct hl_device *hdev,
@@ -2915,6 +3021,9 @@ int hl_mmu_unmap_page(struct hl_ctx *ctx, u64 virt_addr, u32 page_size,
 int hl_mmu_map_contiguous(struct hl_ctx *ctx, u64 virt_addr,
 					u64 phys_addr, u32 size);
 int hl_mmu_unmap_contiguous(struct hl_ctx *ctx, u64 virt_addr, u32 size);
+int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags);
+int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
+					u32 flags, u32 asid, u64 va, u64 size);
 void hl_mmu_swap_out(struct hl_ctx *ctx);
 void hl_mmu_swap_in(struct hl_ctx *ctx);
 int hl_mmu_if_set_funcs(struct hl_device *hdev);
@@ -2969,6 +3078,10 @@ int hl_fw_dynamic_send_protocol_cmd(struct hl_device *hdev,
 				struct fw_load_mgr *fw_loader,
 				enum comms_cmd cmd, unsigned int size,
 				bool wait_ok, u32 timeout);
+int hl_fw_dram_replaced_row_get(struct hl_device *hdev,
+				struct cpucp_hbm_row_info *info);
+int hl_fw_dram_pending_row_get(struct hl_device *hdev, u32 *pend_rows_num);
+int hl_fw_cpucp_engine_core_asid_set(struct hl_device *hdev, u32 asid);
 int hl_pci_bars_map(struct hl_device *hdev, const char * const name[3],
 			bool is_wc[3]);
 int hl_pci_elbi_read(struct hl_device *hdev, u64 addr, u32 *data);
diff --git a/drivers/misc/habanalabs/common/habanalabs_drv.c b/drivers/misc/habanalabs/common/habanalabs_drv.c
index 949d1b5c5c41..690b763c7a95 100644
--- a/drivers/misc/habanalabs/common/habanalabs_drv.c
+++ b/drivers/misc/habanalabs/common/habanalabs_drv.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -153,15 +153,7 @@ int hl_device_open(struct inode *inode, struct file *filp)
 		goto out_err;
 	}
 
-	if (hdev->in_debug) {
-		dev_err_ratelimited(hdev->dev,
-			"Can't open %s because it is being debugged by another user\n",
-			dev_name(hdev->dev));
-		rc = -EPERM;
-		goto out_err;
-	}
-
-	if (hdev->compute_ctx) {
+	if (hdev->is_compute_ctx_active) {
 		dev_dbg_ratelimited(hdev->dev,
 			"Can't open %s because another user is working on it\n",
 			dev_name(hdev->dev));
@@ -175,20 +167,17 @@ int hl_device_open(struct inode *inode, struct file *filp)
 		goto out_err;
 	}
 
-	/* Device is IDLE at this point so it is legal to change PLLs.
-	 * There is no need to check anything because if the PLL is
-	 * already HIGH, the set function will return without doing
-	 * anything
-	 */
-	hl_device_set_frequency(hdev, PLL_HIGH);
-
 	list_add(&hpriv->dev_node, &hdev->fpriv_list);
 	mutex_unlock(&hdev->fpriv_list_lock);
 
 	hl_debugfs_add_file(hpriv);
 
+	atomic_set(&hdev->last_error.cs_write_disable, 0);
+	atomic_set(&hdev->last_error.razwi_write_disable, 0);
+
 	hdev->open_counter++;
 	hdev->last_successful_open_jif = jiffies;
+	hdev->last_successful_open_ktime = ktime_get();
 
 	return 0;
 
@@ -231,12 +220,11 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 	hpriv->hdev = hdev;
 	filp->private_data = hpriv;
 	hpriv->filp = filp;
-	hpriv->is_control = true;
 	nonseekable_open(inode, filp);
 
 	hpriv->taskpid = find_get_pid(current->pid);
 
-	mutex_lock(&hdev->fpriv_list_lock);
+	mutex_lock(&hdev->fpriv_ctrl_list_lock);
 
 	if (!hl_device_operational(hdev, NULL)) {
 		dev_err_ratelimited(hdev->dev_ctrl,
@@ -246,13 +234,13 @@ int hl_device_open_ctrl(struct inode *inode, struct file *filp)
 		goto out_err;
 	}
 
-	list_add(&hpriv->dev_node, &hdev->fpriv_list);
-	mutex_unlock(&hdev->fpriv_list_lock);
+	list_add(&hpriv->dev_node, &hdev->fpriv_ctrl_list);
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 
 	return 0;
 
 out_err:
-	mutex_unlock(&hdev->fpriv_list_lock);
+	mutex_unlock(&hdev->fpriv_ctrl_list_lock);
 	filp->private_data = NULL;
 	put_pid(hpriv->taskpid);
 
@@ -263,6 +251,7 @@ out_err:
 
 static void set_driver_behavior_per_device(struct hl_device *hdev)
 {
+	hdev->pldm = 0;
 	hdev->fw_components = FW_TYPE_ALL_TYPES;
 	hdev->cpu_queues_enable = 1;
 	hdev->heartbeat = 1;
@@ -279,23 +268,53 @@ static void set_driver_behavior_per_device(struct hl_device *hdev)
 	hdev->axi_drain = 0;
 }
 
-/*
+static void copy_kernel_module_params_to_device(struct hl_device *hdev)
+{
+	hdev->major = hl_major;
+	hdev->memory_scrub = memory_scrub;
+	hdev->reset_on_lockup = reset_on_lockup;
+	hdev->boot_error_status_mask = boot_error_status_mask;
+
+	if (timeout_locked)
+		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
+	else
+		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
+}
+
+static int fixup_device_params(struct hl_device *hdev)
+{
+	hdev->asic_prop.fw_security_enabled = is_asic_secured(hdev->asic_type);
+
+	hdev->fw_poll_interval_usec = HL_FW_STATUS_POLL_INTERVAL_USEC;
+
+	hdev->stop_on_err = true;
+	hdev->reset_info.curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
+	hdev->reset_info.prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
+
+	/* Enable only after the initialization of the device */
+	hdev->disabled = true;
+
+	/* Set default DMA mask to 32 bits */
+	hdev->dma_mask = 32;
+
+	return 0;
+}
+
+/**
  * create_hdev - create habanalabs device instance
  *
  * @dev: will hold the pointer to the new habanalabs device structure
  * @pdev: pointer to the pci device
- * @asic_type: in case of simulator device, which device is it
- * @minor: in case of simulator device, the minor of the device
  *
  * Allocate memory for habanalabs device and initialize basic fields
  * Identify the ASIC type
  * Allocate ID (minor) for the device (only for real devices)
  */
-int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
-		enum hl_asic_type asic_type, int minor)
+static int create_hdev(struct hl_device **dev, struct pci_dev *pdev)
 {
+	int main_id, ctrl_id = 0, rc = 0;
 	struct hl_device *hdev;
-	int rc, main_id, ctrl_id = 0;
 
 	*dev = NULL;
 
@@ -303,69 +322,39 @@ int create_hdev(struct hl_device **dev, struct pci_dev *pdev,
 	if (!hdev)
 		return -ENOMEM;
 
-	/* First, we must find out which ASIC are we handling. This is needed
-	 * to configure the behavior of the driver (kernel parameters)
-	 */
-	if (pdev) {
-		hdev->asic_type = get_asic_type(pdev->device);
-		if (hdev->asic_type == ASIC_INVALID) {
-			dev_err(&pdev->dev, "Unsupported ASIC\n");
-			rc = -ENODEV;
-			goto free_hdev;
-		}
-	} else {
-		hdev->asic_type = asic_type;
-	}
-
-	if (pdev)
-		hdev->asic_prop.fw_security_enabled =
-					is_asic_secured(hdev->asic_type);
-	else
-		hdev->asic_prop.fw_security_enabled = false;
+	/* can be NULL in case of simulator device */
+	hdev->pdev = pdev;
 
 	/* Assign status description string */
-	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL],
-					"operational", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET],
-					"in reset", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION],
-					"disabled", HL_STR_MAX);
-	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET],
-					"needs reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_OPERATIONAL], "operational", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_IN_RESET], "in reset", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_MALFUNCTION], "disabled", HL_STR_MAX);
+	strncpy(hdev->status[HL_DEVICE_STATUS_NEEDS_RESET], "needs reset", HL_STR_MAX);
 	strncpy(hdev->status[HL_DEVICE_STATUS_IN_DEVICE_CREATION],
 					"in device creation", HL_STR_MAX);
 
-	hdev->major = hl_major;
-	hdev->reset_on_lockup = reset_on_lockup;
-	hdev->memory_scrub = memory_scrub;
-	hdev->boot_error_status_mask = boot_error_status_mask;
-	hdev->stop_on_err = true;
+	/* First, we must find out which ASIC are we handling. This is needed
+	 * to configure the behavior of the driver (kernel parameters)
+	 */
+	hdev->asic_type = get_asic_type(pdev->device);
+	if (hdev->asic_type == ASIC_INVALID) {
+		dev_err(&pdev->dev, "Unsupported ASIC\n");
+		rc = -ENODEV;
+		goto free_hdev;
+	}
 
-	hdev->pldm = 0;
+	copy_kernel_module_params_to_device(hdev);
 
 	set_driver_behavior_per_device(hdev);
 
-	hdev->curr_reset_cause = HL_RESET_CAUSE_UNKNOWN;
-	hdev->prev_reset_trigger = HL_RESET_TRIGGER_DEFAULT;
-
-	if (timeout_locked)
-		hdev->timeout_jiffies = msecs_to_jiffies(timeout_locked * 1000);
-	else
-		hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
-
-	hdev->disabled = true;
-	hdev->pdev = pdev; /* can be NULL in case of simulator device */
-
-	/* Set default DMA mask to 32 bits */
-	hdev->dma_mask = 32;
+	fixup_device_params(hdev);
 
 	mutex_lock(&hl_devs_idr_lock);
 
 	/* Always save 2 numbers, 1 for main device and 1 for control.
 	 * They must be consecutive
 	 */
-	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS,
-				GFP_KERNEL);
+	main_id = idr_alloc(&hl_devs_idr, hdev, 0, HL_MAX_MINORS, GFP_KERNEL);
 
 	if (main_id >= 0)
 		ctrl_id = idr_alloc(&hl_devs_idr, hdev, main_id + 1,
@@ -405,7 +394,7 @@ free_hdev:
  * @dev: pointer to the habanalabs device structure
  *
  */
-void destroy_hdev(struct hl_device *hdev)
+static void destroy_hdev(struct hl_device *hdev)
 {
 	/* Remove device from the device list */
 	mutex_lock(&hl_devs_idr_lock);
@@ -444,7 +433,7 @@ static int hl_pmops_resume(struct device *dev)
 	return hl_device_resume(hdev);
 }
 
-/*
+/**
  * hl_pci_probe - probe PCI habanalabs devices
  *
  * @pdev: pointer to pci device
@@ -454,8 +443,7 @@ static int hl_pmops_resume(struct device *dev)
  * Create a new habanalabs device and initialize it according to the
  * device's type
  */
-static int hl_pci_probe(struct pci_dev *pdev,
-				const struct pci_device_id *id)
+static int hl_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id)
 {
 	struct hl_device *hdev;
 	int rc;
@@ -464,7 +452,7 @@ static int hl_pci_probe(struct pci_dev *pdev,
 		 " device found [%04x:%04x] (rev %x)\n",
 		 (int)pdev->vendor, (int)pdev->device, (int)pdev->revision);
 
-	rc = create_hdev(&hdev, pdev, ASIC_INVALID, -1);
+	rc = create_hdev(&hdev, pdev);
 	if (rc)
 		return rc;
 
diff --git a/drivers/misc/habanalabs/common/habanalabs_ioctl.c b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
index 86c3257d9ae1..3ba3a8ffda3e 100644
--- a/drivers/misc/habanalabs/common/habanalabs_ioctl.c
+++ b/drivers/misc/habanalabs/common/habanalabs_ioctl.c
@@ -158,7 +158,7 @@ static int hw_idle(struct hl_device *hdev, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(hw_idle))) ? -EFAULT : 0;
 }
 
-static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
+static int debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, struct hl_debug_args *args)
 {
 	struct hl_debug_params *params;
 	void *input = NULL, *output = NULL;
@@ -200,7 +200,7 @@ static int debug_coresight(struct hl_device *hdev, struct hl_debug_args *args)
 		params->output_size = args->output_size;
 	}
 
-	rc = hdev->asic_funcs->debug_coresight(hdev, params);
+	rc = hdev->asic_funcs->debug_coresight(hdev, ctx, params);
 	if (rc) {
 		dev_err(hdev->dev,
 			"debug coresight operation failed %d\n", rc);
@@ -269,8 +269,8 @@ static int get_reset_count(struct hl_device *hdev, struct hl_info_args *args)
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	reset_count.hard_reset_cnt = hdev->hard_reset_cnt;
-	reset_count.soft_reset_cnt = hdev->soft_reset_cnt;
+	reset_count.hard_reset_cnt = hdev->reset_info.hard_reset_cnt;
+	reset_count.soft_reset_cnt = hdev->reset_info.soft_reset_cnt;
 
 	return copy_to_user(out, &reset_count,
 		min((size_t) max_size, sizeof(reset_count))) ? -EFAULT : 0;
@@ -313,15 +313,38 @@ static int pci_counters_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 
 static int clk_throttle_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 {
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
 	struct hl_device *hdev = hpriv->hdev;
 	struct hl_info_clk_throttle clk_throttle = {0};
+	ktime_t end_time, zero_time = ktime_set(0, 0);
 	u32 max_size = args->return_size;
-	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int i;
 
 	if ((!max_size) || (!out))
 		return -EINVAL;
 
-	clk_throttle.clk_throttling_reason = hdev->clk_throttling_reason;
+	mutex_lock(&hdev->clk_throttling.lock);
+
+	clk_throttle.clk_throttling_reason = hdev->clk_throttling.current_reason;
+
+	for (i = 0 ; i < HL_CLK_THROTTLE_TYPE_MAX ; i++) {
+		if (!(hdev->clk_throttling.aggregated_reason & BIT(i)))
+			continue;
+
+		clk_throttle.clk_throttling_timestamp_us[i] =
+			ktime_to_us(hdev->clk_throttling.timestamp[i].start);
+
+		if (ktime_compare(hdev->clk_throttling.timestamp[i].end, zero_time))
+			end_time = hdev->clk_throttling.timestamp[i].end;
+		else
+			end_time = ktime_get();
+
+		clk_throttle.clk_throttling_duration_ns[i] =
+			ktime_to_ns(ktime_sub(end_time,
+				hdev->clk_throttling.timestamp[i].start));
+
+	}
+	mutex_unlock(&hdev->clk_throttling.lock);
 
 	return copy_to_user(out, &clk_throttle,
 		min((size_t) max_size, sizeof(clk_throttle))) ? -EFAULT : 0;
@@ -480,6 +503,94 @@ static int open_stats_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
 		min((size_t) max_size, sizeof(open_stats_info))) ? -EFAULT : 0;
 }
 
+static int dram_pending_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	u32 pend_rows_num = 0;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_dram_pending_row_get(hdev, &pend_rows_num);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &pend_rows_num,
+			min_t(size_t, max_size, sizeof(pend_rows_num))) ? -EFAULT : 0;
+}
+
+static int dram_replaced_rows_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct cpucp_hbm_row_info info = {0};
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+	int rc;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	rc = hl_fw_dram_replaced_row_get(hdev, &info);
+	if (rc)
+		return rc;
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int last_err_open_dev_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_info_last_err_open_dev_time info = {0};
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	info.timestamp = ktime_to_ns(hdev->last_error.open_dev_timestamp);
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int cs_timeout_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_info_cs_timeout_event info = {0};
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	info.seq = hdev->last_error.cs_timeout_seq;
+	info.timestamp = ktime_to_ns(hdev->last_error.cs_timeout_timestamp);
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
+static int razwi_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
+{
+	struct hl_device *hdev = hpriv->hdev;
+	u32 max_size = args->return_size;
+	struct hl_info_razwi_event info = {0};
+	void __user *out = (void __user *) (uintptr_t) args->return_pointer;
+
+	if ((!max_size) || (!out))
+		return -EINVAL;
+
+	info.timestamp = ktime_to_ns(hdev->last_error.razwi_timestamp);
+	info.addr = hdev->last_error.razwi_addr;
+	info.engine_id_1 = hdev->last_error.razwi_engine_id_1;
+	info.engine_id_2 = hdev->last_error.razwi_engine_id_2;
+	info.no_engine_id = hdev->last_error.razwi_non_engine_initiator;
+	info.error_type = hdev->last_error.razwi_type;
+
+	return copy_to_user(out, &info, min_t(size_t, max_size, sizeof(info))) ? -EFAULT : 0;
+}
+
 static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 				struct device *dev)
 {
@@ -503,6 +614,33 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_RESET_COUNT:
 		return get_reset_count(hdev, args);
 
+	case HL_INFO_HW_EVENTS:
+		return hw_events_info(hdev, false, args);
+
+	case HL_INFO_HW_EVENTS_AGGREGATE:
+		return hw_events_info(hdev, true, args);
+
+	case HL_INFO_CS_COUNTERS:
+		return cs_counters_info(hpriv, args);
+
+	case HL_INFO_CLK_THROTTLE_REASON:
+		return clk_throttle_info(hpriv, args);
+
+	case HL_INFO_SYNC_MANAGER:
+		return sync_manager_info(hpriv, args);
+
+	case HL_INFO_OPEN_STATS:
+		return open_stats_info(hpriv, args);
+
+	case HL_INFO_LAST_ERR_OPEN_DEV_TIME:
+		return last_err_open_dev_info(hpriv, args);
+
+	case HL_INFO_CS_TIMEOUT_EVENT:
+		return cs_timeout_info(hpriv, args);
+
+	case HL_INFO_RAZWI_EVENT:
+		return razwi_info(hpriv, args);
+
 	default:
 		break;
 	}
@@ -515,10 +653,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	}
 
 	switch (args->op) {
-	case HL_INFO_HW_EVENTS:
-		rc = hw_events_info(hdev, false, args);
-		break;
-
 	case HL_INFO_DRAM_USAGE:
 		rc = dram_usage_info(hpriv, args);
 		break;
@@ -531,10 +665,6 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 		rc = device_utilization(hdev, args);
 		break;
 
-	case HL_INFO_HW_EVENTS_AGGREGATE:
-		rc = hw_events_info(hdev, true, args);
-		break;
-
 	case HL_INFO_CLK_RATE:
 		rc = get_clk_rate(hdev, args);
 		break;
@@ -542,18 +672,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_TIME_SYNC:
 		return time_sync_info(hdev, args);
 
-	case HL_INFO_CS_COUNTERS:
-		return cs_counters_info(hpriv, args);
-
 	case HL_INFO_PCI_COUNTERS:
 		return pci_counters_info(hpriv, args);
 
-	case HL_INFO_CLK_THROTTLE_REASON:
-		return clk_throttle_info(hpriv, args);
-
-	case HL_INFO_SYNC_MANAGER:
-		return sync_manager_info(hpriv, args);
-
 	case HL_INFO_TOTAL_ENERGY:
 		return total_energy_consumption_info(hpriv, args);
 
@@ -563,12 +684,16 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
 	case HL_INFO_POWER:
 		return power_info(hpriv, args);
 
-	case HL_INFO_OPEN_STATS:
-		return open_stats_info(hpriv, args);
+
+	case HL_INFO_DRAM_REPLACED_ROWS:
+		return dram_replaced_rows_info(hpriv, args);
+
+	case HL_INFO_DRAM_PENDING_ROWS:
+		return dram_pending_rows_info(hpriv, args);
 
 	default:
 		dev_err(dev, "Invalid request %d\n", args->op);
-		rc = -ENOTTY;
+		rc = -EINVAL;
 		break;
 	}
 
@@ -613,16 +738,17 @@ static int hl_debug_ioctl(struct hl_fpriv *hpriv, void *data)
 				"Rejecting debug configuration request because device not in debug mode\n");
 			return -EFAULT;
 		}
-		args->input_size =
-			min(args->input_size, hl_debug_struct_size[args->op]);
-		rc = debug_coresight(hdev, args);
+		args->input_size = min(args->input_size, hl_debug_struct_size[args->op]);
+		rc = debug_coresight(hdev, hpriv->ctx, args);
 		break;
+
 	case HL_DEBUG_OP_SET_MODE:
-		rc = hl_device_set_debug_mode(hdev, (bool) args->enable);
+		rc = hl_device_set_debug_mode(hdev, hpriv->ctx, (bool) args->enable);
 		break;
+
 	default:
 		dev_err(hdev->dev, "Invalid request %d\n", args->op);
-		rc = -ENOTTY;
+		rc = -EINVAL;
 		break;
 	}
 
@@ -649,7 +775,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 		const struct hl_ioctl_desc *ioctl, struct device *dev)
 {
 	struct hl_fpriv *hpriv = filep->private_data;
-	struct hl_device *hdev = hpriv->hdev;
 	unsigned int nr = _IOC_NR(cmd);
 	char stack_kdata[128] = {0};
 	char *kdata = NULL;
@@ -658,12 +783,6 @@ static long _hl_ioctl(struct file *filep, unsigned int cmd, unsigned long arg,
 	u32 hl_size;
 	int retcode;
 
-	if (hdev->hard_reset_pending) {
-		dev_crit_ratelimited(dev,
-			"Device HARD reset pending! Please close FD\n");
-		return -ENODEV;
-	}
-
 	/* Do not trust userspace, use our own definition */
 	func = ioctl->func;
 
diff --git a/drivers/misc/habanalabs/common/hw_queue.c b/drivers/misc/habanalabs/common/hw_queue.c
index 0743319b10c7..6103e479e855 100644
--- a/drivers/misc/habanalabs/common/hw_queue.c
+++ b/drivers/misc/habanalabs/common/hw_queue.c
@@ -429,6 +429,9 @@ static int init_signal_cs(struct hl_device *hdev,
 	rc = hl_cs_signal_sob_wraparound_handler(hdev, q_idx, &hw_sob, 1,
 								false);
 
+	job->cs->sob_addr_offset = hw_sob->sob_addr;
+	job->cs->initial_sob_count = prop->next_sob_val - 1;
+
 	return rc;
 }
 
@@ -571,7 +574,7 @@ static int encaps_sig_first_staged_cs_handler
 	struct hl_encaps_signals_mgr *mgr;
 	int rc = 0;
 
-	mgr = &hdev->compute_ctx->sig_mgr;
+	mgr = &cs->ctx->sig_mgr;
 
 	spin_lock(&mgr->lock);
 	encaps_sig_hdl = idr_find(&mgr->handles, cs->encaps_sig_hdl_id);
diff --git a/drivers/misc/habanalabs/common/hwmon.c b/drivers/misc/habanalabs/common/hwmon.c
index e33f65be8a00..57f5d2c48330 100644
--- a/drivers/misc/habanalabs/common/hwmon.c
+++ b/drivers/misc/habanalabs/common/hwmon.c
@@ -10,17 +10,148 @@
 #include <linux/pci.h>
 #include <linux/hwmon.h>
 
-#define HWMON_NR_SENSOR_TYPES		(hwmon_pwm + 1)
+#define HWMON_NR_SENSOR_TYPES		(hwmon_max)
 
-int hl_build_hwmon_channel_info(struct hl_device *hdev,
-				struct cpucp_sensor *sensors_arr)
+#ifdef _HAS_HWMON_HWMON_T_ENABLE
+
+static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type,
+					u32 cpucp_flags)
 {
-	u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
-	u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
+	u32 flags;
+
+	switch (type) {
+	case hwmon_temp:
+		flags = (cpucp_flags << 1) | HWMON_T_ENABLE;
+		break;
+
+	case hwmon_in:
+		flags = (cpucp_flags << 1) | HWMON_I_ENABLE;
+		break;
+
+	case hwmon_curr:
+		flags = (cpucp_flags << 1) | HWMON_C_ENABLE;
+		break;
+
+	case hwmon_fan:
+		flags = (cpucp_flags << 1) | HWMON_F_ENABLE;
+		break;
+
+	case hwmon_power:
+		flags = (cpucp_flags << 1) | HWMON_P_ENABLE;
+		break;
+
+	case hwmon_pwm:
+		/* enable bit was here from day 1, so no need to adjust */
+		flags = cpucp_flags;
+		break;
+
+	default:
+		dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+		flags = cpucp_flags;
+		break;
+	}
+
+	return flags;
+}
+
+static u32 fixup_attr_legacy_fw(u32 attr)
+{
+	return (attr - 1);
+}
+
+#else
+
+static u32 fixup_flags_legacy_fw(struct hl_device *hdev, enum hwmon_sensor_types type,
+						u32 cpucp_flags)
+{
+	return cpucp_flags;
+}
+
+static u32 fixup_attr_legacy_fw(u32 attr)
+{
+	return attr;
+}
+
+#endif /* !_HAS_HWMON_HWMON_T_ENABLE */
+
+static u32 adjust_hwmon_flags(struct hl_device *hdev, enum hwmon_sensor_types type, u32 cpucp_flags)
+{
+	u32 flags, cpucp_input_val;
+	bool use_cpucp_enum;
+
+	use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+					CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false;
+
+	/* If f/w is using it's own enum, we need to check if the properties values are aligned.
+	 * If not, it means we need to adjust the values to the new format that is used in the
+	 * kernel since 5.6 (enum values were incremented by 1 by adding a new enable value).
+	 */
+	if (use_cpucp_enum) {
+		switch (type) {
+		case hwmon_temp:
+			cpucp_input_val = cpucp_temp_input;
+			if (cpucp_input_val == hwmon_temp_input)
+				flags = cpucp_flags;
+			else
+				flags = (cpucp_flags << 1) | HWMON_T_ENABLE;
+			break;
+
+		case hwmon_in:
+			cpucp_input_val = cpucp_in_input;
+			if (cpucp_input_val == hwmon_in_input)
+				flags = cpucp_flags;
+			else
+				flags = (cpucp_flags << 1) | HWMON_I_ENABLE;
+			break;
+
+		case hwmon_curr:
+			cpucp_input_val = cpucp_curr_input;
+			if (cpucp_input_val == hwmon_curr_input)
+				flags = cpucp_flags;
+			else
+				flags = (cpucp_flags << 1) | HWMON_C_ENABLE;
+			break;
+
+		case hwmon_fan:
+			cpucp_input_val = cpucp_fan_input;
+			if (cpucp_input_val == hwmon_fan_input)
+				flags = cpucp_flags;
+			else
+				flags = (cpucp_flags << 1) | HWMON_F_ENABLE;
+			break;
+
+		case hwmon_pwm:
+			/* enable bit was here from day 1, so no need to adjust */
+			flags = cpucp_flags;
+			break;
+
+		case hwmon_power:
+			cpucp_input_val = CPUCP_POWER_INPUT;
+			if (cpucp_input_val == hwmon_power_input)
+				flags = cpucp_flags;
+			else
+				flags = (cpucp_flags << 1) | HWMON_P_ENABLE;
+			break;
+
+		default:
+			dev_err(hdev->dev, "unsupported h/w sensor type %d\n", type);
+			flags = cpucp_flags;
+			break;
+		}
+	} else {
+		flags = fixup_flags_legacy_fw(hdev, type, cpucp_flags);
+	}
+
+	return flags;
+}
+
+int hl_build_hwmon_channel_info(struct hl_device *hdev, struct cpucp_sensor *sensors_arr)
+{
+	u32 num_sensors_for_type, flags, num_active_sensor_types = 0, arr_size = 0, *curr_arr;
 	u32 sensors_by_type_next_index[HWMON_NR_SENSOR_TYPES] = {0};
+	u32 *sensors_by_type[HWMON_NR_SENSOR_TYPES] = {NULL};
 	struct hwmon_channel_info **channels_info;
-	u32 num_sensors_for_type, num_active_sensor_types = 0,
-			arr_size = 0, *curr_arr;
+	u32 counts[HWMON_NR_SENSOR_TYPES] = {0};
 	enum hwmon_sensor_types type;
 	int rc, i, j;
 
@@ -31,8 +162,7 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 			break;
 
 		if (type >= HWMON_NR_SENSOR_TYPES) {
-			dev_err(hdev->dev,
-				"Got wrong sensor type %d from device\n", type);
+			dev_err(hdev->dev, "Got wrong sensor type %d from device\n", type);
 			return -EINVAL;
 		}
 
@@ -45,8 +175,9 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 			continue;
 
 		num_sensors_for_type = counts[i] + 1;
-		curr_arr = kcalloc(num_sensors_for_type, sizeof(*curr_arr),
-				GFP_KERNEL);
+		dev_dbg(hdev->dev, "num_sensors_for_type %d = %d\n", i, num_sensors_for_type);
+
+		curr_arr = kcalloc(num_sensors_for_type, sizeof(*curr_arr), GFP_KERNEL);
 		if (!curr_arr) {
 			rc = -ENOMEM;
 			goto sensors_type_err;
@@ -59,20 +190,18 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 	for (i = 0 ; i < arr_size ; i++) {
 		type = le32_to_cpu(sensors_arr[i].type);
 		curr_arr = sensors_by_type[type];
-		curr_arr[sensors_by_type_next_index[type]++] =
-				le32_to_cpu(sensors_arr[i].flags);
+		flags = adjust_hwmon_flags(hdev, type, le32_to_cpu(sensors_arr[i].flags));
+		curr_arr[sensors_by_type_next_index[type]++] = flags;
 	}
 
-	channels_info = kcalloc(num_active_sensor_types + 1,
-			sizeof(*channels_info), GFP_KERNEL);
+	channels_info = kcalloc(num_active_sensor_types + 1, sizeof(*channels_info), GFP_KERNEL);
 	if (!channels_info) {
 		rc = -ENOMEM;
 		goto channels_info_array_err;
 	}
 
 	for (i = 0 ; i < num_active_sensor_types ; i++) {
-		channels_info[i] = kzalloc(sizeof(*channels_info[i]),
-				GFP_KERNEL);
+		channels_info[i] = kzalloc(sizeof(*channels_info[i]), GFP_KERNEL);
 		if (!channels_info[i]) {
 			rc = -ENOMEM;
 			goto channel_info_err;
@@ -88,18 +217,19 @@ int hl_build_hwmon_channel_info(struct hl_device *hdev,
 		j++;
 	}
 
-	hdev->hl_chip_info->info =
-			(const struct hwmon_channel_info **)channels_info;
+	hdev->hl_chip_info->info = (const struct hwmon_channel_info **)channels_info;
 
 	return 0;
 
 channel_info_err:
-	for (i = 0 ; i < num_active_sensor_types ; i++)
+	for (i = 0 ; i < num_active_sensor_types ; i++) {
 		if (channels_info[i]) {
 			kfree(channels_info[i]->config);
 			kfree(channels_info[i]);
 		}
+	}
 	kfree(channels_info);
+
 channels_info_array_err:
 sensors_type_err:
 	for (i = 0 ; i < HWMON_NR_SENSOR_TYPES ; i++)
@@ -112,14 +242,16 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 			u32 attr, int channel, long *val)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
-	int rc;
+	bool use_cpucp_enum;
 	u32 cpucp_attr;
-	bool use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
-				CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false;
+	int rc;
 
 	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
+	use_cpucp_enum = (hdev->asic_prop.fw_app_cpu_boot_dev_sts0 &
+					CPU_BOOT_DEV_STS0_MAP_HWMON_EN) ? true : false;
+
 	switch (type) {
 	case hwmon_temp:
 		switch (attr) {
@@ -151,7 +283,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_temperature(hdev, channel, cpucp_attr, val);
 		else
-			rc = hl_get_temperature(hdev, channel, attr, val);
+			rc = hl_get_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_in:
 		switch (attr) {
@@ -174,7 +306,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_voltage(hdev, channel, cpucp_attr, val);
 		else
-			rc = hl_get_voltage(hdev, channel, attr, val);
+			rc = hl_get_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_curr:
 		switch (attr) {
@@ -197,7 +329,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_current(hdev, channel, cpucp_attr, val);
 		else
-			rc = hl_get_current(hdev, channel, attr, val);
+			rc = hl_get_current(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_fan:
 		switch (attr) {
@@ -217,7 +349,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_fan_speed(hdev, channel, cpucp_attr, val);
 		else
-			rc = hl_get_fan_speed(hdev, channel, attr, val);
+			rc = hl_get_fan_speed(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_pwm:
 		switch (attr) {
@@ -234,6 +366,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_pwm_info(hdev, channel, cpucp_attr, val);
 		else
+			/* no need for fixup as pwm was aligned from day 1 */
 			rc = hl_get_pwm_info(hdev, channel, attr, val);
 		break;
 	case hwmon_power:
@@ -251,7 +384,7 @@ static int hl_read(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			rc = hl_get_power(hdev, channel, cpucp_attr, val);
 		else
-			rc = hl_get_power(hdev, channel, attr, val);
+			rc = hl_get_power(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	default:
 		return -EINVAL;
@@ -286,7 +419,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			hl_set_temperature(hdev, channel, cpucp_attr, val);
 		else
-			hl_set_temperature(hdev, channel, attr, val);
+			hl_set_temperature(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_pwm:
 		switch (attr) {
@@ -303,6 +436,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			hl_set_pwm_info(hdev, channel, cpucp_attr, val);
 		else
+			/* no need for fixup as pwm was aligned from day 1 */
 			hl_set_pwm_info(hdev, channel, attr, val);
 		break;
 	case hwmon_in:
@@ -317,7 +451,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			hl_set_voltage(hdev, channel, cpucp_attr, val);
 		else
-			hl_set_voltage(hdev, channel, attr, val);
+			hl_set_voltage(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_curr:
 		switch (attr) {
@@ -331,7 +465,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			hl_set_current(hdev, channel, cpucp_attr, val);
 		else
-			hl_set_current(hdev, channel, attr, val);
+			hl_set_current(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	case hwmon_power:
 		switch (attr) {
@@ -345,7 +479,7 @@ static int hl_write(struct device *dev, enum hwmon_sensor_types type,
 		if (use_cpucp_enum)
 			hl_set_power(hdev, channel, cpucp_attr, val);
 		else
-			hl_set_power(hdev, channel, attr, val);
+			hl_set_power(hdev, channel, fixup_attr_legacy_fw(attr), val);
 		break;
 	default:
 		return -EINVAL;
@@ -444,6 +578,9 @@ int hl_get_temperature(struct hl_device *hdev,
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 
+	dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n",
+		pkt.ctl, pkt.sensor_index, pkt.type);
+
 	rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
 						0, &result);
 
@@ -677,12 +814,18 @@ int hl_set_power(struct hl_device *hdev,
 			int sensor_index, u32 attr, long value)
 {
 	struct cpucp_packet pkt;
+	struct asic_fixed_properties *prop = &hdev->asic_prop;
 	int rc;
 
 	memset(&pkt, 0, sizeof(pkt));
 
-	pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
+	if (prop->use_get_power_for_reset_history)
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_GET <<
 				CPUCP_PKT_CTL_OPCODE_SHIFT);
+	else
+		pkt.ctl = cpu_to_le32(CPUCP_PACKET_POWER_SET <<
+				CPUCP_PKT_CTL_OPCODE_SHIFT);
+
 	pkt.sensor_index = __cpu_to_le16(sensor_index);
 	pkt.type = __cpu_to_le16(attr);
 	pkt.value = __cpu_to_le64(value);
diff --git a/drivers/misc/habanalabs/common/irq.c b/drivers/misc/habanalabs/common/irq.c
index 96d82b682674..1b6bdc900c26 100644
--- a/drivers/misc/habanalabs/common/irq.c
+++ b/drivers/misc/habanalabs/common/irq.c
@@ -145,8 +145,12 @@ static void handle_user_cq(struct hl_device *hdev,
 
 	spin_lock(&user_cq->wait_list_lock);
 	list_for_each_entry(pend, &user_cq->wait_list_head, wait_list_node) {
-		pend->fence.timestamp = now;
-		complete_all(&pend->fence.completion);
+		if ((pend->cq_kernel_addr &&
+				*(pend->cq_kernel_addr) >= pend->cq_target_value) ||
+				!pend->cq_kernel_addr) {
+			pend->fence.timestamp = now;
+			complete_all(&pend->fence.completion);
+		}
 	}
 	spin_unlock(&user_cq->wait_list_lock);
 }
@@ -245,10 +249,8 @@ irqreturn_t hl_irq_handler_eq(int irq, void *arg)
 		 */
 		dma_rmb();
 
-		if (hdev->disabled) {
-			dev_warn(hdev->dev,
-				"Device disabled but received IRQ %d for EQ\n",
-					irq);
+		if (hdev->disabled && !hdev->reset_info.is_in_soft_reset) {
+			dev_warn(hdev->dev, "Device disabled but received an EQ event\n");
 			goto skip_irq;
 		}
 
diff --git a/drivers/misc/habanalabs/common/memory.c b/drivers/misc/habanalabs/common/memory.c
index 9bd626a00de3..c1eefaebacb6 100644
--- a/drivers/misc/habanalabs/common/memory.c
+++ b/drivers/misc/habanalabs/common/memory.c
@@ -316,7 +316,7 @@ static int free_phys_pg_pack(struct hl_device *hdev,
 	}
 
 	if (rc && !hdev->disabled)
-		hl_device_reset(hdev, HL_RESET_HARD);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD);
 
 end:
 	kvfree(phys_pg_pack->pages);
@@ -477,7 +477,7 @@ static int add_va_block_locked(struct hl_device *hdev,
 		struct list_head *va_list, u64 start, u64 end)
 {
 	struct hl_vm_va_block *va_block, *res = NULL;
-	u64 size = end - start;
+	u64 size = end - start + 1;
 
 	print_va_list_locked(hdev, va_list);
 
@@ -518,7 +518,7 @@ static int add_va_block_locked(struct hl_device *hdev,
 /**
  * add_va_block() - wrapper for add_va_block_locked.
  * @hdev: pointer to the habanalabs device structure.
- * @va_list: pointer to the virtual addresses block list.
+ * @va_range: pointer to the virtual addresses range object.
  * @start: start virtual address.
  * @end: end virtual address.
  *
@@ -538,8 +538,11 @@ static inline int add_va_block(struct hl_device *hdev,
 }
 
 /**
- * is_hint_crossing_range() - check if hint address crossing specified reserved
- * range.
+ * is_hint_crossing_range() - check if hint address crossing specified reserved.
+ * @range_type: virtual space range type.
+ * @start_addr: start virtual address.
+ * @size: block size.
+ * @prop: asic properties structure to retrieve reserved ranges from.
  */
 static inline bool is_hint_crossing_range(enum hl_va_range_type range_type,
 		u64 start_addr, u32 size, struct asic_fixed_properties *prop) {
@@ -644,7 +647,7 @@ static u64 get_va_block(struct hl_device *hdev,
 				continue;
 		}
 
-		valid_size = va_block->end - valid_start;
+		valid_size = va_block->end - valid_start + 1;
 		if (valid_size < size)
 			continue;
 
@@ -707,7 +710,7 @@ static u64 get_va_block(struct hl_device *hdev,
 
 	if (new_va_block->size > size) {
 		new_va_block->start += size;
-		new_va_block->size = new_va_block->end - new_va_block->start;
+		new_va_block->size = new_va_block->end - new_va_block->start + 1;
 	} else {
 		list_del(&new_va_block->node);
 		kfree(new_va_block);
@@ -749,6 +752,7 @@ u64 hl_reserve_va_block(struct hl_device *hdev, struct hl_ctx *ctx,
 
 /**
  * hl_get_va_range_type() - get va_range type for the given address and size.
+ * @ctx: context to fetch va_range from.
  * @address: the start address of the area we want to validate.
  * @size: the size in bytes of the area we want to validate.
  * @type: returned va_range type.
@@ -776,8 +780,8 @@ static int hl_get_va_range_type(struct hl_ctx *ctx, u64 address, u64 size,
  * hl_unreserve_va_block() - wrapper for add_va_block to unreserve a va block.
  * @hdev: pointer to the habanalabs device structure
  * @ctx: pointer to the context structure.
- * @start: start virtual address.
- * @end: end virtual address.
+ * @start_addr: start virtual address.
+ * @size: number of bytes to unreserve.
  *
  * This function does the following:
  * - Takes the list lock and calls add_va_block_locked.
@@ -1201,17 +1205,13 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 		goto map_err;
 	}
 
-	rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, false,
-		*vm_type, ctx->asid, ret_vaddr, phys_pg_pack->total_size);
+	rc = hl_mmu_invalidate_cache_range(hdev, false, *vm_type | MMU_OP_SKIP_LOW_CACHE_INV,
+				ctx->asid, ret_vaddr, phys_pg_pack->total_size);
 
 	mutex_unlock(&ctx->mmu_lock);
 
-	if (rc) {
-		dev_err(hdev->dev,
-			"mapping handle %u failed due to MMU cache invalidation\n",
-			handle);
+	if (rc)
 		goto map_err;
-	}
 
 	ret_vaddr += phys_pg_pack->offset;
 
@@ -1349,9 +1349,8 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	 * at the loop end rather than for each iteration
 	 */
 	if (!ctx_free)
-		rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, true,
-				*vm_type, ctx->asid, vaddr,
-				phys_pg_pack->total_size);
+		rc = hl_mmu_invalidate_cache_range(hdev, true, *vm_type, ctx->asid, vaddr,
+							phys_pg_pack->total_size);
 
 	mutex_unlock(&ctx->mmu_lock);
 
@@ -1364,11 +1363,6 @@ static int unmap_device_va(struct hl_ctx *ctx, struct hl_mem_in *args,
 	if (!ctx_free) {
 		int tmp_rc;
 
-		if (rc)
-			dev_err(hdev->dev,
-				"unmapping vaddr 0x%llx failed due to MMU cache invalidation\n",
-				vaddr);
-
 		tmp_rc = add_va_block(hdev, va_range, vaddr,
 					vaddr + phys_pg_pack->total_size - 1);
 		if (tmp_rc) {
@@ -2037,7 +2031,7 @@ static int mem_ioctl_no_mmu(struct hl_fpriv *hpriv, union hl_mem_args *args)
 
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
-		rc = -ENOTTY;
+		rc = -EINVAL;
 		break;
 	}
 
@@ -2162,7 +2156,7 @@ int hl_mem_ioctl(struct hl_fpriv *hpriv, void *data)
 
 	default:
 		dev_err(hdev->dev, "Unknown opcode for memory IOCTL\n");
-		rc = -ENOTTY;
+		rc = -EINVAL;
 		break;
 	}
 
@@ -2339,6 +2333,8 @@ void hl_userptr_delete_list(struct hl_device *hdev,
 /**
  * hl_userptr_is_pinned() - returns whether the given userptr is pinned.
  * @hdev: pointer to the habanalabs device structure.
+ * @addr: user address to check.
+ * @size: user block size to check.
  * @userptr_list: pointer to the list to clear.
  * @userptr: pointer to userptr to check.
  *
@@ -2361,9 +2357,10 @@ bool hl_userptr_is_pinned(struct hl_device *hdev, u64 addr,
 /**
  * va_range_init() - initialize virtual addresses range.
  * @hdev: pointer to the habanalabs device structure.
- * @va_range: pointer to the range to initialize.
+ * @va_ranges: pointer to va_ranges array.
  * @start: range start address.
  * @end: range end address.
+ * @page_size: page size for this va_range.
  *
  * This function does the following:
  * - Initializes the virtual addresses list of the given range with the given
@@ -2388,8 +2385,14 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
 			start += PAGE_SIZE;
 		}
 
-		if (end & (PAGE_SIZE - 1))
-			end &= PAGE_MASK;
+		/*
+		 * The end of the range is inclusive, hence we need to align it
+		 * to the end of the last full page in the range. For example if
+		 * end = 0x3ff5 with page size 0x1000, we need to align it to
+		 * 0x2fff. The remainig 0xff5 bytes do not form a full page.
+		 */
+		if ((end + 1) & (PAGE_SIZE - 1))
+			end = ((end + 1) & PAGE_MASK) - 1;
 	}
 
 	if (start >= end) {
@@ -2414,7 +2417,7 @@ static int va_range_init(struct hl_device *hdev, struct hl_va_range *va_range,
 /**
  * va_range_fini() - clear a virtual addresses range.
  * @hdev: pointer to the habanalabs structure.
- * va_range: pointer to virtual addresses rang.e
+ * @va_range: pointer to virtual addresses range.
  *
  * This function does the following:
  * - Frees the virtual addresses block list and its lock.
@@ -2434,12 +2437,15 @@ static void va_range_fini(struct hl_device *hdev, struct hl_va_range *va_range)
  * @ctx: pointer to the habanalabs context structure.
  * @host_range_start: host virtual addresses range start.
  * @host_range_end: host virtual addresses range end.
+ * @host_page_size: host page size.
  * @host_huge_range_start: host virtual addresses range start for memory
  *                         allocated with huge pages.
  * @host_huge_range_end: host virtual addresses range end for memory allocated
  *                        with huge pages.
+ * @host_huge_page_size: host huge page size.
  * @dram_range_start: dram virtual addresses range start.
  * @dram_range_end: dram virtual addresses range end.
+ * @dram_page_size: dram page size.
  *
  * This function initializes the following:
  * - MMU for context.
@@ -2564,14 +2570,14 @@ int hl_vm_ctx_init(struct hl_ctx *ctx)
 		return 0;
 
 	dram_range_start = prop->dmmu.start_addr;
-	dram_range_end = prop->dmmu.end_addr;
+	dram_range_end = prop->dmmu.end_addr - 1;
 	dram_page_size = prop->dram_page_size ?
 				prop->dram_page_size : prop->dmmu.page_size;
 	host_range_start = prop->pmmu.start_addr;
-	host_range_end = prop->pmmu.end_addr;
+	host_range_end = prop->pmmu.end_addr - 1;
 	host_page_size = prop->pmmu.page_size;
 	host_huge_range_start = prop->pmmu_huge.start_addr;
-	host_huge_range_end = prop->pmmu_huge.end_addr;
+	host_huge_range_end = prop->pmmu_huge.end_addr - 1;
 	host_huge_page_size = prop->pmmu_huge.page_size;
 
 	return vm_ctx_init_with_ranges(ctx, host_range_start, host_range_end,
@@ -2618,7 +2624,7 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	 * Clearly something went wrong on hard reset so no point in printing
 	 * another side effect error
 	 */
-	if (!hdev->hard_reset_pending && !hash_empty(ctx->mem_hash))
+	if (!hdev->reset_info.hard_reset_pending && !hash_empty(ctx->mem_hash))
 		dev_dbg(hdev->dev,
 			"user released device without removing its memory mappings\n");
 
@@ -2633,8 +2639,8 @@ void hl_vm_ctx_fini(struct hl_ctx *ctx)
 	mutex_lock(&ctx->mmu_lock);
 
 	/* invalidate the cache once after the unmapping loop */
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_PHYS_PACK);
+	hl_mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
+	hl_mmu_invalidate_cache(hdev, true, MMU_OP_PHYS_PACK);
 
 	mutex_unlock(&ctx->mmu_lock);
 
diff --git a/drivers/misc/habanalabs/common/mmu/mmu.c b/drivers/misc/habanalabs/common/mmu/mmu.c
index aa96917f62e5..9153a1f55175 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu.c
@@ -637,3 +637,28 @@ u64 hl_mmu_descramble_addr(struct hl_device *hdev, u64 addr)
 {
 	return addr;
 }
+
+int hl_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard, u32 flags)
+{
+	int rc;
+
+	rc = hdev->asic_funcs->mmu_invalidate_cache(hdev, is_hard, flags);
+	if (rc)
+		dev_err_ratelimited(hdev->dev, "MMU cache invalidation failed\n");
+
+	return rc;
+}
+
+int hl_mmu_invalidate_cache_range(struct hl_device *hdev, bool is_hard,
+					u32 flags, u32 asid, u64 va, u64 size)
+{
+	int rc;
+
+	rc = hdev->asic_funcs->mmu_invalidate_cache_range(hdev, is_hard, flags,
+								asid, va, size);
+	if (rc)
+		dev_err_ratelimited(hdev->dev, "MMU cache range invalidation failed\n");
+
+	return rc;
+}
+
diff --git a/drivers/misc/habanalabs/common/mmu/mmu_v1.c b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
index 0f536f79dd9c..6134b6ae7615 100644
--- a/drivers/misc/habanalabs/common/mmu/mmu_v1.c
+++ b/drivers/misc/habanalabs/common/mmu/mmu_v1.c
@@ -269,7 +269,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	num_of_hop3 = prop->dram_size_for_default_page_mapping;
 	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+	do_div(num_of_hop3, HOP_PTE_ENTRIES_512);
 
 	/* add hop1 and hop2 */
 	total_hops = num_of_hop3 + 2;
@@ -330,7 +330,7 @@ static int dram_default_mapping_init(struct hl_ctx *ctx)
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
 			write_final_pte(ctx, hop3_pte_addr, pte_val);
 			get_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
@@ -369,7 +369,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 
 	num_of_hop3 = prop->dram_size_for_default_page_mapping;
 	do_div(num_of_hop3, prop->dram_page_size);
-	do_div(num_of_hop3, PTE_ENTRIES_IN_HOP);
+	do_div(num_of_hop3, HOP_PTE_ENTRIES_512);
 
 	hop0_addr = get_hop0_addr(ctx);
 	/* add hop1 and hop2 */
@@ -379,7 +379,7 @@ static void dram_default_mapping_fini(struct hl_ctx *ctx)
 
 	for (i = 0 ; i < num_of_hop3 ; i++) {
 		hop3_pte_addr = ctx->dram_default_hops[i];
-		for (j = 0 ; j < PTE_ENTRIES_IN_HOP ; j++) {
+		for (j = 0 ; j < HOP_PTE_ENTRIES_512 ; j++) {
 			clear_pte(ctx, hop3_pte_addr);
 			put_pte(ctx, ctx->dram_default_hops[i]);
 			hop3_pte_addr += HL_PTE_SIZE;
@@ -573,7 +573,7 @@ static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
 
 	curr_pte = *(u64 *) (uintptr_t) hop3_pte_addr;
 
-	is_huge = curr_pte & LAST_MASK;
+	is_huge = curr_pte & mmu_prop->last_mask;
 
 	if (is_dram_addr && !is_huge) {
 		dev_err(hdev->dev,
@@ -597,7 +597,7 @@ static int _hl_mmu_v1_unmap(struct hl_ctx *ctx,
 
 	if (hdev->dram_default_page_mapping && is_dram_addr) {
 		u64 default_pte = (prop->mmu_dram_default_page_addr &
-				HOP_PHYS_ADDR_MASK) | LAST_MASK |
+				HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask |
 					PAGE_PRESENT_MASK;
 		if (curr_pte == default_pte) {
 			dev_err(hdev->dev,
@@ -729,7 +729,7 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 
 	if (hdev->dram_default_page_mapping && is_dram_addr) {
 		u64 default_pte = (prop->mmu_dram_default_page_addr &
-					HOP_PHYS_ADDR_MASK) | LAST_MASK |
+					HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask |
 						PAGE_PRESENT_MASK;
 
 		if (curr_pte != default_pte) {
@@ -769,7 +769,7 @@ static int _hl_mmu_v1_map(struct hl_ctx *ctx, u64 virt_addr, u64 phys_addr,
 		goto err;
 	}
 
-	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | LAST_MASK
+	curr_pte = (phys_addr & HOP_PHYS_ADDR_MASK) | mmu_prop->last_mask
 			| PAGE_PRESENT_MASK;
 
 	if (is_huge)
@@ -930,7 +930,7 @@ static int hl_mmu_v1_get_tlb_info(struct hl_ctx *ctx, u64 virt_addr,
 		if (!(hops->hop_info[i].hop_pte_val & PAGE_PRESENT_MASK))
 			return -EFAULT;
 
-		if (hops->hop_info[i].hop_pte_val & LAST_MASK)
+		if (hops->hop_info[i].hop_pte_val & mmu_prop->last_mask)
 			break;
 	}
 
diff --git a/drivers/misc/habanalabs/common/sysfs.c b/drivers/misc/habanalabs/common/sysfs.c
index 42c1769ad25d..45c715325e2a 100644
--- a/drivers/misc/habanalabs/common/sysfs.c
+++ b/drivers/misc/habanalabs/common/sysfs.c
@@ -139,7 +139,7 @@ static ssize_t cpld_ver_show(struct device *dev, struct device_attribute *attr,
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
 	return sprintf(buf, "0x%08x\n",
-			hdev->asic_prop.cpucp_info.cpld_version);
+			le32_to_cpu(hdev->asic_prop.cpucp_info.cpld_version));
 }
 
 static ssize_t cpucp_kernel_ver_show(struct device *dev,
@@ -163,8 +163,13 @@ static ssize_t infineon_ver_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "0x%04x\n",
-			hdev->asic_prop.cpucp_info.infineon_version);
+	if (hdev->asic_prop.cpucp_info.infineon_second_stage_version)
+		return sprintf(buf, "%#04x %#04x\n",
+			le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_version),
+			le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_second_stage_version));
+	else
+		return sprintf(buf, "%#04x\n",
+			le32_to_cpu(hdev->asic_prop.cpucp_info.infineon_version));
 }
 
 static ssize_t fuse_ver_show(struct device *dev, struct device_attribute *attr,
@@ -206,7 +211,7 @@ static ssize_t soft_reset_store(struct device *dev,
 		goto out;
 	}
 
-	if (!hdev->allow_inference_soft_reset) {
+	if (!hdev->asic_prop.allow_inference_soft_reset) {
 		dev_err(hdev->dev, "Device does not support inference soft-reset\n");
 		goto out;
 	}
@@ -236,7 +241,7 @@ static ssize_t hard_reset_store(struct device *dev,
 
 	dev_warn(hdev->dev, "Hard-Reset requested through sysfs\n");
 
-	hl_device_reset(hdev, HL_RESET_HARD);
+	hl_device_reset(hdev, HL_DRV_RESET_HARD);
 
 out:
 	return count;
@@ -298,7 +303,7 @@ static ssize_t soft_reset_cnt_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%d\n", hdev->soft_reset_cnt);
+	return sprintf(buf, "%d\n", hdev->reset_info.soft_reset_cnt);
 }
 
 static ssize_t hard_reset_cnt_show(struct device *dev,
@@ -306,7 +311,7 @@ static ssize_t hard_reset_cnt_show(struct device *dev,
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
 
-	return sprintf(buf, "%d\n", hdev->hard_reset_cnt);
+	return sprintf(buf, "%d\n", hdev->reset_info.hard_reset_cnt);
 }
 
 static ssize_t max_power_show(struct device *dev, struct device_attribute *attr,
@@ -419,8 +424,6 @@ static struct attribute *hl_dev_attrs[] = {
 	&dev_attr_max_power.attr,
 	&dev_attr_pci_addr.attr,
 	&dev_attr_preboot_btl_ver.attr,
-	&dev_attr_soft_reset.attr,
-	&dev_attr_soft_reset_cnt.attr,
 	&dev_attr_status.attr,
 	&dev_attr_thermal_ver.attr,
 	&dev_attr_uboot_ver.attr,
@@ -445,15 +448,25 @@ static const struct attribute_group *hl_dev_attr_groups[] = {
 	NULL,
 };
 
+static struct attribute *hl_dev_inference_attrs[] = {
+	&dev_attr_soft_reset.attr,
+	&dev_attr_soft_reset_cnt.attr,
+	NULL,
+};
+
+static struct attribute_group hl_dev_inference_attr_group = {
+	.attrs = hl_dev_inference_attrs,
+};
+
+static const struct attribute_group *hl_dev_inference_attr_groups[] = {
+	&hl_dev_inference_attr_group,
+	NULL,
+};
+
 int hl_sysfs_init(struct hl_device *hdev)
 {
 	int rc;
 
-	if (hdev->asic_type == ASIC_GOYA)
-		hdev->pm_mng_profile = PM_AUTO;
-	else
-		hdev->pm_mng_profile = PM_MANUAL;
-
 	hdev->max_power = hdev->asic_prop.max_power_default;
 
 	hdev->asic_funcs->add_device_attr(hdev, &hl_dev_clks_attr_group);
@@ -465,10 +478,25 @@ int hl_sysfs_init(struct hl_device *hdev)
 		return rc;
 	}
 
+	if (!hdev->asic_prop.allow_inference_soft_reset)
+		return 0;
+
+	rc = device_add_groups(hdev->dev, hl_dev_inference_attr_groups);
+	if (rc) {
+		dev_err(hdev->dev,
+			"Failed to add groups to device, error %d\n", rc);
+		return rc;
+	}
+
 	return 0;
 }
 
 void hl_sysfs_fini(struct hl_device *hdev)
 {
 	device_remove_groups(hdev->dev, hl_dev_attr_groups);
+
+	if (!hdev->asic_prop.allow_inference_soft_reset)
+		return;
+
+	device_remove_groups(hdev->dev, hl_dev_inference_attr_groups);
 }
diff --git a/drivers/misc/habanalabs/gaudi/gaudi.c b/drivers/misc/habanalabs/gaudi/gaudi.c
index 825737dfe381..013c6da2e3ca 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2020 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -593,26 +593,27 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE;
-	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
+	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->dram_supports_virtual_memory = false;
 
-	prop->pmmu.hop0_shift = HOP0_SHIFT;
-	prop->pmmu.hop1_shift = HOP1_SHIFT;
-	prop->pmmu.hop2_shift = HOP2_SHIFT;
-	prop->pmmu.hop3_shift = HOP3_SHIFT;
-	prop->pmmu.hop4_shift = HOP4_SHIFT;
-	prop->pmmu.hop0_mask = HOP0_MASK;
-	prop->pmmu.hop1_mask = HOP1_MASK;
-	prop->pmmu.hop2_mask = HOP2_MASK;
-	prop->pmmu.hop3_mask = HOP3_MASK;
-	prop->pmmu.hop4_mask = HOP4_MASK;
+	prop->pmmu.hop0_shift = MMU_V1_1_HOP0_SHIFT;
+	prop->pmmu.hop1_shift = MMU_V1_1_HOP1_SHIFT;
+	prop->pmmu.hop2_shift = MMU_V1_1_HOP2_SHIFT;
+	prop->pmmu.hop3_shift = MMU_V1_1_HOP3_SHIFT;
+	prop->pmmu.hop4_shift = MMU_V1_1_HOP4_SHIFT;
+	prop->pmmu.hop0_mask = MMU_V1_1_HOP0_MASK;
+	prop->pmmu.hop1_mask = MMU_V1_1_HOP1_MASK;
+	prop->pmmu.hop2_mask = MMU_V1_1_HOP2_MASK;
+	prop->pmmu.hop3_mask = MMU_V1_1_HOP3_MASK;
+	prop->pmmu.hop4_mask = MMU_V1_1_HOP4_MASK;
 	prop->pmmu.start_addr = VA_HOST_SPACE_START;
 	prop->pmmu.end_addr =
 			(VA_HOST_SPACE_START + VA_HOST_SPACE_SIZE / 2) - 1;
 	prop->pmmu.page_size = PAGE_SIZE_4KB;
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
+	prop->pmmu.last_mask = LAST_MASK;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -664,6 +665,8 @@ static int gaudi_set_fixed_properties(struct hl_device *hdev)
 	prop->clk_pll_index = HL_GAUDI_MME_PLL;
 	prop->max_freq_value = GAUDI_MAX_CLK_FREQ;
 
+	prop->use_get_power_for_reset_history = true;
+
 	return 0;
 }
 
@@ -878,6 +881,11 @@ static int gaudi_fetch_psoc_frequency(struct hl_device *hdev)
 	int rc;
 
 	if (hdev->asic_prop.fw_security_enabled) {
+		struct gaudi_device *gaudi = hdev->asic_specific;
+
+		if (!(gaudi->hw_cap_initialized & HW_CAP_CPU_Q))
+			return 0;
+
 		rc = hl_fw_cpucp_pll_info_get(hdev, HL_GAUDI_CPU_PLL, pll_freq_arr);
 
 		if (rc)
@@ -1273,6 +1281,7 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 		container_of(cs->signal_fence, struct hl_cs_compl, base_fence);
 	struct hl_cs_compl *cs_cmpl =
 		container_of(cs->fence, struct hl_cs_compl, base_fence);
+	struct hl_cs_encaps_sig_handle *handle = cs->encaps_sig_hdl;
 	struct gaudi_collective_properties *cprop;
 	u32 stream, queue_id, sob_group_offset;
 	struct gaudi_device *gaudi;
@@ -1285,10 +1294,16 @@ static int gaudi_collective_wait_init_cs(struct hl_cs *cs)
 	gaudi = hdev->asic_specific;
 	cprop = &gaudi->collective_props;
 
-	/* In encaps signals case the SOB info will be retrieved from
-	 * the handle in gaudi_collective_slave_init_job.
-	 */
-	if (!cs->encaps_signals) {
+	if (cs->encaps_signals) {
+		cs_cmpl->hw_sob = handle->hw_sob;
+		/* at this checkpoint we only need the hw_sob pointer
+		 * for the completion check before start going over the jobs
+		 * of the master/slaves, the sob_value will be taken later on
+		 * in gaudi_collective_slave_init_job depends on each
+		 * job wait offset value.
+		 */
+		cs_cmpl->sob_val = 0;
+	} else {
 		/* copy the SOB id and value of the signal CS */
 		cs_cmpl->hw_sob = signal_cs_cmpl->hw_sob;
 		cs_cmpl->sob_val = signal_cs_cmpl->sob_val;
@@ -1621,6 +1636,8 @@ static int gaudi_late_init(struct hl_device *hdev)
 	 */
 	gaudi_mmu_prepare(hdev, 1);
 
+	hdev->asic_funcs->set_pll_profile(hdev, PLL_LAST);
+
 	return 0;
 
 disable_pci_access:
@@ -4006,7 +4023,7 @@ static void gaudi_init_firmware_loader(struct hl_device *hdev)
 	struct fw_load_mgr *fw_loader = &hdev->fw_loader;
 
 	/* fill common fields */
-	fw_loader->linux_loaded = false;
+	fw_loader->fw_comp_loaded = FW_TYPE_NONE;
 	fw_loader->boot_fit_img.image_name = GAUDI_BOOT_FIT_FILE;
 	fw_loader->linux_img.image_name = GAUDI_LINUX_FW_FILE;
 	fw_loader->cpu_timeout = GAUDI_CPU_TIMEOUT_USEC;
@@ -4289,13 +4306,31 @@ static void gaudi_hw_fini(struct hl_device *hdev, bool hard_reset, bool fw_reset
 	 * via the GIC. Otherwise, we need to use COMMS or the MSG_TO_CPU
 	 * registers in case of old F/Ws
 	 */
-	if (hdev->fw_loader.linux_loaded) {
+	if (hdev->fw_loader.fw_comp_loaded & FW_TYPE_LINUX) {
 		irq_handler_offset = hdev->asic_prop.gic_interrupts_enable ?
 				mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR :
 				le32_to_cpu(dyn_regs->gic_host_halt_irq);
 
 		WREG32(irq_handler_offset,
 			gaudi_irq_map_table[GAUDI_EVENT_HALT_MACHINE].cpu_id);
+
+		/* This is a hail-mary attempt to revive the card in the small chance that the
+		 * f/w has experienced a watchdog event, which caused it to return back to preboot.
+		 * In that case, triggering reset through GIC won't help. We need to trigger the
+		 * reset as if Linux wasn't loaded.
+		 *
+		 * We do it only if the reset cause was HB, because that would be the indication
+		 * of such an event.
+		 *
+		 * In case watchdog hasn't expired but we still got HB, then this won't do any
+		 * damage.
+		 */
+		if (hdev->reset_info.curr_reset_cause == HL_RESET_CAUSE_HEARTBEAT) {
+			if (hdev->asic_prop.hard_reset_done_by_fw)
+				hl_fw_ask_hard_reset_without_linux(hdev);
+			else
+				hl_fw_ask_halt_machine_without_linux(hdev);
+		}
 	} else {
 		if (hdev->asic_prop.hard_reset_done_by_fw)
 			hl_fw_ask_hard_reset_without_linux(hdev);
@@ -6412,6 +6447,7 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size,
 {
 	u32 dma_core_sts0, err_cause, cfg1, size_left, pos, size_to_dma;
 	struct gaudi_device *gaudi = hdev->asic_specific;
+	u32 qm_glbl_sts0, qm_cgm_sts;
 	u64 dma_offset, qm_offset;
 	dma_addr_t dma_addr;
 	void *kernel_addr;
@@ -6436,14 +6472,20 @@ static int gaudi_debugfs_read_dma(struct hl_device *hdev, u64 addr, u32 size,
 	dma_offset = dma_id * DMA_CORE_OFFSET;
 	qm_offset = dma_id * DMA_QMAN_OFFSET;
 	dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
-	is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+	qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset);
+	qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset);
+	is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) &&
+		      IS_DMA_IDLE(dma_core_sts0);
 
 	if (!is_eng_idle) {
 		dma_id = gaudi_dma_assignment[GAUDI_PCI_DMA_2];
 		dma_offset = dma_id * DMA_CORE_OFFSET;
 		qm_offset = dma_id * DMA_QMAN_OFFSET;
 		dma_core_sts0 = RREG32(mmDMA0_CORE_STS0 + dma_offset);
-		is_eng_idle = IS_DMA_IDLE(dma_core_sts0);
+		qm_glbl_sts0 = RREG32(mmDMA0_QM_GLBL_STS0 + qm_offset);
+		qm_cgm_sts = RREG32(mmDMA0_QM_CGM_STS + qm_offset);
+		is_eng_idle = IS_QM_IDLE(qm_glbl_sts0, qm_cgm_sts) &&
+			      IS_DMA_IDLE(dma_core_sts0);
 
 		if (!is_eng_idle) {
 			dev_err_ratelimited(hdev->dev,
@@ -6522,7 +6564,7 @@ static u64 gaudi_read_pte(struct hl_device *hdev, u64 addr)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 
-	if (hdev->hard_reset_pending)
+	if (hdev->reset_info.hard_reset_pending)
 		return U64_MAX;
 
 	return readq(hdev->pcie_bar[HBM_BAR_ID] +
@@ -6533,7 +6575,7 @@ static void gaudi_write_pte(struct hl_device *hdev, u64 addr, u64 val)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
 
-	if (hdev->hard_reset_pending)
+	if (hdev->reset_info.hard_reset_pending)
 		return;
 
 	writeq(val, hdev->pcie_bar[HBM_BAR_ID] +
@@ -6935,8 +6977,9 @@ event_not_supported:
 	snprintf(desc, size, "N/A");
 }
 
-static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev,
-							u32 x_y, bool is_write)
+static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev, u32 x_y,
+							bool is_write, s32 *engine_id_1,
+							s32 *engine_id_2)
 {
 	u32 dma_id[2], dma_offset, err_cause[2], mask, i;
 
@@ -6976,44 +7019,64 @@ static const char *gaudi_get_razwi_initiator_dma_name(struct hl_device *hdev,
 	switch (x_y) {
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_0:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_S_1:
-		if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+		if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
 			return "DMA0";
-		else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+		} else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_2;
 			return "DMA2";
-		else
+		} else {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_0;
+			*engine_id_2 = GAUDI_ENGINE_ID_DMA_2;
 			return "DMA0 or DMA2";
+		}
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_0:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_S_1:
-		if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+		if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
 			return "DMA1";
-		else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+		} else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_3;
 			return "DMA3";
-		else
+		} else {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_1;
+			*engine_id_2 = GAUDI_ENGINE_ID_DMA_3;
 			return "DMA1 or DMA3";
+		}
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_0:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
-		if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+		if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
 			return "DMA4";
-		else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+		} else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_6;
 			return "DMA6";
-		else
+		} else {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_4;
+			*engine_id_2 = GAUDI_ENGINE_ID_DMA_6;
 			return "DMA4 or DMA6";
+		}
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
-		if ((err_cause[0] & mask) && !(err_cause[1] & mask))
+		if ((err_cause[0] & mask) && !(err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
 			return "DMA5";
-		else if (!(err_cause[0] & mask) && (err_cause[1] & mask))
+		} else if (!(err_cause[0] & mask) && (err_cause[1] & mask)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_7;
 			return "DMA7";
-		else
+		} else {
+			*engine_id_1 = GAUDI_ENGINE_ID_DMA_5;
+			*engine_id_2 = GAUDI_ENGINE_ID_DMA_7;
 			return "DMA5 or DMA7";
+		}
 	}
 
 unknown_initiator:
 	return "unknown initiator";
 }
 
-static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
-							bool is_write)
+static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev, bool is_write,
+							u32 *engine_id_1, u32 *engine_id_2)
 {
 	u32 val, x_y, axi_id;
 
@@ -7026,24 +7089,35 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
 
 	switch (x_y) {
 	case RAZWI_INITIATOR_ID_X_Y_TPC0_NIC0:
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_TPC_0;
 			return "TPC0";
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+		}
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_NIC_0;
 			return "NIC0";
+		}
 		break;
 	case RAZWI_INITIATOR_ID_X_Y_TPC1:
+		*engine_id_1 = GAUDI_ENGINE_ID_TPC_1;
 		return "TPC1";
 	case RAZWI_INITIATOR_ID_X_Y_MME0_0:
 	case RAZWI_INITIATOR_ID_X_Y_MME0_1:
+		*engine_id_1 = GAUDI_ENGINE_ID_MME_0;
 		return "MME0";
 	case RAZWI_INITIATOR_ID_X_Y_MME1_0:
 	case RAZWI_INITIATOR_ID_X_Y_MME1_1:
+		*engine_id_1 = GAUDI_ENGINE_ID_MME_1;
 		return "MME1";
 	case RAZWI_INITIATOR_ID_X_Y_TPC2:
+		*engine_id_1 = GAUDI_ENGINE_ID_TPC_2;
 		return "TPC2";
 	case RAZWI_INITIATOR_ID_X_Y_TPC3_PCI_CPU_PSOC:
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_TPC_3;
 			return "TPC3";
+		}
+		/* PCI, CPU or PSOC does not have engine id*/
 		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_PCI))
 			return "PCI";
 		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_CPU))
@@ -7059,32 +7133,49 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_W_N_1:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_0:
 	case RAZWI_INITIATOR_ID_X_Y_DMA_IF_E_N_1:
-		return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write);
+		return gaudi_get_razwi_initiator_dma_name(hdev, x_y, is_write,
+				engine_id_1, engine_id_2);
 	case RAZWI_INITIATOR_ID_X_Y_TPC4_NIC1_NIC2:
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_TPC_4;
 			return "TPC4";
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+		}
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_NIC_1;
 			return "NIC1";
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+		}
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_NIC_2;
 			return "NIC2";
+		}
 		break;
 	case RAZWI_INITIATOR_ID_X_Y_TPC5:
+		*engine_id_1 = GAUDI_ENGINE_ID_TPC_5;
 		return "TPC5";
 	case RAZWI_INITIATOR_ID_X_Y_MME2_0:
 	case RAZWI_INITIATOR_ID_X_Y_MME2_1:
+		*engine_id_1 = GAUDI_ENGINE_ID_MME_2;
 		return "MME2";
 	case RAZWI_INITIATOR_ID_X_Y_MME3_0:
 	case RAZWI_INITIATOR_ID_X_Y_MME3_1:
+		*engine_id_1 = GAUDI_ENGINE_ID_MME_3;
 		return "MME3";
 	case RAZWI_INITIATOR_ID_X_Y_TPC6:
+		*engine_id_1 = GAUDI_ENGINE_ID_TPC_6;
 		return "TPC6";
 	case RAZWI_INITIATOR_ID_X_Y_TPC7_NIC4_NIC5:
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC))
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_TPC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_TPC_7;
 			return "TPC7";
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC))
+		}
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_NIC_4;
 			return "NIC4";
-		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT))
+		}
+		if (axi_id == RAZWI_INITIATOR_ID_AXI_ID(AXI_ID_NIC_FT)) {
+			*engine_id_1 = GAUDI_ENGINE_ID_NIC_5;
 			return "NIC5";
+		}
 		break;
 	default:
 		break;
@@ -7101,27 +7192,28 @@ static const char *gaudi_get_razwi_initiator_name(struct hl_device *hdev,
 	return "unknown initiator";
 }
 
-static void gaudi_print_razwi_info(struct hl_device *hdev)
+static void gaudi_print_and_get_razwi_info(struct hl_device *hdev, u32 *engine_id_1,
+						u32 *engine_id_2)
 {
+
 	if (RREG32(mmMMU_UP_RAZWI_WRITE_VLD)) {
 		dev_err_ratelimited(hdev->dev,
 			"RAZWI event caused by illegal write of %s\n",
-			gaudi_get_razwi_initiator_name(hdev, true));
+			gaudi_get_razwi_initiator_name(hdev, true, engine_id_1, engine_id_2));
 		WREG32(mmMMU_UP_RAZWI_WRITE_VLD, 0);
 	}
 
 	if (RREG32(mmMMU_UP_RAZWI_READ_VLD)) {
 		dev_err_ratelimited(hdev->dev,
 			"RAZWI event caused by illegal read of %s\n",
-			gaudi_get_razwi_initiator_name(hdev, false));
+			gaudi_get_razwi_initiator_name(hdev, false, engine_id_1, engine_id_2));
 		WREG32(mmMMU_UP_RAZWI_READ_VLD, 0);
 	}
 }
 
-static void gaudi_print_mmu_error_info(struct hl_device *hdev)
+static void gaudi_print_and_get_mmu_error_info(struct hl_device *hdev, u64 *addr, u8 *type)
 {
 	struct gaudi_device *gaudi = hdev->asic_specific;
-	u64 addr;
 	u32 val;
 
 	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU))
@@ -7129,24 +7221,24 @@ static void gaudi_print_mmu_error_info(struct hl_device *hdev)
 
 	val = RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE);
 	if (val & MMU_UP_PAGE_ERROR_CAPTURE_ENTRY_VALID_MASK) {
-		addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
-		addr <<= 32;
-		addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
+		*addr = val & MMU_UP_PAGE_ERROR_CAPTURE_VA_49_32_MASK;
+		*addr <<= 32;
+		*addr |= RREG32(mmMMU_UP_PAGE_ERROR_CAPTURE_VA);
 
-		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n",
-					addr);
+		dev_err_ratelimited(hdev->dev, "MMU page fault on va 0x%llx\n", *addr);
+		*type = HL_RAZWI_PAGE_FAULT;
 
 		WREG32(mmMMU_UP_PAGE_ERROR_CAPTURE, 0);
 	}
 
 	val = RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE);
 	if (val & MMU_UP_ACCESS_ERROR_CAPTURE_ENTRY_VALID_MASK) {
-		addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
-		addr <<= 32;
-		addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
+		*addr = val & MMU_UP_ACCESS_ERROR_CAPTURE_VA_49_32_MASK;
+		*addr <<= 32;
+		*addr |= RREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE_VA);
 
-		dev_err_ratelimited(hdev->dev,
-				"MMU access error on va 0x%llx\n", addr);
+		dev_err_ratelimited(hdev->dev, "MMU access error on va 0x%llx\n", *addr);
+		*type = HL_RAZWI_MMU_ACCESS_ERROR;
 
 		WREG32(mmMMU_UP_ACCESS_ERROR_CAPTURE, 0);
 	}
@@ -7665,15 +7757,46 @@ static void gaudi_handle_qman_err(struct hl_device *hdev, u16 event_type)
 static void gaudi_print_irq_info(struct hl_device *hdev, u16 event_type,
 					bool razwi)
 {
+	u32 engine_id_1, engine_id_2;
 	char desc[64] = "";
+	u64 razwi_addr = 0;
+	u8 razwi_type;
+	int rc;
+
+	/*
+	 * Init engine id by default as not valid and only if razwi initiated from engine with
+	 * engine id it will get valid value.
+	 * Init razwi type to default, will be changed only if razwi caused by page fault of
+	 * MMU access error
+	 */
+	engine_id_1 = U16_MAX;
+	engine_id_2 = U16_MAX;
+	razwi_type = U8_MAX;
 
 	gaudi_get_event_desc(event_type, desc, sizeof(desc));
 	dev_err_ratelimited(hdev->dev, "Received H/W interrupt %d [\"%s\"]\n",
 		event_type, desc);
 
 	if (razwi) {
-		gaudi_print_razwi_info(hdev);
-		gaudi_print_mmu_error_info(hdev);
+		gaudi_print_and_get_razwi_info(hdev, &engine_id_1, &engine_id_2);
+		gaudi_print_and_get_mmu_error_info(hdev, &razwi_addr, &razwi_type);
+
+		/* In case it's the first razwi, save its parameters*/
+		rc = atomic_cmpxchg(&hdev->last_error.razwi_write_disable, 0, 1);
+		if (!rc) {
+			hdev->last_error.open_dev_timestamp = hdev->last_successful_open_ktime;
+			hdev->last_error.razwi_timestamp = ktime_get();
+			hdev->last_error.razwi_addr = razwi_addr;
+			hdev->last_error.razwi_engine_id_1 = engine_id_1;
+			hdev->last_error.razwi_engine_id_2 = engine_id_2;
+			/*
+			 * If first engine id holds non valid value the razwi initiator
+			 * does not have engine id
+			 */
+			hdev->last_error.razwi_non_engine_initiator = (engine_id_1 == U16_MAX);
+			hdev->last_error.razwi_type = razwi_type;
+
+		}
 	}
 }
 
@@ -7696,14 +7819,10 @@ static void gaudi_print_fw_alive_info(struct hl_device *hdev,
 		fw_alive->thread_id, fw_alive->uptime_seconds);
 }
 
-static int gaudi_soft_reset_late_init(struct hl_device *hdev)
+static int gaudi_non_hard_reset_late_init(struct hl_device *hdev)
 {
-	struct gaudi_device *gaudi = hdev->asic_specific;
-
-	/* Unmask all IRQs since some could have been received
-	 * during the soft reset
-	 */
-	return hl_fw_unmask_irq_arr(hdev, gaudi->events, sizeof(gaudi->events));
+	/* GAUDI doesn't support any reset except hard-reset */
+	return -EPERM;
 }
 
 static int gaudi_hbm_read_interrupts(struct hl_device *hdev, int device,
@@ -7897,27 +8016,39 @@ static int tpc_krn_event_to_tpc_id(u16 tpc_dec_event_type)
 static void gaudi_print_clk_change_info(struct hl_device *hdev,
 					u16 event_type)
 {
+	ktime_t zero_time = ktime_set(0, 0);
+
+	mutex_lock(&hdev->clk_throttling.lock);
+
 	switch (event_type) {
 	case GAUDI_EVENT_FIX_POWER_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;
 
 	case GAUDI_EVENT_FIX_POWER_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;
 
 	case GAUDI_EVENT_FIX_THERMAL_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;
 
 	case GAUDI_EVENT_FIX_THERMAL_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@@ -7927,6 +8058,8 @@ static void gaudi_print_clk_change_info(struct hl_device *hdev,
 			event_type);
 		break;
 	}
+
+	mutex_unlock(&hdev->clk_throttling.lock);
 }
 
 static void gaudi_handle_eqe(struct hl_device *hdev,
@@ -7975,7 +8108,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_NIC0_CS_DBG_DERR ... GAUDI_EVENT_NIC4_CS_DBG_DERR:
 		gaudi_print_irq_info(hdev, event_type, true);
 		gaudi_handle_ecc_event(hdev, event_type, &eq_entry->ecc_data);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_GIC500:
@@ -7983,7 +8116,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 	case GAUDI_EVENT_L2_RAM_ECC:
 	case GAUDI_EVENT_PLL0 ... GAUDI_EVENT_PLL17:
 		gaudi_print_irq_info(hdev, event_type, false);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_0:
@@ -7994,7 +8127,7 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 		gaudi_hbm_read_interrupts(hdev,
 				gaudi_hbm_event_to_dev(event_type),
 				&eq_entry->hbm_ecc_data);
-		fw_fatal_err_flag = HL_RESET_FW_FATAL_ERR;
+		fw_fatal_err_flag = HL_DRV_RESET_FW_FATAL_ERR;
 		goto reset_device;
 
 	case GAUDI_EVENT_HBM0_SPI_1:
@@ -8177,9 +8310,11 @@ static void gaudi_handle_eqe(struct hl_device *hdev,
 
 reset_device:
 	if (hdev->asic_prop.fw_security_enabled)
-		hl_device_reset(hdev, HL_RESET_HARD | HL_RESET_FW | fw_fatal_err_flag);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD
+					| HL_DRV_RESET_BYPASS_REQ_TO_FW
+					| fw_fatal_err_flag);
 	else if (hdev->hard_reset_on_fw_events)
-		hl_device_reset(hdev, HL_RESET_HARD | fw_fatal_err_flag);
+		hl_device_reset(hdev, HL_DRV_RESET_HARD | fw_fatal_err_flag);
 	else
 		hl_fw_unmask_irq(hdev, event_type);
 }
@@ -8206,7 +8341,7 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 	int rc;
 
 	if (!(gaudi->hw_cap_initialized & HW_CAP_MMU) ||
-		hdev->hard_reset_pending)
+		hdev->reset_info.hard_reset_pending)
 		return 0;
 
 	if (hdev->pldm)
@@ -8229,12 +8364,6 @@ static int gaudi_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 
 	WREG32(mmSTLB_INV_SET, 0);
 
-	if (rc) {
-		dev_err_ratelimited(hdev->dev,
-					"MMU cache invalidation timeout\n");
-		hl_device_reset(hdev, HL_RESET_HARD);
-	}
-
 	return rc;
 }
 
@@ -8662,7 +8791,7 @@ static int gaudi_internal_cb_pool_init(struct hl_device *hdev,
 			hdev->internal_cb_pool_dma_addr,
 			HOST_SPACE_INTERNAL_CB_SZ);
 
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, VM_TYPE_USERPTR);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, false, MMU_OP_USERPTR);
 	mutex_unlock(&ctx->mmu_lock);
 
 	if (rc)
@@ -8697,7 +8826,7 @@ static void gaudi_internal_cb_pool_fini(struct hl_device *hdev,
 			HOST_SPACE_INTERNAL_CB_SZ);
 	hl_unreserve_va_block(hdev, ctx, hdev->internal_cb_va_base,
 			HOST_SPACE_INTERNAL_CB_SZ);
-	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, VM_TYPE_USERPTR);
+	hdev->asic_funcs->mmu_invalidate_cache(hdev, true, MMU_OP_USERPTR);
 	mutex_unlock(&ctx->mmu_lock);
 
 	gen_pool_destroy(hdev->internal_cb_pool);
@@ -9458,7 +9587,7 @@ static const struct hl_asic_funcs gaudi_funcs = {
 	.disable_clock_gating = gaudi_disable_clock_gating,
 	.debug_coresight = gaudi_debug_coresight,
 	.is_device_idle = gaudi_is_device_idle,
-	.soft_reset_late_init = gaudi_soft_reset_late_init,
+	.non_hard_reset_late_init = gaudi_non_hard_reset_late_init,
 	.hw_queues_lock = gaudi_hw_queues_lock,
 	.hw_queues_unlock = gaudi_hw_queues_unlock,
 	.get_pci_id = gaudi_get_pci_id,
diff --git a/drivers/misc/habanalabs/gaudi/gaudiP.h b/drivers/misc/habanalabs/gaudi/gaudiP.h
index f325e36a71e6..8ac16a9b7d15 100644
--- a/drivers/misc/habanalabs/gaudi/gaudiP.h
+++ b/drivers/misc/habanalabs/gaudi/gaudiP.h
@@ -357,8 +357,8 @@ void gaudi_init_security(struct hl_device *hdev);
 void gaudi_ack_protection_bits_errors(struct hl_device *hdev);
 void gaudi_add_device_attr(struct hl_device *hdev,
 			struct attribute_group *dev_attr_grp);
-int gaudi_debug_coresight(struct hl_device *hdev, void *data);
-void gaudi_halt_coresight(struct hl_device *hdev);
+int gaudi_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
+void gaudi_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx);
 void gaudi_mmu_prepare_reg(struct hl_device *hdev, u64 reg, u32 asid);
 
 #endif /* GAUDIP_H_ */
diff --git a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
index 5349c1be13f9..08108f5fed67 100644
--- a/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
+++ b/drivers/misc/habanalabs/gaudi/gaudi_coresight.c
@@ -848,7 +848,7 @@ static int gaudi_config_spmu(struct hl_device *hdev,
 	return 0;
 }
 
-int gaudi_debug_coresight(struct hl_device *hdev, void *data)
+int gaudi_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data)
 {
 	struct hl_debug_params *params = data;
 	int rc = 0;
@@ -887,7 +887,7 @@ int gaudi_debug_coresight(struct hl_device *hdev, void *data)
 	return rc;
 }
 
-void gaudi_halt_coresight(struct hl_device *hdev)
+void gaudi_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx)
 {
 	struct hl_debug_params params = {};
 	int i, rc;
diff --git a/drivers/misc/habanalabs/goya/goya.c b/drivers/misc/habanalabs/goya/goya.c
index 5536e8c27bd5..fbcc7bbf44b3 100644
--- a/drivers/misc/habanalabs/goya/goya.c
+++ b/drivers/misc/habanalabs/goya/goya.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -410,25 +410,26 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	else
 		prop->mmu_pgt_size = MMU_PAGE_TABLES_SIZE;
 	prop->mmu_pte_size = HL_PTE_SIZE;
-	prop->mmu_hop_table_size = HOP_TABLE_SIZE;
-	prop->mmu_hop0_tables_total_size = HOP0_TABLES_TOTAL_SIZE;
+	prop->mmu_hop_table_size = HOP_TABLE_SIZE_512_PTE;
+	prop->mmu_hop0_tables_total_size = HOP0_512_PTE_TABLES_TOTAL_SIZE;
 	prop->dram_page_size = PAGE_SIZE_2MB;
 	prop->dram_supports_virtual_memory = true;
 
-	prop->dmmu.hop0_shift = HOP0_SHIFT;
-	prop->dmmu.hop1_shift = HOP1_SHIFT;
-	prop->dmmu.hop2_shift = HOP2_SHIFT;
-	prop->dmmu.hop3_shift = HOP3_SHIFT;
-	prop->dmmu.hop4_shift = HOP4_SHIFT;
-	prop->dmmu.hop0_mask = HOP0_MASK;
-	prop->dmmu.hop1_mask = HOP1_MASK;
-	prop->dmmu.hop2_mask = HOP2_MASK;
-	prop->dmmu.hop3_mask = HOP3_MASK;
-	prop->dmmu.hop4_mask = HOP4_MASK;
+	prop->dmmu.hop0_shift = MMU_V1_0_HOP0_SHIFT;
+	prop->dmmu.hop1_shift = MMU_V1_0_HOP1_SHIFT;
+	prop->dmmu.hop2_shift = MMU_V1_0_HOP2_SHIFT;
+	prop->dmmu.hop3_shift = MMU_V1_0_HOP3_SHIFT;
+	prop->dmmu.hop4_shift = MMU_V1_0_HOP4_SHIFT;
+	prop->dmmu.hop0_mask = MMU_V1_0_HOP0_MASK;
+	prop->dmmu.hop1_mask = MMU_V1_0_HOP1_MASK;
+	prop->dmmu.hop2_mask = MMU_V1_0_HOP2_MASK;
+	prop->dmmu.hop3_mask = MMU_V1_0_HOP3_MASK;
+	prop->dmmu.hop4_mask = MMU_V1_0_HOP4_MASK;
 	prop->dmmu.start_addr = VA_DDR_SPACE_START;
 	prop->dmmu.end_addr = VA_DDR_SPACE_END;
 	prop->dmmu.page_size = PAGE_SIZE_2MB;
 	prop->dmmu.num_hops = MMU_ARCH_5_HOPS;
+	prop->dmmu.last_mask = LAST_MASK;
 
 	/* shifts and masks are the same in PMMU and DMMU */
 	memcpy(&prop->pmmu, &prop->dmmu, sizeof(prop->dmmu));
@@ -436,6 +437,7 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 	prop->pmmu.end_addr = VA_HOST_SPACE_END;
 	prop->pmmu.page_size = PAGE_SIZE_4KB;
 	prop->pmmu.num_hops = MMU_ARCH_5_HOPS;
+	prop->pmmu.last_mask = LAST_MASK;
 
 	/* PMMU and HPMMU are the same except of page size */
 	memcpy(&prop->pmmu_huge, &prop->pmmu, sizeof(prop->pmmu));
@@ -473,6 +475,8 @@ int goya_set_fixed_properties(struct hl_device *hdev)
 
 	prop->clk_pll_index = HL_GOYA_MME_PLL;
 
+	prop->use_get_power_for_reset_history = true;
+
 	return 0;
 }
 
@@ -735,6 +739,11 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev)
 	int rc;
 
 	if (hdev->asic_prop.fw_security_enabled) {
+		struct goya_device *goya = hdev->asic_specific;
+
+		if (!(goya->hw_cap_initialized & HW_CAP_CPU_Q))
+			return;
+
 		rc = hl_fw_cpucp_pll_info_get(hdev, HL_GOYA_PCI_PLL,
 				pll_freq_arr);
 
@@ -778,9 +787,59 @@ static void goya_fetch_psoc_frequency(struct hl_device *hdev)
 	prop->psoc_pci_pll_div_factor = div_fctr;
 }
 
+/*
+ * goya_set_frequency - set the frequency of the device
+ *
+ * @hdev: pointer to habanalabs device structure
+ * @freq: the new frequency value
+ *
+ * Change the frequency if needed. This function has no protection against
+ * concurrency, therefore it is assumed that the calling function has protected
+ * itself against the case of calling this function from multiple threads with
+ * different values
+ *
+ * Returns 0 if no change was done, otherwise returns 1
+ */
+int goya_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq)
+{
+	struct goya_device *goya = hdev->asic_specific;
+
+	if ((goya->pm_mng_profile == PM_MANUAL) ||
+			(goya->curr_pll_profile == freq))
+		return 0;
+
+	dev_dbg(hdev->dev, "Changing device frequency to %s\n",
+		freq == PLL_HIGH ? "high" : "low");
+
+	goya_set_pll_profile(hdev, freq);
+
+	goya->curr_pll_profile = freq;
+
+	return 1;
+}
+
+static void goya_set_freq_to_low_job(struct work_struct *work)
+{
+	struct goya_work_freq *goya_work = container_of(work,
+						struct goya_work_freq,
+						work_freq.work);
+	struct hl_device *hdev = goya_work->hdev;
+
+	mutex_lock(&hdev->fpriv_list_lock);
+
+	if (!hdev->is_compute_ctx_active)
+		goya_set_frequency(hdev, PLL_LOW);
+
+	mutex_unlock(&hdev->fpriv_list_lock);
+
+	schedule_delayed_work(&goya_work->work_freq,
+			usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
+}
+
 int goya_late_init(struct hl_device *hdev)
 {
 	struct asic_fixed_properties *prop = &hdev->asic_prop;
+	struct goya_device *goya = hdev->asic_specific;
 	int rc;
 
 	goya_fetch_psoc_frequency(hdev);
@@ -829,6 +888,16 @@ int goya_late_init(struct hl_device *hdev)
 		return rc;
 	}
 
+	/* force setting to low frequency */
+	goya->curr_pll_profile = PLL_LOW;
+
+	goya->pm_mng_profile = PM_AUTO;
+
+	hdev->asic_funcs->set_pll_profile(hdev, PLL_LOW);
+
+	schedule_delayed_work(&goya->goya_work->work_freq,
+		usecs_to_jiffies(HL_PLL_LOW_JOB_FREQ_USEC));
+
 	return 0;
 }
 
@@ -842,8 +911,11 @@ int goya_late_init(struct hl_device *hdev)
 void goya_late_fini(struct hl_device *hdev)
 {
 	const struct hwmon_channel_info **channel_info_arr;
+	struct goya_device *goya = hdev->asic_specific;
 	int i = 0;
 
+	cancel_delayed_work_sync(&goya->goya_work->work_freq);
+
 	if (!hdev->hl_chip_info->info)
 		return;
 
@@ -961,12 +1033,21 @@ static int goya_sw_init(struct hl_device *hdev)
 
 	spin_lock_init(&goya->hw_queues_lock);
 	hdev->supports_coresight = true;
-	hdev->supports_soft_reset = true;
-	hdev->allow_inference_soft_reset = true;
+	hdev->asic_prop.supports_soft_reset = true;
+	hdev->asic_prop.allow_inference_soft_reset = true;
 	hdev->supports_wait_for_multi_cs = false;
 
 	hdev->asic_funcs->set_pci_memory_regions(hdev);
 
+	goya->goya_work = kmalloc(sizeof(struct goya_work_freq), GFP_KERNEL);
+	if (!goya->goya_work) {
+		rc = -ENOMEM;
+		goto free_cpu_accessible_dma_pool;
+	}
+
+	goya->goya_work->hdev = hdev;
+	INIT_DELAYED_WORK(&goya->goya_work->work_freq, goya_set_freq_to_low_job);
+
 	return 0;
 
 free_cpu_accessible_dma_pool:
@@ -1003,6 +1084,7 @@ static int goya_sw_fini(struct hl_device *hdev)
 
 	dma_pool_destroy(hdev->dma_pool);
 
+	kfree(goya->goya_work);
 	kfree(goya);
 
 	return 0;
@@ -2502,7 +2584,7 @@ static void goya_init_firmware_loader(struct hl_device *hdev)
 	struct fw_load_mgr *fw_loader = &hdev->fw_loader;
 
 	/* fill common fields */
-	fw_loader->linux_loaded = false;
+	fw_loader->fw_comp_loaded = FW_TYPE_NONE;
 	fw_loader->boot_fit_img.image_name = GOYA_BOOT_FIT_FILE;
 	fw_loader->linux_img.image_name = GOYA_LINUX_FW_FILE;
 	fw_loader->cpu_timeout = GOYA_CPU_TIMEOUT_USEC;
@@ -2619,7 +2701,7 @@ int goya_mmu_init(struct hl_device *hdev)
 			(~STLB_STLB_FEATURE_EN_FOLLOWER_EN_MASK));
 
 	hdev->asic_funcs->mmu_invalidate_cache(hdev, true,
-					VM_TYPE_USERPTR | VM_TYPE_PHYS_PACK);
+					MMU_OP_USERPTR | MMU_OP_PHYS_PACK);
 
 	WREG32(mmMMU_MMU_ENABLE, 1);
 	WREG32(mmMMU_SPI_MASK, 0xF);
@@ -4395,7 +4477,7 @@ static u64 goya_read_pte(struct hl_device *hdev, u64 addr)
 {
 	struct goya_device *goya = hdev->asic_specific;
 
-	if (hdev->hard_reset_pending)
+	if (hdev->reset_info.hard_reset_pending)
 		return U64_MAX;
 
 	return readq(hdev->pcie_bar[DDR_BAR_ID] +
@@ -4406,7 +4488,7 @@ static void goya_write_pte(struct hl_device *hdev, u64 addr, u64 val)
 {
 	struct goya_device *goya = hdev->asic_specific;
 
-	if (hdev->hard_reset_pending)
+	if (hdev->reset_info.hard_reset_pending)
 		return;
 
 	writeq(val, hdev->pcie_bar[DDR_BAR_ID] +
@@ -4731,7 +4813,7 @@ static int goya_unmask_irq_arr(struct hl_device *hdev, u32 *irq_arr,
 	return rc;
 }
 
-static int goya_soft_reset_late_init(struct hl_device *hdev)
+static int goya_non_hard_reset_late_init(struct hl_device *hdev)
 {
 	/*
 	 * Unmask all IRQs since some could have been received
@@ -4764,24 +4846,39 @@ static int goya_unmask_irq(struct hl_device *hdev, u16 event_type)
 
 static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 {
+	ktime_t zero_time = ktime_set(0, 0);
+
+	mutex_lock(&hdev->clk_throttling.lock);
+
 	switch (event_type) {
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to power consumption\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_POWER_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_POWER;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_POWER].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Power envelop is safe, back to optimal clock\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_S:
-		hdev->clk_throttling_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.aggregated_reason |= HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].start = ktime_get();
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = zero_time;
 		dev_info_ratelimited(hdev->dev,
 			"Clock throttling due to overheating\n");
 		break;
+
 	case GOYA_ASYNC_EVENT_ID_FIX_THERMAL_ENV_E:
-		hdev->clk_throttling_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.current_reason &= ~HL_CLK_THROTTLE_THERMAL;
+		hdev->clk_throttling.timestamp[HL_CLK_THROTTLE_TYPE_THERMAL].end = ktime_get();
 		dev_info_ratelimited(hdev->dev,
 			"Thermal envelop is safe, back to optimal clock\n");
 		break;
@@ -4791,6 +4888,8 @@ static void goya_print_clk_change_info(struct hl_device *hdev, u16 event_type)
 			event_type);
 		break;
 	}
+
+	mutex_unlock(&hdev->clk_throttling.lock);
 }
 
 void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
@@ -4834,14 +4933,14 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 	case GOYA_ASYNC_EVENT_ID_L2_RAM_ECC:
 		goya_print_irq_info(hdev, event_type, false);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, (HL_RESET_HARD |
-						HL_RESET_FW_FATAL_ERR));
+			hl_device_reset(hdev, (HL_DRV_RESET_HARD |
+						HL_DRV_RESET_FW_FATAL_ERR));
 		break;
 
 	case GOYA_ASYNC_EVENT_ID_PSOC_GPIO_05_SW_RESET:
 		goya_print_irq_info(hdev, event_type, false);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, HL_RESET_HARD);
+			hl_device_reset(hdev, HL_DRV_RESET_HARD);
 		break;
 
 	case GOYA_ASYNC_EVENT_ID_PCIE_DEC:
@@ -4901,7 +5000,7 @@ void goya_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_entry)
 		goya_print_irq_info(hdev, event_type, false);
 		goya_print_out_of_sync_info(hdev, &eq_entry->pkt_sync_err);
 		if (hdev->hard_reset_on_fw_events)
-			hl_device_reset(hdev, HL_RESET_HARD);
+			hl_device_reset(hdev, HL_DRV_RESET_HARD);
 		else
 			hl_fw_unmask_irq(hdev, event_type);
 		break;
@@ -5209,7 +5308,7 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 	int rc;
 
 	if (!(goya->hw_cap_initialized & HW_CAP_MMU) ||
-		hdev->hard_reset_pending)
+		hdev->reset_info.hard_reset_pending)
 		return 0;
 
 	/* no need in L1 only invalidation in Goya */
@@ -5232,12 +5331,6 @@ static int goya_mmu_invalidate_cache(struct hl_device *hdev, bool is_hard,
 		1000,
 		timeout_usec);
 
-	if (rc) {
-		dev_err_ratelimited(hdev->dev,
-					"MMU cache invalidation timeout\n");
-		hl_device_reset(hdev, HL_RESET_HARD);
-	}
-
 	return rc;
 }
 
@@ -5645,7 +5738,7 @@ static const struct hl_asic_funcs goya_funcs = {
 	.disable_clock_gating = goya_disable_clock_gating,
 	.debug_coresight = goya_debug_coresight,
 	.is_device_idle = goya_is_device_idle,
-	.soft_reset_late_init = goya_soft_reset_late_init,
+	.non_hard_reset_late_init = goya_non_hard_reset_late_init,
 	.hw_queues_lock = goya_hw_queues_lock,
 	.hw_queues_unlock = goya_hw_queues_unlock,
 	.get_pci_id = goya_get_pci_id,
diff --git a/drivers/misc/habanalabs/goya/goyaP.h b/drivers/misc/habanalabs/goya/goyaP.h
index 97add7b04f82..3740fd25bf84 100644
--- a/drivers/misc/habanalabs/goya/goyaP.h
+++ b/drivers/misc/habanalabs/goya/goyaP.h
@@ -153,9 +153,15 @@
 #define HW_CAP_GOLDEN		0x00000400
 #define HW_CAP_TPC		0x00000800
 
+struct goya_work_freq {
+	struct hl_device *hdev;
+	struct delayed_work work_freq;
+};
+
 struct goya_device {
 	/* TODO: remove hw_queues_lock after moving to scheduler code */
 	spinlock_t	hw_queues_lock;
+	struct goya_work_freq	*goya_work;
 
 	u64		mme_clk;
 	u64		tpc_clk;
@@ -166,6 +172,9 @@ struct goya_device {
 	u32		events_stat_aggregate[GOYA_ASYNC_EVENT_ID_SIZE];
 	u32		hw_cap_initialized;
 	u8		device_cpu_mmu_mappings_done;
+
+	enum hl_pll_frequency		curr_pll_profile;
+	enum hl_pm_mng_profile		pm_mng_profile;
 };
 
 int goya_set_fixed_properties(struct hl_device *hdev);
@@ -211,8 +220,8 @@ void goya_set_pll_profile(struct hl_device *hdev, enum hl_pll_frequency freq);
 void goya_add_device_attr(struct hl_device *hdev,
 			struct attribute_group *dev_attr_grp);
 int goya_cpucp_info_get(struct hl_device *hdev);
-int goya_debug_coresight(struct hl_device *hdev, void *data);
-void goya_halt_coresight(struct hl_device *hdev);
+int goya_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data);
+void goya_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx);
 
 int goya_suspend(struct hl_device *hdev);
 int goya_resume(struct hl_device *hdev);
@@ -237,5 +246,6 @@ void goya_mmu_remove_device_cpu_mappings(struct hl_device *hdev);
 
 u32 goya_get_queue_id_for_cq(struct hl_device *hdev, u32 cq_idx);
 u64 goya_get_device_time(struct hl_device *hdev);
+int goya_set_frequency(struct hl_device *hdev, enum hl_pll_frequency freq);
 
 #endif /* GOYAP_H_ */
diff --git a/drivers/misc/habanalabs/goya/goya_coresight.c b/drivers/misc/habanalabs/goya/goya_coresight.c
index c55c100fdd24..2c5133cfae65 100644
--- a/drivers/misc/habanalabs/goya/goya_coresight.c
+++ b/drivers/misc/habanalabs/goya/goya_coresight.c
@@ -652,7 +652,7 @@ static int goya_config_spmu(struct hl_device *hdev,
 	return 0;
 }
 
-int goya_debug_coresight(struct hl_device *hdev, void *data)
+int goya_debug_coresight(struct hl_device *hdev, struct hl_ctx *ctx, void *data)
 {
 	struct hl_debug_params *params = data;
 	int rc = 0;
@@ -691,7 +691,7 @@ int goya_debug_coresight(struct hl_device *hdev, void *data)
 	return rc;
 }
 
-void goya_halt_coresight(struct hl_device *hdev)
+void goya_halt_coresight(struct hl_device *hdev, struct hl_ctx *ctx)
 {
 	struct hl_debug_params params = {};
 	int i, rc;
diff --git a/drivers/misc/habanalabs/goya/goya_hwmgr.c b/drivers/misc/habanalabs/goya/goya_hwmgr.c
index 59b2624ff81a..76b47749affe 100644
--- a/drivers/misc/habanalabs/goya/goya_hwmgr.c
+++ b/drivers/misc/habanalabs/goya/goya_hwmgr.c
@@ -1,7 +1,7 @@
 // SPDX-License-Identifier: GPL-2.0
 
 /*
- * Copyright 2016-2019 HabanaLabs, Ltd.
+ * Copyright 2016-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  */
 
@@ -62,7 +62,7 @@ static ssize_t mme_clk_store(struct device *dev, struct device_attribute *attr,
 		goto fail;
 	}
 
-	if (hdev->pm_mng_profile == PM_AUTO) {
+	if (goya->pm_mng_profile == PM_AUTO) {
 		count = -EPERM;
 		goto fail;
 	}
@@ -111,7 +111,7 @@ static ssize_t tpc_clk_store(struct device *dev, struct device_attribute *attr,
 		goto fail;
 	}
 
-	if (hdev->pm_mng_profile == PM_AUTO) {
+	if (goya->pm_mng_profile == PM_AUTO) {
 		count = -EPERM;
 		goto fail;
 	}
@@ -160,7 +160,7 @@ static ssize_t ic_clk_store(struct device *dev, struct device_attribute *attr,
 		goto fail;
 	}
 
-	if (hdev->pm_mng_profile == PM_AUTO) {
+	if (goya->pm_mng_profile == PM_AUTO) {
 		count = -EPERM;
 		goto fail;
 	}
@@ -234,13 +234,14 @@ static ssize_t pm_mng_profile_show(struct device *dev,
 				struct device_attribute *attr, char *buf)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
+	struct goya_device *goya = hdev->asic_specific;
 
 	if (!hl_device_operational(hdev, NULL))
 		return -ENODEV;
 
 	return sprintf(buf, "%s\n",
-			(hdev->pm_mng_profile == PM_AUTO) ? "auto" :
-			(hdev->pm_mng_profile == PM_MANUAL) ? "manual" :
+			(goya->pm_mng_profile == PM_AUTO) ? "auto" :
+			(goya->pm_mng_profile == PM_MANUAL) ? "manual" :
 			"unknown");
 }
 
@@ -248,6 +249,7 @@ static ssize_t pm_mng_profile_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
 	struct hl_device *hdev = dev_get_drvdata(dev);
+	struct goya_device *goya = hdev->asic_specific;
 
 	if (!hl_device_operational(hdev, NULL)) {
 		count = -ENODEV;
@@ -256,7 +258,7 @@ static ssize_t pm_mng_profile_store(struct device *dev,
 
 	mutex_lock(&hdev->fpriv_list_lock);
 
-	if (hdev->compute_ctx) {
+	if (hdev->is_compute_ctx_active) {
 		dev_err(hdev->dev,
 			"Can't change PM profile while compute context is opened on the device\n");
 		count = -EPERM;
@@ -265,26 +267,27 @@ static ssize_t pm_mng_profile_store(struct device *dev,
 
 	if (strncmp("auto", buf, strlen("auto")) == 0) {
 		/* Make sure we are in LOW PLL when changing modes */
-		if (hdev->pm_mng_profile == PM_MANUAL) {
-			hdev->curr_pll_profile = PLL_HIGH;
-			hdev->pm_mng_profile = PM_AUTO;
-			hl_device_set_frequency(hdev, PLL_LOW);
+		if (goya->pm_mng_profile == PM_MANUAL) {
+			goya->curr_pll_profile = PLL_HIGH;
+			goya->pm_mng_profile = PM_AUTO;
+			goya_set_frequency(hdev, PLL_LOW);
 		}
 	} else if (strncmp("manual", buf, strlen("manual")) == 0) {
-		if (hdev->pm_mng_profile == PM_AUTO) {
+		if (goya->pm_mng_profile == PM_AUTO) {
 			/* Must release the lock because the work thread also
 			 * takes this lock. But before we release it, set
 			 * the mode to manual so nothing will change if a user
 			 * suddenly opens the device
 			 */
-			hdev->pm_mng_profile = PM_MANUAL;
+			goya->pm_mng_profile = PM_MANUAL;
 
 			mutex_unlock(&hdev->fpriv_list_lock);
 
 			/* Flush the current work so we can return to the user
 			 * knowing that he is the only one changing frequencies
 			 */
-			flush_delayed_work(&hdev->work_freq);
+			if (goya->goya_work)
+				flush_delayed_work(&goya->goya_work->work_freq);
 
 			return count;
 		}
diff --git a/drivers/misc/habanalabs/include/common/cpucp_if.h b/drivers/misc/habanalabs/include/common/cpucp_if.h
index ae13231fda94..737c39f33f05 100644
--- a/drivers/misc/habanalabs/include/common/cpucp_if.h
+++ b/drivers/misc/habanalabs/include/common/cpucp_if.h
@@ -1,6 +1,6 @@
 /* SPDX-License-Identifier: GPL-2.0
  *
- * Copyright 2020 HabanaLabs, Ltd.
+ * Copyright 2020-2021 HabanaLabs, Ltd.
  * All Rights Reserved.
  *
  */
@@ -376,6 +376,19 @@ enum pq_init_status {
  *       and QMANs. The f/w will return a bitmask where each bit represents
  *       a different engine or QMAN according to enum cpucp_idle_mask.
  *       The bit will be 1 if the engine is NOT idle.
+ *
+ * CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET -
+ *       Fetch all HBM replaced-rows and prending to be replaced rows data.
+ *
+ * CPUCP_PACKET_HBM_PENDING_ROWS_STATUS -
+ *       Fetch status of HBM rows pending replacement and need a reboot to
+ *       be replaced.
+ *
+ * CPUCP_PACKET_POWER_SET -
+ *       Resets power history of device to 0
+ *
+ * CPUCP_PACKET_ENGINE_CORE_ASID_SET -
+ *       Packet to perform engine core ASID configuration
  */
 
 enum cpucp_packet_id {
@@ -421,6 +434,11 @@ enum cpucp_packet_id {
 	CPUCP_PACKET_NIC_STAT_REGS_CLR,		/* internal */
 	CPUCP_PACKET_NIC_STAT_REGS_ALL_GET,	/* internal */
 	CPUCP_PACKET_IS_IDLE_CHECK,		/* internal */
+	CPUCP_PACKET_HBM_REPLACED_ROWS_INFO_GET,/* internal */
+	CPUCP_PACKET_HBM_PENDING_ROWS_STATUS,	/* internal */
+	CPUCP_PACKET_POWER_SET,			/* internal */
+	CPUCP_PACKET_RESERVED,			/* not used */
+	CPUCP_PACKET_ENGINE_CORE_ASID_SET,	/* internal */
 };
 
 #define CPUCP_PACKET_FENCE_VAL	0xFE8CE7A5
@@ -480,7 +498,14 @@ struct cpucp_packet {
 			__u8 i2c_bus;
 			__u8 i2c_addr;
 			__u8 i2c_reg;
-			__u8 pad; /* unused */
+			/*
+			 * In legacy implemetations, i2c_len was not present,
+			 * was unused and just added as pad.
+			 * So if i2c_len is 0, it is treated as legacy
+			 * and r/w 1 Byte, else if i2c_len is specified,
+			 * its treated as new multibyte r/w support.
+			 */
+			__u8 i2c_len;
 		};
 
 		struct {/* For PLL info fetch */
@@ -688,6 +713,7 @@ struct eq_generic_event {
 #define CPUCP_MAX_NIC_LANES		(CPUCP_MAX_NICS * CPUCP_LANES_PER_NIC)
 #define CPUCP_NIC_MASK_ARR_LEN		((CPUCP_MAX_NICS + 63) / 64)
 #define CPUCP_NIC_POLARITY_ARR_LEN	((CPUCP_MAX_NIC_LANES + 63) / 64)
+#define CPUCP_HBM_ROW_REPLACE_MAX	32
 
 struct cpucp_sensor {
 	__le32 type;
@@ -740,6 +766,7 @@ struct cpucp_security_info {
  * @fuse_version: silicon production FUSE information.
  * @thermal_version: thermald S/W version.
  * @cpucp_version: CpuCP S/W version.
+ * @infineon_second_stage_version: Infineon 2nd stage DC-DC version.
  * @dram_size: available DRAM size.
  * @card_name: card name that will be displayed in HWMON subsystem on the host
  * @sec_info: security information
@@ -749,6 +776,10 @@ struct cpucp_security_info {
  * @dram_binning_mask: DRAM binning mask, 1 bit per dram instance
  *                     (0 = functional 1 = binned)
  * @memory_repair_flag: eFuse flag indicating memory repair
+ * @edma_binning_mask: EDMA binning mask, 1 bit per EDMA instance
+ *                     (0 = functional 1 = binned)
+ * @xbar_binning_mask: Xbar binning mask, 1 bit per Xbar instance
+ *                     (0 = functional 1 = binned)
  */
 struct cpucp_info {
 	struct cpucp_sensor sensors[CPUCP_MAX_SENSORS];
@@ -761,7 +792,7 @@ struct cpucp_info {
 	__u8 fuse_version[VERSION_MAX_LEN];
 	__u8 thermal_version[VERSION_MAX_LEN];
 	__u8 cpucp_version[VERSION_MAX_LEN];
-	__le32 reserved2;
+	__le32 infineon_second_stage_version;
 	__le64 dram_size;
 	char card_name[CARD_NAME_MAX_LEN];
 	__le64 reserved3;
@@ -769,7 +800,9 @@ struct cpucp_info {
 	__u8 reserved5;
 	__u8 dram_binning_mask;
 	__u8 memory_repair_flag;
-	__u8 pad[5];
+	__u8 edma_binning_mask;
+	__u8 xbar_binning_mask;
+	__u8 pad[3];
 	struct cpucp_security_info sec_info;
 	__le32 reserved6;
 	__u8 pll_map[PLL_MAP_LEN];
@@ -833,4 +866,25 @@ struct cpucp_nic_status {
 	__le32 high_ber_cnt;
 };
 
+enum cpucp_hbm_row_replace_cause {
+	REPLACE_CAUSE_DOUBLE_ECC_ERR,
+	REPLACE_CAUSE_MULTI_SINGLE_ECC_ERR,
+};
+
+struct cpucp_hbm_row_info {
+	__u8 hbm_idx;
+	__u8 pc;
+	__u8 sid;
+	__u8 bank_idx;
+	__le16 row_addr;
+	__u8 replaced_row_cause; /* enum cpucp_hbm_row_replace_cause */
+	__u8 pad;
+};
+
+struct cpucp_hbm_row_replaced_rows_info {
+	__le16 num_replaced_rows;
+	__u8 pad[6];
+	struct cpucp_hbm_row_info replaced_rows[CPUCP_HBM_ROW_REPLACE_MAX];
+};
+
 #endif /* CPUCP_IF_H */
diff --git a/drivers/misc/habanalabs/include/common/hl_boot_if.h b/drivers/misc/habanalabs/include/common/hl_boot_if.h
index 2626df6ef3ef..135e21d6edc9 100644
--- a/drivers/misc/habanalabs/include/common/hl_boot_if.h
+++ b/drivers/misc/habanalabs/include/common/hl_boot_if.h
@@ -32,6 +32,7 @@ enum cpu_boot_err {
 	CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL = 13,
 	CPU_BOOT_ERR_BOOT_FW_CRIT_ERR = 18,
 	CPU_BOOT_ERR_BINNING_FAIL = 19,
+	CPU_BOOT_ERR_TPM_FAIL = 20,
 	CPU_BOOT_ERR_ENABLED = 31,
 	CPU_BOOT_ERR_SCND_EN = 63,
 	CPU_BOOT_ERR_LAST = 64 /* we have 2 registers of 32 bits */
@@ -108,6 +109,8 @@ enum cpu_boot_err {
  *					malfunctioning components might still be
  *					in use.
  *
+ * CPU_BOOT_ERR0_TPM_FAIL		TPM verification flow failed.
+ *
  * CPU_BOOT_ERR0_ENABLED		Error registers enabled.
  *					This is a main indication that the
  *					running FW populates the error
@@ -130,6 +133,7 @@ enum cpu_boot_err {
 #define CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL	(1 << CPU_BOOT_ERR_DEVICE_UNUSABLE_FAIL)
 #define CPU_BOOT_ERR0_BOOT_FW_CRIT_ERR		(1 << CPU_BOOT_ERR_BOOT_FW_CRIT_ERR)
 #define CPU_BOOT_ERR0_BINNING_FAIL		(1 << CPU_BOOT_ERR_BINNING_FAIL)
+#define CPU_BOOT_ERR0_TPM_FAIL			(1 << CPU_BOOT_ERR_TPM_FAIL)
 #define CPU_BOOT_ERR0_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
 #define CPU_BOOT_ERR1_ENABLED			(1 << CPU_BOOT_ERR_ENABLED)
 
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
index dedf20e8f956..758f246627f8 100644
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_general.h
@@ -16,27 +16,18 @@
 #define PAGE_PRESENT_MASK		0x0000000000001ull
 #define SWAP_OUT_MASK			0x0000000000004ull
 #define LAST_MASK			0x0000000000800ull
-#define HOP0_MASK			0x3000000000000ull
-#define HOP1_MASK			0x0FF8000000000ull
-#define HOP2_MASK			0x0007FC0000000ull
-#define HOP3_MASK			0x000003FE00000ull
-#define HOP4_MASK			0x00000001FF000ull
 #define FLAGS_MASK			0x0000000000FFFull
 
-#define HOP0_SHIFT			48
-#define HOP1_SHIFT			39
-#define HOP2_SHIFT			30
-#define HOP3_SHIFT			21
-#define HOP4_SHIFT			12
-
 #define MMU_ARCH_5_HOPS			5
 
 #define HOP_PHYS_ADDR_MASK		(~FLAGS_MASK)
 
 #define HL_PTE_SIZE			sizeof(u64)
-#define HOP_TABLE_SIZE			PAGE_SIZE_4KB
-#define PTE_ENTRIES_IN_HOP		(HOP_TABLE_SIZE / HL_PTE_SIZE)
-#define HOP0_TABLES_TOTAL_SIZE		(HOP_TABLE_SIZE * MAX_ASID)
+
+/* definitions for HOP with 512 PTE entries */
+#define HOP_PTE_ENTRIES_512		512
+#define HOP_TABLE_SIZE_512_PTE		(HOP_PTE_ENTRIES_512 * HL_PTE_SIZE)
+#define HOP0_512_PTE_TABLES_TOTAL_SIZE	(HOP_TABLE_SIZE_512_PTE * MAX_ASID)
 
 #define MMU_HOP0_PA43_12_SHIFT		12
 #define MMU_HOP0_PA49_44_SHIFT		(12 + 32)
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
index 8539dd041f2c..86511002e367 100644
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_0.h
@@ -8,8 +8,20 @@
 #ifndef INCLUDE_MMU_V1_0_H_
 #define INCLUDE_MMU_V1_0_H_
 
-#define MMU_HOP0_PA43_12	0x490004
-#define MMU_HOP0_PA49_44	0x490008
-#define MMU_ASID_BUSY		0x490000
+#define MMU_V1_0_HOP0_MASK		0x3000000000000ull
+#define MMU_V1_0_HOP1_MASK		0x0FF8000000000ull
+#define MMU_V1_0_HOP2_MASK		0x0007FC0000000ull
+#define MMU_V1_0_HOP3_MASK		0x000003FE00000ull
+#define MMU_V1_0_HOP4_MASK		0x00000001FF000ull
+
+#define MMU_V1_0_HOP0_SHIFT		48
+#define MMU_V1_0_HOP1_SHIFT		39
+#define MMU_V1_0_HOP2_SHIFT		30
+#define MMU_V1_0_HOP3_SHIFT		21
+#define MMU_V1_0_HOP4_SHIFT		12
+
+#define MMU_HOP0_PA43_12		0x490004
+#define MMU_HOP0_PA49_44		0x490008
+#define MMU_ASID_BUSY			0x490000
 
 #endif /* INCLUDE_MMU_V1_0_H_ */
diff --git a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h
index b2a9570583ac..9c727a5d47b4 100644
--- a/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h
+++ b/drivers/misc/habanalabs/include/hw_ip/mmu/mmu_v1_1.h
@@ -8,9 +8,21 @@
 #ifndef INCLUDE_MMU_V1_1_H_
 #define INCLUDE_MMU_V1_1_H_
 
-#define MMU_ASID		0xC12004
-#define MMU_HOP0_PA43_12	0xC12008
-#define MMU_HOP0_PA49_44	0xC1200C
-#define MMU_BUSY		0xC12000
+#define MMU_V1_1_HOP0_MASK		0x3000000000000ull
+#define MMU_V1_1_HOP1_MASK		0x0FF8000000000ull
+#define MMU_V1_1_HOP2_MASK		0x0007FC0000000ull
+#define MMU_V1_1_HOP3_MASK		0x000003FE00000ull
+#define MMU_V1_1_HOP4_MASK		0x00000001FF000ull
+
+#define MMU_V1_1_HOP0_SHIFT		48
+#define MMU_V1_1_HOP1_SHIFT		39
+#define MMU_V1_1_HOP2_SHIFT		30
+#define MMU_V1_1_HOP3_SHIFT		21
+#define MMU_V1_1_HOP4_SHIFT		12
+
+#define MMU_ASID			0xC12004
+#define MMU_HOP0_PA43_12		0xC12008
+#define MMU_HOP0_PA49_44		0xC1200C
+#define MMU_BUSY			0xC12000
 
 #endif /* INCLUDE_MMU_V1_1_H_ */