summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYiPeng Chai <YiPeng.Chai@amd.com>2024-03-22 05:06:01 +0100
committerAlex Deucher <alexander.deucher@amd.com>2024-04-26 23:22:41 +0200
commite74313be5a71df63e307ad98b6ab202b8a222817 (patch)
tree591d702bd77608d20ecef578938f24b2bbe000cf
parentdrm/amdgpu: Add delay work to retire bad pages (diff)
downloadlinux-e74313be5a71df63e307ad98b6ab202b8a222817.tar.xz
linux-e74313be5a71df63e307ad98b6ab202b8a222817.zip
drm/amdgpu: add condition check for amdgpu_umc_fill_error_record
Add condition check for amdgpu_umc_fill_error_record. Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com> Reviewed-by: Tao Zhou <tao.zhou1@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c20
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h2
3 files changed, 19 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index cb5a0f31d201..c8980d5f6540 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -579,6 +579,7 @@ struct ras_err_data {
unsigned long de_count;
unsigned long err_addr_cnt;
struct eeprom_table_record *err_addr;
+ unsigned long err_addr_len;
u32 err_list_count;
struct list_head err_node_list;
};
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
index 2bd88218c20e..dcda3d24bee3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.c
@@ -66,6 +66,8 @@ int amdgpu_umc_page_retirement_mca(struct amdgpu_device *adev,
goto out_fini_err_data;
}
+ err_data.err_addr_len = adev->umc.max_ras_err_cnt_per_query;
+
/*
* Translate UMC channel address to Physical address
*/
@@ -121,6 +123,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
+ else
+ err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -146,6 +150,8 @@ void amdgpu_umc_handle_bad_pages(struct amdgpu_device *adev,
if(!err_data->err_addr)
dev_warn(adev->dev, "Failed to alloc memory for "
"umc error address record!\n");
+ else
+ err_data->err_addr_len = adev->umc.max_ras_err_cnt_per_query;
/* umc query_ras_error_address is also responsible for clearing
* error status
@@ -389,14 +395,20 @@ int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
return 0;
}
-void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
+int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
uint64_t err_addr,
uint64_t retired_page,
uint32_t channel_index,
uint32_t umc_inst)
{
- struct eeprom_table_record *err_rec =
- &err_data->err_addr[err_data->err_addr_cnt];
+ struct eeprom_table_record *err_rec;
+
+ if (!err_data ||
+ !err_data->err_addr ||
+ (err_data->err_addr_cnt >= err_data->err_addr_len))
+ return -EINVAL;
+
+ err_rec = &err_data->err_addr[err_data->err_addr_cnt];
err_rec->address = err_addr;
/* page frame address is saved */
@@ -408,6 +420,8 @@ void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
err_rec->mcumc_id = umc_inst;
err_data->err_addr_cnt++;
+
+ return 0;
}
int amdgpu_umc_loop_channels(struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
index 2d08d076f7c9..9e77e6d48e3b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_umc.h
@@ -109,7 +109,7 @@ int amdgpu_umc_poison_handler(struct amdgpu_device *adev,
int amdgpu_umc_process_ecc_irq(struct amdgpu_device *adev,
struct amdgpu_irq_src *source,
struct amdgpu_iv_entry *entry);
-void amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
+int amdgpu_umc_fill_error_record(struct ras_err_data *err_data,
uint64_t err_addr,
uint64_t retired_page,
uint32_t channel_index,