drm/amdgpu: Add helper to initialize badpage info

Add a separate function to read badpage data during initialization.
Reading bad pages will need hardware access and cannot be done during
reset. Hence in cases where device needs a full reset during
init itself, attempting to read will cause a deadlock.

Signed-off-by: Lijo Lazar <lijo.lazar@amd.com>
Reviewed-by: Feifei Xu <Feifei.Xu@amd.com>
Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
Lijo Lazar 2024-08-30 11:21:43 +05:30 committed by Alex Deucher
parent 0ee2399116
commit b17f87329d
3 changed files with 41 additions and 21 deletions

View file

@ -2953,7 +2953,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
r = amdgpu_ras_recovery_init(adev);
r = amdgpu_ras_recovery_init(adev, true);
if (r)
goto init_failed;

View file

@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
return 0;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
int ret;
if (!con || amdgpu_sriov_vf(adev))
return 0;
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
if (ret)
return ret;
/* HW not usable */
if (amdgpu_ras_is_rma(adev))
return -EHWPOISON;
if (con->eeprom_control.ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (ret)
return ret;
amdgpu_dpm_send_hbm_bad_pages_num(
adev, con->eeprom_control.ras_num_recs);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(
adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}
return ret;
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
*/
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
return 0;
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
* This calling fails when is_rma is true or
* ret != 0.
*/
if (amdgpu_ras_is_rma(adev) || ret)
goto free;
if (con->eeprom_control.ras_num_recs) {
ret = amdgpu_ras_load_bad_pages(adev);
if (init_bp_info) {
ret = amdgpu_ras_init_badpage_info(adev);
if (ret)
goto free;
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
if (con->update_channel_flag == true) {
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
con->update_channel_flag = false;
}
}
mutex_init(&con->page_rsv_lock);

View file

@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
* 8: feature disable
*/
int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);