mirror of
https://github.com/torvalds/linux.git
synced 2025-11-02 17:49:03 +02:00
drm/amdgpu: Add helper to initialize badpage info
Add a separate function to read badpage data during initialization. Reading bad pages will need hardware access and cannot be done during reset. Hence in cases where device needs a full reset during init itself, attempting to read will cause a deadlock. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Feifei Xu <Feifei.Xu@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Acked-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Tested-by: Rajneesh Bhardwaj <rajneesh.bhardwaj@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
parent
0ee2399116
commit
b17f87329d
3 changed files with 41 additions and 21 deletions
|
|
@ -2953,7 +2953,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
|
|||
* Note: theoretically, this should be called before all vram allocations
|
||||
* to protect retired page from abusing
|
||||
*/
|
||||
r = amdgpu_ras_recovery_init(adev);
|
||||
r = amdgpu_ras_recovery_init(adev, true);
|
||||
if (r)
|
||||
goto init_failed;
|
||||
|
||||
|
|
|
|||
|
|
@ -3146,7 +3146,42 @@ static int amdgpu_ras_page_retirement_thread(void *param)
|
|||
return 0;
|
||||
}
|
||||
|
||||
int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
||||
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
int ret;
|
||||
|
||||
if (!con || amdgpu_sriov_vf(adev))
|
||||
return 0;
|
||||
|
||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
|
||||
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
/* HW not usable */
|
||||
if (amdgpu_ras_is_rma(adev))
|
||||
return -EHWPOISON;
|
||||
|
||||
if (con->eeprom_control.ras_num_recs) {
|
||||
ret = amdgpu_ras_load_bad_pages(adev);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(
|
||||
adev, con->eeprom_control.ras_num_recs);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(
|
||||
adev, con->eeprom_control.bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info)
|
||||
{
|
||||
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
|
||||
struct ras_err_handler_data **data;
|
||||
|
|
@ -3187,25 +3222,10 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
|
|||
*/
|
||||
if (adev->init_lvl->level == AMDGPU_INIT_LEVEL_MINIMAL_XGMI)
|
||||
return 0;
|
||||
ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
|
||||
/*
|
||||
* This calling fails when is_rma is true or
|
||||
* ret != 0.
|
||||
*/
|
||||
if (amdgpu_ras_is_rma(adev) || ret)
|
||||
goto free;
|
||||
|
||||
if (con->eeprom_control.ras_num_recs) {
|
||||
ret = amdgpu_ras_load_bad_pages(adev);
|
||||
if (init_bp_info) {
|
||||
ret = amdgpu_ras_init_badpage_info(adev);
|
||||
if (ret)
|
||||
goto free;
|
||||
|
||||
amdgpu_dpm_send_hbm_bad_pages_num(adev, con->eeprom_control.ras_num_recs);
|
||||
|
||||
if (con->update_channel_flag == true) {
|
||||
amdgpu_dpm_send_hbm_bad_channel_flag(adev, con->eeprom_control.bad_channel_bitmap);
|
||||
con->update_channel_flag = false;
|
||||
}
|
||||
}
|
||||
|
||||
mutex_init(&con->page_rsv_lock);
|
||||
|
|
|
|||
|
|
@ -736,8 +736,8 @@ struct amdgpu_ras_block_hw_ops {
|
|||
* 8: feature disable
|
||||
*/
|
||||
|
||||
|
||||
int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_init_badpage_info(struct amdgpu_device *adev);
|
||||
int amdgpu_ras_recovery_init(struct amdgpu_device *adev, bool init_bp_info);
|
||||
|
||||
void amdgpu_ras_resume(struct amdgpu_device *adev);
|
||||
void amdgpu_ras_suspend(struct amdgpu_device *adev);
|
||||
|
|
|
|||
Loading…
Reference in a new issue