mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	drm/amdgpu: Refactor mode2 reset logic for v13.0.2
Use IP version and refactor reset logic to apply to a list of devices. Signed-off-by: Lijo Lazar <lijo.lazar@amd.com> Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com> Reviewed-by: Le Ma <Le.Ma@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
		
							parent
							
								
									90d282582a
								
							
						
					
					
						commit
						9e08564727
					
				
					 2 changed files with 54 additions and 20 deletions
				
			
		| 
						 | 
				
			
			@ -31,6 +31,17 @@
 | 
			
		|||
#include "amdgpu_psp.h"
 | 
			
		||||
#include "amdgpu_xgmi.h"
 | 
			
		||||
 | 
			
		||||
static bool aldebaran_is_mode2_default(struct amdgpu_reset_control *reset_ctl)
 | 
			
		||||
{
 | 
			
		||||
	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
 | 
			
		||||
 | 
			
		||||
	if ((adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
 | 
			
		||||
	     adev->gmc.xgmi.connected_to_cpu))
 | 
			
		||||
		return true;
 | 
			
		||||
 | 
			
		||||
	return false;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static struct amdgpu_reset_handler *
 | 
			
		||||
aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
 | 
			
		||||
			    struct amdgpu_reset_context *reset_context)
 | 
			
		||||
| 
						 | 
				
			
			@ -48,7 +59,7 @@ aldebaran_get_reset_handler(struct amdgpu_reset_control *reset_ctl,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	if (adev->gmc.xgmi.connected_to_cpu) {
 | 
			
		||||
	if (aldebaran_is_mode2_default(reset_ctl)) {
 | 
			
		||||
		list_for_each_entry(handler, &reset_ctl->reset_handlers,
 | 
			
		||||
				     handler_list) {
 | 
			
		||||
			if (handler->reset_method == AMD_RESET_METHOD_MODE2) {
 | 
			
		||||
| 
						 | 
				
			
			@ -136,18 +147,31 @@ static int
 | 
			
		|||
aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
 | 
			
		||||
			      struct amdgpu_reset_context *reset_context)
 | 
			
		||||
{
 | 
			
		||||
	struct amdgpu_device *tmp_adev = NULL;
 | 
			
		||||
	struct amdgpu_device *adev = (struct amdgpu_device *)reset_ctl->handle;
 | 
			
		||||
	struct amdgpu_device *tmp_adev = NULL;
 | 
			
		||||
	struct list_head reset_device_list;
 | 
			
		||||
	int r = 0;
 | 
			
		||||
 | 
			
		||||
	dev_dbg(adev->dev, "aldebaran perform hw reset\n");
 | 
			
		||||
	if (reset_context->hive == NULL) {
 | 
			
		||||
	if (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2) &&
 | 
			
		||||
	    reset_context->hive == NULL) {
 | 
			
		||||
		/* Wrong context, return error */
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry(tmp_adev, &reset_context->hive->device_list,
 | 
			
		||||
			     gmc.xgmi.head) {
 | 
			
		||||
	INIT_LIST_HEAD(&reset_device_list);
 | 
			
		||||
	if (reset_context->hive) {
 | 
			
		||||
		list_for_each_entry (tmp_adev,
 | 
			
		||||
				     &reset_context->hive->device_list,
 | 
			
		||||
				     gmc.xgmi.head)
 | 
			
		||||
			list_add_tail(&tmp_adev->reset_list,
 | 
			
		||||
				      &reset_device_list);
 | 
			
		||||
	} else {
 | 
			
		||||
		list_add_tail(&reset_context->reset_req_dev->reset_list,
 | 
			
		||||
			      &reset_device_list);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
 | 
			
		||||
		mutex_lock(&tmp_adev->reset_cntl->reset_lock);
 | 
			
		||||
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_MODE2;
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			@ -155,8 +179,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
 | 
			
		|||
	 * Mode2 reset doesn't need any sync between nodes in XGMI hive, instead launch
 | 
			
		||||
	 * them together so that they can be completed asynchronously on multiple nodes
 | 
			
		||||
	 */
 | 
			
		||||
	list_for_each_entry(tmp_adev, &reset_context->hive->device_list,
 | 
			
		||||
			     gmc.xgmi.head) {
 | 
			
		||||
	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
 | 
			
		||||
		/* For XGMI run all resets in parallel to speed up the process */
 | 
			
		||||
		if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
 | 
			
		||||
			if (!queue_work(system_unbound_wq,
 | 
			
		||||
| 
						 | 
				
			
			@ -174,9 +197,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
 | 
			
		|||
 | 
			
		||||
	/* For XGMI wait for all resets to complete before proceed */
 | 
			
		||||
	if (!r) {
 | 
			
		||||
		list_for_each_entry(tmp_adev,
 | 
			
		||||
				     &reset_context->hive->device_list,
 | 
			
		||||
				     gmc.xgmi.head) {
 | 
			
		||||
		list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
 | 
			
		||||
			if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
 | 
			
		||||
				flush_work(&tmp_adev->reset_cntl->reset_work);
 | 
			
		||||
				r = tmp_adev->asic_reset_res;
 | 
			
		||||
| 
						 | 
				
			
			@ -186,8 +207,7 @@ aldebaran_mode2_perform_reset(struct amdgpu_reset_control *reset_ctl,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry(tmp_adev, &reset_context->hive->device_list,
 | 
			
		||||
			     gmc.xgmi.head) {
 | 
			
		||||
	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
 | 
			
		||||
		mutex_unlock(&tmp_adev->reset_cntl->reset_lock);
 | 
			
		||||
		tmp_adev->reset_cntl->active_reset = AMD_RESET_METHOD_NONE;
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			@ -319,16 +339,30 @@ static int
 | 
			
		|||
aldebaran_mode2_restore_hwcontext(struct amdgpu_reset_control *reset_ctl,
 | 
			
		||||
				  struct amdgpu_reset_context *reset_context)
 | 
			
		||||
{
 | 
			
		||||
	int r;
 | 
			
		||||
	struct amdgpu_device *tmp_adev = NULL;
 | 
			
		||||
	struct list_head reset_device_list;
 | 
			
		||||
	int r;
 | 
			
		||||
 | 
			
		||||
	if (reset_context->hive == NULL) {
 | 
			
		||||
	if (reset_context->reset_req_dev->ip_versions[MP1_HWIP][0] ==
 | 
			
		||||
		    IP_VERSION(13, 0, 2) &&
 | 
			
		||||
	    reset_context->hive == NULL) {
 | 
			
		||||
		/* Wrong context, return error */
 | 
			
		||||
		return -EINVAL;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry(tmp_adev, &reset_context->hive->device_list,
 | 
			
		||||
			     gmc.xgmi.head) {
 | 
			
		||||
	INIT_LIST_HEAD(&reset_device_list);
 | 
			
		||||
	if (reset_context->hive) {
 | 
			
		||||
		list_for_each_entry (tmp_adev,
 | 
			
		||||
				     &reset_context->hive->device_list,
 | 
			
		||||
				     gmc.xgmi.head)
 | 
			
		||||
			list_add_tail(&tmp_adev->reset_list,
 | 
			
		||||
				      &reset_device_list);
 | 
			
		||||
	} else {
 | 
			
		||||
		list_add_tail(&reset_context->reset_req_dev->reset_list,
 | 
			
		||||
			      &reset_device_list);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	list_for_each_entry (tmp_adev, &reset_device_list, reset_list) {
 | 
			
		||||
		dev_info(tmp_adev->dev,
 | 
			
		||||
			 "GPU reset succeeded, trying to resume\n");
 | 
			
		||||
		r = aldebaran_mode2_restore_ip(tmp_adev);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -36,8 +36,8 @@ int amdgpu_reset_init(struct amdgpu_device *adev)
 | 
			
		|||
{
 | 
			
		||||
	int ret = 0;
 | 
			
		||||
 | 
			
		||||
	switch (adev->asic_type) {
 | 
			
		||||
	case CHIP_ALDEBARAN:
 | 
			
		||||
	switch (adev->ip_versions[MP1_HWIP][0]) {
 | 
			
		||||
	case IP_VERSION(13, 0, 2):
 | 
			
		||||
		ret = aldebaran_reset_init(adev);
 | 
			
		||||
		break;
 | 
			
		||||
	default:
 | 
			
		||||
| 
						 | 
				
			
			@ -51,8 +51,8 @@ int amdgpu_reset_fini(struct amdgpu_device *adev)
 | 
			
		|||
{
 | 
			
		||||
	int ret = 0;
 | 
			
		||||
 | 
			
		||||
	switch (adev->asic_type) {
 | 
			
		||||
	case CHIP_ALDEBARAN:
 | 
			
		||||
	switch (adev->ip_versions[MP1_HWIP][0]) {
 | 
			
		||||
	case IP_VERSION(13, 0, 2):
 | 
			
		||||
		ret = aldebaran_reset_fini(adev);
 | 
			
		||||
		break;
 | 
			
		||||
	default:
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue