forked from mirrors/linux
		
	drm/amdgpu: fix vf error handling
The error handling for virtual functions assumed a single vf per VM and didn't properly account for bare metal. Make the error arrays per device and add locking. Reviewed-by: Gavin Wan <gavin.wan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
		
							parent
							
								
									6f87a89570
								
							
						
					
					
						commit
						e23b74aab5
					
				
					 4 changed files with 54 additions and 41 deletions
				
			
		| 
						 | 
					@ -2040,6 +2040,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	mutex_init(&adev->srbm_mutex);
 | 
						mutex_init(&adev->srbm_mutex);
 | 
				
			||||||
	mutex_init(&adev->grbm_idx_mutex);
 | 
						mutex_init(&adev->grbm_idx_mutex);
 | 
				
			||||||
	mutex_init(&adev->mn_lock);
 | 
						mutex_init(&adev->mn_lock);
 | 
				
			||||||
 | 
						mutex_init(&adev->virt.vf_errors.lock);
 | 
				
			||||||
	hash_init(adev->mn_hash);
 | 
						hash_init(adev->mn_hash);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	amdgpu_check_arguments(adev);
 | 
						amdgpu_check_arguments(adev);
 | 
				
			||||||
| 
						 | 
					@ -2125,7 +2126,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	r = amdgpu_atombios_init(adev);
 | 
						r = amdgpu_atombios_init(adev);
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		dev_err(adev->dev, "amdgpu_atombios_init failed\n");
 | 
							dev_err(adev->dev, "amdgpu_atombios_init failed\n");
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
 | 
				
			||||||
		goto failed;
 | 
							goto failed;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2136,7 +2137,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	if (amdgpu_vpost_needed(adev)) {
 | 
						if (amdgpu_vpost_needed(adev)) {
 | 
				
			||||||
		if (!adev->bios) {
 | 
							if (!adev->bios) {
 | 
				
			||||||
			dev_err(adev->dev, "no vBIOS found\n");
 | 
								dev_err(adev->dev, "no vBIOS found\n");
 | 
				
			||||||
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
 | 
								amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
 | 
				
			||||||
			r = -EINVAL;
 | 
								r = -EINVAL;
 | 
				
			||||||
			goto failed;
 | 
								goto failed;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
| 
						 | 
					@ -2144,7 +2145,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
		r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
 | 
							r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
 | 
				
			||||||
		if (r) {
 | 
							if (r) {
 | 
				
			||||||
			dev_err(adev->dev, "gpu post error!\n");
 | 
								dev_err(adev->dev, "gpu post error!\n");
 | 
				
			||||||
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
 | 
								amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0);
 | 
				
			||||||
			goto failed;
 | 
								goto failed;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
| 
						 | 
					@ -2156,7 +2157,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
		r = amdgpu_atomfirmware_get_clock_info(adev);
 | 
							r = amdgpu_atomfirmware_get_clock_info(adev);
 | 
				
			||||||
		if (r) {
 | 
							if (r) {
 | 
				
			||||||
			dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
 | 
								dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
 | 
				
			||||||
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
 | 
								amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
 | 
				
			||||||
			goto failed;
 | 
								goto failed;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
| 
						 | 
					@ -2164,7 +2165,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
		r = amdgpu_atombios_get_clock_info(adev);
 | 
							r = amdgpu_atombios_get_clock_info(adev);
 | 
				
			||||||
		if (r) {
 | 
							if (r) {
 | 
				
			||||||
			dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
 | 
								dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
 | 
				
			||||||
			amdgpu_vf_error_put(AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
 | 
								amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
 | 
				
			||||||
			goto failed;
 | 
								goto failed;
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
		/* init i2c buses */
 | 
							/* init i2c buses */
 | 
				
			||||||
| 
						 | 
					@ -2175,7 +2176,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	r = amdgpu_fence_driver_init(adev);
 | 
						r = amdgpu_fence_driver_init(adev);
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
 | 
							dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
 | 
				
			||||||
		goto failed;
 | 
							goto failed;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2185,7 +2186,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	r = amdgpu_init(adev);
 | 
						r = amdgpu_init(adev);
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		dev_err(adev->dev, "amdgpu_init failed\n");
 | 
							dev_err(adev->dev, "amdgpu_init failed\n");
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
 | 
				
			||||||
		amdgpu_fini(adev);
 | 
							amdgpu_fini(adev);
 | 
				
			||||||
		goto failed;
 | 
							goto failed;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
| 
						 | 
					@ -2205,7 +2206,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	r = amdgpu_ib_pool_init(adev);
 | 
						r = amdgpu_ib_pool_init(adev);
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		dev_err(adev->dev, "IB initialization failed (%d).\n", r);
 | 
							dev_err(adev->dev, "IB initialization failed (%d).\n", r);
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
 | 
				
			||||||
		goto failed;
 | 
							goto failed;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2254,7 +2255,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
				
			||||||
	r = amdgpu_late_init(adev);
 | 
						r = amdgpu_late_init(adev);
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		dev_err(adev->dev, "amdgpu_late_init failed\n");
 | 
							dev_err(adev->dev, "amdgpu_late_init failed\n");
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
 | 
				
			||||||
		goto failed;
 | 
							goto failed;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2936,7 +2937,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 | 
				
			||||||
		}
 | 
							}
 | 
				
			||||||
	} else {
 | 
						} else {
 | 
				
			||||||
		dev_err(adev->dev, "asic resume failed (%d).\n", r);
 | 
							dev_err(adev->dev, "asic resume failed (%d).\n", r);
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r);
 | 
				
			||||||
		for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 | 
							for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
 | 
				
			||||||
			if (adev->rings[i] && adev->rings[i]->sched.thread) {
 | 
								if (adev->rings[i] && adev->rings[i]->sched.thread) {
 | 
				
			||||||
				kthread_unpark(adev->rings[i]->sched.thread);
 | 
									kthread_unpark(adev->rings[i]->sched.thread);
 | 
				
			||||||
| 
						 | 
					@ -2950,7 +2951,7 @@ int amdgpu_gpu_reset(struct amdgpu_device *adev)
 | 
				
			||||||
	if (r) {
 | 
						if (r) {
 | 
				
			||||||
		/* bad news, how to tell it to userspace ? */
 | 
							/* bad news, how to tell it to userspace ? */
 | 
				
			||||||
		dev_info(adev->dev, "GPU reset failed\n");
 | 
							dev_info(adev->dev, "GPU reset failed\n");
 | 
				
			||||||
		amdgpu_vf_error_put(AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
 | 
							amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
	else {
 | 
						else {
 | 
				
			||||||
		dev_info(adev->dev, "GPU reset successed!\n");
 | 
							dev_info(adev->dev, "GPU reset successed!\n");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -25,30 +25,21 @@
 | 
				
			||||||
#include "amdgpu_vf_error.h"
 | 
					#include "amdgpu_vf_error.h"
 | 
				
			||||||
#include "mxgpu_ai.h"
 | 
					#include "mxgpu_ai.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AMDGPU_VF_ERROR_ENTRY_SIZE    16 
 | 
					void amdgpu_vf_error_put(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
								 uint16_t sub_error_code,
 | 
				
			||||||
/* struct error_entry - amdgpu VF error information. */
 | 
								 uint16_t error_flags,
 | 
				
			||||||
struct amdgpu_vf_error_buffer {
 | 
								 uint64_t error_data)
 | 
				
			||||||
	int read_count;
 | 
					 | 
				
			||||||
	int write_count;
 | 
					 | 
				
			||||||
	uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
					 | 
				
			||||||
	uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
					 | 
				
			||||||
	uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
					 | 
				
			||||||
};
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
struct amdgpu_vf_error_buffer admgpu_vf_errors;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data)
 | 
					 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	int index;
 | 
						int index;
 | 
				
			||||||
	uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
 | 
						uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
						mutex_lock(&adev->virt.vf_errors.lock);
 | 
				
			||||||
	admgpu_vf_errors.code [index] = error_code;
 | 
						index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
				
			||||||
	admgpu_vf_errors.flags [index] = error_flags;
 | 
						adev->virt.vf_errors.code [index] = error_code;
 | 
				
			||||||
	admgpu_vf_errors.data [index] = error_data;
 | 
						adev->virt.vf_errors.flags [index] = error_flags;
 | 
				
			||||||
	admgpu_vf_errors.write_count ++;
 | 
						adev->virt.vf_errors.data [index] = error_data;
 | 
				
			||||||
 | 
						adev->virt.vf_errors.write_count ++;
 | 
				
			||||||
 | 
						mutex_unlock(&adev->virt.vf_errors.lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
 | 
				
			||||||
	u32 data1, data2, data3;
 | 
						u32 data1, data2, data3;
 | 
				
			||||||
	int index;
 | 
						int index;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
 | 
						if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) ||
 | 
				
			||||||
 | 
						    (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) {
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
/*
 | 
					/*
 | 
				
			||||||
| 
						 | 
					@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev)
 | 
				
			||||||
		return;
 | 
							return;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
*/
 | 
					*/
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						mutex_lock(&adev->virt.vf_errors.lock);
 | 
				
			||||||
	/* The errors are overlay of array, correct read_count as full. */
 | 
						/* The errors are overlay of array, correct read_count as full. */
 | 
				
			||||||
	if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
 | 
						if (adev->virt.vf_errors.write_count - adev->virt.vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) {
 | 
				
			||||||
		admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
							adev->virt.vf_errors.read_count = adev->virt.vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) {
 | 
						while (adev->virt.vf_errors.read_count < adev->virt.vf_errors.write_count) {
 | 
				
			||||||
		index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
							index =adev->virt.vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE;
 | 
				
			||||||
		data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]);
 | 
							data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(adev->virt.vf_errors.code[index],
 | 
				
			||||||
		data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF;
 | 
												   adev->virt.vf_errors.flags[index]);
 | 
				
			||||||
		data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF;
 | 
							data2 = adev->virt.vf_errors.data[index] & 0xFFFFFFFF;
 | 
				
			||||||
 | 
							data3 = (adev->virt.vf_errors.data[index] >> 32) & 0xFFFFFFFF;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
		adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
 | 
							adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3);
 | 
				
			||||||
		admgpu_vf_errors.read_count ++;
 | 
							adev->virt.vf_errors.read_count ++;
 | 
				
			||||||
	}
 | 
						}
 | 
				
			||||||
 | 
						mutex_unlock(&adev->virt.vf_errors.lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -56,7 +56,10 @@ enum AMDGIM_ERROR_CATEGORY {
 | 
				
			||||||
	AMDGIM_ERROR_CATEGORY_MAX
 | 
						AMDGIM_ERROR_CATEGORY_MAX
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data);
 | 
					void amdgpu_vf_error_put(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
								 uint16_t sub_error_code,
 | 
				
			||||||
 | 
								 uint16_t error_flags,
 | 
				
			||||||
 | 
								 uint64_t error_data);
 | 
				
			||||||
void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);
 | 
					void amdgpu_vf_error_trans_all (struct amdgpu_device *adev);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#endif /* __VF_ERROR_H__ */
 | 
					#endif /* __VF_ERROR_H__ */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -36,6 +36,18 @@ struct amdgpu_mm_table {
 | 
				
			||||||
	uint64_t		gpu_addr;
 | 
						uint64_t		gpu_addr;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#define AMDGPU_VF_ERROR_ENTRY_SIZE    16
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* struct error_entry - amdgpu VF error information. */
 | 
				
			||||||
 | 
					struct amdgpu_vf_error_buffer {
 | 
				
			||||||
 | 
						struct mutex lock;
 | 
				
			||||||
 | 
						int read_count;
 | 
				
			||||||
 | 
						int write_count;
 | 
				
			||||||
 | 
						uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
				
			||||||
 | 
						uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
				
			||||||
 | 
						uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE];
 | 
				
			||||||
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * struct amdgpu_virt_ops - amdgpu device virt operations
 | 
					 * struct amdgpu_virt_ops - amdgpu device virt operations
 | 
				
			||||||
 */
 | 
					 */
 | 
				
			||||||
| 
						 | 
					@ -59,6 +71,7 @@ struct amdgpu_virt {
 | 
				
			||||||
	struct work_struct		flr_work;
 | 
						struct work_struct		flr_work;
 | 
				
			||||||
	struct amdgpu_mm_table		mm_table;
 | 
						struct amdgpu_mm_table		mm_table;
 | 
				
			||||||
	const struct amdgpu_virt_ops	*ops;
 | 
						const struct amdgpu_virt_ops	*ops;
 | 
				
			||||||
 | 
						struct amdgpu_vf_error_buffer   vf_errors;
 | 
				
			||||||
};
 | 
					};
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#define AMDGPU_CSA_SIZE    (8 * 1024)
 | 
					#define AMDGPU_CSA_SIZE    (8 * 1024)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue