mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 10:40:15 +02:00 
			
		
		
		
	drm/amdgpu: Implement DPC recovery
Add PCI Downstream Port Containment (DPC) with basic recovery functionality v2: remove pci_save_state to avoid breaking suspend/resume v3: Fix style comments v4: Improve description. Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Reviewed-by: Alex Deucher <alexander.deucher@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
		
							parent
							
								
									2a9787dcf5
								
							
						
					
					
						commit
						c9a6b82f45
					
				
					 3 changed files with 177 additions and 1 deletions
				
			
		| 
						 | 
				
			
			@ -49,6 +49,8 @@
 | 
			
		|||
#include <linux/rbtree.h>
 | 
			
		||||
#include <linux/hashtable.h>
 | 
			
		||||
#include <linux/dma-fence.h>
 | 
			
		||||
#include <linux/pci.h>
 | 
			
		||||
#include <linux/aer.h>
 | 
			
		||||
 | 
			
		||||
#include <drm/ttm/ttm_bo_api.h>
 | 
			
		||||
#include <drm/ttm/ttm_bo_driver.h>
 | 
			
		||||
| 
						 | 
				
			
			@ -1260,6 +1262,12 @@ static inline int amdgpu_dm_display_resume(struct amdgpu_device *adev) { return
 | 
			
		|||
void amdgpu_register_gpu_instance(struct amdgpu_device *adev);
 | 
			
		||||
void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev);
 | 
			
		||||
 | 
			
		||||
pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev,
 | 
			
		||||
					   pci_channel_state_t state);
 | 
			
		||||
pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev);
 | 
			
		||||
pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev);
 | 
			
		||||
void amdgpu_pci_resume(struct pci_dev *pdev);
 | 
			
		||||
 | 
			
		||||
#include "amdgpu_object.h"
 | 
			
		||||
 | 
			
		||||
/* used by df_v3_6.c and amdgpu_pmu.c */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -2999,6 +2999,7 @@ static const struct attribute *amdgpu_dev_attributes[] = {
 | 
			
		|||
	NULL
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * amdgpu_device_init - initialize the driver
 | 
			
		||||
 *
 | 
			
		||||
| 
						 | 
				
			
			@ -3217,6 +3218,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
 | 
			
		|||
		}
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	pci_enable_pcie_error_reporting(adev->ddev.pdev);
 | 
			
		||||
 | 
			
		||||
	/* Post card if necessary */
 | 
			
		||||
	if (amdgpu_device_need_post(adev)) {
 | 
			
		||||
		if (!adev->bios) {
 | 
			
		||||
| 
						 | 
				
			
			@ -4705,3 +4708,161 @@ int amdgpu_device_baco_exit(struct drm_device *dev)
 | 
			
		|||
 | 
			
		||||
	return 0;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * amdgpu_pci_error_detected - Called when a PCI error is detected.
 | 
			
		||||
 * @pdev: PCI device struct
 | 
			
		||||
 * @state: PCI channel state
 | 
			
		||||
 *
 | 
			
		||||
 * Description: Called when a PCI error is detected.
 | 
			
		||||
 *
 | 
			
		||||
 * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
 | 
			
		||||
 */
 | 
			
		||||
pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
 | 
			
		||||
{
 | 
			
		||||
	struct drm_device *dev = pci_get_drvdata(pdev);
 | 
			
		||||
	struct amdgpu_device *adev = drm_to_adev(dev);
 | 
			
		||||
 | 
			
		||||
	DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
 | 
			
		||||
 | 
			
		||||
	switch (state) {
 | 
			
		||||
	case pci_channel_io_normal:
 | 
			
		||||
		return PCI_ERS_RESULT_CAN_RECOVER;
 | 
			
		||||
	case pci_channel_io_frozen:
 | 
			
		||||
		/* Fatal error, prepare for slot reset */
 | 
			
		||||
		amdgpu_device_lock_adev(adev);
 | 
			
		||||
		return PCI_ERS_RESULT_NEED_RESET;
 | 
			
		||||
	case pci_channel_io_perm_failure:
 | 
			
		||||
		/* Permanent error, prepare for device removal */
 | 
			
		||||
		return PCI_ERS_RESULT_DISCONNECT;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return PCI_ERS_RESULT_NEED_RESET;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
 | 
			
		||||
 * @pdev: pointer to PCI device
 | 
			
		||||
 */
 | 
			
		||||
pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
 | 
			
		||||
{
 | 
			
		||||
 | 
			
		||||
	DRM_INFO("PCI error: mmio enabled callback!!\n");
 | 
			
		||||
 | 
			
		||||
	/* TODO - dump whatever for debugging purposes */
 | 
			
		||||
 | 
			
		||||
	/* This called only if amdgpu_pci_error_detected returns
 | 
			
		||||
	 * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
 | 
			
		||||
	 * works, no need to reset slot.
 | 
			
		||||
	 */
 | 
			
		||||
 | 
			
		||||
	return PCI_ERS_RESULT_RECOVERED;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
 | 
			
		||||
 * @pdev: PCI device struct
 | 
			
		||||
 *
 | 
			
		||||
 * Description: This routine is called by the pci error recovery
 | 
			
		||||
 * code after the PCI slot has been reset, just before we
 | 
			
		||||
 * should resume normal operations.
 | 
			
		||||
 */
 | 
			
		||||
pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
 | 
			
		||||
{
 | 
			
		||||
	struct drm_device *dev = pci_get_drvdata(pdev);
 | 
			
		||||
	struct amdgpu_device *adev = drm_to_adev(dev);
 | 
			
		||||
	int r;
 | 
			
		||||
	bool vram_lost;
 | 
			
		||||
 | 
			
		||||
	DRM_INFO("PCI error: slot reset callback!!\n");
 | 
			
		||||
 | 
			
		||||
	pci_restore_state(pdev);
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_ip_suspend(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	/* post card */
 | 
			
		||||
	r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_ip_resume_phase1(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	vram_lost = amdgpu_device_check_vram_lost(adev);
 | 
			
		||||
	if (vram_lost) {
 | 
			
		||||
		DRM_INFO("VRAM is lost due to GPU reset!\n");
 | 
			
		||||
		amdgpu_inc_vram_lost(adev);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_gtt_mgr_recover(
 | 
			
		||||
		&adev->mman.bdev.man[TTM_PL_TT]);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_fw_loading(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		return r;
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_ip_resume_phase2(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	if (vram_lost)
 | 
			
		||||
		amdgpu_device_fill_reset_magic(adev);
 | 
			
		||||
 | 
			
		||||
	/*
 | 
			
		||||
	 * Add this ASIC as tracked as reset was already
 | 
			
		||||
	 * complete successfully.
 | 
			
		||||
	 */
 | 
			
		||||
	amdgpu_register_gpu_instance(adev);
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_ip_late_init(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	amdgpu_fbdev_set_suspend(adev, 0);
 | 
			
		||||
 | 
			
		||||
	/* must succeed. */
 | 
			
		||||
	amdgpu_ras_resume(adev);
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
	amdgpu_irq_gpu_reset_resume_helper(adev);
 | 
			
		||||
	r = amdgpu_ib_ring_tests(adev);
 | 
			
		||||
	if (r)
 | 
			
		||||
		goto out;
 | 
			
		||||
 | 
			
		||||
	r = amdgpu_device_recover_vram(adev);
 | 
			
		||||
 | 
			
		||||
out:
 | 
			
		||||
 | 
			
		||||
	if (!r) {
 | 
			
		||||
		DRM_INFO("PCIe error recovery succeeded\n");
 | 
			
		||||
	} else {
 | 
			
		||||
		DRM_ERROR("PCIe error recovery failed, err:%d", r);
 | 
			
		||||
		amdgpu_device_unlock_adev(adev);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * amdgpu_pci_resume() - resume normal ops after PCI reset
 | 
			
		||||
 * @pdev: pointer to PCI device
 | 
			
		||||
 *
 | 
			
		||||
 * Called when the error recovery driver tells us that its
 | 
			
		||||
 * OK to resume normal operation. Use completion to allow
 | 
			
		||||
 * halted scsi ops to resume.
 | 
			
		||||
 */
 | 
			
		||||
void amdgpu_pci_resume(struct pci_dev *pdev)
 | 
			
		||||
{
 | 
			
		||||
	struct drm_device *dev = pci_get_drvdata(pdev);
 | 
			
		||||
	struct amdgpu_device *adev = drm_to_adev(dev);
 | 
			
		||||
 | 
			
		||||
	amdgpu_device_unlock_adev(adev);
 | 
			
		||||
 | 
			
		||||
	DRM_INFO("PCI error: resume callback!!\n");
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -32,7 +32,6 @@
 | 
			
		|||
#include <drm/drm_pciids.h>
 | 
			
		||||
#include <linux/console.h>
 | 
			
		||||
#include <linux/module.h>
 | 
			
		||||
#include <linux/pci.h>
 | 
			
		||||
#include <linux/pm_runtime.h>
 | 
			
		||||
#include <linux/vga_switcheroo.h>
 | 
			
		||||
#include <drm/drm_probe_helper.h>
 | 
			
		||||
| 
						 | 
				
			
			@ -1528,6 +1527,13 @@ static struct drm_driver kms_driver = {
 | 
			
		|||
	.patchlevel = KMS_DRIVER_PATCHLEVEL,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct pci_error_handlers amdgpu_pci_err_handler = {
 | 
			
		||||
	.error_detected	= amdgpu_pci_error_detected,
 | 
			
		||||
	.mmio_enabled	= amdgpu_pci_mmio_enabled,
 | 
			
		||||
	.slot_reset	= amdgpu_pci_slot_reset,
 | 
			
		||||
	.resume		= amdgpu_pci_resume,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static struct pci_driver amdgpu_kms_pci_driver = {
 | 
			
		||||
	.name = DRIVER_NAME,
 | 
			
		||||
	.id_table = pciidlist,
 | 
			
		||||
| 
						 | 
				
			
			@ -1535,6 +1541,7 @@ static struct pci_driver amdgpu_kms_pci_driver = {
 | 
			
		|||
	.remove = amdgpu_pci_remove,
 | 
			
		||||
	.shutdown = amdgpu_pci_shutdown,
 | 
			
		||||
	.driver.pm = &amdgpu_pm_ops,
 | 
			
		||||
	.err_handler = &amdgpu_pci_err_handler,
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
static int __init amdgpu_init(void)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue