mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	Adjust removal control flow for smu v13_0_2:
   During amdgpu uninstallation, when removing the first
device, the kernel needs to first send a mode1reset message
to all gpu devices. Otherwise, smu initialization will fail
the next time amdgpu is installed.
V2:
1. Update commit comments.
2. Remove the global variable amdgpu_device_remove_cnt
   and add a variable to the structure amdgpu_hive_info.
3. Use hive to detect the first removed device instead of
   a global variable.
V3:
 1. Update commit comments.
 2. Split a patch into multiple patches.
 3. The current patch does:
    a. Add a work mode of AMDGPU_RESET_FOR_DEVICE_REMOVE into
       the existing gpu recover path, which make all devices
       in hive list only have HW reset but no resume (except
       the base IP).
    b. Call AMDGPU_RESET_FOR_DEVICE_REMOVE and
       AMDGPU_NEED_FULL_RESET mode of amdgpu_device_gpu_recover
       in amdgpu_pci_remove when removing the first device in
       hive list.
    c. When removing the first device, the IP blocks keyword
       function call sequence is as follows:
.suspend->mode1reset->.resume(basic ip)->.hw_fini->.early_fini->.sw_fini.
   ^                           |
   |-<----------<---------<----|
	The first three sequences are because of a call to
        amdgpu_device_gpu_recover. The three sequences will be
        executed in a loop until all devices in the hive list
        are iterated.
        The sequences starting from .hw_fini only apply to the
        first device. Since .suspend has been called before,
        except the resumed phase1 basic ip blocks, all other ip
        blocks .hw_fini of current device will do nothing.
     d. When removing other devices, the calling sequences is the
        same as legacy:
	   .hw_fini -> .early_fini -> .sw_fini.
	Since .suspend has been called when removing the first device,
        except the resumed phase1 basic ip blocks, all of other ip
        blocks .hw_fini of current device will do nothing.
Signed-off-by: YiPeng Chai <YiPeng.Chai@amd.com>
Reviewed-by: Hawking Zhang <Hawking.Zhang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
		
	
			
		
			
				
	
	
		
			77 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
			
		
		
	
	
			77 lines
		
	
	
	
		
			2.7 KiB
		
	
	
	
		
			C
		
	
	
	
	
	
/*
 | 
						|
 * Copyright 2016 Advanced Micro Devices, Inc.
 | 
						|
 *
 | 
						|
 * Permission is hereby granted, free of charge, to any person obtaining a
 | 
						|
 * copy of this software and associated documentation files (the "Software"),
 | 
						|
 * to deal in the Software without restriction, including without limitation
 | 
						|
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
 | 
						|
 * and/or sell copies of the Software, and to permit persons to whom the
 | 
						|
 * Software is furnished to do so, subject to the following conditions:
 | 
						|
 *
 | 
						|
 * The above copyright notice and this permission notice shall be included in
 | 
						|
 * all copies or substantial portions of the Software.
 | 
						|
 *
 | 
						|
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 | 
						|
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 | 
						|
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
 | 
						|
 * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
 | 
						|
 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
 | 
						|
 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
 | 
						|
 * OTHER DEALINGS IN THE SOFTWARE.
 | 
						|
 */
 | 
						|
#ifndef __AMDGPU_XGMI_H__
 | 
						|
#define __AMDGPU_XGMI_H__
 | 
						|
 | 
						|
#include <drm/task_barrier.h>
 | 
						|
#include "amdgpu_psp.h"
 | 
						|
#include "amdgpu_ras.h"
 | 
						|
 | 
						|
struct amdgpu_hive_info {
 | 
						|
	struct kobject kobj;
 | 
						|
	uint64_t hive_id;
 | 
						|
	struct list_head device_list;
 | 
						|
	struct list_head node;
 | 
						|
	atomic_t number_devices;
 | 
						|
	struct mutex hive_lock;
 | 
						|
	int hi_req_count;
 | 
						|
	struct amdgpu_device *hi_req_gpu;
 | 
						|
	struct task_barrier tb;
 | 
						|
	enum {
 | 
						|
		AMDGPU_XGMI_PSTATE_MIN,
 | 
						|
		AMDGPU_XGMI_PSTATE_MAX_VEGA20,
 | 
						|
		AMDGPU_XGMI_PSTATE_UNKNOWN
 | 
						|
	} pstate;
 | 
						|
 | 
						|
	struct amdgpu_reset_domain *reset_domain;
 | 
						|
	uint32_t device_remove_count;
 | 
						|
};
 | 
						|
 | 
						|
struct amdgpu_pcs_ras_field {
 | 
						|
	const char *err_name;
 | 
						|
	uint32_t pcs_err_mask;
 | 
						|
	uint32_t pcs_err_shift;
 | 
						|
};
 | 
						|
 | 
						|
extern struct amdgpu_xgmi_ras  xgmi_ras;
 | 
						|
struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev);
 | 
						|
void amdgpu_put_xgmi_hive(struct amdgpu_hive_info *hive);
 | 
						|
int amdgpu_xgmi_update_topology(struct amdgpu_hive_info *hive, struct amdgpu_device *adev);
 | 
						|
int amdgpu_xgmi_add_device(struct amdgpu_device *adev);
 | 
						|
int amdgpu_xgmi_remove_device(struct amdgpu_device *adev);
 | 
						|
int amdgpu_xgmi_set_pstate(struct amdgpu_device *adev, int pstate);
 | 
						|
int amdgpu_xgmi_get_hops_count(struct amdgpu_device *adev,
 | 
						|
		struct amdgpu_device *peer_adev);
 | 
						|
int amdgpu_xgmi_get_num_links(struct amdgpu_device *adev,
 | 
						|
		struct amdgpu_device *peer_adev);
 | 
						|
uint64_t amdgpu_xgmi_get_relative_phy_addr(struct amdgpu_device *adev,
 | 
						|
					   uint64_t addr);
 | 
						|
static inline bool amdgpu_xgmi_same_hive(struct amdgpu_device *adev,
 | 
						|
		struct amdgpu_device *bo_adev)
 | 
						|
{
 | 
						|
	return (amdgpu_use_xgmi_p2p &&
 | 
						|
		adev != bo_adev &&
 | 
						|
		adev->gmc.xgmi.hive_id &&
 | 
						|
		adev->gmc.xgmi.hive_id == bo_adev->gmc.xgmi.hive_id);
 | 
						|
}
 | 
						|
 | 
						|
#endif
 |