mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-04 02:30:34 +02:00 
			
		
		
		
	RAS/AMD/ATL: Add MI300 row retirement support
DRAM row retirement depends on model-specific information that is best done within the AMD Address Translation Library. Export a generic wrapper function for other modules to use. Add any model-specific helpers here. Signed-off-by: Yazen Ghannam <yazen.ghannam@amd.com> Signed-off-by: Borislav Petkov (AMD) <bp@alien8.de> Link: https://lore.kernel.org/r/20240214033516.1344948-2-yazen.ghannam@amd.com
This commit is contained in:
		
							parent
							
								
									0e4fd816b0
								
							
						
					
					
						commit
						3b566b30b4
					
				
					 3 changed files with 54 additions and 0 deletions
				
			
		| 
						 | 
					@ -10,6 +10,7 @@
 | 
				
			||||||
config AMD_ATL
 | 
					config AMD_ATL
 | 
				
			||||||
	tristate "AMD Address Translation Library"
 | 
						tristate "AMD Address Translation Library"
 | 
				
			||||||
	depends on AMD_NB && X86_64 && RAS
 | 
						depends on AMD_NB && X86_64 && RAS
 | 
				
			||||||
 | 
						depends on MEMORY_FAILURE
 | 
				
			||||||
	default N
 | 
						default N
 | 
				
			||||||
	help
 | 
						help
 | 
				
			||||||
	  This library includes support for implementation-specific
 | 
						  This library includes support for implementation-specific
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -239,6 +239,57 @@ static unsigned long convert_dram_to_norm_addr_mi300(unsigned long addr)
 | 
				
			||||||
	return addr;
 | 
						return addr;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/*
 | 
				
			||||||
 | 
					 * When a DRAM ECC error occurs on MI300 systems, it is recommended to retire
 | 
				
			||||||
 | 
					 * all memory within that DRAM row. This applies to the memory with a DRAM
 | 
				
			||||||
 | 
					 * bank.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * To find the memory addresses, loop through permutations of the DRAM column
 | 
				
			||||||
 | 
					 * bits and find the System Physical address of each. The column bits are used
 | 
				
			||||||
 | 
					 * to calculate the intermediate Normalized address, so all permutations should
 | 
				
			||||||
 | 
					 * be checked.
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * See amd_atl::convert_dram_to_norm_addr_mi300() for MI300 address formats.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					#define MI300_NUM_COL		BIT(HWEIGHT(MI300_UMC_MCA_COL))
 | 
				
			||||||
 | 
					static void retire_row_mi300(struct atl_err *a_err)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						unsigned long addr;
 | 
				
			||||||
 | 
						struct page *p;
 | 
				
			||||||
 | 
						u8 col;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						for (col = 0; col < MI300_NUM_COL; col++) {
 | 
				
			||||||
 | 
							a_err->addr &= ~MI300_UMC_MCA_COL;
 | 
				
			||||||
 | 
							a_err->addr |= FIELD_PREP(MI300_UMC_MCA_COL, col);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							addr = amd_convert_umc_mca_addr_to_sys_addr(a_err);
 | 
				
			||||||
 | 
							if (IS_ERR_VALUE(addr))
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							addr = PHYS_PFN(addr);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							/*
 | 
				
			||||||
 | 
							 * Skip invalid or already poisoned pages to avoid unnecessary
 | 
				
			||||||
 | 
							 * error messages from memory_failure().
 | 
				
			||||||
 | 
							 */
 | 
				
			||||||
 | 
							p = pfn_to_online_page(addr);
 | 
				
			||||||
 | 
							if (!p)
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (PageHWPoison(p))
 | 
				
			||||||
 | 
								continue;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							memory_failure(addr, 0);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void amd_retire_dram_row(struct atl_err *a_err)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
 | 
				
			||||||
 | 
							return retire_row_mi300(a_err);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					EXPORT_SYMBOL_GPL(amd_retire_dram_row);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static unsigned long get_addr(unsigned long addr)
 | 
					static unsigned long get_addr(unsigned long addr)
 | 
				
			||||||
{
 | 
					{
 | 
				
			||||||
	if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
 | 
						if (df_cfg.rev == DF4p5 && df_cfg.flags.heterogeneous)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -45,8 +45,10 @@ struct atl_err {
 | 
				
			||||||
#if IS_ENABLED(CONFIG_AMD_ATL)
 | 
					#if IS_ENABLED(CONFIG_AMD_ATL)
 | 
				
			||||||
void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
 | 
					void amd_atl_register_decoder(unsigned long (*f)(struct atl_err *));
 | 
				
			||||||
void amd_atl_unregister_decoder(void);
 | 
					void amd_atl_unregister_decoder(void);
 | 
				
			||||||
 | 
					void amd_retire_dram_row(struct atl_err *err);
 | 
				
			||||||
unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 | 
					unsigned long amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err);
 | 
				
			||||||
#else
 | 
					#else
 | 
				
			||||||
 | 
					static inline void amd_retire_dram_row(struct atl_err *err) { }
 | 
				
			||||||
static inline unsigned long
 | 
					static inline unsigned long
 | 
				
			||||||
amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 | 
					amd_convert_umc_mca_addr_to_sys_addr(struct atl_err *err) { return -EINVAL; }
 | 
				
			||||||
#endif /* CONFIG_AMD_ATL */
 | 
					#endif /* CONFIG_AMD_ATL */
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue