mirror of
				https://github.com/torvalds/linux.git
				synced 2025-11-03 18:20:25 +02:00 
			
		
		
		
	drm: Create a task info option for wedge events
When a device get wedged, it might be caused by a guilty application. For userspace, knowing which task was involved can be useful for some situations, like for implementing a policy, logs or for giving a chance for the compositor to let the user know what task was involved in the problem. This is an optional argument, when the task info is not available, the PID and TASK string won't appear in the event string. Sometimes just the PID isn't enough giving that the task might be already dead by the time userspace will try to check what was this PID's name, so to make the life easier also notify what's the task's name in the user event. Acked-by: Rodrigo Vivi <rodrigo.vivi@intel.com> Reviewed-by: Krzysztof Karas <krzysztof.karas@intel.com> Reviewed-by: Raag Jadav <raag.jadav@intel.com> Acked-by: Christian König <christian.koenig@amd.com> Link: https://lore.kernel.org/r/20250617124949.2151549-4-andrealmeid@igalia.com Signed-off-by: André Almeida <andrealmeid@igalia.com>
This commit is contained in:
		
							parent
							
								
									3bfd1af74a
								
							
						
					
					
						commit
						183bccafa1
					
				
					 7 changed files with 34 additions and 9 deletions
				
			
		| 
						 | 
				
			
			@ -6364,7 +6364,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 | 
			
		|||
	atomic_set(&adev->reset_domain->reset_res, r);
 | 
			
		||||
 | 
			
		||||
	if (!r)
 | 
			
		||||
		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
 | 
			
		||||
		drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 | 
			
		||||
 | 
			
		||||
	return r;
 | 
			
		||||
}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -164,7 +164,7 @@ static enum drm_gpu_sched_stat amdgpu_job_timedout(struct drm_sched_job *s_job)
 | 
			
		|||
			if (amdgpu_ring_sched_ready(ring))
 | 
			
		||||
				drm_sched_start(&ring->sched, 0);
 | 
			
		||||
			dev_err(adev->dev, "Ring %s reset succeeded\n", ring->sched.name);
 | 
			
		||||
			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
 | 
			
		||||
			drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE, NULL);
 | 
			
		||||
			goto exit;
 | 
			
		||||
		}
 | 
			
		||||
		dev_err(adev->dev, "Ring %s reset failure\n", ring->sched.name);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -35,6 +35,7 @@
 | 
			
		|||
#include <linux/moduleparam.h>
 | 
			
		||||
#include <linux/mount.h>
 | 
			
		||||
#include <linux/pseudo_fs.h>
 | 
			
		||||
#include <linux/sched.h>
 | 
			
		||||
#include <linux/slab.h>
 | 
			
		||||
#include <linux/sprintf.h>
 | 
			
		||||
#include <linux/srcu.h>
 | 
			
		||||
| 
						 | 
				
			
			@ -539,10 +540,15 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
 | 
			
		|||
	}
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
#define WEDGE_STR_LEN	32
 | 
			
		||||
#define PID_STR_LEN	15
 | 
			
		||||
#define COMM_STR_LEN	(TASK_COMM_LEN + 5)
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * drm_dev_wedged_event - generate a device wedged uevent
 | 
			
		||||
 * @dev: DRM device
 | 
			
		||||
 * @method: method(s) to be used for recovery
 | 
			
		||||
 * @info: optional information about the guilty task
 | 
			
		||||
 *
 | 
			
		||||
 * This generates a device wedged uevent for the DRM device specified by @dev.
 | 
			
		||||
 * Recovery @method\(s) of choice will be sent in the uevent environment as
 | 
			
		||||
| 
						 | 
				
			
			@ -555,13 +561,13 @@ static const char *drm_get_wedge_recovery(unsigned int opt)
 | 
			
		|||
 *
 | 
			
		||||
 * Returns: 0 on success, negative error code otherwise.
 | 
			
		||||
 */
 | 
			
		||||
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
 | 
			
		||||
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
 | 
			
		||||
			 struct drm_wedge_task_info *info)
 | 
			
		||||
{
 | 
			
		||||
	char event_string[WEDGE_STR_LEN], pid_string[PID_STR_LEN], comm_string[COMM_STR_LEN];
 | 
			
		||||
	char *envp[] = { event_string, NULL, NULL, NULL };
 | 
			
		||||
	const char *recovery = NULL;
 | 
			
		||||
	unsigned int len, opt;
 | 
			
		||||
	/* Event string length up to 28+ characters with available methods */
 | 
			
		||||
	char event_string[32];
 | 
			
		||||
	char *envp[] = { event_string, NULL };
 | 
			
		||||
 | 
			
		||||
	len = scnprintf(event_string, sizeof(event_string), "%s", "WEDGED=");
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -583,6 +589,13 @@ int drm_dev_wedged_event(struct drm_device *dev, unsigned long method)
 | 
			
		|||
	drm_info(dev, "device wedged, %s\n", method == DRM_WEDGE_RECOVERY_NONE ?
 | 
			
		||||
		 "but recovered through reset" : "needs recovery");
 | 
			
		||||
 | 
			
		||||
	if (info && (info->comm[0] != '\0') && (info->pid >= 0)) {
 | 
			
		||||
		snprintf(pid_string, sizeof(pid_string), "PID=%u", info->pid);
 | 
			
		||||
		snprintf(comm_string, sizeof(comm_string), "TASK=%s", info->comm);
 | 
			
		||||
		envp[1] = pid_string;
 | 
			
		||||
		envp[2] = comm_string;
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	return kobject_uevent_env(&dev->primary->kdev->kobj, KOBJ_CHANGE, envp);
 | 
			
		||||
}
 | 
			
		||||
EXPORT_SYMBOL(drm_dev_wedged_event);
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1448,7 +1448,8 @@ static void intel_gt_reset_global(struct intel_gt *gt,
 | 
			
		|||
		kobject_uevent_env(kobj, KOBJ_CHANGE, reset_done_event);
 | 
			
		||||
	else
 | 
			
		||||
		drm_dev_wedged_event(>->i915->drm,
 | 
			
		||||
				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
 | 
			
		||||
				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
 | 
			
		||||
				     NULL);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -1168,7 +1168,8 @@ void xe_device_declare_wedged(struct xe_device *xe)
 | 
			
		|||
 | 
			
		||||
		/* Notify userspace of wedged device */
 | 
			
		||||
		drm_dev_wedged_event(&xe->drm,
 | 
			
		||||
				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET);
 | 
			
		||||
				     DRM_WEDGE_RECOVERY_REBIND | DRM_WEDGE_RECOVERY_BUS_RESET,
 | 
			
		||||
				     NULL);
 | 
			
		||||
	}
 | 
			
		||||
 | 
			
		||||
	for_each_gt(gt, xe, id)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -5,6 +5,7 @@
 | 
			
		|||
#include <linux/kref.h>
 | 
			
		||||
#include <linux/mutex.h>
 | 
			
		||||
#include <linux/idr.h>
 | 
			
		||||
#include <linux/sched.h>
 | 
			
		||||
 | 
			
		||||
#include <drm/drm_mode_config.h>
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -30,6 +31,14 @@ struct pci_controller;
 | 
			
		|||
#define DRM_WEDGE_RECOVERY_REBIND	BIT(1)	/* unbind + bind driver */
 | 
			
		||||
#define DRM_WEDGE_RECOVERY_BUS_RESET	BIT(2)	/* unbind + reset bus device + bind */
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * struct drm_wedge_task_info - information about the guilty task of a wedge dev
 | 
			
		||||
 */
 | 
			
		||||
struct drm_wedge_task_info {
 | 
			
		||||
	pid_t pid;
 | 
			
		||||
	char comm[TASK_COMM_LEN];
 | 
			
		||||
};
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * enum switch_power_state - power state of drm device
 | 
			
		||||
 */
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -487,7 +487,8 @@ void drm_put_dev(struct drm_device *dev);
 | 
			
		|||
bool drm_dev_enter(struct drm_device *dev, int *idx);
 | 
			
		||||
void drm_dev_exit(int idx);
 | 
			
		||||
void drm_dev_unplug(struct drm_device *dev);
 | 
			
		||||
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method);
 | 
			
		||||
int drm_dev_wedged_event(struct drm_device *dev, unsigned long method,
 | 
			
		||||
			 struct drm_wedge_task_info *info);
 | 
			
		||||
 | 
			
		||||
/**
 | 
			
		||||
 * drm_dev_is_unplugged - is a DRM device unplugged
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
		Reference in a new issue