linux/drivers/gpu/drm/scheduler/tests/mock_scheduler.c
Tvrtko Ursulin 2650bc4007 drm/sched: Fix a race in DRM_GPU_SCHED_STAT_NO_HANG test
The "skip reset" test waits for the timeout handler to run for the
duration of 2 * MOCK_TIMEOUT, and because the mock scheduler opted to
remove the "skip reset" flag once it fires, this gives opportunity for the
timeout handler to run twice. Second time the job will be removed from the
mock scheduler job list and the drm_mock_sched_advance() call in the test
will fail.

Fix it by making the "don't reset" flag persist for the lifetime of the
job and add a new flag to verify that the code path had executed as
expected.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@igalia.com>
Fixes: 1472e7549f ("drm/sched: Add new test for DRM_GPU_SCHED_STAT_NO_HANG")
Cc: Maíra Canal <mcanal@igalia.com>
Cc: Philipp Stanner <phasta@kernel.org>
Reviewed-by: Maíra Canal <mcanal@igalia.com>
Signed-off-by: Philipp Stanner <phasta@kernel.org>
Link: https://lore.kernel.org/r/20250716084817.56797-1-tvrtko.ursulin@igalia.com
2025-07-17 11:39:19 +02:00

370 lines
10 KiB
C

// SPDX-License-Identifier: GPL-2.0
/* Copyright (c) 2025 Valve Corporation */
#include "sched_tests.h"
/*
* Here we implement the mock "GPU" (or the scheduler backend) which is used by
* the DRM scheduler unit tests in order to exercise the core functionality.
*
* Test cases are implemented in a separate file.
*/
/**
* drm_mock_sched_entity_new - Create a new mock scheduler entity
*
* @test: KUnit test owning the entity
* @priority: Scheduling priority
* @sched: Mock scheduler on which the entity can be scheduled
*
* Returns: New mock scheduler entity with allocation managed by the test
*/
struct drm_mock_sched_entity *
drm_mock_sched_entity_new(struct kunit *test,
enum drm_sched_priority priority,
struct drm_mock_scheduler *sched)
{
struct drm_mock_sched_entity *entity;
struct drm_gpu_scheduler *drm_sched;
int ret;
entity = kunit_kzalloc(test, sizeof(*entity), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, entity);
drm_sched = &sched->base;
ret = drm_sched_entity_init(&entity->base,
priority,
&drm_sched, 1,
NULL);
KUNIT_ASSERT_EQ(test, ret, 0);
entity->test = test;
return entity;
}
/**
* drm_mock_sched_entity_free - Destroys a mock scheduler entity
*
* @entity: Entity to destroy
*
* To be used from the test cases once done with the entity.
*/
void drm_mock_sched_entity_free(struct drm_mock_sched_entity *entity)
{
drm_sched_entity_destroy(&entity->base);
}
static void drm_mock_sched_job_complete(struct drm_mock_sched_job *job)
{
struct drm_mock_scheduler *sched =
drm_sched_to_mock_sched(job->base.sched);
lockdep_assert_held(&sched->lock);
job->flags |= DRM_MOCK_SCHED_JOB_DONE;
list_del(&job->link);
dma_fence_signal_locked(&job->hw_fence);
complete(&job->done);
}
static enum hrtimer_restart
drm_mock_sched_job_signal_timer(struct hrtimer *hrtimer)
{
struct drm_mock_sched_job *job =
container_of(hrtimer, typeof(*job), timer);
struct drm_mock_scheduler *sched =
drm_sched_to_mock_sched(job->base.sched);
struct drm_mock_sched_job *next;
ktime_t now = ktime_get();
unsigned long flags;
LIST_HEAD(signal);
spin_lock_irqsave(&sched->lock, flags);
list_for_each_entry_safe(job, next, &sched->job_list, link) {
if (!job->duration_us)
break;
if (ktime_before(now, job->finish_at))
break;
sched->hw_timeline.cur_seqno = job->hw_fence.seqno;
drm_mock_sched_job_complete(job);
}
spin_unlock_irqrestore(&sched->lock, flags);
return HRTIMER_NORESTART;
}
/**
* drm_mock_sched_job_new - Create a new mock scheduler job
*
* @test: KUnit test owning the job
* @entity: Scheduler entity of the job
*
* Returns: New mock scheduler job with allocation managed by the test
*/
struct drm_mock_sched_job *
drm_mock_sched_job_new(struct kunit *test,
struct drm_mock_sched_entity *entity)
{
struct drm_mock_sched_job *job;
int ret;
job = kunit_kzalloc(test, sizeof(*job), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, job);
ret = drm_sched_job_init(&job->base,
&entity->base,
1,
NULL,
1);
KUNIT_ASSERT_EQ(test, ret, 0);
job->test = test;
init_completion(&job->done);
INIT_LIST_HEAD(&job->link);
hrtimer_setup(&job->timer, drm_mock_sched_job_signal_timer,
CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
return job;
}
static const char *drm_mock_sched_hw_fence_driver_name(struct dma_fence *fence)
{
return "drm_mock_sched";
}
static const char *
drm_mock_sched_hw_fence_timeline_name(struct dma_fence *fence)
{
struct drm_mock_sched_job *job =
container_of(fence, typeof(*job), hw_fence);
return (const char *)job->base.sched->name;
}
static void drm_mock_sched_hw_fence_release(struct dma_fence *fence)
{
struct drm_mock_sched_job *job =
container_of(fence, typeof(*job), hw_fence);
hrtimer_cancel(&job->timer);
/* Containing job is freed by the kunit framework */
}
static const struct dma_fence_ops drm_mock_sched_hw_fence_ops = {
.get_driver_name = drm_mock_sched_hw_fence_driver_name,
.get_timeline_name = drm_mock_sched_hw_fence_timeline_name,
.release = drm_mock_sched_hw_fence_release,
};
static struct dma_fence *mock_sched_run_job(struct drm_sched_job *sched_job)
{
struct drm_mock_scheduler *sched =
drm_sched_to_mock_sched(sched_job->sched);
struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job);
dma_fence_init(&job->hw_fence,
&drm_mock_sched_hw_fence_ops,
&sched->lock,
sched->hw_timeline.context,
atomic_inc_return(&sched->hw_timeline.next_seqno));
dma_fence_get(&job->hw_fence); /* Reference for the job_list */
spin_lock_irq(&sched->lock);
if (job->duration_us) {
ktime_t prev_finish_at = 0;
if (!list_empty(&sched->job_list)) {
struct drm_mock_sched_job *prev =
list_last_entry(&sched->job_list, typeof(*prev),
link);
prev_finish_at = prev->finish_at;
}
if (!prev_finish_at)
prev_finish_at = ktime_get();
job->finish_at = ktime_add_us(prev_finish_at, job->duration_us);
}
list_add_tail(&job->link, &sched->job_list);
if (job->finish_at)
hrtimer_start(&job->timer, job->finish_at, HRTIMER_MODE_ABS);
spin_unlock_irq(&sched->lock);
return &job->hw_fence;
}
/*
* Normally, drivers would take appropriate measures in this callback, such as
* killing the entity the faulty job is associated with, resetting the hardware
* and / or resubmitting non-faulty jobs.
*
* For the mock scheduler, there are no hardware rings to be resetted nor jobs
* to be resubmitted. Thus, this function merely ensures that
* a) timedout fences get signaled properly and removed from the pending list
* b) the mock scheduler framework gets informed about the timeout via a flag
* c) The drm_sched_job, not longer needed, gets freed
*/
static enum drm_gpu_sched_stat
mock_sched_timedout_job(struct drm_sched_job *sched_job)
{
struct drm_mock_scheduler *sched = drm_sched_to_mock_sched(sched_job->sched);
struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job);
unsigned long flags;
if (job->flags & DRM_MOCK_SCHED_JOB_DONT_RESET) {
job->flags |= DRM_MOCK_SCHED_JOB_RESET_SKIPPED;
return DRM_GPU_SCHED_STAT_NO_HANG;
}
spin_lock_irqsave(&sched->lock, flags);
if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
list_del(&job->link);
job->flags |= DRM_MOCK_SCHED_JOB_TIMEDOUT;
dma_fence_set_error(&job->hw_fence, -ETIMEDOUT);
dma_fence_signal_locked(&job->hw_fence);
}
spin_unlock_irqrestore(&sched->lock, flags);
dma_fence_put(&job->hw_fence);
drm_sched_job_cleanup(sched_job);
/* Mock job itself is freed by the kunit framework. */
return DRM_GPU_SCHED_STAT_RESET;
}
static void mock_sched_free_job(struct drm_sched_job *sched_job)
{
struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job);
dma_fence_put(&job->hw_fence);
drm_sched_job_cleanup(sched_job);
/* Mock job itself is freed by the kunit framework. */
}
static void mock_sched_cancel_job(struct drm_sched_job *sched_job)
{
struct drm_mock_scheduler *sched = drm_sched_to_mock_sched(sched_job->sched);
struct drm_mock_sched_job *job = drm_sched_job_to_mock_job(sched_job);
unsigned long flags;
hrtimer_cancel(&job->timer);
spin_lock_irqsave(&sched->lock, flags);
if (!dma_fence_is_signaled_locked(&job->hw_fence)) {
list_del(&job->link);
dma_fence_set_error(&job->hw_fence, -ECANCELED);
dma_fence_signal_locked(&job->hw_fence);
}
spin_unlock_irqrestore(&sched->lock, flags);
/*
* The GPU Scheduler will call drm_sched_backend_ops.free_job(), still.
* Mock job itself is freed by the kunit framework.
*/
}
static const struct drm_sched_backend_ops drm_mock_scheduler_ops = {
.run_job = mock_sched_run_job,
.timedout_job = mock_sched_timedout_job,
.free_job = mock_sched_free_job,
.cancel_job = mock_sched_cancel_job,
};
/**
* drm_mock_sched_new - Create a new mock scheduler
*
* @test: KUnit test owning the job
* @timeout: Job timeout to set
*
* Returns: New mock scheduler with allocation managed by the test
*/
struct drm_mock_scheduler *drm_mock_sched_new(struct kunit *test, long timeout)
{
struct drm_sched_init_args args = {
.ops = &drm_mock_scheduler_ops,
.num_rqs = DRM_SCHED_PRIORITY_COUNT,
.credit_limit = U32_MAX,
.hang_limit = 1,
.timeout = timeout,
.name = "drm-mock-scheduler",
};
struct drm_mock_scheduler *sched;
int ret;
sched = kunit_kzalloc(test, sizeof(*sched), GFP_KERNEL);
KUNIT_ASSERT_NOT_NULL(test, sched);
ret = drm_sched_init(&sched->base, &args);
KUNIT_ASSERT_EQ(test, ret, 0);
sched->test = test;
sched->hw_timeline.context = dma_fence_context_alloc(1);
atomic_set(&sched->hw_timeline.next_seqno, 0);
INIT_LIST_HEAD(&sched->job_list);
spin_lock_init(&sched->lock);
return sched;
}
/**
* drm_mock_sched_fini - Destroys a mock scheduler
*
* @sched: Scheduler to destroy
*
* To be used from the test cases once done with the scheduler.
*/
void drm_mock_sched_fini(struct drm_mock_scheduler *sched)
{
drm_sched_fini(&sched->base);
}
/**
* drm_mock_sched_advance - Advances the mock scheduler timeline
*
* @sched: Scheduler timeline to advance
* @num: By how many jobs to advance
*
* Advancing the scheduler timeline by a number of seqnos will trigger
* signalling of the hardware fences and unlinking the jobs from the internal
* scheduler tracking.
*
* This can be used from test cases which want complete control of the simulated
* job execution timing. For example submitting one job with no set duration
* would never complete it before test cases advances the timeline by one.
*/
unsigned int drm_mock_sched_advance(struct drm_mock_scheduler *sched,
unsigned int num)
{
struct drm_mock_sched_job *job, *next;
unsigned int found = 0;
unsigned long flags;
LIST_HEAD(signal);
spin_lock_irqsave(&sched->lock, flags);
if (WARN_ON_ONCE(sched->hw_timeline.cur_seqno + num <
sched->hw_timeline.cur_seqno))
goto unlock;
sched->hw_timeline.cur_seqno += num;
list_for_each_entry_safe(job, next, &sched->job_list, link) {
if (sched->hw_timeline.cur_seqno < job->hw_fence.seqno)
break;
drm_mock_sched_job_complete(job);
found++;
}
unlock:
spin_unlock_irqrestore(&sched->lock, flags);
return found;
}
MODULE_DESCRIPTION("DRM mock scheduler and tests");
MODULE_LICENSE("GPL");