Skip to content
Snippets Groups Projects
Commit cab92f16 authored by Ashley Smith's avatar Ashley Smith
Browse files

Attempt to fix scheduling issues in panthor/drm_sched. Patch from bbrezillon...

Attempt to fix scheduling issues in panthor/drm_sched. Patch from bbrezillon with a few edits by me. Edits by me restart the timeout on queue_run_job since we are making progress so may stop some long running tasks from timing out
parent 8af49358
Branches fix_flakes
No related tags found
No related merge requests found
Pipeline #126624 failed
...@@ -1066,6 +1066,46 @@ cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ...@@ -1066,6 +1066,46 @@ cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
} }
} }
static bool
group_is_idle(struct panthor_group *group)
{
struct panthor_device *ptdev = group->ptdev;
u32 inactive_queues;
if (group->csg_id >= 0)
return ptdev->scheduler->csg_slots[group->csg_id].idle;
inactive_queues = group->idle_queues | group->blocked_queues;
return hweight32(inactive_queues) == group->queue_count;
}
static void
suspend_queue_timeout(struct panthor_group *group, u32 queue_idx)
{
struct panthor_queue *queue = group->queues[queue_idx];
/* The queue timeout is already suspended. */
if (queue->timeout_suspended)
return;
/* If the queue is idle, all jobs have landed or should land soon, and
* there's no reason to suspend the timer.
*/
if (group->idle_queues & BIT(queue_idx))
return;
/* If the group is idle and our queue isn't, that means our queue is
* blocked, in which case we want to keep the timeout active so we
* can detect unbounded waits.
*/
if (group_is_idle(group))
return;
queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
queue->timeout_suspended = true;
}
/** /**
* cs_slot_reset_locked() - Reset a queue slot * cs_slot_reset_locked() - Reset a queue slot
* @ptdev: Device. * @ptdev: Device.
...@@ -1082,7 +1122,6 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ...@@ -1082,7 +1122,6 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
{ {
struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id); struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group; struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
struct panthor_queue *queue = group->queues[cs_id];
lockdep_assert_held(&ptdev->scheduler->lock); lockdep_assert_held(&ptdev->scheduler->lock);
...@@ -1090,14 +1129,7 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id) ...@@ -1090,14 +1129,7 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
CS_STATE_STOP, CS_STATE_STOP,
CS_STATE_MASK); CS_STATE_MASK);
/* If the queue is blocked, we want to keep the timeout running, so suspend_queue_timeout(group, cs_id);
* we can detect unbounded waits and kill the group when that happens.
*/
if (!(group->blocked_queues & BIT(cs_id)) && !queue->timeout_suspended) {
queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
queue->timeout_suspended = true;
WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
}
return 0; return 0;
} }
...@@ -1879,19 +1911,6 @@ tick_ctx_is_full(const struct panthor_scheduler *sched, ...@@ -1879,19 +1911,6 @@ tick_ctx_is_full(const struct panthor_scheduler *sched,
return ctx->group_count == sched->csg_slot_count; return ctx->group_count == sched->csg_slot_count;
} }
static bool
group_is_idle(struct panthor_group *group)
{
struct panthor_device *ptdev = group->ptdev;
u32 inactive_queues;
if (group->csg_id >= 0)
return ptdev->scheduler->csg_slots[group->csg_id].idle;
inactive_queues = group->idle_queues | group->blocked_queues;
return hweight32(inactive_queues) == group->queue_count;
}
static bool static bool
group_can_run(struct panthor_group *group) group_can_run(struct panthor_group *group)
{ {
...@@ -3156,17 +3175,15 @@ queue_run_job(struct drm_sched_job *sched_job) ...@@ -3156,17 +3175,15 @@ queue_run_job(struct drm_sched_job *sched_job)
queue->iface.input->extract = queue->iface.output->extract; queue->iface.input->extract = queue->iface.output->extract;
queue->iface.input->insert = job->ringbuf.end; queue->iface.input->insert = job->ringbuf.end;
queue->scheduler.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
if (group->csg_id < 0) { if (group->csg_id < 0) {
/* If the queue is blocked, we want to keep the timeout running, so we /* If the queue is blocked, we want to keep the timeout running, so we
* can detect unbounded waits and kill the group when that happens. * can detect unbounded waits and kill the group when that happens.
* Otherwise, we suspend the timeout so the time we spend waiting for * Otherwise, we suspend the timeout so the time we spend waiting for
* a CSG slot is not counted. * a CSG slot is not counted.
*/ */
if (!(group->blocked_queues & BIT(job->queue_idx)) && suspend_queue_timeout(group, job->queue_idx);
!queue->timeout_suspended) {
queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
queue->timeout_suspended = true;
}
group_schedule_locked(group, BIT(job->queue_idx)); group_schedule_locked(group, BIT(job->queue_idx));
} else { } else {
...@@ -3183,6 +3200,9 @@ queue_run_job(struct drm_sched_job *sched_job) ...@@ -3183,6 +3200,9 @@ queue_run_job(struct drm_sched_job *sched_job)
dma_fence_put(queue->fence_ctx.last_fence); dma_fence_put(queue->fence_ctx.last_fence);
queue->fence_ctx.last_fence = dma_fence_get(job->done_fence); queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
if(!queue->timeout_suspended)
drm_sched_start_timeout_unlocked(&queue->scheduler);
done_fence = dma_fence_get(job->done_fence); done_fence = dma_fence_get(job->done_fence);
out_unlock: out_unlock:
......
...@@ -449,12 +449,13 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched) ...@@ -449,12 +449,13 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
mod_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout); mod_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
} }
static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched) void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
{ {
spin_lock(&sched->job_list_lock); spin_lock(&sched->job_list_lock);
drm_sched_start_timeout(sched); drm_sched_start_timeout(sched);
spin_unlock(&sched->job_list_lock); spin_unlock(&sched->job_list_lock);
} }
EXPORT_SYMBOL(drm_sched_start_timeout_unlocked);
/** /**
* drm_sched_tdr_queue_imm: - immediately start job timeout handler * drm_sched_tdr_queue_imm: - immediately start job timeout handler
...@@ -500,6 +501,7 @@ EXPORT_SYMBOL(drm_sched_fault); ...@@ -500,6 +501,7 @@ EXPORT_SYMBOL(drm_sched_fault);
*/ */
unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched) unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
{ {
spin_lock(&sched->job_list_lock);
unsigned long sched_timeout, now = jiffies; unsigned long sched_timeout, now = jiffies;
sched_timeout = sched->work_tdr.timer.expires; sched_timeout = sched->work_tdr.timer.expires;
...@@ -510,10 +512,16 @@ unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched) ...@@ -510,10 +512,16 @@ unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
*/ */
if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT) if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
&& time_after(sched_timeout, now)) && time_after(sched_timeout, now))
{
spin_unlock(&sched->job_list_lock);
return sched_timeout - now; return sched_timeout - now;
}
else else
{
spin_unlock(&sched->job_list_lock);
return sched->timeout; return sched->timeout;
} }
}
EXPORT_SYMBOL(drm_sched_suspend_timeout); EXPORT_SYMBOL(drm_sched_suspend_timeout);
/** /**
......
...@@ -636,6 +636,7 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result); ...@@ -636,6 +636,7 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched); unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched, void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
unsigned long remaining); unsigned long remaining);
void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched);
struct drm_gpu_scheduler * struct drm_gpu_scheduler *
drm_sched_pick_best(struct drm_gpu_scheduler **sched_list, drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
unsigned int num_sched_list); unsigned int num_sched_list);
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment