Attempt to fix scheduling issues in panthor/drm_sched. Patch from bbrezillon...

Attempt to fix scheduling issues in panthor/drm_sched. Patch from bbrezillon with a few edits by me. Edits by me restart the timeout on queue_run_job since we are making progress so may stop some long running tasks from timing out

Attempt to fix scheduling issues in panthor/drm_sched. Patch from bbrezillon...
cab92f16 · Ashley Smith · 8af49358 · cab92f16 · cab92f16 · cab92f16
Commit cab92f16 authored 6 months ago by Ashley Smith
--- a/drivers/gpu/drm/panthor/panthor_sched.c
+++ b/drivers/gpu/drm/panthor/panthor_sched.c
@@ -1066,6 +1066,46 @@ cs_slot_prog_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
 	}
 }
+static bool
+group_is_idle(struct panthor_group *group)
+{
+       struct panthor_device *ptdev = group->ptdev;
+       u32 inactive_queues;
+       if (group->csg_id >= 0)
+               return ptdev->scheduler->csg_slots[group->csg_id].idle;
+       inactive_queues = group->idle_queues | group->blocked_queues;
+       return hweight32(inactive_queues) == group->queue_count;
+}
+static void
+suspend_queue_timeout(struct panthor_group *group, u32 queue_idx)
+{
+       struct panthor_queue *queue = group->queues[queue_idx];
+       /* The queue timeout is already suspended. */
+       if (queue->timeout_suspended)
+               return;
+       /* If the queue is idle, all jobs have landed or should land soon, and
+        * there's no reason to suspend the timer.
+        */
+       if (group->idle_queues & BIT(queue_idx))
+               return;
+       /* If the group is idle and our queue isn't, that means our queue is
+        * blocked, in which case we want to keep the timeout active so we
+        * can detect unbounded waits.
+        */
+       if (group_is_idle(group))
+               return;
+       queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
+       WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
+       queue->timeout_suspended = true;
+}
 /**
 * cs_slot_reset_locked() - Reset a queue slot
 * @ptdev: Device.
@@ -1082,7 +1122,6 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
 {
 	struct panthor_fw_cs_iface *cs_iface = panthor_fw_get_cs_iface(ptdev, csg_id, cs_id);
 	struct panthor_group *group = ptdev->scheduler->csg_slots[csg_id].group;
-	struct panthor_queue *queue = group->queues[cs_id];
 	lockdep_assert_held(&ptdev->scheduler->lock);
@@ -1090,14 +1129,7 @@ cs_slot_reset_locked(struct panthor_device *ptdev, u32 csg_id, u32 cs_id)
 			       CS_STATE_STOP,
 			       CS_STATE_MASK);
-	/* If the queue is blocked, we want to keep the timeout running, so
+	suspend_queue_timeout(group, cs_id);
-	 * we can detect unbounded waits and kill the group when that happens.
-	 */
-	if (!(group->blocked_queues & BIT(cs_id)) && !queue->timeout_suspended) {
-		queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
-		queue->timeout_suspended = true;
-		WARN_ON(queue->remaining_time > msecs_to_jiffies(JOB_TIMEOUT_MS));
-	}
 	return 0;
 }
@@ -1879,19 +1911,6 @@ tick_ctx_is_full(const struct panthor_scheduler *sched,
 	return ctx->group_count == sched->csg_slot_count;
 }
-static bool
-group_is_idle(struct panthor_group *group)
-{
-	struct panthor_device *ptdev = group->ptdev;
-	u32 inactive_queues;
-	if (group->csg_id >= 0)
-		return ptdev->scheduler->csg_slots[group->csg_id].idle;
-	inactive_queues = group->idle_queues | group->blocked_queues;
-	return hweight32(inactive_queues) == group->queue_count;
-}
 static bool
 group_can_run(struct panthor_group *group)
 {
@@ -3156,17 +3175,15 @@ queue_run_job(struct drm_sched_job *sched_job)
 	queue->iface.input->extract = queue->iface.output->extract;
 	queue->iface.input->insert = job->ringbuf.end;
+	queue->scheduler.timeout = msecs_to_jiffies(JOB_TIMEOUT_MS);
 	if (group->csg_id < 0) {
 		/* If the queue is blocked, we want to keep the timeout running, so we
 		 * can detect unbounded waits and kill the group when that happens.
 		 * Otherwise, we suspend the timeout so the time we spend waiting for
 		 * a CSG slot is not counted.
 		 */
-		if (!(group->blocked_queues & BIT(job->queue_idx)) &&
+		suspend_queue_timeout(group, job->queue_idx);
-		    !queue->timeout_suspended) {
-			queue->remaining_time = drm_sched_suspend_timeout(&queue->scheduler);
-			queue->timeout_suspended = true;
-		}
 		group_schedule_locked(group, BIT(job->queue_idx));
 	} else {
@@ -3183,6 +3200,9 @@ queue_run_job(struct drm_sched_job *sched_job)
 	dma_fence_put(queue->fence_ctx.last_fence);
 	queue->fence_ctx.last_fence = dma_fence_get(job->done_fence);
+       if(!queue->timeout_suspended)
+               drm_sched_start_timeout_unlocked(&queue->scheduler);
 	done_fence = dma_fence_get(job->done_fence);
 out_unlock:

--- a/drivers/gpu/drm/scheduler/sched_main.c
+++ b/drivers/gpu/drm/scheduler/sched_main.c
@@ -449,12 +449,13 @@ static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
 		mod_delayed_work(sched->timeout_wq, &sched->work_tdr, sched->timeout);
 }
-static void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
+void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched)
 {
 	spin_lock(&sched->job_list_lock);
 	drm_sched_start_timeout(sched);
 	spin_unlock(&sched->job_list_lock);
 }
+EXPORT_SYMBOL(drm_sched_start_timeout_unlocked);
 /**
 * drm_sched_tdr_queue_imm: - immediately start job timeout handler
@@ -500,6 +501,7 @@ EXPORT_SYMBOL(drm_sched_fault);
 */
 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
 {
+	spin_lock(&sched->job_list_lock);
 	unsigned long sched_timeout, now = jiffies;
 	sched_timeout = sched->work_tdr.timer.expires;
@@ -510,10 +512,16 @@ unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched)
 	 */
 	if (mod_delayed_work(sched->timeout_wq, &sched->work_tdr, MAX_SCHEDULE_TIMEOUT)
 			&& time_after(sched_timeout, now))
+	{
+		spin_unlock(&sched->job_list_lock);
 		return sched_timeout - now;
+	}
 	else
+	{
+		spin_unlock(&sched->job_list_lock);
 		return sched->timeout;
 	}
+}
 EXPORT_SYMBOL(drm_sched_suspend_timeout);
 /**

--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -636,6 +636,7 @@ void drm_sched_fence_finished(struct drm_sched_fence *fence, int result);
 unsigned long drm_sched_suspend_timeout(struct drm_gpu_scheduler *sched);
 void drm_sched_resume_timeout(struct drm_gpu_scheduler *sched,
 		                unsigned long remaining);
+void drm_sched_start_timeout_unlocked(struct drm_gpu_scheduler *sched);
 struct drm_gpu_scheduler *
 drm_sched_pick_best(struct drm_gpu_scheduler **sched_list,
 		     unsigned int num_sched_list);