diff --git a/drivers/gpu/drm/i915/Kconfig.profile b/drivers/gpu/drm/i915/Kconfig.profile index 48df8889a88a7fecc31a5e0979c39761c697d57d..3a3881d5e44ba1388594fc30b71e98b1b542de77 100644 --- a/drivers/gpu/drm/i915/Kconfig.profile +++ b/drivers/gpu/drm/i915/Kconfig.profile @@ -25,3 +25,14 @@ config DRM_I915_SPIN_REQUEST May be 0 to disable the initial spin. In practice, we estimate the cost of enabling the interrupt (if currently disabled) to be a few microseconds. + +config DRM_I915_STOP_TIMEOUT + int "How long to wait for an engine to quiesce gracefully before reset (ms)" + default 100 # milliseconds + help + By stopping submission and sleeping for a short time before resetting + the GPU, we allow the innocent contexts also on the system to quiesce. + It is then less likely for a hanging context to cause collateral + damage as the system is reset in order to recover. The corollary is + that the reset itself may take longer and so be more disruptive to + interactive or low latency workloads. diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index 0e20713603ec1f6d3f1b4368d7f3ac1c667b7d10..e4203eb4413924730c9156e0dd9384b2c478023c 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -308,6 +308,9 @@ static int intel_engine_setup(struct intel_gt *gt, enum intel_engine_id id) engine->instance = info->instance; __sprint_engine_name(engine); + engine->props.stop_timeout_ms = + CONFIG_DRM_I915_STOP_TIMEOUT; + /* * To be overridden by the backend on setup. However to facilitate * cleanup on error during setup, we always provide the destroy vfunc. @@ -875,6 +878,21 @@ u64 intel_engine_get_last_batch_head(const struct intel_engine_cs *engine) return bbaddr; } +static unsigned long stop_timeout(const struct intel_engine_cs *engine) +{ + if (in_atomic() || irqs_disabled()) /* inside atomic preempt-reset? */ + return 0; + + /* + * If we are doing a normal GPU reset, we can take our time and allow + * the engine to quiesce. We've stopped submission to the engine, and + * if we wait long enough an innocent context should complete and + * leave the engine idle. So they should not be caught unaware by + * the forthcoming GPU reset (which usually follows the stop_cs)! + */ + return READ_ONCE(engine->props.stop_timeout_ms); +} + int intel_engine_stop_cs(struct intel_engine_cs *engine) { struct intel_uncore *uncore = engine->uncore; @@ -892,7 +910,7 @@ int intel_engine_stop_cs(struct intel_engine_cs *engine) err = 0; if (__intel_wait_for_register_fw(uncore, mode, MODE_IDLE, MODE_IDLE, - 1000, 0, + 1000, stop_timeout(engine), NULL)) { GEM_TRACE("%s: timed out on STOP_RING -> IDLE\n", engine->name); err = -ETIMEDOUT; diff --git a/drivers/gpu/drm/i915/gt/intel_engine_types.h b/drivers/gpu/drm/i915/gt/intel_engine_types.h index 3451be034caf4c3b6396dea6249c1c0f577ac249..87d5c4ef3ae7754ac79363077a2c2aba83047cef 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_types.h +++ b/drivers/gpu/drm/i915/gt/intel_engine_types.h @@ -542,6 +542,10 @@ struct intel_engine_cs { */ ktime_t total; } stats; + + struct { + unsigned long stop_timeout_ms; + } props; }; static inline bool