Merge tag 'drm-intel-next-fixes-2020-06-04' of...

Merge tag 'drm-intel-next-fixes-2020-06-04' of git://anongit.freedesktop.org/drm/drm-intel into drm-next - Includes gvt-next-fixes-2020-05-28 - Use after free fix for display global state. - Whitelisting context-local timestamp on Gen9 and two scheduler fixes with deps (Cc: stable) - Removal of write flag from sysfs files where ineffective Signed-off-by: Dave Airlie <airlied@redhat.com> From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200604150454.GA59322@jlahtine-desk.ger.corp.intel.com

Merge tag 'drm-intel-next-fixes-2020-06-04' of...
Merge tag 'drm-intel-next-fixes-2020-06-04' of git://anongit.freedesktop.org/drm/drm-intel into drm-next - Includes gvt-next-fixes-2020-05-28 - Use after free fix for display global state. - Whitelisting context-local timestamp on Gen9 and two scheduler fixes with deps (Cc: stable) - Removal of write flag from sysfs files where ineffective Signed-off-by: Dave Airlie <airlied@redhat.com> From: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20200604150454.GA59322@jlahtine-desk.ger.corp.intel.com
8d286e2f · Dave Airlie · fa3fa222 · f8665d79 · 8d286e2f · 8d286e2f
Commit 8d286e2f authored Jun 08, 2020 by Dave Airlie
11 changed files
--- a/drivers/gpu/drm/i915/display/intel_global_state.c
+++ b/drivers/gpu/drm/i915/display/intel_global_state.c
@@ -10,6 +10,28 @@
 #include "intel_display_types.h"
 #include "intel_global_state.h"
+static void __intel_atomic_global_state_free(struct kref *kref)
+{
+	struct intel_global_state *obj_state =
+		container_of(kref, struct intel_global_state, ref);
+	struct intel_global_obj *obj = obj_state->obj;
+	obj->funcs->atomic_destroy_state(obj, obj_state);
+}
+static void intel_atomic_global_state_put(struct intel_global_state *obj_state)
+{
+	kref_put(&obj_state->ref, __intel_atomic_global_state_free);
+}
+static struct intel_global_state *
+intel_atomic_global_state_get(struct intel_global_state *obj_state)
+{
+	kref_get(&obj_state->ref);
+	return obj_state;
+}
 void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv,
 				  struct intel_global_obj *obj,
 				  struct intel_global_state *state,
@@ -17,6 +39,10 @@ void intel_atomic_global_obj_init(struct drm_i915_private *dev_priv,
 {
 	memset(obj, 0, sizeof(*obj));
+	state->obj = obj;
+	kref_init(&state->ref);
 	obj->state = state;
 	obj->funcs = funcs;
 	list_add_tail(&obj->head, &dev_priv->global_obj_list);
@@ -28,7 +54,9 @@ void intel_atomic_global_obj_cleanup(struct drm_i915_private *dev_priv)
 	list_for_each_entry_safe(obj, next, &dev_priv->global_obj_list, head) {
 		list_del(&obj->head);
-		obj->funcs->atomic_destroy_state(obj, obj->state);
+		drm_WARN_ON(&dev_priv->drm, kref_read(&obj->state->ref) != 1);
+		intel_atomic_global_state_put(obj->state);
 	}
 }
@@ -97,10 +125,14 @@ intel_atomic_get_global_obj_state(struct intel_atomic_state *state,
 	if (!obj_state)
 		return ERR_PTR(-ENOMEM);
+	obj_state->obj = obj;
 	obj_state->changed = false;
+	kref_init(&obj_state->ref);
 	state->global_objs[index].state = obj_state;
-	state->global_objs[index].old_state = obj->state;
+	state->global_objs[index].old_state =
+		intel_atomic_global_state_get(obj->state);
 	state->global_objs[index].new_state = obj_state;
 	state->global_objs[index].ptr = obj;
 	obj_state->state = state;
@@ -163,7 +195,9 @@ void intel_atomic_swap_global_state(struct intel_atomic_state *state)
 		new_obj_state->state = NULL;
 		state->global_objs[i].state = old_obj_state;
-		obj->state = new_obj_state;
+		intel_atomic_global_state_put(obj->state);
+		obj->state = intel_atomic_global_state_get(new_obj_state);
 	}
 }
@@ -172,10 +206,9 @@ void intel_atomic_clear_global_state(struct intel_atomic_state *state)
 	int i;
 	for (i = 0; i < state->num_global_objs; i++) {
-		struct intel_global_obj *obj = state->global_objs[i].ptr;
+		intel_atomic_global_state_put(state->global_objs[i].old_state);
+		intel_atomic_global_state_put(state->global_objs[i].new_state);
-		obj->funcs->atomic_destroy_state(obj,
-						 state->global_objs[i].state);
 		state->global_objs[i].ptr = NULL;
 		state->global_objs[i].state = NULL;
 		state->global_objs[i].old_state = NULL;

--- a/drivers/gpu/drm/i915/display/intel_global_state.h
+++ b/drivers/gpu/drm/i915/display/intel_global_state.h
@@ -6,6 +6,7 @@
 #ifndef __INTEL_GLOBAL_STATE_H__
 #define __INTEL_GLOBAL_STATE_H__
+#include <linux/kref.h>
 #include <linux/list.h>
 struct drm_i915_private;
@@ -54,7 +55,9 @@ struct intel_global_obj {
 		for_each_if(obj)
 struct intel_global_state {
+	struct intel_global_obj *obj;
 	struct intel_atomic_state *state;
+	struct kref ref;
 	bool changed;
 };

--- a/drivers/gpu/drm/i915/gem/i915_gem_context.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -230,7 +230,7 @@ static void intel_context_set_gem(struct intel_context *ce,
 		ce->timeline = intel_timeline_get(ctx->timeline);
 	if (ctx->sched.priority >= I915_PRIORITY_NORMAL &&
-	    intel_engine_has_semaphores(ce->engine))
+	    intel_engine_has_timeslices(ce->engine))
 		__set_bit(CONTEXT_USE_SEMAPHORES, &ce->flags);
 }
@@ -1969,7 +1969,7 @@ static int __apply_priority(struct intel_context *ce, void *arg)
 {
 	struct i915_gem_context *ctx = arg;
-	if (!intel_engine_has_semaphores(ce->engine))
+	if (!intel_engine_has_timeslices(ce->engine))
 		return 0;
 	if (ctx->sched.priority >= I915_PRIORITY_NORMAL)

--- a/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
+++ b/drivers/gpu/drm/i915/gem/i915_gem_shmem.c
@@ -39,7 +39,6 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj)
 	unsigned long last_pfn = 0;	/* suppress gcc warning */
 	unsigned int max_segment = i915_sg_segment_size();
 	unsigned int sg_page_sizes;
-	struct pagevec pvec;
 	gfp_t noreclaim;
 	int ret;
@@ -192,6 +191,9 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj)
 	sg_mark_end(sg);
 err_pages:
 	mapping_clear_unevictable(mapping);
+	if (sg != st->sgl) {
+		struct pagevec pvec;
 		pagevec_init(&pvec);
 		for_each_sgt_page(page, sgt_iter, st) {
 			if (!pagevec_add(&pvec, page))
@@ -199,6 +201,7 @@ static int shmem_get_pages(struct drm_i915_gem_object *obj)
 		}
 		if (pagevec_count(&pvec))
 			check_release_pagevec(&pvec);
+	}
 	sg_free_table(st);
 	kfree(st);

--- a/drivers/gpu/drm/i915/gt/intel_context.c
+++ b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -97,8 +97,6 @@ int __intel_context_do_pin(struct intel_context *ce)
 {
 	int err;
-	GEM_BUG_ON(intel_context_is_closed(ce));
 	if (unlikely(!test_bit(CONTEXT_ALLOC_BIT, &ce->flags))) {
 		err = intel_context_alloc_state(ce);
 		if (err)

--- a/drivers/gpu/drm/i915/gvt/vgpu.c
+++ b/drivers/gpu/drm/i915/gvt/vgpu.c
@@ -124,7 +124,7 @@ int intel_gvt_init_vgpu_types(struct intel_gvt *gvt)
 	 */
 	low_avail = gvt_aperture_sz(gvt) - HOST_LOW_GM_SIZE;
 	high_avail = gvt_hidden_sz(gvt) - HOST_HIGH_GM_SIZE;
-	num_types = sizeof(vgpu_types) / sizeof(vgpu_types[0]);
+	num_types = ARRAY_SIZE(vgpu_types);
 	gvt->types = kcalloc(num_types, sizeof(struct intel_vgpu_type),
 			     GFP_KERNEL);

--- a/drivers/gpu/drm/i915/i915_cmd_parser.c
+++ b/drivers/gpu/drm/i915/i915_cmd_parser.c
@@ -572,6 +572,9 @@ struct drm_i915_reg_descriptor {
 #define REG32(_reg, ...) \
 	{ .addr = (_reg), __VA_ARGS__ }
+#define REG32_IDX(_reg, idx) \
+	{ .addr = _reg(idx) }
 /*
 * Convenience macro for adding 64-bit registers.
 *
@@ -669,6 +672,7 @@ static const struct drm_i915_reg_descriptor gen9_blt_regs[] = {
 	REG64_IDX(RING_TIMESTAMP, BSD_RING_BASE),
 	REG32(BCS_SWCTRL),
 	REG64_IDX(RING_TIMESTAMP, BLT_RING_BASE),
+	REG32_IDX(RING_CTX_TIMESTAMP, BLT_RING_BASE),
 	REG64_IDX(BCS_GPR, 0),
 	REG64_IDX(BCS_GPR, 1),
 	REG64_IDX(BCS_GPR, 2),

--- a/drivers/gpu/drm/i915/i915_params.c
+++ b/drivers/gpu/drm/i915/i915_params.c
@@ -173,7 +173,7 @@ i915_param_named(enable_gvt, bool, 0400,
 #endif
 #if IS_ENABLED(CONFIG_DRM_I915_UNSTABLE_FAKE_LMEM)
-i915_param_named_unsafe(fake_lmem_start, ulong, 0600,
+i915_param_named_unsafe(fake_lmem_start, ulong, 0400,
 	"Fake LMEM start offset (default: 0)");
 #endif

--- a/drivers/gpu/drm/i915/i915_params.h
+++ b/drivers/gpu/drm/i915/i915_params.h
@@ -64,7 +64,7 @@ struct drm_printer;
 	param(int, mmio_debug, -IS_ENABLED(CONFIG_DRM_I915_DEBUG_MMIO), 0600) \
 	param(int, edp_vswing, 0, 0400) \
 	param(unsigned int, reset, 3, 0600) \
-	param(unsigned int, inject_probe_failure, 0, 0600) \
+	param(unsigned int, inject_probe_failure, 0, 0) \
 	param(int, fastboot, -1, 0600) \
 	param(int, enable_dpcd_backlight, -1, 0600) \
 	param(char *, force_probe, CONFIG_DRM_I915_FORCE_PROBE, 0400) \

--- a/drivers/gpu/drm/i915/i915_request.c
+++ b/drivers/gpu/drm/i915/i915_request.c
@@ -121,8 +121,39 @@ static void i915_fence_release(struct dma_fence *fence)
 	i915_sw_fence_fini(&rq->submit);
 	i915_sw_fence_fini(&rq->semaphore);
-	/* Keep one request on each engine for reserved use under mempressure */
+	/*
-	if (!cmpxchg(&rq->engine->request_pool, NULL, rq))
+	 * Keep one request on each engine for reserved use under mempressure
+	 *
+	 * We do not hold a reference to the engine here and so have to be
+	 * very careful in what rq->engine we poke. The virtual engine is
+	 * referenced via the rq->context and we released that ref during
+	 * i915_request_retire(), ergo we must not dereference a virtual
+	 * engine here. Not that we would want to, as the only consumer of
+	 * the reserved engine->request_pool is the power management parking,
+	 * which must-not-fail, and that is only run on the physical engines.
+	 *
+	 * Since the request must have been executed to be have completed,
+	 * we know that it will have been processed by the HW and will
+	 * not be unsubmitted again, so rq->engine and rq->execution_mask
+	 * at this point is stable. rq->execution_mask will be a single
+	 * bit if the last and _only_ engine it could execution on was a
+	 * physical engine, if it's multiple bits then it started on and
+	 * could still be on a virtual engine. Thus if the mask is not a
+	 * power-of-two we assume that rq->engine may still be a virtual
+	 * engine and so a dangling invalid pointer that we cannot dereference
+	 *
+	 * For example, consider the flow of a bonded request through a virtual
+	 * engine. The request is created with a wide engine mask (all engines
+	 * that we might execute on). On processing the bond, the request mask
+	 * is reduced to one or more engines. If the request is subsequently
+	 * bound to a single engine, it will then be constrained to only
+	 * execute on that engine and never returned to the virtual engine
+	 * after timeslicing away, see __unwind_incomplete_requests(). Thus we
+	 * know that if the rq->execution_mask is a single bit, rq->engine
+	 * can be a physical engine with the exact corresponding mask.
+	 */
+	if (is_power_of_2(rq->execution_mask) &&
+	    !cmpxchg(&rq->engine->request_pool, NULL, rq))
 		return;
 	kmem_cache_free(global.slab_requests, rq);
@@ -326,6 +357,53 @@ void i915_request_retire_upto(struct i915_request *rq)
 	} while (i915_request_retire(tmp) && tmp != rq);
 }
+static struct i915_request * const *
+__engine_active(struct intel_engine_cs *engine)
+{
+	return READ_ONCE(engine->execlists.active);
+}
+static bool __request_in_flight(const struct i915_request *signal)
+{
+	struct i915_request * const *port, *rq;
+	bool inflight = false;
+	if (!i915_request_is_ready(signal))
+		return false;
+	/*
+	 * Even if we have unwound the request, it may still be on
+	 * the GPU (preempt-to-busy). If that request is inside an
+	 * unpreemptible critical section, it will not be removed. Some
+	 * GPU functions may even be stuck waiting for the paired request
+	 * (__await_execution) to be submitted and cannot be preempted
+	 * until the bond is executing.
+	 *
+	 * As we know that there are always preemption points between
+	 * requests, we know that only the currently executing request
+	 * may be still active even though we have cleared the flag.
+	 * However, we can't rely on our tracking of ELSP[0] to known
+	 * which request is currently active and so maybe stuck, as
+	 * the tracking maybe an event behind. Instead assume that
+	 * if the context is still inflight, then it is still active
+	 * even if the active flag has been cleared.
+	 */
+	if (!intel_context_inflight(signal->context))
+		return false;
+	rcu_read_lock();
+	for (port = __engine_active(signal->engine); (rq = *port); port++) {
+		if (rq->context == signal->context) {
+			inflight = i915_seqno_passed(rq->fence.seqno,
+						     signal->fence.seqno);
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return inflight;
+}
 static int
 __await_execution(struct i915_request *rq,
 		  struct i915_request *signal,
@@ -356,7 +434,7 @@ __await_execution(struct i915_request *rq,
 	}
 	spin_lock_irq(&signal->lock);
-	if (i915_request_is_active(signal)) {
+	if (i915_request_is_active(signal) || __request_in_flight(signal)) {
 		if (hook) {
 			hook(rq, &signal->fence);
 			i915_request_put(signal);
@@ -1022,37 +1100,91 @@ emit_semaphore_wait(struct i915_request *to,
 					     I915_FENCE_GFP);
 }
+static bool intel_timeline_sync_has_start(struct intel_timeline *tl,
+					  struct dma_fence *fence)
+{
+	return __intel_timeline_sync_is_later(tl,
+					      fence->context,
+					      fence->seqno - 1);
+}
+static int intel_timeline_sync_set_start(struct intel_timeline *tl,
+					 const struct dma_fence *fence)
+{
+	return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1);
+}
 static int
-i915_request_await_request(struct i915_request *to, struct i915_request *from)
+__i915_request_await_execution(struct i915_request *to,
+			       struct i915_request *from,
+			       void (*hook)(struct i915_request *rq,
+					    struct dma_fence *signal))
 {
-	int ret;
+	int err;
-	GEM_BUG_ON(to == from);
+	GEM_BUG_ON(intel_context_is_barrier(from->context));
-	GEM_BUG_ON(to->timeline == from->timeline);
-	if (i915_request_completed(from)) {
+	/* Submit both requests at the same time */
-		i915_sw_fence_set_error_once(&to->submit, from->fence.error);
+	err = __await_execution(to, from, hook, I915_FENCE_GFP);
+	if (err)
+		return err;
+	/* Squash repeated depenendices to the same timelines */
+	if (intel_timeline_sync_has_start(i915_request_timeline(to),
+					  &from->fence))
 		return 0;
+	/*
+	 * Wait until the start of this request.
+	 *
+	 * The execution cb fires when we submit the request to HW. But in
+	 * many cases this may be long before the request itself is ready to
+	 * run (consider that we submit 2 requests for the same context, where
+	 * the request of interest is behind an indefinite spinner). So we hook
+	 * up to both to reduce our queues and keep the execution lag minimised
+	 * in the worst case, though we hope that the await_start is elided.
+	 */
+	err = i915_request_await_start(to, from);
+	if (err < 0)
+		return err;
+	/*
+	 * Ensure both start together [after all semaphores in signal]
+	 *
+	 * Now that we are queued to the HW at roughly the same time (thanks
+	 * to the execute cb) and are ready to run at roughly the same time
+	 * (thanks to the await start), our signaler may still be indefinitely
+	 * delayed by waiting on a semaphore from a remote engine. If our
+	 * signaler depends on a semaphore, so indirectly do we, and we do not
+	 * want to start our payload until our signaler also starts theirs.
+	 * So we wait.
+	 *
+	 * However, there is also a second condition for which we need to wait
+	 * for the precise start of the signaler. Consider that the signaler
+	 * was submitted in a chain of requests following another context
+	 * (with just an ordinary intra-engine fence dependency between the
+	 * two). In this case the signaler is queued to HW, but not for
+	 * immediate execution, and so we must wait until it reaches the
+	 * active slot.
+	 */
+	if (intel_engine_has_semaphores(to->engine) &&
+	    !i915_request_has_initial_breadcrumb(to)) {
+		err = __emit_semaphore_wait(to, from, from->fence.seqno - 1);
+		if (err < 0)
+			return err;
 	}
+	/* Couple the dependency tree for PI on this exposed to->fence */
 	if (to->engine->schedule) {
-		ret = i915_sched_node_add_dependency(&to->sched,
+		err = i915_sched_node_add_dependency(&to->sched,
 						     &from->sched,
-						     I915_DEPENDENCY_EXTERNAL);
+						     I915_DEPENDENCY_WEAK);
-		if (ret < 0)
+		if (err < 0)
-			return ret;
+			return err;
 	}
-	if (to->engine == from->engine)
+	return intel_timeline_sync_set_start(i915_request_timeline(to),
-		ret = i915_sw_fence_await_sw_fence_gfp(&to->submit,
+					     &from->fence);
-						       &from->submit,
-						       I915_FENCE_GFP);
-	else
-		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
-	if (ret < 0)
-		return ret;
-	return 0;
 }
 static void mark_external(struct i915_request *rq)
@@ -1105,23 +1237,20 @@ i915_request_await_external(struct i915_request *rq, struct dma_fence *fence)
 }
 int
-i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
+i915_request_await_execution(struct i915_request *rq,
+			     struct dma_fence *fence,
+			     void (*hook)(struct i915_request *rq,
+					  struct dma_fence *signal))
 {
 	struct dma_fence **child = &fence;
 	unsigned int nchild = 1;
 	int ret;
-	/*
-	 * Note that if the fence-array was created in signal-on-any mode,
-	 * we should *not* decompose it into its individual fences. However,
-	 * we don't currently store which mode the fence-array is operating
-	 * in. Fortunately, the only user of signal-on-any is private to
-	 * amdgpu and we should not see any incoming fence-array from
-	 * sync-file being in signal-on-any mode.
-	 */
 	if (dma_fence_is_array(fence)) {
 		struct dma_fence_array *array = to_dma_fence_array(fence);
+		/* XXX Error for signal-on-any fence arrays */
 		child = array->fences;
 		nchild = array->num_fences;
 		GEM_BUG_ON(!nchild);
@@ -1134,138 +1263,95 @@ i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
 			continue;
 		}
-		/*
-		 * Requests on the same timeline are explicitly ordered, along
-		 * with their dependencies, by i915_request_add() which ensures
-		 * that requests are submitted in-order through each ring.
-		 */
 		if (fence->context == rq->fence.context)
 			continue;
-		/* Squash repeated waits to the same timelines */
+		/*
-		if (fence->context &&
+		 * We don't squash repeated fence dependencies here as we
-		    intel_timeline_sync_is_later(i915_request_timeline(rq),
+		 * want to run our callback in all cases.
-						 fence))
+		 */
-			continue;
 		if (dma_fence_is_i915(fence))
-			ret = i915_request_await_request(rq, to_request(fence));
+			ret = __i915_request_await_execution(rq,
+							     to_request(fence),
+							     hook);
 		else
 			ret = i915_request_await_external(rq, fence);
 		if (ret < 0)
 			return ret;
-		/* Record the latest fence used against each timeline */
-		if (fence->context)
-			intel_timeline_sync_set(i915_request_timeline(rq),
-						fence);
 	} while (--nchild);
 	return 0;
 }
-static bool intel_timeline_sync_has_start(struct intel_timeline *tl,
+static int
-					  struct dma_fence *fence)
+await_request_submit(struct i915_request *to, struct i915_request *from)
-{
-	return __intel_timeline_sync_is_later(tl,
-					      fence->context,
-					      fence->seqno - 1);
-}
-static int intel_timeline_sync_set_start(struct intel_timeline *tl,
-					 const struct dma_fence *fence)
 {
-	return __intel_timeline_sync_set(tl, fence->context, fence->seqno - 1);
+	/*
+	 * If we are waiting on a virtual engine, then it may be
+	 * constrained to execute on a single engine *prior* to submission.
+	 * When it is submitted, it will be first submitted to the virtual
+	 * engine and then passed to the physical engine. We cannot allow
+	 * the waiter to be submitted immediately to the physical engine
+	 * as it may then bypass the virtual request.
+	 */
+	if (to->engine == READ_ONCE(from->engine))
+		return i915_sw_fence_await_sw_fence_gfp(&to->submit,
+							&from->submit,
+							I915_FENCE_GFP);
+	else
+		return __i915_request_await_execution(to, from, NULL);
 }
 static int
-__i915_request_await_execution(struct i915_request *to,
+i915_request_await_request(struct i915_request *to, struct i915_request *from)
-			       struct i915_request *from,
-			       void (*hook)(struct i915_request *rq,
-					    struct dma_fence *signal))
 {
-	int err;
+	int ret;
-	GEM_BUG_ON(intel_context_is_barrier(from->context));
-	/* Submit both requests at the same time */
+	GEM_BUG_ON(to == from);
-	err = __await_execution(to, from, hook, I915_FENCE_GFP);
+	GEM_BUG_ON(to->timeline == from->timeline);
-	if (err)
-		return err;
-	/* Squash repeated depenendices to the same timelines */
+	if (i915_request_completed(from)) {
-	if (intel_timeline_sync_has_start(i915_request_timeline(to),
+		i915_sw_fence_set_error_once(&to->submit, from->fence.error);
-					  &from->fence))
 		return 0;
-	/*
-	 * Wait until the start of this request.
-	 *
-	 * The execution cb fires when we submit the request to HW. But in
-	 * many cases this may be long before the request itself is ready to
-	 * run (consider that we submit 2 requests for the same context, where
-	 * the request of interest is behind an indefinite spinner). So we hook
-	 * up to both to reduce our queues and keep the execution lag minimised
-	 * in the worst case, though we hope that the await_start is elided.
-	 */
-	err = i915_request_await_start(to, from);
-	if (err < 0)
-		return err;
-	/*
-	 * Ensure both start together [after all semaphores in signal]
-	 *
-	 * Now that we are queued to the HW at roughly the same time (thanks
-	 * to the execute cb) and are ready to run at roughly the same time
-	 * (thanks to the await start), our signaler may still be indefinitely
-	 * delayed by waiting on a semaphore from a remote engine. If our
-	 * signaler depends on a semaphore, so indirectly do we, and we do not
-	 * want to start our payload until our signaler also starts theirs.
-	 * So we wait.
-	 *
-	 * However, there is also a second condition for which we need to wait
-	 * for the precise start of the signaler. Consider that the signaler
-	 * was submitted in a chain of requests following another context
-	 * (with just an ordinary intra-engine fence dependency between the
-	 * two). In this case the signaler is queued to HW, but not for
-	 * immediate execution, and so we must wait until it reaches the
-	 * active slot.
-	 */
-	if (intel_engine_has_semaphores(to->engine) &&
-	    !i915_request_has_initial_breadcrumb(to)) {
-		err = __emit_semaphore_wait(to, from, from->fence.seqno - 1);
-		if (err < 0)
-			return err;
 	}
-	/* Couple the dependency tree for PI on this exposed to->fence */
 	if (to->engine->schedule) {
-		err = i915_sched_node_add_dependency(&to->sched,
+		ret = i915_sched_node_add_dependency(&to->sched,
 						     &from->sched,
-						     I915_DEPENDENCY_WEAK);
+						     I915_DEPENDENCY_EXTERNAL);
-		if (err < 0)
+		if (ret < 0)
-			return err;
+			return ret;
 	}
-	return intel_timeline_sync_set_start(i915_request_timeline(to),
+	if (is_power_of_2(to->execution_mask | READ_ONCE(from->execution_mask)))
-					     &from->fence);
+		ret = await_request_submit(to, from);
+	else
+		ret = emit_semaphore_wait(to, from, I915_FENCE_GFP);
+	if (ret < 0)
+		return ret;
+	return 0;
 }
 int
-i915_request_await_execution(struct i915_request *rq,
+i915_request_await_dma_fence(struct i915_request *rq, struct dma_fence *fence)
-			     struct dma_fence *fence,
-			     void (*hook)(struct i915_request *rq,
-					  struct dma_fence *signal))
 {
 	struct dma_fence **child = &fence;
 	unsigned int nchild = 1;
 	int ret;
+	/*
+	 * Note that if the fence-array was created in signal-on-any mode,
+	 * we should *not* decompose it into its individual fences. However,
+	 * we don't currently store which mode the fence-array is operating
+	 * in. Fortunately, the only user of signal-on-any is private to
+	 * amdgpu and we should not see any incoming fence-array from
+	 * sync-file being in signal-on-any mode.
+	 */
 	if (dma_fence_is_array(fence)) {
 		struct dma_fence_array *array = to_dma_fence_array(fence);
-		/* XXX Error for signal-on-any fence arrays */
 		child = array->fences;
 		nchild = array->num_fences;
 		GEM_BUG_ON(!nchild);
@@ -1278,22 +1364,31 @@ i915_request_await_execution(struct i915_request *rq,
 			continue;
 		}
+		/*
+		 * Requests on the same timeline are explicitly ordered, along
+		 * with their dependencies, by i915_request_add() which ensures
+		 * that requests are submitted in-order through each ring.
+		 */
 		if (fence->context == rq->fence.context)
 			continue;
-		/*
+		/* Squash repeated waits to the same timelines */
-		 * We don't squash repeated fence dependencies here as we
+		if (fence->context &&
-		 * want to run our callback in all cases.
+		    intel_timeline_sync_is_later(i915_request_timeline(rq),
-		 */
+						 fence))
+			continue;
 		if (dma_fence_is_i915(fence))
-			ret = __i915_request_await_execution(rq,
+			ret = i915_request_await_request(rq, to_request(fence));
-							     to_request(fence),
-							     hook);
 		else
 			ret = i915_request_await_external(rq, fence);
 		if (ret < 0)
 			return ret;
+		/* Record the latest fence used against each timeline */
+		if (fence->context)
+			intel_timeline_sync_set(i915_request_timeline(rq),
+						fence);
 	} while (--nchild);
 	return 0;

--- a/drivers/gpu/drm/i915/i915_scheduler.c
+++ b/drivers/gpu/drm/i915/i915_scheduler.c
@@ -209,14 +209,6 @@ static void kick_submission(struct intel_engine_cs *engine,
 	if (!inflight)
 		goto unlock;
-	ENGINE_TRACE(engine,
-		     "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n",
-		     prio,
-		     rq->fence.context, rq->fence.seqno,
-		     inflight->fence.context, inflight->fence.seqno,
-		     inflight->sched.attr.priority);
-	engine->execlists.queue_priority_hint = prio;
 	/*
 	 * If we are already the currently executing context, don't
 	 * bother evaluating if we should preempt ourselves.
@@ -224,6 +216,14 @@ static void kick_submission(struct intel_engine_cs *engine,
 	if (inflight->context == rq->context)
 		goto unlock;
+	ENGINE_TRACE(engine,
+		     "bumping queue-priority-hint:%d for rq:%llx:%lld, inflight:%llx:%lld prio %d\n",
+		     prio,
+		     rq->fence.context, rq->fence.seqno,
+		     inflight->fence.context, inflight->fence.seqno,
+		     inflight->sched.attr.priority);
+	engine->execlists.queue_priority_hint = prio;
 	if (need_preempt(prio, rq_prio(inflight)))
 		tasklet_hi_schedule(&engine->execlists.tasklet);