drm/i915: Defer breadcrumb emission

Move the actual emission of the breadcrumb for closing the request from i915_add_request() to the submit callback. (It can be moved later when required.) This allows us to defer the allocation of the global_seqno from request construction to actual submission, allowing us to emit the requests out of order (wrt to the order of their construction, they still will only be executed one all of their dependencies are resolved including that all earlier requests on their timeline have been submitted.) We have to specialise how we then emit the request in order to write into the preallocated space, rather than at the tail of the ringbuffer (which will have been advanced by the addition of new requests). Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-29-chris@chris-wilson.co.uk

drm/i915: Defer breadcrumb emission
Move the actual emission of the breadcrumb for closing the request from i915_add_request() to the submit callback. (It can be moved later when required.) This allows us to defer the allocation of the global_seqno from request construction to actual submission, allowing us to emit the requests out of order (wrt to the order of their construction, they still will only be executed one all of their dependencies are resolved including that all earlier requests on their timeline have been submitted.) We have to specialise how we then emit the request in order to write into the preallocated space, rather than at the tail of the ringbuffer (which will have been advanced by the addition of new requests). Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Joonas Lahtinen <joonas.lahtinen@linux.intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20161028125858.23563-29-chris@chris-wilson.co.uk
caddfe71 · Chris Wilson · 98f29e8d · caddfe71 · caddfe71 · caddfe71
Commit caddfe71 authored Oct 28, 2016 by Chris Wilson
4 changed files
--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -318,17 +318,16 @@ submit_notify(struct i915_sw_fence *fence, enum i915_sw_fence_notify state)
 		container_of(fence, typeof(*request), submit);
 	struct intel_engine_cs *engine = request->engine;

+	if (state != FENCE_COMPLETE)
+		return NOTIFY_DONE;
+
 	/* Will be called from irq-context when using foreign DMA fences */

-	switch (state) {
-	case FENCE_COMPLETE:
-		engine->timeline->last_submitted_seqno = request->fence.seqno;
-		engine->submit_request(request);
-		break;
+	engine->timeline->last_submitted_seqno = request->fence.seqno;

-	case FENCE_FREE:
-		break;
-	}
+	engine->emit_breadcrumb(request,
+				request->ring->vaddr + request->postfix);
+	engine->submit_request(request);

 	return NOTIFY_DONE;
 }
@@ -648,9 +647,7 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	struct intel_ring *ring = request->ring;
 	struct intel_timeline *timeline = request->timeline;
 	struct drm_i915_gem_request *prev;
-	u32 request_start;
-	u32 reserved_tail;
-	int ret;
+	int err;

 	lockdep_assert_held(&request->i915->drm.struct_mutex);
 	trace_i915_gem_request_add(request);
@@ -660,8 +657,6 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * should already have been reserved in the ring buffer. Let the ring
 	 * know that it is time to use that space up.
 	 */
-	request_start = ring->tail;
-	reserved_tail = request->reserved_space;
 	request->reserved_space = 0;

 	/*
@@ -672,10 +667,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * what.
 	 */
 	if (flush_caches) {
-		ret = engine->emit_flush(request, EMIT_FLUSH);
+		err = engine->emit_flush(request, EMIT_FLUSH);

 		/* Not allowed to fail! */
-		WARN(ret, "engine->emit_flush() failed: %d!\n", ret);
+		WARN(err, "engine->emit_flush() failed: %d!\n", err);
 	}

 	/* Record the position of the start of the breadcrumb so that
@@ -683,20 +678,10 @@ void __i915_add_request(struct drm_i915_gem_request *request, bool flush_caches)
 	 * GPU processing the request, we never over-estimate the
 	 * position of the ring's HEAD.
 	 */
+	err = intel_ring_begin(request, engine->emit_breadcrumb_sz);
+	GEM_BUG_ON(err);
 	request->postfix = ring->tail;
-
-	/* Not allowed to fail! */
-	ret = engine->emit_breadcrumb(request);
-	WARN(ret, "(%s)->emit_breadcrumb failed: %d!\n", engine->name, ret);
-
-	/* Sanity check that the reserved size was large enough. */
-	ret = ring->tail - request_start;
-	if (ret < 0)
-		ret += ring->size;
-	WARN_ONCE(ret > reserved_tail,
-		  "Not enough space reserved (%d bytes) "
-		  "for adding the request (%d bytes)\n",
-		  reserved_tail, ret);
+	ring->tail += engine->emit_breadcrumb_sz * sizeof(u32);

 	/* Seal the request and mark it as pending execution. Note that
 	 * we may inspect this state, without holding any locks, during

--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -365,7 +365,7 @@ static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
 	u32 *reg_state = ce->lrc_reg_state;

-	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
+	reg_state[CTX_RING_TAIL+1] = rq->tail;

 	/* True 32b PPGTT with dynamic page allocation: update PDP
 	 * registers and point the unallocated PDPs to scratch page.
@@ -599,6 +599,15 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)

 	spin_lock_irqsave(&engine->execlist_lock, flags);

+	/* We keep the previous context alive until we retire the following
+	 * request. This ensures that any the context object is still pinned
+	 * for any residual writes the HW makes into it on the context switch
+	 * into the next object following the breadcrumb. Otherwise, we may
+	 * retire the context too early.
+	 */
+	request->previous_context = engine->last_context;
+	engine->last_context = request->ctx;
+
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
 	if (execlists_elsp_idle(engine))
 		tasklet_hi_schedule(&engine->irq_tasklet);
@@ -671,46 +680,6 @@ int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request
 	return ret;
 }

-/*
- * intel_logical_ring_advance() - advance the tail and prepare for submission
- * @request: Request to advance the logical ringbuffer of.
- *
- * The tail is updated in our logical ringbuffer struct, not in the actual context. What
- * really happens during submission is that the context and current tail will be placed
- * on a queue waiting for the ELSP to be ready to accept a new context submission. At that
- * point, the tail *inside* the context is updated and the ELSP written to.
- */
-static int
-intel_logical_ring_advance(struct drm_i915_gem_request *request)
-{
-	struct intel_ring *ring = request->ring;
-	struct intel_engine_cs *engine = request->engine;
-
-	intel_ring_advance(ring);
-	request->tail = ring->tail;
-
-	/*
-	 * Here we add two extra NOOPs as padding to avoid
-	 * lite restore of a context with HEAD==TAIL.
-	 *
-	 * Caller must reserve WA_TAIL_DWORDS for us!
-	 */
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
-	request->wa_tail = ring->tail;
-
-	/* We keep the previous context alive until we retire the following
-	 * request. This ensures that any the context object is still pinned
-	 * for any residual writes the HW makes into it on the context switch
-	 * into the next object following the breadcrumb. Otherwise, we may
-	 * retire the context too early.
-	 */
-	request->previous_context = engine->last_context;
-	engine->last_context = request->ctx;
-	return 0;
-}
-
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
@@ -1566,41 +1535,35 @@ static void bxt_a_seqno_barrier(struct intel_engine_cs *engine)
 * used as a workaround for not being allowed to do lite
 * restore with HEAD==TAIL (WaIdleLiteRestore).
 */
-
-static int gen8_emit_breadcrumb(struct drm_i915_gem_request *request)
+static void gen8_emit_wa_tail(struct drm_i915_gem_request *request, u32 *out)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
-
-	ret = intel_ring_begin(request, 6 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
+	*out++ = MI_NOOP;
+	*out++ = MI_NOOP;
+	request->wa_tail = intel_ring_offset(request->ring, out);
+}

+static void gen8_emit_breadcrumb(struct drm_i915_gem_request *request,
+				 u32 *out)
+{
 	/* w/a: bit 5 needs to be zero for MI_FLUSH_DW address. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX_ADDR & (1 << 5));

-	intel_ring_emit(ring, (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-	intel_ring_emit(ring,
-			intel_hws_seqno_address(request->engine) |
-			MI_FLUSH_DW_USE_GTT);
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, request->global_seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+	*out++ = intel_hws_seqno_address(request->engine) | MI_FLUSH_DW_USE_GTT;
+	*out++ = 0;
+	*out++ = request->global_seqno;
+	*out++ = MI_USER_INTERRUPT;
+	*out++ = MI_NOOP;
+	request->tail = intel_ring_offset(request->ring, out);
+
+	gen8_emit_wa_tail(request, out);
 }

 static const int gen8_emit_breadcrumb_sz = 6 + WA_TAIL_DWORDS;

-static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request)
+static void gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request,
+					u32 *out)
 {
-	struct intel_ring *ring = request->ring;
-	int ret;
-
-	ret = intel_ring_begin(request, 8 + WA_TAIL_DWORDS);
-	if (ret)
-		return ret;
-
 	/* We're using qword write, seqno should be aligned to 8 bytes. */
 	BUILD_BUG_ON(I915_GEM_HWS_INDEX & 1);

@@ -1608,19 +1571,20 @@ static int gen8_emit_breadcrumb_render(struct drm_i915_gem_request *request)
 	 * need a prior CS_STALL, which is emitted by the flush
 	 * following the batch.
 	 */
-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring,
-			(PIPE_CONTROL_GLOBAL_GTT_IVB |
-			 PIPE_CONTROL_CS_STALL |
-			 PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(request->engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, request->global_seqno);
+	*out++ = GFX_OP_PIPE_CONTROL(6);
+	*out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+		  PIPE_CONTROL_CS_STALL |
+		  PIPE_CONTROL_QW_WRITE);
+	*out++ = intel_hws_seqno_address(request->engine);
+	*out++ = 0;
+	*out++ = request->global_seqno;
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	return intel_logical_ring_advance(request);
+	*out++ = 0;
+	*out++ = MI_USER_INTERRUPT;
+	*out++ = MI_NOOP;
+	request->tail = intel_ring_offset(request->ring, out);
+
+	gen8_emit_wa_tail(request, out);
 }

 static const int gen8_emit_breadcrumb_render_sz = 8 + WA_TAIL_DWORDS;

--- a/drivers/gpu/drm/i915/intel_ringbuffer.c
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.c
@@ -1213,90 +1213,62 @@ static void render_ring_cleanup(struct intel_engine_cs *engine)
 	i915_vma_unpin_and_release(&dev_priv->semaphore);
 }

-static int gen8_rcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_rcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
-
-	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 8);
-	if (ret)
-		return ret;

 	for_each_engine(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;

-		intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-		intel_ring_emit(ring,
-				PIPE_CONTROL_GLOBAL_GTT_IVB |
-				PIPE_CONTROL_QW_WRITE |
-				PIPE_CONTROL_CS_STALL);
-		intel_ring_emit(ring, lower_32_bits(gtt_offset));
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->global_seqno);
-		intel_ring_emit(ring, 0);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*out++ = GFX_OP_PIPE_CONTROL(6);
+		*out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
+			  PIPE_CONTROL_QW_WRITE |
+			  PIPE_CONTROL_CS_STALL);
+		*out++ = lower_32_bits(gtt_offset);
+		*out++ = upper_32_bits(gtt_offset);
+		*out++ = req->global_seqno;
+		*out++ = 0;
+		*out++ = (MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id));
+		*out++ = 0;
 	}
-	intel_ring_advance(ring);

-	return 0;
+	return out;
 }

-static int gen8_xcs_signal(struct drm_i915_gem_request *req)
+static u32 *gen8_xcs_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *waiter;
 	enum intel_engine_id id;
-	int ret, num_rings;
-
-	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, (num_rings-1) * 6);
-	if (ret)
-		return ret;

 	for_each_engine(waiter, dev_priv, id) {
 		u64 gtt_offset = req->engine->semaphore.signal_ggtt[id];
 		if (gtt_offset == MI_SEMAPHORE_SYNC_INVALID)
 			continue;

-		intel_ring_emit(ring,
-				(MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW);
-		intel_ring_emit(ring,
-				lower_32_bits(gtt_offset) |
-				MI_FLUSH_DW_USE_GTT);
-		intel_ring_emit(ring, upper_32_bits(gtt_offset));
-		intel_ring_emit(ring, req->global_seqno);
-		intel_ring_emit(ring,
-				MI_SEMAPHORE_SIGNAL |
-				MI_SEMAPHORE_TARGET(waiter->hw_id));
-		intel_ring_emit(ring, 0);
+		*out++ = (MI_FLUSH_DW + 1) | MI_FLUSH_DW_OP_STOREDW;
+		*out++ = lower_32_bits(gtt_offset) | MI_FLUSH_DW_USE_GTT;
+		*out++ = upper_32_bits(gtt_offset);
+		*out++ = req->global_seqno;
+		*out++ = (MI_SEMAPHORE_SIGNAL |
+			  MI_SEMAPHORE_TARGET(waiter->hw_id));
+		*out++ = 0;
 	}
-	intel_ring_advance(ring);

-	return 0;
+	return out;
 }

-static int gen6_signal(struct drm_i915_gem_request *req)
+static u32 *gen6_signal(struct drm_i915_gem_request *req, u32 *out)
 {
-	struct intel_ring *ring = req->ring;
 	struct drm_i915_private *dev_priv = req->i915;
 	struct intel_engine_cs *engine;
 	enum intel_engine_id id;
-	int ret, num_rings;
-
-	num_rings = INTEL_INFO(dev_priv)->num_rings;
-	ret = intel_ring_begin(req, round_up((num_rings-1) * 3, 2));
-	if (ret)
-		return ret;
+	int num_rings = 0;

 	for_each_engine(engine, dev_priv, id) {
 		i915_reg_t mbox_reg;
@@ -1306,46 +1278,34 @@ static int gen6_signal(struct drm_i915_gem_request *req)

 		mbox_reg = req->engine->semaphore.mbox.signal[engine->hw_id];
 		if (i915_mmio_reg_valid(mbox_reg)) {
-			intel_ring_emit(ring, MI_LOAD_REGISTER_IMM(1));
-			intel_ring_emit_reg(ring, mbox_reg);
-			intel_ring_emit(ring, req->global_seqno);
+			*out++ = MI_LOAD_REGISTER_IMM(1);
+			*out++ = i915_mmio_reg_offset(mbox_reg);
+			*out++ = req->global_seqno;
+			num_rings++;
 		}
 	}
+	if (num_rings & 1)
+		*out++ = MI_NOOP;

-	/* If num_dwords was rounded, make sure the tail pointer is correct */
-	if (num_rings % 2 == 0)
-		intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
-
-	return 0;
+	return out;
 }

 static void i9xx_submit_request(struct drm_i915_gem_request *request)
 {
 	struct drm_i915_private *dev_priv = request->i915;

-	I915_WRITE_TAIL(request->engine,
-			intel_ring_offset(request->ring, request->tail));
+	I915_WRITE_TAIL(request->engine, request->tail);
 }

-static int i9xx_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void i9xx_emit_breadcrumb(struct drm_i915_gem_request *req,
+				 u32 *out)
 {
-	struct intel_ring *ring = req->ring;
-	int ret;
-
-	ret = intel_ring_begin(req, 4);
-	if (ret)
-		return ret;
-
-	intel_ring_emit(ring, MI_STORE_DWORD_INDEX);
-	intel_ring_emit(ring, I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT);
-	intel_ring_emit(ring, req->global_seqno);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_advance(ring);
-
-	req->tail = ring->tail;
+	*out++ = MI_STORE_DWORD_INDEX;
+	*out++ = I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT;
+	*out++ = req->global_seqno;
+	*out++ = MI_USER_INTERRUPT;

-	return 0;
+	req->tail = intel_ring_offset(req->ring, out);
 }

 static const int i9xx_emit_breadcrumb_sz = 4;
@@ -1358,49 +1318,34 @@ static const int i9xx_emit_breadcrumb_sz = 4;
 * Update the mailbox registers in the *other* rings with the current seqno.
 * This acts like a signal in the canonical semaphore.
 */
-static int gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen6_sema_emit_breadcrumb(struct drm_i915_gem_request *req,
+				      u32 *out)
 {
-	int ret;
-
-	ret = req->engine->semaphore.signal(req);
-	if (ret)
-		return ret;
-
-	return i9xx_emit_breadcrumb(req);
+	return i9xx_emit_breadcrumb(req,
+				    req->engine->semaphore.signal(req, out));
 }

-static int gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req)
+static void gen8_render_emit_breadcrumb(struct drm_i915_gem_request *req,
+					u32 *out)
 {
 	struct intel_engine_cs *engine = req->engine;
-	struct intel_ring *ring = req->ring;
-	int ret;

-	if (engine->semaphore.signal) {
-		ret = engine->semaphore.signal(req);
-		if (ret)
-			return ret;
-	}
-
-	ret = intel_ring_begin(req, 8);
-	if (ret)
-		return ret;
+	if (engine->semaphore.signal)
+		out = engine->semaphore.signal(req, out);

-	intel_ring_emit(ring, GFX_OP_PIPE_CONTROL(6));
-	intel_ring_emit(ring, (PIPE_CONTROL_GLOBAL_GTT_IVB |
+	*out++ = GFX_OP_PIPE_CONTROL(6);
+	*out++ = (PIPE_CONTROL_GLOBAL_GTT_IVB |
 			       PIPE_CONTROL_CS_STALL |
-			       PIPE_CONTROL_QW_WRITE));
-	intel_ring_emit(ring, intel_hws_seqno_address(engine));
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, req->global_seqno);
+			       PIPE_CONTROL_QW_WRITE);
+	*out++ = intel_hws_seqno_address(engine);
+	*out++ = 0;
+	*out++ = req->global_seqno;
 	/* We're thrashing one dword of HWS. */
-	intel_ring_emit(ring, 0);
-	intel_ring_emit(ring, MI_USER_INTERRUPT);
-	intel_ring_emit(ring, MI_NOOP);
-	intel_ring_advance(ring);
-
-	req->tail = ring->tail;
+	*out++ = 0;
+	*out++ = MI_USER_INTERRUPT;
+	*out++ = MI_NOOP;

-	return 0;
+	req->tail = intel_ring_offset(req->ring, out);
 }

 static const int gen8_render_emit_breadcrumb_sz = 8;

--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -255,7 +255,8 @@ struct intel_engine_cs {
 #define I915_DISPATCH_SECURE BIT(0)
 #define I915_DISPATCH_PINNED BIT(1)
 #define I915_DISPATCH_RS     BIT(2)
-	int		(*emit_breadcrumb)(struct drm_i915_gem_request *req);
+	void		(*emit_breadcrumb)(struct drm_i915_gem_request *req,
+					   u32 *out);
 	int		emit_breadcrumb_sz;

 	/* Pass the request to the hardware queue (e.g. directly into
@@ -331,7 +332,7 @@ struct intel_engine_cs {
 		/* AKA wait() */
 		int	(*sync_to)(struct drm_i915_gem_request *req,
 				   struct drm_i915_gem_request *signal);
-		int	(*signal)(struct drm_i915_gem_request *req);
+		u32	*(*signal)(struct drm_i915_gem_request *req, u32 *out);
 	} semaphore;

 	/* Execlists */
@@ -487,10 +488,11 @@ static inline void intel_ring_advance(struct intel_ring *ring)
 	 */
 }

-static inline u32 intel_ring_offset(struct intel_ring *ring, u32 value)
+static inline u32 intel_ring_offset(struct intel_ring *ring, void *addr)
 {
 	/* Don't write ring->size (equivalent to 0) as that hangs some GPUs. */
-	return value & (ring->size - 1);
+	u32 offset = addr - ring->vaddr;
+	return offset & (ring->size - 1);
 }

 int __intel_ring_space(int head, int tail, int size);