drm/i915: Simplify ELSP queue request tracking

Emulate HW to track and manage ELSP queue. A set of SW ports are defined and requests are assigned to these ports before submitting them to HW. This helps in cleaning up incomplete requests during reset recovery easier especially after engine reset by decoupling elsp queue management. This will become more clear in the next patch. In the engine reset case we want to resume where we left-off after skipping the incomplete batch which requires checking the elsp queue, removing element and fixing elsp_submitted counts in some cases. Instead of directly manipulating the elsp queue from reset path we can examine these ports, fix up ringbuffer pointers using the incomplete request and restart submissions again after reset. Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Arun Siluvery <arun.siluvery@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluvery@linux.intel.comReviewed-by: Mika Kuoppala <mika.kuoppala@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20160909131201.16673-6-chris@chris-wilson.co.uk

drm/i915: Simplify ELSP queue request tracking
Emulate HW to track and manage ELSP queue. A set of SW ports are defined and requests are assigned to these ports before submitting them to HW. This helps in cleaning up incomplete requests during reset recovery easier especially after engine reset by decoupling elsp queue management. This will become more clear in the next patch. In the engine reset case we want to resume where we left-off after skipping the incomplete batch which requires checking the elsp queue, removing element and fixing elsp_submitted counts in some cases. Instead of directly manipulating the elsp queue from reset path we can examine these ports, fix up ringbuffer pointers using the incomplete request and restart submissions again after reset. Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Cc: Mika Kuoppala <mika.kuoppala@intel.com> Cc: Arun Siluvery <arun.siluvery@linux.intel.com> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: http://patchwork.freedesktop.org/patch/msgid/1470414607-32453-3-git-send-email-arun.siluvery@linux.intel.comReviewed-by: Mika Kuoppala <mika.kuoppala@intel.com> Link: http://patchwork.freedesktop.org/patch/msgid/20160909131201.16673-6-chris@chris-wilson.co.uk
70c2a24d · Chris Wilson · bbd6c47e · 70c2a24d · 70c2a24d · 70c2a24d
Commit 70c2a24d authored Sep 09, 2016 by Chris Wilson
7 changed files
--- a/drivers/gpu/drm/i915/i915_debugfs.c
+++ b/drivers/gpu/drm/i915/i915_debugfs.c
@@ -2051,7 +2051,7 @@ static int i915_execlists(struct seq_file *m, void *data)
 		status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(engine));
 		seq_printf(m, "\tStatus pointer: 0x%08X\n", status_pointer);
-		read_pointer = engine->next_context_status_buffer;
+		read_pointer = GEN8_CSB_READ_PTR(status_pointer);
 		write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
 		if (read_pointer > write_pointer)
 			write_pointer += GEN8_CSB_ENTRIES;

--- a/drivers/gpu/drm/i915/i915_gem.c
+++ b/drivers/gpu/drm/i915/i915_gem.c
@@ -2575,6 +2575,9 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	struct drm_i915_gem_request *request;
 	struct intel_ring *ring;
+	/* Ensure irq handler finishes, and not run again. */
+	tasklet_kill(&engine->irq_tasklet);
 	/* Mark all pending requests as complete so that any concurrent
 	 * (lockless) lookup doesn't try and wait upon the request as we
 	 * reset it.
@@ -2588,10 +2591,12 @@ static void i915_gem_reset_engine_cleanup(struct intel_engine_cs *engine)
 	 */
 	if (i915.enable_execlists) {
-		/* Ensure irq handler finishes or is cancelled. */
+		spin_lock(&engine->execlist_lock);
-		tasklet_kill(&engine->irq_tasklet);
+		INIT_LIST_HEAD(&engine->execlist_queue);
+		i915_gem_request_put(engine->execlist_port[0].request);
-		intel_execlists_cancel_requests(engine);
+		i915_gem_request_put(engine->execlist_port[1].request);
+		memset(engine->execlist_port, 0, sizeof(engine->execlist_port));
+		spin_unlock(&engine->execlist_lock);
 	}
 	/*

--- a/drivers/gpu/drm/i915/i915_gem_request.c
+++ b/drivers/gpu/drm/i915/i915_gem_request.c
@@ -402,7 +402,6 @@ i915_gem_request_alloc(struct intel_engine_cs *engine,
 	req->previous_context = NULL;
 	req->file_priv = NULL;
 	req->batch = NULL;
-	req->elsp_submitted = 0;
 	/*
 	 * Reserve space in the ring buffer for all the commands required to

--- a/drivers/gpu/drm/i915/i915_gem_request.h
+++ b/drivers/gpu/drm/i915/i915_gem_request.h
@@ -137,27 +137,8 @@ struct drm_i915_gem_request {
 	/** file_priv list entry for this request */
 	struct list_head client_list;
-	/**
+	/** Link in the execlist submission queue, guarded by execlist_lock. */
-	 * The ELSP only accepts two elements at a time, so we queue
-	 * context/tail pairs on a given queue (ring->execlist_queue) until the
-	 * hardware is available. The queue serves a double purpose: we also use
-	 * it to keep track of the up to 2 contexts currently in the hardware
-	 * (usually one in execution and the other queued up by the GPU): We
-	 * only remove elements from the head of the queue when the hardware
-	 * informs us that an element has been completed.
-	 *
-	 * All accesses to the queue are mediated by a spinlock
-	 * (ring->execlist_lock).
-	 */
-	/** Execlist link in the submission queue.*/
 	struct list_head execlist_link;
-	/** Execlists no. of times this request has been sent to the ELSP */
-	int elsp_submitted;
-	/** Execlists context hardware id. */
-	unsigned int ctx_hw_id;
 };
 extern const struct fence_ops i915_fence_ops;

--- a/drivers/gpu/drm/i915/intel_lrc.c
+++ b/drivers/gpu/drm/i915/intel_lrc.c
@@ -156,6 +156,11 @@
 #define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
 #define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
+#define GEN8_CTX_STATUS_COMPLETED_MASK \
+	 (GEN8_CTX_STATUS_ACTIVE_IDLE | \
+	  GEN8_CTX_STATUS_PREEMPTED | \
+	  GEN8_CTX_STATUS_ELEMENT_SWITCH)
 #define CTX_LRI_HEADER_0		0x01
 #define CTX_CONTEXT_CONTROL		0x02
 #define CTX_RING_HEAD			0x04
@@ -263,10 +268,8 @@ logical_ring_init_platform_invariants(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	if (IS_GEN8(dev_priv) || IS_GEN9(dev_priv))
+	engine->disable_lite_restore_wa =
-		engine->idle_lite_restore_wa = ~0;
+		(IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
-	engine->disable_lite_restore_wa = (IS_SKL_REVID(dev_priv, 0, SKL_REVID_B0) ||
 		 IS_BXT_REVID(dev_priv, 0, BXT_REVID_A1)) &&
 		(engine->id == VCS || engine->id == VCS2);
@@ -351,11 +354,11 @@ execlists_update_context_pdps(struct i915_hw_ppgtt *ppgtt, u32 *reg_state)
 	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
 }
-static void execlists_update_context(struct drm_i915_gem_request *rq)
+static u64 execlists_update_context(struct drm_i915_gem_request *rq)
 {
-	struct intel_engine_cs *engine = rq->engine;
+	struct intel_context *ce = &rq->ctx->engine[rq->engine->id];
 	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
-	uint32_t *reg_state = rq->ctx->engine[engine->id].lrc_reg_state;
+	u32 *reg_state = ce->lrc_reg_state;
 	reg_state[CTX_RING_TAIL+1] = intel_ring_offset(rq->ring, rq->tail);
@@ -366,26 +369,34 @@ static void execlists_update_context(struct drm_i915_gem_request *rq)
 	 */
 	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev))
 		execlists_update_context_pdps(ppgtt, reg_state);
+	return ce->lrc_desc;
 }
-static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
+static void execlists_submit_ports(struct intel_engine_cs *engine)
-				 struct drm_i915_gem_request *rq1)
 {
-	struct intel_engine_cs *engine = rq0->engine;
+	struct drm_i915_private *dev_priv = engine->i915;
-	struct drm_i915_private *dev_priv = rq0->i915;
+	struct execlist_port *port = engine->execlist_port;
 	u32 __iomem *elsp =
 		dev_priv->regs + i915_mmio_reg_offset(RING_ELSP(engine));
 	u64 desc[2];
-	if (rq1) {
+	if (!port[0].count)
-		desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->engine);
+		execlists_context_status_change(port[0].request,
-		rq1->elsp_submitted++;
+						INTEL_CONTEXT_SCHEDULE_IN);
+	desc[0] = execlists_update_context(port[0].request);
+	engine->preempt_wa = port[0].count++; /* bdw only? fixed on skl? */
+	if (port[1].request) {
+		GEM_BUG_ON(port[1].count);
+		execlists_context_status_change(port[1].request,
+						INTEL_CONTEXT_SCHEDULE_IN);
+		desc[1] = execlists_update_context(port[1].request);
+		port[1].count = 1;
 	} else {
 		desc[1] = 0;
 	}
+	GEM_BUG_ON(desc[0] == desc[1]);
-	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->engine);
-	rq0->elsp_submitted++;
 	/* You must always write both descriptors in the order below. */
 	writel(upper_32_bits(desc[1]), elsp);
@@ -396,141 +407,125 @@ static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
 	writel(lower_32_bits(desc[0]), elsp);
 }
-static void execlists_elsp_submit_contexts(struct drm_i915_gem_request *rq0,
+static bool ctx_single_port_submission(const struct i915_gem_context *ctx)
-					   struct drm_i915_gem_request *rq1)
 {
-	struct drm_i915_private *dev_priv = rq0->i915;
+	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
-	unsigned int fw_domains = rq0->engine->fw_domains;
+		ctx->execlists_force_single_submission);
+}
-	execlists_update_context(rq0);
-	if (rq1)
-		execlists_update_context(rq1);
-	spin_lock_irq(&dev_priv->uncore.lock);
+static bool can_merge_ctx(const struct i915_gem_context *prev,
-	intel_uncore_forcewake_get__locked(dev_priv, fw_domains);
+			  const struct i915_gem_context *next)
+{
+	if (prev != next)
+		return false;
-	execlists_elsp_write(rq0, rq1);
+	if (ctx_single_port_submission(prev))
+		return false;
-	intel_uncore_forcewake_put__locked(dev_priv, fw_domains);
+	return true;
-	spin_unlock_irq(&dev_priv->uncore.lock);
 }
-static void execlists_unqueue(struct intel_engine_cs *engine)
+static void execlists_dequeue(struct intel_engine_cs *engine)
 {
-	struct drm_i915_gem_request *req0 = NULL, *req1 = NULL;
+	struct drm_i915_gem_request *cursor, *last;
-	struct drm_i915_gem_request *cursor, *tmp;
+	struct execlist_port *port = engine->execlist_port;
+	bool submit = false;
+	last = port->request;
+	if (last)
+		/* WaIdleLiteRestore:bdw,skl
+		 * Apply the wa NOOPs to prevent ring:HEAD == req:TAIL
+		 * as we resubmit the request. See gen8_emit_request()
+		 * for where we prepare the padding after the end of the
+		 * request.
+		 */
+		last->tail = last->wa_tail;
-	assert_spin_locked(&engine->execlist_lock);
+	GEM_BUG_ON(port[1].request);
-	/*
+	/* Hardware submission is through 2 ports. Conceptually each port
-	 * If irqs are not active generate a warning as batches that finish
+	 * has a (RING_START, RING_HEAD, RING_TAIL) tuple. RING_START is
-	 * without the irqs may get lost and a GPU Hang may occur.
+	 * static for a context, and unique to each, so we only execute
+	 * requests belonging to a single context from each ring. RING_HEAD
+	 * is maintained by the CS in the context image, it marks the place
+	 * where it got up to last time, and through RING_TAIL we tell the CS
+	 * where we want to execute up to this time.
+	 *
+	 * In this list the requests are in order of execution. Consecutive
+	 * requests from the same context are adjacent in the ringbuffer. We
+	 * can combine these requests into a single RING_TAIL update:
+	 *
+	 *              RING_HEAD...req1...req2
+	 *                                    ^- RING_TAIL
+	 * since to execute req2 the CS must first execute req1.
+	 *
+	 * Our goal then is to point each port to the end of a consecutive
+	 * sequence of requests as being the most optimal (fewest wake ups
+	 * and context switches) submission.
 	 */
-	WARN_ON(!intel_irqs_enabled(engine->i915));
+	spin_lock(&engine->execlist_lock);
-	/* Try to read in pairs */
+	list_for_each_entry(cursor, &engine->execlist_queue, execlist_link) {
-	list_for_each_entry_safe(cursor, tmp, &engine->execlist_queue,
+		/* Can we combine this request with the current port? It has to
-				 execlist_link) {
+		 * be the same context/ringbuffer and not have any exceptions
-		if (!req0) {
+		 * (e.g. GVT saying never to combine contexts).
-			req0 = cursor;
+		 *
-		} else if (req0->ctx == cursor->ctx) {
+		 * If we can combine the requests, we can execute both by
-			/* Same ctx: ignore first request, as second request
+		 * updating the RING_TAIL to point to the end of the second
-			 * will update tail past first request's workload */
+		 * request, and so we never need to tell the hardware about
-			cursor->elsp_submitted = req0->elsp_submitted;
+		 * the first.
-			list_del(&req0->execlist_link);
-			i915_gem_request_put(req0);
-			req0 = cursor;
-		} else {
-			if (IS_ENABLED(CONFIG_DRM_I915_GVT)) {
-				/*
-				 * req0 (after merged) ctx requires single
-				 * submission, stop picking
 		 */
-				if (req0->ctx->execlists_force_single_submission)
+		if (last && !can_merge_ctx(cursor->ctx, last->ctx)) {
-					break;
+			/* If we are on the second port and cannot combine
-				/*
+			 * this request with the last, then we are done.
-				 * req0 ctx doesn't require single submission,
-				 * but next req ctx requires, stop picking
 			 */
-				if (cursor->ctx->execlists_force_single_submission)
+			if (port != engine->execlist_port)
-					break;
-			}
-			req1 = cursor;
-			WARN_ON(req1->elsp_submitted);
 				break;
-		}
-	}
-	if (unlikely(!req0))
+			/* If GVT overrides us we only ever submit port[0],
-		return;
+			 * leaving port[1] empty. Note that we also have
+			 * to be careful that we don't queue the same
+			 * context (even though a different request) to
+			 * the second port.
+			 */
+			if (ctx_single_port_submission(cursor->ctx))
+				break;
-	execlists_context_status_change(req0, INTEL_CONTEXT_SCHEDULE_IN);
+			GEM_BUG_ON(last->ctx == cursor->ctx);
-	if (req1)
+			i915_gem_request_assign(&port->request, last);
-		execlists_context_status_change(req1,
+			port++;
-						INTEL_CONTEXT_SCHEDULE_IN);
+		}
+		last = cursor;
+		submit = true;
+	}
+	if (submit) {
+		/* Decouple all the requests submitted from the queue */
+		engine->execlist_queue.next = &cursor->execlist_link;
+		cursor->execlist_link.prev = &engine->execlist_queue;
-	if (req0->elsp_submitted & engine->idle_lite_restore_wa) {
+		i915_gem_request_assign(&port->request, last);
-		/*
-		 * WaIdleLiteRestore: make sure we never cause a lite restore
-		 * with HEAD==TAIL.
-		 *
-		 * Apply the wa NOOPS to prevent ring:HEAD == req:TAIL as we
-		 * resubmit the request. See gen8_emit_request() for where we
-		 * prepare the padding after the end of the request.
-		 */
-		req0->tail = req0->wa_tail;
 	}
+	spin_unlock(&engine->execlist_lock);
-	execlists_elsp_submit_contexts(req0, req1);
+	if (submit)
+		execlists_submit_ports(engine);
 }
-static unsigned int
+static bool execlists_elsp_idle(struct intel_engine_cs *engine)
-execlists_check_remove_request(struct intel_engine_cs *engine, u32 ctx_id)
 {
-	struct drm_i915_gem_request *head_req;
+	return !engine->execlist_port[0].request;
-	assert_spin_locked(&engine->execlist_lock);
-	head_req = list_first_entry_or_null(&engine->execlist_queue,
-					    struct drm_i915_gem_request,
-					    execlist_link);
-	if (WARN_ON(!head_req || (head_req->ctx_hw_id != ctx_id)))
-               return 0;
-	WARN(head_req->elsp_submitted == 0, "Never submitted head request\n");
-	if (--head_req->elsp_submitted > 0)
-		return 0;
-	execlists_context_status_change(head_req, INTEL_CONTEXT_SCHEDULE_OUT);
-	list_del(&head_req->execlist_link);
-	i915_gem_request_put(head_req);
-	return 1;
 }
-static u32
+static bool execlists_elsp_ready(struct intel_engine_cs *engine)
-get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
-		   u32 *context_id)
 {
-	struct drm_i915_private *dev_priv = engine->i915;
+	int port;
-	u32 status;
-	read_pointer %= GEN8_CSB_ENTRIES;
-	status = I915_READ_FW(RING_CONTEXT_STATUS_BUF_LO(engine, read_pointer));
-	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
-		return 0;
-	*context_id = I915_READ_FW(RING_CONTEXT_STATUS_BUF_HI(engine,
+	port = 1; /* wait for a free slot */
-							      read_pointer));
+	if (engine->disable_lite_restore_wa || engine->preempt_wa)
+		port = 0; /* wait for GPU to be idle before continuing */
-	return status;
+	return !engine->execlist_port[port].request;
 }
 /*
@@ -540,67 +535,56 @@ get_context_status(struct intel_engine_cs *engine, unsigned int read_pointer,
 static void intel_lrc_irq_handler(unsigned long data)
 {
 	struct intel_engine_cs *engine = (struct intel_engine_cs *)data;
+	struct execlist_port *port = engine->execlist_port;
 	struct drm_i915_private *dev_priv = engine->i915;
-	u32 status_pointer;
-	unsigned int read_pointer, write_pointer;
-	u32 csb[GEN8_CSB_ENTRIES][2];
-	unsigned int csb_read = 0, i;
-	unsigned int submit_contexts = 0;
 	intel_uncore_forcewake_get(dev_priv, engine->fw_domains);
-	status_pointer = I915_READ_FW(RING_CONTEXT_STATUS_PTR(engine));
+	if (!execlists_elsp_idle(engine)) {
+		u32 __iomem *csb_mmio =
-	read_pointer = engine->next_context_status_buffer;
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_PTR(engine));
-	write_pointer = GEN8_CSB_WRITE_PTR(status_pointer);
+		u32 __iomem *buf =
-	if (read_pointer > write_pointer)
+			dev_priv->regs + i915_mmio_reg_offset(RING_CONTEXT_STATUS_BUF_LO(engine, 0));
-		write_pointer += GEN8_CSB_ENTRIES;
+		unsigned int csb, head, tail;
-	while (read_pointer < write_pointer) {
+		csb = readl(csb_mmio);
-		if (WARN_ON_ONCE(csb_read == GEN8_CSB_ENTRIES))
+		head = GEN8_CSB_READ_PTR(csb);
-			break;
+		tail = GEN8_CSB_WRITE_PTR(csb);
-		csb[csb_read][0] = get_context_status(engine, ++read_pointer,
+		if (tail < head)
-						      &csb[csb_read][1]);
+			tail += GEN8_CSB_ENTRIES;
-		csb_read++;
+		while (head < tail) {
-	}
+			unsigned int idx = ++head % GEN8_CSB_ENTRIES;
+			unsigned int status = readl(buf + 2 * idx);
-	engine->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
+			if (!(status & GEN8_CTX_STATUS_COMPLETED_MASK))
+				continue;
-	/* Update the read pointer to the old write pointer. Manual ringbuffer
+			GEM_BUG_ON(port[0].count == 0);
-	 * management ftw </sarcasm> */
+			if (--port[0].count == 0) {
-	I915_WRITE_FW(RING_CONTEXT_STATUS_PTR(engine),
+				GEM_BUG_ON(status & GEN8_CTX_STATUS_PREEMPTED);
-		      _MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
+				execlists_context_status_change(port[0].request,
-				    engine->next_context_status_buffer << 8));
+								INTEL_CONTEXT_SCHEDULE_OUT);
-	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
+				i915_gem_request_put(port[0].request);
+				port[0] = port[1];
+				memset(&port[1], 0, sizeof(port[1]));
-	spin_lock(&engine->execlist_lock);
+				engine->preempt_wa = false;
-	for (i = 0; i < csb_read; i++) {
-		if (unlikely(csb[i][0] & GEN8_CTX_STATUS_PREEMPTED)) {
-			if (csb[i][0] & GEN8_CTX_STATUS_LITE_RESTORE) {
-				if (execlists_check_remove_request(engine, csb[i][1]))
-					WARN(1, "Lite Restored request removed from queue\n");
-			} else
-				WARN(1, "Preemption without Lite Restore\n");
 			}
-		if (csb[i][0] & (GEN8_CTX_STATUS_ACTIVE_IDLE |
+			GEM_BUG_ON(port[0].count == 0 &&
-		    GEN8_CTX_STATUS_ELEMENT_SWITCH))
+				   !(status & GEN8_CTX_STATUS_ACTIVE_IDLE));
-			submit_contexts +=
-				execlists_check_remove_request(engine, csb[i][1]);
 		}
-	if (submit_contexts) {
+		writel(_MASKED_FIELD(GEN8_CSB_READ_PTR_MASK,
-		if (!engine->disable_lite_restore_wa ||
+				     GEN8_CSB_WRITE_PTR(csb) << 8),
-		    (csb[i][0] & GEN8_CTX_STATUS_ACTIVE_IDLE))
+		       csb_mmio);
-			execlists_unqueue(engine);
 	}
-	spin_unlock(&engine->execlist_lock);
+	if (execlists_elsp_ready(engine))
+		execlists_dequeue(engine);
-	if (unlikely(submit_contexts > 2))
+	intel_uncore_forcewake_put(dev_priv, engine->fw_domains);
-		DRM_ERROR("More than two context complete events?\n");
 }
 static void execlists_submit_request(struct drm_i915_gem_request *request)
@@ -609,12 +593,9 @@ static void execlists_submit_request(struct drm_i915_gem_request *request)
 	spin_lock_bh(&engine->execlist_lock);
-	i915_gem_request_get(request);
-	request->ctx_hw_id = request->ctx->hw_id;
-	if (list_empty(&engine->execlist_queue))
-		tasklet_hi_schedule(&engine->irq_tasklet);
 	list_add_tail(&request->execlist_link, &engine->execlist_queue);
+	if (execlists_elsp_idle(engine))
+		tasklet_hi_schedule(&engine->irq_tasklet);
 	spin_unlock_bh(&engine->execlist_lock);
 }
@@ -721,23 +702,6 @@ intel_logical_ring_advance(struct drm_i915_gem_request *request)
 	return 0;
 }
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine)
-{
-	struct drm_i915_gem_request *req, *tmp;
-	LIST_HEAD(cancel_list);
-	WARN_ON(!mutex_is_locked(&engine->i915->drm.struct_mutex));
-	spin_lock_bh(&engine->execlist_lock);
-	list_replace_init(&engine->execlist_queue, &cancel_list);
-	spin_unlock_bh(&engine->execlist_lock);
-	list_for_each_entry_safe(req, tmp, &cancel_list, execlist_link) {
-		list_del(&req->execlist_link);
-		i915_gem_request_put(req);
-	}
-}
 static int intel_lr_context_pin(struct i915_gem_context *ctx,
 				struct intel_engine_cs *engine)
 {
@@ -1258,7 +1222,6 @@ static void lrc_init_hws(struct intel_engine_cs *engine)
 static int gen8_init_common_ring(struct intel_engine_cs *engine)
 {
 	struct drm_i915_private *dev_priv = engine->i915;
-	unsigned int next_context_status_buffer_hw;
 	lrc_init_hws(engine);
@@ -1269,32 +1232,7 @@ static int gen8_init_common_ring(struct intel_engine_cs *engine)
 	I915_WRITE(RING_MODE_GEN7(engine),
 		   _MASKED_BIT_DISABLE(GFX_REPLAY_MODE) |
 		   _MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
-	POSTING_READ(RING_MODE_GEN7(engine));
-	/*
-	 * Instead of resetting the Context Status Buffer (CSB) read pointer to
-	 * zero, we need to read the write pointer from hardware and use its
-	 * value because "this register is power context save restored".
-	 * Effectively, these states have been observed:
-	 *
-	 *      | Suspend-to-idle (freeze) | Suspend-to-RAM (mem) |
-	 * BDW  | CSB regs not reset       | CSB regs reset       |
-	 * CHT  | CSB regs not reset       | CSB regs not reset   |
-	 * SKL  |         ?                |         ?            |
-	 * BXT  |         ?                |         ?            |
-	 */
-	next_context_status_buffer_hw =
-		GEN8_CSB_WRITE_PTR(I915_READ(RING_CONTEXT_STATUS_PTR(engine)));
-	/*
-	 * When the CSB registers are reset (also after power-up / gpu reset),
-	 * CSB write pointer is set to all 1's, which is not valid, use '5' in
-	 * this special case, so the first element read is CSB[0].
-	 */
-	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
-		next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
-	engine->next_context_status_buffer = next_context_status_buffer_hw;
 	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", engine->name);
 	intel_engine_init_hangcheck(engine);
@@ -1680,10 +1618,6 @@ void intel_logical_ring_cleanup(struct intel_engine_cs *engine)
 	}
 	intel_lr_context_unpin(dev_priv->kernel_context, engine);
-	engine->idle_lite_restore_wa = 0;
-	engine->disable_lite_restore_wa = false;
-	engine->ctx_desc_template = 0;
 	lrc_destroy_wa_ctx_obj(engine);
 	engine->i915 = NULL;
 }

--- a/drivers/gpu/drm/i915/intel_lrc.h
+++ b/drivers/gpu/drm/i915/intel_lrc.h
@@ -97,6 +97,4 @@ int intel_sanitize_enable_execlists(struct drm_i915_private *dev_priv,
 				    int enable_execlists);
 void intel_execlists_enable_submission(struct drm_i915_private *dev_priv);
-void intel_execlists_cancel_requests(struct intel_engine_cs *engine);
 #endif /* _INTEL_LRC_H_ */
--- a/drivers/gpu/drm/i915/intel_ringbuffer.h
+++ b/drivers/gpu/drm/i915/intel_ringbuffer.h
@@ -298,11 +298,14 @@ struct intel_engine_cs {
 	/* Execlists */
 	struct tasklet_struct irq_tasklet;
 	spinlock_t execlist_lock; /* used inside tasklet, use spin_lock_bh */
+	struct execlist_port {
+		struct drm_i915_gem_request *request;
+		unsigned int count;
+	} execlist_port[2];
 	struct list_head execlist_queue;
 	unsigned int fw_domains;
-	unsigned int next_context_status_buffer;
-	unsigned int idle_lite_restore_wa;
 	bool disable_lite_restore_wa;
+	bool preempt_wa;
 	u32 ctx_desc_template;
 	/**