drm/i915/perf: execute OA configuration from command stream

We haven't run into issues with programming the global OA/NOA registers configuration from CPU so far, but HW engineers actually recommend doing this from the command streamer. On TGL in particular one of the clock domain in which some of that programming goes might not be powered when we poke things from the CPU. Since we have a command buffer prepared for the execbuffer side of things, we can reuse that approach here too. This also allows us to significantly reduce the amount of time we hold the main lock. v2: Drop the global lock as much as possible v3: Take global lock to pin global v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel) v5: Move locking to the stream (Lionel) v6: Move active reconfiguration request into i915_perf_stream (Lionel) v7: Pin VMA outside request creation (Chris) Lock VMA before move to active (Chris) v8: Fix double free on stream->initial_oa_config_bo (Lionel) Don't allow interruption when waiting on active config request (Lionel) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: https://patchwork.freedesktop.org/patch/msgid/20191012072308.30312-3-chris@chris-wilson.co.uk

drm/i915/perf: execute OA configuration from command stream
We haven't run into issues with programming the global OA/NOA registers configuration from CPU so far, but HW engineers actually recommend doing this from the command streamer. On TGL in particular one of the clock domain in which some of that programming goes might not be powered when we poke things from the CPU. Since we have a command buffer prepared for the execbuffer side of things, we can reuse that approach here too. This also allows us to significantly reduce the amount of time we hold the main lock. v2: Drop the global lock as much as possible v3: Take global lock to pin global v4: Create i915 request in emit_oa_config() to avoid deadlocks (Lionel) v5: Move locking to the stream (Lionel) v6: Move active reconfiguration request into i915_perf_stream (Lionel) v7: Pin VMA outside request creation (Chris) Lock VMA before move to active (Chris) v8: Fix double free on stream->initial_oa_config_bo (Lionel) Don't allow interruption when waiting on active config request (Lionel) Signed-off-by: Lionel Landwerlin <lionel.g.landwerlin@intel.com> Reviewed-by: Chris Wilson <chris@chris-wilson.co.uk> Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Link: https://patchwork.freedesktop.org/patch/msgid/20191012072308.30312-3-chris@chris-wilson.co.uk
15d0ace1 · Lionel Landwerlin · Chris Wilson · daed3e44 · 15d0ace1
Commit 15d0ace1 authored Oct 12, 2019 by Lionel Landwerlin Committed by Chris Wilson Oct 12, 2019
Show whitespace changes
Inline Side-by-side

Showing with 156 additions and 43 deletions

drivers/gpu/drm/i915/i915_perf.c drivers/gpu/drm/i915/i915_perf.c +156 -43

No files found.
--- a/drivers/gpu/drm/i915/i915_perf.c
+++ b/drivers/gpu/drm/i915/i915_perf.c
@@ -1731,56 +1731,181 @@ static int alloc_noa_wait(struct i915_perf_stream *stream)
 	return 0;
 err_unpin:
-	__i915_vma_unpin(vma);
+	i915_vma_unpin_and_release(&vma, 0);
 err_unref:
 	i915_gem_object_put(bo);
 	return ret;
 }
-static void config_oa_regs(struct intel_uncore *uncore,
+static u32 *write_cs_mi_lri(u32 *cs,
-			   const struct i915_oa_reg *regs,
+			    const struct i915_oa_reg *reg_data,
 			    u32 n_regs)
 {
 	u32 i;
 	for (i = 0; i < n_regs; i++) {
-		const struct i915_oa_reg *reg = regs + i;
+		if ((i % MI_LOAD_REGISTER_IMM_MAX_REGS) == 0) {
+			u32 n_lri = min_t(u32,
+					  n_regs - i,
+					  MI_LOAD_REGISTER_IMM_MAX_REGS);
+			*cs++ = MI_LOAD_REGISTER_IMM(n_lri);
+		}
+		*cs++ = i915_mmio_reg_offset(reg_data[i].addr);
+		*cs++ = reg_data[i].value;
+	}
+	return cs;
+}
+static int num_lri_dwords(int num_regs)
+{
+	int count = 0;
+	if (num_regs > 0) {
+		count += DIV_ROUND_UP(num_regs, MI_LOAD_REGISTER_IMM_MAX_REGS);
+		count += num_regs * 2;
+	}
+	return count;
+}
+static struct i915_oa_config_bo *
+alloc_oa_config_buffer(struct i915_perf_stream *stream,
+		       struct i915_oa_config *oa_config)
+{
+	struct drm_i915_gem_object *obj;
+	struct i915_oa_config_bo *oa_bo;
+	size_t config_length = 0;
+	u32 *cs;
+	int err;
+	oa_bo = kzalloc(sizeof(*oa_bo), GFP_KERNEL);
+	if (!oa_bo)
+		return ERR_PTR(-ENOMEM);
-		intel_uncore_write(uncore, reg->addr, reg->value);
+	config_length += num_lri_dwords(oa_config->mux_regs_len);
+	config_length += num_lri_dwords(oa_config->b_counter_regs_len);
+	config_length += num_lri_dwords(oa_config->flex_regs_len);
+	config_length++; /* MI_BATCH_BUFFER_END */
+	config_length = ALIGN(sizeof(u32) * config_length, I915_GTT_PAGE_SIZE);
+	obj = i915_gem_object_create_shmem(stream->perf->i915, config_length);
+	if (IS_ERR(obj)) {
+		err = PTR_ERR(obj);
+		goto err_free;
 	}
+	cs = i915_gem_object_pin_map(obj, I915_MAP_WB);
+	if (IS_ERR(cs)) {
+		err = PTR_ERR(cs);
+		goto err_oa_bo;
+	}
+	cs = write_cs_mi_lri(cs,
+			     oa_config->mux_regs,
+			     oa_config->mux_regs_len);
+	cs = write_cs_mi_lri(cs,
+			     oa_config->b_counter_regs,
+			     oa_config->b_counter_regs_len);
+	cs = write_cs_mi_lri(cs,
+			     oa_config->flex_regs,
+			     oa_config->flex_regs_len);
+	*cs++ = MI_BATCH_BUFFER_END;
+	i915_gem_object_flush_map(obj);
+	i915_gem_object_unpin_map(obj);
+	oa_bo->vma = i915_vma_instance(obj,
+				       &stream->engine->gt->ggtt->vm,
+				       NULL);
+	if (IS_ERR(oa_bo->vma)) {
+		err = PTR_ERR(oa_bo->vma);
+		goto err_oa_bo;
+	}
+	oa_bo->oa_config = i915_oa_config_get(oa_config);
+	llist_add(&oa_bo->node, &stream->oa_config_bos);
+	return oa_bo;
+err_oa_bo:
+	i915_gem_object_put(obj);
+err_free:
+	kfree(oa_bo);
+	return ERR_PTR(err);
 }
-static void delay_after_mux(void)
+static struct i915_vma *
+get_oa_vma(struct i915_perf_stream *stream, struct i915_oa_config *oa_config)
 {
+	struct i915_oa_config_bo *oa_bo;
 	/*
-	 * It apparently takes a fairly long time for a new MUX
+	 * Look for the buffer in the already allocated BOs attached
-	 * configuration to be be applied after these register writes.
+	 * to the stream.
-	 * This delay duration was derived empirically based on the
-	 * render_basic config but hopefully it covers the maximum
-	 * configuration latency.
-	 *
-	 * As a fallback, the checks in _append_oa_reports() to skip
-	 * invalid OA reports do also seem to work to discard reports
-	 * generated before this config has completed - albeit not
-	 * silently.
-	 *
-	 * Unfortunately this is essentially a magic number, since we
-	 * don't currently know of a reliable mechanism for predicting
-	 * how long the MUX config will take to apply and besides
-	 * seeing invalid reports we don't know of a reliable way to
-	 * explicitly check that the MUX config has landed.
-	 *
-	 * It's even possible we've miss characterized the underlying
-	 * problem - it just seems like the simplest explanation why
-	 * a delay at this location would mitigate any invalid reports.
 	 */
-	usleep_range(15000, 20000);
+	llist_for_each_entry(oa_bo, stream->oa_config_bos.first, node) {
+		if (oa_bo->oa_config == oa_config &&
+		    memcmp(oa_bo->oa_config->uuid,
+			   oa_config->uuid,
+			   sizeof(oa_config->uuid)) == 0)
+			goto out;
+	}
+	oa_bo = alloc_oa_config_buffer(stream, oa_config);
+	if (IS_ERR(oa_bo))
+		return ERR_CAST(oa_bo);
+out:
+	return i915_vma_get(oa_bo->vma);
+}
+static int emit_oa_config(struct i915_perf_stream *stream,
+			  struct intel_context *ce)
+{
+	struct i915_request *rq;
+	struct i915_vma *vma;
+	int err;
+	vma = get_oa_vma(stream, stream->oa_config);
+	if (IS_ERR(vma))
+		return PTR_ERR(vma);
+	err = i915_vma_pin(vma, 0, 0, PIN_GLOBAL | PIN_HIGH);
+	if (err)
+		goto err_vma_put;
+	rq = i915_request_create(ce);
+	if (IS_ERR(rq)) {
+		err = PTR_ERR(rq);
+		goto err_vma_unpin;
+	}
+	i915_vma_lock(vma);
+	err = i915_request_await_object(rq, vma->obj, 0);
+	if (!err)
+		err = i915_vma_move_to_active(vma, rq, 0);
+	i915_vma_unlock(vma);
+	if (err)
+		goto err_add_request;
+	err = rq->engine->emit_bb_start(rq,
+					vma->node.start, 0,
+					I915_DISPATCH_SECURE);
+err_add_request:
+	i915_request_add(rq);
+err_vma_unpin:
+	i915_vma_unpin(vma);
+err_vma_put:
+	i915_vma_put(vma);
+	return err;
 }
 static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 {
 	struct intel_uncore *uncore = stream->uncore;
-	const struct i915_oa_config *oa_config = stream->oa_config;
 	/*
 	 * PRM:
@@ -1797,13 +1922,7 @@ static int hsw_enable_metric_set(struct i915_perf_stream *stream)
 	intel_uncore_rmw(uncore, GEN6_UCGCTL1,
 			 0, GEN6_CSUNIT_CLOCK_GATE_DISABLE);
-	config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
+	return emit_oa_config(stream, stream->engine->kernel_context);
-	delay_after_mux();
-	config_oa_regs(uncore, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-	return 0;
 }
 static void hsw_disable_metric_set(struct i915_perf_stream *stream)
@@ -2167,13 +2286,7 @@ static int gen8_enable_metric_set(struct i915_perf_stream *stream)
 	if (ret)
 		return ret;
-	config_oa_regs(uncore, oa_config->mux_regs, oa_config->mux_regs_len);
+	return emit_oa_config(stream, stream->engine->kernel_context);
-	delay_after_mux();
-	config_oa_regs(uncore, oa_config->b_counter_regs,
-		       oa_config->b_counter_regs_len);
-	return 0;
 }
 static void gen8_disable_metric_set(struct i915_perf_stream *stream)