Commit eebfb32e authored by Chris Wilson's avatar Chris Wilson

drm/i915/gt: Limit VFE threads based on GT

MEDIA_STATE_VFE only accepts the 'maximum number of threads' in the
range [0, n-1] where n is #EU * (#threads/EU) with the number of threads
based on plaform and the number of EU based on the number of slices and
subslices. This is a fixed number per platform/gt, so appropriately
limit the number of threads we spawn to match the device.

v2: Oversaturate the system with tasks to force execution on every HW
thread; if the thread idles it is returned to the pool and may be reused
again before an unused thread.

v3: Fix more state commands, which was causing Baytrail to barf.
v4: STATE_CACHE_INVALIDATE requires a stall on Ivybridge

Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/2024
Fixes: 47f8253d ("drm/i915/gen7: Clear all EU/L3 residual contexts")
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Mika Kuoppala <mika.kuoppala@linux.intel.com>
Cc: Prathap Kumar Valsan <prathap.kumar.valsan@intel.com>
Cc: Akeem G Abodunrin <akeem.g.abodunrin@intel.com>
Cc: Jon Bloomfield <jon.bloomfield@intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Randy Wright <rwright@hpe.com>
Cc: stable@vger.kernel.org # v5.7+
Reviewed-by: default avatarAkeem G Abodunrin <akeem.g.abodunrin@intel.com>
Reviewed-by: default avatarRodrigo Vivi <rodrigo.vivi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210111225220.3483-1-chris@chris-wilson.co.uk
parent cd7a214f
...@@ -7,8 +7,6 @@ ...@@ -7,8 +7,6 @@
#include "i915_drv.h" #include "i915_drv.h"
#include "intel_gpu_commands.h" #include "intel_gpu_commands.h"
#define MAX_URB_ENTRIES 64
#define STATE_SIZE (4 * 1024)
#define GT3_INLINE_DATA_DELAYS 0x1E00 #define GT3_INLINE_DATA_DELAYS 0x1E00
#define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS)) #define batch_advance(Y, CS) GEM_BUG_ON((Y)->end != (CS))
...@@ -34,38 +32,59 @@ struct batch_chunk { ...@@ -34,38 +32,59 @@ struct batch_chunk {
}; };
struct batch_vals { struct batch_vals {
u32 max_primitives; u32 max_threads;
u32 max_urb_entries;
u32 cmd_size;
u32 state_size;
u32 state_start; u32 state_start;
u32 batch_size; u32 surface_start;
u32 surface_height; u32 surface_height;
u32 surface_width; u32 surface_width;
u32 scratch_size; u32 size;
u32 max_size;
}; };
static inline int num_primitives(const struct batch_vals *bv)
{
/*
* We need to saturate the GPU with work in order to dispatch
* a shader on every HW thread, and clear the thread-local registers.
* In short, we have to dispatch work faster than the shaders can
* run in order to fill the EU and occupy each HW thread.
*/
return bv->max_threads;
}
static void static void
batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv) batch_get_defaults(struct drm_i915_private *i915, struct batch_vals *bv)
{ {
if (IS_HASWELL(i915)) { if (IS_HASWELL(i915)) {
bv->max_primitives = 280; switch (INTEL_INFO(i915)->gt) {
bv->max_urb_entries = MAX_URB_ENTRIES; default:
case 1:
bv->max_threads = 70;
break;
case 2:
bv->max_threads = 140;
break;
case 3:
bv->max_threads = 280;
break;
}
bv->surface_height = 16 * 16; bv->surface_height = 16 * 16;
bv->surface_width = 32 * 2 * 16; bv->surface_width = 32 * 2 * 16;
} else { } else {
bv->max_primitives = 128; switch (INTEL_INFO(i915)->gt) {
bv->max_urb_entries = MAX_URB_ENTRIES / 2; default:
case 1: /* including vlv */
bv->max_threads = 36;
break;
case 2:
bv->max_threads = 128;
break;
}
bv->surface_height = 16 * 8; bv->surface_height = 16 * 8;
bv->surface_width = 32 * 16; bv->surface_width = 32 * 16;
} }
bv->cmd_size = bv->max_primitives * 4096; bv->state_start = round_up(SZ_1K + num_primitives(bv) * 64, SZ_4K);
bv->state_size = STATE_SIZE; bv->surface_start = bv->state_start + SZ_4K;
bv->state_start = bv->cmd_size; bv->size = bv->surface_start + bv->surface_height * bv->surface_width;
bv->batch_size = bv->cmd_size + bv->state_size;
bv->scratch_size = bv->surface_height * bv->surface_width;
bv->max_size = bv->batch_size + bv->scratch_size;
} }
static void batch_init(struct batch_chunk *bc, static void batch_init(struct batch_chunk *bc,
...@@ -155,7 +174,8 @@ static u32 ...@@ -155,7 +174,8 @@ static u32
gen7_fill_binding_table(struct batch_chunk *state, gen7_fill_binding_table(struct batch_chunk *state,
const struct batch_vals *bv) const struct batch_vals *bv)
{ {
u32 surface_start = gen7_fill_surface_state(state, bv->batch_size, bv); u32 surface_start =
gen7_fill_surface_state(state, bv->surface_start, bv);
u32 *cs = batch_alloc_items(state, 32, 8); u32 *cs = batch_alloc_items(state, 32, 8);
u32 offset = batch_offset(state, cs); u32 offset = batch_offset(state, cs);
...@@ -214,9 +234,9 @@ static void ...@@ -214,9 +234,9 @@ static void
gen7_emit_state_base_address(struct batch_chunk *batch, gen7_emit_state_base_address(struct batch_chunk *batch,
u32 surface_state_base) u32 surface_state_base)
{ {
u32 *cs = batch_alloc_items(batch, 0, 12); u32 *cs = batch_alloc_items(batch, 0, 10);
*cs++ = STATE_BASE_ADDRESS | (12 - 2); *cs++ = STATE_BASE_ADDRESS | (10 - 2);
/* general */ /* general */
*cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY; *cs++ = batch_addr(batch) | BASE_ADDRESS_MODIFY;
/* surface */ /* surface */
...@@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch, ...@@ -233,8 +253,6 @@ gen7_emit_state_base_address(struct batch_chunk *batch,
*cs++ = BASE_ADDRESS_MODIFY; *cs++ = BASE_ADDRESS_MODIFY;
*cs++ = 0; *cs++ = 0;
*cs++ = BASE_ADDRESS_MODIFY; *cs++ = BASE_ADDRESS_MODIFY;
*cs++ = 0;
*cs++ = 0;
batch_advance(batch, cs); batch_advance(batch, cs);
} }
...@@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch, ...@@ -244,8 +262,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
u32 urb_size, u32 curbe_size, u32 urb_size, u32 curbe_size,
u32 mode) u32 mode)
{ {
u32 urb_entries = bv->max_urb_entries; u32 threads = bv->max_threads - 1;
u32 threads = bv->max_primitives - 1;
u32 *cs = batch_alloc_items(batch, 32, 8); u32 *cs = batch_alloc_items(batch, 32, 8);
*cs++ = MEDIA_VFE_STATE | (8 - 2); *cs++ = MEDIA_VFE_STATE | (8 - 2);
...@@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch, ...@@ -254,7 +271,7 @@ gen7_emit_vfe_state(struct batch_chunk *batch,
*cs++ = 0; *cs++ = 0;
/* number of threads & urb entries for GPGPU vs Media Mode */ /* number of threads & urb entries for GPGPU vs Media Mode */
*cs++ = threads << 16 | urb_entries << 8 | mode << 2; *cs++ = threads << 16 | 1 << 8 | mode << 2;
*cs++ = 0; *cs++ = 0;
...@@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch, ...@@ -293,17 +310,12 @@ gen7_emit_media_object(struct batch_chunk *batch,
{ {
unsigned int x_offset = (media_object_index % 16) * 64; unsigned int x_offset = (media_object_index % 16) * 64;
unsigned int y_offset = (media_object_index / 16) * 16; unsigned int y_offset = (media_object_index / 16) * 16;
unsigned int inline_data_size; unsigned int pkt = 6 + 3;
unsigned int media_batch_size;
unsigned int i;
u32 *cs; u32 *cs;
inline_data_size = 112 * 8; cs = batch_alloc_items(batch, 8, pkt);
media_batch_size = inline_data_size + 6;
cs = batch_alloc_items(batch, 8, media_batch_size);
*cs++ = MEDIA_OBJECT | (media_batch_size - 2); *cs++ = MEDIA_OBJECT | (pkt - 2);
/* interface descriptor offset */ /* interface descriptor offset */
*cs++ = 0; *cs++ = 0;
...@@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch, ...@@ -317,25 +329,44 @@ gen7_emit_media_object(struct batch_chunk *batch,
*cs++ = 0; *cs++ = 0;
/* inline */ /* inline */
*cs++ = (y_offset << 16) | (x_offset); *cs++ = y_offset << 16 | x_offset;
*cs++ = 0; *cs++ = 0;
*cs++ = GT3_INLINE_DATA_DELAYS; *cs++ = GT3_INLINE_DATA_DELAYS;
for (i = 3; i < inline_data_size; i++)
*cs++ = 0;
batch_advance(batch, cs); batch_advance(batch, cs);
} }
static void gen7_emit_pipeline_flush(struct batch_chunk *batch) static void gen7_emit_pipeline_flush(struct batch_chunk *batch)
{ {
u32 *cs = batch_alloc_items(batch, 0, 5); u32 *cs = batch_alloc_items(batch, 0, 4);
*cs++ = GFX_OP_PIPE_CONTROL(5); *cs++ = GFX_OP_PIPE_CONTROL(4);
*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE | *cs++ = PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH |
PIPE_CONTROL_GLOBAL_GTT_IVB; PIPE_CONTROL_DEPTH_CACHE_FLUSH |
PIPE_CONTROL_DC_FLUSH_ENABLE |
PIPE_CONTROL_CS_STALL;
*cs++ = 0; *cs++ = 0;
*cs++ = 0; *cs++ = 0;
batch_advance(batch, cs);
}
static void gen7_emit_pipeline_invalidate(struct batch_chunk *batch)
{
u32 *cs = batch_alloc_items(batch, 0, 8);
/* ivb: Stall before STATE_CACHE_INVALIDATE */
*cs++ = GFX_OP_PIPE_CONTROL(4);
*cs++ = PIPE_CONTROL_STALL_AT_SCOREBOARD |
PIPE_CONTROL_CS_STALL;
*cs++ = 0;
*cs++ = 0;
*cs++ = GFX_OP_PIPE_CONTROL(4);
*cs++ = PIPE_CONTROL_STATE_CACHE_INVALIDATE;
*cs++ = 0; *cs++ = 0;
*cs++ = 0;
batch_advance(batch, cs); batch_advance(batch, cs);
} }
...@@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma, ...@@ -344,34 +375,34 @@ static void emit_batch(struct i915_vma * const vma,
const struct batch_vals *bv) const struct batch_vals *bv)
{ {
struct drm_i915_private *i915 = vma->vm->i915; struct drm_i915_private *i915 = vma->vm->i915;
unsigned int desc_count = 64; const unsigned int desc_count = 1;
const u32 urb_size = 112; const unsigned int urb_size = 1;
struct batch_chunk cmds, state; struct batch_chunk cmds, state;
u32 interface_descriptor; u32 descriptors;
unsigned int i; unsigned int i;
batch_init(&cmds, vma, start, 0, bv->cmd_size); batch_init(&cmds, vma, start, 0, bv->state_start);
batch_init(&state, vma, start, bv->state_start, bv->state_size); batch_init(&state, vma, start, bv->state_start, SZ_4K);
interface_descriptor = descriptors = gen7_fill_interface_descriptor(&state, bv,
gen7_fill_interface_descriptor(&state, bv, IS_HASWELL(i915) ?
IS_HASWELL(i915) ? &cb_kernel_hsw :
&cb_kernel_hsw : &cb_kernel_ivb,
&cb_kernel_ivb, desc_count);
desc_count);
gen7_emit_pipeline_flush(&cmds); gen7_emit_pipeline_invalidate(&cmds);
batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA); batch_add(&cmds, PIPELINE_SELECT | PIPELINE_SELECT_MEDIA);
batch_add(&cmds, MI_NOOP); batch_add(&cmds, MI_NOOP);
gen7_emit_state_base_address(&cmds, interface_descriptor); gen7_emit_pipeline_invalidate(&cmds);
gen7_emit_pipeline_flush(&cmds); gen7_emit_pipeline_flush(&cmds);
gen7_emit_state_base_address(&cmds, descriptors);
gen7_emit_pipeline_invalidate(&cmds);
gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0); gen7_emit_vfe_state(&cmds, bv, urb_size - 1, 0, 0);
gen7_emit_interface_descriptor_load(&cmds, descriptors, desc_count);
gen7_emit_interface_descriptor_load(&cmds, for (i = 0; i < num_primitives(bv); i++)
interface_descriptor,
desc_count);
for (i = 0; i < bv->max_primitives; i++)
gen7_emit_media_object(&cmds, i); gen7_emit_media_object(&cmds, i);
batch_add(&cmds, MI_BATCH_BUFFER_END); batch_add(&cmds, MI_BATCH_BUFFER_END);
...@@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine, ...@@ -385,15 +416,15 @@ int gen7_setup_clear_gpr_bb(struct intel_engine_cs * const engine,
batch_get_defaults(engine->i915, &bv); batch_get_defaults(engine->i915, &bv);
if (!vma) if (!vma)
return bv.max_size; return bv.size;
GEM_BUG_ON(vma->obj->base.size < bv.max_size); GEM_BUG_ON(vma->obj->base.size < bv.size);
batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC); batch = i915_gem_object_pin_map(vma->obj, I915_MAP_WC);
if (IS_ERR(batch)) if (IS_ERR(batch))
return PTR_ERR(batch); return PTR_ERR(batch);
emit_batch(vma, memset(batch, 0, bv.max_size), &bv); emit_batch(vma, memset(batch, 0, bv.size), &bv);
i915_gem_object_flush_map(vma->obj); i915_gem_object_flush_map(vma->obj);
__i915_gem_object_release_map(vma->obj); __i915_gem_object_release_map(vma->obj);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment