Commit b4716185 authored by Chris Wilson's avatar Chris Wilson Committed by Daniel Vetter

drm/i915: Implement inter-engine read-read optimisations

Currently, we only track the last request globally across all engines.
This prevents us from issuing concurrent read requests on e.g. the RCS
and BCS engines (or more likely the render and media engines). Without
semaphores, we incur costly stalls as we synchronise between rings -
greatly impacting the current performance of Broadwell versus Haswell in
certain workloads (like video decode). With the introduction of
reference counted requests, it is much easier to track the last request
per ring, as well as the last global write request so that we can
optimise inter-engine read read requests (as well as better optimise
certain CPU waits).

v2: Fix inverted readonly condition for nonblocking waits.
v3: Handle non-continguous engine array after waits
v4: Rebase, tidy, rewrite ring list debugging
v5: Use obj->active as a bitfield, it looks cool
v6: Micro-optimise, mostly involving moving code around
v7: Fix retire-requests-upto for execlists (and multiple rq->ringbuf)
v8: Rebase
v9: Refactor i915_gem_object_sync() to allow the compiler to better
optimise it.

Benchmark: igt/gem_read_read_speed
hsw:gt3e (with semaphores):
Before: Time to read-read 1024k:		275.794µs
After:  Time to read-read 1024k:		123.260µs

hsw:gt3e (w/o semaphores):
Before: Time to read-read 1024k:		230.433µs
After:  Time to read-read 1024k:		124.593µs

bdw-u (w/o semaphores):             Before          After
Time to read-read 1x1:            26.274µs       10.350µs
Time to read-read 128x128:        40.097µs       21.366µs
Time to read-read 256x256:        77.087µs       42.608µs
Time to read-read 512x512:       281.999µs      181.155µs
Time to read-read 1024x1024:    1196.141µs     1118.223µs
Time to read-read 2048x2048:    5639.072µs     5225.837µs
Time to read-read 4096x4096:   22401.662µs    21137.067µs
Time to read-read 8192x8192:   89617.735µs    85637.681µs

Testcase: igt/gem_concurrent_blit (read-read and friends)
Cc: Lionel Landwerlin <lionel.g.landwerlin@linux.intel.com>
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> [v8]
[danvet: s/\<rq\>/req/g]
Signed-off-by: default avatarDaniel Vetter <daniel.vetter@ffwll.ch>
parent eed29a5b
......@@ -120,10 +120,13 @@ static inline const char *get_global_flag(struct drm_i915_gem_object *obj)
static void
describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
{
struct drm_i915_private *dev_priv = to_i915(obj->base.dev);
struct intel_engine_cs *ring;
struct i915_vma *vma;
int pin_count = 0;
int i;
seq_printf(m, "%pK: %s%s%s%s %8zdKiB %02x %02x %x %x %x%s%s%s",
seq_printf(m, "%pK: %s%s%s%s %8zdKiB %02x %02x [ ",
&obj->base,
obj->active ? "*" : " ",
get_pin_flag(obj),
......@@ -131,8 +134,11 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
get_global_flag(obj),
obj->base.size / 1024,
obj->base.read_domains,
obj->base.write_domain,
i915_gem_request_get_seqno(obj->last_read_req),
obj->base.write_domain);
for_each_ring(ring, dev_priv, i)
seq_printf(m, "%x ",
i915_gem_request_get_seqno(obj->last_read_req[i]));
seq_printf(m, "] %x %x%s%s%s",
i915_gem_request_get_seqno(obj->last_write_req),
i915_gem_request_get_seqno(obj->last_fenced_req),
i915_cache_level_str(to_i915(obj->base.dev), obj->cache_level),
......@@ -169,9 +175,9 @@ describe_obj(struct seq_file *m, struct drm_i915_gem_object *obj)
*t = '\0';
seq_printf(m, " (%s mappable)", s);
}
if (obj->last_read_req != NULL)
if (obj->last_write_req != NULL)
seq_printf(m, " (%s)",
i915_gem_request_get_ring(obj->last_read_req)->name);
i915_gem_request_get_ring(obj->last_write_req)->name);
if (obj->frontbuffer_bits)
seq_printf(m, " (frontbuffer: 0x%03x)", obj->frontbuffer_bits);
}
......
......@@ -508,7 +508,7 @@ struct drm_i915_error_state {
struct drm_i915_error_buffer {
u32 size;
u32 name;
u32 rseqno, wseqno;
u32 rseqno[I915_NUM_RINGS], wseqno;
u32 gtt_offset;
u32 read_domains;
u32 write_domain;
......@@ -1939,7 +1939,7 @@ struct drm_i915_gem_object {
struct drm_mm_node *stolen;
struct list_head global_list;
struct list_head ring_list;
struct list_head ring_list[I915_NUM_RINGS];
/** Used in execbuf to temporarily hold a ref */
struct list_head obj_exec_link;
......@@ -1950,7 +1950,7 @@ struct drm_i915_gem_object {
* rendering and so a non-zero seqno), and is not set if it i s on
* inactive (ready to be unbound) list.
*/
unsigned int active:1;
unsigned int active:I915_NUM_RINGS;
/**
* This is set if the object has been written to since last bound
......@@ -2021,8 +2021,17 @@ struct drm_i915_gem_object {
void *dma_buf_vmapping;
int vmapping_count;
/** Breadcrumb of last rendering to the buffer. */
struct drm_i915_gem_request *last_read_req;
/** Breadcrumb of last rendering to the buffer.
* There can only be one writer, but we allow for multiple readers.
* If there is a writer that necessarily implies that all other
* read requests are complete - but we may only be lazily clearing
* the read requests. A read request is naturally the most recent
* request on a ring, so we may have two different write and read
* requests on one ring where the write request is older than the
* read request. This allows for the CPU to read from an active
* buffer by only waiting for the write to complete.
* */
struct drm_i915_gem_request *last_read_req[I915_NUM_RINGS];
struct drm_i915_gem_request *last_write_req;
/** Breadcrumb of last fenced GPU access to the buffer. */
struct drm_i915_gem_request *last_fenced_req;
......
This diff is collapsed.
......@@ -753,8 +753,6 @@ static int do_switch(struct intel_engine_cs *ring,
* swapped, but there is no way to do that yet.
*/
from->legacy_hw_ctx.rcs_state->dirty = 1;
BUG_ON(i915_gem_request_get_ring(
from->legacy_hw_ctx.rcs_state->last_read_req) != ring);
/* obj is kept alive until the next request by its active ref */
i915_gem_object_ggtt_unpin(from->legacy_hw_ctx.rcs_state);
......
......@@ -34,83 +34,35 @@ int
i915_verify_lists(struct drm_device *dev)
{
static int warned;
struct drm_i915_private *dev_priv = dev->dev_private;
struct drm_i915_private *dev_priv = to_i915(dev);
struct drm_i915_gem_object *obj;
struct intel_engine_cs *ring;
int err = 0;
int i;
if (warned)
return 0;
list_for_each_entry(obj, &dev_priv->render_ring.active_list, list) {
for_each_ring(ring, dev_priv, i) {
list_for_each_entry(obj, &ring->active_list, ring_list[ring->id]) {
if (obj->base.dev != dev ||
!atomic_read(&obj->base.refcount.refcount)) {
DRM_ERROR("freed render active %p\n", obj);
DRM_ERROR("%s: freed active obj %p\n",
ring->name, obj);
err++;
break;
} else if (!obj->active ||
(obj->base.read_domains & I915_GEM_GPU_DOMAINS) == 0) {
DRM_ERROR("invalid render active %p (a %d r %x)\n",
obj,
obj->active,
obj->base.read_domains);
obj->last_read_req[ring->id] == NULL) {
DRM_ERROR("%s: invalid active obj %p\n",
ring->name, obj);
err++;
} else if (obj->base.write_domain && list_empty(&obj->gpu_write_list)) {
DRM_ERROR("invalid render active %p (w %x, gwl %d)\n",
obj,
obj->base.write_domain,
!list_empty(&obj->gpu_write_list));
} else if (obj->base.write_domain) {
DRM_ERROR("%s: invalid write obj %p (w %x)\n",
ring->name,
obj, obj->base.write_domain);
err++;
}
}
list_for_each_entry(obj, &dev_priv->mm.flushing_list, list) {
if (obj->base.dev != dev ||
!atomic_read(&obj->base.refcount.refcount)) {
DRM_ERROR("freed flushing %p\n", obj);
err++;
break;
} else if (!obj->active ||
(obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0 ||
list_empty(&obj->gpu_write_list)) {
DRM_ERROR("invalid flushing %p (a %d w %x gwl %d)\n",
obj,
obj->active,
obj->base.write_domain,
!list_empty(&obj->gpu_write_list));
err++;
}
}
list_for_each_entry(obj, &dev_priv->mm.gpu_write_list, gpu_write_list) {
if (obj->base.dev != dev ||
!atomic_read(&obj->base.refcount.refcount)) {
DRM_ERROR("freed gpu write %p\n", obj);
err++;
break;
} else if (!obj->active ||
(obj->base.write_domain & I915_GEM_GPU_DOMAINS) == 0) {
DRM_ERROR("invalid gpu write %p (a %d w %x)\n",
obj,
obj->active,
obj->base.write_domain);
err++;
}
}
list_for_each_entry(obj, &i915_gtt_vm->inactive_list, list) {
if (obj->base.dev != dev ||
!atomic_read(&obj->base.refcount.refcount)) {
DRM_ERROR("freed inactive %p\n", obj);
err++;
break;
} else if (obj->pin_count || obj->active ||
(obj->base.write_domain & I915_GEM_GPU_DOMAINS)) {
DRM_ERROR("invalid inactive %p (p %d a %d w %x)\n",
obj,
obj->pin_count, obj->active,
obj->base.write_domain);
err++;
}
}
return warned = err;
......
......@@ -192,15 +192,20 @@ static void print_error_buffers(struct drm_i915_error_state_buf *m,
struct drm_i915_error_buffer *err,
int count)
{
int i;
err_printf(m, " %s [%d]:\n", name, count);
while (count--) {
err_printf(m, " %08x %8u %02x %02x %x %x",
err_printf(m, " %08x %8u %02x %02x [ ",
err->gtt_offset,
err->size,
err->read_domains,
err->write_domain,
err->rseqno, err->wseqno);
err->write_domain);
for (i = 0; i < I915_NUM_RINGS; i++)
err_printf(m, "%02x ", err->rseqno[i]);
err_printf(m, "] %02x", err->wseqno);
err_puts(m, pin_flag(err->pinned));
err_puts(m, tiling_flag(err->tiling));
err_puts(m, dirty_flag(err->dirty));
......@@ -681,10 +686,12 @@ static void capture_bo(struct drm_i915_error_buffer *err,
struct i915_vma *vma)
{
struct drm_i915_gem_object *obj = vma->obj;
int i;
err->size = obj->base.size;
err->name = obj->base.name;
err->rseqno = i915_gem_request_get_seqno(obj->last_read_req);
for (i = 0; i < I915_NUM_RINGS; i++)
err->rseqno[i] = i915_gem_request_get_seqno(obj->last_read_req[i]);
err->wseqno = i915_gem_request_get_seqno(obj->last_write_req);
err->gtt_offset = vma->node.start;
err->read_domains = obj->base.read_domains;
......@@ -697,8 +704,8 @@ static void capture_bo(struct drm_i915_error_buffer *err,
err->dirty = obj->dirty;
err->purgeable = obj->madv != I915_MADV_WILLNEED;
err->userptr = obj->userptr.mm != NULL;
err->ring = obj->last_read_req ?
i915_gem_request_get_ring(obj->last_read_req)->id : -1;
err->ring = obj->last_write_req ?
i915_gem_request_get_ring(obj->last_write_req)->id : -1;
err->cache_level = obj->cache_level;
}
......
......@@ -10682,7 +10682,7 @@ static bool use_mmio_flip(struct intel_engine_cs *ring,
else if (i915.enable_execlists)
return true;
else
return ring != i915_gem_request_get_ring(obj->last_read_req);
return ring != i915_gem_request_get_ring(obj->last_write_req);
}
static void skl_do_mmio_flip(struct intel_crtc *intel_crtc)
......@@ -10998,7 +10998,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
} else if (IS_IVYBRIDGE(dev) || IS_HASWELL(dev)) {
ring = &dev_priv->ring[BCS];
} else if (INTEL_INFO(dev)->gen >= 7) {
ring = i915_gem_request_get_ring(obj->last_read_req);
ring = i915_gem_request_get_ring(obj->last_write_req);
if (ring == NULL || ring->id != RCS)
ring = &dev_priv->ring[BCS];
} else {
......@@ -11014,7 +11014,7 @@ static int intel_crtc_page_flip(struct drm_crtc *crtc,
*/
ret = intel_pin_and_fence_fb_obj(crtc->primary, fb,
crtc->primary->state,
mmio_flip ? i915_gem_request_get_ring(obj->last_read_req) : ring);
mmio_flip ? i915_gem_request_get_ring(obj->last_write_req) : ring);
if (ret)
goto cleanup_pending;
......
......@@ -679,7 +679,8 @@ static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf,
{
struct intel_engine_cs *ring = ringbuf->ring;
struct drm_i915_gem_request *request;
int ret, new_space;
unsigned space;
int ret;
if (intel_ring_space(ringbuf) >= bytes)
return 0;
......@@ -690,14 +691,13 @@ static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf,
* from multiple ringbuffers. Here, we must ignore any that
* aren't from the ringbuffer we're considering.
*/
struct intel_context *ctx = request->ctx;
if (ctx->engine[ring->id].ringbuf != ringbuf)
if (request->ringbuf != ringbuf)
continue;
/* Would completion of this request free enough space? */
new_space = __intel_ring_space(request->postfix, ringbuf->tail,
space = __intel_ring_space(request->postfix, ringbuf->tail,
ringbuf->size);
if (new_space >= bytes)
if (space >= bytes)
break;
}
......@@ -708,11 +708,8 @@ static int logical_ring_wait_for_space(struct intel_ringbuffer *ringbuf,
if (ret)
return ret;
i915_gem_retire_requests_ring(ring);
WARN_ON(intel_ring_space(ringbuf) < new_space);
return intel_ring_space(ringbuf) >= bytes ? 0 : -ENOSPC;
ringbuf->space = space;
return 0;
}
/*
......
......@@ -228,7 +228,6 @@ static int intel_overlay_do_wait_request(struct intel_overlay *overlay,
ret = i915_wait_request(overlay->last_flip_req);
if (ret)
return ret;
i915_gem_retire_requests(dev);
i915_gem_request_assign(&overlay->last_flip_req, NULL);
return 0;
......@@ -376,7 +375,6 @@ static int intel_overlay_recover_from_interrupt(struct intel_overlay *overlay)
ret = i915_wait_request(overlay->last_flip_req);
if (ret)
return ret;
i915_gem_retire_requests(overlay->dev);
if (overlay->flip_tail)
overlay->flip_tail(overlay);
......
......@@ -2103,15 +2103,16 @@ static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
{
struct intel_ringbuffer *ringbuf = ring->buffer;
struct drm_i915_gem_request *request;
int ret, new_space;
unsigned space;
int ret;
if (intel_ring_space(ringbuf) >= n)
return 0;
list_for_each_entry(request, &ring->request_list, list) {
new_space = __intel_ring_space(request->postfix, ringbuf->tail,
space = __intel_ring_space(request->postfix, ringbuf->tail,
ringbuf->size);
if (new_space >= n)
if (space >= n)
break;
}
......@@ -2122,10 +2123,7 @@ static int ring_wait_for_space(struct intel_engine_cs *ring, int n)
if (ret)
return ret;
i915_gem_retire_requests_ring(ring);
WARN_ON(intel_ring_space(ringbuf) < new_space);
ringbuf->space = space;
return 0;
}
......@@ -2172,7 +2170,11 @@ int intel_ring_idle(struct intel_engine_cs *ring)
struct drm_i915_gem_request,
list);
return i915_wait_request(req);
/* Make sure we do not trigger any retires */
return __i915_wait_request(req,
atomic_read(&to_i915(ring->dev)->gpu_error.reset_counter),
to_i915(ring->dev)->mm.interruptible,
NULL, NULL);
}
int intel_ring_alloc_request_extras(struct drm_i915_gem_request *request)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment