Commit df9f85d8 authored by Chris Wilson's avatar Chris Wilson

drm/i915: Serialise i915_active_fence_set() with itself

The expected downside to commit 58b4c1a0 ("drm/i915: Reduce nested
prepare_remote_context() to a trylock") was that it would need to return
-EAGAIN to userspace in order to resolve potential mutex inversion. Such
an unsightly round trip is unnecessary if we could atomically insert a
barrier into the i915_active_fence, so make it happen.

Currently, we use the timeline->mutex (or some other named outer lock)
to order insertion into the i915_active_fence (and so individual nodes
of i915_active). Inside __i915_active_fence_set, we only need then
serialise with the interrupt handler in order to claim the timeline for
ourselves.

However, if we remove the outer lock, we need to ensure the order is
intact between not only multiple threads trying to insert themselves
into the timeline, but also with the interrupt handler completing the
previous occupant. We use xchg() on insert so that we have an ordered
sequence of insertions (and each caller knows the previous fence on
which to wait, preserving the chain of all fences in the timeline), but
we then have to cmpxchg() in the interrupt handler to avoid overwriting
the new occupant. The only nasty side-effect is having to temporarily
strip off the RCU-annotations to apply the atomic operations, otherwise
the rules are much more conventional!

Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=112402
Fixes: 58b4c1a0 ("drm/i915: Reduce nested prepare_remote_context() to a trylock")
Signed-off-by: default avatarChris Wilson <chris@chris-wilson.co.uk>
Cc: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Reviewed-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20191127134527.3438410-1-chris@chris-wilson.co.uk
parent 730eaeb5
......@@ -310,27 +310,8 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
GEM_BUG_ON(rq->hw_context == ce);
if (rcu_access_pointer(rq->timeline) != tl) { /* timeline sharing! */
/*
* Ideally, we just want to insert our foreign fence as
* a barrier into the remove context, such that this operation
* occurs after all current operations in that context, and
* all future operations must occur after this.
*
* Currently, the timeline->last_request tracking is guarded
* by its mutex and so we must obtain that to atomically
* insert our barrier. However, since we already hold our
* timeline->mutex, we must be careful against potential
* inversion if we are the kernel_context as the remote context
* will itself poke at the kernel_context when it needs to
* unpin. Ergo, if already locked, we drop both locks and
* try again (through the magic of userspace repeating EAGAIN).
*/
if (!mutex_trylock(&tl->mutex))
return -EAGAIN;
/* Queue this switch after current activity by this context. */
err = i915_active_fence_set(&tl->last_request, rq);
mutex_unlock(&tl->mutex);
if (err)
return err;
}
......
......@@ -183,7 +183,7 @@ static void call_idle_barriers(struct intel_engine_cs *engine)
container_of((struct list_head *)node,
typeof(*cb), node);
cb->func(NULL, cb);
cb->func(ERR_PTR(-EAGAIN), cb);
}
}
......
......@@ -254,7 +254,7 @@ int intel_timeline_init(struct intel_timeline *timeline,
mutex_init(&timeline->mutex);
INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
INIT_ACTIVE_FENCE(&timeline->last_request);
INIT_LIST_HEAD(&timeline->requests);
i915_syncmap_init(&timeline->sync);
......
......@@ -15,7 +15,7 @@ void mock_timeline_init(struct intel_timeline *timeline, u64 context)
mutex_init(&timeline->mutex);
INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
INIT_ACTIVE_FENCE(&timeline->last_request);
INIT_LIST_HEAD(&timeline->requests);
i915_syncmap_init(&timeline->sync);
......
......@@ -186,18 +186,33 @@ active_retire(struct i915_active *ref)
__active_retire(ref);
}
static inline struct dma_fence **
__active_fence_slot(struct i915_active_fence *active)
{
return (struct dma_fence ** __force)&active->fence;
}
static inline bool
active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
{
struct i915_active_fence *active =
container_of(cb, typeof(*active), cb);
return cmpxchg(__active_fence_slot(active), fence, NULL) == fence;
}
static void
node_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
{
i915_active_fence_cb(fence, cb);
active_retire(container_of(cb, struct active_node, base.cb)->ref);
if (active_fence_cb(fence, cb))
active_retire(container_of(cb, struct active_node, base.cb)->ref);
}
static void
excl_retire(struct dma_fence *fence, struct dma_fence_cb *cb)
{
i915_active_fence_cb(fence, cb);
active_retire(container_of(cb, struct i915_active, excl.cb));
if (active_fence_cb(fence, cb))
active_retire(container_of(cb, struct i915_active, excl.cb));
}
static struct i915_active_fence *
......@@ -244,7 +259,7 @@ active_instance(struct i915_active *ref, struct intel_timeline *tl)
}
node = prealloc;
__i915_active_fence_init(&node->base, &tl->mutex, NULL, node_retire);
__i915_active_fence_init(&node->base, NULL, node_retire);
node->ref = ref;
node->timeline = idx;
......@@ -281,7 +296,7 @@ void __i915_active_init(struct i915_active *ref,
init_llist_head(&ref->preallocated_barriers);
atomic_set(&ref->count, 0);
__mutex_init(&ref->mutex, "i915_active", key);
__i915_active_fence_init(&ref->excl, &ref->mutex, NULL, excl_retire);
__i915_active_fence_init(&ref->excl, NULL, excl_retire);
INIT_WORK(&ref->work, active_work);
}
......@@ -376,15 +391,8 @@ void i915_active_set_exclusive(struct i915_active *ref, struct dma_fence *f)
/* We expect the caller to manage the exclusive timeline ordering */
GEM_BUG_ON(i915_active_is_idle(ref));
/*
* As we don't know which mutex the caller is using, we told a small
* lie to the debug code that it is using the i915_active.mutex;
* and now we must stick to that lie.
*/
mutex_acquire(&ref->mutex.dep_map, 0, 0, _THIS_IP_);
if (!__i915_active_fence_set(&ref->excl, f))
atomic_inc(&ref->count);
mutex_release(&ref->mutex.dep_map, 0, _THIS_IP_);
}
bool i915_active_acquire_if_busy(struct i915_active *ref)
......@@ -615,10 +623,6 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
goto unwind;
}
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
node->base.lock =
&engine->kernel_context->timeline->mutex;
#endif
RCU_INIT_POINTER(node->base.fence, NULL);
node->base.cb.func = node_retire;
node->timeline = idx;
......@@ -639,6 +643,7 @@ int i915_active_acquire_preallocate_barrier(struct i915_active *ref,
node->base.cb.node.prev = (void *)engine;
atomic_inc(&ref->count);
}
GEM_BUG_ON(rcu_access_pointer(node->base.fence) != ERR_PTR(-EAGAIN));
GEM_BUG_ON(barrier_to_engine(node) != engine);
llist_add(barrier_to_ll(node), &ref->preallocated_barriers);
......@@ -702,6 +707,11 @@ void i915_active_acquire_barrier(struct i915_active *ref)
}
}
static struct dma_fence **ll_to_fence_slot(struct llist_node *node)
{
return __active_fence_slot(&barrier_from_ll(node)->base);
}
void i915_request_add_active_barriers(struct i915_request *rq)
{
struct intel_engine_cs *engine = rq->engine;
......@@ -721,19 +731,13 @@ void i915_request_add_active_barriers(struct i915_request *rq)
*/
spin_lock_irqsave(&rq->lock, flags);
llist_for_each_safe(node, next, node) {
RCU_INIT_POINTER(barrier_from_ll(node)->base.fence, &rq->fence);
smp_wmb(); /* serialise with reuse_idle_barrier */
/* serialise with reuse_idle_barrier */
smp_store_mb(*ll_to_fence_slot(node), &rq->fence);
list_add_tail((struct list_head *)node, &rq->fence.cb_list);
}
spin_unlock_irqrestore(&rq->lock, flags);
}
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
#define active_is_held(active) lockdep_is_held((active)->lock)
#else
#define active_is_held(active) true
#endif
/*
* __i915_active_fence_set: Update the last active fence along its timeline
* @active: the active tracker
......@@ -744,7 +748,7 @@ void i915_request_add_active_barriers(struct i915_request *rq)
* fence onto this one. Returns the previous fence (if not already completed),
* which the caller must ensure is executed before the new fence. To ensure
* that the order of fences within the timeline of the i915_active_fence is
* maintained, it must be locked by the caller.
* understood, it should be locked by the caller.
*/
struct dma_fence *
__i915_active_fence_set(struct i915_active_fence *active,
......@@ -753,34 +757,41 @@ __i915_active_fence_set(struct i915_active_fence *active,
struct dma_fence *prev;
unsigned long flags;
/* NB: must be serialised by an outer timeline mutex (active->lock) */
spin_lock_irqsave(fence->lock, flags);
if (fence == rcu_access_pointer(active->fence))
return fence;
GEM_BUG_ON(test_bit(DMA_FENCE_FLAG_SIGNALED_BIT, &fence->flags));
prev = rcu_dereference_protected(active->fence, active_is_held(active));
/*
* Consider that we have two threads arriving (A and B), with
* C already resident as the active->fence.
*
* A does the xchg first, and so it sees C or NULL depending
* on the timing of the interrupt handler. If it is NULL, the
* previous fence must have been signaled and we know that
* we are first on the timeline. If it is still present,
* we acquire the lock on that fence and serialise with the interrupt
* handler, in the process removing it from any future interrupt
* callback. A will then wait on C before executing (if present).
*
* As B is second, it sees A as the previous fence and so waits for
* it to complete its transition and takes over the occupancy for
* itself -- remembering that it needs to wait on A before executing.
*
* Note the strong ordering of the timeline also provides consistent
* nesting rules for the fence->lock; the inner lock is always the
* older lock.
*/
spin_lock_irqsave(fence->lock, flags);
prev = xchg(__active_fence_slot(active), fence);
if (prev) {
GEM_BUG_ON(prev == fence);
spin_lock_nested(prev->lock, SINGLE_DEPTH_NESTING);
__list_del_entry(&active->cb.node);
spin_unlock(prev->lock); /* serialise with prev->cb_list */
/*
* active->fence is reset by the callback from inside
* interrupt context. We need to serialise our list
* manipulation with the fence->lock to prevent the prev
* being lost inside an interrupt (it can't be replaced as
* no other caller is allowed to enter __i915_active_fence_set
* as we hold the timeline lock). After serialising with
* the callback, we need to double check which ran first,
* our list_del() [decoupling prev from the callback] or
* the callback...
*/
prev = rcu_access_pointer(active->fence);
}
rcu_assign_pointer(active->fence, fence);
GEM_BUG_ON(rcu_access_pointer(active->fence) != fence);
list_add_tail(&active->cb.node, &fence->cb_list);
spin_unlock_irqrestore(fence->lock, flags);
return prev;
......@@ -792,10 +803,6 @@ int i915_active_fence_set(struct i915_active_fence *active,
struct dma_fence *fence;
int err = 0;
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
lockdep_assert_held(active->lock);
#endif
/* Must maintain timeline ordering wrt previous active requests */
rcu_read_lock();
fence = __i915_active_fence_set(active, &rq->fence);
......@@ -812,7 +819,7 @@ int i915_active_fence_set(struct i915_active_fence *active,
void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb)
{
i915_active_fence_cb(fence, cb);
active_fence_cb(fence, cb);
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
......
......@@ -61,19 +61,15 @@ void i915_active_noop(struct dma_fence *fence, struct dma_fence_cb *cb);
*/
static inline void
__i915_active_fence_init(struct i915_active_fence *active,
struct mutex *lock,
void *fence,
dma_fence_func_t fn)
{
RCU_INIT_POINTER(active->fence, fence);
active->cb.func = fn ?: i915_active_noop;
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
active->lock = lock;
#endif
}
#define INIT_ACTIVE_FENCE(A, LOCK) \
__i915_active_fence_init((A), (LOCK), NULL, NULL)
#define INIT_ACTIVE_FENCE(A) \
__i915_active_fence_init((A), NULL, NULL)
struct dma_fence *
__i915_active_fence_set(struct i915_active_fence *active,
......@@ -127,15 +123,6 @@ i915_active_fence_isset(const struct i915_active_fence *active)
return rcu_access_pointer(active->fence);
}
static inline void
i915_active_fence_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
{
struct i915_active_fence *active =
container_of(cb, typeof(*active), cb);
RCU_INIT_POINTER(active->fence, NULL);
}
/*
* GPU activity tracking
*
......
......@@ -20,21 +20,6 @@
struct i915_active_fence {
struct dma_fence __rcu *fence;
struct dma_fence_cb cb;
#if IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM)
/*
* Incorporeal!
*
* Updates to the i915_active_request must be serialised under a lock
* to ensure that the timeline is ordered. Normally, this is the
* timeline->mutex, but another mutex may be used so long as it is
* done so consistently.
*
* For lockdep tracking of the above, we store the lock we intend
* to always use for updates of this i915_active_request during
* construction and assert that is held on every update.
*/
struct mutex *lock;
#endif
};
struct active_node;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment