Commit c9358de1 authored by Brendan Cunningham's avatar Brendan Cunningham Committed by Jason Gunthorpe

IB/hfi1: Fix wrong mmu_node used for user SDMA packet after invalidate

The hfi1 user SDMA pinned-page cache will leave a stale cache entry when
the cache-entry's virtual address range is invalidated but that cache
entry is in-use by an outstanding SDMA request.

Subsequent user SDMA requests with buffers in or spanning the virtual
address range of the stale cache entry will result in packets constructed
from the wrong memory, the physical pages pointed to by the stale cache
entry.

To fix this, remove mmu_rb_node cache entries from the mmu_rb_handler
cache independent of the cache entry's refcount. Add 'struct kref
refcount' to struct mmu_rb_node and manage mmu_rb_node lifetime with
kref_get() and kref_put().

mmu_rb_node.refcount makes sdma_mmu_node.refcount redundant. Remove
'atomic_t refcount' from struct sdma_mmu_node and change sdma_mmu_node
code to use mmu_rb_node.refcount.

Move the mmu_rb_handler destructor call after a
wait-for-SDMA-request-completion call so mmu_rb_nodes that need
mmu_rb_handler's workqueue to queue themselves up for destruction from an
interrupt context may do so.

Fixes: f48ad614 ("IB/hfi1: Move driver out of staging")
Fixes: 00cbce5c ("IB/hfi1: Fix bugs with non-PAGE_SIZE-end multi-iovec user SDMA requests")
Link: https://lore.kernel.org/r/168451393605.3700681.13493776139032178861.stgit@awfm-02.cornelisnetworks.comReviewed-by: default avatarDean Luick <dean.luick@cornelisnetworks.com>
Signed-off-by: default avatarBrendan Cunningham <bcunningham@cornelisnetworks.com>
Signed-off-by: default avatarDennis Dalessandro <dennis.dalessandro@cornelisnetworks.com>
Signed-off-by: default avatarJason Gunthorpe <jgg@nvidia.com>
parent 1cc625ce
...@@ -215,11 +215,11 @@ static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx, ...@@ -215,11 +215,11 @@ static int hfi1_ipoib_build_ulp_payload(struct ipoib_txreq *tx,
const skb_frag_t *frag = &skb_shinfo(skb)->frags[i]; const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
ret = sdma_txadd_page(dd, ret = sdma_txadd_page(dd,
NULL,
txreq, txreq,
skb_frag_page(frag), skb_frag_page(frag),
frag->bv_offset, frag->bv_offset,
skb_frag_size(frag)); skb_frag_size(frag),
NULL, NULL, NULL);
if (unlikely(ret)) if (unlikely(ret))
break; break;
} }
......
...@@ -19,8 +19,7 @@ static int mmu_notifier_range_start(struct mmu_notifier *, ...@@ -19,8 +19,7 @@ static int mmu_notifier_range_start(struct mmu_notifier *,
const struct mmu_notifier_range *); const struct mmu_notifier_range *);
static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *,
unsigned long, unsigned long); unsigned long, unsigned long);
static void do_remove(struct mmu_rb_handler *handler, static void release_immediate(struct kref *refcount);
struct list_head *del_list);
static void handle_remove(struct work_struct *work); static void handle_remove(struct work_struct *work);
static const struct mmu_notifier_ops mn_opts = { static const struct mmu_notifier_ops mn_opts = {
...@@ -106,7 +105,11 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) ...@@ -106,7 +105,11 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler)
} }
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
do_remove(handler, &del_list); while (!list_empty(&del_list)) {
rbnode = list_first_entry(&del_list, struct mmu_rb_node, list);
list_del(&rbnode->list);
kref_put(&rbnode->refcount, release_immediate);
}
/* Now the mm may be freed. */ /* Now the mm may be freed. */
mmdrop(handler->mn.mm); mmdrop(handler->mn.mm);
...@@ -134,12 +137,6 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, ...@@ -134,12 +137,6 @@ int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
} }
__mmu_int_rb_insert(mnode, &handler->root); __mmu_int_rb_insert(mnode, &handler->root);
list_add_tail(&mnode->list, &handler->lru_list); list_add_tail(&mnode->list, &handler->lru_list);
ret = handler->ops->insert(handler->ops_arg, mnode);
if (ret) {
__mmu_int_rb_remove(mnode, &handler->root);
list_del(&mnode->list); /* remove from LRU list */
}
mnode->handler = handler; mnode->handler = handler;
unlock: unlock:
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
...@@ -183,6 +180,48 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler, ...@@ -183,6 +180,48 @@ static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *handler,
return node; return node;
} }
/*
* Must NOT call while holding mnode->handler->lock.
* mnode->handler->ops->remove() may sleep and mnode->handler->lock is a
* spinlock.
*/
static void release_immediate(struct kref *refcount)
{
struct mmu_rb_node *mnode =
container_of(refcount, struct mmu_rb_node, refcount);
mnode->handler->ops->remove(mnode->handler->ops_arg, mnode);
}
/* Caller must hold mnode->handler->lock */
static void release_nolock(struct kref *refcount)
{
struct mmu_rb_node *mnode =
container_of(refcount, struct mmu_rb_node, refcount);
list_move(&mnode->list, &mnode->handler->del_list);
queue_work(mnode->handler->wq, &mnode->handler->del_work);
}
/*
* struct mmu_rb_node->refcount kref_put() callback.
* Adds mmu_rb_node to mmu_rb_node->handler->del_list and queues
* handler->del_work on handler->wq.
* Does not remove mmu_rb_node from handler->lru_list or handler->rb_root.
* Acquires mmu_rb_node->handler->lock; do not call while already holding
* handler->lock.
*/
void hfi1_mmu_rb_release(struct kref *refcount)
{
struct mmu_rb_node *mnode =
container_of(refcount, struct mmu_rb_node, refcount);
struct mmu_rb_handler *handler = mnode->handler;
unsigned long flags;
spin_lock_irqsave(&handler->lock, flags);
list_move(&mnode->list, &mnode->handler->del_list);
spin_unlock_irqrestore(&handler->lock, flags);
queue_work(handler->wq, &handler->del_work);
}
void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
{ {
struct mmu_rb_node *rbnode, *ptr; struct mmu_rb_node *rbnode, *ptr;
...@@ -197,6 +236,10 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) ...@@ -197,6 +236,10 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
spin_lock_irqsave(&handler->lock, flags); spin_lock_irqsave(&handler->lock, flags);
list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) { list_for_each_entry_safe(rbnode, ptr, &handler->lru_list, list) {
/* refcount == 1 implies mmu_rb_handler has only rbnode ref */
if (kref_read(&rbnode->refcount) > 1)
continue;
if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg, if (handler->ops->evict(handler->ops_arg, rbnode, evict_arg,
&stop)) { &stop)) {
__mmu_int_rb_remove(rbnode, &handler->root); __mmu_int_rb_remove(rbnode, &handler->root);
...@@ -209,7 +252,7 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg) ...@@ -209,7 +252,7 @@ void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg)
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
list_for_each_entry_safe(rbnode, ptr, &del_list, list) { list_for_each_entry_safe(rbnode, ptr, &del_list, list) {
handler->ops->remove(handler->ops_arg, rbnode); kref_put(&rbnode->refcount, release_immediate);
} }
} }
...@@ -221,7 +264,6 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, ...@@ -221,7 +264,6 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn,
struct rb_root_cached *root = &handler->root; struct rb_root_cached *root = &handler->root;
struct mmu_rb_node *node, *ptr = NULL; struct mmu_rb_node *node, *ptr = NULL;
unsigned long flags; unsigned long flags;
bool added = false;
spin_lock_irqsave(&handler->lock, flags); spin_lock_irqsave(&handler->lock, flags);
for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1); for (node = __mmu_int_rb_iter_first(root, range->start, range->end-1);
...@@ -230,38 +272,16 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn, ...@@ -230,38 +272,16 @@ static int mmu_notifier_range_start(struct mmu_notifier *mn,
ptr = __mmu_int_rb_iter_next(node, range->start, ptr = __mmu_int_rb_iter_next(node, range->start,
range->end - 1); range->end - 1);
trace_hfi1_mmu_mem_invalidate(node->addr, node->len); trace_hfi1_mmu_mem_invalidate(node->addr, node->len);
if (handler->ops->invalidate(handler->ops_arg, node)) { /* Remove from rb tree and lru_list. */
__mmu_int_rb_remove(node, root); __mmu_int_rb_remove(node, root);
/* move from LRU list to delete list */ list_del_init(&node->list);
list_move(&node->list, &handler->del_list); kref_put(&node->refcount, release_nolock);
added = true;
}
} }
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
if (added)
queue_work(handler->wq, &handler->del_work);
return 0; return 0;
} }
/*
* Call the remove function for the given handler and the list. This
* is expected to be called with a delete list extracted from handler.
* The caller should not be holding the handler lock.
*/
static void do_remove(struct mmu_rb_handler *handler,
struct list_head *del_list)
{
struct mmu_rb_node *node;
while (!list_empty(del_list)) {
node = list_first_entry(del_list, struct mmu_rb_node, list);
list_del(&node->list);
handler->ops->remove(handler->ops_arg, node);
}
}
/* /*
* Work queue function to remove all nodes that have been queued up to * Work queue function to remove all nodes that have been queued up to
* be removed. The key feature is that mm->mmap_lock is not being held * be removed. The key feature is that mm->mmap_lock is not being held
...@@ -274,11 +294,16 @@ static void handle_remove(struct work_struct *work) ...@@ -274,11 +294,16 @@ static void handle_remove(struct work_struct *work)
del_work); del_work);
struct list_head del_list; struct list_head del_list;
unsigned long flags; unsigned long flags;
struct mmu_rb_node *node;
/* remove anything that is queued to get removed */ /* remove anything that is queued to get removed */
spin_lock_irqsave(&handler->lock, flags); spin_lock_irqsave(&handler->lock, flags);
list_replace_init(&handler->del_list, &del_list); list_replace_init(&handler->del_list, &del_list);
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
do_remove(handler, &del_list); while (!list_empty(&del_list)) {
node = list_first_entry(&del_list, struct mmu_rb_node, list);
list_del(&node->list);
handler->ops->remove(handler->ops_arg, node);
}
} }
...@@ -16,6 +16,7 @@ struct mmu_rb_node { ...@@ -16,6 +16,7 @@ struct mmu_rb_node {
struct rb_node node; struct rb_node node;
struct mmu_rb_handler *handler; struct mmu_rb_handler *handler;
struct list_head list; struct list_head list;
struct kref refcount;
}; };
/* /*
...@@ -61,6 +62,8 @@ int hfi1_mmu_rb_register(void *ops_arg, ...@@ -61,6 +62,8 @@ int hfi1_mmu_rb_register(void *ops_arg,
void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler); void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler);
int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler,
struct mmu_rb_node *mnode); struct mmu_rb_node *mnode);
void hfi1_mmu_rb_release(struct kref *refcount);
void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg); void hfi1_mmu_rb_evict(struct mmu_rb_handler *handler, void *evict_arg);
struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler, struct mmu_rb_node *hfi1_mmu_rb_get_first(struct mmu_rb_handler *handler,
unsigned long addr, unsigned long addr,
......
...@@ -1593,7 +1593,20 @@ static inline void sdma_unmap_desc( ...@@ -1593,7 +1593,20 @@ static inline void sdma_unmap_desc(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
struct sdma_desc *descp) struct sdma_desc *descp)
{ {
system_descriptor_complete(dd, descp); switch (sdma_mapping_type(descp)) {
case SDMA_MAP_SINGLE:
dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
case SDMA_MAP_PAGE:
dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
}
if (descp->pinning_ctx && descp->ctx_put)
descp->ctx_put(descp->pinning_ctx);
descp->pinning_ctx = NULL;
} }
/* /*
...@@ -3113,8 +3126,8 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx, ...@@ -3113,8 +3126,8 @@ int ext_coal_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx,
/* Add descriptor for coalesce buffer */ /* Add descriptor for coalesce buffer */
tx->desc_limit = MAX_DESC; tx->desc_limit = MAX_DESC;
return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx,
addr, tx->tlen); addr, tx->tlen, NULL, NULL, NULL);
} }
return 1; return 1;
...@@ -3157,9 +3170,9 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx) ...@@ -3157,9 +3170,9 @@ int _pad_sdma_tx_descs(struct hfi1_devdata *dd, struct sdma_txreq *tx)
make_tx_sdma_desc( make_tx_sdma_desc(
tx, tx,
SDMA_MAP_NONE, SDMA_MAP_NONE,
NULL,
dd->sdma_pad_phys, dd->sdma_pad_phys,
sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1))); sizeof(u32) - (tx->packet_len & (sizeof(u32) - 1)),
NULL, NULL, NULL);
tx->num_desc++; tx->num_desc++;
_sdma_close_tx(dd, tx); _sdma_close_tx(dd, tx);
return rval; return rval;
......
...@@ -594,9 +594,11 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d) ...@@ -594,9 +594,11 @@ static inline dma_addr_t sdma_mapping_addr(struct sdma_desc *d)
static inline void make_tx_sdma_desc( static inline void make_tx_sdma_desc(
struct sdma_txreq *tx, struct sdma_txreq *tx,
int type, int type,
void *pinning_ctx,
dma_addr_t addr, dma_addr_t addr,
size_t len) size_t len,
void *pinning_ctx,
void (*ctx_get)(void *),
void (*ctx_put)(void *))
{ {
struct sdma_desc *desc = &tx->descp[tx->num_desc]; struct sdma_desc *desc = &tx->descp[tx->num_desc];
...@@ -613,7 +615,11 @@ static inline void make_tx_sdma_desc( ...@@ -613,7 +615,11 @@ static inline void make_tx_sdma_desc(
<< SDMA_DESC0_PHY_ADDR_SHIFT) | << SDMA_DESC0_PHY_ADDR_SHIFT) |
(((u64)len & SDMA_DESC0_BYTE_COUNT_MASK) (((u64)len & SDMA_DESC0_BYTE_COUNT_MASK)
<< SDMA_DESC0_BYTE_COUNT_SHIFT); << SDMA_DESC0_BYTE_COUNT_SHIFT);
desc->pinning_ctx = pinning_ctx; desc->pinning_ctx = pinning_ctx;
desc->ctx_put = ctx_put;
if (pinning_ctx && ctx_get)
ctx_get(pinning_ctx);
} }
/* helper to extend txreq */ /* helper to extend txreq */
...@@ -645,18 +651,20 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd, ...@@ -645,18 +651,20 @@ static inline void _sdma_close_tx(struct hfi1_devdata *dd,
static inline int _sdma_txadd_daddr( static inline int _sdma_txadd_daddr(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
int type, int type,
void *pinning_ctx,
struct sdma_txreq *tx, struct sdma_txreq *tx,
dma_addr_t addr, dma_addr_t addr,
u16 len) u16 len,
void *pinning_ctx,
void (*ctx_get)(void *),
void (*ctx_put)(void *))
{ {
int rval = 0; int rval = 0;
make_tx_sdma_desc( make_tx_sdma_desc(
tx, tx,
type, type,
pinning_ctx, addr, len,
addr, len); pinning_ctx, ctx_get, ctx_put);
WARN_ON(len > tx->tlen); WARN_ON(len > tx->tlen);
tx->num_desc++; tx->num_desc++;
tx->tlen -= len; tx->tlen -= len;
...@@ -676,11 +684,18 @@ static inline int _sdma_txadd_daddr( ...@@ -676,11 +684,18 @@ static inline int _sdma_txadd_daddr(
/** /**
* sdma_txadd_page() - add a page to the sdma_txreq * sdma_txadd_page() - add a page to the sdma_txreq
* @dd: the device to use for mapping * @dd: the device to use for mapping
* @pinning_ctx: context to be released at descriptor retirement
* @tx: tx request to which the page is added * @tx: tx request to which the page is added
* @page: page to map * @page: page to map
* @offset: offset within the page * @offset: offset within the page
* @len: length in bytes * @len: length in bytes
* @pinning_ctx: context to be stored on struct sdma_desc .pinning_ctx. Not
* added if coalesce buffer is used. E.g. pointer to pinned-page
* cache entry for the sdma_desc.
* @ctx_get: optional function to take reference to @pinning_ctx. Not called if
* @pinning_ctx is NULL.
* @ctx_put: optional function to release reference to @pinning_ctx after
* sdma_desc completes. May be called in interrupt context so must
* not sleep. Not called if @pinning_ctx is NULL.
* *
* This is used to add a page/offset/length descriptor. * This is used to add a page/offset/length descriptor.
* *
...@@ -692,11 +707,13 @@ static inline int _sdma_txadd_daddr( ...@@ -692,11 +707,13 @@ static inline int _sdma_txadd_daddr(
*/ */
static inline int sdma_txadd_page( static inline int sdma_txadd_page(
struct hfi1_devdata *dd, struct hfi1_devdata *dd,
void *pinning_ctx,
struct sdma_txreq *tx, struct sdma_txreq *tx,
struct page *page, struct page *page,
unsigned long offset, unsigned long offset,
u16 len) u16 len,
void *pinning_ctx,
void (*ctx_get)(void *),
void (*ctx_put)(void *))
{ {
dma_addr_t addr; dma_addr_t addr;
int rval; int rval;
...@@ -720,7 +737,8 @@ static inline int sdma_txadd_page( ...@@ -720,7 +737,8 @@ static inline int sdma_txadd_page(
return -ENOSPC; return -ENOSPC;
} }
return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, pinning_ctx, tx, addr, len); return _sdma_txadd_daddr(dd, SDMA_MAP_PAGE, tx, addr, len,
pinning_ctx, ctx_get, ctx_put);
} }
/** /**
...@@ -754,8 +772,8 @@ static inline int sdma_txadd_daddr( ...@@ -754,8 +772,8 @@ static inline int sdma_txadd_daddr(
return rval; return rval;
} }
return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, NULL, tx, return _sdma_txadd_daddr(dd, SDMA_MAP_NONE, tx, addr, len,
addr, len); NULL, NULL, NULL);
} }
/** /**
...@@ -801,7 +819,8 @@ static inline int sdma_txadd_kvaddr( ...@@ -801,7 +819,8 @@ static inline int sdma_txadd_kvaddr(
return -ENOSPC; return -ENOSPC;
} }
return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, NULL, tx, addr, len); return _sdma_txadd_daddr(dd, SDMA_MAP_SINGLE, tx, addr, len,
NULL, NULL, NULL);
} }
struct iowait_work; struct iowait_work;
...@@ -1034,6 +1053,4 @@ u16 sdma_get_descq_cnt(void); ...@@ -1034,6 +1053,4 @@ u16 sdma_get_descq_cnt(void);
extern uint mod_num_sdma; extern uint mod_num_sdma;
void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid); void sdma_update_lmc(struct hfi1_devdata *dd, u64 mask, u32 lid);
void system_descriptor_complete(struct hfi1_devdata *dd, struct sdma_desc *descp);
#endif #endif
...@@ -20,6 +20,8 @@ struct sdma_desc { ...@@ -20,6 +20,8 @@ struct sdma_desc {
/* private: don't use directly */ /* private: don't use directly */
u64 qw[2]; u64 qw[2];
void *pinning_ctx; void *pinning_ctx;
/* Release reference to @pinning_ctx. May be called in interrupt context. Must not sleep. */
void (*ctx_put)(void *ctx);
}; };
/** /**
......
...@@ -62,18 +62,14 @@ static int defer_packet_queue( ...@@ -62,18 +62,14 @@ static int defer_packet_queue(
static void activate_packet_queue(struct iowait *wait, int reason); static void activate_packet_queue(struct iowait *wait, int reason);
static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
unsigned long len); unsigned long len);
static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode);
static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
void *arg2, bool *stop); void *arg2, bool *stop);
static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode); static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode);
static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode);
static struct mmu_rb_ops sdma_rb_ops = { static struct mmu_rb_ops sdma_rb_ops = {
.filter = sdma_rb_filter, .filter = sdma_rb_filter,
.insert = sdma_rb_insert,
.evict = sdma_rb_evict, .evict = sdma_rb_evict,
.remove = sdma_rb_remove, .remove = sdma_rb_remove,
.invalidate = sdma_rb_invalidate
}; };
static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
...@@ -247,14 +243,14 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd, ...@@ -247,14 +243,14 @@ int hfi1_user_sdma_free_queues(struct hfi1_filedata *fd,
spin_unlock(&fd->pq_rcu_lock); spin_unlock(&fd->pq_rcu_lock);
synchronize_srcu(&fd->pq_srcu); synchronize_srcu(&fd->pq_srcu);
/* at this point there can be no more new requests */ /* at this point there can be no more new requests */
if (pq->handler)
hfi1_mmu_rb_unregister(pq->handler);
iowait_sdma_drain(&pq->busy); iowait_sdma_drain(&pq->busy);
/* Wait until all requests have been freed. */ /* Wait until all requests have been freed. */
wait_event_interruptible( wait_event_interruptible(
pq->wait, pq->wait,
!atomic_read(&pq->n_reqs)); !atomic_read(&pq->n_reqs));
kfree(pq->reqs); kfree(pq->reqs);
if (pq->handler)
hfi1_mmu_rb_unregister(pq->handler);
bitmap_free(pq->req_in_use); bitmap_free(pq->req_in_use);
kmem_cache_destroy(pq->txreq_cache); kmem_cache_destroy(pq->txreq_cache);
flush_pq_iowait(pq); flush_pq_iowait(pq);
...@@ -1275,25 +1271,17 @@ static void free_system_node(struct sdma_mmu_node *node) ...@@ -1275,25 +1271,17 @@ static void free_system_node(struct sdma_mmu_node *node)
kfree(node); kfree(node);
} }
static inline void acquire_node(struct sdma_mmu_node *node) /*
{ * kref_get()'s an additional kref on the returned rb_node to prevent rb_node
atomic_inc(&node->refcount); * from being released until after rb_node is assigned to an SDMA descriptor
WARN_ON(atomic_read(&node->refcount) < 0); * (struct sdma_desc) under add_system_iovec_to_sdma_packet(), even if the
} * virtual address range for rb_node is invalidated between now and then.
*/
static inline void release_node(struct mmu_rb_handler *handler,
struct sdma_mmu_node *node)
{
atomic_dec(&node->refcount);
WARN_ON(atomic_read(&node->refcount) < 0);
}
static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
unsigned long start, unsigned long start,
unsigned long end) unsigned long end)
{ {
struct mmu_rb_node *rb_node; struct mmu_rb_node *rb_node;
struct sdma_mmu_node *node;
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&handler->lock, flags); spin_lock_irqsave(&handler->lock, flags);
...@@ -1302,11 +1290,12 @@ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler, ...@@ -1302,11 +1290,12 @@ static struct sdma_mmu_node *find_system_node(struct mmu_rb_handler *handler,
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
return NULL; return NULL;
} }
node = container_of(rb_node, struct sdma_mmu_node, rb);
acquire_node(node); /* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
kref_get(&rb_node->refcount);
spin_unlock_irqrestore(&handler->lock, flags); spin_unlock_irqrestore(&handler->lock, flags);
return node; return container_of(rb_node, struct sdma_mmu_node, rb);
} }
static int pin_system_pages(struct user_sdma_request *req, static int pin_system_pages(struct user_sdma_request *req,
...@@ -1355,6 +1344,13 @@ static int pin_system_pages(struct user_sdma_request *req, ...@@ -1355,6 +1344,13 @@ static int pin_system_pages(struct user_sdma_request *req,
return 0; return 0;
} }
/*
* kref refcount on *node_p will be 2 on successful addition: one kref from
* kref_init() for mmu_rb_handler and one kref to prevent *node_p from being
* released until after *node_p is assigned to an SDMA descriptor (struct
* sdma_desc) under add_system_iovec_to_sdma_packet(), even if the virtual
* address range for *node_p is invalidated between now and then.
*/
static int add_system_pinning(struct user_sdma_request *req, static int add_system_pinning(struct user_sdma_request *req,
struct sdma_mmu_node **node_p, struct sdma_mmu_node **node_p,
unsigned long start, unsigned long len) unsigned long start, unsigned long len)
...@@ -1368,6 +1364,12 @@ static int add_system_pinning(struct user_sdma_request *req, ...@@ -1368,6 +1364,12 @@ static int add_system_pinning(struct user_sdma_request *req,
if (!node) if (!node)
return -ENOMEM; return -ENOMEM;
/* First kref "moves" to mmu_rb_handler */
kref_init(&node->rb.refcount);
/* "safety" kref to prevent release before add_system_iovec_to_sdma_packet() */
kref_get(&node->rb.refcount);
node->pq = pq; node->pq = pq;
ret = pin_system_pages(req, start, len, node, PFN_DOWN(len)); ret = pin_system_pages(req, start, len, node, PFN_DOWN(len));
if (ret == 0) { if (ret == 0) {
...@@ -1431,15 +1433,15 @@ static int get_system_cache_entry(struct user_sdma_request *req, ...@@ -1431,15 +1433,15 @@ static int get_system_cache_entry(struct user_sdma_request *req,
return 0; return 0;
} }
SDMA_DBG(req, "prepend: node->rb.addr %lx, node->refcount %d", SDMA_DBG(req, "prepend: node->rb.addr %lx, node->rb.refcount %d",
node->rb.addr, atomic_read(&node->refcount)); node->rb.addr, kref_read(&node->rb.refcount));
prepend_len = node->rb.addr - start; prepend_len = node->rb.addr - start;
/* /*
* This node will not be returned, instead a new node * This node will not be returned, instead a new node
* will be. So release the reference. * will be. So release the reference.
*/ */
release_node(handler, node); kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
/* Prepend a node to cover the beginning of the allocation */ /* Prepend a node to cover the beginning of the allocation */
ret = add_system_pinning(req, node_p, start, prepend_len); ret = add_system_pinning(req, node_p, start, prepend_len);
...@@ -1451,6 +1453,20 @@ static int get_system_cache_entry(struct user_sdma_request *req, ...@@ -1451,6 +1453,20 @@ static int get_system_cache_entry(struct user_sdma_request *req,
} }
} }
static void sdma_mmu_rb_node_get(void *ctx)
{
struct mmu_rb_node *node = ctx;
kref_get(&node->refcount);
}
static void sdma_mmu_rb_node_put(void *ctx)
{
struct sdma_mmu_node *node = ctx;
kref_put(&node->rb.refcount, hfi1_mmu_rb_release);
}
static int add_mapping_to_sdma_packet(struct user_sdma_request *req, static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_txreq *tx, struct user_sdma_txreq *tx,
struct sdma_mmu_node *cache_entry, struct sdma_mmu_node *cache_entry,
...@@ -1494,9 +1510,12 @@ static int add_mapping_to_sdma_packet(struct user_sdma_request *req, ...@@ -1494,9 +1510,12 @@ static int add_mapping_to_sdma_packet(struct user_sdma_request *req,
ctx = cache_entry; ctx = cache_entry;
} }
ret = sdma_txadd_page(pq->dd, ctx, &tx->txreq, ret = sdma_txadd_page(pq->dd, &tx->txreq,
cache_entry->pages[page_index], cache_entry->pages[page_index],
page_offset, from_this_page); page_offset, from_this_page,
ctx,
sdma_mmu_rb_node_get,
sdma_mmu_rb_node_put);
if (ret) { if (ret) {
/* /*
* When there's a failure, the entire request is freed by * When there's a failure, the entire request is freed by
...@@ -1518,8 +1537,6 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, ...@@ -1518,8 +1537,6 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
struct user_sdma_iovec *iovec, struct user_sdma_iovec *iovec,
size_t from_this_iovec) size_t from_this_iovec)
{ {
struct mmu_rb_handler *handler = req->pq->handler;
while (from_this_iovec > 0) { while (from_this_iovec > 0) {
struct sdma_mmu_node *cache_entry; struct sdma_mmu_node *cache_entry;
size_t from_this_cache_entry; size_t from_this_cache_entry;
...@@ -1540,15 +1557,15 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req, ...@@ -1540,15 +1557,15 @@ static int add_system_iovec_to_sdma_packet(struct user_sdma_request *req,
ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start, ret = add_mapping_to_sdma_packet(req, tx, cache_entry, start,
from_this_cache_entry); from_this_cache_entry);
/*
* Done adding cache_entry to zero or more sdma_desc. Can
* kref_put() the "safety" kref taken under
* get_system_cache_entry().
*/
kref_put(&cache_entry->rb.refcount, hfi1_mmu_rb_release);
if (ret) { if (ret) {
/*
* We're guaranteed that there will be no descriptor
* completion callback that releases this node
* because only the last descriptor referencing it
* has a context attached, and a failure means the
* last descriptor was never added.
*/
release_node(handler, cache_entry);
SDMA_DBG(req, "add system segment failed %d", ret); SDMA_DBG(req, "add system segment failed %d", ret);
return ret; return ret;
} }
...@@ -1599,42 +1616,12 @@ static int add_system_pages_to_sdma_packet(struct user_sdma_request *req, ...@@ -1599,42 +1616,12 @@ static int add_system_pages_to_sdma_packet(struct user_sdma_request *req,
return 0; return 0;
} }
void system_descriptor_complete(struct hfi1_devdata *dd,
struct sdma_desc *descp)
{
switch (sdma_mapping_type(descp)) {
case SDMA_MAP_SINGLE:
dma_unmap_single(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
case SDMA_MAP_PAGE:
dma_unmap_page(&dd->pcidev->dev, sdma_mapping_addr(descp),
sdma_mapping_len(descp), DMA_TO_DEVICE);
break;
}
if (descp->pinning_ctx) {
struct sdma_mmu_node *node = descp->pinning_ctx;
release_node(node->rb.handler, node);
}
}
static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr, static bool sdma_rb_filter(struct mmu_rb_node *node, unsigned long addr,
unsigned long len) unsigned long len)
{ {
return (bool)(node->addr == addr); return (bool)(node->addr == addr);
} }
static int sdma_rb_insert(void *arg, struct mmu_rb_node *mnode)
{
struct sdma_mmu_node *node =
container_of(mnode, struct sdma_mmu_node, rb);
atomic_inc(&node->refcount);
return 0;
}
/* /*
* Return 1 to remove the node from the rb tree and call the remove op. * Return 1 to remove the node from the rb tree and call the remove op.
* *
...@@ -1647,10 +1634,6 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode, ...@@ -1647,10 +1634,6 @@ static int sdma_rb_evict(void *arg, struct mmu_rb_node *mnode,
container_of(mnode, struct sdma_mmu_node, rb); container_of(mnode, struct sdma_mmu_node, rb);
struct evict_data *evict_data = evict_arg; struct evict_data *evict_data = evict_arg;
/* is this node still being used? */
if (atomic_read(&node->refcount))
return 0; /* keep this node */
/* this node will be evicted, add its pages to our count */ /* this node will be evicted, add its pages to our count */
evict_data->cleared += node->npages; evict_data->cleared += node->npages;
...@@ -1668,13 +1651,3 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode) ...@@ -1668,13 +1651,3 @@ static void sdma_rb_remove(void *arg, struct mmu_rb_node *mnode)
free_system_node(node); free_system_node(node);
} }
static int sdma_rb_invalidate(void *arg, struct mmu_rb_node *mnode)
{
struct sdma_mmu_node *node =
container_of(mnode, struct sdma_mmu_node, rb);
if (!atomic_read(&node->refcount))
return 1;
return 0;
}
...@@ -104,7 +104,6 @@ struct hfi1_user_sdma_comp_q { ...@@ -104,7 +104,6 @@ struct hfi1_user_sdma_comp_q {
struct sdma_mmu_node { struct sdma_mmu_node {
struct mmu_rb_node rb; struct mmu_rb_node rb;
struct hfi1_user_sdma_pkt_q *pq; struct hfi1_user_sdma_pkt_q *pq;
atomic_t refcount;
struct page **pages; struct page **pages;
unsigned int npages; unsigned int npages;
}; };
......
...@@ -64,11 +64,11 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde, ...@@ -64,11 +64,11 @@ static noinline int build_vnic_ulp_payload(struct sdma_engine *sde,
/* combine physically continuous fragments later? */ /* combine physically continuous fragments later? */
ret = sdma_txadd_page(sde->dd, ret = sdma_txadd_page(sde->dd,
NULL,
&tx->txreq, &tx->txreq,
skb_frag_page(frag), skb_frag_page(frag),
skb_frag_off(frag), skb_frag_off(frag),
skb_frag_size(frag)); skb_frag_size(frag),
NULL, NULL, NULL);
if (unlikely(ret)) if (unlikely(ret))
goto bail_txadd; goto bail_txadd;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment