Commit 4809bba7 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-rds-RDMA-fixes'

Gerd Rausch says:

====================
net/rds: RDMA fixes

A number of net/rds fixes necessary to make "rds_rdma.ko"
pass some basic Oracle internal tests.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f11fe1da aa494893
...@@ -156,6 +156,7 @@ struct rds_ib_connection { ...@@ -156,6 +156,7 @@ struct rds_ib_connection {
/* To control the number of wrs from fastreg */ /* To control the number of wrs from fastreg */
atomic_t i_fastreg_wrs; atomic_t i_fastreg_wrs;
atomic_t i_fastreg_inuse_count;
/* interrupt handling */ /* interrupt handling */
struct tasklet_struct i_send_tasklet; struct tasklet_struct i_send_tasklet;
......
...@@ -40,6 +40,7 @@ ...@@ -40,6 +40,7 @@
#include "rds_single_path.h" #include "rds_single_path.h"
#include "rds.h" #include "rds.h"
#include "ib.h" #include "ib.h"
#include "ib_mr.h"
/* /*
* Set the selected protocol version * Set the selected protocol version
...@@ -526,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ...@@ -526,7 +527,6 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.qp_type = IB_QPT_RC; attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq; attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq; attr.recv_cq = ic->i_recv_cq;
atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/* /*
* XXX this can fail if max_*_wr is too large? Are we supposed * XXX this can fail if max_*_wr is too large? Are we supposed
...@@ -993,6 +993,11 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ...@@ -993,6 +993,11 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
ic->i_cm_id, err); ic->i_cm_id, err);
} }
/* kick off "flush_worker" for all pools in order to reap
* all FRMR registrations that are still marked "FRMR_IS_INUSE"
*/
rds_ib_flush_mrs();
/* /*
* We want to wait for tx and rx completion to finish * We want to wait for tx and rx completion to finish
* before we tear down the connection, but we have to be * before we tear down the connection, but we have to be
...@@ -1005,6 +1010,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp) ...@@ -1005,6 +1010,7 @@ void rds_ib_conn_path_shutdown(struct rds_conn_path *cp)
wait_event(rds_ib_ring_empty_wait, wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) && rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0) && (atomic_read(&ic->i_signaled_sends) == 0) &&
(atomic_read(&ic->i_fastreg_inuse_count) == 0) &&
(atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR)); (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet); tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet); tasklet_kill(&ic->i_recv_tasklet);
...@@ -1132,6 +1138,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) ...@@ -1132,6 +1138,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp)
spin_lock_init(&ic->i_ack_lock); spin_lock_init(&ic->i_ack_lock);
#endif #endif
atomic_set(&ic->i_signaled_sends, 0); atomic_set(&ic->i_signaled_sends, 0);
atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/* /*
* rds_ib_conn_shutdown() waits for these to be emptied so they * rds_ib_conn_shutdown() waits for these to be emptied so they
......
...@@ -32,6 +32,24 @@ ...@@ -32,6 +32,24 @@
#include "ib_mr.h" #include "ib_mr.h"
static inline void
rds_transition_frwr_state(struct rds_ib_mr *ibmr,
enum rds_ib_fr_state old_state,
enum rds_ib_fr_state new_state)
{
if (cmpxchg(&ibmr->u.frmr.fr_state,
old_state, new_state) == old_state &&
old_state == FRMR_IS_INUSE) {
/* enforce order of ibmr->u.frmr.fr_state update
* before decrementing i_fastreg_inuse_count
*/
smp_mb__before_atomic();
atomic_dec(&ibmr->ic->i_fastreg_inuse_count);
if (waitqueue_active(&rds_ib_ring_empty_wait))
wake_up(&rds_ib_ring_empty_wait);
}
}
static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
int npages) int npages)
{ {
...@@ -75,6 +93,8 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev, ...@@ -75,6 +93,8 @@ static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
pool->max_items_soft = pool->max_items; pool->max_items_soft = pool->max_items;
frmr->fr_state = FRMR_IS_FREE; frmr->fr_state = FRMR_IS_FREE;
init_waitqueue_head(&frmr->fr_inv_done);
init_waitqueue_head(&frmr->fr_reg_done);
return ibmr; return ibmr;
out_no_cigar: out_no_cigar:
...@@ -116,13 +136,19 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ...@@ -116,13 +136,19 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
if (unlikely(ret != ibmr->sg_len)) if (unlikely(ret != ibmr->sg_len))
return ret < 0 ? ret : -EINVAL; return ret < 0 ? ret : -EINVAL;
if (cmpxchg(&frmr->fr_state,
FRMR_IS_FREE, FRMR_IS_INUSE) != FRMR_IS_FREE)
return -EBUSY;
atomic_inc(&ibmr->ic->i_fastreg_inuse_count);
/* Perform a WR for the fast_reg_mr. Each individual page /* Perform a WR for the fast_reg_mr. Each individual page
* in the sg list is added to the fast reg page list and placed * in the sg list is added to the fast reg page list and placed
* inside the fast_reg_mr WR. The key used is a rolling 8bit * inside the fast_reg_mr WR. The key used is a rolling 8bit
* counter, which should guarantee uniqueness. * counter, which should guarantee uniqueness.
*/ */
ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++); ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
frmr->fr_state = FRMR_IS_INUSE; frmr->fr_reg = true;
memset(&reg_wr, 0, sizeof(reg_wr)); memset(&reg_wr, 0, sizeof(reg_wr));
reg_wr.wr.wr_id = (unsigned long)(void *)ibmr; reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
...@@ -138,12 +164,23 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr) ...@@ -138,12 +164,23 @@ static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL); ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, NULL);
if (unlikely(ret)) { if (unlikely(ret)) {
/* Failure here can be because of -ENOMEM as well */ /* Failure here can be because of -ENOMEM as well */
frmr->fr_state = FRMR_IS_STALE; rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
atomic_inc(&ibmr->ic->i_fastreg_wrs); atomic_inc(&ibmr->ic->i_fastreg_wrs);
if (printk_ratelimit()) if (printk_ratelimit())
pr_warn("RDS/IB: %s returned error(%d)\n", pr_warn("RDS/IB: %s returned error(%d)\n",
__func__, ret); __func__, ret);
goto out;
} }
/* Wait for the registration to complete in order to prevent an invalid
* access error resulting from a race between the memory region already
* being accessed while registration is still pending.
*/
wait_event(frmr->fr_reg_done, !frmr->fr_reg);
out:
return ret; return ret;
} }
...@@ -255,12 +292,29 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr) ...@@ -255,12 +292,29 @@ static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
ret = ib_post_send(i_cm_id->qp, s_wr, NULL); ret = ib_post_send(i_cm_id->qp, s_wr, NULL);
if (unlikely(ret)) { if (unlikely(ret)) {
frmr->fr_state = FRMR_IS_STALE; rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
frmr->fr_inv = false; frmr->fr_inv = false;
/* enforce order of frmr->fr_inv update
* before incrementing i_fastreg_wrs
*/
smp_mb__before_atomic();
atomic_inc(&ibmr->ic->i_fastreg_wrs); atomic_inc(&ibmr->ic->i_fastreg_wrs);
pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret); pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
goto out; goto out;
} }
/* Wait for the FRMR_IS_FREE (or FRMR_IS_STALE) transition in order to
* 1) avoid a silly bouncing between "clean_list" and "drop_list"
* triggered by function "rds_ib_reg_frmr" as it is releases frmr
* regions whose state is not "FRMR_IS_FREE" right away.
* 2) prevents an invalid access error in a race
* from a pending "IB_WR_LOCAL_INV" operation
* with a teardown ("dma_unmap_sg", "put_page")
* and de-registration ("ib_dereg_mr") of the corresponding
* memory region.
*/
wait_event(frmr->fr_inv_done, frmr->fr_state != FRMR_IS_INUSE);
out: out:
return ret; return ret;
} }
...@@ -271,7 +325,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) ...@@ -271,7 +325,7 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
struct rds_ib_frmr *frmr = &ibmr->u.frmr; struct rds_ib_frmr *frmr = &ibmr->u.frmr;
if (wc->status != IB_WC_SUCCESS) { if (wc->status != IB_WC_SUCCESS) {
frmr->fr_state = FRMR_IS_STALE; rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_STALE);
if (rds_conn_up(ic->conn)) if (rds_conn_up(ic->conn))
rds_ib_conn_error(ic->conn, rds_ib_conn_error(ic->conn,
"frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n", "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
...@@ -283,10 +337,20 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc) ...@@ -283,10 +337,20 @@ void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
} }
if (frmr->fr_inv) { if (frmr->fr_inv) {
frmr->fr_state = FRMR_IS_FREE; rds_transition_frwr_state(ibmr, FRMR_IS_INUSE, FRMR_IS_FREE);
frmr->fr_inv = false; frmr->fr_inv = false;
wake_up(&frmr->fr_inv_done);
} }
if (frmr->fr_reg) {
frmr->fr_reg = false;
wake_up(&frmr->fr_reg_done);
}
/* enforce order of frmr->{fr_reg,fr_inv} update
* before incrementing i_fastreg_wrs
*/
smp_mb__before_atomic();
atomic_inc(&ic->i_fastreg_wrs); atomic_inc(&ic->i_fastreg_wrs);
} }
...@@ -295,14 +359,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed, ...@@ -295,14 +359,18 @@ void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
{ {
struct rds_ib_mr *ibmr, *next; struct rds_ib_mr *ibmr, *next;
struct rds_ib_frmr *frmr; struct rds_ib_frmr *frmr;
int ret = 0; int ret = 0, ret2;
unsigned int freed = *nfreed; unsigned int freed = *nfreed;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
list_for_each_entry(ibmr, list, unmap_list) { list_for_each_entry(ibmr, list, unmap_list) {
if (ibmr->sg_dma_len) if (ibmr->sg_dma_len) {
ret |= rds_ib_post_inv(ibmr); ret2 = rds_ib_post_inv(ibmr);
if (ret2 && !ret)
ret = ret2;
}
} }
if (ret) if (ret)
pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret); pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
......
...@@ -57,6 +57,9 @@ struct rds_ib_frmr { ...@@ -57,6 +57,9 @@ struct rds_ib_frmr {
struct ib_mr *mr; struct ib_mr *mr;
enum rds_ib_fr_state fr_state; enum rds_ib_fr_state fr_state;
bool fr_inv; bool fr_inv;
wait_queue_head_t fr_inv_done;
bool fr_reg;
wait_queue_head_t fr_reg_done;
struct ib_send_wr fr_wr; struct ib_send_wr fr_wr;
unsigned int dma_npages; unsigned int dma_npages;
unsigned int sg_byte_len; unsigned int sg_byte_len;
...@@ -97,6 +100,7 @@ struct rds_ib_mr_pool { ...@@ -97,6 +100,7 @@ struct rds_ib_mr_pool {
struct llist_head free_list; /* unused MRs */ struct llist_head free_list; /* unused MRs */
struct llist_head clean_list; /* unused & unmapped MRs */ struct llist_head clean_list; /* unused & unmapped MRs */
wait_queue_head_t flush_wait; wait_queue_head_t flush_wait;
spinlock_t clean_lock; /* "clean_list" concurrency */
atomic_t free_pinned; /* memory pinned by free MRs */ atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items; unsigned long max_items;
......
...@@ -40,9 +40,6 @@ ...@@ -40,9 +40,6 @@
struct workqueue_struct *rds_ib_mr_wq; struct workqueue_struct *rds_ib_mr_wq;
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{ {
struct rds_ib_device *rds_ibdev; struct rds_ib_device *rds_ibdev;
...@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) ...@@ -195,12 +192,11 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{ {
struct rds_ib_mr *ibmr = NULL; struct rds_ib_mr *ibmr = NULL;
struct llist_node *ret; struct llist_node *ret;
unsigned long *flag; unsigned long flags;
preempt_disable(); spin_lock_irqsave(&pool->clean_lock, flags);
flag = this_cpu_ptr(&clean_list_grace);
set_bit(CLEAN_LIST_BUSY_BIT, flag);
ret = llist_del_first(&pool->clean_list); ret = llist_del_first(&pool->clean_list);
spin_unlock_irqrestore(&pool->clean_lock, flags);
if (ret) { if (ret) {
ibmr = llist_entry(ret, struct rds_ib_mr, llnode); ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
if (pool->pool_type == RDS_IB_MR_8K_POOL) if (pool->pool_type == RDS_IB_MR_8K_POOL)
...@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool) ...@@ -209,23 +205,9 @@ struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
rds_ib_stats_inc(s_ib_rdma_mr_1m_reused); rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
} }
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
return ibmr; return ibmr;
} }
static inline void wait_clean_list_grace(void)
{
int cpu;
unsigned long *flag;
for_each_online_cpu(cpu) {
flag = &per_cpu(clean_list_grace, cpu);
while (test_bit(CLEAN_LIST_BUSY_BIT, flag))
cpu_relax();
}
}
void rds_ib_sync_mr(void *trans_private, int direction) void rds_ib_sync_mr(void *trans_private, int direction)
{ {
struct rds_ib_mr *ibmr = trans_private; struct rds_ib_mr *ibmr = trans_private;
...@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist, ...@@ -324,8 +306,7 @@ static unsigned int llist_append_to_list(struct llist_head *llist,
* of clusters. Each cluster has linked llist nodes of * of clusters. Each cluster has linked llist nodes of
* MR_CLUSTER_SIZE mrs that are ready for reuse. * MR_CLUSTER_SIZE mrs that are ready for reuse.
*/ */
static void list_to_llist_nodes(struct rds_ib_mr_pool *pool, static void list_to_llist_nodes(struct list_head *list,
struct list_head *list,
struct llist_node **nodes_head, struct llist_node **nodes_head,
struct llist_node **nodes_tail) struct llist_node **nodes_tail)
{ {
...@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, ...@@ -402,8 +383,13 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
*/ */
dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list);
dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list);
if (free_all) if (free_all) {
unsigned long flags;
spin_lock_irqsave(&pool->clean_lock, flags);
llist_append_to_list(&pool->clean_list, &unmap_list); llist_append_to_list(&pool->clean_list, &unmap_list);
spin_unlock_irqrestore(&pool->clean_lock, flags);
}
free_goal = rds_ib_flush_goal(pool, free_all); free_goal = rds_ib_flush_goal(pool, free_all);
...@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, ...@@ -416,27 +402,20 @@ int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal); rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
if (!list_empty(&unmap_list)) { if (!list_empty(&unmap_list)) {
/* we have to make sure that none of the things we're about unsigned long flags;
* to put on the clean list would race with other cpus trying
* to pull items off. The llist would explode if we managed to list_to_llist_nodes(&unmap_list, &clean_nodes, &clean_tail);
* remove something from the clean list and then add it back again
* while another CPU was spinning on that same item in llist_del_first.
*
* This is pretty unlikely, but just in case wait for an llist grace period
* here before adding anything back into the clean list.
*/
wait_clean_list_grace();
list_to_llist_nodes(pool, &unmap_list, &clean_nodes, &clean_tail);
if (ibmr_ret) { if (ibmr_ret) {
*ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode); *ibmr_ret = llist_entry(clean_nodes, struct rds_ib_mr, llnode);
clean_nodes = clean_nodes->next; clean_nodes = clean_nodes->next;
} }
/* more than one entry in llist nodes */ /* more than one entry in llist nodes */
if (clean_nodes) if (clean_nodes) {
spin_lock_irqsave(&pool->clean_lock, flags);
llist_add_batch(clean_nodes, clean_tail, llist_add_batch(clean_nodes, clean_tail,
&pool->clean_list); &pool->clean_list);
spin_unlock_irqrestore(&pool->clean_lock, flags);
}
} }
atomic_sub(unpinned, &pool->free_pinned); atomic_sub(unpinned, &pool->free_pinned);
...@@ -471,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) ...@@ -471,7 +450,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted); rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
else else
rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted); rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
return ERR_PTR(-EAGAIN); break;
} }
/* We do have some empty MRs. Flush them out. */ /* We do have some empty MRs. Flush them out. */
...@@ -485,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool) ...@@ -485,7 +464,7 @@ struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
return ibmr; return ibmr;
} }
return ibmr; return NULL;
} }
static void rds_ib_mr_pool_flush_worker(struct work_struct *work) static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
...@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev, ...@@ -610,6 +589,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
init_llist_head(&pool->free_list); init_llist_head(&pool->free_list);
init_llist_head(&pool->drop_list); init_llist_head(&pool->drop_list);
init_llist_head(&pool->clean_list); init_llist_head(&pool->clean_list);
spin_lock_init(&pool->clean_lock);
mutex_init(&pool->flush_lock); mutex_init(&pool->flush_lock);
init_waitqueue_head(&pool->flush_wait); init_waitqueue_head(&pool->flush_wait);
INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment