Commit fcb3f55f authored by David S. Miller's avatar David S. Miller

Merge branch 'rds-support-FRMR-and-cleanups'

Santosh Shilimkar says:

====================
RDS: Major clean-up with couple of new features for 4.6

v3:
Re-generated the same series by omitting "-D" option from git format-patch
command. Since first patch has file removals, git apply/am can't deal
with it when formated with '-D' option.

v2:
Dropped module parameter from [PATCH 11/13] as suggested by David Miller

Series is generated against net-next but also applies against Linus's tip
cleanly. Entire patchset is available at below git tree:

 git://git.kernel.org/pub/scm/linux/kernel/git/ssantosh/linux.git for_4.6/net-next/rds_v2

The diff-stat looks bit scary since almost ~4K lines of code is
getting removed. Brief summary of the series:

- Drop the stale iWARP support:
	RDS iWarp support code has become stale and non testable for
	sometime.  As discussed and agreed earlier on list, am dropping
	its support for good. If new iWarp user(s) shows up in future,
	the plan is to adapt existing IB RDMA with special sink case.
- RDS gets SO_TIMESTAMP support
- Long due RDS maintainer entry gets updated
- Some RDS IB code refactoring towards new FastReg Memory registration (FRMR)
- Lastly the initial support for FRMR

RDS IB RDMA performance with FRMR is not yet as good as FMR and I do have
some patches in progress to address that. But they are not ready for 4.6
so I left them out of this series.

Also am keeping eye on new CQ API adaptations like other ULPs doing and
will try to adapt RDS for the same most likely in 4.7+ timeframe.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents afc3de92 1659185f
......@@ -19,9 +19,7 @@ to N*N if you use a connection-oriented socket transport like TCP.
RDS is not Infiniband-specific; it was designed to support different
transports. The current implementation used to support RDS over TCP as well
as IB. Work is in progress to support RDS over iWARP, and using DCE to
guarantee no dropped packets on Ethernet, it may be possible to use RDS over
UDP in the future.
as IB.
The high-level semantics of RDS from the application's point of view are
......
......@@ -9076,10 +9076,14 @@ S: Maintained
F: drivers/net/ethernet/rdc/r6040.c
RDS - RELIABLE DATAGRAM SOCKETS
M: Chien Yen <chien.yen@oracle.com>
M: Santosh Shilimkar <santosh.shilimkar@oracle.com>
L: netdev@vger.kernel.org
L: linux-rdma@vger.kernel.org
L: rds-devel@oss.oracle.com (moderated for non-subscribers)
W: https://oss.oracle.com/projects/rds/
S: Supported
F: net/rds/
F: Documentation/networking/rds.txt
READ-COPY UPDATE (RCU)
M: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
......
......@@ -4,14 +4,13 @@ config RDS
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
sequenced delivery of datagrams over Infiniband, iWARP,
or TCP.
sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
tristate "RDS over Infiniband and iWARP"
tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
Allow RDS to use Infiniband and iWARP as a transport.
Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
config RDS_TCP
......
......@@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
ib_sysctl.o ib_rdma.o \
iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
iw_sysctl.o iw_rdma.o
ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
......
......@@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
int optlen)
{
int val, valbool;
if (optlen != sizeof(int))
return -EFAULT;
if (get_user(val, (int __user *)optval))
return -EFAULT;
valbool = val ? 1 : 0;
if (valbool)
sock_set_flag(sk, SOCK_RCVTSTAMP);
else
sock_reset_flag(sk, SOCK_RCVTSTAMP);
return 0;
}
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
......@@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_set_transport(rs, optval, optlen);
release_sock(sock->sk);
break;
case SO_TIMESTAMP:
lock_sock(sock->sk);
ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
release_sock(sock->sk);
break;
default:
ret = -ENOPROTOOPT;
}
......
......@@ -42,15 +42,16 @@
#include "rds.h"
#include "ib.h"
#include "ib_mr.h"
unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
module_param(rds_ib_fmr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
module_param(rds_ib_fmr_8k_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
module_param(rds_ib_mr_1m_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
module_param(rds_ib_mr_8k_pool_size, int, 0444);
MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
......@@ -139,14 +140,20 @@ static void rds_ib_add_one(struct ib_device *device)
rds_ibdev->max_wrs = device->attrs.max_qp_wr;
rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
rds_ibdev->has_fr = (device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
device->map_phys_fmr && device->unmap_fmr);
rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
rds_ibdev->max_1m_fmrs = device->attrs.max_mr ?
rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
min_t(unsigned int, (device->attrs.max_mr / 2),
rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
rds_ibdev->max_8k_fmrs = device->attrs.max_mr ?
rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
......@@ -172,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}
rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
rds_ibdev->max_8k_fmrs);
rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
rds_ibdev->max_8k_mrs);
pr_info("RDS/IB: %s: %s supported and preferred\n",
device->name,
rds_ibdev->use_fastreg ? "FRMR" : "FMR");
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
......@@ -364,7 +375,7 @@ void rds_ib_exit(void)
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
rds_ib_fmr_exit();
rds_ib_mr_exit();
}
struct rds_transport rds_ib_transport = {
......@@ -400,13 +411,13 @@ int rds_ib_init(void)
INIT_LIST_HEAD(&rds_ib_devices);
ret = rds_ib_fmr_init();
ret = rds_ib_mr_init();
if (ret)
goto out;
ret = ib_register_client(&rds_ib_client);
if (ret)
goto out_fmr_exit;
goto out_mr_exit;
ret = rds_ib_sysctl_init();
if (ret)
......@@ -430,8 +441,8 @@ int rds_ib_init(void)
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
out_fmr_exit:
rds_ib_fmr_exit();
out_mr_exit:
rds_ib_mr_exit();
out:
return ret;
}
......
......@@ -9,17 +9,12 @@
#include "rds.h"
#include "rdma_transport.h"
#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
#define RDS_FMR_1M_MSG_SIZE 256
#define RDS_FMR_8K_MSG_SIZE 2
#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
#define RDS_IB_DEFAULT_FR_WR 512
#define RDS_IB_DEFAULT_RETRY_COUNT 2
......@@ -28,7 +23,6 @@
#define RDS_IB_RECYCLE_BATCH_COUNT 32
#define RDS_IB_WC_MAX 32
#define RDS_IB_SEND_OP BIT_ULL(63)
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
......@@ -129,6 +123,9 @@ struct rds_ib_connection {
struct ib_wc i_send_wc[RDS_IB_WC_MAX];
struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
/* To control the number of wrs from fastreg */
atomic_t i_fastreg_wrs;
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
struct tasklet_struct i_recv_tasklet;
......@@ -207,12 +204,16 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
unsigned int max_fmrs;
bool has_fmr;
bool has_fr;
bool use_fastreg;
unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
unsigned int fmr_max_remaps;
unsigned int max_8k_fmrs;
unsigned int max_1m_fmrs;
unsigned int max_8k_mrs;
unsigned int max_1m_mrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
......@@ -266,6 +267,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
uint64_t s_ib_rdma_mr_8k_reused;
uint64_t s_ib_rdma_mr_1m_reused;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
......@@ -317,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
extern unsigned int rds_ib_fmr_1m_pool_size;
extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
......@@ -348,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_fmr_init(void);
void rds_ib_fmr_exit(void);
void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
......
......@@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
tasklet_schedule(&ic->i_recv_tasklet);
}
static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
struct ib_wc *wcs,
struct rds_ib_ack_state *ack_state)
static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
struct ib_wc *wcs)
{
int nr;
int i;
int nr, i;
struct ib_wc *wc;
while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
......@@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
(unsigned long long)wc->wr_id, wc->status,
wc->byte_len, be32_to_cpu(wc->ex.imm_data));
if (wc->wr_id & RDS_IB_SEND_OP)
if (wc->wr_id <= ic->i_send_ring.w_nr ||
wc->wr_id == RDS_IB_ACK_WR_ID)
rds_ib_send_cqe_handler(ic, wc);
else
rds_ib_recv_cqe_handler(ic, wc, ack_state);
rds_ib_mr_cqe_handler(ic, wc);
}
}
}
......@@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
struct rds_connection *conn = ic->conn;
struct rds_ib_ack_state state;
rds_ib_stats_inc(s_ib_tasklet_call);
memset(&state, 0, sizeof(state));
poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
......@@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
rds_send_xmit(ic->conn);
}
static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
struct ib_wc *wcs,
struct rds_ib_ack_state *ack_state)
{
int nr, i;
struct ib_wc *wc;
while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
for (i = 0; i < nr; i++) {
wc = wcs + i;
rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
(unsigned long long)wc->wr_id, wc->status,
wc->byte_len, be32_to_cpu(wc->ex.imm_data));
rds_ib_recv_cqe_handler(ic, wc, ack_state);
}
}
}
static void rds_ib_tasklet_fn_recv(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
......@@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
memset(&state, 0, sizeof(state));
poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
if (state.ack_next_valid)
rds_ib_set_ack(ic, state.ack_next, state.ack_required);
......@@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
int ret;
int ret, fr_queue_space;
/*
* It's normal to see a null device if an incoming connection races
......@@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!rds_ibdev)
return -EOPNOTSUPP;
/* The fr_queue_space is currently set to 512, to add extra space on
* completion queue and send queue. This extra space is used for FRMR
* registration and invalidation work requests
*/
fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
......@@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
cq_attr.cqe = ic->i_send_ring.w_nr + 1;
cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
......@@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
......@@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
......@@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
(atomic_read(&ic->i_signaled_sends) == 0));
(atomic_read(&ic->i_signaled_sends) == 0) &&
(atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
......
/*
* Copyright (c) 2016 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#include "ib_mr.h"
struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
{
struct rds_ib_mr_pool *pool;
struct rds_ib_mr *ibmr = NULL;
struct rds_ib_fmr *fmr;
int err = 0;
if (npages <= RDS_MR_8K_MSG_SIZE)
pool = rds_ibdev->mr_8k_pool;
else
pool = rds_ibdev->mr_1m_pool;
ibmr = rds_ib_try_reuse_ibmr(pool);
if (ibmr)
return ibmr;
ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
rdsibdev_to_node(rds_ibdev));
if (!ibmr) {
err = -ENOMEM;
goto out_no_cigar;
}
fmr = &ibmr->u.fmr;
fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
(IB_ACCESS_LOCAL_WRITE |
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_REMOTE_ATOMIC),
&pool->fmr_attr);
if (IS_ERR(fmr->fmr)) {
err = PTR_ERR(fmr->fmr);
fmr->fmr = NULL;
pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
goto out_no_cigar;
}
ibmr->pool = pool;
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
return ibmr;
out_no_cigar:
if (ibmr) {
if (fmr->fmr)
ib_dealloc_fmr(fmr->fmr);
kfree(ibmr);
}
atomic_dec(&pool->item_count);
return ERR_PTR(err);
}
int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
struct scatterlist *sg, unsigned int nents)
{
struct ib_device *dev = rds_ibdev->dev;
struct rds_ib_fmr *fmr = &ibmr->u.fmr;
struct scatterlist *scat = sg;
u64 io_addr = 0;
u64 *dma_pages;
u32 len;
int page_cnt, sg_dma_len;
int i, j;
int ret;
sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
if (unlikely(!sg_dma_len)) {
pr_warn("RDS/IB: %s failed!\n", __func__);
return -EBUSY;
}
len = 0;
page_cnt = 0;
for (i = 0; i < sg_dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
if (dma_addr & ~PAGE_MASK) {
if (i > 0)
return -EINVAL;
else
++page_cnt;
}
if ((dma_addr + dma_len) & ~PAGE_MASK) {
if (i < sg_dma_len - 1)
return -EINVAL;
else
++page_cnt;
}
len += dma_len;
}
page_cnt += len >> PAGE_SHIFT;
if (page_cnt > ibmr->pool->fmr_attr.max_pages)
return -EINVAL;
dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
rdsibdev_to_node(rds_ibdev));
if (!dma_pages)
return -ENOMEM;
page_cnt = 0;
for (i = 0; i < sg_dma_len; ++i) {
unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
for (j = 0; j < dma_len; j += PAGE_SIZE)
dma_pages[page_cnt++] =
(dma_addr & PAGE_MASK) + j;
}
ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
if (ret)
goto out;
/* Success - we successfully remapped the MR, so we can
* safely tear down the old mapping.
*/
rds_ib_teardown_mr(ibmr);
ibmr->sg = scat;
ibmr->sg_len = nents;
ibmr->sg_dma_len = sg_dma_len;
ibmr->remap_count++;
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
ret = 0;
out:
kfree(dma_pages);
return ret;
}
struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
struct scatterlist *sg,
unsigned long nents,
u32 *key)
{
struct rds_ib_mr *ibmr = NULL;
struct rds_ib_fmr *fmr;
int ret;
ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
if (IS_ERR(ibmr))
return ibmr;
ibmr->device = rds_ibdev;
fmr = &ibmr->u.fmr;
ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
if (ret == 0)
*key = fmr->fmr->rkey;
else
rds_ib_free_mr(ibmr, 0);
return ibmr;
}
void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
unsigned long *unpinned, unsigned int goal)
{
struct rds_ib_mr *ibmr, *next;
struct rds_ib_fmr *fmr;
LIST_HEAD(fmr_list);
int ret = 0;
unsigned int freed = *nfreed;
/* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
list_for_each_entry(ibmr, list, unmap_list) {
fmr = &ibmr->u.fmr;
list_add(&fmr->fmr->list, &fmr_list);
}
ret = ib_unmap_fmr(&fmr_list);
if (ret)
pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
/* Now we can destroy the DMA mapping and unpin any pages */
list_for_each_entry_safe(ibmr, next, list, unmap_list) {
fmr = &ibmr->u.fmr;
*unpinned += ibmr->sg_len;
__rds_ib_teardown_mr(ibmr);
if (freed < goal ||
ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
else
rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
list_del(&ibmr->unmap_list);
ib_dealloc_fmr(fmr->fmr);
kfree(ibmr);
freed++;
}
}
*nfreed = freed;
}
void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
{
struct rds_ib_mr_pool *pool = ibmr->pool;
if (ibmr->remap_count >= pool->fmr_attr.max_maps)
llist_add(&ibmr->llnode, &pool->drop_list);
else
llist_add(&ibmr->llnode, &pool->free_list);
}
This diff is collapsed.
/*
* Copyright (c) 2016 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
#ifndef _RDS_IB_MR_H
#define _RDS_IB_MR_H
#include <linux/kernel.h>
#include "rds.h"
#include "ib.h"
#define RDS_MR_1M_POOL_SIZE (8192 / 2)
#define RDS_MR_1M_MSG_SIZE 256
#define RDS_MR_8K_MSG_SIZE 2
#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
struct rds_ib_fmr {
struct ib_fmr *fmr;
u64 *dma;
};
enum rds_ib_fr_state {
FRMR_IS_FREE, /* mr invalidated & ready for use */
FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
FRMR_IS_STALE, /* Stale MR and needs to be dropped */
};
struct rds_ib_frmr {
struct ib_mr *mr;
enum rds_ib_fr_state fr_state;
bool fr_inv;
struct ib_send_wr fr_wr;
unsigned int dma_npages;
unsigned int sg_byte_len;
};
/* This is stored as mr->r_trans_private. */
struct rds_ib_mr {
struct rds_ib_device *device;
struct rds_ib_mr_pool *pool;
struct rds_ib_connection *ic;
struct llist_node llnode;
/* unmap_list is for freeing */
struct list_head unmap_list;
unsigned int remap_count;
struct scatterlist *sg;
unsigned int sg_len;
int sg_dma_len;
union {
struct rds_ib_fmr fmr;
struct rds_ib_frmr frmr;
} u;
};
/* Our own little MR pool */
struct rds_ib_mr_pool {
unsigned int pool_type;
struct mutex flush_lock; /* serialize fmr invalidate */
struct delayed_work flush_worker; /* flush worker */
atomic_t item_count; /* total # of MRs */
atomic_t dirty_count; /* # dirty of MRs */
struct llist_head drop_list; /* MRs not reached max_maps */
struct llist_head free_list; /* unused MRs */
struct llist_head clean_list; /* unused & unmapped MRs */
wait_queue_head_t flush_wait;
atomic_t free_pinned; /* memory pinned by free MRs */
unsigned long max_items;
unsigned long max_items_soft;
unsigned long max_free_pinned;
struct ib_fmr_attr fmr_attr;
bool use_fastreg;
};
extern struct workqueue_struct *rds_ib_mr_wq;
extern unsigned int rds_ib_mr_1m_pool_size;
extern unsigned int rds_ib_mr_8k_pool_size;
extern bool prefer_frmr;
struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
int npages);
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
struct rds_info_rdma_connection *iinfo);
void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
struct rds_sock *rs, u32 *key_ret);
void rds_ib_sync_mr(void *trans_private, int dir);
void rds_ib_free_mr(void *trans_private, int invalidate);
void rds_ib_flush_mrs(void);
int rds_ib_mr_init(void);
void rds_ib_mr_exit(void);
void __rds_ib_teardown_mr(struct rds_ib_mr *);
void rds_ib_teardown_mr(struct rds_ib_mr *);
struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *,
struct scatterlist *, unsigned int);
struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
unsigned long, u32 *);
struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
unsigned long *, unsigned int);
void rds_ib_free_fmr_list(struct rds_ib_mr *);
struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
struct rds_ib_connection *ic,
struct scatterlist *sg,
unsigned long nents, u32 *key);
void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
unsigned long *unpinned, unsigned int goal);
void rds_ib_free_frmr_list(struct rds_ib_mr *);
#endif
This diff is collapsed.
......@@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
send->s_op = NULL;
send->s_wr.wr_id = i | RDS_IB_SEND_OP;
send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
send->s_wr.ex.imm_data = 0;
......@@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
oldest = rds_ib_ring_oldest(&ic->i_send_ring);
completed = rds_ib_ring_completed(&ic->i_send_ring,
(wc->wr_id & ~RDS_IB_SEND_OP),
oldest);
completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
......
......@@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_1m_pool_flush",
"ib_rdma_mr_1m_pool_wait",
"ib_rdma_mr_1m_pool_depleted",
"ib_rdma_mr_8k_reused",
"ib_rdma_mr_1m_reused",
"ib_atomic_cswp",
"ib_atomic_fadd",
};
......
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/in.h>
#include <linux/if.h>
#include <linux/netdevice.h>
#include <linux/inetdevice.h>
#include <linux/if_arp.h>
#include <linux/delay.h>
#include <linux/slab.h>
#include <linux/module.h>
#include "rds.h"
#include "iw.h"
unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
module_param(fastreg_pool_size, int, 0444);
MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
module_param(fastreg_message_size, int, 0444);
MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
struct list_head rds_iw_devices;
/* NOTE: if also grabbing iwdev lock, grab this first */
DEFINE_SPINLOCK(iw_nodev_conns_lock);
LIST_HEAD(iw_nodev_conns);
static void rds_iw_add_one(struct ib_device *device)
{
struct rds_iw_device *rds_iwdev;
/* Only handle iwarp devices */
if (device->node_type != RDMA_NODE_RNIC)
return;
rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
if (!rds_iwdev)
return;
spin_lock_init(&rds_iwdev->spinlock);
rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
rds_iwdev->max_wrs = device->attrs.max_qp_wr;
rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE);
rds_iwdev->dev = device;
rds_iwdev->pd = ib_alloc_pd(device);
if (IS_ERR(rds_iwdev->pd))
goto free_dev;
if (!rds_iwdev->dma_local_lkey) {
rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
IB_ACCESS_REMOTE_READ |
IB_ACCESS_REMOTE_WRITE |
IB_ACCESS_LOCAL_WRITE);
if (IS_ERR(rds_iwdev->mr))
goto err_pd;
} else
rds_iwdev->mr = NULL;
rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
if (IS_ERR(rds_iwdev->mr_pool)) {
rds_iwdev->mr_pool = NULL;
goto err_mr;
}
INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
INIT_LIST_HEAD(&rds_iwdev->conn_list);
list_add_tail(&rds_iwdev->list, &rds_iw_devices);
ib_set_client_data(device, &rds_iw_client, rds_iwdev);
return;
err_mr:
if (rds_iwdev->mr)
ib_dereg_mr(rds_iwdev->mr);
err_pd:
ib_dealloc_pd(rds_iwdev->pd);
free_dev:
kfree(rds_iwdev);
}
static void rds_iw_remove_one(struct ib_device *device, void *client_data)
{
struct rds_iw_device *rds_iwdev = client_data;
struct rds_iw_cm_id *i_cm_id, *next;
if (!rds_iwdev)
return;
spin_lock_irq(&rds_iwdev->spinlock);
list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
list_del(&i_cm_id->list);
kfree(i_cm_id);
}
spin_unlock_irq(&rds_iwdev->spinlock);
rds_iw_destroy_conns(rds_iwdev);
if (rds_iwdev->mr_pool)
rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
if (rds_iwdev->mr)
ib_dereg_mr(rds_iwdev->mr);
ib_dealloc_pd(rds_iwdev->pd);
list_del(&rds_iwdev->list);
kfree(rds_iwdev);
}
struct ib_client rds_iw_client = {
.name = "rds_iw",
.add = rds_iw_add_one,
.remove = rds_iw_remove_one
};
static int rds_iw_conn_info_visitor(struct rds_connection *conn,
void *buffer)
{
struct rds_info_rdma_connection *iinfo = buffer;
struct rds_iw_connection *ic;
/* We will only ever look at IB transports */
if (conn->c_trans != &rds_iw_transport)
return 0;
iinfo->src_addr = conn->c_laddr;
iinfo->dst_addr = conn->c_faddr;
memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
if (rds_conn_state(conn) == RDS_CONN_UP) {
struct rds_iw_device *rds_iwdev;
struct rdma_dev_addr *dev_addr;
ic = conn->c_transport_data;
dev_addr = &ic->i_cm_id->route.addr.dev_addr;
rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
iinfo->max_send_wr = ic->i_send_ring.w_nr;
iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
iinfo->max_send_sge = rds_iwdev->max_sge;
rds_iw_get_mr_info(rds_iwdev, iinfo);
}
return 1;
}
static void rds_iw_ic_info(struct socket *sock, unsigned int len,
struct rds_info_iterator *iter,
struct rds_info_lengths *lens)
{
rds_for_each_conn_info(sock, len, iter, lens,
rds_iw_conn_info_visitor,
sizeof(struct rds_info_rdma_connection));
}
/*
* Early RDS/IB was built to only bind to an address if there is an IPoIB
* device with that address set.
*
* If it were me, I'd advocate for something more flexible. Sending and
* receiving should be device-agnostic. Transports would try and maintain
* connections between peers who have messages queued. Userspace would be
* allowed to influence which paths have priority. We could call userspace
* asserting this policy "routing".
*/
static int rds_iw_laddr_check(struct net *net, __be32 addr)
{
int ret;
struct rdma_cm_id *cm_id;
struct sockaddr_in sin;
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
memset(&sin, 0, sizeof(sin));
sin.sin_family = AF_INET;
sin.sin_addr.s_addr = addr;
/* rdma_bind_addr will only succeed for IB & iWARP devices */
ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
/* due to this, we will claim to support IB devices unless we
check node_type. */
if (ret || !cm_id->device ||
cm_id->device->node_type != RDMA_NODE_RNIC)
ret = -EADDRNOTAVAIL;
rdsdebug("addr %pI4 ret %d node type %d\n",
&addr, ret,
cm_id->device ? cm_id->device->node_type : -1);
rdma_destroy_id(cm_id);
return ret;
}
void rds_iw_exit(void)
{
rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
rds_iw_destroy_nodev_conns();
ib_unregister_client(&rds_iw_client);
rds_iw_sysctl_exit();
rds_iw_recv_exit();
rds_trans_unregister(&rds_iw_transport);
}
struct rds_transport rds_iw_transport = {
.laddr_check = rds_iw_laddr_check,
.xmit_complete = rds_iw_xmit_complete,
.xmit = rds_iw_xmit,
.xmit_rdma = rds_iw_xmit_rdma,
.recv = rds_iw_recv,
.conn_alloc = rds_iw_conn_alloc,
.conn_free = rds_iw_conn_free,
.conn_connect = rds_iw_conn_connect,
.conn_shutdown = rds_iw_conn_shutdown,
.inc_copy_to_user = rds_iw_inc_copy_to_user,
.inc_free = rds_iw_inc_free,
.cm_initiate_connect = rds_iw_cm_initiate_connect,
.cm_handle_connect = rds_iw_cm_handle_connect,
.cm_connect_complete = rds_iw_cm_connect_complete,
.stats_info_copy = rds_iw_stats_info_copy,
.exit = rds_iw_exit,
.get_mr = rds_iw_get_mr,
.sync_mr = rds_iw_sync_mr,
.free_mr = rds_iw_free_mr,
.flush_mrs = rds_iw_flush_mrs,
.t_owner = THIS_MODULE,
.t_name = "iwarp",
.t_type = RDS_TRANS_IWARP,
.t_prefer_loopback = 1,
};
int rds_iw_init(void)
{
int ret;
INIT_LIST_HEAD(&rds_iw_devices);
ret = ib_register_client(&rds_iw_client);
if (ret)
goto out;
ret = rds_iw_sysctl_init();
if (ret)
goto out_ibreg;
ret = rds_iw_recv_init();
if (ret)
goto out_sysctl;
ret = rds_trans_register(&rds_iw_transport);
if (ret)
goto out_recv;
rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
goto out;
out_recv:
rds_iw_recv_exit();
out_sysctl:
rds_iw_sysctl_exit();
out_ibreg:
ib_unregister_client(&rds_iw_client);
out:
return ret;
}
MODULE_LICENSE("GPL");
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include "rds.h"
#include "iw.h"
/*
* Locking for IB rings.
* We assume that allocation is always protected by a mutex
* in the caller (this is a valid assumption for the current
* implementation).
*
* Freeing always happens in an interrupt, and hence only
* races with allocations, but not with other free()s.
*
* The interaction between allocation and freeing is that
* the alloc code has to determine the number of free entries.
* To this end, we maintain two counters; an allocation counter
* and a free counter. Both are allowed to run freely, and wrap
* around.
* The number of used entries is always (alloc_ctr - free_ctr) % NR.
*
* The current implementation makes free_ctr atomic. When the
* caller finds an allocation fails, it should set an "alloc fail"
* bit and retry the allocation. The "alloc fail" bit essentially tells
* the CQ completion handlers to wake it up after freeing some
* more entries.
*/
/*
* This only happens on shutdown.
*/
DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
{
memset(ring, 0, sizeof(*ring));
ring->w_nr = nr;
rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
}
static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
{
u32 diff;
/* This assumes that atomic_t has at least as many bits as u32 */
diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
BUG_ON(diff > ring->w_nr);
return diff;
}
void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
{
/* We only ever get called from the connection setup code,
* prior to creating the QP. */
BUG_ON(__rds_iw_ring_used(ring));
ring->w_nr = nr;
}
static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_used(ring) == 0;
}
u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
{
u32 ret = 0, avail;
avail = ring->w_nr - __rds_iw_ring_used(ring);
rdsdebug("ring %p val %u next %u free %u\n", ring, val,
ring->w_alloc_ptr, avail);
if (val && avail) {
ret = min(val, avail);
*pos = ring->w_alloc_ptr;
ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
ring->w_alloc_ctr += ret;
}
return ret;
}
void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
{
ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
atomic_add(val, &ring->w_free_ctr);
if (__rds_iw_ring_empty(ring) &&
waitqueue_active(&rds_iw_ring_empty_wait))
wake_up(&rds_iw_ring_empty_wait);
}
void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
{
ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
ring->w_alloc_ctr -= val;
}
int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_empty(ring);
}
int rds_iw_ring_low(struct rds_iw_work_ring *ring)
{
return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
}
/*
* returns the oldest alloced ring entry. This will be the next one
* freed. This can't be called if there are none allocated.
*/
u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
{
return ring->w_free_ptr;
}
/*
* returns the number of completed work requests.
*/
u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
{
u32 ret;
if (oldest <= (unsigned long long)wr_id)
ret = (unsigned long long)wr_id - oldest + 1;
else
ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
wr_id, oldest);
return ret;
}
This diff is collapsed.
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/percpu.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include "rds.h"
#include "iw.h"
DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
static const char *const rds_iw_stat_names[] = {
"iw_connect_raced",
"iw_listen_closed_stale",
"iw_tx_cq_call",
"iw_tx_cq_event",
"iw_tx_ring_full",
"iw_tx_throttle",
"iw_tx_sg_mapping_failure",
"iw_tx_stalled",
"iw_tx_credit_updates",
"iw_rx_cq_call",
"iw_rx_cq_event",
"iw_rx_ring_empty",
"iw_rx_refill_from_cq",
"iw_rx_refill_from_thread",
"iw_rx_alloc_limit",
"iw_rx_credit_updates",
"iw_ack_sent",
"iw_ack_send_failure",
"iw_ack_send_delayed",
"iw_ack_send_piggybacked",
"iw_ack_received",
"iw_rdma_mr_alloc",
"iw_rdma_mr_free",
"iw_rdma_mr_used",
"iw_rdma_mr_pool_flush",
"iw_rdma_mr_pool_wait",
"iw_rdma_mr_pool_depleted",
};
unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
unsigned int avail)
{
struct rds_iw_statistics stats = {0, };
uint64_t *src;
uint64_t *sum;
size_t i;
int cpu;
if (avail < ARRAY_SIZE(rds_iw_stat_names))
goto out;
for_each_online_cpu(cpu) {
src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
sum = (uint64_t *)&stats;
for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
*(sum++) += *(src++);
}
rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
ARRAY_SIZE(rds_iw_stat_names));
out:
return ARRAY_SIZE(rds_iw_stat_names);
}
/*
* Copyright (c) 2006 Oracle. All rights reserved.
*
* This software is available to you under a choice of one of two
* licenses. You may choose to be licensed under the terms of the GNU
* General Public License (GPL) Version 2, available from the file
* COPYING in the main directory of this source tree, or the
* OpenIB.org BSD license below:
*
* Redistribution and use in source and binary forms, with or
* without modification, are permitted provided that the following
* conditions are met:
*
* - Redistributions of source code must retain the above
* copyright notice, this list of conditions and the following
* disclaimer.
*
* - Redistributions in binary form must reproduce the above
* copyright notice, this list of conditions and the following
* disclaimer in the documentation and/or other materials
* provided with the distribution.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*/
#include <linux/kernel.h>
#include <linux/sysctl.h>
#include <linux/proc_fs.h>
#include "iw.h"
static struct ctl_table_header *rds_iw_sysctl_hdr;
unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
static unsigned long rds_iw_sysctl_max_wr_min = 1;
/* hardware will fail CQ creation long before this */
static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
unsigned int rds_iw_sysctl_flow_control = 1;
static struct ctl_table rds_iw_sysctl_table[] = {
{
.procname = "max_send_wr",
.data = &rds_iw_sysctl_max_send_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
.procname = "max_recv_wr",
.data = &rds_iw_sysctl_max_recv_wr,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_wr_min,
.extra2 = &rds_iw_sysctl_max_wr_max,
},
{
.procname = "max_unsignaled_wr",
.data = &rds_iw_sysctl_max_unsig_wrs,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_wr_min,
.extra2 = &rds_iw_sysctl_max_unsig_wr_max,
},
{
.procname = "max_unsignaled_bytes",
.data = &rds_iw_sysctl_max_unsig_bytes,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
.extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
.extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
},
{
.procname = "max_recv_allocation",
.data = &rds_iw_sysctl_max_recv_allocation,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
{
.procname = "flow_control",
.data = &rds_iw_sysctl_flow_control,
.maxlen = sizeof(rds_iw_sysctl_flow_control),
.mode = 0644,
.proc_handler = proc_dointvec,
},
{ }
};
void rds_iw_sysctl_exit(void)
{
unregister_net_sysctl_table(rds_iw_sysctl_hdr);
}
int rds_iw_sysctl_init(void)
{
rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
if (!rds_iw_sysctl_hdr)
return -ENOMEM;
return 0;
}
......@@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rdma_event_msg(event->event));
if (cm_id->device->node_type == RDMA_NODE_RNIC)
trans = &rds_iw_transport;
else
if (cm_id->device->node_type == RDMA_NODE_IB_CA)
trans = &rds_ib_transport;
/* Prevent shutdown from tearing down the connection
......@@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rds_conn_drop(conn);
break;
case RDMA_CM_EVENT_TIMEWAIT_EXIT:
if (conn) {
pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
&conn->c_laddr, &conn->c_faddr);
rds_conn_drop(conn);
}
break;
default:
/* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
......@@ -200,10 +206,6 @@ static int rds_rdma_init(void)
if (ret)
goto out;
ret = rds_iw_init();
if (ret)
goto err_iw_init;
ret = rds_ib_init();
if (ret)
goto err_ib_init;
......@@ -211,8 +213,6 @@ static int rds_rdma_init(void)
goto out;
err_ib_init:
rds_iw_exit();
err_iw_init:
rds_rdma_listen_stop();
out:
return ret;
......@@ -224,11 +224,10 @@ static void rds_rdma_exit(void)
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
rds_ib_exit();
rds_iw_exit();
}
module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
MODULE_DESCRIPTION("RDS: IB/iWARP transport");
MODULE_DESCRIPTION("RDS: IB transport");
MODULE_LICENSE("Dual BSD/GPL");
......@@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
void rds_ib_exit(void);
/* from iw.c */
extern struct rds_transport rds_iw_transport;
int rds_iw_init(void);
void rds_iw_exit(void);
#endif
......@@ -222,6 +222,7 @@ struct rds_incoming {
__be32 i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
struct timeval i_rx_tstamp;
};
struct rds_mr {
......
......@@ -35,6 +35,8 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/export.h>
#include <linux/time.h>
#include <linux/rds.h>
#include "rds.h"
......@@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_conn = conn;
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
inc->i_rx_tstamp.tv_sec = 0;
inc->i_rx_tstamp.tv_usec = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
......@@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
if (sock_flag(sk, SOCK_RCVTSTAMP))
do_gettimeofday(&inc->i_rx_tstamp);
rds_inc_addref(inc);
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
......@@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
/*
* Receive any control messages.
*/
static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
struct rds_sock *rs)
{
int ret = 0;
......@@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
return ret;
}
if ((inc->i_rx_tstamp.tv_sec != 0) &&
sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
sizeof(struct timeval),
&inc->i_rx_tstamp);
if (ret)
return ret;
}
return 0;
}
......@@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
msg->msg_flags |= MSG_TRUNC;
}
if (rds_cmsg_recv(inc, msg)) {
if (rds_cmsg_recv(inc, msg, rs)) {
ret = -EFAULT;
goto out;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment