Commit f38ba179 authored by Ursula Braun's avatar Ursula Braun Committed by David S. Miller

smc: work request (WR) base for use by LLC and CDC

The base containers for RDMA transport are work requests and completion
queue entries processed through Infiniband verbs:
* allocate and initialize these areas
* map these areas to DMA
* implement the basic communication consisting of work request posting
  and receival of completion queue events
Signed-off-by: default avatarUrsula Braun <ubraun@linux.vnet.ibm.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent cd6851f3
obj-$(CONFIG_SMC) += smc.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o
smc-y := af_smc.o smc_pnet.o smc_ib.o smc_clc.o smc_core.o smc_wr.o
......@@ -12,6 +12,7 @@
#include <linux/socket.h>
#include <linux/types.h>
#include <linux/compiler.h> /* __aligned */
#include <net/sock.h>
#include "smc_ib.h"
......@@ -29,6 +30,10 @@ enum smc_state { /* possible states of an SMC socket */
struct smc_link_group;
struct smc_wr_rx_hdr { /* common prefix part of LLC and CDC to demultiplex */
u8 type;
} __aligned(1);
struct smc_connection {
struct rb_node alert_node;
struct smc_link_group *lgr; /* link group of connection */
......
......@@ -20,6 +20,7 @@
#include "smc_clc.h"
#include "smc_core.h"
#include "smc_ib.h"
#include "smc_wr.h"
#define SMC_LGR_FREE_DELAY (600 * HZ)
......@@ -161,12 +162,20 @@ static int smc_lgr_create(struct smc_sock *smc, __be32 peer_in_addr,
lnk->path_mtu = smcibdev->pattr[ibport - 1].active_mtu;
get_random_bytes(rndvec, sizeof(rndvec));
lnk->psn_initial = rndvec[0] + (rndvec[1] << 8) + (rndvec[2] << 16);
rc = smc_wr_alloc_link_mem(lnk);
if (rc)
goto free_lgr;
init_waitqueue_head(&lnk->wr_tx_wait);
smc->conn.lgr = lgr;
rwlock_init(&lgr->conns_lock);
spin_lock_bh(&smc_lgr_list.lock);
list_add(&lgr->list, &smc_lgr_list.list);
spin_unlock_bh(&smc_lgr_list.lock);
return 0;
free_lgr:
kfree(lgr);
out:
return rc;
}
......@@ -202,6 +211,8 @@ void smc_conn_free(struct smc_connection *conn)
static void smc_link_clear(struct smc_link *lnk)
{
lnk->peer_qpn = 0;
smc_wr_free_link(lnk);
smc_wr_free_link_mem(lnk);
}
static void smc_lgr_free_sndbufs(struct smc_link_group *lgr)
......
......@@ -11,6 +11,7 @@
#ifndef _SMC_CORE_H
#define _SMC_CORE_H
#include <linux/atomic.h>
#include <rdma/ib_verbs.h>
#include "smc.h"
......@@ -30,11 +31,40 @@ enum smc_lgr_role { /* possible roles of a link group */
SMC_SERV /* server */
};
#define SMC_WR_BUF_SIZE 48 /* size of work request buffer */
struct smc_wr_buf {
u8 raw[SMC_WR_BUF_SIZE];
};
struct smc_link {
struct smc_ib_device *smcibdev; /* ib-device */
u8 ibport; /* port - values 1 | 2 */
struct ib_pd *roce_pd; /* IB protection domain,
* unique for every RoCE QP
*/
struct ib_qp *roce_qp; /* IB queue pair */
struct ib_qp_attr qp_attr; /* IB queue pair attributes */
struct smc_wr_buf *wr_tx_bufs; /* WR send payload buffers */
struct ib_send_wr *wr_tx_ibs; /* WR send meta data */
struct ib_sge *wr_tx_sges; /* WR send gather meta data */
struct smc_wr_tx_pend *wr_tx_pends; /* WR send waiting for CQE */
/* above four vectors have wr_tx_cnt elements and use the same index */
dma_addr_t wr_tx_dma_addr; /* DMA address of wr_tx_bufs */
atomic_long_t wr_tx_id; /* seq # of last sent WR */
unsigned long *wr_tx_mask; /* bit mask of used indexes */
u32 wr_tx_cnt; /* number of WR send buffers */
wait_queue_head_t wr_tx_wait; /* wait for free WR send buf */
struct smc_wr_buf *wr_rx_bufs; /* WR recv payload buffers */
struct ib_recv_wr *wr_rx_ibs; /* WR recv meta data */
struct ib_sge *wr_rx_sges; /* WR recv scatter meta data */
/* above three vectors have wr_rx_cnt elements and use the same index */
dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */
u64 wr_rx_id; /* seq # of last recv WR */
u32 wr_rx_cnt; /* number of WR recv buffers */
union ib_gid gid; /* gid matching used vlan id */
u32 peer_qpn; /* QP number of peer */
enum ib_mtu path_mtu; /* used mtu */
......
......@@ -17,6 +17,7 @@
#include "smc_pnet.h"
#include "smc_ib.h"
#include "smc_core.h"
#include "smc_wr.h"
#include "smc.h"
struct smc_ib_devices smc_ib_devices = { /* smc-registered ib devices */
......@@ -30,6 +31,78 @@ u8 local_systemid[SMC_SYSTEMID_LEN] = SMC_LOCAL_SYSTEMID_RESET; /* unique system
* identifier
*/
void smc_ib_dealloc_protection_domain(struct smc_link *lnk)
{
ib_dealloc_pd(lnk->roce_pd);
lnk->roce_pd = NULL;
}
int smc_ib_create_protection_domain(struct smc_link *lnk)
{
int rc;
lnk->roce_pd = ib_alloc_pd(lnk->smcibdev->ibdev, 0);
rc = PTR_ERR_OR_ZERO(lnk->roce_pd);
if (IS_ERR(lnk->roce_pd))
lnk->roce_pd = NULL;
return rc;
}
static void smc_ib_qp_event_handler(struct ib_event *ibevent, void *priv)
{
switch (ibevent->event) {
case IB_EVENT_DEVICE_FATAL:
case IB_EVENT_GID_CHANGE:
case IB_EVENT_PORT_ERR:
case IB_EVENT_QP_ACCESS_ERR:
/* tbd in follow-on patch:
* abnormal close of corresponding connections
*/
break;
default:
break;
}
}
void smc_ib_destroy_queue_pair(struct smc_link *lnk)
{
ib_destroy_qp(lnk->roce_qp);
lnk->roce_qp = NULL;
}
/* create a queue pair within the protection domain for a link */
int smc_ib_create_queue_pair(struct smc_link *lnk)
{
struct ib_qp_init_attr qp_attr = {
.event_handler = smc_ib_qp_event_handler,
.qp_context = lnk,
.send_cq = lnk->smcibdev->roce_cq_send,
.recv_cq = lnk->smcibdev->roce_cq_recv,
.srq = NULL,
.cap = {
.max_send_wr = SMC_WR_BUF_CNT,
/* include unsolicited rdma_writes as well,
* there are max. 2 RDMA_WRITE per 1 WR_SEND
*/
.max_recv_wr = SMC_WR_BUF_CNT * 3,
.max_send_sge = SMC_IB_MAX_SEND_SGE,
.max_recv_sge = 1,
.max_inline_data = SMC_WR_TX_SIZE,
},
.sq_sig_type = IB_SIGNAL_REQ_WR,
.qp_type = IB_QPT_RC,
};
int rc;
lnk->roce_qp = ib_create_qp(lnk->roce_pd, &qp_attr);
rc = PTR_ERR_OR_ZERO(lnk->roce_qp);
if (IS_ERR(lnk->roce_qp))
lnk->roce_qp = NULL;
else
smc_wr_remember_qp_attr(lnk);
return rc;
}
/* map a new TX or RX buffer to DMA */
int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
struct smc_buf_desc *buf_slot,
......
......@@ -16,6 +16,8 @@
#define SMC_MAX_PORTS 2 /* Max # of ports */
#define SMC_GID_SIZE sizeof(union ib_gid)
#define SMC_IB_MAX_SEND_SGE 2
struct smc_ib_devices { /* list of smc ib devices definition */
struct list_head list;
spinlock_t lock; /* protects list of smc ib devices */
......@@ -27,12 +29,17 @@ struct smc_ib_device { /* ib-device infos for smc */
struct list_head list;
struct ib_device *ibdev;
struct ib_port_attr pattr[SMC_MAX_PORTS]; /* ib dev. port attrs */
struct ib_cq *roce_cq_send; /* send completion queue */
struct ib_cq *roce_cq_recv; /* recv completion queue */
struct tasklet_struct send_tasklet; /* called by send cq handler */
struct tasklet_struct recv_tasklet; /* called by recv cq handler */
char mac[SMC_MAX_PORTS][6]; /* mac address per port*/
union ib_gid gid[SMC_MAX_PORTS]; /* gid per port */
u8 initialized : 1; /* ib dev CQ, evthdl done */
};
struct smc_buf_desc;
struct smc_link;
int smc_ib_register_client(void) __init;
void smc_ib_unregister_client(void);
......@@ -41,5 +48,9 @@ int smc_ib_remember_port_attr(struct smc_ib_device *smcibdev, u8 ibport);
int smc_ib_buf_map(struct smc_ib_device *smcibdev, int buf_size,
struct smc_buf_desc *buf_slot,
enum dma_data_direction data_direction);
void smc_ib_dealloc_protection_domain(struct smc_link *lnk);
int smc_ib_create_protection_domain(struct smc_link *lnk);
void smc_ib_destroy_queue_pair(struct smc_link *lnk);
int smc_ib_create_queue_pair(struct smc_link *lnk);
#endif
This diff is collapsed.
/*
* Shared Memory Communications over RDMA (SMC-R) and RoCE
*
* Work Requests exploiting Infiniband API
*
* Copyright IBM Corp. 2016
*
* Author(s): Steffen Maier <maier@linux.vnet.ibm.com>
*/
#ifndef SMC_WR_H
#define SMC_WR_H
#include <linux/atomic.h>
#include <rdma/ib_verbs.h>
#include <asm/div64.h>
#include "smc.h"
#include "smc_core.h"
#define SMC_WR_MAX_CQE 32768 /* max. # of completion queue elements */
#define SMC_WR_BUF_CNT 16 /* # of ctrl buffers per link */
#define SMC_WR_TX_WAIT_FREE_SLOT_TIME (10 * HZ)
#define SMC_WR_TX_WAIT_PENDING_TIME (5 * HZ)
#define SMC_WR_TX_SIZE 44 /* actual size of wr_send data (<=SMC_WR_BUF_SIZE) */
#define SMC_WR_TX_PEND_PRIV_SIZE 32
struct smc_wr_tx_pend_priv {
u8 priv[SMC_WR_TX_PEND_PRIV_SIZE];
};
typedef void (*smc_wr_tx_handler)(struct smc_wr_tx_pend_priv *,
struct smc_link *,
enum ib_wc_status);
struct smc_wr_rx_handler {
struct hlist_node list; /* hash table collision resolution */
void (*handler)(struct ib_wc *, void *);
u8 type;
};
/* Only used by RDMA write WRs.
* All other WRs (CDC/LLC) use smc_wr_tx_send handling WR_ID implicitly
*/
static inline long smc_wr_tx_get_next_wr_id(struct smc_link *link)
{
return atomic_long_inc_return(&link->wr_tx_id);
}
static inline void smc_wr_tx_set_wr_id(atomic_long_t *wr_tx_id, long val)
{
atomic_long_set(wr_tx_id, val);
}
/* post a new receive work request to fill a completed old work request entry */
static inline int smc_wr_rx_post(struct smc_link *link)
{
struct ib_recv_wr *bad_recv_wr = NULL;
int rc;
u64 wr_id, temp_wr_id;
u32 index;
wr_id = ++link->wr_rx_id; /* tasklet context, thus not atomic */
temp_wr_id = wr_id;
index = do_div(temp_wr_id, link->wr_rx_cnt);
link->wr_rx_ibs[index].wr_id = wr_id;
rc = ib_post_recv(link->roce_qp, &link->wr_rx_ibs[index], &bad_recv_wr);
return rc;
}
int smc_wr_create_link(struct smc_link *lnk);
int smc_wr_alloc_link_mem(struct smc_link *lnk);
void smc_wr_free_link(struct smc_link *lnk);
void smc_wr_free_link_mem(struct smc_link *lnk);
void smc_wr_remember_qp_attr(struct smc_link *lnk);
void smc_wr_remove_dev(struct smc_ib_device *smcibdev);
void smc_wr_add_dev(struct smc_ib_device *smcibdev);
int smc_wr_tx_get_free_slot(struct smc_link *link, smc_wr_tx_handler handler,
struct smc_wr_buf **wr_buf,
struct smc_wr_tx_pend_priv **wr_pend_priv);
int smc_wr_tx_put_slot(struct smc_link *link,
struct smc_wr_tx_pend_priv *wr_pend_priv);
int smc_wr_tx_send(struct smc_link *link,
struct smc_wr_tx_pend_priv *wr_pend_priv);
void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler);
int smc_wr_rx_post_init(struct smc_link *link);
void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context);
#endif /* SMC_WR_H */
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment