Commit dd5a477c authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma

Pull rdma fixes from Doug Ledford:
 "Round three of 4.8 rc fixes.

  This is likely the last rdma pull request this cycle.  The new rxe
  driver had a few issues (you probably saw the boot bot bug report) and
  they should be addressed now.  There are a couple other fixes here,
  mainly mlx4.  There are still two outstanding issues that need
  resolved but I don't think their fix will make this kernel cycle.

  Summary:

   - Various fixes to rdmavt, ipoib, mlx5, mlx4, rxe"

* tag 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/dledford/rdma:
  IB/rdmavt: Don't vfree a kzalloc'ed memory region
  IB/rxe: Fix kmem_cache leak
  IB/rxe: Fix race condition between requester and completer
  IB/rxe: Fix duplicate atomic request handling
  IB/rxe: Fix kernel panic in udp_setup_tunnel
  IB/mlx5: Set source mac address in FTE
  IB/mlx5: Enable MAD_IFC commands for IB ports only
  IB/mlx4: Diagnostic HW counters are not supported in slave mode
  IB/mlx4: Use correct subnet-prefix in QP1 mads under SR-IOV
  IB/mlx4: Fix code indentation in QP1 MAD flow
  IB/mlx4: Fix incorrect MC join state bit-masking on SR-IOV
  IB/ipoib: Don't allow MC joins during light MC flush
  IB/rxe: fix GFP_KERNEL in spinlock context
parents 008f08d6 e4618d40
......@@ -1128,6 +1128,27 @@ void handle_port_mgmt_change_event(struct work_struct *work)
/* Generate GUID changed event */
if (changed_attr & MLX4_EQ_PORT_INFO_GID_PFX_CHANGE_MASK) {
if (mlx4_is_master(dev->dev)) {
union ib_gid gid;
int err = 0;
if (!eqe->event.port_mgmt_change.params.port_info.gid_prefix)
err = __mlx4_ib_query_gid(&dev->ib_dev, port, 0, &gid, 1);
else
gid.global.subnet_prefix =
eqe->event.port_mgmt_change.params.port_info.gid_prefix;
if (err) {
pr_warn("Could not change QP1 subnet prefix for port %d: query_gid error (%d)\n",
port, err);
} else {
pr_debug("Changing QP1 subnet prefix for port %d. old=0x%llx. new=0x%llx\n",
port,
(u64)atomic64_read(&dev->sriov.demux[port - 1].subnet_prefix),
be64_to_cpu(gid.global.subnet_prefix));
atomic64_set(&dev->sriov.demux[port - 1].subnet_prefix,
be64_to_cpu(gid.global.subnet_prefix));
}
}
mlx4_ib_dispatch_event(dev, port, IB_EVENT_GID_CHANGE);
/*if master, notify all slaves*/
if (mlx4_is_master(dev->dev))
......@@ -2202,6 +2223,8 @@ int mlx4_ib_init_sriov(struct mlx4_ib_dev *dev)
if (err)
goto demux_err;
dev->sriov.demux[i].guid_cache[0] = gid.global.interface_id;
atomic64_set(&dev->sriov.demux[i].subnet_prefix,
be64_to_cpu(gid.global.subnet_prefix));
err = alloc_pv_object(dev, mlx4_master_func_num(dev->dev), i + 1,
&dev->sriov.sqps[i]);
if (err)
......
......@@ -2202,6 +2202,9 @@ static int mlx4_ib_alloc_diag_counters(struct mlx4_ib_dev *ibdev)
bool per_port = !!(ibdev->dev->caps.flags2 &
MLX4_DEV_CAP_FLAG2_DIAG_PER_PORT);
if (mlx4_is_slave(ibdev->dev))
return 0;
for (i = 0; i < MLX4_DIAG_COUNTERS_TYPES; i++) {
/* i == 1 means we are building port counters */
if (i && !per_port)
......
......@@ -489,7 +489,7 @@ static u8 get_leave_state(struct mcast_group *group)
if (!group->members[i])
leave_state |= (1 << i);
return leave_state & (group->rec.scope_join_state & 7);
return leave_state & (group->rec.scope_join_state & 0xf);
}
static int join_group(struct mcast_group *group, int slave, u8 join_mask)
......@@ -564,8 +564,8 @@ static void mlx4_ib_mcg_timeout_handler(struct work_struct *work)
} else
mcg_warn_group(group, "DRIVER BUG\n");
} else if (group->state == MCAST_LEAVE_SENT) {
if (group->rec.scope_join_state & 7)
group->rec.scope_join_state &= 0xf8;
if (group->rec.scope_join_state & 0xf)
group->rec.scope_join_state &= 0xf0;
group->state = MCAST_IDLE;
mutex_unlock(&group->lock);
if (release_group(group, 1))
......@@ -605,7 +605,7 @@ static int handle_leave_req(struct mcast_group *group, u8 leave_mask,
static int handle_join_req(struct mcast_group *group, u8 join_mask,
struct mcast_req *req)
{
u8 group_join_state = group->rec.scope_join_state & 7;
u8 group_join_state = group->rec.scope_join_state & 0xf;
int ref = 0;
u16 status;
struct ib_sa_mcmember_data *sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
......@@ -690,8 +690,8 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
u8 cur_join_state;
resp_join_state = ((struct ib_sa_mcmember_data *)
group->response_sa_mad.data)->scope_join_state & 7;
cur_join_state = group->rec.scope_join_state & 7;
group->response_sa_mad.data)->scope_join_state & 0xf;
cur_join_state = group->rec.scope_join_state & 0xf;
if (method == IB_MGMT_METHOD_GET_RESP) {
/* successfull join */
......@@ -710,7 +710,7 @@ static void mlx4_ib_mcg_work_handler(struct work_struct *work)
req = list_first_entry(&group->pending_list, struct mcast_req,
group_list);
sa_data = (struct ib_sa_mcmember_data *)req->sa_mad.data;
req_join_state = sa_data->scope_join_state & 0x7;
req_join_state = sa_data->scope_join_state & 0xf;
/* For a leave request, we will immediately answer the VF, and
* update our internal counters. The actual leave will be sent
......
......@@ -448,7 +448,7 @@ struct mlx4_ib_demux_ctx {
struct workqueue_struct *wq;
struct workqueue_struct *ud_wq;
spinlock_t ud_lock;
__be64 subnet_prefix;
atomic64_t subnet_prefix;
__be64 guid_cache[128];
struct mlx4_ib_dev *dev;
/* the following lock protects both mcg_table and mcg_mgid0_list */
......
......@@ -2493,24 +2493,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr,
sqp->ud_header.grh.flow_label =
ah->av.ib.sl_tclass_flowlabel & cpu_to_be32(0xfffff);
sqp->ud_header.grh.hop_limit = ah->av.ib.hop_limit;
if (is_eth)
if (is_eth) {
memcpy(sqp->ud_header.grh.source_gid.raw, sgid.raw, 16);
else {
if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
/* When multi-function is enabled, the ib_core gid
* indexes don't necessarily match the hw ones, so
* we must use our own cache */
sqp->ud_header.grh.source_gid.global.subnet_prefix =
to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
subnet_prefix;
sqp->ud_header.grh.source_gid.global.interface_id =
to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
guid_cache[ah->av.ib.gid_index];
} else
ib_get_cached_gid(ib_dev,
be32_to_cpu(ah->av.ib.port_pd) >> 24,
ah->av.ib.gid_index,
&sqp->ud_header.grh.source_gid, NULL);
} else {
if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) {
/* When multi-function is enabled, the ib_core gid
* indexes don't necessarily match the hw ones, so
* we must use our own cache
*/
sqp->ud_header.grh.source_gid.global.subnet_prefix =
cpu_to_be64(atomic64_read(&(to_mdev(ib_dev)->sriov.
demux[sqp->qp.port - 1].
subnet_prefix)));
sqp->ud_header.grh.source_gid.global.interface_id =
to_mdev(ib_dev)->sriov.demux[sqp->qp.port - 1].
guid_cache[ah->av.ib.gid_index];
} else {
ib_get_cached_gid(ib_dev,
be32_to_cpu(ah->av.ib.port_pd) >> 24,
ah->av.ib.gid_index,
&sqp->ud_header.grh.source_gid, NULL);
}
}
memcpy(sqp->ud_header.grh.destination_gid.raw,
ah->av.ib.dgid, 16);
......
......@@ -288,7 +288,9 @@ __be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num,
static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev)
{
return !MLX5_CAP_GEN(dev->mdev, ib_virt);
if (MLX5_CAP_GEN(dev->mdev, port_type) == MLX5_CAP_PORT_TYPE_IB)
return !MLX5_CAP_GEN(dev->mdev, ib_virt);
return 0;
}
enum {
......@@ -1428,6 +1430,13 @@ static int parse_flow_attr(u32 *match_c, u32 *match_v,
dmac_47_16),
ib_spec->eth.val.dst_mac);
ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_c,
smac_47_16),
ib_spec->eth.mask.src_mac);
ether_addr_copy(MLX5_ADDR_OF(fte_match_set_lyr_2_4, outer_headers_v,
smac_47_16),
ib_spec->eth.val.src_mac);
if (ib_spec->eth.mask.vlan_tag) {
MLX5_SET(fte_match_set_lyr_2_4, outer_headers_c,
vlan_tag, 1);
......
......@@ -294,7 +294,7 @@ static void __rvt_free_mr(struct rvt_mr *mr)
{
rvt_deinit_mregion(&mr->mr);
rvt_free_lkey(&mr->mr);
vfree(mr);
kfree(mr);
}
/**
......
......@@ -362,15 +362,34 @@ static int __init rxe_module_init(void)
return err;
}
err = rxe_net_init();
err = rxe_net_ipv4_init();
if (err) {
pr_err("rxe: unable to init\n");
pr_err("rxe: unable to init ipv4 tunnel\n");
rxe_cache_exit();
return err;
goto exit;
}
err = rxe_net_ipv6_init();
if (err) {
pr_err("rxe: unable to init ipv6 tunnel\n");
rxe_cache_exit();
goto exit;
}
err = register_netdevice_notifier(&rxe_net_notifier);
if (err) {
pr_err("rxe: Failed to rigister netdev notifier\n");
goto exit;
}
pr_info("rxe: loaded\n");
return 0;
exit:
rxe_release_udp_tunnel(recv_sockets.sk4);
rxe_release_udp_tunnel(recv_sockets.sk6);
return err;
}
static void __exit rxe_module_exit(void)
......
......@@ -689,7 +689,14 @@ int rxe_completer(void *arg)
qp->req.need_retry = 1;
rxe_run_task(&qp->req.task, 1);
}
if (pkt) {
rxe_drop_ref(pkt->qp);
kfree_skb(skb);
}
goto exit;
} else {
wqe->status = IB_WC_RETRY_EXC_ERR;
state = COMPST_ERROR;
......@@ -716,6 +723,12 @@ int rxe_completer(void *arg)
case COMPST_ERROR:
do_complete(qp, wqe);
rxe_qp_error(qp);
if (pkt) {
rxe_drop_ref(pkt->qp);
kfree_skb(skb);
}
goto exit;
}
}
......
......@@ -275,9 +275,10 @@ static struct socket *rxe_setup_udp_tunnel(struct net *net, __be16 port,
return sock;
}
static void rxe_release_udp_tunnel(struct socket *sk)
void rxe_release_udp_tunnel(struct socket *sk)
{
udp_tunnel_sock_release(sk);
if (sk)
udp_tunnel_sock_release(sk);
}
static void prepare_udp_hdr(struct sk_buff *skb, __be16 src_port,
......@@ -658,51 +659,45 @@ static int rxe_notify(struct notifier_block *not_blk,
return NOTIFY_OK;
}
static struct notifier_block rxe_net_notifier = {
struct notifier_block rxe_net_notifier = {
.notifier_call = rxe_notify,
};
int rxe_net_init(void)
int rxe_net_ipv4_init(void)
{
int err;
spin_lock_init(&dev_list_lock);
recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
htons(ROCE_V2_UDP_DPORT), true);
if (IS_ERR(recv_sockets.sk6)) {
recv_sockets.sk6 = NULL;
pr_err("rxe: Failed to create IPv6 UDP tunnel\n");
return -1;
}
recv_sockets.sk4 = rxe_setup_udp_tunnel(&init_net,
htons(ROCE_V2_UDP_DPORT), false);
htons(ROCE_V2_UDP_DPORT), false);
if (IS_ERR(recv_sockets.sk4)) {
rxe_release_udp_tunnel(recv_sockets.sk6);
recv_sockets.sk4 = NULL;
recv_sockets.sk6 = NULL;
pr_err("rxe: Failed to create IPv4 UDP tunnel\n");
return -1;
}
err = register_netdevice_notifier(&rxe_net_notifier);
if (err) {
rxe_release_udp_tunnel(recv_sockets.sk6);
rxe_release_udp_tunnel(recv_sockets.sk4);
pr_err("rxe: Failed to rigister netdev notifier\n");
}
return err;
return 0;
}
void rxe_net_exit(void)
int rxe_net_ipv6_init(void)
{
if (recv_sockets.sk6)
rxe_release_udp_tunnel(recv_sockets.sk6);
#if IS_ENABLED(CONFIG_IPV6)
if (recv_sockets.sk4)
rxe_release_udp_tunnel(recv_sockets.sk4);
spin_lock_init(&dev_list_lock);
recv_sockets.sk6 = rxe_setup_udp_tunnel(&init_net,
htons(ROCE_V2_UDP_DPORT), true);
if (IS_ERR(recv_sockets.sk6)) {
recv_sockets.sk6 = NULL;
pr_err("rxe: Failed to create IPv6 UDP tunnel\n");
return -1;
}
#endif
return 0;
}
void rxe_net_exit(void)
{
rxe_release_udp_tunnel(recv_sockets.sk6);
rxe_release_udp_tunnel(recv_sockets.sk4);
unregister_netdevice_notifier(&rxe_net_notifier);
}
......@@ -44,10 +44,13 @@ struct rxe_recv_sockets {
};
extern struct rxe_recv_sockets recv_sockets;
extern struct notifier_block rxe_net_notifier;
void rxe_release_udp_tunnel(struct socket *sk);
struct rxe_dev *rxe_net_add(struct net_device *ndev);
int rxe_net_init(void);
int rxe_net_ipv4_init(void);
int rxe_net_ipv6_init(void);
void rxe_net_exit(void);
#endif /* RXE_NET_H */
......@@ -312,7 +312,7 @@ static void rxe_rcv_mcast_pkt(struct rxe_dev *rxe, struct sk_buff *skb)
* make a copy of the skb to post to the next qp
*/
skb_copy = (mce->qp_list.next != &mcg->qp_list) ?
skb_clone(skb, GFP_KERNEL) : NULL;
skb_clone(skb, GFP_ATOMIC) : NULL;
pkt->qp = qp;
rxe_add_ref(qp);
......
......@@ -511,24 +511,21 @@ static int fill_packet(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
}
static void update_wqe_state(struct rxe_qp *qp,
struct rxe_send_wqe *wqe,
struct rxe_pkt_info *pkt,
enum wqe_state *prev_state)
struct rxe_send_wqe *wqe,
struct rxe_pkt_info *pkt)
{
enum wqe_state prev_state_ = wqe->state;
if (pkt->mask & RXE_END_MASK) {
if (qp_type(qp) == IB_QPT_RC)
wqe->state = wqe_state_pending;
} else {
wqe->state = wqe_state_processing;
}
*prev_state = prev_state_;
}
static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
struct rxe_pkt_info *pkt, int payload)
static void update_wqe_psn(struct rxe_qp *qp,
struct rxe_send_wqe *wqe,
struct rxe_pkt_info *pkt,
int payload)
{
/* number of packets left to send including current one */
int num_pkt = (wqe->dma.resid + payload + qp->mtu - 1) / qp->mtu;
......@@ -546,9 +543,34 @@ static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
qp->req.psn = (wqe->first_psn + num_pkt) & BTH_PSN_MASK;
else
qp->req.psn = (qp->req.psn + 1) & BTH_PSN_MASK;
}
qp->req.opcode = pkt->opcode;
static void save_state(struct rxe_send_wqe *wqe,
struct rxe_qp *qp,
struct rxe_send_wqe *rollback_wqe,
struct rxe_qp *rollback_qp)
{
rollback_wqe->state = wqe->state;
rollback_wqe->first_psn = wqe->first_psn;
rollback_wqe->last_psn = wqe->last_psn;
rollback_qp->req.psn = qp->req.psn;
}
static void rollback_state(struct rxe_send_wqe *wqe,
struct rxe_qp *qp,
struct rxe_send_wqe *rollback_wqe,
struct rxe_qp *rollback_qp)
{
wqe->state = rollback_wqe->state;
wqe->first_psn = rollback_wqe->first_psn;
wqe->last_psn = rollback_wqe->last_psn;
qp->req.psn = rollback_qp->req.psn;
}
static void update_state(struct rxe_qp *qp, struct rxe_send_wqe *wqe,
struct rxe_pkt_info *pkt, int payload)
{
qp->req.opcode = pkt->opcode;
if (pkt->mask & RXE_END_MASK)
qp->req.wqe_index = next_index(qp->sq.queue, qp->req.wqe_index);
......@@ -571,7 +593,8 @@ int rxe_requester(void *arg)
int mtu;
int opcode;
int ret;
enum wqe_state prev_state;
struct rxe_qp rollback_qp;
struct rxe_send_wqe rollback_wqe;
next_wqe:
if (unlikely(!qp->valid || qp->req.state == QP_STATE_ERROR))
......@@ -688,13 +711,21 @@ int rxe_requester(void *arg)
goto err;
}
update_wqe_state(qp, wqe, &pkt, &prev_state);
/*
* To prevent a race on wqe access between requester and completer,
* wqe members state and psn need to be set before calling
* rxe_xmit_packet().
* Otherwise, completer might initiate an unjustified retry flow.
*/
save_state(wqe, qp, &rollback_wqe, &rollback_qp);
update_wqe_state(qp, wqe, &pkt);
update_wqe_psn(qp, wqe, &pkt, payload);
ret = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp, &pkt, skb);
if (ret) {
qp->need_req_skb = 1;
kfree_skb(skb);
wqe->state = prev_state;
rollback_state(wqe, qp, &rollback_wqe, &rollback_qp);
if (ret == -EAGAIN) {
rxe_run_task(&qp->req.task, 1);
......
......@@ -972,11 +972,13 @@ static int send_atomic_ack(struct rxe_qp *qp, struct rxe_pkt_info *pkt,
free_rd_atomic_resource(qp, res);
rxe_advance_resp_resource(qp);
memcpy(SKB_TO_PKT(skb), &ack_pkt, sizeof(skb->cb));
res->type = RXE_ATOMIC_MASK;
res->atomic.skb = skb;
res->first_psn = qp->resp.psn;
res->last_psn = qp->resp.psn;
res->cur_psn = qp->resp.psn;
res->first_psn = ack_pkt.psn;
res->last_psn = ack_pkt.psn;
res->cur_psn = ack_pkt.psn;
rc = rxe_xmit_packet(rxe, qp, &ack_pkt, skb_copy);
if (rc) {
......@@ -1116,8 +1118,7 @@ static enum resp_states duplicate_request(struct rxe_qp *qp,
rc = RESPST_CLEANUP;
goto out;
}
bth_set_psn(SKB_TO_PKT(skb_copy),
qp->resp.psn - 1);
/* Resend the result. */
rc = rxe_xmit_packet(to_rdev(qp->ibqp.device), qp,
pkt, skb_copy);
......
......@@ -1161,8 +1161,17 @@ static void __ipoib_ib_dev_flush(struct ipoib_dev_priv *priv,
}
if (level == IPOIB_FLUSH_LIGHT) {
int oper_up;
ipoib_mark_paths_invalid(dev);
/* Set IPoIB operation as down to prevent races between:
* the flush flow which leaves MCG and on the fly joins
* which can happen during that time. mcast restart task
* should deal with join requests we missed.
*/
oper_up = test_and_clear_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_mcast_dev_flush(dev);
if (oper_up)
set_bit(IPOIB_FLAG_OPER_UP, &priv->flags);
ipoib_flush_ah(dev);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment