Commit a88eb6be authored by David S. Miller's avatar David S. Miller

Merge branch 'tipc-link-starvation'

Jon Maloy says:

====================
tipc: improve interaction socket-link

We fix a very real starvation problem that may occur when a link
encounters send buffer congestion. At the same time we make the
interaction between the socket and link layer simpler and more
consistent.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents aa276dd7 365ad353
......@@ -174,7 +174,7 @@ static void tipc_bcbase_xmit(struct net *net, struct sk_buff_head *xmitq)
* and to identified node local sockets
* @net: the applicable net namespace
* @list: chain of buffers containing message
* Consumes the buffer chain, except when returning -ELINKCONG
* Consumes the buffer chain.
* Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
*/
int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
......@@ -197,7 +197,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
tipc_bcast_unlock(net);
/* Don't send to local node if adding to link failed */
if (unlikely(rc)) {
if (unlikely(rc && (rc != -ELINKCONG))) {
__skb_queue_purge(&rcvq);
return rc;
}
......@@ -206,7 +206,7 @@ int tipc_bcast_xmit(struct net *net, struct sk_buff_head *list)
tipc_bcbase_xmit(net, &xmitq);
tipc_sk_mcast_rcv(net, &rcvq, &inputq);
__skb_queue_purge(list);
return 0;
return rc;
}
/* tipc_bcast_rcv - receive a broadcast packet, and deliver to rcv link
......
......@@ -776,60 +776,47 @@ int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq)
/**
* link_schedule_user - schedule a message sender for wakeup after congestion
* @link: congested link
* @list: message that was attempted sent
* @l: congested link
* @hdr: header of message that is being sent
* Create pseudo msg to send back to user when congestion abates
* Does not consume buffer list
*/
static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
static int link_schedule_user(struct tipc_link *l, struct tipc_msg *hdr)
{
struct tipc_msg *msg = buf_msg(skb_peek(list));
int imp = msg_importance(msg);
u32 oport = msg_origport(msg);
u32 addr = tipc_own_addr(link->net);
u32 dnode = tipc_own_addr(l->net);
u32 dport = msg_origport(hdr);
struct sk_buff *skb;
/* This really cannot happen... */
if (unlikely(imp > TIPC_CRITICAL_IMPORTANCE)) {
pr_warn("%s<%s>, send queue full", link_rst_msg, link->name);
return -ENOBUFS;
}
/* Non-blocking sender: */
if (TIPC_SKB_CB(skb_peek(list))->wakeup_pending)
return -ELINKCONG;
/* Create and schedule wakeup pseudo message */
skb = tipc_msg_create(SOCK_WAKEUP, 0, INT_H_SIZE, 0,
addr, addr, oport, 0, 0);
dnode, l->addr, dport, 0, 0);
if (!skb)
return -ENOBUFS;
TIPC_SKB_CB(skb)->chain_sz = skb_queue_len(list);
TIPC_SKB_CB(skb)->chain_imp = imp;
skb_queue_tail(&link->wakeupq, skb);
link->stats.link_congs++;
msg_set_dest_droppable(buf_msg(skb), true);
TIPC_SKB_CB(skb)->chain_imp = msg_importance(hdr);
skb_queue_tail(&l->wakeupq, skb);
l->stats.link_congs++;
return -ELINKCONG;
}
/**
* link_prepare_wakeup - prepare users for wakeup after congestion
* @link: congested link
* Move a number of waiting users, as permitted by available space in
* the send queue, from link wait queue to node wait queue for wakeup
* @l: congested link
* Wake up a number of waiting users, as permitted by available space
* in the send queue
*/
void link_prepare_wakeup(struct tipc_link *l)
{
int pnd[TIPC_SYSTEM_IMPORTANCE + 1] = {0,};
int imp, lim;
struct sk_buff *skb, *tmp;
int imp, i = 0;
skb_queue_walk_safe(&l->wakeupq, skb, tmp) {
imp = TIPC_SKB_CB(skb)->chain_imp;
lim = l->backlog[imp].limit;
pnd[imp] += TIPC_SKB_CB(skb)->chain_sz;
if ((pnd[imp] + l->backlog[imp].len) >= lim)
if (l->backlog[imp].len < l->backlog[imp].limit) {
skb_unlink(skb, &l->wakeupq);
skb_queue_tail(l->inputq, skb);
} else if (i++ > 10) {
break;
skb_unlink(skb, &l->wakeupq);
skb_queue_tail(l->inputq, skb);
}
}
}
......@@ -869,8 +856,7 @@ void tipc_link_reset(struct tipc_link *l)
* @list: chain of buffers containing message
* @xmitq: returned list of packets to be sent by caller
*
* Consumes the buffer chain, except when returning -ELINKCONG,
* since the caller then may want to make more send attempts.
* Consumes the buffer chain.
* Returns 0 if success, or errno: -ELINKCONG, -EMSGSIZE or -ENOBUFS
* Messages at TIPC_SYSTEM_IMPORTANCE are always accepted
*/
......@@ -879,7 +865,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
{
struct tipc_msg *hdr = buf_msg(skb_peek(list));
unsigned int maxwin = l->window;
unsigned int i, imp = msg_importance(hdr);
int imp = msg_importance(hdr);
unsigned int mtu = l->mtu;
u16 ack = l->rcv_nxt - 1;
u16 seqno = l->snd_nxt;
......@@ -888,19 +874,22 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
struct sk_buff_head *backlogq = &l->backlogq;
struct sk_buff *skb, *_skb, *bskb;
int pkt_cnt = skb_queue_len(list);
int rc = 0;
/* Match msg importance against this and all higher backlog limits: */
if (!skb_queue_empty(backlogq)) {
for (i = imp; i <= TIPC_SYSTEM_IMPORTANCE; i++) {
if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
return link_schedule_user(l, list);
}
}
if (unlikely(msg_size(hdr) > mtu)) {
skb_queue_purge(list);
return -EMSGSIZE;
}
/* Allow oversubscription of one data msg per source at congestion */
if (unlikely(l->backlog[imp].len >= l->backlog[imp].limit)) {
if (imp == TIPC_SYSTEM_IMPORTANCE) {
pr_warn("%s<%s>, link overflow", link_rst_msg, l->name);
return -ENOBUFS;
}
rc = link_schedule_user(l, hdr);
}
if (pkt_cnt > 1) {
l->stats.sent_fragmented++;
l->stats.sent_fragments += pkt_cnt;
......@@ -946,7 +935,7 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
skb_queue_splice_tail_init(list, backlogq);
}
l->snd_nxt = seqno;
return 0;
return rc;
}
void tipc_link_advance_backlog(struct tipc_link *l, struct sk_buff_head *xmitq)
......
......@@ -98,8 +98,6 @@ struct tipc_skb_cb {
u32 bytes_read;
struct sk_buff *tail;
bool validated;
bool wakeup_pending;
u16 chain_sz;
u16 chain_imp;
u16 ackers;
};
......
......@@ -608,7 +608,7 @@ u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance,
* Returns non-zero if any off-node ports overlap
*/
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
u32 limit, struct tipc_plist *dports)
u32 limit, struct list_head *dports)
{
struct name_seq *seq;
struct sub_seq *sseq;
......@@ -633,7 +633,7 @@ int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
info = sseq->info;
list_for_each_entry(publ, &info->node_list, node_list) {
if (publ->scope <= limit)
tipc_plist_push(dports, publ->ref);
u32_push(dports, publ->ref);
}
if (info->cluster_list_size != info->node_list_size)
......@@ -1022,40 +1022,84 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
void tipc_plist_push(struct tipc_plist *pl, u32 port)
struct u32_item {
struct list_head list;
u32 value;
};
bool u32_find(struct list_head *l, u32 value)
{
struct tipc_plist *nl;
struct u32_item *item;
if (likely(!pl->port)) {
pl->port = port;
return;
list_for_each_entry(item, l, list) {
if (item->value == value)
return true;
}
if (pl->port == port)
return;
list_for_each_entry(nl, &pl->list, list) {
if (nl->port == port)
return;
return false;
}
bool u32_push(struct list_head *l, u32 value)
{
struct u32_item *item;
list_for_each_entry(item, l, list) {
if (item->value == value)
return false;
}
item = kmalloc(sizeof(*item), GFP_ATOMIC);
if (unlikely(!item))
return false;
item->value = value;
list_add(&item->list, l);
return true;
}
u32 u32_pop(struct list_head *l)
{
struct u32_item *item;
u32 value = 0;
if (list_empty(l))
return 0;
item = list_first_entry(l, typeof(*item), list);
value = item->value;
list_del(&item->list);
kfree(item);
return value;
}
bool u32_del(struct list_head *l, u32 value)
{
struct u32_item *item, *tmp;
list_for_each_entry_safe(item, tmp, l, list) {
if (item->value != value)
continue;
list_del(&item->list);
kfree(item);
return true;
}
nl = kmalloc(sizeof(*nl), GFP_ATOMIC);
if (nl) {
nl->port = port;
list_add(&nl->list, &pl->list);
return false;
}
void u32_list_purge(struct list_head *l)
{
struct u32_item *item, *tmp;
list_for_each_entry_safe(item, tmp, l, list) {
list_del(&item->list);
kfree(item);
}
}
u32 tipc_plist_pop(struct tipc_plist *pl)
int u32_list_len(struct list_head *l)
{
struct tipc_plist *nl;
u32 port = 0;
struct u32_item *item;
int i = 0;
if (likely(list_empty(&pl->list))) {
port = pl->port;
pl->port = 0;
return port;
list_for_each_entry(item, l, list) {
i++;
}
nl = list_first_entry(&pl->list, typeof(*nl), list);
port = nl->port;
list_del(&nl->list);
kfree(nl);
return port;
return i;
}
......@@ -99,7 +99,7 @@ int tipc_nl_name_table_dump(struct sk_buff *skb, struct netlink_callback *cb);
u32 tipc_nametbl_translate(struct net *net, u32 type, u32 instance, u32 *node);
int tipc_nametbl_mc_translate(struct net *net, u32 type, u32 lower, u32 upper,
u32 limit, struct tipc_plist *dports);
u32 limit, struct list_head *dports);
struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
u32 upper, u32 scope, u32 port_ref,
u32 key);
......@@ -116,18 +116,11 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s);
int tipc_nametbl_init(struct net *net);
void tipc_nametbl_stop(struct net *net);
struct tipc_plist {
struct list_head list;
u32 port;
};
static inline void tipc_plist_init(struct tipc_plist *pl)
{
INIT_LIST_HEAD(&pl->list);
pl->port = 0;
}
void tipc_plist_push(struct tipc_plist *pl, u32 port);
u32 tipc_plist_pop(struct tipc_plist *pl);
bool u32_push(struct list_head *l, u32 value);
u32 u32_pop(struct list_head *l);
bool u32_find(struct list_head *l, u32 value);
bool u32_del(struct list_head *l, u32 value);
void u32_list_purge(struct list_head *l);
int u32_list_len(struct list_head *l);
#endif
......@@ -1167,7 +1167,7 @@ static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
* @list: chain of buffers containing message
* @dnode: address of destination node
* @selector: a number used for deterministic link selection
* Consumes the buffer chain, except when returning -ELINKCONG
* Consumes the buffer chain.
* Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
*/
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
......@@ -1206,10 +1206,10 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
spin_unlock_bh(&le->lock);
tipc_node_read_unlock(n);
if (likely(rc == 0))
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
else if (rc == -ENOBUFS)
if (unlikely(rc == -ENOBUFS))
tipc_node_link_down(n, bearer_id, false);
else
tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
tipc_node_put(n);
......@@ -1221,20 +1221,15 @@ int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
* messages, which will not be rejected
* The only exception is datagram messages rerouted after secondary
* lookup, which are rare and safe to dispose of anyway.
* TODO: Return real return value, and let callers use
* tipc_wait_for_sendpkt() where applicable
*/
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
u32 selector)
{
struct sk_buff_head head;
int rc;
skb_queue_head_init(&head);
__skb_queue_tail(&head, skb);
rc = tipc_node_xmit(net, &head, dnode, selector);
if (rc == -ELINKCONG)
kfree_skb(skb);
tipc_node_xmit(net, &head, dnode, selector);
return 0;
}
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment