Commit 89ed5b51 authored by Neil Horman's avatar Neil Horman Committed by David S. Miller

af_packet: Block execution of tasks waiting for transmit to complete in AF_PACKET

When an application is run that:
a) Sets its scheduler to be SCHED_FIFO
and
b) Opens a memory mapped AF_PACKET socket, and sends frames with the
MSG_DONTWAIT flag cleared, its possible for the application to hang
forever in the kernel.  This occurs because when waiting, the code in
tpacket_snd calls schedule, which under normal circumstances allows
other tasks to run, including ksoftirqd, which in some cases is
responsible for freeing the transmitted skb (which in AF_PACKET calls a
destructor that flips the status bit of the transmitted frame back to
available, allowing the transmitting task to complete).

However, when the calling application is SCHED_FIFO, its priority is
such that the schedule call immediately places the task back on the cpu,
preventing ksoftirqd from freeing the skb, which in turn prevents the
transmitting task from detecting that the transmission is complete.

We can fix this by converting the schedule call to a completion
mechanism.  By using a completion queue, we force the calling task, when
it detects there are no more frames to send, to schedule itself off the
cpu until such time as the last transmitted skb is freed, allowing
forward progress to be made.

Tested by myself and the reporter, with good results

Change Notes:

V1->V2:
	Enhance the sleep logic to support being interruptible and
allowing for honoring to SK_SNDTIMEO (Willem de Bruijn)

V2->V3:
	Rearrage the point at which we wait for the completion queue, to
avoid needing to check for ph/skb being null at the end of the loop.
Also move the complete call to the skb destructor to avoid needing to
modify __packet_set_status.  Also gate calling complete on
packet_read_pending returning zero to avoid multiple calls to complete.
(Willem de Bruijn)

	Move timeo computation within loop, to re-fetch the socket
timeout since we also use the timeo variable to record the return code
from the wait_for_complete call (Neil Horman)

V3->V4:
	Willem has requested that the control flow be restored to the
previous state.  Doing so lets us eliminate the need for the
po->wait_on_complete flag variable, and lets us get rid of the
packet_next_frame function, but introduces another complexity.
Specifically, but using the packet pending count, we can, if an
applications calls sendmsg multiple times with MSG_DONTWAIT set, each
set of transmitted frames, when complete, will cause
tpacket_destruct_skb to issue a complete call, for which there will
never be a wait_on_completion call.  This imbalance will lead to any
future call to wait_for_completion here to return early, when the frames
they sent may not have completed.  To correct this, we need to re-init
the completion queue on every call to tpacket_snd before we enter the
loop so as to ensure we wait properly for the frames we send in this
iteration.

	Change the timeout and interrupted gotos to out_put rather than
out_status so that we don't try to free a non-existant skb
	Clean up some extra newlines (Willem de Bruijn)
Reviewed-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarNeil Horman <nhorman@tuxdriver.com>
Reported-by: default avatarMatteo Croce <mcroce@redhat.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 25bff6d5
...@@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb) ...@@ -2401,6 +2401,9 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
ts = __packet_set_timestamp(po, ph, skb); ts = __packet_set_timestamp(po, ph, skb);
__packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts); __packet_set_status(po, ph, TP_STATUS_AVAILABLE | ts);
if (!packet_read_pending(&po->tx_ring))
complete(&po->skb_completion);
} }
sock_wfree(skb); sock_wfree(skb);
...@@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame, ...@@ -2585,7 +2588,7 @@ static int tpacket_parse_header(struct packet_sock *po, void *frame,
static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{ {
struct sk_buff *skb; struct sk_buff *skb = NULL;
struct net_device *dev; struct net_device *dev;
struct virtio_net_hdr *vnet_hdr = NULL; struct virtio_net_hdr *vnet_hdr = NULL;
struct sockcm_cookie sockc; struct sockcm_cookie sockc;
...@@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) ...@@ -2600,6 +2603,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
int len_sum = 0; int len_sum = 0;
int status = TP_STATUS_AVAILABLE; int status = TP_STATUS_AVAILABLE;
int hlen, tlen, copylen = 0; int hlen, tlen, copylen = 0;
long timeo = 0;
mutex_lock(&po->pg_vec_lock); mutex_lock(&po->pg_vec_lock);
...@@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg) ...@@ -2646,12 +2650,21 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr) if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
size_max = dev->mtu + reserve + VLAN_HLEN; size_max = dev->mtu + reserve + VLAN_HLEN;
reinit_completion(&po->skb_completion);
do { do {
ph = packet_current_frame(po, &po->tx_ring, ph = packet_current_frame(po, &po->tx_ring,
TP_STATUS_SEND_REQUEST); TP_STATUS_SEND_REQUEST);
if (unlikely(ph == NULL)) { if (unlikely(ph == NULL)) {
if (need_wait && need_resched()) if (need_wait && skb) {
schedule(); timeo = sock_sndtimeo(&po->sk, msg->msg_flags & MSG_DONTWAIT);
timeo = wait_for_completion_interruptible_timeout(&po->skb_completion, timeo);
if (timeo <= 0) {
err = !timeo ? -ETIMEDOUT : -ERESTARTSYS;
goto out_put;
}
}
/* check for additional frames */
continue; continue;
} }
...@@ -3207,6 +3220,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol, ...@@ -3207,6 +3220,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
sock_init_data(sock, sk); sock_init_data(sock, sk);
po = pkt_sk(sk); po = pkt_sk(sk);
init_completion(&po->skb_completion);
sk->sk_family = PF_PACKET; sk->sk_family = PF_PACKET;
po->num = proto; po->num = proto;
po->xmit = dev_queue_xmit; po->xmit = dev_queue_xmit;
......
...@@ -128,6 +128,7 @@ struct packet_sock { ...@@ -128,6 +128,7 @@ struct packet_sock {
unsigned int tp_hdrlen; unsigned int tp_hdrlen;
unsigned int tp_reserve; unsigned int tp_reserve;
unsigned int tp_tstamp; unsigned int tp_tstamp;
struct completion skb_completion;
struct net_device __rcu *cached_dev; struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb); int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp; struct packet_type prot_hook ____cacheline_aligned_in_smp;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment