Commit 00483690 authored by Jon Maxwell's avatar Jon Maxwell Committed by David S. Miller

tcp: Add mark for TIMEWAIT sockets

This version has some suggestions by Eric Dumazet:

- Use a local variable for the mark in IPv6 instead of ctl_sk to avoid SMP
races.
- Use the more elegant "IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark"
statement.
- Factorize code as sk_fullsock() check is not necessary.

Aidan McGurn from Openwave Mobility systems reported the following bug:

"Marked routing is broken on customer deployment. Its effects are large
increase in Uplink retransmissions caused by the client never receiving
the final ACK to their FINACK - this ACK misses the mark and routes out
of the incorrect route."

Currently marks are added to sk_buffs for replies when the "fwmark_reflect"
sysctl is enabled. But not for TW sockets that had sk->sk_mark set via
setsockopt(SO_MARK..).

Fix this in IPv4/v6 by adding tw->tw_mark for TIME_WAIT sockets. Copy the the
original sk->sk_mark in __inet_twsk_hashdance() to the new tw->tw_mark location.
Then progate this so that the skb gets sent with the correct mark. Do the same
for resets. Give the "fwmark_reflect" sysctl precedence over sk->sk_mark so that
netfilter rules are still honored.
Signed-off-by: default avatarJon Maxwell <jmaxwell37@gmail.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 03bdfc00
...@@ -62,6 +62,7 @@ struct inet_timewait_sock { ...@@ -62,6 +62,7 @@ struct inet_timewait_sock {
#define tw_dr __tw_common.skc_tw_dr #define tw_dr __tw_common.skc_tw_dr
int tw_timeout; int tw_timeout;
__u32 tw_mark;
volatile unsigned char tw_substate; volatile unsigned char tw_substate;
unsigned char tw_rcv_wscale; unsigned char tw_rcv_wscale;
......
...@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ...@@ -1561,7 +1561,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
oif = skb->skb_iif; oif = skb->skb_iif;
flowi4_init_output(&fl4, oif, flowi4_init_output(&fl4, oif,
IP4_REPLY_MARK(net, skb->mark), IP4_REPLY_MARK(net, skb->mark) ?: sk->sk_mark,
RT_TOS(arg->tos), RT_TOS(arg->tos),
RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol, RT_SCOPE_UNIVERSE, ip_hdr(skb)->protocol,
ip_reply_arg_flowi_flags(arg), ip_reply_arg_flowi_flags(arg),
......
...@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ...@@ -621,6 +621,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
struct sock *sk1 = NULL; struct sock *sk1 = NULL;
#endif #endif
struct net *net; struct net *net;
struct sock *ctl_sk;
/* Never send a reset in response to a reset. */ /* Never send a reset in response to a reset. */
if (th->rst) if (th->rst)
...@@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) ...@@ -723,11 +724,16 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
arg.tos = ip_hdr(skb)->tos; arg.tos = ip_hdr(skb)->tos;
arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); arg.uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
local_bh_disable(); local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
if (sk)
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt, skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len); &arg, arg.iov[0].iov_len);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS); __TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
local_bh_enable(); local_bh_enable();
...@@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk, ...@@ -759,6 +765,7 @@ static void tcp_v4_send_ack(const struct sock *sk,
} rep; } rep;
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
struct ip_reply_arg arg; struct ip_reply_arg arg;
struct sock *ctl_sk;
memset(&rep.th, 0, sizeof(struct tcphdr)); memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg)); memset(&arg, 0, sizeof(arg));
...@@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk, ...@@ -809,11 +816,16 @@ static void tcp_v4_send_ack(const struct sock *sk,
arg.tos = tos; arg.tos = tos;
arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL); arg.uid = sock_net_uid(net, sk_fullsock(sk) ? sk : NULL);
local_bh_disable(); local_bh_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk), ctl_sk = *this_cpu_ptr(net->ipv4.tcp_sk);
if (sk)
ctl_sk->sk_mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
ip_send_unicast_reply(ctl_sk,
skb, &TCP_SKB_CB(skb)->header.h4.opt, skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len); &arg, arg.iov[0].iov_len);
ctl_sk->sk_mark = 0;
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS); __TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
local_bh_enable(); local_bh_enable();
} }
......
...@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) ...@@ -263,6 +263,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
struct inet_sock *inet = inet_sk(sk); struct inet_sock *inet = inet_sk(sk);
tw->tw_transparent = inet->transparent; tw->tw_transparent = inet->transparent;
tw->tw_mark = sk->sk_mark;
tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale;
tcptw->tw_rcv_nxt = tp->rcv_nxt; tcptw->tw_rcv_nxt = tp->rcv_nxt;
tcptw->tw_snd_nxt = tp->snd_nxt; tcptw->tw_snd_nxt = tp->snd_nxt;
......
...@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 ...@@ -803,6 +803,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
unsigned int tot_len = sizeof(struct tcphdr); unsigned int tot_len = sizeof(struct tcphdr);
struct dst_entry *dst; struct dst_entry *dst;
__be32 *topt; __be32 *topt;
__u32 mark = 0;
if (tsecr) if (tsecr)
tot_len += TCPOLEN_TSTAMP_ALIGNED; tot_len += TCPOLEN_TSTAMP_ALIGNED;
...@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 ...@@ -871,7 +872,10 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
fl6.flowi6_oif = oif; fl6.flowi6_oif = oif;
} }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); if (sk)
mark = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_mark : sk->sk_mark;
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest; fl6.fl6_dport = t1->dest;
fl6.fl6_sport = t1->source; fl6.fl6_sport = t1->source;
fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL); fl6.flowi6_uid = sock_net_uid(net, sk && sk_fullsock(sk) ? sk : NULL);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment