Commit c3fc7ac9 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-lockless-listener'

Eric Dumazet says:

====================
tcp/dccp: lockless listener

TCP listener refactoring : this is becoming interesting !

This patch series takes the steps to use normal TCP/DCCP ehash
table to store SYN_RECV requests, instead of the private per-listener
hash table we had until now.

SYNACK skb are now attached to their syn_recv request socket,
so that we no longer heavily modify listener sk_wmem_alloc.

listener lock is no longer held in fast path, including
SYNCOOKIE mode.

During my tests, my server was able to process 3,500,000
SYN packets per second on one listener and still had available
cpu cycles.

That is about 2 to 3 order of magnitude what we had with older kernels.

This effort started two years ago and I am pleased to reach expectations.

We'll probably extend SO_REUSEPORT to add proper cpu/numa affinities,
so that heavy duty TCP servers can get proper siloing thanks to multi-queues
NIC.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f6d3125f e994b2f0
...@@ -28,15 +28,6 @@ int inet6_csk_bind_conflict(const struct sock *sk, ...@@ -28,15 +28,6 @@ int inet6_csk_bind_conflict(const struct sock *sk,
struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6, struct dst_entry *inet6_csk_route_req(const struct sock *sk, struct flowi6 *fl6,
const struct request_sock *req, u8 proto); const struct request_sock *req, u8 proto);
struct request_sock *inet6_csk_search_req(struct sock *sk,
const __be16 rport,
const struct in6_addr *raddr,
const struct in6_addr *laddr,
const int iif);
void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
const unsigned long timeout);
void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr);
int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl); int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl);
......
...@@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk, ...@@ -258,10 +258,6 @@ inet_csk_rto_backoff(const struct inet_connection_sock *icsk,
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err); struct sock *inet_csk_accept(struct sock *sk, int flags, int *err);
struct request_sock *inet_csk_search_req(struct sock *sk,
const __be16 rport,
const __be32 raddr,
const __be32 laddr);
int inet_csk_bind_conflict(const struct sock *sk, int inet_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax); const struct inet_bind_bucket *tb, bool relax);
int inet_csk_get_port(struct sock *sk, unsigned short snum); int inet_csk_get_port(struct sock *sk, unsigned short snum);
...@@ -282,8 +278,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, ...@@ -282,8 +278,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk,
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout); unsigned long timeout);
static inline void inet_csk_reqsk_queue_added(struct sock *sk, static inline void inet_csk_reqsk_queue_added(struct sock *sk)
const unsigned long timeout)
{ {
reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue); reqsk_queue_added(&inet_csk(sk)->icsk_accept_queue);
} }
...@@ -300,7 +295,7 @@ static inline int inet_csk_reqsk_queue_young(const struct sock *sk) ...@@ -300,7 +295,7 @@ static inline int inet_csk_reqsk_queue_young(const struct sock *sk)
static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk) static inline int inet_csk_reqsk_queue_is_full(const struct sock *sk)
{ {
return reqsk_queue_is_full(&inet_csk(sk)->icsk_accept_queue); return inet_csk_reqsk_queue_len(sk) >= sk->sk_max_ack_backlog;
} }
void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req); void inet_csk_reqsk_queue_drop(struct sock *sk, struct request_sock *req);
......
...@@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk); ...@@ -205,6 +205,7 @@ void inet_put_port(struct sock *sk);
void inet_hashinfo_init(struct inet_hashinfo *h); void inet_hashinfo_init(struct inet_hashinfo *h);
int inet_ehash_insert(struct sock *sk, struct sock *osk);
void __inet_hash_nolisten(struct sock *sk, struct sock *osk); void __inet_hash_nolisten(struct sock *sk, struct sock *osk);
void __inet_hash(struct sock *sk, struct sock *osk); void __inet_hash(struct sock *sk, struct sock *osk);
void inet_hash(struct sock *sk); void inet_hash(struct sock *sk);
......
...@@ -69,6 +69,16 @@ struct request_sock { ...@@ -69,6 +69,16 @@ struct request_sock {
u32 peer_secid; u32 peer_secid;
}; };
static inline struct request_sock *inet_reqsk(struct sock *sk)
{
return (struct request_sock *)sk;
}
static inline struct sock *req_to_sk(struct request_sock *req)
{
return (struct sock *)req;
}
static inline struct request_sock * static inline struct request_sock *
reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener)
{ {
...@@ -78,6 +88,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) ...@@ -78,6 +88,8 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener)
req->rsk_ops = ops; req->rsk_ops = ops;
sock_hold(sk_listener); sock_hold(sk_listener);
req->rsk_listener = sk_listener; req->rsk_listener = sk_listener;
req_to_sk(req)->sk_prot = sk_listener->sk_prot;
sk_node_init(&req_to_sk(req)->sk_node);
req->saved_syn = NULL; req->saved_syn = NULL;
/* Following is temporary. It is coupled with debugging /* Following is temporary. It is coupled with debugging
* helpers in reqsk_put() & reqsk_free() * helpers in reqsk_put() & reqsk_free()
...@@ -87,16 +99,6 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener) ...@@ -87,16 +99,6 @@ reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener)
return req; return req;
} }
static inline struct request_sock *inet_reqsk(struct sock *sk)
{
return (struct request_sock *)sk;
}
static inline struct sock *req_to_sk(struct request_sock *req)
{
return (struct sock *)req;
}
static inline void reqsk_free(struct request_sock *req) static inline void reqsk_free(struct request_sock *req)
{ {
/* temporary debugging */ /* temporary debugging */
...@@ -117,25 +119,6 @@ static inline void reqsk_put(struct request_sock *req) ...@@ -117,25 +119,6 @@ static inline void reqsk_put(struct request_sock *req)
extern int sysctl_max_syn_backlog; extern int sysctl_max_syn_backlog;
/** struct listen_sock - listen state
*
* @max_qlen_log - log_2 of maximal queued SYNs/REQUESTs
*/
struct listen_sock {
int qlen_inc; /* protected by listener lock */
int young_inc;/* protected by listener lock */
/* following fields can be updated by timer */
atomic_t qlen_dec; /* qlen = qlen_inc - qlen_dec */
atomic_t young_dec;
u32 max_qlen_log ____cacheline_aligned_in_smp;
u32 synflood_warned;
u32 hash_rnd;
u32 nr_table_entries;
struct request_sock *syn_table[0];
};
/* /*
* For a TCP Fast Open listener - * For a TCP Fast Open listener -
* lock - protects the access to all the reqsk, which is co-owned by * lock - protects the access to all the reqsk, which is co-owned by
...@@ -169,43 +152,29 @@ struct fastopen_queue { ...@@ -169,43 +152,29 @@ struct fastopen_queue {
* @rskq_accept_head - FIFO head of established children * @rskq_accept_head - FIFO head of established children
* @rskq_accept_tail - FIFO tail of established children * @rskq_accept_tail - FIFO tail of established children
* @rskq_defer_accept - User waits for some data after accept() * @rskq_defer_accept - User waits for some data after accept()
* @syn_wait_lock - serializer
*
* %syn_wait_lock is necessary only to avoid proc interface having to grab the main
* lock sock while browsing the listening hash (otherwise it's deadlock prone).
* *
*/ */
struct request_sock_queue { struct request_sock_queue {
spinlock_t rskq_lock;
u8 rskq_defer_accept;
u32 synflood_warned;
atomic_t qlen;
atomic_t young;
struct request_sock *rskq_accept_head; struct request_sock *rskq_accept_head;
struct request_sock *rskq_accept_tail; struct request_sock *rskq_accept_tail;
u8 rskq_defer_accept;
struct listen_sock *listen_opt;
struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine struct fastopen_queue fastopenq; /* Check max_qlen != 0 to determine
* if TFO is enabled. * if TFO is enabled.
*/ */
/* temporary alignment, our goal is to get rid of this lock */
spinlock_t syn_wait_lock ____cacheline_aligned_in_smp;
}; };
int reqsk_queue_alloc(struct request_sock_queue *queue, void reqsk_queue_alloc(struct request_sock_queue *queue);
unsigned int nr_table_entries);
void __reqsk_queue_destroy(struct request_sock_queue *queue);
void reqsk_queue_destroy(struct request_sock_queue *queue);
void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req, void reqsk_fastopen_remove(struct sock *sk, struct request_sock *req,
bool reset); bool reset);
static inline struct request_sock * static inline bool reqsk_queue_empty(const struct request_sock_queue *queue)
reqsk_queue_yank_acceptq(struct request_sock_queue *queue)
{
struct request_sock *req = queue->rskq_accept_head;
queue->rskq_accept_head = NULL;
return req;
}
static inline int reqsk_queue_empty(struct request_sock_queue *queue)
{ {
return queue->rskq_accept_head == NULL; return queue->rskq_accept_head == NULL;
} }
...@@ -215,6 +184,7 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, ...@@ -215,6 +184,7 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue,
struct sock *parent, struct sock *parent,
struct sock *child) struct sock *child)
{ {
spin_lock(&queue->rskq_lock);
req->sk = child; req->sk = child;
sk_acceptq_added(parent); sk_acceptq_added(parent);
...@@ -225,68 +195,48 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue, ...@@ -225,68 +195,48 @@ static inline void reqsk_queue_add(struct request_sock_queue *queue,
queue->rskq_accept_tail = req; queue->rskq_accept_tail = req;
req->dl_next = NULL; req->dl_next = NULL;
spin_unlock(&queue->rskq_lock);
} }
static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue) static inline struct request_sock *reqsk_queue_remove(struct request_sock_queue *queue,
struct sock *parent)
{ {
struct request_sock *req = queue->rskq_accept_head; struct request_sock *req;
WARN_ON(req == NULL);
queue->rskq_accept_head = req->dl_next;
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_tail = NULL;
spin_lock_bh(&queue->rskq_lock);
req = queue->rskq_accept_head;
if (req) {
sk_acceptq_removed(parent);
queue->rskq_accept_head = req->dl_next;
if (queue->rskq_accept_head == NULL)
queue->rskq_accept_tail = NULL;
}
spin_unlock_bh(&queue->rskq_lock);
return req; return req;
} }
static inline void reqsk_queue_removed(struct request_sock_queue *queue, static inline void reqsk_queue_removed(struct request_sock_queue *queue,
const struct request_sock *req) const struct request_sock *req)
{ {
struct listen_sock *lopt = queue->listen_opt;
if (req->num_timeout == 0) if (req->num_timeout == 0)
atomic_inc(&lopt->young_dec); atomic_dec(&queue->young);
atomic_inc(&lopt->qlen_dec); atomic_dec(&queue->qlen);
} }
static inline void reqsk_queue_added(struct request_sock_queue *queue) static inline void reqsk_queue_added(struct request_sock_queue *queue)
{ {
struct listen_sock *lopt = queue->listen_opt; atomic_inc(&queue->young);
atomic_inc(&queue->qlen);
lopt->young_inc++;
lopt->qlen_inc++;
}
static inline int listen_sock_qlen(const struct listen_sock *lopt)
{
return lopt->qlen_inc - atomic_read(&lopt->qlen_dec);
}
static inline int listen_sock_young(const struct listen_sock *lopt)
{
return lopt->young_inc - atomic_read(&lopt->young_dec);
} }
static inline int reqsk_queue_len(const struct request_sock_queue *queue) static inline int reqsk_queue_len(const struct request_sock_queue *queue)
{ {
const struct listen_sock *lopt = queue->listen_opt; return atomic_read(&queue->qlen);
return lopt ? listen_sock_qlen(lopt) : 0;
} }
static inline int reqsk_queue_len_young(const struct request_sock_queue *queue) static inline int reqsk_queue_len_young(const struct request_sock_queue *queue)
{ {
return listen_sock_young(queue->listen_opt); return atomic_read(&queue->young);
} }
static inline int reqsk_queue_is_full(const struct request_sock_queue *queue)
{
return reqsk_queue_len(queue) >> queue->listen_opt->max_qlen_log;
}
void reqsk_queue_hash_req(struct request_sock_queue *queue,
u32 hash, struct request_sock *req,
unsigned long timeout);
#endif /* _REQUEST_SOCK_H */ #endif /* _REQUEST_SOCK_H */
...@@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len); ...@@ -462,7 +462,8 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len);
int tcp_connect(struct sock *sk); int tcp_connect(struct sock *sk);
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct tcp_fastopen_cookie *foc); struct tcp_fastopen_cookie *foc,
bool attach_req);
int tcp_disconnect(struct sock *sk, int flags); int tcp_disconnect(struct sock *sk, int flags);
void tcp_finish_connect(struct sock *sk, struct sk_buff *skb); void tcp_finish_connect(struct sock *sk, struct sk_buff *skb);
...@@ -1618,7 +1619,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp) ...@@ -1618,7 +1619,6 @@ static inline bool tcp_stream_is_thin(struct tcp_sock *tp)
/* /proc */ /* /proc */
enum tcp_seq_states { enum tcp_seq_states {
TCP_SEQ_STATE_LISTENING, TCP_SEQ_STATE_LISTENING,
TCP_SEQ_STATE_OPENREQ,
TCP_SEQ_STATE_ESTABLISHED, TCP_SEQ_STATE_ESTABLISHED,
}; };
...@@ -1637,7 +1637,6 @@ struct tcp_iter_state { ...@@ -1637,7 +1637,6 @@ struct tcp_iter_state {
enum tcp_seq_states state; enum tcp_seq_states state;
struct sock *syn_wait_sk; struct sock *syn_wait_sk;
int bucket, offset, sbucket, num; int bucket, offset, sbucket, num;
kuid_t uid;
loff_t last_pos; loff_t last_pos;
}; };
...@@ -1717,9 +1716,8 @@ struct tcp_request_sock_ops { ...@@ -1717,9 +1716,8 @@ struct tcp_request_sock_ops {
__u32 (*init_seq)(const struct sk_buff *skb); __u32 (*init_seq)(const struct sk_buff *skb);
int (*send_synack)(const struct sock *sk, struct dst_entry *dst, int (*send_synack)(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct request_sock *req, struct flowi *fl, struct request_sock *req,
u16 queue_mapping, struct tcp_fastopen_cookie *foc); u16 queue_mapping, struct tcp_fastopen_cookie *foc,
void (*queue_hash_add)(struct sock *sk, struct request_sock *req, bool attach_req);
const unsigned long timeout);
}; };
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
......
...@@ -37,28 +37,9 @@ ...@@ -37,28 +37,9 @@
int sysctl_max_syn_backlog = 256; int sysctl_max_syn_backlog = 256;
EXPORT_SYMBOL(sysctl_max_syn_backlog); EXPORT_SYMBOL(sysctl_max_syn_backlog);
int reqsk_queue_alloc(struct request_sock_queue *queue, void reqsk_queue_alloc(struct request_sock_queue *queue)
unsigned int nr_table_entries)
{ {
size_t lopt_size = sizeof(struct listen_sock); spin_lock_init(&queue->rskq_lock);
struct listen_sock *lopt = NULL;
nr_table_entries = min_t(u32, nr_table_entries, sysctl_max_syn_backlog);
nr_table_entries = max_t(u32, nr_table_entries, 8);
nr_table_entries = roundup_pow_of_two(nr_table_entries + 1);
lopt_size += nr_table_entries * sizeof(struct request_sock *);
if (lopt_size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
lopt = kzalloc(lopt_size, GFP_KERNEL |
__GFP_NOWARN |
__GFP_NORETRY);
if (!lopt)
lopt = vzalloc(lopt_size);
if (!lopt)
return -ENOMEM;
get_random_bytes(&lopt->hash_rnd, sizeof(lopt->hash_rnd));
spin_lock_init(&queue->syn_wait_lock);
spin_lock_init(&queue->fastopenq.lock); spin_lock_init(&queue->fastopenq.lock);
queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_head = NULL;
...@@ -67,67 +48,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue, ...@@ -67,67 +48,6 @@ int reqsk_queue_alloc(struct request_sock_queue *queue,
queue->fastopenq.max_qlen = 0; queue->fastopenq.max_qlen = 0;
queue->rskq_accept_head = NULL; queue->rskq_accept_head = NULL;
lopt->nr_table_entries = nr_table_entries;
lopt->max_qlen_log = ilog2(nr_table_entries);
spin_lock_bh(&queue->syn_wait_lock);
queue->listen_opt = lopt;
spin_unlock_bh(&queue->syn_wait_lock);
return 0;
}
void __reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* This is an error recovery path only, no locking needed */
kvfree(queue->listen_opt);
}
static inline struct listen_sock *reqsk_queue_yank_listen_sk(
struct request_sock_queue *queue)
{
struct listen_sock *lopt;
spin_lock_bh(&queue->syn_wait_lock);
lopt = queue->listen_opt;
queue->listen_opt = NULL;
spin_unlock_bh(&queue->syn_wait_lock);
return lopt;
}
void reqsk_queue_destroy(struct request_sock_queue *queue)
{
/* make all the listen_opt local to us */
struct listen_sock *lopt = reqsk_queue_yank_listen_sk(queue);
if (listen_sock_qlen(lopt) != 0) {
unsigned int i;
for (i = 0; i < lopt->nr_table_entries; i++) {
struct request_sock *req;
spin_lock_bh(&queue->syn_wait_lock);
while ((req = lopt->syn_table[i]) != NULL) {
lopt->syn_table[i] = req->dl_next;
/* Because of following del_timer_sync(),
* we must release the spinlock here
* or risk a dead lock.
*/
spin_unlock_bh(&queue->syn_wait_lock);
atomic_inc(&lopt->qlen_dec);
if (del_timer_sync(&req->rsk_timer))
reqsk_put(req);
reqsk_put(req);
spin_lock_bh(&queue->syn_wait_lock);
}
spin_unlock_bh(&queue->syn_wait_lock);
}
}
if (WARN_ON(listen_sock_qlen(lopt) != 0))
pr_err("qlen %u\n", listen_sock_qlen(lopt));
kvfree(lopt);
} }
/* /*
......
...@@ -444,36 +444,6 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, ...@@ -444,36 +444,6 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk,
} }
EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock);
static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct sock *nsk;
/* Find possible connection requests. */
struct request_sock *req = inet_csk_search_req(sk, dh->dccph_sport,
iph->saddr, iph->daddr);
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = inet_lookup_established(sock_net(sk), &dccp_hashinfo,
iph->saddr, dh->dccph_sport,
iph->daddr, dh->dccph_dport,
inet_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
return sk;
}
static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk, static struct dst_entry* dccp_v4_route_skb(struct net *net, struct sock *sk,
struct sk_buff *skb) struct sk_buff *skb)
{ {
...@@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -705,18 +675,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in * NOTE: the check for the packet types is done in
* dccp_rcv_state_process * dccp_rcv_state_process
*/ */
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v4_hnd_req(sk, skb);
if (nsk == NULL)
goto discard;
if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
return 0;
}
}
if (dccp_rcv_state_process(sk, skb, dh, skb->len)) if (dccp_rcv_state_process(sk, skb, dh, skb->len))
goto reset; goto reset;
...@@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -724,7 +682,6 @@ int dccp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
reset: reset:
dccp_v4_ctl_send_reset(sk, skb); dccp_v4_ctl_send_reset(sk, skb);
discard:
kfree_skb(skb); kfree_skb(skb);
return 0; return 0;
} }
...@@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb) ...@@ -868,6 +825,27 @@ static int dccp_v4_rcv(struct sk_buff *skb)
goto no_dccp_socket; goto no_dccp_socket;
} }
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v4_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/* /*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
......
...@@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = { ...@@ -290,37 +290,6 @@ static struct request_sock_ops dccp6_request_sock_ops = {
.syn_ack_timeout = dccp_syn_ack_timeout, .syn_ack_timeout = dccp_syn_ack_timeout,
}; };
static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb)
{
const struct dccp_hdr *dh = dccp_hdr(skb);
const struct ipv6hdr *iph = ipv6_hdr(skb);
struct request_sock *req;
struct sock *nsk;
req = inet6_csk_search_req(sk, dh->dccph_sport, &iph->saddr,
&iph->daddr, inet6_iif(skb));
if (req) {
nsk = dccp_check_req(sk, skb, req);
if (!nsk)
reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &dccp_hashinfo,
&iph->saddr, dh->dccph_sport,
&iph->daddr, ntohs(dh->dccph_dport),
inet6_iif(skb));
if (nsk != NULL) {
if (nsk->sk_state != DCCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
return sk;
}
static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
{ {
struct request_sock *req; struct request_sock *req;
...@@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) ...@@ -398,7 +367,7 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
if (dccp_v6_send_response(sk, req)) if (dccp_v6_send_response(sk, req))
goto drop_and_free; goto drop_and_free;
inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT);
return 0; return 0;
drop_and_free: drop_and_free:
...@@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -641,24 +610,6 @@ static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
* NOTE: the check for the packet types is done in * NOTE: the check for the packet types is done in
* dccp_rcv_state_process * dccp_rcv_state_process
*/ */
if (sk->sk_state == DCCP_LISTEN) {
struct sock *nsk = dccp_v6_hnd_req(sk, skb);
if (nsk == NULL)
goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (nsk != sk) {
if (dccp_child_process(sk, nsk, skb))
goto reset;
if (opt_skb != NULL)
__kfree_skb(opt_skb);
return 0;
}
}
if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len))
goto reset; goto reset;
...@@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb) ...@@ -732,6 +683,27 @@ static int dccp_v6_rcv(struct sk_buff *skb)
goto no_dccp_socket; goto no_dccp_socket;
} }
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
if (sk->sk_state == DCCP_LISTEN)
nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v6_ctl_send_reset(sk, skb);
goto discard_it;
} else {
return 0;
}
}
/* /*
* RFC 4340, sec. 9.2.1: Minimum Checksum Coverage * RFC 4340, sec. 9.2.1: Minimum Checksum Coverage
* o if MinCsCov = 0, only packets with CsCov = 0 are accepted * o if MinCsCov = 0, only packets with CsCov = 0 are accepted
......
...@@ -330,10 +330,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err) ...@@ -330,10 +330,9 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
if (error) if (error)
goto out_err; goto out_err;
} }
req = reqsk_queue_remove(queue); req = reqsk_queue_remove(queue, sk);
newsk = req->sk; newsk = req->sk;
sk_acceptq_removed(sk);
if (sk->sk_protocol == IPPROTO_TCP && if (sk->sk_protocol == IPPROTO_TCP &&
tcp_rsk(req)->tfo_listener) { tcp_rsk(req)->tfo_listener) {
spin_lock_bh(&queue->fastopenq.lock); spin_lock_bh(&queue->fastopenq.lock);
...@@ -477,65 +476,12 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk, ...@@ -477,65 +476,12 @@ struct dst_entry *inet_csk_route_child_sock(const struct sock *sk,
} }
EXPORT_SYMBOL_GPL(inet_csk_route_child_sock); EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
static inline u32 inet_synq_hash(const __be32 raddr, const __be16 rport,
const u32 rnd, const u32 synq_hsize)
{
return jhash_2words((__force u32)raddr, (__force u32)rport, rnd) & (synq_hsize - 1);
}
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
#define AF_INET_FAMILY(fam) ((fam) == AF_INET) #define AF_INET_FAMILY(fam) ((fam) == AF_INET)
#else #else
#define AF_INET_FAMILY(fam) true #define AF_INET_FAMILY(fam) true
#endif #endif
/* Note: this is temporary :
* req sock will no longer be in listener hash table
*/
struct request_sock *inet_csk_search_req(struct sock *sk,
const __be16 rport,
const __be32 raddr,
const __be32 laddr)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
struct request_sock *req;
u32 hash = inet_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries);
spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->ir_rmt_port == rport &&
ireq->ir_rmt_addr == raddr &&
ireq->ir_loc_addr == laddr &&
AF_INET_FAMILY(req->rsk_ops->family)) {
atomic_inc(&req->rsk_refcnt);
WARN_ON(req->sk);
break;
}
}
spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req;
}
EXPORT_SYMBOL_GPL(inet_csk_search_req);
void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
const u32 h = inet_synq_hash(inet_rsk(req)->ir_rmt_addr,
inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/* Only thing we need from tcp.h */ /* Only thing we need from tcp.h */
extern int sysctl_tcp_synack_retries; extern int sysctl_tcp_synack_retries;
...@@ -572,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req) ...@@ -572,26 +518,20 @@ int inet_rtx_syn_ack(const struct sock *parent, struct request_sock *req)
} }
EXPORT_SYMBOL(inet_rtx_syn_ack); EXPORT_SYMBOL(inet_rtx_syn_ack);
/* return true if req was found in the syn_table[] */ /* return true if req was found in the ehash table */
static bool reqsk_queue_unlink(struct request_sock_queue *queue, static bool reqsk_queue_unlink(struct request_sock_queue *queue,
struct request_sock *req) struct request_sock *req)
{ {
struct listen_sock *lopt = queue->listen_opt; struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo;
struct request_sock **prev; spinlock_t *lock;
bool found = false; bool found;
spin_lock(&queue->syn_wait_lock); lock = inet_ehash_lockp(hashinfo, req->rsk_hash);
for (prev = &lopt->syn_table[req->rsk_hash]; *prev != NULL; spin_lock(lock);
prev = &(*prev)->dl_next) { found = __sk_nulls_del_node_init_rcu(req_to_sk(req));
if (*prev == req) { spin_unlock(lock);
*prev = req->dl_next;
found = true;
break;
}
}
spin_unlock(&queue->syn_wait_lock);
if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer))
reqsk_put(req); reqsk_put(req);
return found; return found;
...@@ -612,15 +552,12 @@ static void reqsk_timer_handler(unsigned long data) ...@@ -612,15 +552,12 @@ static void reqsk_timer_handler(unsigned long data)
struct sock *sk_listener = req->rsk_listener; struct sock *sk_listener = req->rsk_listener;
struct inet_connection_sock *icsk = inet_csk(sk_listener); struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct listen_sock *lopt = queue->listen_opt;
int qlen, expire = 0, resend = 0; int qlen, expire = 0, resend = 0;
int max_retries, thresh; int max_retries, thresh;
u8 defer_accept; u8 defer_accept;
if (sk_listener->sk_state != TCP_LISTEN || !lopt) { if (sk_listener->sk_state != TCP_LISTEN)
reqsk_put(req); goto drop;
return;
}
max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries; max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
thresh = max_retries; thresh = max_retries;
...@@ -641,9 +578,9 @@ static void reqsk_timer_handler(unsigned long data) ...@@ -641,9 +578,9 @@ static void reqsk_timer_handler(unsigned long data)
* embrions; and abort old ones without pity, if old * embrions; and abort old ones without pity, if old
* ones are about to clog our table. * ones are about to clog our table.
*/ */
qlen = listen_sock_qlen(lopt); qlen = reqsk_queue_len(queue);
if (qlen >> (lopt->max_qlen_log - 1)) { if ((qlen << 1) > sk_listener->sk_max_ack_backlog) {
int young = listen_sock_young(lopt) << 1; int young = reqsk_queue_len_young(queue) << 1;
while (thresh > 2) { while (thresh > 2) {
if (qlen < young) if (qlen < young)
...@@ -665,41 +602,41 @@ static void reqsk_timer_handler(unsigned long data) ...@@ -665,41 +602,41 @@ static void reqsk_timer_handler(unsigned long data)
unsigned long timeo; unsigned long timeo;
if (req->num_timeout++ == 0) if (req->num_timeout++ == 0)
atomic_inc(&lopt->young_dec); atomic_dec(&queue->young);
timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX); timeo = min(TCP_TIMEOUT_INIT << req->num_timeout, TCP_RTO_MAX);
mod_timer_pinned(&req->rsk_timer, jiffies + timeo); mod_timer_pinned(&req->rsk_timer, jiffies + timeo);
return; return;
} }
drop:
inet_csk_reqsk_queue_drop(sk_listener, req); inet_csk_reqsk_queue_drop(sk_listener, req);
reqsk_put(req); reqsk_put(req);
} }
void reqsk_queue_hash_req(struct request_sock_queue *queue, static void reqsk_queue_hash_req(struct request_sock *req,
u32 hash, struct request_sock *req, unsigned long timeout)
unsigned long timeout)
{ {
struct listen_sock *lopt = queue->listen_opt;
req->num_retrans = 0; req->num_retrans = 0;
req->num_timeout = 0; req->num_timeout = 0;
req->sk = NULL; req->sk = NULL;
setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req); setup_timer(&req->rsk_timer, reqsk_timer_handler, (unsigned long)req);
mod_timer_pinned(&req->rsk_timer, jiffies + timeout); mod_timer_pinned(&req->rsk_timer, jiffies + timeout);
req->rsk_hash = hash;
inet_ehash_insert(req_to_sk(req), NULL);
/* before letting lookups find us, make sure all req fields /* before letting lookups find us, make sure all req fields
* are committed to memory and refcnt initialized. * are committed to memory and refcnt initialized.
*/ */
smp_wmb(); smp_wmb();
atomic_set(&req->rsk_refcnt, 2); atomic_set(&req->rsk_refcnt, 2 + 1);
}
spin_lock(&queue->syn_wait_lock); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req,
req->dl_next = lopt->syn_table[hash]; unsigned long timeout)
lopt->syn_table[hash] = req; {
spin_unlock(&queue->syn_wait_lock); reqsk_queue_hash_req(req, timeout);
inet_csk_reqsk_queue_added(sk);
} }
EXPORT_SYMBOL(reqsk_queue_hash_req); EXPORT_SYMBOL_GPL(inet_csk_reqsk_queue_hash_add);
/** /**
* inet_csk_clone_lock - clone an inet socket, and lock its clone * inet_csk_clone_lock - clone an inet socket, and lock its clone
...@@ -792,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close); ...@@ -792,12 +729,10 @@ EXPORT_SYMBOL(inet_csk_prepare_forced_close);
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{ {
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue, nr_table_entries); struct inet_sock *inet = inet_sk(sk);
if (rc != 0) reqsk_queue_alloc(&icsk->icsk_accept_queue);
return rc;
sk->sk_max_ack_backlog = 0; sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0; sk->sk_ack_backlog = 0;
...@@ -819,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries) ...@@ -819,7 +754,6 @@ int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
} }
sk->sk_state = TCP_CLOSE; sk->sk_state = TCP_CLOSE;
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -EADDRINUSE; return -EADDRINUSE;
} }
EXPORT_SYMBOL_GPL(inet_csk_listen_start); EXPORT_SYMBOL_GPL(inet_csk_listen_start);
...@@ -832,11 +766,7 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -832,11 +766,7 @@ void inet_csk_listen_stop(struct sock *sk)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct request_sock_queue *queue = &icsk->icsk_accept_queue; struct request_sock_queue *queue = &icsk->icsk_accept_queue;
struct request_sock *acc_req; struct request_sock *next, *req;
struct request_sock *req;
/* make all the listen_opt local to us */
acc_req = reqsk_queue_yank_acceptq(queue);
/* Following specs, it would be better either to send FIN /* Following specs, it would be better either to send FIN
* (and enter FIN-WAIT-1, it is normal close) * (and enter FIN-WAIT-1, it is normal close)
...@@ -846,13 +776,9 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -846,13 +776,9 @@ void inet_csk_listen_stop(struct sock *sk)
* To be honest, we are not able to make either * To be honest, we are not able to make either
* of the variants now. --ANK * of the variants now. --ANK
*/ */
reqsk_queue_destroy(queue); while ((req = reqsk_queue_remove(queue, sk)) != NULL) {
while ((req = acc_req) != NULL) {
struct sock *child = req->sk; struct sock *child = req->sk;
acc_req = req->dl_next;
local_bh_disable(); local_bh_disable();
bh_lock_sock(child); bh_lock_sock(child);
WARN_ON(sock_owned_by_user(child)); WARN_ON(sock_owned_by_user(child));
...@@ -882,18 +808,19 @@ void inet_csk_listen_stop(struct sock *sk) ...@@ -882,18 +808,19 @@ void inet_csk_listen_stop(struct sock *sk)
local_bh_enable(); local_bh_enable();
sock_put(child); sock_put(child);
sk_acceptq_removed(sk);
reqsk_put(req); reqsk_put(req);
cond_resched();
} }
if (queue->fastopenq.rskq_rst_head) { if (queue->fastopenq.rskq_rst_head) {
/* Free all the reqs queued in rskq_rst_head. */ /* Free all the reqs queued in rskq_rst_head. */
spin_lock_bh(&queue->fastopenq.lock); spin_lock_bh(&queue->fastopenq.lock);
acc_req = queue->fastopenq.rskq_rst_head; req = queue->fastopenq.rskq_rst_head;
queue->fastopenq.rskq_rst_head = NULL; queue->fastopenq.rskq_rst_head = NULL;
spin_unlock_bh(&queue->fastopenq.lock); spin_unlock_bh(&queue->fastopenq.lock);
while ((req = acc_req) != NULL) { while (req != NULL) {
acc_req = req->dl_next; next = req->dl_next;
reqsk_put(req); reqsk_put(req);
req = next;
} }
} }
WARN_ON(sk->sk_ack_backlog); WARN_ON(sk->sk_ack_backlog);
......
...@@ -730,91 +730,21 @@ static void twsk_build_assert(void) ...@@ -730,91 +730,21 @@ static void twsk_build_assert(void)
#endif #endif
} }
static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk,
struct netlink_callback *cb,
const struct inet_diag_req_v2 *r,
const struct nlattr *bc)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
struct inet_diag_entry entry;
int j, s_j, reqnum, s_reqnum;
struct listen_sock *lopt;
int err = 0;
s_j = cb->args[3];
s_reqnum = cb->args[4];
if (s_j > 0)
s_j--;
entry.family = sk->sk_family;
spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
lopt = icsk->icsk_accept_queue.listen_opt;
if (!lopt || !listen_sock_qlen(lopt))
goto out;
if (bc) {
entry.sport = inet->inet_num;
entry.userlocks = sk->sk_userlocks;
}
for (j = s_j; j < lopt->nr_table_entries; j++) {
struct request_sock *req, *head = lopt->syn_table[j];
reqnum = 0;
for (req = head; req; reqnum++, req = req->dl_next) {
struct inet_request_sock *ireq = inet_rsk(req);
if (reqnum < s_reqnum)
continue;
if (r->id.idiag_dport != ireq->ir_rmt_port &&
r->id.idiag_dport)
continue;
if (bc) {
/* Note: entry.sport and entry.userlocks are already set */
entry_fill_addrs(&entry, req_to_sk(req));
entry.dport = ntohs(ireq->ir_rmt_port);
if (!inet_diag_bc_run(bc, &entry))
continue;
}
err = inet_req_diag_fill(req_to_sk(req), skb,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
NLM_F_MULTI, cb->nlh);
if (err < 0) {
cb->args[3] = j + 1;
cb->args[4] = reqnum;
goto out;
}
}
s_reqnum = 0;
}
out:
spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return err;
}
void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
struct netlink_callback *cb, struct netlink_callback *cb,
const struct inet_diag_req_v2 *r, struct nlattr *bc) const struct inet_diag_req_v2 *r, struct nlattr *bc)
{ {
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
int i, num, s_i, s_num; int i, num, s_i, s_num;
u32 idiag_states = r->idiag_states;
if (idiag_states & TCPF_SYN_RECV)
idiag_states |= TCPF_NEW_SYN_RECV;
s_i = cb->args[1]; s_i = cb->args[1];
s_num = num = cb->args[2]; s_num = num = cb->args[2];
if (cb->args[0] == 0) { if (cb->args[0] == 0) {
if (!(r->idiag_states & (TCPF_LISTEN | TCPF_SYN_RECV))) if (!(idiag_states & TCPF_LISTEN))
goto skip_listen_ht; goto skip_listen_ht;
for (i = s_i; i < INET_LHTABLE_SIZE; i++) { for (i = s_i; i < INET_LHTABLE_SIZE; i++) {
...@@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -844,21 +774,11 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
r->id.idiag_sport) r->id.idiag_sport)
goto next_listen; goto next_listen;
if (!(r->idiag_states & TCPF_LISTEN) || if (r->id.idiag_dport ||
r->id.idiag_dport ||
cb->args[3] > 0) cb->args[3] > 0)
goto syn_recv;
if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock);
goto done;
}
syn_recv:
if (!(r->idiag_states & TCPF_SYN_RECV))
goto next_listen; goto next_listen;
if (inet_diag_dump_reqs(skb, sk, cb, r, bc) < 0) { if (inet_csk_diag_dump(sk, skb, cb, r, bc) < 0) {
spin_unlock_bh(&ilb->lock); spin_unlock_bh(&ilb->lock);
goto done; goto done;
} }
...@@ -879,7 +799,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -879,7 +799,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
s_i = num = s_num = 0; s_i = num = s_num = 0;
} }
if (!(r->idiag_states & ~(TCPF_LISTEN | TCPF_SYN_RECV))) if (!(idiag_states & ~TCPF_LISTEN))
goto out; goto out;
for (i = s_i; i <= hashinfo->ehash_mask; i++) { for (i = s_i; i <= hashinfo->ehash_mask; i++) {
...@@ -906,7 +826,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb, ...@@ -906,7 +826,7 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
goto next_normal; goto next_normal;
state = (sk->sk_state == TCP_TIME_WAIT) ? state = (sk->sk_state == TCP_TIME_WAIT) ?
inet_twsk(sk)->tw_substate : sk->sk_state; inet_twsk(sk)->tw_substate : sk->sk_state;
if (!(r->idiag_states & (1 << state))) if (!(idiag_states & (1 << state)))
goto next_normal; goto next_normal;
if (r->sdiag_family != AF_UNSPEC && if (r->sdiag_family != AF_UNSPEC &&
sk->sk_family != r->sdiag_family) sk->sk_family != r->sdiag_family)
......
...@@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk) ...@@ -398,14 +398,18 @@ static u32 inet_sk_port_offset(const struct sock *sk)
inet->inet_dport); inet->inet_dport);
} }
void __inet_hash_nolisten(struct sock *sk, struct sock *osk) /* insert a socket into ehash, and eventually remove another one
* (The another one can be a SYN_RECV or TIMEWAIT
*/
int inet_ehash_insert(struct sock *sk, struct sock *osk)
{ {
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct hlist_nulls_head *list; struct hlist_nulls_head *list;
struct inet_ehash_bucket *head; struct inet_ehash_bucket *head;
spinlock_t *lock; spinlock_t *lock;
int ret = 0;
WARN_ON(!sk_unhashed(sk)); WARN_ON_ONCE(!sk_unhashed(sk));
sk->sk_hash = sk_ehashfn(sk); sk->sk_hash = sk_ehashfn(sk);
head = inet_ehash_bucket(hashinfo, sk->sk_hash); head = inet_ehash_bucket(hashinfo, sk->sk_hash);
...@@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk) ...@@ -419,6 +423,12 @@ void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
sk_nulls_del_node_init_rcu(osk); sk_nulls_del_node_init_rcu(osk);
} }
spin_unlock(lock); spin_unlock(lock);
return ret;
}
void __inet_hash_nolisten(struct sock *sk, struct sock *osk)
{
inet_ehash_insert(sk, osk);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
} }
EXPORT_SYMBOL_GPL(__inet_hash_nolisten); EXPORT_SYMBOL_GPL(__inet_hash_nolisten);
......
...@@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt, ...@@ -284,6 +284,10 @@ bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
} }
EXPORT_SYMBOL(cookie_ecn_ok); EXPORT_SYMBOL(cookie_ecn_ok);
/* On input, sk is a listener.
* Output is listener if incoming packet would not create a child
* NULL if memory could not be allocated.
*/
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{ {
struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt; struct ip_options *opt = &TCP_SKB_CB(skb)->header.h4.opt;
......
...@@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk, ...@@ -161,13 +161,13 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tp->snd_wnd = ntohs(tcp_hdr(skb)->window); tp->snd_wnd = ntohs(tcp_hdr(skb)->window);
/* Activate the retrans timer so that SYNACK can be retransmitted. /* Activate the retrans timer so that SYNACK can be retransmitted.
* The request socket is not added to the SYN table of the parent * The request socket is not added to the ehash
* because it's been added to the accept queue directly. * because it's been added to the accept queue directly.
*/ */
inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS, inet_csk_reset_xmit_timer(child, ICSK_TIME_RETRANS,
TCP_TIMEOUT_INIT, TCP_RTO_MAX); TCP_TIMEOUT_INIT, TCP_RTO_MAX);
atomic_set(&req->rsk_refcnt, 1); atomic_set(&req->rsk_refcnt, 2);
/* Add the child socket directly into the accept queue */ /* Add the child socket directly into the accept queue */
inet_csk_reqsk_queue_add(sk, req, child); inet_csk_reqsk_queue_add(sk, req, child);
......
...@@ -6068,9 +6068,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, ...@@ -6068,9 +6068,9 @@ static bool tcp_syn_flood_action(const struct sock *sk,
const struct sk_buff *skb, const struct sk_buff *skb,
const char *proto) const char *proto)
{ {
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request"; const char *msg = "Dropping request";
bool want_cookie = false; bool want_cookie = false;
struct listen_sock *lopt;
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
if (sysctl_tcp_syncookies) { if (sysctl_tcp_syncookies) {
...@@ -6081,10 +6081,9 @@ static bool tcp_syn_flood_action(const struct sock *sk, ...@@ -6081,10 +6081,9 @@ static bool tcp_syn_flood_action(const struct sock *sk,
#endif #endif
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
lopt = inet_csk(sk)->icsk_accept_queue.listen_opt; if (!queue->synflood_warned &&
if (!lopt->synflood_warned &&
sysctl_tcp_syncookies != 2 && sysctl_tcp_syncookies != 2 &&
xchg(&lopt->synflood_warned, 1) == 0) xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n", pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg); proto, ntohs(tcp_hdr(skb)->dest), msg);
...@@ -6121,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -6121,8 +6120,6 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
struct request_sock *req; struct request_sock *req;
bool want_cookie = false; bool want_cookie = false;
struct flowi fl; struct flowi fl;
int err;
/* TW buckets are converted to open requests without /* TW buckets are converted to open requests without
* limitations, they conserve resources and peer is * limitations, they conserve resources and peer is
...@@ -6231,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -6231,21 +6228,24 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst); tcp_openreq_init_rwin(req, sk, dst);
if (!want_cookie) if (!want_cookie) {
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst); fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
err = af_ops->send_synack(fastopen_sk ?: sk, dst, &fl, req, tcp_reqsk_record_syn(sk, req, skb);
skb_get_queue_mapping(skb), &foc); }
if (fastopen_sk) { if (fastopen_sk) {
af_ops->send_synack(fastopen_sk, dst, &fl, req,
skb_get_queue_mapping(skb), &foc, false);
sock_put(fastopen_sk); sock_put(fastopen_sk);
} else { } else {
if (err || want_cookie)
goto drop_and_free;
tcp_rsk(req)->tfo_listener = false; tcp_rsk(req)->tfo_listener = false;
af_ops->queue_hash_add(sk, req, TCP_TIMEOUT_INIT); if (!want_cookie)
inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
af_ops->send_synack(sk, dst, &fl, req,
skb_get_queue_mapping(skb), &foc, !want_cookie);
if (want_cookie)
goto drop_and_free;
} }
tcp_reqsk_record_syn(sk, req, skb); reqsk_put(req);
return 0; return 0;
drop_and_release: drop_and_release:
......
...@@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -822,7 +822,8 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct flowi *fl,
struct request_sock *req, struct request_sock *req,
u16 queue_mapping, u16 queue_mapping,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
struct flowi4 fl4; struct flowi4 fl4;
...@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -833,7 +834,7 @@ static int tcp_v4_send_synack(const struct sock *sk, struct dst_entry *dst,
if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL) if (!dst && (dst = inet_csk_route_req(sk, &fl4, req)) == NULL)
return -1; return -1;
skb = tcp_make_synack(sk, dst, req, foc); skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) { if (skb) {
__tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr); __tcp_v4_send_check(skb, ireq->ir_loc_addr, ireq->ir_rmt_addr);
...@@ -1112,10 +1113,13 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key, ...@@ -1112,10 +1113,13 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
} }
EXPORT_SYMBOL(tcp_v4_md5_hash_skb); EXPORT_SYMBOL(tcp_v4_md5_hash_skb);
#endif
/* Called with rcu_read_lock() */ /* Called with rcu_read_lock() */
static bool tcp_v4_inbound_md5_hash(struct sock *sk, static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb) const struct sk_buff *skb)
{ {
#ifdef CONFIG_TCP_MD5SIG
/* /*
* This gets called for each TCP segment that arrives * This gets called for each TCP segment that arrives
* so we want to be efficient. * so we want to be efficient.
...@@ -1165,8 +1169,9 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk, ...@@ -1165,8 +1169,9 @@ static bool tcp_v4_inbound_md5_hash(struct sock *sk,
return true; return true;
} }
return false; return false;
}
#endif #endif
return false;
}
static void tcp_v4_init_req(struct request_sock *req, static void tcp_v4_init_req(struct request_sock *req,
const struct sock *sk_listener, const struct sock *sk_listener,
...@@ -1220,7 +1225,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = { ...@@ -1220,7 +1225,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv4_ops = {
.route_req = tcp_v4_route_req, .route_req = tcp_v4_route_req,
.init_seq = tcp_v4_init_sequence, .init_seq = tcp_v4_init_sequence,
.send_synack = tcp_v4_send_synack, .send_synack = tcp_v4_send_synack,
.queue_hash_add = inet_csk_reqsk_queue_hash_add,
}; };
int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
...@@ -1339,34 +1343,11 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, ...@@ -1339,34 +1343,11 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
} }
EXPORT_SYMBOL(tcp_v4_syn_recv_sock); EXPORT_SYMBOL(tcp_v4_syn_recv_sock);
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) static struct sock *tcp_v4_cookie_check(struct sock *sk, struct sk_buff *skb)
{ {
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb); const struct tcphdr *th = tcp_hdr(skb);
const struct iphdr *iph = ip_hdr(skb);
struct request_sock *req;
struct sock *nsk;
req = inet_csk_search_req(sk, th->source, iph->saddr, iph->daddr);
if (req) {
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk || nsk == sk)
reqsk_put(req);
return nsk;
}
nsk = inet_lookup_established(sock_net(sk), &tcp_hashinfo, iph->saddr,
th->source, iph->daddr, th->dest, inet_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th->syn) if (!th->syn)
sk = cookie_v4_check(sk, skb); sk = cookie_v4_check(sk, skb);
#endif #endif
...@@ -1374,7 +1355,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) ...@@ -1374,7 +1355,7 @@ static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb)
} }
/* The socket must have it's spinlock held when we get /* The socket must have it's spinlock held when we get
* here. * here, unless it is a TCP_LISTEN socket.
* *
* We have a potential double-lock case here, so even when * We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme. * doing backlog processing we use the BH locking scheme.
...@@ -1405,13 +1386,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1405,13 +1386,13 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
goto csum_err; goto csum_err;
if (sk->sk_state == TCP_LISTEN) { if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb); struct sock *nsk = tcp_v4_cookie_check(sk, skb);
if (!nsk) if (!nsk)
goto discard; goto discard;
if (nsk != sk) { if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb); sock_rps_save_rxhash(nsk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) { if (tcp_child_process(sk, nsk, skb)) {
rsk = nsk; rsk = nsk;
goto reset; goto reset;
...@@ -1599,6 +1580,29 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1599,6 +1580,29 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (sk->sk_state == TCP_TIME_WAIT) if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait; goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse;
if (sk->sk_state == TCP_LISTEN)
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
goto discard_it;
} else {
return 0;
}
}
if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) { if (unlikely(iph->ttl < inet_sk(sk)->min_ttl)) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse; goto discard_and_relse;
...@@ -1607,25 +1611,23 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1607,25 +1611,23 @@ int tcp_v4_rcv(struct sk_buff *skb)
if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb))
goto discard_and_relse; goto discard_and_relse;
#ifdef CONFIG_TCP_MD5SIG
/*
* We really want to reject the packet as early as possible
* if:
* o We're expecting an MD5'd packet and this is no MD5 tcp option
* o There is an MD5 option and we're not expecting one
*/
if (tcp_v4_inbound_md5_hash(sk, skb)) if (tcp_v4_inbound_md5_hash(sk, skb))
goto discard_and_relse; goto discard_and_relse;
#endif
nf_reset(skb); nf_reset(skb);
if (sk_filter(sk, skb)) if (sk_filter(sk, skb))
goto discard_and_relse; goto discard_and_relse;
sk_incoming_cpu_update(sk);
skb->dev = NULL; skb->dev = NULL;
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v4_do_rcv(sk, skb);
goto put_and_return;
}
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0; ret = 0;
...@@ -1640,6 +1642,7 @@ int tcp_v4_rcv(struct sk_buff *skb) ...@@ -1640,6 +1642,7 @@ int tcp_v4_rcv(struct sk_buff *skb)
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
put_and_return:
sock_put(sk); sock_put(sk);
return ret; return ret;
...@@ -1834,35 +1837,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1834,35 +1837,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
++st->num; ++st->num;
++st->offset; ++st->offset;
if (st->state == TCP_SEQ_STATE_OPENREQ) { sk = sk_nulls_next(sk);
struct request_sock *req = cur;
icsk = inet_csk(st->syn_wait_sk);
req = req->dl_next;
while (1) {
while (req) {
if (req->rsk_ops->family == st->family) {
cur = req;
goto out;
}
req = req->dl_next;
}
if (++st->sbucket >= icsk->icsk_accept_queue.listen_opt->nr_table_entries)
break;
get_req:
req = icsk->icsk_accept_queue.listen_opt->syn_table[st->sbucket];
}
sk = sk_nulls_next(st->syn_wait_sk);
st->state = TCP_SEQ_STATE_LISTENING;
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} else {
icsk = inet_csk(sk);
spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue))
goto start_req;
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
sk = sk_nulls_next(sk);
}
get_sk: get_sk:
sk_nulls_for_each_from(sk, node) { sk_nulls_for_each_from(sk, node) {
if (!net_eq(sock_net(sk), net)) if (!net_eq(sock_net(sk), net))
...@@ -1872,16 +1847,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur) ...@@ -1872,16 +1847,6 @@ static void *listening_get_next(struct seq_file *seq, void *cur)
goto out; goto out;
} }
icsk = inet_csk(sk); icsk = inet_csk(sk);
spin_lock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
if (reqsk_queue_len(&icsk->icsk_accept_queue)) {
start_req:
st->uid = sock_i_uid(sk);
st->syn_wait_sk = sk;
st->state = TCP_SEQ_STATE_OPENREQ;
st->sbucket = 0;
goto get_req;
}
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
} }
spin_unlock_bh(&ilb->lock); spin_unlock_bh(&ilb->lock);
st->offset = 0; st->offset = 0;
...@@ -2013,7 +1978,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq) ...@@ -2013,7 +1978,6 @@ static void *tcp_seek_last_pos(struct seq_file *seq)
void *rc = NULL; void *rc = NULL;
switch (st->state) { switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_LISTENING:
if (st->bucket >= INET_LHTABLE_SIZE) if (st->bucket >= INET_LHTABLE_SIZE)
break; break;
...@@ -2072,7 +2036,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos) ...@@ -2072,7 +2036,6 @@ static void *tcp_seq_next(struct seq_file *seq, void *v, loff_t *pos)
} }
switch (st->state) { switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_LISTENING:
rc = listening_get_next(seq, v); rc = listening_get_next(seq, v);
if (!rc) { if (!rc) {
...@@ -2097,11 +2060,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v) ...@@ -2097,11 +2060,6 @@ static void tcp_seq_stop(struct seq_file *seq, void *v)
struct tcp_iter_state *st = seq->private; struct tcp_iter_state *st = seq->private;
switch (st->state) { switch (st->state) {
case TCP_SEQ_STATE_OPENREQ:
if (v) {
struct inet_connection_sock *icsk = inet_csk(st->syn_wait_sk);
spin_unlock_bh(&icsk->icsk_accept_queue.syn_wait_lock);
}
case TCP_SEQ_STATE_LISTENING: case TCP_SEQ_STATE_LISTENING:
if (v != SEQ_START_TOKEN) if (v != SEQ_START_TOKEN)
spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock); spin_unlock_bh(&tcp_hashinfo.listening_hash[st->bucket].lock);
...@@ -2155,7 +2113,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo) ...@@ -2155,7 +2113,7 @@ void tcp_proc_unregister(struct net *net, struct tcp_seq_afinfo *afinfo)
EXPORT_SYMBOL(tcp_proc_unregister); EXPORT_SYMBOL(tcp_proc_unregister);
static void get_openreq4(const struct request_sock *req, static void get_openreq4(const struct request_sock *req,
struct seq_file *f, int i, kuid_t uid) struct seq_file *f, int i)
{ {
const struct inet_request_sock *ireq = inet_rsk(req); const struct inet_request_sock *ireq = inet_rsk(req);
long delta = req->rsk_timer.expires - jiffies; long delta = req->rsk_timer.expires - jiffies;
...@@ -2172,7 +2130,8 @@ static void get_openreq4(const struct request_sock *req, ...@@ -2172,7 +2130,8 @@ static void get_openreq4(const struct request_sock *req,
1, /* timers active (only the expire timer) */ 1, /* timers active (only the expire timer) */
jiffies_delta_to_clock_t(delta), jiffies_delta_to_clock_t(delta),
req->num_timeout, req->num_timeout,
from_kuid_munged(seq_user_ns(f), uid), from_kuid_munged(seq_user_ns(f),
sock_i_uid(req->rsk_listener)),
0, /* non standard timer */ 0, /* non standard timer */
0, /* open_requests have no inode */ 0, /* open_requests have no inode */
0, 0,
...@@ -2273,18 +2232,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v) ...@@ -2273,18 +2232,12 @@ static int tcp4_seq_show(struct seq_file *seq, void *v)
} }
st = seq->private; st = seq->private;
switch (st->state) { if (sk->sk_state == TCP_TIME_WAIT)
case TCP_SEQ_STATE_LISTENING: get_timewait4_sock(v, seq, st->num);
case TCP_SEQ_STATE_ESTABLISHED: else if (sk->sk_state == TCP_NEW_SYN_RECV)
if (sk->sk_state == TCP_TIME_WAIT) get_openreq4(v, seq, st->num);
get_timewait4_sock(v, seq, st->num); else
else get_tcp4_sock(v, seq, st->num);
get_tcp4_sock(v, seq, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
get_openreq4(v, seq, st->num, st->uid);
break;
}
out: out:
seq_pad(seq, '\n'); seq_pad(seq, '\n');
return 0; return 0;
......
...@@ -578,8 +578,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb, ...@@ -578,8 +578,6 @@ struct sock *tcp_check_req(struct sock *sk, struct sk_buff *skb,
__be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); __be32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK);
bool paws_reject = false; bool paws_reject = false;
BUG_ON(fastopen == (sk->sk_state == TCP_LISTEN));
tmp_opt.saw_tstamp = 0; tmp_opt.saw_tstamp = 0;
if (th->doff > (sizeof(struct tcphdr)>>2)) { if (th->doff > (sizeof(struct tcphdr)>>2)) {
tcp_parse_options(skb, &tmp_opt, 0, NULL); tcp_parse_options(skb, &tmp_opt, 0, NULL);
......
...@@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk) ...@@ -2947,7 +2947,8 @@ int tcp_send_synack(struct sock *sk)
*/ */
struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
struct request_sock *req, struct request_sock *req,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
struct inet_request_sock *ireq = inet_rsk(req); struct inet_request_sock *ireq = inet_rsk(req);
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
...@@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -2959,11 +2960,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
u16 user_mss; u16 user_mss;
int mss; int mss;
/* sk is a const pointer, because we want to express multiple cpus skb = alloc_skb(MAX_TCP_HEADER, GFP_ATOMIC);
* might call us concurrently.
* sock_wmalloc() will change sk->sk_wmem_alloc in an atomic way.
*/
skb = sock_wmalloc((struct sock *)sk, MAX_TCP_HEADER, 1, GFP_ATOMIC);
if (unlikely(!skb)) { if (unlikely(!skb)) {
dst_release(dst); dst_release(dst);
return NULL; return NULL;
...@@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -2971,6 +2968,17 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
/* Reserve space for headers. */ /* Reserve space for headers. */
skb_reserve(skb, MAX_TCP_HEADER); skb_reserve(skb, MAX_TCP_HEADER);
if (attach_req) {
skb->destructor = sock_edemux;
sock_hold(req_to_sk(req));
skb->sk = req_to_sk(req);
} else {
/* sk is a const pointer, because we want to express multiple
* cpu might call us concurrently.
* sk->sk_wmem_alloc in an atomic, we can promote to rw.
*/
skb_set_owner_w(skb, (struct sock *)sk);
}
skb_dst_set(skb, dst); skb_dst_set(skb, dst);
mss = dst_metric_advmss(dst); mss = dst_metric_advmss(dst);
...@@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req) ...@@ -3510,7 +3518,7 @@ int tcp_rtx_synack(const struct sock *sk, struct request_sock *req)
int res; int res;
tcp_rsk(req)->txhash = net_tx_rndhash(); tcp_rsk(req)->txhash = net_tx_rndhash();
res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL); res = af_ops->send_synack(sk, NULL, &fl, req, 0, NULL, true);
if (!res) { if (!res) {
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS); TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_RETRANSSEGS);
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPSYNRETRANS);
......
...@@ -94,73 +94,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk, ...@@ -94,73 +94,6 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
} }
EXPORT_SYMBOL(inet6_csk_route_req); EXPORT_SYMBOL(inet6_csk_route_req);
/*
* request_sock (formerly open request) hash tables.
*/
static u32 inet6_synq_hash(const struct in6_addr *raddr, const __be16 rport,
const u32 rnd, const u32 synq_hsize)
{
u32 c;
c = jhash_3words((__force u32)raddr->s6_addr32[0],
(__force u32)raddr->s6_addr32[1],
(__force u32)raddr->s6_addr32[2],
rnd);
c = jhash_2words((__force u32)raddr->s6_addr32[3],
(__force u32)rport,
c);
return c & (synq_hsize - 1);
}
struct request_sock *inet6_csk_search_req(struct sock *sk,
const __be16 rport,
const struct in6_addr *raddr,
const struct in6_addr *laddr,
const int iif)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
struct request_sock *req;
u32 hash = inet6_synq_hash(raddr, rport, lopt->hash_rnd,
lopt->nr_table_entries);
spin_lock(&icsk->icsk_accept_queue.syn_wait_lock);
for (req = lopt->syn_table[hash]; req != NULL; req = req->dl_next) {
const struct inet_request_sock *ireq = inet_rsk(req);
if (ireq->ir_rmt_port == rport &&
req->rsk_ops->family == AF_INET6 &&
ipv6_addr_equal(&ireq->ir_v6_rmt_addr, raddr) &&
ipv6_addr_equal(&ireq->ir_v6_loc_addr, laddr) &&
(!ireq->ir_iif || ireq->ir_iif == iif)) {
atomic_inc(&req->rsk_refcnt);
WARN_ON(req->sk != NULL);
break;
}
}
spin_unlock(&icsk->icsk_accept_queue.syn_wait_lock);
return req;
}
EXPORT_SYMBOL_GPL(inet6_csk_search_req);
void inet6_csk_reqsk_queue_hash_add(struct sock *sk,
struct request_sock *req,
const unsigned long timeout)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt;
const u32 h = inet6_synq_hash(&inet_rsk(req)->ir_v6_rmt_addr,
inet_rsk(req)->ir_rmt_port,
lopt->hash_rnd, lopt->nr_table_entries);
reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout);
inet_csk_reqsk_queue_added(sk, timeout);
}
EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add);
void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
{ {
struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr;
......
...@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -438,7 +438,8 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
struct flowi *fl, struct flowi *fl,
struct request_sock *req, struct request_sock *req,
u16 queue_mapping, u16 queue_mapping,
struct tcp_fastopen_cookie *foc) struct tcp_fastopen_cookie *foc,
bool attach_req)
{ {
struct inet_request_sock *ireq = inet_rsk(req); struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk);
...@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -451,7 +452,7 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
IPPROTO_TCP)) == NULL) IPPROTO_TCP)) == NULL)
goto done; goto done;
skb = tcp_make_synack(sk, dst, req, foc); skb = tcp_make_synack(sk, dst, req, foc, attach_req);
if (skb) { if (skb) {
__tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr, __tcp_v6_send_check(skb, &ireq->ir_v6_loc_addr,
...@@ -622,8 +623,12 @@ static int tcp_v6_md5_hash_skb(char *md5_hash, ...@@ -622,8 +623,12 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
return 1; return 1;
} }
static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) #endif
static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
const struct sk_buff *skb)
{ {
#ifdef CONFIG_TCP_MD5SIG
const __u8 *hash_location = NULL; const __u8 *hash_location = NULL;
struct tcp_md5sig_key *hash_expected; struct tcp_md5sig_key *hash_expected;
const struct ipv6hdr *ip6h = ipv6_hdr(skb); const struct ipv6hdr *ip6h = ipv6_hdr(skb);
...@@ -660,9 +665,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb) ...@@ -660,9 +665,9 @@ static bool tcp_v6_inbound_md5_hash(struct sock *sk, const struct sk_buff *skb)
&ip6h->daddr, ntohs(th->dest)); &ip6h->daddr, ntohs(th->dest));
return true; return true;
} }
#endif
return false; return false;
} }
#endif
static void tcp_v6_init_req(struct request_sock *req, static void tcp_v6_init_req(struct request_sock *req,
const struct sock *sk_listener, const struct sock *sk_listener,
...@@ -723,7 +728,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = { ...@@ -723,7 +728,6 @@ static const struct tcp_request_sock_ops tcp_request_sock_ipv6_ops = {
.route_req = tcp_v6_route_req, .route_req = tcp_v6_route_req,
.init_seq = tcp_v6_init_sequence, .init_seq = tcp_v6_init_sequence,
.send_synack = tcp_v6_send_synack, .send_synack = tcp_v6_send_synack,
.queue_hash_add = inet6_csk_reqsk_queue_hash_add,
}; };
static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq, static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 seq,
...@@ -934,37 +938,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb, ...@@ -934,37 +938,11 @@ static void tcp_v6_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
} }
static struct sock *tcp_v6_hnd_req(struct sock *sk, struct sk_buff *skb) static struct sock *tcp_v6_cookie_check(struct sock *sk, struct sk_buff *skb)
{ {
#ifdef CONFIG_SYN_COOKIES
const struct tcphdr *th = tcp_hdr(skb); const struct tcphdr *th = tcp_hdr(skb);
struct request_sock *req;
struct sock *nsk;
/* Find possible connection requests. */
req = inet6_csk_search_req(sk, th->source,
&ipv6_hdr(skb)->saddr,
&ipv6_hdr(skb)->daddr, tcp_v6_iif(skb));
if (req) {
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk || nsk == sk)
reqsk_put(req);
return nsk;
}
nsk = __inet6_lookup_established(sock_net(sk), &tcp_hashinfo,
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr, ntohs(th->dest),
tcp_v6_iif(skb));
if (nsk) {
if (nsk->sk_state != TCP_TIME_WAIT) {
bh_lock_sock(nsk);
return nsk;
}
inet_twsk_put(inet_twsk(nsk));
return NULL;
}
#ifdef CONFIG_SYN_COOKIES
if (!th->syn) if (!th->syn)
sk = cookie_v6_check(sk, skb); sk = cookie_v6_check(sk, skb);
#endif #endif
...@@ -1183,7 +1161,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * ...@@ -1183,7 +1161,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
} }
/* The socket must have it's spinlock held when we get /* The socket must have it's spinlock held when we get
* here. * here, unless it is a TCP_LISTEN socket.
* *
* We have a potential double-lock case here, so even when * We have a potential double-lock case here, so even when
* doing backlog processing we use the BH locking scheme. * doing backlog processing we use the BH locking scheme.
...@@ -1254,18 +1232,14 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) ...@@ -1254,18 +1232,14 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
goto csum_err; goto csum_err;
if (sk->sk_state == TCP_LISTEN) { if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v6_hnd_req(sk, skb); struct sock *nsk = tcp_v6_cookie_check(sk, skb);
if (!nsk) if (!nsk)
goto discard; goto discard;
/*
* Queue it on the new socket if the new socket is active,
* otherwise we just shortcircuit this and continue with
* the new socket..
*/
if (nsk != sk) { if (nsk != sk) {
sock_rps_save_rxhash(nsk, skb); sock_rps_save_rxhash(nsk, skb);
sk_mark_napi_id(sk, skb); sk_mark_napi_id(nsk, skb);
if (tcp_child_process(sk, nsk, skb)) if (tcp_child_process(sk, nsk, skb))
goto reset; goto reset;
if (opt_skb) if (opt_skb)
...@@ -1398,6 +1372,33 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1398,6 +1372,33 @@ static int tcp_v6_rcv(struct sk_buff *skb)
if (sk->sk_state == TCP_TIME_WAIT) if (sk->sk_state == TCP_TIME_WAIT)
goto do_time_wait; goto do_time_wait;
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
struct sock *nsk = NULL;
sk = req->rsk_listener;
tcp_v6_fill_cb(skb, hdr, th);
if (tcp_v6_inbound_md5_hash(sk, skb)) {
reqsk_put(req);
goto discard_it;
}
if (sk->sk_state == TCP_LISTEN)
nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
goto discard_it;
}
if (nsk == sk) {
sock_hold(sk);
reqsk_put(req);
tcp_v6_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v6_send_reset(nsk, skb);
goto discard_it;
} else {
return 0;
}
}
if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) { if (hdr->hop_limit < inet6_sk(sk)->min_hopcount) {
NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP); NET_INC_STATS_BH(net, LINUX_MIB_TCPMINTTLDROP);
goto discard_and_relse; goto discard_and_relse;
...@@ -1408,17 +1409,21 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1408,17 +1409,21 @@ static int tcp_v6_rcv(struct sk_buff *skb)
tcp_v6_fill_cb(skb, hdr, th); tcp_v6_fill_cb(skb, hdr, th);
#ifdef CONFIG_TCP_MD5SIG
if (tcp_v6_inbound_md5_hash(sk, skb)) if (tcp_v6_inbound_md5_hash(sk, skb))
goto discard_and_relse; goto discard_and_relse;
#endif
if (sk_filter(sk, skb)) if (sk_filter(sk, skb))
goto discard_and_relse; goto discard_and_relse;
sk_incoming_cpu_update(sk);
skb->dev = NULL; skb->dev = NULL;
if (sk->sk_state == TCP_LISTEN) {
ret = tcp_v6_do_rcv(sk, skb);
goto put_and_return;
}
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk); bh_lock_sock_nested(sk);
tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs); tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
ret = 0; ret = 0;
...@@ -1433,6 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb) ...@@ -1433,6 +1438,7 @@ static int tcp_v6_rcv(struct sk_buff *skb)
} }
bh_unlock_sock(sk); bh_unlock_sock(sk);
put_and_return:
sock_put(sk); sock_put(sk);
return ret ? -1 : 0; return ret ? -1 : 0;
...@@ -1633,7 +1639,7 @@ static void tcp_v6_destroy_sock(struct sock *sk) ...@@ -1633,7 +1639,7 @@ static void tcp_v6_destroy_sock(struct sock *sk)
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
/* Proc filesystem TCPv6 sock list dumping. */ /* Proc filesystem TCPv6 sock list dumping. */
static void get_openreq6(struct seq_file *seq, static void get_openreq6(struct seq_file *seq,
struct request_sock *req, int i, kuid_t uid) const struct request_sock *req, int i)
{ {
long ttd = req->rsk_timer.expires - jiffies; long ttd = req->rsk_timer.expires - jiffies;
const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr; const struct in6_addr *src = &inet_rsk(req)->ir_v6_loc_addr;
...@@ -1657,7 +1663,8 @@ static void get_openreq6(struct seq_file *seq, ...@@ -1657,7 +1663,8 @@ static void get_openreq6(struct seq_file *seq,
1, /* timers active (only the expire timer) */ 1, /* timers active (only the expire timer) */
jiffies_to_clock_t(ttd), jiffies_to_clock_t(ttd),
req->num_timeout, req->num_timeout,
from_kuid_munged(seq_user_ns(seq), uid), from_kuid_munged(seq_user_ns(seq),
sock_i_uid(req->rsk_listener)),
0, /* non standard timer */ 0, /* non standard timer */
0, /* open_requests have no inode */ 0, /* open_requests have no inode */
0, req); 0, req);
...@@ -1762,18 +1769,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v) ...@@ -1762,18 +1769,12 @@ static int tcp6_seq_show(struct seq_file *seq, void *v)
} }
st = seq->private; st = seq->private;
switch (st->state) { if (sk->sk_state == TCP_TIME_WAIT)
case TCP_SEQ_STATE_LISTENING: get_timewait6_sock(seq, v, st->num);
case TCP_SEQ_STATE_ESTABLISHED: else if (sk->sk_state == TCP_NEW_SYN_RECV)
if (sk->sk_state == TCP_TIME_WAIT) get_openreq6(seq, v, st->num);
get_timewait6_sock(seq, v, st->num); else
else get_tcp6_sock(seq, v, st->num);
get_tcp6_sock(seq, v, st->num);
break;
case TCP_SEQ_STATE_OPENREQ:
get_openreq6(seq, v, st->num, st->uid);
break;
}
out: out:
return 0; return 0;
} }
......
...@@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -224,13 +224,15 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
return &q->internal; return &q->internal;
/* SYNACK messages are attached to a listener socket. /* SYNACK messages are attached to a TCP_NEW_SYN_RECV request socket
* 1) They are not part of a 'flow' yet * 1) request sockets are not full blown,
* 2) We do not want to rate limit them (eg SYNFLOOD attack), * they do not contain sk_pacing_rate
* 2) They are not part of a 'flow' yet
* 3) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE * especially if the listener set SO_MAX_PACING_RATE
* 3) We pretend they are orphaned * 4) We pretend they are orphaned
*/ */
if (!sk || sk->sk_state == TCP_LISTEN) { if (!sk || sk->sk_state == TCP_NEW_SYN_RECV) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask; unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not /* By forcing low order bit to 1, we make sure to not
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment