Commit 4373a5e2 authored by David S. Miller's avatar David S. Miller

Merge branch 'packet-DDOS'

Eric Dumazet says:

====================
net/packet: better behavior under DDOS

Using tcpdump (or other af_packet user) on a busy host can lead to
catastrophic consequences, because suddenly, potentially all cpus
are spinning on a contended spinlock.

Both packet_rcv() and tpacket_rcv() grab the spinlock
to eventually find there is no room for an additional packet.

This patch series align packet_rcv() and tpacket_rcv() to both
check if the queue is full before grabbing the spinlock.

If the queue is full, they both increment a new atomic counter
placed on a separate cache line to let readers drain the queue faster.

There is still false sharing on this new atomic counter,
we might in the future make it per cpu if there is interest.
====================
Acked-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents f30e33bc 9bb6cd65
......@@ -384,7 +384,7 @@ static void __packet_set_status(struct packet_sock *po, void *frame, int status)
smp_wmb();
}
static int __packet_get_status(struct packet_sock *po, void *frame)
static int __packet_get_status(const struct packet_sock *po, void *frame)
{
union tpacket_uhdr h;
......@@ -460,10 +460,10 @@ static __u32 __packet_set_timestamp(struct packet_sock *po, void *frame,
return ts_status;
}
static void *packet_lookup_frame(struct packet_sock *po,
struct packet_ring_buffer *rb,
unsigned int position,
int status)
static void *packet_lookup_frame(const struct packet_sock *po,
const struct packet_ring_buffer *rb,
unsigned int position,
int status)
{
unsigned int pg_vec_pos, frame_offset;
union tpacket_uhdr h;
......@@ -758,7 +758,7 @@ static void prb_close_block(struct tpacket_kbdq_core *pkc1,
struct tpacket_hdr_v1 *h1 = &pbd1->hdr.bh1;
struct sock *sk = &po->sk;
if (po->stats.stats3.tp_drops)
if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
last_pkt = (struct tpacket3_hdr *)pkc1->prev;
......@@ -1082,10 +1082,10 @@ static void *packet_current_rx_frame(struct packet_sock *po,
}
}
static void *prb_lookup_block(struct packet_sock *po,
struct packet_ring_buffer *rb,
unsigned int idx,
int status)
static void *prb_lookup_block(const struct packet_sock *po,
const struct packet_ring_buffer *rb,
unsigned int idx,
int status)
{
struct tpacket_kbdq_core *pkc = GET_PBDQC_FROM_RB(rb);
struct tpacket_block_desc *pbd = GET_PBLOCK_DESC(pkc, idx);
......@@ -1198,12 +1198,12 @@ static void packet_free_pending(struct packet_sock *po)
#define ROOM_LOW 0x1
#define ROOM_NORMAL 0x2
static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
static bool __tpacket_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
len = po->rx_ring.frame_max + 1;
idx = po->rx_ring.head;
len = READ_ONCE(po->rx_ring.frame_max) + 1;
idx = READ_ONCE(po->rx_ring.head);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
......@@ -1211,12 +1211,12 @@ static bool __tpacket_has_room(struct packet_sock *po, int pow_off)
return packet_lookup_frame(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
static bool __tpacket_v3_has_room(const struct packet_sock *po, int pow_off)
{
int idx, len;
len = po->rx_ring.prb_bdqc.knum_blocks;
idx = po->rx_ring.prb_bdqc.kactive_blk_num;
len = READ_ONCE(po->rx_ring.prb_bdqc.knum_blocks);
idx = READ_ONCE(po->rx_ring.prb_bdqc.kactive_blk_num);
if (pow_off)
idx += len >> pow_off;
if (idx >= len)
......@@ -1224,15 +1224,18 @@ static bool __tpacket_v3_has_room(struct packet_sock *po, int pow_off)
return prb_lookup_block(po, &po->rx_ring, idx, TP_STATUS_KERNEL);
}
static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int __packet_rcv_has_room(const struct packet_sock *po,
const struct sk_buff *skb)
{
struct sock *sk = &po->sk;
const struct sock *sk = &po->sk;
int ret = ROOM_NONE;
if (po->prot_hook.func != tpacket_rcv) {
int avail = sk->sk_rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- (skb ? skb->truesize : 0);
if (avail > (sk->sk_rcvbuf >> ROOM_POW_OFF))
int rcvbuf = READ_ONCE(sk->sk_rcvbuf);
int avail = rcvbuf - atomic_read(&sk->sk_rmem_alloc)
- (skb ? skb->truesize : 0);
if (avail > (rcvbuf >> ROOM_POW_OFF))
return ROOM_NORMAL;
else if (avail > 0)
return ROOM_LOW;
......@@ -1257,19 +1260,24 @@ static int __packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
static int packet_rcv_has_room(struct packet_sock *po, struct sk_buff *skb)
{
int ret;
bool has_room;
int pressure, ret;
spin_lock_bh(&po->sk.sk_receive_queue.lock);
ret = __packet_rcv_has_room(po, skb);
has_room = ret == ROOM_NORMAL;
if (po->pressure == has_room)
po->pressure = !has_room;
spin_unlock_bh(&po->sk.sk_receive_queue.lock);
pressure = ret != ROOM_NORMAL;
if (READ_ONCE(po->pressure) != pressure)
WRITE_ONCE(po->pressure, pressure);
return ret;
}
static void packet_rcv_try_clear_pressure(struct packet_sock *po)
{
if (READ_ONCE(po->pressure) &&
__packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
WRITE_ONCE(po->pressure, 0);
}
static void packet_sock_destruct(struct sock *sk)
{
skb_queue_purge(&sk->sk_error_queue);
......@@ -1350,7 +1358,7 @@ static unsigned int fanout_demux_rollover(struct packet_fanout *f,
i = j = min_t(int, po->rollover->sock, num - 1);
do {
po_next = pkt_sk(f->arr[i]);
if (po_next != po_skip && !po_next->pressure &&
if (po_next != po_skip && !READ_ONCE(po_next->pressure) &&
packet_rcv_has_room(po_next, skb) == ROOM_NORMAL) {
if (i != j)
po->rollover->sock = i;
......@@ -2125,10 +2133,8 @@ static int packet_rcv(struct sk_buff *skb, struct net_device *dev,
drop_n_acct:
is_drop_n_account = true;
spin_lock(&sk->sk_receive_queue.lock);
po->stats.stats1.tp_drops++;
atomic_inc(&po->tp_drops);
atomic_inc(&sk->sk_drops);
spin_unlock(&sk->sk_receive_queue.lock);
drop_n_restore:
if (skb_head != skb->data && skb_shared(skb)) {
......@@ -2192,6 +2198,12 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
if (!res)
goto drop_n_restore;
/* If we are flooded, just give up */
if (__packet_rcv_has_room(po, skb) == ROOM_NONE) {
atomic_inc(&po->tp_drops);
goto drop_n_restore;
}
if (skb->ip_summed == CHECKSUM_PARTIAL)
status |= TP_STATUS_CSUMNOTREADY;
else if (skb->pkt_type != PACKET_OUTGOING &&
......@@ -2262,7 +2274,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
* Anyways, moving it for V1/V2 only as V3 doesn't need this
* at packet level.
*/
if (po->stats.stats1.tp_drops)
if (atomic_read(&po->tp_drops))
status |= TP_STATUS_LOSING;
}
......@@ -2378,9 +2390,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
return 0;
drop_n_account:
is_drop_n_account = true;
po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
atomic_inc(&po->tp_drops);
is_drop_n_account = true;
sk->sk_data_ready(sk);
kfree_skb(copy_skb);
......@@ -3303,8 +3315,7 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (skb == NULL)
goto out;
if (pkt_sk(sk)->pressure)
packet_rcv_has_room(pkt_sk(sk), NULL);
packet_rcv_try_clear_pressure(pkt_sk(sk));
if (pkt_sk(sk)->has_vnet_hdr) {
err = packet_rcv_vnet(msg, skb, &len);
......@@ -3876,6 +3887,7 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
void *data = &val;
union tpacket_stats_u st;
struct tpacket_rollover_stats rstats;
int drops;
if (level != SOL_PACKET)
return -ENOPROTOOPT;
......@@ -3892,14 +3904,17 @@ static int packet_getsockopt(struct socket *sock, int level, int optname,
memcpy(&st, &po->stats, sizeof(st));
memset(&po->stats, 0, sizeof(po->stats));
spin_unlock_bh(&sk->sk_receive_queue.lock);
drops = atomic_xchg(&po->tp_drops, 0);
if (po->tp_version == TPACKET_V3) {
lv = sizeof(struct tpacket_stats_v3);
st.stats3.tp_packets += st.stats3.tp_drops;
st.stats3.tp_drops = drops;
st.stats3.tp_packets += drops;
data = &st.stats3;
} else {
lv = sizeof(struct tpacket_stats);
st.stats1.tp_packets += st.stats1.tp_drops;
st.stats1.tp_drops = drops;
st.stats1.tp_packets += drops;
data = &st.stats1;
}
......@@ -4118,8 +4133,7 @@ static __poll_t packet_poll(struct file *file, struct socket *sock,
TP_STATUS_KERNEL))
mask |= EPOLLIN | EPOLLRDNORM;
}
if (po->pressure && __packet_rcv_has_room(po, NULL) == ROOM_NORMAL)
po->pressure = 0;
packet_rcv_try_clear_pressure(po);
spin_unlock_bh(&sk->sk_receive_queue.lock);
spin_lock_bh(&sk->sk_write_queue.lock);
if (po->tx_ring.pg_vec) {
......
......@@ -131,6 +131,7 @@ struct packet_sock {
struct net_device __rcu *cached_dev;
int (*xmit)(struct sk_buff *skb);
struct packet_type prot_hook ____cacheline_aligned_in_smp;
atomic_t tp_drops ____cacheline_aligned_in_smp;
};
static struct packet_sock *pkt_sk(struct sock *sk)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment