Commit f1fdee33 authored by Andrii Nakryiko's avatar Andrii Nakryiko

Merge branch 'sockmap fixes picked up by stress tests'

John Fastabend says:

====================

Running stress tests with recent patch to remove an extra lock in sockmap
resulted in a couple new issues popping up. It seems only one of them
is actually related to the patch:

799aa7f9 ("skmsg: Avoid lock_sock() in sk_psock_backlog()")

The other two issues had existed long before, but I guess the timing
with the serialization we had before was too tight to get any of
our tests or deployments to hit it.

With attached series stress testing sockmap+TCP with workloads that
create lots of short-lived connections no more splats like below were
seen on upstream bpf branch.

[224913.935822] WARNING: CPU: 3 PID: 32100 at net/core/stream.c:208 sk_stream_kill_queues+0x212/0x220
[224913.935841] Modules linked in: fuse overlay bpf_preload x86_pkg_temp_thermal intel_uncore wmi_bmof squashfs sch_fq_codel efivarfs ip_tables x_tables uas xhci_pci ixgbe mdio xfrm_algo xhci_hcd wmi
[224913.935897] CPU: 3 PID: 32100 Comm: fgs-bench Tainted: G          I       5.14.0-rc1alu+ #181
[224913.935908] Hardware name: Dell Inc. Precision 5820 Tower/002KVM, BIOS 1.9.2 01/24/2019
[224913.935914] RIP: 0010:sk_stream_kill_queues+0x212/0x220
[224913.935923] Code: 8b 83 20 02 00 00 85 c0 75 20 5b 5d 41 5c 41 5d 41 5e 41 5f c3 48 89 df e8 2b 11 fe ff eb c3 0f 0b e9 7c ff ff ff 0f 0b eb ce <0f> 0b 5b 5d 41 5c 41 5d 41 5e 41 5f c3 90 0f 1f 44 00 00 41 57 41
[224913.935932] RSP: 0018:ffff88816271fd38 EFLAGS: 00010206
[224913.935941] RAX: 0000000000000ae8 RBX: ffff88815acd5240 RCX: dffffc0000000000
[224913.935948] RDX: 0000000000000003 RSI: 0000000000000ae8 RDI: ffff88815acd5460
[224913.935954] RBP: ffff88815acd5460 R08: ffffffff955c0ae8 R09: fffffbfff2e6f543
[224913.935961] R10: ffffffff9737aa17 R11: fffffbfff2e6f542 R12: ffff88815acd5390
[224913.935967] R13: ffff88815acd5480 R14: ffffffff98d0c080 R15: ffffffff96267500
[224913.935974] FS:  00007f86e6bd1700(0000) GS:ffff888451cc0000(0000) knlGS:0000000000000000
[224913.935981] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[224913.935988] CR2: 000000c0008eb000 CR3: 00000001020e0005 CR4: 00000000003706e0
[224913.935994] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
[224913.936000] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
[224913.936007] Call Trace:
[224913.936016]  inet_csk_destroy_sock+0xba/0x1f0
[224913.936033]  __tcp_close+0x620/0x790
[224913.936047]  tcp_close+0x20/0x80
[224913.936056]  inet_release+0x8f/0xf0
[224913.936070]  __sock_release+0x72/0x120

v3: make sock_drop inline in skmsg.h
v2: init skb to null and fix a space/tab issue. Added Jakub's acks.
====================
Signed-off-by: default avatarAndrii Nakryiko <andrii@kernel.org>
parents d6371c76 9635720b
......@@ -285,11 +285,45 @@ static inline struct sk_psock *sk_psock(const struct sock *sk)
return rcu_dereference_sk_user_data(sk);
}
static inline void sk_psock_set_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
set_bit(bit, &psock->state);
}
static inline void sk_psock_clear_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
clear_bit(bit, &psock->state);
}
static inline bool sk_psock_test_state(const struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
return test_bit(bit, &psock->state);
}
static inline void sock_drop(struct sock *sk, struct sk_buff *skb)
{
sk_drops_add(sk, skb);
kfree_skb(skb);
}
static inline void drop_sk_msg(struct sk_psock *psock, struct sk_msg *msg)
{
if (msg->skb)
sock_drop(psock->sk, msg->skb);
kfree(msg);
}
static inline void sk_psock_queue_msg(struct sk_psock *psock,
struct sk_msg *msg)
{
spin_lock_bh(&psock->ingress_lock);
list_add_tail(&msg->list, &psock->ingress_msg);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED))
list_add_tail(&msg->list, &psock->ingress_msg);
else
drop_sk_msg(psock, msg);
spin_unlock_bh(&psock->ingress_lock);
}
......@@ -406,24 +440,6 @@ static inline void sk_psock_restore_proto(struct sock *sk,
psock->psock_update_sk_prot(sk, psock, true);
}
static inline void sk_psock_set_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
set_bit(bit, &psock->state);
}
static inline void sk_psock_clear_state(struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
clear_bit(bit, &psock->state);
}
static inline bool sk_psock_test_state(const struct sk_psock *psock,
enum sk_psock_state_bits bit)
{
return test_bit(bit, &psock->state);
}
static inline struct sk_psock *sk_psock_get(struct sock *sk)
{
struct sk_psock *psock;
......
......@@ -584,29 +584,42 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb,
return sk_psock_skb_ingress(psock, skb);
}
static void sock_drop(struct sock *sk, struct sk_buff *skb)
static void sk_psock_skb_state(struct sk_psock *psock,
struct sk_psock_work_state *state,
struct sk_buff *skb,
int len, int off)
{
sk_drops_add(sk, skb);
kfree_skb(skb);
spin_lock_bh(&psock->ingress_lock);
if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) {
state->skb = skb;
state->len = len;
state->off = off;
} else {
sock_drop(psock->sk, skb);
}
spin_unlock_bh(&psock->ingress_lock);
}
static void sk_psock_backlog(struct work_struct *work)
{
struct sk_psock *psock = container_of(work, struct sk_psock, work);
struct sk_psock_work_state *state = &psock->work_state;
struct sk_buff *skb;
struct sk_buff *skb = NULL;
bool ingress;
u32 len, off;
int ret;
mutex_lock(&psock->work_mutex);
if (state->skb) {
if (unlikely(state->skb)) {
spin_lock_bh(&psock->ingress_lock);
skb = state->skb;
len = state->len;
off = state->off;
state->skb = NULL;
goto start;
spin_unlock_bh(&psock->ingress_lock);
}
if (skb)
goto start;
while ((skb = skb_dequeue(&psock->ingress_skb))) {
len = skb->len;
......@@ -621,9 +634,8 @@ static void sk_psock_backlog(struct work_struct *work)
len, ingress);
if (ret <= 0) {
if (ret == -EAGAIN) {
state->skb = skb;
state->len = len;
state->off = off;
sk_psock_skb_state(psock, state, skb,
len, off);
goto end;
}
/* Hard errors break pipe and stop xmit. */
......@@ -722,6 +734,11 @@ static void __sk_psock_zap_ingress(struct sk_psock *psock)
skb_bpf_redirect_clear(skb);
sock_drop(psock->sk, skb);
}
kfree_skb(psock->work_state.skb);
/* We null the skb here to ensure that calls to sk_psock_backlog
* do not pick up the free'd skb.
*/
psock->work_state.skb = NULL;
__sk_psock_purge_ingress_msg(psock);
}
......@@ -773,8 +790,6 @@ static void sk_psock_destroy(struct work_struct *work)
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
sk_psock_stop(psock, false);
write_lock_bh(&sk->sk_callback_lock);
sk_psock_restore_proto(sk, psock);
rcu_assign_sk_user_data(sk, NULL);
......@@ -784,6 +799,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
sk_psock_stop_verdict(sk, psock);
write_unlock_bh(&sk->sk_callback_lock);
sk_psock_stop(psock, false);
INIT_RCU_WORK(&psock->rwork, sk_psock_destroy);
queue_rcu_work(system_wq, &psock->rwork);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment