Commit 978f4175 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-prepare-mptcp-packet-scheduler-for-bpf-extension'

Mat Martineau says:

====================
mptcp: Prepare MPTCP packet scheduler for BPF extension

The kernel's MPTCP packet scheduler has, to date, been a one-size-fits
all algorithm that is hard-coded. It attempts to balance latency and
throughput when transmitting data across multiple TCP subflows, and has
some limited tunability through sysctls. It has been a long-term goal of
the Linux MPTCP community to support customizable packet schedulers for
use cases that need to make different trade-offs regarding latency,
throughput, redundancy, and other metrics. BPF is well-suited for
configuring customized, per-packet scheduling decisions without having
to modify the kernel or manage out-of-tree kernel modules.

The first steps toward implementing BPF packet schedulers are to update
the existing MPTCP transmit loops to allow more flexible scheduling
decisions, and to add infrastructure for swappable packet schedulers.
The existing scheduling algorithm remains the default. BPF-related
changes will be in a future patch series.

This code has been in the MPTCP development tree for quite a while,
undergoing testing in our CI and community.

Patches 1 and 2 refactor the transmit code and do some related cleanup.

Patches 3-9 add infrastructure for registering and calling multiple
schedulers.

Patch 10 connects the in-kernel default scheduler to the new
infrastructure.
====================

Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-0-0c860fb256a8@kernel.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 98173633 ed1ad86b
...@@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER ...@@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER
This is a per-namespace sysctl. This is a per-namespace sysctl.
Default: 4 Default: 4
scheduler - STRING
Select the scheduler of your choice.
Support for selection of different schedulers. This is a per-namespace
sysctl.
Default: "default"
...@@ -96,6 +96,27 @@ struct mptcp_out_options { ...@@ -96,6 +96,27 @@ struct mptcp_out_options {
#endif #endif
}; };
#define MPTCP_SCHED_NAME_MAX 16
#define MPTCP_SUBFLOWS_MAX 8
struct mptcp_sched_data {
bool reinject;
u8 subflows;
struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
};
struct mptcp_sched_ops {
int (*get_subflow)(struct mptcp_sock *msk,
struct mptcp_sched_data *data);
char name[MPTCP_SCHED_NAME_MAX];
struct module *owner;
struct list_head list;
void (*init)(struct mptcp_sock *msk);
void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;
#ifdef CONFIG_MPTCP #ifdef CONFIG_MPTCP
void mptcp_init(void); void mptcp_init(void);
......
...@@ -2,7 +2,7 @@ ...@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \ mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
......
...@@ -32,6 +32,7 @@ struct mptcp_pernet { ...@@ -32,6 +32,7 @@ struct mptcp_pernet {
u8 checksum_enabled; u8 checksum_enabled;
u8 allow_join_initial_addr_port; u8 allow_join_initial_addr_port;
u8 pm_type; u8 pm_type;
char scheduler[MPTCP_SCHED_NAME_MAX];
}; };
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net) static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
...@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net) ...@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type; return mptcp_get_pernet(net)->pm_type;
} }
const char *mptcp_get_scheduler(const struct net *net)
{
return mptcp_get_pernet(net)->scheduler;
}
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{ {
pernet->mptcp_enabled = 1; pernet->mptcp_enabled = 1;
...@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet) ...@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->allow_join_initial_addr_port = 1; pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4; pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL; pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
strcpy(pernet->scheduler, "default");
} }
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
...@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = { ...@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO, .extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max .extra2 = &mptcp_pm_type_max
}, },
{
.procname = "scheduler",
.maxlen = MPTCP_SCHED_NAME_MAX,
.mode = 0644,
.proc_handler = proc_dostring,
},
{} {}
}; };
...@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet) ...@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port; table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt; table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type; table[5].data = &pernet->pm_type;
table[6].data = &pernet->scheduler;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table); hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr) if (!hdr)
......
...@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup) ...@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup); pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
msk = mptcp_sk(sk); msk = mptcp_sk(sk);
if (subflow->backup != bkup) { if (subflow->backup != bkup)
subflow->backup = bkup; subflow->backup = bkup;
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
msk->last_snd = NULL;
else
__set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
mptcp_data_unlock(sk);
}
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC); mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
} }
......
...@@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con ...@@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
slow = lock_sock_fast(ssk); slow = lock_sock_fast(ssk);
if (prio) { if (prio) {
if (subflow->backup != backup)
msk->last_snd = NULL;
subflow->send_mp_prio = 1; subflow->send_mp_prio = 1;
subflow->backup = backup; subflow->backup = backup;
subflow->request_bkup = backup; subflow->request_bkup = backup;
......
...@@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow) ...@@ -1366,7 +1366,7 @@ bool mptcp_subflow_active(struct mptcp_subflow_context *subflow)
* returns the subflow that will transmit the next DSS * returns the subflow that will transmit the next DSS
* additionally updates the rtx timeout * additionally updates the rtx timeout
*/ */
static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
{ {
struct subflow_send_info send_info[SSK_MODE_MAX]; struct subflow_send_info send_info[SSK_MODE_MAX];
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
...@@ -1377,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -1377,23 +1377,6 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
u64 linger_time; u64 linger_time;
long tout = 0; long tout = 0;
msk_owned_by_me(msk);
if (__mptcp_check_fallback(msk)) {
if (!msk->first)
return NULL;
return __tcp_can_send(msk->first) &&
sk_stream_memory_free(msk->first) ? msk->first : NULL;
}
/* re-use last subflow, if the burst allow that */
if (msk->last_snd && msk->snd_burst > 0 &&
sk_stream_memory_free(msk->last_snd) &&
mptcp_subflow_active(mptcp_subflow_ctx(msk->last_snd))) {
mptcp_set_timeout(sk);
return msk->last_snd;
}
/* pick the subflow with the lower wmem/wspace ratio */ /* pick the subflow with the lower wmem/wspace ratio */
for (i = 0; i < SSK_MODE_MAX; ++i) { for (i = 0; i < SSK_MODE_MAX; ++i) {
send_info[i].ssk = NULL; send_info[i].ssk = NULL;
...@@ -1446,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk) ...@@ -1446,16 +1429,13 @@ static struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk)
burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt); burst = min_t(int, MPTCP_SEND_BURST_SIZE, mptcp_wnd_end(msk) - msk->snd_nxt);
wmem = READ_ONCE(ssk->sk_wmem_queued); wmem = READ_ONCE(ssk->sk_wmem_queued);
if (!burst) { if (!burst)
msk->last_snd = NULL;
return ssk; return ssk;
}
subflow = mptcp_subflow_ctx(ssk); subflow = mptcp_subflow_ctx(ssk);
subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem + subflow->avg_pacing_rate = div_u64((u64)subflow->avg_pacing_rate * wmem +
READ_ONCE(ssk->sk_pacing_rate) * burst, READ_ONCE(ssk->sk_pacing_rate) * burst,
burst + wmem); burst + wmem);
msk->last_snd = ssk;
msk->snd_burst = burst; msk->snd_burst = burst;
return ssk; return ssk;
} }
...@@ -1499,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk) ...@@ -1499,64 +1479,106 @@ void mptcp_check_and_set_pending(struct sock *sk)
mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING); mptcp_sk(sk)->push_pending |= BIT(MPTCP_PUSH_PENDING);
} }
void __mptcp_push_pending(struct sock *sk, unsigned int flags) static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
struct mptcp_sendmsg_info *info)
{ {
struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info = {
.flags = flags,
};
bool do_check_data_fin = false;
struct mptcp_data_frag *dfrag; struct mptcp_data_frag *dfrag;
int len; int len, copied = 0, err = 0;
while ((dfrag = mptcp_send_head(sk))) { while ((dfrag = mptcp_send_head(sk))) {
info.sent = dfrag->already_sent; info->sent = dfrag->already_sent;
info.limit = dfrag->data_len; info->limit = dfrag->data_len;
len = dfrag->data_len - dfrag->already_sent; len = dfrag->data_len - dfrag->already_sent;
while (len > 0) { while (len > 0) {
int ret = 0; int ret = 0;
prev_ssk = ssk; ret = mptcp_sendmsg_frag(sk, ssk, dfrag, info);
ssk = mptcp_subflow_get_send(msk);
/* First check. If the ssk has changed since
* the last round, release prev_ssk
*/
if (ssk != prev_ssk && prev_ssk)
mptcp_push_release(prev_ssk, &info);
if (!ssk)
goto out;
/* Need to lock the new subflow only if different
* from the previous one, otherwise we are still
* helding the relevant lock
*/
if (ssk != prev_ssk)
lock_sock(ssk);
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0) { if (ret <= 0) {
if (ret == -EAGAIN) err = copied ? : ret;
continue;
mptcp_push_release(ssk, &info);
goto out; goto out;
} }
do_check_data_fin = true; info->sent += ret;
info.sent += ret; copied += ret;
len -= ret; len -= ret;
mptcp_update_post_push(msk, dfrag, ret); mptcp_update_post_push(msk, dfrag, ret);
} }
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk)); WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
if (msk->snd_burst <= 0 ||
!sk_stream_memory_free(ssk) ||
!mptcp_subflow_active(mptcp_subflow_ctx(ssk))) {
err = copied;
goto out;
}
mptcp_set_timeout(sk);
}
err = copied;
out:
return err;
}
void __mptcp_push_pending(struct sock *sk, unsigned int flags)
{
struct sock *prev_ssk = NULL, *ssk = NULL;
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_sendmsg_info info = {
.flags = flags,
};
bool do_check_data_fin = false;
int push_count = 1;
while (mptcp_send_head(sk) && (push_count > 0)) {
struct mptcp_subflow_context *subflow;
int ret = 0;
if (mptcp_sched_get_send(msk))
break;
push_count = 0;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled)) {
mptcp_subflow_set_scheduled(subflow, false);
prev_ssk = ssk;
ssk = mptcp_subflow_tcp_sock(subflow);
if (ssk != prev_ssk) {
/* First check. If the ssk has changed since
* the last round, release prev_ssk
*/
if (prev_ssk)
mptcp_push_release(prev_ssk, &info);
/* Need to lock the new subflow only if different
* from the previous one, otherwise we are still
* helding the relevant lock
*/
lock_sock(ssk);
}
push_count++;
ret = __subflow_push_pending(sk, ssk, &info);
if (ret <= 0) {
if (ret != -EAGAIN ||
(1 << ssk->sk_state) &
(TCPF_FIN_WAIT1 | TCPF_FIN_WAIT2 | TCPF_CLOSE))
push_count--;
continue;
}
do_check_data_fin = true;
}
}
} }
/* at this point we held the socket lock for the last subflow we used */ /* at this point we held the socket lock for the last subflow we used */
if (ssk) if (ssk)
mptcp_push_release(ssk, &info); mptcp_push_release(ssk, &info);
out:
/* ensure the rtx timer is running */ /* ensure the rtx timer is running */
if (!mptcp_timer_pending(sk)) if (!mptcp_timer_pending(sk))
mptcp_reset_timer(sk); mptcp_reset_timer(sk);
...@@ -1570,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool ...@@ -1570,42 +1592,49 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool
struct mptcp_sendmsg_info info = { struct mptcp_sendmsg_info info = {
.data_lock_held = true, .data_lock_held = true,
}; };
struct mptcp_data_frag *dfrag; bool keep_pushing = true;
struct sock *xmit_ssk; struct sock *xmit_ssk;
int len, copied = 0; int copied = 0;
info.flags = 0; info.flags = 0;
while ((dfrag = mptcp_send_head(sk))) { while (mptcp_send_head(sk) && keep_pushing) {
info.sent = dfrag->already_sent; struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
info.limit = dfrag->data_len; int ret = 0;
len = dfrag->data_len - dfrag->already_sent;
while (len > 0) {
int ret = 0;
/* check for a different subflow usage only after
* spooling the first chunk of data
*/
xmit_ssk = first ? ssk : mptcp_subflow_get_send(msk);
if (!xmit_ssk)
goto out;
if (xmit_ssk != ssk) {
mptcp_subflow_delegate(mptcp_subflow_ctx(xmit_ssk),
MPTCP_DELEGATE_SEND);
goto out;
}
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); /* check for a different subflow usage only after
* spooling the first chunk of data
*/
if (first) {
mptcp_subflow_set_scheduled(subflow, false);
ret = __subflow_push_pending(sk, ssk, &info);
first = false;
if (ret <= 0) if (ret <= 0)
goto out; break;
copied += ret;
continue;
}
if (mptcp_sched_get_send(msk))
goto out;
info.sent += ret; if (READ_ONCE(subflow->scheduled)) {
mptcp_subflow_set_scheduled(subflow, false);
ret = __subflow_push_pending(sk, ssk, &info);
if (ret <= 0)
keep_pushing = false;
copied += ret; copied += ret;
len -= ret; }
first = false;
mptcp_update_post_push(msk, dfrag, ret); mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled)) {
xmit_ssk = mptcp_subflow_tcp_sock(subflow);
if (xmit_ssk != ssk) {
mptcp_subflow_delegate(subflow,
MPTCP_DELEGATE_SEND);
keep_pushing = false;
}
}
} }
WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
} }
out: out:
...@@ -2198,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t) ...@@ -2198,17 +2227,12 @@ static void mptcp_timeout_timer(struct timer_list *t)
* *
* A backup subflow is returned only if that is the only kind available. * A backup subflow is returned only if that is the only kind available.
*/ */
static struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk) struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk)
{ {
struct sock *backup = NULL, *pick = NULL; struct sock *backup = NULL, *pick = NULL;
struct mptcp_subflow_context *subflow; struct mptcp_subflow_context *subflow;
int min_stale_count = INT_MAX; int min_stale_count = INT_MAX;
msk_owned_by_me(msk);
if (__mptcp_check_fallback(msk))
return NULL;
mptcp_for_each_subflow(msk, subflow) { mptcp_for_each_subflow(msk, subflow) {
struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
...@@ -2370,9 +2394,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, ...@@ -2370,9 +2394,6 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk,
WRITE_ONCE(msk->first, NULL); WRITE_ONCE(msk->first, NULL);
out: out:
if (ssk == msk->last_snd)
msk->last_snd = NULL;
if (need_push) if (need_push)
__mptcp_push_pending(sk, 0); __mptcp_push_pending(sk, 0);
} }
...@@ -2489,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) ...@@ -2489,16 +2510,17 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk)
static void __mptcp_retrans(struct sock *sk) static void __mptcp_retrans(struct sock *sk)
{ {
struct mptcp_sock *msk = mptcp_sk(sk); struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_subflow_context *subflow;
struct mptcp_sendmsg_info info = {}; struct mptcp_sendmsg_info info = {};
struct mptcp_data_frag *dfrag; struct mptcp_data_frag *dfrag;
size_t copied = 0;
struct sock *ssk; struct sock *ssk;
int ret; int ret, err;
u16 len = 0;
mptcp_clean_una_wakeup(sk); mptcp_clean_una_wakeup(sk);
/* first check ssk: need to kick "stale" logic */ /* first check ssk: need to kick "stale" logic */
ssk = mptcp_subflow_get_retrans(msk); err = mptcp_sched_get_retrans(msk);
dfrag = mptcp_rtx_head(sk); dfrag = mptcp_rtx_head(sk);
if (!dfrag) { if (!dfrag) {
if (mptcp_data_fin_enabled(msk)) { if (mptcp_data_fin_enabled(msk)) {
...@@ -2517,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk) ...@@ -2517,32 +2539,45 @@ static void __mptcp_retrans(struct sock *sk)
goto reset_timer; goto reset_timer;
} }
if (!ssk) if (err)
goto reset_timer; goto reset_timer;
lock_sock(ssk); mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled)) {
u16 copied = 0;
/* limit retransmission to the bytes already sent on some subflows */ mptcp_subflow_set_scheduled(subflow, false);
info.sent = 0;
info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent;
while (info.sent < info.limit) {
ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
break;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS); ssk = mptcp_subflow_tcp_sock(subflow);
copied += ret;
info.sent += ret; lock_sock(ssk);
}
if (copied) { /* limit retransmission to the bytes already sent on some subflows */
dfrag->already_sent = max(dfrag->already_sent, info.sent); info.sent = 0;
msk->bytes_retrans += copied; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len :
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle, dfrag->already_sent;
info.size_goal); while (info.sent < info.limit) {
WRITE_ONCE(msk->allow_infinite_fallback, false); ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info);
if (ret <= 0)
break;
MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_RETRANSSEGS);
copied += ret;
info.sent += ret;
}
if (copied) {
len = max(copied, len);
tcp_push(ssk, 0, info.mss_now, tcp_sk(ssk)->nonagle,
info.size_goal);
WRITE_ONCE(msk->allow_infinite_fallback, false);
}
release_sock(ssk);
}
} }
release_sock(ssk); msk->bytes_retrans += len;
dfrag->already_sent = max(dfrag->already_sent, len);
reset_timer: reset_timer:
mptcp_check_and_set_pending(sk); mptcp_check_and_set_pending(sk);
...@@ -2694,6 +2729,7 @@ static void mptcp_ca_reset(struct sock *sk) ...@@ -2694,6 +2729,7 @@ static void mptcp_ca_reset(struct sock *sk)
static int mptcp_init_sock(struct sock *sk) static int mptcp_init_sock(struct sock *sk)
{ {
struct net *net = sock_net(sk); struct net *net = sock_net(sk);
int ret;
__mptcp_init_sock(sk); __mptcp_init_sock(sk);
...@@ -2703,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk) ...@@ -2703,6 +2739,11 @@ static int mptcp_init_sock(struct sock *sk)
if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net))
return -ENOMEM; return -ENOMEM;
ret = mptcp_init_sched(mptcp_sk(sk),
mptcp_sched_find(mptcp_get_scheduler(net)));
if (ret)
return ret;
set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags);
/* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will
...@@ -2848,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk) ...@@ -2848,6 +2889,7 @@ static void __mptcp_destroy_sock(struct sock *sk)
mptcp_stop_timer(sk); mptcp_stop_timer(sk);
sk_stop_timer(sk, &sk->sk_timer); sk_stop_timer(sk, &sk->sk_timer);
msk->pm.status = 0; msk->pm.status = 0;
mptcp_release_sched(msk);
sk->sk_prot->destroy(sk); sk->sk_prot->destroy(sk);
...@@ -3037,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags) ...@@ -3037,7 +3079,6 @@ static int mptcp_disconnect(struct sock *sk, int flags)
* subflow * subflow
*/ */
mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE); mptcp_destroy_common(msk, MPTCP_CF_FASTCLOSE);
msk->last_snd = NULL;
WRITE_ONCE(msk->flags, 0); WRITE_ONCE(msk->flags, 0);
msk->cb_flags = 0; msk->cb_flags = 0;
msk->push_pending = 0; msk->push_pending = 0;
...@@ -3103,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk, ...@@ -3103,6 +3144,7 @@ struct sock *mptcp_sk_clone_init(const struct sock *sk,
msk->snd_una = msk->write_seq; msk->snd_una = msk->write_seq;
msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd; msk->wnd_end = msk->snd_nxt + req->rsk_rcv_wnd;
msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq; msk->setsockopt_seq = mptcp_sk(sk)->setsockopt_seq;
mptcp_init_sched(msk, mptcp_sk(sk)->sched);
/* passive msk is created after the first/MPC subflow */ /* passive msk is created after the first/MPC subflow */
msk->subflow_id = 2; msk->subflow_id = 2;
...@@ -3307,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk) ...@@ -3307,8 +3349,6 @@ static void mptcp_release_cb(struct sock *sk)
__mptcp_set_connected(sk); __mptcp_set_connected(sk);
if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags)) if (__test_and_clear_bit(MPTCP_ERROR_REPORT, &msk->cb_flags))
__mptcp_error_report(sk); __mptcp_error_report(sk);
if (__test_and_clear_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags))
msk->last_snd = NULL;
} }
__mptcp_update_rmem(sk); __mptcp_update_rmem(sk);
...@@ -3925,6 +3965,7 @@ void __init mptcp_proto_init(void) ...@@ -3925,6 +3965,7 @@ void __init mptcp_proto_init(void)
mptcp_subflow_init(); mptcp_subflow_init();
mptcp_pm_init(); mptcp_pm_init();
mptcp_sched_init();
mptcp_token_init(); mptcp_token_init();
if (proto_register(&mptcp_prot, 1) != 0) if (proto_register(&mptcp_prot, 1) != 0)
......
...@@ -123,7 +123,6 @@ ...@@ -123,7 +123,6 @@
#define MPTCP_RETRANSMIT 4 #define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5 #define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6 #define MPTCP_CONNECTED 6
#define MPTCP_RESET_SCHEDULER 7
struct mptcp_skb_cb { struct mptcp_skb_cb {
u64 map_seq; u64 map_seq;
...@@ -269,7 +268,6 @@ struct mptcp_sock { ...@@ -269,7 +268,6 @@ struct mptcp_sock {
u64 rcv_data_fin_seq; u64 rcv_data_fin_seq;
u64 bytes_retrans; u64 bytes_retrans;
int rmem_fwd_alloc; int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst; int snd_burst;
int old_wspace; int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq; u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
...@@ -314,6 +312,7 @@ struct mptcp_sock { ...@@ -314,6 +312,7 @@ struct mptcp_sock {
* lock as such sock is freed after close(). * lock as such sock is freed after close().
*/ */
struct mptcp_pm_data pm; struct mptcp_pm_data pm;
struct mptcp_sched_ops *sched;
struct { struct {
u32 space; /* bytes copied in last measurement window */ u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */ u32 copied; /* bytes copied in this measurement window */
...@@ -492,6 +491,7 @@ struct mptcp_subflow_context { ...@@ -492,6 +491,7 @@ struct mptcp_subflow_context {
is_mptfo : 1, /* subflow is doing TFO */ is_mptfo : 1, /* subflow is doing TFO */
__unused : 9; __unused : 9;
enum mptcp_data_avail data_avail; enum mptcp_data_avail data_avail;
bool scheduled;
u32 remote_nonce; u32 remote_nonce;
u64 thmac; u64 thmac;
u32 local_nonce; u32 local_nonce;
...@@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net); ...@@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net); int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net); int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt); const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk); bool __mptcp_retransmit_pending_data(struct sock *sk);
...@@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family, ...@@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
void mptcp_info2sockaddr(const struct mptcp_addr_info *info, void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr, struct sockaddr_storage *addr,
unsigned short family); unsigned short family);
struct mptcp_sched_ops *mptcp_sched_find(const char *name);
int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
void mptcp_sched_init(void);
int mptcp_init_sched(struct mptcp_sock *msk,
struct mptcp_sched_ops *sched);
void mptcp_release_sched(struct mptcp_sock *msk);
void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
bool scheduled);
struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
int mptcp_sched_get_send(struct mptcp_sock *msk);
int mptcp_sched_get_retrans(struct mptcp_sock *msk);
static inline bool __tcp_can_send(const struct sock *ssk) static inline bool __tcp_can_send(const struct sock *ssk)
{ {
......
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2022, SUSE.
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include "protocol.h"
static DEFINE_SPINLOCK(mptcp_sched_list_lock);
static LIST_HEAD(mptcp_sched_list);
static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk,
struct mptcp_sched_data *data)
{
struct sock *ssk;
ssk = data->reinject ? mptcp_subflow_get_retrans(msk) :
mptcp_subflow_get_send(msk);
if (!ssk)
return -EINVAL;
mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
return 0;
}
static struct mptcp_sched_ops mptcp_sched_default = {
.get_subflow = mptcp_sched_default_get_subflow,
.name = "default",
.owner = THIS_MODULE,
};
/* Must be called with rcu read lock held */
struct mptcp_sched_ops *mptcp_sched_find(const char *name)
{
struct mptcp_sched_ops *sched, *ret = NULL;
list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
if (!strcmp(sched->name, name)) {
ret = sched;
break;
}
}
return ret;
}
int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
{
if (!sched->get_subflow)
return -EINVAL;
spin_lock(&mptcp_sched_list_lock);
if (mptcp_sched_find(sched->name)) {
spin_unlock(&mptcp_sched_list_lock);
return -EEXIST;
}
list_add_tail_rcu(&sched->list, &mptcp_sched_list);
spin_unlock(&mptcp_sched_list_lock);
pr_debug("%s registered", sched->name);
return 0;
}
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
{
if (sched == &mptcp_sched_default)
return;
spin_lock(&mptcp_sched_list_lock);
list_del_rcu(&sched->list);
spin_unlock(&mptcp_sched_list_lock);
}
void mptcp_sched_init(void)
{
mptcp_register_scheduler(&mptcp_sched_default);
}
int mptcp_init_sched(struct mptcp_sock *msk,
struct mptcp_sched_ops *sched)
{
if (!sched)
sched = &mptcp_sched_default;
if (!bpf_try_module_get(sched, sched->owner))
return -EBUSY;
msk->sched = sched;
if (msk->sched->init)
msk->sched->init(msk);
pr_debug("sched=%s", msk->sched->name);
return 0;
}
void mptcp_release_sched(struct mptcp_sock *msk)
{
struct mptcp_sched_ops *sched = msk->sched;
if (!sched)
return;
msk->sched = NULL;
if (sched->release)
sched->release(msk);
bpf_module_put(sched, sched->owner);
}
void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
bool scheduled)
{
WRITE_ONCE(subflow->scheduled, scheduled);
}
int mptcp_sched_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sched_data data;
msk_owned_by_me(msk);
/* the following check is moved out of mptcp_subflow_get_send */
if (__mptcp_check_fallback(msk)) {
if (msk->first &&
__tcp_can_send(msk->first) &&
sk_stream_memory_free(msk->first)) {
mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
return 0;
}
return -EINVAL;
}
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled))
return 0;
}
data.reinject = false;
if (msk->sched == &mptcp_sched_default || !msk->sched)
return mptcp_sched_default_get_subflow(msk, &data);
return msk->sched->get_subflow(msk, &data);
}
int mptcp_sched_get_retrans(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sched_data data;
msk_owned_by_me(msk);
/* the following check is moved out of mptcp_subflow_get_retrans */
if (__mptcp_check_fallback(msk))
return -EINVAL;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled))
return 0;
}
data.reinject = true;
if (msk->sched == &mptcp_sched_default || !msk->sched)
return mptcp_sched_default_get_subflow(msk, &data);
return msk->sched->get_subflow(msk, &data);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment