Commit 978f4175 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'mptcp-prepare-mptcp-packet-scheduler-for-bpf-extension'

Mat Martineau says:

====================
mptcp: Prepare MPTCP packet scheduler for BPF extension

The kernel's MPTCP packet scheduler has, to date, been a one-size-fits
all algorithm that is hard-coded. It attempts to balance latency and
throughput when transmitting data across multiple TCP subflows, and has
some limited tunability through sysctls. It has been a long-term goal of
the Linux MPTCP community to support customizable packet schedulers for
use cases that need to make different trade-offs regarding latency,
throughput, redundancy, and other metrics. BPF is well-suited for
configuring customized, per-packet scheduling decisions without having
to modify the kernel or manage out-of-tree kernel modules.

The first steps toward implementing BPF packet schedulers are to update
the existing MPTCP transmit loops to allow more flexible scheduling
decisions, and to add infrastructure for swappable packet schedulers.
The existing scheduling algorithm remains the default. BPF-related
changes will be in a future patch series.

This code has been in the MPTCP development tree for quite a while,
undergoing testing in our CI and community.

Patches 1 and 2 refactor the transmit code and do some related cleanup.

Patches 3-9 add infrastructure for registering and calling multiple
schedulers.

Patch 10 connects the in-kernel default scheduler to the new
infrastructure.
====================

Link: https://lore.kernel.org/r/20230821-upstream-net-next-20230818-v1-0-0c860fb256a8@kernel.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 98173633 ed1ad86b
......@@ -74,3 +74,11 @@ stale_loss_cnt - INTEGER
This is a per-namespace sysctl.
Default: 4
scheduler - STRING
Select the scheduler of your choice.
Support for selection of different schedulers. This is a per-namespace
sysctl.
Default: "default"
......@@ -96,6 +96,27 @@ struct mptcp_out_options {
#endif
};
#define MPTCP_SCHED_NAME_MAX 16
#define MPTCP_SUBFLOWS_MAX 8
struct mptcp_sched_data {
bool reinject;
u8 subflows;
struct mptcp_subflow_context *contexts[MPTCP_SUBFLOWS_MAX];
};
struct mptcp_sched_ops {
int (*get_subflow)(struct mptcp_sock *msk,
struct mptcp_sched_data *data);
char name[MPTCP_SCHED_NAME_MAX];
struct module *owner;
struct list_head list;
void (*init)(struct mptcp_sock *msk);
void (*release)(struct mptcp_sock *msk);
} ____cacheline_aligned_in_smp;
#ifdef CONFIG_MPTCP
void mptcp_init(void);
......
......@@ -2,7 +2,7 @@
obj-$(CONFIG_MPTCP) += mptcp.o
mptcp-y := protocol.o subflow.o options.o token.o crypto.o ctrl.o pm.o diag.o \
mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o
mib.o pm_netlink.o sockopt.o pm_userspace.o fastopen.o sched.o
obj-$(CONFIG_SYN_COOKIES) += syncookies.o
obj-$(CONFIG_INET_MPTCP_DIAG) += mptcp_diag.o
......
......@@ -32,6 +32,7 @@ struct mptcp_pernet {
u8 checksum_enabled;
u8 allow_join_initial_addr_port;
u8 pm_type;
char scheduler[MPTCP_SCHED_NAME_MAX];
};
static struct mptcp_pernet *mptcp_get_pernet(const struct net *net)
......@@ -69,6 +70,11 @@ int mptcp_get_pm_type(const struct net *net)
return mptcp_get_pernet(net)->pm_type;
}
const char *mptcp_get_scheduler(const struct net *net)
{
return mptcp_get_pernet(net)->scheduler;
}
static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
{
pernet->mptcp_enabled = 1;
......@@ -77,6 +83,7 @@ static void mptcp_pernet_set_defaults(struct mptcp_pernet *pernet)
pernet->allow_join_initial_addr_port = 1;
pernet->stale_loss_cnt = 4;
pernet->pm_type = MPTCP_PM_TYPE_KERNEL;
strcpy(pernet->scheduler, "default");
}
#ifdef CONFIG_SYSCTL
......@@ -128,6 +135,12 @@ static struct ctl_table mptcp_sysctl_table[] = {
.extra1 = SYSCTL_ZERO,
.extra2 = &mptcp_pm_type_max
},
{
.procname = "scheduler",
.maxlen = MPTCP_SCHED_NAME_MAX,
.mode = 0644,
.proc_handler = proc_dostring,
},
{}
};
......@@ -149,6 +162,7 @@ static int mptcp_pernet_new_table(struct net *net, struct mptcp_pernet *pernet)
table[3].data = &pernet->allow_join_initial_addr_port;
table[4].data = &pernet->stale_loss_cnt;
table[5].data = &pernet->pm_type;
table[6].data = &pernet->scheduler;
hdr = register_net_sysctl(net, MPTCP_SYSCTL_PATH, table);
if (!hdr)
......
......@@ -299,15 +299,8 @@ void mptcp_pm_mp_prio_received(struct sock *ssk, u8 bkup)
pr_debug("subflow->backup=%d, bkup=%d\n", subflow->backup, bkup);
msk = mptcp_sk(sk);
if (subflow->backup != bkup) {
if (subflow->backup != bkup)
subflow->backup = bkup;
mptcp_data_lock(sk);
if (!sock_owned_by_user(sk))
msk->last_snd = NULL;
else
__set_bit(MPTCP_RESET_SCHEDULER, &msk->cb_flags);
mptcp_data_unlock(sk);
}
mptcp_event(MPTCP_EVENT_SUB_PRIORITY, msk, ssk, GFP_ATOMIC);
}
......
......@@ -472,9 +472,6 @@ static void __mptcp_pm_send_ack(struct mptcp_sock *msk, struct mptcp_subflow_con
slow = lock_sock_fast(ssk);
if (prio) {
if (subflow->backup != backup)
msk->last_snd = NULL;
subflow->send_mp_prio = 1;
subflow->backup = backup;
subflow->request_bkup = backup;
......
This diff is collapsed.
......@@ -123,7 +123,6 @@
#define MPTCP_RETRANSMIT 4
#define MPTCP_FLUSH_JOIN_LIST 5
#define MPTCP_CONNECTED 6
#define MPTCP_RESET_SCHEDULER 7
struct mptcp_skb_cb {
u64 map_seq;
......@@ -269,7 +268,6 @@ struct mptcp_sock {
u64 rcv_data_fin_seq;
u64 bytes_retrans;
int rmem_fwd_alloc;
struct sock *last_snd;
int snd_burst;
int old_wspace;
u64 recovery_snd_nxt; /* in recovery mode accept up to this seq;
......@@ -314,6 +312,7 @@ struct mptcp_sock {
* lock as such sock is freed after close().
*/
struct mptcp_pm_data pm;
struct mptcp_sched_ops *sched;
struct {
u32 space; /* bytes copied in last measurement window */
u32 copied; /* bytes copied in this measurement window */
......@@ -492,6 +491,7 @@ struct mptcp_subflow_context {
is_mptfo : 1, /* subflow is doing TFO */
__unused : 9;
enum mptcp_data_avail data_avail;
bool scheduled;
u32 remote_nonce;
u64 thmac;
u32 local_nonce;
......@@ -625,6 +625,7 @@ int mptcp_is_checksum_enabled(const struct net *net);
int mptcp_allow_join_id0(const struct net *net);
unsigned int mptcp_stale_loss_cnt(const struct net *net);
int mptcp_get_pm_type(const struct net *net);
const char *mptcp_get_scheduler(const struct net *net);
void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow,
const struct mptcp_options_received *mp_opt);
bool __mptcp_retransmit_pending_data(struct sock *sk);
......@@ -657,6 +658,19 @@ int mptcp_subflow_create_socket(struct sock *sk, unsigned short family,
void mptcp_info2sockaddr(const struct mptcp_addr_info *info,
struct sockaddr_storage *addr,
unsigned short family);
struct mptcp_sched_ops *mptcp_sched_find(const char *name);
int mptcp_register_scheduler(struct mptcp_sched_ops *sched);
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched);
void mptcp_sched_init(void);
int mptcp_init_sched(struct mptcp_sock *msk,
struct mptcp_sched_ops *sched);
void mptcp_release_sched(struct mptcp_sock *msk);
void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
bool scheduled);
struct sock *mptcp_subflow_get_send(struct mptcp_sock *msk);
struct sock *mptcp_subflow_get_retrans(struct mptcp_sock *msk);
int mptcp_sched_get_send(struct mptcp_sock *msk);
int mptcp_sched_get_retrans(struct mptcp_sock *msk);
static inline bool __tcp_can_send(const struct sock *ssk)
{
......
// SPDX-License-Identifier: GPL-2.0
/* Multipath TCP
*
* Copyright (c) 2022, SUSE.
*/
#define pr_fmt(fmt) "MPTCP: " fmt
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/spinlock.h>
#include "protocol.h"
static DEFINE_SPINLOCK(mptcp_sched_list_lock);
static LIST_HEAD(mptcp_sched_list);
static int mptcp_sched_default_get_subflow(struct mptcp_sock *msk,
struct mptcp_sched_data *data)
{
struct sock *ssk;
ssk = data->reinject ? mptcp_subflow_get_retrans(msk) :
mptcp_subflow_get_send(msk);
if (!ssk)
return -EINVAL;
mptcp_subflow_set_scheduled(mptcp_subflow_ctx(ssk), true);
return 0;
}
static struct mptcp_sched_ops mptcp_sched_default = {
.get_subflow = mptcp_sched_default_get_subflow,
.name = "default",
.owner = THIS_MODULE,
};
/* Must be called with rcu read lock held */
struct mptcp_sched_ops *mptcp_sched_find(const char *name)
{
struct mptcp_sched_ops *sched, *ret = NULL;
list_for_each_entry_rcu(sched, &mptcp_sched_list, list) {
if (!strcmp(sched->name, name)) {
ret = sched;
break;
}
}
return ret;
}
int mptcp_register_scheduler(struct mptcp_sched_ops *sched)
{
if (!sched->get_subflow)
return -EINVAL;
spin_lock(&mptcp_sched_list_lock);
if (mptcp_sched_find(sched->name)) {
spin_unlock(&mptcp_sched_list_lock);
return -EEXIST;
}
list_add_tail_rcu(&sched->list, &mptcp_sched_list);
spin_unlock(&mptcp_sched_list_lock);
pr_debug("%s registered", sched->name);
return 0;
}
void mptcp_unregister_scheduler(struct mptcp_sched_ops *sched)
{
if (sched == &mptcp_sched_default)
return;
spin_lock(&mptcp_sched_list_lock);
list_del_rcu(&sched->list);
spin_unlock(&mptcp_sched_list_lock);
}
void mptcp_sched_init(void)
{
mptcp_register_scheduler(&mptcp_sched_default);
}
int mptcp_init_sched(struct mptcp_sock *msk,
struct mptcp_sched_ops *sched)
{
if (!sched)
sched = &mptcp_sched_default;
if (!bpf_try_module_get(sched, sched->owner))
return -EBUSY;
msk->sched = sched;
if (msk->sched->init)
msk->sched->init(msk);
pr_debug("sched=%s", msk->sched->name);
return 0;
}
void mptcp_release_sched(struct mptcp_sock *msk)
{
struct mptcp_sched_ops *sched = msk->sched;
if (!sched)
return;
msk->sched = NULL;
if (sched->release)
sched->release(msk);
bpf_module_put(sched, sched->owner);
}
void mptcp_subflow_set_scheduled(struct mptcp_subflow_context *subflow,
bool scheduled)
{
WRITE_ONCE(subflow->scheduled, scheduled);
}
int mptcp_sched_get_send(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sched_data data;
msk_owned_by_me(msk);
/* the following check is moved out of mptcp_subflow_get_send */
if (__mptcp_check_fallback(msk)) {
if (msk->first &&
__tcp_can_send(msk->first) &&
sk_stream_memory_free(msk->first)) {
mptcp_subflow_set_scheduled(mptcp_subflow_ctx(msk->first), true);
return 0;
}
return -EINVAL;
}
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled))
return 0;
}
data.reinject = false;
if (msk->sched == &mptcp_sched_default || !msk->sched)
return mptcp_sched_default_get_subflow(msk, &data);
return msk->sched->get_subflow(msk, &data);
}
int mptcp_sched_get_retrans(struct mptcp_sock *msk)
{
struct mptcp_subflow_context *subflow;
struct mptcp_sched_data data;
msk_owned_by_me(msk);
/* the following check is moved out of mptcp_subflow_get_retrans */
if (__mptcp_check_fallback(msk))
return -EINVAL;
mptcp_for_each_subflow(msk, subflow) {
if (READ_ONCE(subflow->scheduled))
return 0;
}
data.reinject = true;
if (msk->sched == &mptcp_sched_default || !msk->sched)
return mptcp_sched_default_get_subflow(msk, &data);
return msk->sched->get_subflow(msk, &data);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment