Commit 96f84061 authored by Jason Wang's avatar Jason Wang Committed by David S. Miller

tun: add eBPF based queue selection method

This patch introduces an eBPF based queue selection method. With this,
the policy could be offloaded to userspace completely through a new
ioctl TUNSETSTEERINGEBPF.
Signed-off-by: default avatarJason Wang <jasowang@redhat.com>
Acked-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f520957d
...@@ -195,6 +195,11 @@ struct tun_flow_entry { ...@@ -195,6 +195,11 @@ struct tun_flow_entry {
#define TUN_NUM_FLOW_ENTRIES 1024 #define TUN_NUM_FLOW_ENTRIES 1024
struct tun_steering_prog {
struct rcu_head rcu;
struct bpf_prog *prog;
};
/* Since the socket were moved to tun_file, to preserve the behavior of persist /* Since the socket were moved to tun_file, to preserve the behavior of persist
* device, socket filter, sndbuf and vnet header size were restore when the * device, socket filter, sndbuf and vnet header size were restore when the
* file were attached to a persist device. * file were attached to a persist device.
...@@ -232,6 +237,7 @@ struct tun_struct { ...@@ -232,6 +237,7 @@ struct tun_struct {
u32 rx_batched; u32 rx_batched;
struct tun_pcpu_stats __percpu *pcpu_stats; struct tun_pcpu_stats __percpu *pcpu_stats;
struct bpf_prog __rcu *xdp_prog; struct bpf_prog __rcu *xdp_prog;
struct tun_steering_prog __rcu *steering_prog;
}; };
static int tun_napi_receive(struct napi_struct *napi, int budget) static int tun_napi_receive(struct napi_struct *napi, int budget)
...@@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash) ...@@ -537,15 +543,12 @@ static inline void tun_flow_save_rps_rxhash(struct tun_flow_entry *e, u32 hash)
* different rxq no. here. If we could not get rxhash, then we would * different rxq no. here. If we could not get rxhash, then we would
* hope the rxq no. may help here. * hope the rxq no. may help here.
*/ */
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, static u16 tun_automq_select_queue(struct tun_struct *tun, struct sk_buff *skb)
void *accel_priv, select_queue_fallback_t fallback)
{ {
struct tun_struct *tun = netdev_priv(dev);
struct tun_flow_entry *e; struct tun_flow_entry *e;
u32 txq = 0; u32 txq = 0;
u32 numqueues = 0; u32 numqueues = 0;
rcu_read_lock();
numqueues = READ_ONCE(tun->numqueues); numqueues = READ_ONCE(tun->numqueues);
txq = __skb_get_hash_symmetric(skb); txq = __skb_get_hash_symmetric(skb);
...@@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb, ...@@ -563,10 +566,37 @@ static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
txq -= numqueues; txq -= numqueues;
} }
rcu_read_unlock();
return txq; return txq;
} }
static u16 tun_ebpf_select_queue(struct tun_struct *tun, struct sk_buff *skb)
{
struct tun_steering_prog *prog;
u16 ret = 0;
prog = rcu_dereference(tun->steering_prog);
if (prog)
ret = bpf_prog_run_clear_cb(prog->prog, skb);
return ret % tun->numqueues;
}
static u16 tun_select_queue(struct net_device *dev, struct sk_buff *skb,
void *accel_priv, select_queue_fallback_t fallback)
{
struct tun_struct *tun = netdev_priv(dev);
u16 ret;
rcu_read_lock();
if (rcu_dereference(tun->steering_prog))
ret = tun_ebpf_select_queue(tun, skb);
else
ret = tun_automq_select_queue(tun, skb);
rcu_read_unlock();
return ret;
}
static inline bool tun_not_capable(struct tun_struct *tun) static inline bool tun_not_capable(struct tun_struct *tun)
{ {
const struct cred *cred = current_cred(); const struct cred *cred = current_cred();
...@@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev) ...@@ -933,23 +963,10 @@ static int tun_net_close(struct net_device *dev)
} }
/* Net device start xmit */ /* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) static void tun_automq_xmit(struct tun_struct *tun, struct sk_buff *skb)
{ {
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
u32 numqueues = 0;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
numqueues = READ_ONCE(tun->numqueues);
/* Drop packet if interface is not attached */
if (txq >= numqueues)
goto drop;
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
if (numqueues == 1 && static_key_false(&rps_needed)) { if (tun->numqueues == 1 && static_key_false(&rps_needed)) {
/* Select queue was not called for the skbuff, so we extract the /* Select queue was not called for the skbuff, so we extract the
* RPS hash and save it into the flow_table here. * RPS hash and save it into the flow_table here.
*/ */
...@@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev) ...@@ -965,6 +982,26 @@ static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
} }
} }
#endif #endif
}
/* Net device start xmit */
static netdev_tx_t tun_net_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct tun_struct *tun = netdev_priv(dev);
int txq = skb->queue_mapping;
struct tun_file *tfile;
u32 numqueues = 0;
rcu_read_lock();
tfile = rcu_dereference(tun->tfiles[txq]);
numqueues = READ_ONCE(tun->numqueues);
/* Drop packet if interface is not attached */
if (txq >= numqueues)
goto drop;
if (!rcu_dereference(tun->steering_prog))
tun_automq_xmit(tun, skb);
tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len); tun_debug(KERN_INFO, tun, "tun_net_xmit %d\n", skb->len);
...@@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, ...@@ -1547,7 +1584,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
int copylen; int copylen;
bool zerocopy = false; bool zerocopy = false;
int err; int err;
u32 rxhash; u32 rxhash = 0;
int skb_xdp = 1; int skb_xdp = 1;
bool frags = tun_napi_frags_enabled(tun); bool frags = tun_napi_frags_enabled(tun);
...@@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, ...@@ -1735,7 +1772,10 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
rcu_read_unlock(); rcu_read_unlock();
} }
rcu_read_lock();
if (!rcu_dereference(tun->steering_prog))
rxhash = __skb_get_hash_symmetric(skb); rxhash = __skb_get_hash_symmetric(skb);
rcu_read_unlock();
if (frags) { if (frags) {
/* Exercise flow dissector code path. */ /* Exercise flow dissector code path. */
...@@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile, ...@@ -1779,7 +1819,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
u64_stats_update_end(&stats->syncp); u64_stats_update_end(&stats->syncp);
put_cpu_ptr(stats); put_cpu_ptr(stats);
if (rxhash)
tun_flow_update(tun, rxhash, tfile); tun_flow_update(tun, rxhash, tfile);
return total_len; return total_len;
} }
...@@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) ...@@ -1987,6 +2029,36 @@ static ssize_t tun_chr_read_iter(struct kiocb *iocb, struct iov_iter *to)
return ret; return ret;
} }
static void tun_steering_prog_free(struct rcu_head *rcu)
{
struct tun_steering_prog *prog = container_of(rcu,
struct tun_steering_prog, rcu);
bpf_prog_destroy(prog->prog);
kfree(prog);
}
static int __tun_set_steering_ebpf(struct tun_struct *tun,
struct bpf_prog *prog)
{
struct tun_steering_prog *old, *new = NULL;
if (prog) {
new = kmalloc(sizeof(*new), GFP_KERNEL);
if (!new)
return -ENOMEM;
new->prog = prog;
}
old = rtnl_dereference(tun->steering_prog);
rcu_assign_pointer(tun->steering_prog, new);
if (old)
call_rcu(&old->rcu, tun_steering_prog_free);
return 0;
}
static void tun_free_netdev(struct net_device *dev) static void tun_free_netdev(struct net_device *dev)
{ {
struct tun_struct *tun = netdev_priv(dev); struct tun_struct *tun = netdev_priv(dev);
...@@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev) ...@@ -1995,6 +2067,9 @@ static void tun_free_netdev(struct net_device *dev)
free_percpu(tun->pcpu_stats); free_percpu(tun->pcpu_stats);
tun_flow_uninit(tun); tun_flow_uninit(tun);
security_tun_dev_free_security(tun->security); security_tun_dev_free_security(tun->security);
rtnl_lock();
__tun_set_steering_ebpf(tun, NULL);
rtnl_unlock();
} }
static void tun_setup(struct net_device *dev) static void tun_setup(struct net_device *dev)
...@@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr) ...@@ -2283,6 +2358,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->filter_attached = false; tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf; tun->sndbuf = tfile->socket.sk->sk_sndbuf;
tun->rx_batched = 0; tun->rx_batched = 0;
RCU_INIT_POINTER(tun->steering_prog, NULL);
tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats); tun->pcpu_stats = netdev_alloc_pcpu_stats(struct tun_pcpu_stats);
if (!tun->pcpu_stats) { if (!tun->pcpu_stats) {
...@@ -2475,6 +2551,25 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr) ...@@ -2475,6 +2551,25 @@ static int tun_set_queue(struct file *file, struct ifreq *ifr)
return ret; return ret;
} }
static int tun_set_steering_ebpf(struct tun_struct *tun, void __user *data)
{
struct bpf_prog *prog;
int fd;
if (copy_from_user(&fd, data, sizeof(fd)))
return -EFAULT;
if (fd == -1) {
prog = NULL;
} else {
prog = bpf_prog_get_type(fd, BPF_PROG_TYPE_SOCKET_FILTER);
if (IS_ERR(prog))
return PTR_ERR(prog);
}
return __tun_set_steering_ebpf(tun, prog);
}
static long __tun_chr_ioctl(struct file *file, unsigned int cmd, static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg, int ifreq_len) unsigned long arg, int ifreq_len)
{ {
...@@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd, ...@@ -2751,6 +2846,10 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
ret = 0; ret = 0;
break; break;
case TUNSETSTEERINGEBPF:
ret = tun_set_steering_ebpf(tun, argp);
break;
default: default:
ret = -EINVAL; ret = -EINVAL;
break; break;
......
...@@ -57,6 +57,7 @@ ...@@ -57,6 +57,7 @@
*/ */
#define TUNSETVNETBE _IOW('T', 222, int) #define TUNSETVNETBE _IOW('T', 222, int)
#define TUNGETVNETBE _IOR('T', 223, int) #define TUNGETVNETBE _IOR('T', 223, int)
#define TUNSETSTEERINGEBPF _IOR('T', 224, int)
/* TUNSETIFF ifr flags */ /* TUNSETIFF ifr flags */
#define IFF_TUN 0x0001 #define IFF_TUN 0x0001
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment