Commit 5bc1421e authored by Neil Horman's avatar Neil Horman Committed by David S. Miller

net: add network priority cgroup infrastructure (v4)

This patch adds in the infrastructure code to create the network priority
cgroup.  The cgroup, in addition to the standard processes file creates two
control files:

1) prioidx - This is a read-only file that exports the index of this cgroup.
This is a value that is both arbitrary and unique to a cgroup in this subsystem,
and is used to index the per-device priority map

2) priomap - This is a writeable file.  On read it reports a table of 2-tuples
<name:priority> where name is the name of a network interface and priority is
indicates the priority assigned to frames egresessing on the named interface and
originating from a pid in this cgroup

This cgroup allows for skb priority to be set prior to a root qdisc getting
selected. This is benenficial for DCB enabled systems, in that it allows for any
application to use dcb configured priorities so without application modification
Signed-off-by: default avatarNeil Horman <nhorman@tuxdriver.com>
Signed-off-by: default avatarJohn Fastabend <john.r.fastabend@intel.com>
CC: Robert Love <robert.w.love@intel.com>
CC: "David S. Miller" <davem@davemloft.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 202ff1c2
...@@ -59,8 +59,16 @@ SUBSYS(net_cls) ...@@ -59,8 +59,16 @@ SUBSYS(net_cls)
SUBSYS(blkio) SUBSYS(blkio)
#endif #endif
/* */
#ifdef CONFIG_CGROUP_PERF #ifdef CONFIG_CGROUP_PERF
SUBSYS(perf) SUBSYS(perf)
#endif #endif
/* */ /* */
#ifdef CONFIG_NETPRIO_CGROUP
SUBSYS(net_prio)
#endif
/* */
...@@ -50,6 +50,7 @@ ...@@ -50,6 +50,7 @@
#ifdef CONFIG_DCB #ifdef CONFIG_DCB
#include <net/dcbnl.h> #include <net/dcbnl.h>
#endif #endif
#include <net/netprio_cgroup.h>
#include <linux/netdev_features.h> #include <linux/netdev_features.h>
...@@ -1244,6 +1245,9 @@ struct net_device { ...@@ -1244,6 +1245,9 @@ struct net_device {
#if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE) #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
/* max exchange id for FCoE LRO by ddp */ /* max exchange id for FCoE LRO by ddp */
unsigned int fcoe_ddp_xid; unsigned int fcoe_ddp_xid;
#endif
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
struct netprio_map __rcu *priomap;
#endif #endif
/* phy device may attach itself for hardware timestamping */ /* phy device may attach itself for hardware timestamping */
struct phy_device *phydev; struct phy_device *phydev;
......
/*
* netprio_cgroup.h Control Group Priority set
*
*
* Authors: Neil Horman <nhorman@tuxdriver.com>
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
*/
#ifndef _NETPRIO_CGROUP_H
#define _NETPRIO_CGROUP_H
#include <linux/module.h>
#include <linux/cgroup.h>
#include <linux/hardirq.h>
#include <linux/rcupdate.h>
struct cgroup_netprio_state
{
struct cgroup_subsys_state css;
u32 prioidx;
};
struct netprio_map {
struct rcu_head rcu;
u32 priomap_len;
u32 priomap[];
};
#ifdef CONFIG_CGROUPS
#ifndef CONFIG_NETPRIO_CGROUP
extern int net_prio_subsys_id;
#endif
extern void sock_update_netprioidx(struct sock *sk);
static inline struct cgroup_netprio_state
*task_netprio_state(struct task_struct *p)
{
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
return container_of(task_subsys_state(p, net_prio_subsys_id),
struct cgroup_netprio_state, css);
#else
return NULL;
#endif
}
#else
#define sock_update_netprioidx(sk)
#define skb_update_prio(skb)
static inline struct cgroup_netprio_state
*task_netprio_state(struct task_struct *p)
{
return NULL;
}
#endif
#endif /* _NET_CLS_CGROUP_H */
...@@ -320,6 +320,9 @@ struct sock { ...@@ -320,6 +320,9 @@ struct sock {
unsigned short sk_ack_backlog; unsigned short sk_ack_backlog;
unsigned short sk_max_ack_backlog; unsigned short sk_max_ack_backlog;
__u32 sk_priority; __u32 sk_priority;
#ifdef CONFIG_CGROUPS
__u32 sk_cgrp_prioidx;
#endif
struct pid *sk_peer_pid; struct pid *sk_peer_pid;
const struct cred *sk_peer_cred; const struct cred *sk_peer_cred;
long sk_rcvtimeo; long sk_rcvtimeo;
......
...@@ -232,6 +232,13 @@ config XPS ...@@ -232,6 +232,13 @@ config XPS
depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS depends on SMP && SYSFS && USE_GENERIC_SMP_HELPERS
default y default y
config NETPRIO_CGROUP
tristate "Network priority cgroup"
depends on CGROUPS
---help---
Cgroup subsystem for use in assigning processes to network priorities on
a per-interface basis
config HAVE_BPF_JIT config HAVE_BPF_JIT
bool bool
......
...@@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o ...@@ -19,3 +19,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
...@@ -2449,6 +2449,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, ...@@ -2449,6 +2449,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc; return rc;
} }
#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
static void skb_update_prio(struct sk_buff *skb)
{
struct netprio_map *map = rcu_dereference(skb->dev->priomap);
if ((!skb->priority) && (skb->sk) && map)
skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
}
#else
#define skb_update_prio(skb)
#endif
static DEFINE_PER_CPU(int, xmit_recursion); static DEFINE_PER_CPU(int, xmit_recursion);
#define RECURSION_LIMIT 10 #define RECURSION_LIMIT 10
...@@ -2489,6 +2501,8 @@ int dev_queue_xmit(struct sk_buff *skb) ...@@ -2489,6 +2501,8 @@ int dev_queue_xmit(struct sk_buff *skb)
*/ */
rcu_read_lock_bh(); rcu_read_lock_bh();
skb_update_prio(skb);
txq = dev_pick_tx(dev, skb); txq = dev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc); q = rcu_dereference_bh(txq->qdisc);
......
/*
* net/core/netprio_cgroup.c Priority Control Group
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
* Authors: Neil Horman <nhorman@tuxdriver.com>
*/
#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
#include <linux/errno.h>
#include <linux/skbuff.h>
#include <linux/cgroup.h>
#include <linux/rcupdate.h>
#include <linux/atomic.h>
#include <net/rtnetlink.h>
#include <net/pkt_cls.h>
#include <net/sock.h>
#include <net/netprio_cgroup.h>
static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
struct cgroup *cgrp);
static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
struct cgroup_subsys net_prio_subsys = {
.name = "net_prio",
.create = cgrp_create,
.destroy = cgrp_destroy,
.populate = cgrp_populate,
#ifdef CONFIG_NETPRIO_CGROUP
.subsys_id = net_prio_subsys_id,
#endif
.module = THIS_MODULE
};
#define PRIOIDX_SZ 128
static unsigned long prioidx_map[PRIOIDX_SZ];
static DEFINE_SPINLOCK(prioidx_map_lock);
static atomic_t max_prioidx = ATOMIC_INIT(0);
static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
{
return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
struct cgroup_netprio_state, css);
}
static int get_prioidx(u32 *prio)
{
unsigned long flags;
u32 prioidx;
spin_lock_irqsave(&prioidx_map_lock, flags);
prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
set_bit(prioidx, prioidx_map);
spin_unlock_irqrestore(&prioidx_map_lock, flags);
if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
return -ENOSPC;
atomic_set(&max_prioidx, prioidx);
*prio = prioidx;
return 0;
}
static void put_prioidx(u32 idx)
{
unsigned long flags;
spin_lock_irqsave(&prioidx_map_lock, flags);
clear_bit(idx, prioidx_map);
spin_unlock_irqrestore(&prioidx_map_lock, flags);
}
static void extend_netdev_table(struct net_device *dev, u32 new_len)
{
size_t new_size = sizeof(struct netprio_map) +
((sizeof(u32) * new_len));
struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
struct netprio_map *old_priomap;
int i;
old_priomap = rtnl_dereference(dev->priomap);
if (!new_priomap) {
printk(KERN_WARNING "Unable to alloc new priomap!\n");
return;
}
for (i = 0;
old_priomap && (i < old_priomap->priomap_len);
i++)
new_priomap->priomap[i] = old_priomap->priomap[i];
new_priomap->priomap_len = new_len;
rcu_assign_pointer(dev->priomap, new_priomap);
if (old_priomap)
kfree_rcu(old_priomap, rcu);
}
static void update_netdev_tables(void)
{
struct net_device *dev;
u32 max_len = atomic_read(&max_prioidx);
struct netprio_map *map;
rtnl_lock();
for_each_netdev(&init_net, dev) {
map = rtnl_dereference(dev->priomap);
if ((!map) ||
(map->priomap_len < max_len))
extend_netdev_table(dev, max_len);
}
rtnl_unlock();
}
static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
struct cgroup *cgrp)
{
struct cgroup_netprio_state *cs;
int ret;
cs = kzalloc(sizeof(*cs), GFP_KERNEL);
if (!cs)
return ERR_PTR(-ENOMEM);
if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
kfree(cs);
return ERR_PTR(-EINVAL);
}
ret = get_prioidx(&cs->prioidx);
if (ret != 0) {
printk(KERN_WARNING "No space in priority index array\n");
kfree(cs);
return ERR_PTR(ret);
}
return &cs->css;
}
static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
struct cgroup_netprio_state *cs;
struct net_device *dev;
struct netprio_map *map;
cs = cgrp_netprio_state(cgrp);
rtnl_lock();
for_each_netdev(&init_net, dev) {
map = rtnl_dereference(dev->priomap);
if (map)
map->priomap[cs->prioidx] = 0;
}
rtnl_unlock();
put_prioidx(cs->prioidx);
kfree(cs);
}
static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
{
return (u64)cgrp_netprio_state(cgrp)->prioidx;
}
static int read_priomap(struct cgroup *cont, struct cftype *cft,
struct cgroup_map_cb *cb)
{
struct net_device *dev;
u32 prioidx = cgrp_netprio_state(cont)->prioidx;
u32 priority;
struct netprio_map *map;
rcu_read_lock();
for_each_netdev_rcu(&init_net, dev) {
map = rcu_dereference(dev->priomap);
priority = map ? map->priomap[prioidx] : 0;
cb->fill(cb, dev->name, priority);
}
rcu_read_unlock();
return 0;
}
static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
const char *buffer)
{
char *devname = kstrdup(buffer, GFP_KERNEL);
int ret = -EINVAL;
u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
unsigned long priority;
char *priostr;
struct net_device *dev;
struct netprio_map *map;
if (!devname)
return -ENOMEM;
/*
* Minimally sized valid priomap string
*/
if (strlen(devname) < 3)
goto out_free_devname;
priostr = strstr(devname, " ");
if (!priostr)
goto out_free_devname;
/*
*Separate the devname from the associated priority
*and advance the priostr poitner to the priority value
*/
*priostr = '\0';
priostr++;
/*
* If the priostr points to NULL, we're at the end of the passed
* in string, and its not a valid write
*/
if (*priostr == '\0')
goto out_free_devname;
ret = kstrtoul(priostr, 10, &priority);
if (ret < 0)
goto out_free_devname;
ret = -ENODEV;
dev = dev_get_by_name(&init_net, devname);
if (!dev)
goto out_free_devname;
update_netdev_tables();
ret = 0;
rcu_read_lock();
map = rcu_dereference(dev->priomap);
if (map)
map->priomap[prioidx] = priority;
rcu_read_unlock();
dev_put(dev);
out_free_devname:
kfree(devname);
return ret;
}
static struct cftype ss_files[] = {
{
.name = "prioidx",
.read_u64 = read_prioidx,
},
{
.name = "ifpriomap",
.read_map = read_priomap,
.write_string = write_priomap,
},
};
static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
{
return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
}
static int netprio_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
struct net_device *dev = ptr;
struct netprio_map *old;
u32 max_len = atomic_read(&max_prioidx);
/*
* Note this is called with rtnl_lock held so we have update side
* protection on our rcu assignments
*/
switch (event) {
case NETDEV_REGISTER:
if (max_len)
extend_netdev_table(dev, max_len);
break;
case NETDEV_UNREGISTER:
old = rtnl_dereference(dev->priomap);
rcu_assign_pointer(dev->priomap, NULL);
if (old)
kfree_rcu(old, rcu);
break;
}
return NOTIFY_DONE;
}
static struct notifier_block netprio_device_notifier = {
.notifier_call = netprio_device_event
};
static int __init init_cgroup_netprio(void)
{
int ret;
ret = cgroup_load_subsys(&net_prio_subsys);
if (ret)
goto out;
#ifndef CONFIG_NETPRIO_CGROUP
smp_wmb();
net_prio_subsys_id = net_prio_subsys.subsys_id;
#endif
register_netdevice_notifier(&netprio_device_notifier);
out:
return ret;
}
static void __exit exit_cgroup_netprio(void)
{
struct netprio_map *old;
struct net_device *dev;
unregister_netdevice_notifier(&netprio_device_notifier);
cgroup_unload_subsys(&net_prio_subsys);
#ifndef CONFIG_NETPRIO_CGROUP
net_prio_subsys_id = -1;
synchronize_rcu();
#endif
rtnl_lock();
for_each_netdev(&init_net, dev) {
old = rtnl_dereference(dev->priomap);
rcu_assign_pointer(dev->priomap, NULL);
if (old)
kfree_rcu(old, rcu);
}
rtnl_unlock();
}
module_init(init_cgroup_netprio);
module_exit(exit_cgroup_netprio);
MODULE_LICENSE("GPL v2");
...@@ -125,6 +125,7 @@ ...@@ -125,6 +125,7 @@
#include <net/xfrm.h> #include <net/xfrm.h>
#include <linux/ipsec.h> #include <linux/ipsec.h>
#include <net/cls_cgroup.h> #include <net/cls_cgroup.h>
#include <net/netprio_cgroup.h>
#include <linux/filter.h> #include <linux/filter.h>
...@@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX; ...@@ -221,10 +222,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512); int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max); EXPORT_SYMBOL(sysctl_optmem_max);
#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP) #if defined(CONFIG_CGROUPS)
#if !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1; int net_cls_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_cls_subsys_id); EXPORT_SYMBOL_GPL(net_cls_subsys_id);
#endif #endif
#if !defined(CONFIG_NETPRIO_CGROUP)
int net_prio_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_prio_subsys_id);
#endif
#endif
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen) static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{ {
...@@ -1120,6 +1127,18 @@ void sock_update_classid(struct sock *sk) ...@@ -1120,6 +1127,18 @@ void sock_update_classid(struct sock *sk)
sk->sk_classid = classid; sk->sk_classid = classid;
} }
EXPORT_SYMBOL(sock_update_classid); EXPORT_SYMBOL(sock_update_classid);
void sock_update_netprioidx(struct sock *sk)
{
struct cgroup_netprio_state *state;
if (in_interrupt())
return;
rcu_read_lock();
state = task_netprio_state(current);
sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif #endif
/** /**
...@@ -1147,6 +1166,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, ...@@ -1147,6 +1166,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
atomic_set(&sk->sk_wmem_alloc, 1); atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk); sock_update_classid(sk);
sock_update_netprioidx(sk);
} }
return sk; return sk;
......
...@@ -551,6 +551,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock, ...@@ -551,6 +551,8 @@ static inline int __sock_sendmsg_nosec(struct kiocb *iocb, struct socket *sock,
sock_update_classid(sock->sk); sock_update_classid(sock->sk);
sock_update_netprioidx(sock->sk);
si->sock = sock; si->sock = sock;
si->scm = NULL; si->scm = NULL;
si->msg = msg; si->msg = msg;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment