Commit 1c0d32fd authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

net_sched: gen_estimator: complete rewrite of rate estimators

1) Old code was hard to maintain, due to complex lock chains.
   (We probably will be able to remove some kfree_rcu() in callers)

2) Using a single timer to update all estimators does not scale.

3) Code was buggy on 32bit kernel (WRITE_ONCE() on 64bit quantity
   is not supposed to work well)

In this rewrite :

- I removed the RB tree that had to be scanned in
  gen_estimator_active(). qdisc dumps should be much faster.

- Each estimator has its own timer.

- Estimations are maintained in net_rate_estimator structure,
  instead of dirtying the qdisc. Minor, but part of the simplification.

- Reading the estimator uses RCU and a seqcount to provide proper
  support for 32bit kernels.

- We reduce memory need when estimators are not used, since
  we store a pointer, instead of the bytes/packets counters.

- xt_rateest_mt() no longer has to grab a spinlock.
  (In the future, xt_rateest_tg() could be switched to per cpu counters)
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent a6e16931
......@@ -36,7 +36,7 @@ struct tc_action {
struct tcf_t tcfa_tm;
struct gnet_stats_basic_packed tcfa_bstats;
struct gnet_stats_queue tcfa_qstats;
struct gnet_stats_rate_est64 tcfa_rate_est;
struct net_rate_estimator __rcu *tcfa_rate_est;
spinlock_t tcfa_lock;
struct rcu_head tcfa_rcu;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
......
......@@ -11,6 +11,8 @@ struct gnet_stats_basic_cpu {
struct u64_stats_sync syncp;
};
struct net_rate_estimator;
struct gnet_dump {
spinlock_t * lock;
struct sk_buff * skb;
......@@ -42,8 +44,7 @@ void __gnet_stats_copy_basic(const seqcount_t *running,
struct gnet_stats_basic_cpu __percpu *cpu,
struct gnet_stats_basic_packed *b);
int gnet_stats_copy_rate_est(struct gnet_dump *d,
const struct gnet_stats_basic_packed *b,
struct gnet_stats_rate_est64 *r);
struct net_rate_estimator __rcu **ptr);
int gnet_stats_copy_queue(struct gnet_dump *d,
struct gnet_stats_queue __percpu *cpu_q,
struct gnet_stats_queue *q, __u32 qlen);
......@@ -53,16 +54,16 @@ int gnet_stats_finish_copy(struct gnet_dump *d);
int gen_new_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
struct net_rate_estimator __rcu **rate_est,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
void gen_kill_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_rate_est64 *rate_est);
void gen_kill_estimator(struct net_rate_estimator __rcu **ptr);
int gen_replace_estimator(struct gnet_stats_basic_packed *bstats,
struct gnet_stats_basic_cpu __percpu *cpu_bstats,
struct gnet_stats_rate_est64 *rate_est,
struct net_rate_estimator __rcu **ptr,
spinlock_t *stats_lock,
seqcount_t *running, struct nlattr *opt);
bool gen_estimator_active(const struct gnet_stats_basic_packed *bstats,
const struct gnet_stats_rate_est64 *rate_est);
bool gen_estimator_active(struct net_rate_estimator __rcu **ptr);
bool gen_estimator_read(struct net_rate_estimator __rcu **ptr,
struct gnet_stats_rate_est64 *sample);
#endif
#ifndef _XT_RATEEST_H
#define _XT_RATEEST_H
#include <net/gen_stats.h>
struct xt_rateest {
/* keep lock and bstats on same cache line to speedup xt_rateest_tg() */
struct gnet_stats_basic_packed bstats;
spinlock_t lock;
/* keep rstats and lock on same cache line to speedup xt_rateest_mt() */
struct gnet_stats_rate_est64 rstats;
/* following fields not accessed in hot path */
unsigned int refcnt;
struct hlist_node list;
char name[IFNAMSIZ];
unsigned int refcnt;
struct gnet_estimator params;
struct rcu_head rcu;
/* keep this field far away to speedup xt_rateest_mt() */
struct net_rate_estimator __rcu *rate_est;
};
struct xt_rateest *xt_rateest_lookup(const char *name);
......
......@@ -76,7 +76,7 @@ struct Qdisc {
struct netdev_queue *dev_queue;
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
struct gnet_stats_basic_cpu __percpu *cpu_bstats;
struct gnet_stats_queue __percpu *cpu_qstats;
......
This diff is collapsed.
......@@ -194,8 +194,7 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
/**
* gnet_stats_copy_rate_est - copy rate estimator statistics into statistics TLV
* @d: dumping handle
* @b: basic statistics
* @r: rate estimator statistics
* @rate_est: rate estimator
*
* Appends the rate estimator statistics to the top level TLV created by
* gnet_stats_start_copy().
......@@ -205,18 +204,17 @@ EXPORT_SYMBOL(gnet_stats_copy_basic);
*/
int
gnet_stats_copy_rate_est(struct gnet_dump *d,
const struct gnet_stats_basic_packed *b,
struct gnet_stats_rate_est64 *r)
struct net_rate_estimator __rcu **rate_est)
{
struct gnet_stats_rate_est64 sample;
struct gnet_stats_rate_est est;
int res;
if (b && !gen_estimator_active(b, r))
if (!gen_estimator_read(rate_est, &sample))
return 0;
est.bps = min_t(u64, UINT_MAX, r->bps);
est.bps = min_t(u64, UINT_MAX, sample.bps);
/* we have some time before reaching 2^32 packets per second */
est.pps = r->pps;
est.pps = sample.pps;
if (d->compat_tc_stats) {
d->tc_stats.bps = est.bps;
......@@ -226,11 +224,11 @@ gnet_stats_copy_rate_est(struct gnet_dump *d,
if (d->tail) {
res = gnet_stats_copy(d, TCA_STATS_RATE_EST, &est, sizeof(est),
TCA_STATS_PAD);
if (res < 0 || est.bps == r->bps)
if (res < 0 || est.bps == sample.bps)
return res;
/* emit 64bit stats only if needed */
return gnet_stats_copy(d, TCA_STATS_RATE_EST64, r, sizeof(*r),
TCA_STATS_PAD);
return gnet_stats_copy(d, TCA_STATS_RATE_EST64, &sample,
sizeof(sample), TCA_STATS_PAD);
}
return 0;
......
......@@ -63,7 +63,7 @@ void xt_rateest_put(struct xt_rateest *est)
mutex_lock(&xt_rateest_mutex);
if (--est->refcnt == 0) {
hlist_del(&est->list);
gen_kill_estimator(&est->bstats, &est->rstats);
gen_kill_estimator(&est->rate_est);
/*
* gen_estimator est_timer() might access est->lock or bstats,
* wait a RCU grace period before freeing 'est'
......@@ -132,7 +132,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par)
cfg.est.interval = info->interval;
cfg.est.ewma_log = info->ewma_log;
ret = gen_new_estimator(&est->bstats, NULL, &est->rstats,
ret = gen_new_estimator(&est->bstats, NULL, &est->rate_est,
&est->lock, NULL, &cfg.opt);
if (ret < 0)
goto err2;
......
......@@ -18,35 +18,33 @@ static bool
xt_rateest_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_rateest_match_info *info = par->matchinfo;
struct gnet_stats_rate_est64 *r;
struct gnet_stats_rate_est64 sample = {0};
u_int32_t bps1, bps2, pps1, pps2;
bool ret = true;
spin_lock_bh(&info->est1->lock);
r = &info->est1->rstats;
gen_estimator_read(&info->est1->rate_est, &sample);
if (info->flags & XT_RATEEST_MATCH_DELTA) {
bps1 = info->bps1 >= r->bps ? info->bps1 - r->bps : 0;
pps1 = info->pps1 >= r->pps ? info->pps1 - r->pps : 0;
bps1 = info->bps1 >= sample.bps ? info->bps1 - sample.bps : 0;
pps1 = info->pps1 >= sample.pps ? info->pps1 - sample.pps : 0;
} else {
bps1 = r->bps;
pps1 = r->pps;
bps1 = sample.bps;
pps1 = sample.pps;
}
spin_unlock_bh(&info->est1->lock);
if (info->flags & XT_RATEEST_MATCH_ABS) {
bps2 = info->bps2;
pps2 = info->pps2;
} else {
spin_lock_bh(&info->est2->lock);
r = &info->est2->rstats;
gen_estimator_read(&info->est2->rate_est, &sample);
if (info->flags & XT_RATEEST_MATCH_DELTA) {
bps2 = info->bps2 >= r->bps ? info->bps2 - r->bps : 0;
pps2 = info->pps2 >= r->pps ? info->pps2 - r->pps : 0;
bps2 = info->bps2 >= sample.bps ? info->bps2 - sample.bps : 0;
pps2 = info->pps2 >= sample.pps ? info->pps2 - sample.pps : 0;
} else {
bps2 = r->bps;
pps2 = r->pps;
bps2 = sample.bps;
pps2 = sample.pps;
}
spin_unlock_bh(&info->est2->lock);
}
switch (info->mode) {
......
......@@ -41,8 +41,7 @@ static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *p)
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfa_head);
spin_unlock_bh(&hinfo->lock);
gen_kill_estimator(&p->tcfa_bstats,
&p->tcfa_rate_est);
gen_kill_estimator(&p->tcfa_rate_est);
/*
* gen_estimator est_timer() might access p->tcfa_lock
* or bstats, wait a RCU grace period before freeing p
......@@ -237,8 +236,7 @@ EXPORT_SYMBOL(tcf_hash_check);
void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
{
if (est)
gen_kill_estimator(&a->tcfa_bstats,
&a->tcfa_rate_est);
gen_kill_estimator(&a->tcfa_rate_est);
call_rcu(&a->tcfa_rcu, free_tcf);
}
EXPORT_SYMBOL(tcf_hash_cleanup);
......@@ -670,8 +668,7 @@ int tcf_action_copy_stats(struct sk_buff *skb, struct tc_action *p,
goto errout;
if (gnet_stats_copy_basic(NULL, &d, p->cpu_bstats, &p->tcfa_bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_bstats,
&p->tcfa_rate_est) < 0 ||
gnet_stats_copy_rate_est(&d, &p->tcfa_rate_est) < 0 ||
gnet_stats_copy_queue(&d, p->cpu_qstats,
&p->tcfa_qstats,
p->tcfa_qstats.qlen) < 0)
......
......@@ -142,8 +142,7 @@ static int tcf_act_police_init(struct net *net, struct nlattr *nla,
goto failure_unlock;
} else if (tb[TCA_POLICE_AVRATE] &&
(ret == ACT_P_CREATED ||
!gen_estimator_active(&police->tcf_bstats,
&police->tcf_rate_est))) {
!gen_estimator_active(&police->tcf_rate_est))) {
err = -EINVAL;
goto failure_unlock;
}
......@@ -216,13 +215,17 @@ static int tcf_act_police(struct sk_buff *skb, const struct tc_action *a,
bstats_update(&police->tcf_bstats, skb);
tcf_lastuse_update(&police->tcf_tm);
if (police->tcfp_ewma_rate &&
police->tcf_rate_est.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
if (police->tcfp_ewma_rate) {
struct gnet_stats_rate_est64 sample;
if (!gen_estimator_read(&police->tcf_rate_est, &sample) ||
sample.bps >= police->tcfp_ewma_rate) {
police->tcf_qstats.overlimits++;
if (police->tcf_action == TC_ACT_SHOT)
police->tcf_qstats.drops++;
spin_unlock(&police->tcf_lock);
return police->tcf_action;
}
}
if (qdisc_pkt_len(skb) <= police->tcfp_mtu) {
......
......@@ -1395,7 +1395,7 @@ static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
&d, cpu_bstats, &q->bstats) < 0 ||
gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
goto nla_put_failure;
......
......@@ -122,7 +122,7 @@ struct cbq_class {
psched_time_t penalized;
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
struct tc_cbq_xstats xstats;
struct tcf_proto __rcu *filter_list;
......@@ -1346,7 +1346,7 @@ cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
return -1;
......@@ -1405,7 +1405,7 @@ static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->q);
qdisc_put_rtab(cl->R_tab);
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->link)
kfree(cl);
}
......
......@@ -25,7 +25,7 @@ struct drr_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
struct list_head alist;
struct Qdisc *qdisc;
......@@ -142,7 +142,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
static void drr_destroy_class(struct Qdisc *sch, struct drr_class *cl)
{
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
......@@ -283,7 +283,7 @@ static int drr_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qdisc->qstats, qlen) < 0)
return -1;
......
......@@ -709,7 +709,7 @@ void qdisc_destroy(struct Qdisc *qdisc)
qdisc_put_stab(rtnl_dereference(qdisc->stab));
#endif
gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est);
gen_kill_estimator(&qdisc->rate_est);
if (ops->reset)
ops->reset(qdisc);
if (ops->destroy)
......
......@@ -114,7 +114,7 @@ struct hfsc_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
struct tcf_proto __rcu *filter_list; /* filter list */
unsigned int filter_cnt; /* filter count */
unsigned int level; /* class level in hierarchy */
......@@ -1091,7 +1091,7 @@ hfsc_destroy_class(struct Qdisc *sch, struct hfsc_class *cl)
tcf_destroy_chain(&cl->filter_list);
qdisc_destroy(cl->qdisc);
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
if (cl != &q->root)
kfree(cl);
}
......@@ -1348,7 +1348,7 @@ hfsc_dump_class_stats(struct Qdisc *sch, unsigned long arg,
xstats.rtwork = cl->cl_cumul;
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch), d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->qdisc->q.qlen) < 0)
return -1;
......
......@@ -111,7 +111,7 @@ struct htb_class {
unsigned int children;
struct htb_class *parent; /* parent class */
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
/*
* Written often fields
......@@ -1145,7 +1145,7 @@ htb_dump_class_stats(struct Qdisc *sch, unsigned long arg, struct gnet_dump *d)
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL, &qs, qlen) < 0)
return -1;
......@@ -1228,7 +1228,7 @@ static void htb_destroy_class(struct Qdisc *sch, struct htb_class *cl)
WARN_ON(!cl->un.leaf.q);
qdisc_destroy(cl->un.leaf.q);
}
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
tcf_destroy_chain(&cl->filter_list);
kfree(cl);
}
......
......@@ -137,7 +137,7 @@ struct qfq_class {
struct gnet_stats_basic_packed bstats;
struct gnet_stats_queue qstats;
struct gnet_stats_rate_est64 rate_est;
struct net_rate_estimator __rcu *rate_est;
struct Qdisc *qdisc;
struct list_head alist; /* Link for active-classes list. */
struct qfq_aggregate *agg; /* Parent aggregate. */
......@@ -508,7 +508,7 @@ static int qfq_change_class(struct Qdisc *sch, u32 classid, u32 parentid,
new_agg = kzalloc(sizeof(*new_agg), GFP_KERNEL);
if (new_agg == NULL) {
err = -ENOBUFS;
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
goto destroy_class;
}
sch_tree_lock(sch);
......@@ -533,7 +533,7 @@ static void qfq_destroy_class(struct Qdisc *sch, struct qfq_class *cl)
struct qfq_sched *q = qdisc_priv(sch);
qfq_rm_from_agg(q, cl);
gen_kill_estimator(&cl->bstats, &cl->rate_est);
gen_kill_estimator(&cl->rate_est);
qdisc_destroy(cl->qdisc);
kfree(cl);
}
......@@ -667,7 +667,7 @@ static int qfq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
d, NULL, &cl->bstats) < 0 ||
gnet_stats_copy_rate_est(d, &cl->bstats, &cl->rate_est) < 0 ||
gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
gnet_stats_copy_queue(d, NULL,
&cl->qdisc->qstats, cl->qdisc->q.qlen) < 0)
return -1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment