Commit cd2809cc authored by David S. Miller's avatar David S. Miller

Merge branch 'Implement-classifier-action-terse-dump-mode'

Vlad Buslov says:

====================
Implement classifier-action terse dump mode

Output rate of current upstream kernel TC filter dump implementation if
relatively low (~100k rules/sec depending on configuration). This
constraint impacts performance of software switch implementation that
rely on TC for their datapath implementation and periodically call TC
filter dump to update rules stats. Moreover, TC filter dump output a lot
of static data that don't change during the filter lifecycle (filter
key, specific action details, etc.) which constitutes significant
portion of payload on resulting netlink packets and increases amount of
syscalls necessary to dump all filters on particular Qdisc. In order to
significantly improve filter dump rate this patch sets implement new
mode of TC filter dump operation named "terse dump" mode. In this mode
only parameters necessary to identify the filter (handle, action cookie,
etc.) and data that can change during filter lifecycle (filter flags,
action stats, etc.) are preserved in dump output while everything else
is omitted.

Userspace API is implemented using new TCA_DUMP_FLAGS tlv with only
available flag value TCA_DUMP_FLAGS_TERSE. Internally, new API requires
individual classifier support (new tcf_proto_ops->terse_dump()
callback). Support for action terse dump is implemented in act API and
don't require changing individual action implementations.

The following table provides performance comparison between regular
filter dump and new terse dump mode for two classifier-action profiles:
one minimal config with L2 flower classifier and single gact action and
another heavier config with L2+5tuple flower classifier with
tunnel_key+mirred actions.

 Classifier-action type      |        dump |  terse dump | X improvement
                             | (rules/sec) | (rules/sec) |
-----------------------------+-------------+-------------+---------------
 L2 with gact                |       141.8 |       293.2 |          2.07
 L2+5tuple tunnel_key+mirred |        76.4 |       198.8 |          2.60

Benchmark details: to measure the rate tc filter dump and terse dump
commands are invoked on ingress Qdisc that have one million filters
configured using following commands.

> time sudo tc -s filter show dev ens1f0 ingress >/dev/null

> time sudo tc -s filter show terse dev ens1f0 ingress >/dev/null

Value in results table is calculated by dividing 1000000 total rules by
"real" time reported by time command.

Setup details: 2x Intel(R) Xeon(R) CPU E5-2620 v3 @ 2.40GHz, 32GB memory
====================
Reviewed-by: default avatarJakub Kicinski <kuba@kernel.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 2e186a2c e7534fd4
......@@ -193,7 +193,7 @@ struct tc_action *tcf_action_init_1(struct net *net, struct tcf_proto *tp,
bool rtnl_held,
struct netlink_ext_ack *extack);
int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[], int bind,
int ref);
int ref, bool terse);
int tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int, int);
int tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int, int);
......
......@@ -325,6 +325,7 @@ int tcf_exts_validate(struct net *net, struct tcf_proto *tp,
void tcf_exts_destroy(struct tcf_exts *exts);
void tcf_exts_change(struct tcf_exts *dst, struct tcf_exts *src);
int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts);
int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts);
/**
......
......@@ -330,6 +330,10 @@ struct tcf_proto_ops {
int (*dump)(struct net*, struct tcf_proto*, void *,
struct sk_buff *skb, struct tcmsg*,
bool);
int (*terse_dump)(struct net *net,
struct tcf_proto *tp, void *fh,
struct sk_buff *skb,
struct tcmsg *t, bool rtnl_held);
int (*tmplt_dump)(struct sk_buff *skb,
struct net *net,
void *tmplt_priv);
......
......@@ -609,11 +609,17 @@ enum {
TCA_HW_OFFLOAD,
TCA_INGRESS_BLOCK,
TCA_EGRESS_BLOCK,
TCA_DUMP_FLAGS,
__TCA_MAX
};
#define TCA_MAX (__TCA_MAX - 1)
#define TCA_DUMP_FLAGS_TERSE (1 << 0) /* Means that in dump user gets only basic
* data necessary to identify the objects
* (handle, cookie, etc.) and stats.
*/
#define TCA_RTA(r) ((struct rtattr*)(((char*)(r)) + NLMSG_ALIGN(sizeof(struct tcmsg))))
#define TCA_PAYLOAD(n) NLMSG_PAYLOAD(n,sizeof(struct tcmsg))
......
......@@ -766,12 +766,10 @@ tcf_action_dump_old(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
return a->ops->dump(skb, a, bind, ref);
}
int
tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
static int
tcf_action_dump_terse(struct sk_buff *skb, struct tc_action *a)
{
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
struct tc_cookie *cookie;
if (nla_put_string(skb, TCA_KIND, a->ops->kind))
......@@ -789,6 +787,23 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
}
rcu_read_unlock();
return 0;
nla_put_failure:
nlmsg_trim(skb, b);
return -1;
}
int
tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
{
int err = -EINVAL;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
if (tcf_action_dump_terse(skb, a))
goto nla_put_failure;
if (a->hw_stats != TCA_ACT_HW_STATS_ANY &&
nla_put_bitfield32(skb, TCA_ACT_HW_STATS,
a->hw_stats, TCA_ACT_HW_STATS_ANY))
......@@ -820,7 +835,7 @@ tcf_action_dump_1(struct sk_buff *skb, struct tc_action *a, int bind, int ref)
EXPORT_SYMBOL(tcf_action_dump_1);
int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
int bind, int ref)
int bind, int ref, bool terse)
{
struct tc_action *a;
int err = -EINVAL, i;
......@@ -831,7 +846,8 @@ int tcf_action_dump(struct sk_buff *skb, struct tc_action *actions[],
nest = nla_nest_start_noflag(skb, i + 1);
if (nest == NULL)
goto nla_put_failure;
err = tcf_action_dump_1(skb, a, bind, ref);
err = terse ? tcf_action_dump_terse(skb, a) :
tcf_action_dump_1(skb, a, bind, ref);
if (err < 0)
goto errout;
nla_nest_end(skb, nest);
......@@ -1133,7 +1149,7 @@ static int tca_get_fill(struct sk_buff *skb, struct tc_action *actions[],
if (!nest)
goto out_nlmsg_trim;
if (tcf_action_dump(skb, actions, bind, ref) < 0)
if (tcf_action_dump(skb, actions, bind, ref, false) < 0)
goto out_nlmsg_trim;
nla_nest_end(skb, nest);
......
......@@ -1851,7 +1851,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
struct tcf_proto *tp, struct tcf_block *block,
struct Qdisc *q, u32 parent, void *fh,
u32 portid, u32 seq, u16 flags, int event,
bool rtnl_held)
bool terse_dump, bool rtnl_held)
{
struct tcmsg *tcm;
struct nlmsghdr *nlh;
......@@ -1878,6 +1878,14 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
goto nla_put_failure;
if (!fh) {
tcm->tcm_handle = 0;
} else if (terse_dump) {
if (tp->ops->terse_dump) {
if (tp->ops->terse_dump(net, tp, fh, skb, tcm,
rtnl_held) < 0)
goto nla_put_failure;
} else {
goto cls_op_not_supp;
}
} else {
if (tp->ops->dump &&
tp->ops->dump(net, tp, fh, skb, tcm, rtnl_held) < 0)
......@@ -1888,6 +1896,7 @@ static int tcf_fill_node(struct net *net, struct sk_buff *skb,
out_nlmsg_trim:
nla_put_failure:
cls_op_not_supp:
nlmsg_trim(skb, b);
return -1;
}
......@@ -1908,7 +1917,7 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, event,
rtnl_held) <= 0) {
false, rtnl_held) <= 0) {
kfree_skb(skb);
return -EINVAL;
}
......@@ -1940,7 +1949,7 @@ static int tfilter_del_notify(struct net *net, struct sk_buff *oskb,
if (tcf_fill_node(net, skb, tp, block, q, parent, fh, portid,
n->nlmsg_seq, n->nlmsg_flags, RTM_DELTFILTER,
rtnl_held) <= 0) {
false, rtnl_held) <= 0) {
NL_SET_ERR_MSG(extack, "Failed to build del event notification");
kfree_skb(skb);
return -EINVAL;
......@@ -2501,6 +2510,7 @@ struct tcf_dump_args {
struct tcf_block *block;
struct Qdisc *q;
u32 parent;
bool terse_dump;
};
static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
......@@ -2511,12 +2521,12 @@ static int tcf_node_dump(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
return tcf_fill_node(net, a->skb, tp, a->block, a->q, a->parent,
n, NETLINK_CB(a->cb->skb).portid,
a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER, true);
RTM_NEWTFILTER, a->terse_dump, true);
}
static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
struct sk_buff *skb, struct netlink_callback *cb,
long index_start, long *p_index)
long index_start, long *p_index, bool terse)
{
struct net *net = sock_net(skb->sk);
struct tcf_block *block = chain->block;
......@@ -2545,7 +2555,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
if (tcf_fill_node(net, skb, tp, block, q, parent, NULL,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
RTM_NEWTFILTER, true) <= 0)
RTM_NEWTFILTER, false, true) <= 0)
goto errout;
cb->args[1] = 1;
}
......@@ -2561,6 +2571,7 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
arg.w.skip = cb->args[1] - 1;
arg.w.count = 0;
arg.w.cookie = cb->args[2];
arg.terse_dump = terse;
tp->ops->walk(tp, &arg.w, true);
cb->args[2] = arg.w.cookie;
cb->args[1] = arg.w.count + 1;
......@@ -2574,6 +2585,10 @@ static bool tcf_chain_dump(struct tcf_chain *chain, struct Qdisc *q, u32 parent,
return false;
}
static const struct nla_policy tcf_tfilter_dump_policy[TCA_MAX + 1] = {
[TCA_DUMP_FLAGS] = NLA_POLICY_BITFIELD32(TCA_DUMP_FLAGS_TERSE),
};
/* called with RTNL */
static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
{
......@@ -2583,6 +2598,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
struct Qdisc *q = NULL;
struct tcf_block *block;
struct tcmsg *tcm = nlmsg_data(cb->nlh);
bool terse_dump = false;
long index_start;
long index;
u32 parent;
......@@ -2592,10 +2608,17 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
err = nlmsg_parse_deprecated(cb->nlh, sizeof(*tcm), tca, TCA_MAX,
NULL, cb->extack);
tcf_tfilter_dump_policy, cb->extack);
if (err)
return err;
if (tca[TCA_DUMP_FLAGS]) {
struct nla_bitfield32 flags =
nla_get_bitfield32(tca[TCA_DUMP_FLAGS]);
terse_dump = flags.value & TCA_DUMP_FLAGS_TERSE;
}
if (tcm->tcm_ifindex == TCM_IFINDEX_MAGIC_BLOCK) {
block = tcf_block_refcnt_get(net, tcm->tcm_block_index);
if (!block)
......@@ -2653,7 +2676,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb)
nla_get_u32(tca[TCA_CHAIN]) != chain->index)
continue;
if (!tcf_chain_dump(chain, q, parent, skb, cb,
index_start, &index)) {
index_start, &index, terse_dump)) {
tcf_chain_put(chain);
err = -EMSGSIZE;
break;
......@@ -3156,7 +3179,8 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
if (nest == NULL)
goto nla_put_failure;
if (tcf_action_dump(skb, exts->actions, 0, 0) < 0)
if (tcf_action_dump(skb, exts->actions, 0, 0, false)
< 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
} else if (exts->police) {
......@@ -3180,6 +3204,31 @@ int tcf_exts_dump(struct sk_buff *skb, struct tcf_exts *exts)
}
EXPORT_SYMBOL(tcf_exts_dump);
int tcf_exts_terse_dump(struct sk_buff *skb, struct tcf_exts *exts)
{
#ifdef CONFIG_NET_CLS_ACT
struct nlattr *nest;
if (!exts->action || !tcf_exts_has_actions(exts))
return 0;
nest = nla_nest_start_noflag(skb, exts->action);
if (!nest)
goto nla_put_failure;
if (tcf_action_dump(skb, exts->actions, 0, 0, true) < 0)
goto nla_put_failure;
nla_nest_end(skb, nest);
return 0;
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
#else
return 0;
#endif
}
EXPORT_SYMBOL(tcf_exts_terse_dump);
int tcf_exts_dump_stats(struct sk_buff *skb, struct tcf_exts *exts)
{
......
......@@ -2768,6 +2768,48 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh,
return -1;
}
static int fl_terse_dump(struct net *net, struct tcf_proto *tp, void *fh,
struct sk_buff *skb, struct tcmsg *t, bool rtnl_held)
{
struct cls_fl_filter *f = fh;
struct nlattr *nest;
bool skip_hw;
if (!f)
return skb->len;
t->tcm_handle = f->handle;
nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
if (!nest)
goto nla_put_failure;
spin_lock(&tp->lock);
skip_hw = tc_skip_hw(f->flags);
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
goto nla_put_failure_locked;
spin_unlock(&tp->lock);
if (!skip_hw)
fl_hw_update_stats(tp, f, rtnl_held);
if (tcf_exts_terse_dump(skb, &f->exts))
goto nla_put_failure;
nla_nest_end(skb, nest);
return skb->len;
nla_put_failure_locked:
spin_unlock(&tp->lock);
nla_put_failure:
nla_nest_cancel(skb, nest);
return -1;
}
static int fl_tmplt_dump(struct sk_buff *skb, struct net *net, void *tmplt_priv)
{
struct fl_flow_tmplt *tmplt = tmplt_priv;
......@@ -2832,6 +2874,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
.hw_add = fl_hw_add,
.hw_del = fl_hw_del,
.dump = fl_dump,
.terse_dump = fl_terse_dump,
.bind_class = fl_bind_class,
.tmplt_create = fl_tmplt_create,
.tmplt_destroy = fl_tmplt_destroy,
......
......@@ -87,5 +87,43 @@
"teardown": [
"$TC qdisc del dev $DEV2 ingress"
]
},
{
"id": "7c65",
"name": "Add flower filter and then terse dump it",
"category": [
"filter",
"flower"
],
"setup": [
"$TC qdisc add dev $DEV2 ingress"
],
"cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop",
"expExitCode": "0",
"verifyCmd": "$TC filter show terse dev $DEV2 ingress",
"matchPattern": "filter protocol ip pref 1 flower.*handle",
"matchCount": "1",
"teardown": [
"$TC qdisc del dev $DEV2 ingress"
]
},
{
"id": "d45e",
"name": "Add flower filter and verify that terse dump doesn't output filter key",
"category": [
"filter",
"flower"
],
"setup": [
"$TC qdisc add dev $DEV2 ingress"
],
"cmdUnderTest": "$TC filter add dev $DEV2 protocol ip pref 1 ingress flower dst_mac e4:11:22:11:4a:51 action drop",
"expExitCode": "0",
"verifyCmd": "$TC filter show terse dev $DEV2 ingress",
"matchPattern": " dst_mac e4:11:22:11:4a:51",
"matchCount": "0",
"teardown": [
"$TC qdisc del dev $DEV2 ingress"
]
}
]
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment