Commit 6648bd7e authored by Alexander Duyck's avatar Alexander Duyck Committed by David S. Miller

ipv4: Add sysctl knob to control early socket demux

This change is meant to add a control for disabling early socket demux.
The main motivation behind this patch is to provide an option to disable
the feature as it adds an additional cost to routing that reduces overall
throughput by up to 5%.  For example one of my systems went from 12.1Mpps
to 11.6 after the early socket demux was added.  It looks like the reason
for the regression is that we are now having to perform two lookups, first
the one for an established socket, and then the one for the routing table.

By adding this patch and toggling the value for ip_early_demux to 0 I am
able to get back to the 12.1Mpps I was previously seeing.

[ Move local variables in ip_rcv_finish() down into the basic
  block in which they are actually used.  -DaveM ]
Signed-off-by: default avatarAlexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 8e27628e
...@@ -425,6 +425,7 @@ enum ...@@ -425,6 +425,7 @@ enum
NET_TCP_ALLOWED_CONG_CONTROL=123, NET_TCP_ALLOWED_CONG_CONTROL=123,
NET_TCP_MAX_SSTHRESH=124, NET_TCP_MAX_SSTHRESH=124,
NET_TCP_FRTO_RESPONSE=125, NET_TCP_FRTO_RESPONSE=125,
NET_IPV4_EARLY_DEMUX=126,
}; };
enum { enum {
......
...@@ -210,6 +210,9 @@ extern int inet_peer_threshold; ...@@ -210,6 +210,9 @@ extern int inet_peer_threshold;
extern int inet_peer_minttl; extern int inet_peer_minttl;
extern int inet_peer_maxttl; extern int inet_peer_maxttl;
/* From ip_input.c */
extern int sysctl_ip_early_demux;
/* From ip_output.c */ /* From ip_output.c */
extern int sysctl_ip_dynaddr; extern int sysctl_ip_dynaddr;
......
...@@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = { ...@@ -415,6 +415,8 @@ static const struct bin_table bin_net_ipv4_table[] = {
{ CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" }, { CTL_INT, NET_IPV4_IPFRAG_SECRET_INTERVAL, "ipfrag_secret_interval" },
/* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */ /* NET_IPV4_IPFRAG_MAX_DIST "ipfrag_max_dist" no longer used */
{ CTL_INT, NET_IPV4_EARLY_DEMUX, "ip_early_demux" },
{ CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" }, { CTL_INT, 2088 /* NET_IPQ_QMAX */, "ip_queue_maxlen" },
/* NET_TCP_DEFAULT_WIN_SCALE unused */ /* NET_TCP_DEFAULT_WIN_SCALE unused */
......
...@@ -313,6 +313,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb) ...@@ -313,6 +313,8 @@ static inline bool ip_rcv_options(struct sk_buff *skb)
return true; return true;
} }
int sysctl_ip_early_demux __read_mostly = 1;
static int ip_rcv_finish(struct sk_buff *skb) static int ip_rcv_finish(struct sk_buff *skb)
{ {
const struct iphdr *iph = ip_hdr(skb); const struct iphdr *iph = ip_hdr(skb);
...@@ -323,16 +325,18 @@ static int ip_rcv_finish(struct sk_buff *skb) ...@@ -323,16 +325,18 @@ static int ip_rcv_finish(struct sk_buff *skb)
* how the packet travels inside Linux networking. * how the packet travels inside Linux networking.
*/ */
if (skb_dst(skb) == NULL) { if (skb_dst(skb) == NULL) {
int err = -ENOENT;
if (sysctl_ip_early_demux) {
const struct net_protocol *ipprot; const struct net_protocol *ipprot;
int protocol = iph->protocol; int protocol = iph->protocol;
int err;
rcu_read_lock(); rcu_read_lock();
ipprot = rcu_dereference(inet_protos[protocol]); ipprot = rcu_dereference(inet_protos[protocol]);
err = -ENOENT;
if (ipprot && ipprot->early_demux) if (ipprot && ipprot->early_demux)
err = ipprot->early_demux(skb); err = ipprot->early_demux(skb);
rcu_read_unlock(); rcu_read_unlock();
}
if (err) { if (err) {
err = ip_route_input_noref(skb, iph->daddr, iph->saddr, err = ip_route_input_noref(skb, iph->daddr, iph->saddr,
......
...@@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = { ...@@ -300,6 +300,13 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "ip_early_demux",
.data = &sysctl_ip_early_demux,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
{ {
.procname = "ip_dynaddr", .procname = "ip_dynaddr",
.data = &sysctl_ip_dynaddr, .data = &sysctl_ip_dynaddr,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment