Commit 9ab948a9 authored by David Ahern's avatar David Ahern Committed by David S. Miller

ipv4: Allow amount of dirty memory from fib resizing to be controllable

fib_trie implementation calls synchronize_rcu when a certain amount of
pages are dirty from freed entries. The number of pages was determined
experimentally in 2009 (commit c3059477).

At the current setting, synchronize_rcu is called often -- 51 times in a
second in one test with an average of an 8 msec delay adding a fib entry.
The total impact is a lot of slow down modifying the fib. This is seen
in the output of 'time' - the difference between real time and sys+user.
For example, using 720,022 single path routes and 'ip -batch'[1]:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m14.214s
    user    0m2.513s
    sys     0m6.783s

So roughly 35% of the actual time to install the routes is from the ip
command getting scheduled out, most notably due to synchronize_rcu (this
is observed using 'perf sched timehist').

This patch makes the amount of dirty memory configurable between 64k where
the synchronize_rcu is called often (small, low end systems that are memory
sensitive) to 64M where synchronize_rcu is called rarely during a large
FIB change (for high end systems with lots of memory). The default is 512kB
which corresponds to the current setting of 128 pages with a 4kB page size.

As an example, at 16MB the worst interval shows 4 calls to synchronize_rcu
in a second blocking for up to 30 msec in a single instance, and a total
of almost 100 msec across the 4 calls in the second. The trade off is
allowing FIB entries to consume more memory in a given time window but
but with much better fib insertion rates (~30% increase in prefixes/sec).
With this patch and net.ipv4.fib_sync_mem set to 16MB, the same batch
file runs in:

    $ time ./ip -batch ipv4/routes-1-hops
    real    0m9.692s
    user    0m2.491s
    sys     0m6.769s

So the dead time is reduced to about 1/2 second or <5% of the real time.

[1] 'ip' modified to not request ACK messages which improves route
    insertion times by about 20%
Signed-off-by: default avatarDavid Ahern <dsahern@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 12132768
...@@ -81,6 +81,11 @@ fib_multipath_hash_policy - INTEGER ...@@ -81,6 +81,11 @@ fib_multipath_hash_policy - INTEGER
0 - Layer 3 0 - Layer 3
1 - Layer 4 1 - Layer 4
fib_sync_mem - UNSIGNED INTEGER
Amount of dirty memory from fib entries that can be backlogged before
synchronize_rcu is forced.
Default: 512kB Minimum: 64kB Maximum: 64MB
ip_forward_update_priority - INTEGER ip_forward_update_priority - INTEGER
Whether to update SKB priority from "TOS" field in IPv4 header after it Whether to update SKB priority from "TOS" field in IPv4 header after it
is forwarded. The new SKB priority is mapped from TOS field value is forwarded. The new SKB priority is mapped from TOS field value
......
...@@ -38,6 +38,10 @@ ...@@ -38,6 +38,10 @@
#define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */ #define IPV4_MAX_PMTU 65535U /* RFC 2675, Section 5.1 */
#define IPV4_MIN_MTU 68 /* RFC 791 */ #define IPV4_MIN_MTU 68 /* RFC 791 */
extern unsigned int sysctl_fib_sync_mem;
extern unsigned int sysctl_fib_sync_mem_min;
extern unsigned int sysctl_fib_sync_mem_max;
struct sock; struct sock;
struct inet_skb_parm { struct inet_skb_parm {
......
...@@ -183,14 +183,16 @@ struct trie { ...@@ -183,14 +183,16 @@ struct trie {
}; };
static struct key_vector *resize(struct trie *t, struct key_vector *tn); static struct key_vector *resize(struct trie *t, struct key_vector *tn);
static size_t tnode_free_size; static unsigned int tnode_free_size;
/* /*
* synchronize_rcu after call_rcu for that many pages; it should be especially * synchronize_rcu after call_rcu for outstanding dirty memory; it should be
* useful before resizing the root node with PREEMPT_NONE configs; the value was * especially useful before resizing the root node with PREEMPT_NONE configs;
* obtained experimentally, aiming to avoid visible slowdown. * the value was obtained experimentally, aiming to avoid visible slowdown.
*/ */
static const int sync_pages = 128; unsigned int sysctl_fib_sync_mem = 512 * 1024;
unsigned int sysctl_fib_sync_mem_min = 64 * 1024;
unsigned int sysctl_fib_sync_mem_max = 64 * 1024 * 1024;
static struct kmem_cache *fn_alias_kmem __ro_after_init; static struct kmem_cache *fn_alias_kmem __ro_after_init;
static struct kmem_cache *trie_leaf_kmem __ro_after_init; static struct kmem_cache *trie_leaf_kmem __ro_after_init;
...@@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn) ...@@ -504,7 +506,7 @@ static void tnode_free(struct key_vector *tn)
tn = container_of(head, struct tnode, rcu)->kv; tn = container_of(head, struct tnode, rcu)->kv;
} }
if (tnode_free_size >= PAGE_SIZE * sync_pages) { if (tnode_free_size >= sysctl_fib_sync_mem) {
tnode_free_size = 0; tnode_free_size = 0;
synchronize_rcu(); synchronize_rcu();
} }
......
...@@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = { ...@@ -549,6 +549,15 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_doulongvec_minmax, .proc_handler = proc_doulongvec_minmax,
}, },
{
.procname = "fib_sync_mem",
.data = &sysctl_fib_sync_mem,
.maxlen = sizeof(sysctl_fib_sync_mem),
.mode = 0644,
.proc_handler = proc_douintvec_minmax,
.extra1 = &sysctl_fib_sync_mem_min,
.extra2 = &sysctl_fib_sync_mem_max,
},
{ } { }
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment