Commit fec65bd4 authored by David S. Miller's avatar David S. Miller

Merge branch 'ila-early-demux'

Tom Herbert says:

====================
ila: Optimization to preserve value of early demux

In the current implementation of ILA, LWT is used to perform
translation on both the input and output paths. This is functional,
however there is a big performance hit in the receive path. Early
demux occurs before the routing lookup (a hit actually obviates the
route lookup). Therefore the stack currently performs early
demux before translation so that a local connection with ILA
addresses is never matched. Note that this issue is not just
with ILA, but pretty much any translated or encapsulated packet
handled by LWT would miss the opportunity for early demux. Solving
the general problem seems non trivial since we would need to move
the route lookup before early demx thereby mitigating the value.

This patch set addresses the issue for ILA by adding a fast locator
lookup that occurs before early demux. This done by hooking in to
NF_INET_PRE_ROUTING

For the backend we implement an rhashtable that contains identifier
to locator to mappings. The table also allows more specific matches
that include original locator and interface.

This patch set:
 - Add an rhashtable function to atomically replace and element.
   This is useful to implement sub-trees from a table entry
   without needing to use a special anchor structure as the
   table entry.
 - Add a start callback for starting a netlink dump.
 - Creates an ila directory under net/ipv6 and moves ila.c to it.
   ila.c is split into ila_common.c and ila_lwt.c.
 - Implement a table to do identifier->locator mapping. This is
   an rhashtable (in ila_xlat.c).
 - Configuration for the table with netlink.
 - Add a hook into NF_INET_PRE_ROUTING to perform ILA translation
   before early demux.

Changes in v2:
 - Use iptables targets instead of a new xfrm function

Changes in v3:
 - Add __rcu to next pointer in struct ila_map

Changes in v4:
 - Use hook for NF_INET_PRE_ROUTING

Changed in v5:
 - Register hooks per namespace using nf_register_net_hooks
 - Only register hooks when first mapping is actually added

Changed in v6:
  - Remove gfp argument in alloc_ila_locks, it is unnecessary
  - Set registered_hooks properly when hooks are registered

Testing:
   Running 200 netperf TCP_RR streams

No ILA, baseline
   79.26% CPU utilization
   1678282 tps
   104/189/390 50/90/99% latencies

ILA before fix (LWT on both input and output)
   81.91% CPU utilization
   1464723 tps (-14.5% from baseline)
   121/215/411 50/90/99% latencies

ILA after fix
   80.62% CPU utilization
   1622985 (-3.4% from baseline)
   110/191/347 50/90/99% latencies
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 3026043d 7f00feaf
...@@ -131,6 +131,7 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask) ...@@ -131,6 +131,7 @@ netlink_skb_clone(struct sk_buff *skb, gfp_t gfp_mask)
struct netlink_callback { struct netlink_callback {
struct sk_buff *skb; struct sk_buff *skb;
const struct nlmsghdr *nlh; const struct nlmsghdr *nlh;
int (*start)(struct netlink_callback *);
int (*dump)(struct sk_buff * skb, int (*dump)(struct sk_buff * skb,
struct netlink_callback *cb); struct netlink_callback *cb);
int (*done)(struct netlink_callback *cb); int (*done)(struct netlink_callback *cb);
...@@ -153,6 +154,7 @@ struct nlmsghdr * ...@@ -153,6 +154,7 @@ struct nlmsghdr *
__nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags); __nlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, int type, int len, int flags);
struct netlink_dump_control { struct netlink_dump_control {
int (*start)(struct netlink_callback *);
int (*dump)(struct sk_buff *skb, struct netlink_callback *); int (*dump)(struct sk_buff *skb, struct netlink_callback *);
int (*done)(struct netlink_callback *); int (*done)(struct netlink_callback *);
void *data; void *data;
......
...@@ -819,4 +819,86 @@ static inline int rhashtable_remove_fast( ...@@ -819,4 +819,86 @@ static inline int rhashtable_remove_fast(
return err; return err;
} }
/* Internal function, please use rhashtable_replace_fast() instead */
static inline int __rhashtable_replace_fast(
struct rhashtable *ht, struct bucket_table *tbl,
struct rhash_head *obj_old, struct rhash_head *obj_new,
const struct rhashtable_params params)
{
struct rhash_head __rcu **pprev;
struct rhash_head *he;
spinlock_t *lock;
unsigned int hash;
int err = -ENOENT;
/* Minimally, the old and new objects must have same hash
* (which should mean identifiers are the same).
*/
hash = rht_head_hashfn(ht, tbl, obj_old, params);
if (hash != rht_head_hashfn(ht, tbl, obj_new, params))
return -EINVAL;
lock = rht_bucket_lock(tbl, hash);
spin_lock_bh(lock);
pprev = &tbl->buckets[hash];
rht_for_each(he, tbl, hash) {
if (he != obj_old) {
pprev = &he->next;
continue;
}
rcu_assign_pointer(obj_new->next, obj_old->next);
rcu_assign_pointer(*pprev, obj_new);
err = 0;
break;
}
spin_unlock_bh(lock);
return err;
}
/**
* rhashtable_replace_fast - replace an object in hash table
* @ht: hash table
* @obj_old: pointer to hash head inside object being replaced
* @obj_new: pointer to hash head inside object which is new
* @params: hash table parameters
*
* Replacing an object doesn't affect the number of elements in the hash table
* or bucket, so we don't need to worry about shrinking or expanding the
* table here.
*
* Returns zero on success, -ENOENT if the entry could not be found,
* -EINVAL if hash is not the same for the old and new objects.
*/
static inline int rhashtable_replace_fast(
struct rhashtable *ht, struct rhash_head *obj_old,
struct rhash_head *obj_new,
const struct rhashtable_params params)
{
struct bucket_table *tbl;
int err;
rcu_read_lock();
tbl = rht_dereference_rcu(ht->tbl, ht);
/* Because we have already taken (and released) the bucket
* lock in old_tbl, if we find that future_tbl is not yet
* visible then that guarantees the entry to still be in
* the old tbl if it exists.
*/
while ((err = __rhashtable_replace_fast(ht, tbl, obj_old,
obj_new, params)) &&
(tbl = rht_dereference_rcu(tbl->future_tbl, ht)))
;
rcu_read_unlock();
return err;
}
#endif /* _LINUX_RHASHTABLE_H */ #endif /* _LINUX_RHASHTABLE_H */
...@@ -114,6 +114,7 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net) ...@@ -114,6 +114,7 @@ static inline void genl_info_net_set(struct genl_info *info, struct net *net)
* @flags: flags * @flags: flags
* @policy: attribute validation policy * @policy: attribute validation policy
* @doit: standard command callback * @doit: standard command callback
* @start: start callback for dumps
* @dumpit: callback for dumpers * @dumpit: callback for dumpers
* @done: completion callback for dumps * @done: completion callback for dumps
* @ops_list: operations list * @ops_list: operations list
...@@ -122,6 +123,7 @@ struct genl_ops { ...@@ -122,6 +123,7 @@ struct genl_ops {
const struct nla_policy *policy; const struct nla_policy *policy;
int (*doit)(struct sk_buff *skb, int (*doit)(struct sk_buff *skb,
struct genl_info *info); struct genl_info *info);
int (*start)(struct netlink_callback *cb);
int (*dumpit)(struct sk_buff *skb, int (*dumpit)(struct sk_buff *skb,
struct netlink_callback *cb); struct netlink_callback *cb);
int (*done)(struct netlink_callback *cb); int (*done)(struct netlink_callback *cb);
......
/*
* ILA kernel interface
*
* Copyright (c) 2015 Tom Herbert <tom@herbertland.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*/
#ifndef _NET_ILA_H
#define _NET_ILA_H
int ila_xlat_outgoing(struct sk_buff *skb);
int ila_xlat_incoming(struct sk_buff *skb);
#endif /* _NET_ILA_H */
...@@ -3,13 +3,35 @@ ...@@ -3,13 +3,35 @@
#ifndef _UAPI_LINUX_ILA_H #ifndef _UAPI_LINUX_ILA_H
#define _UAPI_LINUX_ILA_H #define _UAPI_LINUX_ILA_H
/* NETLINK_GENERIC related info */
#define ILA_GENL_NAME "ila"
#define ILA_GENL_VERSION 0x1
enum { enum {
ILA_ATTR_UNSPEC, ILA_ATTR_UNSPEC,
ILA_ATTR_LOCATOR, /* u64 */ ILA_ATTR_LOCATOR, /* u64 */
ILA_ATTR_IDENTIFIER, /* u64 */
ILA_ATTR_LOCATOR_MATCH, /* u64 */
ILA_ATTR_IFINDEX, /* s32 */
ILA_ATTR_DIR, /* u32 */
__ILA_ATTR_MAX, __ILA_ATTR_MAX,
}; };
#define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1) #define ILA_ATTR_MAX (__ILA_ATTR_MAX - 1)
enum {
ILA_CMD_UNSPEC,
ILA_CMD_ADD,
ILA_CMD_DEL,
ILA_CMD_GET,
__ILA_CMD_MAX,
};
#define ILA_CMD_MAX (__ILA_CMD_MAX - 1)
#define ILA_DIR_IN (1 << 0)
#define ILA_DIR_OUT (1 << 1)
#endif /* _UAPI_LINUX_ILA_H */ #endif /* _UAPI_LINUX_ILA_H */
...@@ -34,7 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o ...@@ -34,7 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
obj-$(CONFIG_IPV6_MIP6) += mip6.o obj-$(CONFIG_IPV6_MIP6) += mip6.o
obj-$(CONFIG_IPV6_ILA) += ila.o obj-$(CONFIG_IPV6_ILA) += ila/
obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IPV6_VTI) += ip6_vti.o obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
......
#
# Makefile for ILA module
#
obj-$(CONFIG_IPV6_ILA) += ila.o
ila-objs := ila_common.o ila_lwt.o ila_xlat.o
/*
* Copyright (c) 2015 Tom Herbert <tom@herbertland.com>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
* published by the Free Software Foundation; either version 2 of
* the License, or (at your option) any later version.
*
*/
#ifndef __ILA_H
#define __ILA_H
#include <linux/errno.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/socket.h>
#include <linux/skbuff.h>
#include <linux/types.h>
#include <net/checksum.h>
#include <net/ip.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
struct ila_params {
__be64 locator;
__be64 locator_match;
__wsum csum_diff;
};
static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
{
__be32 diff[] = {
~from[0], ~from[1], to[0], to[1],
};
return csum_partial(diff, sizeof(diff), 0);
}
void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p);
int ila_lwt_init(void);
void ila_lwt_fini(void);
int ila_xlat_init(void);
void ila_xlat_fini(void);
#endif /* __ILA_H */
#include <linux/errno.h>
#include <linux/ip.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/skbuff.h>
#include <linux/socket.h>
#include <linux/types.h>
#include <net/checksum.h>
#include <net/ip.h>
#include <net/ip6_fib.h>
#include <net/lwtunnel.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
#include "ila.h"
static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
{
if (*(__be64 *)&ip6h->daddr == p->locator_match)
return p->csum_diff;
else
return compute_csum_diff8((__be32 *)&ip6h->daddr,
(__be32 *)&p->locator);
}
void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
{
__wsum diff;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
size_t nhoff = sizeof(struct ipv6hdr);
/* First update checksum */
switch (ip6h->nexthdr) {
case NEXTHDR_TCP:
if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
struct tcphdr *th = (struct tcphdr *)
(skb_network_header(skb) + nhoff);
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&th->check, skb,
diff, true);
}
break;
case NEXTHDR_UDP:
if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
struct udphdr *uh = (struct udphdr *)
(skb_network_header(skb) + nhoff);
if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&uh->check, skb,
diff, true);
if (!uh->check)
uh->check = CSUM_MANGLED_0;
}
}
break;
case NEXTHDR_ICMP:
if (likely(pskb_may_pull(skb,
nhoff + sizeof(struct icmp6hdr)))) {
struct icmp6hdr *ih = (struct icmp6hdr *)
(skb_network_header(skb) + nhoff);
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
diff, true);
}
break;
}
/* Now change destination address */
*(__be64 *)&ip6h->daddr = p->locator;
}
static int __init ila_init(void)
{
int ret;
ret = ila_lwt_init();
if (ret)
goto fail_lwt;
ret = ila_xlat_init();
if (ret)
goto fail_xlat;
return 0;
fail_xlat:
ila_lwt_fini();
fail_lwt:
return ret;
}
static void __exit ila_fini(void)
{
ila_xlat_fini();
ila_lwt_fini();
}
module_init(ila_init);
module_exit(ila_fini);
MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
MODULE_LICENSE("GPL");
...@@ -11,12 +11,7 @@ ...@@ -11,12 +11,7 @@
#include <net/lwtunnel.h> #include <net/lwtunnel.h>
#include <net/protocol.h> #include <net/protocol.h>
#include <uapi/linux/ila.h> #include <uapi/linux/ila.h>
#include "ila.h"
struct ila_params {
__be64 locator;
__be64 locator_match;
__wsum csum_diff;
};
static inline struct ila_params *ila_params_lwtunnel( static inline struct ila_params *ila_params_lwtunnel(
struct lwtunnel_state *lwstate) struct lwtunnel_state *lwstate)
...@@ -24,73 +19,6 @@ static inline struct ila_params *ila_params_lwtunnel( ...@@ -24,73 +19,6 @@ static inline struct ila_params *ila_params_lwtunnel(
return (struct ila_params *)lwstate->data; return (struct ila_params *)lwstate->data;
} }
static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
{
__be32 diff[] = {
~from[0], ~from[1], to[0], to[1],
};
return csum_partial(diff, sizeof(diff), 0);
}
static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
{
if (*(__be64 *)&ip6h->daddr == p->locator_match)
return p->csum_diff;
else
return compute_csum_diff8((__be32 *)&ip6h->daddr,
(__be32 *)&p->locator);
}
static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
{
__wsum diff;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
size_t nhoff = sizeof(struct ipv6hdr);
/* First update checksum */
switch (ip6h->nexthdr) {
case NEXTHDR_TCP:
if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
struct tcphdr *th = (struct tcphdr *)
(skb_network_header(skb) + nhoff);
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&th->check, skb,
diff, true);
}
break;
case NEXTHDR_UDP:
if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
struct udphdr *uh = (struct udphdr *)
(skb_network_header(skb) + nhoff);
if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&uh->check, skb,
diff, true);
if (!uh->check)
uh->check = CSUM_MANGLED_0;
}
}
break;
case NEXTHDR_ICMP:
if (likely(pskb_may_pull(skb,
nhoff + sizeof(struct icmp6hdr)))) {
struct icmp6hdr *ih = (struct icmp6hdr *)
(skb_network_header(skb) + nhoff);
diff = get_csum_diff(ip6h, p);
inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
diff, true);
}
break;
}
/* Now change destination address */
*(__be64 *)&ip6h->daddr = p->locator;
}
static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb) static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{ {
struct dst_entry *dst = skb_dst(skb); struct dst_entry *dst = skb_dst(skb);
...@@ -213,17 +141,12 @@ static const struct lwtunnel_encap_ops ila_encap_ops = { ...@@ -213,17 +141,12 @@ static const struct lwtunnel_encap_ops ila_encap_ops = {
.cmp_encap = ila_encap_cmp, .cmp_encap = ila_encap_cmp,
}; };
static int __init ila_init(void) int ila_lwt_init(void)
{ {
return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
} }
static void __exit ila_fini(void) void ila_lwt_fini(void)
{ {
lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA); lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
} }
module_init(ila_init);
module_exit(ila_fini);
MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
MODULE_LICENSE("GPL");
#include <linux/jhash.h>
#include <linux/netfilter.h>
#include <linux/rcupdate.h>
#include <linux/rhashtable.h>
#include <linux/vmalloc.h>
#include <net/genetlink.h>
#include <net/ila.h>
#include <net/netns/generic.h>
#include <uapi/linux/genetlink.h>
#include "ila.h"
struct ila_xlat_params {
struct ila_params ip;
__be64 identifier;
int ifindex;
unsigned int dir;
};
struct ila_map {
struct ila_xlat_params p;
struct rhash_head node;
struct ila_map __rcu *next;
struct rcu_head rcu;
};
static unsigned int ila_net_id;
struct ila_net {
struct rhashtable rhash_table;
spinlock_t *locks; /* Bucket locks for entry manipulation */
unsigned int locks_mask;
bool hooks_registered;
};
#define LOCKS_PER_CPU 10
static int alloc_ila_locks(struct ila_net *ilan)
{
unsigned int i, size;
unsigned int nr_pcpus = num_possible_cpus();
nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
if (sizeof(spinlock_t) != 0) {
#ifdef CONFIG_NUMA
if (size * sizeof(spinlock_t) > PAGE_SIZE)
ilan->locks = vmalloc(size * sizeof(spinlock_t));
else
#endif
ilan->locks = kmalloc_array(size, sizeof(spinlock_t),
GFP_KERNEL);
if (!ilan->locks)
return -ENOMEM;
for (i = 0; i < size; i++)
spin_lock_init(&ilan->locks[i]);
}
ilan->locks_mask = size - 1;
return 0;
}
static u32 hashrnd __read_mostly;
static __always_inline void __ila_hash_secret_init(void)
{
net_get_random_once(&hashrnd, sizeof(hashrnd));
}
static inline u32 ila_identifier_hash(__be64 identifier)
{
u32 *v = (u32 *)&identifier;
return jhash_2words(v[0], v[1], hashrnd);
}
static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier)
{
return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask];
}
static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc,
int ifindex, unsigned int dir)
{
return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) ||
(ila->p.ifindex && ila->p.ifindex != ifindex) ||
!(ila->p.dir & dir);
}
static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p)
{
return (ila->p.ip.locator_match != p->ip.locator_match) ||
(ila->p.ifindex != p->ifindex) ||
(ila->p.dir != p->dir);
}
static int ila_cmpfn(struct rhashtable_compare_arg *arg,
const void *obj)
{
const struct ila_map *ila = obj;
return (ila->p.identifier != *(__be64 *)arg->key);
}
static inline int ila_order(struct ila_map *ila)
{
int score = 0;
if (ila->p.ip.locator_match)
score += 1 << 0;
if (ila->p.ifindex)
score += 1 << 1;
return score;
}
static const struct rhashtable_params rht_params = {
.nelem_hint = 1024,
.head_offset = offsetof(struct ila_map, node),
.key_offset = offsetof(struct ila_map, p.identifier),
.key_len = sizeof(u64), /* identifier */
.max_size = 1048576,
.min_size = 256,
.automatic_shrinking = true,
.obj_cmpfn = ila_cmpfn,
};
static struct genl_family ila_nl_family = {
.id = GENL_ID_GENERATE,
.hdrsize = 0,
.name = ILA_GENL_NAME,
.version = ILA_GENL_VERSION,
.maxattr = ILA_ATTR_MAX,
.netnsok = true,
.parallel_ops = true,
};
static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
[ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, },
[ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
[ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
[ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
[ILA_ATTR_DIR] = { .type = NLA_U32, },
};
static int parse_nl_config(struct genl_info *info,
struct ila_xlat_params *p)
{
memset(p, 0, sizeof(*p));
if (info->attrs[ILA_ATTR_IDENTIFIER])
p->identifier = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_IDENTIFIER]);
if (info->attrs[ILA_ATTR_LOCATOR])
p->ip.locator = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_LOCATOR]);
if (info->attrs[ILA_ATTR_LOCATOR_MATCH])
p->ip.locator_match = (__force __be64)nla_get_u64(
info->attrs[ILA_ATTR_LOCATOR_MATCH]);
if (info->attrs[ILA_ATTR_IFINDEX])
p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
if (info->attrs[ILA_ATTR_DIR])
p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]);
return 0;
}
/* Must be called with rcu readlock */
static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc,
int ifindex,
unsigned int dir,
struct ila_net *ilan)
{
struct ila_map *ila;
ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params);
while (ila) {
if (!ila_cmp_wildcards(ila, loc, ifindex, dir))
return ila;
ila = rcu_access_pointer(ila->next);
}
return NULL;
}
/* Must be called with rcu readlock */
static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p,
struct ila_net *ilan)
{
struct ila_map *ila;
ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
rht_params);
while (ila) {
if (!ila_cmp_params(ila, p))
return ila;
ila = rcu_access_pointer(ila->next);
}
return NULL;
}
static inline void ila_release(struct ila_map *ila)
{
kfree_rcu(ila, rcu);
}
static void ila_free_cb(void *ptr, void *arg)
{
struct ila_map *ila = (struct ila_map *)ptr, *next;
/* Assume rcu_readlock held */
while (ila) {
next = rcu_access_pointer(ila->next);
ila_release(ila);
ila = next;
}
}
static int ila_xlat_addr(struct sk_buff *skb, int dir);
static unsigned int
ila_nf_input(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
ila_xlat_addr(skb, ILA_DIR_IN);
return NF_ACCEPT;
}
static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = {
{
.hook = ila_nf_input,
.pf = NFPROTO_IPV6,
.hooknum = NF_INET_PRE_ROUTING,
.priority = -1,
},
};
static int ila_add_mapping(struct net *net, struct ila_xlat_params *p)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_map *ila, *head;
spinlock_t *lock = ila_get_lock(ilan, p->identifier);
int err = 0, order;
if (!ilan->hooks_registered) {
/* We defer registering net hooks in the namespace until the
* first mapping is added.
*/
err = nf_register_net_hooks(net, ila_nf_hook_ops,
ARRAY_SIZE(ila_nf_hook_ops));
if (err)
return err;
ilan->hooks_registered = true;
}
ila = kzalloc(sizeof(*ila), GFP_KERNEL);
if (!ila)
return -ENOMEM;
ila->p = *p;
if (p->ip.locator_match) {
/* Precompute checksum difference for translation since we
* know both the old identifier and the new one.
*/
ila->p.ip.csum_diff = compute_csum_diff8(
(__be32 *)&p->ip.locator_match,
(__be32 *)&p->ip.locator);
}
order = ila_order(ila);
spin_lock(lock);
head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
rht_params);
if (!head) {
/* New entry for the rhash_table */
err = rhashtable_lookup_insert_fast(&ilan->rhash_table,
&ila->node, rht_params);
} else {
struct ila_map *tila = head, *prev = NULL;
do {
if (!ila_cmp_params(tila, p)) {
err = -EEXIST;
goto out;
}
if (order > ila_order(tila))
break;
prev = tila;
tila = rcu_dereference_protected(tila->next,
lockdep_is_held(lock));
} while (tila);
if (prev) {
/* Insert in sub list of head */
RCU_INIT_POINTER(ila->next, tila);
rcu_assign_pointer(prev->next, ila);
} else {
/* Make this ila new head */
RCU_INIT_POINTER(ila->next, head);
err = rhashtable_replace_fast(&ilan->rhash_table,
&head->node,
&ila->node, rht_params);
if (err)
goto out;
}
}
out:
spin_unlock(lock);
if (err)
kfree(ila);
return err;
}
static int ila_del_mapping(struct net *net, struct ila_xlat_params *p)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_map *ila, *head, *prev;
spinlock_t *lock = ila_get_lock(ilan, p->identifier);
int err = -ENOENT;
spin_lock(lock);
head = rhashtable_lookup_fast(&ilan->rhash_table,
&p->identifier, rht_params);
ila = head;
prev = NULL;
while (ila) {
if (ila_cmp_params(ila, p)) {
prev = ila;
ila = rcu_dereference_protected(ila->next,
lockdep_is_held(lock));
continue;
}
err = 0;
if (prev) {
/* Not head, just delete from list */
rcu_assign_pointer(prev->next, ila->next);
} else {
/* It is the head. If there is something in the
* sublist we need to make a new head.
*/
head = rcu_dereference_protected(ila->next,
lockdep_is_held(lock));
if (head) {
/* Put first entry in the sublist into the
* table
*/
err = rhashtable_replace_fast(
&ilan->rhash_table, &ila->node,
&head->node, rht_params);
if (err)
goto out;
} else {
/* Entry no longer used */
err = rhashtable_remove_fast(&ilan->rhash_table,
&ila->node,
rht_params);
}
}
ila_release(ila);
break;
}
out:
spin_unlock(lock);
return err;
}
static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params p;
int err;
err = parse_nl_config(info, &p);
if (err)
return err;
return ila_add_mapping(net, &p);
}
static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_xlat_params p;
int err;
err = parse_nl_config(info, &p);
if (err)
return err;
ila_del_mapping(net, &p);
return 0;
}
static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
{
if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER,
(__force u64)ila->p.identifier) ||
nla_put_u64(msg, ILA_ATTR_LOCATOR,
(__force u64)ila->p.ip.locator) ||
nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH,
(__force u64)ila->p.ip.locator_match) ||
nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) ||
nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir))
return -1;
return 0;
}
static int ila_dump_info(struct ila_map *ila,
u32 portid, u32 seq, u32 flags,
struct sk_buff *skb, u8 cmd)
{
void *hdr;
hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd);
if (!hdr)
return -ENOMEM;
if (ila_fill_info(ila, skb) < 0)
goto nla_put_failure;
genlmsg_end(skb, hdr);
return 0;
nla_put_failure:
genlmsg_cancel(skb, hdr);
return -EMSGSIZE;
}
static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
{
struct net *net = genl_info_net(info);
struct ila_net *ilan = net_generic(net, ila_net_id);
struct sk_buff *msg;
struct ila_xlat_params p;
struct ila_map *ila;
int ret;
ret = parse_nl_config(info, &p);
if (ret)
return ret;
msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
if (!msg)
return -ENOMEM;
rcu_read_lock();
ila = ila_lookup_by_params(&p, ilan);
if (ila) {
ret = ila_dump_info(ila,
info->snd_portid,
info->snd_seq, 0, msg,
info->genlhdr->cmd);
}
rcu_read_unlock();
if (ret < 0)
goto out_free;
return genlmsg_reply(msg, info);
out_free:
nlmsg_free(msg);
return ret;
}
struct ila_dump_iter {
struct rhashtable_iter rhiter;
};
static int ila_nl_dump_start(struct netlink_callback *cb)
{
struct net *net = sock_net(cb->skb->sk);
struct ila_net *ilan = net_generic(net, ila_net_id);
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter);
}
static int ila_nl_dump_done(struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
rhashtable_walk_exit(&iter->rhiter);
return 0;
}
static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
struct rhashtable_iter *rhiter = &iter->rhiter;
struct ila_map *ila;
int ret;
ret = rhashtable_walk_start(rhiter);
if (ret && ret != -EAGAIN)
goto done;
for (;;) {
ila = rhashtable_walk_next(rhiter);
if (IS_ERR(ila)) {
if (PTR_ERR(ila) == -EAGAIN)
continue;
ret = PTR_ERR(ila);
goto done;
} else if (!ila) {
break;
}
while (ila) {
ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, NLM_F_MULTI,
skb, ILA_CMD_GET);
if (ret)
goto done;
ila = rcu_access_pointer(ila->next);
}
}
ret = skb->len;
done:
rhashtable_walk_stop(rhiter);
return ret;
}
static const struct genl_ops ila_nl_ops[] = {
{
.cmd = ILA_CMD_ADD,
.doit = ila_nl_cmd_add_mapping,
.policy = ila_nl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = ILA_CMD_DEL,
.doit = ila_nl_cmd_del_mapping,
.policy = ila_nl_policy,
.flags = GENL_ADMIN_PERM,
},
{
.cmd = ILA_CMD_GET,
.doit = ila_nl_cmd_get_mapping,
.start = ila_nl_dump_start,
.dumpit = ila_nl_dump,
.done = ila_nl_dump_done,
.policy = ila_nl_policy,
},
};
#define ILA_HASH_TABLE_SIZE 1024
static __net_init int ila_init_net(struct net *net)
{
int err;
struct ila_net *ilan = net_generic(net, ila_net_id);
err = alloc_ila_locks(ilan);
if (err)
return err;
rhashtable_init(&ilan->rhash_table, &rht_params);
return 0;
}
static __net_exit void ila_exit_net(struct net *net)
{
struct ila_net *ilan = net_generic(net, ila_net_id);
rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
kvfree(ilan->locks);
if (ilan->hooks_registered)
nf_unregister_net_hooks(net, ila_nf_hook_ops,
ARRAY_SIZE(ila_nf_hook_ops));
}
static struct pernet_operations ila_net_ops = {
.init = ila_init_net,
.exit = ila_exit_net,
.id = &ila_net_id,
.size = sizeof(struct ila_net),
};
static int ila_xlat_addr(struct sk_buff *skb, int dir)
{
struct ila_map *ila;
struct ipv6hdr *ip6h = ipv6_hdr(skb);
struct net *net = dev_net(skb->dev);
struct ila_net *ilan = net_generic(net, ila_net_id);
__be64 identifier, locator_match;
size_t nhoff;
/* Assumes skb contains a valid IPv6 header that is pulled */
identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8];
locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0];
nhoff = sizeof(struct ipv6hdr);
rcu_read_lock();
ila = ila_lookup_wildcards(identifier, locator_match,
skb->dev->ifindex, dir, ilan);
if (ila)
update_ipv6_locator(skb, &ila->p.ip);
rcu_read_unlock();
return 0;
}
int ila_xlat_incoming(struct sk_buff *skb)
{
return ila_xlat_addr(skb, ILA_DIR_IN);
}
EXPORT_SYMBOL(ila_xlat_incoming);
int ila_xlat_outgoing(struct sk_buff *skb)
{
return ila_xlat_addr(skb, ILA_DIR_OUT);
}
EXPORT_SYMBOL(ila_xlat_outgoing);
int ila_xlat_init(void)
{
int ret;
ret = register_pernet_device(&ila_net_ops);
if (ret)
goto exit;
ret = genl_register_family_with_ops(&ila_nl_family,
ila_nl_ops);
if (ret < 0)
goto unregister;
return 0;
unregister:
unregister_pernet_device(&ila_net_ops);
exit:
return ret;
}
void ila_xlat_fini(void)
{
genl_unregister_family(&ila_nl_family);
unregister_pernet_device(&ila_net_ops);
}
...@@ -2915,6 +2915,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, ...@@ -2915,6 +2915,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb = &nlk->cb; cb = &nlk->cb;
memset(cb, 0, sizeof(*cb)); memset(cb, 0, sizeof(*cb));
cb->start = control->start;
cb->dump = control->dump; cb->dump = control->dump;
cb->done = control->done; cb->done = control->done;
cb->nlh = nlh; cb->nlh = nlh;
...@@ -2927,6 +2928,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb, ...@@ -2927,6 +2928,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
mutex_unlock(nlk->cb_mutex); mutex_unlock(nlk->cb_mutex);
if (cb->start)
cb->start(cb);
ret = netlink_dump(sk); ret = netlink_dump(sk);
sock_put(sk); sock_put(sk);
......
...@@ -513,6 +513,20 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq, ...@@ -513,6 +513,20 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
} }
EXPORT_SYMBOL(genlmsg_put); EXPORT_SYMBOL(genlmsg_put);
static int genl_lock_start(struct netlink_callback *cb)
{
/* our ops are always const - netlink API doesn't propagate that */
const struct genl_ops *ops = cb->data;
int rc = 0;
if (ops->start) {
genl_lock();
rc = ops->start(cb);
genl_unlock();
}
return rc;
}
static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb) static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{ {
/* our ops are always const - netlink API doesn't propagate that */ /* our ops are always const - netlink API doesn't propagate that */
...@@ -577,6 +591,7 @@ static int genl_family_rcv_msg(struct genl_family *family, ...@@ -577,6 +591,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
.module = family->module, .module = family->module,
/* we have const, but the netlink API doesn't */ /* we have const, but the netlink API doesn't */
.data = (void *)ops, .data = (void *)ops,
.start = genl_lock_start,
.dump = genl_lock_dumpit, .dump = genl_lock_dumpit,
.done = genl_lock_done, .done = genl_lock_done,
}; };
...@@ -588,6 +603,7 @@ static int genl_family_rcv_msg(struct genl_family *family, ...@@ -588,6 +603,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
} else { } else {
struct netlink_dump_control c = { struct netlink_dump_control c = {
.module = family->module, .module = family->module,
.start = ops->start,
.dump = ops->dumpit, .dump = ops->dumpit,
.done = ops->done, .done = ops->done,
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment