Commit dc3d807d authored by David S. Miller's avatar David S. Miller

openvswitch: gre tunneling support.

Pravin B Shelar says:

====================
Following patch series adds support for gre tunneling.
First six patches extend kernel gre and ip_tunnel modules
api so that there is more code sharing between gre modules
and ovs. Rest of patches adds ovs tunneling infrastructre
and gre protocol vport.

V2 fixes two patches according to comments from Jesse.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ac8025a6 aa310701
......@@ -1021,7 +1021,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
struct vxlan_dev *vxlan = netdev_priv(dev);
struct rtable *rt;
const struct iphdr *old_iph;
struct iphdr *iph;
struct vxlanhdr *vxh;
struct udphdr *uh;
struct flowi4 fl4;
......@@ -1030,6 +1029,7 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
u32 vni;
__be16 df = 0;
__u8 tos, ttl;
int err;
dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
vni = rdst->remote_vni;
......@@ -1097,13 +1097,6 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
vxlan_encap_bypass(skb, vxlan, dst_vxlan);
return NETDEV_TX_OK;
}
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
IPSKB_REROUTED);
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
vxh->vx_flags = htonl(VXLAN_FLAGS);
vxh->vx_vni = htonl(vni << 8);
......@@ -1118,27 +1111,18 @@ static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
uh->len = htons(skb->len);
uh->check = 0;
__skb_push(skb, sizeof(*iph));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = IPPROTO_UDP;
iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
iph->daddr = dst;
iph->saddr = fl4.saddr;
iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
tunnel_ip_select_ident(skb, old_iph, &rt->dst);
nf_reset(skb);
vxlan_set_owner(dev, skb);
if (handle_offloads(skb))
goto drop;
iptunnel_xmit(skb, dev);
tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, dst,
IPPROTO_UDP, tos, ttl, df);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return NETDEV_TX_OK;
drop:
......
......@@ -7,6 +7,7 @@
#define GREPROTO_CISCO 0
#define GREPROTO_PPTP 1
#define GREPROTO_MAX 2
#define GRE_IP_PROTO_MAX 2
struct gre_protocol {
int (*handler)(struct sk_buff *skb);
......@@ -22,6 +23,32 @@ struct gre_base_hdr {
int gre_add_protocol(const struct gre_protocol *proto, u8 version);
int gre_del_protocol(const struct gre_protocol *proto, u8 version);
struct gre_cisco_protocol {
int (*handler)(struct sk_buff *skb, const struct tnl_ptk_info *tpi);
int (*err_handler)(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi);
u8 priority;
};
int gre_cisco_register(struct gre_cisco_protocol *proto);
int gre_cisco_unregister(struct gre_cisco_protocol *proto);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len);
struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum);
static inline int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags&TUNNEL_CSUM)
addend += 4;
if (o_flags&TUNNEL_KEY)
addend += 4;
if (o_flags&TUNNEL_SEQ)
addend += 4;
return addend;
}
static inline __be16 gre_flags_to_tnl_flags(__be16 flags)
{
__be16 tflags = 0;
......
......@@ -73,6 +73,7 @@ struct ip_tunnel {
#define TUNNEL_REC __cpu_to_be16(0x20)
#define TUNNEL_VERSION __cpu_to_be16(0x40)
#define TUNNEL_NO_KEY __cpu_to_be16(0x80)
#define TUNNEL_DONT_FRAGMENT __cpu_to_be16(0x0100)
struct tnl_ptk_info {
__be16 flags;
......@@ -155,23 +156,28 @@ static inline void tunnel_ip_select_ident(struct sk_buff *skb,
(skb_shinfo(skb)->gso_segs ?: 1) - 1);
}
static inline void iptunnel_xmit(struct sk_buff *skb, struct net_device *dev)
{
int err;
int pkt_len = skb->len - skb_transport_offset(skb);
struct pcpu_tstats *tstats = this_cpu_ptr(dev->tstats);
int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto);
int iptunnel_xmit(struct net *net, struct rtable *rt,
struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df);
nf_reset(skb);
static inline void iptunnel_xmit_stats(int err,
struct net_device_stats *err_stats,
struct pcpu_tstats __percpu *stats)
{
if (err > 0) {
struct pcpu_tstats *tstats = this_cpu_ptr(stats);
err = ip_local_out(skb);
if (likely(net_xmit_eval(err) == 0)) {
u64_stats_update_begin(&tstats->syncp);
tstats->tx_bytes += pkt_len;
tstats->tx_bytes += err;
tstats->tx_packets++;
u64_stats_update_end(&tstats->syncp);
} else if (err < 0) {
err_stats->tx_errors++;
err_stats->tx_aborted_errors++;
} else {
dev->stats.tx_errors++;
dev->stats.tx_aborted_errors++;
err_stats->tx_dropped++;
}
}
#endif /* __NET_IP_TUNNELS_H */
......@@ -164,6 +164,7 @@ enum ovs_vport_type {
OVS_VPORT_TYPE_UNSPEC,
OVS_VPORT_TYPE_NETDEV, /* network device */
OVS_VPORT_TYPE_INTERNAL, /* network device implemented by datapath */
OVS_VPORT_TYPE_GRE, /* GRE tunnel. */
__OVS_VPORT_TYPE_MAX
};
......@@ -246,11 +247,29 @@ enum ovs_key_attr {
OVS_KEY_ATTR_ARP, /* struct ovs_key_arp */
OVS_KEY_ATTR_ND, /* struct ovs_key_nd */
OVS_KEY_ATTR_SKB_MARK, /* u32 skb mark */
OVS_KEY_ATTR_TUNNEL, /* Nested set of ovs_tunnel attributes */
#ifdef __KERNEL__
OVS_KEY_ATTR_IPV4_TUNNEL, /* struct ovs_key_ipv4_tunnel */
#endif
__OVS_KEY_ATTR_MAX
};
#define OVS_KEY_ATTR_MAX (__OVS_KEY_ATTR_MAX - 1)
enum ovs_tunnel_key_attr {
OVS_TUNNEL_KEY_ATTR_ID, /* be64 Tunnel ID */
OVS_TUNNEL_KEY_ATTR_IPV4_SRC, /* be32 src IP address. */
OVS_TUNNEL_KEY_ATTR_IPV4_DST, /* be32 dst IP address. */
OVS_TUNNEL_KEY_ATTR_TOS, /* u8 Tunnel IP ToS. */
OVS_TUNNEL_KEY_ATTR_TTL, /* u8 Tunnel IP TTL. */
OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT, /* No argument, set DF. */
OVS_TUNNEL_KEY_ATTR_CSUM, /* No argument. CSUM packet. */
__OVS_TUNNEL_KEY_ATTR_MAX
};
#define OVS_TUNNEL_KEY_ATTR_MAX (__OVS_TUNNEL_KEY_ATTR_MAX - 1)
/**
* enum ovs_frag_type - IPv4 and IPv6 fragment type
* @OVS_FRAG_TYPE_NONE: Packet is not a fragment.
......
......@@ -11,7 +11,7 @@ obj-y := route.o inetpeer.o protocol.o \
tcp_offload.o datagram.o raw.o udp.o udplite.o \
udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \
fib_frontend.o fib_semantics.o fib_trie.o \
inet_fragment.o ping.o
inet_fragment.o ping.o ip_tunnel_core.o
obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
......
......@@ -13,6 +13,8 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/module.h>
#include <linux/if.h>
#include <linux/icmp.h>
#include <linux/kernel.h>
#include <linux/kmod.h>
#include <linux/skbuff.h>
......@@ -24,51 +26,270 @@
#include <net/protocol.h>
#include <net/gre.h>
#include <net/icmp.h>
#include <net/route.h>
#include <net/xfrm.h>
static const struct gre_protocol __rcu *gre_proto[GREPROTO_MAX] __read_mostly;
static DEFINE_SPINLOCK(gre_proto_lock);
static struct gre_cisco_protocol __rcu *gre_cisco_proto_list[GRE_IP_PROTO_MAX];
int gre_add_protocol(const struct gre_protocol *proto, u8 version)
{
if (version >= GREPROTO_MAX)
goto err_out;
spin_lock(&gre_proto_lock);
if (gre_proto[version])
goto err_out_unlock;
RCU_INIT_POINTER(gre_proto[version], proto);
spin_unlock(&gre_proto_lock);
return 0;
return -EINVAL;
err_out_unlock:
spin_unlock(&gre_proto_lock);
err_out:
return -1;
return (cmpxchg((const struct gre_protocol **)&gre_proto[version], NULL, proto) == NULL) ?
0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_add_protocol);
int gre_del_protocol(const struct gre_protocol *proto, u8 version)
{
int ret;
if (version >= GREPROTO_MAX)
goto err_out;
spin_lock(&gre_proto_lock);
if (rcu_dereference_protected(gre_proto[version],
lockdep_is_held(&gre_proto_lock)) != proto)
goto err_out_unlock;
RCU_INIT_POINTER(gre_proto[version], NULL);
spin_unlock(&gre_proto_lock);
return -EINVAL;
ret = (cmpxchg((const struct gre_protocol **)&gre_proto[version], proto, NULL) == proto) ?
0 : -EBUSY;
if (ret)
return ret;
synchronize_rcu();
return 0;
err_out_unlock:
spin_unlock(&gre_proto_lock);
err_out:
return -1;
}
EXPORT_SYMBOL_GPL(gre_del_protocol);
void gre_build_header(struct sk_buff *skb, const struct tnl_ptk_info *tpi,
int hdr_len)
{
struct gre_base_hdr *greh;
skb_push(skb, hdr_len);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
greh->protocol = tpi->proto;
if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
if (tpi->flags&TUNNEL_SEQ) {
*ptr = tpi->seq;
ptr--;
}
if (tpi->flags&TUNNEL_KEY) {
*ptr = tpi->key;
ptr--;
}
if (tpi->flags&TUNNEL_CSUM &&
!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
*ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
}
}
}
EXPORT_SYMBOL_GPL(gre_build_header);
struct sk_buff *gre_handle_offloads(struct sk_buff *skb, bool gre_csum)
{
int err;
if (likely(!skb->encapsulation)) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
if (skb_is_gso(skb)) {
err = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(err))
goto error;
skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
return skb;
} else if (skb->ip_summed == CHECKSUM_PARTIAL && gre_csum) {
err = skb_checksum_help(skb);
if (unlikely(err))
goto error;
} else if (skb->ip_summed != CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_NONE;
return skb;
error:
kfree_skb(skb);
return ERR_PTR(err);
}
EXPORT_SYMBOL_GPL(gre_handle_offloads);
static __sum16 check_checksum(struct sk_buff *skb)
{
__sum16 csum = 0;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
csum = csum_fold(skb->csum);
if (!csum)
break;
/* Fall through. */
case CHECKSUM_NONE:
skb->csum = 0;
csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_COMPLETE;
break;
}
return csum;
}
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err)
{
unsigned int ip_hlen = ip_hdrlen(skb);
const struct gre_base_hdr *greh;
__be32 *options;
int hdr_len;
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
hdr_len = ip_gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (check_checksum(skb)) {
*csum_err = true;
return -EINVAL;
}
options++;
}
if (greh->flags & GRE_KEY) {
tpi->key = *options;
options++;
} else
tpi->key = 0;
if (unlikely(greh->flags & GRE_SEQ)) {
tpi->seq = *options;
options++;
} else
tpi->seq = 0;
/* WCCP version 1 and 2 protocol decoding.
* - Change protocol to IP
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
tpi->proto = htons(ETH_P_IP);
if ((*(u8 *)options & 0xF0) != 0x40) {
hdr_len += 4;
if (!pskb_may_pull(skb, hdr_len))
return -EINVAL;
}
}
return iptunnel_pull_header(skb, hdr_len, tpi->proto);
}
static int gre_cisco_rcv(struct sk_buff *skb)
{
struct tnl_ptk_info tpi;
int i;
bool csum_err = false;
if (parse_gre_header(skb, &tpi, &csum_err) < 0)
goto drop;
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
int ret;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
ret = proto->handler(skb, &tpi);
if (ret == PACKET_RCVD) {
rcu_read_unlock();
return 0;
}
}
rcu_read_unlock();
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
static void gre_cisco_err(struct sk_buff *skb, u32 info)
{
/* All the routers (except for Linux) return only
* 8 bytes of packet payload. It means, that precise relaying of
* ICMP in the real Internet is absolutely infeasible.
*
* Moreover, Cisco "wise men" put GRE key to the third word
* in GRE header. It makes impossible maintaining even soft
* state for keyed
* GRE tunnels with enabled checksum. Tell them "thank you".
*
* Well, I wonder, rfc1812 was written by Cisco employee,
* what the hell these idiots break standards established
* by themselves???
*/
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct tnl_ptk_info tpi;
bool csum_err = false;
int i;
if (parse_gre_header(skb, &tpi, &csum_err)) {
if (!csum_err) /* ignore csum errors. */
return;
}
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
skb->dev->ifindex, 0, IPPROTO_GRE, 0);
return;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), skb->dev->ifindex, 0,
IPPROTO_GRE, 0);
return;
}
rcu_read_lock();
for (i = 0; i < GRE_IP_PROTO_MAX; i++) {
struct gre_cisco_protocol *proto;
proto = rcu_dereference(gre_cisco_proto_list[i]);
if (!proto)
continue;
if (proto->err_handler(skb, info, &tpi) == PACKET_RCVD)
goto out;
}
out:
rcu_read_unlock();
}
static int gre_rcv(struct sk_buff *skb)
{
const struct gre_protocol *proto;
......@@ -220,27 +441,68 @@ static const struct net_offload gre_offload = {
},
};
static const struct gre_protocol ipgre_protocol = {
.handler = gre_cisco_rcv,
.err_handler = gre_cisco_err,
};
int gre_cisco_register(struct gre_cisco_protocol *newp)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[newp->priority];
return (cmpxchg(proto, NULL, newp) == NULL) ? 0 : -EBUSY;
}
EXPORT_SYMBOL_GPL(gre_cisco_register);
int gre_cisco_unregister(struct gre_cisco_protocol *del_proto)
{
struct gre_cisco_protocol **proto = (struct gre_cisco_protocol **)
&gre_cisco_proto_list[del_proto->priority];
int ret;
ret = (cmpxchg(proto, del_proto, NULL) == del_proto) ? 0 : -EINVAL;
if (ret)
return ret;
synchronize_net();
return 0;
}
EXPORT_SYMBOL_GPL(gre_cisco_unregister);
static int __init gre_init(void)
{
pr_info("GRE over IPv4 demultiplexor driver\n");
if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) {
pr_err("can't add protocol\n");
return -EAGAIN;
goto err;
}
if (gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) {
pr_info("%s: can't add ipgre handler\n", __func__);
goto err_gre;
}
if (inet_add_offload(&gre_offload, IPPROTO_GRE)) {
pr_err("can't add protocol offload\n");
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
return -EAGAIN;
goto err_gso;
}
return 0;
err_gso:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
err_gre:
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
err:
return -EAGAIN;
}
static void __exit gre_exit(void)
{
inet_del_offload(&gre_offload, IPPROTO_GRE);
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
inet_del_protocol(&net_gre_protocol, IPPROTO_GRE);
}
......@@ -250,4 +512,3 @@ module_exit(gre_exit);
MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver");
MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)");
MODULE_LICENSE("GPL");
......@@ -121,103 +121,8 @@ static int ipgre_tunnel_init(struct net_device *dev);
static int ipgre_net_id __read_mostly;
static int gre_tap_net_id __read_mostly;
static __sum16 check_checksum(struct sk_buff *skb)
{
__sum16 csum = 0;
switch (skb->ip_summed) {
case CHECKSUM_COMPLETE:
csum = csum_fold(skb->csum);
if (!csum)
break;
/* Fall through. */
case CHECKSUM_NONE:
skb->csum = 0;
csum = __skb_checksum_complete(skb);
skb->ip_summed = CHECKSUM_COMPLETE;
break;
}
return csum;
}
static int ip_gre_calc_hlen(__be16 o_flags)
{
int addend = 4;
if (o_flags&TUNNEL_CSUM)
addend += 4;
if (o_flags&TUNNEL_KEY)
addend += 4;
if (o_flags&TUNNEL_SEQ)
addend += 4;
return addend;
}
static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
bool *csum_err, int *hdr_len)
{
unsigned int ip_hlen = ip_hdrlen(skb);
const struct gre_base_hdr *greh;
__be32 *options;
if (unlikely(!pskb_may_pull(skb, sizeof(struct gre_base_hdr))))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
if (unlikely(greh->flags & (GRE_VERSION | GRE_ROUTING)))
return -EINVAL;
tpi->flags = gre_flags_to_tnl_flags(greh->flags);
*hdr_len = ip_gre_calc_hlen(tpi->flags);
if (!pskb_may_pull(skb, *hdr_len))
return -EINVAL;
greh = (struct gre_base_hdr *)(skb_network_header(skb) + ip_hlen);
tpi->proto = greh->protocol;
options = (__be32 *)(greh + 1);
if (greh->flags & GRE_CSUM) {
if (check_checksum(skb)) {
*csum_err = true;
return -EINVAL;
}
options++;
}
if (greh->flags & GRE_KEY) {
tpi->key = *options;
options++;
} else
tpi->key = 0;
if (unlikely(greh->flags & GRE_SEQ)) {
tpi->seq = *options;
options++;
} else
tpi->seq = 0;
/* WCCP version 1 and 2 protocol decoding.
* - Change protocol to IP
* - When dealing with WCCPv2, Skip extra 4 bytes in GRE header
*/
if (greh->flags == 0 && tpi->proto == htons(ETH_P_WCCP)) {
tpi->proto = htons(ETH_P_IP);
if ((*(u8 *)options & 0xF0) != 0x40) {
*hdr_len += 4;
if (!pskb_may_pull(skb, *hdr_len))
return -EINVAL;
}
}
return 0;
}
static void ipgre_err(struct sk_buff *skb, u32 info)
static int ipgre_err(struct sk_buff *skb, u32 info,
const struct tnl_ptk_info *tpi)
{
/* All the routers (except for Linux) return only
......@@ -239,26 +144,18 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
const int type = icmp_hdr(skb)->type;
const int code = icmp_hdr(skb)->code;
struct ip_tunnel *t;
struct tnl_ptk_info tpi;
int hdr_len;
bool csum_err = false;
if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len)) {
if (!csum_err) /* ignore csum errors. */
return;
}
switch (type) {
default:
case ICMP_PARAMETERPROB:
return;
return PACKET_RCVD;
case ICMP_DEST_UNREACH:
switch (code) {
case ICMP_SR_FAILED:
case ICMP_PORT_UNREACH:
/* Impossible event. */
return;
return PACKET_RCVD;
default:
/* All others are translated to HOST_UNREACH.
rfc2003 contains "deep thoughts" about NET_UNREACH,
......@@ -269,138 +166,61 @@ static void ipgre_err(struct sk_buff *skb, u32 info)
break;
case ICMP_TIME_EXCEEDED:
if (code != ICMP_EXC_TTL)
return;
return PACKET_RCVD;
break;
case ICMP_REDIRECT:
break;
}
if (tpi.proto == htons(ETH_P_TEB))
if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
else
itn = net_generic(net, ipgre_net_id);
iph = (const struct iphdr *)skb->data;
t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
iph->daddr, iph->saddr, tpi.key);
t = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->daddr, iph->saddr, tpi->key);
if (t == NULL)
return;
return PACKET_REJECT;
if (type == ICMP_DEST_UNREACH && code == ICMP_FRAG_NEEDED) {
ipv4_update_pmtu(skb, dev_net(skb->dev), info,
t->parms.link, 0, IPPROTO_GRE, 0);
return;
}
if (type == ICMP_REDIRECT) {
ipv4_redirect(skb, dev_net(skb->dev), t->parms.link, 0,
IPPROTO_GRE, 0);
return;
}
if (t->parms.iph.daddr == 0 ||
ipv4_is_multicast(t->parms.iph.daddr))
return;
return PACKET_RCVD;
if (t->parms.iph.ttl == 0 && type == ICMP_TIME_EXCEEDED)
return;
return PACKET_RCVD;
if (time_before(jiffies, t->err_time + IPTUNNEL_ERR_TIMEO))
t->err_count++;
else
t->err_count = 1;
t->err_time = jiffies;
return PACKET_RCVD;
}
static int ipgre_rcv(struct sk_buff *skb)
static int ipgre_rcv(struct sk_buff *skb, const struct tnl_ptk_info *tpi)
{
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn;
const struct iphdr *iph;
struct ip_tunnel *tunnel;
struct tnl_ptk_info tpi;
int hdr_len;
bool csum_err = false;
if (parse_gre_header(skb, &tpi, &csum_err, &hdr_len) < 0)
goto drop;
if (tpi.proto == htons(ETH_P_TEB))
if (tpi->proto == htons(ETH_P_TEB))
itn = net_generic(net, gre_tap_net_id);
else
itn = net_generic(net, ipgre_net_id);
iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi.flags,
iph->saddr, iph->daddr, tpi.key);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, tpi->flags,
iph->saddr, iph->daddr, tpi->key);
if (tunnel) {
ip_tunnel_rcv(tunnel, skb, &tpi, log_ecn_error);
return 0;
}
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
drop:
kfree_skb(skb);
return 0;
}
static struct sk_buff *handle_offloads(struct ip_tunnel *tunnel, struct sk_buff *skb)
{
int err;
if (skb_is_gso(skb)) {
err = skb_unclone(skb, GFP_ATOMIC);
if (unlikely(err))
goto error;
skb_shinfo(skb)->gso_type |= SKB_GSO_GRE;
return skb;
} else if (skb->ip_summed == CHECKSUM_PARTIAL &&
tunnel->parms.o_flags&TUNNEL_CSUM) {
err = skb_checksum_help(skb);
if (unlikely(err))
goto error;
} else if (skb->ip_summed != CHECKSUM_PARTIAL)
skb->ip_summed = CHECKSUM_NONE;
return skb;
error:
kfree_skb(skb);
return ERR_PTR(err);
}
static struct sk_buff *gre_build_header(struct sk_buff *skb,
const struct tnl_ptk_info *tpi,
int hdr_len)
{
struct gre_base_hdr *greh;
skb_push(skb, hdr_len);
greh = (struct gre_base_hdr *)skb->data;
greh->flags = tnl_flags_to_gre_flags(tpi->flags);
greh->protocol = tpi->proto;
if (tpi->flags&(TUNNEL_KEY|TUNNEL_CSUM|TUNNEL_SEQ)) {
__be32 *ptr = (__be32 *)(((u8 *)greh) + hdr_len - 4);
if (tpi->flags&TUNNEL_SEQ) {
*ptr = tpi->seq;
ptr--;
}
if (tpi->flags&TUNNEL_KEY) {
*ptr = tpi->key;
ptr--;
}
if (tpi->flags&TUNNEL_CSUM &&
!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE)) {
*(__sum16 *)ptr = 0;
*(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
skb->len, 0));
ip_tunnel_rcv(tunnel, skb, tpi, log_ecn_error);
return PACKET_RCVD;
}
}
return skb;
return PACKET_REJECT;
}
static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
......@@ -410,11 +230,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
struct ip_tunnel *tunnel = netdev_priv(dev);
struct tnl_ptk_info tpi;
if (likely(!skb->encapsulation)) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
tpi.flags = tunnel->parms.o_flags;
tpi.proto = proto;
tpi.key = tunnel->parms.o_key;
......@@ -423,11 +238,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
tpi.seq = htonl(tunnel->o_seqno);
/* Push GRE header. */
skb = gre_build_header(skb, &tpi, tunnel->hlen);
if (unlikely(!skb)) {
dev->stats.tx_dropped++;
return;
}
gre_build_header(skb, &tpi, tunnel->hlen);
ip_tunnel_xmit(skb, dev, tnl_params, tnl_params->protocol);
}
......@@ -438,7 +249,7 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb,
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tnl_params;
skb = handle_offloads(tunnel, skb);
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
......@@ -477,7 +288,7 @@ static netdev_tx_t gre_tap_xmit(struct sk_buff *skb,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
skb = handle_offloads(tunnel, skb);
skb = gre_handle_offloads(skb, !!(tunnel->parms.o_flags&TUNNEL_CSUM));
if (IS_ERR(skb))
goto out;
......@@ -708,9 +519,10 @@ static int ipgre_tunnel_init(struct net_device *dev)
return ip_tunnel_init(dev);
}
static const struct gre_protocol ipgre_protocol = {
static struct gre_cisco_protocol ipgre_protocol = {
.handler = ipgre_rcv,
.err_handler = ipgre_err,
.priority = 0,
};
static int __net_init ipgre_init_net(struct net *net)
......@@ -978,7 +790,7 @@ static int __init ipgre_init(void)
if (err < 0)
goto pnet_tap_faied;
err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO);
err = gre_cisco_register(&ipgre_protocol);
if (err < 0) {
pr_info("%s: can't add protocol\n", __func__);
goto add_proto_failed;
......@@ -997,7 +809,7 @@ static int __init ipgre_init(void)
tap_ops_failed:
rtnl_link_unregister(&ipgre_link_ops);
rtnl_link_failed:
gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO);
gre_cisco_unregister(&ipgre_protocol);
add_proto_failed:
unregister_pernet_device(&ipgre_tap_net_ops);
pnet_tap_faied:
......@@ -1009,8 +821,7 @@ static void __exit ipgre_fini(void)
{
rtnl_link_unregister(&ipgre_tap_ops);
rtnl_link_unregister(&ipgre_link_ops);
if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0)
pr_info("%s: can't remove protocol\n", __func__);
gre_cisco_unregister(&ipgre_protocol);
unregister_pernet_device(&ipgre_tap_net_ops);
unregister_pernet_device(&ipgre_net_ops);
}
......
......@@ -408,13 +408,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
const struct iphdr *iph = ip_hdr(skb);
int err;
secpath_reset(skb);
skb->protocol = tpi->proto;
skb->mac_header = skb->network_header;
__pskb_pull(skb, tunnel->hlen);
skb_postpull_rcsum(skb, skb_transport_header(skb), tunnel->hlen);
#ifdef CONFIG_NET_IPGRE_BROADCAST
if (ipv4_is_multicast(iph->daddr)) {
/* Looped back packet, drop it! */
......@@ -442,23 +435,6 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tunnel->i_seqno = ntohl(tpi->seq) + 1;
}
/* Warning: All skb pointers will be invalidated! */
if (tunnel->dev->type == ARPHRD_ETHER) {
if (!pskb_may_pull(skb, ETH_HLEN)) {
tunnel->dev->stats.rx_length_errors++;
tunnel->dev->stats.rx_errors++;
goto drop;
}
iph = ip_hdr(skb);
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
}
skb->pkt_type = PACKET_HOST;
__skb_tunnel_rx(skb, tunnel->dev);
skb_reset_network_header(skb);
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
if (log_ecn_error)
......@@ -477,6 +453,12 @@ int ip_tunnel_rcv(struct ip_tunnel *tunnel, struct sk_buff *skb,
tstats->rx_bytes += skb->len;
u64_stats_update_end(&tstats->syncp);
if (tunnel->dev->type == ARPHRD_ETHER) {
skb->protocol = eth_type_trans(skb, tunnel->dev);
skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
} else {
skb->dev = tunnel->dev;
}
gro_cells_receive(&tunnel->gro_cells, skb);
return 0;
......@@ -491,19 +473,17 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
{
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *inner_iph;
struct iphdr *iph;
struct flowi4 fl4;
u8 tos, ttl;
__be16 df;
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
int mtu;
int err;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
......@@ -571,14 +551,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
dev->stats.tx_carrier_errors++;
goto tx_error;
}
tdev = rt->dst.dev;
if (tdev == dev) {
if (rt->dst.dev == dev) {
ip_rt_put(rt);
dev->stats.collisions++;
goto tx_error;
}
df = tnl_params->frag_off;
if (df)
......@@ -596,6 +573,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (!skb_is_gso(skb) &&
(inner_iph->frag_off&htons(IP_DF)) &&
mtu < ntohs(inner_iph->tot_len)) {
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
ip_rt_put(rt);
goto tx_error;
......@@ -646,7 +624,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
ttl = ip4_dst_hoplimit(&rt->dst);
}
max_headroom = LL_RESERVED_SPACE(tdev) + sizeof(struct iphdr)
max_headroom = LL_RESERVED_SPACE(rt->dst.dev) + sizeof(struct iphdr)
+ rt->dst.header_len;
if (max_headroom > dev->needed_headroom) {
dev->needed_headroom = max_headroom;
......@@ -657,27 +635,11 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
}
}
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
/* Push down and install the IP header. */
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
err = iptunnel_xmit(dev_net(dev), rt, skb,
fl4.saddr, fl4.daddr, protocol,
ip_tunnel_ecn_encap(tos, inner_iph, skb), ttl, df);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
iph = ip_hdr(skb);
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = protocol;
iph->tos = ip_tunnel_ecn_encap(tos, inner_iph, skb);
iph->daddr = fl4.daddr;
iph->saddr = fl4.saddr;
iph->ttl = ttl;
tunnel_ip_select_ident(skb, inner_iph, &rt->dst);
iptunnel_xmit(skb, dev);
return;
#if IS_ENABLED(CONFIG_IPV6)
......
/*
* Copyright (c) 2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/if_arp.h>
#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
#include <linux/netfilter_ipv4.h>
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/if_vlan.h>
#include <net/ip.h>
#include <net/icmp.h>
#include <net/protocol.h>
#include <net/ip_tunnels.h>
#include <net/arp.h>
#include <net/checksum.h>
#include <net/dsfield.h>
#include <net/inet_ecn.h>
#include <net/xfrm.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/rtnetlink.h>
int iptunnel_xmit(struct net *net, struct rtable *rt,
struct sk_buff *skb,
__be32 src, __be32 dst, __u8 proto,
__u8 tos, __u8 ttl, __be16 df)
{
int pkt_len = skb->len;
struct iphdr *iph;
int err;
nf_reset(skb);
secpath_reset(skb);
skb->rxhash = 0;
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
/* Push down and install the IP header. */
__skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr) >> 2;
iph->frag_off = df;
iph->protocol = proto;
iph->tos = tos;
iph->daddr = dst;
iph->saddr = src;
iph->ttl = ttl;
tunnel_ip_select_ident(skb,
(const struct iphdr *)skb_inner_network_header(skb),
&rt->dst);
err = ip_local_out(skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
return pkt_len;
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
skb_pull_rcsum(skb, hdr_len);
if (inner_proto == htons(ETH_P_TEB)) {
struct ethhdr *eh = (struct ethhdr *)skb->data;
if (unlikely(!pskb_may_pull(skb, ETH_HLEN)))
return -ENOMEM;
if (likely(ntohs(eh->h_proto) >= ETH_P_802_3_MIN))
skb->protocol = eh->h_proto;
else
skb->protocol = htons(ETH_P_802_2);
} else {
skb->protocol = inner_proto;
}
nf_reset(skb);
secpath_reset(skb);
if (!skb->l4_rxhash)
skb->rxhash = 0;
skb_dst_drop(skb);
skb->vlan_tci = 0;
skb_set_queue_mapping(skb, 0);
skb->pkt_type = PACKET_HOST;
return 0;
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
......@@ -188,8 +188,12 @@ static int ipip_rcv(struct sk_buff *skb)
struct net *net = dev_net(skb->dev);
struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
struct ip_tunnel *tunnel;
const struct iphdr *iph = ip_hdr(skb);
const struct iphdr *iph;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
iph = ip_hdr(skb);
tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
iph->saddr, iph->daddr, 0);
if (tunnel) {
......
......@@ -640,9 +640,14 @@ static const struct tnl_ptk_info tpi = {
static int ipip_rcv(struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
const struct iphdr *iph;
struct ip_tunnel *tunnel;
if (iptunnel_pull_header(skb, 0, tpi.proto))
goto drop;
iph = ip_hdr(skb);
tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev,
iph->saddr, iph->daddr);
if (tunnel != NULL) {
......@@ -723,13 +728,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
__be16 df = tiph->frag_off;
struct rtable *rt; /* Route to the other host */
struct net_device *tdev; /* Device to other host */
struct iphdr *iph; /* Our new IP header */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst = tiph->daddr;
struct flowi4 fl4;
int mtu;
const struct in6_addr *addr6;
int addr_type;
u8 ttl;
int err;
if (skb->protocol != htons(ETH_P_IPV6))
goto tx_error;
......@@ -872,34 +878,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
skb = new_skb;
iph6 = ipv6_hdr(skb);
}
ttl = tiph->ttl;
if (ttl == 0)
ttl = iph6->hop_limit;
tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
skb->transport_header = skb->network_header;
skb_push(skb, sizeof(struct iphdr));
skb_reset_network_header(skb);
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
IPCB(skb)->flags = 0;
skb_dst_drop(skb);
skb_dst_set(skb, &rt->dst);
/*
* Push down and install the IPIP header.
*/
iph = ip_hdr(skb);
iph->version = 4;
iph->ihl = sizeof(struct iphdr)>>2;
iph->frag_off = df;
iph->protocol = IPPROTO_IPV6;
iph->tos = INET_ECN_encapsulate(tos, ipv6_get_dsfield(iph6));
iph->daddr = fl4.daddr;
iph->saddr = fl4.saddr;
if ((iph->ttl = tiph->ttl) == 0)
iph->ttl = iph6->hop_limit;
skb->ip_summed = CHECKSUM_NONE;
ip_select_ident(iph, skb_dst(skb), NULL);
iptunnel_xmit(skb, dev);
err = iptunnel_xmit(dev_net(dev), rt, skb, fl4.saddr, fl4.daddr,
IPPROTO_IPV6, tos, ttl, df);
iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
return NETDEV_TX_OK;
tx_error_icmp:
......
......@@ -19,6 +19,8 @@ config OPENVSWITCH
which is able to accept configuration from a variety of sources and
translate it into packet processing rules.
Open vSwitch GRE support depends on CONFIG_NET_IPGRE_DEMUX.
See http://openvswitch.org for more information and userspace
utilities.
......
......@@ -10,5 +10,6 @@ openvswitch-y := \
dp_notify.o \
flow.o \
vport.o \
vport-gre.o \
vport-internal_dev.o \
vport-netdev.o \
vport-netdev.o
......@@ -436,6 +436,10 @@ static int execute_set_action(struct sk_buff *skb,
skb->mark = nla_get_u32(nested_attr);
break;
case OVS_KEY_ATTR_IPV4_TUNNEL:
OVS_CB(skb)->tun_key = nla_data(nested_attr);
break;
case OVS_KEY_ATTR_ETHERNET:
err = set_eth_addr(skb, nla_data(nested_attr));
break;
......
......@@ -362,6 +362,14 @@ static int queue_gso_packets(struct net *net, int dp_ifindex,
static size_t key_attr_size(void)
{
return nla_total_size(4) /* OVS_KEY_ATTR_PRIORITY */
+ nla_total_size(0) /* OVS_KEY_ATTR_TUNNEL */
+ nla_total_size(8) /* OVS_TUNNEL_KEY_ATTR_ID */
+ nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_SRC */
+ nla_total_size(4) /* OVS_TUNNEL_KEY_ATTR_IPV4_DST */
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TOS */
+ nla_total_size(1) /* OVS_TUNNEL_KEY_ATTR_TTL */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT */
+ nla_total_size(0) /* OVS_TUNNEL_KEY_ATTR_CSUM */
+ nla_total_size(4) /* OVS_KEY_ATTR_IN_PORT */
+ nla_total_size(4) /* OVS_KEY_ATTR_SKB_MARK */
+ nla_total_size(12) /* OVS_KEY_ATTR_ETHERNET */
......@@ -464,16 +472,89 @@ static int flush_flows(struct datapath *dp)
return 0;
}
static int validate_actions(const struct nlattr *attr,
const struct sw_flow_key *key, int depth);
static struct nlattr *reserve_sfa_size(struct sw_flow_actions **sfa, int attr_len)
{
struct sw_flow_actions *acts;
int new_acts_size;
int req_size = NLA_ALIGN(attr_len);
int next_offset = offsetof(struct sw_flow_actions, actions) +
(*sfa)->actions_len;
if (req_size <= (ksize(*sfa) - next_offset))
goto out;
new_acts_size = ksize(*sfa) * 2;
if (new_acts_size > MAX_ACTIONS_BUFSIZE) {
if ((MAX_ACTIONS_BUFSIZE - next_offset) < req_size)
return ERR_PTR(-EMSGSIZE);
new_acts_size = MAX_ACTIONS_BUFSIZE;
}
acts = ovs_flow_actions_alloc(new_acts_size);
if (IS_ERR(acts))
return (void *)acts;
memcpy(acts->actions, (*sfa)->actions, (*sfa)->actions_len);
acts->actions_len = (*sfa)->actions_len;
kfree(*sfa);
*sfa = acts;
static int validate_sample(const struct nlattr *attr,
const struct sw_flow_key *key, int depth)
out:
(*sfa)->actions_len += req_size;
return (struct nlattr *) ((unsigned char *)(*sfa) + next_offset);
}
static int add_action(struct sw_flow_actions **sfa, int attrtype, void *data, int len)
{
struct nlattr *a;
a = reserve_sfa_size(sfa, nla_attr_size(len));
if (IS_ERR(a))
return PTR_ERR(a);
a->nla_type = attrtype;
a->nla_len = nla_attr_size(len);
if (data)
memcpy(nla_data(a), data, len);
memset((unsigned char *) a + a->nla_len, 0, nla_padlen(len));
return 0;
}
static inline int add_nested_action_start(struct sw_flow_actions **sfa, int attrtype)
{
int used = (*sfa)->actions_len;
int err;
err = add_action(sfa, attrtype, NULL, 0);
if (err)
return err;
return used;
}
static inline void add_nested_action_end(struct sw_flow_actions *sfa, int st_offset)
{
struct nlattr *a = (struct nlattr *) ((unsigned char *)sfa->actions + st_offset);
a->nla_len = sfa->actions_len - st_offset;
}
static int validate_and_copy_actions(const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
struct sw_flow_actions **sfa);
static int validate_and_copy_sample(const struct nlattr *attr,
const struct sw_flow_key *key, int depth,
struct sw_flow_actions **sfa)
{
const struct nlattr *attrs[OVS_SAMPLE_ATTR_MAX + 1];
const struct nlattr *probability, *actions;
const struct nlattr *a;
int rem;
int rem, start, err, st_acts;
memset(attrs, 0, sizeof(attrs));
nla_for_each_nested(a, attr, rem) {
......@@ -492,7 +573,26 @@ static int validate_sample(const struct nlattr *attr,
actions = attrs[OVS_SAMPLE_ATTR_ACTIONS];
if (!actions || (nla_len(actions) && nla_len(actions) < NLA_HDRLEN))
return -EINVAL;
return validate_actions(actions, key, depth + 1);
/* validation done, copy sample action. */
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SAMPLE);
if (start < 0)
return start;
err = add_action(sfa, OVS_SAMPLE_ATTR_PROBABILITY, nla_data(probability), sizeof(u32));
if (err)
return err;
st_acts = add_nested_action_start(sfa, OVS_SAMPLE_ATTR_ACTIONS);
if (st_acts < 0)
return st_acts;
err = validate_and_copy_actions(actions, key, depth + 1, sfa);
if (err)
return err;
add_nested_action_end(*sfa, st_acts);
add_nested_action_end(*sfa, start);
return 0;
}
static int validate_tp_port(const struct sw_flow_key *flow_key)
......@@ -508,8 +608,30 @@ static int validate_tp_port(const struct sw_flow_key *flow_key)
return -EINVAL;
}
static int validate_and_copy_set_tun(const struct nlattr *attr,
struct sw_flow_actions **sfa)
{
struct ovs_key_ipv4_tunnel tun_key;
int err, start;
err = ovs_ipv4_tun_from_nlattr(nla_data(attr), &tun_key);
if (err)
return err;
start = add_nested_action_start(sfa, OVS_ACTION_ATTR_SET);
if (start < 0)
return start;
err = add_action(sfa, OVS_KEY_ATTR_IPV4_TUNNEL, &tun_key, sizeof(tun_key));
add_nested_action_end(*sfa, start);
return err;
}
static int validate_set(const struct nlattr *a,
const struct sw_flow_key *flow_key)
const struct sw_flow_key *flow_key,
struct sw_flow_actions **sfa,
bool *set_tun)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
......@@ -519,18 +641,27 @@ static int validate_set(const struct nlattr *a,
return -EINVAL;
if (key_type > OVS_KEY_ATTR_MAX ||
nla_len(ovs_key) != ovs_key_lens[key_type])
(ovs_key_lens[key_type] != nla_len(ovs_key) &&
ovs_key_lens[key_type] != -1))
return -EINVAL;
switch (key_type) {
const struct ovs_key_ipv4 *ipv4_key;
const struct ovs_key_ipv6 *ipv6_key;
int err;
case OVS_KEY_ATTR_PRIORITY:
case OVS_KEY_ATTR_SKB_MARK:
case OVS_KEY_ATTR_ETHERNET:
break;
case OVS_KEY_ATTR_TUNNEL:
*set_tun = true;
err = validate_and_copy_set_tun(a, sfa);
if (err)
return err;
break;
case OVS_KEY_ATTR_IPV4:
if (flow_key->eth.type != htons(ETH_P_IP))
return -EINVAL;
......@@ -606,8 +737,24 @@ static int validate_userspace(const struct nlattr *attr)
return 0;
}
static int validate_actions(const struct nlattr *attr,
const struct sw_flow_key *key, int depth)
static int copy_action(const struct nlattr *from,
struct sw_flow_actions **sfa)
{
int totlen = NLA_ALIGN(from->nla_len);
struct nlattr *to;
to = reserve_sfa_size(sfa, from->nla_len);
if (IS_ERR(to))
return PTR_ERR(to);
memcpy(to, from, totlen);
return 0;
}
static int validate_and_copy_actions(const struct nlattr *attr,
const struct sw_flow_key *key,
int depth,
struct sw_flow_actions **sfa)
{
const struct nlattr *a;
int rem, err;
......@@ -627,12 +774,14 @@ static int validate_actions(const struct nlattr *attr,
};
const struct ovs_action_push_vlan *vlan;
int type = nla_type(a);
bool skip_copy;
if (type > OVS_ACTION_ATTR_MAX ||
(action_lens[type] != nla_len(a) &&
action_lens[type] != (u32)-1))
return -EINVAL;
skip_copy = false;
switch (type) {
case OVS_ACTION_ATTR_UNSPEC:
return -EINVAL;
......@@ -661,20 +810,26 @@ static int validate_actions(const struct nlattr *attr,
break;
case OVS_ACTION_ATTR_SET:
err = validate_set(a, key);
err = validate_set(a, key, sfa, &skip_copy);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SAMPLE:
err = validate_sample(a, key, depth);
err = validate_and_copy_sample(a, key, depth, sfa);
if (err)
return err;
skip_copy = true;
break;
default:
return -EINVAL;
}
if (!skip_copy) {
err = copy_action(a, sfa);
if (err)
return err;
}
}
if (rem > 0)
......@@ -739,21 +894,18 @@ static int ovs_packet_cmd_execute(struct sk_buff *skb, struct genl_info *info)
if (err)
goto err_flow_free;
err = ovs_flow_metadata_from_nlattrs(flow, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
err = validate_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0);
err = ovs_flow_metadata_from_nlattrs(flow, key_len, a[OVS_PACKET_ATTR_KEY]);
if (err)
goto err_flow_free;
flow->hash = ovs_flow_hash(&flow->key, key_len);
acts = ovs_flow_actions_alloc(a[OVS_PACKET_ATTR_ACTIONS]);
acts = ovs_flow_actions_alloc(nla_len(a[OVS_PACKET_ATTR_ACTIONS]));
err = PTR_ERR(acts);
if (IS_ERR(acts))
goto err_flow_free;
err = validate_and_copy_actions(a[OVS_PACKET_ATTR_ACTIONS], &flow->key, 0, &acts);
rcu_assign_pointer(flow->sf_acts, acts);
if (err)
goto err_flow_free;
OVS_CB(packet)->flow = flow;
packet->priority = flow->key.phy.priority;
......@@ -843,6 +995,99 @@ static struct genl_multicast_group ovs_dp_flow_multicast_group = {
.name = OVS_FLOW_MCGROUP
};
static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb);
static int sample_action_to_attr(const struct nlattr *attr, struct sk_buff *skb)
{
const struct nlattr *a;
struct nlattr *start;
int err = 0, rem;
start = nla_nest_start(skb, OVS_ACTION_ATTR_SAMPLE);
if (!start)
return -EMSGSIZE;
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
struct nlattr *st_sample;
switch (type) {
case OVS_SAMPLE_ATTR_PROBABILITY:
if (nla_put(skb, OVS_SAMPLE_ATTR_PROBABILITY, sizeof(u32), nla_data(a)))
return -EMSGSIZE;
break;
case OVS_SAMPLE_ATTR_ACTIONS:
st_sample = nla_nest_start(skb, OVS_SAMPLE_ATTR_ACTIONS);
if (!st_sample)
return -EMSGSIZE;
err = actions_to_attr(nla_data(a), nla_len(a), skb);
if (err)
return err;
nla_nest_end(skb, st_sample);
break;
}
}
nla_nest_end(skb, start);
return err;
}
static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
{
const struct nlattr *ovs_key = nla_data(a);
int key_type = nla_type(ovs_key);
struct nlattr *start;
int err;
switch (key_type) {
case OVS_KEY_ATTR_IPV4_TUNNEL:
start = nla_nest_start(skb, OVS_ACTION_ATTR_SET);
if (!start)
return -EMSGSIZE;
err = ovs_ipv4_tun_to_nlattr(skb, nla_data(ovs_key));
if (err)
return err;
nla_nest_end(skb, start);
break;
default:
if (nla_put(skb, OVS_ACTION_ATTR_SET, nla_len(a), ovs_key))
return -EMSGSIZE;
break;
}
return 0;
}
static int actions_to_attr(const struct nlattr *attr, int len, struct sk_buff *skb)
{
const struct nlattr *a;
int rem, err;
nla_for_each_attr(a, attr, len, rem) {
int type = nla_type(a);
switch (type) {
case OVS_ACTION_ATTR_SET:
err = set_action_to_attr(a, skb);
if (err)
return err;
break;
case OVS_ACTION_ATTR_SAMPLE:
err = sample_action_to_attr(a, skb);
if (err)
return err;
break;
default:
if (nla_put(skb, type, nla_len(a), nla_data(a)))
return -EMSGSIZE;
break;
}
}
return 0;
}
static size_t ovs_flow_cmd_msg_size(const struct sw_flow_actions *acts)
{
return NLMSG_ALIGN(sizeof(struct ovs_header))
......@@ -860,6 +1105,7 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
{
const int skb_orig_len = skb->len;
const struct sw_flow_actions *sf_acts;
struct nlattr *start;
struct ovs_flow_stats stats;
struct ovs_header *ovs_header;
struct nlattr *nla;
......@@ -913,11 +1159,20 @@ static int ovs_flow_cmd_fill_info(struct sw_flow *flow, struct datapath *dp,
* This can only fail for dump operations because the skb is always
* properly sized for single flows.
*/
err = nla_put(skb, OVS_FLOW_ATTR_ACTIONS, sf_acts->actions_len,
sf_acts->actions);
if (err < 0 && skb_orig_len)
start = nla_nest_start(skb, OVS_FLOW_ATTR_ACTIONS);
if (start) {
err = actions_to_attr(sf_acts->actions, sf_acts->actions_len, skb);
if (!err)
nla_nest_end(skb, start);
else {
if (skb_orig_len)
goto error;
nla_nest_cancel(skb, start);
}
} else if (skb_orig_len)
goto nla_put_failure;
return genlmsg_end(skb, ovs_header);
nla_put_failure:
......@@ -961,6 +1216,7 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
struct sk_buff *reply;
struct datapath *dp;
struct flow_table *table;
struct sw_flow_actions *acts = NULL;
int error;
int key_len;
......@@ -974,9 +1230,14 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
error = validate_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0);
if (error)
acts = ovs_flow_actions_alloc(nla_len(a[OVS_FLOW_ATTR_ACTIONS]));
error = PTR_ERR(acts);
if (IS_ERR(acts))
goto error;
error = validate_and_copy_actions(a[OVS_FLOW_ATTR_ACTIONS], &key, 0, &acts);
if (error)
goto err_kfree;
} else if (info->genlhdr->cmd == OVS_FLOW_CMD_NEW) {
error = -EINVAL;
goto error;
......@@ -991,8 +1252,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
table = ovsl_dereference(dp->table);
flow = ovs_flow_tbl_lookup(table, &key, key_len);
if (!flow) {
struct sw_flow_actions *acts;
/* Bail out if we're not allowed to create a new flow. */
error = -ENOENT;
if (info->genlhdr->cmd == OVS_FLOW_CMD_SET)
......@@ -1016,19 +1275,12 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
error = PTR_ERR(flow);
goto err_unlock_ovs;
}
flow->key = key;
clear_stats(flow);
/* Obtain actions. */
acts = ovs_flow_actions_alloc(a[OVS_FLOW_ATTR_ACTIONS]);
error = PTR_ERR(acts);
if (IS_ERR(acts))
goto error_free_flow;
rcu_assign_pointer(flow->sf_acts, acts);
/* Put flow in bucket. */
flow->hash = ovs_flow_hash(&key, key_len);
ovs_flow_tbl_insert(table, flow);
ovs_flow_tbl_insert(table, flow, &key, key_len);
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq,
......@@ -1036,7 +1288,6 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
} else {
/* We found a matching flow. */
struct sw_flow_actions *old_acts;
struct nlattr *acts_attrs;
/* Bail out if we're not allowed to modify an existing flow.
* We accept NLM_F_CREATE in place of the intended NLM_F_EXCL
......@@ -1051,21 +1302,8 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
/* Update actions. */
old_acts = ovsl_dereference(flow->sf_acts);
acts_attrs = a[OVS_FLOW_ATTR_ACTIONS];
if (acts_attrs &&
(old_acts->actions_len != nla_len(acts_attrs) ||
memcmp(old_acts->actions, nla_data(acts_attrs),
old_acts->actions_len))) {
struct sw_flow_actions *new_acts;
new_acts = ovs_flow_actions_alloc(acts_attrs);
error = PTR_ERR(new_acts);
if (IS_ERR(new_acts))
goto err_unlock_ovs;
rcu_assign_pointer(flow->sf_acts, new_acts);
rcu_assign_pointer(flow->sf_acts, acts);
ovs_flow_deferred_free_acts(old_acts);
}
reply = ovs_flow_cmd_build_info(flow, dp, info->snd_portid,
info->snd_seq, OVS_FLOW_CMD_NEW);
......@@ -1086,10 +1324,10 @@ static int ovs_flow_cmd_new_or_set(struct sk_buff *skb, struct genl_info *info)
ovs_dp_flow_multicast_group.id, PTR_ERR(reply));
return 0;
error_free_flow:
ovs_flow_free(flow);
err_unlock_ovs:
ovs_unlock();
err_kfree:
kfree(acts);
error:
return error;
}
......@@ -1866,8 +2104,8 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
goto exit_unlock;
}
reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq,
OVS_VPORT_CMD_DEL);
reply = ovs_vport_cmd_build_info(vport, info->snd_portid,
info->snd_seq, OVS_VPORT_CMD_DEL);
err = PTR_ERR(reply);
if (IS_ERR(reply))
goto exit_unlock;
......@@ -1896,8 +2134,8 @@ static int ovs_vport_cmd_get(struct sk_buff *skb, struct genl_info *info)
if (IS_ERR(vport))
goto exit_unlock;
reply = ovs_vport_cmd_build_info(vport, info->snd_portid, info->snd_seq,
OVS_VPORT_CMD_NEW);
reply = ovs_vport_cmd_build_info(vport, info->snd_portid,
info->snd_seq, OVS_VPORT_CMD_NEW);
err = PTR_ERR(reply);
if (IS_ERR(reply))
goto exit_unlock;
......
......@@ -88,9 +88,12 @@ struct datapath {
/**
* struct ovs_skb_cb - OVS data in skb CB
* @flow: The flow associated with this packet. May be %NULL if no flow.
* @tun_key: Key for the tunnel that encapsulated this packet. NULL if the
* packet is not being tunneled.
*/
struct ovs_skb_cb {
struct sw_flow *flow;
struct ovs_key_ipv4_tunnel *tun_key;
};
#define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
......@@ -119,6 +122,7 @@ struct dp_upcall_info {
struct ovs_net {
struct list_head dps;
struct work_struct dp_notify_work;
struct vport_net vport_net;
};
extern int ovs_net_id;
......
......@@ -40,6 +40,7 @@
#include <linux/icmpv6.h>
#include <linux/rculist.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/ipv6.h>
#include <net/ndisc.h>
......@@ -198,20 +199,18 @@ void ovs_flow_used(struct sw_flow *flow, struct sk_buff *skb)
spin_unlock(&flow->lock);
}
struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *actions)
struct sw_flow_actions *ovs_flow_actions_alloc(int size)
{
int actions_len = nla_len(actions);
struct sw_flow_actions *sfa;
if (actions_len > MAX_ACTIONS_BUFSIZE)
if (size > MAX_ACTIONS_BUFSIZE)
return ERR_PTR(-EINVAL);
sfa = kmalloc(sizeof(*sfa) + actions_len, GFP_KERNEL);
sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL);
if (!sfa)
return ERR_PTR(-ENOMEM);
sfa->actions_len = actions_len;
nla_memcpy(sfa->actions, actions, actions_len);
sfa->actions_len = 0;
return sfa;
}
......@@ -354,6 +353,14 @@ struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *la
return NULL;
}
static void __flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
{
struct hlist_head *head;
head = find_bucket(table, flow->hash);
hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
table->count++;
}
static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new)
{
int old_ver;
......@@ -370,7 +377,7 @@ static void flow_table_copy_flows(struct flow_table *old, struct flow_table *new
head = flex_array_get(old->buckets, i);
hlist_for_each_entry(flow, head, hash_node[old_ver])
ovs_flow_tbl_insert(new, flow);
__flow_tbl_insert(new, flow);
}
old->keep_flows = true;
}
......@@ -605,6 +612,8 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
memset(key, 0, sizeof(*key));
key->phy.priority = skb->priority;
if (OVS_CB(skb)->tun_key)
memcpy(&key->tun_key, OVS_CB(skb)->tun_key, sizeof(key->tun_key));
key->phy.in_port = in_port;
key->phy.skb_mark = skb->mark;
......@@ -762,9 +771,18 @@ int ovs_flow_extract(struct sk_buff *skb, u16 in_port, struct sw_flow_key *key,
return error;
}
u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len)
static u32 ovs_flow_hash(const struct sw_flow_key *key, int key_start, int key_len)
{
return jhash2((u32 *)key, DIV_ROUND_UP(key_len, sizeof(u32)), 0);
return jhash2((u32 *)((u8 *)key + key_start),
DIV_ROUND_UP(key_len - key_start, sizeof(u32)), 0);
}
static int flow_key_start(struct sw_flow_key *key)
{
if (key->tun_key.ipv4_dst)
return 0;
else
return offsetof(struct sw_flow_key, phy);
}
struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
......@@ -772,28 +790,31 @@ struct sw_flow *ovs_flow_tbl_lookup(struct flow_table *table,
{
struct sw_flow *flow;
struct hlist_head *head;
u8 *_key;
int key_start;
u32 hash;
hash = ovs_flow_hash(key, key_len);
key_start = flow_key_start(key);
hash = ovs_flow_hash(key, key_start, key_len);
_key = (u8 *) key + key_start;
head = find_bucket(table, hash);
hlist_for_each_entry_rcu(flow, head, hash_node[table->node_ver]) {
if (flow->hash == hash &&
!memcmp(&flow->key, key, key_len)) {
!memcmp((u8 *)&flow->key + key_start, _key, key_len - key_start)) {
return flow;
}
}
return NULL;
}
void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow)
void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
struct sw_flow_key *key, int key_len)
{
struct hlist_head *head;
head = find_bucket(table, flow->hash);
hlist_add_head_rcu(&flow->hash_node[table->node_ver], head);
table->count++;
flow->hash = ovs_flow_hash(key, flow_key_start(key), key_len);
memcpy(&flow->key, key, sizeof(flow->key));
__flow_tbl_insert(table, flow);
}
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow)
......@@ -820,6 +841,7 @@ const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1] = {
[OVS_KEY_ATTR_ICMPV6] = sizeof(struct ovs_key_icmpv6),
[OVS_KEY_ATTR_ARP] = sizeof(struct ovs_key_arp),
[OVS_KEY_ATTR_ND] = sizeof(struct ovs_key_nd),
[OVS_KEY_ATTR_TUNNEL] = -1,
};
static int ipv4_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_len,
......@@ -957,6 +979,105 @@ static int parse_flow_nlattrs(const struct nlattr *attr,
return 0;
}
int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
struct ovs_key_ipv4_tunnel *tun_key)
{
struct nlattr *a;
int rem;
bool ttl = false;
memset(tun_key, 0, sizeof(*tun_key));
nla_for_each_nested(a, attr, rem) {
int type = nla_type(a);
static const u32 ovs_tunnel_key_lens[OVS_TUNNEL_KEY_ATTR_MAX + 1] = {
[OVS_TUNNEL_KEY_ATTR_ID] = sizeof(u64),
[OVS_TUNNEL_KEY_ATTR_IPV4_SRC] = sizeof(u32),
[OVS_TUNNEL_KEY_ATTR_IPV4_DST] = sizeof(u32),
[OVS_TUNNEL_KEY_ATTR_TOS] = 1,
[OVS_TUNNEL_KEY_ATTR_TTL] = 1,
[OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT] = 0,
[OVS_TUNNEL_KEY_ATTR_CSUM] = 0,
};
if (type > OVS_TUNNEL_KEY_ATTR_MAX ||
ovs_tunnel_key_lens[type] != nla_len(a))
return -EINVAL;
switch (type) {
case OVS_TUNNEL_KEY_ATTR_ID:
tun_key->tun_id = nla_get_be64(a);
tun_key->tun_flags |= TUNNEL_KEY;
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_SRC:
tun_key->ipv4_src = nla_get_be32(a);
break;
case OVS_TUNNEL_KEY_ATTR_IPV4_DST:
tun_key->ipv4_dst = nla_get_be32(a);
break;
case OVS_TUNNEL_KEY_ATTR_TOS:
tun_key->ipv4_tos = nla_get_u8(a);
break;
case OVS_TUNNEL_KEY_ATTR_TTL:
tun_key->ipv4_ttl = nla_get_u8(a);
ttl = true;
break;
case OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT:
tun_key->tun_flags |= TUNNEL_DONT_FRAGMENT;
break;
case OVS_TUNNEL_KEY_ATTR_CSUM:
tun_key->tun_flags |= TUNNEL_CSUM;
break;
default:
return -EINVAL;
}
}
if (rem > 0)
return -EINVAL;
if (!tun_key->ipv4_dst)
return -EINVAL;
if (!ttl)
return -EINVAL;
return 0;
}
int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *tun_key)
{
struct nlattr *nla;
nla = nla_nest_start(skb, OVS_KEY_ATTR_TUNNEL);
if (!nla)
return -EMSGSIZE;
if (tun_key->tun_flags & TUNNEL_KEY &&
nla_put_be64(skb, OVS_TUNNEL_KEY_ATTR_ID, tun_key->tun_id))
return -EMSGSIZE;
if (tun_key->ipv4_src &&
nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_SRC, tun_key->ipv4_src))
return -EMSGSIZE;
if (nla_put_be32(skb, OVS_TUNNEL_KEY_ATTR_IPV4_DST, tun_key->ipv4_dst))
return -EMSGSIZE;
if (tun_key->ipv4_tos &&
nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TOS, tun_key->ipv4_tos))
return -EMSGSIZE;
if (nla_put_u8(skb, OVS_TUNNEL_KEY_ATTR_TTL, tun_key->ipv4_ttl))
return -EMSGSIZE;
if ((tun_key->tun_flags & TUNNEL_DONT_FRAGMENT) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_DONT_FRAGMENT))
return -EMSGSIZE;
if ((tun_key->tun_flags & TUNNEL_CSUM) &&
nla_put_flag(skb, OVS_TUNNEL_KEY_ATTR_CSUM))
return -EMSGSIZE;
nla_nest_end(skb, nla);
return 0;
}
/**
* ovs_flow_from_nlattrs - parses Netlink attributes into a flow key.
* @swkey: receives the extracted flow key.
......@@ -999,6 +1120,14 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
attrs &= ~(1 << OVS_KEY_ATTR_SKB_MARK);
}
if (attrs & (1 << OVS_KEY_ATTR_TUNNEL)) {
err = ovs_ipv4_tun_from_nlattr(a[OVS_KEY_ATTR_TUNNEL], &swkey->tun_key);
if (err)
return err;
attrs &= ~(1 << OVS_KEY_ATTR_TUNNEL);
}
/* Data attributes. */
if (!(attrs & (1 << OVS_KEY_ATTR_ETHERNET)))
return -EINVAL;
......@@ -1126,6 +1255,7 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
/**
* ovs_flow_metadata_from_nlattrs - parses Netlink attributes into a flow key.
* @flow: Receives extracted in_port, priority, tun_key and skb_mark.
* @key_len: Length of key in @flow. Used for calculating flow hash.
* @attr: Netlink attribute holding nested %OVS_KEY_ATTR_* Netlink attribute
* sequence.
*
......@@ -1134,20 +1264,24 @@ int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
* get the metadata, that is, the parts of the flow key that cannot be
* extracted from the packet itself.
*/
int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
const struct nlattr *attr)
{
struct ovs_key_ipv4_tunnel *tun_key = &flow->key.tun_key;
const struct nlattr *nla;
int rem;
flow->key.phy.in_port = DP_MAX_PORTS;
flow->key.phy.priority = 0;
flow->key.phy.skb_mark = 0;
memset(tun_key, 0, sizeof(flow->key.tun_key));
nla_for_each_nested(nla, attr, rem) {
int type = nla_type(nla);
if (type <= OVS_KEY_ATTR_MAX && ovs_key_lens[type] > 0) {
int err;
if (nla_len(nla) != ovs_key_lens[type])
return -EINVAL;
......@@ -1156,6 +1290,12 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
flow->key.phy.priority = nla_get_u32(nla);
break;
case OVS_KEY_ATTR_TUNNEL:
err = ovs_ipv4_tun_from_nlattr(nla, tun_key);
if (err)
return err;
break;
case OVS_KEY_ATTR_IN_PORT:
if (nla_get_u32(nla) >= DP_MAX_PORTS)
return -EINVAL;
......@@ -1170,6 +1310,10 @@ int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
}
if (rem)
return -EINVAL;
flow->hash = ovs_flow_hash(&flow->key,
flow_key_start(&flow->key), key_len);
return 0;
}
......@@ -1182,6 +1326,10 @@ int ovs_flow_to_nlattrs(const struct sw_flow_key *swkey, struct sk_buff *skb)
nla_put_u32(skb, OVS_KEY_ATTR_PRIORITY, swkey->phy.priority))
goto nla_put_failure;
if (swkey->tun_key.ipv4_dst &&
ovs_ipv4_tun_to_nlattr(skb, &swkey->tun_key))
goto nla_put_failure;
if (swkey->phy.in_port != DP_MAX_PORTS &&
nla_put_u32(skb, OVS_KEY_ATTR_IN_PORT, swkey->phy.in_port))
goto nla_put_failure;
......
......@@ -40,7 +40,38 @@ struct sw_flow_actions {
struct nlattr actions[];
};
/* Used to memset ovs_key_ipv4_tunnel padding. */
#define OVS_TUNNEL_KEY_SIZE \
(offsetof(struct ovs_key_ipv4_tunnel, ipv4_ttl) + \
FIELD_SIZEOF(struct ovs_key_ipv4_tunnel, ipv4_ttl))
struct ovs_key_ipv4_tunnel {
__be64 tun_id;
__be32 ipv4_src;
__be32 ipv4_dst;
__be16 tun_flags;
u8 ipv4_tos;
u8 ipv4_ttl;
};
static inline void ovs_flow_tun_key_init(struct ovs_key_ipv4_tunnel *tun_key,
const struct iphdr *iph, __be64 tun_id,
__be16 tun_flags)
{
tun_key->tun_id = tun_id;
tun_key->ipv4_src = iph->saddr;
tun_key->ipv4_dst = iph->daddr;
tun_key->ipv4_tos = iph->tos;
tun_key->ipv4_ttl = iph->ttl;
tun_key->tun_flags = tun_flags;
/* clear struct padding. */
memset((unsigned char *) tun_key + OVS_TUNNEL_KEY_SIZE, 0,
sizeof(*tun_key) - OVS_TUNNEL_KEY_SIZE);
}
struct sw_flow_key {
struct ovs_key_ipv4_tunnel tun_key; /* Encapsulating tunnel key. */
struct {
u32 priority; /* Packet QoS priority. */
u32 skb_mark; /* SKB mark. */
......@@ -130,7 +161,7 @@ struct sw_flow *ovs_flow_alloc(void);
void ovs_flow_deferred_free(struct sw_flow *);
void ovs_flow_free(struct sw_flow *flow);
struct sw_flow_actions *ovs_flow_actions_alloc(const struct nlattr *);
struct sw_flow_actions *ovs_flow_actions_alloc(int actions_len);
void ovs_flow_deferred_free_acts(struct sw_flow_actions *);
int ovs_flow_extract(struct sk_buff *, u16 in_port, struct sw_flow_key *,
......@@ -141,10 +172,10 @@ u64 ovs_flow_used_time(unsigned long flow_jiffies);
int ovs_flow_to_nlattrs(const struct sw_flow_key *, struct sk_buff *);
int ovs_flow_from_nlattrs(struct sw_flow_key *swkey, int *key_lenp,
const struct nlattr *);
int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow,
int ovs_flow_metadata_from_nlattrs(struct sw_flow *flow, int key_len,
const struct nlattr *attr);
#define MAX_ACTIONS_BUFSIZE (16 * 1024)
#define MAX_ACTIONS_BUFSIZE (32 * 1024)
#define TBL_MIN_BUCKETS 1024
struct flow_table {
......@@ -173,11 +204,15 @@ void ovs_flow_tbl_deferred_destroy(struct flow_table *table);
struct flow_table *ovs_flow_tbl_alloc(int new_size);
struct flow_table *ovs_flow_tbl_expand(struct flow_table *table);
struct flow_table *ovs_flow_tbl_rehash(struct flow_table *table);
void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow);
void ovs_flow_tbl_insert(struct flow_table *table, struct sw_flow *flow,
struct sw_flow_key *key, int key_len);
void ovs_flow_tbl_remove(struct flow_table *table, struct sw_flow *flow);
u32 ovs_flow_hash(const struct sw_flow_key *key, int key_len);
struct sw_flow *ovs_flow_tbl_next(struct flow_table *table, u32 *bucket, u32 *idx);
extern const int ovs_key_lens[OVS_KEY_ATTR_MAX + 1];
int ovs_ipv4_tun_from_nlattr(const struct nlattr *attr,
struct ovs_key_ipv4_tunnel *tun_key);
int ovs_ipv4_tun_to_nlattr(struct sk_buff *skb,
const struct ovs_key_ipv4_tunnel *tun_key);
#endif /* flow.h */
/*
* Copyright (c) 2007-2013 Nicira, Inc.
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
* 02110-1301, USA
*/
#ifdef CONFIG_NET_IPGRE_DEMUX
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/if.h>
#include <linux/skbuff.h>
#include <linux/ip.h>
#include <linux/if_tunnel.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/if_vlan.h>
#include <linux/in.h>
#include <linux/in_route.h>
#include <linux/inetdevice.h>
#include <linux/jhash.h>
#include <linux/list.h>
#include <linux/kernel.h>
#include <linux/workqueue.h>
#include <linux/rculist.h>
#include <net/route.h>
#include <net/xfrm.h>
#include <net/icmp.h>
#include <net/ip.h>
#include <net/ip_tunnels.h>
#include <net/gre.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/protocol.h>
#include "datapath.h"
#include "vport.h"
/* Returns the least-significant 32 bits of a __be64. */
static __be32 be64_get_low32(__be64 x)
{
#ifdef __BIG_ENDIAN
return (__force __be32)x;
#else
return (__force __be32)((__force u64)x >> 32);
#endif
}
static __be16 filter_tnl_flags(__be16 flags)
{
return flags & (TUNNEL_CSUM | TUNNEL_KEY);
}
static struct sk_buff *__build_header(struct sk_buff *skb,
int tunnel_hlen)
{
const struct ovs_key_ipv4_tunnel *tun_key = OVS_CB(skb)->tun_key;
struct tnl_ptk_info tpi;
skb = gre_handle_offloads(skb, !!(tun_key->tun_flags & TUNNEL_CSUM));
if (IS_ERR(skb))
return NULL;
tpi.flags = filter_tnl_flags(tun_key->tun_flags);
tpi.proto = htons(ETH_P_TEB);
tpi.key = be64_get_low32(tun_key->tun_id);
tpi.seq = 0;
gre_build_header(skb, &tpi, tunnel_hlen);
return skb;
}
static __be64 key_to_tunnel_id(__be32 key, __be32 seq)
{
#ifdef __BIG_ENDIAN
return (__force __be64)((__force u64)seq << 32 | (__force u32)key);
#else
return (__force __be64)((__force u64)key << 32 | (__force u32)seq);
#endif
}
/* Called with rcu_read_lock and BH disabled. */
static int gre_rcv(struct sk_buff *skb,
const struct tnl_ptk_info *tpi)
{
struct ovs_key_ipv4_tunnel tun_key;
struct ovs_net *ovs_net;
struct vport *vport;
__be64 key;
ovs_net = net_generic(dev_net(skb->dev), ovs_net_id);
vport = rcu_dereference(ovs_net->vport_net.gre_vport);
if (unlikely(!vport))
return PACKET_REJECT;
key = key_to_tunnel_id(tpi->key, tpi->seq);
ovs_flow_tun_key_init(&tun_key, ip_hdr(skb), key,
filter_tnl_flags(tpi->flags));
ovs_vport_receive(vport, skb, &tun_key);
return PACKET_RCVD;
}
static int gre_tnl_send(struct vport *vport, struct sk_buff *skb)
{
struct net *net = ovs_dp_get_net(vport->dp);
struct flowi4 fl;
struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df;
int err;
if (unlikely(!OVS_CB(skb)->tun_key)) {
err = -EINVAL;
goto error;
}
/* Route lookup */
memset(&fl, 0, sizeof(fl));
fl.daddr = OVS_CB(skb)->tun_key->ipv4_dst;
fl.saddr = OVS_CB(skb)->tun_key->ipv4_src;
fl.flowi4_tos = RT_TOS(OVS_CB(skb)->tun_key->ipv4_tos);
fl.flowi4_mark = skb->mark;
fl.flowi4_proto = IPPROTO_GRE;
rt = ip_route_output_key(net, &fl);
if (IS_ERR(rt))
return PTR_ERR(rt);
tunnel_hlen = ip_gre_calc_hlen(OVS_CB(skb)->tun_key->tun_flags);
min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len
+ tunnel_hlen + sizeof(struct iphdr)
+ (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0);
if (skb_headroom(skb) < min_headroom || skb_header_cloned(skb)) {
int head_delta = SKB_DATA_ALIGN(min_headroom -
skb_headroom(skb) +
16);
err = pskb_expand_head(skb, max_t(int, head_delta, 0),
0, GFP_ATOMIC);
if (unlikely(err))
goto err_free_rt;
}
if (vlan_tx_tag_present(skb)) {
if (unlikely(!__vlan_put_tag(skb,
skb->vlan_proto,
vlan_tx_tag_get(skb)))) {
err = -ENOMEM;
goto err_free_rt;
}
skb->vlan_tci = 0;
}
/* Push Tunnel header. */
skb = __build_header(skb, tunnel_hlen);
if (unlikely(!skb)) {
err = 0;
goto err_free_rt;
}
df = OVS_CB(skb)->tun_key->tun_flags & TUNNEL_DONT_FRAGMENT ?
htons(IP_DF) : 0;
skb->local_df = 1;
return iptunnel_xmit(net, rt, skb, fl.saddr,
OVS_CB(skb)->tun_key->ipv4_dst, IPPROTO_GRE,
OVS_CB(skb)->tun_key->ipv4_tos,
OVS_CB(skb)->tun_key->ipv4_ttl, df);
err_free_rt:
ip_rt_put(rt);
error:
return err;
}
static struct gre_cisco_protocol gre_protocol = {
.handler = gre_rcv,
.priority = 1,
};
static int gre_ports;
static int gre_init(void)
{
int err;
gre_ports++;
if (gre_ports > 1)
return 0;
err = gre_cisco_register(&gre_protocol);
if (err)
pr_warn("cannot register gre protocol handler\n");
return err;
}
static void gre_exit(void)
{
gre_ports--;
if (gre_ports > 0)
return;
gre_cisco_unregister(&gre_protocol);
}
static const char *gre_get_name(const struct vport *vport)
{
return vport_priv(vport);
}
static struct vport *gre_create(const struct vport_parms *parms)
{
struct net *net = ovs_dp_get_net(parms->dp);
struct ovs_net *ovs_net;
struct vport *vport;
int err;
err = gre_init();
if (err)
return ERR_PTR(err);
ovs_net = net_generic(net, ovs_net_id);
if (ovsl_dereference(ovs_net->vport_net.gre_vport)) {
vport = ERR_PTR(-EEXIST);
goto error;
}
vport = ovs_vport_alloc(IFNAMSIZ, &ovs_gre_vport_ops, parms);
if (IS_ERR(vport))
goto error;
strncpy(vport_priv(vport), parms->name, IFNAMSIZ);
rcu_assign_pointer(ovs_net->vport_net.gre_vport, vport);
return vport;
error:
gre_exit();
return vport;
}
static void gre_tnl_destroy(struct vport *vport)
{
struct net *net = ovs_dp_get_net(vport->dp);
struct ovs_net *ovs_net;
ovs_net = net_generic(net, ovs_net_id);
rcu_assign_pointer(ovs_net->vport_net.gre_vport, NULL);
ovs_vport_deferred_free(vport);
gre_exit();
}
const struct vport_ops ovs_gre_vport_ops = {
.type = OVS_VPORT_TYPE_GRE,
.create = gre_create,
.destroy = gre_tnl_destroy,
.get_name = gre_get_name,
.send = gre_tnl_send,
};
#endif
......@@ -67,7 +67,7 @@ static struct rtnl_link_stats64 *internal_dev_get_stats(struct net_device *netde
static int internal_dev_xmit(struct sk_buff *skb, struct net_device *netdev)
{
rcu_read_lock();
ovs_vport_receive(internal_dev_priv(netdev)->vport, skb);
ovs_vport_receive(internal_dev_priv(netdev)->vport, skb, NULL);
rcu_read_unlock();
return 0;
}
......
......@@ -51,7 +51,7 @@ static void netdev_port_receive(struct vport *vport, struct sk_buff *skb)
skb_push(skb, ETH_HLEN);
ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
ovs_vport_receive(vport, skb);
ovs_vport_receive(vport, skb, NULL);
return;
error:
......
......@@ -38,6 +38,10 @@
static const struct vport_ops *vport_ops_list[] = {
&ovs_netdev_vport_ops,
&ovs_internal_vport_ops,
#ifdef CONFIG_NET_IPGRE_DEMUX
&ovs_gre_vport_ops,
#endif
};
/* Protected by RCU read lock for reading, ovs_mutex for writing. */
......@@ -325,7 +329,8 @@ int ovs_vport_get_options(const struct vport *vport, struct sk_buff *skb)
* Must be called with rcu_read_lock. The packet cannot be shared and
* skb->data should point to the Ethernet header.
*/
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
void ovs_vport_receive(struct vport *vport, struct sk_buff *skb,
struct ovs_key_ipv4_tunnel *tun_key)
{
struct pcpu_tstats *stats;
......@@ -335,6 +340,7 @@ void ovs_vport_receive(struct vport *vport, struct sk_buff *skb)
stats->rx_bytes += skb->len;
u64_stats_update_end(&stats->syncp);
OVS_CB(skb)->tun_key = tun_key;
ovs_dp_process_received_packet(vport, skb);
}
......@@ -402,3 +408,18 @@ void ovs_vport_record_error(struct vport *vport, enum vport_err_type err_type)
spin_unlock(&vport->stats_lock);
}
static void free_vport_rcu(struct rcu_head *rcu)
{
struct vport *vport = container_of(rcu, struct vport, rcu);
ovs_vport_free(vport);
}
void ovs_vport_deferred_free(struct vport *vport)
{
if (!vport)
return;
call_rcu(&vport->rcu, free_vport_rcu);
}
......@@ -34,6 +34,11 @@ struct vport_parms;
/* The following definitions are for users of the vport subsytem: */
/* The following definitions are for users of the vport subsytem: */
struct vport_net {
struct vport __rcu *gre_vport;
};
int ovs_vport_init(void);
void ovs_vport_exit(void);
......@@ -152,6 +157,7 @@ enum vport_err_type {
struct vport *ovs_vport_alloc(int priv_size, const struct vport_ops *,
const struct vport_parms *);
void ovs_vport_free(struct vport *);
void ovs_vport_deferred_free(struct vport *vport);
#define VPORT_ALIGN 8
......@@ -184,13 +190,15 @@ static inline struct vport *vport_from_priv(const void *priv)
return (struct vport *)(priv - ALIGN(sizeof(struct vport), VPORT_ALIGN));
}
void ovs_vport_receive(struct vport *, struct sk_buff *);
void ovs_vport_receive(struct vport *, struct sk_buff *,
struct ovs_key_ipv4_tunnel *);
void ovs_vport_record_error(struct vport *, enum vport_err_type err_type);
/* List of statically compiled vport implementations. Don't forget to also
* add yours to the list at the top of vport.c. */
extern const struct vport_ops ovs_netdev_vport_ops;
extern const struct vport_ops ovs_internal_vport_ops;
extern const struct vport_ops ovs_gre_vport_ops;
static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
const void *start, unsigned int len)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment