Commit 4d0312e0 authored by David S. Miller's avatar David S. Miller

Merge branch 'net-Add-address-attribute-to-control-metric-of-prefix-route'

David Ahern says:

====================
net: Add address attribute to control metric of prefix route

For use cases such as VRR (Virtual Router Redundancy) interface managers
want efficient control over the order of prefix routes when multiple
interfaces have addresses with overlapping/duplicate subnets.

Currently, if two interfaces have addresses in the same subnet, the order
of the prefix route entries is determined by the order in which the
addresses are assigned or the links brought up. Any actions like cycling
an interface up and down changes that order. This set adds a new attribute
for addresses to allow a user to specify the metric of the prefix route
associated with an address giving interface managers better and more
efficient control of the order of prefix routes.

Patches 1-3 refactor IPv6 address add functions to pass an ifa6_config
struct. The functions currently have a long list of arguments and adding
the metric just makes it worse. Because of the overall diff size in
moving the arguments to a struct, the change is done in stages to make
it easier to review starting with the bottom function and pushing the
struct up to callers in each successive patch.

Patch 4 introduces the new attribute.

Patches 5 and 6 add support for the new attribute to IPv4 and IPv6
addresses.

Patch 7 adds a set of test cases.

Patch 8 adds support to iproute2

Changes since RFC
- collapsed patches 1 and 3 into patch 2
- simplified stack variables in fib_modify_prefix_metric in patch 5
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 49fb6fe3 d69faad7
......@@ -139,6 +139,7 @@ struct in_ifaddr {
__be32 ifa_local;
__be32 ifa_address;
__be32 ifa_mask;
__u32 ifa_rt_priority;
__be32 ifa_broadcast;
unsigned char ifa_scope;
unsigned char ifa_prefixlen;
......
......@@ -59,6 +59,19 @@ struct in6_validator_info {
struct netlink_ext_ack *extack;
};
struct ifa6_config {
const struct in6_addr *pfx;
unsigned int plen;
const struct in6_addr *peer_pfx;
u32 rt_priority;
u32 ifa_flags;
u32 preferred_lft;
u32 valid_lft;
u16 scope;
};
int addrconf_init(void);
void addrconf_cleanup(void);
......
......@@ -42,6 +42,7 @@ enum {
struct inet6_ifaddr {
struct in6_addr addr;
__u32 prefix_len;
__u32 rt_priority;
/* In seconds, relative to tstamp. Expiry is at tstamp + HZ * lft. */
__u32 valid_lft;
......
......@@ -225,6 +225,7 @@ struct rtable *rt_dst_alloc(struct net_device *dev,
struct in_ifaddr;
void fib_add_ifaddr(struct in_ifaddr *);
void fib_del_ifaddr(struct in_ifaddr *, struct in_ifaddr *);
void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric);
void rt_add_uncached_list(struct rtable *rt);
void rt_del_uncached_list(struct rtable *rt);
......
......@@ -33,6 +33,7 @@ enum {
IFA_CACHEINFO,
IFA_MULTICAST,
IFA_FLAGS,
IFA_RT_PRIORITY, /* u32, priority/metric for prefix route */
__IFA_MAX,
};
......
......@@ -99,6 +99,7 @@ static const struct nla_policy ifa_ipv4_policy[IFA_MAX+1] = {
[IFA_LABEL] = { .type = NLA_STRING, .len = IFNAMSIZ - 1 },
[IFA_CACHEINFO] = { .len = sizeof(struct ifa_cacheinfo) },
[IFA_FLAGS] = { .type = NLA_U32 },
[IFA_RT_PRIORITY] = { .type = NLA_U32 },
};
#define IN4_ADDR_HSIZE_SHIFT 8
......@@ -835,6 +836,9 @@ static struct in_ifaddr *rtm_to_ifaddr(struct net *net, struct nlmsghdr *nlh,
else
memcpy(ifa->ifa_label, dev->name, IFNAMSIZ);
if (tb[IFA_RT_PRIORITY])
ifa->ifa_rt_priority = nla_get_u32(tb[IFA_RT_PRIORITY]);
if (tb[IFA_CACHEINFO]) {
struct ifa_cacheinfo *ci;
......@@ -906,12 +910,20 @@ static int inet_rtm_newaddr(struct sk_buff *skb, struct nlmsghdr *nlh,
return __inet_insert_ifa(ifa, nlh, NETLINK_CB(skb).portid,
extack);
} else {
u32 new_metric = ifa->ifa_rt_priority;
inet_free_ifa(ifa);
if (nlh->nlmsg_flags & NLM_F_EXCL ||
!(nlh->nlmsg_flags & NLM_F_REPLACE))
return -EEXIST;
ifa = ifa_existing;
if (ifa->ifa_rt_priority != new_metric) {
fib_modify_prefix_metric(ifa, new_metric);
ifa->ifa_rt_priority = new_metric;
}
set_ifa_lifetime(ifa, valid_lft, prefered_lft);
cancel_delayed_work(&check_lifetime_work);
queue_delayed_work(system_power_efficient_wq,
......@@ -1549,6 +1561,7 @@ static size_t inet_nlmsg_size(void)
+ nla_total_size(4) /* IFA_BROADCAST */
+ nla_total_size(IFNAMSIZ) /* IFA_LABEL */
+ nla_total_size(4) /* IFA_FLAGS */
+ nla_total_size(4) /* IFA_RT_PRIORITY */
+ nla_total_size(sizeof(struct ifa_cacheinfo)); /* IFA_CACHEINFO */
}
......@@ -1618,6 +1631,8 @@ static int inet_fill_ifaddr(struct sk_buff *skb, struct in_ifaddr *ifa,
(ifa->ifa_label[0] &&
nla_put_string(skb, IFA_LABEL, ifa->ifa_label)) ||
nla_put_u32(skb, IFA_FLAGS, ifa->ifa_flags) ||
(ifa->ifa_rt_priority &&
nla_put_u32(skb, IFA_RT_PRIORITY, ifa->ifa_rt_priority)) ||
put_cacheinfo(skb, ifa->ifa_cstamp, ifa->ifa_tstamp,
preferred, valid))
goto nla_put_failure;
......
......@@ -847,7 +847,8 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
* to fib engine. It is legal, because all events occur
* only when netlink is already locked.
*/
static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa)
static void fib_magic(int cmd, int type, __be32 dst, int dst_len,
struct in_ifaddr *ifa, u32 rt_priority)
{
struct net *net = dev_net(ifa->ifa_dev->dev);
u32 tb_id = l3mdev_fib_table(ifa->ifa_dev->dev);
......@@ -857,6 +858,7 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad
.fc_type = type,
.fc_dst = dst,
.fc_dst_len = dst_len,
.fc_priority = rt_priority,
.fc_prefsrc = ifa->ifa_local,
.fc_oif = ifa->ifa_dev->dev->ifindex,
.fc_nlflags = NLM_F_CREATE | NLM_F_APPEND,
......@@ -902,31 +904,57 @@ void fib_add_ifaddr(struct in_ifaddr *ifa)
}
}
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim);
fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, 0);
if (!(dev->flags & IFF_UP))
return;
/* Add broadcast address, if it is explicitly assigned. */
if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF))
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
prim, 0);
if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) &&
(prefix != addr || ifa->ifa_prefixlen < 32)) {
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
prefix, ifa->ifa_prefixlen, prim);
prefix, ifa->ifa_prefixlen, prim,
ifa->ifa_rt_priority);
/* Add network specific broadcasts, when it takes a sense */
if (ifa->ifa_prefixlen < 31) {
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32,
prim, 0);
fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask,
32, prim);
32, prim, 0);
}
}
}
void fib_modify_prefix_metric(struct in_ifaddr *ifa, u32 new_metric)
{
__be32 prefix = ifa->ifa_address & ifa->ifa_mask;
struct in_device *in_dev = ifa->ifa_dev;
struct net_device *dev = in_dev->dev;
if (!(dev->flags & IFF_UP) ||
ifa->ifa_flags & (IFA_F_SECONDARY | IFA_F_NOPREFIXROUTE) ||
ipv4_is_zeronet(prefix) ||
prefix == ifa->ifa_local || ifa->ifa_prefixlen == 32)
return;
/* add the new */
fib_magic(RTM_NEWROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
prefix, ifa->ifa_prefixlen, ifa, new_metric);
/* delete the old */
fib_magic(RTM_DELROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
prefix, ifa->ifa_prefixlen, ifa, ifa->ifa_rt_priority);
}
/* Delete primary or secondary address.
* Optionally, on secondary address promotion consider the addresses
* from subnet iprim as deleted, even if they are in device list.
......@@ -968,7 +996,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
if (!(ifa->ifa_flags & IFA_F_NOPREFIXROUTE))
fib_magic(RTM_DELROUTE,
dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST,
any, ifa->ifa_prefixlen, prim);
any, ifa->ifa_prefixlen, prim, 0);
subnet = 1;
}
......@@ -1052,17 +1080,20 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
no_promotions:
if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32,
prim, 0);
if (subnet && ifa->ifa_prefixlen < 31) {
if (!(ok & BRD1_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim);
fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32,
prim, 0);
if (!(ok & BRD0_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim);
fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32,
prim, 0);
}
if (!(ok & LOCAL_OK)) {
unsigned int addr_type;
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim);
fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim, 0);
/* Check, that this local address finally disappeared. */
addr_type = inet_addr_type_dev_table(dev_net(dev), dev,
......
This diff is collapsed.
......@@ -6,7 +6,8 @@
ret=0
TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt"
# all tests in this script. Can be overridden with -t option
TESTS="unregister down carrier nexthop ipv6_rt ipv4_rt ipv6_addr_metric ipv4_addr_metric"
VERBOSE=0
PAUSE_ON_FAIL=no
PAUSE=no
......@@ -642,6 +643,8 @@ check_route6()
local rc=0
out=$($IP -6 ro ls match ${pfx} | sed -e 's/ pref medium//')
[ "${out}" = "${expected}" ] && return 0
if [ -z "${out}" ]; then
if [ "$VERBOSE" = "1" ]; then
printf "\nNo route entry found\n"
......@@ -911,6 +914,98 @@ ipv6_route_test()
route_cleanup
}
ip_addr_metric_check()
{
ip addr help 2>&1 | grep -q metric
if [ $? -ne 0 ]; then
echo "iproute2 command does not support metric for addresses. Skipping test"
return 1
fi
return 0
}
ipv6_addr_metric_test()
{
local rc
echo
echo "IPv6 prefix route tests"
ip_addr_metric_check || return 1
setup
set -e
$IP li add dummy1 type dummy
$IP li add dummy2 type dummy
$IP li set dummy1 up
$IP li set dummy2 up
# default entry is metric 256
run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64"
run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64"
set +e
check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 256 2001:db8:104::/64 dev dummy2 proto kernel metric 256"
log_test $? 0 "Default metric"
set -e
run_cmd "$IP -6 addr flush dev dummy1"
run_cmd "$IP -6 addr add dev dummy1 2001:db8:104::1/64 metric 257"
set +e
check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 256 2001:db8:104::/64 dev dummy1 proto kernel metric 257"
log_test $? 0 "User specified metric on first device"
set -e
run_cmd "$IP -6 addr flush dev dummy2"
run_cmd "$IP -6 addr add dev dummy2 2001:db8:104::2/64 metric 258"
set +e
check_route6 "2001:db8:104::/64 dev dummy1 proto kernel metric 257 2001:db8:104::/64 dev dummy2 proto kernel metric 258"
log_test $? 0 "User specified metric on second device"
run_cmd "$IP -6 addr del dev dummy1 2001:db8:104::1/64 metric 257"
rc=$?
if [ $rc -eq 0 ]; then
check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 258"
rc=$?
fi
log_test $rc 0 "Delete of address on first device"
run_cmd "$IP -6 addr change dev dummy2 2001:db8:104::2/64 metric 259"
rc=$?
if [ $rc -eq 0 ]; then
check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
rc=$?
fi
log_test $rc 0 "Modify metric of address"
# verify prefix route removed on down
run_cmd "ip netns exec testns sysctl -qw net.ipv6.conf.all.keep_addr_on_down=1"
run_cmd "$IP li set dev dummy2 down"
rc=$?
if [ $rc -eq 0 ]; then
check_route6 ""
rc=$?
fi
log_test $rc 0 "Prefix route removed on link down"
# verify prefix route re-inserted with assigned metric
run_cmd "$IP li set dev dummy2 up"
rc=$?
if [ $rc -eq 0 ]; then
check_route6 "2001:db8:104::/64 dev dummy2 proto kernel metric 259"
rc=$?
fi
log_test $rc 0 "Prefix route with metric on link up"
$IP li del dummy1
$IP li del dummy2
cleanup
}
# add route for a prefix, flushing any existing routes first
# expected to be the first step of a test
add_route()
......@@ -955,6 +1050,8 @@ check_route()
local rc=0
out=$($IP ro ls match ${pfx})
[ "${out}" = "${expected}" ] && return 0
if [ -z "${out}" ]; then
if [ "$VERBOSE" = "1" ]; then
printf "\nNo route entry found\n"
......@@ -1181,6 +1278,86 @@ ipv4_route_test()
route_cleanup
}
ipv4_addr_metric_test()
{
local rc
echo
echo "IPv4 prefix route tests"
ip_addr_metric_check || return 1
setup
set -e
$IP li add dummy1 type dummy
$IP li add dummy2 type dummy
$IP li set dummy1 up
$IP li set dummy2 up
# default entry is metric 256
run_cmd "$IP addr add dev dummy1 172.16.104.1/24"
run_cmd "$IP addr add dev dummy2 172.16.104.2/24"
set +e
check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2"
log_test $? 0 "Default metric"
set -e
run_cmd "$IP addr flush dev dummy1"
run_cmd "$IP addr add dev dummy1 172.16.104.1/24 metric 257"
set +e
check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257"
log_test $? 0 "User specified metric on first device"
set -e
run_cmd "$IP addr flush dev dummy2"
run_cmd "$IP addr add dev dummy2 172.16.104.2/24 metric 258"
set +e
check_route "172.16.104.0/24 dev dummy1 proto kernel scope link src 172.16.104.1 metric 257 172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
log_test $? 0 "User specified metric on second device"
run_cmd "$IP addr del dev dummy1 172.16.104.1/24 metric 257"
rc=$?
if [ $rc -eq 0 ]; then
check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 258"
rc=$?
fi
log_test $rc 0 "Delete of address on first device"
run_cmd "$IP addr change dev dummy2 172.16.104.2/24 metric 259"
rc=$?
if [ $rc -eq 0 ]; then
check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
rc=$?
fi
log_test $rc 0 "Modify metric of address"
# verify prefix route removed on down
run_cmd "$IP li set dev dummy2 down"
rc=$?
if [ $rc -eq 0 ]; then
check_route ""
rc=$?
fi
log_test $rc 0 "Prefix route removed on link down"
# verify prefix route re-inserted with assigned metric
run_cmd "$IP li set dev dummy2 up"
rc=$?
if [ $rc -eq 0 ]; then
check_route "172.16.104.0/24 dev dummy2 proto kernel scope link src 172.16.104.2 metric 259"
rc=$?
fi
log_test $rc 0 "Prefix route with metric on link up"
$IP li del dummy1
$IP li del dummy2
cleanup
}
################################################################################
# usage
......@@ -1245,6 +1422,8 @@ do
fib_nexthop_test|nexthop) fib_nexthop_test;;
ipv6_route_test|ipv6_rt) ipv6_route_test;;
ipv4_route_test|ipv4_rt) ipv4_route_test;;
ipv6_addr_metric) ipv6_addr_metric_test;;
ipv4_addr_metric) ipv4_addr_metric_test;;
help) echo "Test names: $TESTS"; exit 0;;
esac
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment