Commit 6cd54fc6 authored by Pablo Neira Ayuso's avatar Pablo Neira Ayuso

Merge tag 'ipvs-for-v4.7' of https://git.kernel.org/pub/scm/linux/kernel/git/horms/ipvs-next

Simon Horman says:

====================
IPVS Updates for v4.7

please consider these enhancements to the IPVS. They allow SIP connections
originating from real-servers to be load balanced by the SIP psersitence
engine as is already implemented in the other direction. And for better one
packet scheduling (OPS) performance.
====================
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parents d2b484b5 8fb04d9f
...@@ -731,6 +731,12 @@ struct ip_vs_pe { ...@@ -731,6 +731,12 @@ struct ip_vs_pe {
u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval, u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
bool inverse); bool inverse);
int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf); int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
/* create connections for real-server outgoing packets */
struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport, __be16 cport);
}; };
/* The application module object (a.k.a. app incarnation) */ /* The application module object (a.k.a. app incarnation) */
...@@ -874,6 +880,7 @@ struct netns_ipvs { ...@@ -874,6 +880,7 @@ struct netns_ipvs {
/* Service counters */ /* Service counters */
atomic_t ftpsvc_counter; atomic_t ftpsvc_counter;
atomic_t nullsvc_counter; atomic_t nullsvc_counter;
atomic_t conn_out_counter;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
/* 1/rate drop and drop-entry variables */ /* 1/rate drop and drop-entry variables */
...@@ -1147,6 +1154,12 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs) ...@@ -1147,6 +1154,12 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs)
*/ */
const char *ip_vs_proto_name(unsigned int proto); const char *ip_vs_proto_name(unsigned int proto);
void ip_vs_init_hash_table(struct list_head *table, int rows); void ip_vs_init_hash_table(struct list_head *table, int rows);
struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport);
#define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t))) #define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t)))
#define IP_VS_APP_TYPE_FTP 1 #define IP_VS_APP_TYPE_FTP 1
...@@ -1378,6 +1391,10 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol ...@@ -1378,6 +1391,10 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport); const union nf_inet_addr *daddr, __be16 dport);
struct ip_vs_dest *
ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport);
int ip_vs_use_count_inc(void); int ip_vs_use_count_inc(void);
void ip_vs_use_count_dec(void); void ip_vs_use_count_dec(void);
int ip_vs_register_nl_ioctl(void); int ip_vs_register_nl_ioctl(void);
......
...@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key) ...@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l); spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
} }
static void ip_vs_conn_expire(unsigned long data);
/* /*
* Returns hash value for IPVS connection entry * Returns hash value for IPVS connection entry
...@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af, ...@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
} }
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
{
__ip_vs_conn_put(cp);
ip_vs_conn_expire((unsigned long)cp);
}
/* /*
* Put back the conn and restart its timer with its timeout * Put back the conn and restart its timer with its timeout
*/ */
void ip_vs_conn_put(struct ip_vs_conn *cp) static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
{ {
unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ? unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
0 : cp->timeout; 0 : cp->timeout;
...@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp) ...@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
__ip_vs_conn_put(cp); __ip_vs_conn_put(cp);
} }
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
(atomic_read(&cp->refcnt) == 1) &&
!timer_pending(&cp->timer))
/* expire connection immediately */
__ip_vs_conn_put_notimer(cp);
else
__ip_vs_conn_put_timer(cp);
}
/* /*
* Fill a no_client_port connection with a client port number * Fill a no_client_port connection with a client port number
...@@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data) ...@@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control) if (cp->control)
ip_vs_control_del(cp); ip_vs_control_del(cp);
if (cp->flags & IP_VS_CONN_F_NFCT) { if ((cp->flags & IP_VS_CONN_F_NFCT) &&
!(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
/* Do not access conntracks during subsys cleanup /* Do not access conntracks during subsys cleanup
* because nf_conntrack_find_get can not be used after * because nf_conntrack_find_get can not be used after
* conntrack cleanup for the net. * conntrack cleanup for the net.
...@@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data) ...@@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_unbind_dest(cp); ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT) if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt); atomic_dec(&ip_vs_conn_no_cport_cnt);
call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free); if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
ip_vs_conn_rcu_free(&cp->rcu_head);
else
call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count); atomic_dec(&ipvs->conn_count);
return; return;
} }
...@@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data) ...@@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (ipvs->sync_state & IP_VS_STATE_MASTER) if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs)); ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp); __ip_vs_conn_put_timer(cp);
} }
/* Modify timer, so that it expires as soon as possible. /* Modify timer, so that it expires as soon as possible.
......
...@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put); ...@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG #ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level); EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif #endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
static int ip_vs_net_id __read_mostly; static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */ /* netns cnt used for uniqueness */
...@@ -1100,6 +1101,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp, ...@@ -1100,6 +1101,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
} }
} }
/* Generic function to create new connections for outgoing RS packets
*
* Pre-requisites for successful connection creation:
* 1) Virtual Service is NOT fwmark based:
* In fwmark-VS actual vaddr and vport are unknown to IPVS
* 2) Real Server and Virtual Service were NOT configured without port:
* This is to allow match of different VS to the same RS ip-addr
*/
struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport)
{
struct ip_vs_conn_param param;
struct ip_vs_conn *ct = NULL, *cp = NULL;
const union nf_inet_addr *vaddr, *daddr, *caddr;
union nf_inet_addr snet;
__be16 vport;
unsigned int flags;
EnterFunction(12);
vaddr = &svc->addr;
vport = svc->port;
daddr = &iph->saddr;
caddr = &iph->daddr;
/* check pre-requisites are satisfied */
if (svc->fwmark)
return NULL;
if (!vport || !dport)
return NULL;
/* for persistent service first create connection template */
if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
/* apply netmask the same way ingress-side does */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
ipv6_addr_prefix(&snet.in6, &caddr->in6,
(__force __u32)svc->netmask);
else
#endif
snet.ip = caddr->ip & svc->netmask;
/* fill params and create template if not existent */
if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
&snet, 0, vaddr,
vport, &param) < 0)
return NULL;
ct = ip_vs_ct_in_get(&param);
if (!ct) {
ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
IP_VS_CONN_F_TEMPLATE, dest, 0);
if (!ct) {
kfree(param.pe_data);
return NULL;
}
ct->timeout = svc->timeout;
} else {
kfree(param.pe_data);
}
}
/* connection flags */
flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
/* create connection */
ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
caddr, cport, vaddr, vport, &param);
cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
if (!cp) {
if (ct)
ip_vs_conn_put(ct);
return NULL;
}
if (ct) {
ip_vs_control_add(cp, ct);
ip_vs_conn_put(ct);
}
ip_vs_conn_stats(cp, svc);
/* return connection (will be used to handle outgoing packet) */
IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
cp->flags, atomic_read(&cp->refcnt));
LeaveFunction(12);
return cp;
}
/* Handle outgoing packets which are considered requests initiated by
* real servers, so that subsequent responses from external client can be
* routed to the right real server.
* Used also for outgoing responses in OPS mode.
*
* Connection management is handled by persistent-engine specific callback.
*/
static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
struct netns_ipvs *ipvs,
int af, struct sk_buff *skb,
const struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_conn *cp = NULL;
__be16 _ports[2], *pptr;
if (hooknum == NF_INET_LOCAL_IN)
return NULL;
pptr = frag_safe_skb_hp(skb, iph->len,
sizeof(_ports), _ports, iph);
if (!pptr)
return NULL;
rcu_read_lock();
dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
&iph->saddr, pptr[0]);
if (dest) {
struct ip_vs_service *svc;
struct ip_vs_pe *pe;
svc = rcu_dereference(dest->svc);
if (svc) {
pe = rcu_dereference(svc->pe);
if (pe && pe->conn_out)
cp = pe->conn_out(svc, dest, skb, iph,
pptr[0], pptr[1]);
}
}
rcu_read_unlock();
return cp;
}
/* Handle response packets: rewrite addresses and send away... /* Handle response packets: rewrite addresses and send away...
*/ */
static unsigned int static unsigned int
...@@ -1245,6 +1383,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in ...@@ -1245,6 +1383,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (likely(cp)) if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum); return handle_response(af, skb, pd, cp, &iph, hooknum);
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
/* Currently only for UDP:
* connection oriented protocols typically use
* ephemeral ports for outgoing connections, so
* related incoming responses would not match any VS
*/
if (pp->protocol == IPPROTO_UDP) {
cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph,
hooknum);
}
}
if (sysctl_nat_icmp_send(ipvs) && if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP || (pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP || pp->protocol == IPPROTO_UDP ||
......
...@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol, ...@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
return false; return false;
} }
/* Find real service record by <proto,addr,port>.
* In case of multiple records with the same <proto,addr,port>, only
* the first found record is returned.
*
* To be called under RCU lock.
*/
struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
__u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
unsigned int hash;
struct ip_vs_dest *dest;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
(dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
return dest;
}
}
return NULL;
}
/* Lookup destination by {addr,port} in the given service /* Lookup destination by {addr,port} in the given service
* Called under RCU lock. * Called under RCU lock.
*/ */
...@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, ...@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
atomic_inc(&ipvs->ftpsvc_counter); atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0) else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter); atomic_inc(&ipvs->nullsvc_counter);
if (svc->pe && svc->pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats); ip_vs_start_estimator(ipvs, &svc->stats);
...@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) ...@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
struct ip_vs_scheduler *sched = NULL, *old_sched; struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL; struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0; int ret = 0;
bool new_pe_conn_out, old_pe_conn_out;
/* /*
* Lookup the scheduler, by 'u->sched_name' * Lookup the scheduler, by 'u->sched_name'
...@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) ...@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->netmask = u->netmask; svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1); old_pe = rcu_dereference_protected(svc->pe, 1);
if (pe != old_pe) if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe); rcu_assign_pointer(svc->pe, pe);
/* check for optional methods in new pe */
new_pe_conn_out = (pe && pe->conn_out) ? true : false;
old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
if (new_pe_conn_out && !old_pe_conn_out)
atomic_inc(&svc->ipvs->conn_out_counter);
if (old_pe_conn_out && !new_pe_conn_out)
atomic_dec(&svc->ipvs->conn_out_counter);
}
out: out:
ip_vs_scheduler_put(old_sched); ip_vs_scheduler_put(old_sched);
...@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup) ...@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind persistence engine, keep svc->pe */ /* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1); old_pe = rcu_dereference_protected(svc->pe, 1);
if (old_pe && old_pe->conn_out)
atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe); ip_vs_pe_put(old_pe);
/* /*
...@@ -3957,6 +4000,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs) ...@@ -3957,6 +4000,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
(unsigned long) ipvs); (unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0); atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0); atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */ /* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats); ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
......
...@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) ...@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return; return;
/* Never alter conntrack for OPS conns (no reply is expected) */
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
return;
/* Alter reply only in original direction */ /* Alter reply only in original direction */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return; return;
......
...@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) ...@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
return cp->pe_data_len; return cp->pe_data_len;
} }
static struct ip_vs_conn *
ip_vs_sip_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport)
{
if (likely(iph->protocol == IPPROTO_UDP))
return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport);
/* currently no need to handle other than UDP */
return NULL;
}
static struct ip_vs_pe ip_vs_sip_pe = static struct ip_vs_pe ip_vs_sip_pe =
{ {
.name = "sip", .name = "sip",
...@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe = ...@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe =
.ct_match = ip_vs_sip_ct_match, .ct_match = ip_vs_sip_ct_match,
.hashkey_raw = ip_vs_sip_hashkey_raw, .hashkey_raw = ip_vs_sip_hashkey_raw,
.show_pe_data = ip_vs_sip_show_pe_data, .show_pe_data = ip_vs_sip_show_pe_data,
.conn_out = ip_vs_sip_conn_out,
}; };
static int __init ip_vs_sip_init(void) static int __init ip_vs_sip_init(void)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment