Commit 54f47c5d authored by Julian Anastasov's avatar Julian Anastasov Committed by Arnaldo Carvalho de Melo

[IPVS]: Properly handle non-linear skbs.

Most of the changes come from Paul `Rusty' Russell. Now we
modify the skbs only for IPVS packets.
parent 3a9a3e7d
......@@ -8,7 +8,7 @@
#include <asm/types.h> /* For __uXX types */
#define IP_VS_VERSION_CODE 0x010107
#define IP_VS_VERSION_CODE 0x010108
#define NVERSION(version) \
(version >> 16) & 0xFF, \
(version >> 8) & 0xFF, \
......@@ -272,22 +272,22 @@ extern int ip_vs_get_debug_level(void);
if (net_ratelimit()) \
printk(KERN_DEBUG "IPVS: " msg); \
} while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) \
#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) \
do { \
if (level <= ip_vs_get_debug_level()) \
pp->debug_packet(pp, iph, msg); \
pp->debug_packet(pp, skb, ofs, msg); \
} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) \
#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) \
do { \
if (level <= ip_vs_get_debug_level() && \
net_ratelimit()) \
pp->debug_packet(pp, iph, msg); \
pp->debug_packet(pp, skb, ofs, msg); \
} while (0)
#else /* NO DEBUGGING at ALL */
#define IP_VS_DBG(level, msg...) do {} while (0)
#define IP_VS_DBG_RL(msg...) do {} while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) do {} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) do {} while (0)
#define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) do {} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) do {} while (0)
#endif
#define IP_VS_BUG() BUG()
......@@ -395,18 +395,6 @@ enum {
IP_VS_ICMP_S_LAST,
};
/*
* Transport protocol header
*/
union ip_vs_tphdr {
unsigned char *raw;
struct udphdr *uh;
struct tcphdr *th;
struct icmphdr *icmph;
__u16 *portp;
};
/*
* Delta sequence info structure
* Each ip_vs_conn has 2 (output AND input seq. changes).
......@@ -459,36 +447,36 @@ struct ip_vs_protocol {
void (*exit)(struct ip_vs_protocol *pp);
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int (*conn_schedule)(struct sk_buff *skb,
struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int inverse);
(*conn_in_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse);
struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int inverse);
(*conn_out_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse);
int (*snat_handler)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*snat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
int (*dnat_handler)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*dnat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
int (*csum_check)(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int size);
int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp);
const char *(*state_name)(int state);
int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h,
const struct sk_buff *skb,
struct ip_vs_protocol *pp);
int (*register_app)(struct ip_vs_app *inc);
......@@ -497,8 +485,10 @@ struct ip_vs_protocol {
int (*app_conn_bind)(struct ip_vs_conn *cp);
void (*debug_packet)(struct ip_vs_protocol *pp, struct iphdr *iph,
char *msg);
void (*debug_packet)(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg);
void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
......@@ -638,7 +628,7 @@ struct ip_vs_scheduler {
/* selecting a server from the given service */
struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
struct iphdr *iph);
const struct sk_buff *skb);
};
......@@ -660,13 +650,13 @@ struct ip_vs_app
__u16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */
/* output hook */
/* output hook: return false if can't linearize. diff set for TCP. */
int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *);
struct sk_buff **, int *diff);
/* input hook */
/* input hook: return false if can't linearize. diff set for TCP. */
int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *);
struct sk_buff **, int *diff);
/* ip_vs_app initializer */
int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
......@@ -686,20 +676,21 @@ struct ip_vs_app
int timeouts_size;
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse);
(*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app,
const struct iphdr *iph, unsigned int proto_off,
int inverse);
struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse);
(*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app,
const struct iphdr *iph, unsigned int proto_off,
int inverse);
int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_app *app);
const struct sk_buff *skb,
struct ip_vs_app *app);
void (*timeout_change)(struct ip_vs_app *app, int flags);
};
......@@ -839,8 +830,8 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port);
extern int ip_vs_app_inc_get(struct ip_vs_app *inc);
extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb);
extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb);
extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff **pskb);
extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff **pskb);
extern int ip_vs_skb_replace(struct sk_buff *skb, int pri,
char *o_buf, int o_len, char *n_buf, int n_len);
extern int ip_vs_app_init(void);
......@@ -856,6 +847,10 @@ extern void ip_vs_protocol_timeout_change(int flags);
extern int *ip_vs_create_timeout_table(int *table, int size);
extern int
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to);
extern void
ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg);
extern struct ip_vs_protocol ip_vs_protocol_tcp;
extern struct ip_vs_protocol ip_vs_protocol_udp;
extern struct ip_vs_protocol ip_vs_protocol_icmp;
......@@ -875,9 +870,9 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
extern struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph);
ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb);
extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h);
struct ip_vs_protocol *pp);
/*
......@@ -940,7 +935,7 @@ extern int ip_vs_tunnel_xmit
extern int ip_vs_dr_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_icmp_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset);
extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
......@@ -986,6 +981,11 @@ extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp)
return fwd;
}
extern int ip_vs_make_skb_writable(struct sk_buff **pskb, int len);
extern void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int dir);
extern u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset);
static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum)
{
......
......@@ -362,29 +362,18 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
spin_unlock(&cp->lock);
}
/*
* Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns (new - old) skb->len diff.
*/
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
struct ip_vs_app *app)
{
struct ip_vs_app *app;
int diff;
struct iphdr *iph;
unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
struct tcphdr *th;
__u32 seq;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
return 0;
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
/*
* Remember seq number in case this pkt gets resized
......@@ -394,54 +383,72 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
/*
* Fix seq stuff if flagged as so.
*/
if (cp->protocol == IPPROTO_TCP) {
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_seq(&cp->out_seq, th);
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_ack_seq(&cp->in_seq, th);
}
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 0;
return 1;
diff = app->pkt_out(app, cp, skb);
if (!app->pkt_out(app, cp, pskb, &diff))
return 0;
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0 && cp->protocol == IPPROTO_TCP)
if (diff != 0)
vs_seq_update(cp, &cp->out_seq,
IP_VS_CONN_F_OUT_SEQ, seq, diff);
return diff;
return 1;
}
/*
* Input pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns (new - old) skb->len diff.
* Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns false if it can't handle packet (oom)
*/
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
{
struct ip_vs_app *app;
int diff;
struct iphdr *iph;
struct tcphdr *th;
__u32 seq;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_out(cp, pskb, app);
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 1;
return app->pkt_out(app, cp, pskb, NULL);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
struct ip_vs_app *app)
{
int diff;
unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
struct tcphdr *th;
__u32 seq;
if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
return 0;
iph = skb->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
/*
* Remember seq number in case this pkt gets resized
......@@ -451,29 +458,57 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
/*
* Fix seq stuff if flagged as so.
*/
if (cp->protocol == IPPROTO_TCP) {
if (cp->flags & IP_VS_CONN_F_IN_SEQ)
vs_fix_seq(&cp->in_seq, th);
if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
vs_fix_ack_seq(&cp->out_seq, th);
}
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 0;
return 1;
diff = app->pkt_in(app, cp, skb);
if (!app->pkt_in(app, cp, pskb, &diff))
return 0;
/*
* Update ip_vs seq stuff if len has changed.
*/
if (diff != 0 && cp->protocol == IPPROTO_TCP)
if (diff != 0)
vs_seq_update(cp, &cp->in_seq,
IP_VS_CONN_F_IN_SEQ, seq, diff);
return diff;
return 1;
}
/*
* Input pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
{
struct ip_vs_app *app;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_in(cp, pskb, app);
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 1;
return app->pkt_in(app, cp, pskb, NULL);
}
......
......@@ -21,6 +21,7 @@
* and others.
*
* Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
*
*/
......@@ -61,10 +62,11 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(check_for_ip_vs_out);
EXPORT_SYMBOL(ip_vs_make_skb_writable);
/* ID used in ICMP lookups */
#define icmp_id(icmph) ((icmph->un).echo.id)
#define icmp_id(icmph) (((icmph)->un).echo.id)
const char *ip_vs_proto_name(unsigned proto)
{
......@@ -156,15 +158,51 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
if (unlikely(!pp->state_transition))
return 0;
return pp->state_transition(cp, direction, iph, h, pp);
return pp->state_transition(cp, direction, skb, pp);
}
int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
{
struct sk_buff *skb = *pskb;
/* skb is already used, better copy skb and its payload */
if (unlikely(skb_shared(skb) || skb->sk))
goto copy_skb;
/* skb data is already used, copy it */
if (unlikely(skb_cloned(skb)))
goto copy_data;
return pskb_may_pull(skb, writable_len);
copy_data:
if (unlikely(writable_len > skb->len))
return 0;
return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
copy_skb:
if (unlikely(writable_len > skb->len))
return 0;
skb = skb_copy(skb, GFP_ATOMIC);
if (!skb)
return 0;
BUG_ON(skb_is_nonlinear(skb));
/* Rest of kernel will get very unhappy if we pass it a
suddenly-orphaned skbuff */
if ((*pskb)->sk)
skb_set_owner_w(skb, (*pskb)->sk);
kfree_skb(*pskb);
*pskb = skb;
return 1;
}
/*
* IPVS persistent scheduling function
* It creates a connection entry according to its template if exists,
......@@ -173,24 +211,24 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
* Protocols supported: TCP, UDP
*/
static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_sched_persist(struct ip_vs_service *svc,
const struct sk_buff *skb,
__u16 ports[2])
{
struct ip_vs_conn *cp = NULL;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_dest *dest;
const __u16 *portp;
struct ip_vs_conn *ct;
__u16 dport; /* destination port to forward */
__u32 snet; /* source network of the client, after masking */
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
/* Mask saddr with the netmask to adjust template granularity */
snet = iph->saddr & svc->netmask;
IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
"mnet %u.%u.%u.%u\n",
NIPQUAD(iph->saddr), ntohs(portp[0]),
NIPQUAD(iph->daddr), ntohs(portp[1]),
NIPQUAD(iph->saddr), ntohs(ports[0]),
NIPQUAD(iph->daddr), ntohs(ports[1]),
NIPQUAD(snet));
/*
......@@ -206,11 +244,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
* is created for other persistent services.
*/
if (portp[1] == svc->port) {
if (ports[1] == svc->port) {
/* Check if a template already exists */
if (svc->port != FTPPORT)
ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, portp[1]);
iph->daddr, ports[1]);
else
ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, 0);
......@@ -220,7 +258,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* No template found or the dest of the connection
* template is not available.
*/
dest = svc->scheduler->schedule(svc, iph);
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL;
......@@ -235,7 +273,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
if (svc->port != FTPPORT)
ct = ip_vs_conn_new(iph->protocol,
snet, 0,
iph->daddr, portp[1],
iph->daddr,
ports[1],
dest->addr, dest->port,
0,
dest);
......@@ -277,7 +316,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
if (svc->port)
return NULL;
dest = svc->scheduler->schedule(svc, iph);
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL;
......@@ -308,15 +347,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
/* set destination with the found template */
dest = ct->dest;
}
dport = portp[1];
dport = ports[1];
}
/*
* Create a new connection according to the template
*/
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0],
iph->daddr, portp[1],
iph->saddr, ports[0],
iph->daddr, ports[1],
dest->addr, dport,
0,
dest);
......@@ -343,23 +382,26 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* Protocols supported: TCP, UDP
*/
struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_conn *cp = NULL;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_dest *dest;
const __u16 *portp;
__u16 ports[2];
if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0)
return NULL;
/*
* Persistent service
*/
if (svc->flags & IP_VS_SVC_F_PERSISTENT)
return ip_vs_sched_persist(svc, iph);
return ip_vs_sched_persist(svc, skb, ports);
/*
* Non-persistent service
*/
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
if (!svc->fwmark && portp[1] != svc->port) {
if (!svc->fwmark && ports[1] != svc->port) {
if (!svc->port)
IP_VS_ERR("Schedule: port zero only supported "
"in persistent services, "
......@@ -367,7 +409,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
return NULL;
}
dest = svc->scheduler->schedule(svc, iph);
dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL;
......@@ -377,9 +419,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
* Create a connection entry.
*/
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0],
iph->daddr, portp[1],
dest->addr, dest->port?dest->port:portp[1],
iph->saddr, ports[0],
iph->daddr, ports[1],
dest->addr, dest->port?dest->port:ports[1],
0,
dest);
if (cp == NULL)
......@@ -404,10 +446,16 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
* no destination is available for a new connection.
*/
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h)
struct ip_vs_protocol *pp)
{
__u16 ports[2];
struct iphdr *iph = skb->nh.iph;
if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0) {
ip_vs_service_put(svc);
return NF_DROP;
}
/* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is RTN_UNICAST (and not local), then create
a cache_bypass connection entry */
......@@ -421,21 +469,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
/* create a new connection entry */
IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
cp = ip_vs_conn_new(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1],
iph->saddr, ports[0],
iph->daddr, ports[1],
0, 0,
IP_VS_CONN_F_BYPASS,
NULL);
if (cp == NULL) {
kfree_skb(skb);
return NF_STOLEN;
}
if (cp == NULL)
return NF_DROP;
/* statistics */
ip_vs_in_stats(cp, skb);
/* set state */
cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
/* transmit the first SYN packet */
ret = cp->packet_xmit(skb, cp, pp);
......@@ -451,7 +497,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets.
*/
if ((svc->port == FTPPORT) && (h.portp[1] != FTPPORT)) {
if ((svc->port == FTPPORT) && (ports[1] != FTPPORT)) {
ip_vs_service_put(svc);
return NF_ACCEPT;
}
......@@ -466,8 +512,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
*/
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
kfree_skb(skb);
return NF_STOLEN;
return NF_DROP;
}
......@@ -479,22 +524,80 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* for VS/NAT.
*/
static unsigned int ip_vs_post_routing(unsigned int hooknum,
struct sk_buff **skb_p,
struct sk_buff **pskb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
if (!(skb->nfcache & NFC_IPVS_PROPERTY))
if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */
(*okfn)(skb);
(*okfn)(*pskb);
return NF_STOLEN;
}
u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
}
static inline struct sk_buff *
ip_vs_gather_frags(struct sk_buff *skb)
{
skb = ip_defrag(skb);
if (skb)
ip_send_check(skb->nh.iph);
return skb;
}
/*
* Packet has been made sufficiently writable in caller
* - inout: 1=in->out, 0=out->in
*/
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int inout)
{
struct iphdr *iph = skb->nh.iph;
unsigned int icmp_offset = iph->ihl*4;
struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
struct iphdr *ciph = (struct iphdr *)(icmph + 1);
if (inout) {
iph->saddr = cp->vaddr;
ip_send_check(iph);
ciph->daddr = cp->vaddr;
ip_send_check(ciph);
} else {
iph->daddr = cp->daddr;
ip_send_check(iph);
ciph->saddr = cp->daddr;
ip_send_check(ciph);
}
/* the TCP/UDP port */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
__u16 *ports = (void *)ciph + ciph->ihl*4;
if (inout)
ports[1] = cp->vport;
else
ports[0] = cp->dport;
}
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (inout)
IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered outgoing ICMP");
else
IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered incoming ICMP");
}
/*
* Handle ICMP messages in the inside-to-outside direction (outgoing).
......@@ -503,44 +606,33 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
* Currently handles error types - unreachable, quench, ttl exceeded.
* (Only used in VS/NAT)
*/
static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
{
struct sk_buff *skb = *skb_p;
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short ihl;
unsigned short len;
unsigned short clen, cihl;
struct icmphdr icmph;
struct iphdr ciph; /* The ip header contained within the ICMP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
unsigned int offset, ihl, verdict;
*related = 1;
/* reassemble IP fragments, but will it happen in ICMP packets?? */
/* reassemble IP fragments */
if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
skb = ip_vs_gather_frags(skb);
if (!skb)
return NF_STOLEN;
*skb_p = skb;
}
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
ip_send_check(skb->nh.iph);
*pskb = skb;
}
iph = skb->nh.iph;
ihl = iph->ihl << 2;
icmph = (struct icmphdr *)((char *)iph + ihl);
len = ntohs(iph->tot_len) - ihl;
if (len < sizeof(struct icmphdr))
offset = ihl = iph->ihl * 4;
if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0)
return NF_DROP;
IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)),
IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph.type, ntohs(icmp_id(&icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
/*
......@@ -550,86 +642,80 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_SOURCE_QUENCH) &&
(icmph->type != ICMP_TIME_EXCEEDED)) {
if ((icmph.type != ICMP_DEST_UNREACH) &&
(icmph.type != ICMP_SOURCE_QUENCH) &&
(icmph.type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
/* Now find the contained IP header */
clen = len - sizeof(struct icmphdr);
if (clen < sizeof(struct iphdr))
return NF_DROP;
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
offset += sizeof(icmph);
if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0)
return NF_ACCEPT; /* The packet looks wrong, ignore */
pp = ip_vs_proto_get(ciph->protocol);
pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT;
/* We need at least TCP/UDP ports here */
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
h.raw = (char *) ciph + cihl;
/* Ensure the checksum is correct */
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
return NF_DROP;
}
IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
IP_VS_DBG_PKT(11, pp, ciph, "Handling outgoing ICMP for");
offset += ciph.ihl * 4;
/* ciph content is actually <protocol, caddr, cport, daddr, dport> */
cp = pp->conn_out_get(skb, pp, ciph, h, 1);
/* The embedded headers contain source and dest in reverse order */
cp = pp->conn_out_get(skb, pp, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != 0) {
IP_VS_ERR("shouldn't reach here, because the box is on the"
"half connection in the tun/dr module.\n");
}
/* Now we do real damage to this packet...! */
/* First change the source IP address, and recalc checksum */
iph->saddr = cp->vaddr;
ip_send_check(iph);
/* Now change the *dest* address in the contained IP */
ciph->daddr = cp->vaddr;
ip_send_check(ciph);
/* Ensure the checksum is correct */
if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
ip_vs_checksum_complete(skb, ihl)) {
/* Failed checksum! */
IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
goto out;
}
/* the TCP/UDP dest port - cannot redo check */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
h.portp[1] = cp->vport;
if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol)
offset += 2 * sizeof(__u16);
if (!ip_vs_make_skb_writable(pskb, offset))
goto out;
skb = *pskb;
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
ip_vs_nat_icmp(skb, pp, cp, 1);
/* do the statistics and put it back */
ip_vs_out_stats(cp, skb);
__ip_vs_conn_put(cp);
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding correct outgoing ICMP");
skb->nfcache |= NFC_IPVS_PROPERTY;
verdict = NF_ACCEPT;
return NF_ACCEPT;
out:
__ip_vs_conn_put(cp);
return verdict;
}
static inline int is_tcp_reset(const struct sk_buff *skb)
{
struct tcphdr tcph;
if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) < 0)
return 0;
return tcph.rst;
}
/*
* It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
......@@ -637,16 +723,15 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
* rewrite addresses of the packet and send it on its way...
*/
static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
struct ip_vs_conn *cp;
int size, ihl, firstfrag;
int ihl;
EnterFunction(11);
......@@ -655,10 +740,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_out_icmp(skb_p, &related);
int related, verdict = ip_vs_out_icmp(pskb, &related);
if (related)
return verdict;
skb = *pskb;
iph = skb->nh.iph;
}
pp = ip_vs_proto_get(iph->protocol);
......@@ -668,105 +755,74 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
/* reassemble IP fragments */
if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) {
skb = ip_defrag(skb);
skb = ip_vs_gather_frags(skb);
if (!skb)
return NF_STOLEN;
iph = skb->nh.iph;
*skb_p = skb;
*pskb = skb;
}
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2;
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/*
* Check if the packet belongs to an existing entry
*/
cp = pp->conn_out_get(skb, pp, iph, h, 0);
cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
if (unlikely(!cp)) {
if (sysctl_ip_vs_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP) &&
ip_vs_lookup_real_service(iph->protocol,
iph->saddr, h.portp[0])) {
pp->protocol == IPPROTO_UDP)) {
__u16 ports[2];
if (skb_copy_bits(skb, ihl, ports, sizeof(ports)) < 0)
return NF_ACCEPT; /* Not for me */
if (ip_vs_lookup_real_service(iph->protocol,
iph->saddr, ports[0])) {
/*
* Notify the real server: there is no existing
* entry if it is not RST packet or not TCP packet.
* Notify the real server: there is no
* existing entry if it is not RST
* packet or not TCP packet.
*/
if (!h.th->rst || iph->protocol != IPPROTO_TCP) {
icmp_send(skb, ICMP_DEST_UNREACH,
if (iph->protocol != IPPROTO_TCP
|| !is_tcp_reset(skb)) {
icmp_send(skb,ICMP_DEST_UNREACH,
ICMP_PORT_UNREACH, 0);
kfree_skb(skb);
return NF_STOLEN;
return NF_DROP;
}
}
IP_VS_DBG_PKT(12, pp, iph,
}
IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal");
if (!pp->dont_defrag)
ip_send_check(iph);
return NF_ACCEPT;
}
/*
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (addr/port) is changed, so it is fast
* to do incremental checksum update, and let the destination host
* do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave && skb_is_nonlinear(skb))) {
if (skb_linearize(skb, GFP_ATOMIC) != 0) {
ip_vs_conn_put(cp);
return NF_DROP;
}
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
size = skb->len - ihl;
IP_VS_DBG(11, "O-pkt: %s size=%d\n", pp->name, size);
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size)) {
ip_vs_conn_put(cp);
return NF_DROP;
}
}
IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
IP_VS_DBG_PKT(11, pp, iph, "Outgoing packet");
if (!ip_vs_make_skb_writable(pskb, ihl))
goto drop;
/* mangle the packet */
iph->saddr = cp->vaddr;
if (pp->snat_handler) {
pp->snat_handler(skb, pp, cp, iph, h, size);
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
ip_send_check(iph);
if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
goto drop;
skb = *pskb;
skb->nh.iph->saddr = cp->vaddr;
ip_send_check(skb->nh.iph);
IP_VS_DBG_PKT(10, pp, iph, "After SNAT");
IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, iph, h, pp);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
ip_vs_conn_put(cp);
skb->nfcache |= NFC_IPVS_PROPERTY;
LeaveFunction(11);
return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
return NF_DROP;
}
......@@ -777,198 +833,185 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
* they are changed by ipchains masquerading code.
*/
unsigned int
check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *))
check_for_ip_vs_out(struct sk_buff **pskb, int (*okfn)(struct sk_buff *))
{
unsigned int ret;
ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL);
ret = ip_vs_out(NF_IP_FORWARD, pskb, NULL, NULL, NULL);
if (ret != NF_ACCEPT) {
return ret;
} else {
/* send the packet immediately if it is already mangled
by ip_vs_out */
if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) {
(*okfn)(*skb_p);
if ((*pskb)->nfcache & NFC_IPVS_PROPERTY) {
(*okfn)(*pskb);
return NF_STOLEN;
}
}
return NF_ACCEPT;
}
/*
* Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections,
* forward to the right destination host if relevant.
* Currently handles error types - unreachable, quench, ttl exceeded
* Currently handles error types - unreachable, quench, ttl exceeded.
*/
static int ip_vs_in_icmp(struct sk_buff **skb_p, int *related)
static int ip_vs_in_icmp(struct sk_buff **pskb, int *related)
{
struct sk_buff *skb = *skb_p;
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short len;
unsigned short clen, cihl;
struct icmphdr icmph;
struct iphdr ciph; /* The ip header contained within the ICMP */
struct ip_vs_conn *cp;
struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
int rc;
unsigned int offset, ihl, verdict;
*related = 1;
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
ip_send_check(skb->nh.iph);
/* reassemble IP fragments */
if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_vs_gather_frags(skb);
if (!skb)
return NF_STOLEN;
*pskb = skb;
}
iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
len = ntohs(iph->tot_len) - (iph->ihl<<2);
if (len < sizeof(struct icmphdr))
offset = ihl = iph->ihl * 4;
if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0)
return NF_DROP;
IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)),
IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph.type, ntohs(icmp_id(&icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
if ((icmph->type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_SOURCE_QUENCH) &&
(icmph->type != ICMP_TIME_EXCEEDED)) {
/*
* Work through seeing if this is for us.
* These checks are supposed to be in an order that means easy
* things are checked first to speed up processing.... however
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
if ((icmph.type != ICMP_DEST_UNREACH) &&
(icmph.type != ICMP_SOURCE_QUENCH) &&
(icmph.type != ICMP_TIME_EXCEEDED)) {
*related = 0;
return NF_ACCEPT;
}
/*
* If we get here we have an ICMP error of one of the above 3 types
* Now find the contained IP header
*/
clen = len - sizeof(struct icmphdr);
if (clen < sizeof(struct iphdr))
return NF_DROP;
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
/* Now find the contained IP header */
offset += sizeof(icmph);
if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0)
return NF_ACCEPT; /* The packet looks wrong, ignore */
pp = ip_vs_proto_get(ciph->protocol);
pp = ip_vs_proto_get(ciph.protocol);
if (!pp)
return NF_ACCEPT;
/* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) &&
if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT;
/* We need at least TCP/UDP ports here */
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
/* Ensure the checksum is correct */
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_ERR_RL("incoming ICMP: failed checksum from "
"%d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
return NF_DROP;
}
offset += ciph.ihl * 4;
h.raw = (char *) ciph + cihl;
/* The embedded headers contain source and dest in reverse order */
cp = pp->conn_in_get(skb, pp, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
IP_VS_DBG_PKT(11, pp, ciph, "Handling incoming ICMP for");
verdict = NF_DROP;
/* This is pretty much what ip_vs_conn_in_get() does,
except parameters are in the reverse order */
cp = pp->conn_in_get(skb, pp, ciph, h, 1);
if (cp == NULL)
return NF_ACCEPT;
/* Ensure the checksum is correct */
if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
ip_vs_checksum_complete(skb, ihl)) {
/* Failed checksum! */
IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
goto out;
}
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb);
rc = ip_vs_icmp_xmit(skb, cp, pp);
if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
out:
__ip_vs_conn_put(cp);
return rc;
}
return verdict;
}
/*
* Check if it's for virtual services, look it up,
* and send it on its way...
*/
static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_protocol *pp = ip_vs_proto_get(iph->protocol);
union ip_vs_tphdr h;
struct sk_buff *skb = *pskb;
struct iphdr *iph;
struct ip_vs_protocol *pp;
struct ip_vs_conn *cp;
int ihl, ret, restart;
int firstfrag;
int ret, restart;
int ihl;
/*
* Big tappo: only PACKET_HOST (neither loopback nor mcasts)
* ... don't know why 1st test DOES NOT include 2nd (?)
*/
if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev)) {
if (unlikely(skb->pkt_type != PACKET_HOST
|| skb->dev == &loopback_dev || skb->sk)) {
IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
skb->pkt_type,
iph->protocol,
NIPQUAD(iph->daddr));
skb->nh.iph->protocol,
NIPQUAD(skb->nh.iph->daddr));
return NF_ACCEPT;
}
iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb_p, &related);
int related, verdict = ip_vs_in_icmp(pskb, &related);
if (related)
return verdict;
skb = *pskb;
iph = skb->nh.iph;
}
/* Protocol supported? */
pp = ip_vs_proto_get(iph->protocol);
if (unlikely(!pp))
return NF_ACCEPT;
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2;
#if 0
/* Enable this when not in LOCAL_IN */
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
#else
firstfrag = 1;
#endif
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/*
* Check if the packet belongs to an existing connection entry
*/
cp = pp->conn_in_get(skb, pp, iph, h, 0);
cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
if (unlikely(!cp)) {
int v;
if (!pp->conn_schedule(skb, pp, iph, h, &v, &cp)) {
if (!pp->conn_schedule(skb, pp, &v, &cp))
return v;
}
}
if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, iph,
IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal");
return NF_ACCEPT;
}
IP_VS_DBG_PKT(11, pp, iph, "Incoming packet");
IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
/* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
......@@ -986,7 +1029,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
}
ip_vs_in_stats(cp, skb);
restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp);
restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp);
else {
......@@ -1011,7 +1054,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
/*
* It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
* packets destined for 0.0.0.0/0.
* related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent
* cache cluster, TCP packets can be marked and routed to ip_vs_in,
* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
......@@ -1019,25 +1062,16 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
* and send them to ip_vs_in_icmp.
*/
static unsigned int
ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **skb_p,
ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct sk_buff *skb = *skb_p;
struct iphdr *iph = skb->nh.iph;
int r;
if (iph->protocol != IPPROTO_ICMP)
if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
return NF_ACCEPT;
if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb);
if (!skb)
return NF_STOLEN;
*skb_p = skb;
}
return ip_vs_in_icmp(skb_p, &r);
return ip_vs_in_icmp(pskb, &r);
}
......
......@@ -202,10 +202,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Destination hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_dh_bucket *tbl;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
......
......@@ -87,39 +87,46 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
__u32 *addr, __u16 *port,
char **start, char **end)
{
unsigned char p1,p2,p3,p4,p5,p6;
unsigned char p[6];
int i = 0;
if (data_limit - data < plen) {
/* check if there is partial match */
if (strnicmp(data, pattern, data_limit - data) == 0)
return -1;
else
return 0;
}
while (data < data_limit) {
if (strnicmp(data, pattern, plen) != 0) {
data++;
continue;
return 0;
}
*start = data+plen;
p1 = simple_strtoul(data+plen, &data, 10);
if (*data != ',')
continue;
p2 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p3 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p4 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p5 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p6 = simple_strtoul(data+1, &data, 10);
if (*data != term)
continue;
*start = data + plen;
for (data = *start; *data != term; data++) {
if (data == data_limit)
return -1;
}
*end = data;
*addr = (p4<<24) | (p3<<16) | (p2<<8) | p1;
*port = (p6<<8) | p5;
return 1;
memset(p, 0, sizeof(p));
for (data = *start; data != *end; data++) {
if (*data >= '0' && *data <= '9') {
p[i] = p[i]*10 + *data - '0';
} else if (*data == ',' && i < 5) {
i++;
} else {
/* unexpected character */
return -1;
}
return 0;
}
if (i != 5)
return -1;
*addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
*port = (p[5]<<8) | p[4];
return 1;
}
......@@ -136,8 +143,8 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
*/
static int ip_vs_ftp_out(struct ip_vs_app *app,
struct ip_vs_conn *cp, struct sk_buff *skb)
static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct sk_buff **pskb, int *diff)
{
struct iphdr *iph;
struct tcphdr *th;
......@@ -148,24 +155,30 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
unsigned buf_len;
int diff;
int ret;
*diff = 0;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
return 0;
if (cp->app_data == &ip_vs_ftp_pasv) {
iph = skb->nh.iph;
iph = (*pskb)->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
data = (char *)th + (th->doff << 2);
data_limit = skb->tail;
data_limit = (*pskb)->tail;
if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING,
sizeof(SERVER_STRING)-1, ')',
&from, &port,
&start, &end) == 0)
return 0;
&start, &end) != 1)
return 1;
IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
"%u.%u.%u.%u:%d detected\n",
......@@ -196,29 +209,29 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
from = n_cp->vaddr;
port = n_cp->vport;
sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
port&255, port>>8&255);
port&255, (port>>8)&255);
buf_len = strlen(buf);
/*
* Calculate required delta-offset to keep TCP happy
*/
diff = buf_len - (end-start);
*diff = buf_len - (end-start);
if (diff == 0) {
if (*diff == 0) {
/* simply replace it with new passive address */
memcpy(start, buf, buf_len);
ret = 1;
} else {
/* fixme: return value isn't checked here */
ip_vs_skb_replace(skb, GFP_ATOMIC, start,
ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
end-start, buf, buf_len);
}
cp->app_data = NULL;
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
return diff;
return ret;
}
return 0;
return 1;
}
......@@ -233,8 +246,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
* port, so that the active ftp data connection from the server can reach
* the client.
*/
static int ip_vs_ftp_in(struct ip_vs_app *app,
struct ip_vs_conn *cp, struct sk_buff *skb)
static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct sk_buff **pskb, int *diff)
{
struct iphdr *iph;
struct tcphdr *th;
......@@ -244,29 +257,37 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
__u16 port;
struct ip_vs_conn *n_cp;
/* no diff required for incoming packets */
*diff = 0;
/* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
return 0;
/*
* Detecting whether it is passive
*/
iph = skb->nh.iph;
iph = (*pskb)->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* Since there may be OPTIONS in the TCP packet and the HLEN is
the length of the header in 32-bit multiples, it is accurate
to calculate data address by th+HLEN*4 */
data = data_start = (char *)th + (th->doff << 2);
data_limit = skb->tail;
data_limit = (*pskb)->tail;
while (data < data_limit) {
while (data <= data_limit - 6) {
if (strnicmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(1-debug, "got PASV at %d of %d\n",
data - data_start,
data_limit - data_start);
cp->app_data = &ip_vs_ftp_pasv;
return 0;
return 1;
}
data++;
}
......@@ -278,28 +299,28 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
* then create a new connection entry for the coming data
* connection.
*/
data = data_start;
data_limit = skb->h.raw + skb->len - 18;
if (ip_vs_ftp_get_addrport(data, data_limit,
if (ip_vs_ftp_get_addrport(data_start, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1,
'\r', &to, &port,
&start, &end) == 0)
return 0;
&start, &end) != 1)
return 1;
IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
NIPQUAD(to), ntohs(port));
/* Passive mode off */
cp->app_data = NULL;
/*
* Now update or create a connection entry for it
*/
IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
ip_vs_proto_name(iph->protocol),
NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0);
NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
n_cp = ip_vs_conn_in_get(iph->protocol,
to, port,
iph->daddr, htons(ntohs(cp->vport)-1));
cp->vaddr, htons(ntohs(cp->vport)-1));
if (!n_cp) {
n_cp = ip_vs_conn_new(IPPROTO_TCP,
to, port,
......@@ -320,8 +341,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp);
/* no diff required for incoming packets */
return 0;
return 1;
}
......
......@@ -523,11 +523,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_lblc_table *tbl;
struct ip_vs_lblc_entry *en;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
......
......@@ -777,11 +777,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_lblcr_table *tbl;
struct ip_vs_lblcr_entry *en;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
......
......@@ -63,7 +63,7 @@ ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
* Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
......
......@@ -79,7 +79,7 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh;
......
......@@ -164,22 +164,33 @@ const char * ip_vs_state_name(__u16 proto, int state)
void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
char buf[128];
union ip_vs_tphdr h;
__u16 ports[2];
struct iphdr iph;
h.raw = (char *) iph + iph->ihl * 4;
if (iph->frag_off & __constant_htons(IP_OFFSET))
if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else if (iph.frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else if (skb_copy_bits(skb, offset + iph.ihl*4, ports, sizeof(ports)) < 0)
sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
pp->name,
NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else
sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
pp->name,
NIPQUAD(iph->saddr),
ntohs(h.portp[0]),
NIPQUAD(iph->daddr),
ntohs(h.portp[1]));
NIPQUAD(iph.saddr),
ntohs(ports[0]),
NIPQUAD(iph.daddr),
ntohs(ports[1]));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
......
......@@ -44,8 +44,11 @@ struct isakmp_hdr {
static struct ip_vs_conn *
ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
ah_conn_in_get(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
......@@ -81,8 +84,8 @@ ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
struct ip_vs_conn *cp;
......@@ -119,8 +122,8 @@ ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int
ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
ah_conn_schedule(struct sk_buff *skb,
struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
/*
......@@ -132,12 +135,18 @@ ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static void
ah_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{
char buf[256];
struct iphdr iph;
if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
......
......@@ -44,8 +44,11 @@ struct isakmp_hdr {
static struct ip_vs_conn *
esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
esp_conn_in_get(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{
struct ip_vs_conn *cp;
......@@ -81,8 +84,8 @@ esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
struct ip_vs_conn *cp;
......@@ -120,7 +123,6 @@ esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int
esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
/*
......@@ -132,12 +134,18 @@ esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static void
esp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{
char buf[256];
struct iphdr iph;
if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
......
......@@ -28,8 +28,11 @@ static int icmp_timeouts[1] = { 1*60*HZ };
static char * icmp_state_name_table[1] = { "ICMP" };
struct ip_vs_conn *
icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
icmp_conn_in_get(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{
#if 0
struct ip_vs_conn *cp;
......@@ -52,8 +55,11 @@ icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
struct ip_vs_conn *
icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
icmp_conn_out_get(const struct sk_buff *skb,
struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{
#if 0
struct ip_vs_conn *cp;
......@@ -76,7 +82,6 @@ icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int
icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
*verdict = NF_ACCEPT;
......@@ -84,41 +89,51 @@ icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
}
static int
icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
{
if (!(iph->frag_off & __constant_htons(IP_OFFSET))) {
if (ip_compute_csum(h.raw, size)) {
IP_VS_DBG_RL_PKT(0, pp, iph, "Failed checksum for");
if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
return 0;
}
}
}
return 1;
}
static void
icmp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg)
icmp_debug_packet(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{
char buf[256];
union ip_vs_tphdr h;
struct iphdr iph;
struct icmphdr icmph;
h.raw = (char *) iph + iph->ihl * 4;
if (iph->frag_off & __constant_htons(IP_OFFSET))
if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else if (iph.frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else if (skb_copy_bits(skb, offset + iph.ihl*4, &icmph, sizeof(icmph)) < 0)
sprintf(buf, "%s TRUNCATED to %u bytes\n",
pp->name, skb->len - offset);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr),
h.icmph->type, h.icmph->code);
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr),
icmph.type, icmph.code);
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
}
static int
icmp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
icmp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
return 1;
......
......@@ -21,52 +21,68 @@
#include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
static struct ip_vs_conn *
tcp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) {
return ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.th->source,
iph->daddr, h.th->dest);
iph->saddr, ports[0],
iph->daddr, ports[1]);
} else {
return ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.th->dest,
iph->saddr, h.th->source);
iph->daddr, ports[1],
iph->saddr, ports[0]);
}
}
static struct ip_vs_conn *
tcp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) {
return ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.th->source,
iph->daddr, h.th->dest);
iph->saddr, ports[0],
iph->daddr, ports[1]);
} else {
return ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.th->dest,
iph->saddr, h.th->source);
iph->daddr, ports[1],
iph->saddr, ports[0]);
}
}
static int
tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
tcp_conn_schedule(struct sk_buff *skb,
struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct tcphdr tcph;
if (h.th->syn &&
(svc = ip_vs_service_get(skb->nfmark, iph->protocol,
iph->daddr, h.portp[1]))) {
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) {
*verdict = NF_DROP;
return 0;
}
if (tcph.syn &&
(svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
skb->nh.iph->daddr, tcph.dest))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
......@@ -81,9 +97,9 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, iph);
*cpp = ip_vs_schedule(svc, skb);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h);
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
......@@ -93,111 +109,128 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static inline void
tcp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
u16 oldport, u16 newport)
{
h->th->check =
tcph->check =
ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->th->check));
newport, tcph->check));
}
static int
tcp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
tcp_snat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
int ihl = (char *) h.raw - (char *) iph;
struct tcphdr *tcph;
unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */
/* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
return 0;
h.th->source = cp->vport;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/* Call application helper if needed */
if (ip_vs_app_pkt_out(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
if (!ip_vs_app_pkt_out(cp, pskb))
return 0;
}
tcph = (void *)(*pskb)->nh.iph + tcphoff;
tcph->source = cp->vport;
/* Adjust TCP checksums */
if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->daddr, cp->vaddr,
tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
if ((*pskb)->ip_summed == CHECKSUM_HW)
(*pskb)->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.th->check = 0;
skb->csum = csum_partial(h.raw, size, 0);
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
skb->csum);
tcph->check = 0;
(*pskb)->csum = skb_checksum(*pskb, tcphoff,
(*pskb)->len - tcphoff, 0);
tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
(*pskb)->len - tcphoff,
cp->protocol,
(*pskb)->csum);
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.th->check,
(char*)&(h.th->check) - (char*)h.raw);
pp->name, tcph->check,
(char*)&(tcph->check) - (char*)tcph);
}
return 1;
}
static int
tcp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
tcp_dnat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
int ihl = (char *) h.raw - (char *) iph;
struct tcphdr *tcph;
unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */
/* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
return 0;
h.th->dest = cp->dport;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/*
* Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff
*/
if (ip_vs_app_pkt_in(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
if (!ip_vs_app_pkt_in(cp, pskb))
return 0;
}
tcph = (void *)(*pskb)->nh.iph + tcphoff;
tcph->dest = cp->dport;
/*
* Adjust TCP/UDP checksums
* Adjust TCP checksums
*/
if (!cp->app) {
/* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->vaddr, cp->daddr,
tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
if ((*pskb)->ip_summed == CHECKSUM_HW)
(*pskb)->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.th->check = 0;
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
skb->ip_summed = CHECKSUM_UNNECESSARY;
tcph->check = 0;
(*pskb)->csum = skb_checksum(*pskb, tcphoff,
(*pskb)->len - tcphoff, 0);
tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
(*pskb)->len - tcphoff,
cp->protocol,
(*pskb)->csum);
(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
{
unsigned int tcphoff = skb->nh.iph->ihl*4;
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0);
skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, iph,
if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
skb->len - tcphoff,
skb->nh.iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
......@@ -383,10 +416,9 @@ static inline int tcp_state_idx(struct tcphdr *th)
static inline void
set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
int direction, union ip_vs_tphdr h)
int direction, struct tcphdr *th)
{
int state_idx;
struct tcphdr *th = h.th;
int new_state = IP_VS_TCP_S_CLOSE;
int state_off = tcp_state_off[direction];
......@@ -448,12 +480,17 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
* Handle state transitions
*/
static int
tcp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
tcp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
struct tcphdr tcph;
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0)
return 0;
spin_lock(&cp->lock);
set_tcp_state(pp, cp, direction, h);
set_tcp_state(pp, cp, direction, &tcph);
spin_unlock(&cp->lock);
return 1;
......@@ -574,9 +611,6 @@ static void tcp_exit(struct ip_vs_protocol *pp)
}
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP",
.protocol = IPPROTO_TCP,
......@@ -599,7 +633,7 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.state_name = tcp_state_name,
.state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind,
.debug_packet = tcpudp_debug_packet,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change,
.set_state_timeout = tcp_set_state_timeout,
};
......@@ -16,25 +16,29 @@
*/
#include <linux/kernel.h>
#include <linux/netfilter.h>
#include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h>
static struct ip_vs_conn *
udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
struct ip_vs_conn *cp;
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) {
cp = ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1]);
iph->saddr, ports[0],
iph->daddr, ports[1]);
} else {
cp = ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.portp[1],
iph->saddr, h.portp[0]);
iph->daddr, ports[1],
iph->saddr, ports[0]);
}
return cp;
......@@ -42,19 +46,23 @@ udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn *
udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse)
udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
const struct iphdr *iph, unsigned int proto_off, int inverse)
{
struct ip_vs_conn *cp;
__u16 ports[2];
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) {
cp = ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.portp[0],
iph->daddr, h.portp[1]);
iph->saddr, ports[0],
iph->daddr, ports[1]);
} else {
cp = ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.portp[1],
iph->saddr, h.portp[0]);
iph->daddr, ports[1],
iph->saddr, ports[0]);
}
return cp;
......@@ -63,13 +71,18 @@ udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int
udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp)
{
struct ip_vs_service *svc;
struct udphdr udph;
if ((svc = ip_vs_service_get(skb->nfmark, iph->protocol,
iph->daddr, h.portp[1]))) {
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &udph, sizeof(udph)) < 0) {
*verdict = NF_DROP;
return 0;
}
if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
skb->nh.iph->daddr, udph.dest))) {
if (ip_vs_todrop()) {
/*
* It seems that we are very loaded.
......@@ -84,9 +97,9 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the
* incoming connection, and create a connection entry.
*/
*cpp = ip_vs_schedule(svc, iph);
*cpp = ip_vs_schedule(svc, skb);
if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h);
*verdict = ip_vs_leave(svc, skb, pp);
return 0;
}
ip_vs_service_put(svc);
......@@ -96,121 +109,145 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static inline void
udp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip,
udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
u16 oldport, u16 newport)
{
h->uh->check =
uhdr->check =
ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->uh->check));
if (!h->uh->check)
h->uh->check = 0xFFFF;
newport, uhdr->check));
if (!uhdr->check)
uhdr->check = 0xFFFF;
}
static int
udp_snat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
udp_snat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
int ihl = (char *) h.raw - (char *) iph;
struct udphdr *udph;
unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */
/* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
return 0;
h.portp[0] = cp->vport;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/*
* Call application helper if needed
*/
if (ip_vs_app_pkt_out(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
if (!ip_vs_app_pkt_out(cp, pskb))
return 0;
}
udph = (void *)(*pskb)->nh.iph + udphoff;
udph->source = cp->vport;
/*
* Adjust UDP checksums
*/
if (!cp->app && (h.uh->check != 0)) {
if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->daddr, cp->vaddr,
udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
if ((*pskb)->ip_summed == CHECKSUM_HW)
(*pskb)->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.uh->check = 0;
skb->csum = csum_partial(h.raw, size, 0);
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
skb->csum);
if (h.uh->check == 0)
h.uh->check = 0xFFFF;
udph->check = 0;
(*pskb)->csum = skb_checksum(*pskb, udphoff,
(*pskb)->len - udphoff, 0);
udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
(*pskb)->len - udphoff,
cp->protocol,
(*pskb)->csum);
if (udph->check == 0)
udph->check = 0xFFFF;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.uh->check,
(char*)&(h.uh->check) - (char*)h.raw);
pp->name, udph->check,
(char*)&(udph->check) - (char*)udph);
}
return 1;
}
static int
udp_dnat_handler(struct sk_buff *skb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
udp_dnat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
{
int ihl = (char *) h.raw - (char *) iph;
struct udphdr *udph;
unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */
/* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
return 0;
h.portp[1] = cp->dport;
if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/*
* Attempt ip_vs_app call.
* will fix ip_vs_conn and iph ack_seq stuff
* It will fix ip_vs_conn
*/
if (ip_vs_app_pkt_in(cp, skb) != 0) {
/* skb data has probably changed, update pointers */
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
if (!ip_vs_app_pkt_in(cp, pskb))
return 0;
}
udph = (void *)(*pskb)->nh.iph + udphoff;
udph->dest = cp->dport;
/*
* Adjust UDP checksums
*/
if (!cp->app && (h.uh->check != 0)) {
if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->vaddr, cp->daddr,
udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE;
if ((*pskb)->ip_summed == CHECKSUM_HW)
(*pskb)->ip_summed = CHECKSUM_NONE;
} else {
/* full checksum calculation */
h.uh->check = 0;
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
size, iph->protocol,
csum_partial(h.raw, size, 0));
if (h.uh->check == 0)
h.uh->check = 0xFFFF;
skb->ip_summed = CHECKSUM_UNNECESSARY;
udph->check = 0;
(*pskb)->csum = skb_checksum(*pskb, udphoff,
(*pskb)->len - udphoff, 0);
udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
(*pskb)->len - udphoff,
cp->protocol,
(*pskb)->csum);
if (udph->check == 0)
udph->check = 0xFFFF;
(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
}
return 1;
}
static int
udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int size)
udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
{
if (h.uh->check != 0) {
struct udphdr udph;
unsigned int udphoff = skb->nh.iph->ihl*4;
if (skb_copy_bits(skb, udphoff, &udph, sizeof(udph)) < 0)
return 0;
if (udph.check != 0) {
switch (skb->ip_summed) {
case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0);
skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size,
iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, iph,
if (csum_tcpudp_magic(skb->nh.iph->saddr,
skb->nh.iph->daddr,
skb->len - udphoff,
skb->nh.iph->protocol,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for");
return 0;
}
......@@ -342,9 +379,9 @@ static const char * udp_state_name(int state)
}
static int
udp_state_transition(struct ip_vs_conn *cp,
int direction, struct iphdr *iph,
union ip_vs_tphdr h, struct ip_vs_protocol *pp)
udp_state_transition(struct ip_vs_conn *cp, int direction,
const struct sk_buff *skb,
struct ip_vs_protocol *pp)
{
cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
return 1;
......@@ -361,9 +398,6 @@ static void udp_exit(struct ip_vs_protocol *pp)
}
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_udp = {
.name = "UDP",
.protocol = IPPROTO_UDP,
......@@ -385,7 +419,7 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.register_app = udp_register_app,
.unregister_app = udp_unregister_app,
.app_conn_bind = udp_app_conn_bind,
.debug_packet = tcpudp_debug_packet,
.debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL,
.set_state_timeout = udp_set_state_timeout,
};
......@@ -55,7 +55,7 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
* Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct list_head *p, *q;
struct ip_vs_dest *dest;
......
......@@ -83,7 +83,7 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
......
......@@ -199,10 +199,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Source Hashing scheduling
*/
static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_sh_bucket *tbl;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
......
......@@ -71,7 +71,7 @@ ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling
*/
static struct ip_vs_dest *
ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest, *least;
unsigned int loh, doh;
......
......@@ -138,7 +138,7 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
* Weighted Round-Robin Scheduling
*/
static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph)
ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{
struct ip_vs_dest *dest;
struct ip_vs_wrr_mark *mark = svc->sched_data;
......
......@@ -128,32 +128,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
}
static inline int
ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom,
struct iphdr **iph_p, unsigned char **t_p)
{
int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
if (delta < 0)
delta = 0;
if (delta ||skb_cloned(skb)) {
if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC))
return -ENOMEM;
/* skb data changed, update pointers */
*iph_p = skb->nh.iph;
*t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4;
}
return 0;
}
#define IP_VS_XMIT(skb, rt) \
do { \
skb->nfcache |= NFC_IPVS_PROPERTY; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \
rt->u.dst.dev, dst_output); \
(skb)->nfcache |= NFC_IPVS_PROPERTY; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
(rt)->u.dst.dev, dst_output); \
} while (0)
......@@ -188,7 +167,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
.daddr = iph->daddr,
.saddr = 0,
.tos = RT_TOS(tos), } },
.proto = iph->protocol,
};
EnterFunction(10);
......@@ -208,21 +186,23 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
}
if (skb_is_nonlinear(skb) && skb->len <= mtu)
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
/*
* Call ip_send_check because we are not sure it is called
* after ip_defrag. Is copy-on-write needed?
*/
if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n");
goto tx_error;
}
return NF_STOLEN;
}
ip_send_check(skb->nh.iph);
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
......@@ -234,8 +214,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
LeaveFunction(10);
return NF_DROP;
}
......@@ -248,45 +228,18 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
union ip_vs_tphdr h;
int ihl;
unsigned short size;
int mtu;
struct iphdr *iph = skb->nh.iph;
EnterFunction(10);
/*
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (such as IP address and port number)
* will be changed, so it is fast to do incremental checksum update,
* and let the destination host do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave)) {
if (skb_is_nonlinear(skb) &&
skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
}
iph = skb->nh.iph;
ihl = iph->ihl << 2;
h.raw = (char*) iph + ihl;
size = ntohs(iph->tot_len) - ihl;
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size))
goto tx_error;
}
/*
* Check if it is no clinet port connection ...
*/
/* check if it is a connection of no-client-port */
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
ip_vs_conn_fill_cport(cp, h.portp[0]);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport));
__u16 pt;
if (skb_copy_bits(skb, iph->ihl*4, &pt, sizeof(pt)) < 0)
goto tx_error;
ip_vs_conn_fill_cport(cp, pt);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(pt));
}
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
......@@ -297,33 +250,36 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for");
IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
goto tx_error;
}
/* copy-on-write the packet before mangling it */
if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
goto tx_error_put;
if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
goto tx_error_put;
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
return NF_DROP;
/* mangle the packet */
iph->daddr = cp->daddr;
if (pp->dnat_handler) {
pp->dnat_handler(skb, pp, cp, iph, h, size);
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
ip_send_check(iph);
if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
goto tx_error;
skb->nh.iph->daddr = cp->daddr;
ip_send_check(skb->nh.iph);
IP_VS_DBG_PKT(10, pp, iph, "After DNAT");
IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still
MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
......@@ -335,8 +291,11 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
LeaveFunction(10);
return NF_DROP;
tx_error_put:
ip_rt_put(rt);
goto tx_error;
}
......@@ -405,11 +364,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
}
if (skb_is_nonlinear(skb))
ip_send_check(old_iph);
skb->h.raw = skb->nh.raw;
/*
* Okay, now see if we can stuff it in the buffer as-is.
*/
......@@ -421,14 +375,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_realloc_headroom(skb, max_headroom);
if (!new_skb) {
ip_rt_put(rt);
kfree_skb(skb);
IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
return -EINVAL;
return NF_DROP;
}
kfree_skb(skb);
skb = new_skb;
old_iph = skb->nh.iph;
}
skb->h.raw = (void *) old_iph;
/* fix old IP header checksum */
ip_send_check(old_iph);
skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
......@@ -453,9 +412,14 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_send_check(iph);
skb->ip_summed = CHECKSUM_NONE;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt);
LeaveFunction(10);
......@@ -465,8 +429,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
LeaveFunction(10);
return NF_DROP;
}
......@@ -496,21 +460,23 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error;
}
if (skb_is_nonlinear(skb) && skb->len <= mtu)
ip_send_check(iph);
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) {
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) {
/*
* Call ip_send_check because we are not sure it is called
* after ip_defrag. Is copy-on-write needed?
*/
if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n");
goto tx_error;
}
return NF_STOLEN;
}
ip_send_check(skb->nh.iph);
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */
......@@ -522,8 +488,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp:
dst_link_failure(skb);
tx_error:
kfree_skb(skb);
return NF_STOLEN;
LeaveFunction(10);
return NF_DROP;
}
......@@ -533,14 +499,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/
int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp)
struct ip_vs_protocol *pp, int offset)
{
struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short len;
union ip_vs_tphdr h;
int mtu;
int rc;
......@@ -559,60 +520,37 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto out;
}
iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
len = ntohs(iph->tot_len) - (iph->ihl<<2);
/*
* mangle and send the packet here (only for VS/NAT)
*/
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
goto tx_error_icmp;
/* MTU checking */
mtu = dst_pmtu(&rt->u.dst);
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
goto tx_error;
}
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len,
&iph, (unsigned char**)&icmph)) {
rc = NF_DROP;
goto out;
}
ciph = (struct iphdr *) (icmph + 1);
h.raw = (char *) ciph + (ciph->ihl << 2);
/* The ICMP packet for VS/NAT must be written to correct addresses
before being forwarded to the right server */
if (!ip_vs_make_skb_writable(&skb, offset))
goto tx_error_put;
/* First change the dest IP address, and recalc checksum */
iph->daddr = cp->daddr;
ip_send_check(iph);
/* Now change the *source* address in the contained IP */
ciph->saddr = cp->daddr;
ip_send_check(ciph);
if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
goto tx_error_put;
/* the TCP/UDP source port - cannot redo check */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
h.portp[0] = cp->dport;
/* drop the old route when skb is not shared */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
ip_vs_nat_icmp(skb, pp, cp, 0);
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP");
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0;
......@@ -630,4 +568,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
out:
LeaveFunction(10);
return rc;
tx_error_put:
ip_rt_put(rt);
goto tx_error;
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment