Commit 54f47c5d authored by Julian Anastasov's avatar Julian Anastasov Committed by Arnaldo Carvalho de Melo

[IPVS]: Properly handle non-linear skbs.

Most of the changes come from Paul `Rusty' Russell. Now we
modify the skbs only for IPVS packets.
parent 3a9a3e7d
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
#include <asm/types.h> /* For __uXX types */ #include <asm/types.h> /* For __uXX types */
#define IP_VS_VERSION_CODE 0x010107 #define IP_VS_VERSION_CODE 0x010108
#define NVERSION(version) \ #define NVERSION(version) \
(version >> 16) & 0xFF, \ (version >> 16) & 0xFF, \
(version >> 8) & 0xFF, \ (version >> 8) & 0xFF, \
...@@ -272,22 +272,22 @@ extern int ip_vs_get_debug_level(void); ...@@ -272,22 +272,22 @@ extern int ip_vs_get_debug_level(void);
if (net_ratelimit()) \ if (net_ratelimit()) \
printk(KERN_DEBUG "IPVS: " msg); \ printk(KERN_DEBUG "IPVS: " msg); \
} while (0) } while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) \ #define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) \
do { \ do { \
if (level <= ip_vs_get_debug_level()) \ if (level <= ip_vs_get_debug_level()) \
pp->debug_packet(pp, iph, msg); \ pp->debug_packet(pp, skb, ofs, msg); \
} while (0) } while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) \ #define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) \
do { \ do { \
if (level <= ip_vs_get_debug_level() && \ if (level <= ip_vs_get_debug_level() && \
net_ratelimit()) \ net_ratelimit()) \
pp->debug_packet(pp, iph, msg); \ pp->debug_packet(pp, skb, ofs, msg); \
} while (0) } while (0)
#else /* NO DEBUGGING at ALL */ #else /* NO DEBUGGING at ALL */
#define IP_VS_DBG(level, msg...) do {} while (0) #define IP_VS_DBG(level, msg...) do {} while (0)
#define IP_VS_DBG_RL(msg...) do {} while (0) #define IP_VS_DBG_RL(msg...) do {} while (0)
#define IP_VS_DBG_PKT(level, pp, iph, msg) do {} while (0) #define IP_VS_DBG_PKT(level, pp, skb, ofs, msg) do {} while (0)
#define IP_VS_DBG_RL_PKT(level, pp, iph, msg) do {} while (0) #define IP_VS_DBG_RL_PKT(level, pp, skb, ofs, msg) do {} while (0)
#endif #endif
#define IP_VS_BUG() BUG() #define IP_VS_BUG() BUG()
...@@ -395,18 +395,6 @@ enum { ...@@ -395,18 +395,6 @@ enum {
IP_VS_ICMP_S_LAST, IP_VS_ICMP_S_LAST,
}; };
/*
* Transport protocol header
*/
union ip_vs_tphdr {
unsigned char *raw;
struct udphdr *uh;
struct tcphdr *th;
struct icmphdr *icmph;
__u16 *portp;
};
/* /*
* Delta sequence info structure * Delta sequence info structure
* Each ip_vs_conn has 2 (output AND input seq. changes). * Each ip_vs_conn has 2 (output AND input seq. changes).
...@@ -459,36 +447,36 @@ struct ip_vs_protocol { ...@@ -459,36 +447,36 @@ struct ip_vs_protocol {
void (*exit)(struct ip_vs_protocol *pp); void (*exit)(struct ip_vs_protocol *pp);
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_protocol *pp, int (*conn_schedule)(struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp); int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn * struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb, (*conn_in_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph, struct ip_vs_protocol *pp,
union ip_vs_tphdr h, int inverse); const struct iphdr *iph,
unsigned int proto_off,
int inverse);
struct ip_vs_conn * struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb, (*conn_out_get)(const struct sk_buff *skb,
struct ip_vs_protocol *pp, struct iphdr *iph, struct ip_vs_protocol *pp,
union ip_vs_tphdr h, int inverse); const struct iphdr *iph,
unsigned int proto_off,
int inverse);
int (*snat_handler)(struct sk_buff *skb, int (*snat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*dnat_handler)(struct sk_buff *skb, int (*dnat_handler)(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp);
struct iphdr *iph, union ip_vs_tphdr h, int size);
int (*csum_check)(struct sk_buff *skb, int (*csum_check)(struct sk_buff *skb, struct ip_vs_protocol *pp);
struct ip_vs_protocol *pp, struct iphdr *iph,
union ip_vs_tphdr h, int size);
const char *(*state_name)(int state); const char *(*state_name)(int state);
int (*state_transition)(struct ip_vs_conn *cp, int direction, int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h, const struct sk_buff *skb,
struct ip_vs_protocol *pp); struct ip_vs_protocol *pp);
int (*register_app)(struct ip_vs_app *inc); int (*register_app)(struct ip_vs_app *inc);
...@@ -497,8 +485,10 @@ struct ip_vs_protocol { ...@@ -497,8 +485,10 @@ struct ip_vs_protocol {
int (*app_conn_bind)(struct ip_vs_conn *cp); int (*app_conn_bind)(struct ip_vs_conn *cp);
void (*debug_packet)(struct ip_vs_protocol *pp, struct iphdr *iph, void (*debug_packet)(struct ip_vs_protocol *pp,
char *msg); const struct sk_buff *skb,
int offset,
const char *msg);
void (*timeout_change)(struct ip_vs_protocol *pp, int flags); void (*timeout_change)(struct ip_vs_protocol *pp, int flags);
...@@ -638,7 +628,7 @@ struct ip_vs_scheduler { ...@@ -638,7 +628,7 @@ struct ip_vs_scheduler {
/* selecting a server from the given service */ /* selecting a server from the given service */
struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc, struct ip_vs_dest* (*schedule)(struct ip_vs_service *svc,
struct iphdr *iph); const struct sk_buff *skb);
}; };
...@@ -660,13 +650,13 @@ struct ip_vs_app ...@@ -660,13 +650,13 @@ struct ip_vs_app
__u16 port; /* port number in net order */ __u16 port; /* port number in net order */
atomic_t usecnt; /* usage counter */ atomic_t usecnt; /* usage counter */
/* output hook */ /* output hook: return false if can't linearize. diff set for TCP. */
int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *, int (*pkt_out)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *); struct sk_buff **, int *diff);
/* input hook */ /* input hook: return false if can't linearize. diff set for TCP. */
int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *, int (*pkt_in)(struct ip_vs_app *, struct ip_vs_conn *,
struct sk_buff *); struct sk_buff **, int *diff);
/* ip_vs_app initializer */ /* ip_vs_app initializer */
int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *); int (*init_conn)(struct ip_vs_app *, struct ip_vs_conn *);
...@@ -686,20 +676,21 @@ struct ip_vs_app ...@@ -686,20 +676,21 @@ struct ip_vs_app
int timeouts_size; int timeouts_size;
int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app, int (*conn_schedule)(struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp); int *verdict, struct ip_vs_conn **cpp);
struct ip_vs_conn * struct ip_vs_conn *
(*conn_in_get)(struct sk_buff *skb, struct ip_vs_app *app, (*conn_in_get)(const struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse); const struct iphdr *iph, unsigned int proto_off,
int inverse);
struct ip_vs_conn * struct ip_vs_conn *
(*conn_out_get)(struct sk_buff *skb, struct ip_vs_app *app, (*conn_out_get)(const struct sk_buff *skb, struct ip_vs_app *app,
struct iphdr *iph, union ip_vs_tphdr h, int inverse); const struct iphdr *iph, unsigned int proto_off,
int inverse);
int (*state_transition)(struct ip_vs_conn *cp, int direction, int (*state_transition)(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, const struct sk_buff *skb,
union ip_vs_tphdr h, struct ip_vs_app *app); struct ip_vs_app *app);
void (*timeout_change)(struct ip_vs_app *app, int flags); void (*timeout_change)(struct ip_vs_app *app, int flags);
}; };
...@@ -839,8 +830,8 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port); ...@@ -839,8 +830,8 @@ register_ip_vs_app_inc(struct ip_vs_app *app, __u16 proto, __u16 port);
extern int ip_vs_app_inc_get(struct ip_vs_app *inc); extern int ip_vs_app_inc_get(struct ip_vs_app *inc);
extern void ip_vs_app_inc_put(struct ip_vs_app *inc); extern void ip_vs_app_inc_put(struct ip_vs_app *inc);
extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff *skb); extern int ip_vs_app_pkt_out(struct ip_vs_conn *, struct sk_buff **pskb);
extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff *skb); extern int ip_vs_app_pkt_in(struct ip_vs_conn *, struct sk_buff **pskb);
extern int ip_vs_skb_replace(struct sk_buff *skb, int pri, extern int ip_vs_skb_replace(struct sk_buff *skb, int pri,
char *o_buf, int o_len, char *n_buf, int n_len); char *o_buf, int o_len, char *n_buf, int n_len);
extern int ip_vs_app_init(void); extern int ip_vs_app_init(void);
...@@ -856,6 +847,10 @@ extern void ip_vs_protocol_timeout_change(int flags); ...@@ -856,6 +847,10 @@ extern void ip_vs_protocol_timeout_change(int flags);
extern int *ip_vs_create_timeout_table(int *table, int size); extern int *ip_vs_create_timeout_table(int *table, int size);
extern int extern int
ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to); ip_vs_set_state_timeout(int *table, int num, char **names, char *name, int to);
extern void
ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg);
extern struct ip_vs_protocol ip_vs_protocol_tcp; extern struct ip_vs_protocol ip_vs_protocol_tcp;
extern struct ip_vs_protocol ip_vs_protocol_udp; extern struct ip_vs_protocol ip_vs_protocol_udp;
extern struct ip_vs_protocol ip_vs_protocol_icmp; extern struct ip_vs_protocol ip_vs_protocol_icmp;
...@@ -875,9 +870,9 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc); ...@@ -875,9 +870,9 @@ extern int ip_vs_unbind_scheduler(struct ip_vs_service *svc);
extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name); extern struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name);
extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler); extern void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler);
extern struct ip_vs_conn * extern struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph); ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb);
extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, extern int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h); struct ip_vs_protocol *pp);
/* /*
...@@ -940,7 +935,7 @@ extern int ip_vs_tunnel_xmit ...@@ -940,7 +935,7 @@ extern int ip_vs_tunnel_xmit
extern int ip_vs_dr_xmit extern int ip_vs_dr_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); (struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp);
extern int ip_vs_icmp_xmit extern int ip_vs_icmp_xmit
(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp); (struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, int offset);
extern void ip_vs_dst_reset(struct ip_vs_dest *dest); extern void ip_vs_dst_reset(struct ip_vs_dest *dest);
...@@ -986,6 +981,11 @@ extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp) ...@@ -986,6 +981,11 @@ extern __inline__ char ip_vs_fwd_tag(struct ip_vs_conn *cp)
return fwd; return fwd;
} }
extern int ip_vs_make_skb_writable(struct sk_buff **pskb, int len);
extern void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int dir);
extern u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset);
static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum) static inline u16 ip_vs_check_diff(u32 old, u32 new, u16 oldsum)
{ {
......
...@@ -362,29 +362,18 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq, ...@@ -362,29 +362,18 @@ static inline void vs_seq_update(struct ip_vs_conn *cp, struct ip_vs_seq *vseq,
spin_unlock(&cp->lock); spin_unlock(&cp->lock);
} }
static inline int app_tcp_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb,
/* struct ip_vs_app *app)
* Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL
* returns (new - old) skb->len diff.
*/
int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
{ {
struct ip_vs_app *app;
int diff; int diff;
struct iphdr *iph; unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
struct tcphdr *th; struct tcphdr *th;
__u32 seq; __u32 seq;
/* if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 0; return 0;
iph = skb->nh.iph; th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* /*
* Remember seq number in case this pkt gets resized * Remember seq number in case this pkt gets resized
...@@ -394,54 +383,72 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb) ...@@ -394,54 +383,72 @@ int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff *skb)
/* /*
* Fix seq stuff if flagged as so. * Fix seq stuff if flagged as so.
*/ */
if (cp->protocol == IPPROTO_TCP) { if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
if (cp->flags & IP_VS_CONN_F_OUT_SEQ) vs_fix_seq(&cp->out_seq, th);
vs_fix_seq(&cp->out_seq, th); if (cp->flags & IP_VS_CONN_F_IN_SEQ)
if (cp->flags & IP_VS_CONN_F_IN_SEQ) vs_fix_ack_seq(&cp->in_seq, th);
vs_fix_ack_seq(&cp->in_seq, th);
}
/* /*
* Call private output hook function * Call private output hook function
*/ */
if (app->pkt_out == NULL) if (app->pkt_out == NULL)
return 0; return 1;
diff = app->pkt_out(app, cp, skb); if (!app->pkt_out(app, cp, pskb, &diff))
return 0;
/* /*
* Update ip_vs seq stuff if len has changed. * Update ip_vs seq stuff if len has changed.
*/ */
if (diff != 0 && cp->protocol == IPPROTO_TCP) if (diff != 0)
vs_seq_update(cp, &cp->out_seq, vs_seq_update(cp, &cp->out_seq,
IP_VS_CONN_F_OUT_SEQ, seq, diff); IP_VS_CONN_F_OUT_SEQ, seq, diff);
return diff; return 1;
} }
/* /*
* Input pkt hook. Will call bound ip_vs_app specific function * Output pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL. * called by ipvs packet handler, assumes previously checked cp!=NULL
* returns (new - old) skb->len diff. * returns false if it can't handle packet (oom)
*/ */
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) int ip_vs_app_pkt_out(struct ip_vs_conn *cp, struct sk_buff **pskb)
{ {
struct ip_vs_app *app; struct ip_vs_app *app;
int diff;
struct iphdr *iph;
struct tcphdr *th;
__u32 seq;
/* /*
* check if application module is bound to * check if application module is bound to
* this ip_vs_conn. * this ip_vs_conn.
*/ */
if ((app = cp->app) == NULL) if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_out(cp, pskb, app);
/*
* Call private output hook function
*/
if (app->pkt_out == NULL)
return 1;
return app->pkt_out(app, cp, pskb, NULL);
}
static inline int app_tcp_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb,
struct ip_vs_app *app)
{
int diff;
unsigned int tcp_offset = (*pskb)->nh.iph->ihl*4;
struct tcphdr *th;
__u32 seq;
if (!ip_vs_make_skb_writable(pskb, tcp_offset + sizeof(*th)))
return 0; return 0;
iph = skb->nh.iph; th = (struct tcphdr *)((*pskb)->nh.raw + tcp_offset);
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* /*
* Remember seq number in case this pkt gets resized * Remember seq number in case this pkt gets resized
...@@ -451,29 +458,57 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb) ...@@ -451,29 +458,57 @@ int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff *skb)
/* /*
* Fix seq stuff if flagged as so. * Fix seq stuff if flagged as so.
*/ */
if (cp->protocol == IPPROTO_TCP) { if (cp->flags & IP_VS_CONN_F_IN_SEQ)
if (cp->flags & IP_VS_CONN_F_IN_SEQ) vs_fix_seq(&cp->in_seq, th);
vs_fix_seq(&cp->in_seq, th); if (cp->flags & IP_VS_CONN_F_OUT_SEQ)
if (cp->flags & IP_VS_CONN_F_OUT_SEQ) vs_fix_ack_seq(&cp->out_seq, th);
vs_fix_ack_seq(&cp->out_seq, th);
}
/* /*
* Call private input hook function * Call private input hook function
*/ */
if (app->pkt_in == NULL) if (app->pkt_in == NULL)
return 0; return 1;
diff = app->pkt_in(app, cp, skb); if (!app->pkt_in(app, cp, pskb, &diff))
return 0;
/* /*
* Update ip_vs seq stuff if len has changed. * Update ip_vs seq stuff if len has changed.
*/ */
if (diff != 0 && cp->protocol == IPPROTO_TCP) if (diff != 0)
vs_seq_update(cp, &cp->in_seq, vs_seq_update(cp, &cp->in_seq,
IP_VS_CONN_F_IN_SEQ, seq, diff); IP_VS_CONN_F_IN_SEQ, seq, diff);
return diff; return 1;
}
/*
* Input pkt hook. Will call bound ip_vs_app specific function
* called by ipvs packet handler, assumes previously checked cp!=NULL.
* returns false if can't handle packet (oom).
*/
int ip_vs_app_pkt_in(struct ip_vs_conn *cp, struct sk_buff **pskb)
{
struct ip_vs_app *app;
/*
* check if application module is bound to
* this ip_vs_conn.
*/
if ((app = cp->app) == NULL)
return 1;
/* TCP is complicated */
if (cp->protocol == IPPROTO_TCP)
return app_tcp_pkt_in(cp, pskb, app);
/*
* Call private input hook function
*/
if (app->pkt_in == NULL)
return 1;
return app->pkt_in(app, cp, pskb, NULL);
} }
...@@ -490,7 +525,7 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos) ...@@ -490,7 +525,7 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
list_for_each_entry(inc, &app->incs_list, a_list) { list_for_each_entry(inc, &app->incs_list, a_list) {
if (pos-- == 0) if (pos-- == 0)
return inc; return inc;
} }
} }
return NULL; return NULL;
...@@ -499,7 +534,7 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos) ...@@ -499,7 +534,7 @@ static struct ip_vs_app *ip_vs_app_idx(loff_t pos)
static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos) static void *ip_vs_app_seq_start(struct seq_file *seq, loff_t *pos)
{ {
down(&__ip_vs_app_mutex); down(&__ip_vs_app_mutex);
return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN; return *pos ? ip_vs_app_idx(*pos - 1) : SEQ_START_TOKEN;
} }
...@@ -511,7 +546,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos) ...@@ -511,7 +546,7 @@ static void *ip_vs_app_seq_next(struct seq_file *seq, void *v, loff_t *pos)
++*pos; ++*pos;
if (v == SEQ_START_TOKEN) if (v == SEQ_START_TOKEN)
return ip_vs_app_idx(0); return ip_vs_app_idx(0);
inc = v; inc = v;
app = inc->app; app = inc->app;
...@@ -563,8 +598,8 @@ static int ip_vs_app_open(struct inode *inode, struct file *file) ...@@ -563,8 +598,8 @@ static int ip_vs_app_open(struct inode *inode, struct file *file)
static struct file_operations ip_vs_app_fops = { static struct file_operations ip_vs_app_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.open = ip_vs_app_open, .open = ip_vs_app_open,
.read = seq_read, .read = seq_read,
.llseek = seq_lseek, .llseek = seq_lseek,
.release = seq_release, .release = seq_release,
}; };
......
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
* and others. * and others.
* *
* Changes: * Changes:
* Paul `Rusty' Russell properly handle non-linear skbs
* *
*/ */
...@@ -61,10 +62,11 @@ EXPORT_SYMBOL(ip_vs_conn_put); ...@@ -61,10 +62,11 @@ EXPORT_SYMBOL(ip_vs_conn_put);
EXPORT_SYMBOL(ip_vs_get_debug_level); EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif #endif
EXPORT_SYMBOL(check_for_ip_vs_out); EXPORT_SYMBOL(check_for_ip_vs_out);
EXPORT_SYMBOL(ip_vs_make_skb_writable);
/* ID used in ICMP lookups */ /* ID used in ICMP lookups */
#define icmp_id(icmph) ((icmph->un).echo.id) #define icmp_id(icmph) (((icmph)->un).echo.id)
const char *ip_vs_proto_name(unsigned proto) const char *ip_vs_proto_name(unsigned proto)
{ {
...@@ -156,15 +158,51 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc) ...@@ -156,15 +158,51 @@ ip_vs_conn_stats(struct ip_vs_conn *cp, struct ip_vs_service *svc)
static inline int static inline int
ip_vs_set_state(struct ip_vs_conn *cp, int direction, ip_vs_set_state(struct ip_vs_conn *cp, int direction,
struct iphdr *iph, union ip_vs_tphdr h, const struct sk_buff *skb,
struct ip_vs_protocol *pp) struct ip_vs_protocol *pp)
{ {
if (unlikely(!pp->state_transition)) if (unlikely(!pp->state_transition))
return 0; return 0;
return pp->state_transition(cp, direction, iph, h, pp); return pp->state_transition(cp, direction, skb, pp);
} }
int ip_vs_make_skb_writable(struct sk_buff **pskb, int writable_len)
{
struct sk_buff *skb = *pskb;
/* skb is already used, better copy skb and its payload */
if (unlikely(skb_shared(skb) || skb->sk))
goto copy_skb;
/* skb data is already used, copy it */
if (unlikely(skb_cloned(skb)))
goto copy_data;
return pskb_may_pull(skb, writable_len);
copy_data:
if (unlikely(writable_len > skb->len))
return 0;
return !pskb_expand_head(skb, 0, 0, GFP_ATOMIC);
copy_skb:
if (unlikely(writable_len > skb->len))
return 0;
skb = skb_copy(skb, GFP_ATOMIC);
if (!skb)
return 0;
BUG_ON(skb_is_nonlinear(skb));
/* Rest of kernel will get very unhappy if we pass it a
suddenly-orphaned skbuff */
if ((*pskb)->sk)
skb_set_owner_w(skb, (*pskb)->sk);
kfree_skb(*pskb);
*pskb = skb;
return 1;
}
/* /*
* IPVS persistent scheduling function * IPVS persistent scheduling function
* It creates a connection entry according to its template if exists, * It creates a connection entry according to its template if exists,
...@@ -173,24 +211,24 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, ...@@ -173,24 +211,24 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction,
* Protocols supported: TCP, UDP * Protocols supported: TCP, UDP
*/ */
static struct ip_vs_conn * static struct ip_vs_conn *
ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_sched_persist(struct ip_vs_service *svc,
const struct sk_buff *skb,
__u16 ports[2])
{ {
struct ip_vs_conn *cp = NULL; struct ip_vs_conn *cp = NULL;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
const __u16 *portp;
struct ip_vs_conn *ct; struct ip_vs_conn *ct;
__u16 dport; /* destination port to forward */ __u16 dport; /* destination port to forward */
__u32 snet; /* source network of the client, after masking */ __u32 snet; /* source network of the client, after masking */
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]);
/* Mask saddr with the netmask to adjust template granularity */ /* Mask saddr with the netmask to adjust template granularity */
snet = iph->saddr & svc->netmask; snet = iph->saddr & svc->netmask;
IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u " IP_VS_DBG(6, "p-schedule: src %u.%u.%u.%u:%u dest %u.%u.%u.%u:%u "
"mnet %u.%u.%u.%u\n", "mnet %u.%u.%u.%u\n",
NIPQUAD(iph->saddr), ntohs(portp[0]), NIPQUAD(iph->saddr), ntohs(ports[0]),
NIPQUAD(iph->daddr), ntohs(portp[1]), NIPQUAD(iph->daddr), ntohs(ports[1]),
NIPQUAD(snet)); NIPQUAD(snet));
/* /*
...@@ -206,11 +244,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -206,11 +244,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* service, and a template like <caddr, 0, vaddr, vport, daddr, dport> * service, and a template like <caddr, 0, vaddr, vport, daddr, dport>
* is created for other persistent services. * is created for other persistent services.
*/ */
if (portp[1] == svc->port) { if (ports[1] == svc->port) {
/* Check if a template already exists */ /* Check if a template already exists */
if (svc->port != FTPPORT) if (svc->port != FTPPORT)
ct = ip_vs_conn_in_get(iph->protocol, snet, 0, ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, portp[1]); iph->daddr, ports[1]);
else else
ct = ip_vs_conn_in_get(iph->protocol, snet, 0, ct = ip_vs_conn_in_get(iph->protocol, snet, 0,
iph->daddr, 0); iph->daddr, 0);
...@@ -220,7 +258,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -220,7 +258,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* No template found or the dest of the connection * No template found or the dest of the connection
* template is not available. * template is not available.
*/ */
dest = svc->scheduler->schedule(svc, iph); dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) { if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n"); IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL; return NULL;
...@@ -235,7 +273,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -235,7 +273,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
if (svc->port != FTPPORT) if (svc->port != FTPPORT)
ct = ip_vs_conn_new(iph->protocol, ct = ip_vs_conn_new(iph->protocol,
snet, 0, snet, 0,
iph->daddr, portp[1], iph->daddr,
ports[1],
dest->addr, dest->port, dest->addr, dest->port,
0, 0,
dest); dest);
...@@ -277,7 +316,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -277,7 +316,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
if (svc->port) if (svc->port)
return NULL; return NULL;
dest = svc->scheduler->schedule(svc, iph); dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) { if (dest == NULL) {
IP_VS_DBG(1, "p-schedule: no dest found.\n"); IP_VS_DBG(1, "p-schedule: no dest found.\n");
return NULL; return NULL;
...@@ -308,15 +347,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -308,15 +347,15 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
/* set destination with the found template */ /* set destination with the found template */
dest = ct->dest; dest = ct->dest;
} }
dport = portp[1]; dport = ports[1];
} }
/* /*
* Create a new connection according to the template * Create a new connection according to the template
*/ */
cp = ip_vs_conn_new(iph->protocol, cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0], iph->saddr, ports[0],
iph->daddr, portp[1], iph->daddr, ports[1],
dest->addr, dport, dest->addr, dport,
0, 0,
dest); dest);
...@@ -343,23 +382,26 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -343,23 +382,26 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct iphdr *iph)
* Protocols supported: TCP, UDP * Protocols supported: TCP, UDP
*/ */
struct ip_vs_conn * struct ip_vs_conn *
ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_conn *cp = NULL; struct ip_vs_conn *cp = NULL;
struct iphdr *iph = skb->nh.iph;
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
const __u16 *portp; __u16 ports[2];
if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0)
return NULL;
/* /*
* Persistent service * Persistent service
*/ */
if (svc->flags & IP_VS_SVC_F_PERSISTENT) if (svc->flags & IP_VS_SVC_F_PERSISTENT)
return ip_vs_sched_persist(svc, iph); return ip_vs_sched_persist(svc, skb, ports);
/* /*
* Non-persistent service * Non-persistent service
*/ */
portp = (__u16 *)&(((char *)iph)[iph->ihl*4]); if (!svc->fwmark && ports[1] != svc->port) {
if (!svc->fwmark && portp[1] != svc->port) {
if (!svc->port) if (!svc->port)
IP_VS_ERR("Schedule: port zero only supported " IP_VS_ERR("Schedule: port zero only supported "
"in persistent services, " "in persistent services, "
...@@ -367,7 +409,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -367,7 +409,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
return NULL; return NULL;
} }
dest = svc->scheduler->schedule(svc, iph); dest = svc->scheduler->schedule(svc, skb);
if (dest == NULL) { if (dest == NULL) {
IP_VS_DBG(1, "Schedule: no dest found.\n"); IP_VS_DBG(1, "Schedule: no dest found.\n");
return NULL; return NULL;
...@@ -377,9 +419,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -377,9 +419,9 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
* Create a connection entry. * Create a connection entry.
*/ */
cp = ip_vs_conn_new(iph->protocol, cp = ip_vs_conn_new(iph->protocol,
iph->saddr, portp[0], iph->saddr, ports[0],
iph->daddr, portp[1], iph->daddr, ports[1],
dest->addr, dest->port?dest->port:portp[1], dest->addr, dest->port?dest->port:ports[1],
0, 0,
dest); dest);
if (cp == NULL) if (cp == NULL)
...@@ -404,10 +446,16 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph) ...@@ -404,10 +446,16 @@ ip_vs_schedule(struct ip_vs_service *svc, struct iphdr *iph)
* no destination is available for a new connection. * no destination is available for a new connection.
*/ */
int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
struct ip_vs_protocol *pp, union ip_vs_tphdr h) struct ip_vs_protocol *pp)
{ {
__u16 ports[2];
struct iphdr *iph = skb->nh.iph; struct iphdr *iph = skb->nh.iph;
if (skb_copy_bits(skb, iph->ihl*4, ports, sizeof(ports)) < 0) {
ip_vs_service_put(svc);
return NF_DROP;
}
/* if it is fwmark-based service, the cache_bypass sysctl is up /* if it is fwmark-based service, the cache_bypass sysctl is up
and the destination is RTN_UNICAST (and not local), then create and the destination is RTN_UNICAST (and not local), then create
a cache_bypass connection entry */ a cache_bypass connection entry */
...@@ -421,21 +469,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ...@@ -421,21 +469,19 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
/* create a new connection entry */ /* create a new connection entry */
IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n"); IP_VS_DBG(6, "ip_vs_leave: create a cache_bypass entry\n");
cp = ip_vs_conn_new(iph->protocol, cp = ip_vs_conn_new(iph->protocol,
iph->saddr, h.portp[0], iph->saddr, ports[0],
iph->daddr, h.portp[1], iph->daddr, ports[1],
0, 0, 0, 0,
IP_VS_CONN_F_BYPASS, IP_VS_CONN_F_BYPASS,
NULL); NULL);
if (cp == NULL) { if (cp == NULL)
kfree_skb(skb); return NF_DROP;
return NF_STOLEN;
}
/* statistics */ /* statistics */
ip_vs_in_stats(cp, skb); ip_vs_in_stats(cp, skb);
/* set state */ /* set state */
cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp); cs = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
/* transmit the first SYN packet */ /* transmit the first SYN packet */
ret = cp->packet_xmit(skb, cp, pp); ret = cp->packet_xmit(skb, cp, pp);
...@@ -451,7 +497,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ...@@ -451,7 +497,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* listed in the ipvs table), pass the packets, because it is * listed in the ipvs table), pass the packets, because it is
* not ipvs job to decide to drop the packets. * not ipvs job to decide to drop the packets.
*/ */
if ((svc->port == FTPPORT) && (h.portp[1] != FTPPORT)) { if ((svc->port == FTPPORT) && (ports[1] != FTPPORT)) {
ip_vs_service_put(svc); ip_vs_service_put(svc);
return NF_ACCEPT; return NF_ACCEPT;
} }
...@@ -466,8 +512,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ...@@ -466,8 +512,7 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ
*/ */
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
kfree_skb(skb); return NF_DROP;
return NF_STOLEN;
} }
...@@ -479,22 +524,80 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, ...@@ -479,22 +524,80 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
* for VS/NAT. * for VS/NAT.
*/ */
static unsigned int ip_vs_post_routing(unsigned int hooknum, static unsigned int ip_vs_post_routing(unsigned int hooknum,
struct sk_buff **skb_p, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *in,
const struct net_device *out, const struct net_device *out,
int (*okfn)(struct sk_buff *)) int (*okfn)(struct sk_buff *))
{ {
struct sk_buff *skb = *skb_p; if (!((*pskb)->nfcache & NFC_IPVS_PROPERTY))
if (!(skb->nfcache & NFC_IPVS_PROPERTY))
return NF_ACCEPT; return NF_ACCEPT;
/* The packet was sent from IPVS, exit this chain */ /* The packet was sent from IPVS, exit this chain */
(*okfn)(skb); (*okfn)(*pskb);
return NF_STOLEN; return NF_STOLEN;
} }
u16 ip_vs_checksum_complete(struct sk_buff *skb, int offset)
{
return (u16) csum_fold(skb_checksum(skb, offset, skb->len - offset, 0));
}
static inline struct sk_buff *
ip_vs_gather_frags(struct sk_buff *skb)
{
skb = ip_defrag(skb);
if (skb)
ip_send_check(skb->nh.iph);
return skb;
}
/*
* Packet has been made sufficiently writable in caller
* - inout: 1=in->out, 0=out->in
*/
void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct ip_vs_conn *cp, int inout)
{
struct iphdr *iph = skb->nh.iph;
unsigned int icmp_offset = iph->ihl*4;
struct icmphdr *icmph = (struct icmphdr *)(skb->nh.raw + icmp_offset);
struct iphdr *ciph = (struct iphdr *)(icmph + 1);
if (inout) {
iph->saddr = cp->vaddr;
ip_send_check(iph);
ciph->daddr = cp->vaddr;
ip_send_check(ciph);
} else {
iph->daddr = cp->daddr;
ip_send_check(iph);
ciph->saddr = cp->daddr;
ip_send_check(ciph);
}
/* the TCP/UDP port */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) {
__u16 *ports = (void *)ciph + ciph->ihl*4;
if (inout)
ports[1] = cp->vport;
else
ports[0] = cp->dport;
}
/* And finally the ICMP checksum */
icmph->checksum = 0;
icmph->checksum = ip_vs_checksum_complete(skb, icmp_offset);
skb->ip_summed = CHECKSUM_UNNECESSARY;
if (inout)
IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered outgoing ICMP");
else
IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph,
"Forwarding altered incoming ICMP");
}
/* /*
* Handle ICMP messages in the inside-to-outside direction (outgoing). * Handle ICMP messages in the inside-to-outside direction (outgoing).
...@@ -503,44 +606,33 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum, ...@@ -503,44 +606,33 @@ static unsigned int ip_vs_post_routing(unsigned int hooknum,
* Currently handles error types - unreachable, quench, ttl exceeded. * Currently handles error types - unreachable, quench, ttl exceeded.
* (Only used in VS/NAT) * (Only used in VS/NAT)
*/ */
static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related) static int ip_vs_out_icmp(struct sk_buff **pskb, int *related)
{ {
struct sk_buff *skb = *skb_p; struct sk_buff *skb = *pskb;
struct iphdr *iph; struct iphdr *iph;
struct icmphdr *icmph; struct icmphdr icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */ struct iphdr ciph; /* The ip header contained within the ICMP */
unsigned short ihl;
unsigned short len;
unsigned short clen, cihl;
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
struct ip_vs_protocol *pp; struct ip_vs_protocol *pp;
union ip_vs_tphdr h; unsigned int offset, ihl, verdict;
*related = 1; *related = 1;
/* reassemble IP fragments, but will it happen in ICMP packets?? */ /* reassemble IP fragments */
if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
skb = ip_defrag(skb); skb = ip_vs_gather_frags(skb);
if (!skb) if (!skb)
return NF_STOLEN; return NF_STOLEN;
*skb_p = skb; *pskb = skb;
}
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
ip_send_check(skb->nh.iph);
} }
iph = skb->nh.iph; iph = skb->nh.iph;
ihl = iph->ihl << 2; offset = ihl = iph->ihl * 4;
icmph = (struct icmphdr *)((char *)iph + ihl); if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0)
len = ntohs(iph->tot_len) - ihl;
if (len < sizeof(struct icmphdr))
return NF_DROP; return NF_DROP;
IP_VS_DBG(12, "outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n", IP_VS_DBG(12, "Outgoing ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)), icmph.type, ntohs(icmp_id(&icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
/* /*
...@@ -550,86 +642,80 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related) ...@@ -550,86 +642,80 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
* this means that some packets will manage to get a long way * this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life. * down this stack and then be rejected, but that's life.
*/ */
if ((icmph->type != ICMP_DEST_UNREACH) && if ((icmph.type != ICMP_DEST_UNREACH) &&
(icmph->type != ICMP_SOURCE_QUENCH) && (icmph.type != ICMP_SOURCE_QUENCH) &&
(icmph->type != ICMP_TIME_EXCEEDED)) { (icmph.type != ICMP_TIME_EXCEEDED)) {
*related = 0; *related = 0;
return NF_ACCEPT; return NF_ACCEPT;
} }
/* Now find the contained IP header */ /* Now find the contained IP header */
clen = len - sizeof(struct icmphdr); offset += sizeof(icmph);
if (clen < sizeof(struct iphdr)) if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0)
return NF_DROP; return NF_ACCEPT; /* The packet looks wrong, ignore */
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
pp = ip_vs_proto_get(ciph->protocol); pp = ip_vs_proto_get(ciph.protocol);
if (!pp) if (!pp)
return NF_ACCEPT; return NF_ACCEPT;
/* Is the embedded protocol header present? */ /* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) && if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag))) (pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT; return NF_ACCEPT;
/* We need at least TCP/UDP ports here */ IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for");
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
h.raw = (char *) ciph + cihl;
/* Ensure the checksum is correct */
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_DBG(1, "forward ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
return NF_DROP;
}
IP_VS_DBG_PKT(11, pp, ciph, "Handling outgoing ICMP for"); offset += ciph.ihl * 4;
/* ciph content is actually <protocol, caddr, cport, daddr, dport> */ /* The embedded headers contain source and dest in reverse order */
cp = pp->conn_out_get(skb, pp, ciph, h, 1); cp = pp->conn_out_get(skb, pp, &ciph, offset, 1);
if (!cp) if (!cp)
return NF_ACCEPT; return NF_ACCEPT;
verdict = NF_DROP;
if (IP_VS_FWD_METHOD(cp) != 0) { if (IP_VS_FWD_METHOD(cp) != 0) {
IP_VS_ERR("shouldn't reach here, because the box is on the" IP_VS_ERR("shouldn't reach here, because the box is on the"
"half connection in the tun/dr module.\n"); "half connection in the tun/dr module.\n");
} }
/* Now we do real damage to this packet...! */ /* Ensure the checksum is correct */
/* First change the source IP address, and recalc checksum */ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
iph->saddr = cp->vaddr; ip_vs_checksum_complete(skb, ihl)) {
ip_send_check(iph); /* Failed checksum! */
IP_VS_DBG(1, "Forward ICMP: failed checksum from %d.%d.%d.%d!\n",
/* Now change the *dest* address in the contained IP */ NIPQUAD(iph->saddr));
ciph->daddr = cp->vaddr; goto out;
ip_send_check(ciph); }
/* the TCP/UDP dest port - cannot redo check */ if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol)
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol) offset += 2 * sizeof(__u16);
h.portp[1] = cp->vport; if (!ip_vs_make_skb_writable(pskb, offset))
goto out;
skb = *pskb;
/* And finally the ICMP checksum */ ip_vs_nat_icmp(skb, pp, cp, 1);
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
/* do the statistics and put it back */ /* do the statistics and put it back */
ip_vs_out_stats(cp, skb); ip_vs_out_stats(cp, skb);
__ip_vs_conn_put(cp);
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding correct outgoing ICMP");
skb->nfcache |= NFC_IPVS_PROPERTY; skb->nfcache |= NFC_IPVS_PROPERTY;
verdict = NF_ACCEPT;
return NF_ACCEPT; out:
__ip_vs_conn_put(cp);
return verdict;
} }
static inline int is_tcp_reset(const struct sk_buff *skb)
{
struct tcphdr tcph;
if (skb_copy_bits(skb, skb->nh.iph->ihl * 4, &tcph, sizeof(tcph)) < 0)
return 0;
return tcph.rst;
}
/* /*
* It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT. * It is hooked at the NF_IP_FORWARD chain, used only for VS/NAT.
...@@ -637,16 +723,15 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related) ...@@ -637,16 +723,15 @@ static int ip_vs_out_icmp(struct sk_buff **skb_p, int *related)
* rewrite addresses of the packet and send it on its way... * rewrite addresses of the packet and send it on its way...
*/ */
static unsigned int static unsigned int
ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p, ip_vs_out(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out, const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *)) int (*okfn)(struct sk_buff *))
{ {
struct sk_buff *skb = *skb_p; struct sk_buff *skb = *pskb;
struct iphdr *iph; struct iphdr *iph;
struct ip_vs_protocol *pp; struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
int size, ihl, firstfrag; int ihl;
EnterFunction(11); EnterFunction(11);
...@@ -655,10 +740,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -655,10 +740,12 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
iph = skb->nh.iph; iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) { if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_out_icmp(skb_p, &related); int related, verdict = ip_vs_out_icmp(pskb, &related);
if (related) if (related)
return verdict; return verdict;
skb = *pskb;
iph = skb->nh.iph;
} }
pp = ip_vs_proto_get(iph->protocol); pp = ip_vs_proto_get(iph->protocol);
...@@ -668,105 +755,74 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -668,105 +755,74 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
/* reassemble IP fragments */ /* reassemble IP fragments */
if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) && if (unlikely(iph->frag_off & __constant_htons(IP_MF|IP_OFFSET) &&
!pp->dont_defrag)) { !pp->dont_defrag)) {
skb = ip_defrag(skb); skb = ip_vs_gather_frags(skb);
if (!skb) if (!skb)
return NF_STOLEN; return NF_STOLEN;
iph = skb->nh.iph; iph = skb->nh.iph;
*skb_p = skb; *pskb = skb;
} }
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2; ihl = iph->ihl << 2;
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/* /*
* Check if the packet belongs to an existing entry * Check if the packet belongs to an existing entry
*/ */
cp = pp->conn_out_get(skb, pp, iph, h, 0); cp = pp->conn_out_get(skb, pp, iph, ihl, 0);
if (unlikely(!cp)) { if (unlikely(!cp)) {
if (sysctl_ip_vs_nat_icmp_send && if (sysctl_ip_vs_nat_icmp_send &&
(pp->protocol == IPPROTO_TCP || (pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP) && pp->protocol == IPPROTO_UDP)) {
ip_vs_lookup_real_service(iph->protocol, __u16 ports[2];
iph->saddr, h.portp[0])) {
/* if (skb_copy_bits(skb, ihl, ports, sizeof(ports)) < 0)
* Notify the real server: there is no existing return NF_ACCEPT; /* Not for me */
* entry if it is not RST packet or not TCP packet. if (ip_vs_lookup_real_service(iph->protocol,
*/ iph->saddr, ports[0])) {
if (!h.th->rst || iph->protocol != IPPROTO_TCP) { /*
icmp_send(skb, ICMP_DEST_UNREACH, * Notify the real server: there is no
ICMP_PORT_UNREACH, 0); * existing entry if it is not RST
kfree_skb(skb); * packet or not TCP packet.
return NF_STOLEN; */
if (iph->protocol != IPPROTO_TCP
|| !is_tcp_reset(skb)) {
icmp_send(skb,ICMP_DEST_UNREACH,
ICMP_PORT_UNREACH, 0);
return NF_DROP;
}
} }
} }
IP_VS_DBG_PKT(12, pp, iph, IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal"); "packet continues traversal as normal");
if (!pp->dont_defrag)
ip_send_check(iph);
return NF_ACCEPT; return NF_ACCEPT;
} }
/* IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet");
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (addr/port) is changed, so it is fast
* to do incremental checksum update, and let the destination host
* do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave && skb_is_nonlinear(skb))) {
if (skb_linearize(skb, GFP_ATOMIC) != 0) {
ip_vs_conn_put(cp);
return NF_DROP;
}
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
}
size = skb->len - ihl;
IP_VS_DBG(11, "O-pkt: %s size=%d\n", pp->name, size);
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size)) {
ip_vs_conn_put(cp);
return NF_DROP;
}
}
IP_VS_DBG_PKT(11, pp, iph, "Outgoing packet"); if (!ip_vs_make_skb_writable(pskb, ihl))
goto drop;
/* mangle the packet */ /* mangle the packet */
iph->saddr = cp->vaddr; if (pp->snat_handler && !pp->snat_handler(pskb, pp, cp))
if (pp->snat_handler) { goto drop;
pp->snat_handler(skb, pp, cp, iph, h, size); skb = *pskb;
iph = skb->nh.iph; skb->nh.iph->saddr = cp->vaddr;
h.raw = (char*) iph + ihl; ip_send_check(skb->nh.iph);
}
ip_send_check(iph);
IP_VS_DBG_PKT(10, pp, iph, "After SNAT"); IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT");
ip_vs_out_stats(cp, skb); ip_vs_out_stats(cp, skb);
ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, iph, h, pp); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp);
ip_vs_conn_put(cp); ip_vs_conn_put(cp);
skb->nfcache |= NFC_IPVS_PROPERTY; skb->nfcache |= NFC_IPVS_PROPERTY;
LeaveFunction(11); LeaveFunction(11);
return NF_ACCEPT; return NF_ACCEPT;
drop:
ip_vs_conn_put(cp);
return NF_DROP;
} }
...@@ -777,198 +833,185 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -777,198 +833,185 @@ ip_vs_out(unsigned int hooknum, struct sk_buff **skb_p,
* they are changed by ipchains masquerading code. * they are changed by ipchains masquerading code.
*/ */
unsigned int unsigned int
check_for_ip_vs_out(struct sk_buff **skb_p, int (*okfn)(struct sk_buff *)) check_for_ip_vs_out(struct sk_buff **pskb, int (*okfn)(struct sk_buff *))
{ {
unsigned int ret; unsigned int ret;
ret = ip_vs_out(NF_IP_FORWARD, skb_p, NULL, NULL, NULL); ret = ip_vs_out(NF_IP_FORWARD, pskb, NULL, NULL, NULL);
if (ret != NF_ACCEPT) { if (ret != NF_ACCEPT) {
return ret; return ret;
} else { } else {
/* send the packet immediately if it is already mangled /* send the packet immediately if it is already mangled
by ip_vs_out */ by ip_vs_out */
if ((*skb_p)->nfcache & NFC_IPVS_PROPERTY) { if ((*pskb)->nfcache & NFC_IPVS_PROPERTY) {
(*okfn)(*skb_p); (*okfn)(*pskb);
return NF_STOLEN; return NF_STOLEN;
} }
} }
return NF_ACCEPT; return NF_ACCEPT;
} }
/* /*
* Handle ICMP messages in the outside-to-inside direction (incoming). * Handle ICMP messages in the outside-to-inside direction (incoming).
* Find any that might be relevant, check against existing connections, * Find any that might be relevant, check against existing connections,
* forward to the right destination host if relevant. * forward to the right destination host if relevant.
* Currently handles error types - unreachable, quench, ttl exceeded * Currently handles error types - unreachable, quench, ttl exceeded.
*/ */
static int ip_vs_in_icmp(struct sk_buff **skb_p, int *related) static int ip_vs_in_icmp(struct sk_buff **pskb, int *related)
{ {
struct sk_buff *skb = *skb_p; struct sk_buff *skb = *pskb;
struct iphdr *iph; struct iphdr *iph;
struct icmphdr *icmph; struct icmphdr icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */ struct iphdr ciph; /* The ip header contained within the ICMP */
unsigned short len;
unsigned short clen, cihl;
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
struct ip_vs_protocol *pp; struct ip_vs_protocol *pp;
union ip_vs_tphdr h; unsigned int offset, ihl, verdict;
int rc;
*related = 1; *related = 1;
if (skb_is_nonlinear(skb)) {
if (skb_linearize(skb, GFP_ATOMIC) != 0) /* reassemble IP fragments */
return NF_DROP; if (skb->nh.iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) {
ip_send_check(skb->nh.iph); skb = ip_vs_gather_frags(skb);
if (!skb)
return NF_STOLEN;
*pskb = skb;
} }
iph = skb->nh.iph; iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2)); offset = ihl = iph->ihl * 4;
len = ntohs(iph->tot_len) - (iph->ihl<<2); if (skb_copy_bits(skb, offset, &icmph, sizeof(icmph)) < 0)
if (len < sizeof(struct icmphdr))
return NF_DROP; return NF_DROP;
IP_VS_DBG(12, "icmp in (%d,%d) %u.%u.%u.%u -> %u.%u.%u.%u\n", IP_VS_DBG(12, "Incoming ICMP (%d,%d) %u.%u.%u.%u->%u.%u.%u.%u\n",
icmph->type, ntohs(icmp_id(icmph)), icmph.type, ntohs(icmp_id(&icmph)),
NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); NIPQUAD(iph->saddr), NIPQUAD(iph->daddr));
if ((icmph->type != ICMP_DEST_UNREACH) && /*
(icmph->type != ICMP_SOURCE_QUENCH) && * Work through seeing if this is for us.
(icmph->type != ICMP_TIME_EXCEEDED)) { * These checks are supposed to be in an order that means easy
* things are checked first to speed up processing.... however
* this means that some packets will manage to get a long way
* down this stack and then be rejected, but that's life.
*/
if ((icmph.type != ICMP_DEST_UNREACH) &&
(icmph.type != ICMP_SOURCE_QUENCH) &&
(icmph.type != ICMP_TIME_EXCEEDED)) {
*related = 0; *related = 0;
return NF_ACCEPT; return NF_ACCEPT;
} }
/* /* Now find the contained IP header */
* If we get here we have an ICMP error of one of the above 3 types offset += sizeof(icmph);
* Now find the contained IP header if (skb_copy_bits(skb, offset, &ciph, sizeof(ciph)) < 0)
*/ return NF_ACCEPT; /* The packet looks wrong, ignore */
clen = len - sizeof(struct icmphdr);
if (clen < sizeof(struct iphdr))
return NF_DROP;
ciph = (struct iphdr *) (icmph + 1);
cihl = ciph->ihl << 2;
if (clen < cihl)
return NF_DROP;
pp = ip_vs_proto_get(ciph->protocol); pp = ip_vs_proto_get(ciph.protocol);
if (!pp) if (!pp)
return NF_ACCEPT; return NF_ACCEPT;
/* Is the embedded protocol header present? */ /* Is the embedded protocol header present? */
if (unlikely(ciph->frag_off & __constant_htons(IP_OFFSET) && if (unlikely(ciph.frag_off & __constant_htons(IP_OFFSET) &&
(pp->minhlen || pp->dont_defrag))) (pp->minhlen || pp->dont_defrag)))
return NF_ACCEPT; return NF_ACCEPT;
/* We need at least TCP/UDP ports here */ IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for");
if (clen < cihl + pp->minhlen_icmp)
return NF_DROP;
/* Ensure the checksum is correct */ offset += ciph.ihl * 4;
if (ip_compute_csum((unsigned char *) icmph, len)) {
/* Failed checksum! */
IP_VS_ERR_RL("incoming ICMP: failed checksum from "
"%d.%d.%d.%d!\n", NIPQUAD(iph->saddr));
return NF_DROP;
}
h.raw = (char *) ciph + cihl; /* The embedded headers contain source and dest in reverse order */
cp = pp->conn_in_get(skb, pp, &ciph, offset, 1);
if (!cp)
return NF_ACCEPT;
IP_VS_DBG_PKT(11, pp, ciph, "Handling incoming ICMP for"); verdict = NF_DROP;
/* This is pretty much what ip_vs_conn_in_get() does, /* Ensure the checksum is correct */
except parameters are in the reverse order */ if (skb->ip_summed != CHECKSUM_UNNECESSARY &&
cp = pp->conn_in_get(skb, pp, ciph, h, 1); ip_vs_checksum_complete(skb, ihl)) {
if (cp == NULL) /* Failed checksum! */
return NF_ACCEPT; IP_VS_DBG(1, "Incoming ICMP: failed checksum from %d.%d.%d.%d!\n",
NIPQUAD(iph->saddr));
goto out;
}
/* do the statistics and put it back */
ip_vs_in_stats(cp, skb); ip_vs_in_stats(cp, skb);
rc = ip_vs_icmp_xmit(skb, cp, pp); if (IPPROTO_TCP == ciph.protocol || IPPROTO_UDP == ciph.protocol)
offset += 2 * sizeof(__u16);
verdict = ip_vs_icmp_xmit(skb, cp, pp, offset);
out:
__ip_vs_conn_put(cp); __ip_vs_conn_put(cp);
return rc;
}
return verdict;
}
/* /*
* Check if it's for virtual services, look it up, * Check if it's for virtual services, look it up,
* and send it on its way... * and send it on its way...
*/ */
static unsigned int static unsigned int
ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p, ip_vs_in(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out, const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *)) int (*okfn)(struct sk_buff *))
{ {
struct sk_buff *skb = *skb_p; struct sk_buff *skb = *pskb;
struct iphdr *iph = skb->nh.iph; struct iphdr *iph;
struct ip_vs_protocol *pp = ip_vs_proto_get(iph->protocol); struct ip_vs_protocol *pp;
union ip_vs_tphdr h;
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
int ihl, ret, restart; int ret, restart;
int firstfrag; int ihl;
/* /*
* Big tappo: only PACKET_HOST (neither loopback nor mcasts) * Big tappo: only PACKET_HOST (neither loopback nor mcasts)
* ... don't know why 1st test DOES NOT include 2nd (?) * ... don't know why 1st test DOES NOT include 2nd (?)
*/ */
if (unlikely(skb->pkt_type != PACKET_HOST || skb->dev == &loopback_dev)) { if (unlikely(skb->pkt_type != PACKET_HOST
|| skb->dev == &loopback_dev || skb->sk)) {
IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n", IP_VS_DBG(12, "packet type=%d proto=%d daddr=%d.%d.%d.%d ignored\n",
skb->pkt_type, skb->pkt_type,
iph->protocol, skb->nh.iph->protocol,
NIPQUAD(iph->daddr)); NIPQUAD(skb->nh.iph->daddr));
return NF_ACCEPT; return NF_ACCEPT;
} }
iph = skb->nh.iph;
if (unlikely(iph->protocol == IPPROTO_ICMP)) { if (unlikely(iph->protocol == IPPROTO_ICMP)) {
int related, verdict = ip_vs_in_icmp(skb_p, &related); int related, verdict = ip_vs_in_icmp(pskb, &related);
if (related) if (related)
return verdict; return verdict;
skb = *pskb;
iph = skb->nh.iph;
} }
/* Protocol supported? */ /* Protocol supported? */
pp = ip_vs_proto_get(iph->protocol);
if (unlikely(!pp)) if (unlikely(!pp))
return NF_ACCEPT; return NF_ACCEPT;
/* make sure that protocol header is available in skb data area,
note that skb data area may be reallocated. */
ihl = iph->ihl << 2; ihl = iph->ihl << 2;
#if 0
/* Enable this when not in LOCAL_IN */
firstfrag = !(iph->frag_off & __constant_htons(IP_OFFSET));
/*
* WARNING: we can work with !firstfrag packets, make sure
* each protocol handler checks for firstfrag
*/
#else
firstfrag = 1;
#endif
if (firstfrag &&
!pskb_may_pull(skb, ihl+pp->minhlen))
return NF_DROP;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
/* /*
* Check if the packet belongs to an existing connection entry * Check if the packet belongs to an existing connection entry
*/ */
cp = pp->conn_in_get(skb, pp, iph, h, 0); cp = pp->conn_in_get(skb, pp, iph, ihl, 0);
if (unlikely(!cp)) { if (unlikely(!cp)) {
int v; int v;
if (!pp->conn_schedule(skb, pp, iph, h, &v, &cp)) { if (!pp->conn_schedule(skb, pp, &v, &cp))
return v; return v;
}
} }
if (unlikely(!cp)) { if (unlikely(!cp)) {
/* sorry, all this trouble for a no-hit :) */ /* sorry, all this trouble for a no-hit :) */
IP_VS_DBG_PKT(12, pp, iph, IP_VS_DBG_PKT(12, pp, skb, 0,
"packet continues traversal as normal"); "packet continues traversal as normal");
return NF_ACCEPT; return NF_ACCEPT;
} }
IP_VS_DBG_PKT(11, pp, iph, "Incoming packet"); IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet");
/* Check the server status */ /* Check the server status */
if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) {
...@@ -986,7 +1029,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -986,7 +1029,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
} }
ip_vs_in_stats(cp, skb); ip_vs_in_stats(cp, skb);
restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, iph, h, pp); restart = ip_vs_set_state(cp, IP_VS_DIR_INPUT, skb, pp);
if (cp->packet_xmit) if (cp->packet_xmit)
ret = cp->packet_xmit(skb, cp, pp); ret = cp->packet_xmit(skb, cp, pp);
else { else {
...@@ -1011,7 +1054,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -1011,7 +1054,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
/* /*
* It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP * It is hooked at the NF_IP_FORWARD chain, in order to catch ICMP
* packets destined for 0.0.0.0/0. * related packets destined for 0.0.0.0/0.
* When fwmark-based virtual service is used, such as transparent * When fwmark-based virtual service is used, such as transparent
* cache cluster, TCP packets can be marked and routed to ip_vs_in, * cache cluster, TCP packets can be marked and routed to ip_vs_in,
* but ICMP destined for 0.0.0.0/0 cannot not be easily marked and * but ICMP destined for 0.0.0.0/0 cannot not be easily marked and
...@@ -1019,25 +1062,16 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p, ...@@ -1019,25 +1062,16 @@ ip_vs_in(unsigned int hooknum, struct sk_buff **skb_p,
* and send them to ip_vs_in_icmp. * and send them to ip_vs_in_icmp.
*/ */
static unsigned int static unsigned int
ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **skb_p, ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff **pskb,
const struct net_device *in, const struct net_device *out, const struct net_device *in, const struct net_device *out,
int (*okfn)(struct sk_buff *)) int (*okfn)(struct sk_buff *))
{ {
struct sk_buff *skb = *skb_p;
struct iphdr *iph = skb->nh.iph;
int r; int r;
if (iph->protocol != IPPROTO_ICMP) if ((*pskb)->nh.iph->protocol != IPPROTO_ICMP)
return NF_ACCEPT; return NF_ACCEPT;
if (iph->frag_off & __constant_htons(IP_MF|IP_OFFSET)) { return ip_vs_in_icmp(pskb, &r);
skb = ip_defrag(skb);
if (!skb)
return NF_STOLEN;
*skb_p = skb;
}
return ip_vs_in_icmp(skb_p, &r);
} }
......
...@@ -202,10 +202,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest) ...@@ -202,10 +202,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Destination hashing scheduling * Destination hashing scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_dh_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_dh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
struct ip_vs_dh_bucket *tbl; struct ip_vs_dh_bucket *tbl;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n"); IP_VS_DBG(6, "ip_vs_dh_schedule(): Scheduling...\n");
......
...@@ -87,39 +87,46 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, ...@@ -87,39 +87,46 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
__u32 *addr, __u16 *port, __u32 *addr, __u16 *port,
char **start, char **end) char **start, char **end)
{ {
unsigned char p1,p2,p3,p4,p5,p6; unsigned char p[6];
int i = 0;
if (data_limit - data < plen) {
/* check if there is partial match */
if (strnicmp(data, pattern, data_limit - data) == 0)
return -1;
else
return 0;
}
while (data < data_limit) { if (strnicmp(data, pattern, plen) != 0) {
if (strnicmp(data, pattern, plen) != 0) { return 0;
data++; }
continue; *start = data + plen;
}
*start = data+plen;
p1 = simple_strtoul(data+plen, &data, 10);
if (*data != ',')
continue;
p2 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p3 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p4 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p5 = simple_strtoul(data+1, &data, 10);
if (*data != ',')
continue;
p6 = simple_strtoul(data+1, &data, 10);
if (*data != term)
continue;
*end = data; for (data = *start; *data != term; data++) {
*addr = (p4<<24) | (p3<<16) | (p2<<8) | p1; if (data == data_limit)
*port = (p6<<8) | p5; return -1;
return 1;
} }
return 0; *end = data;
memset(p, 0, sizeof(p));
for (data = *start; data != *end; data++) {
if (*data >= '0' && *data <= '9') {
p[i] = p[i]*10 + *data - '0';
} else if (*data == ',' && i < 5) {
i++;
} else {
/* unexpected character */
return -1;
}
}
if (i != 5)
return -1;
*addr = (p[3]<<24) | (p[2]<<16) | (p[1]<<8) | p[0];
*port = (p[5]<<8) | p[4];
return 1;
} }
...@@ -136,8 +143,8 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, ...@@ -136,8 +143,8 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit,
* "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)". * "227 Entering Passive Mode (xxx,xxx,xxx,xxx,ppp,ppp)".
* xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number. * xxx,xxx,xxx,xxx is the server address, ppp,ppp is the server port number.
*/ */
static int ip_vs_ftp_out(struct ip_vs_app *app, static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct ip_vs_conn *cp, struct sk_buff *skb) struct sk_buff **pskb, int *diff)
{ {
struct iphdr *iph; struct iphdr *iph;
struct tcphdr *th; struct tcphdr *th;
...@@ -148,24 +155,30 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, ...@@ -148,24 +155,30 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
struct ip_vs_conn *n_cp; struct ip_vs_conn *n_cp;
char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */ char buf[24]; /* xxx.xxx.xxx.xxx,ppp,ppp\000 */
unsigned buf_len; unsigned buf_len;
int diff; int ret;
*diff = 0;
/* Only useful for established sessions */ /* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED) if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
return 0; return 0;
if (cp->app_data == &ip_vs_ftp_pasv) { if (cp->app_data == &ip_vs_ftp_pasv) {
iph = skb->nh.iph; iph = (*pskb)->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
data = (char *)th + (th->doff << 2); data = (char *)th + (th->doff << 2);
data_limit = skb->tail; data_limit = (*pskb)->tail;
if (ip_vs_ftp_get_addrport(data, data_limit, if (ip_vs_ftp_get_addrport(data, data_limit,
SERVER_STRING, SERVER_STRING,
sizeof(SERVER_STRING)-1, ')', sizeof(SERVER_STRING)-1, ')',
&from, &port, &from, &port,
&start, &end) == 0) &start, &end) != 1)
return 0; return 1;
IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> " IP_VS_DBG(1-debug, "PASV response (%u.%u.%u.%u:%d) -> "
"%u.%u.%u.%u:%d detected\n", "%u.%u.%u.%u:%d detected\n",
...@@ -196,29 +209,29 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, ...@@ -196,29 +209,29 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
from = n_cp->vaddr; from = n_cp->vaddr;
port = n_cp->vport; port = n_cp->vport;
sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from), sprintf(buf,"%d,%d,%d,%d,%d,%d", NIPQUAD(from),
port&255, port>>8&255); port&255, (port>>8)&255);
buf_len = strlen(buf); buf_len = strlen(buf);
/* /*
* Calculate required delta-offset to keep TCP happy * Calculate required delta-offset to keep TCP happy
*/ */
diff = buf_len - (end-start); *diff = buf_len - (end-start);
if (diff == 0) { if (*diff == 0) {
/* simply replace it with new passive address */ /* simply replace it with new passive address */
memcpy(start, buf, buf_len); memcpy(start, buf, buf_len);
ret = 1;
} else { } else {
/* fixme: return value isn't checked here */ ret = !ip_vs_skb_replace(*pskb, GFP_ATOMIC, start,
ip_vs_skb_replace(skb, GFP_ATOMIC, start,
end-start, buf, buf_len); end-start, buf, buf_len);
} }
cp->app_data = NULL; cp->app_data = NULL;
ip_vs_tcp_conn_listen(n_cp); ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp); ip_vs_conn_put(n_cp);
return diff; return ret;
} }
return 0; return 1;
} }
...@@ -233,8 +246,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, ...@@ -233,8 +246,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app,
* port, so that the active ftp data connection from the server can reach * port, so that the active ftp data connection from the server can reach
* the client. * the client.
*/ */
static int ip_vs_ftp_in(struct ip_vs_app *app, static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp,
struct ip_vs_conn *cp, struct sk_buff *skb) struct sk_buff **pskb, int *diff)
{ {
struct iphdr *iph; struct iphdr *iph;
struct tcphdr *th; struct tcphdr *th;
...@@ -244,29 +257,37 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, ...@@ -244,29 +257,37 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
__u16 port; __u16 port;
struct ip_vs_conn *n_cp; struct ip_vs_conn *n_cp;
/* no diff required for incoming packets */
*diff = 0;
/* Only useful for established sessions */ /* Only useful for established sessions */
if (cp->state != IP_VS_TCP_S_ESTABLISHED) if (cp->state != IP_VS_TCP_S_ESTABLISHED)
return 1;
/* Linear packets are much easier to deal with. */
if (!ip_vs_make_skb_writable(pskb, (*pskb)->len))
return 0; return 0;
/* /*
* Detecting whether it is passive * Detecting whether it is passive
*/ */
iph = skb->nh.iph; iph = (*pskb)->nh.iph;
th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]); th = (struct tcphdr *)&(((char *)iph)[iph->ihl*4]);
/* Since there may be OPTIONS in the TCP packet and the HLEN is /* Since there may be OPTIONS in the TCP packet and the HLEN is
the length of the header in 32-bit multiples, it is accurate the length of the header in 32-bit multiples, it is accurate
to calculate data address by th+HLEN*4 */ to calculate data address by th+HLEN*4 */
data = data_start = (char *)th + (th->doff << 2); data = data_start = (char *)th + (th->doff << 2);
data_limit = skb->tail; data_limit = (*pskb)->tail;
while (data < data_limit) { while (data <= data_limit - 6) {
if (strnicmp(data, "PASV\r\n", 6) == 0) { if (strnicmp(data, "PASV\r\n", 6) == 0) {
/* Passive mode on */
IP_VS_DBG(1-debug, "got PASV at %d of %d\n", IP_VS_DBG(1-debug, "got PASV at %d of %d\n",
data - data_start, data - data_start,
data_limit - data_start); data_limit - data_start);
cp->app_data = &ip_vs_ftp_pasv; cp->app_data = &ip_vs_ftp_pasv;
return 0; return 1;
} }
data++; data++;
} }
...@@ -278,28 +299,28 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, ...@@ -278,28 +299,28 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
* then create a new connection entry for the coming data * then create a new connection entry for the coming data
* connection. * connection.
*/ */
data = data_start; if (ip_vs_ftp_get_addrport(data_start, data_limit,
data_limit = skb->h.raw + skb->len - 18;
if (ip_vs_ftp_get_addrport(data, data_limit,
CLIENT_STRING, sizeof(CLIENT_STRING)-1, CLIENT_STRING, sizeof(CLIENT_STRING)-1,
'\r', &to, &port, '\r', &to, &port,
&start, &end) == 0) &start, &end) != 1)
return 0; return 1;
IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n", IP_VS_DBG(1-debug, "PORT %u.%u.%u.%u:%d detected\n",
NIPQUAD(to), ntohs(port)); NIPQUAD(to), ntohs(port));
/* Passive mode off */
cp->app_data = NULL;
/* /*
* Now update or create a connection entry for it * Now update or create a connection entry for it
*/ */
IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n", IP_VS_DBG(1-debug, "protocol %s %u.%u.%u.%u:%d %u.%u.%u.%u:%d\n",
ip_vs_proto_name(iph->protocol), ip_vs_proto_name(iph->protocol),
NIPQUAD(to), ntohs(port), NIPQUAD(iph->daddr), 0); NIPQUAD(to), ntohs(port), NIPQUAD(cp->vaddr), 0);
n_cp = ip_vs_conn_in_get(iph->protocol, n_cp = ip_vs_conn_in_get(iph->protocol,
to, port, to, port,
iph->daddr, htons(ntohs(cp->vport)-1)); cp->vaddr, htons(ntohs(cp->vport)-1));
if (!n_cp) { if (!n_cp) {
n_cp = ip_vs_conn_new(IPPROTO_TCP, n_cp = ip_vs_conn_new(IPPROTO_TCP,
to, port, to, port,
...@@ -320,8 +341,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, ...@@ -320,8 +341,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app,
ip_vs_tcp_conn_listen(n_cp); ip_vs_tcp_conn_listen(n_cp);
ip_vs_conn_put(n_cp); ip_vs_conn_put(n_cp);
/* no diff required for incoming packets */ return 1;
return 0;
} }
......
...@@ -523,11 +523,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) ...@@ -523,11 +523,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling * Locality-Based (weighted) Least-Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_lblc_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_lblc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
struct ip_vs_lblc_table *tbl; struct ip_vs_lblc_table *tbl;
struct ip_vs_lblc_entry *en; struct ip_vs_lblc_entry *en;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n"); IP_VS_DBG(6, "ip_vs_lblc_schedule(): Scheduling...\n");
......
...@@ -777,11 +777,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc) ...@@ -777,11 +777,12 @@ is_overloaded(struct ip_vs_dest *dest, struct ip_vs_service *svc)
* Locality-Based (weighted) Least-Connection scheduling * Locality-Based (weighted) Least-Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_lblcr_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_lblcr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
struct ip_vs_lblcr_table *tbl; struct ip_vs_lblcr_table *tbl;
struct ip_vs_lblcr_entry *en; struct ip_vs_lblcr_entry *en;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n"); IP_VS_DBG(6, "ip_vs_lblcr_schedule(): Scheduling...\n");
......
...@@ -63,7 +63,7 @@ ip_vs_lc_dest_overhead(struct ip_vs_dest *dest) ...@@ -63,7 +63,7 @@ ip_vs_lc_dest_overhead(struct ip_vs_dest *dest)
* Least Connection scheduling * Least Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_lc_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_lc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest, *least = NULL; struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh; unsigned int loh = 0, doh;
......
...@@ -79,7 +79,7 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest) ...@@ -79,7 +79,7 @@ ip_vs_nq_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling * Weighted Least Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_nq_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_nq_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest, *least = NULL; struct ip_vs_dest *dest, *least = NULL;
unsigned int loh = 0, doh; unsigned int loh = 0, doh;
......
...@@ -164,22 +164,33 @@ const char * ip_vs_state_name(__u16 proto, int state) ...@@ -164,22 +164,33 @@ const char * ip_vs_state_name(__u16 proto, int state)
void void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{ {
char buf[128]; char buf[128];
union ip_vs_tphdr h; __u16 ports[2];
struct iphdr iph;
h.raw = (char *) iph + iph->ihl * 4; if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
if (iph->frag_off & __constant_htons(IP_OFFSET)) sprintf(buf, "%s TRUNCATED", pp->name);
else if (iph.frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else if (skb_copy_bits(skb, offset + iph.ihl*4, ports, sizeof(ports)) < 0)
sprintf(buf, "%s TRUNCATED %u.%u.%u.%u->%u.%u.%u.%u",
pp->name,
NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else else
sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u", sprintf(buf, "%s %u.%u.%u.%u:%u->%u.%u.%u.%u:%u",
pp->name, pp->name,
NIPQUAD(iph->saddr), NIPQUAD(iph.saddr),
ntohs(h.portp[0]), ntohs(ports[0]),
NIPQUAD(iph->daddr), NIPQUAD(iph.daddr),
ntohs(h.portp[1])); ntohs(ports[1]));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
} }
......
...@@ -44,8 +44,11 @@ struct isakmp_hdr { ...@@ -44,8 +44,11 @@ struct isakmp_hdr {
static struct ip_vs_conn * static struct ip_vs_conn *
ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ah_conn_in_get(const struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -81,8 +84,8 @@ ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -81,8 +84,8 @@ ah_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn * static struct ip_vs_conn *
ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ah_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -119,8 +122,8 @@ ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -119,8 +122,8 @@ ah_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int static int
ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ah_conn_schedule(struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp) int *verdict, struct ip_vs_conn **cpp)
{ {
/* /*
...@@ -132,12 +135,18 @@ ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -132,12 +135,18 @@ ah_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static void static void
ah_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) ah_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{ {
char buf[256]; char buf[256];
struct iphdr iph;
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
} }
......
...@@ -44,8 +44,11 @@ struct isakmp_hdr { ...@@ -44,8 +44,11 @@ struct isakmp_hdr {
static struct ip_vs_conn * static struct ip_vs_conn *
esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, esp_conn_in_get(const struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -81,8 +84,8 @@ esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -81,8 +84,8 @@ esp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn * static struct ip_vs_conn *
esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, esp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -120,7 +123,6 @@ esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -120,7 +123,6 @@ esp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int static int
esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp) int *verdict, struct ip_vs_conn **cpp)
{ {
/* /*
...@@ -132,12 +134,18 @@ esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -132,12 +134,18 @@ esp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static void static void
esp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb,
int offset, const char *msg)
{ {
char buf[256]; char buf[256];
struct iphdr iph;
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
sprintf(buf, "%s TRUNCATED", pp->name);
else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u",
pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
} }
......
...@@ -28,8 +28,11 @@ static int icmp_timeouts[1] = { 1*60*HZ }; ...@@ -28,8 +28,11 @@ static int icmp_timeouts[1] = { 1*60*HZ };
static char * icmp_state_name_table[1] = { "ICMP" }; static char * icmp_state_name_table[1] = { "ICMP" };
struct ip_vs_conn * struct ip_vs_conn *
icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, icmp_conn_in_get(const struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{ {
#if 0 #if 0
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -52,8 +55,11 @@ icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -52,8 +55,11 @@ icmp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
} }
struct ip_vs_conn * struct ip_vs_conn *
icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, icmp_conn_out_get(const struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) struct ip_vs_protocol *pp,
const struct iphdr *iph,
unsigned int proto_off,
int inverse)
{ {
#if 0 #if 0
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
...@@ -76,7 +82,6 @@ icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -76,7 +82,6 @@ icmp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int static int
icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp) int *verdict, struct ip_vs_conn **cpp)
{ {
*verdict = NF_ACCEPT; *verdict = NF_ACCEPT;
...@@ -84,41 +89,51 @@ icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -84,41 +89,51 @@ icmp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
} }
static int static int
icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, icmp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
if (!(iph->frag_off & __constant_htons(IP_OFFSET))) { if (!(skb->nh.iph->frag_off & __constant_htons(IP_OFFSET))) {
if (ip_compute_csum(h.raw, size)) { if (skb->ip_summed != CHECKSUM_UNNECESSARY) {
IP_VS_DBG_RL_PKT(0, pp, iph, "Failed checksum for"); if (ip_vs_checksum_complete(skb, skb->nh.iph->ihl * 4)) {
return 0; IP_VS_DBG_RL_PKT(0, pp, skb, 0, "Failed checksum for");
return 0;
}
} }
} }
return 1; return 1;
} }
static void static void
icmp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg) icmp_debug_packet(struct ip_vs_protocol *pp,
const struct sk_buff *skb,
int offset,
const char *msg)
{ {
char buf[256]; char buf[256];
union ip_vs_tphdr h; struct iphdr iph;
struct icmphdr icmph;
h.raw = (char *) iph + iph->ihl * 4; if (skb_copy_bits(skb, offset, &iph, sizeof(iph)) < 0)
if (iph->frag_off & __constant_htons(IP_OFFSET)) sprintf(buf, "%s TRUNCATED", pp->name);
else if (iph.frag_off & __constant_htons(IP_OFFSET))
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag", sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u frag",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr)); pp->name, NIPQUAD(iph.saddr),
NIPQUAD(iph.daddr));
else if (skb_copy_bits(skb, offset + iph.ihl*4, &icmph, sizeof(icmph)) < 0)
sprintf(buf, "%s TRUNCATED to %u bytes\n",
pp->name, skb->len - offset);
else else
sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d", sprintf(buf, "%s %u.%u.%u.%u->%u.%u.%u.%u T:%d C:%d",
pp->name, NIPQUAD(iph->saddr), NIPQUAD(iph->daddr), pp->name, NIPQUAD(iph.saddr),
h.icmph->type, h.icmph->code); NIPQUAD(iph.daddr),
icmph.type, icmph.code);
printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf); printk(KERN_DEBUG "IPVS: %s: %s\n", msg, buf);
} }
static int static int
icmp_state_transition(struct ip_vs_conn *cp, icmp_state_transition(struct ip_vs_conn *cp, int direction,
int direction, struct iphdr *iph, const struct sk_buff *skb,
union ip_vs_tphdr h, struct ip_vs_protocol *pp) struct ip_vs_protocol *pp)
{ {
cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL]; cp->timeout = pp->timeout_table[IP_VS_ICMP_S_NORMAL];
return 1; return 1;
......
...@@ -21,52 +21,68 @@ ...@@ -21,52 +21,68 @@
#include <linux/tcp.h> /* for tcphdr */ #include <linux/tcp.h> /* for tcphdr */
#include <net/ip.h> #include <net/ip.h>
#include <net/tcp.h> /* for csum_tcpudp_magic */ #include <net/tcp.h> /* for csum_tcpudp_magic */
#include <linux/netfilter.h> #include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h> #include <net/ip_vs.h>
static struct ip_vs_conn * static struct ip_vs_conn *
tcp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) { if (likely(!inverse)) {
return ip_vs_conn_in_get(iph->protocol, return ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.th->source, iph->saddr, ports[0],
iph->daddr, h.th->dest); iph->daddr, ports[1]);
} else { } else {
return ip_vs_conn_in_get(iph->protocol, return ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.th->dest, iph->daddr, ports[1],
iph->saddr, h.th->source); iph->saddr, ports[0]);
} }
} }
static struct ip_vs_conn * static struct ip_vs_conn *
tcp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) { if (likely(!inverse)) {
return ip_vs_conn_out_get(iph->protocol, return ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.th->source, iph->saddr, ports[0],
iph->daddr, h.th->dest); iph->daddr, ports[1]);
} else { } else {
return ip_vs_conn_out_get(iph->protocol, return ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.th->dest, iph->daddr, ports[1],
iph->saddr, h.th->source); iph->saddr, ports[0]);
} }
} }
static int static int
tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_conn_schedule(struct sk_buff *skb,
struct iphdr *iph, union ip_vs_tphdr h, struct ip_vs_protocol *pp,
int *verdict, struct ip_vs_conn **cpp) int *verdict, struct ip_vs_conn **cpp)
{ {
struct ip_vs_service *svc; struct ip_vs_service *svc;
struct tcphdr tcph;
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0) {
*verdict = NF_DROP;
return 0;
}
if (h.th->syn && if (tcph.syn &&
(svc = ip_vs_service_get(skb->nfmark, iph->protocol, (svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
iph->daddr, h.portp[1]))) { skb->nh.iph->daddr, tcph.dest))) {
if (ip_vs_todrop()) { if (ip_vs_todrop()) {
/* /*
* It seems that we are very loaded. * It seems that we are very loaded.
...@@ -81,9 +97,9 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -81,9 +97,9 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the * Let the virtual server select a real server for the
* incoming connection, and create a connection entry. * incoming connection, and create a connection entry.
*/ */
*cpp = ip_vs_schedule(svc, iph); *cpp = ip_vs_schedule(svc, skb);
if (!*cpp) { if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h); *verdict = ip_vs_leave(svc, skb, pp);
return 0; return 0;
} }
ip_vs_service_put(svc); ip_vs_service_put(svc);
...@@ -93,111 +109,128 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -93,111 +109,128 @@ tcp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static inline void static inline void
tcp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip, tcp_fast_csum_update(struct tcphdr *tcph, u32 oldip, u32 newip,
u16 oldport, u16 newport) u16 oldport, u16 newport)
{ {
h->th->check = tcph->check =
ip_vs_check_diff(~oldip, newip, ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF, ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->th->check)); newport, tcph->check));
} }
static int static int
tcp_snat_handler(struct sk_buff *skb, tcp_snat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
int ihl = (char *) h.raw - (char *) iph; struct tcphdr *tcph;
unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */ /* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
return 0;
h.th->source = cp->vport; if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/* Call application helper if needed */ /* Call application helper if needed */
if (ip_vs_app_pkt_out(cp, skb) != 0) { if (!ip_vs_app_pkt_out(cp, pskb))
/* skb data has probably changed, update pointers */ return 0;
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
} }
tcph = (void *)(*pskb)->nh.iph + tcphoff;
tcph->source = cp->vport;
/* Adjust TCP checksums */ /* Adjust TCP checksums */
if (!cp->app) { if (!cp->app) {
/* Only port and addr are changed, do fast csum update */ /* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->daddr, cp->vaddr, tcp_fast_csum_update(tcph, cp->daddr, cp->vaddr,
cp->dport, cp->vport); cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW) if ((*pskb)->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE; (*pskb)->ip_summed = CHECKSUM_NONE;
} else { } else {
/* full checksum calculation */ /* full checksum calculation */
h.th->check = 0; tcph->check = 0;
skb->csum = csum_partial(h.raw, size, 0); (*pskb)->csum = skb_checksum(*pskb, tcphoff,
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, (*pskb)->len - tcphoff, 0);
size, iph->protocol, tcph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
skb->csum); (*pskb)->len - tcphoff,
cp->protocol,
(*pskb)->csum);
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.th->check, pp->name, tcph->check,
(char*)&(h.th->check) - (char*)h.raw); (char*)&(tcph->check) - (char*)tcph);
} }
return 1; return 1;
} }
static int static int
tcp_dnat_handler(struct sk_buff *skb, tcp_dnat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
int ihl = (char *) h.raw - (char *) iph; struct tcphdr *tcph;
unsigned int tcphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */ /* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, tcphoff+sizeof(*tcph)))
return 0;
h.th->dest = cp->dport; if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/* /*
* Attempt ip_vs_app call. * Attempt ip_vs_app call.
* It will fix ip_vs_conn and iph ack_seq stuff * It will fix ip_vs_conn and iph ack_seq stuff
*/ */
if (ip_vs_app_pkt_in(cp, skb) != 0) { if (!ip_vs_app_pkt_in(cp, pskb))
/* skb data has probably changed, update pointers */ return 0;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
} }
tcph = (void *)(*pskb)->nh.iph + tcphoff;
tcph->dest = cp->dport;
/* /*
* Adjust TCP/UDP checksums * Adjust TCP checksums
*/ */
if (!cp->app) { if (!cp->app) {
/* Only port and addr are changed, do fast csum update */ /* Only port and addr are changed, do fast csum update */
tcp_fast_csum_update(&h, cp->vaddr, cp->daddr, tcp_fast_csum_update(tcph, cp->vaddr, cp->daddr,
cp->vport, cp->dport); cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW) if ((*pskb)->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE; (*pskb)->ip_summed = CHECKSUM_NONE;
} else { } else {
/* full checksum calculation */ /* full checksum calculation */
h.th->check = 0; tcph->check = 0;
h.th->check = csum_tcpudp_magic(iph->saddr, iph->daddr, (*pskb)->csum = skb_checksum(*pskb, tcphoff,
size, iph->protocol, (*pskb)->len - tcphoff, 0);
csum_partial(h.raw, size, 0)); tcph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
skb->ip_summed = CHECKSUM_UNNECESSARY; (*pskb)->len - tcphoff,
cp->protocol,
(*pskb)->csum);
(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
} }
return 1; return 1;
} }
static int static int
tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, tcp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
unsigned int tcphoff = skb->nh.iph->ihl*4;
switch (skb->ip_summed) { switch (skb->ip_summed) {
case CHECKSUM_NONE: case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0); skb->csum = skb_checksum(skb, tcphoff, skb->len - tcphoff, 0);
case CHECKSUM_HW: case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, if (csum_tcpudp_magic(skb->nh.iph->saddr, skb->nh.iph->daddr,
iph->protocol, skb->csum)) { skb->len - tcphoff,
IP_VS_DBG_RL_PKT(0, pp, iph, skb->nh.iph->protocol, skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for"); "Failed checksum for");
return 0; return 0;
} }
...@@ -383,10 +416,9 @@ static inline int tcp_state_idx(struct tcphdr *th) ...@@ -383,10 +416,9 @@ static inline int tcp_state_idx(struct tcphdr *th)
static inline void static inline void
set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
int direction, union ip_vs_tphdr h) int direction, struct tcphdr *th)
{ {
int state_idx; int state_idx;
struct tcphdr *th = h.th;
int new_state = IP_VS_TCP_S_CLOSE; int new_state = IP_VS_TCP_S_CLOSE;
int state_off = tcp_state_off[direction]; int state_off = tcp_state_off[direction];
...@@ -448,12 +480,17 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, ...@@ -448,12 +480,17 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp,
* Handle state transitions * Handle state transitions
*/ */
static int static int
tcp_state_transition(struct ip_vs_conn *cp, tcp_state_transition(struct ip_vs_conn *cp, int direction,
int direction, struct iphdr *iph, const struct sk_buff *skb,
union ip_vs_tphdr h, struct ip_vs_protocol *pp) struct ip_vs_protocol *pp)
{ {
struct tcphdr tcph;
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &tcph, sizeof(tcph)) < 0)
return 0;
spin_lock(&cp->lock); spin_lock(&cp->lock);
set_tcp_state(pp, cp, direction, h); set_tcp_state(pp, cp, direction, &tcph);
spin_unlock(&cp->lock); spin_unlock(&cp->lock);
return 1; return 1;
...@@ -574,9 +611,6 @@ static void tcp_exit(struct ip_vs_protocol *pp) ...@@ -574,9 +611,6 @@ static void tcp_exit(struct ip_vs_protocol *pp)
} }
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_tcp = { struct ip_vs_protocol ip_vs_protocol_tcp = {
.name = "TCP", .name = "TCP",
.protocol = IPPROTO_TCP, .protocol = IPPROTO_TCP,
...@@ -599,7 +633,7 @@ struct ip_vs_protocol ip_vs_protocol_tcp = { ...@@ -599,7 +633,7 @@ struct ip_vs_protocol ip_vs_protocol_tcp = {
.state_name = tcp_state_name, .state_name = tcp_state_name,
.state_transition = tcp_state_transition, .state_transition = tcp_state_transition,
.app_conn_bind = tcp_app_conn_bind, .app_conn_bind = tcp_app_conn_bind,
.debug_packet = tcpudp_debug_packet, .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = tcp_timeout_change, .timeout_change = tcp_timeout_change,
.set_state_timeout = tcp_set_state_timeout, .set_state_timeout = tcp_set_state_timeout,
}; };
...@@ -16,25 +16,29 @@ ...@@ -16,25 +16,29 @@
*/ */
#include <linux/kernel.h> #include <linux/kernel.h>
#include <linux/netfilter.h> #include <linux/netfilter_ipv4.h>
#include <net/ip_vs.h> #include <net/ip_vs.h>
static struct ip_vs_conn * static struct ip_vs_conn *
udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_conn_in_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
__u16 ports[2];
if (skb_copy_bits(skb, proto_off, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) { if (likely(!inverse)) {
cp = ip_vs_conn_in_get(iph->protocol, cp = ip_vs_conn_in_get(iph->protocol,
iph->saddr, h.portp[0], iph->saddr, ports[0],
iph->daddr, h.portp[1]); iph->daddr, ports[1]);
} else { } else {
cp = ip_vs_conn_in_get(iph->protocol, cp = ip_vs_conn_in_get(iph->protocol,
iph->daddr, h.portp[1], iph->daddr, ports[1],
iph->saddr, h.portp[0]); iph->saddr, ports[0]);
} }
return cp; return cp;
...@@ -42,19 +46,23 @@ udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -42,19 +46,23 @@ udp_conn_in_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static struct ip_vs_conn * static struct ip_vs_conn *
udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_conn_out_get(const struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h, int inverse) const struct iphdr *iph, unsigned int proto_off, int inverse)
{ {
struct ip_vs_conn *cp; struct ip_vs_conn *cp;
__u16 ports[2];
if (skb_copy_bits(skb, skb->nh.iph->ihl*4, ports, sizeof(ports)) < 0)
return NULL;
if (likely(!inverse)) { if (likely(!inverse)) {
cp = ip_vs_conn_out_get(iph->protocol, cp = ip_vs_conn_out_get(iph->protocol,
iph->saddr, h.portp[0], iph->saddr, ports[0],
iph->daddr, h.portp[1]); iph->daddr, ports[1]);
} else { } else {
cp = ip_vs_conn_out_get(iph->protocol, cp = ip_vs_conn_out_get(iph->protocol,
iph->daddr, h.portp[1], iph->daddr, ports[1],
iph->saddr, h.portp[0]); iph->saddr, ports[0]);
} }
return cp; return cp;
...@@ -63,13 +71,18 @@ udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -63,13 +71,18 @@ udp_conn_out_get(struct sk_buff *skb, struct ip_vs_protocol *pp,
static int static int
udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
struct iphdr *iph, union ip_vs_tphdr h,
int *verdict, struct ip_vs_conn **cpp) int *verdict, struct ip_vs_conn **cpp)
{ {
struct ip_vs_service *svc; struct ip_vs_service *svc;
struct udphdr udph;
if ((svc = ip_vs_service_get(skb->nfmark, iph->protocol, if (skb_copy_bits(skb, skb->nh.iph->ihl*4, &udph, sizeof(udph)) < 0) {
iph->daddr, h.portp[1]))) { *verdict = NF_DROP;
return 0;
}
if ((svc = ip_vs_service_get(skb->nfmark, skb->nh.iph->protocol,
skb->nh.iph->daddr, udph.dest))) {
if (ip_vs_todrop()) { if (ip_vs_todrop()) {
/* /*
* It seems that we are very loaded. * It seems that we are very loaded.
...@@ -84,9 +97,9 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -84,9 +97,9 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
* Let the virtual server select a real server for the * Let the virtual server select a real server for the
* incoming connection, and create a connection entry. * incoming connection, and create a connection entry.
*/ */
*cpp = ip_vs_schedule(svc, iph); *cpp = ip_vs_schedule(svc, skb);
if (!*cpp) { if (!*cpp) {
*verdict = ip_vs_leave(svc, skb, pp, h); *verdict = ip_vs_leave(svc, skb, pp);
return 0; return 0;
} }
ip_vs_service_put(svc); ip_vs_service_put(svc);
...@@ -96,121 +109,145 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp, ...@@ -96,121 +109,145 @@ udp_conn_schedule(struct sk_buff *skb, struct ip_vs_protocol *pp,
static inline void static inline void
udp_fast_csum_update(union ip_vs_tphdr *h, u32 oldip, u32 newip, udp_fast_csum_update(struct udphdr *uhdr, u32 oldip, u32 newip,
u16 oldport, u16 newport) u16 oldport, u16 newport)
{ {
h->uh->check = uhdr->check =
ip_vs_check_diff(~oldip, newip, ip_vs_check_diff(~oldip, newip,
ip_vs_check_diff(oldport ^ 0xFFFF, ip_vs_check_diff(oldport ^ 0xFFFF,
newport, h->uh->check)); newport, uhdr->check));
if (!h->uh->check) if (!uhdr->check)
h->uh->check = 0xFFFF; uhdr->check = 0xFFFF;
} }
static int static int
udp_snat_handler(struct sk_buff *skb, udp_snat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
int ihl = (char *) h.raw - (char *) iph; struct udphdr *udph;
unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */ /* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
return 0;
h.portp[0] = cp->vport; if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/* /*
* Call application helper if needed * Call application helper if needed
*/ */
if (ip_vs_app_pkt_out(cp, skb) != 0) { if (!ip_vs_app_pkt_out(cp, pskb))
/* skb data has probably changed, update pointers */ return 0;
iph = skb->nh.iph;
h.raw = (char*)iph + ihl;
size = skb->len - ihl;
} }
udph = (void *)(*pskb)->nh.iph + udphoff;
udph->source = cp->vport;
/* /*
* Adjust UDP checksums * Adjust UDP checksums
*/ */
if (!cp->app && (h.uh->check != 0)) { if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */ /* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->daddr, cp->vaddr, udp_fast_csum_update(udph, cp->daddr, cp->vaddr,
cp->dport, cp->vport); cp->dport, cp->vport);
if (skb->ip_summed == CHECKSUM_HW) if ((*pskb)->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE; (*pskb)->ip_summed = CHECKSUM_NONE;
} else { } else {
/* full checksum calculation */ /* full checksum calculation */
h.uh->check = 0; udph->check = 0;
skb->csum = csum_partial(h.raw, size, 0); (*pskb)->csum = skb_checksum(*pskb, udphoff,
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, (*pskb)->len - udphoff, 0);
size, iph->protocol, udph->check = csum_tcpudp_magic(cp->vaddr, cp->caddr,
skb->csum); (*pskb)->len - udphoff,
if (h.uh->check == 0) cp->protocol,
h.uh->check = 0xFFFF; (*pskb)->csum);
if (udph->check == 0)
udph->check = 0xFFFF;
IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n", IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%d)\n",
pp->name, h.uh->check, pp->name, udph->check,
(char*)&(h.uh->check) - (char*)h.raw); (char*)&(udph->check) - (char*)udph);
} }
return 1; return 1;
} }
static int static int
udp_dnat_handler(struct sk_buff *skb, udp_dnat_handler(struct sk_buff **pskb,
struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_protocol *pp, struct ip_vs_conn *cp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
int ihl = (char *) h.raw - (char *) iph; struct udphdr *udph;
unsigned int udphoff = (*pskb)->nh.iph->ihl * 4;
/* We are sure that we work on first fragment */ /* csum_check requires unshared skb */
if (!ip_vs_make_skb_writable(pskb, udphoff+sizeof(*udph)))
return 0;
h.portp[1] = cp->dport; if (unlikely(cp->app != NULL)) {
/* Some checks before mangling */
if (pp->csum_check && !pp->slave && !pp->csum_check(*pskb, pp))
return 0;
/* /*
* Attempt ip_vs_app call. * Attempt ip_vs_app call.
* will fix ip_vs_conn and iph ack_seq stuff * It will fix ip_vs_conn
*/ */
if (ip_vs_app_pkt_in(cp, skb) != 0) { if (!ip_vs_app_pkt_in(cp, pskb))
/* skb data has probably changed, update pointers */ return 0;
iph = skb->nh.iph;
h.raw = (char*) iph + ihl;
size = skb->len - ihl;
} }
udph = (void *)(*pskb)->nh.iph + udphoff;
udph->dest = cp->dport;
/* /*
* Adjust UDP checksums * Adjust UDP checksums
*/ */
if (!cp->app && (h.uh->check != 0)) { if (!cp->app && (udph->check != 0)) {
/* Only port and addr are changed, do fast csum update */ /* Only port and addr are changed, do fast csum update */
udp_fast_csum_update(&h, cp->vaddr, cp->daddr, udp_fast_csum_update(udph, cp->vaddr, cp->daddr,
cp->vport, cp->dport); cp->vport, cp->dport);
if (skb->ip_summed == CHECKSUM_HW) if ((*pskb)->ip_summed == CHECKSUM_HW)
skb->ip_summed = CHECKSUM_NONE; (*pskb)->ip_summed = CHECKSUM_NONE;
} else { } else {
/* full checksum calculation */ /* full checksum calculation */
h.uh->check = 0; udph->check = 0;
h.uh->check = csum_tcpudp_magic(iph->saddr, iph->daddr, (*pskb)->csum = skb_checksum(*pskb, udphoff,
size, iph->protocol, (*pskb)->len - udphoff, 0);
csum_partial(h.raw, size, 0)); udph->check = csum_tcpudp_magic(cp->caddr, cp->daddr,
if (h.uh->check == 0) (*pskb)->len - udphoff,
h.uh->check = 0xFFFF; cp->protocol,
skb->ip_summed = CHECKSUM_UNNECESSARY; (*pskb)->csum);
if (udph->check == 0)
udph->check = 0xFFFF;
(*pskb)->ip_summed = CHECKSUM_UNNECESSARY;
} }
return 1; return 1;
} }
static int static int
udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp, udp_csum_check(struct sk_buff *skb, struct ip_vs_protocol *pp)
struct iphdr *iph, union ip_vs_tphdr h, int size)
{ {
if (h.uh->check != 0) { struct udphdr udph;
unsigned int udphoff = skb->nh.iph->ihl*4;
if (skb_copy_bits(skb, udphoff, &udph, sizeof(udph)) < 0)
return 0;
if (udph.check != 0) {
switch (skb->ip_summed) { switch (skb->ip_summed) {
case CHECKSUM_NONE: case CHECKSUM_NONE:
skb->csum = csum_partial(h.raw, size, 0); skb->csum = skb_checksum(skb, udphoff,
skb->len - udphoff, 0);
case CHECKSUM_HW: case CHECKSUM_HW:
if (csum_tcpudp_magic(iph->saddr, iph->daddr, size, if (csum_tcpudp_magic(skb->nh.iph->saddr,
iph->protocol, skb->csum)) { skb->nh.iph->daddr,
IP_VS_DBG_RL_PKT(0, pp, iph, skb->len - udphoff,
skb->nh.iph->protocol,
skb->csum)) {
IP_VS_DBG_RL_PKT(0, pp, skb, 0,
"Failed checksum for"); "Failed checksum for");
return 0; return 0;
} }
...@@ -342,9 +379,9 @@ static const char * udp_state_name(int state) ...@@ -342,9 +379,9 @@ static const char * udp_state_name(int state)
} }
static int static int
udp_state_transition(struct ip_vs_conn *cp, udp_state_transition(struct ip_vs_conn *cp, int direction,
int direction, struct iphdr *iph, const struct sk_buff *skb,
union ip_vs_tphdr h, struct ip_vs_protocol *pp) struct ip_vs_protocol *pp)
{ {
cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL]; cp->timeout = pp->timeout_table[IP_VS_UDP_S_NORMAL];
return 1; return 1;
...@@ -361,9 +398,6 @@ static void udp_exit(struct ip_vs_protocol *pp) ...@@ -361,9 +398,6 @@ static void udp_exit(struct ip_vs_protocol *pp)
} }
extern void
tcpudp_debug_packet(struct ip_vs_protocol *pp, struct iphdr *iph, char *msg);
struct ip_vs_protocol ip_vs_protocol_udp = { struct ip_vs_protocol ip_vs_protocol_udp = {
.name = "UDP", .name = "UDP",
.protocol = IPPROTO_UDP, .protocol = IPPROTO_UDP,
...@@ -385,7 +419,7 @@ struct ip_vs_protocol ip_vs_protocol_udp = { ...@@ -385,7 +419,7 @@ struct ip_vs_protocol ip_vs_protocol_udp = {
.register_app = udp_register_app, .register_app = udp_register_app,
.unregister_app = udp_unregister_app, .unregister_app = udp_unregister_app,
.app_conn_bind = udp_app_conn_bind, .app_conn_bind = udp_app_conn_bind,
.debug_packet = tcpudp_debug_packet, .debug_packet = ip_vs_tcpudp_debug_packet,
.timeout_change = NULL, .timeout_change = NULL,
.set_state_timeout = udp_set_state_timeout, .set_state_timeout = udp_set_state_timeout,
}; };
...@@ -55,7 +55,7 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc) ...@@ -55,7 +55,7 @@ static int ip_vs_rr_update_svc(struct ip_vs_service *svc)
* Round-Robin Scheduling * Round-Robin Scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_rr_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_rr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct list_head *p, *q; struct list_head *p, *q;
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
......
...@@ -83,7 +83,7 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest) ...@@ -83,7 +83,7 @@ ip_vs_sed_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling * Weighted Least Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_sed_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_sed_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest, *least; struct ip_vs_dest *dest, *least;
unsigned int loh, doh; unsigned int loh, doh;
......
...@@ -199,10 +199,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest) ...@@ -199,10 +199,11 @@ static inline int is_overloaded(struct ip_vs_dest *dest)
* Source Hashing scheduling * Source Hashing scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_sh_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_sh_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
struct ip_vs_sh_bucket *tbl; struct ip_vs_sh_bucket *tbl;
struct iphdr *iph = skb->nh.iph;
IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n"); IP_VS_DBG(6, "ip_vs_sh_schedule(): Scheduling...\n");
......
...@@ -71,7 +71,7 @@ ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest) ...@@ -71,7 +71,7 @@ ip_vs_wlc_dest_overhead(struct ip_vs_dest *dest)
* Weighted Least Connection scheduling * Weighted Least Connection scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_wlc_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_wlc_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest, *least; struct ip_vs_dest *dest, *least;
unsigned int loh, doh; unsigned int loh, doh;
......
...@@ -138,7 +138,7 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc) ...@@ -138,7 +138,7 @@ static int ip_vs_wrr_update_svc(struct ip_vs_service *svc)
* Weighted Round-Robin Scheduling * Weighted Round-Robin Scheduling
*/ */
static struct ip_vs_dest * static struct ip_vs_dest *
ip_vs_wrr_schedule(struct ip_vs_service *svc, struct iphdr *iph) ip_vs_wrr_schedule(struct ip_vs_service *svc, const struct sk_buff *skb)
{ {
struct ip_vs_dest *dest; struct ip_vs_dest *dest;
struct ip_vs_wrr_mark *mark = svc->sched_data; struct ip_vs_wrr_mark *mark = svc->sched_data;
......
...@@ -128,32 +128,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) ...@@ -128,32 +128,11 @@ ip_vs_dst_reset(struct ip_vs_dest *dest)
} }
static inline int
ip_vs_skb_cow(struct sk_buff *skb, unsigned int headroom,
struct iphdr **iph_p, unsigned char **t_p)
{
int delta = (headroom > 16 ? headroom : 16) - skb_headroom(skb);
if (delta < 0)
delta = 0;
if (delta ||skb_cloned(skb)) {
if (pskb_expand_head(skb, (delta+15)&~15, 0, GFP_ATOMIC))
return -ENOMEM;
/* skb data changed, update pointers */
*iph_p = skb->nh.iph;
*t_p = (char*) (*iph_p) + (*iph_p)->ihl * 4;
}
return 0;
}
#define IP_VS_XMIT(skb, rt) \ #define IP_VS_XMIT(skb, rt) \
do { \ do { \
skb->nfcache |= NFC_IPVS_PROPERTY; \ (skb)->nfcache |= NFC_IPVS_PROPERTY; \
NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, \ NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, (skb), NULL, \
rt->u.dst.dev, dst_output); \ (rt)->u.dst.dev, dst_output); \
} while (0) } while (0)
...@@ -188,7 +167,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -188,7 +167,6 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
.daddr = iph->daddr, .daddr = iph->daddr,
.saddr = 0, .saddr = 0,
.tos = RT_TOS(tos), } }, .tos = RT_TOS(tos), } },
.proto = iph->protocol,
}; };
EnterFunction(10); EnterFunction(10);
...@@ -208,21 +186,23 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -208,21 +186,23 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error; goto tx_error;
} }
if (skb_is_nonlinear(skb) && skb->len <= mtu) /*
ip_send_check(iph); * Call ip_send_check because we are not sure it is called
* after ip_defrag. Is copy-on-write needed?
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { */
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
ip_rt_put(rt); ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_bypass_xmit(): no memory\n"); return NF_STOLEN;
goto tx_error;
}
} }
ip_send_check(skb->nh.iph);
/* drop old route */ /* drop old route */
dst_release(skb->dst); dst_release(skb->dst);
skb->dst = &rt->u.dst; skb->dst = &rt->u.dst;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG #ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0; skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */ #endif /* CONFIG_NETFILTER_DEBUG */
...@@ -234,8 +214,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -234,8 +214,8 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp: tx_error_icmp:
dst_link_failure(skb); dst_link_failure(skb);
tx_error: tx_error:
kfree_skb(skb); LeaveFunction(10);
return NF_STOLEN; return NF_DROP;
} }
...@@ -248,45 +228,18 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -248,45 +228,18 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp) struct ip_vs_protocol *pp)
{ {
struct rtable *rt; /* Route to the other host */ struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
union ip_vs_tphdr h;
int ihl;
unsigned short size;
int mtu; int mtu;
struct iphdr *iph = skb->nh.iph;
EnterFunction(10); EnterFunction(10);
/* /* check if it is a connection of no-client-port */
* If it has ip_vs_app helper, the helper may change the payload,
* so it needs full checksum checking and checksum calculation.
* If not, only the header (such as IP address and port number)
* will be changed, so it is fast to do incremental checksum update,
* and let the destination host do final checksum checking.
*/
if (unlikely(cp->app && !pp->slave)) {
if (skb_is_nonlinear(skb) &&
skb_linearize(skb, GFP_ATOMIC) != 0)
return NF_DROP;
}
iph = skb->nh.iph;
ihl = iph->ihl << 2;
h.raw = (char*) iph + ihl;
size = ntohs(iph->tot_len) - ihl;
/* do TCP/UDP checksum checking if it has application helper */
if (unlikely(cp->app && pp->csum_check && !pp->slave)) {
if (!pp->csum_check(skb, pp, iph, h, size))
goto tx_error;
}
/*
* Check if it is no clinet port connection ...
*/
if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) { if (unlikely(cp->flags & IP_VS_CONN_F_NO_CPORT)) {
ip_vs_conn_fill_cport(cp, h.portp[0]); __u16 pt;
IP_VS_DBG(10, "filled cport=%d\n", ntohs(cp->dport)); if (skb_copy_bits(skb, iph->ihl*4, &pt, sizeof(pt)) < 0)
goto tx_error;
ip_vs_conn_fill_cport(cp, pt);
IP_VS_DBG(10, "filled cport=%d\n", ntohs(pt));
} }
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos))))
...@@ -297,33 +250,36 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -297,33 +250,36 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt); ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL_PKT(0, pp, iph, "ip_vs_nat_xmit(): frag needed for"); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for");
goto tx_error; goto tx_error;
} }
/* copy-on-write the packet before mangling it */
if (!ip_vs_make_skb_writable(&skb, sizeof(struct iphdr)))
goto tx_error_put;
if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
goto tx_error_put;
/* drop old route */ /* drop old route */
dst_release(skb->dst); dst_release(skb->dst);
skb->dst = &rt->u.dst; skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, &iph, &h.raw))
return NF_DROP;
/* mangle the packet */ /* mangle the packet */
iph->daddr = cp->daddr; if (pp->dnat_handler && !pp->dnat_handler(&skb, pp, cp))
if (pp->dnat_handler) { goto tx_error;
pp->dnat_handler(skb, pp, cp, iph, h, size); skb->nh.iph->daddr = cp->daddr;
iph = skb->nh.iph; ip_send_check(skb->nh.iph);
h.raw = (char*) iph + ihl;
}
ip_send_check(iph);
IP_VS_DBG_PKT(10, pp, iph, "After DNAT"); IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT");
/* FIXME: when application helper enlarges the packet and the length /* FIXME: when application helper enlarges the packet and the length
is larger than the MTU of outgoing device, there will be still is larger than the MTU of outgoing device, there will be still
MTU problem. */ MTU problem. */
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG #ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0; skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */ #endif /* CONFIG_NETFILTER_DEBUG */
...@@ -335,8 +291,11 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -335,8 +291,11 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp: tx_error_icmp:
dst_link_failure(skb); dst_link_failure(skb);
tx_error: tx_error:
kfree_skb(skb); LeaveFunction(10);
return NF_STOLEN; return NF_DROP;
tx_error_put:
ip_rt_put(rt);
goto tx_error;
} }
...@@ -405,11 +364,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -405,11 +364,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error; goto tx_error;
} }
if (skb_is_nonlinear(skb))
ip_send_check(old_iph);
skb->h.raw = skb->nh.raw;
/* /*
* Okay, now see if we can stuff it in the buffer as-is. * Okay, now see if we can stuff it in the buffer as-is.
*/ */
...@@ -421,14 +375,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -421,14 +375,19 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
skb_realloc_headroom(skb, max_headroom); skb_realloc_headroom(skb, max_headroom);
if (!new_skb) { if (!new_skb) {
ip_rt_put(rt); ip_rt_put(rt);
kfree_skb(skb);
IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n"); IP_VS_ERR_RL("ip_vs_tunnel_xmit(): no memory\n");
return -EINVAL; return NF_DROP;
} }
kfree_skb(skb); kfree_skb(skb);
skb = new_skb; skb = new_skb;
old_iph = skb->nh.iph;
} }
skb->h.raw = (void *) old_iph;
/* fix old IP header checksum */
ip_send_check(old_iph);
skb->nh.raw = skb_push(skb, sizeof(struct iphdr)); skb->nh.raw = skb_push(skb, sizeof(struct iphdr));
memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
...@@ -453,9 +412,14 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -453,9 +412,14 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
ip_send_check(iph); ip_send_check(iph);
skb->ip_summed = CHECKSUM_NONE; skb->ip_summed = CHECKSUM_NONE;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG #ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0; skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */ #endif /* CONFIG_NETFILTER_DEBUG */
IP_VS_XMIT(skb, rt); IP_VS_XMIT(skb, rt);
LeaveFunction(10); LeaveFunction(10);
...@@ -465,8 +429,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -465,8 +429,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp: tx_error_icmp:
dst_link_failure(skb); dst_link_failure(skb);
tx_error: tx_error:
kfree_skb(skb); LeaveFunction(10);
return NF_STOLEN; return NF_DROP;
} }
...@@ -496,21 +460,23 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -496,21 +460,23 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto tx_error; goto tx_error;
} }
if (skb_is_nonlinear(skb) && skb->len <= mtu) /*
ip_send_check(iph); * Call ip_send_check because we are not sure it is called
* after ip_defrag. Is copy-on-write needed?
if (unlikely(skb_headroom(skb) < rt->u.dst.dev->hard_header_len)) { */
if (skb_cow(skb, rt->u.dst.dev->hard_header_len)) { if (unlikely((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL)) {
ip_rt_put(rt); ip_rt_put(rt);
IP_VS_ERR_RL("ip_vs_dr_xmit(): no memory\n"); return NF_STOLEN;
goto tx_error;
}
} }
ip_send_check(skb->nh.iph);
/* drop old route */ /* drop old route */
dst_release(skb->dst); dst_release(skb->dst);
skb->dst = &rt->u.dst; skb->dst = &rt->u.dst;
/* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG #ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0; skb->nf_debug = 0;
#endif /* CONFIG_NETFILTER_DEBUG */ #endif /* CONFIG_NETFILTER_DEBUG */
...@@ -522,8 +488,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -522,8 +488,8 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
tx_error_icmp: tx_error_icmp:
dst_link_failure(skb); dst_link_failure(skb);
tx_error: tx_error:
kfree_skb(skb); LeaveFunction(10);
return NF_STOLEN; return NF_DROP;
} }
...@@ -533,14 +499,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -533,14 +499,9 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
*/ */
int int
ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
struct ip_vs_protocol *pp) struct ip_vs_protocol *pp, int offset)
{ {
struct rtable *rt; /* Route to the other host */ struct rtable *rt; /* Route to the other host */
struct iphdr *iph;
struct icmphdr *icmph;
struct iphdr *ciph; /* The ip header contained within the ICMP */
unsigned short len;
union ip_vs_tphdr h;
int mtu; int mtu;
int rc; int rc;
...@@ -559,60 +520,37 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -559,60 +520,37 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
goto out; goto out;
} }
iph = skb->nh.iph;
icmph = (struct icmphdr *)((char *)iph+(iph->ihl<<2));
len = ntohs(iph->tot_len) - (iph->ihl<<2);
/* /*
* mangle and send the packet here (only for VS/NAT) * mangle and send the packet here (only for VS/NAT)
*/ */
if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(skb->nh.iph->tos))))
goto tx_error_icmp; goto tx_error_icmp;
/* MTU checking */ /* MTU checking */
mtu = dst_pmtu(&rt->u.dst); mtu = dst_pmtu(&rt->u.dst);
if ((skb->len > mtu) && (iph->frag_off&__constant_htons(IP_DF))) { if ((skb->len > mtu) && (skb->nh.iph->frag_off&__constant_htons(IP_DF))) {
ip_rt_put(rt); ip_rt_put(rt);
icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu));
IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n"); IP_VS_DBG_RL("ip_vs_in_icmp(): frag needed\n");
goto tx_error; goto tx_error;
} }
/* drop old route */
dst_release(skb->dst);
skb->dst = &rt->u.dst;
/* copy-on-write the packet before mangling it */ /* copy-on-write the packet before mangling it */
if (ip_vs_skb_cow(skb, rt->u.dst.dev->hard_header_len, if (!ip_vs_make_skb_writable(&skb, offset))
&iph, (unsigned char**)&icmph)) { goto tx_error_put;
rc = NF_DROP;
goto out;
}
ciph = (struct iphdr *) (icmph + 1);
h.raw = (char *) ciph + (ciph->ihl << 2);
/* The ICMP packet for VS/NAT must be written to correct addresses if (skb_cow(skb, rt->u.dst.dev->hard_header_len))
before being forwarded to the right server */ goto tx_error_put;
/* First change the dest IP address, and recalc checksum */ /* drop the old route when skb is not shared */
iph->daddr = cp->daddr; dst_release(skb->dst);
ip_send_check(iph); skb->dst = &rt->u.dst;
/* Now change the *source* address in the contained IP */
ciph->saddr = cp->daddr;
ip_send_check(ciph);
/* the TCP/UDP source port - cannot redo check */
if (IPPROTO_TCP == ciph->protocol || IPPROTO_UDP == ciph->protocol)
h.portp[0] = cp->dport;
/* And finally the ICMP checksum */ ip_vs_nat_icmp(skb, pp, cp, 0);
icmph->checksum = 0;
icmph->checksum = ip_compute_csum((unsigned char *) icmph, len);
skb->ip_summed = CHECKSUM_UNNECESSARY;
IP_VS_DBG_PKT(11, pp, ciph, "Forwarding incoming ICMP"); /* Another hack: avoid icmp_send in ip_fragment */
skb->local_df = 1;
#ifdef CONFIG_NETFILTER_DEBUG #ifdef CONFIG_NETFILTER_DEBUG
skb->nf_debug = 0; skb->nf_debug = 0;
...@@ -630,4 +568,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, ...@@ -630,4 +568,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
out: out:
LeaveFunction(10); LeaveFunction(10);
return rc; return rc;
tx_error_put:
ip_rt_put(rt);
goto tx_error;
} }
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment