Commit aba5acdf authored by osdl.org!shemminger's avatar osdl.org!shemminger

(Logical change 1.3)

parent 86fdf0e4
TC_CONFIG_DIFFSERV=n
TC_CONFIG_ATM=n
# Path to parent kernel include files directory
DESTDIR=
SBINDIR=/sbin
CONFDIR=/etc/iproute2
DOCDIR=/usr/doc/iproute2
KERNEL_INCLUDE=/usr/src/linux/include
LIBC_INCLUDE=/usr/include
DEFINES= -DRESOLVE_HOSTNAMES
#options if you have a bind>=4.9.4 libresolv (or, maybe, glibc)
LDLIBS=-lresolv
ADDLIB=
#options if you compile with libc5, and without a bind>=4.9.4 libresolv
#LDLIBS=
#ADDLIB=inet_ntop.o inet_pton.o
#options for decnet
ADDLIB+=dnet_ntop.o dnet_pton.o
#options for ipx
ADDLIB+=ipx_ntop.o ipx_pton.o
ifeq ($(LIBC_INCLUDE)/socketbits.h,$(wildcard $(LIBC_INCLUDE)/socketbits.h))
ifeq ($(LIBC_INCLUDE)/net/if_packet.h,$(wildcard $(LIBC_INCLUDE)/net/if_packet.h))
GLIBCFIX=-I../include-glibc -include ../include-glibc/glibc-bugs.h
endif
endif
ifeq ($(LIBC_INCLUDE)/bits/socket.h,$(wildcard $(LIBC_INCLUDE)/bits/socket.h))
GLIBCFIX=-I../include-glibc -I/usr/include/db3 -include ../include-glibc/glibc-bugs.h
endif
CC = gcc
CCOPTS = -D_GNU_SOURCE -O2 -Wstrict-prototypes -Wall -g
CFLAGS = $(CCOPTS) $(GLIBCFIX) -I$(KERNEL_INCLUDE) -I../include $(DEFINES)
LDLIBS += -L../lib -lnetlink -lutil
SUBDIRS=lib ip tc misc
LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a
all: check-kernel
@set -e; \
for i in $(SUBDIRS); \
do $(MAKE) -C $$i; done
check-kernel:
ifeq ($(KERNEL_INCLUDE),)
@echo "Please, set correct KERNEL_INCLUDE"; false
else
@set -e; \
if [ ! -r $(KERNEL_INCLUDE)/linux/autoconf.h ]; then \
echo "Please, compile the kernel first"; false; fi
endif
install: all
install -m 0755 -d $(DESTDIR)$(SBINDIR)
install -m 0755 -d $(DESTDIR)$(CONFDIR)
install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples
install -m 0755 -d $(DESTDIR)$(DOCDIR)/examples/diffserv
install -m 0644 README.iproute2+tc $(shell find examples -type f -maxdepth 1) $(DESTDIR)$(DOCDIR)/examples
install -m 0644 $(shell echo examples/diffserv/*) $(DESTDIR)$(DOCDIR)/examples/diffserv
@for i in $(SUBDIRS) doc; do $(MAKE) -C $$i install; done
@cd etc/iproute2; for i in *; do \
if [ ! -e $(DESTDIR)$(CONFDIR)/$$i ]; then \
echo install -m 0644 $$i $(DESTDIR)$(CONFDIR); \
install -m 0644 $$i $(DESTDIR)$(CONFDIR); fi; done
clean:
for i in $(SUBDIRS) doc; \
do $(MAKE) -C $$i clean; done
.EXPORT_ALL_VARIABLES:
File: tcp_diag.c
Status: desired for kernels < 2.4.17
not needed for kernels >= 2.4.17
Description: adds tcpdiag facility to kernel to accelerate ss utility
and pidentd
Side effects: none
\ No newline at end of file
This diff is collapsed.
File: rt_cache_stat.dif
Apply to: kernel < 2.4.7
Status: recommended for kernels < 2.4.7.
already present in >= 2.4.7
Description: tracing efficiency of routing cache
Side effects: none
File: pidentd-3.0.12.dif
Apply to: pident-3.0.12 tree f.e. from am redhat rpm
Status: highly recommended
Description: Patch to pidentd allowing to use tcpdiag facility and fixing
some bugs in original pident.
Side effects: none. Does not break anything not depending on kernel version,
even if tcpdiag is absent.
Advice: not related to this patch but should be said yet.
Do NOT configure pidentd to use threads! Use option
"--without-threads" when doing "configure".
pidentd is typical example of application where
threading results in nothing but collapse of performance.
Apparently author learned thread programming and decided
to apply new knowledge to the first victim.
File: symbol_exports.dif
Apply to: kernel < 2.4.17
Status: desired for kernels < 2.4.17
not needed for kernels >= 2.4.17
Description: exports symbols required to load tcpdiag module
tcpdiag is builtin since 2.4.17, hence the exports
are redundant.
Side effects: none
File: af_unix.dif
Apply to: kernel
Status: recommended
Desciption: implements fragmented skb for unix sockets reducing
vm pressure for datagram sockets and adds to /proc/net/unix
columns allowing to monitor recv/send memory and identify
peer of connected sockets.
Side effects: "lsof" blames something about unix sockets.
Not a big loss, lsof is not able to tell anything more
clever than "can't identify protocol" for sockets anyway.
Note: the patch affects area where one or two lines changed
several times while 2.4. It does not depend on this,
but unfortunately may reject. It apply cleanly to
2.4.17.
diff -ur ../vger3-011229/linux/net/unix/af_unix.c linux/net/unix/af_unix.c
--- ../vger3-011229/linux/net/unix/af_unix.c Mon Dec 3 20:24:03 2001
+++ linux/net/unix/af_unix.c Sat Jan 5 04:30:19 2002
@@ -112,6 +112,7 @@
#include <asm/checksum.h>
int sysctl_unix_max_dgram_qlen = 10;
+int sysctl_unix_stream_pages = MAX_SKB_FRAGS;
unix_socket *unix_socket_table[UNIX_HASH_SIZE+1];
rwlock_t unix_table_lock = RW_LOCK_UNLOCKED;
@@ -1123,9 +1124,6 @@
struct scm_cookie scm;
memset(&scm, 0, sizeof(scm));
unix_detach_fds(&scm, skb);
-
- /* Alas, it calls VFS */
- /* So fscking what? fput() had been SMP-safe since the last Summer */
scm_destroy(&scm);
sock_wfree(skb);
}
@@ -1140,6 +1138,67 @@
scm->fp = NULL;
}
+int datagram_copy_fromiovec(struct iovec *iov, struct sk_buff *skb, int size)
+{
+ struct sock *sk;
+ struct sk_buff **tail, *skb1;
+ int copy = min_t(int, size, skb_tailroom(skb));
+
+ if (memcpy_fromiovec(skb_put(skb, copy), iov, copy))
+ goto do_fault;
+
+ if ((size -= copy) == 0)
+ return 0;
+
+ sk = skb->sk;
+ skb1 = skb;
+ tail = &skb_shinfo(skb)->frag_list;
+
+ do {
+ struct page *page;
+ int i = skb_shinfo(skb1)->nr_frags;
+
+ if (i == MAX_SKB_FRAGS) {
+ skb1 = alloc_skb(0, sk->allocation);
+ if (skb1 == NULL)
+ goto do_oom;
+ *tail = skb1;
+ tail = &skb1->next;
+ i = 0;
+ skb->truesize += skb1->truesize;
+ atomic_add(skb1->truesize, &sk->wmem_alloc);
+ }
+
+ page = alloc_pages(sk->allocation, 0);
+ if (page == NULL)
+ goto do_oom;
+
+ copy = min_t(int, size, PAGE_SIZE);
+ skb_shinfo(skb1)->nr_frags=i+1;
+ skb_shinfo(skb1)->frags[i].page = page;
+ skb_shinfo(skb1)->frags[i].page_offset = 0;
+ skb_shinfo(skb1)->frags[i].size = copy;
+
+ skb1->len += copy;
+ skb1->data_len += copy;
+ if (skb != skb1) {
+ skb->len += copy;
+ skb->data_len += copy;
+ }
+ skb->truesize += PAGE_SIZE;
+ atomic_add(PAGE_SIZE, &sk->wmem_alloc);
+ if (memcpy_fromiovec(page_address(page), iov, copy))
+ goto do_fault;
+ } while ((size -= copy) > 0);
+ return 0;
+
+do_oom:
+ return -ENOMEM;
+
+do_fault:
+ return -EFAULT;
+}
+
/*
* Send AF_UNIX data.
*/
@@ -1155,6 +1214,7 @@
unsigned hash;
struct sk_buff *skb;
long timeo;
+ int alloc;
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
@@ -1178,10 +1238,14 @@
goto out;
err = -EMSGSIZE;
- if ((unsigned)len > sk->sndbuf - 32)
+ if ((unsigned)len > sk->sndbuf)
goto out;
- skb = sock_alloc_send_skb(sk, len, msg->msg_flags&MSG_DONTWAIT, &err);
+ alloc = len;
+ if (alloc > SKB_MAX_HEAD(0))
+ alloc = SKB_MAX_HEAD(0);
+
+ skb = sock_alloc_send_skb(sk, alloc, msg->msg_flags&MSG_DONTWAIT, &err);
if (skb==NULL)
goto out;
@@ -1190,7 +1254,7 @@
unix_attach_fds(scm, skb);
skb->h.raw = skb->data;
- err = memcpy_fromiovec(skb_put(skb,len), msg->msg_iov, len);
+ err = datagram_copy_fromiovec(msg->msg_iov, skb, len);
if (err)
goto out_free;
@@ -1275,74 +1339,57 @@
return err;
}
-
static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, int len,
struct scm_cookie *scm)
{
struct sock *sk = sock->sk;
unix_socket *other = NULL;
- struct sockaddr_un *sunaddr=msg->msg_name;
- int err,size;
struct sk_buff *skb;
+ int err;
int sent=0;
err = -EOPNOTSUPP;
if (msg->msg_flags&MSG_OOB)
goto out_err;
- if (msg->msg_namelen) {
- err = (sk->state==TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP);
+ err = -ENOTCONN;
+ other = unix_peer_get(sk);
+ if (!other)
goto out_err;
- } else {
- sunaddr = NULL;
- err = -ENOTCONN;
- other = unix_peer_get(sk);
- if (!other)
- goto out_err;
- }
if (sk->shutdown&SEND_SHUTDOWN)
goto pipe_err;
- while(sent < len)
- {
- /*
- * Optimisation for the fact that under 0.01% of X messages typically
- * need breaking up.
- */
+ while(sent < len) {
+ int size, alloc;
- size=len-sent;
+ size = len-sent;
/* Keep two messages in the pipe so it schedules better */
- if (size > sk->sndbuf/2 - 64)
- size = sk->sndbuf/2 - 64;
+ if (size > sk->sndbuf/2)
+ size = sk->sndbuf/2;
- if (size > SKB_MAX_ALLOC)
- size = SKB_MAX_ALLOC;
-
/*
* Grab a buffer
*/
-
- skb=sock_alloc_send_skb(sk,size,msg->msg_flags&MSG_DONTWAIT, &err);
+ alloc = size;
+
+ if (size > SKB_MAX_HEAD(0)) {
+ alloc = SKB_MAX_HEAD(0);
+ if (size > alloc + sysctl_unix_stream_pages*PAGE_SIZE)
+ size = alloc + sysctl_unix_stream_pages*PAGE_SIZE;
+ }
+
+ skb=sock_alloc_send_skb(sk,alloc,msg->msg_flags&MSG_DONTWAIT, &err);
if (skb==NULL)
goto out_err;
- /*
- * If you pass two values to the sock_alloc_send_skb
- * it tries to grab the large buffer with GFP_NOFS
- * (which can fail easily), and if it fails grab the
- * fallback size buffer which is under a page and will
- * succeed. [Alan]
- */
- size = min_t(int, size, skb_tailroom(skb));
-
memcpy(UNIXCREDS(skb), &scm->creds, sizeof(struct ucred));
if (scm->fp)
unix_attach_fds(scm, skb);
- if ((err = memcpy_fromiovec(skb_put(skb,size), msg->msg_iov, size)) != 0) {
+ if ((err = datagram_copy_fromiovec(msg->msg_iov, skb, size)) != 0) {
kfree_skb(skb);
goto out_err;
}
@@ -1418,13 +1465,10 @@
scm->creds = *UNIXCREDS(skb);
- if (!(flags & MSG_PEEK))
- {
+ if (!(flags & MSG_PEEK)) {
if (UNIXCB(skb).fp)
unix_detach_fds(scm, skb);
- }
- else
- {
+ } else {
/* It is questionable: on PEEK we could:
- do not return fds - good, but too simple 8)
- return fds, and do not return them on read (old strategy,
@@ -1483,13 +1527,10 @@
return timeo;
}
-
-
static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, int size,
int flags, struct scm_cookie *scm)
{
struct sock *sk = sock->sk;
- struct sockaddr_un *sunaddr=msg->msg_name;
int copied = 0;
int check_creds = 0;
int target;
@@ -1515,21 +1556,18 @@
down(&sk->protinfo.af_unix.readsem);
- do
- {
+ do {
int chunk;
struct sk_buff *skb;
skb=skb_dequeue(&sk->receive_queue);
- if (skb==NULL)
- {
+ if (skb==NULL) {
if (copied >= target)
break;
/*
* POSIX 1003.1g mandates this order.
*/
-
if ((err = sock_error(sk)) != 0)
break;
if (sk->shutdown & RCV_SHUTDOWN)
@@ -1551,60 +1589,44 @@
if (check_creds) {
/* Never glue messages from different writers */
- if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0) {
- skb_queue_head(&sk->receive_queue, skb);
- break;
- }
+ if (memcmp(UNIXCREDS(skb), &scm->creds, sizeof(scm->creds)) != 0)
+ goto out_put_back;
} else {
/* Copy credentials */
scm->creds = *UNIXCREDS(skb);
check_creds = 1;
}
- /* Copy address just once */
- if (sunaddr)
- {
- unix_copy_addr(msg, skb->sk);
- sunaddr = NULL;
- }
+ chunk = min_t(int, skb->len - sk->protinfo.af_unix.copied, size);
+ err = skb_copy_datagram_iovec(skb, sk->protinfo.af_unix.copied, msg->msg_iov, chunk);
+ if (err)
+ goto out_put_back;
- chunk = min_t(unsigned int, skb->len, size);
- if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) {
- skb_queue_head(&sk->receive_queue, skb);
- if (copied == 0)
- copied = -EFAULT;
- break;
- }
copied += chunk;
size -= chunk;
/* Mark read part of skb as used */
- if (!(flags & MSG_PEEK))
- {
- skb_pull(skb, chunk);
-
+ if (!(flags & MSG_PEEK)) {
if (UNIXCB(skb).fp)
unix_detach_fds(scm, skb);
/* put the skb back if we didn't use it up.. */
- if (skb->len)
- {
- skb_queue_head(&sk->receive_queue, skb);
- break;
- }
+ if ((sk->protinfo.af_unix.copied += chunk) < skb->len)
+ goto out_put_back;
+
+ sk->protinfo.af_unix.copied = 0;
kfree_skb(skb);
if (scm->fp)
break;
- }
- else
- {
+ } else {
/* It is questionable, see note in unix_dgram_recvmsg.
*/
if (UNIXCB(skb).fp)
scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+out_put_back:
/* put message back and return */
skb_queue_head(&sk->receive_queue, skb);
break;
@@ -1676,10 +1698,12 @@
break;
}
+ down(&sk->protinfo.af_unix.readsem);
spin_lock(&sk->receive_queue.lock);
if((skb=skb_peek(&sk->receive_queue))!=NULL)
- amount=skb->len;
+ amount=skb->len - sk->protinfo.af_unix.copied;
spin_unlock(&sk->receive_queue.lock);
+ up(&sk->protinfo.af_unix.readsem);
err = put_user(amount, (int *)arg);
break;
}
@@ -1734,7 +1758,7 @@
int i;
unix_socket *s;
- len+= sprintf(buffer,"Num RefCount Protocol Flags Type St "
+ len+= sprintf(buffer,"Peer RcvQueue WMem Flags Type St "
"Inode Path\n");
read_lock(&unix_table_lock);
@@ -1742,10 +1766,10 @@
{
unix_state_rlock(s);
- len+=sprintf(buffer+len,"%p: %08X %08X %08X %04X %02X %5ld",
- s,
- atomic_read(&s->refcnt),
- 0,
+ len+=sprintf(buffer+len,"%08lX: %08X %08X %08X %04X %02X %5ld",
+ unix_peer(s) ? sock_i_ino(unix_peer(s)) : 0,
+ skb_queue_len(&s->receive_queue),
+ atomic_read(&s->wmem_alloc),
s->state == TCP_LISTEN ? __SO_ACCEPTCON : 0,
s->type,
s->socket ?
diff -ur ../vger3-011229/linux/net/unix/sysctl_net_unix.c linux/net/unix/sysctl_net_unix.c
--- ../vger3-011229/linux/net/unix/sysctl_net_unix.c Tue Jan 30 21:20:16 2001
+++ linux/net/unix/sysctl_net_unix.c Sat Jan 5 04:10:58 2002
@@ -13,10 +13,14 @@
#include <linux/sysctl.h>
extern int sysctl_unix_max_dgram_qlen;
+extern int sysctl_unix_stream_pages;
ctl_table unix_table[] = {
{NET_UNIX_MAX_DGRAM_QLEN, "max_dgram_qlen",
&sysctl_unix_max_dgram_qlen, sizeof(int), 0600, NULL,
+ &proc_dointvec },
+ {NET_UNIX_STREAM_PAGES, "stream_pages",
+ &sysctl_unix_stream_pages, sizeof(int), 0600, NULL,
&proc_dointvec },
{0}
};
diff -ur ../pidentd-3.0.12-orig/src/k_linux.c ./src/k_linux.c
--- ../pidentd-3.0.12-orig/src/k_linux.c Sat Jan 12 00:44:05 2002
+++ ./src/k_linux.c Sat Nov 3 07:51:28 2001
@@ -26,12 +26,65 @@
#include "pidentd.h"
+#define NETLINK_TCPDIAG 4
+#define TCPDIAG_GETSOCK 18
+
+#include <linux/uio.h>
+#include <linux/netlink.h>
+
+/* Socket identity */
+struct tcpdiag_sockid
+{
+ __u16 tcpdiag_sport;
+ __u16 tcpdiag_dport;
+ __u32 tcpdiag_src[4];
+ __u32 tcpdiag_dst[4];
+ __u32 tcpdiag_if;
+ __u32 tcpdiag_cookie[2];
+#define TCPDIAG_NOCOOKIE (~0U)
+};
+
+/* Request structure */
+
+struct tcpdiagreq
+{
+ __u8 tcpdiag_family; /* Family of addresses. */
+ __u8 tcpdiag_src_len;
+ __u8 tcpdiag_dst_len;
+ __u8 tcpdiag_ext; /* Query extended information */
+
+ struct tcpdiag_sockid id;
+
+ __u32 tcpdiag_states; /* States to dump */
+ __u32 tcpdiag_dbs; /* Tables to dump (NI) */
+};
+
+struct tcpdiagmsg
+{
+ __u8 tcpdiag_family;
+ __u8 tcpdiag_state;
+ __u8 tcpdiag_timer;
+ __u8 tcpdiag_retrans;
+
+ struct tcpdiag_sockid id;
+
+ __u32 tcpdiag_expires;
+ __u32 tcpdiag_rqueue;
+ __u32 tcpdiag_wqueue;
+ __u32 tcpdiag_uid;
+ __u32 tcpdiag_inode;
+};
+
+
+int tcpdiag_fd = -1;
+
/*
** Make sure we are running on a supported OS version
*/
int
ka_init(void)
{
+ tcpdiag_fd = socket(AF_NETLINK, SOCK_DGRAM, NETLINK_TCPDIAG);
return 0; /* We always succeed */
}
@@ -56,6 +109,144 @@
}
+
+int k_lookup_tcpdiag(struct kernel *kp)
+{
+ struct sockaddr_nl nladdr;
+ struct {
+ struct nlmsghdr nlh;
+ struct tcpdiagreq r;
+ } req;
+ struct msghdr msg;
+ char buf[8192];
+ struct iovec iov[1];
+ struct tcpdiagmsg *r;
+ static unsigned seqno = 123456;
+
+ memset(&nladdr, 0, sizeof(nladdr));
+ nladdr.nl_family = AF_NETLINK;
+
+ req.nlh.nlmsg_len = sizeof(req);
+ req.nlh.nlmsg_type = TCPDIAG_GETSOCK;
+ req.nlh.nlmsg_flags = NLM_F_REQUEST;
+ req.nlh.nlmsg_pid = 0;
+ req.nlh.nlmsg_seq = ++seqno;
+ memset(&req.r, 0, sizeof(req.r));
+ req.r.tcpdiag_family = AF_INET;
+ req.r.tcpdiag_states = ~0;
+
+ req.r.id.tcpdiag_dport = kp->remote.sin_port;
+ req.r.id.tcpdiag_sport = kp->local.sin_port;
+ req.r.id.tcpdiag_dst[0] = kp->remote.sin_addr.s_addr;
+ req.r.id.tcpdiag_src[0] = kp->local.sin_addr.s_addr;
+ req.r.id.tcpdiag_cookie[0] = TCPDIAG_NOCOOKIE;
+ req.r.id.tcpdiag_cookie[1] = TCPDIAG_NOCOOKIE;
+ kp->ruid = NO_UID;
+
+ iov[0] = (struct iovec){ &req, sizeof(req) };
+
+ msg = (struct msghdr) {
+ (void*)&nladdr, sizeof(nladdr),
+ iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ if (sendmsg(tcpdiag_fd, &msg, 0) < 0) {
+ if (errno == ECONNREFUSED) {
+ close(tcpdiag_fd);
+ tcpdiag_fd = -1;
+ return 0;
+ }
+ syslog(LOG_ERR, "system error on tcpdiag sendmsg: %m");
+ return -1;
+ }
+
+ iov[0] = (struct iovec){ buf, sizeof(buf) };
+
+ while (1) {
+ int status;
+ struct nlmsghdr *h;
+
+ msg = (struct msghdr) {
+ (void*)&nladdr, sizeof(nladdr),
+ iov, 1,
+ NULL, 0,
+ 0
+ };
+
+ status = recvmsg(tcpdiag_fd, &msg, 0);
+
+ if (status < 0) {
+ if (errno == EINTR || errno == EAGAIN)
+ continue;
+ return -1;
+ }
+ if (status == 0) {
+ return -1;
+ }
+
+ h = (struct nlmsghdr*)buf;
+ while (NLMSG_OK(h, status)) {
+ int err;
+
+ if (/*h->nlmsg_pid != rth->local.nl_pid ||*/
+ h->nlmsg_seq != seqno)
+ goto skip_it;
+
+ if (h->nlmsg_type == NLMSG_DONE)
+ return -1;
+ if (h->nlmsg_type == NLMSG_ERROR) {
+ struct nlmsgerr *err = (struct nlmsgerr*)NLMSG_DATA(h);
+ if (h->nlmsg_len < NLMSG_LENGTH(sizeof(struct nlmsgerr))) {
+ return -1;
+ } else {
+ errno = -err->error;
+ if (errno == ECONNREFUSED) {
+ close(tcpdiag_fd);
+ tcpdiag_fd = -1;
+ return 0;
+ }
+ if (errno != ENOENT)
+ syslog(LOG_ERR, "tcpdiag answers: %m");
+ }
+ return -1;
+ }
+
+ r = NLMSG_DATA(h);
+
+ /* Lookup _may_ return listening socket, if no
+ * better matches are found. */
+ if (r->id.tcpdiag_dport == kp->remote.sin_port &&
+ r->id.tcpdiag_dst[0] == kp->remote.sin_addr.s_addr) {
+ kp->ruid = r->tcpdiag_uid;
+ if (!r->tcpdiag_inode && !r->tcpdiag_uid) {
+ /* _NEVER_ return "root" for closed
+ * sockets. Otherwise people think
+ * that it is sysadmin who abuses their
+ * poor ircd. :-) */
+ syslog(LOG_NOTICE,
+ "Req for stale socket(%d) %d from %x/%d",
+ r->tcpdiag_state, ntohs(r->id.tcpdiag_sport),
+ r->id.tcpdiag_dst[0], ntohs(r->id.tcpdiag_dport));
+ return -1;
+ }
+ return 1;
+ }
+
+ return -1;
+
+skip_it:
+ h = NLMSG_NEXT(h, status);
+ }
+ if ((msg.msg_flags & MSG_TRUNC) || status) {
+ syslog(LOG_ERR, "truncated tcp_diag message");
+ return -1;
+ }
+ }
+}
+
+
int
ka_lookup(void *vp, struct kernel *kp)
{
@@ -64,16 +255,23 @@
long r_laddr, r_raddr, myladdr, myraddr;
int r_lport, r_rport, mylport, myrport;
int euid;
-
-
+
+ if (tcpdiag_fd >= 0) {
+ int res;
+ if ((res = k_lookup_tcpdiag(kp)) != 0)
+ return res;
+ syslog(LOG_ERR, "tcp_diag is not loaded, fallback to proc");
+ }
+
+
r_rport = ntohs(kp->remote.sin_port);
r_lport = ntohs(kp->local.sin_port);
r_raddr = kp->remote.sin_addr.s_addr;
r_laddr = kp->local.sin_addr.s_addr;
+ kp->ruid = NO_UID;
fp = (FILE *) vp;
- kp->ruid = NO_UID;
rewind(fp);
/* eat header */
@@ -82,13 +280,26 @@
while (fgets(buf, sizeof(buf)-1, fp) != NULL)
{
- if (sscanf(buf, "%*d: %lx:%x %lx:%x %*x %*x:%*x %*x:%*x %*x %d %*d %*d",
- &myladdr, &mylport, &myraddr, &myrport, &euid) == 5)
+ int state, ino;
+ if (sscanf(buf, "%*d: %x:%x %x:%x %x %*x:%*x %*x:%*x %*x %d %*d %u",
+ &myladdr, &mylport, &myraddr, &myrport,
+ &state, &euid, &ino) == 7)
{
if (myladdr == r_laddr && mylport == r_lport &&
myraddr == r_raddr && myrport == r_rport)
{
kp->euid = euid;
+ if (ino == 0 && euid == 0)
+ {
+ /* _NEVER_ return "root" for closed
+ * sockets. Otherwise people think
+ * that it is sysadmin who abuses their
+ * poor ircd. :-) */
+ syslog(LOG_NOTICE,
+ "Req for stale socket(%d) %d from %x/%d",
+ state, r_rport, r_raddr, r_lport);
+ return -1;
+ }
return 1;
}
}
--- linux/include/net/route.h.orig Tue Apr 17 07:25:48 2001
+++ linux/include/net/route.h Tue Jul 10 23:35:18 2001
@@ -14,6 +14,7 @@
* Alan Cox : Support for TCP parameters.
* Alexey Kuznetsov: Major changes for new routing code.
* Mike McLagan : Routing by source
+ * Robert Olsson : Added rt_cache statistics
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -90,6 +91,20 @@
__u32 o_packets;
__u32 i_bytes;
__u32 i_packets;
+};
+
+struct rt_cache_stat
+{
+ unsigned in_hit;
+ unsigned in_slow_tot;
+ unsigned in_slow_mc;
+ unsigned in_no_route;
+ unsigned in_brd;
+ unsigned in_martian_dst;
+ unsigned in_martian_src;
+ unsigned out_hit;
+ unsigned out_slow_tot;
+ unsigned out_slow_mc;
};
extern struct ip_rt_acct *ip_rt_acct;
--- linux/net/ipv4/route.c.orig Wed Mar 28 22:01:15 2001
+++ linux/net/ipv4/route.c Tue Jul 10 23:27:51 2001
@@ -52,6 +52,7 @@
* Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
* Vladimir V. Ivanov : IP rule info (flowid) is really useful.
* Marc Boucher : routing by fwmark
+ * Robert Olsson : Added rt_cache statistics
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
@@ -201,6 +202,8 @@
static unsigned rt_hash_mask;
static int rt_hash_log;
+struct rt_cache_stat rt_cache_stat[NR_CPUS];
+
static int rt_intern_hash(unsigned hash, struct rtable * rth, struct rtable ** res);
static __inline__ unsigned rt_hash_code(u32 daddr, u32 saddr, u8 tos)
@@ -270,6 +273,44 @@
len = length;
return len;
}
+
+
+#ifdef CONFIG_PROC_FS
+static int rt_cache_stat_get_info(char *buffer, char **start, off_t offset, int length)
+{
+ int i, lcpu;
+ int len=0;
+ unsigned int dst_entries = atomic_read(&ipv4_dst_ops.entries);
+
+ for (lcpu=0; lcpu<smp_num_cpus; lcpu++) {
+ i = cpu_logical_map(lcpu);
+
+ len += sprintf(buffer+len, "%08x %08x %08x %08x %08x %08x %08x %08x %08x %08x %08x\n",
+ dst_entries,
+ rt_cache_stat[i].in_hit,
+ rt_cache_stat[i].in_slow_tot,
+ rt_cache_stat[i].in_slow_mc,
+ rt_cache_stat[i].in_no_route,
+ rt_cache_stat[i].in_brd,
+ rt_cache_stat[i].in_martian_dst,
+ rt_cache_stat[i].in_martian_src,
+
+ rt_cache_stat[i].out_hit,
+ rt_cache_stat[i].out_slow_tot,
+ rt_cache_stat[i].out_slow_mc
+ );
+ }
+ len -= offset;
+
+ if (len > length)
+ len = length;
+ if (len < 0)
+ len = 0;
+
+ *start = buffer + offset;
+ return len;
+}
+#endif
static __inline__ void rt_free(struct rtable *rt)
{
@@ -1163,6 +1204,8 @@
u32 spec_dst;
struct in_device *in_dev = in_dev_get(dev);
u32 itag = 0;
+ int cpu = smp_processor_id();
+
/* Primary sanity checks. */
@@ -1221,6 +1264,7 @@
if (!LOCAL_MCAST(daddr) && IN_DEV_MFORWARD(in_dev))
rth->u.dst.input = ip_mr_input;
#endif
+ rt_cache_stat[cpu].in_slow_mc++;
in_dev_put(in_dev);
hash = rt_hash_code(daddr, saddr^(dev->ifindex<<5), tos);
@@ -1259,6 +1303,7 @@
u32 spec_dst;
int err = -EINVAL;
int free_res = 0;
+ int cpu = smp_processor_id();
/*
* IP on this device is disabled.
@@ -1308,6 +1353,8 @@
}
free_res = 1;
+ rt_cache_stat[cpu].in_slow_tot++;
+
#ifdef CONFIG_IP_ROUTE_NAT
/* Policy is applied before mapping destination,
but rerouting after map should be made with old source.
@@ -1455,6 +1502,7 @@
}
flags |= RTCF_BROADCAST;
res.type = RTN_BROADCAST;
+ rt_cache_stat[cpu].in_brd++;
local_input:
rth = dst_alloc(&ipv4_dst_ops);
@@ -1498,6 +1546,7 @@
goto intern;
no_route:
+ rt_cache_stat[cpu].in_no_route++;
spec_dst = inet_select_addr(dev, 0, RT_SCOPE_UNIVERSE);
res.type = RTN_UNREACHABLE;
goto local_input;
@@ -1506,6 +1555,7 @@
* Do not cache martian addresses: they should be logged (RFC1812)
*/
martian_destination:
+ rt_cache_stat[cpu].in_martian_dst++;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
printk(KERN_WARNING "martian destination %u.%u.%u.%u from %u.%u.%u.%u, dev %s\n",
@@ -1520,6 +1570,8 @@
goto done;
martian_source:
+
+ rt_cache_stat[cpu].in_martian_src++;
#ifdef CONFIG_IP_ROUTE_VERBOSE
if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
/*
@@ -1550,6 +1602,7 @@
struct rtable * rth;
unsigned hash;
int iif = dev->ifindex;
+ int cpu = smp_processor_id();
tos &= IPTOS_RT_MASK;
hash = rt_hash_code(daddr, saddr^(iif<<5), tos);
@@ -1567,6 +1620,7 @@
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
+ rt_cache_stat[cpu].in_hit++;
read_unlock(&rt_hash_table[hash].lock);
skb->dst = (struct dst_entry*)rth;
return 0;
@@ -1621,6 +1675,7 @@
int free_res = 0;
int err;
u32 tos;
+ int cpu = smp_processor_id();
tos = oldkey->tos & (IPTOS_RT_MASK|RTO_ONLINK);
key.dst = oldkey->dst;
@@ -1847,14 +1902,18 @@
rth->u.dst.output=ip_output;
+ rt_cache_stat[cpu].out_slow_tot++;
+
if (flags&RTCF_LOCAL) {
rth->u.dst.input = ip_local_deliver;
rth->rt_spec_dst = key.dst;
}
if (flags&(RTCF_BROADCAST|RTCF_MULTICAST)) {
rth->rt_spec_dst = key.src;
- if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK))
+ if (flags&RTCF_LOCAL && !(dev_out->flags&IFF_LOOPBACK)) {
rth->u.dst.output = ip_mc_output;
+ rt_cache_stat[cpu].out_slow_mc++;
+ }
#ifdef CONFIG_IP_MROUTE
if (res.type == RTN_MULTICAST) {
struct in_device *in_dev = in_dev_get(dev_out);
@@ -1894,6 +1953,7 @@
{
unsigned hash;
struct rtable *rth;
+ int cpu = smp_processor_id();
hash = rt_hash_code(key->dst, key->src^(key->oif<<5), key->tos);
@@ -1912,6 +1972,7 @@
rth->u.dst.lastuse = jiffies;
dst_hold(&rth->u.dst);
rth->u.dst.__use++;
+ rt_cache_stat[cpu].out_hit++;
read_unlock_bh(&rt_hash_table[hash].lock);
*rp = rth;
return 0;
@@ -2339,6 +2400,7 @@
add_timer(&rt_periodic_timer);
proc_net_create ("rt_cache", 0, rt_cache_get_info);
+ proc_net_create ("rt_cache_stat", 0, rt_cache_stat_get_info);
#ifdef CONFIG_NET_CLS_ROUTE
create_proc_read_entry("net/rt_acct", 0, 0, ip_rt_acct_read, NULL);
#endif
diff -ur ../vger3-010830/linux/net/ipv6/tcp_ipv6.c linux/net/ipv6/tcp_ipv6.c
--- ../vger3-010830/linux/net/ipv6/tcp_ipv6.c Wed Jun 13 21:14:05 2001
+++ linux/net/ipv6/tcp_ipv6.c Fri Oct 12 06:59:07 2001
@@ -339,13 +339,18 @@
return tcp_v6_lookup_listener(daddr, hnum, dif);
}
-#define tcp_v6_lookup(sa, sp, da, dp, dif) \
-({ struct sock *___sk; \
- local_bh_disable(); \
- ___sk = __tcp_v6_lookup((sa),(sp),(da),ntohs(dp),(dif)); \
- local_bh_enable(); \
- ___sk; \
-})
+__inline__ struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+ struct in6_addr *daddr, u16 dport,
+ int dif)
+{
+ struct sock *sk;
+
+ local_bh_disable();
+ sk = __tcp_v6_lookup(saddr, sport, daddr, ntohs(dport), dif);
+ local_bh_enable();
+
+ return sk;
+}
/*
diff -ur ../vger3-010830/linux/net/netsyms.c linux/net/netsyms.c
--- ../vger3-010830/linux/net/netsyms.c Sun Aug 19 22:01:45 2001
+++ linux/net/netsyms.c Fri Oct 12 07:59:17 2001
@@ -72,6 +72,11 @@
extern int netdev_finish_unregister(struct net_device *dev);
+extern struct sock *tcp_v6_lookup(struct in6_addr *saddr, u16 sport,
+ struct in6_addr *daddr, u16 dport,
+ int dif);
+extern struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, u16 dport, int dif);
+
#include <linux/rtnetlink.h>
#ifdef CONFIG_IPX_MODULE
@@ -284,7 +289,11 @@
EXPORT_SYMBOL(ndisc_mc_map);
EXPORT_SYMBOL(register_inet6addr_notifier);
EXPORT_SYMBOL(unregister_inet6addr_notifier);
+EXPORT_SYMBOL(tcp_v6_lookup);
#endif
+EXPORT_SYMBOL(tcp_v4_lookup);
+EXPORT_SYMBOL(tcp_timewait_cachep);
+EXPORT_SYMBOL(tcp_hashinfo);
#if defined (CONFIG_IPV6_MODULE) || defined (CONFIG_KHTTPD) || defined (CONFIG_KHTTPD_MODULE)
/* inet functions common to v4 and v6 */
EXPORT_SYMBOL(inet_release);
Primary FTP site is:
ftp://ftp.inr.ac.ru/ip-routing/
Mirrors are:
ftp://linux.wauug.org/pub/net
ftp://ftp.nc.ras.ru/pub/mirrors/ftp.inr.ac.ru/ip-routing/
ftp://ftp.gts.cz/MIRRORS/ftp.inr.ac.ru/
ftp://ftp.funet.fi/pub/mirrors/ftp.inr.ac.ru/ip-routing/ (STM1 to USA)
ftp://sunsite.icm.edu.pl/pub/Linux/iproute/
ftp://ftp.sunet.se/pub/Linux/ip-routing/
ftp://ftp.nvg.ntnu.no/pub/linux/ip-routing/
ftp://ftp.crc.ca/pub/systems/linux/ip-routing/
ftp://ftp.proxad.net/mirrors/ftp.inr.ac.ru/ip-routing/
ftp://donlug.dn.ua/pub/mirrors/ip-routing/
ftp://omni.rk.tusur.ru/mirrors/ftp.inr.ac.ru/ip-routing/
ftp://ftp.src.uchicago.edu/pub/linux/ip-routing/
http://www.asit.ro/ip-routing/
ftp://ftp.infoscience.co.jp/pub/linux/ip-routing/ (Japan)
ftp://ftp.sucs.swan.ac.uk/pub/mirrors/ftp.inr.ac.ru/ip-routing
http://mirror.schell.de/ftp.inr.ac.ru/ip-routing/ (Germany)
ftp://ftp.gin.cz/MIRRORS/ftp.inr.ac.ru/ip-routing
ftp://mirror.aarnet.edu.au/pub/ip-routing/ (Australia)
http://mirror.aarnet.edu.au/pub/ip-routing/ (Australia)
RPMs are available at:
ftp://omni.rk.tusur.ru/Tango/
ftp://ftp4.dgtu.donetsk.ua/pub/BlackCat/6.0/contrib/SRPMS/i[35]86/
How to compile this.
--------------------
1. Look at start of Makefile and set correct values for:
KERNEL_INCLUDE should point to correct linux kernel include directory.
Default (/usr/src/linux/include) is right as rule.
ADDLIB should contain inet_* functions, if your libc contains
obsolete resolver library (<4.9.4) and you have no correct libresolv.
ADDLIB should also contain dnet_* functions if you don't have a
libdnet with support for them. If your libdnet does have support,
then comment out that line and uncomment the line to add -ldnet to
LDLIBS.
LDLIBS should be empty, if you have no libresolv.
2. make
Utilities "ip" and "rtmon" are in ip/ directory now,
"tc" is in tc/. That's all.
3. To make documentation, cd to doc/ directory , then
look at start of Makefile and set correct values for
PAGESIZE=a4 , ie: a4 , letter ... (string)
PAGESPERPAGE=2 , ie: 1 , 2 ... (numeric)
and make there. It assumes, that latex, dvips and psnup
are in your path.
Alexey Kuznetsov
kuznet@ms2.inr.ac.ru
Here are a few quick points about DECnet support...
o No name resolution is available as yet, all addresses must be
entered numerically.
o The neighbour cache may well list every entry as having the address
0.170. This is due to a problem that I need to sort out kernel side.
It is harmless (but don't try and use neigh add yet) just look in
/proc/net/decnet_neigh to see the real addresses for now.
o The rtnetlink support in the kernel is rather exprimental, expect a
few odd things to happen for the next few DECnet kernel releases.
o Whilst you can use ip addr add to add more than one DECnet address to an
interface, don't expect addresses which are not the same as the
kernels node address to work properly. i.e. You will break the DECnet
protocol if you do add anything other than the automatically generated
interface addresses to ethernet cards. This option is there for future
link layer support, where the device will have to be configed for
DECnet explicitly.
o The DECnet support is currently self contained. You do not need the
libdnet library to use it. In fact until I've sent the dnet_pton and
dnet_ntop functions to Patrick to add, you can't use libdnet.
o If you are not using the very latest 2.3.xx series kernels, don't
try and list DECnet routes if you've got IPv6 compiled into the
kernel. It will oops.
o My main reason for writing the DECnet support for iproute2 was to
check out the DECnet routing code, so the route get and
route show cache commands are likely to be the most debugged out of
all of them.
o If you find bugs in the DECnet support, please send them to me in the
first instance, and then I'll send Alexey a patch to fix it. IPv4/6
bugs should be sent to Alexey as before.
Steve Whitehouse <SteveW@ACM.org>
iproute2+tc*
It's the first release of Linux traffic control engine.
NOTES.
* csz scheduler is inoperational at the moment, and probably
never will be repaired but replaced with h-pfq scheduler.
* To use "fw" classifier you will need ipfwchains patch.
* No manual available. Ask me, if you have problems (only try to guess
answer yourself at first 8)).
Micro-manual how to start it the first time
-------------------------------------------
A. Attach CBQ to eth1:
tc qdisc add dev eth1 root handle 1: cbq bandwidth 10Mbit allot 1514 cell 8 \
avpkt 1000 mpu 64
B. Add root class:
tc class add dev eth1 parent 1:0 classid 1:1 cbq bandwidth 10Mbit rate 10Mbit \
allot 1514 cell 8 weight 1Mbit prio 8 maxburst 20 avpkt 1000
C. Add default interactive class:
tc class add dev eth1 parent 1:1 classid 1:2 cbq bandwidth 10Mbit rate 1Mbit \
allot 1514 cell 8 weight 100Kbit prio 3 maxburst 20 avpkt 1000 split 1:0 \
defmap c0
D. Add default class:
tc class add dev eth1 parent 1:1 classid 1:3 cbq bandwidth 10Mbit rate 8Mbit \
allot 1514 cell 8 weight 800Kbit prio 7 maxburst 20 avpkt 1000 split 1:0 \
defmap 3f
etc. etc. etc. Well, it is enough to start 8) The rest can be guessed 8)
Look also at more elaborated example, ready to start rsvpd,
in rsvp/cbqinit.eth1.
Terminology and advices about setting CBQ parameters may be found in Sally Floyd
papers.
Pairs X:Y are class handles, X:0 are qdisc heandles.
weight should be proportional to rate for leaf classes
(I choosed it ten times less, but it is not necessary)
defmap is bitmap of logical priorities served by this class.
E. Another qdiscs are simpler. F.e. let's join TBF on class 1:2
tc qdisc add dev eth1 parent 1:2 tbf rate 64Kbit buffer 5Kb/8 limit 10Kb
F. Look at all that we created:
tc qdisc ls dev eth1
tc class ls dev eth1
G. Install "route" classifier on root of cbq and map destination from realm
1 to class 1:2
tc filter add dev eth1 parent 1:0 protocol ip prio 100 route to 1 classid 1:2
H. Assign routes to 10.11.12.0/24 to realm 1
ip route add 10.11.12.0/24 dev eth1 via whatever realm 1
etc. The same thing can be made with rules.
I still did not test ipchains, but they should work too.
Setup of rsvp and u32 classifiers is more hairy.
If you read RSVP specs, you will understand how rsvp classifier
works easily. What's about u32... That's example:
#! /bin/sh
TC=/home/root/tc
# Setup classifier root on eth1 root (it is cbq)
$TC filter add dev eth1 parent 1:0 prio 5 protocol ip u32
# Create hash table of 256 slots with ID 1:
$TC filter add dev eth1 parent 1:0 prio 5 handle 1: u32 divisor 256
# Add to 6th slot of hash table rule to select tcp/telnet to 193.233.7.75
# direct it to class 1:4 and prescribe to fall to best effort,
# if traffic violate TBF (32kbit,5K)
$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:6: \
match ip dst 193.233.7.75 \
match tcp dst 0x17 0xffff \
flowid 1:4 \
police rate 32kbit buffer 5kb/8 mpu 64 mtu 1514 index 1
# Add to 1th slot of hash table rule to select icmp to 193.233.7.75
# direct it to class 1:4 and prescribe to fall to best effort,
# if traffic violate TBF (10kbit,5K)
$TC filter add dev eth1 parent 1:0 prio 5 u32 ht 1:: \
sample ip protocol 1 0xff \
match ip dst 193.233.7.75 \
flowid 1:4 \
police rate 10kbit buffer 5kb/8 mpu 64 mtu 1514 index 2
# Lookup hash table, if it is not fragmented frame
# Use protocol as hash key
$TC filter add dev eth1 parent 1:0 prio 5 handle ::1 u32 ht 800:: \
match ip nofrag \
offset mask 0x0F00 shift 6 \
hashkey mask 0x00ff0000 at 8 \
link 1:
Alexey Kuznetsov
kuznet@ms2.inr.ac.ru
[020116]
! 1. Compile with rh-7.2
! 2. What the hell some people blame on socklen_t defined in unistd.h? Check.
* Kim Woelders <kim@woelders.dk>, various useful fixups: compilation
with old kernels, cross-compiling, "all" == "any" in prefix spec.
* Collected from my disk, cleaned and packed to directory iproute2/misc/
several utilities: ss, nstat, ifstat, rtacct, arpd and module tcp_diag.
Writing some docs. me.
* prepared patchlet for pidentd to use tcp_diag.
* David Miller: 64bit (and even worse 64bit kernel/32 bit user :-) fixes
to above. tcp_diag is merged to main tree.
* Alexandr D. Kanevskiy <kad@blackcatlinux.com>: various flaws in ss
* Alexandr D. Kanevskiy <kad@blackcatlinux.com>: oops, more aggressive caching
of names opened old bugs: ip started to print garbage in some places.
* Robert Olsson, rt_cache_stat. Renamed to rtstat.
* An old bug in "ip maddr ls": reduntant empty lines in output.
Seeing this crap for ages but lucky match of desire/ability to repair
and a huff about this happened only today. :-)
* "Mr. James W. Laferriere" <babydr@baby-dragons.com>
doc: option to produce ps output for non-a4 and not only 2 pages/sheet.
* Jamal's patch for ingres qdisc.
* Bernd Eckenfels <ecki@lina.inka.de>: deleted orphaned bogus #include
in include/utils.h.
* Julian Anastasov <ja@ssi.bg>: uninitialized fields in nexthop
producing funny "dead" nexthops in multipath routes.
Stupid me, look at the first line in [010803]... Was it difficult to guess
this that time? People blame for several months. :-)
Special thanks to bert hubert <ahu@ds9a.nl> who raised the issue in netdev.
Thanks and apologies to Terry Schmidt <terry@nycwireless.net>,
Ruben Puettmann <ruben.puettmann@freenet-ag.de>,
Mark Ivens <mivens@clara.net>.
* willy tarreau <wtarreau@yahoo.fr>: "make install" target.
* Tunable limit for sch_sfq. Patch to kernel activating this
is about to be submitted. Reminded by Adi Nugroho <Adi@iNterNUX.co.id>.
[010824]
* ip address add sets scope of loopback addreses to "host".
Advised by David Miller.
* ZIP! <zip@killerlabs.com> and David Ford <david@blue-labs.org>
Some strcpy's changed to strncpy's.
* David Ford <david@blue-labs.org>, test for compilation with gcc3.
* David Ford <david@blue-labs.org>. Damn, I broke rtnl_talk in previous
snapshot.
[010803]
* If "dev" is not specified in multipath route, ifindex remained
uninitialized. Grr. Thanks to Kunihiro Ishiguro <kunihiro@zebra.org>.
* Rafal Maszkowski <rzm@icm.edu.pl>, batch mode tc. The most old patch.
* Updates list of data protocol ids.
Lots of reporters. I bring my apologies.
* Jan Rekorajski <baggins@sith.mimuw.edu.pl>. Updated list of datalink types.
* Christina Chen <chenchristina@cwc.nus.edu.sg>. Bug in parsing IPv6 address match in u32.
* Pekka Savola <pekkas@netcore.fi>. ip -6 route flush dev lo stuck
on deleting root of the table.
* Werner. dsmark fixes.
* Alexander Demenshin <aldem-reply@aldem.net>. Old miracleous bug
in ip monitor. It was puzzle, people permanently blame that
it prints some crap.
* Rui Prior <rprior@inescporto.pt>. f_route failed to resolve fromif.
Werner also noticed this and sent patch. Bad place... [RETHINK]
* Kim Woelders <kim@woelders.dk>.
- changes in Makefile for cross-compile
- understand "all" as alias for "any"
- bug in iprule.c
! [ NB. Also he sent patch for kernel. Do not forget! ]
* Werner. Fix to tc core files: wrong exits etc.
* Bernd Jendrissek <berndj@prism.co.za>. Some sanitizations of tc.c
!* Marian Jancar <marian.jancar@infonet.cz>. He say q_tbf prints wrong latency!
! Seems, he is wrong.
* Werner (and Nikolai Vladychevski <niko@isl.net.mx>) check ->print_copts
to avoid segfault.
[001007]
* Compiles under rh-7.0
[000928]
* Sorry. I have lost all the CVS with changes made since 000305.
If someone sent me a patch after this date, please, resubmit.
Restored from the last backup and mailboxes:
* Edit ip-cref.tex by raf <raf2@zip.com.au>.
* RTAX_REORDERING support.
* IFLA_MASTER support.
* Bug in rtnl_talk(), libnetlink.c. Reported by David P. Olshfski
<olshef@us.ibm.com>
[000305]
* Bugs in RESOLVE_HOSTNAMES. Bratislav Ilich <bilik@@zepter.ru>
* ARPHRD_IEEE802_TR
[000225]
* ECN in q_red.c.
[000221]
* diffserv update from Jamal Hadi Salim
* Some bits of IPX from Steve Whitehouse.
* ATM qdisc from Werner Almesberger
* Support for new attributes on routes in linux-2.3.
[991023]
No news, only several bugs are fixed.
* Since ss990630 "ip rule list" printed wrong prefix length.
Vladimir V. Ivanov <vlad@alis.tusur.ru>
* "ip rule" parsed >INT_MAX values of metric incorrectly.
Matthew G. Marsh <mgm@paktronix.com>
* Some improvements in doc/Makefile advised by
Andi Kleen and Werner Almesberger.
[990824]
* new attributes in "ip route": rtt, rttvar, cwnd, ssthresh and advmss.
* some updates in documentaion to reflect new status.
[990630]
* DiffServ support.
Werner Almesberger <almesber@lrc.di.epfl.ch>
Jamal Hadi Salim <hadi@nortelnetworks.com>
* DECnet support.
Steve Whitehouse <SteveW@ACM.org>
* Some minor tweaks in docs and code.
[990530]
* routel script. Stephen R. van den Berg <srb@cuci.nl>
* Bug in tc/q_prio.c resetting priomap. Reported by
Ole Husgaard <sparre@login.dknet.dk> and
Jan Kasprzak <kas@informatics.muni.cz>
* IP command reference manual is published (ip-cref.tex).
I am sorry, but tc-cref.tex is still not ready, to be more
exact the draft does not describe current tc 8-)
* ip, rtmon, rtacct utilities are updated according to manual 8-)
Lots of changes:
- (MAIN) "flush" command for addr, neigh and route.
- error messages are sanitized; now it does not print
usage() page on each error.
- output format is improved.
- "oneline" mode is added.
- etc.
* Name databases; resolution acsii <-> numeric is split out to lib/*
* scripts ifcfg, ifone and rtpr.
* examples/dhcp-client-script is copied from my patch to ISC dhcp.
* Makefile in doc/ directory.
[990417]
* "pmtudisc" flag to "ip tunnel". Phil Karn <karn@ka9q.ampr.org>
* bug in tc/q_tbf.c preventing setting peak_rate, Martin Mares <mj@ucw.cz>
* doc/flowlabels.tex
[990329]
* This snapshot fixes some compatibility problems, which I introduced
occasionally to previous snapshots.
* Namely, "allot" to "tc qdisc add ... cbq" is accepted but ignored.
* Another changes are supposed to be shown in the next snapshot, but
because of troubles with "allot" I am forced to release premature
version. Namely, "cell", "prio", "weight" etc. are optional now.
* doc/ip-tunnels.tex
[990327]
* History was not recorded.
[981002]
* Rani Assaf <rani@magic.metawire.com> contributed resolving
addresses to names.
BEWARE! DO NOT USE THIS OPTION, WHEN REPORTING BUGS IN
IPROUTE OR IN KERENEL. ALL THE BUG REPORTS MUST CONTAIN
ONLY NUMERIC ADDRESSES.
[981101]
* now it should compile for any libc.
PSFILES=ip-cref.ps ip-tunnels.ps api-ip6-flowlabels.ps ss.ps nstat.ps arpd.ps rtstat.ps
# tc-cref.ps
# api-rtnl.tex api-pmtudisc.tex api-news.tex
# iki-netdev.ps iki-neighdst.ps
LATEX=latex
DVIPS=dvips
SGML2DVI=sgml2latex --output=dvi
SGML2HTML=sgml2html -s 0
LPR=lpr -Zsduplex
SHELL=bash
PAGESIZE=a4
PAGESPERPAGE=2
HTMLFILES=$(subst .sgml,.html,$(shell echo *.sgml))
DVIFILES=$(subst .ps,.dvi,$(PSFILES))
all: pstwocol
pstwocol: $(PSFILES)
html: $(HTMLFILES)
dvi: $(DVIFILES)
print: $(PSFILES)
$(LPR) $(PSFILES)
%.dvi: %.sgml
$(SGML2DVI) $<
%.dvi: %.tex
@set -e; pass=2; echo "Running LaTeX $<"; \
while [ `$(LATEX) $< </dev/null 2>&1 | \
grep -c '^\(LaTeX Warning: Label(s) may\|No file \|! Emergency stop\)'` -ge 1 ]; do \
if [ $$pass -gt 3 ]; then \
echo "Seems, something is wrong. Try by hands." ; exit 1 ; \
fi; \
echo "Re-running LaTeX $<, $${pass}d pass"; pass=$$[$$pass + 1]; \
done
%.ps: %.dvi
$(DVIPS) $< -o $@.tmp
./do-psnup $@.tmp $@ $(PAGESIZE) $(PAGESPERPAGE)
rm -f $@.tmp
%.html: %.sgml
$(SGML2HTML) $<
install:
install -m 0644 $(shell echo *.tex) $(DESTDIR)$(DOCDIR)
install -m 0644 $(shell echo *.sgml) $(DESTDIR)$(DOCDIR)
clean:
rm -f *.aux *.log *.toc $(PSFILES) $(DVIFILES) *.html
Partially finished work.
1. User Reference manuals.
1.1 IP Command reference (ip-cref.tex, published)
1.2 TC Command reference (tc-cref.tex)
1.3 IP tunnels (ip-tunnels.tex, published)
2. Linux-2.2 Networking API
2.1 RTNETLINK (api-rtnl.tex)
2.2 Path MTU Discovery (api-pmtudisc.tex)
2.3 IPv6 Flow Labels (api-ip6-flowlabels.tex, published)
2.4 Miscellaneous extensions (api-misc.tex)
3. Linux-2.2 Networking Intra-Kernel Interfaces
3.1 NetDev --- Networking Devices and netdev... (iki-netdev.tex)
3.2 Neighbour cache and destination cache. (iki-neighdst.tex)
This diff is collapsed.
<!doctype linuxdoc system>
<article>
<title>ARPD Daemon
<author>Alexey Kuznetsov, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/arpd/ is daemon collecting gratuitous ARP information, saving
it on local disk and feeding it to kernel on demand to avoid
redundant broadcasting due to limited size of kernel ARP cache.
</abstract>
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
arpd OPTIONS [ INTERFACE [ INTERFACE ... ] ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-l/ - dump <tt/arpd/ database to stdout and exit. Output consists
of three columns: interface index, IP address and MAC address.
Negative entries for dead hosts are also shown, in this case MAC address
is replaced by word <tt/FAILED/ followed by colon and time when the fact
that host is dead was proven the last time.
<item><tt/-f FILE/ - read and load <tt/arpd/ database from <tt/FILE/
in text format similar dumped by option <tt/-l/. Exit after load,
probably listing resulting database, if option <tt/-l/ is also given.
If <tt/FILE/ is <tt/-/, <tt/stdin/ is read to get ARP table.
<item><tt/-b DATABASE/ - location of database file. Default location is
<tt>/var/lib/arpd/arpd.db</tt>.
<item><tt/-a NUMBER/ - <tt/arpd/ not only passively listens ARP on wire, but
also send brodcast queries itself. <tt/NUMBER/ is number of such queries
to make before destination is considered as dead. When <tt/arpd/ is started
as kernel helper (i.e. with <tt/app_solicit/ enabled in <tt/sysctl/
or even with option <tt/-k/) without this option and still did not learn enough
information, you can observe 1 second gaps in service. Not fatal, but
not good.
<item><tt/-k/ - suppress sending broadcast queries by kernel. It takes
sense together with option <tt/-a/.
<item><tt/-n TIME/ - timeout of negative cache. When resolution fails <tt/arpd/
suppresses further attempts to resolve for this period. It makes sense
only together with option <tt/-k/. This timeout should not be too much
longer than boot time of a typical host not supporting gratuitous ARP.
Default value is 60 seconds.
<item><tt/-R RATE/ - maximal steady rate of broadcasts sent by <tt/arpd/
in packets per second. Default value is 1.
<item><tt/-B NUMBER/ - number of broadcasts sent by <tt/arpd/ back to back.
Default value is 3. Together with option <tt/-R/ this option allows
to police broadcasting not to exceed <tt/B+R*T/ over any interval
of time <tt/T/.
</itemize>
<p><tt/INTERFACE/ is name of networking inteface to watch.
If no interfaces given, <tt/arpd/ monitors all the interfaces.
In this case <tt/arpd/ does not adjust <tt/sysctl/ parameters,
it is supposed user does this himself after <tt/arpd/ is started.
<p> Signals
<p> <tt/arpd/ exits gracefully syncing database and restoring adjusted
<tt/sysctl/ parameters, when receives <tt/SIGINT/ or <tt/SIGTERM/.
<tt/SIGHUP/ syncs database to disk. <tt/SIGUSR1/ sends some statistics
to <tt/syslog/. Effect of another signals is undefined, they may corrupt
database and leave <tt/sysctl/ parameters in an unpredictable state.
<p> Note
<p> In order to <tt/arpd/ be able to serve as ARP resolver, kernel must be
compiled with the option <tt/CONFIG_ARPD/ and, in the case when interface list
is not given on command line, variable <tt/app_solicit/
on interfaces of interest should be set in <tt>/proc/sys/net/ipv4/neigh/*</tt>.
If this is not made <tt/arpd/ still collects gratuitous ARP information
in its database.
<p> Examples
<enum>
<item> Start <tt/arpd/ to collect gratuitous ARP, but not messing
with kernel functionality:
<tscreen><verb>
arpd -b /var/tmp/arpd.db
</verb></tscreen>
<item> Look at result after some time:
<tscreen><verb>
killall arpd
arpd -l -b /var/tmp/arpd.db
</verb></tscreen>
<item> To enable kernel helper, leaving leading role to kernel:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 1 eth0 eth1
</verb></tscreen>
<item> Completely replace kernel resolution on interfaces <tt/eth0/
and <tt/eth1/. In this case kernel still does unicast probing to
validate entries, but all the broadcast activity is suppressed
and made under authority of <tt/arpd/:
<tscreen><verb>
arpd -b /var/tmp/arpd.db -a 3 -k eth0 eth1
</verb></tscreen>
This is mode which <tt/arpd/ is supposed to work normally.
It is not default just to prevent occasional enabling of too aggressive
mode occasionally.
</enum>
</article>
#! /bin/bash
# $1 = Temporary file . "string"
# $2 = File to process . "string"
# $3 = Page size . ie: a4 , letter ... "string"
# $4 = Number of pages to fit on a single sheet . "numeric"
if type psnup >&/dev/null; then
echo "psnup -$4 -p$3 $1 $2"
psnup -$4 -p$3 $1 $2
elif type psmulti >&/dev/null; then
echo "psmulti $1 > $2"
psmulti $1 > $2
else
echo "cp $1 $2"
cp $1 $2
fi
This diff is collapsed.
This diff is collapsed.
<!doctype linuxdoc system>
<article>
<title>NSTAT, IFSTAT and RTACCT Utilities
<author>Alexey Kuznetosv, <tt/kuznet@ms2.inr.ac.ru/
<date>some_negative_number, 20 Sep 2001
<abstract>
<tt/nstat/, <tt/ifstat/ and <tt/rtacct/ are simple tools helping
to monitor kernel snmp counters and network interface statistics.
</abstract>
<p> These utilities are very similar, so that I describe
them simultaneously, using name <tt/Xstat/ in the places which apply
to all of them.
<p>The format of the command is:
<tscreen><verb>
Xstat [ OPTIONS ] [ PATTERN [ PATTERN ... ] ]
</verb></tscreen>
<p>
<tt/PATTERN/ is shell style pattern, selecting identifier
of SNMP variables or interfaces to show. Variable is displayed
if one of patterns matches its name. If no patterns are given,
<tt/Xstat/ assumes that user wants to see all the variables.
<p> <tt/OPTIONS/ is list of single letter options, using common unix
conventions.
<itemize>
<item><tt/-h/ - show help page
<item><tt/-?/ - the same, of course
<item><tt/-v/, <tt/-V/ - print version of <tt/Xstat/ and exit
<item><tt/-z/ - dump zero counters too. By default they are not shown.
<item><tt/-a/ - dump absolute values of counters. By default <tt/Xstat/
calculates increments since the previous use.
<item><tt/-s/ - do not update history, so that the next time you will
see counters including values accumulated to the moment
of this measurement too.
<item><tt/-n/ - do not display anything, only update history.
<item><tt/-r/ - reset history.
<item><tt/-d INTERVAL/ - <tt/Xstat/ is run in daemon mode collecting
statistics. <tt/INTERVAL/ is interval between measurements
in seconds.
<item><tt/-t INTERVAL/ - time interval to average rates. Default value
is 60 seconds.
<item><tt/-e/ - display extended information about errors (<tt/ifstat/ only).
</itemize>
<p>
History is just dump saved in file <tt>/tmp/.Xstat.uUID</tt>
or in file given by environment variables <tt/NSTAT_HISTORY/,
<tt/IFSTAT_HISTORY/ and <tt/RTACCT_HISTORY/.
Each time when you use <tt/Xstat/ values there are updated.
If you use patterns, only the values which you _really_ see
are updated. If you want to skip an unintersting period,
use option <tt/-n/, or just output to <tt>/dev/null</tt>.
<p>
<tt/Xstat/ understands when history is invalidated by system reboot
or source of information switched between different instances
of daemonic <tt/Xstat/ and kernel SNMP tables and does not
use invalid history.
<p> Beware, <tt/Xstat/ will not produce sane output,
when many processes use it simultaneously. If several processes
under single user need this utility they should use environment
variables to put their history in safe places
or to use it with options <tt/-a -s/.
<p>
Well, that's all. The utility is very simple, but nevertheless
very handy.
<p> <bf/Output of XSTAT/
<p> The first line of output is <tt/#/ followed by identifier
of source of information, it may be word <tt/kernel/, when <tt/Xstat/
gets information from kernel or some dotted decimal number followed
by parameters, when it obtains information from running <tt/Xstat/ daemon.
<p>In the case of <tt/nstat/ the rest of output consists of three columns:
SNMP MIB identifier,
its value (or increment since previous measurement) and average
rate of increase of the counter per second. <tt/ifstat/ outputs
interface name followed by pairs of counter and rate of its change.
<p> <bf/Daemonic Xstat/
<p> <tt/Xstat/ may be started as daemon by any user. This makes sense
to avoid wrapped counters and to obtain reasonable long counters
for large time. Also <tt/Xstat/ daemon calculates average rates.
For the first goal sampling interval (option <tt/-d/) may be large enough,
f.e. for gigabit rates byte counters overflow not more frequently than
each 40 seconds and you may select interval of 20 seconds.
From the other hand, when <tt/Xstat/ is used for estimating rates
interval should be less than averaging period (option <tt/-t/), otherwise
estimation loses in quality.
Client <tt/Xstat/, before trying to get information from the kernel,
contacts daemon started by this user, then it tries system wide
daemon, which is supposed to be started by superuser. And only if
none of them replied it gets information from kernel.
<p> <bf/Environment/
<p> <tt/NSTAT_HISTORY/ - name of history file for <tt/nstat/.
<p> <tt/IFSTAT_HISTORY/ - name of history file for <tt/ifstat/.
<p> <tt/RTACCT_HISTORY/ - name of history file for <tt/rtacct/.
</article>
\textwidth 6.0in
\textheight 8.5in
\input SNAPSHOT
\pagestyle{myheadings}
\markboth{\protect\TITLE}{}
\markright{{\protect\sc iproute2-ss\Draft}}
% To print it in compact form: both sides on one sheet (psnup -2)
\evensidemargin=\oddsidemargin
\newenvironment{NB}{\bgroup \vskip 1mm\leftskip 1cm \footnotesize \noindent NB.
}{\par\egroup \vskip 1mm}
\def\threeonly{[2.3.15+ only] }
\begin{document}
\makeatletter
\renewcommand{\@oddhead}{{\protect\sc iproute2-ss\Draft} \hfill \protect\arabic{page}}
\makeatother
\let\oldthefootnote\thefootnote
\def\thefootnote{}
\footnotetext{Copyright \copyright~1999 A.N.Kuznetsov}
<!doctype linuxdoc system>
<article>
<title>RTACCT Utility
<author>Robert Olsson
<date>some_negative_number, 20 Dec 2001
<p>
Here is some code for monitoring the route cache. For systems handling high
network load, servers, routers, firewalls etc the route cache and its garbage
collection is crucial. Linux has a solid implementation.
<p>
The kernel patch (not required since linux-2.4.7) adds statistics counters
from route cache process into
/proc/net/rt_cache_stat. A companion user mode program presents the statistics
in a vmstat or iostat manner. The ratio between cache hits and misses gives
the flow length.
<p>
Hopefully it can help understanding performance and DoS and other related
issues.
<p> An URL where newer versions of this utility can be (probably) found
is ftp://robur.slu.se/pub/Linux/net-development/rt_cache_stat/
<p><bf/Description/
<p>The format of the command is:
<tscreen><verb>
rtstat [ OPTIONS ]
</verb></tscreen>
<p> <tt/OPTIONS/ are:
<itemize>
<item><tt/-h/, <tt/-help/ - show help page and version of the utility.
<item><tt/-i INTERVAL/ - interval between snapshots, default value is
2 seconds.
<item><tt/-s NUMBER/ - whether to print header line. 0 inhibits header line,
1 prescribes to print it once and 2 (this is default setting) forces header
line each 20 lines.
</itemize>
</article>
This diff is collapsed.
0x10 lowdelay
0x08 throughput
0x04 reliability
# This value overlap with ECT, do not use it!
0x02 mincost
# These values seems do not want to die, Cisco likes them by a strange reason.
0x20 priority
0x40 immediate
0x60 flash
0x80 flash-override
0xa0 critical
0xc0 internet
0xe0 network
#
# Reserved protocols.
#
0 unspec
1 redirect
2 kernel
3 boot
4 static
8 gated
9 ra
10 mrt
11 zebra
12 bird
#
# Used by me for gated
#
254 gated/aggr
253 gated/bgp
252 gated/ospf
251 gated/ospfase
250 gated/rip
249 gated/static
248 gated/conn
247 gated/inet
246 gated/default
#
# reserved values
#
0 cosmos
#
# local
#
#1 inr.ac
#2 inr.ruhep
#3 freenet
#4 radio-msu
#5 russia
#6 internet
#
# reserved values
#
0 global
255 nowhere
254 host
253 link
#
# pseudo-reserved
#
200 site
#
# reserved values
#
255 local
254 main
253 default
0 unspec
#
# local
#
#1 inr.ruhep
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# this script shows how one can rate limit incoming SYNs
# Useful for TCP-SYN attack protection. You can use
# IPchains to have more powerful additions to the SYN (eg
# in addition the subnet)
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
#
# tag all incoming SYN packets through $INDEV as mark value 1
############################################################
$IPCHAINS -A input -i $INDEV -y -m 1
############################################################
#
# install the ingress qdisc on the ingress interface
############################################################
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
#
# SYN packets are 40 bytes (320 bits) so three SYNs equals
# 960 bits (approximately 1kbit); so we rate limit below
# the incoming SYNs to 3/sec (not very sueful really; but
#serves to show the point - JHS
############################################################
$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 1 fw \
police rate 1kbit burst 40 mtu 9k drop flowid :1
############################################################
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress
#! /bin/sh
TC=/home/root/tc
IP=/home/root/ip
DEVICE=eth1
BANDWIDTH="bandwidth 10Mbit"
# Attach CBQ on $DEVICE. It will have handle 1:.
# $BANDWIDTH is real $DEVICE bandwidth (10Mbit).
# avpkt is average packet size.
# mpu is minimal packet size.
$TC qdisc add dev $DEVICE root handle 1: cbq \
$BANDWIDTH avpkt 1000 mpu 64
# Create root class with classid 1:1. This step is not necessary.
# bandwidth is the same as on CBQ itself.
# rate == all the bandwidth
# allot is MTU + MAC header
# maxburst measure allowed class burstiness (please,read S.Floyd and VJ papers)
# est 1sec 8sec means, that kernel will evaluate average rate
# on this class with period 1sec and time constant 8sec.
# This rate is viewed with "tc -s class ls dev $DEVICE"
$TC class add dev $DEVICE parent 1:0 classid :1 est 1sec 8sec cbq \
$BANDWIDTH rate 10Mbit allot 1514 maxburst 50 avpkt 1000
# Bulk.
# New parameters are:
# weight, which is set to be proportional to
# "rate". It is not necessary, weight=1 will work as well.
# defmap and split say that best effort ttraffic, not classfied
# by another means will fall to this class.
$TC class add dev $DEVICE parent 1:1 classid :2 est 1sec 8sec cbq \
$BANDWIDTH rate 4Mbit allot 1514 weight 500Kbit \
prio 6 maxburst 50 avpkt 1000 split 1:0 defmap ff3d
# OPTIONAL.
# Attach "sfq" qdisc to this class, quantum is MTU, perturb
# gives period of hash function perturbation in seconds.
#
$TC qdisc add dev $DEVICE parent 1:2 sfq quantum 1514b perturb 15
# Interactive-burst class
$TC class add dev $DEVICE parent 1:1 classid :3 est 2sec 16sec cbq \
$BANDWIDTH rate 1Mbit allot 1514 weight 100Kbit \
prio 2 maxburst 100 avpkt 1000 split 1:0 defmap c0
$TC qdisc add dev $DEVICE parent 1:3 sfq quantum 1514b perturb 15
# Background.
$TC class add dev $DEVICE parent 1:1 classid :4 est 1sec 8sec cbq \
$BANDWIDTH rate 100Kbit allot 1514 weight 10Mbit \
prio 7 maxburst 10 avpkt 1000 split 1:0 defmap 2
$TC qdisc add dev $DEVICE parent 1:4 sfq quantum 1514b perturb 15
# Realtime class for RSVP
$TC class add dev $DEVICE parent 1:1 classid 1:7FFE cbq \
rate 5Mbit $BANDWIDTH allot 1514b avpkt 1000 \
maxburst 20
# Reclassified realtime traffic
#
# New element: split is not 1:0, but 1:7FFE. It means,
# that only real-time packets, which violated policing filters
# or exceeded reshaping buffers will fall to it.
$TC class add dev $DEVICE parent 1:7FFE classid 1:7FFF est 4sec 32sec cbq \
rate 1Mbit $BANDWIDTH allot 1514b avpkt 1000 weight 10Kbit \
prio 6 maxburst 10 split 1:7FFE defmap ffff
This diff is collapsed.
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script just tags on the ingress interfac using Ipchains
# the result is used for fast classification and re-marking
# on the egress interface
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
#
# tag all incoming packets from host 10.2.0.24 to value 1
# tag all incoming packets from host 10.2.0.3 to value 2
# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
#These values are used in the egress
#
############################################################
$IPCHAINS -A input -s 10.2.0.4/24 -m 3
$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64 set_tc_index
#
# values of the DSCP to change depending on the class
#
#becomes EF
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0xb8
#becomes AF11
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x28
#becomes AF21
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x48
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent 1:0
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#! /bin/sh -x
#
# sample script on using the ingress capabilities
# This script tags the fwmark on the ingress interface using IPchains
# the result is used first for policing on the Ingress interface then
# for fast classification and re-marking
# on the egress interface
#
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/root/DS-6-beta/iproute2-990530-dsing
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
IPCHAINS=/root/DS-6-beta/ipchains-1.3.9/ipchains
INDEV=eth2
EGDEV="dev eth1"
#
# tag all incoming packets from host 10.2.0.24 to value 1
# tag all incoming packets from host 10.2.0.3 to value 2
# tag the rest of incoming packets from subnet 10.2.0.0/24 to value 3
#These values are used in the egress
############################################################
$IPCHAINS -A input -s 10.2.0.0/24 -m 3
$IPCHAINS -A input -i $INDEV -s 10.2.0.24 -m 1
$IPCHAINS -A input -i $INDEV -s 10.2.0.3 -m 2
############################################################
#
# install the ingress qdisc on the ingress interface
############################################################
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
# attach a fw classifier to the ingress which polices anything marked
# by ipchains to tag value 3 (The rest of the subnet packets -- not
# tag 1 or 2) to not go beyond 1.5Mbps
# Allow up to at least 60 packets to burst (assuming maximum packet
# size of # 1.5 KB) in the long run and upto about 6 packets in the
# shot run
############################################################
$TC filter add dev $INDEV parent ffff: protocol ip prio 50 handle 3 fw \
police rate 1500kbit burst 90k mtu 9k drop flowid :1
############################################################
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0xb8
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x28
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x48
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 1 fw classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 2 fw classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 4 handle 3 fw classid 1:3
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $DEV ingress
#! /bin/sh -x
#
# sample script on using the ingress capabilities using u32 classifier
# This script tags tcindex based on metering on the ingress
# interface the result is used for fast classification and re-marking
# on the egress interface
# This is an example of a color aware mode marker with PIR configured
# based on draft-wahjak-mcm-00.txt (section 3.1)
#
# The colors are defined using the Diffserv Fields
#path to various utilities;
#change to reflect yours.
#
IPROUTE=/usr/src/iproute2-current
TC=$IPROUTE/tc/tc
IP=$IPROUTE/ip/ip
INDEV=eth0
EGDEV="dev eth1"
CIR1=1500kbit
CIR2=1000kbit
#The CBS is about 60 MTU sized packets
CBS1=90k
CBS2=90k
############################################################
#
# install the ingress qdisc on the ingress interface
$TC qdisc add dev $INDEV handle ffff: ingress
############################################################
#
# Create u32 filters
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 handle 1: u32 \
divisor 1
############################################################
# The meters: Note that we have shared meters in this case as identified
# by the index parameter
meter1=" police index 1 rate $CIR1 burst $CBS1 "
meter2=" police index 2 rate $CIR2 burst $CBS1 "
meter3=" police index 3 rate $CIR2 burst $CBS2 "
meter4=" police index 4 rate $CIR1 burst $CBS2 "
meter5=" police index 5 rate $CIR1 burst $CBS2 "
# All packets are marked with a tcindex value which is used on the egress
# tcindex 1 maps to AF41, 2->AF42, 3->AF43, 4->BE
# *********************** AF41 ***************************
#AF41 (DSCP 0x22) is passed on with a tcindex value 1
#if it doesnt exceed its CIR/CBS
#policer 1 is used.
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 4 u32 \
match ip tos 0x88 0xfc \
$meter1 \
continue flowid :1
#
# if it exceeds the above but not the extra rate/burst below, it gets a
# tcindex value of 2
# policer 2 is used
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip tos 0x88 0xfc \
$meter2 \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x88 0xfc \
$meter3 \
drop flowid :3
#
# *********************** AF42 ***************************
#AF42 (DSCP 0x24) from is passed on with a tcindex value 2
#if it doesnt exceed its CIR/CBS
#policer 2 is used. Note that this is shared with the AF41
#
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 5 u32 \
match ip tos 0x90 0xfc \
$meter2 \
continue flowid :2
#
# if it exceeds the above but not the rule below, it gets a tcindex value
# of 3 (policer 3)
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x90 0xfc \
$meter3 \
drop flowid :3
#
# *********************** AF43 ***************************
#
#AF43 (DSCP 0x26) from is passed on with a tcindex value 3
#if it doesnt exceed its CIR/CBS
#policer 3 is used. Note that this is shared with the AF41 and AF42
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 6 u32 \
match ip tos 0x98 0xfc \
$meter3 \
drop flowid :3
#
# *********************** BE ***************************
#
# Anything else (not from the AF4*) gets discarded if it
# exceeds 1Mbps and by default goes to BE if it doesnt
# Note that the BE class is also used by the AF4* in the worst
# case
#
$TC filter add dev $INDEV parent ffff: protocol ip prio 7 u32 \
match ip src 0/0\
$meter4 \
drop flowid :4
######################## Egress side ########################
# attach a dsmarker
#
$TC qdisc add $EGDEV handle 1:0 root dsmark indices 64
#
# values of the DSCP to change depending on the class
#note that the ECN bits are masked out
#
#AF41 (0x88 is 0x22 shifted to the right by two bits)
#
$TC class change $EGDEV classid 1:1 dsmark mask 0x3 \
value 0x88
#AF42
$TC class change $EGDEV classid 1:2 dsmark mask 0x3 \
value 0x90
#AF43
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x98
#BE
$TC class change $EGDEV classid 1:3 dsmark mask 0x3 \
value 0x0
#
#
# The class mapping
#
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 1 tcindex classid 1:1
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 2 tcindex classid 1:2
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 3 tcindex classid 1:3
$TC filter add $EGDEV parent 1:0 protocol ip prio 1 \
handle 4 tcindex classid 1:4
#
#
echo "---- qdisc parameters Ingress ----------"
$TC qdisc ls dev $INDEV
echo "---- Class parameters Ingress ----------"
$TC class ls dev $INDEV
echo "---- filter parameters Ingress ----------"
$TC filter ls dev $INDEV parent ffff:
echo "---- qdisc parameters Egress ----------"
$TC qdisc ls $EGDEV
echo "---- Class parameters Egress ----------"
$TC class ls $EGDEV
echo "---- filter parameters Egress ----------"
$TC filter ls $EGDEV parent 1:0
#
#deleting the ingress qdisc
#$TC qdisc del $INDEV ingress
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
#!/usr/bin/perl
$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
$DEV = "dev eth1";
$efrate="1.5Mbit";
$MTU="1.5kB";
print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
"mask 0xfc shift 2\n";
print "$TC qdisc add $DEV parent 1:0 handle 2:0 prio\n";
#
# EF class: Maximum about one MTU sized packet allowed on the queue
#
print "$TC qdisc add $DEV parent 2:1 tbf rate $efrate burst $MTU limit 1.6kB\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
"handle 0x2e tcindex classid 2:1 pass_on\n";
#
# BE class
#
print "#BE class(2:2) \n";
print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
"min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
"probability 0.4\n";
#
print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
"handle 0 tcindex mask 0 classid 2:2 pass_on\n";
#!/usr/bin/perl
#
$TC = "/root/DS-6-beta/iproute2-990530-dsing/tc/tc";
$DEV = "dev eth1";
print "$TC qdisc add $DEV handle 1:0 root dsmark indices 64 set_tc_index\n";
print "$TC filter add $DEV parent 1:0 protocol ip prio 1 tcindex ".
"mask 0xfc shift 2\n";
print "$TC qdisc add $DEV parent 1:0 handle 2:0 cbq bandwidth ".
"10Mbit cell 8 avpkt 1000 mpu 64\n";
#
# EF class
#
print "$TC class add $DEV parent 2:0 classid 2:1 cbq bandwidth ".
"10Mbit rate 1500Kbit avpkt 1000 prio 1 bounded isolated ".
"allot 1514 weight 1 maxburst 10 \n";
# packet fifo for EF?
print "$TC qdisc add $DEV parent 2:1 pfifo limit 5\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 1 ".
"handle 0x2e tcindex classid 2:1 pass_on\n";
#
# BE class
#
print "#BE class(2:2) \n";
print "$TC class add $DEV parent 2:0 classid 2:2 cbq bandwidth ".
"10Mbit rate 5Mbit avpkt 1000 prio 7 allot 1514 weight 1 ".
"maxburst 21 borrow split 2:0 defmap 0xffff \n";
print "$TC qdisc add $DEV parent 2:2 red limit 60KB ".
"min 15KB max 45KB burst 20 avpkt 1000 bandwidth 10Mbit ".
"probability 0.4\n";
print "$TC filter add $DEV parent 2:0 protocol ip prio 2 ".
"handle 0 tcindex mask 0 classid 2:2 pass_on\n";
This diff is collapsed.
/* I cannot describe, how I laughed, when saw, that now sys/socket.h
includes ALL OF networking include files. 8)8)8)
Bravo! Aah, they forgot sockaddr_ll, sockaddr_pkt and sockaddr_nl...
Not a big problem, we only start the way to single UNIVERSAL include file:
#include <GNU-Gnu_is_Not_Unix.h>.
Jokes apart, it is full crap. Removed.
--ANK
*/
/* Union of all sockaddr types (required by IPv6 Basic API). This is
somewhat evil. */
/* 8)8) Well, ipngwg really does strange things sometimes, but
not in such extent! It is removed long ago --ANK
*/
union sockaddr_union
{
struct sockaddr sa;
char __maxsize[128];
};
/* Mess with various libdb in various glibcs is something...
* Crooked hands of hackers can result in amazing results making
* incompatibility at all the levels without any reasons.
*
* The simplest trick which I was able to invent is to write fake
* db.h including db_185.h and adding -I/usr/include/db3 to CFLAGS.
* Looks ugly but compiles everywhere.
*/
#include <db_185.h>
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
static char SNAPSHOT[] = "020116";
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment