1. 10 Jun, 2013 40 commits
    • Weiping Pan's avatar
      rds: set correct msg_namelen · a86c9b39
      Weiping Pan authored
      commit 06b6a1cf upstream
      
      Jay Fenlason (fenlason@redhat.com) found a bug,
      that recvfrom() on an RDS socket can return the contents of random kernel
      memory to userspace if it was called with a address length larger than
      sizeof(struct sockaddr_in).
      rds_recvmsg() also fails to set the addr_len paramater properly before
      returning, but that's just a bug.
      There are also a number of cases wher recvfrom() can return an entirely bogus
      address. Anything in rds_recvmsg() that returns a non-negative value but does
      not go through the "sin = (struct sockaddr_in *)msg->msg_name;" code path
      at the end of the while(1) loop will return up to 128 bytes of kernel memory
      to userspace.
      
      And I write two test programs to reproduce this bug, you will see that in
      rds_server, fromAddr will be overwritten and the following sock_fd will be
      destroyed.
      Yes, it is the programmer's fault to set msg_namelen incorrectly, but it is
      better to make the kernel copy the real length of address to user space in
      such case.
      
      How to run the test programs ?
      I test them on 32bit x86 system, 3.5.0-rc7.
      
      1 compile
      gcc -o rds_client rds_client.c
      gcc -o rds_server rds_server.c
      
      2 run ./rds_server on one console
      
      3 run ./rds_client on another console
      
      4 you will see something like:
      server is waiting to receive data...
      old socket fd=3
      server received data from client:data from client
      msg.msg_namelen=32
      new socket fd=-1067277685
      sendmsg()
      : Bad file descriptor
      
      /***************** rds_client.c ********************/
      
      int main(void)
      {
      	int sock_fd;
      	struct sockaddr_in serverAddr;
      	struct sockaddr_in toAddr;
      	char recvBuffer[128] = "data from client";
      	struct msghdr msg;
      	struct iovec iov;
      
      	sock_fd = socket(AF_RDS, SOCK_SEQPACKET, 0);
      	if (sock_fd < 0) {
      		perror("create socket error\n");
      		exit(1);
      	}
      
      	memset(&serverAddr, 0, sizeof(serverAddr));
      	serverAddr.sin_family = AF_INET;
      	serverAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
      	serverAddr.sin_port = htons(4001);
      
      	if (bind(sock_fd, (struct sockaddr*)&serverAddr, sizeof(serverAddr)) < 0) {
      		perror("bind() error\n");
      		close(sock_fd);
      		exit(1);
      	}
      
      	memset(&toAddr, 0, sizeof(toAddr));
      	toAddr.sin_family = AF_INET;
      	toAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
      	toAddr.sin_port = htons(4000);
      	msg.msg_name = &toAddr;
      	msg.msg_namelen = sizeof(toAddr);
      	msg.msg_iov = &iov;
      	msg.msg_iovlen = 1;
      	msg.msg_iov->iov_base = recvBuffer;
      	msg.msg_iov->iov_len = strlen(recvBuffer) + 1;
      	msg.msg_control = 0;
      	msg.msg_controllen = 0;
      	msg.msg_flags = 0;
      
      	if (sendmsg(sock_fd, &msg, 0) == -1) {
      		perror("sendto() error\n");
      		close(sock_fd);
      		exit(1);
      	}
      
      	printf("client send data:%s\n", recvBuffer);
      
      	memset(recvBuffer, '\0', 128);
      
      	msg.msg_name = &toAddr;
      	msg.msg_namelen = sizeof(toAddr);
      	msg.msg_iov = &iov;
      	msg.msg_iovlen = 1;
      	msg.msg_iov->iov_base = recvBuffer;
      	msg.msg_iov->iov_len = 128;
      	msg.msg_control = 0;
      	msg.msg_controllen = 0;
      	msg.msg_flags = 0;
      	if (recvmsg(sock_fd, &msg, 0) == -1) {
      		perror("recvmsg() error\n");
      		close(sock_fd);
      		exit(1);
      	}
      
      	printf("receive data from server:%s\n", recvBuffer);
      
      	close(sock_fd);
      
      	return 0;
      }
      
      /***************** rds_server.c ********************/
      
      int main(void)
      {
      	struct sockaddr_in fromAddr;
      	int sock_fd;
      	struct sockaddr_in serverAddr;
      	unsigned int addrLen;
      	char recvBuffer[128];
      	struct msghdr msg;
      	struct iovec iov;
      
      	sock_fd = socket(AF_RDS, SOCK_SEQPACKET, 0);
      	if(sock_fd < 0) {
      		perror("create socket error\n");
      		exit(0);
      	}
      
      	memset(&serverAddr, 0, sizeof(serverAddr));
      	serverAddr.sin_family = AF_INET;
      	serverAddr.sin_addr.s_addr = inet_addr("127.0.0.1");
      	serverAddr.sin_port = htons(4000);
      	if (bind(sock_fd, (struct sockaddr*)&serverAddr, sizeof(serverAddr)) < 0) {
      		perror("bind error\n");
      		close(sock_fd);
      		exit(1);
      	}
      
      	printf("server is waiting to receive data...\n");
      	msg.msg_name = &fromAddr;
      
      	/*
      	 * I add 16 to sizeof(fromAddr), ie 32,
      	 * and pay attention to the definition of fromAddr,
      	 * recvmsg() will overwrite sock_fd,
      	 * since kernel will copy 32 bytes to userspace.
      	 *
      	 * If you just use sizeof(fromAddr), it works fine.
      	 * */
      	msg.msg_namelen = sizeof(fromAddr) + 16;
      	/* msg.msg_namelen = sizeof(fromAddr); */
      	msg.msg_iov = &iov;
      	msg.msg_iovlen = 1;
      	msg.msg_iov->iov_base = recvBuffer;
      	msg.msg_iov->iov_len = 128;
      	msg.msg_control = 0;
      	msg.msg_controllen = 0;
      	msg.msg_flags = 0;
      
      	while (1) {
      		printf("old socket fd=%d\n", sock_fd);
      		if (recvmsg(sock_fd, &msg, 0) == -1) {
      			perror("recvmsg() error\n");
      			close(sock_fd);
      			exit(1);
      		}
      		printf("server received data from client:%s\n", recvBuffer);
      		printf("msg.msg_namelen=%d\n", msg.msg_namelen);
      		printf("new socket fd=%d\n", sock_fd);
      		strcat(recvBuffer, "--data from server");
      		if (sendmsg(sock_fd, &msg, 0) == -1) {
      			perror("sendmsg()\n");
      			close(sock_fd);
      			exit(1);
      		}
      	}
      
      	close(sock_fd);
      	return 0;
      }
      Signed-off-by: default avatarWeiping Pan <wpan@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [dannf: Adjusted to apply to Debian's 2.6.32]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      a86c9b39
    • Mathias Krause's avatar
      llc: Fix missing msg_namelen update in llc_ui_recvmsg() · 5f527802
      Mathias Krause authored
      [ Upstream commit c77a4b9c ]
      
      For stream sockets the code misses to update the msg_namelen member
      to 0 and therefore makes net/socket.c leak the local, uninitialized
      sockaddr_storage variable to userland -- 128 bytes of kernel stack
      memory. The msg_namelen update is also missing for datagram sockets
      in case the socket is shutting down during receive.
      
      Fix both issues by setting msg_namelen to 0 early. It will be
      updated later if we're going to fill the msg_name member.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      5f527802
    • Mathias Krause's avatar
      llc: fix info leak via getsockname() · 3c445af8
      Mathias Krause authored
      [ Upstream commit 3592aaeb ]
      
      The LLC code wrongly returns 0, i.e. "success", when the socket is
      zapped. Together with the uninitialized uaddrlen pointer argument from
      sys_getsockname this leads to an arbitrary memory leak of up to 128
      bytes kernel stack via the getsockname() syscall.
      
      Return an error instead when the socket is zapped to prevent the info
      leak. Also remove the unnecessary memset(0). We don't directly write to
      the memory pointed by uaddr but memcpy() a local structure at the end of
      the function that is properly initialized.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Cc: Arnaldo Carvalho de Melo <acme@ghostprotocols.net>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      3c445af8
    • Mathias Krause's avatar
      iucv: Fix missing msg_namelen update in iucv_sock_recvmsg() · 775ad2a8
      Mathias Krause authored
      [ Upstream commit a5598bd9 ]
      
      The current code does not fill the msg_name member in case it is set.
      It also does not set the msg_namelen member to 0 and therefore makes
      net/socket.c leak the local, uninitialized sockaddr_storage variable
      to userland -- 128 bytes of kernel stack memory.
      
      Fix that by simply setting msg_namelen to 0 as obviously nobody cared
      about iucv_sock_recvmsg() not filling the msg_name in case it was set.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Cc: Ursula Braun <ursula.braun@de.ibm.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      775ad2a8
    • Wu Fengguang's avatar
      isdnloop: fix and simplify isdnloop_init() · 3822b3c5
      Wu Fengguang authored
      [ Upstream commit 77f00f63 ]
      
      Fix a buffer overflow bug by removing the revision and printk.
      
      [   22.016214] isdnloop-ISDN-driver Rev 1.11.6.7
      [   22.097508] isdnloop: (loop0) virtual card added
      [   22.174400] Kernel panic - not syncing: stack-protector: Kernel stack is corrupted in: ffffffff83244972
      [   22.174400]
      [   22.436157] Pid: 1, comm: swapper Not tainted 3.5.0-bisect-00018-gfa8bbb13-dirty #129
      [   22.624071] Call Trace:
      [   22.720558]  [<ffffffff832448c3>] ? CallcNew+0x56/0x56
      [   22.815248]  [<ffffffff8222b623>] panic+0x110/0x329
      [   22.914330]  [<ffffffff83244972>] ? isdnloop_init+0xaf/0xb1
      [   23.014800]  [<ffffffff832448c3>] ? CallcNew+0x56/0x56
      [   23.090763]  [<ffffffff8108e24b>] __stack_chk_fail+0x2b/0x30
      [   23.185748]  [<ffffffff83244972>] isdnloop_init+0xaf/0xb1
      Signed-off-by: default avatarFengguang Wu <fengguang.wu@intel.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      3822b3c5
    • Mathias Krause's avatar
      ax25: fix info leak via msg_name in ax25_recvmsg() · 92df2f60
      Mathias Krause authored
      [ Upstream commit ef3313e8 ]
      
      When msg_namelen is non-zero the sockaddr info gets filled out, as
      requested, but the code fails to initialize the padding bytes of struct
      sockaddr_ax25 inserted by the compiler for alignment. Additionally the
      msg_namelen value is updated to sizeof(struct full_sockaddr_ax25) but is
      not always filled up to this size.
      
      Both issues lead to the fact that the code will leak uninitialized
      kernel stack bytes in net/socket.c.
      
      Fix both issues by initializing the memory with memset(0).
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Cc: Ralf Baechle <ralf@linux-mips.org>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      92df2f60
    • Mathias Krause's avatar
      atm: fix info leak in getsockopt(SO_ATMPVC) · 55dde8cf
      Mathias Krause authored
      commit e862f1a9 upstream.
      
      The ATM code fails to initialize the two padding bytes of struct
      sockaddr_atmpvc inserted for alignment. Add an explicit memset(0)
      before filling the structure to avoid the info leak.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [bwh: Backported to 2.6.32: adjust context, indentation]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      55dde8cf
    • Mathias Krause's avatar
      atm: fix info leak via getsockname() · dde45d39
      Mathias Krause authored
      commit 3c0c5cfd upstream.
      
      The ATM code fails to initialize the two padding bytes of struct
      sockaddr_atmpvc inserted for alignment. Add an explicit memset(0)
      before filling the structure to avoid the info leak.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [bwh: Backported to 2.6.32: adjust context]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      dde45d39
    • Mathias Krause's avatar
      atm: update msg_namelen in vcc_recvmsg() · 531539ab
      Mathias Krause authored
      [ Upstream commit 9b3e617f ]
      
      The current code does not fill the msg_name member in case it is set.
      It also does not set the msg_namelen member to 0 and therefore makes
      net/socket.c leak the local, uninitialized sockaddr_storage variable
      to userland -- 128 bytes of kernel stack memory.
      
      Fix that by simply setting msg_namelen to 0 as obviously nobody cared
      about vcc_recvmsg() not filling the msg_name in case it was set.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      531539ab
    • Mathias Krause's avatar
      ipvs: fix info leak in getsockopt(IP_VS_SO_GET_TIMEOUT) · 75ca2088
      Mathias Krause authored
      commit 2d8a041b upstream.
      
      If at least one of CONFIG_IP_VS_PROTO_TCP or CONFIG_IP_VS_PROTO_UDP is
      not set, __ip_vs_get_timeouts() does not fully initialize the structure
      that gets copied to userland and that for leaks up to 12 bytes of kernel
      stack. Add an explicit memset(0) before passing the structure to
      __ip_vs_get_timeouts() to avoid the info leak.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Cc: Wensong Zhang <wensong@linux-vs.org>
      Cc: Simon Horman <horms@verge.net.au>
      Cc: Julian Anastasov <ja@ssi.bg>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [bwh: Backported to 2.6.32: adjust context]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      75ca2088
    • Jesper Dangaard Brouer's avatar
      ipvs: IPv6 MTU checking cleanup and bugfix · 9df2c9ad
      Jesper Dangaard Brouer authored
      Cleaning up the IPv6 MTU checking in the IPVS xmit code, by using
      a common helper function __mtu_check_toobig_v6().
      
      The MTU check for tunnel mode can also use this helper as
      ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr) is qual to
      skb->len.  And the 'mtu' variable have been adjusted before
      calling helper.
      
      Notice, this also fixes a bug, as the the MTU check in ip_vs_dr_xmit_v6()
      were missing a check for skb_is_gso().
      
      This bug e.g. caused issues for KVM IPVS setups, where different
      Segmentation Offloading techniques are utilized, between guests,
      via the virtio driver.  This resulted in very bad performance,
      due to the ICMPv6 "too big" messages didn't affect the sender.
      Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
      Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
      Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
      (cherry picked from commit 590e3f79)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      9df2c9ad
    • Simon Horman's avatar
      ipvs: allow transmit of GRO aggregated skbs · ec3dc8cd
      Simon Horman authored
      Attempt at allowing LVS to transmit skbs of greater than MTU length that
      have been aggregated by GRO and can thus be deaggregated by GSO.
      
      Cc: Julian Anastasov <ja@ssi.bg>
      Cc: Herbert Xu <herbert@gondor.apana.org.au>
      Signed-off-by: default avatarSimon Horman <horms@verge.net.au>
      (cherry picked from commit 8f1b03a4)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      ec3dc8cd
    • Jozsef Kadlecsik's avatar
      netfilter: nf_ct_ipv4: packets with wrong ihl are invalid · df7753cf
      Jozsef Kadlecsik authored
      commit 07153c6e upstream.
      
      It was reported that the Linux kernel sometimes logs:
      
      klogd: [2629147.402413] kernel BUG at net / netfilter /
      nf_conntrack_proto_tcp.c: 447!
      klogd: [1072212.887368] kernel BUG at net / netfilter /
      nf_conntrack_proto_tcp.c: 392
      
      ipv4_get_l4proto() in nf_conntrack_l3proto_ipv4.c and tcp_error() in
      nf_conntrack_proto_tcp.c should catch malformed packets, so the errors
      at the indicated lines - TCP options parsing - should not happen.
      However, tcp_error() relies on the "dataoff" offset to the TCP header,
      calculated by ipv4_get_l4proto().  But ipv4_get_l4proto() does not check
      bogus ihl values in IPv4 packets, which then can slip through tcp_error()
      and get caught at the TCP options parsing routines.
      
      The patch fixes ipv4_get_l4proto() by invalidating packets with bogus
      ihl value.
      
      The patch closes netfilter bugzilla id 771.
      Signed-off-by: default avatarJozsef Kadlecsik <kadlec@blackhole.kfki.hu>
      Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
      Acked-by: default avatarDavid Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      df7753cf
    • Eric Dumazet's avatar
      ipv6: make fragment identifications less predictable · c023a0b4
      Eric Dumazet authored
      [ Backport of upstream commit 87c48fa3 ]
      
      Fernando Gont reported current IPv6 fragment identification generation
      was not secure, because using a very predictable system-wide generator,
      allowing various attacks.
      
      IPv4 uses inetpeer cache to address this problem and to get good
      performance. We'll use this mechanism when IPv6 inetpeer is stable
      enough in linux-3.1
      
      For the time being, we use jhash on destination address to provide less
      predictable identifications. Also remove a spinlock and use cmpxchg() to
      get better SMP performance.
      Reported-by: default avatarFernando Gont <fernando@gont.com.ar>
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@suse.de>
      [bwh: Backport further to 2.6.32]
      Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      c023a0b4
    • Nicolas Dichtel's avatar
      ipv6: discard overlapping fragment · b1a1c38d
      Nicolas Dichtel authored
      commit 70789d70 upstream
      
      RFC5722 prohibits reassembling fragments when some data overlaps.
      
      Bug spotted by Zhang Zuotao <zuotao.zhang@6wind.com>.
      Signed-off-by: default avatarNicolas Dichtel <nicolas.dichtel@6wind.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [dannf: backported to Debian's 2.6.32]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      b1a1c38d
    • Daniel Borkmann's avatar
      net: sctp: sctp_auth_key_put: use kzfree instead of kfree · 9c51a966
      Daniel Borkmann authored
      [ Upstream commit 586c31f3 ]
      
      For sensitive data like keying material, it is common practice to zero
      out keys before returning the memory back to the allocator. Thus, use
      kzfree instead of kfree.
      Signed-off-by: default avatarDaniel Borkmann <dborkman@redhat.com>
      Acked-by: default avatarNeil Horman <nhorman@tuxdriver.com>
      Acked-by: default avatarVlad Yasevich <vyasevich@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      9c51a966
    • Daniel Borkmann's avatar
      net: sctp: sctp_endpoint_free: zero out secret key data · 57201215
      Daniel Borkmann authored
      [ Upstream commit b5c37fe6 ]
      
      On sctp_endpoint_destroy, previously used sensitive keying material
      should be zeroed out before the memory is returned, as we already do
      with e.g. auth keys when released.
      Signed-off-by: default avatarDaniel Borkmann <dborkman@redhat.com>
      Acked-by: default avatarVlad Yasevich <vyasevic@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      57201215
    • Daniel Borkmann's avatar
      net: sctp: sctp_setsockopt_auth_key: use kzfree instead of kfree · bae3ff4a
      Daniel Borkmann authored
      [ Upstream commit 6ba542a2 ]
      
      In sctp_setsockopt_auth_key, we create a temporary copy of the user
      passed shared auth key for the endpoint or association and after
      internal setup, we free it right away. Since it's sensitive data, we
      should zero out the key before returning the memory back to the
      allocator. Thus, use kzfree instead of kfree, just as we do in
      sctp_auth_key_put().
      Signed-off-by: default avatarDaniel Borkmann <dborkman@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      bae3ff4a
    • Tommi Rantala's avatar
      sctp: fix memory leak in sctp_datamsg_from_user() when copy from user space fails · 5e4b9c85
      Tommi Rantala authored
      [ Upstream commit be364c8c ]
      
      Trinity (the syscall fuzzer) discovered a memory leak in SCTP,
      reproducible e.g. with the sendto() syscall by passing invalid
      user space pointer in the second argument:
      
       #include <string.h>
       #include <arpa/inet.h>
       #include <sys/socket.h>
      
       int main(void)
       {
               int fd;
               struct sockaddr_in sa;
      
               fd = socket(AF_INET, SOCK_STREAM, 132 /*IPPROTO_SCTP*/);
               if (fd < 0)
                       return 1;
      
               memset(&sa, 0, sizeof(sa));
               sa.sin_family = AF_INET;
               sa.sin_addr.s_addr = inet_addr("127.0.0.1");
               sa.sin_port = htons(11111);
      
               sendto(fd, NULL, 1, 0, (struct sockaddr *)&sa, sizeof(sa));
      
               return 0;
       }
      
      As far as I can tell, the leak has been around since ~2003.
      Signed-off-by: default avatarTommi Rantala <tt.rantala@gmail.com>
      Acked-by: default avatarVlad Yasevich <vyasevich@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      5e4b9c85
    • Mathias Krause's avatar
      dcbnl: fix various netlink info leaks · 0e03cad4
      Mathias Krause authored
      commit 29cd8ae0 upstream.
      
      The dcb netlink interface leaks stack memory in various places:
      * perm_addr[] buffer is only filled at max with 12 of the 32 bytes but
        copied completely,
      * no in-kernel driver fills all fields of an IEEE 802.1Qaz subcommand,
        so we're leaking up to 58 bytes for ieee_ets structs, up to 136 bytes
        for ieee_pfc structs, etc.,
      * the same is true for CEE -- no in-kernel driver fills the whole
        struct,
      
      Prevent all of the above stack info leaks by properly initializing the
      buffers/structures involved.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [bwh: Backported to 2.6.32: no support for IEEE or CEE commands, so only
       deal with perm_addr]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      0e03cad4
    • Paul Moore's avatar
      unix: fix a race condition in unix_release() · 480cabcc
      Paul Moore authored
      [ Upstream commit ded34e0f ]
      
      As reported by Jan, and others over the past few years, there is a
      race condition caused by unix_release setting the sock->sk pointer
      to NULL before properly marking the socket as dead/orphaned.  This
      can cause a problem with the LSM hook security_unix_may_send() if
      there is another socket attempting to write to this partially
      released socket in between when sock->sk is set to NULL and it is
      marked as dead/orphaned.  This patch fixes this by only setting
      sock->sk to NULL after the socket has been marked as dead; I also
      take the opportunity to make unix_release_sock() a void function
      as it only ever returned 0/success.
      
      Dave, I think this one should go on the -stable pile.
      
      Special thanks to Jan for coming up with a reproducer for this
      problem.
      Reported-by: default avatarJan Stancek <jan.stancek@gmail.com>
      Signed-off-by: default avatarPaul Moore <pmoore@redhat.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      480cabcc
    • Eric Dumazet's avatar
      tcp: preserve ACK clocking in TSO · 0d99f344
      Eric Dumazet authored
      [ Upstream commit f4541d60 ]
      
      A long standing problem with TSO is the fact that tcp_tso_should_defer()
      rearms the deferred timer, while it should not.
      
      Current code leads to following bad bursty behavior :
      
      20:11:24.484333 IP A > B: . 297161:316921(19760) ack 1 win 119
      20:11:24.484337 IP B > A: . ack 263721 win 1117
      20:11:24.485086 IP B > A: . ack 265241 win 1117
      20:11:24.485925 IP B > A: . ack 266761 win 1117
      20:11:24.486759 IP B > A: . ack 268281 win 1117
      20:11:24.487594 IP B > A: . ack 269801 win 1117
      20:11:24.488430 IP B > A: . ack 271321 win 1117
      20:11:24.489267 IP B > A: . ack 272841 win 1117
      20:11:24.490104 IP B > A: . ack 274361 win 1117
      20:11:24.490939 IP B > A: . ack 275881 win 1117
      20:11:24.491775 IP B > A: . ack 277401 win 1117
      20:11:24.491784 IP A > B: . 316921:332881(15960) ack 1 win 119
      20:11:24.492620 IP B > A: . ack 278921 win 1117
      20:11:24.493448 IP B > A: . ack 280441 win 1117
      20:11:24.494286 IP B > A: . ack 281961 win 1117
      20:11:24.495122 IP B > A: . ack 283481 win 1117
      20:11:24.495958 IP B > A: . ack 285001 win 1117
      20:11:24.496791 IP B > A: . ack 286521 win 1117
      20:11:24.497628 IP B > A: . ack 288041 win 1117
      20:11:24.498459 IP B > A: . ack 289561 win 1117
      20:11:24.499296 IP B > A: . ack 291081 win 1117
      20:11:24.500133 IP B > A: . ack 292601 win 1117
      20:11:24.500970 IP B > A: . ack 294121 win 1117
      20:11:24.501388 IP B > A: . ack 295641 win 1117
      20:11:24.501398 IP A > B: . 332881:351881(19000) ack 1 win 119
      
      While the expected behavior is more like :
      
      20:19:49.259620 IP A > B: . 197601:202161(4560) ack 1 win 119
      20:19:49.260446 IP B > A: . ack 154281 win 1212
      20:19:49.261282 IP B > A: . ack 155801 win 1212
      20:19:49.262125 IP B > A: . ack 157321 win 1212
      20:19:49.262136 IP A > B: . 202161:206721(4560) ack 1 win 119
      20:19:49.262958 IP B > A: . ack 158841 win 1212
      20:19:49.263795 IP B > A: . ack 160361 win 1212
      20:19:49.264628 IP B > A: . ack 161881 win 1212
      20:19:49.264637 IP A > B: . 206721:211281(4560) ack 1 win 119
      20:19:49.265465 IP B > A: . ack 163401 win 1212
      20:19:49.265886 IP B > A: . ack 164921 win 1212
      20:19:49.266722 IP B > A: . ack 166441 win 1212
      20:19:49.266732 IP A > B: . 211281:215841(4560) ack 1 win 119
      20:19:49.267559 IP B > A: . ack 167961 win 1212
      20:19:49.268394 IP B > A: . ack 169481 win 1212
      20:19:49.269232 IP B > A: . ack 171001 win 1212
      20:19:49.269241 IP A > B: . 215841:221161(5320) ack 1 win 119
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Cc: Yuchung Cheng <ycheng@google.com>
      Cc: Van Jacobson <vanj@google.com>
      Cc: Neal Cardwell <ncardwell@google.com>
      Cc: Nandita Dukkipati <nanditad@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      0d99f344
    • Eric Dumazet's avatar
      tcp: fix MSG_SENDPAGE_NOTLAST logic · ef32163f
      Eric Dumazet authored
      [ Upstream commit ae62ca7b ]
      
      commit 35f9c09f (tcp: tcp_sendpages() should call tcp_push() once)
      added an internal flag : MSG_SENDPAGE_NOTLAST meant to be set on all
      frags but the last one for a splice() call.
      
      The condition used to set the flag in pipe_to_sendpage() relied on
      splice() user passing the exact number of bytes present in the pipe,
      or a smaller one.
      
      But some programs pass an arbitrary high value, and the test fails.
      
      The effect of this bug is a lack of tcp_push() at the end of a
      splice(pipe -> socket) call, and possibly very slow or erratic TCP
      sessions.
      
      We should both test sd->total_len and fact that another fragment
      is in the pipe (pipe->nrbufs > 1)
      
      Many thanks to Willy for providing very clear bug report, bisection
      and test programs.
      Reported-by: default avatarWilly Tarreau <w@1wt.eu>
      Bisected-by: default avatarWilly Tarreau <w@1wt.eu>
      Tested-by: default avatarWilly Tarreau <w@1wt.eu>
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      ef32163f
    • Eric Dumazet's avatar
      tcp: allow splice() to build full TSO packets · 2ed8840b
      Eric Dumazet authored
      [ This combines upstream commit
        2f533844 and the follow-on bug fix
        commit 35f9c09f ]
      
      vmsplice()/splice(pipe, socket) call do_tcp_sendpages() one page at a
      time, adding at most 4096 bytes to an skb. (assuming PAGE_SIZE=4096)
      
      The call to tcp_push() at the end of do_tcp_sendpages() forces an
      immediate xmit when pipe is not already filled, and tso_fragment() try
      to split these skb to MSS multiples.
      
      4096 bytes are usually split in a skb with 2 MSS, and a remaining
      sub-mss skb (assuming MTU=1500)
      
      This makes slow start suboptimal because many small frames are sent to
      qdisc/driver layers instead of big ones (constrained by cwnd and packets
      in flight of course)
      
      In fact, applications using sendmsg() (adding an additional memory copy)
      instead of vmsplice()/splice()/sendfile() are a bit faster because of
      this anomaly, especially if serving small files in environments with
      large initial [c]wnd.
      
      Call tcp_push() only if MSG_MORE is not set in the flags parameter.
      
      This bit is automatically provided by splice() internals but for the
      last page, or on all pages if user specified SPLICE_F_MORE splice()
      flag.
      
      In some workloads, this can reduce number of sent logical packets by an
      order of magnitude, making zero-copy TCP actually faster than
      one-copy :)
      Reported-by: default avatarTom Herbert <therbert@google.com>
      Cc: Nandita Dukkipati <nanditad@google.com>
      Cc: Neal Cardwell <ncardwell@google.com>
      Cc: Tom Herbert <therbert@google.com>
      Cc: Yuchung Cheng <ycheng@google.com>
      Cc: H.K. Jerry Chu <hkchu@google.com>
      Cc: Maciej Żenczykowski <maze@google.com>
      Cc: Mahesh Bandewar <maheshb@google.com>
      Cc: Ilpo Järvinen <ilpo.jarvinen@helsinki.fi>
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      2ed8840b
    • Eric Dumazet's avatar
      inet: add RCU protection to inet->opt · b8710128
      Eric Dumazet authored
      commit f6d8bd05 upstream.
      
      We lack proper synchronization to manipulate inet->opt ip_options
      
      Problem is ip_make_skb() calls ip_setup_cork() and
      ip_setup_cork() possibly makes a copy of ipc->opt (struct ip_options),
      without any protection against another thread manipulating inet->opt.
      
      Another thread can change inet->opt pointer and free old one under us.
      
      Use RCU to protect inet->opt (changed to inet->inet_opt).
      
      Instead of handling atomic refcounts, just copy ip_options when
      necessary, to avoid cache line dirtying.
      
      We cant insert an rcu_head in struct ip_options since its included in
      skb->cb[], so this patch is large because I had to introduce a new
      ip_options_rcu structure.
      Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
      Cc: Herbert Xu <herbert@gondor.apana.org.au>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [dannf/bwh: backported to Debian's 2.6.32]
      Signed-off-by: default avatarBen Hutchings <ben@decadent.org.uk>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      b8710128
    • Mathias Krause's avatar
      net: fix info leak in compat dev_ifconf() · 66096e89
      Mathias Krause authored
      commit 43da5f2e upstream.
      
      The implementation of dev_ifconf() for the compat ioctl interface uses
      an intermediate ifc structure allocated in userland for the duration of
      the syscall. Though, it fails to initialize the padding bytes inserted
      for alignment and that for leaks four bytes of kernel stack. Add an
      explicit memset(0) before filling the structure to avoid the info leak.
      Signed-off-by: default avatarMathias Krause <minipli@googlemail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      [bwh: Backported to 2.6.32: adjust filename, context]
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      66096e89
    • Eric Dumazet's avatar
      net: guard tcp_set_keepalive() to tcp sockets · f2e244fa
      Eric Dumazet authored
      [ Upstream commit 3e10986d ]
      
      Its possible to use RAW sockets to get a crash in
      tcp_set_keepalive() / sk_reset_timer()
      
      Fix is to make sure socket is a SOCK_STREAM one.
      Reported-by: default avatarDave Jones <davej@redhat.com>
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      f2e244fa
    • Jesper Dangaard Brouer's avatar
      net: fix divide by zero in tcp algorithm illinois · 81821315
      Jesper Dangaard Brouer authored
      commit 8f363b77 upstream
      
      Reading TCP stats when using TCP Illinois congestion control algorithm
      can cause a divide by zero kernel oops.
      
      The division by zero occur in tcp_illinois_info() at:
       do_div(t, ca->cnt_rtt);
      where ca->cnt_rtt can become zero (when rtt_reset is called)
      
      Steps to Reproduce:
       1. Register tcp_illinois:
           # sysctl -w net.ipv4.tcp_congestion_control=illinois
       2. Monitor internal TCP information via command "ss -i"
           # watch -d ss -i
       3. Establish new TCP conn to machine
      
      Either it fails at the initial conn, or else it needs to wait
      for a loss or a reset.
      
      This is only related to reading stats.  The function avg_delay() also
      performs the same divide, but is guarded with a (ca->cnt_rtt > 0) at its
      calling point in update_params().  Thus, simply fix tcp_illinois_info().
      
      Function tcp_illinois_info() / get_info() is called without
      socket lock.  Thus, eliminate any race condition on ca->cnt_rtt
      by using a local stack variable.  Simply reuse info.tcpv_rttcnt,
      as its already set to ca->cnt_rtt.
      Function avg_delay() is not affected by this race condition, as
      its called with the socket lock.
      
      Cc: Petr Matousek <pmatouse@redhat.com>
      Signed-off-by: default avatarJesper Dangaard Brouer <brouer@redhat.com>
      Acked-by: default avatarEric Dumazet <edumazet@google.com>
      Acked-by: default avatarStephen Hemminger <shemminger@vyatta.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      81821315
    • Cong Wang's avatar
      net: prevent setting ttl=0 via IP_TTL · 495ba7ee
      Cong Wang authored
      [ Upstream commit c9be4a5c ]
      
      A regression is introduced by the following commit:
      
      	commit 4d52cfbe
      	Author: Eric Dumazet <eric.dumazet@gmail.com>
      	Date:   Tue Jun 2 00:42:16 2009 -0700
      
      	    net: ipv4/ip_sockglue.c cleanups
      
      	    Pure cleanups
      
      but it is not a pure cleanup...
      
      	-               if (val != -1 && (val < 1 || val>255))
      	+               if (val != -1 && (val < 0 || val > 255))
      
      Since there is no reason provided to allow ttl=0, change it back.
      Reported-by: default avatarnitin padalia <padalia.nitin@gmail.com>
      Cc: nitin padalia <padalia.nitin@gmail.com>
      Cc: Eric Dumazet <eric.dumazet@gmail.com>
      Cc: David S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarCong Wang <xiyou.wangcong@gmail.com>
      Acked-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      495ba7ee
    • Stefan Hasko's avatar
      net: sched: integer overflow fix · 6e9a4199
      Stefan Hasko authored
      [ Upstream commit d2fe85da ]
      
      Fixed integer overflow in function htb_dequeue
      Signed-off-by: default avatarStefan Hasko <hasko.stevo@gmail.com>
      Acked-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      6e9a4199
    • Hiroaki SHIMODA's avatar
      net_sched: gact: Fix potential panic in tcf_gact(). · 0b42ca0d
      Hiroaki SHIMODA authored
      [ Upstream commit 696ecdc1 ]
      
      gact_rand array is accessed by gact->tcfg_ptype whose value
      is assumed to less than MAX_RAND, but any range checks are
      not performed.
      
      So add a check in tcf_gact_init(). And in tcf_gact(), we can
      reduce a branch.
      Signed-off-by: default avatarHiroaki SHIMODA <shimoda.hiroaki@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      0b42ca0d
    • Benjamin LaHaise's avatar
      ipv4: check rt_genid in dst_check · 32ed4a99
      Benjamin LaHaise authored
      commit d11a4dc1
      Author: Timo Teräs <timo.teras@iki.fi>
      Date:   Thu Mar 18 23:20:20 2010 +0000
      
          ipv4: check rt_genid in dst_check
      
          Xfrm_dst keeps a reference to ipv4 rtable entries on each
          cached bundle. The only way to renew xfrm_dst when the underlying
          route has changed, is to implement dst_check for this. This is
          what ipv6 side does too.
      
          The problems started after 87c1e12b
          ("ipsec: Fix bogus bundle flowi") which fixed a bug causing xfrm_dst
          to not get reused, until that all lookups always generated new
          xfrm_dst with new route reference and path mtu worked. But after the
          fix, the old routes started to get reused even after they were expired
          causing pmtu to break (well it would occationally work if the rtable
          gc had run recently and marked the route obsolete causing dst_check to
          get called).
      Signed-off-by: default avatarTimo Teras <timo.teras@iki.fi>
      Acked-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      
      This commit is based on the above, with the addition of verifying blackhole
      routes in the same manner.
      
      Fixing the issue with blackhole routes as it was accomplished in mainline
      would require pulling in a lot more code, and people were not interested
      in pulling in all of the dependencies given the much higher risk of trying
      to select the right subset of changes to include.  The addition of the
      single line of "dst->obsolete = -1;" in ipv4_dst_blackhole() was much
      easier to verify, and is in the spirit of the patch in question.
      This is the minimal set of changes to fix the bug in question.
      
      A test case is available here :
        http://marc.info/?l=linux-netdev&m=135015076708950&w=2Signed-off-by: default avatarBenjamin LaHaise <bcrl@kvack.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      32ed4a99
    • Hillf Danton's avatar
      bonding: Fix slave selection bug. · 8500db68
      Hillf Danton authored
      The returned slave is incorrect, if the net device under check is not
      charged yet by the master.
      Signed-off-by: default avatarHillf Danton <dhillf@gmail.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      (cherry picked from commit af3e5bd5)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      8500db68
    • Stephen Hemminger's avatar
      bridge: set priority of STP packets · 48bff07f
      Stephen Hemminger authored
      Spanning Tree Protocol packets should have always been marked as
      control packets, this causes them to get queued in the high prirority
      FIFO. As Radia Perlman mentioned in her LCA talk, STP dies if bridge
      gets overloaded and can't communicate. This is a long-standing bug back
      to the first versions of Linux bridge.
      Signed-off-by: default avatarStephen Hemminger <stephen@networkplumber.org>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      (cherry picked from commit 547b4e71)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      48bff07f
    • danborkmann@iogearbox.net's avatar
      af_packet: remove BUG statement in tpacket_destruct_skb · 6f96a94d
      danborkmann@iogearbox.net authored
      [ Upstream commit 7f5c3e3a ]
      
      Here's a quote of the comment about the BUG macro from asm-generic/bug.h:
      
       Don't use BUG() or BUG_ON() unless there's really no way out; one
       example might be detecting data structure corruption in the middle
       of an operation that can't be backed out of.  If the (sub)system
       can somehow continue operating, perhaps with reduced functionality,
       it's probably not BUG-worthy.
      
       If you're tempted to BUG(), think again:  is completely giving up
       really the *only* solution?  There are usually better options, where
       users don't need to reboot ASAP and can mostly shut down cleanly.
      
      In our case, the status flag of a ring buffer slot is managed from both sides,
      the kernel space and the user space. This means that even though the kernel
      side might work as expected, the user space screws up and changes this flag
      right between the send(2) is triggered when the flag is changed to
      TP_STATUS_SENDING and a given skb is destructed after some time. Then, this
      will hit the BUG macro. As David suggested, the best solution is to simply
      remove this statement since it cannot be used for kernel side internal
      consistency checks. I've tested it and the system still behaves /stable/ in
      this case, so in accordance with the above comment, we should rather remove it.
      Signed-off-by: default avatarDaniel Borkmann <daniel.borkmann@tik.ee.ethz.ch>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      6f96a94d
    • Eric Dumazet's avatar
      softirq: reduce latencies · 3003ed64
      Eric Dumazet authored
      In various network workloads, __do_softirq() latencies can be up
      to 20 ms if HZ=1000, and 200 ms if HZ=100.
      
      This is because we iterate 10 times in the softirq dispatcher,
      and some actions can consume a lot of cycles.
      
      This patch changes the fallback to ksoftirqd condition to :
      
      - A time limit of 2 ms.
      - need_resched() being set on current task
      
      When one of this condition is met, we wakeup ksoftirqd for further
      softirq processing if we still have pending softirqs.
      
      Using need_resched() as the only condition can trigger RCU stalls,
      as we can keep BH disabled for too long.
      
      I ran several benchmarks and got no significant difference in
      throughput, but a very significant reduction of latencies (one order
      of magnitude) :
      
      In following bench, 200 antagonist "netperf -t TCP_RR" are started in
      background, using all available cpus.
      
      Then we start one "netperf -t TCP_RR", bound to the cpu handling the NIC
      IRQ (hard+soft)
      
      Before patch :
      
      RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
      MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
      to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
      RT_LATENCY=550110.424
      MIN_LATENCY=146858
      MAX_LATENCY=997109
      P50_LATENCY=305000
      P90_LATENCY=550000
      P99_LATENCY=710000
      MEAN_LATENCY=376989.12
      STDDEV_LATENCY=184046.92
      
      After patch :
      
      RT_LATENCY,MIN_LATENCY,MAX_LATENCY,P50_LATENCY,P90_LATENCY,P99_LATENCY,MEAN_LATENCY,STDDEV_LATENCY
      MIGRATED TCP REQUEST/RESPONSE TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET
      to 7.7.7.84 () port 0 AF_INET : first burst 0 : cpu bind
      RT_LATENCY=40545.492
      MIN_LATENCY=9834
      MAX_LATENCY=78366
      P50_LATENCY=33583
      P90_LATENCY=59000
      P99_LATENCY=69000
      MEAN_LATENCY=38364.67
      STDDEV_LATENCY=12865.26
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Cc: David Miller <davem@davemloft.net>
      Cc: Tom Herbert <therbert@google.com>
      Cc: Ben Hutchings <bhutchings@solarflare.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      (cherry picked from commit c10d7367)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      3003ed64
    • Eric Dumazet's avatar
      net: reduce net_rx_action() latency to 2 HZ · e072f1c6
      Eric Dumazet authored
      We should use time_after_eq() to get maximum latency of two ticks,
      instead of three.
      
      Bug added in commit 24f8b238 (net: increase receive packet quantum)
      Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      (cherry picked from commit d1f41b67)
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      e072f1c6
    • Alexey Khoroshilov's avatar
      net/core: Fix potential memory leak in dev_set_alias() · b680135d
      Alexey Khoroshilov authored
      [ Upstream commit 7364e445 ]
      
      Do not leak memory by updating pointer with potentially NULL realloc return value.
      
      Found by Linux Driver Verification project (linuxtesting.org).
      Signed-off-by: default avatarAlexey Khoroshilov <khoroshilov@ispras.ru>
      Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      b680135d
    • J. Bruce Fields's avatar
      nfsd4: fix oops on unusual readlike compound · be1a41d7
      J. Bruce Fields authored
      commit d5f50b0c upstream.
      
      If the argument and reply together exceed the maximum payload size, then
      a reply with a read-like operation can overlow the rq_pages array.
      Signed-off-by: default avatarJ. Bruce Fields <bfields@redhat.com>
      Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      be1a41d7
    • Trond Myklebust's avatar
      kernel panic when mount NFSv4 · fc9e795a
      Trond Myklebust authored
      On Tue, 2010-12-14 at 16:58 +0800, Mi Jinlong wrote:
      > Hi,
      >
      > When testing NFSv4 at RHEL6 with kernel 2.6.32, I got a kernel panic
      > at NFS client's __rpc_create_common function.
      >
      > The panic place is:
      >   rpc_mkpipe
      >     __rpc_lookup_create()          <=== find pipefile *idmap*
      >     __rpc_mkpipe()                 <=== pipefile is *idmap*
      >       __rpc_create_common()
      >        ******  BUG_ON(!d_unhashed(dentry)); ******    *panic*
      >
      > It means that the dentry's d_flags have be set DCACHE_UNHASHED,
      > but it should not be set here.
      >
      > Is someone known this bug? or give me some idea?
      >
      > A reproduce program is append, but it can't reproduce the bug every time.
      > the export is: "/nfsroot       *(rw,no_root_squash,fsid=0,insecure)"
      >
      > And the panic message is append.
      >
      > ============================================================================
      > #!/bin/sh
      >
      > LOOPTOTAL=768
      > LOOPCOUNT=0
      > ret=0
      >
      > while [ $LOOPCOUNT -ne $LOOPTOTAL ]
      > do
      > 	((LOOPCOUNT += 1))
      > 	service nfs restart
      > 	/usr/sbin/rpc.idmapd
      > 	mount -t nfs4 127.0.0.1:/ /mnt|| return 1;
      > 	ls -l /var/lib/nfs/rpc_pipefs/nfs/*/
      > 	umount /mnt
      > 	echo $LOOPCOUNT
      > done
      >
      > ===============================================================================
      > Code: af 60 01 00 00 89 fa 89 f0 e8 64 cf 89 f0 e8 5c 7c 64 cf 31 c0 8b 5c 24 10 8b
      > 74 24 14 8b 7c 24 18 8b 6c 24 1c 83 c4 20 c3 <0f> 0b eb fc 8b 46 28 c7 44 24 08 20
      > de ee f0 c7 44 24 04 56 ea
      > EIP:[<f0ee92ea>] __rpc_create_common+0x8a/0xc0 [sunrpc] SS:ESP 0068:eccb5d28
      > ---[ end trace 8f5606cd08928ed2]---
      > Kernel panic - not syncing: Fatal exception
      > Pid:7131, comm: mount.nfs4 Tainted: G     D   -------------------2.6.32 #1
      > Call Trace:
      >  [<c080ad18>] ? panic+0x42/0xed
      >  [<c080e42c>] ? oops_end+0xbc/0xd0
      >  [<c040b090>] ? do_invalid_op+0x0/0x90
      >  [<c040b10f>] ? do_invalid_op+0x7f/0x90
      >  [<f0ee92ea>] ? __rpc_create_common+0x8a/0xc0[sunrpc]
      >  [<f0edc433>] ? rpc_free_task+0x33/0x70[sunrpc]
      >  [<f0ed6508>] ? prc_call_sync+0x48/0x60[sunrpc]
      >  [<f0ed656e>] ? rpc_ping+0x4e/0x60[sunrpc]
      >  [<f0ed6eaf>] ? rpc_create+0x38f/0x4f0[sunrpc]
      >  [<c080d80b>] ? error_code+0x73/0x78
      >  [<f0ee92ea>] ? __rpc_create_common+0x8a/0xc0[sunrpc]
      >  [<c0532bda>] ? d_lookup+0x2a/0x40
      >  [<f0ee94b1>] ? rpc_mkpipe+0x111/0x1b0[sunrpc]
      >  [<f10a59f4>] ? nfs_create_rpc_client+0xb4/0xf0[nfs]
      >  [<f10d6c6d>] ? nfs_fscache_get_client_cookie+0x1d/0x50[nfs]
      >  [<f10d3fcb>] ? nfs_idmap_new+0x7b/0x140[nfs]
      >  [<c05e76aa>] ? strlcpy+0x3a/0x60
      >  [<f10a60ca>] ? nfs4_set_client+0xea/0x2b0[nfs]
      >  [<f10a6d0c>] ? nfs4_create_server+0xac/0x1b0[nfs]
      >  [<c04f1400>] ? krealloc+0x40/0x50
      >  [<f10b0e8b>] ? nfs4_remote_get_sb+0x6b/0x250[nfs]
      >  [<c04f14ec>] ? kstrdup+0x3c/0x60
      >  [<c0520739>] ? vfs_kern_mount+0x69/0x170
      >  [<f10b1a3c>] ? nfs_do_root_mount+0x6c/0xa0[nfs]
      >  [<f10b1b47>] ? nfs4_try_mount+0x37/0xa0[nfs]
      >  [<f10afe6d>] ? nfs4_validate_text_mount_data+-x7d/0xf0[nfs]
      >  [<f10b1c42>] ? nfs4_get_sb+0x92/0x2f0
      >  [<c0520739>] ? vfs_kern_mount+0x69/0x170
      >  [<c05366d2>] ? get_fs_type+0x32/0xb0
      >  [<c052089f>] ? do_kern_mount+0x3f/0xe0
      >  [<c053954f>] ? do_mount+0x2ef/0x740
      >  [<c0537740>] ? copy_mount_options+0xb0/0x120
      >  [<c0539a0e>] ? sys_mount+0x6e/0xa0
      
      Hi,
      
      Does the following patch fix the problem?
      
      Cheers
        Trond
      
      --------------------------
      SUNRPC: Fix a BUG in __rpc_create_common
      
      From: Trond Myklebust <Trond.Myklebust@netapp.com>
      
      Mi Jinlong reports:
      
      When testing NFSv4 at RHEL6 with kernel 2.6.32, I got a kernel panic
      at NFS client's __rpc_create_common function.
      
      The panic place is:
        rpc_mkpipe
            __rpc_lookup_create()          <=== find pipefile *idmap*
            __rpc_mkpipe()                 <=== pipefile is *idmap*
              __rpc_create_common()
               ******  BUG_ON(!d_unhashed(dentry)); ****** *panic*
      
      The test is wrong: we can find ourselves with a hashed negative dentry here
      if the idmapper tried to look up the file before we got round to creating
      it.
      
      Just replace the BUG_ON() with a d_drop(dentry).
      
      [2.6.32 background info from Jonathan below]
      > Hi Willy et al,
      >
      > Please consider
      >
      >   beb0f0a9 kernel panic when mount NFSv4, 2010-12-20
      >
      > for application to kernel.org's 2.6.32.y and 2.6.34.y trees.  The
      > patch was applied upstream during the 2.6.38 merge window, so newer
      > kernels don't need it.
      >
      > (Context: <http://bugs.debian.org/695872>.)  Tom Downes (cc-ed)
      > experienced the bug on a Debian kernel close to 2.6.32.58 and
      > confirmed that the patch doesn't seem to hurt.
      >
      > The patch is part of Fedora 13's 2.6.34-based and Fedora 14's
      > 2.6.35-based kernels[1].  It was also included in the RHEL kernel at
      > some point between 2.6.32-71.29.1.el6 and 2.6.32-131.0.15.el6[2].
      >
      > Thoughts of all kinds welcome, as always.
      >
      > Regards,
      > Jonathan
      >
      > [1] https://bugzilla.redhat.com/673207
      > [2] https://oss.oracle.com/git/?p=redpatch.git;a=commit;h=8028cccdc4b1Reported-by: default avatarMi Jinlong <mijinlong@cn.fujitsu.com>
      Signed-off-by: default avatarTrond Myklebust <Trond.Myklebust@netapp.com>
      (cherry picked from commit beb0f0a9)
      Cc: Jonathan Nieder <jrnieder@gmail.com>
      Signed-off-by: default avatarWilly Tarreau <w@1wt.eu>
      fc9e795a