Commit 06021292 authored by Eliezer Tamir's avatar Eliezer Tamir Committed by David S. Miller

net: add low latency socket poll

Adds an ndo_ll_poll method and the code that supports it.
This method can be used by low latency applications to busy-poll
Ethernet device queues directly from the socket code.
sysctl_net_ll_poll controls how many microseconds to poll.
Default is zero (disabled).
Individual protocol support will be added by subsequent patches.
Signed-off-by: default avatarAlexander Duyck <alexander.h.duyck@intel.com>
Signed-off-by: default avatarJesse Brandeburg <jesse.brandeburg@intel.com>
Signed-off-by: default avatarEliezer Tamir <eliezer.tamir@linux.intel.com>
Acked-by: default avatarEric Dumazet <edumazet@google.com>
Tested-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent af12fa6e
...@@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt, ...@@ -50,6 +50,13 @@ The maximum number of packets that kernel can handle on a NAPI interrupt,
it's a Per-CPU variable. it's a Per-CPU variable.
Default: 64 Default: 64
low_latency_poll
----------------
Low latency busy poll timeout. (needs CONFIG_NET_LL_RX_POLL)
Approximate time in us to spin waiting for packets on the device queue.
Recommended value is 50. May increase power usage.
Default: 0 (off)
rmem_default rmem_default
------------ ------------
......
...@@ -971,6 +971,9 @@ struct net_device_ops { ...@@ -971,6 +971,9 @@ struct net_device_ops {
struct netpoll_info *info, struct netpoll_info *info,
gfp_t gfp); gfp_t gfp);
void (*ndo_netpoll_cleanup)(struct net_device *dev); void (*ndo_netpoll_cleanup)(struct net_device *dev);
#endif
#ifdef CONFIG_NET_LL_RX_POLL
int (*ndo_ll_poll)(struct napi_struct *dev);
#endif #endif
int (*ndo_set_vf_mac)(struct net_device *dev, int (*ndo_set_vf_mac)(struct net_device *dev,
int queue, u8 *mac); int queue, u8 *mac);
......
...@@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t; ...@@ -386,6 +386,7 @@ typedef unsigned char *sk_buff_data_t;
* @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS * @no_fcs: Request NIC to treat last 4 bytes as Ethernet FCS
* @dma_cookie: a cookie to one of several possible DMA operations * @dma_cookie: a cookie to one of several possible DMA operations
* done by skb DMA functions * done by skb DMA functions
* @napi_id: id of the NAPI struct this skb came from
* @secmark: security marking * @secmark: security marking
* @mark: Generic packet mark * @mark: Generic packet mark
* @dropcount: total number of sk_receive_queue overflows * @dropcount: total number of sk_receive_queue overflows
...@@ -500,8 +501,11 @@ struct sk_buff { ...@@ -500,8 +501,11 @@ struct sk_buff {
/* 7/9 bit hole (depending on ndisc_nodetype presence) */ /* 7/9 bit hole (depending on ndisc_nodetype presence) */
kmemcheck_bitfield_end(flags2); kmemcheck_bitfield_end(flags2);
#ifdef CONFIG_NET_DMA #if defined CONFIG_NET_DMA || defined CONFIG_NET_LL_RX_POLL
union {
unsigned int napi_id;
dma_cookie_t dma_cookie; dma_cookie_t dma_cookie;
};
#endif #endif
#ifdef CONFIG_NETWORK_SECMARK #ifdef CONFIG_NETWORK_SECMARK
__u32 secmark; __u32 secmark;
......
/*
* Low Latency Sockets
* Copyright(c) 2013 Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; if not, write to the Free Software Foundation, Inc.,
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*
* Author: Eliezer Tamir
*
* Contact Information:
* e1000-devel Mailing List <e1000-devel@lists.sourceforge.net>
*/
/*
* For now this depends on CONFIG_X86_TSC
*/
#ifndef _LINUX_NET_LL_POLL_H
#define _LINUX_NET_LL_POLL_H
#include <linux/netdevice.h>
#include <net/ip.h>
#ifdef CONFIG_NET_LL_RX_POLL
struct napi_struct;
extern unsigned long sysctl_net_ll_poll __read_mostly;
/* return values from ndo_ll_poll */
#define LL_FLUSH_FAILED -1
#define LL_FLUSH_BUSY -2
/* we don't mind a ~2.5% imprecision */
#define TSC_MHZ (tsc_khz >> 10)
static inline cycles_t ll_end_time(void)
{
return TSC_MHZ * ACCESS_ONCE(sysctl_net_ll_poll) + get_cycles();
}
static inline bool sk_valid_ll(struct sock *sk)
{
return sysctl_net_ll_poll && sk->sk_napi_id &&
!need_resched() && !signal_pending(current);
}
static inline bool can_poll_ll(cycles_t end_time)
{
return !time_after((unsigned long)get_cycles(),
(unsigned long)end_time);
}
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{
cycles_t end_time = ll_end_time();
const struct net_device_ops *ops;
struct napi_struct *napi;
int rc = false;
/*
* rcu read lock for napi hash
* bh so we don't race with net_rx_action
*/
rcu_read_lock_bh();
napi = napi_by_id(sk->sk_napi_id);
if (!napi)
goto out;
ops = napi->dev->netdev_ops;
if (!ops->ndo_ll_poll)
goto out;
do {
rc = ops->ndo_ll_poll(napi);
if (rc == LL_FLUSH_FAILED)
break; /* permanent failure */
if (rc > 0)
/* local bh are disabled so it is ok to use _BH */
NET_ADD_STATS_BH(sock_net(sk),
LINUX_MIB_LOWLATENCYRXPACKETS, rc);
} while (skb_queue_empty(&sk->sk_receive_queue)
&& can_poll_ll(end_time) && !nonblock);
rc = !skb_queue_empty(&sk->sk_receive_queue);
out:
rcu_read_unlock_bh();
return rc;
}
/* used in the NIC receive handler to mark the skb */
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
{
skb->napi_id = napi->napi_id;
}
/* used in the protocol hanlder to propagate the napi_id to the socket */
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
{
sk->sk_napi_id = skb->napi_id;
}
#else /* CONFIG_NET_LL_RX_POLL */
static inline cycles_t ll_end_time(void)
{
return 0;
}
static inline bool sk_valid_ll(struct sock *sk)
{
return false;
}
static inline bool sk_poll_ll(struct sock *sk, int nonblock)
{
return false;
}
static inline void skb_mark_ll(struct sk_buff *skb, struct napi_struct *napi)
{
}
static inline void sk_mark_ll(struct sock *sk, struct sk_buff *skb)
{
}
static inline bool can_poll_ll(cycles_t end_time)
{
return false;
}
#endif /* CONFIG_NET_LL_RX_POLL */
#endif /* _LINUX_NET_LL_POLL_H */
...@@ -229,6 +229,7 @@ struct cg_proto; ...@@ -229,6 +229,7 @@ struct cg_proto;
* @sk_omem_alloc: "o" is "option" or "other" * @sk_omem_alloc: "o" is "option" or "other"
* @sk_wmem_queued: persistent queue size * @sk_wmem_queued: persistent queue size
* @sk_forward_alloc: space allocated forward * @sk_forward_alloc: space allocated forward
* @sk_napi_id: id of the last napi context to receive data for sk
* @sk_allocation: allocation mode * @sk_allocation: allocation mode
* @sk_sndbuf: size of send buffer in bytes * @sk_sndbuf: size of send buffer in bytes
* @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE, * @sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
...@@ -324,6 +325,9 @@ struct sock { ...@@ -324,6 +325,9 @@ struct sock {
int sk_forward_alloc; int sk_forward_alloc;
#ifdef CONFIG_RPS #ifdef CONFIG_RPS
__u32 sk_rxhash; __u32 sk_rxhash;
#endif
#ifdef CONFIG_NET_LL_RX_POLL
unsigned int sk_napi_id;
#endif #endif
atomic_t sk_drops; atomic_t sk_drops;
int sk_rcvbuf; int sk_rcvbuf;
......
...@@ -253,6 +253,7 @@ enum ...@@ -253,6 +253,7 @@ enum
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */ LINUX_MIB_TCPFASTOPENLISTENOVERFLOW, /* TCPFastOpenListenOverflow */
LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */ LINUX_MIB_TCPFASTOPENCOOKIEREQD, /* TCPFastOpenCookieReqd */
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */ LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES, /* TCPSpuriousRtxHostQueues */
LINUX_MIB_LOWLATENCYRXPACKETS, /* LowLatencyRxPackets */
__LINUX_MIB_MAX __LINUX_MIB_MAX
}; };
......
...@@ -243,6 +243,18 @@ config NETPRIO_CGROUP ...@@ -243,6 +243,18 @@ config NETPRIO_CGROUP
Cgroup subsystem for use in assigning processes to network priorities on Cgroup subsystem for use in assigning processes to network priorities on
a per-interface basis a per-interface basis
config NET_LL_RX_POLL
bool "Low Latency Receive Poll"
depends on X86_TSC
default n
---help---
Support Low Latency Receive Queue Poll.
(For network card drivers which support this option.)
When waiting for data in read or poll call directly into the the device driver
to flush packets which may be pending on the device queues into the stack.
If unsure, say N.
config BQL config BQL
boolean boolean
depends on SYSFS depends on SYSFS
......
...@@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old) ...@@ -733,6 +733,10 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->vlan_tci = old->vlan_tci; new->vlan_tci = old->vlan_tci;
skb_copy_secmark(new, old); skb_copy_secmark(new, old);
#ifdef CONFIG_NET_LL_RX_POLL
new->napi_id = old->napi_id;
#endif
} }
/* /*
......
...@@ -139,6 +139,8 @@ ...@@ -139,6 +139,8 @@
#include <net/tcp.h> #include <net/tcp.h>
#endif #endif
#include <net/ll_poll.h>
static DEFINE_MUTEX(proto_list_mutex); static DEFINE_MUTEX(proto_list_mutex);
static LIST_HEAD(proto_list); static LIST_HEAD(proto_list);
...@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk) ...@@ -2284,6 +2286,10 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_stamp = ktime_set(-1L, 0); sk->sk_stamp = ktime_set(-1L, 0);
#ifdef CONFIG_NET_LL_RX_POLL
sk->sk_napi_id = 0;
#endif
/* /*
* Before updating sk_refcnt, we must commit prior changes to memory * Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details) * (Documentation/RCU/rculist_nulls.txt for details)
......
...@@ -19,6 +19,7 @@ ...@@ -19,6 +19,7 @@
#include <net/ip.h> #include <net/ip.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/net_ratelimit.h> #include <net/net_ratelimit.h>
#include <net/ll_poll.h>
static int one = 1; static int one = 1;
...@@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = { ...@@ -284,6 +285,15 @@ static struct ctl_table net_core_table[] = {
.proc_handler = flow_limit_table_len_sysctl .proc_handler = flow_limit_table_len_sysctl
}, },
#endif /* CONFIG_NET_FLOW_LIMIT */ #endif /* CONFIG_NET_FLOW_LIMIT */
#ifdef CONFIG_NET_LL_RX_POLL
{
.procname = "low_latency_poll",
.data = &sysctl_net_ll_poll,
.maxlen = sizeof(unsigned long),
.mode = 0644,
.proc_handler = proc_doulongvec_minmax
},
#endif
#endif /* CONFIG_NET */ #endif /* CONFIG_NET */
{ {
.procname = "netdev_budget", .procname = "netdev_budget",
......
...@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = { ...@@ -273,6 +273,7 @@ static const struct snmp_mib snmp4_net_list[] = {
SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW), SNMP_MIB_ITEM("TCPFastOpenListenOverflow", LINUX_MIB_TCPFASTOPENLISTENOVERFLOW),
SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD), SNMP_MIB_ITEM("TCPFastOpenCookieReqd", LINUX_MIB_TCPFASTOPENCOOKIEREQD),
SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES), SNMP_MIB_ITEM("TCPSpuriousRtxHostQueues", LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES),
SNMP_MIB_ITEM("LowLatencyRxPackets", LINUX_MIB_LOWLATENCYRXPACKETS),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };
......
...@@ -104,6 +104,12 @@ ...@@ -104,6 +104,12 @@
#include <linux/route.h> #include <linux/route.h>
#include <linux/sockios.h> #include <linux/sockios.h>
#include <linux/atalk.h> #include <linux/atalk.h>
#include <net/ll_poll.h>
#ifdef CONFIG_NET_LL_RX_POLL
unsigned long sysctl_net_ll_poll __read_mostly;
EXPORT_SYMBOL_GPL(sysctl_net_ll_poll);
#endif
static int sock_no_open(struct inode *irrelevant, struct file *dontcare); static int sock_no_open(struct inode *irrelevant, struct file *dontcare);
static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov, static ssize_t sock_aio_read(struct kiocb *iocb, const struct iovec *iov,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment