Commit 7c951caf authored by Björn Töpel's avatar Björn Töpel Committed by Daniel Borkmann

net: Add SO_BUSY_POLL_BUDGET socket option

This option lets a user set a per socket NAPI budget for
busy-polling. If the options is not set, it will use the default of 8.
Signed-off-by: default avatarBjörn Töpel <bjorn.topel@intel.com>
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Reviewed-by: default avatarJakub Kicinski <kuba@kernel.org>
Link: https://lore.kernel.org/bpf/20201130185205.196029-3-bjorn.topel@gmail.com
parent 7fd3253a
...@@ -125,6 +125,7 @@ ...@@ -125,6 +125,7 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69 #define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -136,6 +136,7 @@ ...@@ -136,6 +136,7 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69 #define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -117,6 +117,7 @@ ...@@ -117,6 +117,7 @@
#define SO_DETACH_REUSEPORT_BPF 0x4042 #define SO_DETACH_REUSEPORT_BPF 0x4042
#define SO_PREFER_BUSY_POLL 0x4043 #define SO_PREFER_BUSY_POLL 0x4043
#define SO_BUSY_POLL_BUDGET 0x4044
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -118,6 +118,7 @@ ...@@ -118,6 +118,7 @@
#define SO_DETACH_REUSEPORT_BPF 0x0047 #define SO_DETACH_REUSEPORT_BPF 0x0047
#define SO_PREFER_BUSY_POLL 0x0048 #define SO_PREFER_BUSY_POLL 0x0048
#define SO_BUSY_POLL_BUDGET 0x0049
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock) ...@@ -397,7 +397,8 @@ static void ep_busy_loop(struct eventpoll *ep, int nonblock)
unsigned int napi_id = READ_ONCE(ep->napi_id); unsigned int napi_id = READ_ONCE(ep->napi_id);
if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on()) if ((napi_id >= MIN_NAPI_ID) && net_busy_loop_on())
napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false); napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, ep, false,
BUSY_POLL_BUDGET);
} }
static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep) static inline void ep_reset_busy_poll_napi_id(struct eventpoll *ep)
......
...@@ -23,6 +23,8 @@ ...@@ -23,6 +23,8 @@
*/ */
#define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1)) #define MIN_NAPI_ID ((unsigned int)(NR_CPUS + 1))
#define BUSY_POLL_BUDGET 8
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
struct napi_struct; struct napi_struct;
...@@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time); ...@@ -43,7 +45,7 @@ bool sk_busy_loop_end(void *p, unsigned long start_time);
void napi_busy_loop(unsigned int napi_id, void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long), bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll); void *loop_end_arg, bool prefer_busy_poll, u16 budget);
#else /* CONFIG_NET_RX_BUSY_POLL */ #else /* CONFIG_NET_RX_BUSY_POLL */
static inline unsigned long net_busy_loop_on(void) static inline unsigned long net_busy_loop_on(void)
...@@ -106,7 +108,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock) ...@@ -106,7 +108,8 @@ static inline void sk_busy_loop(struct sock *sk, int nonblock)
if (napi_id >= MIN_NAPI_ID) if (napi_id >= MIN_NAPI_ID)
napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk, napi_busy_loop(napi_id, nonblock ? NULL : sk_busy_loop_end, sk,
READ_ONCE(sk->sk_prefer_busy_poll)); READ_ONCE(sk->sk_prefer_busy_poll),
READ_ONCE(sk->sk_busy_poll_budget) ?: BUSY_POLL_BUDGET);
#endif #endif
} }
......
...@@ -302,6 +302,7 @@ struct bpf_local_storage; ...@@ -302,6 +302,7 @@ struct bpf_local_storage;
* @sk_max_ack_backlog: listen backlog set in listen() * @sk_max_ack_backlog: listen backlog set in listen()
* @sk_uid: user id of owner * @sk_uid: user id of owner
* @sk_prefer_busy_poll: prefer busypolling over softirq processing * @sk_prefer_busy_poll: prefer busypolling over softirq processing
* @sk_busy_poll_budget: napi processing budget when busypolling
* @sk_priority: %SO_PRIORITY setting * @sk_priority: %SO_PRIORITY setting
* @sk_type: socket type (%SOCK_STREAM, etc) * @sk_type: socket type (%SOCK_STREAM, etc)
* @sk_protocol: which protocol this socket belongs in this network family * @sk_protocol: which protocol this socket belongs in this network family
...@@ -482,6 +483,7 @@ struct sock { ...@@ -482,6 +483,7 @@ struct sock {
kuid_t sk_uid; kuid_t sk_uid;
#ifdef CONFIG_NET_RX_BUSY_POLL #ifdef CONFIG_NET_RX_BUSY_POLL
u8 sk_prefer_busy_poll; u8 sk_prefer_busy_poll;
u16 sk_busy_poll_budget;
#endif #endif
struct pid *sk_peer_pid; struct pid *sk_peer_pid;
const struct cred *sk_peer_cred; const struct cred *sk_peer_cred;
......
...@@ -120,6 +120,7 @@ ...@@ -120,6 +120,7 @@
#define SO_DETACH_REUSEPORT_BPF 68 #define SO_DETACH_REUSEPORT_BPF 68
#define SO_PREFER_BUSY_POLL 69 #define SO_PREFER_BUSY_POLL 69
#define SO_BUSY_POLL_BUDGET 70
#if !defined(__KERNEL__) #if !defined(__KERNEL__)
......
...@@ -6496,8 +6496,6 @@ static struct napi_struct *napi_by_id(unsigned int napi_id) ...@@ -6496,8 +6496,6 @@ static struct napi_struct *napi_by_id(unsigned int napi_id)
#if defined(CONFIG_NET_RX_BUSY_POLL) #if defined(CONFIG_NET_RX_BUSY_POLL)
#define BUSY_POLL_BUDGET 8
static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
{ {
if (!skip_schedule) { if (!skip_schedule) {
...@@ -6517,7 +6515,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule) ...@@ -6517,7 +6515,8 @@ static void __busy_poll_stop(struct napi_struct *napi, bool skip_schedule)
clear_bit(NAPI_STATE_SCHED, &napi->state); clear_bit(NAPI_STATE_SCHED, &napi->state);
} }
static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll) static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool prefer_busy_poll,
u16 budget)
{ {
bool skip_schedule = false; bool skip_schedule = false;
unsigned long timeout; unsigned long timeout;
...@@ -6549,21 +6548,21 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool ...@@ -6549,21 +6548,21 @@ static void busy_poll_stop(struct napi_struct *napi, void *have_poll_lock, bool
/* All we really want here is to re-enable device interrupts. /* All we really want here is to re-enable device interrupts.
* Ideally, a new ndo_busy_poll_stop() could avoid another round. * Ideally, a new ndo_busy_poll_stop() could avoid another round.
*/ */
rc = napi->poll(napi, BUSY_POLL_BUDGET); rc = napi->poll(napi, budget);
/* We can't gro_normal_list() here, because napi->poll() might have /* We can't gro_normal_list() here, because napi->poll() might have
* rearmed the napi (napi_complete_done()) in which case it could * rearmed the napi (napi_complete_done()) in which case it could
* already be running on another CPU. * already be running on another CPU.
*/ */
trace_napi_poll(napi, rc, BUSY_POLL_BUDGET); trace_napi_poll(napi, rc, budget);
netpoll_poll_unlock(have_poll_lock); netpoll_poll_unlock(have_poll_lock);
if (rc == BUSY_POLL_BUDGET) if (rc == budget)
__busy_poll_stop(napi, skip_schedule); __busy_poll_stop(napi, skip_schedule);
local_bh_enable(); local_bh_enable();
} }
void napi_busy_loop(unsigned int napi_id, void napi_busy_loop(unsigned int napi_id,
bool (*loop_end)(void *, unsigned long), bool (*loop_end)(void *, unsigned long),
void *loop_end_arg, bool prefer_busy_poll) void *loop_end_arg, bool prefer_busy_poll, u16 budget)
{ {
unsigned long start_time = loop_end ? busy_loop_current_time() : 0; unsigned long start_time = loop_end ? busy_loop_current_time() : 0;
int (*napi_poll)(struct napi_struct *napi, int budget); int (*napi_poll)(struct napi_struct *napi, int budget);
...@@ -6606,8 +6605,8 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6606,8 +6605,8 @@ void napi_busy_loop(unsigned int napi_id,
have_poll_lock = netpoll_poll_lock(napi); have_poll_lock = netpoll_poll_lock(napi);
napi_poll = napi->poll; napi_poll = napi->poll;
} }
work = napi_poll(napi, BUSY_POLL_BUDGET); work = napi_poll(napi, budget);
trace_napi_poll(napi, work, BUSY_POLL_BUDGET); trace_napi_poll(napi, work, budget);
gro_normal_list(napi); gro_normal_list(napi);
count: count:
if (work > 0) if (work > 0)
...@@ -6620,7 +6619,7 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6620,7 +6619,7 @@ void napi_busy_loop(unsigned int napi_id,
if (unlikely(need_resched())) { if (unlikely(need_resched())) {
if (napi_poll) if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable(); preempt_enable();
rcu_read_unlock(); rcu_read_unlock();
cond_resched(); cond_resched();
...@@ -6631,7 +6630,7 @@ void napi_busy_loop(unsigned int napi_id, ...@@ -6631,7 +6630,7 @@ void napi_busy_loop(unsigned int napi_id,
cpu_relax(); cpu_relax();
} }
if (napi_poll) if (napi_poll)
busy_poll_stop(napi, have_poll_lock, prefer_busy_poll); busy_poll_stop(napi, have_poll_lock, prefer_busy_poll, budget);
preempt_enable(); preempt_enable();
out: out:
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -1165,6 +1165,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname, ...@@ -1165,6 +1165,16 @@ int sock_setsockopt(struct socket *sock, int level, int optname,
else else
WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); WRITE_ONCE(sk->sk_prefer_busy_poll, valbool);
break; break;
case SO_BUSY_POLL_BUDGET:
if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) {
ret = -EPERM;
} else {
if (val < 0 || val > U16_MAX)
ret = -EINVAL;
else
WRITE_ONCE(sk->sk_busy_poll_budget, val);
}
break;
#endif #endif
case SO_MAX_PACING_RATE: case SO_MAX_PACING_RATE:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment