Commit 4ab6c99d authored by Willem de Bruijn's avatar Willem de Bruijn Committed by David S. Miller

sock: MSG_ZEROCOPY notification coalescing

In the simple case, each sendmsg() call generates data and eventually
a zerocopy ready notification N, where N indicates the Nth successful
invocation of sendmsg() with the MSG_ZEROCOPY flag on this socket.

TCP and corked sockets can cause send() calls to append new data to an
existing sk_buff and, thus, ubuf_info. In that case the notification
must hold a range. odify ubuf_info to store a inclusive range [N..N+m]
and add skb_zerocopy_realloc() to optionally extend an existing range.

Also coalesce notifications in this common case: if a notification
[1, 1] is about to be queued while [0, 0] is the queue tail, just modify
the head of the queue to read [0, 1].

Coalescing is limited to a few TSO frames worth of data to bound
notification latency.
Signed-off-by: default avatarWillem de Bruijn <willemb@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 1f8b977a
......@@ -444,15 +444,26 @@ enum {
*/
struct ubuf_info {
void (*callback)(struct ubuf_info *, bool zerocopy_success);
void *ctx;
union {
struct {
unsigned long desc;
void *ctx;
};
struct {
u32 id;
u16 len;
u16 zerocopy:1;
u32 bytelen;
};
};
atomic_t refcnt;
};
#define skb_uarg(SKB) ((struct ubuf_info *)(skb_shinfo(SKB)->destructor_arg))
struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size);
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg);
static inline void sock_zerocopy_get(struct ubuf_info *uarg)
{
......
......@@ -915,7 +915,9 @@ struct ubuf_info *sock_zerocopy_alloc(struct sock *sk, size_t size)
uarg = (void *)skb->cb;
uarg->callback = sock_zerocopy_callback;
uarg->desc = atomic_inc_return(&sk->sk_zckey) - 1;
uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1;
uarg->len = 1;
uarg->bytelen = size;
uarg->zerocopy = 1;
atomic_set(&uarg->refcnt, 0);
sock_hold(sk);
......@@ -929,26 +931,101 @@ static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg)
return container_of((void *)uarg, struct sk_buff, cb);
}
struct ubuf_info *sock_zerocopy_realloc(struct sock *sk, size_t size,
struct ubuf_info *uarg)
{
if (uarg) {
const u32 byte_limit = 1 << 19; /* limit to a few TSO */
u32 bytelen, next;
/* realloc only when socket is locked (TCP, UDP cork),
* so uarg->len and sk_zckey access is serialized
*/
if (!sock_owned_by_user(sk)) {
WARN_ON_ONCE(1);
return NULL;
}
bytelen = uarg->bytelen + size;
if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) {
/* TCP can create new skb to attach new uarg */
if (sk->sk_type == SOCK_STREAM)
goto new_alloc;
return NULL;
}
next = (u32)atomic_read(&sk->sk_zckey);
if ((u32)(uarg->id + uarg->len) == next) {
uarg->len++;
uarg->bytelen = bytelen;
atomic_set(&sk->sk_zckey, ++next);
return uarg;
}
}
new_alloc:
return sock_zerocopy_alloc(sk, size);
}
EXPORT_SYMBOL_GPL(sock_zerocopy_realloc);
static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len)
{
struct sock_exterr_skb *serr = SKB_EXT_ERR(skb);
u32 old_lo, old_hi;
u64 sum_len;
old_lo = serr->ee.ee_info;
old_hi = serr->ee.ee_data;
sum_len = old_hi - old_lo + 1ULL + len;
if (sum_len >= (1ULL << 32))
return false;
if (lo != old_hi + 1)
return false;
serr->ee.ee_data += len;
return true;
}
void sock_zerocopy_callback(struct ubuf_info *uarg, bool success)
{
struct sk_buff *skb = skb_from_uarg(uarg);
struct sk_buff *tail, *skb = skb_from_uarg(uarg);
struct sock_exterr_skb *serr;
struct sock *sk = skb->sk;
u16 id = uarg->desc;
struct sk_buff_head *q;
unsigned long flags;
u32 lo, hi;
u16 len;
if (sock_flag(sk, SOCK_DEAD))
/* if !len, there was only 1 call, and it was aborted
* so do not queue a completion notification
*/
if (!uarg->len || sock_flag(sk, SOCK_DEAD))
goto release;
len = uarg->len;
lo = uarg->id;
hi = uarg->id + len - 1;
serr = SKB_EXT_ERR(skb);
memset(serr, 0, sizeof(*serr));
serr->ee.ee_errno = 0;
serr->ee.ee_origin = SO_EE_ORIGIN_ZEROCOPY;
serr->ee.ee_data = id;
serr->ee.ee_data = hi;
serr->ee.ee_info = lo;
if (!success)
serr->ee.ee_code |= SO_EE_CODE_ZEROCOPY_COPIED;
skb_queue_tail(&sk->sk_error_queue, skb);
q = &sk->sk_error_queue;
spin_lock_irqsave(&q->lock, flags);
tail = skb_peek_tail(q);
if (!tail || SKB_EXT_ERR(tail)->ee.ee_origin != SO_EE_ORIGIN_ZEROCOPY ||
!skb_zerocopy_notify_extend(tail, lo, len)) {
__skb_queue_tail(q, skb);
skb = NULL;
}
spin_unlock_irqrestore(&q->lock, flags);
sk->sk_error_report(sk);
......@@ -975,6 +1052,7 @@ void sock_zerocopy_put_abort(struct ubuf_info *uarg)
struct sock *sk = skb_from_uarg(uarg)->sk;
atomic_dec(&sk->sk_zckey);
uarg->len--;
/* sock_zerocopy_put expects a ref. Most sockets take one per
* skb, which is zero on abort. tcp_sendmsg holds one extra, to
......@@ -995,9 +1073,16 @@ int skb_zerocopy_iter_stream(struct sock *sk, struct sk_buff *skb,
struct msghdr *msg, int len,
struct ubuf_info *uarg)
{
struct ubuf_info *orig_uarg = skb_zcopy(skb);
struct iov_iter orig_iter = msg->msg_iter;
int err, orig_len = skb->len;
/* An skb can only point to one uarg. This edge case happens when
* TCP appends to an skb, but zerocopy_realloc triggered a new alloc.
*/
if (orig_uarg && uarg != orig_uarg)
return -EEXIST;
err = __zerocopy_sg_from_iter(sk, skb, &msg->msg_iter, len);
if (err == -EFAULT || (err == -EMSGSIZE && skb->len == orig_len)) {
/* Streams do not free skb on error. Reset to prev state. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment