Merge branch 'perf-optimizations-for-tcp-recv-zerocopy'

Arjun Roy says: ==================== Perf. optimizations for TCP Recv. Zerocopy This patchset contains several optimizations for TCP Recv. Zerocopy. Summarized: 1. It is possible that a read payload is not exactly page aligned - that there may exist "straggler" bytes that we cannot map into the caller's address space cleanly. For this, we allow the caller to provide as argument a "hybrid copy buffer", turning getsockopt(TCP_ZEROCOPY_RECEIVE) into a "hybrid" operation that allows the caller to avoid a subsequent recvmsg() call to read the stragglers. 2. Similarly, for "small" read payloads that are either below the size of a page, or small enough that remapping pages is not a performance win - we allow the user to short-circuit the remapping operations entirely and simply copy into the buffer provided. Some of the patches in the middle of this set are refactors to support this "short-circuiting" optimization. 3. We allow the user to provide a hint that performing a page zap operation (and the accompanying TLB shootdown) may not be necessary, for the provided region that the kernel will attempt to map pages into. This allows us to avoid this expensive operation while holding the socket lock, which provides a significant performance advantage. With all of these changes combined, "medium" sized receive traffic (multiple tens to few hundreds of KB) see significant efficiency gains when using TCP receive zerocopy instead of regular recvmsg(). For example, with RPC-style traffic with 32KB messages, there is a roughly 15% efficiency improvement when using zerocopy. Without these changes, there is a roughly 60-70% efficiency reduction with such messages when employing zerocopy. ==================== Link: https://lore.kernel.org/r/20201202225349.935284-1-arjunroy.kdev@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>

Merge branch 'perf-optimizations-for-tcp-recv-zerocopy'
Arjun Roy says: ==================== Perf. optimizations for TCP Recv. Zerocopy This patchset contains several optimizations for TCP Recv. Zerocopy. Summarized: 1. It is possible that a read payload is not exactly page aligned - that there may exist "straggler" bytes that we cannot map into the caller's address space cleanly. For this, we allow the caller to provide as argument a "hybrid copy buffer", turning getsockopt(TCP_ZEROCOPY_RECEIVE) into a "hybrid" operation that allows the caller to avoid a subsequent recvmsg() call to read the stragglers. 2. Similarly, for "small" read payloads that are either below the size of a page, or small enough that remapping pages is not a performance win - we allow the user to short-circuit the remapping operations entirely and simply copy into the buffer provided. Some of the patches in the middle of this set are refactors to support this "short-circuiting" optimization. 3. We allow the user to provide a hint that performing a page zap operation (and the accompanying TLB shootdown) may not be necessary, for the provided region that the kernel will attempt to map pages into. This allows us to avoid this expensive operation while holding the socket lock, which provides a significant performance advantage. With all of these changes combined, "medium" sized receive traffic (multiple tens to few hundreds of KB) see significant efficiency gains when using TCP receive zerocopy instead of regular recvmsg(). For example, with RPC-style traffic with 32KB messages, there is a roughly 15% efficiency improvement when using zerocopy. Without these changes, there is a roughly 60-70% efficiency reduction with such messages when employing zerocopy. ==================== Link: https://lore.kernel.org/r/20201202225349.935284-1-arjunroy.kdev@gmail.comSigned-off-by: Jakub Kicinski <kuba@kernel.org>
43be3a3c · Jakub Kicinski · 4be986c8 · 94ab9eb9 · 43be3a3c · 43be3a3c
Commit 43be3a3c authored Dec 04, 2020 by Jakub Kicinski
Hide whitespace changes
Inline Side-by-side

Showing with 343 additions and 107 deletions

include/uapi/linux/tcp.h include/uapi/linux/tcp.h +4 -0

net/ipv4/tcp.c net/ipv4/tcp.c +339 -107

No files found.
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -343,11 +343,15 @@ struct tcp_diag_md5sig {
 /* setsockopt(fd, IPPROTO_TCP, TCP_ZEROCOPY_RECEIVE, ...) */
+#define TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT 0x1
 struct tcp_zerocopy_receive {
 	__u64 address;		/* in: address of mapping */
 	__u32 length;		/* in/out: number of bytes to map/mapped */
 	__u32 recv_skip_hint;	/* out: amount of bytes to skip */
 	__u32 inq; /* out: amount of bytes in read queue */
 	__s32 err; /* out: socket error */
+	__u64 copybuf_address;	/* in: copybuf address (small reads) */
+	__s32 copybuf_len; /* in/out: copybuf bytes avail/used or error */
+	__u32 flags; /* in: flags */
 };
 #endif /* _UAPI_LINUX_TCP_H */
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -1758,52 +1758,272 @@ int tcp_mmap(struct file *file, struct socket *sock,
 }
 EXPORT_SYMBOL(tcp_mmap);
+static skb_frag_t *skb_advance_to_frag(struct sk_buff *skb, u32 offset_skb,
+				       u32 *offset_frag)
+{
+	skb_frag_t *frag;
+	offset_skb -= skb_headlen(skb);
+	if ((int)offset_skb < 0 || skb_has_frag_list(skb))
+		return NULL;
+	frag = skb_shinfo(skb)->frags;
+	while (offset_skb) {
+		if (skb_frag_size(frag) > offset_skb) {
+			*offset_frag = offset_skb;
+			return frag;
+		}
+		offset_skb -= skb_frag_size(frag);
+		++frag;
+	}
+	*offset_frag = 0;
+	return frag;
+}
+static bool can_map_frag(const skb_frag_t *frag)
+{
+	return skb_frag_size(frag) == PAGE_SIZE && !skb_frag_off(frag);
+}
+static int find_next_mappable_frag(const skb_frag_t *frag,
+				   int remaining_in_skb)
+{
+	int offset = 0;
+	if (likely(can_map_frag(frag)))
+		return 0;
+	while (offset < remaining_in_skb && !can_map_frag(frag)) {
+		offset += skb_frag_size(frag);
+		++frag;
+	}
+	return offset;
+}
+static void tcp_zerocopy_set_hint_for_skb(struct sock *sk,
+					  struct tcp_zerocopy_receive *zc,
+					  struct sk_buff *skb, u32 offset)
+{
+	u32 frag_offset, partial_frag_remainder = 0;
+	int mappable_offset;
+	skb_frag_t *frag;
+	/* worst case: skip to next skb. try to improve on this case below */
+	zc->recv_skip_hint = skb->len - offset;
+	/* Find the frag containing this offset (and how far into that frag) */
+	frag = skb_advance_to_frag(skb, offset, &frag_offset);
+	if (!frag)
+		return;
+	if (frag_offset) {
+		struct skb_shared_info *info = skb_shinfo(skb);
+		/* We read part of the last frag, must recvmsg() rest of skb. */
+		if (frag == &info->frags[info->nr_frags - 1])
+			return;
+		/* Else, we must at least read the remainder in this frag. */
+		partial_frag_remainder = skb_frag_size(frag) - frag_offset;
+		zc->recv_skip_hint -= partial_frag_remainder;
+		++frag;
+	}
+	/* partial_frag_remainder: If part way through a frag, must read rest.
+	 * mappable_offset: Bytes till next mappable frag, *not* counting bytes
+	 * in partial_frag_remainder.
+	 */
+	mappable_offset = find_next_mappable_frag(frag, zc->recv_skip_hint);
+	zc->recv_skip_hint = mappable_offset + partial_frag_remainder;
+}
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
+			      int nonblock, int flags,
+			      struct scm_timestamping_internal *tss,
+			      int *cmsg_flags);
+static int receive_fallback_to_copy(struct sock *sk,
+				    struct tcp_zerocopy_receive *zc, int inq)
+{
+	unsigned long copy_address = (unsigned long)zc->copybuf_address;
+	struct scm_timestamping_internal tss_unused;
+	int err, cmsg_flags_unused;
+	struct msghdr msg = {};
+	struct iovec iov;
+	zc->length = 0;
+	zc->recv_skip_hint = 0;
+	if (copy_address != zc->copybuf_address)
+		return -EINVAL;
+	err = import_single_range(READ, (void __user *)copy_address,
+				  inq, &iov, &msg.msg_iter);
+	if (err)
+		return err;
+	err = tcp_recvmsg_locked(sk, &msg, inq, /*nonblock=*/1, /*flags=*/0,
+				 &tss_unused, &cmsg_flags_unused);
+	if (err < 0)
+		return err;
+	zc->copybuf_len = err;
+	if (likely(zc->copybuf_len)) {
+		struct sk_buff *skb;
+		u32 offset;
+		skb = tcp_recv_skb(sk, tcp_sk(sk)->copied_seq, &offset);
+		if (skb)
+			tcp_zerocopy_set_hint_for_skb(sk, zc, skb, offset);
+	}
+	return 0;
+}
+static int tcp_copy_straggler_data(struct tcp_zerocopy_receive *zc,
+				   struct sk_buff *skb, u32 copylen,
+				   u32 *offset, u32 *seq)
+{
+	unsigned long copy_address = (unsigned long)zc->copybuf_address;
+	struct msghdr msg = {};
+	struct iovec iov;
+	int err;
+	if (copy_address != zc->copybuf_address)
+		return -EINVAL;
+	err = import_single_range(READ, (void __user *)copy_address,
+				  copylen, &iov, &msg.msg_iter);
+	if (err)
+		return err;
+	err = skb_copy_datagram_msg(skb, *offset, &msg, copylen);
+	if (err)
+		return err;
+	zc->recv_skip_hint -= copylen;
+	*offset += copylen;
+	*seq += copylen;
+	return (__s32)copylen;
+}
+static int tcp_zerocopy_handle_leftover_data(struct tcp_zerocopy_receive *zc,
+					     struct sock *sk,
+					     struct sk_buff *skb,
+					     u32 *seq,
+					     s32 copybuf_len)
+{
+	u32 offset, copylen = min_t(u32, copybuf_len, zc->recv_skip_hint);
+	if (!copylen)
+		return 0;
+	/* skb is null if inq < PAGE_SIZE. */
+	if (skb)
+		offset = *seq - TCP_SKB_CB(skb)->seq;
+	else
+		skb = tcp_recv_skb(sk, *seq, &offset);
+	zc->copybuf_len = tcp_copy_straggler_data(zc, skb, copylen, &offset,
+						  seq);
+	return zc->copybuf_len < 0 ? 0 : copylen;
+}
+static int tcp_zerocopy_vm_insert_batch_error(struct vm_area_struct *vma,
+					      struct page **pending_pages,
+					      unsigned long pages_remaining,
+					      unsigned long *address,
+					      u32 *length,
+					      u32 *seq,
+					      struct tcp_zerocopy_receive *zc,
+					      u32 total_bytes_to_map,
+					      int err)
+{
+	/* At least one page did not map. Try zapping if we skipped earlier. */
+	if (err == -EBUSY &&
+	    zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT) {
+		u32 maybe_zap_len;
+		maybe_zap_len = total_bytes_to_map -  /* All bytes to map */
+				*length + /* Mapped or pending */
+				(pages_remaining * PAGE_SIZE); /* Failed map. */
+		zap_page_range(vma, *address, maybe_zap_len);
+		err = 0;
+	}
+	if (!err) {
+		unsigned long leftover_pages = pages_remaining;
+		int bytes_mapped;
+		/* We called zap_page_range, try to reinsert. */
+		err = vm_insert_pages(vma, *address,
+				      pending_pages,
+				      &pages_remaining);
+		bytes_mapped = PAGE_SIZE * (leftover_pages - pages_remaining);
+		*seq += bytes_mapped;
+		*address += bytes_mapped;
+	}
+	if (err) {
+		/* Either we were unable to zap, OR we zapped, retried an
+		 * insert, and still had an issue. Either ways, pages_remaining
+		 * is the number of pages we were unable to map, and we unroll
+		 * some state we speculatively touched before.
+		 */
+		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+		*length -= bytes_not_mapped;
+		zc->recv_skip_hint += bytes_not_mapped;
+	}
+	return err;
+}
 static int tcp_zerocopy_vm_insert_batch(struct vm_area_struct *vma,
 					struct page **pages,
-					unsigned long pages_to_map,
+					unsigned int pages_to_map,
-					unsigned long *insert_addr,
+					unsigned long *address,
-					u32 *length_with_pending,
+					u32 *length,
 					u32 *seq,
-					struct tcp_zerocopy_receive *zc)
+					struct tcp_zerocopy_receive *zc,
+					u32 total_bytes_to_map)
 {
 	unsigned long pages_remaining = pages_to_map;
-	int bytes_mapped;
+	unsigned int pages_mapped;
-	int ret;
+	unsigned int bytes_mapped;
+	int err;
-	ret = vm_insert_pages(vma, *insert_addr, pages, &pages_remaining);
+	err = vm_insert_pages(vma, *address, pages, &pages_remaining);
-	bytes_mapped = PAGE_SIZE * (pages_to_map - pages_remaining);
+	pages_mapped = pages_to_map - (unsigned int)pages_remaining;
+	bytes_mapped = PAGE_SIZE * pages_mapped;
 	/* Even if vm_insert_pages fails, it may have partially succeeded in
 	 * mapping (some but not all of the pages).
 	 */
 	*seq += bytes_mapped;
-	*insert_addr += bytes_mapped;
+	*address += bytes_mapped;
-	if (ret) {
-		/* But if vm_insert_pages did fail, we have to unroll some state
+	if (likely(!err))
-		 * we speculatively touched before.
+		return 0;
-		 */
-		const int bytes_not_mapped = PAGE_SIZE * pages_remaining;
+	/* Error: maybe zap and retry + rollback state for failed inserts. */
-		*length_with_pending -= bytes_not_mapped;
+	return tcp_zerocopy_vm_insert_batch_error(vma, pages + pages_mapped,
-		zc->recv_skip_hint += bytes_not_mapped;
+		pages_remaining, address, length, seq, zc, total_bytes_to_map,
-	}
+		err);
-	return ret;
 }
+#define TCP_ZEROCOPY_PAGE_BATCH_SIZE 32
 static int tcp_zerocopy_receive(struct sock *sk,
 				struct tcp_zerocopy_receive *zc)
 {
+	u32 length = 0, offset, vma_len, avail_len, copylen = 0;
 	unsigned long address = (unsigned long)zc->address;
-	u32 length = 0, seq, offset, zap_len;
+	struct page *pages[TCP_ZEROCOPY_PAGE_BATCH_SIZE];
-	#define PAGE_BATCH_SIZE 8
+	s32 copybuf_len = zc->copybuf_len;
-	struct page *pages[PAGE_BATCH_SIZE];
+	struct tcp_sock *tp = tcp_sk(sk);
 	const skb_frag_t *frags = NULL;
+	unsigned int pages_to_map = 0;
 	struct vm_area_struct *vma;
 	struct sk_buff *skb = NULL;
-	unsigned long pg_idx = 0;
+	u32 seq = tp->copied_seq;
-	unsigned long curr_addr;
+	u32 total_bytes_to_map;
-	struct tcp_sock *tp;
+	int inq = tcp_inq(sk);
-	int inq;
 	int ret;
+	zc->copybuf_len = 0;
 	if (address & (PAGE_SIZE - 1) || address != zc->address)
 		return -EINVAL;
@@ -1812,7 +2032,16 @@ static int tcp_zerocopy_receive(struct sock *sk,
 	sock_rps_record_flow(sk);
-	tp = tcp_sk(sk);
+	if (inq && inq <= copybuf_len)
+		return receive_fallback_to_copy(sk, zc, inq);
+	if (inq < PAGE_SIZE) {
+		zc->length = 0;
+		zc->recv_skip_hint = inq;
+		if (!inq && sock_flag(sk, SOCK_DONE))
+			return -EIO;
+		return 0;
+	}
 	mmap_read_lock(current->mm);
@@ -1821,33 +2050,26 @@ static int tcp_zerocopy_receive(struct sock *sk,
 		mmap_read_unlock(current->mm);
 		return -EINVAL;
 	}
-	zc->length = min_t(unsigned long, zc->length, vma->vm_end - address);
+	vma_len = min_t(unsigned long, zc->length, vma->vm_end - address);
+	avail_len = min_t(u32, vma_len, inq);
-	seq = tp->copied_seq;
+	total_bytes_to_map = avail_len & ~(PAGE_SIZE - 1);
-	inq = tcp_inq(sk);
+	if (total_bytes_to_map) {
-	zc->length = min_t(u32, zc->length, inq);
+		if (!(zc->flags & TCP_RECEIVE_ZEROCOPY_FLAG_TLB_CLEAN_HINT))
-	zap_len = zc->length & ~(PAGE_SIZE - 1);
+			zap_page_range(vma, address, total_bytes_to_map);
-	if (zap_len) {
+		zc->length = total_bytes_to_map;
-		zap_page_range(vma, address, zap_len);
 		zc->recv_skip_hint = 0;
 	} else {
-		zc->recv_skip_hint = zc->length;
+		zc->length = avail_len;
+		zc->recv_skip_hint = avail_len;
 	}
 	ret = 0;
-	curr_addr = address;
 	while (length + PAGE_SIZE <= zc->length) {
+		int mappable_offset;
+		struct page *page;
 		if (zc->recv_skip_hint < PAGE_SIZE) {
-			/* If we're here, finish the current batch. */
+			u32 offset_frag;
-			if (pg_idx) {
-				ret = tcp_zerocopy_vm_insert_batch(vma, pages,
-								   pg_idx,
-								   &curr_addr,
-								   &length,
-								   &seq, zc);
-				if (ret)
-					goto out;
-				pg_idx = 0;
-			}
 			if (skb) {
 				if (zc->recv_skip_hint > 0)
 					break;
@@ -1857,56 +2079,57 @@ static int tcp_zerocopy_receive(struct sock *sk,
 				skb = tcp_recv_skb(sk, seq, &offset);
 			}
 			zc->recv_skip_hint = skb->len - offset;
-			offset -= skb_headlen(skb);
+			frags = skb_advance_to_frag(skb, offset, &offset_frag);
-			if ((int)offset < 0 || skb_has_frag_list(skb))
+			if (!frags || offset_frag)
 				break;
-			frags = skb_shinfo(skb)->frags;
-			while (offset) {
-				if (skb_frag_size(frags) > offset)
-					goto out;
-				offset -= skb_frag_size(frags);
-				frags++;
-			}
 		}
-		if (skb_frag_size(frags) != PAGE_SIZE || skb_frag_off(frags)) {
-			int remaining = zc->recv_skip_hint;
-			while (remaining && (skb_frag_size(frags) != PAGE_SIZE ||
+		mappable_offset = find_next_mappable_frag(frags,
-					     skb_frag_off(frags))) {
+							  zc->recv_skip_hint);
-				remaining -= skb_frag_size(frags);
+		if (mappable_offset) {
-				frags++;
+			zc->recv_skip_hint = mappable_offset;
-			}
-			zc->recv_skip_hint -= remaining;
 			break;
 		}
-		pages[pg_idx] = skb_frag_page(frags);
+		page = skb_frag_page(frags);
-		pg_idx++;
+		prefetchw(page);
+		pages[pages_to_map++] = page;
 		length += PAGE_SIZE;
 		zc->recv_skip_hint -= PAGE_SIZE;
 		frags++;
-		if (pg_idx == PAGE_BATCH_SIZE) {
+		if (pages_to_map == TCP_ZEROCOPY_PAGE_BATCH_SIZE ||
-			ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+		    zc->recv_skip_hint < PAGE_SIZE) {
-							   &curr_addr, &length,
+			/* Either full batch, or we're about to go to next skb
-							   &seq, zc);
+			 * (and we cannot unroll failed ops across skbs).
+			 */
+			ret = tcp_zerocopy_vm_insert_batch(vma, pages,
+							   pages_to_map,
+							   &address, &length,
+							   &seq, zc,
+							   total_bytes_to_map);
 			if (ret)
 				goto out;
-			pg_idx = 0;
+			pages_to_map = 0;
 		}
 	}
-	if (pg_idx) {
+	if (pages_to_map) {
-		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pg_idx,
+		ret = tcp_zerocopy_vm_insert_batch(vma, pages, pages_to_map,
-						   &curr_addr, &length, &seq,
+						   &address, &length, &seq,
-						   zc);
+						   zc, total_bytes_to_map);
 	}
 out:
 	mmap_read_unlock(current->mm);
-	if (length) {
+	/* Try to copy straggler data. */
+	if (!ret)
+		copylen = tcp_zerocopy_handle_leftover_data(zc, sk, skb, &seq,
+							    copybuf_len);
+	if (length + copylen) {
 		WRITE_ONCE(tp->copied_seq, seq);
 		tcp_rcv_space_adjust(sk);
 		/* Clean up data we have read: This will do ACK frames. */
 		tcp_recv_skb(sk, seq, &offset);
-		tcp_cleanup_rbuf(sk, length);
+		tcp_cleanup_rbuf(sk, length + copylen);
 		ret = 0;
 		if (length == zc->length)
 			zc->recv_skip_hint = 0;
@@ -2028,36 +2251,28 @@ static int tcp_inq_hint(struct sock *sk)
 *	Probably, code can be easily improved even more.
 */
-int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+static int tcp_recvmsg_locked(struct sock *sk, struct msghdr *msg, size_t len,
-		int flags, int *addr_len)
+			      int nonblock, int flags,
+			      struct scm_timestamping_internal *tss,
+			      int *cmsg_flags)
 {
 	struct tcp_sock *tp = tcp_sk(sk);
 	int copied = 0;
 	u32 peek_seq;
 	u32 *seq;
 	unsigned long used;
-	int err, inq;
+	int err;
 	int target;		/* Read at least this many bytes */
 	long timeo;
 	struct sk_buff *skb, *last;
 	u32 urg_hole = 0;
-	struct scm_timestamping_internal tss;
-	int cmsg_flags;
-	if (unlikely(flags & MSG_ERRQUEUE))
-		return inet_recv_error(sk, msg, len, addr_len);
-	if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(&sk->sk_receive_queue) &&
-	    (sk->sk_state == TCP_ESTABLISHED))
-		sk_busy_loop(sk, nonblock);
-	lock_sock(sk);
 	err = -ENOTCONN;
 	if (sk->sk_state == TCP_LISTEN)
 		goto out;
-	cmsg_flags = tp->recvmsg_inq ? 1 : 0;
+	if (tp->recvmsg_inq)
+		*cmsg_flags = 1;
 	timeo = sock_rcvtimeo(sk, nonblock);
 	/* Urgent data needs to be handled specially. */
@@ -2237,8 +2452,8 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 		}
 		if (TCP_SKB_CB(skb)->has_rxtstamp) {
-			tcp_update_recv_tstamps(skb, &tss);
+			tcp_update_recv_tstamps(skb, tss);
-			cmsg_flags |= 2;
+			*cmsg_flags |= 2;
 		}
 		if (used + offset < skb->len)
@@ -2264,22 +2479,9 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	/* Clean up data we have read: This will do ACK frames. */
 	tcp_cleanup_rbuf(sk, copied);
-	release_sock(sk);
-	if (cmsg_flags) {
-		if (cmsg_flags & 2)
-			tcp_recv_timestamp(msg, sk, &tss);
-		if (cmsg_flags & 1) {
-			inq = tcp_inq_hint(sk);
-			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
-		}
-	}
 	return copied;
 out:
-	release_sock(sk);
 	return err;
 recv_urg:
@@ -2290,6 +2492,36 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
 	err = tcp_peek_sndq(sk, msg, len);
 	goto out;
 }
+int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
+		int flags, int *addr_len)
+{
+	int cmsg_flags = 0, ret, inq;
+	struct scm_timestamping_internal tss;
+	if (unlikely(flags & MSG_ERRQUEUE))
+		return inet_recv_error(sk, msg, len, addr_len);
+	if (sk_can_busy_loop(sk) &&
+	    skb_queue_empty_lockless(&sk->sk_receive_queue) &&
+	    sk->sk_state == TCP_ESTABLISHED)
+		sk_busy_loop(sk, nonblock);
+	lock_sock(sk);
+	ret = tcp_recvmsg_locked(sk, msg, len, nonblock, flags, &tss,
+				 &cmsg_flags);
+	release_sock(sk);
+	if (cmsg_flags && ret >= 0) {
+		if (cmsg_flags & 2)
+			tcp_recv_timestamp(msg, sk, &tss);
+		if (cmsg_flags & 1) {
+			inq = tcp_inq_hint(sk);
+			put_cmsg(msg, SOL_TCP, TCP_CM_INQ, sizeof(inq), &inq);
+		}
+	}
+	return ret;
+}
 EXPORT_SYMBOL(tcp_recvmsg);
 void tcp_set_state(struct sock *sk, int state)