Merge branch 'skb_expand_head'

Vasily Averin says: ==================== skbuff: introduce skb_expand_head() currently if skb does not have enough headroom skb_realloc_headrom is called. It is not optimal because it creates new skb. this patch set introduces new helper skb_expand_head() Unlike skb_realloc_headroom, it does not allocate a new skb if possible; copies skb->sk on new skb when as needed and frees original skb in case of failures. This helps to simplify ip[6]_finish_output2(), ip6_xmit() and few other functions in vrf, ax25 and bpf. There are few other cases where this helper can be used but it requires an additional investigations. v3 changes: - ax25 compilation warning fixed - v5.14-rc4 rebase - now it does not depend on non-committed pathces v2 changes: - helper's name was changed to skb_expand_head - fixed few mistakes inside skb_expand_head(): skb_set_owner_w should set sk on nskb kfree was replaced by kfree_skb() improved warning message - added minor refactoring in changed functions in vrf and bpf patches - removed kfree_skb() in ax25_rt_build_path caller ax25_ip_xmit ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'skb_expand_head'
Vasily Averin says: ==================== skbuff: introduce skb_expand_head() currently if skb does not have enough headroom skb_realloc_headrom is called. It is not optimal because it creates new skb. this patch set introduces new helper skb_expand_head() Unlike skb_realloc_headroom, it does not allocate a new skb if possible; copies skb->sk on new skb when as needed and frees original skb in case of failures. This helps to simplify ip[6]_finish_output2(), ip6_xmit() and few other functions in vrf, ax25 and bpf. There are few other cases where this helper can be used but it requires an additional investigations. v3 changes: - ax25 compilation warning fixed - v5.14-rc4 rebase - now it does not depend on non-committed pathces v2 changes: - helper's name was changed to skb_expand_head - fixed few mistakes inside skb_expand_head(): skb_set_owner_w should set sk on nskb kfree was replaced by kfree_skb() improved warning message - added minor refactoring in changed functions in vrf and bpf patches - removed kfree_skb() in ax25_rt_build_path caller ax25_ip_xmit ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
07e1d6b3 · David S. Miller · fa976624 · a1e975e1 · 07e1d6b3 · 07e1d6b3
Commit 07e1d6b3 authored Aug 03, 2021 by David S. Miller
9 changed files
--- a/drivers/net/vrf.c
+++ b/drivers/net/vrf.c
@@ -857,30 +857,24 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
 	struct neighbour *neigh;
 	bool is_v6gw = false;
-	int ret = -EINVAL;
 	nf_reset_ct(skb);
 	/* Be paranoid, rather than too clever. */
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-		struct sk_buff *skb2;
+		skb = skb_expand_head(skb, hh_len);
+		if (!skb) {
-		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+			skb->dev->stats.tx_errors++;
-		if (!skb2) {
+			return -ENOMEM;
-			ret = -ENOMEM;
-			goto err;
 		}
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-		consume_skb(skb);
-		skb = skb2;
 	}
 	rcu_read_lock_bh();
 	neigh = ip_neigh_for_gw(rt, skb, &is_v6gw);
 	if (!IS_ERR(neigh)) {
+		int ret;
 		sock_confirm_neigh(skb, neigh);
 		/* if crossing protocols, can not use the cached header */
 		ret = neigh_output(neigh, skb, is_v6gw);
@@ -889,9 +883,8 @@ static int vrf_finish_output(struct net *net, struct sock *sk, struct sk_buff *s
 	}
 	rcu_read_unlock_bh();
-err:
 	vrf_tx_error(skb->dev, skb);
-	return ret;
+	return -EINVAL;
 }
 static int vrf_output(struct net *net, struct sock *sk, struct sk_buff *skb)

--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1183,6 +1183,7 @@ static inline struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom,
 int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask);
 struct sk_buff *skb_realloc_headroom(struct sk_buff *skb,
 				     unsigned int headroom);
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom);
 struct sk_buff *skb_copy_expand(const struct sk_buff *skb, int newheadroom,
 				int newtailroom, gfp_t priority);
 int __must_check skb_to_sgvec_nomark(struct sk_buff *skb, struct scatterlist *sg,

--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -193,10 +193,8 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
 	skb_pull(skb, AX25_KISS_HEADER_LEN);
 	if (digipeat != NULL) {
-		if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL) {
+		if ((ourskb = ax25_rt_build_path(skb, src, dst, route->digipeat)) == NULL)
-			kfree_skb(skb);
 			goto put;
-		}
 		skb = ourskb;
 	}

--- a/net/ax25/ax25_out.c
+++ b/net/ax25/ax25_out.c
@@ -325,7 +325,6 @@ void ax25_kick(ax25_cb *ax25)
 void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
 {
-	struct sk_buff *skbn;
 	unsigned char *ptr;
 	int headroom;
@@ -336,18 +335,12 @@ void ax25_transmit_buffer(ax25_cb *ax25, struct sk_buff *skb, int type)
 	headroom = ax25_addr_size(ax25->digipeat);
-	if (skb_headroom(skb) < headroom) {
+	if (unlikely(skb_headroom(skb) < headroom)) {
-		if ((skbn = skb_realloc_headroom(skb, headroom)) == NULL) {
+		skb = skb_expand_head(skb, headroom);
+		if (!skb) {
 			printk(KERN_CRIT "AX.25: ax25_transmit_buffer - out of memory\n");
-			kfree_skb(skb);
 			return;
 		}
-		if (skb->sk != NULL)
-			skb_set_owner_w(skbn, skb->sk);
-		consume_skb(skb);
-		skb = skbn;
 	}
 	ptr = skb_push(skb, headroom);

--- a/net/ax25/ax25_route.c
+++ b/net/ax25/ax25_route.c
@@ -441,24 +441,17 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr)
 struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src,
 	ax25_address *dest, ax25_digi *digi)
 {
-	struct sk_buff *skbn;
 	unsigned char *bp;
 	int len;
 	len = digi->ndigi * AX25_ADDR_LEN;
-	if (skb_headroom(skb) < len) {
+	if (unlikely(skb_headroom(skb) < len)) {
-		if ((skbn = skb_realloc_headroom(skb, len)) == NULL) {
+		skb = skb_expand_head(skb, len);
+		if (!skb) {
 			printk(KERN_CRIT "AX.25: ax25_dg_build_path - out of memory\n");
 			return NULL;
 		}
-		if (skb->sk != NULL)
-			skb_set_owner_w(skbn, skb->sk);
-		consume_skb(skb);
-		skb = skbn;
 	}
 	bp = skb_push(skb, len);

--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -2180,17 +2180,9 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 	skb->tstamp = 0;
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-		struct sk_buff *skb2;
+		skb = skb_expand_head(skb, hh_len);
+		if (!skb)
-		skb2 = skb_realloc_headroom(skb, hh_len);
-		if (unlikely(!skb2)) {
-			kfree_skb(skb);
 			return -ENOMEM;
-		}
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-		consume_skb(skb);
-		skb = skb2;
 	}
 	rcu_read_lock_bh();
@@ -2214,8 +2206,7 @@ static int bpf_out_neigh_v6(struct net *net, struct sk_buff *skb,
 	}
 	rcu_read_unlock_bh();
 	if (dst)
-		IP6_INC_STATS(dev_net(dst->dev),
+		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
-			      ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
 out_drop:
 	kfree_skb(skb);
 	return -ENETDOWN;
@@ -2287,17 +2278,9 @@ static int bpf_out_neigh_v4(struct net *net, struct sk_buff *skb,
 	skb->tstamp = 0;
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-		struct sk_buff *skb2;
+		skb = skb_expand_head(skb, hh_len);
+		if (!skb)
-		skb2 = skb_realloc_headroom(skb, hh_len);
-		if (unlikely(!skb2)) {
-			kfree_skb(skb);
 			return -ENOMEM;
-		}
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-		consume_skb(skb);
-		skb = skb2;
 	}
 	rcu_read_lock_bh();

--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -1789,6 +1789,48 @@ struct sk_buff *skb_realloc_headroom(struct sk_buff *skb, unsigned int headroom)
 }
 EXPORT_SYMBOL(skb_realloc_headroom);
+/**
+ *	skb_expand_head - reallocate header of &sk_buff
+ *	@skb: buffer to reallocate
+ *	@headroom: needed headroom
+ *
+ *	Unlike skb_realloc_headroom, this one does not allocate a new skb
+ *	if possible; copies skb->sk to new skb as needed
+ *	and frees original skb in case of failures.
+ *
+ *	It expect increased headroom and generates warning otherwise.
+ */
+struct sk_buff *skb_expand_head(struct sk_buff *skb, unsigned int headroom)
+{
+	int delta = headroom - skb_headroom(skb);
+	if (WARN_ONCE(delta <= 0,
+		      "%s is expecting an increase in the headroom", __func__))
+		return skb;
+	/* pskb_expand_head() might crash, if skb is shared */
+	if (skb_shared(skb)) {
+		struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
+		if (likely(nskb)) {
+			if (skb->sk)
+				skb_set_owner_w(nskb, skb->sk);
+			consume_skb(skb);
+		} else {
+			kfree_skb(skb);
+		}
+		skb = nskb;
+	}
+	if (skb &&
+	    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
+		kfree_skb(skb);
+		skb = NULL;
+	}
+	return skb;
+}
+EXPORT_SYMBOL(skb_expand_head);
 /**
 *	skb_copy_expand	-	copy and expand sk_buff
 *	@skb: buffer to copy

--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -198,19 +198,10 @@ static int ip_finish_output2(struct net *net, struct sock *sk, struct sk_buff *s
 	} else if (rt->rt_type == RTN_BROADCAST)
 		IP_UPD_PO_STATS(net, IPSTATS_MIB_OUTBCAST, skb->len);
-	/* Be paranoid, rather than too clever. */
 	if (unlikely(skb_headroom(skb) < hh_len && dev->header_ops)) {
-		struct sk_buff *skb2;
+		skb = skb_expand_head(skb, hh_len);
+		if (!skb)
-		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
-		if (!skb2) {
-			kfree_skb(skb);
 			return -ENOMEM;
-		}
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-		consume_skb(skb);
-		skb = skb2;
 	}
 	if (lwtunnel_xmit_redirect(dst->lwtstate)) {

--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -60,46 +60,29 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct net_device *dev = dst->dev;
+	struct inet6_dev *idev = ip6_dst_idev(dst);
 	unsigned int hh_len = LL_RESERVED_SPACE(dev);
-	int delta = hh_len - skb_headroom(skb);
+	const struct in6_addr *daddr, *nexthop;
-	const struct in6_addr *nexthop;
+	struct ipv6hdr *hdr;
 	struct neighbour *neigh;
 	int ret;
 	/* Be paranoid, rather than too clever. */
-	if (unlikely(delta > 0) && dev->header_ops) {
+	if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
-		/* pskb_expand_head() might crash, if skb is shared */
+		skb = skb_expand_head(skb, hh_len);
-		if (skb_shared(skb)) {
-			struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC);
-			if (likely(nskb)) {
-				if (skb->sk)
-					skb_set_owner_w(nskb, skb->sk);
-				consume_skb(skb);
-			} else {
-				kfree_skb(skb);
-			}
-			skb = nskb;
-		}
-		if (skb &&
-		    pskb_expand_head(skb, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) {
-			kfree_skb(skb);
-			skb = NULL;
-		}
 		if (!skb) {
-			IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
 			return -ENOMEM;
 		}
 	}
-	if (ipv6_addr_is_multicast(&ipv6_hdr(skb)->daddr)) {
+	hdr = ipv6_hdr(skb);
-		struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb));
+	daddr = &hdr->daddr;
+	if (ipv6_addr_is_multicast(daddr)) {
 		if (!(dev->flags & IFF_LOOPBACK) && sk_mc_loop(sk) &&
 		    ((mroute6_is_socket(net, skb) &&
 		     !(IP6CB(skb)->flags & IP6SKB_FORWARDED)) ||
-		     ipv6_chk_mcast_addr(dev, &ipv6_hdr(skb)->daddr,
+		     ipv6_chk_mcast_addr(dev, daddr, &hdr->saddr))) {
-					 &ipv6_hdr(skb)->saddr))) {
 			struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC);
 			/* Do not check for IFF_ALLMULTI; multicast routing
@@ -110,7 +93,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 					net, sk, newskb, NULL, newskb->dev,
 					dev_loopback_xmit);
-			if (ipv6_hdr(skb)->hop_limit == 0) {
+			if (hdr->hop_limit == 0) {
 				IP6_INC_STATS(net, idev,
 					      IPSTATS_MIB_OUTDISCARDS);
 				kfree_skb(skb);
@@ -119,9 +102,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 		}
 		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, skb->len);
+		if (IPV6_ADDR_MC_SCOPE(daddr) <= IPV6_ADDR_SCOPE_NODELOCAL &&
-		if (IPV6_ADDR_MC_SCOPE(&ipv6_hdr(skb)->daddr) <=
-		    IPV6_ADDR_SCOPE_NODELOCAL &&
 		    !(dev->flags & IFF_LOOPBACK)) {
 			kfree_skb(skb);
 			return 0;
@@ -136,10 +117,10 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	}
 	rcu_read_lock_bh();
-	nexthop = rt6_nexthop((struct rt6_info *)dst, &ipv6_hdr(skb)->daddr);
+	nexthop = rt6_nexthop((struct rt6_info *)dst, daddr);
-	neigh = __ipv6_neigh_lookup_noref(dst->dev, nexthop);
+	neigh = __ipv6_neigh_lookup_noref(dev, nexthop);
 	if (unlikely(!neigh))
-		neigh = __neigh_create(&nd_tbl, nexthop, dst->dev, false);
+		neigh = __neigh_create(&nd_tbl, nexthop, dev, false);
 	if (!IS_ERR(neigh)) {
 		sock_confirm_neigh(skb, neigh);
 		ret = neigh_output(neigh, skb, false);
@@ -148,7 +129,7 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
 	}
 	rcu_read_unlock_bh();
-	IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTNOROUTES);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTNOROUTES);
 	kfree_skb(skb);
 	return -EINVAL;
 }
@@ -268,6 +249,8 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	const struct ipv6_pinfo *np = inet6_sk(sk);
 	struct in6_addr *first_hop = &fl6->daddr;
 	struct dst_entry *dst = skb_dst(skb);
+	struct net_device *dev = dst->dev;
+	struct inet6_dev *idev = ip6_dst_idev(dst);
 	unsigned int head_room;
 	struct ipv6hdr *hdr;
 	u8  proto = fl6->flowi6_proto;
@@ -275,22 +258,16 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	int hlimit = -1;
 	u32 mtu;
-	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dst->dev);
+	head_room = sizeof(struct ipv6hdr) + LL_RESERVED_SPACE(dev);
 	if (opt)
 		head_room += opt->opt_nflen + opt->opt_flen;
-	if (unlikely(skb_headroom(skb) < head_room)) {
+	if (unlikely(head_room > skb_headroom(skb))) {
-		struct sk_buff *skb2 = skb_realloc_headroom(skb, head_room);
+		skb = skb_expand_head(skb, head_room);
-		if (!skb2) {
+		if (!skb) {
-			IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)),
+			IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
-				      IPSTATS_MIB_OUTDISCARDS);
-			kfree_skb(skb);
 			return -ENOBUFS;
 		}
-		if (skb->sk)
-			skb_set_owner_w(skb2, skb->sk);
-		consume_skb(skb);
-		skb = skb2;
 	}
 	if (opt) {
@@ -332,8 +309,7 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 	mtu = dst_mtu(dst);
 	if ((skb->len <= mtu) || skb->ignore_df || skb_is_gso(skb)) {
-		IP6_UPD_PO_STATS(net, ip6_dst_idev(skb_dst(skb)),
+		IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUT, skb->len);
-			      IPSTATS_MIB_OUT, skb->len);
 		/* if egress device is enslaved to an L3 master device pass the
 		 * skb to its handler for processing
@@ -346,17 +322,17 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
 		 * we promote our socket to non const
 		 */
 		return NF_HOOK(NFPROTO_IPV6, NF_INET_LOCAL_OUT,
-			       net, (struct sock *)sk, skb, NULL, dst->dev,
+			       net, (struct sock *)sk, skb, NULL, dev,
 			       dst_output);
 	}
-	skb->dev = dst->dev;
+	skb->dev = dev;
 	/* ipv6_local_error() does not require socket lock,
 	 * we promote our socket to non const
 	 */
 	ipv6_local_error((struct sock *)sk, EMSGSIZE, fl6, mtu);
-	IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_FRAGFAILS);
+	IP6_INC_STATS(net, idev, IPSTATS_MIB_FRAGFAILS);
 	kfree_skb(skb);
 	return -EMSGSIZE;
 }