Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue

Jeff Kirsher says: ==================== 100GbE Intel Wired LAN Driver Updates 2019-03-25 This series contains updates to the ice driver only. Victor updates the ice driver to be able to update the VSI queue configuration dynamically, by providing the ability to increase or decrease the VSI's number of queues. Michal fixes an issue when the VM starts or the VF driver is reloaded, the VLAN switch rule was lost (i.e. not added), so ensure it gets added in these cases. Brett updates the driver to support link events over the admin receive queue, instead of polling link events. Maciej refactors the code a bit to introduce a new function to fetch the receiver buffer and do the DMA synchronization to reduce the code duplication. Also added ice_can_reuse_rx_page() to verify whether the page can be reused so that in the future, we can use this check elsewhere in the driver. Additional driver optimizations so that we can drop the ice_pull_tail() altogether. Added support for bulk updates of refcount instead of doing it one by one. Refactored the page counting and buffer recycling so that we can use this code to clean up receive buffers when there is no skb allocated, like XDP. Added DMA_ATTR_WEAK_ORDERING and DMA_ATTR_SKIP_CPU_SYNC attributes to the DMA API during the mapping operations on the receive side, so that nonx86 platforms will be able to sync with what is being used (2k buffers) instead of the entire page. Dave fixes the driver to perform the most intrusive of the resets requested and clear the other request bits so that we do not end up with repeated reset, after reset. Bruce adds a iterator macro to clean up several for() loops. Chinh modifies the packet flags to be more generic so that they can be used for both receive and transmit. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch '100GbE' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue
Jeff Kirsher says: ==================== 100GbE Intel Wired LAN Driver Updates 2019-03-25 This series contains updates to the ice driver only. Victor updates the ice driver to be able to update the VSI queue configuration dynamically, by providing the ability to increase or decrease the VSI's number of queues. Michal fixes an issue when the VM starts or the VF driver is reloaded, the VLAN switch rule was lost (i.e. not added), so ensure it gets added in these cases. Brett updates the driver to support link events over the admin receive queue, instead of polling link events. Maciej refactors the code a bit to introduce a new function to fetch the receiver buffer and do the DMA synchronization to reduce the code duplication. Also added ice_can_reuse_rx_page() to verify whether the page can be reused so that in the future, we can use this check elsewhere in the driver. Additional driver optimizations so that we can drop the ice_pull_tail() altogether. Added support for bulk updates of refcount instead of doing it one by one. Refactored the page counting and buffer recycling so that we can use this code to clean up receive buffers when there is no skb allocated, like XDP. Added DMA_ATTR_WEAK_ORDERING and DMA_ATTR_SKIP_CPU_SYNC attributes to the DMA API during the mapping operations on the receive side, so that nonx86 platforms will be able to sync with what is being used (2k buffers) instead of the entire page. Dave fixes the driver to perform the most intrusive of the resets requested and clear the other request bits so that we do not end up with repeated reset, after reset. Bruce adds a iterator macro to clean up several for() loops. Chinh modifies the packet flags to be more generic so that they can be used for both receive and transmit. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
be67101f · David S. Miller · 68cc2999 · 86e81794 · be67101f · be67101f
Commit be67101f authored Mar 26, 2019 by David S. Miller
10 changed files
--- a/drivers/net/ethernet/intel/ice/ice_common.c
+++ b/drivers/net/ethernet/intel/ice/ice_common.c
@@ -262,7 +262,7 @@ static enum ice_media_type ice_get_media_type(struct ice_port_info *pi)
 *
 * Get Link Status (0x607). Returns the link status of the adapter.
 */
-static enum ice_status
+enum ice_status
 ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
 		     struct ice_link_status *link, struct ice_sq_cd *cd)
 {
@@ -358,22 +358,22 @@ static void ice_init_flex_flags(struct ice_hw *hw, enum ice_rxdid prof_id)
 	 */
 	case ICE_RXDID_FLEX_NIC:
 	case ICE_RXDID_FLEX_NIC_2:
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_PKT_FRG,
-				   ICE_RXFLG_UDP_GRE, ICE_RXFLG_PKT_DSI,
-				   ICE_RXFLG_FIN, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_FRG,
+				   ICE_FLG_UDP_GRE, ICE_FLG_PKT_DSI,
+				   ICE_FLG_FIN, idx++);
 		/* flex flag 1 is not used for flexi-flag programming, skipping
 		 * these four FLG64 bits.
 		 */
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_SYN, ICE_RXFLG_RST,
-				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_PKT_DSI,
-				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_EVLAN_x8100,
-				   ICE_RXFLG_EVLAN_x9100, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_VLAN_x8100,
-				   ICE_RXFLG_TNL_VLAN, ICE_RXFLG_TNL_MAC,
-				   ICE_RXFLG_TNL0, idx++);
-		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_RXFLG_TNL1, ICE_RXFLG_TNL2,
-				   ICE_RXFLG_PKT_DSI, ICE_RXFLG_PKT_DSI, idx);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_SYN, ICE_FLG_RST,
+				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_PKT_DSI,
+				   ICE_FLG_PKT_DSI, ICE_FLG_EVLAN_x8100,
+				   ICE_FLG_EVLAN_x9100, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_VLAN_x8100,
+				   ICE_FLG_TNL_VLAN, ICE_FLG_TNL_MAC,
+				   ICE_FLG_TNL0, idx++);
+		ICE_PROG_FLG_ENTRY(hw, prof_id, ICE_FLG_TNL1, ICE_FLG_TNL2,
+				   ICE_FLG_PKT_DSI, ICE_FLG_PKT_DSI, idx);
 		break;

 	default:
@@ -2150,6 +2150,32 @@ ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 	return ice_aq_send_cmd(pi->hw, &desc, NULL, 0, cd);
 }

+/**
+ * ice_aq_set_event_mask
+ * @hw: pointer to the HW struct
+ * @port_num: port number of the physical function
+ * @mask: event mask to be set
+ * @cd: pointer to command details structure or NULL
+ *
+ * Set event mask (0x0613)
+ */
+enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd)
+{
+	struct ice_aqc_set_event_mask *cmd;
+	struct ice_aq_desc desc;
+
+	cmd = &desc.params.set_event_mask;
+
+	ice_fill_dflt_direct_cmd_desc(&desc, ice_aqc_opc_set_event_mask);
+
+	cmd->lport_num = port_num;
+
+	cmd->event_mask = cpu_to_le16(mask);
+	return ice_aq_send_cmd(hw, &desc, NULL, 0, cd);
+}
+
 /**
 * ice_aq_set_port_id_led
 * @pi: pointer to the port information
@@ -2923,7 +2949,7 @@ ice_cfg_vsi_qs(struct ice_port_info *pi, u16 vsi_handle, u8 tc_bitmap,

 	mutex_lock(&pi->sched_lock);

-	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+	ice_for_each_traffic_class(i) {
 		/* configuration is possible only if TC node is present */
 		if (!ice_sched_get_tc_node(pi, i))
 			continue;

--- a/drivers/net/ethernet/intel/ice/ice_common.h
+++ b/drivers/net/ethernet/intel/ice/ice_common.h
@@ -89,6 +89,12 @@ enum ice_status
 ice_aq_set_link_restart_an(struct ice_port_info *pi, bool ena_link,
 			   struct ice_sq_cd *cd);
 enum ice_status
+ice_aq_get_link_info(struct ice_port_info *pi, bool ena_lse,
+		     struct ice_link_status *link, struct ice_sq_cd *cd);
+enum ice_status
+ice_aq_set_event_mask(struct ice_hw *hw, u8 port_num, u16 mask,
+		      struct ice_sq_cd *cd);
+enum ice_status
 ice_aq_set_port_id_led(struct ice_port_info *pi, bool is_orig_mode,
 		       struct ice_sq_cd *cd);


--- a/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
+++ b/drivers/net/ethernet/intel/ice/ice_lan_tx_rx.h
@@ -208,23 +208,23 @@ enum ice_flex_rx_mdid {
 	ICE_RX_MDID_HASH_HIGH,
 };

-/* Rx Flag64 packet flag bits */
-enum ice_rx_flg64_bits {
-	ICE_RXFLG_PKT_DSI	= 0,
-	ICE_RXFLG_EVLAN_x8100	= 15,
-	ICE_RXFLG_EVLAN_x9100,
-	ICE_RXFLG_VLAN_x8100,
-	ICE_RXFLG_TNL_MAC	= 22,
-	ICE_RXFLG_TNL_VLAN,
-	ICE_RXFLG_PKT_FRG,
-	ICE_RXFLG_FIN		= 32,
-	ICE_RXFLG_SYN,
-	ICE_RXFLG_RST,
-	ICE_RXFLG_TNL0		= 38,
-	ICE_RXFLG_TNL1,
-	ICE_RXFLG_TNL2,
-	ICE_RXFLG_UDP_GRE,
-	ICE_RXFLG_RSVD		= 63
+/* RX/TX Flag64 packet flag bits */
+enum ice_flg64_bits {
+	ICE_FLG_PKT_DSI		= 0,
+	ICE_FLG_EVLAN_x8100	= 15,
+	ICE_FLG_EVLAN_x9100,
+	ICE_FLG_VLAN_x8100,
+	ICE_FLG_TNL_MAC		= 22,
+	ICE_FLG_TNL_VLAN,
+	ICE_FLG_PKT_FRG,
+	ICE_FLG_FIN		= 32,
+	ICE_FLG_SYN,
+	ICE_FLG_RST,
+	ICE_FLG_TNL0		= 38,
+	ICE_FLG_TNL1,
+	ICE_FLG_TNL2,
+	ICE_FLG_UDP_GRE,
+	ICE_FLG_RSVD		= 63
 };

 /* for ice_32byte_rx_flex_desc.ptype_flexi_flags0 member */

--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -856,7 +856,7 @@ static void ice_vsi_setup_q_map(struct ice_vsi *vsi, struct ice_vsi_ctx *ctxt)
 	/* find the (rounded up) power-of-2 of qcount */
 	pow = order_base_2(qcount_rx);

-	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+	ice_for_each_traffic_class(i) {
 		if (!(vsi->tc_cfg.ena_tc & BIT(i))) {
 			/* TC is not enabled */
 			vsi->tc_cfg.tc_info[i].qoffset = 0;
@@ -1689,7 +1689,7 @@ ice_vsi_cfg_txqs(struct ice_vsi *vsi, struct ice_ring **rings, int offset)
 	num_q_grps = 1;

 	/* set up and configure the Tx queues for each enabled TC */
-	for (tc = 0; tc < ICE_MAX_TRAFFIC_CLASS; tc++) {
+	ice_for_each_traffic_class(tc) {
 		if (!(vsi->tc_cfg.ena_tc & BIT(tc)))
 			break;


--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -478,8 +478,14 @@ static void ice_reset_subtask(struct ice_pf *pf)
 	 * for the reset now), poll for reset done, rebuild and return.
 	 */
 	if (test_bit(__ICE_RESET_OICR_RECV, pf->state)) {
-		clear_bit(__ICE_GLOBR_RECV, pf->state);
-		clear_bit(__ICE_CORER_RECV, pf->state);
+		/* Perform the largest reset requested */
+		if (test_and_clear_bit(__ICE_CORER_RECV, pf->state))
+			reset_type = ICE_RESET_CORER;
+		if (test_and_clear_bit(__ICE_GLOBR_RECV, pf->state))
+			reset_type = ICE_RESET_GLOBR;
+		/* return if no valid reset type requested */
+		if (reset_type == ICE_RESET_INVAL)
+			return;
 		if (!test_bit(__ICE_PREPARED_FOR_RESET, pf->state))
 			ice_prepare_for_reset(pf);

@@ -698,9 +704,6 @@ static void ice_watchdog_subtask(struct ice_pf *pf)

 	pf->serv_tmr_prev = jiffies;

-	if (ice_link_event(pf, pf->hw.port_info))
-		dev_dbg(&pf->pdev->dev, "ice_link_event failed\n");
-
 	/* Update the stats for active netdevs so the network stack
 	 * can look at updated numbers whenever it cares to
 	 */
@@ -710,6 +713,60 @@ static void ice_watchdog_subtask(struct ice_pf *pf)
 			ice_update_vsi_stats(pf->vsi[i]);
 }

+/**
+ * ice_init_link_events - enable/initialize link events
+ * @pi: pointer to the port_info instance
+ *
+ * Returns -EIO on failure, 0 on success
+ */
+static int ice_init_link_events(struct ice_port_info *pi)
+{
+	u16 mask;
+
+	mask = ~((u16)(ICE_AQ_LINK_EVENT_UPDOWN | ICE_AQ_LINK_EVENT_MEDIA_NA |
+		       ICE_AQ_LINK_EVENT_MODULE_QUAL_FAIL));
+
+	if (ice_aq_set_event_mask(pi->hw, pi->lport, mask, NULL)) {
+		dev_dbg(ice_hw_to_dev(pi->hw),
+			"Failed to set link event mask for port %d\n",
+			pi->lport);
+		return -EIO;
+	}
+
+	if (ice_aq_get_link_info(pi, true, NULL, NULL)) {
+		dev_dbg(ice_hw_to_dev(pi->hw),
+			"Failed to enable link events for port %d\n",
+			pi->lport);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/**
+ * ice_handle_link_event - handle link event via ARQ
+ * @pf: pf that the link event is associated with
+ *
+ * Return -EINVAL if port_info is null
+ * Return status on success
+ */
+static int ice_handle_link_event(struct ice_pf *pf)
+{
+	struct ice_port_info *port_info;
+	int status;
+
+	port_info = pf->hw.port_info;
+	if (!port_info)
+		return -EINVAL;
+
+	status = ice_link_event(pf, port_info);
+	if (status)
+		dev_dbg(&pf->pdev->dev,
+			"Could not process link event, error %d\n", status);
+
+	return status;
+}
+
 /**
 * __ice_clean_ctrlq - helper function to clean controlq rings
 * @pf: ptr to struct ice_pf
@@ -813,6 +870,11 @@ static int __ice_clean_ctrlq(struct ice_pf *pf, enum ice_ctl_q q_type)
 		opcode = le16_to_cpu(event.desc.opcode);

 		switch (opcode) {
+		case ice_aqc_opc_get_link_status:
+			if (ice_handle_link_event(pf))
+				dev_err(&pf->pdev->dev,
+					"Could not handle link event\n");
+			break;
 		case ice_mbx_opc_send_msg_to_pf:
 			ice_vc_process_vf_msg(pf, &event);
 			break;
@@ -1248,10 +1310,9 @@ static int ice_vsi_req_irq_msix(struct ice_vsi *vsi, char *basename)
 			/* skip this unused q_vector */
 			continue;
 		}
-		err = devm_request_irq(&pf->pdev->dev,
-				       pf->msix_entries[base + vector].vector,
-				       vsi->irq_handler, 0, q_vector->name,
-				       q_vector);
+		err = devm_request_irq(&pf->pdev->dev, irq_num,
+				       vsi->irq_handler, 0,
+				       q_vector->name, q_vector);
 		if (err) {
 			netdev_err(vsi->netdev,
 				   "MSIX request_irq failed, error: %d\n", err);
@@ -2268,6 +2329,12 @@ ice_probe(struct pci_dev *pdev, const struct pci_device_id __always_unused *ent)
 	/* since everything is good, start the service timer */
 	mod_timer(&pf->serv_tmr, round_jiffies(jiffies + pf->serv_tmr_period));

+	err = ice_init_link_events(pf->hw.port_info);
+	if (err) {
+		dev_err(dev, "ice_init_link_events failed: %d\n", err);
+		goto err_alloc_sw_unroll;
+	}
+
 	ice_verify_cacheline_size(pf);

 	return 0;

--- a/drivers/net/ethernet/intel/ice/ice_sched.c
+++ b/drivers/net/ethernet/intel/ice/ice_sched.c
@@ -1268,42 +1268,6 @@ ice_sched_add_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	return 0;
 }

-/**
- * ice_sched_rm_vsi_child_nodes - remove VSI child nodes from the tree
- * @pi: port information structure
- * @vsi_node: pointer to the VSI node
- * @num_nodes: pointer to the num nodes that needs to be removed per layer
- * @owner: node owner (lan or rdma)
- *
- * This function removes the VSI child nodes from the tree. It gets called for
- * lan and rdma separately.
- */
-static void
-ice_sched_rm_vsi_child_nodes(struct ice_port_info *pi,
-			     struct ice_sched_node *vsi_node, u16 *num_nodes,
-			     u8 owner)
-{
-	struct ice_sched_node *node, *next;
-	u8 i, qgl, vsil;
-	u16 num;
-
-	qgl = ice_sched_get_qgrp_layer(pi->hw);
-	vsil = ice_sched_get_vsi_layer(pi->hw);
-
-	for (i = qgl; i > vsil; i--) {
-		num = num_nodes[i];
-		node = ice_sched_get_first_node(pi->hw, vsi_node, i);
-		while (node && num) {
-			next = node->sibling;
-			if (node->owner == owner && !node->num_children) {
-				ice_free_sched_node(pi, node);
-				num--;
-			}
-			node = next;
-		}
-	}
-}
-
 /**
 * ice_sched_calc_vsi_support_nodes - calculate number of VSI support nodes
 * @hw: pointer to the hw struct
@@ -1444,7 +1408,6 @@ static enum ice_status
 ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 				 u8 tc, u16 new_numqs, u8 owner)
 {
-	u16 prev_num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
 	u16 new_num_nodes[ICE_AQC_TOPO_MAX_LEVEL_NUM] = { 0 };
 	struct ice_sched_node *vsi_node;
 	struct ice_sched_node *tc_node;
@@ -1452,7 +1415,6 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	enum ice_status status = 0;
 	struct ice_hw *hw = pi->hw;
 	u16 prev_numqs;
-	u8 i;

 	tc_node = ice_sched_get_tc_node(pi, tc);
 	if (!tc_node)
@@ -1471,33 +1433,22 @@ ice_sched_update_vsi_child_nodes(struct ice_port_info *pi, u16 vsi_handle,
 	else
 		return ICE_ERR_PARAM;

-	/* num queues are not changed */
-	if (prev_numqs == new_numqs)
+	/* num queues are not changed or less than the previous number */
+	if (new_numqs <= prev_numqs)
 		return status;
-
-	/* calculate number of nodes based on prev/new number of qs */
-	if (prev_numqs)
-		ice_sched_calc_vsi_child_nodes(hw, prev_numqs, prev_num_nodes);
-
 	if (new_numqs)
 		ice_sched_calc_vsi_child_nodes(hw, new_numqs, new_num_nodes);
-
-	if (prev_numqs > new_numqs) {
-		for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
-			new_num_nodes[i] = prev_num_nodes[i] - new_num_nodes[i];
-
-		ice_sched_rm_vsi_child_nodes(pi, vsi_node, new_num_nodes,
-					     owner);
-	} else {
-		for (i = 0; i < ICE_AQC_TOPO_MAX_LEVEL_NUM; i++)
-			new_num_nodes[i] -= prev_num_nodes[i];
-
-		status = ice_sched_add_vsi_child_nodes(pi, vsi_handle, tc_node,
-						       new_num_nodes, owner);
-		if (status)
-			return status;
-	}
-
+	/* Keep the max number of queue configuration all the time. Update the
+	 * tree only if number of queues > previous number of queues. This may
+	 * leave some extra nodes in the tree if number of queues < previous
+	 * number but that wouldn't harm anything. Removing those extra nodes
+	 * may complicate the code if those nodes are part of SRL or
+	 * individually rate limited.
+	 */
+	status = ice_sched_add_vsi_child_nodes(pi, vsi_handle, tc_node,
+					       new_num_nodes, owner);
+	if (status)
+		return status;
 	vsi_ctx->sched.max_lanq[tc] = new_numqs;

 	return 0;
@@ -1655,7 +1606,7 @@ ice_sched_rm_vsi_cfg(struct ice_port_info *pi, u16 vsi_handle, u8 owner)
 	if (!vsi_ctx)
 		goto exit_sched_rm_vsi_cfg;

-	for (i = 0; i < ICE_MAX_TRAFFIC_CLASS; i++) {
+	ice_for_each_traffic_class(i) {
 		struct ice_sched_node *vsi_node, *tc_node;
 		u8 j = 0;


--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
@@ -282,8 +282,17 @@ void ice_clean_rx_ring(struct ice_ring *rx_ring)
 		if (!rx_buf->page)
 			continue;

-		dma_unmap_page(dev, rx_buf->dma, PAGE_SIZE, DMA_FROM_DEVICE);
-		__free_pages(rx_buf->page, 0);
+		/* Invalidate cache lines that may have been written to by
+		 * device so that we avoid corrupting memory.
+		 */
+		dma_sync_single_range_for_cpu(dev, rx_buf->dma,
+					      rx_buf->page_offset,
+					      ICE_RXBUF_2048, DMA_FROM_DEVICE);
+
+		/* free resources associated with mapping */
+		dma_unmap_page_attrs(dev, rx_buf->dma, PAGE_SIZE,
+				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);

 		rx_buf->page = NULL;
 		rx_buf->page_offset = 0;
@@ -409,7 +418,8 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi)
 	}

 	/* map page for use */
-	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
+	dma = dma_map_page_attrs(rx_ring->dev, page, 0, PAGE_SIZE,
+				 DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);

 	/* if mapping failed free memory back to system since
 	 * there isn't much point in holding memory we can't use
@@ -423,6 +433,8 @@ ice_alloc_mapped_page(struct ice_ring *rx_ring, struct ice_rx_buf *bi)
 	bi->dma = dma;
 	bi->page = page;
 	bi->page_offset = 0;
+	page_ref_add(page, USHRT_MAX - 1);
+	bi->pagecnt_bias = USHRT_MAX;

 	return true;
 }
@@ -452,6 +464,12 @@ bool ice_alloc_rx_bufs(struct ice_ring *rx_ring, u16 cleaned_count)
 		if (!ice_alloc_mapped_page(rx_ring, bi))
 			goto no_bufs;

+		/* sync the buffer for use by the device */
+		dma_sync_single_range_for_device(rx_ring->dev, bi->dma,
+						 bi->page_offset,
+						 ICE_RXBUF_2048,
+						 DMA_FROM_DEVICE);
+
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info.
 		 */
@@ -497,61 +515,43 @@ static bool ice_page_is_reserved(struct page *page)
 }

 /**
- * ice_add_rx_frag - Add contents of Rx buffer to sk_buff
- * @rx_buf: buffer containing page to add
- * @rx_desc: descriptor containing length of buffer written by hardware
- * @skb: sk_buf to place the data into
- *
- * This function will add the data contained in rx_buf->page to the skb.
- * This is done either through a direct copy if the data in the buffer is
- * less than the skb header size, otherwise it will just attach the page as
- * a frag to the skb.
+ * ice_rx_buf_adjust_pg_offset - Prepare Rx buffer for reuse
+ * @rx_buf: Rx buffer to adjust
+ * @size: Size of adjustment
 *
- * The function will then update the page offset if necessary and return
- * true if the buffer can be reused by the adapter.
+ * Update the offset within page so that Rx buf will be ready to be reused.
+ * For systems with PAGE_SIZE < 8192 this function will flip the page offset
+ * so the second half of page assigned to Rx buffer will be used, otherwise
+ * the offset is moved by the @size bytes
 */
-static bool
-ice_add_rx_frag(struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *rx_desc,
-		struct sk_buff *skb)
+static void
+ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
 {
 #if (PAGE_SIZE < 8192)
-	unsigned int truesize = ICE_RXBUF_2048;
+	/* flip page offset to other buffer */
+	rx_buf->page_offset ^= size;
 #else
-	unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
-	unsigned int truesize;
-#endif /* PAGE_SIZE < 8192) */
-
-	struct page *page;
-	unsigned int size;
-
-	size = le16_to_cpu(rx_desc->wb.pkt_len) &
-		ICE_RX_FLX_DESC_PKT_LEN_M;
-
-	page = rx_buf->page;
+	/* move offset up to the next cache line */
+	rx_buf->page_offset += size;
+#endif
+}

+/**
+ * ice_can_reuse_rx_page - Determine if page can be reused for another Rx
+ * @rx_buf: buffer containing the page
+ *
+ * If page is reusable, we have a green light for calling ice_reuse_rx_page,
+ * which will assign the current buffer to the buffer that next_to_alloc is
+ * pointing to; otherwise, the DMA mapping needs to be destroyed and
+ * page freed
+ */
+static bool ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
+{
 #if (PAGE_SIZE >= 8192)
-	truesize = ALIGN(size, L1_CACHE_BYTES);
-#endif /* PAGE_SIZE >= 8192) */
-
-	/* will the data fit in the skb we allocated? if so, just
-	 * copy it as it is pretty small anyway
-	 */
-	if (size <= ICE_RX_HDR_SIZE && !skb_is_nonlinear(skb)) {
-		unsigned char *va = page_address(page) + rx_buf->page_offset;
-
-		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
-
-		/* page is not reserved, we can reuse buffer as-is */
-		if (likely(!ice_page_is_reserved(page)))
-			return true;
-
-		/* this page cannot be reused so discard it */
-		__free_pages(page, 0);
-		return false;
-	}
-
-	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
-			rx_buf->page_offset, size, truesize);
+	unsigned int last_offset = PAGE_SIZE - ICE_RXBUF_2048;
+#endif
+	unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
+	struct page *page = rx_buf->page;

 	/* avoid re-using remote pages */
 	if (unlikely(ice_page_is_reserved(page)))
@@ -559,27 +559,52 @@ ice_add_rx_frag(struct ice_rx_buf *rx_buf, union ice_32b_rx_flex_desc *rx_desc,

 #if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
-	if (unlikely(page_count(page) != 1))
+	if (unlikely((page_count(page) - pagecnt_bias) > 1))
 		return false;
-
-	/* flip page offset to other buffer */
-	rx_buf->page_offset ^= truesize;
 #else
-	/* move offset up to the next cache line */
-	rx_buf->page_offset += truesize;
-
 	if (rx_buf->page_offset > last_offset)
 		return false;
 #endif /* PAGE_SIZE < 8192) */

-	/* Even if we own the page, we are not allowed to use atomic_set()
-	 * This would break get_page_unless_zero() users.
+	/* If we have drained the page fragment pool we need to update
+	 * the pagecnt_bias and page count so that we fully restock the
+	 * number of references the driver holds.
 	 */
-	get_page(rx_buf->page);
+	if (unlikely(pagecnt_bias == 1)) {
+		page_ref_add(page, USHRT_MAX - 1);
+		rx_buf->pagecnt_bias = USHRT_MAX;
+	}

 	return true;
 }

+/**
+ * ice_add_rx_frag - Add contents of Rx buffer to sk_buff as a frag
+ * @rx_buf: buffer containing page to add
+ * @skb: sk_buff to place the data into
+ * @size: packet length from rx_desc
+ *
+ * This function will add the data contained in rx_buf->page to the skb.
+ * It will just attach the page as a frag to the skb.
+ * The function will then update the page offset.
+ */
+static void
+ice_add_rx_frag(struct ice_rx_buf *rx_buf, struct sk_buff *skb,
+		unsigned int size)
+{
+#if (PAGE_SIZE >= 8192)
+	unsigned int truesize = SKB_DATA_ALIGN(size);
+#else
+	unsigned int truesize = ICE_RXBUF_2048;
+#endif
+
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rx_buf->page,
+			rx_buf->page_offset, size, truesize);
+
+	/* page is being used so we must update the page offset */
+	ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
+}
+
 /**
 * ice_reuse_rx_page - page flip buffer and store it back on the ring
 * @rx_ring: Rx descriptor ring to store buffers on
@@ -599,121 +624,132 @@ ice_reuse_rx_page(struct ice_ring *rx_ring, struct ice_rx_buf *old_buf)
 	nta++;
 	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;

-	/* transfer page from old buffer to new buffer */
-	*new_buf = *old_buf;
+	/* Transfer page from old buffer to new buffer.
+	 * Move each member individually to avoid possible store
+	 * forwarding stalls and unnecessary copy of skb.
+	 */
+	new_buf->dma = old_buf->dma;
+	new_buf->page = old_buf->page;
+	new_buf->page_offset = old_buf->page_offset;
+	new_buf->pagecnt_bias = old_buf->pagecnt_bias;
 }

 /**
- * ice_fetch_rx_buf - Allocate skb and populate it
+ * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
 * @rx_ring: Rx descriptor ring to transact packets on
- * @rx_desc: descriptor containing info written by hardware
+ * @skb: skb to be used
+ * @size: size of buffer to add to skb
 *
- * This function allocates an skb on the fly, and populates it with the page
- * data from the current receive descriptor, taking care to set up the skb
- * correctly, as well as handling calling the page recycle function if
- * necessary.
+ * This function will pull an Rx buffer from the ring and synchronize it
+ * for use by the CPU.
 */
-static struct sk_buff *
-ice_fetch_rx_buf(struct ice_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc)
+static struct ice_rx_buf *
+ice_get_rx_buf(struct ice_ring *rx_ring, struct sk_buff **skb,
+	       const unsigned int size)
 {
 	struct ice_rx_buf *rx_buf;
-	struct sk_buff *skb;
-	struct page *page;

 	rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
-	page = rx_buf->page;
-	prefetchw(page);
+	prefetchw(rx_buf->page);
+	*skb = rx_buf->skb;

-	skb = rx_buf->skb;
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
+				      rx_buf->page_offset, size,
+				      DMA_FROM_DEVICE);
+
+	/* We have pulled a buffer for use, so decrement pagecnt_bias */
+	rx_buf->pagecnt_bias--;
+
+	return rx_buf;
+}

-	if (likely(!skb)) {
-		u8 *page_addr = page_address(page) + rx_buf->page_offset;
+/**
+ * ice_construct_skb - Allocate skb and populate it
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @rx_buf: Rx buffer to pull data from
+ * @size: the length of the packet
+ *
+ * This function allocates an skb. It then populates it with the page
+ * data from the current receive descriptor, taking care to set up the
+ * skb correctly.
+ */
+static struct sk_buff *
+ice_construct_skb(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf,
+		  unsigned int size)
+{
+	void *va = page_address(rx_buf->page) + rx_buf->page_offset;
+	unsigned int headlen;
+	struct sk_buff *skb;

-		/* prefetch first cache line of first page */
-		prefetch(page_addr);
+	/* prefetch first cache line of first page */
+	prefetch(va);
 #if L1_CACHE_BYTES < 128
-		prefetch((void *)(page_addr + L1_CACHE_BYTES));
+	prefetch((u8 *)va + L1_CACHE_BYTES);
 #endif /* L1_CACHE_BYTES */

-		/* allocate a skb to store the frags */
-		skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
-				       ICE_RX_HDR_SIZE,
-				       GFP_ATOMIC | __GFP_NOWARN);
-		if (unlikely(!skb)) {
-			rx_ring->rx_stats.alloc_buf_failed++;
-			return NULL;
-		}
+	/* allocate a skb to store the frags */
+	skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
+			       GFP_ATOMIC | __GFP_NOWARN);
+	if (unlikely(!skb))
+		return NULL;

-		/* we will be copying header into skb->data in
-		 * pskb_may_pull so it is in our interest to prefetch
-		 * it now to avoid a possible cache miss
-		 */
-		prefetchw(skb->data);
+	skb_record_rx_queue(skb, rx_ring->q_index);
+	/* Determine available headroom for copy */
+	headlen = size;
+	if (headlen > ICE_RX_HDR_SIZE)
+		headlen = eth_get_headlen(va, ICE_RX_HDR_SIZE);

-		skb_record_rx_queue(skb, rx_ring->q_index);
-	} else {
-		/* we are reusing so sync this buffer for CPU use */
-		dma_sync_single_range_for_cpu(rx_ring->dev, rx_buf->dma,
-					      rx_buf->page_offset,
-					      ICE_RXBUF_2048,
-					      DMA_FROM_DEVICE);
-
-		rx_buf->skb = NULL;
-	}
+	/* align pull length to size of long to optimize memcpy performance */
+	memcpy(__skb_put(skb, headlen), va, ALIGN(headlen, sizeof(long)));

-	/* pull page into skb */
-	if (ice_add_rx_frag(rx_buf, rx_desc, skb)) {
-		/* hand second half of page back to the ring */
-		ice_reuse_rx_page(rx_ring, rx_buf);
-		rx_ring->rx_stats.page_reuse_count++;
+	/* if we exhaust the linear part then add what is left as a frag */
+	size -= headlen;
+	if (size) {
+#if (PAGE_SIZE >= 8192)
+		unsigned int truesize = SKB_DATA_ALIGN(size);
+#else
+		unsigned int truesize = ICE_RXBUF_2048;
+#endif
+		skb_add_rx_frag(skb, 0, rx_buf->page,
+				rx_buf->page_offset + headlen, size, truesize);
+		/* buffer is used by skb, update page_offset */
+		ice_rx_buf_adjust_pg_offset(rx_buf, truesize);
 	} else {
-		/* we are not reusing the buffer so unmap it */
-		dma_unmap_page(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
-			       DMA_FROM_DEVICE);
+		/* buffer is unused, reset bias back to rx_buf; data was copied
+		 * onto skb's linear part so there's no need for adjusting
+		 * page offset and we can reuse this buffer as-is
+		 */
+		rx_buf->pagecnt_bias++;
 	}

-	/* clear contents of buffer_info */
-	rx_buf->page = NULL;
-
 	return skb;
 }

 /**
- * ice_pull_tail - ice specific version of skb_pull_tail
- * @skb: pointer to current skb being adjusted
+ * ice_put_rx_buf - Clean up used buffer and either recycle or free
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @rx_buf: Rx buffer to pull data from
 *
- * This function is an ice specific version of __pskb_pull_tail. The
- * main difference between this version and the original function is that
- * this function can make several assumptions about the state of things
- * that allow for significant optimizations versus the standard function.
- * As a result we can do things like drop a frag and maintain an accurate
- * truesize for the skb.
+ * This function will  clean up the contents of the rx_buf. It will
+ * either recycle the buffer or unmap it and free the associated resources.
 */
-static void ice_pull_tail(struct sk_buff *skb)
+static void ice_put_rx_buf(struct ice_ring *rx_ring, struct ice_rx_buf *rx_buf)
 {
-	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
-	unsigned int pull_len;
-	unsigned char *va;
-
-	/* it is valid to use page_address instead of kmap since we are
-	 * working with pages allocated out of the lomem pool per
-	 * alloc_page(GFP_ATOMIC)
-	 */
-	va = skb_frag_address(frag);
-
-	/* we need the header to contain the greater of either ETH_HLEN or
-	 * 60 bytes if the skb->len is less than 60 for skb_pad.
-	 */
-	pull_len = eth_get_headlen(va, ICE_RX_HDR_SIZE);
-
-	/* align pull length to size of long to optimize memcpy performance */
-	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
+		/* hand second half of page back to the ring */
+	if (ice_can_reuse_rx_page(rx_buf)) {
+		ice_reuse_rx_page(rx_ring, rx_buf);
+		rx_ring->rx_stats.page_reuse_count++;
+	} else {
+		/* we are not reusing the buffer so unmap it */
+		dma_unmap_page_attrs(rx_ring->dev, rx_buf->dma, PAGE_SIZE,
+				     DMA_FROM_DEVICE, ICE_RX_DMA_ATTR);
+		__page_frag_cache_drain(rx_buf->page, rx_buf->pagecnt_bias);
+	}

-	/* update all of the pointers */
-	skb_frag_size_sub(frag, pull_len);
-	frag->page_offset += pull_len;
-	skb->data_len -= pull_len;
-	skb->tail += pull_len;
+	/* clear contents of buffer_info */
+	rx_buf->page = NULL;
+	rx_buf->skb = NULL;
 }

 /**
@@ -730,10 +766,6 @@ static void ice_pull_tail(struct sk_buff *skb)
 */
 static bool ice_cleanup_headers(struct sk_buff *skb)
 {
-	/* place header in linear portion of buffer */
-	if (skb_is_nonlinear(skb))
-		ice_pull_tail(skb);
-
 	/* if eth_skb_pad returns an error the skb was freed */
 	if (eth_skb_pad(skb))
 		return true;
@@ -963,7 +995,9 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 	/* start the loop to process RX packets bounded by 'budget' */
 	while (likely(total_rx_pkts < (unsigned int)budget)) {
 		union ice_32b_rx_flex_desc *rx_desc;
+		struct ice_rx_buf *rx_buf;
 		struct sk_buff *skb;
+		unsigned int size;
 		u16 stat_err_bits;
 		u16 vlan_tag = 0;
 		u8 rx_ptype;
@@ -993,11 +1027,24 @@ static int ice_clean_rx_irq(struct ice_ring *rx_ring, int budget)
 		 */
 		dma_rmb();

+		size = le16_to_cpu(rx_desc->wb.pkt_len) &
+			ICE_RX_FLX_DESC_PKT_LEN_M;
+
+		rx_buf = ice_get_rx_buf(rx_ring, &skb, size);
 		/* allocate (if needed) and populate skb */
-		skb = ice_fetch_rx_buf(rx_ring, rx_desc);
-		if (!skb)
+		if (skb)
+			ice_add_rx_frag(rx_buf, skb, size);
+		else
+			skb = ice_construct_skb(rx_ring, rx_buf, size);
+
+		/* exit if we failed to retrieve a buffer */
+		if (!skb) {
+			rx_ring->rx_stats.alloc_buf_failed++;
+			rx_buf->pagecnt_bias++;
 			break;
+		}

+		ice_put_rx_buf(rx_ring, rx_buf);
 		cleaned_count++;

 		/* skip if it is NOP desc */

--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
+++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
@@ -47,6 +47,9 @@
 #define ICE_TX_FLAGS_VLAN_M	0xffff0000
 #define ICE_TX_FLAGS_VLAN_S	16

+#define ICE_RX_DMA_ATTR \
+	(DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)
+
 struct ice_tx_buf {
 	struct ice_tx_desc *next_to_watch;
 	struct sk_buff *skb;
@@ -73,6 +76,7 @@ struct ice_rx_buf {
 	dma_addr_t dma;
 	struct page *page;
 	unsigned int page_offset;
+	u16 pagecnt_bias;
 };

 struct ice_q_stats {

--- a/drivers/net/ethernet/intel/ice/ice_type.h
+++ b/drivers/net/ethernet/intel/ice/ice_type.h
@@ -210,6 +210,9 @@ struct ice_nvm_info {
 #define ICE_MAX_TRAFFIC_CLASS 8
 #define ICE_TXSCHED_MAX_BRANCHES ICE_MAX_TRAFFIC_CLASS

+#define ice_for_each_traffic_class(_i)	\
+	for ((_i) = 0; (_i) < ICE_MAX_TRAFFIC_CLASS; (_i)++)
+
 struct ice_sched_node {
 	struct ice_sched_node *parent;
 	struct ice_sched_node *sibling; /* next sibling in the same layer */

--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
@@ -457,8 +457,10 @@ static int ice_alloc_vsi_res(struct ice_vf *vf)
 	vsi->hw_base_vector += 1;

 	/* Check if port VLAN exist before, and restore it accordingly */
-	if (vf->port_vlan_id)
+	if (vf->port_vlan_id) {
 		ice_vsi_manage_pvid(vsi, vf->port_vlan_id, true);
+		ice_vsi_add_vlan(vsi, vf->port_vlan_id & ICE_VLAN_M);
+	}

 	eth_broadcast_addr(broadcast);

@@ -1925,6 +1927,9 @@ static int ice_vc_cfg_qs_msg(struct ice_vf *vf, u8 *msg)
 	 */
 	vsi->num_txq = qci->num_queue_pairs;
 	vsi->num_rxq = qci->num_queue_pairs;
+	/* All queues of VF VSI are in TC 0 */
+	vsi->tc_cfg.tc_info[0].qcount_tx = qci->num_queue_pairs;
+	vsi->tc_cfg.tc_info[0].qcount_rx = qci->num_queue_pairs;

 	if (!ice_vsi_cfg_lan_txqs(vsi) && !ice_vsi_cfg_rxqs(vsi))
 		aq_ret = 0;