ice: convert VF storage to hash table with krefs and RCU

The ice driver stores VF structures in a simple array which is allocated once at the time of VF creation. The VF structures are then accessed from the array by their VF ID. The ID must be between 0 and the number of allocated VFs. Multiple threads can access this table: * .ndo operations such as .ndo_get_vf_cfg or .ndo_set_vf_trust * interrupts, such as due to messages from the VF using the virtchnl communication * processing such as device reset * commands to add or remove VFs The current implementation does not keep track of when all threads are done operating on a VF and can potentially result in use-after-free issues caused by one thread accessing a VF structure after it has been released when removing VFs. Some of these are prevented with various state flags and checks. In addition, this structure is quite static and does not support a planned future where virtualization can be more dynamic. As we begin to look at supporting Scalable IOV with the ice driver (as opposed to just supporting Single Root IOV), this structure is not sufficient. In the future, VFs will be able to be added and removed individually and dynamically. To allow for this, and to better protect against a whole class of use-after-free bugs, replace the VF storage with a combination of a hash table and krefs to reference track all of the accesses to VFs through the hash table. A hash table still allows efficient look up of the VF given its ID, but also allows adding and removing VFs. It does not require contiguous VF IDs. The use of krefs allows the cleanup of the VF memory to be delayed until after all threads have released their reference (by calling ice_put_vf). To prevent corruption of the hash table, a combination of RCU and the mutex table_lock are used. Addition and removal from the hash table use the RCU-aware hash macros. This allows simple read-only look ups that iterate to locate a single VF can be fast using RCU. Accesses which modify the hash table, or which can't take RCU because they sleep, will hold the mutex lock. By using this design, we have a stronger guarantee that the VF structure can't be released until after all threads are finished operating on it. We also pave the way for the more dynamic Scalable IOV implementation in the future. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>

ice: convert VF storage to hash table with krefs and RCU
The ice driver stores VF structures in a simple array which is allocated once at the time of VF creation. The VF structures are then accessed from the array by their VF ID. The ID must be between 0 and the number of allocated VFs. Multiple threads can access this table: * .ndo operations such as .ndo_get_vf_cfg or .ndo_set_vf_trust * interrupts, such as due to messages from the VF using the virtchnl communication * processing such as device reset * commands to add or remove VFs The current implementation does not keep track of when all threads are done operating on a VF and can potentially result in use-after-free issues caused by one thread accessing a VF structure after it has been released when removing VFs. Some of these are prevented with various state flags and checks. In addition, this structure is quite static and does not support a planned future where virtualization can be more dynamic. As we begin to look at supporting Scalable IOV with the ice driver (as opposed to just supporting Single Root IOV), this structure is not sufficient. In the future, VFs will be able to be added and removed individually and dynamically. To allow for this, and to better protect against a whole class of use-after-free bugs, replace the VF storage with a combination of a hash table and krefs to reference track all of the accesses to VFs through the hash table. A hash table still allows efficient look up of the VF given its ID, but also allows adding and removing VFs. It does not require contiguous VF IDs. The use of krefs allows the cleanup of the VF memory to be delayed until after all threads have released their reference (by calling ice_put_vf). To prevent corruption of the hash table, a combination of RCU and the mutex table_lock are used. Addition and removal from the hash table use the RCU-aware hash macros. This allows simple read-only look ups that iterate to locate a single VF can be fast using RCU. Accesses which modify the hash table, or which can't take RCU because they sleep, will hold the mutex lock. By using this design, we have a stronger guarantee that the VF structure can't be released until after all threads are finished operating on it. We also pave the way for the more dynamic Scalable IOV implementation in the future. Signed-off-by: Jacob Keller <jacob.e.keller@intel.com> Tested-by: Konrad Jankowski <konrad0.jankowski@intel.com> Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
3d5985a1 · Jacob Keller · Tony Nguyen · fb916db1 · 3d5985a1 · 3d5985a1
Commit 3d5985a1 authored Feb 16, 2022 by Jacob Keller Committed by Tony Nguyen Mar 03, 2022
8 changed files
--- a/drivers/net/ethernet/intel/ice/ice_eswitch.c
+++ b/drivers/net/ethernet/intel/ice/ice_eswitch.c
@@ -209,6 +209,8 @@ static void ice_eswitch_remap_rings_to_vectors(struct ice_pf *pf)
 		rx_ring->q_vector = q_vector;
 		rx_ring->next = NULL;
 		rx_ring->netdev = repr->netdev;
+
+		ice_put_vf(vf);
 	}
 }

@@ -223,6 +225,8 @@ ice_eswitch_release_reprs(struct ice_pf *pf, struct ice_vsi *ctrl_vsi)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf) {
 		struct ice_vsi *vsi = vf->repr->src_vsi;

@@ -251,6 +255,8 @@ static int ice_eswitch_setup_reprs(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf) {
 		struct ice_vsi *vsi = vf->repr->src_vsi;

@@ -430,6 +436,8 @@ static void ice_eswitch_napi_del(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf)
 		netif_napi_del(&vf->repr->q_vector->napi);
 }
@@ -443,6 +451,8 @@ static void ice_eswitch_napi_enable(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf)
 		napi_enable(&vf->repr->q_vector->napi);
 }
@@ -456,6 +466,8 @@ static void ice_eswitch_napi_disable(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf)
 		napi_disable(&vf->repr->q_vector->napi);
 }
@@ -629,6 +641,8 @@ static void ice_eswitch_start_all_tx_queues(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	if (test_bit(ICE_DOWN, pf->state))
 		return;

@@ -647,6 +661,8 @@ void ice_eswitch_stop_all_tx_queues(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	if (test_bit(ICE_DOWN, pf->state))
 		return;


--- a/drivers/net/ethernet/intel/ice/ice_ethtool.c
+++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c
@@ -316,15 +316,20 @@ ice_get_eeprom(struct net_device *netdev, struct ethtool_eeprom *eeprom,
 */
 static bool ice_active_vfs(struct ice_pf *pf)
 {
+	bool active = false;
 	struct ice_vf *vf;
 	unsigned int bkt;

-	ice_for_each_vf(pf, bkt, vf) {
-		if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states))
-			return true;
+	rcu_read_lock();
+	ice_for_each_vf_rcu(pf, bkt, vf) {
+		if (test_bit(ICE_VF_STATE_ACTIVE, vf->vf_states)) {
+			active = true;
+			break;
+		}
 	}
+	rcu_read_unlock();

-	return false;
+	return active;
 }

 /**

--- a/drivers/net/ethernet/intel/ice/ice_lib.c
+++ b/drivers/net/ethernet/intel/ice/ice_lib.c
@@ -439,8 +439,10 @@ static irqreturn_t ice_eswitch_msix_clean_rings(int __always_unused irq, void *d
 	if (!q_vector->tx.tx_ring && !q_vector->rx.rx_ring)
 		return IRQ_HANDLED;

-	ice_for_each_vf(pf, bkt, vf)
+	rcu_read_lock();
+	ice_for_each_vf_rcu(pf, bkt, vf)
 		napi_schedule(&vf->repr->q_vector->napi);
+	rcu_read_unlock();

 	return IRQ_HANDLED;
 }
@@ -1345,11 +1347,17 @@ static int ice_get_vf_ctrl_res(struct ice_pf *pf, struct ice_vsi *vsi)
 {
 	struct ice_vf *vf;
 	unsigned int bkt;
+	int base;

-	ice_for_each_vf(pf, bkt, vf) {
-		if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI)
-			return pf->vsi[vf->ctrl_vsi_idx]->base_vector;
+	rcu_read_lock();
+	ice_for_each_vf_rcu(pf, bkt, vf) {
+		if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI) {
+			base = pf->vsi[vf->ctrl_vsi_idx]->base_vector;
+			rcu_read_unlock();
+			return base;
+		}
 	}
+	rcu_read_unlock();

 	return ice_get_res(pf, pf->irq_tracker, vsi->num_q_vectors,
 			   ICE_RES_VF_CTRL_VEC_ID);
@@ -2894,10 +2902,14 @@ static void ice_free_vf_ctrl_res(struct ice_pf *pf,  struct ice_vsi *vsi)
 	struct ice_vf *vf;
 	unsigned int bkt;

-	ice_for_each_vf(pf, bkt, vf) {
-		if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI)
+	rcu_read_lock();
+	ice_for_each_vf_rcu(pf, bkt, vf) {
+		if (vf != vsi->vf && vf->ctrl_vsi_idx != ICE_NO_VSI) {
+			rcu_read_unlock();
 			return;
+		}
 	}
+	rcu_read_unlock();

 	/* No other VFs left that have control VSI. It is now safe to reclaim
 	 * SW interrupts back to the common pool.

--- a/drivers/net/ethernet/intel/ice/ice_main.c
+++ b/drivers/net/ethernet/intel/ice/ice_main.c
@@ -521,8 +521,10 @@ ice_prepare_for_reset(struct ice_pf *pf, enum ice_reset_req reset_type)
 		ice_vc_notify_reset(pf);

 	/* Disable VFs until reset is completed */
+	mutex_lock(&pf->vfs.table_lock);
 	ice_for_each_vf(pf, bkt, vf)
 		ice_set_vf_state_qs_dis(vf);
+	mutex_unlock(&pf->vfs.table_lock);

 	if (ice_is_eswitch_mode_switchdev(pf)) {
 		if (reset_type != ICE_RESET_PFR)
@@ -1756,6 +1758,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 	/* Check to see if one of the VFs caused an MDD event, and then
 	 * increment counters and set print pending
 	 */
+	mutex_lock(&pf->vfs.table_lock);
 	ice_for_each_vf(pf, bkt, vf) {
 		reg = rd32(hw, VP_MDET_TX_PQM(vf->vf_id));
 		if (reg & VP_MDET_TX_PQM_VALID_M) {
@@ -1811,6 +1814,7 @@ static void ice_handle_mdd_event(struct ice_pf *pf)
 			}
 		}
 	}
+	mutex_unlock(&pf->vfs.table_lock);

 	ice_print_vfs_mdd_events(pf);
 }
@@ -3680,6 +3684,7 @@ static void ice_deinit_pf(struct ice_pf *pf)
 	mutex_destroy(&pf->sw_mutex);
 	mutex_destroy(&pf->tc_mutex);
 	mutex_destroy(&pf->avail_q_mutex);
+	mutex_destroy(&pf->vfs.table_lock);

 	if (pf->avail_txqs) {
 		bitmap_free(pf->avail_txqs);
@@ -3779,6 +3784,9 @@ static int ice_init_pf(struct ice_pf *pf)
 		return -ENOMEM;
 	}

+	mutex_init(&pf->vfs.table_lock);
+	hash_init(pf->vfs.table);
+
 	return 0;
 }


--- a/drivers/net/ethernet/intel/ice/ice_repr.c
+++ b/drivers/net/ethernet/intel/ice/ice_repr.c
@@ -341,6 +341,8 @@ void ice_repr_rem_from_all_vfs(struct ice_pf *pf)
 	struct ice_vf *vf;
 	unsigned int bkt;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf)
 		ice_repr_rem(vf);
 }
@@ -355,6 +357,8 @@ int ice_repr_add_for_all_vfs(struct ice_pf *pf)
 	unsigned int bkt;
 	int err;

+	lockdep_assert_held(&pf->vfs.table_lock);
+
 	ice_for_each_vf(pf, bkt, vf) {
 		err = ice_repr_add(vf);
 		if (err)

--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_fdir.c
@@ -1578,6 +1578,7 @@ void ice_flush_fdir_ctx(struct ice_pf *pf)
 	if (!test_and_clear_bit(ICE_FD_VF_FLUSH_CTX, pf->state))
 		return;

+	mutex_lock(&pf->vfs.table_lock);
 	ice_for_each_vf(pf, bkt, vf) {
 		struct device *dev = ice_pf_to_dev(pf);
 		enum virtchnl_fdir_prgm_status status;
@@ -1634,6 +1635,7 @@ void ice_flush_fdir_ctx(struct ice_pf *pf)
 		ctx->flags &= ~ICE_VF_FDIR_CTX_VALID;
 		spin_unlock_irqrestore(&vf->fdir.ctx_lock, flags);
 	}
+	mutex_unlock(&pf->vfs.table_lock);
 }

 /**

--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.c
--- a/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
+++ b/drivers/net/ethernet/intel/ice/ice_virtchnl_pf.h
@@ -44,22 +44,45 @@
 * These functions provide abstraction for interacting with the VF hash table.
 * In general, direct access to the hash table should be avoided outside of
 * these functions where possible.
+ *
+ * The VF entries in the hash table are protected by reference counting to
+ * track lifetime of accesses from the table. The ice_get_vf_by_id() function
+ * obtains a reference to the VF structure which must be dropped by using
+ * ice_put_vf().
 */

 /**
 * ice_for_each_vf - Iterate over each VF entry
 * @pf: pointer to the PF private structure
 * @bkt: bucket index used for iteration
- * @entry: pointer to the VF entry currently being processed in the loop.
+ * @vf: pointer to the VF entry currently being processed in the loop.
 *
 * The bkt variable is an unsigned integer iterator used to traverse the VF
 * entries. It is *not* guaranteed to be the VF's vf_id. Do not assume it is.
 * Use vf->vf_id to get the id number if needed.
+ *
+ * The caller is expected to be under the table_lock mutex for the entire
+ * loop. Use this iterator if your loop is long or if it might sleep.
 */
-#define ice_for_each_vf(pf, bkt, entry)					\
-	for ((bkt) = 0, (entry) = &(pf)->vfs.table[0];			\
-	     (bkt) < (pf)->vfs.num_alloc;				\
-	     (bkt)++, (entry)++)
+#define ice_for_each_vf(pf, bkt, vf) \
+	hash_for_each((pf)->vfs.table, (bkt), (vf), entry)
+
+/**
+ * ice_for_each_vf_rcu - Iterate over each VF entry protected by RCU
+ * @pf: pointer to the PF private structure
+ * @bkt: bucket index used for iteration
+ * @vf: pointer to the VF entry currently being processed in the loop.
+ *
+ * The bkt variable is an unsigned integer iterator used to traverse the VF
+ * entries. It is *not* guaranteed to be the VF's vf_id. Do not assume it is.
+ * Use vf->vf_id to get the id number if needed.
+ *
+ * The caller is expected to be under rcu_read_lock() for the entire loop.
+ * Only use this iterator if your loop is short and you can guarantee it does
+ * not sleep.
+ */
+#define ice_for_each_vf_rcu(pf, bkt, vf) \
+	hash_for_each_rcu((pf)->vfs.table, (bkt), (vf), entry)

 /* Specific VF states */
 enum ice_vf_states {
@@ -125,8 +148,8 @@ struct ice_vc_vf_ops {

 /* Virtchnl/SR-IOV config info */
 struct ice_vfs {
-	struct ice_vf *table;		/* table of VF entries */
-	u16 num_alloc;			/* number of allocated VFs */
+	DECLARE_HASHTABLE(table, 8);	/* table of VF entries */
+	struct mutex table_lock;	/* Lock for protecting the hash table */
 	u16 num_supported;		/* max supported VFs on this PF */
 	u16 num_qps_per;		/* number of queue pairs per VF */
 	u16 num_msix_per;		/* number of MSI-X vectors per VF */
@@ -136,6 +159,9 @@ struct ice_vfs {

 /* VF information structure */
 struct ice_vf {
+	struct hlist_node entry;
+	struct rcu_head rcu;
+	struct kref refcnt;
 	struct ice_pf *pf;

 	/* Used during virtchnl message handling and NDO ops against the VF
@@ -193,6 +219,7 @@ struct ice_vf {

 #ifdef CONFIG_PCI_IOV
 struct ice_vf *ice_get_vf_by_id(struct ice_pf *pf, u16 vf_id);
+void ice_put_vf(struct ice_vf *vf);
 bool ice_has_vfs(struct ice_pf *pf);
 u16 ice_get_num_vfs(struct ice_pf *pf);
 struct ice_vsi *ice_get_vf_vsi(struct ice_vf *vf);
@@ -259,6 +286,10 @@ static inline struct ice_vf *ice_get_vf_by_id(struct ice_pf *pf, u16 vf_id)
 	return NULL;
 }

+static inline void ice_put_vf(struct ice_vf *vf)
+{
+}
+
 static inline bool ice_has_vfs(struct ice_pf *pf)
 {
 	return false;