Commit fc2e6b3b authored by Slawomir Laba's avatar Slawomir Laba Committed by Tony Nguyen

iavf: Rework mutexes for better synchronisation

The driver used to crash in multiple spots when put to stress testing
of the init, reset and remove paths.

The user would experience call traces or hangs when creating,
resetting, removing VFs. Depending on the machines, the call traces
are happening in random spots, like reset restoring resources racing
with driver remove.

Make adapter->crit_lock mutex a mandatory lock for guarding the
operations performed on all workqueues and functions dealing with
resource allocation and disposal.

Make __IAVF_REMOVE a final state of the driver respected by
workqueues that shall not requeue, when they fail to obtain the
crit_lock.

Make the IRQ handler not to queue the new work for adminq_task
when the __IAVF_REMOVE state is set.

Fixes: 5ac49f3c ("iavf: use mutexes for locking of critical sections")
Signed-off-by: default avatarSlawomir Laba <slawomirx.laba@intel.com>
Signed-off-by: default avatarPhani Burra <phani.r.burra@intel.com>
Signed-off-by: default avatarJacob Keller <jacob.e.keller@intel.com>
Signed-off-by: default avatarMateusz Palczewski <mateusz.palczewski@intel.com>
Tested-by: default avatarKonrad Jankowski <konrad0.jankowski@intel.com>
Signed-off-by: default avatarTony Nguyen <anthony.l.nguyen@intel.com>
parent e01b042e
...@@ -246,7 +246,6 @@ struct iavf_adapter { ...@@ -246,7 +246,6 @@ struct iavf_adapter {
struct list_head mac_filter_list; struct list_head mac_filter_list;
struct mutex crit_lock; struct mutex crit_lock;
struct mutex client_lock; struct mutex client_lock;
struct mutex remove_lock;
/* Lock to protect accesses to MAC and VLAN lists */ /* Lock to protect accesses to MAC and VLAN lists */
spinlock_t mac_vlan_list_lock; spinlock_t mac_vlan_list_lock;
char misc_vector_name[IFNAMSIZ + 9]; char misc_vector_name[IFNAMSIZ + 9];
......
...@@ -302,6 +302,7 @@ static irqreturn_t iavf_msix_aq(int irq, void *data) ...@@ -302,6 +302,7 @@ static irqreturn_t iavf_msix_aq(int irq, void *data)
rd32(hw, IAVF_VFINT_ICR01); rd32(hw, IAVF_VFINT_ICR01);
rd32(hw, IAVF_VFINT_ICR0_ENA1); rd32(hw, IAVF_VFINT_ICR0_ENA1);
if (adapter->state != __IAVF_REMOVE)
/* schedule work on the private workqueue */ /* schedule work on the private workqueue */
queue_work(iavf_wq, &adapter->adminq_task); queue_work(iavf_wq, &adapter->adminq_task);
...@@ -2374,8 +2375,12 @@ static void iavf_watchdog_task(struct work_struct *work) ...@@ -2374,8 +2375,12 @@ static void iavf_watchdog_task(struct work_struct *work)
struct iavf_hw *hw = &adapter->hw; struct iavf_hw *hw = &adapter->hw;
u32 reg_val; u32 reg_val;
if (!mutex_trylock(&adapter->crit_lock)) if (!mutex_trylock(&adapter->crit_lock)) {
if (adapter->state == __IAVF_REMOVE)
return;
goto restart_watchdog; goto restart_watchdog;
}
if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)
iavf_change_state(adapter, __IAVF_COMM_FAILED); iavf_change_state(adapter, __IAVF_COMM_FAILED);
...@@ -2601,13 +2606,13 @@ static void iavf_reset_task(struct work_struct *work) ...@@ -2601,13 +2606,13 @@ static void iavf_reset_task(struct work_struct *work)
/* When device is being removed it doesn't make sense to run the reset /* When device is being removed it doesn't make sense to run the reset
* task, just return in such a case. * task, just return in such a case.
*/ */
if (mutex_is_locked(&adapter->remove_lock)) if (!mutex_trylock(&adapter->crit_lock)) {
return; if (adapter->state != __IAVF_REMOVE)
queue_work(iavf_wq, &adapter->reset_task);
if (iavf_lock_timeout(&adapter->crit_lock, 200)) {
schedule_work(&adapter->reset_task);
return; return;
} }
while (!mutex_trylock(&adapter->client_lock)) while (!mutex_trylock(&adapter->client_lock))
usleep_range(500, 1000); usleep_range(500, 1000);
if (CLIENT_ENABLED(adapter)) { if (CLIENT_ENABLED(adapter)) {
...@@ -2826,13 +2831,19 @@ static void iavf_adminq_task(struct work_struct *work) ...@@ -2826,13 +2831,19 @@ static void iavf_adminq_task(struct work_struct *work)
if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED) if (adapter->flags & IAVF_FLAG_PF_COMMS_FAILED)
goto out; goto out;
if (!mutex_trylock(&adapter->crit_lock)) {
if (adapter->state == __IAVF_REMOVE)
return;
queue_work(iavf_wq, &adapter->adminq_task);
goto out;
}
event.buf_len = IAVF_MAX_AQ_BUF_SIZE; event.buf_len = IAVF_MAX_AQ_BUF_SIZE;
event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL); event.msg_buf = kzalloc(event.buf_len, GFP_KERNEL);
if (!event.msg_buf) if (!event.msg_buf)
goto out; goto out;
if (iavf_lock_timeout(&adapter->crit_lock, 200))
goto freedom;
do { do {
ret = iavf_clean_arq_element(hw, &event, &pending); ret = iavf_clean_arq_element(hw, &event, &pending);
v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high); v_op = (enum virtchnl_ops)le32_to_cpu(event.desc.cookie_high);
...@@ -3800,11 +3811,12 @@ static int iavf_close(struct net_device *netdev) ...@@ -3800,11 +3811,12 @@ static int iavf_close(struct net_device *netdev)
struct iavf_adapter *adapter = netdev_priv(netdev); struct iavf_adapter *adapter = netdev_priv(netdev);
int status; int status;
if (adapter->state <= __IAVF_DOWN_PENDING) mutex_lock(&adapter->crit_lock);
return 0;
while (!mutex_trylock(&adapter->crit_lock)) if (adapter->state <= __IAVF_DOWN_PENDING) {
usleep_range(500, 1000); mutex_unlock(&adapter->crit_lock);
return 0;
}
set_bit(__IAVF_VSI_DOWN, adapter->vsi.state); set_bit(__IAVF_VSI_DOWN, adapter->vsi.state);
if (CLIENT_ENABLED(adapter)) if (CLIENT_ENABLED(adapter))
...@@ -4431,7 +4443,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent) ...@@ -4431,7 +4443,6 @@ static int iavf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
*/ */
mutex_init(&adapter->crit_lock); mutex_init(&adapter->crit_lock);
mutex_init(&adapter->client_lock); mutex_init(&adapter->client_lock);
mutex_init(&adapter->remove_lock);
mutex_init(&hw->aq.asq_mutex); mutex_init(&hw->aq.asq_mutex);
mutex_init(&hw->aq.arq_mutex); mutex_init(&hw->aq.arq_mutex);
...@@ -4556,11 +4567,7 @@ static void iavf_remove(struct pci_dev *pdev) ...@@ -4556,11 +4567,7 @@ static void iavf_remove(struct pci_dev *pdev)
struct iavf_cloud_filter *cf, *cftmp; struct iavf_cloud_filter *cf, *cftmp;
struct iavf_hw *hw = &adapter->hw; struct iavf_hw *hw = &adapter->hw;
int err; int err;
/* Indicate we are in remove and not to run reset_task */
mutex_lock(&adapter->remove_lock);
cancel_work_sync(&adapter->reset_task);
cancel_delayed_work_sync(&adapter->watchdog_task);
cancel_delayed_work_sync(&adapter->client_task);
if (adapter->netdev_registered) { if (adapter->netdev_registered) {
unregister_netdev(netdev); unregister_netdev(netdev);
adapter->netdev_registered = false; adapter->netdev_registered = false;
...@@ -4572,6 +4579,10 @@ static void iavf_remove(struct pci_dev *pdev) ...@@ -4572,6 +4579,10 @@ static void iavf_remove(struct pci_dev *pdev)
err); err);
} }
mutex_lock(&adapter->crit_lock);
dev_info(&adapter->pdev->dev, "Remove device\n");
iavf_change_state(adapter, __IAVF_REMOVE);
iavf_request_reset(adapter); iavf_request_reset(adapter);
msleep(50); msleep(50);
/* If the FW isn't responding, kick it once, but only once. */ /* If the FW isn't responding, kick it once, but only once. */
...@@ -4579,18 +4590,19 @@ static void iavf_remove(struct pci_dev *pdev) ...@@ -4579,18 +4590,19 @@ static void iavf_remove(struct pci_dev *pdev)
iavf_request_reset(adapter); iavf_request_reset(adapter);
msleep(50); msleep(50);
} }
if (iavf_lock_timeout(&adapter->crit_lock, 5000))
dev_warn(&adapter->pdev->dev, "failed to acquire crit_lock in %s\n", __FUNCTION__);
dev_info(&adapter->pdev->dev, "Removing device\n"); iavf_misc_irq_disable(adapter);
/* Shut down all the garbage mashers on the detention level */ /* Shut down all the garbage mashers on the detention level */
iavf_change_state(adapter, __IAVF_REMOVE); cancel_work_sync(&adapter->reset_task);
cancel_delayed_work_sync(&adapter->watchdog_task);
cancel_work_sync(&adapter->adminq_task);
cancel_delayed_work_sync(&adapter->client_task);
adapter->aq_required = 0; adapter->aq_required = 0;
adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED; adapter->flags &= ~IAVF_FLAG_REINIT_ITR_NEEDED;
iavf_free_all_tx_resources(adapter); iavf_free_all_tx_resources(adapter);
iavf_free_all_rx_resources(adapter); iavf_free_all_rx_resources(adapter);
iavf_misc_irq_disable(adapter);
iavf_free_misc_irq(adapter); iavf_free_misc_irq(adapter);
/* In case we enter iavf_remove from erroneous state, free traffic irqs /* In case we enter iavf_remove from erroneous state, free traffic irqs
...@@ -4606,10 +4618,6 @@ static void iavf_remove(struct pci_dev *pdev) ...@@ -4606,10 +4618,6 @@ static void iavf_remove(struct pci_dev *pdev)
iavf_reset_interrupt_capability(adapter); iavf_reset_interrupt_capability(adapter);
iavf_free_q_vectors(adapter); iavf_free_q_vectors(adapter);
cancel_delayed_work_sync(&adapter->watchdog_task);
cancel_work_sync(&adapter->adminq_task);
iavf_free_rss(adapter); iavf_free_rss(adapter);
if (hw->aq.asq.count) if (hw->aq.asq.count)
...@@ -4621,8 +4629,6 @@ static void iavf_remove(struct pci_dev *pdev) ...@@ -4621,8 +4629,6 @@ static void iavf_remove(struct pci_dev *pdev)
mutex_destroy(&adapter->client_lock); mutex_destroy(&adapter->client_lock);
mutex_unlock(&adapter->crit_lock); mutex_unlock(&adapter->crit_lock);
mutex_destroy(&adapter->crit_lock); mutex_destroy(&adapter->crit_lock);
mutex_unlock(&adapter->remove_lock);
mutex_destroy(&adapter->remove_lock);
iounmap(hw->hw_addr); iounmap(hw->hw_addr);
pci_release_regions(pdev); pci_release_regions(pdev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment