Commit 175ac6ec authored by Zhigang Luo's avatar Zhigang Luo Committed by Alex Deucher

drm/amdgpu: skip reset other device in the same hive if it's SRIOV VF

On SRIOV, host driver can support FLR(function level reset) on individual VF
within the hive which might bring the individual device back to normal without
the necessary to execute the hive reset. If the FLR failed , host driver will
trigger the hive reset, each guest VF will get reset notification before the
real hive reset been executed. The VF device can handle the reset request
individually in it's reset work handler.

This change updated gpu recover sequence to skip reset other device in
the same hive for SRIOV VF.
Signed-off-by: default avatarZhigang Luo <zhigang.luo@amd.com>
Reviewed-by: default avatarShaoyun Liu <shaoyun.liu@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 12320274
...@@ -4747,7 +4747,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp ...@@ -4747,7 +4747,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp
{ {
struct amdgpu_device *tmp_adev = NULL; struct amdgpu_device *tmp_adev = NULL;
if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
if (!hive) { if (!hive) {
dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes"); dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
return -ENODEV; return -ENODEV;
...@@ -4959,6 +4959,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -4959,6 +4959,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* We always reset all schedulers for device and all devices for XGMI * We always reset all schedulers for device and all devices for XGMI
* hive so that should take care of them too. * hive so that should take care of them too.
*/ */
if (!amdgpu_sriov_vf(adev))
hive = amdgpu_get_xgmi_hive(adev); hive = amdgpu_get_xgmi_hive(adev);
if (hive) { if (hive) {
if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) { if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
...@@ -5000,7 +5001,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5000,7 +5001,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
* to put adev in the 1st position. * to put adev in the 1st position.
*/ */
INIT_LIST_HEAD(&device_list); INIT_LIST_HEAD(&device_list);
if (adev->gmc.xgmi.num_physical_nodes > 1) { if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
list_add_tail(&tmp_adev->reset_list, &device_list); list_add_tail(&tmp_adev->reset_list, &device_list);
if (!list_is_first(&adev->reset_list, &device_list)) if (!list_is_first(&adev->reset_list, &device_list))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment