Commit cfbb6b00 authored by Andrey Grodzovsky's avatar Andrey Grodzovsky

drm/amdgpu: Rework reset domain to be refcounted.

The reset domain contains register access semaphor
now and so needs to be present as long as each device
in a hive needs it and so it cannot be binded to XGMI
hive life cycle.
Adress this by making reset domain refcounted and pointed
by each member of the hive and the hive itself.

v4:

Fix crash on boot witrh XGMI hive by adding type to reset_domain.
XGMI will only create a new reset_domain if prevoius was of single
device type meaning it's first boot. Otherwsie it will take a
refocunt to exsiting reset_domain from the amdgou device.

Add a wrapper around reset_domain->refcount get/put
and a wrapper around send to reset wq (Lijo)
Signed-off-by: default avatarAndrey Grodzovsky <andrey.grodzovsky@amd.com>
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Link: https://www.spinics.net/lists/amd-gfx/msg74121.html
parent f287a3c5
...@@ -813,9 +813,7 @@ struct amd_powerplay { ...@@ -813,9 +813,7 @@ struct amd_powerplay {
#define AMDGPU_RESET_MAGIC_NUM 64 #define AMDGPU_RESET_MAGIC_NUM 64
#define AMDGPU_MAX_DF_PERFMONS 4 #define AMDGPU_MAX_DF_PERFMONS 4
#define AMDGPU_PRODUCT_NAME_LEN 64 #define AMDGPU_PRODUCT_NAME_LEN 64
struct amdgpu_reset_domain { struct amdgpu_reset_domain;
struct workqueue_struct *wq;
};
struct amdgpu_device { struct amdgpu_device {
struct device *dev; struct device *dev;
...@@ -1104,7 +1102,7 @@ struct amdgpu_device { ...@@ -1104,7 +1102,7 @@ struct amdgpu_device {
uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE]; uint32_t ip_versions[MAX_HWIP][HWIP_MAX_INSTANCE];
bool ram_is_direct_mapped; bool ram_is_direct_mapped;
struct amdgpu_reset_domain reset_domain; struct amdgpu_reset_domain *reset_domain;
}; };
static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev) static inline struct amdgpu_device *drm_to_adev(struct drm_device *ddev)
......
...@@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev) ...@@ -2316,7 +2316,7 @@ static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
r = drm_sched_init(&ring->sched, &amdgpu_sched_ops, r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
ring->num_hw_submission, amdgpu_job_hang_limit, ring->num_hw_submission, amdgpu_job_hang_limit,
timeout, adev->reset_domain.wq, ring->sched_score, ring->name); timeout, adev->reset_domain->wq, ring->sched_score, ring->name);
if (r) { if (r) {
DRM_ERROR("Failed to create scheduler on ring %s.\n", DRM_ERROR("Failed to create scheduler on ring %s.\n",
ring->name); ring->name);
...@@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) ...@@ -2439,24 +2439,22 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
if (r) if (r)
goto init_failed; goto init_failed;
/**
* In case of XGMI grab extra reference for reset domain for this device
*/
if (adev->gmc.xgmi.num_physical_nodes > 1) { if (adev->gmc.xgmi.num_physical_nodes > 1) {
struct amdgpu_hive_info *hive; if (amdgpu_xgmi_add_device(adev) == 0) {
struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
amdgpu_xgmi_add_device(adev); if (!hive->reset_domain ||
!amdgpu_reset_get_reset_domain(hive->reset_domain)) {
r = -ENOENT;
goto init_failed;
}
hive = amdgpu_get_xgmi_hive(adev); /* Drop the early temporary reset domain we created for device */
if (!hive || !hive->reset_domain.wq) { amdgpu_reset_put_reset_domain(adev->reset_domain);
DRM_ERROR("Failed to obtain reset domain info for XGMI hive:%llx", hive->hive_id); adev->reset_domain = hive->reset_domain;
r = -EINVAL;
goto init_failed;
}
adev->reset_domain.wq = hive->reset_domain.wq;
} else {
adev->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-dev", 0);
if (!adev->reset_domain.wq) {
r = -ENOMEM;
goto init_failed;
} }
} }
...@@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -3640,6 +3638,15 @@ int amdgpu_device_init(struct amdgpu_device *adev,
return r; return r;
} }
/*
* Reset domain needs to be present early, before XGMI hive discovered
* (if any) and intitialized to use reset sem and in_gpu reset flag
* early on during init.
*/
adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE ,"amdgpu-reset-dev");
if (!adev->reset_domain)
return -ENOMEM;
/* early init functions */ /* early init functions */
r = amdgpu_device_ip_early_init(adev); r = amdgpu_device_ip_early_init(adev);
if (r) if (r)
...@@ -4016,6 +4023,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) ...@@ -4016,6 +4023,9 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)
if (adev->mman.discovery_bin) if (adev->mman.discovery_bin)
amdgpu_discovery_fini(adev); amdgpu_discovery_fini(adev);
amdgpu_reset_put_reset_domain(adev->reset_domain);
adev->reset_domain = NULL;
kfree(adev->pci_state); kfree(adev->pci_state);
} }
...@@ -5241,7 +5251,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5241,7 +5251,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work); INIT_WORK(&work.base, amdgpu_device_queue_gpu_recover_work);
if (!queue_work(adev->reset_domain.wq, &work.base)) if (!amdgpu_reset_domain_schedule(adev->reset_domain, &work.base))
return -EAGAIN; return -EAGAIN;
flush_work(&work.base); flush_work(&work.base);
......
...@@ -96,3 +96,43 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, ...@@ -96,3 +96,43 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
return reset_handler->restore_hwcontext(adev->reset_cntl, return reset_handler->restore_hwcontext(adev->reset_cntl,
reset_context); reset_context);
} }
void amdgpu_reset_destroy_reset_domain(struct kref *ref)
{
struct amdgpu_reset_domain *reset_domain = container_of(ref,
struct amdgpu_reset_domain,
refcount);
if (reset_domain->wq)
destroy_workqueue(reset_domain->wq);
kvfree(reset_domain);
}
struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
char *wq_name)
{
struct amdgpu_reset_domain *reset_domain;
reset_domain = kvzalloc(sizeof(struct amdgpu_reset_domain), GFP_KERNEL);
if (!reset_domain) {
DRM_ERROR("Failed to allocate amdgpu_reset_domain!");
return NULL;
}
reset_domain->type = type;
kref_init(&reset_domain->refcount);
reset_domain->wq = create_singlethread_workqueue(wq_name);
if (!reset_domain->wq) {
DRM_ERROR("Failed to allocate wq for amdgpu_reset_domain!");
amdgpu_reset_put_reset_domain(reset_domain);
return NULL;
}
return reset_domain;
}
...@@ -70,6 +70,19 @@ struct amdgpu_reset_control { ...@@ -70,6 +70,19 @@ struct amdgpu_reset_control {
void (*async_reset)(struct work_struct *work); void (*async_reset)(struct work_struct *work);
}; };
enum amdgpu_reset_domain_type {
SINGLE_DEVICE,
XGMI_HIVE
};
struct amdgpu_reset_domain {
struct kref refcount;
struct workqueue_struct *wq;
enum amdgpu_reset_domain_type type;
};
int amdgpu_reset_init(struct amdgpu_device *adev); int amdgpu_reset_init(struct amdgpu_device *adev);
int amdgpu_reset_fini(struct amdgpu_device *adev); int amdgpu_reset_fini(struct amdgpu_device *adev);
...@@ -82,4 +95,26 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev, ...@@ -82,4 +95,26 @@ int amdgpu_reset_perform_reset(struct amdgpu_device *adev,
int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl, int amdgpu_reset_add_handler(struct amdgpu_reset_control *reset_ctl,
struct amdgpu_reset_handler *handler); struct amdgpu_reset_handler *handler);
struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_domain_type type,
char *wq_name);
void amdgpu_reset_destroy_reset_domain(struct kref *ref);
static inline bool amdgpu_reset_get_reset_domain(struct amdgpu_reset_domain *domain)
{
return kref_get_unless_zero(&domain->refcount) != 0;
}
static inline void amdgpu_reset_put_reset_domain(struct amdgpu_reset_domain *domain)
{
kref_put(&domain->refcount, amdgpu_reset_destroy_reset_domain);
}
static inline bool amdgpu_reset_domain_schedule(struct amdgpu_reset_domain *domain,
struct work_struct *work)
{
return queue_work(domain->wq, work);
}
#endif #endif
...@@ -32,6 +32,8 @@ ...@@ -32,6 +32,8 @@
#include "wafl/wafl2_4_0_0_smn.h" #include "wafl/wafl2_4_0_0_smn.h"
#include "wafl/wafl2_4_0_0_sh_mask.h" #include "wafl/wafl2_4_0_0_sh_mask.h"
#include "amdgpu_reset.h"
#define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210 #define smnPCS_XGMI23_PCS_ERROR_STATUS 0x11a01210
#define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c #define smnPCS_XGMI3X16_PCS_ERROR_STATUS 0x11a0020c
#define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210 #define smnPCS_GOPX1_PCS_ERROR_STATUS 0x12200210
...@@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj) ...@@ -227,6 +229,9 @@ static void amdgpu_xgmi_hive_release(struct kobject *kobj)
struct amdgpu_hive_info *hive = container_of( struct amdgpu_hive_info *hive = container_of(
kobj, struct amdgpu_hive_info, kobj); kobj, struct amdgpu_hive_info, kobj);
amdgpu_reset_put_reset_domain(hive->reset_domain);
hive->reset_domain = NULL;
mutex_destroy(&hive->hive_lock); mutex_destroy(&hive->hive_lock);
kfree(hive); kfree(hive);
} }
...@@ -398,12 +403,24 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev) ...@@ -398,12 +403,24 @@ struct amdgpu_hive_info *amdgpu_get_xgmi_hive(struct amdgpu_device *adev)
goto pro_end; goto pro_end;
} }
hive->reset_domain.wq = alloc_ordered_workqueue("amdgpu-reset-hive", 0); /**
if (!hive->reset_domain.wq) { * Avoid recreating reset domain when hive is reconstructed for the case
dev_err(adev->dev, "XGMI: failed allocating wq for reset domain!\n"); * of reset the devices in the XGMI hive during probe for SRIOV
kfree(hive); * See https://www.spinics.net/lists/amd-gfx/msg58836.html
hive = NULL; */
goto pro_end; if (adev->reset_domain->type != XGMI_HIVE) {
hive->reset_domain = amdgpu_reset_create_reset_domain(XGMI_HIVE, "amdgpu-reset-hive");
if (!hive->reset_domain) {
dev_err(adev->dev, "XGMI: failed initializing reset domain for xgmi hive\n");
ret = -ENOMEM;
kobject_put(&hive->kobj);
kfree(hive);
hive = NULL;
goto pro_end;
}
} else {
amdgpu_reset_get_reset_domain(adev->reset_domain);
hive->reset_domain = adev->reset_domain;
} }
hive->hive_id = adev->gmc.xgmi.hive_id; hive->hive_id = adev->gmc.xgmi.hive_id;
......
...@@ -42,7 +42,7 @@ struct amdgpu_hive_info { ...@@ -42,7 +42,7 @@ struct amdgpu_hive_info {
AMDGPU_XGMI_PSTATE_UNKNOWN AMDGPU_XGMI_PSTATE_UNKNOWN
} pstate; } pstate;
struct amdgpu_reset_domain reset_domain; struct amdgpu_reset_domain *reset_domain;
}; };
struct amdgpu_pcs_ras_field { struct amdgpu_pcs_ras_field {
......
...@@ -32,6 +32,8 @@ ...@@ -32,6 +32,8 @@
#include "soc15_common.h" #include "soc15_common.h"
#include "mxgpu_ai.h" #include "mxgpu_ai.h"
#include "amdgpu_reset.h"
static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev) static void xgpu_ai_mailbox_send_ack(struct amdgpu_device *adev)
{ {
WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2); WREG8(AI_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
...@@ -308,8 +310,8 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, ...@@ -308,8 +310,8 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev,
switch (event) { switch (event) {
case IDH_FLR_NOTIFICATION: case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
WARN_ONCE(!queue_work(adev->reset_domain.wq, WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->virt.flr_work), &adev->virt.flr_work),
"Failed to queue work! at %s", "Failed to queue work! at %s",
__func__); __func__);
break; break;
......
...@@ -31,6 +31,8 @@ ...@@ -31,6 +31,8 @@
#include "soc15_common.h" #include "soc15_common.h"
#include "mxgpu_nv.h" #include "mxgpu_nv.h"
#include "amdgpu_reset.h"
static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev) static void xgpu_nv_mailbox_send_ack(struct amdgpu_device *adev)
{ {
WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2); WREG8(NV_MAIBOX_CONTROL_RCV_OFFSET_BYTE, 2);
...@@ -338,8 +340,8 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev, ...@@ -338,8 +340,8 @@ static int xgpu_nv_mailbox_rcv_irq(struct amdgpu_device *adev,
switch (event) { switch (event) {
case IDH_FLR_NOTIFICATION: case IDH_FLR_NOTIFICATION:
if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev))
WARN_ONCE(!queue_work(adev->reset_domain.wq, WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->virt.flr_work), &adev->virt.flr_work),
"Failed to queue work! at %s", "Failed to queue work! at %s",
__func__); __func__);
break; break;
......
...@@ -42,6 +42,8 @@ ...@@ -42,6 +42,8 @@
#include "smu/smu_7_1_3_d.h" #include "smu/smu_7_1_3_d.h"
#include "mxgpu_vi.h" #include "mxgpu_vi.h"
#include "amdgpu_reset.h"
/* VI golden setting */ /* VI golden setting */
static const u32 xgpu_fiji_mgcg_cgcg_init[] = { static const u32 xgpu_fiji_mgcg_cgcg_init[] = {
mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff, mmRLC_CGTT_MGCG_OVERRIDE, 0xffffffff, 0xffffffff,
...@@ -551,8 +553,8 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev, ...@@ -551,8 +553,8 @@ static int xgpu_vi_mailbox_rcv_irq(struct amdgpu_device *adev,
/* only handle FLR_NOTIFY now */ /* only handle FLR_NOTIFY now */
if (!r && !amdgpu_in_reset(adev)) if (!r && !amdgpu_in_reset(adev))
WARN_ONCE(!queue_work(adev->reset_domain.wq, WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain,
&adev->virt.flr_work), &adev->virt.flr_work),
"Failed to queue work! at %s", "Failed to queue work! at %s",
__func__); __func__);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment