Commit 55bf196f authored by Christian König's avatar Christian König Committed by Alex Deucher

drm/amdgpu: reset VM when an error is detected

When some problem with the updates of page tables is detected reset the
state machine of the VM and re-create all page tables from scratch.
Signed-off-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent e84e697d
...@@ -266,6 +266,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo) ...@@ -266,6 +266,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
spin_unlock(&vm_bo->vm->status_lock); spin_unlock(&vm_bo->vm->status_lock);
} }
/**
* amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
* @vm: the VM which state machine to reset
*
* Move all vm_bo object in the VM into a state where they will be updated
* again during validation.
*/
static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
{
struct amdgpu_vm_bo_base *vm_bo, *tmp;
spin_lock(&vm->status_lock);
list_splice_init(&vm->done, &vm->invalidated);
list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
vm_bo->moved = true;
list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
struct amdgpu_bo *bo = vm_bo->bo;
if (!bo || bo->tbo.type != ttm_bo_type_kernel)
list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
else if (bo->parent)
list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
}
spin_unlock(&vm->status_lock);
}
/** /**
* amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
* *
...@@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev, ...@@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
spin_unlock(&adev->mman.bdev.lru_lock); spin_unlock(&adev->mman.bdev.lru_lock);
} }
/* Create scheduler entities for page table updates */
static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
struct amdgpu_vm *vm)
{
int r;
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
adev->vm_manager.vm_pte_scheds,
adev->vm_manager.vm_pte_num_scheds, NULL);
if (r)
goto error;
return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
adev->vm_manager.vm_pte_scheds,
adev->vm_manager.vm_pte_num_scheds, NULL);
error:
drm_sched_entity_destroy(&vm->immediate);
return r;
}
/* Destroy the entities for page table updates again */
static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
{
drm_sched_entity_destroy(&vm->immediate);
drm_sched_entity_destroy(&vm->delayed);
}
/** /**
* amdgpu_vm_validate_pt_bos - validate the page table BOs * amdgpu_vm_validate_pt_bos - validate the page table BOs
* *
...@@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm, ...@@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
struct amdgpu_bo *bo; struct amdgpu_bo *bo;
int r; int r;
if (drm_sched_entity_error(&vm->delayed)) {
amdgpu_vm_bo_reset_state_machine(vm);
amdgpu_vm_fini_entities(vm);
r = amdgpu_vm_init_entities(adev, vm);
if (r)
return r;
}
spin_lock(&vm->status_lock); spin_lock(&vm->status_lock);
while (!list_empty(&vm->evicted)) { while (!list_empty(&vm->evicted)) {
bo_base = list_first_entry(&vm->evicted, bo_base = list_first_entry(&vm->evicted,
...@@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
INIT_LIST_HEAD(&vm->pt_freed); INIT_LIST_HEAD(&vm->pt_freed);
INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work); INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
/* create scheduler entities for page table updates */ r = amdgpu_vm_init_entities(adev, vm);
r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
adev->vm_manager.vm_pte_scheds,
adev->vm_manager.vm_pte_num_scheds, NULL);
if (r) if (r)
return r; return r;
r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
adev->vm_manager.vm_pte_scheds,
adev->vm_manager.vm_pte_num_scheds, NULL);
if (r)
goto error_free_immediate;
vm->pte_support_ats = false; vm->pte_support_ats = false;
vm->is_compute_context = false; vm->is_compute_context = false;
...@@ -2121,10 +2174,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -2121,10 +2174,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
error_free_delayed: error_free_delayed:
dma_fence_put(vm->last_tlb_flush); dma_fence_put(vm->last_tlb_flush);
dma_fence_put(vm->last_unlocked); dma_fence_put(vm->last_unlocked);
drm_sched_entity_destroy(&vm->delayed); amdgpu_vm_fini_entities(vm);
error_free_immediate:
drm_sched_entity_destroy(&vm->immediate);
return r; return r;
} }
...@@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm) ...@@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
amdgpu_bo_unref(&root); amdgpu_bo_unref(&root);
WARN_ON(vm->root.bo); WARN_ON(vm->root.bo);
drm_sched_entity_destroy(&vm->immediate); amdgpu_vm_fini_entities(vm);
drm_sched_entity_destroy(&vm->delayed);
if (!RB_EMPTY_ROOT(&vm->va.rb_root)) { if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
dev_err(adev->dev, "still active bo inside vm\n"); dev_err(adev->dev, "still active bo inside vm\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment