drm/amdkfd: Handle errors from svm validate and map

If new range is splited to multiple pranges with max_svm_range_pages alignment and added to update_list, svm validate and map should keep going after error to make sure prange->mapped_to_gpu flag is up to date for the whole range. svm validate and map update set prange->mapped_to_gpu after mapping to GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for update mapping case) instead of setting error flag, we can remove the redundant error flag to simpliy code. Refactor to remove goto and update prange->mapped_to_gpu flag inside svm_range_lock, to guarant we always evict queues or unmap from GPUs if there are invalid ranges. After svm validate and map return error -EAGIN, the caller retry will update the mapping for the whole range again. Fixes: c22b0440 ("drm/amdkfd: flag added to handle errors from svm validate and map") Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Tested-by: James Zhu <james.zhu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

drm/amdkfd: Handle errors from svm validate and map
If new range is splited to multiple pranges with max_svm_range_pages alignment and added to update_list, svm validate and map should keep going after error to make sure prange->mapped_to_gpu flag is up to date for the whole range. svm validate and map update set prange->mapped_to_gpu after mapping to GPUs successfully, otherwise clear prange->mapped_to_gpu flag (for update mapping case) instead of setting error flag, we can remove the redundant error flag to simpliy code. Refactor to remove goto and update prange->mapped_to_gpu flag inside svm_range_lock, to guarant we always evict queues or unmap from GPUs if there are invalid ranges. After svm validate and map return error -EAGIN, the caller retry will update the mapping for the whole range again. Fixes: c22b0440 ("drm/amdkfd: flag added to handle errors from svm validate and map") Signed-off-by: Philip Yang <Philip.Yang@amd.com> Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com> Tested-by: James Zhu <james.zhu@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
eb3c357b · Philip Yang · Alex Deucher · 21e43386 · eb3c357b · eb3c357b
Commit eb3c357b authored Sep 13, 2023 by Philip Yang Committed by Alex Deucher Sep 26, 2023
Hide whitespace changes
Inline Side-by-side

Showing with 39 additions and 44 deletions

drivers/gpu/drm/amd/amdkfd/kfd_svm.c drivers/gpu/drm/amd/amdkfd/kfd_svm.c +39 -43

drivers/gpu/drm/amd/amdkfd/kfd_svm.h drivers/gpu/drm/amd/amdkfd/kfd_svm.h +0 -1

No files found.
--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.c
@@ -829,7 +829,7 @@ svm_range_is_same_attrs(struct kfd_process *p, struct svm_range *prange,
 		}
 	}

-	return !prange->is_error_flag;
+	return true;
 }

 /**
@@ -1679,71 +1679,66 @@ static int svm_range_validate_and_map(struct mm_struct *mm,

 	start = prange->start << PAGE_SHIFT;
 	end = (prange->last + 1) << PAGE_SHIFT;
-	for (addr = start; addr < end && !r; ) {
+	for (addr = start; !r && addr < end; ) {
 		struct hmm_range *hmm_range;
 		struct vm_area_struct *vma;
-		unsigned long next;
+		unsigned long next = 0;
 		unsigned long offset;
 		unsigned long npages;
 		bool readonly;

 		vma = vma_lookup(mm, addr);
-		if (!vma) {
+		if (vma) {
+			readonly = !(vma->vm_flags & VM_WRITE);
+
+			next = min(vma->vm_end, end);
+			npages = (next - addr) >> PAGE_SHIFT;
+			WRITE_ONCE(p->svms.faulting_task, current);
+			r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
+						       readonly, owner, NULL,
+						       &hmm_range);
+			WRITE_ONCE(p->svms.faulting_task, NULL);
+			if (r) {
+				pr_debug("failed %d to get svm range pages\n", r);
+				if (r == -EBUSY)
+					r = -EAGAIN;
+			}
+		} else {
 			r = -EFAULT;
-			goto unreserve_out;
-		}
-		readonly = !(vma->vm_flags & VM_WRITE);
-
-		next = min(vma->vm_end, end);
-		npages = (next - addr) >> PAGE_SHIFT;
-		WRITE_ONCE(p->svms.faulting_task, current);
-		r = amdgpu_hmm_range_get_pages(&prange->notifier, addr, npages,
-					       readonly, owner, NULL,
-					       &hmm_range);
-		WRITE_ONCE(p->svms.faulting_task, NULL);
-		if (r) {
-			pr_debug("failed %d to get svm range pages\n", r);
-			if (r == -EBUSY)
-				r = -EAGAIN;
-			goto unreserve_out;
 		}

-		offset = (addr - start) >> PAGE_SHIFT;
-		r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
-				      hmm_range->hmm_pfns);
-		if (r) {
-			pr_debug("failed %d to dma map range\n", r);
-			goto unreserve_out;
+		if (!r) {
+			offset = (addr - start) >> PAGE_SHIFT;
+			r = svm_range_dma_map(prange, ctx->bitmap, offset, npages,
+					      hmm_range->hmm_pfns);
+			if (r)
+				pr_debug("failed %d to dma map range\n", r);
 		}

 		svm_range_lock(prange);
-		if (amdgpu_hmm_range_get_pages_done(hmm_range)) {
+		if (!r && amdgpu_hmm_range_get_pages_done(hmm_range)) {
 			pr_debug("hmm update the range, need validate again\n");
 			r = -EAGAIN;
-			goto unlock_out;
 		}
-		if (!list_empty(&prange->child_list)) {
+
+		if (!r && !list_empty(&prange->child_list)) {
 			pr_debug("range split by unmap in parallel, validate again\n");
 			r = -EAGAIN;
-			goto unlock_out;
 		}

-		r = svm_range_map_to_gpus(prange, offset, npages, readonly,
-					  ctx->bitmap, wait, flush_tlb);
+		if (!r)
+			r = svm_range_map_to_gpus(prange, offset, npages, readonly,
+						  ctx->bitmap, wait, flush_tlb);
+
+		if (!r && next == end)
+			prange->mapped_to_gpu = true;

-unlock_out:
 		svm_range_unlock(prange);

 		addr = next;
 	}

-	if (addr == end)
-		prange->mapped_to_gpu = true;
-
-unreserve_out:
 	svm_range_unreserve_bos(ctx);
-
-	prange->is_error_flag = !!r;
 	if (!r)
 		prange->validate_timestamp = ktime_get_boottime();

@@ -2112,7 +2107,8 @@ svm_range_add(struct kfd_process *p, uint64_t start, uint64_t size,
 		next = interval_tree_iter_next(node, start, last);
 		next_start = min(node->last, last) + 1;

-		if (svm_range_is_same_attrs(p, prange, nattr, attrs)) {
+		if (svm_range_is_same_attrs(p, prange, nattr, attrs) &&
+		    prange->mapped_to_gpu) {
 			/* nothing to do */
 		} else if (node->start < start || node->last > last) {
 			/* node intersects the update range and its attributes
@@ -3525,7 +3521,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 	struct svm_range *next;
 	bool update_mapping = false;
 	bool flush_tlb;
-	int r = 0;
+	int r, ret = 0;

 	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] pages 0x%llx\n",
 		 p->pasid, &p->svms, start, start + size - 1, size);
@@ -3613,7 +3609,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 out_unlock_range:
 		mutex_unlock(&prange->migrate_mutex);
 		if (r)
-			break;
+			ret = r;
 	}

 	dynamic_svm_range_dump(svms);
@@ -3626,7 +3622,7 @@ svm_range_set_attr(struct kfd_process *p, struct mm_struct *mm,
 	pr_debug("pasid 0x%x svms 0x%p [0x%llx 0x%llx] done, r=%d\n", p->pasid,
 		 &p->svms, start, start + size - 1, r);

-	return r;
+	return ret ? ret : r;
 }

 static int

--- a/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_svm.h
@@ -133,7 +133,6 @@ struct svm_range {
 	DECLARE_BITMAP(bitmap_access, MAX_GPU_INSTANCE);
 	DECLARE_BITMAP(bitmap_aip, MAX_GPU_INSTANCE);
 	bool				mapped_to_gpu;
-	bool				is_error_flag;
 };

 static inline void svm_range_lock(struct svm_range *prange)