forked from mirrors/linux
		
	drm/amdgpu: reset VM when an error is detected
When some problem with the updates of page tables is detected reset the state machine of the VM and re-create all page tables from scratch. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Luben Tuikov <luben.tuikov@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
		
							parent
							
								
									e84e697d92
								
							
						
					
					
						commit
						55bf196f60
					
				
					 1 changed files with 65 additions and 16 deletions
				
			
		| 
						 | 
					@ -266,6 +266,32 @@ static void amdgpu_vm_bo_done(struct amdgpu_vm_bo_base *vm_bo)
 | 
				
			||||||
	spin_unlock(&vm_bo->vm->status_lock);
 | 
						spin_unlock(&vm_bo->vm->status_lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/**
 | 
				
			||||||
 | 
					 * amdgpu_vm_bo_reset_state_machine - reset the vm_bo state machine
 | 
				
			||||||
 | 
					 * @vm: the VM which state machine to reset
 | 
				
			||||||
 | 
					 *
 | 
				
			||||||
 | 
					 * Move all vm_bo object in the VM into a state where they will be updated
 | 
				
			||||||
 | 
					 * again during validation.
 | 
				
			||||||
 | 
					 */
 | 
				
			||||||
 | 
					static void amdgpu_vm_bo_reset_state_machine(struct amdgpu_vm *vm)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						struct amdgpu_vm_bo_base *vm_bo, *tmp;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						spin_lock(&vm->status_lock);
 | 
				
			||||||
 | 
						list_splice_init(&vm->done, &vm->invalidated);
 | 
				
			||||||
 | 
						list_for_each_entry(vm_bo, &vm->invalidated, vm_status)
 | 
				
			||||||
 | 
							vm_bo->moved = true;
 | 
				
			||||||
 | 
						list_for_each_entry_safe(vm_bo, tmp, &vm->idle, vm_status) {
 | 
				
			||||||
 | 
							struct amdgpu_bo *bo = vm_bo->bo;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
							if (!bo || bo->tbo.type != ttm_bo_type_kernel)
 | 
				
			||||||
 | 
								list_move(&vm_bo->vm_status, &vm_bo->vm->moved);
 | 
				
			||||||
 | 
							else if (bo->parent)
 | 
				
			||||||
 | 
								list_move(&vm_bo->vm_status, &vm_bo->vm->relocated);
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
						spin_unlock(&vm->status_lock);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
 | 
					 * amdgpu_vm_bo_base_init - Adds bo to the list of bos associated with the vm
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
| 
						 | 
					@ -351,6 +377,34 @@ void amdgpu_vm_move_to_lru_tail(struct amdgpu_device *adev,
 | 
				
			||||||
	spin_unlock(&adev->mman.bdev.lru_lock);
 | 
						spin_unlock(&adev->mman.bdev.lru_lock);
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Create scheduler entities for page table updates */
 | 
				
			||||||
 | 
					static int amdgpu_vm_init_entities(struct amdgpu_device *adev,
 | 
				
			||||||
 | 
									   struct amdgpu_vm *vm)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						int r;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
 | 
				
			||||||
 | 
									  adev->vm_manager.vm_pte_scheds,
 | 
				
			||||||
 | 
									  adev->vm_manager.vm_pte_num_scheds, NULL);
 | 
				
			||||||
 | 
						if (r)
 | 
				
			||||||
 | 
							goto error;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						return drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
 | 
				
			||||||
 | 
									     adev->vm_manager.vm_pte_scheds,
 | 
				
			||||||
 | 
									     adev->vm_manager.vm_pte_num_scheds, NULL);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					error:
 | 
				
			||||||
 | 
						drm_sched_entity_destroy(&vm->immediate);
 | 
				
			||||||
 | 
						return r;
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					/* Destroy the entities for page table updates again */
 | 
				
			||||||
 | 
					static void amdgpu_vm_fini_entities(struct amdgpu_vm *vm)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						drm_sched_entity_destroy(&vm->immediate);
 | 
				
			||||||
 | 
						drm_sched_entity_destroy(&vm->delayed);
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
/**
 | 
					/**
 | 
				
			||||||
 * amdgpu_vm_validate_pt_bos - validate the page table BOs
 | 
					 * amdgpu_vm_validate_pt_bos - validate the page table BOs
 | 
				
			||||||
 *
 | 
					 *
 | 
				
			||||||
| 
						 | 
					@ -373,6 +427,14 @@ int amdgpu_vm_validate_pt_bos(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 | 
				
			||||||
	struct amdgpu_bo *bo;
 | 
						struct amdgpu_bo *bo;
 | 
				
			||||||
	int r;
 | 
						int r;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
						if (drm_sched_entity_error(&vm->delayed)) {
 | 
				
			||||||
 | 
							amdgpu_vm_bo_reset_state_machine(vm);
 | 
				
			||||||
 | 
							amdgpu_vm_fini_entities(vm);
 | 
				
			||||||
 | 
							r = amdgpu_vm_init_entities(adev, vm);
 | 
				
			||||||
 | 
							if (r)
 | 
				
			||||||
 | 
								return r;
 | 
				
			||||||
 | 
						}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	spin_lock(&vm->status_lock);
 | 
						spin_lock(&vm->status_lock);
 | 
				
			||||||
	while (!list_empty(&vm->evicted)) {
 | 
						while (!list_empty(&vm->evicted)) {
 | 
				
			||||||
		bo_base = list_first_entry(&vm->evicted,
 | 
							bo_base = list_first_entry(&vm->evicted,
 | 
				
			||||||
| 
						 | 
					@ -2048,19 +2110,10 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 | 
				
			||||||
	INIT_LIST_HEAD(&vm->pt_freed);
 | 
						INIT_LIST_HEAD(&vm->pt_freed);
 | 
				
			||||||
	INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
 | 
						INIT_WORK(&vm->pt_free_work, amdgpu_vm_pt_free_work);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	/* create scheduler entities for page table updates */
 | 
						r = amdgpu_vm_init_entities(adev, vm);
 | 
				
			||||||
	r = drm_sched_entity_init(&vm->immediate, DRM_SCHED_PRIORITY_NORMAL,
 | 
					 | 
				
			||||||
				  adev->vm_manager.vm_pte_scheds,
 | 
					 | 
				
			||||||
				  adev->vm_manager.vm_pte_num_scheds, NULL);
 | 
					 | 
				
			||||||
	if (r)
 | 
						if (r)
 | 
				
			||||||
		return r;
 | 
							return r;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	r = drm_sched_entity_init(&vm->delayed, DRM_SCHED_PRIORITY_NORMAL,
 | 
					 | 
				
			||||||
				  adev->vm_manager.vm_pte_scheds,
 | 
					 | 
				
			||||||
				  adev->vm_manager.vm_pte_num_scheds, NULL);
 | 
					 | 
				
			||||||
	if (r)
 | 
					 | 
				
			||||||
		goto error_free_immediate;
 | 
					 | 
				
			||||||
 | 
					 | 
				
			||||||
	vm->pte_support_ats = false;
 | 
						vm->pte_support_ats = false;
 | 
				
			||||||
	vm->is_compute_context = false;
 | 
						vm->is_compute_context = false;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					@ -2121,10 +2174,7 @@ int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 | 
				
			||||||
error_free_delayed:
 | 
					error_free_delayed:
 | 
				
			||||||
	dma_fence_put(vm->last_tlb_flush);
 | 
						dma_fence_put(vm->last_tlb_flush);
 | 
				
			||||||
	dma_fence_put(vm->last_unlocked);
 | 
						dma_fence_put(vm->last_unlocked);
 | 
				
			||||||
	drm_sched_entity_destroy(&vm->delayed);
 | 
						amdgpu_vm_fini_entities(vm);
 | 
				
			||||||
 | 
					 | 
				
			||||||
error_free_immediate:
 | 
					 | 
				
			||||||
	drm_sched_entity_destroy(&vm->immediate);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	return r;
 | 
						return r;
 | 
				
			||||||
}
 | 
					}
 | 
				
			||||||
| 
						 | 
					@ -2277,8 +2327,7 @@ void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm *vm)
 | 
				
			||||||
	amdgpu_bo_unref(&root);
 | 
						amdgpu_bo_unref(&root);
 | 
				
			||||||
	WARN_ON(vm->root.bo);
 | 
						WARN_ON(vm->root.bo);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
	drm_sched_entity_destroy(&vm->immediate);
 | 
						amdgpu_vm_fini_entities(vm);
 | 
				
			||||||
	drm_sched_entity_destroy(&vm->delayed);
 | 
					 | 
				
			||||||
 | 
					
 | 
				
			||||||
	if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
 | 
						if (!RB_EMPTY_ROOT(&vm->va.rb_root)) {
 | 
				
			||||||
		dev_err(adev->dev, "still active bo inside vm\n");
 | 
							dev_err(adev->dev, "still active bo inside vm\n");
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
		Reference in a new issue