From f4322b9f8ad5f9f62add288c785d2e10bb6a5efe Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Mon, 22 Apr 2024 14:59:02 -0400 Subject: drm/amdgpu: Fix two reset triggered in a row Some times a hang GPU causes multiple reset sources to schedule resets. The second source will be able to trigger an unnecessary reset if they schedule after we call amdgpu_device_stop_pending_resets. Move amdgpu_device_stop_pending_resets to after the reset is done. Since at this point the GPU is supposedly in a good state, any reset scheduled after this point would be a legitimate reset. Remove unnecessary and incorrect checks for amdgpu_in_reset that was kinda serving this purpose. Signed-off-by: Yunxiang Li Reviewed-by: Lijo Lazar Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c') diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 0c7275bca8f7..c5ba9c4757a8 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -319,7 +319,7 @@ static int xgpu_ai_mailbox_rcv_irq(struct amdgpu_device *adev, switch (event) { case IDH_FLR_NOTIFICATION: - if (amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) + if (amdgpu_sriov_runtime(adev)) WARN_ONCE(!amdgpu_reset_domain_schedule(adev->reset_domain, &adev->virt.flr_work), "Failed to queue work! at %s", -- cgit v1.2.3 From 25c01191c2555351922e5515b6b6d31357975031 Mon Sep 17 00:00:00 2001 From: Yunxiang Li Date: Mon, 22 Apr 2024 14:44:38 -0400 Subject: drm/amdgpu: Add reset_context flag for host FLR There are other reset sources that pass NULL as the job pointer, such as amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if the FLR comes from the host does not work. Add a flag in reset_context to explicitly mark host triggered reset, and set this flag when we receive host reset notification. Signed-off-by: Yunxiang Li Reviewed-by: Emily Deng Reviewed-by: Zhigang Luo Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 13 ++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 1 + drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c | 1 + 5 files changed, 12 insertions(+), 5 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 8befd10bf007..33c889c027a5 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * * @adev: amdgpu_device pointer - * @from_hypervisor: request from hypervisor + * @reset_context: amdgpu reset context pointer * * do VF FLR and reinitialize Asic * return 0 means succeeded otherwise failed */ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, - bool from_hypervisor) + struct amdgpu_reset_context *reset_context) { int r; struct amdgpu_hive_info *hive = NULL; @@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); - if (from_hypervisor) + if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) { + clear_bit(AMDGPU_HOST_FLR, &reset_context->flags); r = amdgpu_virt_request_full_gpu(adev, true); - else + } else { r = amdgpu_virt_reset_gpu(adev); + } if (r) return r; + amdgpu_ras_set_fed(adev, false); amdgpu_irq_gpu_reset_resume_helper(adev); @@ -5826,7 +5829,7 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ /* Actual ASIC resets if needed.*/ /* Host driver will handle XGMI hive reset for SRIOV */ if (amdgpu_sriov_vf(adev)) { - r = amdgpu_device_reset_sriov(adev, job ? false : true); + r = amdgpu_device_reset_sriov(adev, reset_context); if (r) adev->asic_reset_res = r; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h index b11d190ece53..5a9cc043b858 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h @@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { AMDGPU_NEED_FULL_RESET = 0, AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_COREDUMP = 2, + AMDGPU_HOST_FLR = 3, }; struct amdgpu_reset_context { diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index c5ba9c4757a8..f4c47492e0cd 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -292,6 +292,7 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index fb7cf4214e3a..37b49a5ed2a1 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -328,6 +328,7 @@ flr_done: reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c index 14a065516ae4..78cd07744ebe 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_vi.c @@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) reset_context.method = AMD_RESET_METHOD_NONE; reset_context.reset_req_dev = adev; clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); + set_bit(AMDGPU_HOST_FLR, &reset_context.flags); amdgpu_device_gpu_recover(adev, NULL, &reset_context); } -- cgit v1.2.3