summaryrefslogtreecommitdiffstats
path: root/drivers/gpu
diff options
context:
space:
mode:
authorChristian König <christian.koenig@amd.com>2026-04-20 16:08:35 +0200
committerAlex Deucher <alexander.deucher@amd.com>2026-05-11 17:47:11 -0400
commit0071e01c61aa73f22edd5252f0a3e2c2eff744d6 (patch)
tree3d16edeb186cf6efb929db9c657c02b93078a649 /drivers/gpu
parentd0053441ad7eaf7920b71e2263097ece53c5af34 (diff)
downloadlinux-0071e01c61aa73f22edd5252f0a3e2c2eff744d6.tar.gz
linux-0071e01c61aa73f22edd5252f0a3e2c2eff744d6.zip
drm/amdgpu: fix userq hang detection and reset
Fix lock inversions pointed out by Prike and Sunil. The hang detection timeout *CAN'T* grab locks under which we wait for fences, especially not the userq_mutex lock. Then instead of this completely broken handling with the hang_detect_fence just cancel the work when fences are processed and re-start if necessary. Signed-off-by: Christian König <christian.koenig@amd.com> Reviewed-by: Sunil Khatri <sunil.khatri@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com> (cherry picked from commit 1b62077f045ac6ffde7c97005c6659569ac5c1ec)
Diffstat (limited to 'drivers/gpu')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c65
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h1
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c17
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h2
4 files changed, 40 insertions, 45 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index ba03d2a42e1e..70d74f04d2dd 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -106,9 +106,6 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
int r = 0;
int i;
- /* Warning if current process mutex is not held */
- WARN_ON(!mutex_is_locked(&uq_mgr->userq_mutex));
-
if (unlikely(adev->debug_disable_gpu_ring_reset)) {
dev_err(adev->dev, "userq reset disabled by debug mask\n");
return 0;
@@ -127,9 +124,11 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
*/
for (i = 0; i < num_queue_types; i++) {
int ring_type = queue_types[i];
- const struct amdgpu_userq_funcs *funcs = adev->userq_funcs[ring_type];
+ const struct amdgpu_userq_funcs *funcs =
+ adev->userq_funcs[ring_type];
- if (!amdgpu_userq_is_reset_type_supported(adev, ring_type, AMDGPU_RESET_TYPE_PER_QUEUE))
+ if (!amdgpu_userq_is_reset_type_supported(adev, ring_type,
+ AMDGPU_RESET_TYPE_PER_QUEUE))
continue;
if (atomic_read(&uq_mgr->userq_count[ring_type]) > 0 &&
@@ -150,38 +149,22 @@ amdgpu_userq_detect_and_reset_queues(struct amdgpu_userq_mgr *uq_mgr)
static void amdgpu_userq_hang_detect_work(struct work_struct *work)
{
- struct amdgpu_usermode_queue *queue = container_of(work,
- struct amdgpu_usermode_queue,
- hang_detect_work.work);
- struct dma_fence *fence;
- struct amdgpu_userq_mgr *uq_mgr;
-
- if (!queue->userq_mgr)
- return;
-
- uq_mgr = queue->userq_mgr;
- fence = READ_ONCE(queue->hang_detect_fence);
- /* Fence already signaled – no action needed */
- if (!fence || dma_fence_is_signaled(fence))
- return;
+ struct amdgpu_usermode_queue *queue =
+ container_of(work, struct amdgpu_usermode_queue,
+ hang_detect_work.work);
- mutex_lock(&uq_mgr->userq_mutex);
- amdgpu_userq_detect_and_reset_queues(uq_mgr);
- mutex_unlock(&uq_mgr->userq_mutex);
+ amdgpu_userq_detect_and_reset_queues(queue->userq_mgr);
}
/*
* Start hang detection for a user queue fence. A delayed work will be scheduled
- * to check if the fence is still pending after the timeout period.
-*/
+ * to reset the queues when the fence doesn't signal in time.
+ */
void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
{
struct amdgpu_device *adev;
unsigned long timeout_ms;
- if (!queue || !queue->userq_mgr || !queue->userq_mgr->adev)
- return;
-
adev = queue->userq_mgr->adev;
/* Determine timeout based on queue type */
switch (queue->queue_type) {
@@ -199,8 +182,6 @@ void amdgpu_userq_start_hang_detect_work(struct amdgpu_usermode_queue *queue)
break;
}
- /* Store the fence to monitor and schedule hang detection */
- WRITE_ONCE(queue->hang_detect_fence, queue->last_fence);
schedule_delayed_work(&queue->hang_detect_work,
msecs_to_jiffies(timeout_ms));
}
@@ -210,18 +191,24 @@ void amdgpu_userq_process_fence_irq(struct amdgpu_device *adev, u32 doorbell)
struct xarray *xa = &adev->userq_doorbell_xa;
struct amdgpu_usermode_queue *queue;
unsigned long flags;
+ int r;
xa_lock_irqsave(xa, flags);
queue = xa_load(xa, doorbell);
- if (queue)
- amdgpu_userq_fence_driver_process(queue->fence_drv);
- xa_unlock_irqrestore(xa, flags);
-}
+ if (queue) {
+ r = amdgpu_userq_fence_driver_process(queue->fence_drv);
+ /*
+ * We are in interrupt context here, this *can't* wait for
+ * reset work to finish.
+ */
+ if (r >= 0)
+ cancel_delayed_work(&queue->hang_detect_work);
-static void amdgpu_userq_init_hang_detect_work(struct amdgpu_usermode_queue *queue)
-{
- INIT_DELAYED_WORK(&queue->hang_detect_work, amdgpu_userq_hang_detect_work);
- queue->hang_detect_fence = NULL;
+ /* Restart the timer when there are still fences pending */
+ if (r == 1)
+ amdgpu_userq_start_hang_detect_work(queue);
+ }
+ xa_unlock_irqrestore(xa, flags);
}
static int amdgpu_userq_buffer_va_list_add(struct amdgpu_usermode_queue *queue,
@@ -640,7 +627,6 @@ amdgpu_userq_destroy(struct amdgpu_userq_mgr *uq_mgr, struct amdgpu_usermode_que
amdgpu_bo_unreserve(vm->root.bo);
mutex_lock(&uq_mgr->userq_mutex);
- queue->hang_detect_fence = NULL;
amdgpu_userq_wait_for_last_fence(queue);
#if defined(CONFIG_DEBUG_FS)
@@ -847,7 +833,8 @@ amdgpu_userq_create(struct drm_file *filp, union drm_amdgpu_userq *args)
up_read(&adev->reset_domain->sem);
amdgpu_debugfs_userq_init(filp, queue, qid);
- amdgpu_userq_init_hang_detect_work(queue);
+ INIT_DELAYED_WORK(&queue->hang_detect_work,
+ amdgpu_userq_hang_detect_work);
args->out.queue_id = qid;
atomic_inc(&uq_mgr->userq_count[queue->queue_type]);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 843ea8ecc5d7..85f460e7c31b 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -85,7 +85,6 @@ struct amdgpu_usermode_queue {
int priority;
struct dentry *debugfs_queue;
struct delayed_work hang_detect_work;
- struct dma_fence *hang_detect_fence;
struct kref refcount;
struct list_head userq_va_list;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
index c9dbf03c4eea..53a8944bab05 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.c
@@ -135,7 +135,14 @@ amdgpu_userq_fence_put_fence_drv_array(struct amdgpu_userq_fence *userq_fence)
userq_fence->fence_drv_array_count = 0;
}
-void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv)
+/*
+ * Returns:
+ * -ENOENT when no fences were processes
+ * 1 when more fences are pending
+ * 0 when no fences are pending any more
+ */
+int
+amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv)
{
struct amdgpu_userq_fence *userq_fence, *tmp;
LIST_HEAD(to_be_signaled);
@@ -143,9 +150,6 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
unsigned long flags;
u64 rptr;
- if (!fence_drv)
- return;
-
spin_lock_irqsave(&fence_drv->fence_list_lock, flags);
rptr = amdgpu_userq_fence_read(fence_drv);
@@ -158,6 +162,9 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
&userq_fence->link);
spin_unlock_irqrestore(&fence_drv->fence_list_lock, flags);
+ if (list_empty(&to_be_signaled))
+ return -ENOENT;
+
list_for_each_entry_safe(userq_fence, tmp, &to_be_signaled, link) {
fence = &userq_fence->base;
list_del_init(&userq_fence->link);
@@ -169,6 +176,8 @@ void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_d
dma_fence_put(fence);
}
+ /* That doesn't need to be accurate so no locking */
+ return list_empty(&fence_drv->fences) ? 0 : 1;
}
void amdgpu_userq_fence_driver_destroy(struct kref *ref)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
index d355a0eecc07..0bd51616cef1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq_fence.h
@@ -63,7 +63,7 @@ void amdgpu_userq_fence_driver_put(struct amdgpu_userq_fence_driver *fence_drv);
int amdgpu_userq_fence_driver_alloc(struct amdgpu_device *adev,
struct amdgpu_userq_fence_driver **fence_drv_req);
void amdgpu_userq_fence_driver_free(struct amdgpu_usermode_queue *userq);
-void amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv);
+int amdgpu_userq_fence_driver_process(struct amdgpu_userq_fence_driver *fence_drv);
void amdgpu_userq_fence_driver_force_completion(struct amdgpu_usermode_queue *userq);
void amdgpu_userq_fence_driver_destroy(struct kref *ref);
int amdgpu_userq_signal_ioctl(struct drm_device *dev, void *data,