drm/sched: Avoid lockdep spalt on killing a processes

Probelm: Singlaning one sched fence from within another's sched fence singal callback generates lockdep splat because the both have same lockdep class of their fence->lock Fix: Fix bellow stack by rescheduling to irq work of signaling and killing of jobs that left when entity is killed. [11176.741181] dump_stack+0x10/0x12 [11176.741186] __lock_acquire.cold+0x208/0x2df [11176.741197] lock_acquire+0xc6/0x2d0 [11176.741204] ? dma_fence_signal+0x28/0x80 [11176.741212] _raw_spin_lock_irqsave+0x4d/0x70 [11176.741219] ? dma_fence_signal+0x28/0x80 [11176.741225] dma_fence_signal+0x28/0x80 [11176.741230] drm_sched_fence_finished+0x12/0x20 [gpu_sched] [11176.741240] drm_sched_entity_kill_jobs_cb+0x1c/0x50 [gpu_sched] [11176.741248] dma_fence_signal_timestamp_locked+0xac/0x1a0 [11176.741254] dma_fence_signal+0x3b/0x80 [11176.741260] drm_sched_fence_finished+0x12/0x20 [gpu_sched] [11176.741268] drm_sched_job_done.isra.0+0x7f/0x1a0 [gpu_sched] [11176.741277] drm_sched_job_done_cb+0x12/0x20 [gpu_sched] [11176.741284] dma_fence_signal_timestamp_locked+0xac/0x1a0 [11176.741290] dma_fence_signal+0x3b/0x80 [11176.741296] amdgpu_fence_process+0xd1/0x140 [amdgpu] [11176.741504] sdma_v4_0_process_trap_irq+0x8c/0xb0 [amdgpu] [11176.741731] amdgpu_irq_dispatch+0xce/0x250 [amdgpu] [11176.741954] amdgpu_ih_process+0x81/0x100 [amdgpu] [11176.742174] amdgpu_irq_handler+0x26/0xa0 [amdgpu] [11176.742393] __handle_irq_event_percpu+0x4f/0x2c0 [11176.742402] handle_irq_event_percpu+0x33/0x80 [11176.742408] handle_irq_event+0x39/0x60 [11176.742414] handle_edge_irq+0x93/0x1d0 [11176.742419] __common_interrupt+0x50/0xe0 [11176.742426] common_interrupt+0x80/0x90 Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Suggested-by: Daniel Vetter <daniel.vetter@ffwll.ch> Suggested-by: Christian König <christian.koenig@amd.com> Tested-by: Christian König <christian.koenig@amd.com> Reviewed-by: Christian König <christian.koenig@amd.com> Link: https://www.spinics.net/lists/dri-devel/msg321250.html
author: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 2021-10-28 18:24:03 +0200
committer: Andrey Grodzovsky <andrey.grodzovsky@amd.com> 2021-11-01 16:08:21 +0100
commit: 542cff7893a37445f98ece26aeb3c9c1055e9ea4 (patch)
tree: 99236b129149a7482c2ae3c247c1af2d9411bc7f /include/drm/gpu_scheduler.h
parent: drm/ingenic: Remove bogus register write (diff)
download: linux-542cff7893a37445f98ece26aeb3c9c1055e9ea4.tar.xz
linux-542cff7893a37445f98ece26aeb3c9c1055e9ea4.zip
1 files changed, 11 insertions, 1 deletions
diff --git a/include/drm/gpu_scheduler.h b/include/drm/gpu_scheduler.h
index f011e4c407f2..bbc22fad8d80 100644
--- a/include/drm/gpu_scheduler.h
+++ b/include/drm/gpu_scheduler.h
@@ -28,6 +28,7 @@
 #include <linux/dma-fence.h>
 #include <linux/completion.h>
 #include <linux/xarray.h>
+#include <linux/irq_work.h>
 
 #define MAX_WAIT_SCHED_ENTITY_Q_EMPTY msecs_to_jiffies(1000)
 
@@ -286,7 +287,16 @@ struct drm_sched_job {
 	struct list_head		list;
 	struct drm_gpu_scheduler	*sched;
 	struct drm_sched_fence		*s_fence;
-	struct dma_fence_cb		finish_cb;
+
+	/*
+	 * work is used only after finish_cb has been used and will not be
+	 * accessed anymore.
+	 */
+	union {
+		struct dma_fence_cb		finish_cb;
+		struct irq_work 		work;
+	};
+
 	uint64_t			id;
 	atomic_t			karma;
 	enum drm_sched_priority		s_priority;
author	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	2021-10-28 18:24:03 +0200
committer	Andrey Grodzovsky <andrey.grodzovsky@amd.com>	2021-11-01 16:08:21 +0100
commit	542cff7893a37445f98ece26aeb3c9c1055e9ea4 (patch)
tree	99236b129149a7482c2ae3c247c1af2d9411bc7f /include/drm/gpu_scheduler.h
parent	drm/ingenic: Remove bogus register write (diff)
download	linux-542cff7893a37445f98ece26aeb3c9c1055e9ea4.tar.xz linux-542cff7893a37445f98ece26aeb3c9c1055e9ea4.zip