sched: Fix delayed_dequeue vs switched_from_fair()

Commit 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue") and its follow up fixes try to deal with a rather unfortunate situation where is task is enqueued in a new class, even though it shouldn't have been. Mostly because the existing ->switched_to/from() hooks are in the wrong place for this case. This all led to Paul being able to trigger failures at something like once per 10k CPU hours of RCU torture. For now, do the ugly thing and move the code to the right place by ignoring the switch hooks. Note: Clean up the whole sched_class::switch*_{to,from}() thing. Fixes: 2e0199df252a ("sched/fair: Prepare exit/cleanup paths for delayed_dequeue") Reported-by: Paul E. McKenney <paulmck@kernel.org> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Link: https://lkml.kernel.org/r/20241003185037.GA5594@noisy.programming.kicks-ass.net
author: Peter Zijlstra <peterz@infradead.org> 2024-10-10 11:54:38 +0200
committer: Peter Zijlstra <peterz@infradead.org> 2024-10-11 10:49:32 +0200
commit: 98442f0ccd828ac42e89281a815e9e7a97533822 (patch)
tree: 17881a78dd58ce6979825e4f4b3c8f4906b3f4f3 /kernel
parent: sched/core: Disable page allocation in task_tick_mm_cid() (diff)
download: linux-98442f0ccd828ac42e89281a815e9e7a97533822.tar.xz
linux-98442f0ccd828ac42e89281a815e9e7a97533822.zip
5 files changed, 30 insertions, 34 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0259301e572e..a860996622a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7010,20 +7010,20 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag
 }
 EXPORT_SYMBOL(default_wake_function);
 
-void __setscheduler_prio(struct task_struct *p, int prio)
+const struct sched_class *__setscheduler_class(struct task_struct *p, int prio)
 {
 	if (dl_prio(prio))
-		p->sched_class = &dl_sched_class;
-	else if (rt_prio(prio))
-		p->sched_class = &rt_sched_class;
+		return &dl_sched_class;
+
+	if (rt_prio(prio))
+		return &rt_sched_class;
+
 #ifdef CONFIG_SCHED_CLASS_EXT
-	else if (task_should_scx(p))
-		p->sched_class = &ext_sched_class;
+	if (task_should_scx(p))
+		return &ext_sched_class;
 #endif
-	else
-		p->sched_class = &fair_sched_class;
 
-	p->prio = prio;
+	return &fair_sched_class;
 }
 
 #ifdef CONFIG_RT_MUTEXES
@@ -7069,7 +7069,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 {
 	int prio, oldprio, queued, running, queue_flag =
 		DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *next_class;
 	struct rq_flags rf;
 	struct rq *rq;
 
@@ -7127,6 +7127,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 		queue_flag &= ~DEQUEUE_MOVE;
 
 	prev_class = p->sched_class;
+	next_class = __setscheduler_class(p, prio);
+
+	if (prev_class != next_class && p->se.sched_delayed)
+		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -7164,7 +7169,9 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
 			p->rt.timeout = 0;
 	}
 
-	__setscheduler_prio(p, prio);
+	p->sched_class = next_class;
+	p->prio = prio;
+
 	check_class_changing(rq, p, prev_class);
 
 	if (queued)
diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c
index 3cd7c50a51c5..6f9de573ee93 100644
--- a/kernel/sched/ext.c
+++ b/kernel/sched/ext.c
@@ -4471,7 +4471,7 @@ static void scx_ops_disable_workfn(struct kthread_work *work)
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
 		p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL);
-		__setscheduler_prio(p, p->prio);
+		p->sched_class = __setscheduler_class(p, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
@@ -5186,7 +5186,7 @@ static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link)
 
 		sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx);
 
-		__setscheduler_prio(p, p->prio);
+		p->sched_class = __setscheduler_class(p, p->prio);
 		check_class_changing(task_rq(p), p, old_class);
 
 		sched_enq_and_set_task(&ctx);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index ab497fafa7be..c157d4860a3b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -13177,22 +13177,6 @@ static void attach_task_cfs_rq(struct task_struct *p)
 static void switched_from_fair(struct rq *rq, struct task_struct *p)
 {
 	detach_task_cfs_rq(p);
-	/*
-	 * Since this is called after changing class, this is a little weird
-	 * and we cannot use DEQUEUE_DELAYED.
-	 */
-	if (p->se.sched_delayed) {
-		/* First, dequeue it from its new class' structures */
-		dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP);
-		/*
-		 * Now, clean up the fair_sched_class side of things
-		 * related to sched_delayed being true and that wasn't done
-		 * due to the generic dequeue not using DEQUEUE_DELAYED.
-		 */
-		finish_delayed_dequeue_entity(&p->se);
-		p->se.rel_deadline = 0;
-		__block_task(rq, p);
-	}
 }
 
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b1c3588a8f00..fba524c81c63 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -3797,7 +3797,7 @@ static inline int rt_effective_prio(struct task_struct *p, int prio)
 
 extern int __sched_setscheduler(struct task_struct *p, const struct sched_attr *attr, bool user, bool pi);
 extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *ctx);
-extern void __setscheduler_prio(struct task_struct *p, int prio);
+extern const struct sched_class *__setscheduler_class(struct task_struct *p, int prio);
 extern void set_load_weight(struct task_struct *p, bool update_load);
 extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags);
 extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags);
diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c
index aa70beee9895..0470bcc3d204 100644
--- a/kernel/sched/syscalls.c
+++ b/kernel/sched/syscalls.c
@@ -529,7 +529,7 @@ int __sched_setscheduler(struct task_struct *p,
 {
 	int oldpolicy = -1, policy = attr->sched_policy;
 	int retval, oldprio, newprio, queued, running;
-	const struct sched_class *prev_class;
+	const struct sched_class *prev_class, *next_class;
 	struct balance_callback *head;
 	struct rq_flags rf;
 	int reset_on_fork;
@@ -706,6 +706,12 @@ change:
 			queue_flags &= ~DEQUEUE_MOVE;
 	}
 
+	prev_class = p->sched_class;
+	next_class = __setscheduler_class(p, newprio);
+
+	if (prev_class != next_class && p->se.sched_delayed)
+		dequeue_task(rq, p, DEQUEUE_SLEEP | DEQUEUE_DELAYED | DEQUEUE_NOCLOCK);
+
 	queued = task_on_rq_queued(p);
 	running = task_current(rq, p);
 	if (queued)
@@ -713,11 +719,10 @@ change:
 	if (running)
 		put_prev_task(rq, p);
 
-	prev_class = p->sched_class;
-
 	if (!(attr->sched_flags & SCHED_FLAG_KEEP_PARAMS)) {
 		__setscheduler_params(p, attr);
-		__setscheduler_prio(p, newprio);
+		p->sched_class = next_class;
+		p->prio = newprio;
 	}
 	__setscheduler_uclamp(p, attr);
 	check_class_changing(rq, p, prev_class);
author	Peter Zijlstra <peterz@infradead.org>	2024-10-10 11:54:38 +0200
committer	Peter Zijlstra <peterz@infradead.org>	2024-10-11 10:49:32 +0200
commit	98442f0ccd828ac42e89281a815e9e7a97533822 (patch)
tree	17881a78dd58ce6979825e4f4b3c8f4906b3f4f3 /kernel
parent	sched/core: Disable page allocation in task_tick_mm_cid() (diff)
download	linux-98442f0ccd828ac42e89281a815e9e7a97533822.tar.xz linux-98442f0ccd828ac42e89281a815e9e7a97533822.zip