Skip to content
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.

Commit 21c8a13

Browse files
committedMar 27, 2025·
WIP: scx_bpfland: Introduce per-CPU DSQs
Make the scheduler really conservative at reusing the same CPU. This can improve latency-sensitive workloads at the cost of reducing system responsiveness. Signed-off-by: Andrea Righi <arighi@nvidia.com>
1 parent 95618f7 commit 21c8a13

File tree

1 file changed

+58
-228
lines changed

1 file changed

+58
-228
lines changed
 

‎scheds/rust/scx_bpfland/src/bpf/main.bpf.c

+58-228
Original file line numberDiff line numberDiff line change
@@ -252,9 +252,9 @@ static int calloc_cpumask(struct bpf_cpumask **p_cpumask)
252252
/*
253253
* Return the total amount of tasks that are currently waiting to be scheduled.
254254
*/
255-
static u64 nr_tasks_waiting(int node)
255+
static u64 nr_tasks_waiting(s32 cpu)
256256
{
257-
return scx_bpf_dsq_nr_queued(node) + 1;
257+
return scx_bpf_dsq_nr_queued(cpu) + 1;
258258
}
259259

260260
/*
@@ -388,7 +388,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
388388
const struct cpumask *primary, *p_mask, *l2_mask, *l3_mask;
389389
struct task_struct *current = (void *)bpf_get_current_task_btf();
390390
struct task_ctx *tctx;
391-
bool is_prev_llc_affine = false;
392391
int node;
393392
s32 cpu;
394393

@@ -400,6 +399,11 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
400399
if (!tctx)
401400
return -ENOENT;
402401

402+
/*
403+
* Refresh task domain based on the previously used cpu.
404+
*/
405+
task_set_domain(p, prev_cpu, p->cpus_ptr);
406+
403407
/*
404408
* Task's scheduling domains.
405409
*/
@@ -464,13 +468,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
464468
goto out_put_cpumask;
465469
}
466470

467-
/*
468-
* Migrate the wakee to the same domain as the waker in case of
469-
* a sync wakeup.
470-
*/
471-
if (!share_llc)
472-
task_set_domain(p, cpu, p->cpus_ptr);
473-
474471
/*
475472
* If the waker's L3 domain is not saturated attempt to migrate
476473
* the wakee on the same CPU as the waker (since it's going to
@@ -486,27 +483,19 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
486483
}
487484

488485
/*
489-
* Check if the previously used CPU is still in the L3 task domain. If
490-
* not, we may want to move the task back to its original L3 domain.
486+
* If the task can still run on the previously used CPU, keep using
487+
* it.
491488
*/
492-
is_prev_llc_affine = bpf_cpumask_test_cpu(prev_cpu, l3_mask);
489+
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
490+
cpu = prev_cpu;
491+
*is_idle = true;
492+
goto out_put_cpumask;
493+
}
493494

494495
/*
495496
* Find the best idle CPU, prioritizing full idle cores in SMT systems.
496497
*/
497498
if (smt_enabled) {
498-
/*
499-
* If the task can still run on the previously used CPU and
500-
* it's a full-idle core, keep using it.
501-
*/
502-
if (is_prev_llc_affine &&
503-
bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) &&
504-
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
505-
cpu = prev_cpu;
506-
*is_idle = true;
507-
goto out_put_cpumask;
508-
}
509-
510499
/*
511500
* Search for any full-idle CPU in the primary domain that
512501
* shares the same L2 cache.
@@ -530,37 +519,17 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
530519
}
531520

532521
/*
533-
* Search for any other full-idle core in the primary domain.
522+
* Search for any other full-idle core in the same node and
523+
* primary domain.
534524
*/
535525
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p_mask, node,
536-
SCX_PICK_IDLE_CORE);
537-
if (cpu >= 0) {
538-
*is_idle = true;
539-
goto out_put_cpumask;
540-
}
541-
542-
/*
543-
* Search for any full-idle core usable by the task.
544-
*/
545-
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(p->cpus_ptr, node,
546-
SCX_PICK_IDLE_CORE);
526+
SCX_PICK_IDLE_CORE | __COMPAT_SCX_PICK_IDLE_IN_NODE);
547527
if (cpu >= 0) {
548528
*is_idle = true;
549529
goto out_put_cpumask;
550530
}
551531
}
552532

553-
/*
554-
* If a full-idle core can't be found (or if this is not an SMT system)
555-
* try to re-use the same CPU, even if it's not in a full-idle core.
556-
*/
557-
if (is_prev_llc_affine &&
558-
scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
559-
cpu = prev_cpu;
560-
*is_idle = true;
561-
goto out_put_cpumask;
562-
}
563-
564533
/*
565534
* Search for any idle CPU in the primary domain that shares the same
566535
* L2 cache.
@@ -601,23 +570,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
601570
goto out_put_cpumask;
602571
}
603572

604-
/*
605-
* We couldn't find any idle CPU, return the previous CPU if it is in
606-
* the task's L3 domain.
607-
*/
608-
if (is_prev_llc_affine) {
609-
cpu = prev_cpu;
610-
goto out_put_cpumask;
611-
}
612-
613-
/*
614-
* Otherwise, return a random CPU in the task's L3 domain (if
615-
* available).
616-
*/
617-
cpu = bpf_cpumask_any_distribute(l3_mask);
618-
if (cpu >= nr_cpu_ids)
619-
cpu = prev_cpu;
620-
621573
out_put_cpumask:
622574
scx_bpf_put_cpumask(idle_cpumask);
623575
scx_bpf_put_cpumask(idle_smtmask);
@@ -646,9 +598,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p,
646598

647599
cpu = pick_idle_cpu(p, prev_cpu, wake_flags, &is_idle);
648600
if (is_idle) {
649-
int node = __COMPAT_scx_bpf_cpu_node(cpu);
650-
651-
if (local_pcpu || !scx_bpf_dsq_nr_queued(node)) {
601+
if (local_pcpu || !scx_bpf_dsq_nr_queued(cpu)) {
652602
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_max, 0);
653603
__sync_fetch_and_add(&nr_direct_dispatches, 1);
654604
}
@@ -658,166 +608,54 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p,
658608
}
659609

660610
/*
661-
* Try to wake up an idle CPU that can immediately process the task.
611+
* Try to migrate a task on a different CPU.
662612
*
663-
* Return true if a CPU has been kicked, false otherwise.
613+
* Return a CPU where the task can be migrated.
664614
*/
665-
static bool kick_idle_cpu(const struct task_struct *p, const struct task_ctx *tctx,
666-
s32 prev_cpu, bool idle_smt)
615+
static s32 try_migrate(const struct task_struct *p, const struct task_ctx *tctx,
616+
s32 prev_cpu, bool idle_smt)
667617
{
668618
const struct cpumask *mask;
669619
u64 flags = idle_smt ? SCX_PICK_IDLE_CORE : 0;
670-
s32 cpu = scx_bpf_task_cpu(p);
671-
int node = __COMPAT_scx_bpf_cpu_node(cpu);
620+
s32 cpu;
672621

673622
/*
674623
* No need to look for full-idle SMT cores if SMT is disabled.
675624
*/
676625
if (idle_smt && !smt_enabled)
677-
return false;
626+
return -ENOENT;
627+
628+
/*
629+
* Don't try to migrate if the task can only run on one CPU.
630+
*/
631+
if (p->nr_cpus_allowed == 1 || is_migration_disabled(p))
632+
return prev_cpu;
678633

679634
/*
680635
* Try to reuse the same CPU if idle.
681636
*/
682-
if (!idle_smt || (idle_smt && is_fully_idle(prev_cpu))) {
683-
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) {
684-
scx_bpf_kick_cpu(prev_cpu, SCX_KICK_IDLE);
685-
return true;
686-
}
687-
}
637+
if (scx_bpf_test_and_clear_cpu_idle(prev_cpu))
638+
return prev_cpu;
688639

689640
/*
690641
* Look for any idle CPU usable by the task that can immediately
691642
* execute the task, prioritizing SMT isolation and cache locality.
692643
*/
693644
mask = cast_mask(tctx->l2_cpumask);
694645
if (mask) {
695-
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(mask, node,
696-
flags | __COMPAT_SCX_PICK_IDLE_IN_NODE);
697-
if (cpu >= 0) {
698-
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
699-
return true;
700-
}
646+
cpu = scx_bpf_pick_idle_cpu(mask, flags);
647+
if (cpu >= 0)
648+
return cpu;
701649
}
650+
702651
mask = cast_mask(tctx->l3_cpumask);
703652
if (mask) {
704-
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(mask, node,
705-
flags | __COMPAT_SCX_PICK_IDLE_IN_NODE);
706-
if (cpu >= 0) {
707-
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
708-
return true;
709-
}
710-
}
711-
mask = cast_mask(tctx->cpumask);
712-
if (mask) {
713-
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node(mask, node,
714-
flags | __COMPAT_SCX_PICK_IDLE_IN_NODE);
715-
if (cpu >= 0) {
716-
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
717-
return true;
718-
}
653+
cpu = scx_bpf_pick_idle_cpu(mask, flags);
654+
if (cpu >= 0)
655+
return cpu;
719656
}
720657

721-
return false;
722-
}
723-
724-
/*
725-
* Attempt to dispatch a task directly to its assigned CPU.
726-
*
727-
* Return true if the task is dispatched, false otherwise.
728-
*/
729-
static bool try_direct_dispatch(struct task_struct *p, struct task_ctx *tctx,
730-
s32 prev_cpu, u64 slice, u64 enq_flags)
731-
{
732-
/*
733-
* If a task has been re-enqueued because its assigned CPU has been
734-
* taken by a higher priority scheduling class, force it to follow
735-
* the regular scheduling path and give it a chance to run on a
736-
* different CPU.
737-
*
738-
* However, if the task can only run on a single CPU, re-scheduling
739-
* is unnecessary, as it can only be dispatched on that specific
740-
* CPU. In this case, dispatch it immediately to maximize its
741-
* chances of reclaiming the CPU quickly and avoiding stalls.
742-
*
743-
* This approach will be effective once dl_server support is added
744-
* to the sched_ext core.
745-
*/
746-
if (enq_flags & SCX_ENQ_REENQ) {
747-
if (p->nr_cpus_allowed == 1) {
748-
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_max, enq_flags);
749-
__sync_fetch_and_add(&nr_kthread_dispatches, 1);
750-
751-
return true;
752-
}
753-
return false;
754-
}
755-
756-
/*
757-
* If local_kthread is specified dispatch per-CPU kthreads
758-
* directly on their assigned CPU.
759-
*/
760-
if (local_kthreads && is_kthread(p)) {
761-
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice_max, enq_flags);
762-
__sync_fetch_and_add(&nr_kthread_dispatches, 1);
763-
764-
return true;
765-
}
766-
767-
/*
768-
* If ops.select_cpu() has been skipped, try direct dispatch.
769-
*/
770-
if (!__COMPAT_is_enq_cpu_selected(enq_flags)) {
771-
int node = __COMPAT_scx_bpf_cpu_node(prev_cpu);
772-
struct rq *rq = scx_bpf_cpu_rq(prev_cpu);
773-
774-
/*
775-
* Allow to preempt the task currently running on the
776-
* assigned CPU if our deadline is earlier.
777-
*/
778-
if (!no_preempt && tctx->deadline < rq->curr->scx.dsq_vtime) {
779-
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | prev_cpu,
780-
slice, enq_flags | SCX_ENQ_PREEMPT);
781-
__sync_fetch_and_add(&nr_direct_dispatches, 1);
782-
783-
return true;
784-
}
785-
786-
/*
787-
* If local_pcpu is enabled always dispatch tasks that can only run
788-
* on one CPU directly.
789-
*
790-
* This can help to improve I/O workloads (like large parallel
791-
* builds).
792-
*/
793-
if (local_pcpu && p->nr_cpus_allowed == 1) {
794-
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL, slice, enq_flags);
795-
__sync_fetch_and_add(&nr_direct_dispatches, 1);
796-
797-
return true;
798-
}
799-
800-
/*
801-
* If the local DSQ and the shared DSQ have no task waiting
802-
* and the CPU is still a full-idle SMT core, perform a
803-
* direct dispatch.
804-
*/
805-
if (!scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | prev_cpu) &&
806-
(local_pcpu || !scx_bpf_dsq_nr_queued(node)) &&
807-
is_fully_idle(prev_cpu)) {
808-
scx_bpf_dsq_insert(p, SCX_DSQ_LOCAL_ON | prev_cpu,
809-
slice_max, enq_flags);
810-
__sync_fetch_and_add(&nr_direct_dispatches, 1);
811-
812-
return true;
813-
}
814-
}
815-
816-
/*
817-
* Direct dispatch not possible, follow the regular scheduling
818-
* path.
819-
*/
820-
return false;
658+
return -EBUSY;
821659
}
822660

823661
/*
@@ -829,38 +667,32 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
829667
const struct cpumask *idle_cpumask;
830668
struct task_ctx *tctx;
831669
u64 slice, deadline;
832-
s32 prev_cpu = scx_bpf_task_cpu(p);
833-
int node = __COMPAT_scx_bpf_cpu_node(prev_cpu);
670+
s32 prev_cpu = scx_bpf_task_cpu(p), cpu = -ENOENT;
834671

835672
/*
836673
* Dispatch regular tasks to the shared DSQ.
837674
*/
838675
tctx = try_lookup_task_ctx(p);
839676
if (!tctx)
840677
return;
841-
deadline = task_deadline(p, tctx);
842-
slice = CLAMP(slice_max / nr_tasks_waiting(node), slice_min, slice_max);
843678

844679
/*
845-
* Try to dispatch the task directly, if possible.
680+
* Allow tasks to migrate from ops.enqueue() if ops.select_cpu()
681+
* was skipped and the current CPU is busy.
846682
*/
847-
if (try_direct_dispatch(p, tctx, prev_cpu, slice, enq_flags))
848-
return;
683+
cpu = try_migrate(p, tctx, prev_cpu, true);
684+
if (cpu < 0)
685+
cpu = try_migrate(p, tctx, prev_cpu, false);
686+
if (cpu < 0)
687+
cpu = prev_cpu;
849688

850-
scx_bpf_dsq_insert_vtime(p, node, slice, deadline, enq_flags);
689+
deadline = task_deadline(p, tctx);
690+
slice = CLAMP(slice_max / nr_tasks_waiting(cpu), slice_min, slice_max);
691+
692+
scx_bpf_dsq_insert_vtime(p, cpu, slice, deadline, enq_flags);
851693
__sync_fetch_and_add(&nr_shared_dispatches, 1);
852694

853-
/*
854-
* If there are idle CPUs in the system try to proactively wake up
855-
* one, so that it can immediately execute the task in case its
856-
* current CPU is busy (always prioritizing full-idle SMT cores
857-
* first, if present).
858-
*/
859-
idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node(node);
860-
if (!bpf_cpumask_empty(idle_cpumask))
861-
if (!kick_idle_cpu(p, tctx, prev_cpu, true))
862-
kick_idle_cpu(p, tctx, prev_cpu, false);
863-
scx_bpf_put_cpumask(idle_cpumask);
695+
scx_bpf_kick_cpu(cpu, SCX_KICK_IDLE);
864696
}
865697

866698
static bool keep_running(const struct task_struct *p, s32 cpu)
@@ -912,13 +744,11 @@ static bool keep_running(const struct task_struct *p, s32 cpu)
912744

913745
void BPF_STRUCT_OPS(bpfland_dispatch, s32 cpu, struct task_struct *prev)
914746
{
915-
int node = __COMPAT_scx_bpf_cpu_node(cpu);
916-
917747
/*
918748
* Consume regular tasks from the shared DSQ, transferring them to the
919749
* local CPU DSQ.
920750
*/
921-
if (scx_bpf_dsq_move_to_local(node))
751+
if (scx_bpf_dsq_move_to_local(cpu))
922752
return;
923753

924754
/*
@@ -1256,7 +1086,7 @@ static void init_cpuperf_target(void)
12561086

12571087
s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
12581088
{
1259-
int err, node;
1089+
int err, cpu;
12601090

12611091
/* Initialize amount of online and possible CPUs */
12621092
nr_online_cpus = get_nr_online_cpus();
@@ -1268,10 +1098,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
12681098
/*
12691099
* Create the global shared DSQ.
12701100
*/
1271-
bpf_for(node, 0, __COMPAT_scx_bpf_nr_node_ids()) {
1272-
err = scx_bpf_create_dsq(node, node);
1101+
bpf_for(cpu, 0, nr_cpu_ids) {
1102+
err = scx_bpf_create_dsq(cpu, __COMPAT_scx_bpf_cpu_node(cpu));
12731103
if (err) {
1274-
scx_bpf_error("failed to create DSQ %d: %d", node, err);
1104+
scx_bpf_error("failed to create DSQ %d: %d", cpu, err);
12751105
return err;
12761106
}
12771107
}

0 commit comments

Comments
 (0)
Please sign in to comment.