@@ -252,9 +252,9 @@ static int calloc_cpumask(struct bpf_cpumask **p_cpumask)
252
252
/*
253
253
* Return the total amount of tasks that are currently waiting to be scheduled.
254
254
*/
255
- static u64 nr_tasks_waiting (int node )
255
+ static u64 nr_tasks_waiting (s32 cpu )
256
256
{
257
- return scx_bpf_dsq_nr_queued (node ) + 1 ;
257
+ return scx_bpf_dsq_nr_queued (cpu ) + 1 ;
258
258
}
259
259
260
260
/*
@@ -388,7 +388,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
388
388
const struct cpumask * primary , * p_mask , * l2_mask , * l3_mask ;
389
389
struct task_struct * current = (void * )bpf_get_current_task_btf ();
390
390
struct task_ctx * tctx ;
391
- bool is_prev_llc_affine = false;
392
391
int node ;
393
392
s32 cpu ;
394
393
@@ -400,6 +399,11 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
400
399
if (!tctx )
401
400
return - ENOENT ;
402
401
402
+ /*
403
+ * Refresh task domain based on the previously used cpu.
404
+ */
405
+ task_set_domain (p , prev_cpu , p -> cpus_ptr );
406
+
403
407
/*
404
408
* Task's scheduling domains.
405
409
*/
@@ -464,13 +468,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
464
468
goto out_put_cpumask ;
465
469
}
466
470
467
- /*
468
- * Migrate the wakee to the same domain as the waker in case of
469
- * a sync wakeup.
470
- */
471
- if (!share_llc )
472
- task_set_domain (p , cpu , p -> cpus_ptr );
473
-
474
471
/*
475
472
* If the waker's L3 domain is not saturated attempt to migrate
476
473
* the wakee on the same CPU as the waker (since it's going to
@@ -486,27 +483,19 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
486
483
}
487
484
488
485
/*
489
- * Check if the previously used CPU is still in the L3 task domain. If
490
- * not, we may want to move the task back to its original L3 domain .
486
+ * If the task can still run on the previously used CPU, keep using
487
+ * it .
491
488
*/
492
- is_prev_llc_affine = bpf_cpumask_test_cpu (prev_cpu , l3_mask );
489
+ if (scx_bpf_test_and_clear_cpu_idle (prev_cpu )) {
490
+ cpu = prev_cpu ;
491
+ * is_idle = true;
492
+ goto out_put_cpumask ;
493
+ }
493
494
494
495
/*
495
496
* Find the best idle CPU, prioritizing full idle cores in SMT systems.
496
497
*/
497
498
if (smt_enabled ) {
498
- /*
499
- * If the task can still run on the previously used CPU and
500
- * it's a full-idle core, keep using it.
501
- */
502
- if (is_prev_llc_affine &&
503
- bpf_cpumask_test_cpu (prev_cpu , idle_smtmask ) &&
504
- scx_bpf_test_and_clear_cpu_idle (prev_cpu )) {
505
- cpu = prev_cpu ;
506
- * is_idle = true;
507
- goto out_put_cpumask ;
508
- }
509
-
510
499
/*
511
500
* Search for any full-idle CPU in the primary domain that
512
501
* shares the same L2 cache.
@@ -530,37 +519,17 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
530
519
}
531
520
532
521
/*
533
- * Search for any other full-idle core in the primary domain.
522
+ * Search for any other full-idle core in the same node and
523
+ * primary domain.
534
524
*/
535
525
cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (p_mask , node ,
536
- SCX_PICK_IDLE_CORE );
537
- if (cpu >= 0 ) {
538
- * is_idle = true;
539
- goto out_put_cpumask ;
540
- }
541
-
542
- /*
543
- * Search for any full-idle core usable by the task.
544
- */
545
- cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (p -> cpus_ptr , node ,
546
- SCX_PICK_IDLE_CORE );
526
+ SCX_PICK_IDLE_CORE | __COMPAT_SCX_PICK_IDLE_IN_NODE );
547
527
if (cpu >= 0 ) {
548
528
* is_idle = true;
549
529
goto out_put_cpumask ;
550
530
}
551
531
}
552
532
553
- /*
554
- * If a full-idle core can't be found (or if this is not an SMT system)
555
- * try to re-use the same CPU, even if it's not in a full-idle core.
556
- */
557
- if (is_prev_llc_affine &&
558
- scx_bpf_test_and_clear_cpu_idle (prev_cpu )) {
559
- cpu = prev_cpu ;
560
- * is_idle = true;
561
- goto out_put_cpumask ;
562
- }
563
-
564
533
/*
565
534
* Search for any idle CPU in the primary domain that shares the same
566
535
* L2 cache.
@@ -601,23 +570,6 @@ static s32 pick_idle_cpu(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bo
601
570
goto out_put_cpumask ;
602
571
}
603
572
604
- /*
605
- * We couldn't find any idle CPU, return the previous CPU if it is in
606
- * the task's L3 domain.
607
- */
608
- if (is_prev_llc_affine ) {
609
- cpu = prev_cpu ;
610
- goto out_put_cpumask ;
611
- }
612
-
613
- /*
614
- * Otherwise, return a random CPU in the task's L3 domain (if
615
- * available).
616
- */
617
- cpu = bpf_cpumask_any_distribute (l3_mask );
618
- if (cpu >= nr_cpu_ids )
619
- cpu = prev_cpu ;
620
-
621
573
out_put_cpumask :
622
574
scx_bpf_put_cpumask (idle_cpumask );
623
575
scx_bpf_put_cpumask (idle_smtmask );
@@ -646,9 +598,7 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p,
646
598
647
599
cpu = pick_idle_cpu (p , prev_cpu , wake_flags , & is_idle );
648
600
if (is_idle ) {
649
- int node = __COMPAT_scx_bpf_cpu_node (cpu );
650
-
651
- if (local_pcpu || !scx_bpf_dsq_nr_queued (node )) {
601
+ if (local_pcpu || !scx_bpf_dsq_nr_queued (cpu )) {
652
602
scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL , slice_max , 0 );
653
603
__sync_fetch_and_add (& nr_direct_dispatches , 1 );
654
604
}
@@ -658,166 +608,54 @@ s32 BPF_STRUCT_OPS(bpfland_select_cpu, struct task_struct *p,
658
608
}
659
609
660
610
/*
661
- * Try to wake up an idle CPU that can immediately process the task .
611
+ * Try to migrate a task on a different CPU .
662
612
*
663
- * Return true if a CPU has been kicked, false otherwise .
613
+ * Return a CPU where the task can be migrated .
664
614
*/
665
- static bool kick_idle_cpu (const struct task_struct * p , const struct task_ctx * tctx ,
666
- s32 prev_cpu , bool idle_smt )
615
+ static s32 try_migrate (const struct task_struct * p , const struct task_ctx * tctx ,
616
+ s32 prev_cpu , bool idle_smt )
667
617
{
668
618
const struct cpumask * mask ;
669
619
u64 flags = idle_smt ? SCX_PICK_IDLE_CORE : 0 ;
670
- s32 cpu = scx_bpf_task_cpu (p );
671
- int node = __COMPAT_scx_bpf_cpu_node (cpu );
620
+ s32 cpu ;
672
621
673
622
/*
674
623
* No need to look for full-idle SMT cores if SMT is disabled.
675
624
*/
676
625
if (idle_smt && !smt_enabled )
677
- return false;
626
+ return - ENOENT ;
627
+
628
+ /*
629
+ * Don't try to migrate if the task can only run on one CPU.
630
+ */
631
+ if (p -> nr_cpus_allowed == 1 || is_migration_disabled (p ))
632
+ return prev_cpu ;
678
633
679
634
/*
680
635
* Try to reuse the same CPU if idle.
681
636
*/
682
- if (!idle_smt || (idle_smt && is_fully_idle (prev_cpu ))) {
683
- if (scx_bpf_test_and_clear_cpu_idle (prev_cpu )) {
684
- scx_bpf_kick_cpu (prev_cpu , SCX_KICK_IDLE );
685
- return true;
686
- }
687
- }
637
+ if (scx_bpf_test_and_clear_cpu_idle (prev_cpu ))
638
+ return prev_cpu ;
688
639
689
640
/*
690
641
* Look for any idle CPU usable by the task that can immediately
691
642
* execute the task, prioritizing SMT isolation and cache locality.
692
643
*/
693
644
mask = cast_mask (tctx -> l2_cpumask );
694
645
if (mask ) {
695
- cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (mask , node ,
696
- flags | __COMPAT_SCX_PICK_IDLE_IN_NODE );
697
- if (cpu >= 0 ) {
698
- scx_bpf_kick_cpu (cpu , SCX_KICK_IDLE );
699
- return true;
700
- }
646
+ cpu = scx_bpf_pick_idle_cpu (mask , flags );
647
+ if (cpu >= 0 )
648
+ return cpu ;
701
649
}
650
+
702
651
mask = cast_mask (tctx -> l3_cpumask );
703
652
if (mask ) {
704
- cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (mask , node ,
705
- flags | __COMPAT_SCX_PICK_IDLE_IN_NODE );
706
- if (cpu >= 0 ) {
707
- scx_bpf_kick_cpu (cpu , SCX_KICK_IDLE );
708
- return true;
709
- }
710
- }
711
- mask = cast_mask (tctx -> cpumask );
712
- if (mask ) {
713
- cpu = __COMPAT_scx_bpf_pick_idle_cpu_node (mask , node ,
714
- flags | __COMPAT_SCX_PICK_IDLE_IN_NODE );
715
- if (cpu >= 0 ) {
716
- scx_bpf_kick_cpu (cpu , SCX_KICK_IDLE );
717
- return true;
718
- }
653
+ cpu = scx_bpf_pick_idle_cpu (mask , flags );
654
+ if (cpu >= 0 )
655
+ return cpu ;
719
656
}
720
657
721
- return false;
722
- }
723
-
724
- /*
725
- * Attempt to dispatch a task directly to its assigned CPU.
726
- *
727
- * Return true if the task is dispatched, false otherwise.
728
- */
729
- static bool try_direct_dispatch (struct task_struct * p , struct task_ctx * tctx ,
730
- s32 prev_cpu , u64 slice , u64 enq_flags )
731
- {
732
- /*
733
- * If a task has been re-enqueued because its assigned CPU has been
734
- * taken by a higher priority scheduling class, force it to follow
735
- * the regular scheduling path and give it a chance to run on a
736
- * different CPU.
737
- *
738
- * However, if the task can only run on a single CPU, re-scheduling
739
- * is unnecessary, as it can only be dispatched on that specific
740
- * CPU. In this case, dispatch it immediately to maximize its
741
- * chances of reclaiming the CPU quickly and avoiding stalls.
742
- *
743
- * This approach will be effective once dl_server support is added
744
- * to the sched_ext core.
745
- */
746
- if (enq_flags & SCX_ENQ_REENQ ) {
747
- if (p -> nr_cpus_allowed == 1 ) {
748
- scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL , slice_max , enq_flags );
749
- __sync_fetch_and_add (& nr_kthread_dispatches , 1 );
750
-
751
- return true;
752
- }
753
- return false;
754
- }
755
-
756
- /*
757
- * If local_kthread is specified dispatch per-CPU kthreads
758
- * directly on their assigned CPU.
759
- */
760
- if (local_kthreads && is_kthread (p )) {
761
- scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL , slice_max , enq_flags );
762
- __sync_fetch_and_add (& nr_kthread_dispatches , 1 );
763
-
764
- return true;
765
- }
766
-
767
- /*
768
- * If ops.select_cpu() has been skipped, try direct dispatch.
769
- */
770
- if (!__COMPAT_is_enq_cpu_selected (enq_flags )) {
771
- int node = __COMPAT_scx_bpf_cpu_node (prev_cpu );
772
- struct rq * rq = scx_bpf_cpu_rq (prev_cpu );
773
-
774
- /*
775
- * Allow to preempt the task currently running on the
776
- * assigned CPU if our deadline is earlier.
777
- */
778
- if (!no_preempt && tctx -> deadline < rq -> curr -> scx .dsq_vtime ) {
779
- scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL_ON | prev_cpu ,
780
- slice , enq_flags | SCX_ENQ_PREEMPT );
781
- __sync_fetch_and_add (& nr_direct_dispatches , 1 );
782
-
783
- return true;
784
- }
785
-
786
- /*
787
- * If local_pcpu is enabled always dispatch tasks that can only run
788
- * on one CPU directly.
789
- *
790
- * This can help to improve I/O workloads (like large parallel
791
- * builds).
792
- */
793
- if (local_pcpu && p -> nr_cpus_allowed == 1 ) {
794
- scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL , slice , enq_flags );
795
- __sync_fetch_and_add (& nr_direct_dispatches , 1 );
796
-
797
- return true;
798
- }
799
-
800
- /*
801
- * If the local DSQ and the shared DSQ have no task waiting
802
- * and the CPU is still a full-idle SMT core, perform a
803
- * direct dispatch.
804
- */
805
- if (!scx_bpf_dsq_nr_queued (SCX_DSQ_LOCAL_ON | prev_cpu ) &&
806
- (local_pcpu || !scx_bpf_dsq_nr_queued (node )) &&
807
- is_fully_idle (prev_cpu )) {
808
- scx_bpf_dsq_insert (p , SCX_DSQ_LOCAL_ON | prev_cpu ,
809
- slice_max , enq_flags );
810
- __sync_fetch_and_add (& nr_direct_dispatches , 1 );
811
-
812
- return true;
813
- }
814
- }
815
-
816
- /*
817
- * Direct dispatch not possible, follow the regular scheduling
818
- * path.
819
- */
820
- return false;
658
+ return - EBUSY ;
821
659
}
822
660
823
661
/*
@@ -829,38 +667,32 @@ void BPF_STRUCT_OPS(bpfland_enqueue, struct task_struct *p, u64 enq_flags)
829
667
const struct cpumask * idle_cpumask ;
830
668
struct task_ctx * tctx ;
831
669
u64 slice , deadline ;
832
- s32 prev_cpu = scx_bpf_task_cpu (p );
833
- int node = __COMPAT_scx_bpf_cpu_node (prev_cpu );
670
+ s32 prev_cpu = scx_bpf_task_cpu (p ), cpu = - ENOENT ;
834
671
835
672
/*
836
673
* Dispatch regular tasks to the shared DSQ.
837
674
*/
838
675
tctx = try_lookup_task_ctx (p );
839
676
if (!tctx )
840
677
return ;
841
- deadline = task_deadline (p , tctx );
842
- slice = CLAMP (slice_max / nr_tasks_waiting (node ), slice_min , slice_max );
843
678
844
679
/*
845
- * Try to dispatch the task directly, if possible.
680
+ * Allow tasks to migrate from ops.enqueue() if ops.select_cpu()
681
+ * was skipped and the current CPU is busy.
846
682
*/
847
- if (try_direct_dispatch (p , tctx , prev_cpu , slice , enq_flags ))
848
- return ;
683
+ cpu = try_migrate (p , tctx , prev_cpu , true);
684
+ if (cpu < 0 )
685
+ cpu = try_migrate (p , tctx , prev_cpu , false);
686
+ if (cpu < 0 )
687
+ cpu = prev_cpu ;
849
688
850
- scx_bpf_dsq_insert_vtime (p , node , slice , deadline , enq_flags );
689
+ deadline = task_deadline (p , tctx );
690
+ slice = CLAMP (slice_max / nr_tasks_waiting (cpu ), slice_min , slice_max );
691
+
692
+ scx_bpf_dsq_insert_vtime (p , cpu , slice , deadline , enq_flags );
851
693
__sync_fetch_and_add (& nr_shared_dispatches , 1 );
852
694
853
- /*
854
- * If there are idle CPUs in the system try to proactively wake up
855
- * one, so that it can immediately execute the task in case its
856
- * current CPU is busy (always prioritizing full-idle SMT cores
857
- * first, if present).
858
- */
859
- idle_cpumask = __COMPAT_scx_bpf_get_idle_cpumask_node (node );
860
- if (!bpf_cpumask_empty (idle_cpumask ))
861
- if (!kick_idle_cpu (p , tctx , prev_cpu , true))
862
- kick_idle_cpu (p , tctx , prev_cpu , false);
863
- scx_bpf_put_cpumask (idle_cpumask );
695
+ scx_bpf_kick_cpu (cpu , SCX_KICK_IDLE );
864
696
}
865
697
866
698
static bool keep_running (const struct task_struct * p , s32 cpu )
@@ -912,13 +744,11 @@ static bool keep_running(const struct task_struct *p, s32 cpu)
912
744
913
745
void BPF_STRUCT_OPS (bpfland_dispatch , s32 cpu , struct task_struct * prev )
914
746
{
915
- int node = __COMPAT_scx_bpf_cpu_node (cpu );
916
-
917
747
/*
918
748
* Consume regular tasks from the shared DSQ, transferring them to the
919
749
* local CPU DSQ.
920
750
*/
921
- if (scx_bpf_dsq_move_to_local (node ))
751
+ if (scx_bpf_dsq_move_to_local (cpu ))
922
752
return ;
923
753
924
754
/*
@@ -1256,7 +1086,7 @@ static void init_cpuperf_target(void)
1256
1086
1257
1087
s32 BPF_STRUCT_OPS_SLEEPABLE (bpfland_init )
1258
1088
{
1259
- int err , node ;
1089
+ int err , cpu ;
1260
1090
1261
1091
/* Initialize amount of online and possible CPUs */
1262
1092
nr_online_cpus = get_nr_online_cpus ();
@@ -1268,10 +1098,10 @@ s32 BPF_STRUCT_OPS_SLEEPABLE(bpfland_init)
1268
1098
/*
1269
1099
* Create the global shared DSQ.
1270
1100
*/
1271
- bpf_for (node , 0 , __COMPAT_scx_bpf_nr_node_ids () ) {
1272
- err = scx_bpf_create_dsq (node , node );
1101
+ bpf_for (cpu , 0 , nr_cpu_ids ) {
1102
+ err = scx_bpf_create_dsq (cpu , __COMPAT_scx_bpf_cpu_node ( cpu ) );
1273
1103
if (err ) {
1274
- scx_bpf_error ("failed to create DSQ %d: %d" , node , err );
1104
+ scx_bpf_error ("failed to create DSQ %d: %d" , cpu , err );
1275
1105
return err ;
1276
1106
}
1277
1107
}
0 commit comments