@@ -491,6 +491,10 @@ private[spark] class ExecutorAllocationManager(
491
491
numExecutorsToAdd = 1
492
492
}
493
493
494
+ private case class StageAttempt (stageId : Int , stageAttemptId : Int ) {
495
+ override def toString : String = s " Stage $stageId (Attempt $stageAttemptId) "
496
+ }
497
+
494
498
/**
495
499
* A listener that notifies the given allocation manager of when to add and remove executors.
496
500
*
@@ -499,29 +503,32 @@ private[spark] class ExecutorAllocationManager(
499
503
*/
500
504
private [spark] class ExecutorAllocationListener extends SparkListener {
501
505
502
- private val stageIdToNumTasks = new mutable.HashMap [Int , Int ]
503
- // Number of running tasks per stage including speculative tasks.
506
+ private val stageAttemptToNumTasks = new mutable.HashMap [StageAttempt , Int ]
507
+ // Number of running tasks per stageAttempt including speculative tasks.
504
508
// Should be 0 when no stages are active.
505
- private val stageIdToNumRunningTask = new mutable.HashMap [Int , Int ]
506
- private val stageIdToTaskIndices = new mutable.HashMap [Int , mutable.HashSet [Int ]]
507
- // Number of speculative tasks to be scheduled in each stage
508
- private val stageIdToNumSpeculativeTasks = new mutable.HashMap [Int , Int ]
509
- // The speculative tasks started in each stage
510
- private val stageIdToSpeculativeTaskIndices = new mutable.HashMap [Int , mutable.HashSet [Int ]]
511
-
512
- // stageId to tuple (the number of task with locality preferences, a map where each pair is a
513
- // node and the number of tasks that would like to be scheduled on that node) map,
514
- // maintain the executor placement hints for each stage Id used by resource framework to better
515
- // place the executors.
516
- private val stageIdToExecutorPlacementHints = new mutable.HashMap [Int , (Int , Map [String , Int ])]
509
+ private val stageAttemptToNumRunningTask = new mutable.HashMap [StageAttempt , Int ]
510
+ private val stageAttemptToTaskIndices = new mutable.HashMap [StageAttempt , mutable.HashSet [Int ]]
511
+ // Number of speculative tasks to be scheduled in each stageAttempt
512
+ private val stageAttemptToNumSpeculativeTasks = new mutable.HashMap [StageAttempt , Int ]
513
+ // The speculative tasks started in each stageAttempt
514
+ private val stageAttemptToSpeculativeTaskIndices =
515
+ new mutable.HashMap [StageAttempt , mutable.HashSet [Int ]]
516
+
517
+ // stageAttempt to tuple (the number of task with locality preferences, a map where each pair
518
+ // is a node and the number of tasks that would like to be scheduled on that node) map,
519
+ // maintain the executor placement hints for each stageAttempt used by resource framework
520
+ // to better place the executors.
521
+ private val stageAttemptToExecutorPlacementHints =
522
+ new mutable.HashMap [StageAttempt , (Int , Map [String , Int ])]
517
523
518
524
override def onStageSubmitted (stageSubmitted : SparkListenerStageSubmitted ): Unit = {
519
525
initializing = false
520
526
val stageId = stageSubmitted.stageInfo.stageId
527
+ val stageAttemptId = stageSubmitted.stageInfo.attemptNumber()
528
+ val stageAttempt = StageAttempt (stageId, stageAttemptId)
521
529
val numTasks = stageSubmitted.stageInfo.numTasks
522
530
allocationManager.synchronized {
523
- stageIdToNumTasks(stageId) = numTasks
524
- stageIdToNumRunningTask(stageId) = 0
531
+ stageAttemptToNumTasks(stageAttempt) = numTasks
525
532
allocationManager.onSchedulerBacklogged()
526
533
527
534
// Compute the number of tasks requested by the stage on each host
@@ -536,7 +543,7 @@ private[spark] class ExecutorAllocationManager(
536
543
}
537
544
}
538
545
}
539
- stageIdToExecutorPlacementHints .put(stageId ,
546
+ stageAttemptToExecutorPlacementHints .put(stageAttempt ,
540
547
(numTasksPending, hostToLocalTaskCountPerStage.toMap))
541
548
542
549
// Update the executor placement hints
@@ -546,40 +553,44 @@ private[spark] class ExecutorAllocationManager(
546
553
547
554
override def onStageCompleted (stageCompleted : SparkListenerStageCompleted ): Unit = {
548
555
val stageId = stageCompleted.stageInfo.stageId
556
+ val stageAttemptId = stageCompleted.stageInfo.attemptNumber()
557
+ val stageAttempt = StageAttempt (stageId, stageAttemptId)
549
558
allocationManager.synchronized {
550
- stageIdToNumTasks -= stageId
551
- stageIdToNumRunningTask -= stageId
552
- stageIdToNumSpeculativeTasks -= stageId
553
- stageIdToTaskIndices -= stageId
554
- stageIdToSpeculativeTaskIndices -= stageId
555
- stageIdToExecutorPlacementHints -= stageId
559
+ // do NOT remove stageAttempt from stageAttemptToNumRunningTasks,
560
+ // because the attempt may still have running tasks,
561
+ // even after another attempt for the stage is submitted.
562
+ stageAttemptToNumTasks -= stageAttempt
563
+ stageAttemptToNumSpeculativeTasks -= stageAttempt
564
+ stageAttemptToTaskIndices -= stageAttempt
565
+ stageAttemptToSpeculativeTaskIndices -= stageAttempt
566
+ stageAttemptToExecutorPlacementHints -= stageAttempt
556
567
557
568
// Update the executor placement hints
558
569
updateExecutorPlacementHints()
559
570
560
571
// If this is the last stage with pending tasks, mark the scheduler queue as empty
561
572
// This is needed in case the stage is aborted for any reason
562
- if (stageIdToNumTasks .isEmpty && stageIdToNumSpeculativeTasks .isEmpty) {
573
+ if (stageAttemptToNumTasks .isEmpty && stageAttemptToNumSpeculativeTasks .isEmpty) {
563
574
allocationManager.onSchedulerQueueEmpty()
564
575
}
565
576
}
566
577
}
567
578
568
579
override def onTaskStart (taskStart : SparkListenerTaskStart ): Unit = {
569
580
val stageId = taskStart.stageId
581
+ val stageAttemptId = taskStart.stageAttemptId
582
+ val stageAttempt = StageAttempt (stageId, stageAttemptId)
570
583
val taskIndex = taskStart.taskInfo.index
571
-
572
584
allocationManager.synchronized {
573
- if (stageIdToNumRunningTask.contains(stageId)) {
574
- stageIdToNumRunningTask(stageId) += 1
575
- }
576
-
585
+ stageAttemptToNumRunningTask(stageAttempt) =
586
+ stageAttemptToNumRunningTask.getOrElse(stageAttempt, 0 ) + 1
577
587
// If this is the last pending task, mark the scheduler queue as empty
578
588
if (taskStart.taskInfo.speculative) {
579
- stageIdToSpeculativeTaskIndices .getOrElseUpdate(stageId, new mutable. HashSet [ Int ]) +=
580
- taskIndex
589
+ stageAttemptToSpeculativeTaskIndices .getOrElseUpdate(stageAttempt,
590
+ new mutable. HashSet [ Int ]) += taskIndex
581
591
} else {
582
- stageIdToTaskIndices.getOrElseUpdate(stageId, new mutable.HashSet [Int ]) += taskIndex
592
+ stageAttemptToTaskIndices.getOrElseUpdate(stageAttempt,
593
+ new mutable.HashSet [Int ]) += taskIndex
583
594
}
584
595
if (totalPendingTasks() == 0 ) {
585
596
allocationManager.onSchedulerQueueEmpty()
@@ -588,13 +599,17 @@ private[spark] class ExecutorAllocationManager(
588
599
}
589
600
590
601
override def onTaskEnd (taskEnd : SparkListenerTaskEnd ): Unit = {
591
- val taskIndex = taskEnd.taskInfo.index
592
602
val stageId = taskEnd.stageId
603
+ val stageAttemptId = taskEnd.stageAttemptId
604
+ val stageAttempt = StageAttempt (stageId, stageAttemptId)
605
+ val taskIndex = taskEnd.taskInfo.index
593
606
allocationManager.synchronized {
594
- if (stageIdToNumRunningTask.contains(stageId)) {
595
- stageIdToNumRunningTask(stageId) -= 1
607
+ if (stageAttemptToNumRunningTask.contains(stageAttempt)) {
608
+ stageAttemptToNumRunningTask(stageAttempt) -= 1
609
+ if (stageAttemptToNumRunningTask(stageAttempt) == 0 ) {
610
+ stageAttemptToNumRunningTask -= stageAttempt
611
+ }
596
612
}
597
-
598
613
// If the task failed, we expect it to be resubmitted later. To ensure we have
599
614
// enough resources to run the resubmitted task, we need to mark the scheduler
600
615
// as backlogged again if it's not already marked as such (SPARK-8366)
@@ -603,21 +618,22 @@ private[spark] class ExecutorAllocationManager(
603
618
allocationManager.onSchedulerBacklogged()
604
619
}
605
620
if (taskEnd.taskInfo.speculative) {
606
- stageIdToSpeculativeTaskIndices .get(stageId ).foreach {_.remove(taskIndex)}
621
+ stageAttemptToSpeculativeTaskIndices .get(stageAttempt ).foreach {_.remove(taskIndex)}
607
622
} else {
608
- stageIdToTaskIndices .get(stageId ).foreach {_.remove(taskIndex)}
623
+ stageAttemptToTaskIndices .get(stageAttempt ).foreach {_.remove(taskIndex)}
609
624
}
610
625
}
611
626
}
612
627
}
613
628
614
629
override def onSpeculativeTaskSubmitted (speculativeTask : SparkListenerSpeculativeTaskSubmitted )
615
630
: Unit = {
616
- val stageId = speculativeTask.stageId
617
-
631
+ val stageId = speculativeTask.stageId
632
+ val stageAttemptId = speculativeTask.stageAttemptId
633
+ val stageAttempt = StageAttempt (stageId, stageAttemptId)
618
634
allocationManager.synchronized {
619
- stageIdToNumSpeculativeTasks(stageId ) =
620
- stageIdToNumSpeculativeTasks .getOrElse(stageId , 0 ) + 1
635
+ stageAttemptToNumSpeculativeTasks(stageAttempt ) =
636
+ stageAttemptToNumSpeculativeTasks .getOrElse(stageAttempt , 0 ) + 1
621
637
allocationManager.onSchedulerBacklogged()
622
638
}
623
639
}
@@ -629,14 +645,14 @@ private[spark] class ExecutorAllocationManager(
629
645
* Note: This is not thread-safe without the caller owning the `allocationManager` lock.
630
646
*/
631
647
def pendingTasks (): Int = {
632
- stageIdToNumTasks .map { case (stageId , numTasks) =>
633
- numTasks - stageIdToTaskIndices .get(stageId ).map(_.size).getOrElse(0 )
648
+ stageAttemptToNumTasks .map { case (stageAttempt , numTasks) =>
649
+ numTasks - stageAttemptToTaskIndices .get(stageAttempt ).map(_.size).getOrElse(0 )
634
650
}.sum
635
651
}
636
652
637
653
def pendingSpeculativeTasks (): Int = {
638
- stageIdToNumSpeculativeTasks .map { case (stageId , numTasks) =>
639
- numTasks - stageIdToSpeculativeTaskIndices .get(stageId ).map(_.size).getOrElse(0 )
654
+ stageAttemptToNumSpeculativeTasks .map { case (stageAttempt , numTasks) =>
655
+ numTasks - stageAttemptToSpeculativeTaskIndices .get(stageAttempt ).map(_.size).getOrElse(0 )
640
656
}.sum
641
657
}
642
658
@@ -646,9 +662,10 @@ private[spark] class ExecutorAllocationManager(
646
662
647
663
/**
648
664
* The number of tasks currently running across all stages.
665
+ * Include running-but-zombie stage attempts
649
666
*/
650
667
def totalRunningTasks (): Int = {
651
- stageIdToNumRunningTask .values.sum
668
+ stageAttemptToNumRunningTask .values.sum
652
669
}
653
670
654
671
/**
@@ -662,7 +679,7 @@ private[spark] class ExecutorAllocationManager(
662
679
def updateExecutorPlacementHints (): Unit = {
663
680
var localityAwareTasks = 0
664
681
val localityToCount = new mutable.HashMap [String , Int ]()
665
- stageIdToExecutorPlacementHints .values.foreach { case (numTasksPending, localities) =>
682
+ stageAttemptToExecutorPlacementHints .values.foreach { case (numTasksPending, localities) =>
666
683
localityAwareTasks += numTasksPending
667
684
localities.foreach { case (hostname, count) =>
668
685
val updatedCount = localityToCount.getOrElse(hostname, 0 ) + count
0 commit comments