Skip to content

Commit

Permalink
Add job watcher
Browse files Browse the repository at this point in the history
  • Loading branch information
DrJosh9000 committed Dec 5, 2024
1 parent 4df35d2 commit 1ea1b55
Show file tree
Hide file tree
Showing 12 changed files with 522 additions and 16 deletions.
6 changes: 6 additions & 0 deletions .buildkite/rbac.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,12 @@ rules:
- pods/eviction
verbs:
- create
- apiGroups:
- ""
resources:
- events
verbs:
- list
---
apiVersion: v1
kind: ServiceAccount
Expand Down
6 changes: 6 additions & 0 deletions charts/agent-stack-k8s/templates/rbac.yaml.tpl
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,12 @@ rules:
- pods/eviction
verbs:
- create
- apiGroups:
- ""
resources:
- events
verbs:
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
Expand Down
5 changes: 5 additions & 0 deletions cmd/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,11 @@ func AddConfigFlags(cmd *cobra.Command) {
config.DefaultJobCancelCheckerPollInterval,
"Controls the interval between job state queries while a pod is still Pending",
)
cmd.Flags().Duration(
"empty-job-grace-period",
config.DefaultEmptyJobGracePeriod,
"Duration after starting a Kubernetes job that the controller will wait before considering failing the job due to a missing pod (e.g. when the podSpec specifies a missing service account)",
)
cmd.Flags().Bool(
"prohibit-kubernetes-plugin",
false,
Expand Down
1 change: 1 addition & 0 deletions cmd/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ func TestReadAndParseConfig(t *testing.T) {
JobTTL: 300 * time.Second,
ImagePullBackOffGracePeriod: 60 * time.Second,
JobCancelCheckerPollInterval: 10 * time.Second,
EmptyJobGracePeriod: 50 * time.Second,
PollInterval: 5 * time.Second,
StaleJobDataTimeout: 10 * time.Second,
JobCreationConcurrency: 5,
Expand Down
1 change: 1 addition & 0 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ image: my.registry.dev/buildkite-agent:latest
job-ttl: 5m
image-pull-backoff-grace-period: 60s
job-cancel-checker-poll-interval: 10s
empty-job-grace-period: 50s
poll-interval: 5s
stale-job-data-timeout: 10s
job-creation-concurrency: 5
Expand Down
2 changes: 2 additions & 0 deletions internal/controller/config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ const (
DefaultStaleJobDataTimeout = 10 * time.Second
DefaultImagePullBackOffGracePeriod = 30 * time.Second
DefaultJobCancelCheckerPollInterval = 5 * time.Second
DefaultEmptyJobGracePeriod = 30 * time.Second
DefaultJobCreationConcurrency = 5
)

Expand Down Expand Up @@ -51,6 +52,7 @@ type Config struct {
PodSpecPatch *corev1.PodSpec `json:"pod-spec-patch" validate:"omitempty"`
ImagePullBackOffGracePeriod time.Duration `json:"image-pull-backoff-grace-period" validate:"omitempty"`
JobCancelCheckerPollInterval time.Duration `json:"job-cancel-checker-poll-interval" validate:"omitempty"`
EmptyJobGracePeriod time.Duration `json:"empty-job-grace-period" validate:"omitempty"`

// WorkspaceVolume allows supplying a volume for /workspace. By default
// an EmptyDir volume is created for it.
Expand Down
13 changes: 13 additions & 0 deletions internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,20 @@ func Run(
logger.Fatal("failed to register completions informer", zap.Error(err))
}

// JobWatcher watches for jobs in bad conditions to clean up:
// * Jobs that fail without ever creating a pod
// * Jobs that stall forever without ever creating a pod
jobWatcher := scheduler.NewJobWatcher(
logger.Named("jobWatcher"),
k8sClient,
cfg,
)
if err := jobWatcher.RegisterInformer(ctx, informerFactory); err != nil {
logger.Fatal("failed to register jobWatcher informer", zap.Error(err))
}

// PodWatcher watches for other conditions to clean up pods:
// * Pods where an init container failed for any reason
// * Pods where a container is in ImagePullBackOff for too long
// * Pods that are still pending, but the Buildkite job has been cancelled
podWatcher := scheduler.NewPodWatcher(
Expand Down
Loading

0 comments on commit 1ea1b55

Please sign in to comment.