Skip to content

Commit 7a60a5d

Browse files
committed
Implement the PodTermination failure recovery action
1 parent 07f16a7 commit 7a60a5d

File tree

18 files changed

+453
-539
lines changed

18 files changed

+453
-539
lines changed

internal/mocks/controller/jobframework/interface.go

Lines changed: 0 additions & 344 deletions
This file was deleted.

pkg/constants/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,4 +44,9 @@ const (
4444
// ManagedByKueueLabelKey label that signalize that an object is managed by Kueue
4545
ManagedByKueueLabelKey = "kueue.x-k8s.io/managed"
4646
ManagedByKueueLabelValue = "true"
47+
48+
// PodSetLabel is a label set on the Job's PodTemplate to indicate the name
49+
// of the PodSet of the admitted Workload corresponding to the PodTemplate.
50+
// The label is set when starting the Job, and removed on stopping the Job.
51+
PodSetLabel = "kueue.x-k8s.io/podset"
4752
)

pkg/controller/constants/constants.go

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -43,9 +43,4 @@ const (
4343

4444
// MaxExecTimeSecondsLabel is the label key in the job that holds the maximum execution time.
4545
MaxExecTimeSecondsLabel = `kueue.x-k8s.io/max-exec-time-seconds`
46-
47-
// PodSetLabel is a label set on the Job's PodTemplate to indicate the name
48-
// of the PodSet of the admitted Workload corresponding to the PodTemplate.
49-
// The label is set when starting the Job, and removed on stopping the Job.
50-
PodSetLabel = "kueue.x-k8s.io/podset"
5146
)

pkg/controller/core/core.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
qcache "sigs.k8s.io/kueue/pkg/cache/queue"
2626
schdcache "sigs.k8s.io/kueue/pkg/cache/scheduler"
2727
"sigs.k8s.io/kueue/pkg/constants"
28+
"sigs.k8s.io/kueue/pkg/controller/failurerecovery"
2829
"sigs.k8s.io/kueue/pkg/features"
2930
)
3031

@@ -63,6 +64,13 @@ func SetupControllers(mgr ctrl.Manager, qManager *qcache.Manager, cc *schdcache.
6364
watchers = append(watchers, cohortRec)
6465
}
6566

67+
if features.Enabled(features.ZombiePodTermination) {
68+
zpRec := failurerecovery.NewTerminatingPodReconciler(mgr.GetClient())
69+
if err := zpRec.SetupWithManager(mgr); err != nil {
70+
return "ZombiePodTermination", err
71+
}
72+
}
73+
6674
cqRec := NewClusterQueueReconciler(
6775
mgr.GetClient(),
6876
qManager,

0 commit comments

Comments
 (0)