Skip to content

Commit 2e3bd8a

Browse files
authored
Implement the PodTermination controller to gracefully handle "stuck" pods (#7312)
* Add `FailureRecoveryPolicy` to `Configuration` Validate `FailureRecoveryPolicy` configuration * Implement the `PodTermination` failure recovery action * Add `failurerecovery` integration test suite * Use patch instead of update to terminate stuck pods * Fix indendation in example failure recovery policy * Realign the configuration with KEP changes * Validate that `terminatePod` is set on failure policy * Fix invalid label selector in config test * Remove the `FailureRecoveryPolicy` API * Add `FailureRecoveryPolicy` feature gate * Configure shorter termination grace period in test * Rename `gracePeriodLeft` to `remainingTime` * Fix option not being set on struct * Remove `util.node` in favor of `util.taints` * Add missing copyright headers * Fix feature gate being true by default * Adjust timeouts in integration test * Consolidate unhappy path test cases into a single test * Move annotation costants to the `constants` package * Remove helper methods from tests * Fix counting deletion grace period twice * Use event filter to unburden the reconciler * Adjust unit test * Move termination threshold check above node taint check * Ignore node not found errors * Emit an event upon forceful pod termination * Add `KueueFailureRecovery` condition after forceful pod termination * Add missing event filter unit tests * Simplify deletion update filter condition * Rename integration test timeout variable * Fix extra lines at the start of block
1 parent 6ff0042 commit 2e3bd8a

File tree

10 files changed

+796
-6
lines changed

10 files changed

+796
-6
lines changed

pkg/constants/constants.go

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -21,11 +21,12 @@ import (
2121
)
2222

2323
const (
24-
KueueName = "kueue"
25-
JobControllerName = KueueName + "-job-controller"
26-
WorkloadControllerName = KueueName + "-workload-controller"
27-
AdmissionName = KueueName + "-admission"
28-
ReclaimablePodsMgr = KueueName + "-reclaimable-pods"
24+
KueueName = "kueue"
25+
JobControllerName = KueueName + "-job-controller"
26+
WorkloadControllerName = KueueName + "-workload-controller"
27+
PodTerminationControllerName = KueueName + "-pod-termination-controller"
28+
AdmissionName = KueueName + "-admission"
29+
ReclaimablePodsMgr = KueueName + "-reclaimable-pods"
2930

3031
// UpdatesBatchPeriod is the batch period to hold workload updates
3132
// before syncing a Queue and ClusterQueue objects.

pkg/controller/constants/constants.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,4 +43,9 @@ const (
4343

4444
// MaxExecTimeSecondsLabel is the label key in the job that holds the maximum execution time.
4545
MaxExecTimeSecondsLabel = `kueue.x-k8s.io/max-exec-time-seconds`
46+
47+
// SafeToForcefullyTerminateAnnotationKey is the annotation key that controls whether a pod opted in to FailureRecoveryPolicy.
48+
SafeToForcefullyTerminateAnnotationKey = "kueue.x-k8s.io/safe-to-forcefully-terminate"
49+
// SafeToForcefullyTerminateAnnotationValue is the value of that annotation that enables FailureRecoveryPolicy for that pod.
50+
SafeToForcefullyTerminateAnnotationValue = "true"
4651
)

pkg/controller/core/core.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
qcache "sigs.k8s.io/kueue/pkg/cache/queue"
2626
schdcache "sigs.k8s.io/kueue/pkg/cache/scheduler"
2727
"sigs.k8s.io/kueue/pkg/constants"
28+
"sigs.k8s.io/kueue/pkg/controller/failurerecovery"
2829
"sigs.k8s.io/kueue/pkg/features"
2930
"sigs.k8s.io/kueue/pkg/scheduler/preemption/fairsharing"
3031
"sigs.k8s.io/kueue/pkg/util/waitforpodsready"
@@ -62,6 +63,20 @@ func SetupControllers(mgr ctrl.Manager, qManager *qcache.Manager, cc *schdcache.
6263
watchers = append(watchers, cohortRec)
6364
}
6465

66+
if features.Enabled(features.FailureRecoveryPolicy) {
67+
tpRec, err := failurerecovery.NewTerminatingPodReconciler(
68+
mgr.GetClient(),
69+
mgr.GetEventRecorderFor(constants.PodTerminationControllerName),
70+
)
71+
if err != nil {
72+
return "FailureRecoveryPolicy", err
73+
}
74+
75+
if err := tpRec.SetupWithManager(mgr); err != nil {
76+
return "FailureRecoveryPolicy", err
77+
}
78+
}
79+
6580
cqRec := NewClusterQueueReconciler(
6681
mgr.GetClient(),
6782
qManager,
Lines changed: 199 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,199 @@
1+
/*
2+
Copyright The Kubernetes Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package failurerecovery
18+
19+
import (
20+
"context"
21+
"fmt"
22+
"time"
23+
24+
corev1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/types"
26+
"k8s.io/client-go/tools/record"
27+
"k8s.io/utils/clock"
28+
"k8s.io/utils/ptr"
29+
ctrl "sigs.k8s.io/controller-runtime"
30+
"sigs.k8s.io/controller-runtime/pkg/client"
31+
"sigs.k8s.io/controller-runtime/pkg/event"
32+
33+
"sigs.k8s.io/kueue/pkg/controller/constants"
34+
utilpod "sigs.k8s.io/kueue/pkg/util/pod"
35+
utiltaints "sigs.k8s.io/kueue/pkg/util/taints"
36+
)
37+
38+
// +kubebuilder:rbac:groups="",resources=pods,verbs=get;list;watch
39+
// +kubebuilder:rbac:groups="",resources=pods/status,verbs=get;patch
40+
// +kubebuilder:rbac:groups="",resources=nodes,verbs=get;list;watch
41+
42+
var realClock = clock.RealClock{}
43+
44+
const (
45+
KueueFailureRecoveryConditionType = "KueueFailureRecovery"
46+
KueueForcefulTerminationReason = "KueueForcefullyTerminated"
47+
)
48+
49+
type TerminatingPodReconciler struct {
50+
client client.Client
51+
clock clock.Clock
52+
forcefulTerminationGracePeriod time.Duration
53+
recorder record.EventRecorder
54+
}
55+
56+
type TerminatingPodReconcilerOptions struct {
57+
clock clock.Clock
58+
forcefulTerminationGracePeriod time.Duration
59+
}
60+
61+
type TerminatingPodReconcilerOption func(*TerminatingPodReconcilerOptions)
62+
63+
func WithClock(c clock.Clock) TerminatingPodReconcilerOption {
64+
return func(o *TerminatingPodReconcilerOptions) {
65+
o.clock = c
66+
}
67+
}
68+
69+
func WithForcefulTerminationGracePeriod(t time.Duration) TerminatingPodReconcilerOption {
70+
return func(o *TerminatingPodReconcilerOptions) {
71+
o.forcefulTerminationGracePeriod = t
72+
}
73+
}
74+
75+
var defaultOptions = TerminatingPodReconcilerOptions{
76+
clock: realClock,
77+
forcefulTerminationGracePeriod: time.Minute,
78+
}
79+
80+
func NewTerminatingPodReconciler(
81+
client client.Client,
82+
recorder record.EventRecorder,
83+
opts ...TerminatingPodReconcilerOption,
84+
) (*TerminatingPodReconciler, error) {
85+
options := defaultOptions
86+
for _, opt := range opts {
87+
opt(&options)
88+
}
89+
90+
return &TerminatingPodReconciler{
91+
client: client,
92+
clock: options.clock,
93+
forcefulTerminationGracePeriod: options.forcefulTerminationGracePeriod,
94+
recorder: recorder,
95+
}, nil
96+
}
97+
98+
func (r *TerminatingPodReconciler) Generic(event.GenericEvent) bool {
99+
return false
100+
}
101+
102+
func (r *TerminatingPodReconciler) Create(e event.CreateEvent) bool {
103+
pod := e.Object.(*corev1.Pod)
104+
105+
if !podOptedInToFailurePolicy(pod) {
106+
return false
107+
}
108+
109+
if pod.DeletionTimestamp.IsZero() {
110+
return false
111+
}
112+
113+
return true
114+
}
115+
116+
func (r *TerminatingPodReconciler) Update(u event.UpdateEvent) bool {
117+
oldPod := u.ObjectOld.(*corev1.Pod)
118+
newPod := u.ObjectNew.(*corev1.Pod)
119+
120+
if !podOptedInToFailurePolicy(newPod) {
121+
return false
122+
}
123+
124+
// Pod was not marked for deletion in the update
125+
if !oldPod.DeletionTimestamp.IsZero() || newPod.DeletionTimestamp.IsZero() {
126+
return false
127+
}
128+
129+
return true
130+
}
131+
132+
func (r *TerminatingPodReconciler) Delete(event.DeleteEvent) bool {
133+
return false
134+
}
135+
136+
func podOptedInToFailurePolicy(p *corev1.Pod) bool {
137+
annotationValue, hasAnnotation := p.Annotations[constants.SafeToForcefullyTerminateAnnotationKey]
138+
return hasAnnotation && annotationValue == constants.SafeToForcefullyTerminateAnnotationValue
139+
}
140+
141+
func (r *TerminatingPodReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
142+
pod := &corev1.Pod{}
143+
if err := r.client.Get(ctx, req.NamespacedName, pod); err != nil {
144+
return ctrl.Result{}, client.IgnoreNotFound(err)
145+
}
146+
147+
// Pod was already terminated
148+
if utilpod.IsTerminated(pod) {
149+
return ctrl.Result{}, nil
150+
}
151+
152+
// Forceful termination threshold not reached
153+
now := r.clock.Now()
154+
forcefulTerminationThreshold := pod.DeletionTimestamp.Add(r.forcefulTerminationGracePeriod)
155+
if now.Before(forcefulTerminationThreshold) {
156+
remainingTime := forcefulTerminationThreshold.Sub(now)
157+
return ctrl.Result{RequeueAfter: remainingTime}, nil
158+
}
159+
160+
node := &corev1.Node{}
161+
nodeKey := types.NamespacedName{Name: pod.Spec.NodeName}
162+
if err := r.client.Get(ctx, nodeKey, node); err != nil {
163+
return ctrl.Result{}, client.IgnoreNotFound(err)
164+
}
165+
// Pod is not scheduled on an unreachable node
166+
if !utiltaints.TaintKeyExists(node.Spec.Taints, corev1.TaintNodeUnreachable) {
167+
return ctrl.Result{}, nil
168+
}
169+
170+
totalDeletionGracePeriod := time.Duration(ptr.Deref(pod.DeletionGracePeriodSeconds, 0)) + r.forcefulTerminationGracePeriod
171+
eventMessage := fmt.Sprintf(
172+
"Pod forcefully terminated after %s grace period due to unreachable node `%s` (triggered by `%s` annotation)",
173+
totalDeletionGracePeriod,
174+
node.Name,
175+
constants.SafeToForcefullyTerminateAnnotationKey,
176+
)
177+
178+
podPatch := pod.DeepCopy()
179+
podPatch.Status.Phase = corev1.PodFailed
180+
podPatch.Status.Conditions = append(podPatch.Status.Conditions, corev1.PodCondition{
181+
Type: KueueFailureRecoveryConditionType,
182+
Status: corev1.ConditionTrue,
183+
Reason: KueueForcefulTerminationReason,
184+
Message: eventMessage,
185+
})
186+
if err := r.client.Status().Patch(ctx, podPatch, client.MergeFrom(pod)); err != nil {
187+
return ctrl.Result{}, err
188+
}
189+
r.recorder.Event(pod, corev1.EventTypeWarning, KueueForcefulTerminationReason, eventMessage)
190+
191+
return ctrl.Result{}, nil
192+
}
193+
194+
func (r *TerminatingPodReconciler) SetupWithManager(mgr ctrl.Manager) error {
195+
return ctrl.NewControllerManagedBy(mgr).
196+
For(&corev1.Pod{}).
197+
WithEventFilter(r).
198+
Complete(r)
199+
}

0 commit comments

Comments
 (0)