Skip to content

Commit

Permalink
fix(hatchery:k8s): add delay before pending workers cleanup (#6107)
Browse files Browse the repository at this point in the history
  • Loading branch information
richardlt authored Mar 10, 2022
1 parent f0e0d92 commit 0030f53
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 12 deletions.
28 changes: 16 additions & 12 deletions engine/hatchery/kubernetes/kill_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,26 +40,29 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
continue
}

var toDelete, found bool
for _, w := range workers {
if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name {
found = true
var toDelete bool
for _, container := range pod.Status.ContainerStatuses {
terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error"))
errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull")
if terminated || errImagePull {
toDelete = true
log.Debug(ctx, "pod %s/%s is terminated or in error", pod.Namespace, pod.Name)
break
}
}
if !found {
toDelete = true
}

if !toDelete {
for _, container := range pod.Status.ContainerStatuses {
terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error"))
errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull")
if terminated || errImagePull {
toDelete = true
var found bool
for _, w := range workers {
if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name {
found = true
break
}
}
if !found && time.Since(pod.CreationTimestamp.Time) > 3*time.Minute {
toDelete = true
log.Debug(ctx, "pod %s/%s didn't match a registered worker and was started since %v", pod.Namespace, pod.Name, pod.CreationTimestamp.Time)
}
}

if toDelete {
Expand Down Expand Up @@ -130,6 +133,7 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error {
globalErr = err
log.Error(ctx, "hatchery:kubernetes> killAwolWorkers> Cannot delete pod %s (%s)", pod.Name, err)
}
log.Debug(ctx, "pod %s/%s killed", pod.Namespace, pod.Name)
}
}
return globalErr
Expand Down
11 changes: 11 additions & 0 deletions engine/hatchery/kubernetes/kill_workers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,17 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) {
},
},
},
{
ObjectMeta: metav1.ObjectMeta{
Name: "worker-6",
Namespace: "cds-workers",
Labels: map[string]string{
LABEL_HATCHERY_NAME: "my-hatchery",
LABEL_WORKER_NAME: "worker-6",
},
CreationTimestamp: metav1.Now(),
},
},
},
}
gock.New("http://lolcat.kube").Get("/api/v1/namespaces/cds-workers/pods").Reply(http.StatusOK).JSON(podsList)
Expand Down

0 comments on commit 0030f53

Please sign in to comment.