diff --git a/engine/hatchery/kubernetes/kill_workers.go b/engine/hatchery/kubernetes/kill_workers.go index fe0e10c13e..6ad02b311f 100644 --- a/engine/hatchery/kubernetes/kill_workers.go +++ b/engine/hatchery/kubernetes/kill_workers.go @@ -40,26 +40,29 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error { continue } - var toDelete, found bool - for _, w := range workers { - if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name { - found = true + var toDelete bool + for _, container := range pod.Status.ContainerStatuses { + terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error")) + errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull") + if terminated || errImagePull { + toDelete = true + log.Debug(ctx, "pod %s/%s is terminated or in error", pod.Namespace, pod.Name) break } } - if !found { - toDelete = true - } if !toDelete { - for _, container := range pod.Status.ContainerStatuses { - terminated := (container.State.Terminated != nil && (container.State.Terminated.Reason == "Completed" || container.State.Terminated.Reason == "Error")) - errImagePull := (container.State.Waiting != nil && container.State.Waiting.Reason == "ErrImagePull") - if terminated || errImagePull { - toDelete = true + var found bool + for _, w := range workers { + if workerName, ok := labels[LABEL_WORKER_NAME]; ok && workerName == w.Name { + found = true break } } + if !found && time.Since(pod.CreationTimestamp.Time) > 3*time.Minute { + toDelete = true + log.Debug(ctx, "pod %s/%s didn't match a registered worker and was started since %v", pod.Namespace, pod.Name, pod.CreationTimestamp.Time) + } } if toDelete { @@ -130,6 +133,7 @@ func (h *HatcheryKubernetes) killAwolWorkers(ctx context.Context) error { globalErr = err log.Error(ctx, "hatchery:kubernetes> killAwolWorkers> Cannot delete pod %s (%s)", pod.Name, err) } + log.Debug(ctx, "pod %s/%s killed", pod.Namespace, pod.Name) } } return globalErr diff --git a/engine/hatchery/kubernetes/kill_workers_test.go b/engine/hatchery/kubernetes/kill_workers_test.go index 8888bff74e..be0b714247 100644 --- a/engine/hatchery/kubernetes/kill_workers_test.go +++ b/engine/hatchery/kubernetes/kill_workers_test.go @@ -103,6 +103,17 @@ func TestHatcheryKubernetes_KillAwolWorkers(t *testing.T) { }, }, }, + { + ObjectMeta: metav1.ObjectMeta{ + Name: "worker-6", + Namespace: "cds-workers", + Labels: map[string]string{ + LABEL_HATCHERY_NAME: "my-hatchery", + LABEL_WORKER_NAME: "worker-6", + }, + CreationTimestamp: metav1.Now(), + }, + }, }, } gock.New("http://lolcat.kube").Get("/api/v1/namespaces/cds-workers/pods").Reply(http.StatusOK).JSON(podsList)