From 7e6382f0671c3776e31381a12da31f9754222438 Mon Sep 17 00:00:00 2001 From: Tedi Mitiku Date: Wed, 22 Nov 2023 12:33:03 -0500 Subject: [PATCH] fix: always restart logs aggregator (#1841) ## Description: This change strengthens the restart policy for the logs aggregator. Prior to this, the restart only occurred on failure. Now, we make docker attempt to always restart the logs aggregator. This should help address https://github.com/kurtosis-tech/kurtosis/issues/1832 where the logs aggregator was stopped with a `137` status code but wasn't restarted. This change also addresses a `Propagate must be provided with a cause` panic occurred here: https://github.com/kurtosis-tech/kurtosis/issues/1832. This was caused by nil err's being propagated in the create logs collector code. This change fixes that issue. ## Is this change user facing? NO ## References: https://github.com/kurtosis-tech/kurtosis/issues/1832 https://github.com/kurtosis-tech/kurtosis/issues/1311 --- .../docker_kurtosis_backend/docker_kurtosis_backend.go | 6 +++--- .../vector/vector_container_config_provider.go | 2 +- .../backend_impls/docker/docker_manager/docker_manager.go | 1 + 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go index dae8ce3784..a82360c4f6 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/docker_kurtosis_backend.go @@ -415,14 +415,14 @@ func (backend *DockerKurtosisBackend) CreateLogsCollectorForEnclave( if maybeLogsAggregator == nil { logrus.Warnf("Logs aggregator container does not exist. This is unexpected as docker should have restarted the container automatically.") logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.") - return nil, stacktrace.Propagate(err, "No logs aggregator container exists. The logs collector cannot be run without a logs aggregator.") + return nil, stacktrace.NewError("No logs aggregator container exists. The logs collector cannot be run without a logs aggregator.") } if maybeLogsAggregator.GetStatus() != container.ContainerStatus_Running { logrus.Warnf("Logs aggregator exists but is not running. Instead container status is '%v'. This is unexpected as docker should have restarted the container automatically.", maybeLogsAggregator.GetStatus()) logrus.Warnf("This can be fixed by restarting the engine using `kurtosis engine restart` and attempting to create the enclave again.") - return nil, stacktrace.Propagate(err, - "The logs aggregator container exists but is not running. Instead container status is '%v'. The logs collector cannot be run without a logs aggregator.", + return nil, stacktrace.NewError( + "The logs aggregator container exists but is not running. Instead logs aggregator container status is '%v'. The logs collector cannot be run without a logs aggregator.", maybeLogsAggregator.GetStatus(), ) } diff --git a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go index 9698cb2fd1..31d42b6456 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_kurtosis_backend/logs_aggregator_functions/implementations/vector/vector_container_config_provider.go @@ -54,7 +54,7 @@ func (vector *vectorContainerConfigProvider) GetContainerArgs( // The logs aggregator should ALWAYS be running to ensure that no logs are lost for services in enclaves // Thus, instruct docker to restart the container if it exits with non-zero status code for whatever reason - restartPolicy := docker_manager.RestartPolicy("on-failure") + restartPolicy := docker_manager.RestartPolicy(docker_manager.RestartAlways) createAndStartArgs := docker_manager.NewCreateAndStartContainerArgsBuilder( containerImage, diff --git a/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go b/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go index f17e9f7c73..f550a5e2e7 100644 --- a/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go +++ b/container-engine-lib/lib/backend_impls/docker/docker_manager/docker_manager.go @@ -142,6 +142,7 @@ const ( type RestartPolicy string const ( + RestartAlways = "always" RestartOnFailure = "on-failure" NoRestart = "" )