Skip to content

Commit

Permalink
feat: adding error logs when K8s resource creation fails in the creat…
Browse files Browse the repository at this point in the history
…e enclave workflow (#2074)

## Description:
adding error logs when K8s resource creation fails in the create enclave
workflow

## Is this change user facing?
NO

## References (if applicable):
This was motivated during user support, we received this engine's logs
that show us something fails when creating the APIC's pod but we can't
see the error because something failed later when removing the other k8s
resources.

These new log lines should show us the error in the APIC's pod creation
in this scenario

Logs:
``` 
2024-01-24T19:03:09.828984663Z �[37mDEBU�[0m[2024-01-24T19:03:09Z][kubernetes_manager.go:CreatePod] Going to start pod using the following JSON: {"metadata":{"name":"kurtosis-api","creationTimestamp":null,"labels":{"kurtosistech.com/app-id":"kurtosis","kurtosistech.com/enclave-id":"a1367260310749f9a9406218a09fd9ae","kurtosistech.com/resource-type":"api-container"}},"spec":{"volumes":[{"name":"enclave-data","persistentVolumeClaim":{"claimName":"enclave-data"}}],"containers":[{"name":"kurtosis-core-api","image":"kurtosistech/core:0.86.8","ports":[{"name":"grpc","containerPort":7443,"protocol":"TCP"}],"env":[{"name":"OWN_IP_ADDRESS","value":"10.98.123.85"},{"name":"SERIALIZED_ARGS","value":"{\"version\":\"0.86.8\",\"logLevel\":\"debug\",\"grpcListenPortNum\":7443,\"enclaveUuid\":\"a1367260310749f9a9406218a09fd9ae\",\"enclaveDataVolume\":\"/kurtosis-data\",\"kurtosisBackendType\":\"kubernetes\",\"kurtosisBackendConfig\":{\"StorageClass\":\"vultr-block-storage-hdd\"},\"enclaveEnvVars\":\"{}\",\"isProductionEnclave\":false,\"metricsUserID\":\"3dd3097f9bb3c18f44b93677edc2b20a3deee4368817088b715b50d70398fa04\",\"didUserAcceptSendingMetrics\":false,\"is_ci\":false,\"cloud_user_id\":\"\",\"cloud_instance_id\":\"\"}"},{"name":"API_CONTAINER_OWN_NAMESPACE_NAME","valueFrom":{"fieldRef":{"fieldPath":"metadata.namespace"}}}],"resources":{},"volumeMounts":[{"name":"enclave-data","mountPath":"/kurtosis-data"}]}],"restartPolicy":"OnFailure","serviceAccountName":"kurtosis-api"},"status":{}} 
2024-01-24T19:04:12.551574597Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func6] Creating the API container didn't complete successfully, so we tried to delete role binding 'kurtosis-api' in namespace 'kt-rusty-forest' that we created but an error was thrown:
2024-01-24T19:04:12.551599695Z Failed to delete role bindings with name 'kurtosis-api' in namespace 'kt-rusty-forest'
2024-01-24T19:04:12.551606898Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:856 (KubernetesManager.RemoveRoleBindings) ---
2024-01-24T19:04:12.551613080Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.551619231Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func6] ACTION REQUIRED: You'll need to manually remove role binding with name 'kurtosis-api'!!!!!!! 
2024-01-24T19:04:12.551773996Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func5] Creating the API container didn't complete successfully, so we tried to delete role 'kurtosis-api' in namespace 'kt-rusty-forest' that we created but an error was thrown:
2024-01-24T19:04:12.551803382Z Failed to delete role with name 'kurtosis-api' in namespace 'kt-rusty-forest'
2024-01-24T19:04:12.551809623Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:767 (KubernetesManager.RemoveRole) ---
2024-01-24T19:04:12.551847205Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.551953247Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func5] ACTION REQUIRED: You'll need to manually remove role with name 'kurtosis-api'!!!!!!! 
2024-01-24T19:04:12.552051153Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func4] Creating the API container didn't complete successfully, so we tried to delete cluster role binding 'kurtosis-api-a1367260310749f9a9406218a09fd9ae' that we created but an error was thrown:
2024-01-24T19:04:12.552056072Z Failed to delete cluster role binding with name 'kurtosis-api-a1367260310749f9a9406218a09fd9ae'
2024-01-24T19:04:12.552060330Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:1032 (KubernetesManager.RemoveClusterRoleBindings) ---
2024-01-24T19:04:12.552063136Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.552087923Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func4] ACTION REQUIRED: You'll need to manually remove cluster role binding with name 'kurtosis-api-a1367260310749f9a9406218a09fd9ae'!!!!!!! 
2024-01-24T19:04:12.552206007Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func3] Creating the API container didn't complete successfully, so we tried to delete cluster role 'kurtosis-api-a1367260310749f9a9406218a09fd9ae' that we created but an error was thrown:
2024-01-24T19:04:12.552211899Z Failed to delete cluster role with name 'kurtosis-api-a1367260310749f9a9406218a09fd9ae'
2024-01-24T19:04:12.552214745Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:944 (KubernetesManager.RemoveClusterRole) ---
2024-01-24T19:04:12.552217460Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.552261513Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func3] ACTION REQUIRED: You'll need to manually remove cluster role with name 'kurtosis-api-a1367260310749f9a9406218a09fd9ae'!!!!!!! 
2024-01-24T19:04:12.552359049Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func2] Creating the API container didn't complete successfully, so we tried to delete service account 'kurtosis-api' in namespace 'kt-rusty-forest' that we created but an error was thrown:
2024-01-24T19:04:12.552364299Z Failed to delete service account with name 'kurtosis-api' in namespace 'kt-rusty-forest'
2024-01-24T19:04:12.552367154Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:679 (KubernetesManager.RemoveServiceAccount) ---
2024-01-24T19:04:12.552369870Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.552413432Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func2] ACTION REQUIRED: You'll need to manually remove service account with name 'kurtosis-api'!!!!!!! 
2024-01-24T19:04:12.552500759Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func1] Creating the api container didn't complete successfully, so we tried to delete Kubernetes service 'kurtosis-api' that we created but an error was thrown:
2024-01-24T19:04:12.552507441Z Failed to delete service 'kurtosis-api' with delete options '{TypeMeta:{Kind: APIVersion:} GracePeriodSeconds:<nil> Preconditions:nil OrphanDependents:<nil> PropagationPolicy:0x31a42d0 DryRun:[]}' in namespace 'kt-rusty-forest'
2024-01-24T19:04:12.552510908Z  --- at /home/circleci/project/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/kubernetes_manager.go:234 (KubernetesManager.RemoveService) ---
2024-01-24T19:04:12.552514114Z Caused by: client rate limiter Wait returned an error: context canceled 
2024-01-24T19:04:12.552562716Z �[31mERRO�[0m[2024-01-24T19:04:12Z][kubernetes_kurtosis_backend_api_container_functions.go:func1] ACTION REQUIRED: You'll need to manually remove Kubernetes service with name 'kurtosis-api'!!!!!!! 

```
  • Loading branch information
leoporoli authored Jan 24, 2024
1 parent d546905 commit a35e0a2
Showing 1 changed file with 25 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package kubernetes_kurtosis_backend

import (
"context"
"fmt"
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_kurtosis_backend/consts"
"github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_kurtosis_backend/shared_helpers"
kubernetes_manager_consts "github.com/kurtosis-tech/kurtosis/container-engine-lib/lib/backend_impls/kubernetes/kubernetes_manager/consts"
Expand Down Expand Up @@ -162,7 +163,9 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(
servicePorts,
)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred while creating the service with name '%s' in namespace '%s' with ports '%v'", apiContainerServiceName, enclaveNamespaceName, grpcPortInt32)
errMsg := fmt.Sprintf("An error occurred while creating the service with name '%s' in namespace '%s' with ports '%v'", apiContainerServiceName, enclaveNamespaceName, grpcPortInt32)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
var shouldRemoveService = true
defer func() {
Expand Down Expand Up @@ -198,7 +201,9 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(
serviceAccountLabels := shared_helpers.GetStringMapFromLabelMap(serviceAccountAttributes.GetLabels())
apiContainerServiceAccount, err := backend.kubernetesManager.CreateServiceAccount(ctx, serviceAccountName, enclaveNamespaceName, serviceAccountLabels)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating service account '%v' with labels '%+v' in namespace '%v'", serviceAccountName, serviceAccountLabels, enclaveNamespaceName)
errMsg := fmt.Sprintf("An error occurred creating service account '%v' with labels '%+v' in namespace '%v'", serviceAccountName, serviceAccountLabels, enclaveNamespaceName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
apiContainerServiceAccountName := apiContainerServiceAccount.GetName()
shouldRemoveServiceAccount := true
Expand Down Expand Up @@ -255,8 +260,10 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(

apiContainerClusterRole, err := backend.kubernetesManager.CreateClusterRoles(ctx, clusterRoleName, clusterRolePolicyRules, clusterRoleLabels)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating cluster role '%v' with policy rules '%+v' "+
errMsg := fmt.Sprintf("An error occurred creating cluster role '%v' with policy rules '%+v' "+
"and labels '%+v' in namespace '%v'", clusterRoleName, clusterRolePolicyRules, clusterRoleLabels, enclaveNamespaceName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
shouldRemoveClusterRole := true
defer func() {
Expand Down Expand Up @@ -296,8 +303,10 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(

apiContainerClusterRoleBinding, err := backend.kubernetesManager.CreateClusterRoleBindings(ctx, clusterRoleBindingName, clusterRoleBindingsSubjects, clusterRoleBindingsRoleRef, clusterRoleBindingsLabels)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating cluster role bindings '%v' with subjects "+
errMsg := fmt.Sprintf("An error occurred creating cluster role bindings '%v' with subjects "+
"'%+v' and role ref '%+v' in namespace '%v'", clusterRoleBindingName, clusterRoleBindingsSubjects, clusterRoleBindingsRoleRef, enclaveNamespaceName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
shouldRemoveClusterRoleBinding := true
defer func() {
Expand Down Expand Up @@ -353,8 +362,10 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(

apiContainerRole, err := backend.kubernetesManager.CreateRole(ctx, roleName, enclaveNamespaceName, rolePolicyRules, roleLabels)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating role '%v' with policy rules '%+v' "+
errMsg := fmt.Sprintf("An error occurred creating role '%v' with policy rules '%+v' "+
"and labels '%+v' in namespace '%v'", roleName, rolePolicyRules, roleLabels, enclaveNamespaceName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
shouldRemoveRole := true
defer func() {
Expand Down Expand Up @@ -394,8 +405,10 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(

apiContainerRoleBinding, err := backend.kubernetesManager.CreateRoleBindings(ctx, roleBindingName, enclaveNamespaceName, roleBindingsSubjects, roleBindingsRoleRef, roleBindingsLabels)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating role bindings '%v' with subjects "+
errMsg := fmt.Sprintf("An error occurred creating role bindings '%v' with subjects "+
"'%+v' and role ref '%+v' in namespace '%v'", roleBindingName, roleBindingsSubjects, roleBindingsRoleRef, enclaveNamespaceName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
shouldRemoveRoleBinding := true
defer func() {
Expand All @@ -422,7 +435,9 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(
volumeLabelsStrs[key.GetString()] = value.GetString()
}
if _, err = backend.kubernetesManager.CreatePersistentVolumeClaim(ctx, enclaveNamespaceName, enclaveDataDirVolumeName, volumeLabelsStrs, enclaveDataDirVolumeSize); err != nil {
return nil, stacktrace.Propagate(err, "An error occurred creating the persistent volume claim for enclave data dir volume for enclave '%s'", enclaveDataDirVolumeName)
errMsg := fmt.Sprintf("An error occurred creating the persistent volume claim for enclave data dir volume for enclave '%s'", enclaveDataDirVolumeName)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
shouldDeleteVolumeClaim := true

Expand Down Expand Up @@ -464,7 +479,9 @@ func (backend *KubernetesKurtosisBackend) CreateAPIContainer(
apiContainerRestartPolicy,
)
if err != nil {
return nil, stacktrace.Propagate(err, "An error occurred while creating the pod with name '%s' in namespace '%s' with image '%s'", apiContainerPodName, enclaveNamespaceName, image)
errMsg := fmt.Sprintf("An error occurred while creating the pod with name '%s' in namespace '%s' with image '%s'", apiContainerPodName, enclaveNamespaceName, image)
logrus.Errorf("%s. Error was:\n%s", errMsg, err)
return nil, stacktrace.Propagate(err, errMsg)
}
var shouldRemovePod = true
defer func() {
Expand Down

0 comments on commit a35e0a2

Please sign in to comment.