Skip to content

Commit

Permalink
Breaking change: Stop deleting failed GKE clusters, taint instead (#8978
Browse files Browse the repository at this point in the history
) (#6301)

Signed-off-by: Modular Magician <[email protected]>
  • Loading branch information
modular-magician authored Sep 18, 2023
1 parent 6fadf39 commit 6c8ef79
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 63 deletions.
3 changes: 3 additions & 0 deletions .changelog/8978.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:breaking-change
container: removed the behaviour that `google_container_cluster` will delete the cluster if it's created in an error state. Instead, it will mark the cluster as tainted, allowing manual inspection and intervention. To proceed with deletion, run another `terraform apply`.
```
69 changes: 7 additions & 62 deletions google-beta/services/container/resource_container_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -2319,29 +2319,26 @@ func resourceContainerClusterCreate(d *schema.ResourceData, meta interface{}) er
if err := d.Set("operation", op.Name); err != nil {
return fmt.Errorf("Error setting operation: %s", err)
}

return nil
default:
// leaving default case to ensure this is non blocking
}

// Try a GET on the cluster so we can see the state in debug logs. This will help classify error states.
clusterGetCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Get(containerClusterFullName(project, location, clusterName))
if config.UserProjectOverride {
clusterGetCall.Header().Add("X-Goog-User-Project", project)
}

_, getErr := clusterGetCall.Do()
if getErr != nil {
log.Printf("[WARN] Cluster %s was created in an error state and not found", clusterName)
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
// Don't clear cluster id, this will taint the resource
log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName)
return waitErr
}

Expand Down Expand Up @@ -2486,14 +2483,8 @@ func resourceContainerClusterRead(d *schema.ResourceData, meta interface{}) erro
d.SetId("")
}

if deleteErr := cleanFailedContainerCluster(d, meta); deleteErr != nil {
log.Printf("[WARN] Unable to clean up cluster from failed creation: %s", deleteErr)
// Leave ID set as the cluster likely still exists and should not be removed from state yet.
} else {
log.Printf("[WARN] Verified failed creation of cluster %s was cleaned up", d.Id())
d.SetId("")
}
// The resource didn't actually create
// Don't clear cluster id, this will taint the resource
log.Printf("[WARN] GKE cluster %s was created in an error state, and has been marked as tainted", clusterName)
return waitErr
}
}
Expand Down Expand Up @@ -4079,52 +4070,6 @@ func resourceContainerClusterDelete(d *schema.ResourceData, meta interface{}) er
return nil
}

// cleanFailedContainerCluster deletes clusters that failed but were
// created in an error state. Similar to resourceContainerClusterDelete
// but implemented in separate function as it doesn't try to lock already
// locked cluster state, does different error handling, and doesn't do retries.
func cleanFailedContainerCluster(d *schema.ResourceData, meta interface{}) error {
config := meta.(*transport_tpg.Config)

project, err := tpgresource.GetProject(d, config)
if err != nil {
return err
}

location, err := tpgresource.GetLocation(d, config)
if err != nil {
return err
}

userAgent, err := tpgresource.GenerateUserAgentString(d, config.UserAgent)
if err != nil {
return err
}

clusterName := d.Get("name").(string)
fullName := containerClusterFullName(project, location, clusterName)

log.Printf("[DEBUG] Cleaning up failed GKE cluster %s", d.Get("name").(string))
clusterDeleteCall := config.NewContainerClient(userAgent).Projects.Locations.Clusters.Delete(fullName)
if config.UserProjectOverride {
clusterDeleteCall.Header().Add("X-Goog-User-Project", project)
}
op, err := clusterDeleteCall.Do()
if err != nil {
return transport_tpg.HandleNotFoundError(err, d, fmt.Sprintf("Container Cluster %q", d.Get("name").(string)))
}

// Wait until it's deleted
waitErr := ContainerOperationWait(config, op, project, location, "deleting GKE cluster", userAgent, d.Timeout(schema.TimeoutDelete))
if waitErr != nil {
return waitErr
}

log.Printf("[INFO] GKE cluster %s has been deleted", d.Id())
d.SetId("")
return nil
}

var containerClusterRestingStates = RestingStates{
"RUNNING": ReadyState,
"DEGRADED": ErrorState,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3330,6 +3330,9 @@ func TestAccContainerCluster_autoprovisioningDefaultsManagement(t *testing.T) {
})
}

// This resource originally cleaned up the dangling cluster directly, but now
// taints it, having Terraform clean it up during the next apply. This test
// name is now inexact, but is being preserved to maintain the test history.
func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) {
t.Parallel()

Expand Down Expand Up @@ -3358,7 +3361,7 @@ func TestAccContainerCluster_errorCleanDanglingCluster(t *testing.T) {
Config: overlapConfig,
ExpectError: regexp.MustCompile("Error waiting for creating GKE cluster"),
},
// If dangling cluster wasn't deleted, this plan will return an error
// If tainted cluster won't be deleted, this step will return an error
{
Config: overlapConfig,
PlanOnly: true,
Expand Down

0 comments on commit 6c8ef79

Please sign in to comment.