Skip to content

Commit

Permalink
Fix GPU cluster creation and resize (#182)
Browse files Browse the repository at this point in the history
* Fix GPU cluster creation and resize

* Update gcore_ai_cluster docs

* Prevent resize to 0 instances

* Update gcorelabscloud-go to v0.11.0
  • Loading branch information
pedrodeoliveira authored Feb 13, 2025
1 parent 56b2429 commit ff6f966
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 111 deletions.
4 changes: 3 additions & 1 deletion docs/resources/ai_cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@ resource "gcore_ai_cluster" "cluster1" {
image_id = "f6aa6e75-ab88-4c19-889d-79133366cb83"
cluster_name = "cluster1"
keypair_name = "front"
instances_count = 1
volume {
source = "image"
image_id = "f6aa6e75-ab88-4c19-889d-79133366cb83"
Expand Down Expand Up @@ -69,6 +70,7 @@ resource "gcore_ai_cluster" "cluster1" {
- `user_data` (String) String in base64 format. Must not be passed together with 'username' or 'password'. Examples of the user_data: https://cloudinit.readthedocs.io/en/latest/topics/examples.html
- `username` (String) A name of a new user in the Linux instance. It may be passed with a 'password' parameter
- `volume` (Block Set) List of volumes attached to the cluster (see [below for nested schema](#nestedblock--volume))
- `instances_count` (Number) Number of instances in the cluster

### Read-Only

Expand Down
87 changes: 30 additions & 57 deletions gcore/resource_gcore_ai_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ const (
AIClusterCreatingTimeout int = 1200
AIClusterSuspendTimeout int = 300

AIClusterPoint = "ai/clusters"
TaskPoint = "tasks"
AIClusterPoint = "ai/clusters"
AIClusterGPUPoint = "ai/clusters/gpu"
TaskPoint = "tasks"
)

const (
Expand Down Expand Up @@ -109,6 +110,11 @@ func resourceAICluster() *schema.Resource {
Description: "AI Cluster Name",
Required: true,
},
"instances_count": {
Type: schema.TypeInt,
Description: "Number of instances to create",
Optional: true,
},
"cluster_status": {
Type: schema.TypeString,
Description: "AI Cluster status",
Expand Down Expand Up @@ -190,7 +196,6 @@ func resourceAICluster() *schema.Resource {
"size": {
Type: schema.TypeInt,
Description: "Volume size, GiB",
Computed: true,
Optional: true,
},
"created_at": {
Expand All @@ -212,7 +217,7 @@ func resourceAICluster() *schema.Resource {
"image_id": {
Type: schema.TypeString,
Description: "Volume ID. Mandatory if volume is pre-existing volume",
Optional: true,
Required: true,
},
"attachments": {
Type: schema.TypeSet,
Expand Down Expand Up @@ -538,9 +543,6 @@ func checkAIClusterStatus(client *gcorecloud.ServiceClient, clusterID string, de
}

func validateCreateOpts(createOpts *ai.CreateOpts) error {
if isBmFlavor(createOpts.Flavor) && len(createOpts.Volumes) > 0 {
return errors.New("volumes are not supported for baremetal poplar servers")
}
if !isBmFlavor(createOpts.Flavor) && len(createOpts.Volumes) == 0 {
return errors.New("at least one image volume is required for vm poplar cluster")
}
Expand Down Expand Up @@ -590,7 +592,7 @@ func resourceAIClusterCreate(ctx context.Context, d *schema.ResourceData, m inte
config := m.(*Config)
provider := config.Provider

client, err := CreateClient(provider, d, AIClusterPoint, versionPointV1)
client, err := CreateClient(provider, d, AIClusterGPUPoint, versionPointV1)
if err != nil {
return diag.FromErr(err)
}
Expand All @@ -609,6 +611,10 @@ func resourceAIClusterCreate(ctx context.Context, d *schema.ResourceData, m inte
createOpts.Keypair = d.Get("keypair_name").(string)
createOpts.ImageID = d.Get("image_id").(string)

if instancesCount, ok := d.GetOk("instances_count"); ok {
createOpts.InstancesCount = instancesCount.(int)
}

if userData, ok := d.GetOk("userdata"); ok {
createOpts.UserData = userData.(string)
}
Expand Down Expand Up @@ -658,10 +664,8 @@ func resourceAIClusterCreate(ctx context.Context, d *schema.ResourceData, m inte
if err != nil {
return nil, fmt.Errorf("cannot get task with ID: %s. Error: %w", task, err)
}
clusterID, err := ai.ExtractAIClusterIDFromTask(taskInfo)
if err != nil {
return nil, fmt.Errorf("cannot retrieve AI cluster ID from task info: %w", err)
}
// on create task, the cluster_id matches the task_id
clusterID := taskInfo.ID
return clusterID, nil
},
)
Expand Down Expand Up @@ -805,7 +809,7 @@ func getDetachOptions(instanceInterfaces []instances.Interface, detachIface ai.A
}
}

var IsResize bool = false
var IsResize = false

func resourceAIClusterUpdate(ctx context.Context, d *schema.ResourceData, m interface{}) diag.Diagnostics {
log.Println("[DEBUG] Start AI cluster updating")
Expand All @@ -815,6 +819,10 @@ func resourceAIClusterUpdate(ctx context.Context, d *schema.ResourceData, m inte
if err != nil {
return diag.FromErr(err)
}
clientV1GPU, err := CreateClient(provider, d, AIClusterGPUPoint, versionPointV1)
if err != nil {
return diag.FromErr(err)
}
clientV2, err := CreateClient(provider, d, AIClusterPoint, versionPointV2)
if err != nil {
return diag.FromErr(err)
Expand Down Expand Up @@ -866,61 +874,26 @@ func resourceAIClusterUpdate(ctx context.Context, d *schema.ResourceData, m inte
}

// Make resize
if d.HasChanges("flavor", "image_id", "keypair_name", "user_data", "username", "password") || (d.HasChanges("interface") && isBmFlavor(d.Get("flavor").(string))) {
if d.HasChanges("instances_count") {
IsResize = true
_, newSGs := d.GetChange("security_group")
securityGroupList := newSGs.(*schema.Set).List()
securityGroupIDs := make([]gcorecloud.ItemID, len(securityGroupList))
for sgIndex, sgID := range securityGroupList {
securityGroupIDs[sgIndex] = gcorecloud.ItemID{ID: sgID.(map[string]interface{})["id"].(string)}
}
_, flavor := d.GetChange("flavor")
_, image_id := d.GetChange("image_id")
_, keypairName := d.GetChange("keypair_name")
_, userData := d.GetChange("user_data")
_, username := d.GetChange("username")
_, password := d.GetChange("password")

resizeOpts := ai.ResizeAIClusterOpts{
Flavor: flavor.(string),
ImageID: image_id.(string),
Interfaces: []instances.InterfaceInstanceCreateOpts{},
Volumes: []instances.CreateVolumeOpts{},
SecurityGroups: securityGroupIDs,
Keypair: keypairName.(string),
Password: password.(string),
Username: username.(string),
UserData: userData.(string),
Metadata: map[string]string{},
instancesCount, ok := d.GetOk("instances_count")
if !ok || instancesCount.(int) == 0 {
// if the number of instances has been specified before, then it cannot be removed or set to 0
return diag.FromErr(errors.New("cannot resize cluster to 0 instances"))
}
_, newVolumes := d.GetChange("volume")

volumeList := newVolumes.([]interface{})
if len(volumeList) > 0 {
vs, err := extractVolumesMap(volumeList)
if err != nil {
return diag.FromErr(err)
}
resizeOpts.Volumes = vs
resizeOpts := ai.ResizeGPUAIClusterOpts{
InstancesCount: instancesCount.(int),
}

_, newIface := d.GetChange("interface")
interfaceList := newIface.([]interface{})
if len(interfaceList) > 0 {
ifaces, err := extractAIClusterInterfacesMap(interfaceList)
if err != nil {
return diag.FromErr(err)
}
resizeOpts.Interfaces = ifaces
}
_, newMetadata := d.GetChange("cluster_metadata")

for metaKey, metaValue := range newMetadata.(map[string]interface{}) {
resizeOpts.Metadata[metaKey] = metaValue.(string)
}

log.Printf("[DEBUG] AI cluster resize options: %+v", resizeOpts)
results, err := ai.Resize(clientV1, clusterID, resizeOpts).Extract()
results, err := ai.Resize(clientV1GPU, clusterID, resizeOpts).Extract()
if err != nil {
return diag.FromErr(err)
}
Expand All @@ -943,7 +916,7 @@ func resourceAIClusterUpdate(ctx context.Context, d *schema.ResourceData, m inte
oldVolumeList := extractInstanceVolumesMap(oldVolumes.(*schema.Set).List())
newVolumeList := extractInstanceVolumesMap(newVolumes.(*schema.Set).List())
if isBmFlavor(d.Get("flavor").(string)) && len(newVolumeList) > 0 {
return diag.FromErr(errors.New("baremetal servers don't support external voluems"))
return diag.FromErr(errors.New("baremetal servers don't support external volumes"))
}
poplarInstances := d.Get("poplar_servers").([]interface{})
if len(poplarInstances) > 1 {
Expand Down
3 changes: 1 addition & 2 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ require (
github.com/G-Core/gcore-dns-sdk-go v0.2.9
github.com/G-Core/gcore-storage-sdk-go v0.1.34
github.com/G-Core/gcorelabscdn-go v1.0.25
github.com/G-Core/gcorelabscloud-go v0.10.1
github.com/G-Core/gcorelabscloud-go v0.11.0
github.com/hashicorp/go-cty v1.4.1-0.20200414143053-d3edf31b6320
github.com/hashicorp/terraform-plugin-sdk/v2 v2.27.0
github.com/mitchellh/mapstructure v1.5.0
Expand Down Expand Up @@ -53,7 +53,6 @@ require (
github.com/hashicorp/hcl/v2 v2.17.0 // indirect
github.com/hashicorp/logutils v1.0.0 // indirect
github.com/hashicorp/terraform-json v0.23.0 // indirect
github.com/hashicorp/terraform-plugin-docs v0.20.1 // indirect
github.com/hashicorp/terraform-plugin-go v0.16.0 // indirect
github.com/hashicorp/terraform-plugin-log v0.9.0 // indirect
github.com/hashicorp/terraform-registry-address v0.2.1 // indirect
Expand Down
Loading

0 comments on commit ff6f966

Please sign in to comment.