This release adds APIs for new features for SageMaker endpoint to sca…

…le down to zero instances, native support for multi-adapter inference, and endpoint scaling improvements.
aws · Nov 22, 2024 · 649eebe · 649eebe
1 parent ef3f3cd
commit 649eebe
Show file tree

Hide file tree

Showing 24 changed files with 716 additions and 40 deletions.
diff --git a/generator/ServiceModels/sagemaker/sagemaker-2017-07-24.api.json b/generator/ServiceModels/sagemaker/sagemaker-2017-07-24.api.json
@@ -5998,7 +5998,8 @@
         "ExecutionRole":{"shape":"RoleArn"},
         "ThreadsPerCore":{"shape":"ClusterThreadsPerCore"},
         "InstanceStorageConfigs":{"shape":"ClusterInstanceStorageConfigs"},
-        "OnStartDeepHealthChecks":{"shape":"OnStartDeepHealthChecks"}
+        "OnStartDeepHealthChecks":{"shape":"OnStartDeepHealthChecks"},
+        "OverrideVpcConfig":{"shape":"VpcConfig"}
       }
     },
     "ClusterInstanceGroupDetailsList":{
@@ -6028,7 +6029,8 @@
         "ExecutionRole":{"shape":"RoleArn"},
         "ThreadsPerCore":{"shape":"ClusterThreadsPerCore"},
         "InstanceStorageConfigs":{"shape":"ClusterInstanceStorageConfigs"},
-        "OnStartDeepHealthChecks":{"shape":"OnStartDeepHealthChecks"}
+        "OnStartDeepHealthChecks":{"shape":"OnStartDeepHealthChecks"},
+        "OverrideVpcConfig":{"shape":"VpcConfig"}
       }
     },
     "ClusterInstanceGroupSpecifications":{
@@ -6174,6 +6176,7 @@
         "InstanceType":{"shape":"ClusterInstanceType"},
         "LaunchTime":{"shape":"Timestamp"},
         "LifeCycleConfig":{"shape":"ClusterLifeCycleConfig"},
+        "OverrideVpcConfig":{"shape":"VpcConfig"},
         "ThreadsPerCore":{"shape":"ClusterThreadsPerCore"},
         "InstanceStorageConfigs":{"shape":"ClusterInstanceStorageConfigs"},
         "PrivatePrimaryIp":{"shape":"ClusterPrivatePrimaryIp"},
@@ -7382,9 +7385,7 @@
       "required":[
         "InferenceComponentName",
         "EndpointName",
-        "VariantName",
-        "Specification",
-        "RuntimeConfig"
+        "Specification"
       ],
       "members":{
         "InferenceComponentName":{"shape":"InferenceComponentName"},
@@ -13726,7 +13727,7 @@
     "ImageVersionArn":{
       "type":"string",
       "max":256,
-      "pattern":"^arn:aws(-[\\w]+)*:sagemaker:.+:[0-9]{12}:image-version/[a-z0-9]([-.]?[a-z0-9])*/[0-9]+$"
+      "pattern":"^(arn:aws(-[\\w]+)*:sagemaker:.+:[0-9]{12}:image-version/[a-z0-9]([-.]?[a-z0-9])*/[0-9]+|None)$"
     },
     "ImageVersionNumber":{
       "type":"integer",
@@ -13868,12 +13869,12 @@
     },
     "InferenceComponentSpecification":{
       "type":"structure",
-      "required":["ComputeResourceRequirements"],
       "members":{
         "ModelName":{"shape":"ModelName"},
         "Container":{"shape":"InferenceComponentContainerSpecification"},
         "StartupParameters":{"shape":"InferenceComponentStartupParameters"},
-        "ComputeResourceRequirements":{"shape":"InferenceComponentComputeResourceRequirements"}
+        "ComputeResourceRequirements":{"shape":"InferenceComponentComputeResourceRequirements"},
+        "BaseInferenceComponentName":{"shape":"InferenceComponentName"}
       }
     },
     "InferenceComponentSpecificationSummary":{
@@ -13882,7 +13883,8 @@
         "ModelName":{"shape":"ModelName"},
         "Container":{"shape":"InferenceComponentContainerSpecificationSummary"},
         "StartupParameters":{"shape":"InferenceComponentStartupParameters"},
-        "ComputeResourceRequirements":{"shape":"InferenceComponentComputeResourceRequirements"}
+        "ComputeResourceRequirements":{"shape":"InferenceComponentComputeResourceRequirements"},
+        "BaseInferenceComponentName":{"shape":"InferenceComponentName"}
       }
     },
     "InferenceComponentStartupParameters":{
@@ -16627,7 +16629,7 @@
     },
     "ManagedInstanceScalingMinInstanceCount":{
       "type":"integer",
-      "min":1
+      "min":0
     },
     "ManagedInstanceScalingStatus":{
       "type":"string",
@@ -17745,6 +17747,13 @@
       "type":"integer",
       "min":0
     },
+    "ModelShardingConfig":{
+      "type":"structure",
+      "members":{
+        "Image":{"shape":"OptimizationContainerImage"},
+        "OverrideEnvironment":{"shape":"OptimizationJobEnvironmentVariables"}
+      }
+    },
     "ModelSortKey":{
       "type":"string",
       "enum":[
@@ -18693,7 +18702,8 @@
       "type":"structure",
       "members":{
         "ModelQuantizationConfig":{"shape":"ModelQuantizationConfig"},
-        "ModelCompilationConfig":{"shape":"ModelCompilationConfig"}
+        "ModelCompilationConfig":{"shape":"ModelCompilationConfig"},
+        "ModelShardingConfig":{"shape":"ModelShardingConfig"}
       },
       "union":true
     },
@@ -19774,6 +19784,24 @@
         "ml.c6i.16xlarge",
         "ml.c6i.24xlarge",
         "ml.c6i.32xlarge",
+        "ml.m6i.large",
+        "ml.m6i.xlarge",
+        "ml.m6i.2xlarge",
+        "ml.m6i.4xlarge",
+        "ml.m6i.8xlarge",
+        "ml.m6i.12xlarge",
+        "ml.m6i.16xlarge",
+        "ml.m6i.24xlarge",
+        "ml.m6i.32xlarge",
+        "ml.r6i.large",
+        "ml.r6i.xlarge",
+        "ml.r6i.2xlarge",
+        "ml.r6i.4xlarge",
+        "ml.r6i.8xlarge",
+        "ml.r6i.12xlarge",
+        "ml.r6i.16xlarge",
+        "ml.r6i.24xlarge",
+        "ml.r6i.32xlarge",
         "ml.g5.xlarge",
         "ml.g5.2xlarge",
         "ml.g5.4xlarge",
@@ -19790,6 +19818,14 @@
         "ml.g6.16xlarge",
         "ml.g6.24xlarge",
         "ml.g6.48xlarge",
+        "ml.g6e.xlarge",
+        "ml.g6e.2xlarge",
+        "ml.g6e.4xlarge",
+        "ml.g6e.8xlarge",
+        "ml.g6e.12xlarge",
+        "ml.g6e.16xlarge",
+        "ml.g6e.24xlarge",
+        "ml.g6e.48xlarge",
         "ml.p4d.24xlarge",
         "ml.c7g.large",
         "ml.c7g.xlarge",
@@ -19851,11 +19887,13 @@
         "ml.trn1.2xlarge",
         "ml.trn1.32xlarge",
         "ml.trn1n.32xlarge",
+        "ml.trn2.48xlarge",
         "ml.inf2.xlarge",
         "ml.inf2.8xlarge",
         "ml.inf2.24xlarge",
         "ml.inf2.48xlarge",
         "ml.p5.48xlarge",
+        "ml.p5e.48xlarge",
         "ml.m7i.large",
         "ml.m7i.xlarge",
         "ml.m7i.2xlarge",

diff --git a/generator/ServiceModels/sagemaker/sagemaker-2017-07-24.docs.json b/generator/ServiceModels/sagemaker/sagemaker-2017-07-24.docs.json
@@ -7655,10 +7655,10 @@
       }
     },
     "InferenceComponentComputeResourceRequirements": {
-      "base": "<p>Defines the compute resources to allocate to run a model that you assign to an inference component. These resources include CPU cores, accelerators, and memory.</p>",
+      "base": "<p>Defines the compute resources to allocate to run a model, plus any adapter models, that you assign to an inference component. These resources include CPU cores, accelerators, and memory.</p>",
       "refs": {
-        "InferenceComponentSpecification$ComputeResourceRequirements": "<p>The compute resources allocated to run the model assigned to the inference component.</p>",
-        "InferenceComponentSpecificationSummary$ComputeResourceRequirements": "<p>The compute resources allocated to run the model assigned to the inference component.</p>"
+        "InferenceComponentSpecification$ComputeResourceRequirements": "<p>The compute resources allocated to run the model, plus any adapter models, that you assign to the inference component.</p> <p>Omit this parameter if your request is meant to create an adapter inference component. An adapter inference component is loaded by a base inference component, and it uses the compute resources of the base inference component.</p>",
+        "InferenceComponentSpecificationSummary$ComputeResourceRequirements": "<p>The compute resources allocated to run the model, plus any adapter models, that you assign to the inference component.</p>"
       }
     },
     "InferenceComponentContainerSpecification": {
@@ -7688,6 +7688,8 @@
         "DeleteInferenceComponentInput$InferenceComponentName": "<p>The name of the inference component to delete.</p>",
         "DescribeInferenceComponentInput$InferenceComponentName": "<p>The name of the inference component.</p>",
         "DescribeInferenceComponentOutput$InferenceComponentName": "<p>The name of the inference component.</p>",
+        "InferenceComponentSpecification$BaseInferenceComponentName": "<p>The name of an existing inference component that is to contain the inference component that you're creating with your request.</p> <p>Specify this parameter only if your request is meant to create an adapter inference component. An adapter inference component contains the path to an adapter model. The purpose of the adapter model is to tailor the inference output of a base foundation model, which is hosted by the base inference component. The adapter inference component uses the compute resources that you assigned to the base inference component.</p> <p>When you create an adapter inference component, use the <code>Container</code> parameter to specify the location of the adapter artifacts. In the parameter value, use the <code>ArtifactUrl</code> parameter of the <code>InferenceComponentContainerSpecification</code> data type.</p> <p>Before you can create an adapter inference component, you must have an existing inference component that contains the foundation model that you want to adapt.</p>",
+        "InferenceComponentSpecificationSummary$BaseInferenceComponentName": "<p>The name of the base inference component that contains this inference component.</p>",
         "InferenceComponentSummary$InferenceComponentName": "<p>The name of the inference component.</p>",
         "UpdateInferenceComponentInput$InferenceComponentName": "<p>The name of the inference component.</p>",
         "UpdateInferenceComponentRuntimeConfigInput$InferenceComponentName": "<p>The name of the inference component to update.</p>"
@@ -10613,6 +10615,12 @@
         "RecommendationMetrics$ModelSetupTime": "<p>The time it takes to launch new compute resources for a serverless endpoint. The time can vary depending on the model size, how long it takes to download the model, and the start-up time of the container.</p> <p> <code>NaN</code> indicates that the value is not available.</p>"
       }
     },
+    "ModelShardingConfig": {
+      "base": "<p>Settings for the model sharding technique that's applied by a model optimization job.</p>",
+      "refs": {
+        "OptimizationConfig$ModelShardingConfig": "<p>Settings for the model sharding technique that's applied by a model optimization job.</p>"
+      }
+    },
     "ModelSortKey": {
       "base": null,
       "refs": {
@@ -11765,6 +11773,7 @@
       "refs": {
         "ModelCompilationConfig$Image": "<p>The URI of an LMI DLC in Amazon ECR. SageMaker uses this image to run the optimization.</p>",
         "ModelQuantizationConfig$Image": "<p>The URI of an LMI DLC in Amazon ECR. SageMaker uses this image to run the optimization.</p>",
+        "ModelShardingConfig$Image": "<p>The URI of an LMI DLC in Amazon ECR. SageMaker uses this image to run the optimization.</p>",
         "OptimizationOutput$RecommendedInferenceImage": "<p>The image that SageMaker recommends that you use to host the optimized model that you created with an optimization job.</p>"
       }
     },
@@ -11790,7 +11799,8 @@
         "CreateOptimizationJobRequest$OptimizationEnvironment": "<p>The environment variables to set in the model container.</p>",
         "DescribeOptimizationJobResponse$OptimizationEnvironment": "<p>The environment variables to set in the model container.</p>",
         "ModelCompilationConfig$OverrideEnvironment": "<p>Environment variables that override the default ones in the model container.</p>",
-        "ModelQuantizationConfig$OverrideEnvironment": "<p>Environment variables that override the default ones in the model container.</p>"
+        "ModelQuantizationConfig$OverrideEnvironment": "<p>Environment variables that override the default ones in the model container.</p>",
+        "ModelShardingConfig$OverrideEnvironment": "<p>Environment variables that override the default ones in the model container.</p>"
       }
     },
     "OptimizationJobModelSource": {
@@ -17339,6 +17349,9 @@
       "base": "<p>Specifies an Amazon Virtual Private Cloud (VPC) that your SageMaker jobs, hosted models, and compute resources have access to. You can control access to and from your resources by configuring a VPC. For more information, see <a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/infrastructure-give-access.html\">Give SageMaker Access to Resources in your Amazon VPC</a>. </p>",
       "refs": {
         "AutoMLSecurityConfig$VpcConfig": "<p>The VPC configuration.</p>",
+        "ClusterInstanceGroupDetails$OverrideVpcConfig": null,
+        "ClusterInstanceGroupSpecification$OverrideVpcConfig": null,
+        "ClusterNodeDetails$OverrideVpcConfig": null,
         "CreateClusterRequest$VpcConfig": null,
         "CreateEndpointConfigInput$VpcConfig": null,
         "CreateModelInput$VpcConfig": "<p>A <a href=\"https://docs.aws.amazon.com/sagemaker/latest/APIReference/API_VpcConfig.html\">VpcConfig</a> object that specifies the VPC that you want your model to connect to. Control access to and from your model container by configuring the VPC. <code>VpcConfig</code> is used in hosting services and in batch transform. For more information, see <a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/host-vpc.html\">Protect Endpoints by Using an Amazon Virtual Private Cloud</a> and <a href=\"https://docs.aws.amazon.com/sagemaker/latest/dg/batch-vpc.html\">Protect Data in Batch Transform Jobs by Using an Amazon Virtual Private Cloud</a>.</p>",