Skip to content

Commit

Permalink
Revert "[FG:InPlacePodVerticalScaling] kubelet: Propagate error in do…
Browse files Browse the repository at this point in the history
…PodResizeAction() to the caller"
  • Loading branch information
tallclair authored Nov 8, 2024
1 parent 6db9477 commit bf8354d
Show file tree
Hide file tree
Showing 5 changed files with 32 additions and 254 deletions.
12 changes: 0 additions & 12 deletions pkg/kubelet/container/sync_result.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,12 +45,6 @@ var (
ErrConfigPodSandbox = errors.New("ConfigPodSandboxError")
// ErrKillPodSandbox returned when runtime failed to stop pod's sandbox.
ErrKillPodSandbox = errors.New("KillPodSandboxError")
// ErrUpdatePodSandbox returned when runtime failed to update the pod's sandbox config.
ErrUpdatePodSandbox = errors.New("UpdatePodSandboxError")
// ErrUpdateContainerMemory returned when runtime failed to update the pod's container config.
ErrUpdateContainerMemory = errors.New("UpdateContainerMemoryError")
// ErrUpdateContainerCPU returned when runtime failed to update the pod's container config.
ErrUpdateContainerCPU = errors.New("UpdateContainerCPUError")
)

// SyncAction indicates different kind of actions in SyncPod() and KillPod(). Now there are only actions
Expand All @@ -74,12 +68,6 @@ const (
ConfigPodSandbox SyncAction = "ConfigPodSandbox"
// KillPodSandbox action
KillPodSandbox SyncAction = "KillPodSandbox"
// UpdatePodSandbox action
UpdatePodSandbox SyncAction = "UpdatePodSandbox"
// UpdateContainerMemory action
UpdateContainerMemory SyncAction = "UpdateContainerMemory"
// UpdateContainerCPU action
UpdateContainerCPU SyncAction = "UpdateContainerCPU"
)

// SyncResult is the result of sync action.
Expand Down
3 changes: 2 additions & 1 deletion pkg/kubelet/kuberuntime/kuberuntime_container.go
Original file line number Diff line number Diff line change
Expand Up @@ -392,11 +392,12 @@ func (m *kubeGenericRuntimeManager) generateContainerConfig(ctx context.Context,
return config, cleanupAction, nil
}

func (m *kubeGenericRuntimeManager) updateContainerResources(ctx context.Context, pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error {
func (m *kubeGenericRuntimeManager) updateContainerResources(pod *v1.Pod, container *v1.Container, containerID kubecontainer.ContainerID) error {
containerResources := m.generateContainerResources(pod, container)
if containerResources == nil {
return fmt.Errorf("container %q updateContainerResources failed: cannot generate resources config", containerID.String())
}
ctx := context.Background()
err := m.runtimeService.UpdateContainerResources(ctx, containerID.ID, containerResources)
if err != nil {
klog.ErrorS(err, "UpdateContainerResources failed", "container", containerID.String())
Expand Down
2 changes: 1 addition & 1 deletion pkg/kubelet/kuberuntime/kuberuntime_container_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -963,7 +963,7 @@ func TestUpdateContainerResources(t *testing.T) {
assert.NoError(t, err)
containerID := cStatus[0].ID

err = m.updateContainerResources(ctx, pod, &pod.Spec.Containers[0], containerID)
err = m.updateContainerResources(pod, &pod.Spec.Containers[0], containerID)
assert.NoError(t, err)

// Verify container is updated
Expand Down
64 changes: 27 additions & 37 deletions pkg/kubelet/kuberuntime/kuberuntime_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -666,14 +666,13 @@ func (m *kubeGenericRuntimeManager) computePodResizeAction(pod *v1.Pod, containe
return true
}

func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *v1.Pod, podStatus *kubecontainer.PodStatus, podContainerChanges podActions) (result kubecontainer.PodSyncResult) {
updatePodResult := kubecontainer.NewSyncResult(kubecontainer.UpdatePodSandbox, pod.UID)
result.AddSyncResult(updatePodResult)
func (m *kubeGenericRuntimeManager) doPodResizeAction(pod *v1.Pod, podStatus *kubecontainer.PodStatus, podContainerChanges podActions, result kubecontainer.PodSyncResult) {
pcm := m.containerManager.NewPodContainerManager()
//TODO(vinaykul,InPlacePodVerticalScaling): Figure out best way to get enforceMemoryQoS value (parameter #4 below) in platform-agnostic way
podResources := cm.ResourceConfigForPod(pod, m.cpuCFSQuota, uint64((m.cpuCFSQuotaPeriod.Duration)/time.Microsecond), false)
if podResources == nil {
result.Fail(fmt.Errorf("unable to get resource configuration processing resize for pod %s", format.Pod(pod)))
klog.ErrorS(nil, "Unable to get resource configuration", "pod", pod.Name)
result.Fail(fmt.Errorf("Unable to get resource configuration processing resize for pod %s", pod.Name))
return
}
setPodCgroupConfig := func(rName v1.ResourceName, setLimitValue bool) error {
Expand All @@ -691,7 +690,10 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
case v1.ResourceMemory:
err = pcm.SetPodCgroupConfig(pod, podResources)
}
return fmt.Errorf("failed to set cgroup config for %s of pod %s: %w", rName, format.Pod(pod), err)
if err != nil {
klog.ErrorS(err, "Failed to set cgroup config", "resource", rName, "pod", pod.Name)
}
return err
}
// Memory and CPU are updated separately because memory resizes may be ordered differently than CPU resizes.
// If resize results in net pod resource increase, set pod cgroup config before resizing containers.
Expand All @@ -711,12 +713,9 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
}
}
if len(podContainerChanges.ContainersToUpdate[rName]) > 0 {
updateContainerResults, errUpdate := m.updatePodContainerResources(ctx, pod, rName, podContainerChanges.ContainersToUpdate[rName])
for _, containerResult := range updateContainerResults {
result.AddSyncResult(containerResult)
}
if errUpdate != nil {
return errUpdate
if err = m.updatePodContainerResources(pod, rName, podContainerChanges.ContainersToUpdate[rName]); err != nil {
klog.ErrorS(err, "updatePodContainerResources failed", "pod", format.Pod(pod), "resource", rName)
return err
}
}
// At downsizing, requests should shrink prior to limits in order to keep "requests <= limits".
Expand All @@ -734,21 +733,25 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
}
if len(podContainerChanges.ContainersToUpdate[v1.ResourceMemory]) > 0 || podContainerChanges.UpdatePodResources {
if podResources.Memory == nil {
result.Fail(fmt.Errorf("podResources.Memory is nil for pod %s", format.Pod(pod)))
klog.ErrorS(nil, "podResources.Memory is nil", "pod", pod.Name)
result.Fail(fmt.Errorf("podResources.Memory is nil for pod %s", pod.Name))
return
}
currentPodMemoryConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceMemory)
if err != nil {
result.Fail(fmt.Errorf("GetPodCgroupConfig for memory failed for pod %s: %w", format.Pod(pod), err))
klog.ErrorS(err, "GetPodCgroupConfig for memory failed", "pod", pod.Name)
result.Fail(err)
return
}
currentPodMemoryUsage, err := pcm.GetPodCgroupMemoryUsage(pod)
if err != nil {
result.Fail(fmt.Errorf("GetPodCgroupMemoryUsage failed for pod %s", format.Pod(pod)))
klog.ErrorS(err, "GetPodCgroupMemoryUsage failed", "pod", pod.Name)
result.Fail(err)
return
}
if currentPodMemoryUsage >= uint64(*podResources.Memory) {
result.Fail(fmt.Errorf("aborting attempt to set pod memory limit less than current memory usage for pod %s", format.Pod(pod)))
klog.ErrorS(nil, "Aborting attempt to set pod memory limit less than current memory usage", "pod", pod.Name)
result.Fail(fmt.Errorf("Aborting attempt to set pod memory limit less than current memory usage for pod %s", pod.Name))
return
}
if errResize := resizeContainers(v1.ResourceMemory, int64(*currentPodMemoryConfig.Memory), *podResources.Memory, 0, 0); errResize != nil {
Expand All @@ -758,12 +761,14 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
}
if len(podContainerChanges.ContainersToUpdate[v1.ResourceCPU]) > 0 || podContainerChanges.UpdatePodResources {
if podResources.CPUQuota == nil || podResources.CPUShares == nil {
result.Fail(fmt.Errorf("podResources.CPUQuota or podResources.CPUShares is nil for pod %s", format.Pod(pod)))
klog.ErrorS(nil, "podResources.CPUQuota or podResources.CPUShares is nil", "pod", pod.Name)
result.Fail(fmt.Errorf("podResources.CPUQuota or podResources.CPUShares is nil for pod %s", pod.Name))
return
}
currentPodCpuConfig, err := pcm.GetPodCgroupConfig(pod, v1.ResourceCPU)
if err != nil {
result.Fail(fmt.Errorf("GetPodCgroupConfig for CPU failed for pod %s", format.Pod(pod)))
klog.ErrorS(err, "GetPodCgroupConfig for CPU failed", "pod", pod.Name)
result.Fail(err)
return
}
if errResize := resizeContainers(v1.ResourceCPU, *currentPodCpuConfig.CPUQuota, *podResources.CPUQuota,
Expand All @@ -772,20 +777,17 @@ func (m *kubeGenericRuntimeManager) doPodResizeAction(ctx context.Context, pod *
return
}
}
return
}

func (m *kubeGenericRuntimeManager) updatePodContainerResources(ctx context.Context, pod *v1.Pod, resourceName v1.ResourceName, containersToUpdate []containerToUpdateInfo) (updateResults []*kubecontainer.SyncResult, err error) {
func (m *kubeGenericRuntimeManager) updatePodContainerResources(pod *v1.Pod, resourceName v1.ResourceName, containersToUpdate []containerToUpdateInfo) error {
klog.V(5).InfoS("Updating container resources", "pod", klog.KObj(pod))

for _, cInfo := range containersToUpdate {
var updateContainerResult *kubecontainer.SyncResult
container := pod.Spec.Containers[cInfo.apiContainerIdx].DeepCopy()
// If updating memory limit, use most recently configured CPU request and limit values.
// If updating CPU request and limit, use most recently configured memory request and limit values.
switch resourceName {
case v1.ResourceMemory:
updateContainerResult = kubecontainer.NewSyncResult(kubecontainer.UpdateContainerMemory, container.Name)
container.Resources.Limits = v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.currentContainerResources.cpuLimit, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(cInfo.desiredContainerResources.memoryLimit, resource.BinarySI),
Expand All @@ -795,7 +797,6 @@ func (m *kubeGenericRuntimeManager) updatePodContainerResources(ctx context.Cont
v1.ResourceMemory: *resource.NewQuantity(cInfo.desiredContainerResources.memoryRequest, resource.BinarySI),
}
case v1.ResourceCPU:
updateContainerResult = kubecontainer.NewSyncResult(kubecontainer.UpdateContainerCPU, container.Name)
container.Resources.Limits = v1.ResourceList{
v1.ResourceCPU: *resource.NewMilliQuantity(cInfo.desiredContainerResources.cpuLimit, resource.DecimalSI),
v1.ResourceMemory: *resource.NewQuantity(cInfo.currentContainerResources.memoryLimit, resource.BinarySI),
Expand All @@ -805,19 +806,12 @@ func (m *kubeGenericRuntimeManager) updatePodContainerResources(ctx context.Cont
v1.ResourceMemory: *resource.NewQuantity(cInfo.currentContainerResources.memoryRequest, resource.BinarySI),
}
}
updateResults = append(updateResults, updateContainerResult)
if err = m.updateContainerResources(ctx, pod, container, cInfo.kubeContainerID); err != nil {
if err := m.updateContainerResources(pod, container, cInfo.kubeContainerID); err != nil {
// Log error and abort as container updates need to succeed in the order determined by computePodResizeAction.
// The recovery path is for SyncPod to keep retrying at later times until it succeeds.
klog.ErrorS(err, "updateContainerResources failed", "container", container.Name, "cID", cInfo.kubeContainerID,
"pod", format.Pod(pod), "resourceName", resourceName)
switch resourceName {
case v1.ResourceMemory:
updateContainerResult.Fail(kubecontainer.ErrUpdateContainerMemory, err.Error())
case v1.ResourceCPU:
updateContainerResult.Fail(kubecontainer.ErrUpdateContainerCPU, err.Error())
}
return
return err
}
resizeKey := fmt.Sprintf("%s:resize:%s", container.Name, resourceName)

Expand Down Expand Up @@ -857,7 +851,7 @@ func (m *kubeGenericRuntimeManager) updatePodContainerResources(ctx context.Cont
cInfo.currentContainerResources.cpuRequest = cInfo.desiredContainerResources.cpuRequest
}
}
return
return nil
}

// computePodActions checks whether the pod spec has changed and returns the changes if true.
Expand Down Expand Up @@ -1357,11 +1351,7 @@ func (m *kubeGenericRuntimeManager) SyncPod(ctx context.Context, pod *v1.Pod, po
// Step 7: For containers in podContainerChanges.ContainersToUpdate[CPU,Memory] list, invoke UpdateContainerResources
if IsInPlacePodVerticalScalingAllowed(pod) {
if len(podContainerChanges.ContainersToUpdate) > 0 || podContainerChanges.UpdatePodResources {
resizeResult := m.doPodResizeAction(ctx, pod, podStatus, podContainerChanges)
result.AddPodSyncResult(resizeResult)
if resizeResult.Error() != nil {
klog.ErrorS(resizeResult.Error(), "doPodResizeAction failed")
}
m.doPodResizeAction(pod, podStatus, podContainerChanges, result)
}
}

Expand Down
Loading

0 comments on commit bf8354d

Please sign in to comment.