Skip to content

Commit

Permalink
Emit events for resize error states
Browse files Browse the repository at this point in the history
  • Loading branch information
tallclair committed Nov 9, 2024
1 parent 61e6242 commit 3a2555e
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 12 deletions.
2 changes: 2 additions & 0 deletions pkg/kubelet/events/event.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ const (
FailedToCreatePodContainer = "FailedCreatePodContainer"
FailedToMakePodDataDirectories = "Failed"
NetworkNotReady = "NetworkNotReady"
ResizeDeferred = "ResizeDeferred"
ResizeInfeasible = "ResizeInfeasible"
)

// Image event reason list
Expand Down
42 changes: 30 additions & 12 deletions pkg/kubelet/kubelet.go
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,7 @@ import (
kubetypes "k8s.io/kubernetes/pkg/kubelet/types"
"k8s.io/kubernetes/pkg/kubelet/userns"
"k8s.io/kubernetes/pkg/kubelet/util"
"k8s.io/kubernetes/pkg/kubelet/util/format"
"k8s.io/kubernetes/pkg/kubelet/util/manager"
"k8s.io/kubernetes/pkg/kubelet/util/queue"
"k8s.io/kubernetes/pkg/kubelet/util/sliceutils"
Expand Down Expand Up @@ -2829,38 +2830,47 @@ func isPodResizeInProgress(pod *v1.Pod, podStatus *kubecontainer.PodStatus) bool
// canResizePod determines if the requested resize is currently feasible.
// pod should hold the desired (pre-allocated) spec.
// Returns true if the resize can proceed.
func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus, string) {
if goos == "windows" {
return false, v1.PodResizeStatusInfeasible
return false, v1.PodResizeStatusInfeasible, "Resizing Windows pods is not supported"
}

if v1qos.GetPodQOS(pod) == v1.PodQOSGuaranteed && !utilfeature.DefaultFeatureGate.Enabled(features.InPlacePodVerticalScalingExclusiveCPUs) {
if utilfeature.DefaultFeatureGate.Enabled(features.CPUManager) {
if kl.containerManager.GetNodeConfig().CPUManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy")
return false, v1.PodResizeStatusInfeasible
msg := "Resize is infeasible for Guaranteed Pods alongside CPU Manager static policy"
klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
}
}
if utilfeature.DefaultFeatureGate.Enabled(features.MemoryManager) {
if kl.containerManager.GetNodeConfig().ExperimentalMemoryManagerPolicy == "static" {
klog.V(3).InfoS("Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy")
return false, v1.PodResizeStatusInfeasible
msg := "Resize is infeasible for Guaranteed Pods alongside Memory Manager static policy"
klog.V(3).InfoS(msg, "pod", format.Pod(pod))
return false, v1.PodResizeStatusInfeasible, msg
}
}
}

node, err := kl.getNodeAnyWay()
if err != nil {
klog.ErrorS(err, "getNodeAnyway function failed")
return false, ""
return false, "", ""
}
cpuAvailable := node.Status.Allocatable.Cpu().MilliValue()
memAvailable := node.Status.Allocatable.Memory().Value()
cpuRequests := resource.GetResourceRequest(pod, v1.ResourceCPU)
memRequests := resource.GetResourceRequest(pod, v1.ResourceMemory)
if cpuRequests > cpuAvailable || memRequests > memAvailable {
klog.V(3).InfoS("Resize is not feasible as request exceeds allocatable node resources", "pod", klog.KObj(pod))
return false, v1.PodResizeStatusInfeasible
var msg string
if memRequests > memAvailable {
msg = fmt.Sprintf("memory, requested: %d, capacity: %d", memRequests, memAvailable)
} else {
msg = fmt.Sprintf("cpu, requested: %d, capacity: %d", cpuRequests, cpuAvailable)
}
msg = "Node didn't have enough capacity: " + msg
klog.V(3).InfoS(msg, "pod", klog.KObj(pod))
return false, v1.PodResizeStatusInfeasible, msg
}

// Treat the existing pod needing resize as a new pod with desired resources seeking admit.
Expand All @@ -2871,10 +2881,10 @@ func (kl *Kubelet) canResizePod(pod *v1.Pod) (bool, v1.PodResizeStatus) {
if ok, failReason, failMessage := kl.canAdmitPod(allocatedPods, pod); !ok {
// Log reason and return. Let the next sync iteration retry the resize
klog.V(3).InfoS("Resize cannot be accommodated", "pod", klog.KObj(pod), "reason", failReason, "message", failMessage)
return false, v1.PodResizeStatusDeferred
return false, v1.PodResizeStatusDeferred, failMessage
}

return true, v1.PodResizeStatusInProgress
return true, v1.PodResizeStatusInProgress, ""
}

// handlePodResourcesResize returns the "allocated pod", which should be used for all resource
Expand All @@ -2899,7 +2909,7 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
kl.podResizeMutex.Lock()
defer kl.podResizeMutex.Unlock()
// Desired resources != allocated resources. Can we update the allocation to the desired resources?
fit, resizeStatus := kl.canResizePod(pod)
fit, resizeStatus, resizeMsg := kl.canResizePod(pod)
if fit {
// Update pod resource allocation checkpoint
if err := kl.statusManager.SetPodAllocation(pod); err != nil {
Expand All @@ -2925,6 +2935,14 @@ func (kl *Kubelet) handlePodResourcesResize(pod *v1.Pod, podStatus *kubecontaine
}
if resizeStatus != "" {
kl.statusManager.SetPodResizeStatus(pod.UID, resizeStatus)
if resizeMsg != "" {
switch resizeStatus {
case v1.PodResizeStatusDeferred:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeDeferred, resizeMsg)
case v1.PodResizeStatusInfeasible:
kl.recorder.Eventf(pod, v1.EventTypeWarning, events.ResizeInfeasible, resizeMsg)
}
}
}
return allocatedPod, nil
}
Expand Down

0 comments on commit 3a2555e

Please sign in to comment.