-
Notifications
You must be signed in to change notification settings - Fork 1.6k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Cleaner kubectl port-forward
retry logic
#2593
Changes from 9 commits
60b1bde
861ccf4
869eef0
8935ad7
0b3c285
6bb2078
60f4c15
cfdcb3c
fde0886
1aebb92
392fc97
1543e10
7a990b5
76a8616
c145f2b
becb9c5
1d5d3b6
e7653cb
f2defe8
0e69e4c
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,79 +20,99 @@ import ( | |
"bytes" | ||
"context" | ||
"fmt" | ||
"net" | ||
"io" | ||
"os/exec" | ||
"strings" | ||
"time" | ||
|
||
"github.com/pkg/errors" | ||
"github.com/GoogleContainerTools/skaffold/pkg/skaffold/color" | ||
|
||
"github.com/GoogleContainerTools/skaffold/pkg/skaffold/util" | ||
|
||
"github.com/sirupsen/logrus" | ||
"k8s.io/apimachinery/pkg/util/wait" | ||
|
||
"github.com/GoogleContainerTools/skaffold/pkg/skaffold/kubectl" | ||
"github.com/GoogleContainerTools/skaffold/pkg/skaffold/util" | ||
) | ||
|
||
type EntryForwarder interface { | ||
Forward(parentCtx context.Context, pfe *portForwardEntry) error | ||
Forward(parentCtx context.Context, pfe *portForwardEntry) | ||
Terminate(p *portForwardEntry) | ||
Monitor(*portForwardEntry, func()) | ||
} | ||
|
||
type KubectlForwarder struct { | ||
kubectl *kubectl.CLI | ||
out io.Writer | ||
} | ||
|
||
// Forward port-forwards a pod using kubectl port-forward | ||
// It returns an error only if the process fails or was terminated by a signal other than SIGTERM | ||
func (k *KubectlForwarder) Forward(parentCtx context.Context, pfe *portForwardEntry) error { | ||
ctx, cancel := context.WithCancel(parentCtx) | ||
// when retrying a portforwarding entry, it might already have a context running | ||
if pfe.cancel != nil { | ||
pfe.cancel() | ||
} | ||
pfe.cancel = cancel | ||
|
||
cmd := k.kubectl.Command(ctx, | ||
"port-forward", | ||
"--pod-running-timeout", "5s", | ||
fmt.Sprintf("%s/%s", pfe.resource.Type, pfe.resource.Name), | ||
fmt.Sprintf("%d:%d", pfe.localPort, pfe.resource.Port), | ||
"--namespace", pfe.resource.Namespace, | ||
) | ||
pfe.logBuffer = &bytes.Buffer{} | ||
cmd.Stdout = pfe.logBuffer | ||
cmd.Stderr = pfe.logBuffer | ||
|
||
if err := cmd.Start(); err != nil { | ||
if errors.Cause(err) == context.Canceled { | ||
return nil | ||
// Forward port-forwards a pod using kubectl port-forward in the background | ||
// It kills the command on errors in the kubectl port-forward log | ||
// It restarts the command if it was not cancelled by skaffold | ||
// It retries in case the port is taken | ||
func (k *KubectlForwarder) Forward(parentCtx context.Context, pfe *portForwardEntry) { | ||
go k.forward(parentCtx, pfe) | ||
} | ||
|
||
func (k *KubectlForwarder) forward(parentCtx context.Context, pfe *portForwardEntry) { | ||
var notifiedUser bool | ||
for { | ||
if parentCtx.Err() == context.Canceled { | ||
logrus.Debugf("port forwarding %v cancelled...", pfe) | ||
return | ||
} | ||
if !util.IsPortFree(pfe.localPort) { | ||
//assuming that Skaffold brokered ports don't overlap, this has to be an external process that started | ||
//since the dev loop kicked off. We are notifying the user in the hope that they can fix it | ||
color.Red.Fprintf(k.out, "failed to port forward %v, port %d is taken, retrying...\n", pfe, pfe.localPort) | ||
notifiedUser = true | ||
time.Sleep(5 * time.Second) | ||
continue | ||
} | ||
return errors.Wrapf(err, "port forwarding %s/%s, port: %d to local port: %d, err: %s", pfe.resource.Type, pfe.resource.Name, pfe.resource.Port, pfe.localPort, pfe.logBuffer.String()) | ||
} | ||
|
||
resultChan := make(chan error, 1) | ||
go func() { | ||
err := cmd.Wait() | ||
if err != nil { | ||
logrus.Debugf("port forwarding %v terminated: %s, output: %s", pfe, err, pfe.logBuffer.String()) | ||
resultChan <- err | ||
if notifiedUser { | ||
color.Green.Fprintf(k.out, "port forwarding %v recovered on port %d\n", pfe, pfe.localPort) | ||
notifiedUser = false | ||
} | ||
}() | ||
|
||
go func() { | ||
err := wait.PollImmediate(200*time.Millisecond, 5*time.Second, func() (bool, error) { | ||
// creating a listening port should not succeed | ||
if ln, err := net.Listen("tcp", fmt.Sprintf("%s:%d", util.Loopback, pfe.localPort)); err == nil { | ||
ln.Close() | ||
return false, nil | ||
|
||
ctx, cancel := context.WithCancel(parentCtx) | ||
// when retrying a portforwarding entry, it might already have a context running | ||
if pfe.cancel != nil { | ||
pfe.cancel() | ||
} | ||
pfe.cancel = cancel | ||
cmd := k.kubectl.Command(ctx, | ||
"port-forward", | ||
"--pod-running-timeout", "1s", | ||
fmt.Sprintf("%s/%s", pfe.resource.Type, pfe.resource.Name), | ||
fmt.Sprintf("%d:%d", pfe.localPort, pfe.resource.Port), | ||
"--namespace", pfe.resource.Namespace, | ||
) | ||
buf := &bytes.Buffer{} | ||
cmd.Stdout = buf | ||
cmd.Stderr = buf | ||
|
||
if err := cmd.Start(); err != nil { | ||
if ctx.Err() == context.Canceled { | ||
logrus.Debugf("couldn't start %v due to context cancellation", pfe) | ||
return | ||
} | ||
return true, nil | ||
}) | ||
resultChan <- err | ||
}() | ||
//retry on exit at Start() | ||
logrus.Debugf("error starting port forwarding %v: %s, output: %s", pfe, err, buf.String()) | ||
time.Sleep(500 * time.Millisecond) | ||
continue | ||
} | ||
|
||
err := <-resultChan | ||
return err | ||
//kill kubectl on port forwarding error logs | ||
go k.monitorErrorLogs(ctx, buf, cmd, pfe) | ||
|
||
if err := cmd.Wait(); err != nil { | ||
if ctx.Err() == context.Canceled { | ||
logrus.Debugf("terminated %v due to context cancellation", pfe) | ||
return | ||
} | ||
logrus.Debugf("port forwarding %v got terminated: %s, output: %s", pfe, err, buf.String()) | ||
time.Sleep(500 * time.Millisecond) | ||
} | ||
} | ||
} | ||
|
||
// Terminate terminates an existing kubectl port-forward command using SIGTERM | ||
|
@@ -107,18 +127,29 @@ func (*KubectlForwarder) Terminate(p *portForwardEntry) { | |
// Monitor monitors the logs for a kubectl port forward command | ||
// If it sees an error, it calls back to the EntryManager to | ||
// retry the entire port forward operation. | ||
func (*KubectlForwarder) Monitor(p *portForwardEntry, retryFunc func()) { | ||
func (*KubectlForwarder) monitorErrorLogs(ctx context.Context, buf *bytes.Buffer, cmd *exec.Cmd, p *portForwardEntry) { | ||
for { | ||
time.Sleep(1 * time.Second) | ||
s, _ := p.logBuffer.ReadString(byte('\n')) | ||
if s != "" { | ||
logrus.Tracef("[port-forward] %s", s) | ||
if strings.Contains(s, "error forwarding port") || strings.Contains(s, "unable to forward") { | ||
// kubectl is having an error. retry the command | ||
logrus.Infof("retrying kubectl port-forward due to error: %s", s) | ||
go retryFunc() | ||
return | ||
select { | ||
case <-ctx.Done(): | ||
return | ||
default: | ||
time.Sleep(1 * time.Second) | ||
s, _ := buf.ReadString(byte('\n')) | ||
if s != "" { | ||
logrus.Tracef("[port-forward] %s", s) | ||
|
||
if strings.Contains(s, "error forwarding port") || | ||
strings.Contains(s, "unable to forward") || | ||
strings.Contains(s, "error upgrading connection") { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Would it make sense to log a warning or something so the user knows the retry is happening and why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it is there in trace mode There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I keep thinking about what you mentioned: "error upgrading connection" is from port forwarding in client-go: https://github.com/kubernetes/client-go/blob/master/tools/portforward/portforward.go#L194 - I think it is fine to retry on it... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm pretty sure Cloud Code doesn't show trace — it would be good for users to at least be able to discover that there was a failure. I think restarting on error makes a lot of sense: it's essentially turning |
||
// kubectl is having an error. retry the command | ||
logrus.Tracef("killing port forwarding %v", p) | ||
if err := cmd.Process.Kill(); err != nil { | ||
logrus.Tracef("failed to kill port forwarding %v, err: %s", p, err) | ||
balopat marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} | ||
return | ||
} | ||
} | ||
} | ||
|
||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
when reading this, it looks like this
forward
runs forever?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
it does, until the pfe gets cancelled (there are 3 return statements in the body, all around cancellation!)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
got lost in reading and saw only 1 return statement.
I still feel, all the 3 return statements are waiting for conditions that are
skaffold dev
.Will it keep trying forever untill above two conditions are satisfied?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
well, if things are going well, it is going to wait in
cmd.Wait()
as long as kubectl portforward is running ...There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
but yes, it does retry unless it is an explicit cancel from the skaffold process...
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Something like this:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
btw this was generated on planttext.com :)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wish I could put labels on the arrows - but the key thing is non-cancellation error scenarios go back to start, cancellation exists, no error continues to next step.