Skip to content

Commit

Permalink
apimachinery runtime: support contextual logging
Browse files Browse the repository at this point in the history
In contrast to the original HandleError and HandleCrash, the new
HandleErrorInContext and HandleCrashInContext functions properly do contextual
logging, so if a problem occurs while e.g. dealing with a certain request and
WithValues was used for that request, then the error log entry will also
contain information about it.

The output changes from unstructured to structured, which might be a breaking
change for users who grep for panics. Care was taken to format panics
as similar as possible to the original output.

For errors, a message string gets added. There was none before, which made it
impossible to find all error output coming from HandleError.

Keeping HandleError and HandleCrash around without deprecating while changing
the signature of callbacks is a compromise between not breaking existing code
and not adding too many special cases that need to be supported. There is some
code which uses PanicHandlers or ErrorHandlers, but less than code that uses
the Handle* calls.

In Kubernetes, we want to replace the calls. golangci-lint now warns about
using the old functions as a hint. Because it contains a test that checks the
output and that check no longer passed, k8s.io/apiserver/pkg/server/filters
gets updated to use HandleErrorInContext.
  • Loading branch information
pohly committed Nov 22, 2023
1 parent 46f4248 commit 700db5b
Show file tree
Hide file tree
Showing 11 changed files with 193 additions and 34 deletions.
6 changes: 6 additions & 0 deletions hack/golangci-hints.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,6 +107,12 @@ linters-settings: # please keep this alphabetized
- p: \.Extract
pkg: ^k8s\.io/client-go/applyconfigurations/
msg: should not be used because managedFields was removed
- p: \.HandleError$
pkg: ^k8s\.io/apimachinery/pkg/util/runtime
msg: should consider HandleErrorInContext in new or modified code
- p: \.HandleCrash$
pkg: ^k8s\.io/apimachinery/pkg/util/runtime
msg: should consider HandleCrashInContext in new or modified code
revive:
# Only these rules are enabled.
rules:
Expand Down
8 changes: 8 additions & 0 deletions hack/golangci.yaml.in
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,14 @@ linters-settings: # please keep this alphabetized
- p: \.Extract
pkg: ^k8s\.io/client-go/applyconfigurations/
msg: should not be used because managedFields was removed
{{- if .Hints}}
- p: \.HandleError$
pkg: ^k8s\.io/apimachinery/pkg/util/runtime
msg: should consider HandleErrorInContext in new or modified code
- p: \.HandleCrash$
pkg: ^k8s\.io/apimachinery/pkg/util/runtime
msg: should consider HandleCrashInContext in new or modified code
{{- end}}
{{- if .Base }}
gocritic:
enabled-checks:
Expand Down
1 change: 1 addition & 0 deletions hack/logcheck.conf
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ structured k8s.io/apiserver/pkg/server/options/encryptionconfig/.*
# The following packages have been migrated to contextual logging.
# Packages matched here do not have to be listed above because
# "contextual" implies "structured".
contextual k8s.io/apimachinery/pkg/util/runtime/.*
contextual k8s.io/client-go/tools/events/.*
contextual k8s.io/client-go/tools/record/.*
contextual k8s.io/dynamic-resource-allocation/.*
Expand Down
106 changes: 88 additions & 18 deletions staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ limitations under the License.
package runtime

import (
"context"
"fmt"
"net/http"
"runtime"
Expand All @@ -35,7 +36,7 @@ var (
)

// PanicHandlers is a list of functions which will be invoked when a panic happens.
var PanicHandlers = []func(interface{}){logPanic}
var PanicHandlers = []func(context.Context, interface{}){logPanic}

// HandleCrash simply catches a crash and logs an error. Meant to be called via
// defer. Additional context-specific handlers can be provided, and will be
Expand All @@ -46,7 +47,7 @@ var PanicHandlers = []func(interface{}){logPanic}
func HandleCrash(additionalHandlers ...func(interface{})) {
if r := recover(); r != nil {
for _, fn := range PanicHandlers {
fn(r)
fn(context.Background(), r)
}
for _, fn := range additionalHandlers {
fn(r)
Expand All @@ -58,8 +59,31 @@ func HandleCrash(additionalHandlers ...func(interface{})) {
}
}

// HandleCrash simply catches a crash and logs an error. Meant to be called via
// defer. Additional context-specific handlers can be provided, and will be
// called in case of panic. HandleCrash actually crashes, after calling the
// handlers and logging the panic message.
//
// E.g., you can provide one or more additional handlers for something like shutting down go routines gracefully.
//
// The context is used to determine how to log. Should be used instead of HandleCrash.
func HandleCrashInContext(ctx context.Context, additionalHandlers ...func(context.Context, interface{})) {
if r := recover(); r != nil {
for _, fn := range PanicHandlers {
fn(ctx, r)
}
for _, fn := range additionalHandlers {
fn(ctx, r)
}
if ReallyCrash {
// Actually proceed to panic.
panic(r)
}
}
}

// logPanic logs the caller tree when a panic occurs (except in the special case of http.ErrAbortHandler).
func logPanic(r interface{}) {
func logPanic(ctx context.Context, r interface{}) {
if r == http.ErrAbortHandler {
// honor the http.ErrAbortHandler sentinel panic value:
// ErrAbortHandler is a sentinel panic value to abort a handler.
Expand All @@ -73,46 +97,92 @@ func logPanic(r interface{}) {
const size = 64 << 10
stacktrace := make([]byte, size)
stacktrace = stacktrace[:runtime.Stack(stacktrace, false)]

// We don't really know how many call frames to skip because the Go
// panic handler is between us and the code where the panic occurred.
// If it's one function (as in Go 1.21), then skipping three levels
// gets us to the function which called the `defer HandleCrashInContext(...)`.
logger := klog.FromContext(ctx).WithCallDepth(3)

// For backwards compatibility, conversion to string
// is handled here instead of defering to the logging
// backend.
if _, ok := r.(string); ok {
klog.Errorf("Observed a panic: %s\n%s", r, stacktrace)
logger.Error(nil, "Observed a panic", "panic", r, "stacktrace", string(stacktrace))
} else {
klog.Errorf("Observed a panic: %#v (%v)\n%s", r, r, stacktrace)
logger.Error(nil, "Observed a panic", "panic", fmt.Sprintf("%v", r), "panicGoValue", fmt.Sprintf("%#v", r), "stacktrace", string(stacktrace))
}
}

// ErrorHandlers is a list of functions which will be invoked when a nonreturnable
// error occurs.
// TODO(lavalamp): for testability, this and the below HandleError function
// should be packaged up into a testable and reusable object.
var ErrorHandlers = []func(error){
var ErrorHandlers = []ErrorHandler{
logError,
(&rudimentaryErrorBackoff{
lastErrorTime: time.Now(),
// 1ms was the number folks were able to stomach as a global rate limit.
// If you need to log errors more than 1000 times a second you
// should probably consider fixing your code instead. :)
minPeriod: time.Millisecond,
}).OnError,
func(_ context.Context, _ error, _ string, _ ...interface{}) {
(&rudimentaryErrorBackoff{
lastErrorTime: time.Now(),
// 1ms was the number folks were able to stomach as a global rate limit.
// If you need to log errors more than 1000 times a second you
// should probably consider fixing your code instead. :)
minPeriod: time.Millisecond,
}).OnError()
},
}

type ErrorHandler func(ctx context.Context, err error, msg string, keysAndValues ...interface{})

// HandlerError is a method to invoke when a non-user facing piece of code cannot
// return an error and needs to indicate it has been ignored. Invoking this method
// is preferable to logging the error - the default behavior is to log but the
// errors may be sent to a remote server for analysis.
//
// HandleErrorInContext is a better alternative.
func HandleError(err error) {
// this is sometimes called with a nil error. We probably shouldn't fail and should do nothing instead
if err == nil {
return
}

for _, fn := range ErrorHandlers {
fn(err)
fn(context.Background(), err, "Unhandled Error")
}
}

// HandlerError is a method to invoke when a non-user facing piece of code cannot
// return an error and needs to indicate it has been ignored. Invoking this method
// is preferable to logging the error - the default behavior is to log but the
// errors may be sent to a remote server for analysis. The context is used to
// determine how to log the error.
//
// If contextual logging is enabled, the default log output is equivalent to
//
// logr.FromContext(ctx).WithName("UnhandledError").Error(err, msg, keysAndValues...)
//
// Without contextual logging, it is equivalent to:
//
// klog.ErrorS(err, msg, keysAndValues...)
//
// In contrast to HandleError, passing nil for the error is still going to
// trigger a log entry. Don't construct a new error or wrap an error
// with fmt.Errorf. Instead, add additional information via the mssage
// and key/value pairs.
//
// This variant should be used instead of HandleError because it supports
// structured, contextual logging.
func HandleErrorInContext(ctx context.Context, err error, msg string, keysAndValues ...interface{}) {
for _, fn := range ErrorHandlers {
fn(ctx, err, msg, keysAndValues...)
}
}

// logError prints an error with the call stack of the location it was reported
func logError(err error) {
klog.ErrorDepth(2, err)
// logError prints an error with the call stack of the location it was reported.
// It expects to be called as <caller> -> HandleError[InContext] -> logError.
func logError(ctx context.Context, err error, msg string, keysAndValues ...interface{}) {
logger := klog.FromContext(ctx).WithCallDepth(2)
logger = klog.LoggerWithName(logger, "UnhandledError")
logger.Error(err, msg, keysAndValues...) //nolint:logcheck // logcheck complains about unknown key/value pairs.
}

type rudimentaryErrorBackoff struct {
Expand All @@ -125,7 +195,7 @@ type rudimentaryErrorBackoff struct {

// OnError will block if it is called more often than the embedded period time.
// This will prevent overly tight hot error loops.
func (r *rudimentaryErrorBackoff) OnError(error) {
func (r *rudimentaryErrorBackoff) OnError() {
now := time.Now() // start the timer before acquiring the lock
r.lastErrorTimeLock.Lock()
d := now.Sub(r.lastErrorTime)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
/*
Copyright 2014 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package runtime

import (
"bytes"
"context"
"errors"
"flag"
"fmt"
"regexp"

"k8s.io/klog/v2"
)

//nolint:logcheck // Several functions are normally not okay in a package.
func ExampleHandleErrorInContext() {
state := klog.CaptureState()
defer state.Restore()
var fs flag.FlagSet
klog.InitFlags(&fs)
for flag, value := range map[string]string{
"one_output": "true",
"logtostderr": "false",
} {
if err := fs.Set(flag, value); err != nil {
fmt.Printf("Unexpected error configuring klog: %v", err)
return
}
}
var buffer bytes.Buffer
klog.SetOutput(&buffer)

logger := klog.Background()
logger = klog.LoggerWithValues(logger, "request", 42)
ctx := klog.NewContext(context.Background(), logger)

// The line number of the next call must be at line 60. Here are some
// blank lines that can be removed to keep the line unchanged.
//
//
//
//
//
//
HandleErrorInContext(ctx, errors.New("fake error"), "test")

klog.Flush()
// Strip varying header. Code location should be constant and something
// that needs to be tested.
output := buffer.String()
output = regexp.MustCompile(`^.* ([^[:space:]]*.go:[[:digit:]]*)\] `).ReplaceAllString(output, `xxx $1] `)
fmt.Print(output)

// Output:
// xxx runtime_stack_test.go:60] "test" err="fake error" logger="UnhandledError" request=42
}
14 changes: 8 additions & 6 deletions staging/src/k8s.io/apimachinery/pkg/util/runtime/runtime_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package runtime

import (
"bytes"
"context"
"fmt"
"io"
"net/http"
Expand All @@ -43,8 +44,8 @@ func TestCustomHandleCrash(t *testing.T) {
old := PanicHandlers
defer func() { PanicHandlers = old }()
var result interface{}
PanicHandlers = []func(interface{}){
func(r interface{}) {
PanicHandlers = []func(context.Context, interface{}){
func(_ context.Context, r interface{}) {
result = r
},
}
Expand All @@ -66,8 +67,8 @@ func TestCustomHandleError(t *testing.T) {
old := ErrorHandlers
defer func() { ErrorHandlers = old }()
var result error
ErrorHandlers = []func(error){
func(err error) {
ErrorHandlers = []ErrorHandler{
func(_ context.Context, err error, msg string, keysAndValues ...interface{}) {
result = err
},
}
Expand Down Expand Up @@ -101,7 +102,8 @@ func TestHandleCrashLog(t *testing.T) {
if len(lines) < 4 {
t.Fatalf("panic log should have 1 line of message, 1 line per goroutine and 2 lines per function call")
}
if match, _ := regexp.MatchString("Observed a panic: test panic", lines[0]); !match {
t.Logf("Got log output:\n%s", strings.Join(lines, "\n"))
if match, _ := regexp.MatchString(`"Observed a panic" panic="test panic"`, lines[0]); !match {
t.Errorf("mismatch panic message: %s", lines[0])
}
// The following regexp's verify that Kubernetes panic log matches Golang stdlib
Expand Down Expand Up @@ -170,7 +172,7 @@ func Test_rudimentaryErrorBackoff_OnError_ParallelSleep(t *testing.T) {
wg.Add(1)
go func() {
<-start
r.OnError(nil) // input error is ignored
r.OnError()
wg.Done()
}()
}
Expand Down
4 changes: 2 additions & 2 deletions staging/src/k8s.io/apimachinery/pkg/util/wait/wait_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -194,8 +194,8 @@ func TestJitterUntilRecoversPanic(t *testing.T) {

// Hook up a custom crash handler to ensure it is called when a jitter function panics
runtime.ReallyCrash = false
runtime.PanicHandlers = []func(interface{}){
func(p interface{}) {
runtime.PanicHandlers = []func(context.Context, interface{}){
func(_ context.Context, p interface{}) {
handled++
},
}
Expand Down
4 changes: 1 addition & 3 deletions staging/src/k8s.io/apiserver/pkg/server/filters/timeout.go
Original file line number Diff line number Diff line change
Expand Up @@ -137,9 +137,7 @@ func (t *timeoutHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
}

metrics.RecordRequestPostTimeout(metrics.PostTimeoutSourceTimeoutHandler, status)
err := fmt.Errorf("post-timeout activity - time-elapsed: %s, %v %q result: %v",
time.Since(timedOutAt), r.Method, r.URL.Path, res)
utilruntime.HandleError(err)
utilruntime.HandleErrorInContext(r.Context(), nil, "Post-timeout activity", "timeElapsed", time.Since(timedOutAt), "method", r.Method, "path", r.URL.Path, "result", res)
}()
}()
httplog.SetStacktracePredicate(r.Context(), func(status int) bool {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,9 @@ func TestErrConnKilled(t *testing.T) {
if isStackTraceLoggedByRuntime(capturedOutput) {
t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput)
}
if !strings.Contains(capturedOutput, `timeout or abort while handling: method=GET URI="/" audit-ID=""`) {
// For the sake of simplicity and clarity this matches the full log line.
// This is not part of the Kubernetes API and could change.
if !strings.Contains(capturedOutput, `"Timeout or abort while handling" logger="UnhandledError" method="GET" URI="/" auditID=""`) {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
}
}
Expand Down Expand Up @@ -501,7 +503,9 @@ func TestErrConnKilledHTTP2(t *testing.T) {
if isStackTraceLoggedByRuntime(capturedOutput) {
t.Errorf("unexpected stack trace in log, actual = %v", capturedOutput)
}
if !strings.Contains(capturedOutput, `timeout or abort while handling: method=GET URI="/" audit-ID=""`) {
// For the sake of simplicity and clarity this matches the full log line.
// This is not part of the Kubernetes API and could change.
if !strings.Contains(capturedOutput, `"Timeout or abort while handling" logger="UnhandledError" method="GET" URI="/" auditID=""`) {
t.Errorf("unexpected output captured actual = %v", capturedOutput)
}

Expand Down
3 changes: 1 addition & 2 deletions staging/src/k8s.io/apiserver/pkg/server/filters/wrap.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@ limitations under the License.
package filters

import (
"fmt"
"net/http"

"k8s.io/apimachinery/pkg/util/runtime"
Expand Down Expand Up @@ -51,7 +50,7 @@ func WithPanicRecovery(handler http.Handler, resolver request.RequestInfoResolve
// This call can have different handlers, but the default chain rate limits. Call it after the metrics are updated
// in case the rate limit delays it. If you outrun the rate for this one timed out requests, something has gone
// seriously wrong with your server, but generally having a logging signal for timeouts is useful.
runtime.HandleError(fmt.Errorf("timeout or abort while handling: method=%v URI=%q audit-ID=%q", req.Method, req.RequestURI, audit.GetAuditIDTruncated(req.Context())))
runtime.HandleErrorInContext(req.Context(), nil, "Timeout or abort while handling", "method", req.Method, "URI", req.RequestURI, "auditID", audit.GetAuditIDTruncated(req.Context()))
return
}
http.Error(w, "This request caused apiserver to panic. Look in the logs for details.", http.StatusInternalServerError)
Expand Down
Loading

0 comments on commit 700db5b

Please sign in to comment.