Skip to content

Commit

Permalink
Decouple TaskRun startTime from pod start time ⌚
Browse files Browse the repository at this point in the history
Our TaskRun timeout end to end test has been intermittently failing
against PRs. After adding a lot of (terrible) log messages, I found out
that the reason TaskRuns weren't timing out was b/c the go routine
checking for a timeout was considering them timed out several
milliseconds before the reconcile loop would.

So what would happen was:

1. The timeout handler decided the run timed out, and triggered a
   reconcile
2. The reconcile checked if the run timed out, decided the run had a few
   more milliseconds of execution time and let it go
3. And the TaskRun would just keep running

It looks like the root cause is that when the go routine starts,
it uses a `StartTime` that has been set by `TaskRun.InitializeConditions`,
but after that, the pod is started and the `TaskRun.Status` is updated to use _the pod's_
`StartTime` instead - which is what the Reconcile loop will use from
that point forward, causing the slight drift.

This is fixed by no longer tying the start time of the TaskRun to the
pod: the TaskRun will be considered started as soon as the reconciler
starts to act on it, which is probably the functionality the user would
expect anyway (e.g. if the pod was delayed in being started, this delay
should be subtracted from the timeout, and in tektoncd#734 we are looking to be
more tolerant of pods not immediately scheduling anyway).

Fixes tektoncd#731

Co-authored-by: Nader Ziada <[email protected]>
  • Loading branch information
2 people authored and wlynch committed May 20, 2019
1 parent d0dc4a0 commit 03adc3b
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 16 deletions.
7 changes: 4 additions & 3 deletions pkg/reconciler/v1alpha1/pipelinerun/pipelinerun.go
Original file line number Diff line number Diff line change
Expand Up @@ -167,11 +167,12 @@ func (c *Reconciler) Reconcile(ctx context.Context, key string) error {
pr := original.DeepCopy()
c.timeoutHandler.StatusLock(pr)
if !pr.HasStarted() {
pr.Status.InitializeConditions()
// start goroutine to track pipelinerun timeout only startTime is not set
go c.timeoutHandler.WaitPipelineRun(pr)
go c.timeoutHandler.WaitPipelineRun(pr, pr.Status.StartTime)
} else {
pr.Status.InitializeConditions()
}
pr.Status.InitializeConditions()

c.timeoutHandler.StatusUnlock(original)

if pr.IsDone() {
Expand Down
8 changes: 6 additions & 2 deletions pkg/reconciler/v1alpha1/taskrun/taskrun.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ func (c *Reconciler) Reconcile(ctx context.Context, key string) error {

// Don't modify the informer's copy.
tr := original.DeepCopy()

// If the TaskRun is just starting, this will also set the starttime,
// from which the timeout will immediately begin counting down.
tr.Status.InitializeConditions()

if tr.IsDone() {
Expand Down Expand Up @@ -303,7 +306,7 @@ func (c *Reconciler) reconcile(ctx context.Context, tr *v1alpha1.TaskRun) error
c.Logger.Errorf("Failed to create build pod for task %q :%v", err, tr.Name)
return nil
}
go c.timeoutHandler.WaitTaskRun(tr)
go c.timeoutHandler.WaitTaskRun(tr, tr.Status.StartTime)
}
if err := c.tracker.Track(tr.GetBuildPodRef(), tr); err != nil {
c.Logger.Errorf("Failed to create tracker for build pod %q for taskrun %q: %v", tr.Name, tr.Name, err)
Expand Down Expand Up @@ -336,7 +339,6 @@ func updateStatusFromPod(taskRun *v1alpha1.TaskRun, pod *corev1.Pod) {
})
}

taskRun.Status.StartTime = &pod.CreationTimestamp
taskRun.Status.PodName = pod.Name

taskRun.Status.Steps = []v1alpha1.StepState{}
Expand Down Expand Up @@ -535,6 +537,8 @@ func (c *Reconciler) checkTimeout(tr *v1alpha1.TaskRun, ts *v1alpha1.TaskSpec, d
if tr.Spec.Timeout != nil {
timeout := tr.Spec.Timeout.Duration
runtime := time.Since(tr.Status.StartTime.Time)

c.Logger.Infof("Checking timeout for TaskRun %q (startTime %s, timeout %s, runtime %s)", tr.Name, tr.Status.StartTime, timeout, runtime)
if runtime > timeout {
c.Logger.Infof("TaskRun %q is timeout (runtime %s over %s), deleting pod", tr.Name, runtime, timeout)
if err := dp(tr.Status.PodName, &metav1.DeleteOptions{}); err != nil && !errors.IsNotFound(err) {
Expand Down
72 changes: 62 additions & 10 deletions pkg/reconciler/v1alpha1/taskrun/taskrun_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -978,6 +978,55 @@ func TestReconcile(t *testing.T) {
}
}

func TestReconcile_SetsStartTime(t *testing.T) {
taskRun := tb.TaskRun("test-taskrun", "foo", tb.TaskRunSpec(
tb.TaskRunTaskRef(simpleTask.Name),
))
d := test.Data{
TaskRuns: []*v1alpha1.TaskRun{taskRun},
Tasks: []*v1alpha1.Task{simpleTask},
}
testAssets := getTaskRunController(d)

if err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)); err != nil {
t.Errorf("expected no error reconciling valid TaskRun but got %v", err)
}

if taskRun.Status.StartTime == nil || taskRun.Status.StartTime.IsZero() {
t.Errorf("expected startTime to be set by reconcile but was %q", taskRun.Status.StartTime)
}
}

func TestReconcile_DoesntChangeStartTime(t *testing.T) {
startTime := time.Date(2000, 1, 1, 1, 1, 1, 1, time.UTC)
taskRun := tb.TaskRun("test-taskrun", "foo", tb.TaskRunSpec(
tb.TaskRunTaskRef(simpleTask.Name)),
tb.TaskRunStatus(
tb.TaskRunStartTime(startTime),
tb.PodName("the-pod"),
),
)
d := test.Data{
TaskRuns: []*v1alpha1.TaskRun{taskRun},
Tasks: []*v1alpha1.Task{simpleTask},
Pods: []*corev1.Pod{{
ObjectMeta: metav1.ObjectMeta{
Namespace: "foo",
Name: "the-pod",
},
}},
}
testAssets := getTaskRunController(d)

if err := testAssets.Controller.Reconciler.Reconcile(context.Background(), getRunName(taskRun)); err != nil {
t.Errorf("expected no error reconciling valid TaskRun but got %v", err)
}

if taskRun.Status.StartTime.Time != startTime {
t.Errorf("expected startTime %q to be preserved by reconcile but was %q", startTime, taskRun.Status.StartTime)
}
}

func TestReconcileInvalidTaskRuns(t *testing.T) {
noTaskRun := tb.TaskRun("notaskrun", "foo", tb.TaskRunSpec(tb.TaskRunTaskRef("notask")))
withWrongRef := tb.TaskRun("taskrun-with-wrong-ref", "foo", tb.TaskRunSpec(
Expand Down Expand Up @@ -1288,13 +1337,6 @@ func TestUpdateStatusFromPod(t *testing.T) {
Status: corev1.ConditionUnknown,
Reason: "Building",
}
// ignoreFields := cmpopts.IgnoreFields(v1alpha1.TaskRunStatus{}, "CompletionTime")
compareCompletionTime := cmp.Comparer(func(x, y *metav1.Time) bool {
if x == nil {
return y == nil
}
return y != nil
})
for _, c := range []struct {
desc string
podStatus corev1.PodStatus
Expand Down Expand Up @@ -1579,16 +1621,26 @@ func TestUpdateStatusFromPod(t *testing.T) {
},
Status: c.podStatus,
}
tr := tb.TaskRun("taskRun", "foo")
startTime := time.Date(2010, 1, 1, 1, 1, 1, 1, time.UTC)
tr := tb.TaskRun("taskRun", "foo", tb.TaskRunStatus(tb.TaskRunStartTime(startTime)))
updateStatusFromPod(tr, p)

// Common traits, set for test case brevity.
c.want.PodName = "pod"
c.want.StartTime = &now
c.want.StartTime = &metav1.Time{Time: startTime}

if d := cmp.Diff(tr.Status, c.want, ignoreVolatileTime, compareCompletionTime); d != "" {
ensureTimeNotNil := cmp.Comparer(func(x, y *metav1.Time) bool {
if x == nil {
return y == nil
}
return y != nil
})
if d := cmp.Diff(tr.Status, c.want, ignoreVolatileTime, ensureTimeNotNil); d != "" {
t.Errorf("Diff:\n%s", d)
}
if tr.Status.StartTime.Time != c.want.StartTime.Time {
t.Errorf("Expected TaskRun startTime to be unchanged but was %s", tr.Status.StartTime)
}
})
}
}
1 change: 0 additions & 1 deletion test/timeout_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,6 @@ func TestPipelineRunTimeout(t *testing.T) {

// TestTaskRunTimeout is an integration test that will verify a TaskRun can be timed out.
func TestTaskRunTimeout(t *testing.T) {
t.Skip("Flakey test, tracked in https://github.com/tektoncd/pipeline/issues/731")
c, namespace := setup(t)
t.Parallel()

Expand Down

0 comments on commit 03adc3b

Please sign in to comment.