Skip to content

Commit

Permalink
feat(hatchery): allow to disable or change max job start attempts cou…
Browse files Browse the repository at this point in the history
…nt (#5914)
  • Loading branch information
richardlt authored Sep 1, 2021
1 parent de71b22 commit 8dc52d6
Show file tree
Hide file tree
Showing 6 changed files with 40 additions and 31 deletions.
1 change: 1 addition & 0 deletions engine/service/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ type HatcheryCommonConfiguration struct {
ExtraValue string `toml:"extraValue" comment:"value for extraKey field. For many keys: valueaaa,valuebbb" json:"-"`
} `toml:"graylog" json:"graylog"`
} `toml:"workerLogsOptions" comment:"Worker Log Configuration" json:"workerLogsOptions"`
MaxAttemptsNumberBeforeFailure int `toml:"maxAttemptsNumberBeforeFailure" default:"5" commented:"true" comment:"Maximum attempts to start a same job. -1 to disable failing jobs when to many attempts" json:"maxAttemptsNumberBeforeFailure"`
} `toml:"provision" json:"provision"`
LogOptions struct {
SpawnOptions struct {
Expand Down
2 changes: 2 additions & 0 deletions sdk/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,7 @@ var (
ErrConflictData = Error{ID: 192, Status: http.StatusConflict}
ErrWebsocketUpgrade = Error{ID: 193, Status: http.StatusUpgradeRequired}
ErrMFARequired = Error{ID: 194, Status: http.StatusForbidden}
ErrHatcheryNoResourceAvailable = Error{ID: 195, Status: http.StatusInternalServerError}
)

var errorsAmericanEnglish = map[int]string{
Expand Down Expand Up @@ -383,6 +384,7 @@ var errorsAmericanEnglish = map[int]string{
ErrConflictData.ID: "Data conflict",
ErrWebsocketUpgrade.ID: "Websocket upgrade required",
ErrMFARequired.ID: "Multi factor authentication is required",
ErrHatcheryNoResourceAvailable.ID: "No enough resource available to start worker",
}

// Error type.
Expand Down
60 changes: 33 additions & 27 deletions sdk/hatchery/hatchery.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ import (

var (
// Client is a CDS Client
Client cdsclient.HTTPClient
defaultMaxProvisioning = 10
models []sdk.Model
MaxAttemptsNumberBeforeFailure = 5
CacheSpawnIDsTTL = 10 * time.Second
CacheNbAttemptsIDsTTL = 1 * time.Hour
Client cdsclient.HTTPClient
defaultMaxProvisioning = 10
models []sdk.Model
defaultMaxAttemptsNumberBeforeFailure = 5
CacheSpawnIDsTTL = 10 * time.Second
CacheNbAttemptsIDsTTL = 1 * time.Hour
)

type CacheNbAttemptsJobIDs struct {
Expand Down Expand Up @@ -87,7 +87,7 @@ func Create(ctx context.Context, h Interface) error {
// Create a cache to keep in memory the jobID processed in the last 10s.
cacheSpawnIDs := cache.New(CacheSpawnIDsTTL, 2*CacheSpawnIDsTTL)

// Create a cache to only process each jobID only a number of attenmpts before force to fail the job
// Create a cache to only process each jobID only a number of attempts before force to fail the job
cacheNbAttemptsIDs := &CacheNbAttemptsJobIDs{
cache: cache.New(CacheNbAttemptsIDsTTL, 2*CacheNbAttemptsIDsTTL),
}
Expand Down Expand Up @@ -226,26 +226,6 @@ func Create(ctx context.Context, h Interface) error {
continue
}

//Check if we already try to start a worker for this job
nbAttempts := cacheNbAttemptsIDs.NewAttempt(j.ID)
if nbAttempts > MaxAttemptsNumberBeforeFailure {
if err := h.CDSClient().
QueueSendResult(ctx,
j.ID,
sdk.Result{
ID: j.ID,
BuildID: j.ID,
Status: sdk.StatusFail,
RemoteTime: time.Now(),
Reason: fmt.Sprintf("hatchery %q failed to start worker after %d attempts", h.Configuration().Name, MaxAttemptsNumberBeforeFailure),
}); err != nil {
log.ErrorWithStackTrace(ctx, err)
}
log.Info(ctx, "hatchery %q failed to start worker after %d attempts", h.Configuration().Name, MaxAttemptsNumberBeforeFailure)
endTrace("maximum attempts")
continue
}

workerRequest := workerStarterRequest{
ctx: currentCtx,
cancel: endTrace,
Expand Down Expand Up @@ -313,6 +293,32 @@ func Create(ctx context.Context, h Interface) error {
}
}

// Check if we already try to start a worker for this job
maxAttemptsNumberBeforeFailure := h.Configuration().Provision.MaxAttemptsNumberBeforeFailure
if maxAttemptsNumberBeforeFailure > -1 {
nbAttempts := cacheNbAttemptsIDs.NewAttempt(j.ID)
if maxAttemptsNumberBeforeFailure == 0 {
maxAttemptsNumberBeforeFailure = defaultMaxAttemptsNumberBeforeFailure
}
if nbAttempts > maxAttemptsNumberBeforeFailure {
if err := h.CDSClient().
QueueSendResult(ctx,
j.ID,
sdk.Result{
ID: j.ID,
BuildID: j.ID,
Status: sdk.StatusFail,
RemoteTime: time.Now(),
Reason: fmt.Sprintf("hatchery %q failed to start worker after %d attempts", h.Configuration().Name, maxAttemptsNumberBeforeFailure),
}); err != nil {
log.ErrorWithStackTrace(ctx, err)
}
log.Info(ctx, "hatchery %q failed to start worker after %d attempts", h.Configuration().Name, maxAttemptsNumberBeforeFailure)
endTrace("maximum attempts")
continue
}
}

//Ask to start
log.Debug(ctx, "hatchery> Request a worker for job %d (%.3f seconds elapsed)", j.ID, time.Since(t0).Seconds())
workersStartChan <- workerRequest
Expand Down
4 changes: 2 additions & 2 deletions sdk/hatchery/hatchery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@ func TestCreate(t *testing.T) {
}
hatcheryConfig.Provision.MaxWorker = 1

hatchery.CacheSpawnIDsTTL = 2 * time.Second // decrease this cache TTL to speedup the test
hatchery.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test
hatchery.CacheSpawnIDsTTL = 2 * time.Second // decrease this cache TTL to speedup the test
hatcheryConfig.Provision.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test

mockHatchery.EXPECT().Name().Return(t.Name()).AnyTimes()
mockHatchery.EXPECT().Type().Return(sdk.TypeHatchery).AnyTimes()
Expand Down
2 changes: 1 addition & 1 deletion sdk/hatchery/starter.go
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,7 @@ func spawnWorkerForJob(ctx context.Context, h Interface, j workerStarterRequest)
ID: sdk.MsgSpawnInfoHatcheryErrorSpawn.ID,
Args: []interface{}{h.Service().Name, modelName, sdk.Round(time.Since(start), time.Second).String(), sdk.ExtractHTTPError(errSpawn).Error()},
})
log.Error(ctx, "hatchery %s cannot spawn worker %s for job %d: %v", h.Service().Name, modelName, j.id, errSpawn)
log.ErrorWithStackTrace(ctx, sdk.WrapError(errSpawn, "hatchery %s cannot spawn worker %s for job %d", h.Service().Name, modelName, j.id))
next()
return false
}
Expand Down
2 changes: 1 addition & 1 deletion sdk/messages.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ var (
MsgEnvironmentDetached = &Message{"MsgEnvironmentDetached", trad{FR: "L'environnement %s est détaché du repository %s", EN: "The environment %s is detached from repository %s"}, nil, RunInfoTypInfo}
MsgWorkflowDetached = &Message{"MsgWorkflowDetached", trad{FR: "Le workflow %s est détaché du repository %s", EN: "The workflow %s is detached from repository %s"}, nil, RunInfoTypInfo}
MsgSpawnInfoHatcheryStarts = &Message{"MsgSpawnInfoHatcheryStarts", trad{FR: "La Hatchery %s a démarré le lancement du worker avec le modèle %s", EN: "Hatchery %s starts spawn worker with model %s"}, nil, RunInfoTypInfo}
MsgSpawnInfoHatcheryErrorSpawn = &Message{"MsgSpawnInfoHatcheryErrorSpawn", trad{FR: "Une erreur est survenue lorsque la Hatchery %s a démarré un worker avec le modèle %s après %s, err:%s", EN: "Error while Hatchery %s spawns worker with model %s after %s, err:%s"}, nil, RunInfoTypeError}
MsgSpawnInfoHatcheryErrorSpawn = &Message{"MsgSpawnInfoHatcheryErrorSpawn", trad{FR: "Une erreur est survenue lorsque la Hatchery %s a démarré un worker avec le modèle %s après %s, err: %s", EN: "Error while Hatchery %s spawns worker with model %s after %s, err: %s"}, nil, RunInfoTypeError}
MsgSpawnInfoHatcheryStartDockerPull = &Message{"MsgSpawnInfoHatcheryStartDockerPull", trad{FR: "La Hatchery %s a démarré le docker pull de l'image %s...", EN: "Hatchery %s starts docker pull %s..."}, nil, RunInfoTypInfo}
MsgSpawnInfoHatcheryEndDockerPull = &Message{"MsgSpawnInfoHatcheryEndDockerPull", trad{FR: "La Hatchery %s a terminé le docker pull de l'image %s", EN: "Hatchery %s docker pull %s done"}, nil, RunInfoTypInfo}
MsgSpawnInfoHatcheryEndDockerPullErr = &Message{"MsgSpawnInfoHatcheryEndDockerPullErr", trad{FR: "⚠ La Hatchery %s a terminé le docker pull de l'image %s en erreur: %s", EN: "⚠ Hatchery %s - docker pull %s done with error: %v"}, nil, RunInfoTypeError}
Expand Down

0 comments on commit 8dc52d6

Please sign in to comment.