Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(hatchery): do not add same job into workersStarter #6722

Merged
merged 7 commits into from
Dec 7, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions engine/hatchery/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ type Common struct {
Clientv2 cdsclient.HatcheryServiceClient
mapServiceNextLineNumberMutex sync.Mutex
mapServiceNextLineNumber map[string]int64
mapSpawnJobRequest map[string]bool
yesnault marked this conversation as resolved.
Show resolved Hide resolved
mapSpawnJobRequestMutex *sync.Mutex
}

func (c *Common) MaxHeartbeat() int {
Expand Down Expand Up @@ -65,6 +67,25 @@ func (c *Common) GetGoRoutines() *sdk.GoRoutines {
return c.GoRoutines
}

func (c *Common) SetJobInPendingWorkerCreation(id string) {
c.mapSpawnJobRequestMutex.Lock()
c.mapSpawnJobRequest[id] = true
yesnault marked this conversation as resolved.
Show resolved Hide resolved
c.mapSpawnJobRequestMutex.Unlock()
}

func (c *Common) RemoveJobFromPendingWorkerCreation(id string) {
c.mapSpawnJobRequestMutex.Lock()
delete(c.mapSpawnJobRequest, id)
c.mapSpawnJobRequestMutex.Unlock()
}

func (c *Common) IsJobAlreadyPendingWorkerCreation(id string) bool {
c.mapSpawnJobRequestMutex.Lock()
res := c.mapSpawnJobRequest[id]
c.mapSpawnJobRequestMutex.Unlock()
return res
}

// CommonServe start the HatcheryLocal server
func (c *Common) CommonServe(ctx context.Context, h hatchery.Interface) error {
log.Info(ctx, "%s> Starting service %s (%s)...", c.Name(), h.Configuration().Name, sdk.VERSION)
Expand Down Expand Up @@ -231,6 +252,9 @@ func (c *Common) Init(ctx context.Context, h hatchery.Interface) error {
}
}

c.mapSpawnJobRequest = make(map[string]bool)
c.mapSpawnJobRequestMutex = new(sync.Mutex)

return c.initServiceLogger(ctx)
}

Expand Down
25 changes: 13 additions & 12 deletions sdk/hatchery/hatchery.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ var (
defaultMaxProvisioning = 10
models []sdk.Model
defaultMaxAttemptsNumberBeforeFailure = 5
CacheSpawnIDsTTL = 10 * time.Second
CacheNbAttemptsIDsTTL = 1 * time.Hour
)

Expand Down Expand Up @@ -86,9 +85,6 @@ func Create(ctx context.Context, h Interface) error {
v2Runjobs := make(chan sdk.V2WorkflowRunJob, h.Configuration().Provision.MaxConcurrentProvisioning)
errs := make(chan error, 1)

// Create a cache to keep in memory the jobID processed in the last 10s.
cacheSpawnIDs := cache.New(CacheSpawnIDsTTL, 2*CacheSpawnIDsTTL)

// Create a cache to only process each jobID only a number of attempts before force to fail the job
cacheNbAttemptsIDs := &CacheNbAttemptsJobIDs{
cache: cache.New(CacheNbAttemptsIDsTTL, 2*CacheNbAttemptsIDsTTL),
Expand Down Expand Up @@ -154,7 +150,7 @@ func Create(ctx context.Context, h Interface) error {
log.Error(ctx, "error on h.WorkerModelsEnabled(): %v", errwm)
}
case j := <-v2Runjobs:
if err := handleJobV2(ctx, h, j, cacheNbAttemptsIDs, workersStartChan); err != nil {
if err := handleJobV2(h, j, cacheNbAttemptsIDs, workersStartChan); err != nil {
log.ErrorWithStackTrace(ctx, err)
}
case j := <-wjobs:
Expand Down Expand Up @@ -216,15 +212,12 @@ func Create(ctx context.Context, h Interface) error {
}

//Check if the jobs is concerned by a pending worker creation
if _, exist := cacheSpawnIDs.Get(strconv.FormatInt(j.ID, 10)); exist {
if h.IsJobAlreadyPendingWorkerCreation(strconv.FormatInt(j.ID, 10)) {
log.Debug(currentCtx, "job %d already spawned in previous routine", j.ID)
endTrace("already spawned")
endTrace("already in worker creation process")
continue
}

//Before doing anything, push in cache
cacheSpawnIDs.SetDefault(strconv.FormatInt(j.ID, 10), j.ID)

//Check bookedBy current hatchery
if j.BookedBy.ID != 0 {
log.Debug(currentCtx, "hatchery> job %d is already booked", j.ID)
Expand Down Expand Up @@ -270,7 +263,6 @@ func Create(ctx context.Context, h Interface) error {
log.Debug(currentCtx, "cannot launch this job because it does not contains a region prerequisite and IgnoreJobWithNoRegion=true in hatchery configuration")
canTakeJob = false
} else if isWithModels {

// Test ascode model
modelPath := strings.Split(jobModel, "/")
if len(modelPath) >= 5 {
Expand Down Expand Up @@ -349,6 +341,7 @@ func Create(ctx context.Context, h Interface) error {
}

logStepInfo(currentCtx, "processed", j.Queued)
h.SetJobInPendingWorkerCreation(strconv.FormatInt(j.ID, 10))
workersStartChan <- workerRequest
case <-chanRegister:
if err := workerRegister(ctx, hWithModels, workersStartChan); err != nil {
Expand All @@ -360,7 +353,7 @@ func Create(ctx context.Context, h Interface) error {
return nil
}

func handleJobV2(ctx context.Context, h Interface, j sdk.V2WorkflowRunJob, cacheAttempts *CacheNbAttemptsJobIDs, workersStartChan chan<- workerStarterRequest) error {
func handleJobV2(h Interface, j sdk.V2WorkflowRunJob, cacheAttempts *CacheNbAttemptsJobIDs, workersStartChan chan<- workerStarterRequest) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Minute)
ctx = telemetry.New(ctx, h, "hatchery.V2JobReceive", trace.AlwaysSample(), trace.SpanKindServer)
ctx, end := telemetry.Span(ctx, "hatchery.V2JobReceive", telemetry.Tag(telemetry.TagWorkflow, j.WorkflowName),
Expand Down Expand Up @@ -403,6 +396,13 @@ func handleJobV2(ctx context.Context, h Interface, j sdk.V2WorkflowRunJob, cache
endTrace("no capacities")
}

//Check if the jobs is concerned by a pending worker creation
if h.IsJobAlreadyPendingWorkerCreation(j.ID) {
log.Debug(ctx, "job %d already spawned in previous routine", j.ID)
endTrace("already in worker creation process")
return nil
}

workerRequest := workerStarterRequest{
ctx: ctx,
cancel: cancel,
Expand Down Expand Up @@ -453,6 +453,7 @@ func handleJobV2(ctx context.Context, h Interface, j sdk.V2WorkflowRunJob, cache
}

logStepInfo(ctx, "processed", j.Queued)
h.SetJobInPendingWorkerCreation(j.ID)
workersStartChan <- workerRequest
return nil
}
Expand Down
95 changes: 83 additions & 12 deletions sdk/hatchery/hatchery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,80 @@ import (
"github.com/ovh/cds/sdk/jws"
)

func TestCreateOneJob(t *testing.T) {
log.Factory = log.NewTestingWrapper(t)
ctx := context.TODO()
ctx, cancel := context.WithTimeout(ctx, 7*time.Second)
defer cancel()
ctrl1 := gomock.NewController(t)
ctrl2 := gomock.NewController(t)

t.Cleanup(func() {
ctrl1.Finish()
ctrl2.Finish()
})

mockHatchery := mock_hatchery.NewMockInterface(ctrl1)
mockCDSClient := mock_cdsclient.NewMockInterface(ctrl2)

grtn := sdk.NewGoRoutines(ctx)
hatcheryConfig := service.HatcheryCommonConfiguration{
Name: t.Name(),
}
hatcheryConfig.Provision.MaxWorker = 1

hatcheryConfig.Provision.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test

mockHatchery.EXPECT().Name().Return(t.Name()).AnyTimes()
mockHatchery.EXPECT().Type().Return(sdk.TypeHatchery).AnyTimes()
mockHatchery.EXPECT().Service().Return(&sdk.Service{}).AnyTimes()
mockHatchery.EXPECT().InitHatchery(gomock.Any()).Return(nil)
mockHatchery.EXPECT().Configuration().Return(hatcheryConfig).AnyTimes()
mockHatchery.EXPECT().GetGoRoutines().Return(grtn).AnyTimes()
mockHatchery.EXPECT().CDSClient().Return(mockCDSClient).AnyTimes()
mockHatchery.EXPECT().CDSClientV2().Return(nil).AnyTimes()
mockCDSClient.EXPECT().QueuePolling(gomock.Any(), grtn, gomock.Any(), gomock.Any(), gomock.Any(), gomock.Any()).DoAndReturn(
func(ctx context.Context, goRoutines *sdk.GoRoutines, jobs chan<- sdk.WorkflowNodeJobRun, errs chan<- error, delay time.Duration, ms ...cdsclient.RequestModifier) error {
j := sdk.WorkflowNodeJobRun{
ProjectID: 1,
ID: 777,
WorkflowNodeRunID: 1,
Status: sdk.StatusWaiting,
Job: sdk.ExecutedJob{
Job: sdk.Job{},
},
Start: time.Now(),
}

jobs <- j // Send the job a first time, it will trigger the first call on SpawnWorker

<-ctx.Done()
return ctx.Err()
},
)

mockHatchery.EXPECT().IsJobAlreadyPendingWorkerCreation(gomock.Any()).Times(1)
mockHatchery.EXPECT().SetJobInPendingWorkerCreation(gomock.Any()).Times(1)
mockHatchery.EXPECT().RemoveJobFromPendingWorkerCreation(gomock.Any()).Times(1)

// This calls are expected for each job received in the channel
mockCDSClient.EXPECT().WorkerList(gomock.Any()).Return(nil, nil).AnyTimes()
mockHatchery.EXPECT().WorkersStarted(gomock.Any()).Return(nil, nil).AnyTimes()
mockHatchery.EXPECT().CanSpawn(gomock.Any(), gomock.Any(), "777", gomock.Any()).Return(true).AnyTimes()
mockCDSClient.EXPECT().QueueJobBook(gomock.Any(), "777").Return(sdk.WorkflowNodeJobRunBooked{}, nil).AnyTimes()
mockCDSClient.EXPECT().QueueJobSendSpawnInfo(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
privateKey, err := jws.NewRandomRSAKey()
require.NoError(t, err)
mockHatchery.EXPECT().GetPrivateKey().Return(privateKey).AnyTimes()

// Call to SpawnWorker regarding what append in "QueuePolling"
mockHatchery.EXPECT().SpawnWorker(gomock.Any(), gomock.Any()).Return(nil).Times(1)

hatchery.Create(ctx, mockHatchery)

<-ctx.Done()
}

func TestCreate(t *testing.T) {
log.Factory = log.NewTestingWrapper(t)
ctx := context.TODO()
Expand All @@ -41,7 +115,6 @@ func TestCreate(t *testing.T) {
}
hatcheryConfig.Provision.MaxWorker = 1

hatchery.CacheSpawnIDsTTL = 2 * time.Second // decrease this cache TTL to speedup the test
hatcheryConfig.Provision.MaxAttemptsNumberBeforeFailure = 2 // decrease this value to speedup the test

mockHatchery.EXPECT().Name().Return(t.Name()).AnyTimes()
Expand All @@ -67,22 +140,26 @@ func TestCreate(t *testing.T) {

jobs <- j // Send the job a first time, it will trigger the first call on SpawnWorker
time.Sleep(1 * time.Second) // Wait
jobs <- j // This one must be ignored with a log "already spawned in previous routine"
jobs <- j // This will start the workerStarter, but failed on book in the real life
time.Sleep(2 * time.Second) // Wait
jobs <- j // This will trigger a second call on SpawnWorker
jobs <- j // This will trigger a second call on SpawnWorker should fail the job (nbAttempts: > 2) and call QueueSendResult
time.Sleep(3 * time.Second) // Wait
jobs <- j // This shoud not trigger the call on SpawnWorker but should fail the job
jobs <- j // This shoud not trigger the call on SpawnWorker but should fail the job (nbAttempts: > 2) and call QueueSendResult

<-ctx.Done()
return ctx.Err()
},
)

mockHatchery.EXPECT().IsJobAlreadyPendingWorkerCreation(gomock.Any()).Times(4)
mockHatchery.EXPECT().SetJobInPendingWorkerCreation(gomock.Any()).Times(2)
mockHatchery.EXPECT().RemoveJobFromPendingWorkerCreation(gomock.Any()).Times(2)

// This calls are expected for each job received in the channel
mockCDSClient.EXPECT().WorkerList(gomock.Any()).Return(nil, nil).AnyTimes()
mockHatchery.EXPECT().WorkersStarted(gomock.Any()).Return(nil, nil).AnyTimes()
mockHatchery.EXPECT().CanSpawn(gomock.Any(), gomock.Any(), "666", gomock.Any()).Return(true).AnyTimes()
mockCDSClient.EXPECT().QueueJobBook(gomock.Any(), "666").Return(sdk.WorkflowNodeJobRunBooked{}, nil).AnyTimes()
mockCDSClient.EXPECT().QueueJobBook(gomock.Any(), "666").Return(sdk.WorkflowNodeJobRunBooked{}, nil).Times(2)
mockCDSClient.EXPECT().QueueJobSendSpawnInfo(gomock.Any(), gomock.Any(), gomock.Any()).Return(nil).AnyTimes()
privateKey, err := jws.NewRandomRSAKey()
require.NoError(t, err)
Expand All @@ -92,20 +169,14 @@ func TestCreate(t *testing.T) {
mockHatchery.EXPECT().SpawnWorker(gomock.Any(), gomock.Any()).Return(nil).Times(2)

// Expecing a call to QueueSendResult
mockCDSClient.EXPECT().QueueSendResult(gomock.Any(), int64(666), gomock.Any()).Return(nil)
mockCDSClient.EXPECT().QueueSendResult(gomock.Any(), int64(666), gomock.Any()).Return(nil).Times(2)

hatchery.Create(ctx, mockHatchery)

<-ctx.Done()

}

func getMockLogger() *logrus.Logger {
log := logrus.New()
log.AddHook(&HookMock{})
return log
}

type HookMock struct{}

func (h *HookMock) Levels() []logrus.Level {
Expand Down
76 changes: 76 additions & 0 deletions sdk/hatchery/mock_hatchery/interface_mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading