Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: allow stopping services in "starting" state #503

Merged
merged 14 commits into from
Oct 10, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 62 additions & 2 deletions internals/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/canonical/pebble/internals/overlord"
"github.com/canonical/pebble/internals/overlord/patch"
"github.com/canonical/pebble/internals/overlord/restart"
"github.com/canonical/pebble/internals/overlord/servstate"
"github.com/canonical/pebble/internals/overlord/standby"
"github.com/canonical/pebble/internals/overlord/state"
"github.com/canonical/pebble/internals/reaper"
Expand Down Expand Up @@ -1148,7 +1149,7 @@ services:
// We have to wait for it be in running state.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1283,7 +1284,7 @@ services:
// We have to wait for it be in running state for StopRunning to stop it.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1317,6 +1318,65 @@ services:
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestStopWithinOkayDelay(c *C) {
// Start the daemon.
writeTestLayer(s.pebbleDir, `
services:
test1:
override: replace
command: sleep 10
`)
d := s.newDaemon(c)
err := d.Init()
c.Assert(err, IsNil)
c.Assert(d.Start(), IsNil)

// Start the test service.
payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`)
req, err := http.NewRequest("POST", "/v1/services", payload)
c.Assert(err, IsNil)
rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp)
rec := httptest.NewRecorder()
rsp.ServeHTTP(rec, req)
c.Check(rec.Result().StatusCode, Equals, 202)

// Waiting for the change to be in doing state cannot guarantee the service is
// in the starting state, so here we wait until the service is in the starting
// state. We wait up to 25*20=500ms to make sure there is still half a second
// left to stop the service before okayDelay.
for i := 0; i < 25; i++ {
svcInfo, err := d.overlord.ServiceManager().Services([]string{"test1"})
c.Assert(err, IsNil)
if len(svcInfo) > 0 && svcInfo[0].Current == servstate.StatusActive {
break
}
time.Sleep(20 * time.Millisecond)
}

// Stop the daemon, which should stop services in starting state. At this point,
// it should still be within the okayDelay.
err = d.Stop(nil)
c.Assert(err, IsNil)

// Ensure a "stop" change is created, along with its "stop" tasks.
d.state.Lock()
defer d.state.Unlock()
changes := d.state.Changes()
var change *state.Change
for _, ch := range changes {
if ch.Kind() == "stop" {
change = ch
}
}
if change == nil {
c.Fatalf("stop change not found")
}
c.Check(change.Status(), Equals, state.DoneStatus)
tasks := change.Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestWritesRequireAdminAccess(c *C) {
for _, cmd := range API {
if cmd.Path == "/v1/notices" {
Expand Down
4 changes: 4 additions & 0 deletions internals/overlord/servstate/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ func (s *serviceData) stop() error {
defer s.manager.servicesLock.Unlock()

switch s.state {
case stateStarting:
s.started <- fmt.Errorf("stopped before the %s okay delay", okayDelay)
fallthrough

case stateRunning:
logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name)
// First send SIGTERM to try to terminate it gracefully.
Expand Down
4 changes: 2 additions & 2 deletions internals/overlord/servstate/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,15 +367,15 @@ func servicesToStop(m *ServiceManager) ([][]string, error) {
return nil, err
}

// Filter down to only those that are running or in backoff
// Filter down to only those that are starting, running or in backoff
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
var result [][]string
for _, services := range stop {
var notStopped []string
for _, name := range services {
s := m.services[name]
if s != nil && (s.state == stateRunning || s.state == stateBackoff) {
if s != nil && (s.state == stateStarting || s.state == stateRunning || s.state == stateBackoff) {
notStopped = append(notStopped, name)
}
}
Expand Down
52 changes: 52 additions & 0 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,58 @@ services:
c.Assert(s.manager.StopTimeout(), Equals, time.Minute*60+time.Millisecond*200)
}

func (s *S) TestStopServiceWithinOkayDelay(c *C) {
// A longer okayDelay is used so that the change for starting the service won't
// quickly transition into the running state.
fakeOkayDelay := 20 * shortOkayDelay
servstate.FakeOkayWait(fakeOkayDelay)

s.newServiceManager(c)
serviceName := "test-stop-within-okaywait"
// The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath).
layer := `
services:
%s:
override: replace
command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}"
`
s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay.Seconds()))
s.planChanged(c)

// Start the service without waiting for change ready.
s.st.Lock()
ts, err := servstate.Start(s.st, [][]string{{serviceName}})
c.Check(err, IsNil)
chgStart := s.st.NewChange("test", "Start test")
chgStart.AddAll(ts)
s.st.Unlock()
s.runner.Ensure()

// Wait until the service is in the starting state.
s.waitUntilService(c, serviceName, func(service *servstate.ServiceInfo) bool {
return service.Current == servstate.StatusActive
})

// Stop the service within okayDelay.
chg := s.stopServices(c, [][]string{{serviceName}})
s.st.Lock()
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

waitChangeReady(c, s.runner, chgStart, "Start test")

s.st.Lock()
c.Check(chgStart.Status(), Equals, state.ErrorStatus)
c.Check(chgStart.Err(), ErrorMatches, fmt.Sprintf(`(?s).*stopped before the %s okay delay.*`, fakeOkayDelay))
s.st.Unlock()

donePath := filepath.Join(s.dir, serviceName)
// If the service is stopped within okayDelay and is indeed terminated, donePath should not exist.
if _, err := os.Stat(donePath); err == nil {
c.Fatalf("service %s waiting for service output", serviceName)
}
}

func (s *S) TestKillDelayIsUsed(c *C) {
s.newServiceManager(c)
s.planAddLayer(c, testPlanLayer)
Expand Down
1 change: 1 addition & 0 deletions internals/overlord/servstate/state-diagram.dot
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ digraph service_state_machine {
node [penwidth=1]
initial -> starting [label="start"]
starting -> running [label="okay wait\nelapsed"]
starting -> terminating [label="stop (before\nokay wait elapses)"]
running -> terminating [label="stop"]
running -> terminating [label="check failed\n(action \"restart\")"]
terminating -> killing [label="terminate time\nelapsed"]
Expand Down
Loading
Loading