Skip to content

Commit

Permalink
fix: allow stopping services in the starting state (#512)
Browse files Browse the repository at this point in the history
Previously, we introduced [a fix to allow stopping services in the
starting state](#503).

Because of this fix, we discovered [another deadlock
issue](#508), so we rolled it
back. Now that the deadlock issue is [fixed by this
PR](https://github.com/canonical/pebble/pull/511/files), we are
reintroducing the fix about "allowing stopping services in the starting
state".

Benchmark test, manual test to reproduce the 3-way deadlock issue, and
race test are all done and passed.
  • Loading branch information
IronCore864 authored Oct 21, 2024
1 parent ae91836 commit fc1d18b
Show file tree
Hide file tree
Showing 7 changed files with 250 additions and 129 deletions.
64 changes: 62 additions & 2 deletions internals/daemon/daemon_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
"github.com/canonical/pebble/internals/overlord"
"github.com/canonical/pebble/internals/overlord/patch"
"github.com/canonical/pebble/internals/overlord/restart"
"github.com/canonical/pebble/internals/overlord/servstate"
"github.com/canonical/pebble/internals/overlord/standby"
"github.com/canonical/pebble/internals/overlord/state"
"github.com/canonical/pebble/internals/reaper"
Expand Down Expand Up @@ -1148,7 +1149,7 @@ services:
// We have to wait for it be in running state.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1283,7 +1284,7 @@ services:
// We have to wait for it be in running state for StopRunning to stop it.
for i := 0; ; i++ {
if i >= 25 {
c.Fatalf("timed out waiting or service to start")
c.Fatalf("timed out waiting for service to start")
}
d.state.Lock()
change := d.state.Change(rsp.Change)
Expand Down Expand Up @@ -1317,6 +1318,65 @@ services:
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestStopWithinOkayDelay(c *C) {
// Start the daemon.
writeTestLayer(s.pebbleDir, `
services:
test1:
override: replace
command: sleep 10
`)
d := s.newDaemon(c)
err := d.Init()
c.Assert(err, IsNil)
c.Assert(d.Start(), IsNil)

// Start the test service.
payload := bytes.NewBufferString(`{"action": "start", "services": ["test1"]}`)
req, err := http.NewRequest("POST", "/v1/services", payload)
c.Assert(err, IsNil)
rsp := v1PostServices(apiCmd("/v1/services"), req, nil).(*resp)
rec := httptest.NewRecorder()
rsp.ServeHTTP(rec, req)
c.Check(rec.Result().StatusCode, Equals, 202)

// Waiting for the change to be in doing state cannot guarantee the service is
// in the starting state, so here we wait until the service is in the starting
// state. We wait up to 25*20=500ms to make sure there is still half a second
// left to stop the service before okayDelay.
for i := 0; i < 25; i++ {
svcInfo, err := d.overlord.ServiceManager().Services([]string{"test1"})
c.Assert(err, IsNil)
if len(svcInfo) > 0 && svcInfo[0].Current == servstate.StatusActive {
break
}
time.Sleep(20 * time.Millisecond)
}

// Stop the daemon, which should stop services in starting state. At this point,
// it should still be within the okayDelay.
err = d.Stop(nil)
c.Assert(err, IsNil)

// Ensure a "stop" change is created, along with its "stop" tasks.
d.state.Lock()
defer d.state.Unlock()
changes := d.state.Changes()
var change *state.Change
for _, ch := range changes {
if ch.Kind() == "stop" {
change = ch
}
}
if change == nil {
c.Fatalf("stop change not found")
}
c.Check(change.Status(), Equals, state.DoneStatus)
tasks := change.Tasks()
c.Assert(tasks, HasLen, 1)
c.Check(tasks[0].Kind(), Equals, "stop")
}

func (s *daemonSuite) TestWritesRequireAdminAccess(c *C) {
for _, cmd := range API {
if cmd.Path == "/v1/notices" {
Expand Down
4 changes: 4 additions & 0 deletions internals/overlord/servstate/handlers.go
Original file line number Diff line number Diff line change
Expand Up @@ -682,6 +682,10 @@ func (s *serviceData) stop() error {
defer s.manager.servicesLock.Unlock()

switch s.state {
case stateStarting:
s.started <- fmt.Errorf("stopped before the %s okay delay", okayDelay)
fallthrough

case stateRunning:
logger.Debugf("Attempting to stop service %q by sending SIGTERM", s.config.Name)
// First send SIGTERM to try to terminate it gracefully.
Expand Down
4 changes: 2 additions & 2 deletions internals/overlord/servstate/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,15 +367,15 @@ func servicesToStop(m *ServiceManager) ([][]string, error) {
return nil, err
}

// Filter down to only those that are running or in backoff
// Filter down to only those that are starting, running or in backoff
m.servicesLock.Lock()
defer m.servicesLock.Unlock()
var result [][]string
for _, services := range stop {
var notStopped []string
for _, name := range services {
s := m.services[name]
if s != nil && (s.state == stateRunning || s.state == stateBackoff) {
if s != nil && (s.state == stateStarting || s.state == stateRunning || s.state == stateBackoff) {
notStopped = append(notStopped, name)
}
}
Expand Down
52 changes: 52 additions & 0 deletions internals/overlord/servstate/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,58 @@ services:
c.Assert(s.manager.StopTimeout(), Equals, time.Minute*60+time.Millisecond*200)
}

func (s *S) TestStopServiceWithinOkayDelay(c *C) {
// A longer okayDelay is used so that the change for starting the service won't
// quickly transition into the running state.
fakeOkayDelay := 20 * shortOkayDelay
servstate.FakeOkayWait(fakeOkayDelay)

s.newServiceManager(c)
serviceName := "test-stop-within-okaywait"
// The service sleeps for fakeOkayDelay second then creates a side effect (a file at donePath).
layer := `
services:
%s:
override: replace
command: /bin/sh -c "sleep %g; {{.NotifyDoneCheck}}"
`
s.planAddLayer(c, fmt.Sprintf(layer, serviceName, fakeOkayDelay.Seconds()))
s.planChanged(c)

// Start the service without waiting for change ready.
s.st.Lock()
ts, err := servstate.Start(s.st, [][]string{{serviceName}})
c.Check(err, IsNil)
chgStart := s.st.NewChange("test", "Start test")
chgStart.AddAll(ts)
s.st.Unlock()
s.runner.Ensure()

// Wait until the service is in the starting state.
s.waitUntilService(c, serviceName, func(service *servstate.ServiceInfo) bool {
return service.Current == servstate.StatusActive
})

// Stop the service within okayDelay.
chg := s.stopServices(c, [][]string{{serviceName}})
s.st.Lock()
c.Assert(chg.Err(), IsNil)
s.st.Unlock()

waitChangeReady(c, s.runner, chgStart, "Start test")

s.st.Lock()
c.Check(chgStart.Status(), Equals, state.ErrorStatus)
c.Check(chgStart.Err(), ErrorMatches, fmt.Sprintf(`(?s).*stopped before the %s okay delay.*`, fakeOkayDelay))
s.st.Unlock()

donePath := filepath.Join(s.dir, serviceName)
// If the service is stopped within okayDelay and is indeed terminated, donePath should not exist.
if _, err := os.Stat(donePath); err == nil {
c.Fatalf("service %s waiting for service output", serviceName)
}
}

func (s *S) TestKillDelayIsUsed(c *C) {
s.newServiceManager(c)
s.planAddLayer(c, testPlanLayer)
Expand Down
1 change: 1 addition & 0 deletions internals/overlord/servstate/state-diagram.dot
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ digraph service_state_machine {
node [penwidth=1]
initial -> starting [label="start"]
starting -> running [label="okay wait\nelapsed"]
starting -> terminating [label="stop (before\nokay wait elapses)"]
running -> terminating [label="stop"]
running -> terminating [label="check failed\n(action \"restart\")"]
terminating -> killing [label="terminate time\nelapsed"]
Expand Down
Loading

0 comments on commit fc1d18b

Please sign in to comment.