up: fix various race/deadlock conditions on exit (#10934)

If running `up` in foreground mode (i.e. not `-d`),
when exiting via `Ctrl-C`, Compose stops all the
services it launched directly as part of that `up`
command.

In one of the E2E tests (`TestUpDependenciesNotStopped`),
this was occasionally flaking because the stop
behavior was racy: the return might not block on
the stop operation because it gets added to the
error group in a goroutine. As a result, it was
possible for no services to get terminated on exit.

There were a few other related pieces here that
I uncovered and tried to fix while stressing this.
For example, the printer could cause a deadlock if
an event was sent to it after it stopped.

Also, an error group wasn't really appropriate here;
each goroutine is a different operation for printing,
signal-handling, etc. If one part fails, we don't
actually want printing to stop, for example. This has
been switched to a `multierror.Group`, which has the
same API but coalesces errors instead of canceling a
context the moment the first one fails and returning
that single error.

Signed-off-by: Milas Bowman <milas.bowman@docker.com>
This commit is contained in:
Milas Bowman 2023-08-31 10:47:14 -04:00 committed by GitHub
parent d0dfb848df
commit 407a0d5b53
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 105 additions and 82 deletions

View File

@ -18,6 +18,7 @@ package compose
import ( import (
"fmt" "fmt"
"sync/atomic"
"github.com/docker/compose/v2/pkg/api" "github.com/docker/compose/v2/pkg/api"
) )
@ -33,32 +34,37 @@ type logPrinter interface {
type printer struct { type printer struct {
queue chan api.ContainerEvent queue chan api.ContainerEvent
consumer api.LogConsumer consumer api.LogConsumer
stopCh chan struct{} stopped atomic.Bool
} }
// newLogPrinter builds a LogPrinter passing containers logs to LogConsumer // newLogPrinter builds a LogPrinter passing containers logs to LogConsumer
func newLogPrinter(consumer api.LogConsumer) logPrinter { func newLogPrinter(consumer api.LogConsumer) logPrinter {
queue := make(chan api.ContainerEvent) queue := make(chan api.ContainerEvent)
stopCh := make(chan struct{}, 1) // printer MAY stop on his own, so Stop MUST not be blocking
printer := printer{ printer := printer{
consumer: consumer, consumer: consumer,
queue: queue, queue: queue,
stopCh: stopCh,
} }
return &printer return &printer
} }
func (p *printer) Cancel() { func (p *printer) Cancel() {
p.queue <- api.ContainerEvent{ // note: HandleEvent is used to ensure this doesn't deadlock
Type: api.UserCancel, p.HandleEvent(api.ContainerEvent{Type: api.UserCancel})
}
} }
func (p *printer) Stop() { func (p *printer) Stop() {
p.stopCh <- struct{}{} if p.stopped.CompareAndSwap(false, true) {
// only close if this is the first call to stop
close(p.queue)
}
} }
func (p *printer) HandleEvent(event api.ContainerEvent) { func (p *printer) HandleEvent(event api.ContainerEvent) {
// prevent deadlocking, if the printer is done, there's no reader for
// queue, so this write could block indefinitely
if p.stopped.Load() {
return
}
p.queue <- event p.queue <- event
} }
@ -69,61 +75,57 @@ func (p *printer) Run(cascadeStop bool, exitCodeFrom string, stopFn func() error
exitCode int exitCode int
) )
containers := map[string]struct{}{} containers := map[string]struct{}{}
for { for event := range p.queue {
select { container, id := event.Container, event.ID
case <-p.stopCh: switch event.Type {
return exitCode, nil case api.UserCancel:
case event := <-p.queue: aborting = true
container, id := event.Container, event.ID case api.ContainerEventAttach:
switch event.Type { if _, ok := containers[id]; ok {
case api.UserCancel: continue
aborting = true }
case api.ContainerEventAttach: containers[id] = struct{}{}
if _, ok := containers[id]; ok { p.consumer.Register(container)
continue case api.ContainerEventExit, api.ContainerEventStopped, api.ContainerEventRecreated:
} if !event.Restarting {
containers[id] = struct{}{} delete(containers, id)
p.consumer.Register(container) }
case api.ContainerEventExit, api.ContainerEventStopped, api.ContainerEventRecreated: if !aborting {
if !event.Restarting { p.consumer.Status(container, fmt.Sprintf("exited with code %d", event.ExitCode))
delete(containers, id) if event.Type == api.ContainerEventRecreated {
p.consumer.Status(container, "has been recreated")
} }
}
if cascadeStop {
if !aborting { if !aborting {
p.consumer.Status(container, fmt.Sprintf("exited with code %d", event.ExitCode)) aborting = true
if event.Type == api.ContainerEventRecreated { err := stopFn()
p.consumer.Status(container, "has been recreated") if err != nil {
return 0, err
} }
} }
if cascadeStop { if event.Type == api.ContainerEventExit {
if !aborting { if exitCodeFrom == "" {
aborting = true exitCodeFrom = event.Service
err := stopFn()
if err != nil {
return 0, err
}
} }
if event.Type == api.ContainerEventExit { if exitCodeFrom == event.Service {
if exitCodeFrom == "" { exitCode = event.ExitCode
exitCodeFrom = event.Service
}
if exitCodeFrom == event.Service {
exitCode = event.ExitCode
}
} }
} }
if len(containers) == 0 { }
// Last container terminated, done if len(containers) == 0 {
return exitCode, nil // Last container terminated, done
} return exitCode, nil
case api.ContainerEventLog: }
if !aborting { case api.ContainerEventLog:
p.consumer.Log(container, event.Line) if !aborting {
} p.consumer.Log(container, event.Line)
case api.ContainerEventErr: }
if !aborting { case api.ContainerEventErr:
p.consumer.Err(container, event.Line) if !aborting {
} p.consumer.Err(container, event.Line)
} }
} }
} }
return exitCode, nil
} }

View File

@ -21,15 +21,15 @@ import (
"fmt" "fmt"
"os" "os"
"os/signal" "os/signal"
"sync"
"syscall" "syscall"
"github.com/docker/compose/v2/internal/tracing"
"github.com/compose-spec/compose-go/types" "github.com/compose-spec/compose-go/types"
"github.com/docker/cli/cli" "github.com/docker/cli/cli"
"github.com/docker/compose/v2/internal/tracing"
"github.com/docker/compose/v2/pkg/api" "github.com/docker/compose/v2/pkg/api"
"github.com/docker/compose/v2/pkg/progress" "github.com/docker/compose/v2/pkg/progress"
"golang.org/x/sync/errgroup" "github.com/hashicorp/go-multierror"
) )
func (s *composeService) Up(ctx context.Context, project *types.Project, options api.UpOptions) error { func (s *composeService) Up(ctx context.Context, project *types.Project, options api.UpOptions) error {
@ -55,39 +55,60 @@ func (s *composeService) Up(ctx context.Context, project *types.Project, options
return err return err
} }
printer := newLogPrinter(options.Start.Attach) // if we get a second signal during shutdown, we kill the services
// immediately, so the channel needs to have sufficient capacity or
signalChan := make(chan os.Signal, 1) // we might miss a signal while setting up the second channel read
// (this is also why signal.Notify is used vs signal.NotifyContext)
signalChan := make(chan os.Signal, 2)
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM) signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
signalCancel := sync.OnceFunc(func() {
signal.Stop(signalChan)
close(signalChan)
})
defer signalCancel()
printer := newLogPrinter(options.Start.Attach)
stopFunc := func() error { stopFunc := func() error {
fmt.Fprintln(s.stdinfo(), "Aborting on container exit...") fmt.Fprintln(s.stdinfo(), "Aborting on container exit...")
ctx := context.Background() ctx := context.Background()
return progress.Run(ctx, func(ctx context.Context) error { return progress.Run(ctx, func(ctx context.Context) error {
// race two goroutines - one that blocks until another signal is received
// and then does a Kill() and one that immediately starts a friendly Stop()
errCh := make(chan error, 1)
go func() { go func() {
<-signalChan if _, ok := <-signalChan; !ok {
s.Kill(ctx, project.Name, api.KillOptions{ //nolint:errcheck // channel closed, so the outer function is done, which
// means the other goroutine (calling Stop()) finished
return
}
errCh <- s.Kill(ctx, project.Name, api.KillOptions{
Services: options.Create.Services, Services: options.Create.Services,
Project: project, Project: project,
}) })
}() }()
return s.Stop(ctx, project.Name, api.StopOptions{ go func() {
Services: options.Create.Services, errCh <- s.Stop(ctx, project.Name, api.StopOptions{
Project: project, Services: options.Create.Services,
}) Project: project,
})
}()
return <-errCh
}, s.stdinfo()) }, s.stdinfo())
} }
var isTerminated bool var isTerminated bool
eg, ctx := errgroup.WithContext(ctx) var eg multierror.Group
go func() { eg.Go(func() error {
<-signalChan if _, ok := <-signalChan; !ok {
// function finished without receiving a signal
return nil
}
isTerminated = true isTerminated = true
printer.Cancel() printer.Cancel()
fmt.Fprintln(s.stdinfo(), "Gracefully stopping... (press Ctrl+C again to force)") fmt.Fprintln(s.stdinfo(), "Gracefully stopping... (press Ctrl+C again to force)")
eg.Go(stopFunc) return stopFunc()
}() })
var exitCode int var exitCode int
eg.Go(func() error { eg.Go(func() error {
@ -101,8 +122,10 @@ func (s *composeService) Up(ctx context.Context, project *types.Project, options
return err return err
} }
// signal for the goroutines to stop & wait for them to finish any remaining work
signalCancel()
printer.Stop() printer.Stop()
err = eg.Wait() err = eg.Wait().ErrorOrNil()
if exitCode != 0 { if exitCode != 0 {
errMsg := "" errMsg := ""
if err != nil { if err != nil {

View File

@ -28,10 +28,11 @@ import (
// (running or exited). // (running or exited).
func RequireServiceState(t testing.TB, cli *CLI, service string, state string) { func RequireServiceState(t testing.TB, cli *CLI, service string, state string) {
t.Helper() t.Helper()
psRes := cli.RunDockerComposeCmd(t, "ps", "--format=json", service) psRes := cli.RunDockerComposeCmd(t, "ps", "--all", "--format=json", service)
var svc map[string]interface{} var svc map[string]interface{}
require.NoError(t, json.Unmarshal([]byte(psRes.Stdout()), &svc), require.NoError(t, json.Unmarshal([]byte(psRes.Stdout()), &svc),
"Invalid `compose ps` JSON output") "Invalid `compose ps` JSON: command output: %s",
psRes.Combined())
require.Equal(t, service, svc["Service"], require.Equal(t, service, svc["Service"],
"Found ps output for unexpected service") "Found ps output for unexpected service")

View File

@ -21,7 +21,6 @@ package e2e
import ( import (
"context" "context"
"os"
"os/exec" "os/exec"
"strings" "strings"
"syscall" "syscall"
@ -45,9 +44,6 @@ func TestUpServiceUnhealthy(t *testing.T) {
} }
func TestUpDependenciesNotStopped(t *testing.T) { func TestUpDependenciesNotStopped(t *testing.T) {
if _, ok := os.LookupEnv("CI"); ok {
t.Skip("Skipping test on CI... flaky")
}
c := NewParallelCLI(t, WithEnv( c := NewParallelCLI(t, WithEnv(
"COMPOSE_PROJECT_NAME=up-deps-stop", "COMPOSE_PROJECT_NAME=up-deps-stop",
)) ))
@ -76,8 +72,8 @@ func TestUpDependenciesNotStopped(t *testing.T) {
"app", "app",
) )
ctx, cancel := context.WithCancel(context.Background()) ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel() t.Cleanup(cancel)
cmd, err := StartWithNewGroupID(ctx, testCmd, upOut, nil) cmd, err := StartWithNewGroupID(ctx, testCmd, upOut, nil)
assert.NilError(t, err, "Failed to run compose up") assert.NilError(t, err, "Failed to run compose up")
@ -91,12 +87,13 @@ func TestUpDependenciesNotStopped(t *testing.T) {
require.NoError(t, syscall.Kill(-cmd.Process.Pid, syscall.SIGINT), require.NoError(t, syscall.Kill(-cmd.Process.Pid, syscall.SIGINT),
"Failed to send SIGINT to compose up process") "Failed to send SIGINT to compose up process")
time.AfterFunc(5*time.Second, cancel)
t.Log("Waiting for `compose up` to exit") t.Log("Waiting for `compose up` to exit")
err = cmd.Wait() err = cmd.Wait()
if err != nil { if err != nil {
exitErr := err.(*exec.ExitError) exitErr := err.(*exec.ExitError)
if exitErr.ExitCode() == -1 {
t.Fatalf("`compose up` was killed: %v", err)
}
require.EqualValues(t, exitErr.ExitCode(), 130) require.EqualValues(t, exitErr.ExitCode(), 130)
} }