mirror of https://github.com/docker/compose.git
up: fix various race/deadlock conditions on exit (#10934)
If running `up` in foreground mode (i.e. not `-d`), when exiting via `Ctrl-C`, Compose stops all the services it launched directly as part of that `up` command. In one of the E2E tests (`TestUpDependenciesNotStopped`), this was occasionally flaking because the stop behavior was racy: the return might not block on the stop operation because it gets added to the error group in a goroutine. As a result, it was possible for no services to get terminated on exit. There were a few other related pieces here that I uncovered and tried to fix while stressing this. For example, the printer could cause a deadlock if an event was sent to it after it stopped. Also, an error group wasn't really appropriate here; each goroutine is a different operation for printing, signal-handling, etc. If one part fails, we don't actually want printing to stop, for example. This has been switched to a `multierror.Group`, which has the same API but coalesces errors instead of canceling a context the moment the first one fails and returning that single error. Signed-off-by: Milas Bowman <milas.bowman@docker.com>
This commit is contained in:
parent
d0dfb848df
commit
407a0d5b53
|
@ -18,6 +18,7 @@ package compose
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"fmt"
|
"fmt"
|
||||||
|
"sync/atomic"
|
||||||
|
|
||||||
"github.com/docker/compose/v2/pkg/api"
|
"github.com/docker/compose/v2/pkg/api"
|
||||||
)
|
)
|
||||||
|
@ -33,32 +34,37 @@ type logPrinter interface {
|
||||||
type printer struct {
|
type printer struct {
|
||||||
queue chan api.ContainerEvent
|
queue chan api.ContainerEvent
|
||||||
consumer api.LogConsumer
|
consumer api.LogConsumer
|
||||||
stopCh chan struct{}
|
stopped atomic.Bool
|
||||||
}
|
}
|
||||||
|
|
||||||
// newLogPrinter builds a LogPrinter passing containers logs to LogConsumer
|
// newLogPrinter builds a LogPrinter passing containers logs to LogConsumer
|
||||||
func newLogPrinter(consumer api.LogConsumer) logPrinter {
|
func newLogPrinter(consumer api.LogConsumer) logPrinter {
|
||||||
queue := make(chan api.ContainerEvent)
|
queue := make(chan api.ContainerEvent)
|
||||||
stopCh := make(chan struct{}, 1) // printer MAY stop on his own, so Stop MUST not be blocking
|
|
||||||
printer := printer{
|
printer := printer{
|
||||||
consumer: consumer,
|
consumer: consumer,
|
||||||
queue: queue,
|
queue: queue,
|
||||||
stopCh: stopCh,
|
|
||||||
}
|
}
|
||||||
return &printer
|
return &printer
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *printer) Cancel() {
|
func (p *printer) Cancel() {
|
||||||
p.queue <- api.ContainerEvent{
|
// note: HandleEvent is used to ensure this doesn't deadlock
|
||||||
Type: api.UserCancel,
|
p.HandleEvent(api.ContainerEvent{Type: api.UserCancel})
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *printer) Stop() {
|
func (p *printer) Stop() {
|
||||||
p.stopCh <- struct{}{}
|
if p.stopped.CompareAndSwap(false, true) {
|
||||||
|
// only close if this is the first call to stop
|
||||||
|
close(p.queue)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func (p *printer) HandleEvent(event api.ContainerEvent) {
|
func (p *printer) HandleEvent(event api.ContainerEvent) {
|
||||||
|
// prevent deadlocking, if the printer is done, there's no reader for
|
||||||
|
// queue, so this write could block indefinitely
|
||||||
|
if p.stopped.Load() {
|
||||||
|
return
|
||||||
|
}
|
||||||
p.queue <- event
|
p.queue <- event
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -69,61 +75,57 @@ func (p *printer) Run(cascadeStop bool, exitCodeFrom string, stopFn func() error
|
||||||
exitCode int
|
exitCode int
|
||||||
)
|
)
|
||||||
containers := map[string]struct{}{}
|
containers := map[string]struct{}{}
|
||||||
for {
|
for event := range p.queue {
|
||||||
select {
|
container, id := event.Container, event.ID
|
||||||
case <-p.stopCh:
|
switch event.Type {
|
||||||
return exitCode, nil
|
case api.UserCancel:
|
||||||
case event := <-p.queue:
|
aborting = true
|
||||||
container, id := event.Container, event.ID
|
case api.ContainerEventAttach:
|
||||||
switch event.Type {
|
if _, ok := containers[id]; ok {
|
||||||
case api.UserCancel:
|
continue
|
||||||
aborting = true
|
}
|
||||||
case api.ContainerEventAttach:
|
containers[id] = struct{}{}
|
||||||
if _, ok := containers[id]; ok {
|
p.consumer.Register(container)
|
||||||
continue
|
case api.ContainerEventExit, api.ContainerEventStopped, api.ContainerEventRecreated:
|
||||||
}
|
if !event.Restarting {
|
||||||
containers[id] = struct{}{}
|
delete(containers, id)
|
||||||
p.consumer.Register(container)
|
}
|
||||||
case api.ContainerEventExit, api.ContainerEventStopped, api.ContainerEventRecreated:
|
if !aborting {
|
||||||
if !event.Restarting {
|
p.consumer.Status(container, fmt.Sprintf("exited with code %d", event.ExitCode))
|
||||||
delete(containers, id)
|
if event.Type == api.ContainerEventRecreated {
|
||||||
|
p.consumer.Status(container, "has been recreated")
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
if cascadeStop {
|
||||||
if !aborting {
|
if !aborting {
|
||||||
p.consumer.Status(container, fmt.Sprintf("exited with code %d", event.ExitCode))
|
aborting = true
|
||||||
if event.Type == api.ContainerEventRecreated {
|
err := stopFn()
|
||||||
p.consumer.Status(container, "has been recreated")
|
if err != nil {
|
||||||
|
return 0, err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if cascadeStop {
|
if event.Type == api.ContainerEventExit {
|
||||||
if !aborting {
|
if exitCodeFrom == "" {
|
||||||
aborting = true
|
exitCodeFrom = event.Service
|
||||||
err := stopFn()
|
|
||||||
if err != nil {
|
|
||||||
return 0, err
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if event.Type == api.ContainerEventExit {
|
if exitCodeFrom == event.Service {
|
||||||
if exitCodeFrom == "" {
|
exitCode = event.ExitCode
|
||||||
exitCodeFrom = event.Service
|
|
||||||
}
|
|
||||||
if exitCodeFrom == event.Service {
|
|
||||||
exitCode = event.ExitCode
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if len(containers) == 0 {
|
}
|
||||||
// Last container terminated, done
|
if len(containers) == 0 {
|
||||||
return exitCode, nil
|
// Last container terminated, done
|
||||||
}
|
return exitCode, nil
|
||||||
case api.ContainerEventLog:
|
}
|
||||||
if !aborting {
|
case api.ContainerEventLog:
|
||||||
p.consumer.Log(container, event.Line)
|
if !aborting {
|
||||||
}
|
p.consumer.Log(container, event.Line)
|
||||||
case api.ContainerEventErr:
|
}
|
||||||
if !aborting {
|
case api.ContainerEventErr:
|
||||||
p.consumer.Err(container, event.Line)
|
if !aborting {
|
||||||
}
|
p.consumer.Err(container, event.Line)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
return exitCode, nil
|
||||||
}
|
}
|
||||||
|
|
|
@ -21,15 +21,15 @@ import (
|
||||||
"fmt"
|
"fmt"
|
||||||
"os"
|
"os"
|
||||||
"os/signal"
|
"os/signal"
|
||||||
|
"sync"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
|
||||||
"github.com/docker/compose/v2/internal/tracing"
|
|
||||||
|
|
||||||
"github.com/compose-spec/compose-go/types"
|
"github.com/compose-spec/compose-go/types"
|
||||||
"github.com/docker/cli/cli"
|
"github.com/docker/cli/cli"
|
||||||
|
"github.com/docker/compose/v2/internal/tracing"
|
||||||
"github.com/docker/compose/v2/pkg/api"
|
"github.com/docker/compose/v2/pkg/api"
|
||||||
"github.com/docker/compose/v2/pkg/progress"
|
"github.com/docker/compose/v2/pkg/progress"
|
||||||
"golang.org/x/sync/errgroup"
|
"github.com/hashicorp/go-multierror"
|
||||||
)
|
)
|
||||||
|
|
||||||
func (s *composeService) Up(ctx context.Context, project *types.Project, options api.UpOptions) error {
|
func (s *composeService) Up(ctx context.Context, project *types.Project, options api.UpOptions) error {
|
||||||
|
@ -55,39 +55,60 @@ func (s *composeService) Up(ctx context.Context, project *types.Project, options
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
printer := newLogPrinter(options.Start.Attach)
|
// if we get a second signal during shutdown, we kill the services
|
||||||
|
// immediately, so the channel needs to have sufficient capacity or
|
||||||
signalChan := make(chan os.Signal, 1)
|
// we might miss a signal while setting up the second channel read
|
||||||
|
// (this is also why signal.Notify is used vs signal.NotifyContext)
|
||||||
|
signalChan := make(chan os.Signal, 2)
|
||||||
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
|
signal.Notify(signalChan, syscall.SIGINT, syscall.SIGTERM)
|
||||||
|
signalCancel := sync.OnceFunc(func() {
|
||||||
|
signal.Stop(signalChan)
|
||||||
|
close(signalChan)
|
||||||
|
})
|
||||||
|
defer signalCancel()
|
||||||
|
|
||||||
|
printer := newLogPrinter(options.Start.Attach)
|
||||||
stopFunc := func() error {
|
stopFunc := func() error {
|
||||||
fmt.Fprintln(s.stdinfo(), "Aborting on container exit...")
|
fmt.Fprintln(s.stdinfo(), "Aborting on container exit...")
|
||||||
ctx := context.Background()
|
ctx := context.Background()
|
||||||
return progress.Run(ctx, func(ctx context.Context) error {
|
return progress.Run(ctx, func(ctx context.Context) error {
|
||||||
|
// race two goroutines - one that blocks until another signal is received
|
||||||
|
// and then does a Kill() and one that immediately starts a friendly Stop()
|
||||||
|
errCh := make(chan error, 1)
|
||||||
go func() {
|
go func() {
|
||||||
<-signalChan
|
if _, ok := <-signalChan; !ok {
|
||||||
s.Kill(ctx, project.Name, api.KillOptions{ //nolint:errcheck
|
// channel closed, so the outer function is done, which
|
||||||
|
// means the other goroutine (calling Stop()) finished
|
||||||
|
return
|
||||||
|
}
|
||||||
|
errCh <- s.Kill(ctx, project.Name, api.KillOptions{
|
||||||
Services: options.Create.Services,
|
Services: options.Create.Services,
|
||||||
Project: project,
|
Project: project,
|
||||||
})
|
})
|
||||||
}()
|
}()
|
||||||
|
|
||||||
return s.Stop(ctx, project.Name, api.StopOptions{
|
go func() {
|
||||||
Services: options.Create.Services,
|
errCh <- s.Stop(ctx, project.Name, api.StopOptions{
|
||||||
Project: project,
|
Services: options.Create.Services,
|
||||||
})
|
Project: project,
|
||||||
|
})
|
||||||
|
}()
|
||||||
|
return <-errCh
|
||||||
}, s.stdinfo())
|
}, s.stdinfo())
|
||||||
}
|
}
|
||||||
|
|
||||||
var isTerminated bool
|
var isTerminated bool
|
||||||
eg, ctx := errgroup.WithContext(ctx)
|
var eg multierror.Group
|
||||||
go func() {
|
eg.Go(func() error {
|
||||||
<-signalChan
|
if _, ok := <-signalChan; !ok {
|
||||||
|
// function finished without receiving a signal
|
||||||
|
return nil
|
||||||
|
}
|
||||||
isTerminated = true
|
isTerminated = true
|
||||||
printer.Cancel()
|
printer.Cancel()
|
||||||
fmt.Fprintln(s.stdinfo(), "Gracefully stopping... (press Ctrl+C again to force)")
|
fmt.Fprintln(s.stdinfo(), "Gracefully stopping... (press Ctrl+C again to force)")
|
||||||
eg.Go(stopFunc)
|
return stopFunc()
|
||||||
}()
|
})
|
||||||
|
|
||||||
var exitCode int
|
var exitCode int
|
||||||
eg.Go(func() error {
|
eg.Go(func() error {
|
||||||
|
@ -101,8 +122,10 @@ func (s *composeService) Up(ctx context.Context, project *types.Project, options
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// signal for the goroutines to stop & wait for them to finish any remaining work
|
||||||
|
signalCancel()
|
||||||
printer.Stop()
|
printer.Stop()
|
||||||
err = eg.Wait()
|
err = eg.Wait().ErrorOrNil()
|
||||||
if exitCode != 0 {
|
if exitCode != 0 {
|
||||||
errMsg := ""
|
errMsg := ""
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
|
|
@ -28,10 +28,11 @@ import (
|
||||||
// (running or exited).
|
// (running or exited).
|
||||||
func RequireServiceState(t testing.TB, cli *CLI, service string, state string) {
|
func RequireServiceState(t testing.TB, cli *CLI, service string, state string) {
|
||||||
t.Helper()
|
t.Helper()
|
||||||
psRes := cli.RunDockerComposeCmd(t, "ps", "--format=json", service)
|
psRes := cli.RunDockerComposeCmd(t, "ps", "--all", "--format=json", service)
|
||||||
var svc map[string]interface{}
|
var svc map[string]interface{}
|
||||||
require.NoError(t, json.Unmarshal([]byte(psRes.Stdout()), &svc),
|
require.NoError(t, json.Unmarshal([]byte(psRes.Stdout()), &svc),
|
||||||
"Invalid `compose ps` JSON output")
|
"Invalid `compose ps` JSON: command output: %s",
|
||||||
|
psRes.Combined())
|
||||||
|
|
||||||
require.Equal(t, service, svc["Service"],
|
require.Equal(t, service, svc["Service"],
|
||||||
"Found ps output for unexpected service")
|
"Found ps output for unexpected service")
|
||||||
|
|
|
@ -21,7 +21,6 @@ package e2e
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
|
||||||
"os/exec"
|
"os/exec"
|
||||||
"strings"
|
"strings"
|
||||||
"syscall"
|
"syscall"
|
||||||
|
@ -45,9 +44,6 @@ func TestUpServiceUnhealthy(t *testing.T) {
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestUpDependenciesNotStopped(t *testing.T) {
|
func TestUpDependenciesNotStopped(t *testing.T) {
|
||||||
if _, ok := os.LookupEnv("CI"); ok {
|
|
||||||
t.Skip("Skipping test on CI... flaky")
|
|
||||||
}
|
|
||||||
c := NewParallelCLI(t, WithEnv(
|
c := NewParallelCLI(t, WithEnv(
|
||||||
"COMPOSE_PROJECT_NAME=up-deps-stop",
|
"COMPOSE_PROJECT_NAME=up-deps-stop",
|
||||||
))
|
))
|
||||||
|
@ -76,8 +72,8 @@ func TestUpDependenciesNotStopped(t *testing.T) {
|
||||||
"app",
|
"app",
|
||||||
)
|
)
|
||||||
|
|
||||||
ctx, cancel := context.WithCancel(context.Background())
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
||||||
defer cancel()
|
t.Cleanup(cancel)
|
||||||
|
|
||||||
cmd, err := StartWithNewGroupID(ctx, testCmd, upOut, nil)
|
cmd, err := StartWithNewGroupID(ctx, testCmd, upOut, nil)
|
||||||
assert.NilError(t, err, "Failed to run compose up")
|
assert.NilError(t, err, "Failed to run compose up")
|
||||||
|
@ -91,12 +87,13 @@ func TestUpDependenciesNotStopped(t *testing.T) {
|
||||||
require.NoError(t, syscall.Kill(-cmd.Process.Pid, syscall.SIGINT),
|
require.NoError(t, syscall.Kill(-cmd.Process.Pid, syscall.SIGINT),
|
||||||
"Failed to send SIGINT to compose up process")
|
"Failed to send SIGINT to compose up process")
|
||||||
|
|
||||||
time.AfterFunc(5*time.Second, cancel)
|
|
||||||
|
|
||||||
t.Log("Waiting for `compose up` to exit")
|
t.Log("Waiting for `compose up` to exit")
|
||||||
err = cmd.Wait()
|
err = cmd.Wait()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
exitErr := err.(*exec.ExitError)
|
exitErr := err.(*exec.ExitError)
|
||||||
|
if exitErr.ExitCode() == -1 {
|
||||||
|
t.Fatalf("`compose up` was killed: %v", err)
|
||||||
|
}
|
||||||
require.EqualValues(t, exitErr.ExitCode(), 130)
|
require.EqualValues(t, exitErr.ExitCode(), 130)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue