Add pprof labels in processes and for lifecycles (#19202)

Use pprof labelling to help identify goroutines with stacks.

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
zeripath 2022-03-25 12:47:12 +00:00 committed by GitHub
parent e48f3b0527
commit 5fe764b1eb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 55 additions and 48 deletions

View File

@ -6,6 +6,7 @@ package graceful
import ( import (
"context" "context"
"runtime/pprof"
"sync" "sync"
"time" "time"
@ -62,7 +63,6 @@ type WithCallback func(callback func())
// Similarly the callback function provided to atTerminate must return once termination is complete. // Similarly the callback function provided to atTerminate must return once termination is complete.
// Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals // Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
// - users must therefore be careful to only call these as necessary. // - users must therefore be careful to only call these as necessary.
// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
type RunnableWithShutdownFns func(atShutdown, atTerminate func(func())) type RunnableWithShutdownFns func(atShutdown, atTerminate func(func()))
// RunWithShutdownFns takes a function that has both atShutdown and atTerminate callbacks // RunWithShutdownFns takes a function that has both atShutdown and atTerminate callbacks
@ -70,7 +70,6 @@ type RunnableWithShutdownFns func(atShutdown, atTerminate func(func()))
// Similarly the callback function provided to atTerminate must return once termination is complete. // Similarly the callback function provided to atTerminate must return once termination is complete.
// Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals // Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
// - users must therefore be careful to only call these as necessary. // - users must therefore be careful to only call these as necessary.
// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) { func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
g.runningServerWaitGroup.Add(1) g.runningServerWaitGroup.Add(1)
defer g.runningServerWaitGroup.Done() defer g.runningServerWaitGroup.Done()
@ -98,32 +97,6 @@ func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
}) })
} }
// RunnableWithShutdownChan is a runnable with functions to run at shutdown and terminate.
// After the atShutdown channel is closed, the main function must return once shutdown is complete.
// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
// The callback function provided to atTerminate must return once termination is complete.
// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
type RunnableWithShutdownChan func(atShutdown <-chan struct{}, atTerminate WithCallback)
// RunWithShutdownChan takes a function that has channel to watch for shutdown and atTerminate callbacks
// After the atShutdown channel is closed, the main function must return once shutdown is complete.
// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
// The callback function provided to atTerminate must return once termination is complete.
// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
func (g *Manager) RunWithShutdownChan(run RunnableWithShutdownChan) {
g.runningServerWaitGroup.Add(1)
defer g.runningServerWaitGroup.Done()
defer func() {
if err := recover(); err != nil {
log.Critical("PANIC during RunWithShutdownChan: %v\nStacktrace: %s", err, log.Stack(2))
g.doShutdown()
}
}()
run(g.IsShutdown(), func(atTerminate func()) {
g.RunAtTerminate(atTerminate)
})
}
// RunWithShutdownContext takes a function that has a context to watch for shutdown. // RunWithShutdownContext takes a function that has a context to watch for shutdown.
// After the provided context is Done(), the main function must return once shutdown is complete. // After the provided context is Done(), the main function must return once shutdown is complete.
// (Optionally the HammerContext may be obtained and waited for however, this should be avoided if possible.) // (Optionally the HammerContext may be obtained and waited for however, this should be avoided if possible.)
@ -136,7 +109,9 @@ func (g *Manager) RunWithShutdownContext(run func(context.Context)) {
g.doShutdown() g.doShutdown()
} }
}() }()
run(g.ShutdownContext()) ctx := g.ShutdownContext()
pprof.SetGoroutineLabels(ctx) // We don't have a label to restore back to but I think this is fine
run(ctx)
} }
// RunAtTerminate adds to the terminate wait group and creates a go-routine to run the provided function at termination // RunAtTerminate adds to the terminate wait group and creates a go-routine to run the provided function at termination
@ -198,6 +173,8 @@ func (g *Manager) doShutdown() {
} }
g.lock.Lock() g.lock.Lock()
g.shutdownCtxCancel() g.shutdownCtxCancel()
atShutdownCtx := pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "post-shutdown"))
pprof.SetGoroutineLabels(atShutdownCtx)
for _, fn := range g.toRunAtShutdown { for _, fn := range g.toRunAtShutdown {
go fn() go fn()
} }
@ -214,7 +191,7 @@ func (g *Manager) doShutdown() {
g.doTerminate() g.doTerminate()
g.WaitForTerminate() g.WaitForTerminate()
g.lock.Lock() g.lock.Lock()
g.doneCtxCancel() g.managerCtxCancel()
g.lock.Unlock() g.lock.Unlock()
}() }()
} }
@ -227,6 +204,8 @@ func (g *Manager) doHammerTime(d time.Duration) {
default: default:
log.Warn("Setting Hammer condition") log.Warn("Setting Hammer condition")
g.hammerCtxCancel() g.hammerCtxCancel()
atHammerCtx := pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "post-hammer"))
pprof.SetGoroutineLabels(atHammerCtx)
for _, fn := range g.toRunAtHammer { for _, fn := range g.toRunAtHammer {
go fn() go fn()
} }
@ -244,6 +223,9 @@ func (g *Manager) doTerminate() {
default: default:
log.Warn("Terminating") log.Warn("Terminating")
g.terminateCtxCancel() g.terminateCtxCancel()
atTerminateCtx := pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "post-terminate"))
pprof.SetGoroutineLabels(atTerminateCtx)
for _, fn := range g.toRunAtTerminate { for _, fn := range g.toRunAtTerminate {
go fn() go fn()
} }
@ -331,20 +313,20 @@ func (g *Manager) InformCleanup() {
// Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating // Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating
func (g *Manager) Done() <-chan struct{} { func (g *Manager) Done() <-chan struct{} {
return g.doneCtx.Done() return g.managerCtx.Done()
} }
// Err allows the manager to be viewed as a context.Context done at Terminate // Err allows the manager to be viewed as a context.Context done at Terminate
func (g *Manager) Err() error { func (g *Manager) Err() error {
return g.doneCtx.Err() return g.managerCtx.Err()
} }
// Value allows the manager to be viewed as a context.Context done at Terminate // Value allows the manager to be viewed as a context.Context done at Terminate
func (g *Manager) Value(key interface{}) interface{} { func (g *Manager) Value(key interface{}) interface{} {
return g.doneCtx.Value(key) return g.managerCtx.Value(key)
} }
// Deadline returns nil as there is no fixed Deadline for the manager, it allows the manager to be viewed as a context.Context // Deadline returns nil as there is no fixed Deadline for the manager, it allows the manager to be viewed as a context.Context
func (g *Manager) Deadline() (deadline time.Time, ok bool) { func (g *Manager) Deadline() (deadline time.Time, ok bool) {
return g.doneCtx.Deadline() return g.managerCtx.Deadline()
} }

View File

@ -12,6 +12,7 @@ import (
"errors" "errors"
"os" "os"
"os/signal" "os/signal"
"runtime/pprof"
"sync" "sync"
"syscall" "syscall"
"time" "time"
@ -29,11 +30,11 @@ type Manager struct {
shutdownCtx context.Context shutdownCtx context.Context
hammerCtx context.Context hammerCtx context.Context
terminateCtx context.Context terminateCtx context.Context
doneCtx context.Context managerCtx context.Context
shutdownCtxCancel context.CancelFunc shutdownCtxCancel context.CancelFunc
hammerCtxCancel context.CancelFunc hammerCtxCancel context.CancelFunc
terminateCtxCancel context.CancelFunc terminateCtxCancel context.CancelFunc
doneCtxCancel context.CancelFunc managerCtxCancel context.CancelFunc
runningServerWaitGroup sync.WaitGroup runningServerWaitGroup sync.WaitGroup
createServerWaitGroup sync.WaitGroup createServerWaitGroup sync.WaitGroup
terminateWaitGroup sync.WaitGroup terminateWaitGroup sync.WaitGroup
@ -58,7 +59,17 @@ func (g *Manager) start(ctx context.Context) {
g.terminateCtx, g.terminateCtxCancel = context.WithCancel(ctx) g.terminateCtx, g.terminateCtxCancel = context.WithCancel(ctx)
g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(ctx) g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(ctx)
g.hammerCtx, g.hammerCtxCancel = context.WithCancel(ctx) g.hammerCtx, g.hammerCtxCancel = context.WithCancel(ctx)
g.doneCtx, g.doneCtxCancel = context.WithCancel(ctx) g.managerCtx, g.managerCtxCancel = context.WithCancel(ctx)
// Next add pprof labels to these contexts
g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
pprof.SetGoroutineLabels(g.managerCtx)
defer pprof.SetGoroutineLabels(ctx)
// Set the running state & handle signals // Set the running state & handle signals
g.setState(stateRunning) g.setState(stateRunning)

View File

@ -11,6 +11,7 @@ package graceful
import ( import (
"context" "context"
"os" "os"
"runtime/pprof"
"strconv" "strconv"
"sync" "sync"
"time" "time"
@ -40,11 +41,11 @@ type Manager struct {
shutdownCtx context.Context shutdownCtx context.Context
hammerCtx context.Context hammerCtx context.Context
terminateCtx context.Context terminateCtx context.Context
doneCtx context.Context managerCtx context.Context
shutdownCtxCancel context.CancelFunc shutdownCtxCancel context.CancelFunc
hammerCtxCancel context.CancelFunc hammerCtxCancel context.CancelFunc
terminateCtxCancel context.CancelFunc terminateCtxCancel context.CancelFunc
doneCtxCancel context.CancelFunc managerCtxCancel context.CancelFunc
runningServerWaitGroup sync.WaitGroup runningServerWaitGroup sync.WaitGroup
createServerWaitGroup sync.WaitGroup createServerWaitGroup sync.WaitGroup
terminateWaitGroup sync.WaitGroup terminateWaitGroup sync.WaitGroup
@ -71,7 +72,17 @@ func (g *Manager) start() {
g.terminateCtx, g.terminateCtxCancel = context.WithCancel(g.ctx) g.terminateCtx, g.terminateCtxCancel = context.WithCancel(g.ctx)
g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(g.ctx) g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(g.ctx)
g.hammerCtx, g.hammerCtxCancel = context.WithCancel(g.ctx) g.hammerCtx, g.hammerCtxCancel = context.WithCancel(g.ctx)
g.doneCtx, g.doneCtxCancel = context.WithCancel(g.ctx) g.managerCtx, g.managerCtxCancel = context.WithCancel(g.ctx)
// Next add pprof labels to these contexts
g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
pprof.SetGoroutineLabels(g.managerCtx)
defer pprof.SetGoroutineLabels(g.ctx)
// Make channels // Make channels
g.shutdownRequested = make(chan struct{}) g.shutdownRequested = make(chan struct{})

View File

@ -11,6 +11,7 @@ import (
"fmt" "fmt"
"io" "io"
"os/exec" "os/exec"
"runtime/pprof"
"sort" "sort"
"strconv" "strconv"
"sync" "sync"
@ -66,11 +67,9 @@ func GetManager() *Manager {
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the // Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
// process table. // process table.
func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) { func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
parentPID := GetParentPID(parent)
ctx, cancel = context.WithCancel(parent) ctx, cancel = context.WithCancel(parent)
pid, finished := pm.Add(parentPID, description, cancel) ctx, pid, finished := pm.Add(ctx, description, cancel)
return &Context{ return &Context{
Context: ctx, Context: ctx,
@ -87,11 +86,9 @@ func (pm *Manager) AddContext(parent context.Context, description string) (ctx c
// Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the // Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
// process table. // process table.
func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finshed FinishedFunc) { func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finshed FinishedFunc) {
parentPID := GetParentPID(parent)
ctx, cancel = context.WithTimeout(parent, timeout) ctx, cancel = context.WithTimeout(parent, timeout)
pid, finshed := pm.Add(parentPID, description, cancel) ctx, pid, finshed := pm.Add(ctx, description, cancel)
return &Context{ return &Context{
Context: ctx, Context: ctx,
@ -100,7 +97,9 @@ func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Durati
} }
// Add create a new process // Add create a new process
func (pm *Manager) Add(parentPID IDType, description string, cancel context.CancelFunc) (IDType, FinishedFunc) { func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc) (context.Context, IDType, FinishedFunc) {
parentPID := GetParentPID(ctx)
pm.mutex.Lock() pm.mutex.Lock()
start, pid := pm.nextPID() start, pid := pm.nextPID()
@ -120,6 +119,7 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
finished := func() { finished := func() {
cancel() cancel()
pm.remove(process) pm.remove(process)
pprof.SetGoroutineLabels(ctx)
} }
if parent != nil { if parent != nil {
@ -128,7 +128,10 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
pm.processes[pid] = process pm.processes[pid] = process
pm.mutex.Unlock() pm.mutex.Unlock()
return pid, finished pprofCtx := pprof.WithLabels(ctx, pprof.Labels("process-description", description, "ppid", string(parentPID), "pid", string(pid)))
pprof.SetGoroutineLabels(pprofCtx)
return pprofCtx, pid, finished
} }
// nextPID will return the next available PID. pm.mutex should already be locked. // nextPID will return the next available PID. pm.mutex should already be locked.