From 5fe764b1eb7bf4da752b047c25794316f31b68d6 Mon Sep 17 00:00:00 2001
From: zeripath <art27@cantab.net>
Date: Fri, 25 Mar 2022 12:47:12 +0000
Subject: [PATCH] Add pprof labels in processes and for lifecycles (#19202)

Use pprof labelling to help identify goroutines with stacks.

Signed-off-by: Andrew Thornton <art27@cantab.net>
---
 modules/graceful/manager.go         | 50 +++++++++--------------------
 modules/graceful/manager_unix.go    | 17 ++++++++--
 modules/graceful/manager_windows.go | 17 ++++++++--
 modules/process/manager.go          | 19 ++++++-----
 4 files changed, 55 insertions(+), 48 deletions(-)

diff --git a/modules/graceful/manager.go b/modules/graceful/manager.go
index f78357360..8766cfca0 100644
--- a/modules/graceful/manager.go
+++ b/modules/graceful/manager.go
@@ -6,6 +6,7 @@ package graceful
 
 import (
 	"context"
+	"runtime/pprof"
 	"sync"
 	"time"
 
@@ -62,7 +63,6 @@ type WithCallback func(callback func())
 // Similarly the callback function provided to atTerminate must return once termination is complete.
 // Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
 // - users must therefore be careful to only call these as necessary.
-// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
 type RunnableWithShutdownFns func(atShutdown, atTerminate func(func()))
 
 // RunWithShutdownFns takes a function that has both atShutdown and atTerminate callbacks
@@ -70,7 +70,6 @@ type RunnableWithShutdownFns func(atShutdown, atTerminate func(func()))
 // Similarly the callback function provided to atTerminate must return once termination is complete.
 // Please note that use of the atShutdown and atTerminate callbacks will create go-routines that will wait till their respective signals
 // - users must therefore be careful to only call these as necessary.
-// If run is not expected to run indefinitely RunWithShutdownChan is likely to be more appropriate.
 func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
 	g.runningServerWaitGroup.Add(1)
 	defer g.runningServerWaitGroup.Done()
@@ -98,32 +97,6 @@ func (g *Manager) RunWithShutdownFns(run RunnableWithShutdownFns) {
 	})
 }
 
-// RunnableWithShutdownChan is a runnable with functions to run at shutdown and terminate.
-// After the atShutdown channel is closed, the main function must return once shutdown is complete.
-// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
-// The callback function provided to atTerminate must return once termination is complete.
-// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
-type RunnableWithShutdownChan func(atShutdown <-chan struct{}, atTerminate WithCallback)
-
-// RunWithShutdownChan takes a function that has channel to watch for shutdown and atTerminate callbacks
-// After the atShutdown channel is closed, the main function must return once shutdown is complete.
-// (Optionally IsHammer may be waited for instead however, this should be avoided if possible.)
-// The callback function provided to atTerminate must return once termination is complete.
-// Please note that use of the atTerminate function will create a go-routine that will wait till terminate - users must therefore be careful to only call this as necessary.
-func (g *Manager) RunWithShutdownChan(run RunnableWithShutdownChan) {
-	g.runningServerWaitGroup.Add(1)
-	defer g.runningServerWaitGroup.Done()
-	defer func() {
-		if err := recover(); err != nil {
-			log.Critical("PANIC during RunWithShutdownChan: %v\nStacktrace: %s", err, log.Stack(2))
-			g.doShutdown()
-		}
-	}()
-	run(g.IsShutdown(), func(atTerminate func()) {
-		g.RunAtTerminate(atTerminate)
-	})
-}
-
 // RunWithShutdownContext takes a function that has a context to watch for shutdown.
 // After the provided context is Done(), the main function must return once shutdown is complete.
 // (Optionally the HammerContext may be obtained and waited for however, this should be avoided if possible.)
@@ -136,7 +109,9 @@ func (g *Manager) RunWithShutdownContext(run func(context.Context)) {
 			g.doShutdown()
 		}
 	}()
-	run(g.ShutdownContext())
+	ctx := g.ShutdownContext()
+	pprof.SetGoroutineLabels(ctx) // We don't have a label to restore back to but I think this is fine
+	run(ctx)
 }
 
 // RunAtTerminate adds to the terminate wait group and creates a go-routine to run the provided function at termination
@@ -198,6 +173,8 @@ func (g *Manager) doShutdown() {
 	}
 	g.lock.Lock()
 	g.shutdownCtxCancel()
+	atShutdownCtx := pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "post-shutdown"))
+	pprof.SetGoroutineLabels(atShutdownCtx)
 	for _, fn := range g.toRunAtShutdown {
 		go fn()
 	}
@@ -214,7 +191,7 @@ func (g *Manager) doShutdown() {
 		g.doTerminate()
 		g.WaitForTerminate()
 		g.lock.Lock()
-		g.doneCtxCancel()
+		g.managerCtxCancel()
 		g.lock.Unlock()
 	}()
 }
@@ -227,6 +204,8 @@ func (g *Manager) doHammerTime(d time.Duration) {
 	default:
 		log.Warn("Setting Hammer condition")
 		g.hammerCtxCancel()
+		atHammerCtx := pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "post-hammer"))
+		pprof.SetGoroutineLabels(atHammerCtx)
 		for _, fn := range g.toRunAtHammer {
 			go fn()
 		}
@@ -244,6 +223,9 @@ func (g *Manager) doTerminate() {
 	default:
 		log.Warn("Terminating")
 		g.terminateCtxCancel()
+		atTerminateCtx := pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "post-terminate"))
+		pprof.SetGoroutineLabels(atTerminateCtx)
+
 		for _, fn := range g.toRunAtTerminate {
 			go fn()
 		}
@@ -331,20 +313,20 @@ func (g *Manager) InformCleanup() {
 
 // Done allows the manager to be viewed as a context.Context, it returns a channel that is closed when the server is finished terminating
 func (g *Manager) Done() <-chan struct{} {
-	return g.doneCtx.Done()
+	return g.managerCtx.Done()
 }
 
 // Err allows the manager to be viewed as a context.Context done at Terminate
 func (g *Manager) Err() error {
-	return g.doneCtx.Err()
+	return g.managerCtx.Err()
 }
 
 // Value allows the manager to be viewed as a context.Context done at Terminate
 func (g *Manager) Value(key interface{}) interface{} {
-	return g.doneCtx.Value(key)
+	return g.managerCtx.Value(key)
 }
 
 // Deadline returns nil as there is no fixed Deadline for the manager, it allows the manager to be viewed as a context.Context
 func (g *Manager) Deadline() (deadline time.Time, ok bool) {
-	return g.doneCtx.Deadline()
+	return g.managerCtx.Deadline()
 }
diff --git a/modules/graceful/manager_unix.go b/modules/graceful/manager_unix.go
index 99e84d73e..6fbb2bda2 100644
--- a/modules/graceful/manager_unix.go
+++ b/modules/graceful/manager_unix.go
@@ -12,6 +12,7 @@ import (
 	"errors"
 	"os"
 	"os/signal"
+	"runtime/pprof"
 	"sync"
 	"syscall"
 	"time"
@@ -29,11 +30,11 @@ type Manager struct {
 	shutdownCtx            context.Context
 	hammerCtx              context.Context
 	terminateCtx           context.Context
-	doneCtx                context.Context
+	managerCtx             context.Context
 	shutdownCtxCancel      context.CancelFunc
 	hammerCtxCancel        context.CancelFunc
 	terminateCtxCancel     context.CancelFunc
-	doneCtxCancel          context.CancelFunc
+	managerCtxCancel       context.CancelFunc
 	runningServerWaitGroup sync.WaitGroup
 	createServerWaitGroup  sync.WaitGroup
 	terminateWaitGroup     sync.WaitGroup
@@ -58,7 +59,17 @@ func (g *Manager) start(ctx context.Context) {
 	g.terminateCtx, g.terminateCtxCancel = context.WithCancel(ctx)
 	g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(ctx)
 	g.hammerCtx, g.hammerCtxCancel = context.WithCancel(ctx)
-	g.doneCtx, g.doneCtxCancel = context.WithCancel(ctx)
+	g.managerCtx, g.managerCtxCancel = context.WithCancel(ctx)
+
+	// Next add pprof labels to these contexts
+	g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
+	g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
+	g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
+	g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
+
+	// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
+	pprof.SetGoroutineLabels(g.managerCtx)
+	defer pprof.SetGoroutineLabels(ctx)
 
 	// Set the running state & handle signals
 	g.setState(stateRunning)
diff --git a/modules/graceful/manager_windows.go b/modules/graceful/manager_windows.go
index e5f5541ed..66baddfa3 100644
--- a/modules/graceful/manager_windows.go
+++ b/modules/graceful/manager_windows.go
@@ -11,6 +11,7 @@ package graceful
 import (
 	"context"
 	"os"
+	"runtime/pprof"
 	"strconv"
 	"sync"
 	"time"
@@ -40,11 +41,11 @@ type Manager struct {
 	shutdownCtx            context.Context
 	hammerCtx              context.Context
 	terminateCtx           context.Context
-	doneCtx                context.Context
+	managerCtx             context.Context
 	shutdownCtxCancel      context.CancelFunc
 	hammerCtxCancel        context.CancelFunc
 	terminateCtxCancel     context.CancelFunc
-	doneCtxCancel          context.CancelFunc
+	managerCtxCancel       context.CancelFunc
 	runningServerWaitGroup sync.WaitGroup
 	createServerWaitGroup  sync.WaitGroup
 	terminateWaitGroup     sync.WaitGroup
@@ -71,7 +72,17 @@ func (g *Manager) start() {
 	g.terminateCtx, g.terminateCtxCancel = context.WithCancel(g.ctx)
 	g.shutdownCtx, g.shutdownCtxCancel = context.WithCancel(g.ctx)
 	g.hammerCtx, g.hammerCtxCancel = context.WithCancel(g.ctx)
-	g.doneCtx, g.doneCtxCancel = context.WithCancel(g.ctx)
+	g.managerCtx, g.managerCtxCancel = context.WithCancel(g.ctx)
+
+	// Next add pprof labels to these contexts
+	g.terminateCtx = pprof.WithLabels(g.terminateCtx, pprof.Labels("graceful-lifecycle", "with-terminate"))
+	g.shutdownCtx = pprof.WithLabels(g.shutdownCtx, pprof.Labels("graceful-lifecycle", "with-shutdown"))
+	g.hammerCtx = pprof.WithLabels(g.hammerCtx, pprof.Labels("graceful-lifecycle", "with-hammer"))
+	g.managerCtx = pprof.WithLabels(g.managerCtx, pprof.Labels("graceful-lifecycle", "with-manager"))
+
+	// Now label this and all goroutines created by this goroutine with the graceful-lifecycle manager
+	pprof.SetGoroutineLabels(g.managerCtx)
+	defer pprof.SetGoroutineLabels(g.ctx)
 
 	// Make channels
 	g.shutdownRequested = make(chan struct{})
diff --git a/modules/process/manager.go b/modules/process/manager.go
index d9d2f8c3e..50dbbbe6c 100644
--- a/modules/process/manager.go
+++ b/modules/process/manager.go
@@ -11,6 +11,7 @@ import (
 	"fmt"
 	"io"
 	"os/exec"
+	"runtime/pprof"
 	"sort"
 	"strconv"
 	"sync"
@@ -66,11 +67,9 @@ func GetManager() *Manager {
 // Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
 // process table.
 func (pm *Manager) AddContext(parent context.Context, description string) (ctx context.Context, cancel context.CancelFunc, finished FinishedFunc) {
-	parentPID := GetParentPID(parent)
-
 	ctx, cancel = context.WithCancel(parent)
 
-	pid, finished := pm.Add(parentPID, description, cancel)
+	ctx, pid, finished := pm.Add(ctx, description, cancel)
 
 	return &Context{
 		Context: ctx,
@@ -87,11 +86,9 @@ func (pm *Manager) AddContext(parent context.Context, description string) (ctx c
 // Most processes will not need to use the cancel function but there will be cases whereby you want to cancel the process but not immediately remove it from the
 // process table.
 func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Duration, description string) (ctx context.Context, cancel context.CancelFunc, finshed FinishedFunc) {
-	parentPID := GetParentPID(parent)
-
 	ctx, cancel = context.WithTimeout(parent, timeout)
 
-	pid, finshed := pm.Add(parentPID, description, cancel)
+	ctx, pid, finshed := pm.Add(ctx, description, cancel)
 
 	return &Context{
 		Context: ctx,
@@ -100,7 +97,9 @@ func (pm *Manager) AddContextTimeout(parent context.Context, timeout time.Durati
 }
 
 // Add create a new process
-func (pm *Manager) Add(parentPID IDType, description string, cancel context.CancelFunc) (IDType, FinishedFunc) {
+func (pm *Manager) Add(ctx context.Context, description string, cancel context.CancelFunc) (context.Context, IDType, FinishedFunc) {
+	parentPID := GetParentPID(ctx)
+
 	pm.mutex.Lock()
 	start, pid := pm.nextPID()
 
@@ -120,6 +119,7 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
 	finished := func() {
 		cancel()
 		pm.remove(process)
+		pprof.SetGoroutineLabels(ctx)
 	}
 
 	if parent != nil {
@@ -128,7 +128,10 @@ func (pm *Manager) Add(parentPID IDType, description string, cancel context.Canc
 	pm.processes[pid] = process
 	pm.mutex.Unlock()
 
-	return pid, finished
+	pprofCtx := pprof.WithLabels(ctx, pprof.Labels("process-description", description, "ppid", string(parentPID), "pid", string(pid)))
+	pprof.SetGoroutineLabels(pprofCtx)
+
+	return pprofCtx, pid, finished
 }
 
 // nextPID will return the next available PID. pm.mutex should already be locked.