foundationdb/fdbkubernetesmonitor/monitor.go

653 lines
21 KiB
Go

// monitor.go
//
// This source file is part of the FoundationDB open source project
//
// Copyright 2021-2024 Apple Inc. and the FoundationDB project authors
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package main
import (
"bufio"
"context"
"crypto/tls"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"net/http/pprof"
"os"
"os/exec"
"os/signal"
"path"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/apple/foundationdb/fdbkubernetesmonitor/api"
"github.com/apple/foundationdb/fdbkubernetesmonitor/internal/certloader"
"github.com/fsnotify/fsnotify"
"github.com/go-logr/logr"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/collectors"
"github.com/prometheus/client_golang/prometheus/promhttp"
"k8s.io/apimachinery/pkg/api/equality"
"k8s.io/utils/pointer"
)
const (
// maxErrorBackoffSeconds is the maximum time to wait after a process fails before starting another process.
// The actual delay will be based on the observed errors and will increase until maxErrorBackoffSeconds is hit.
maxErrorBackoffSeconds = 60 * time.Second
// fdbClusterFilePath defines the default path to the fdb cluster file that contains the current connection string.
// This file is managed by the fdbserver processes itself and they will automatically update the file if the
// coordinators have changed.
fdbClusterFilePath = "/var/fdb/data/fdb.cluster"
)
// monitor provides the main monitor loop
type monitor struct {
// configFile defines the path to the config file to load.
configFile string
// currentContainerVersion defines the version of the container. This will be the same as the fdbserver version.
currentContainerVersion api.Version
// customEnvironment defines the custom environment variables to use when
// interpreting the monitor configuration.
customEnvironment map[string]string
// activeConfiguration defines the active process configuration.
activeConfiguration *api.ProcessConfiguration
// activeConfigurationBytes defines the source data for the active process
// configuration.
activeConfigurationBytes []byte
// lastConfigurationTime is the last time we successfully reloaded the
// configuration file.
lastConfigurationTime time.Time
// processCount defines how many processes the
processCount int
// processIDs stores the PIDs of the processes that are running. A PID of
// zero will indicate that a process does not have a run loop. A PID of -1
// will indicate that a process has a run loop but is not currently running
// the subprocess.
processIDs []int
// mutex defines a mutex around working with configuration.
// This is used to synchronize access to local state like the active
// configuration and the process IDs from multiple goroutines.
mutex sync.Mutex
// podClient is a client for posting updates about this pod to
// Kubernetes.
podClient *kubernetesClient
// logger is the logger instance for this monitor.
logger logr.Logger
// metrics represents the prometheus monitor metrics.
metrics *metrics
}
type httpConfig struct {
listenAddr, certPath, keyPath, rootCaPath string
}
// startMonitor starts the monitor loop.
func startMonitor(ctx context.Context, logger logr.Logger, configFile string, customEnvironment map[string]string, processCount int, promConfig httpConfig, enableDebug bool, currentContainerVersion api.Version, enableNodeWatcher bool) {
client, err := createPodClient(ctx, logger, enableNodeWatcher, setupCache)
if err != nil {
logger.Error(err, "could not create Pod client")
os.Exit(1)
}
mon := &monitor{
configFile: configFile,
podClient: client,
logger: logger,
customEnvironment: customEnvironment,
processCount: processCount,
processIDs: make([]int, processCount+1),
currentContainerVersion: currentContainerVersion,
}
go func() { mon.watchPodTimestamps() }()
mux := http.NewServeMux()
// Enable pprof endpoints for debugging purposes.
if enableDebug {
mux.Handle("/debug/pprof/heap", pprof.Handler("heap"))
mux.Handle("/debug/pprof/goroutine", pprof.Handler("goroutine"))
mux.Handle("/debug/pprof/threadcreate", pprof.Handler("threadcreate"))
mux.Handle("/debug/pprof/allocs", pprof.Handler("allocs"))
mux.Handle("/debug/pprof/block", pprof.Handler("block"))
mux.Handle("/debug/pprof/mutex", pprof.Handler("mutex"))
mux.HandleFunc("/debug/pprof/", pprof.Index)
mux.HandleFunc("/debug/pprof/profile", pprof.Profile)
mux.HandleFunc("/debug/pprof/symbol", pprof.Symbol)
mux.HandleFunc("/debug/pprof/cmdline", pprof.Cmdline)
mux.HandleFunc("/debug/pprof/trace", pprof.Trace)
}
reg := prometheus.NewRegistry()
// Enable the default go metrics.
reg.MustRegister(collectors.NewGoCollector())
monitorMetrics := registerMetrics(reg)
mon.metrics = monitorMetrics
promHandler := promhttp.HandlerFor(reg, promhttp.HandlerOpts{})
// Add Prometheus support
mux.Handle("/metrics", promHandler)
go func() {
if promConfig.keyPath != "" || promConfig.certPath != "" {
certLoader := certloader.NewCertLoader(logger, promConfig.certPath, promConfig.keyPath)
tlsConfig := &tls.Config{
GetCertificate: certLoader.GetCertificate,
}
server := &http.Server{
Addr: promConfig.listenAddr,
Handler: mux,
TLSConfig: tlsConfig,
}
err = server.ListenAndServeTLS("", "")
if err != nil {
logger.Error(err, "could not start HTTPS server")
os.Exit(1)
}
}
err := http.ListenAndServe(promConfig.listenAddr, mux)
if err != nil {
logger.Error(err, "could not start HTTP server")
os.Exit(1)
}
}()
mon.run()
}
// updateCustomEnvironment will add the node labels and their values to the custom environment map. All the generated
// environment variables will start with NODE_LABEL and "/" and "." will be replaced in the key as "_", e.g. from the
// label "foundationdb.org/testing = awesome" the env variables NODE_LABEL_FOUNDATIONDB_ORG_TESTING = awesome" will be
// generated.
func (monitor *monitor) updateCustomEnvironmentFromNodeMetadata() {
if monitor.podClient.nodeMetadata == nil {
return
}
nodeLabels := monitor.podClient.nodeMetadata.Labels
for key, value := range nodeLabels {
sanitizedKey := strings.ReplaceAll(key, "/", "_")
sanitizedKey = strings.ReplaceAll(sanitizedKey, ".", "_")
envKey := "NODE_LABEL_" + strings.ToUpper(sanitizedKey)
currentValue, ok := monitor.customEnvironment[envKey]
if !ok {
monitor.logger.Info("adding new custom environment variable from node labels", "key", envKey, "value", value)
monitor.customEnvironment[envKey] = value
continue
}
if currentValue == value {
continue
}
monitor.logger.Info("update custom environment variable from node labels", "key", envKey, "newValue", value, "currentValue", currentValue)
monitor.customEnvironment[envKey] = value
continue
}
}
// readConfiguration reads the latest configuration from the monitor file.
func (monitor *monitor) readConfiguration() (*api.ProcessConfiguration, []byte) {
file, err := os.Open(monitor.configFile)
if err != nil {
monitor.logger.Error(err, "Error reading monitor config file", "monitorConfigPath", monitor.configFile)
return nil, nil
}
defer func() {
err := file.Close()
monitor.logger.Error(err, "Error could not close file", "monitorConfigPath", monitor.configFile)
}()
configuration := &api.ProcessConfiguration{}
configurationBytes, err := io.ReadAll(file)
if err != nil {
monitor.logger.Error(err, "Error reading monitor configuration", "monitorConfigPath", monitor.configFile)
}
err = json.Unmarshal(configurationBytes, configuration)
if err != nil {
monitor.logger.Error(err, "Error parsing monitor configuration", "rawConfiguration", string(configurationBytes))
return nil, nil
}
if configuration.Version == nil {
monitor.logger.Error(err, "Error could not parse configured version", "rawConfiguration", string(configurationBytes))
return nil, nil
}
// If the versions are protocol compatible don't try to point to another binary path. Otherwise, the processes will
// cannot restart when a process crashes during a patch upgrade.
if monitor.currentContainerVersion.IsProtocolCompatible(*configuration.Version) {
configuration.BinaryPath = fdbserverPath
} else {
configuration.BinaryPath = path.Join(sharedBinaryDir, configuration.Version.String(), "fdbserver")
}
err = checkOwnerExecutable(configuration.BinaryPath)
if err != nil {
monitor.logger.Error(err, "Error with binary path for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath)
return nil, nil
}
monitor.updateCustomEnvironmentFromNodeMetadata()
_, err = configuration.GenerateArguments(1, monitor.customEnvironment)
if err != nil {
monitor.logger.Error(err, "Error generating arguments for latest configuration", "configuration", configuration, "binaryPath", configuration.BinaryPath)
return nil, nil
}
if configuration.ShouldRunServers() {
// In case that the process is isolated we don't want to start the servers and we should terminate the running fdbserver
// instances.
if monitor.processIsIsolated() {
configuration.RunServers = pointer.Bool(false)
}
}
return configuration, configurationBytes
}
// loadConfiguration loads the latest configuration from the config file.
func (monitor *monitor) loadConfiguration() {
configuration, configurationBytes := monitor.readConfiguration()
if configuration == nil || len(configurationBytes) == 0 {
return
}
monitor.acceptConfiguration(configuration, configurationBytes)
}
// checkOwnerExecutable validates that a path is a file that exists and is
// executable by its owner.
func checkOwnerExecutable(path string) error {
binaryStat, err := os.Stat(path)
if err != nil {
return err
}
if binaryStat.Mode()&0o100 == 0 {
return fmt.Errorf("binary is not executable")
}
return nil
}
// acceptConfiguration is called when the monitor process parses and accepts
// a configuration from the local config file.
func (monitor *monitor) acceptConfiguration(configuration *api.ProcessConfiguration, configurationBytes []byte) {
monitor.mutex.Lock()
defer monitor.mutex.Unlock()
// If the configuration hasn't changed ignore those events to prevent noisy logging.
if equality.Semantic.DeepEqual(monitor.activeConfiguration, configuration) {
return
}
monitor.logger.Info("Received new configuration file", "configuration", configuration)
monitor.activeConfiguration = configuration
monitor.activeConfigurationBytes = configurationBytes
monitor.lastConfigurationTime = time.Now()
// Update the prometheus metrics.
monitor.metrics.registerConfigurationChange(configuration.Version.String())
var hasRunningProcesses bool
for processNumber := 1; processNumber <= monitor.processCount; processNumber++ {
if monitor.processIDs[processNumber] == 0 {
monitor.processIDs[processNumber] = -1
tempNumber := processNumber
go func() { monitor.runProcess(tempNumber) }()
continue
}
hasRunningProcesses = true
}
// If the monitor has running processes but the processes shouldn't be running, kill them with SIGTERM.
if hasRunningProcesses && !monitor.activeConfiguration.ShouldRunServers() {
monitor.sendSignalToProcesses(syscall.SIGTERM)
}
err := monitor.podClient.updateAnnotations(monitor)
if err != nil {
monitor.logger.Error(err, "Error updating pod annotations")
}
}
// getBackoffDuration returns the backoff duration. The backoff time will increase exponential with a maximum of 60 seconds.
func getBackoffDuration(errorCounter int) time.Duration {
timeToBackoff := time.Duration(errorCounter*errorCounter) * time.Second
if timeToBackoff > maxErrorBackoffSeconds {
return maxErrorBackoffSeconds
}
return timeToBackoff
}
// runProcess runs a loop to continually start and watch a process.
func (monitor *monitor) runProcess(processNumber int) {
pid := 0
logger := monitor.logger.WithValues("processNumber", processNumber, "area", "runProcess")
logger.Info("Starting run loop")
startTime := time.Now()
// Counts the successive errors that occurred during process start up. Based on the error count the backoff time
// will be calculated.
var errorCounter int
for {
if !monitor.processRequired(processNumber) {
return
}
durationSinceLastStart := time.Since(startTime)
// If for more than 5 minutes no error have occurred we reset the error counter to reset the backoff time.
if durationSinceLastStart > 5*time.Minute {
errorCounter = 0
}
arguments, err := monitor.activeConfiguration.GenerateArguments(processNumber, monitor.customEnvironment)
if err != nil {
backoffDuration := getBackoffDuration(errorCounter)
logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.activeConfiguration, "errorCounter", errorCounter, "backoffDuration", backoffDuration.String())
time.Sleep(backoffDuration)
errorCounter++
continue
}
cmd := exec.Cmd{
Path: arguments[0],
Args: arguments,
}
logger.Info("Starting subprocess", "arguments", arguments)
stdout, err := cmd.StdoutPipe()
if err != nil {
logger.Error(err, "Error getting stdout from subprocess")
}
stderr, err := cmd.StderrPipe()
if err != nil {
logger.Error(err, "Error getting stderr from subprocess")
}
err = cmd.Start()
if err != nil {
backoffDuration := getBackoffDuration(errorCounter)
logger.Error(err, "Error starting subprocess", "backoffDuration", backoffDuration.String())
time.Sleep(backoffDuration)
errorCounter++
continue
}
// Update the prometheus metrics for the process.
monitor.metrics.registerProcessStartup(processNumber, monitor.activeConfiguration.Version.String())
if cmd.Process != nil {
pid = cmd.Process.Pid
} else {
logger.Error(nil, "No Process information available for subprocess")
}
startTime = time.Now()
logger.Info("Subprocess started", "PID", pid)
monitor.updateProcessID(processNumber, pid)
if stdout != nil {
stdoutScanner := bufio.NewScanner(stdout)
go func() {
for stdoutScanner.Scan() {
logger.Info("Subprocess output", "msg", stdoutScanner.Text(), "PID", pid)
}
}()
}
if stderr != nil {
stderrScanner := bufio.NewScanner(stderr)
go func() {
for stderrScanner.Scan() {
logger.Error(nil, "Subprocess error log", "msg", stderrScanner.Text(), "PID", pid)
}
}()
}
err = cmd.Wait()
if err != nil {
logger.Error(err, "Error from subprocess", "PID", pid)
}
exitCode := -1
if cmd.ProcessState != nil {
exitCode = cmd.ProcessState.ExitCode()
}
processDuration := time.Since(startTime)
logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid, "lastExecutionDurationSeconds", processDuration.String())
monitor.updateProcessID(processNumber, -1)
// Only backoff if the exit code is non-zero.
if exitCode != 0 {
backoffDuration := getBackoffDuration(errorCounter)
logger.Info("Backing off from restarting subprocess", "backoffDuration", backoffDuration.String(), "lastExecutionDurationSeconds", processDuration.String(), "errorCounter", errorCounter, "exitCode", exitCode)
time.Sleep(backoffDuration)
errorCounter++
}
}
}
// processRequired determines if the latest configuration requires that a
// process stay running.
// If the process is no longer desired, this will remove it from the process ID
// list and return false. If the process is still desired, this will return
// true.
func (monitor *monitor) processRequired(processNumber int) bool {
monitor.mutex.Lock()
defer monitor.mutex.Unlock()
logger := monitor.logger.WithValues("processNumber", processNumber, "area", "processRequired")
if monitor.processCount < processNumber || !monitor.activeConfiguration.ShouldRunServers() {
if monitor.processIDs[processNumber] != 0 {
logger.Info("Terminating run loop")
monitor.processIDs[processNumber] = 0
}
return false
}
return true
}
// processIsIsolated returns true if the IsolateProcessGroupAnnotation is set to "true".
func (monitor *monitor) processIsIsolated() bool {
if monitor.podClient.podMetadata == nil {
return false
}
if monitor.podClient.podMetadata.Annotations == nil {
return false
}
val, ok := monitor.podClient.podMetadata.Annotations[api.IsolateProcessGroupAnnotation]
if !ok {
return false
}
isolated, err := strconv.ParseBool(val)
if err != nil {
monitor.logger.Error(err, "could not parse the value of the %s annotation", api.IsolateProcessGroupAnnotation)
return false
}
return isolated
}
// updateProcessID records a new Process ID from a newly launched process.
func (monitor *monitor) updateProcessID(processNumber int, pid int) {
monitor.mutex.Lock()
defer monitor.mutex.Unlock()
monitor.processIDs[processNumber] = pid
}
// watchConfiguration detects changes to the monitor configuration file.
func (monitor *monitor) watchConfiguration(watcher *fsnotify.Watcher) {
for {
select {
case event, ok := <-watcher.Events:
if !ok {
return
}
monitor.logger.Info("Detected event on monitor conf file or cluster file", "event", event)
if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
monitor.handleFileChange(event.Name)
} else if event.Op&fsnotify.Remove == fsnotify.Remove {
err := watcher.Add(event.Name)
if err != nil {
panic(err)
}
monitor.handleFileChange(event.Name)
}
case err, ok := <-watcher.Errors:
if !ok {
return
}
monitor.logger.Error(err, "Error watching for file system events")
}
}
}
// handleFileChange will perform the required action based on the changed/modified file.
func (monitor *monitor) handleFileChange(changedFile string) {
if changedFile == fdbClusterFilePath {
err := monitor.podClient.updateFdbClusterTimestampAnnotation()
if err != nil {
monitor.logger.Error(err, fmt.Sprintf("could not update %s annotation", api.ClusterFileChangeDetectedAnnotation))
}
return
}
monitor.loadConfiguration()
}
func (monitor *monitor) sendSignalToProcesses(signal os.Signal) {
for processNumber, processID := range monitor.processIDs {
if processID <= 0 {
continue
}
subprocessLogger := monitor.logger.WithValues("processNumber", processNumber, "PID", processID)
process, err := os.FindProcess(processID)
if err != nil {
subprocessLogger.Error(err, "Error finding subprocess")
continue
}
subprocessLogger.Info("Sending signal to subprocess", "signal", signal)
err = process.Signal(signal)
if err != nil {
subprocessLogger.Error(err, "Error signaling subprocess")
continue
}
}
}
// run runs the monitor loop.
func (monitor *monitor) run() {
done := make(chan bool, 1)
signals := make(chan os.Signal, 1)
signal.Notify(signals, syscall.SIGINT, syscall.SIGTERM)
go func() {
latestSignal := <-signals
monitor.logger.Info("Received system signal", "signal", latestSignal)
// Reset the processCount to 0 to make sure the monitor doesn't try to restart the processes.
monitor.processCount = 0
monitor.sendSignalToProcesses(latestSignal)
annotations := monitor.podClient.podMetadata.Annotations
if len(annotations) > 0 {
delayValue, ok := annotations[api.DelayShutdownAnnotation]
if ok {
delay, err := time.ParseDuration(delayValue)
if err == nil {
time.Sleep(delay)
}
}
}
done <- true
}()
monitor.loadConfiguration()
watcher, err := fsnotify.NewWatcher()
if err != nil {
panic(err)
}
monitor.logger.Info("adding watch for file", "path", path.Base(monitor.configFile))
err = watcher.Add(monitor.configFile)
if err != nil {
panic(err)
}
defer func(watcher *fsnotify.Watcher) {
err := watcher.Close()
if err != nil {
monitor.logger.Error(err, "could not close watcher")
}
}(watcher)
go func() { monitor.watchConfiguration(watcher) }()
// The cluster file will be created and managed by the fdbserver processes, so we have to wait until the fdbserver
// processes have been started. Except for the initial cluster creation this file should be present as soon as the
// monitor starts the processes.
for {
_, err = os.Stat(fdbClusterFilePath)
if errors.Is(err, os.ErrNotExist) {
monitor.logger.Info("waiting for file to be created", "path", fdbClusterFilePath)
time.Sleep(5 * time.Second)
continue
}
monitor.logger.Info("adding watch for file", "path", fdbClusterFilePath)
err = watcher.Add(fdbClusterFilePath)
if err != nil {
panic(err)
}
break
}
<-done
}
// watchPodTimestamps watches the timestamp feed to reload the configuration.
func (monitor *monitor) watchPodTimestamps() {
for timestamp := range monitor.podClient.TimestampFeed {
if timestamp > monitor.lastConfigurationTime.Unix() {
monitor.loadConfiguration()
}
}
}