Adds structured logging in fdb-kubernetes-monitor.

Adds a backoff window when restarting processes in fdb-kubernetes-monitor.
This commit is contained in:
John Brownlee 2021-08-22 21:19:10 -07:00
parent 95ad5854be
commit 7c36123cf8
11 changed files with 197 additions and 66 deletions

View File

@ -0,0 +1 @@
test:test@127.0.0.1:4501

View File

@ -0,0 +1,7 @@
export FDB_PUBLIC_IP=127.0.0.1
export FDB_POD_IP=127.0.0.1
export FDB_ZONE_ID=localhost
export FDB_MACHINE_ID=localhost
export FDB_INSTANCE_ID=storage-1
export KUBERNETES_SERVICE_HOST=kubernetes.docker.internal
export KUBERNETES_SERVICE_PORT=6443

View File

@ -5,21 +5,24 @@ To test this, run the following commands from the root of the FoundationDB
repository:
docker build -t foundationdb/foundationdb-kubernetes:latest --build-arg FDB_VERSION=6.3.15 --build-arg FDB_LIBRARY_VERSIONS="6.3.15 6.2.30 6.1.13" -f packaging/docker/kubernetes/Dockerfile .
kubectl apply -f packaging/docker/kubernetes/config.yaml
kubectl apply -f packaging/docker/kubernetes/test_config.yaml
# Wait for the pods to become ready
ips=$(kubectl get pod -l app=fdb-kubernetes-example -o json | jq -j '[[.items|.[]|select(.status.podIP!="")]|limit(3;.[])|.status.podIP+":4501"]|join(",")')
cat packaging/docker/kubernetes/config.yaml | sed -e "s/fdb.cluster: \"\"/fdb.cluster: \"test:test@$ips\"/" -e "s/\"serverCount\": 0/\"serverCount\": 1/" | kubectl apply -f -
cat packaging/docker/kubernetes/test_config.yaml | sed -e "s/fdb.cluster: \"\"/fdb.cluster: \"test:test@$ips\"/" -e "s/\"serverCount\": 0/\"serverCount\": 1/" | kubectl apply -f -
kubectl get pod -l app=fdb-kubernetes-example -o name | xargs -I {} kubectl annotate {} foundationdb.org/outdated-config-map-seen=$(date +%s) --overwrite
# Watch the logs for the fdb-kubernetes-example pods to confirm that they have launched the fdbserver processes.
kubectl exec -it sts/fdb-kubernetes-example -- fdbcli --exec "configure new double ssd"
You can then make changes to the data in the config map and update the fdbserver processes:
kubectl apply -f packaging/docker/kubernetes/config.yaml
cat packaging/docker/kubernetes/test_config.yaml | sed -e "s/fdb.cluster: \"\"/fdb.cluster: \"test:test@$ips\"/" -e "s/\"serverCount\": 0/\"serverCount\": 1/" | kubectl apply -f -
# You can apply an annotation to speed up the propagation of config
kubectl get pod -l app=fdb-kubernetes-example -o name | xargs -I {} kubectl annotate {} foundationdb.org/outdated-config-map-seen=$(date +%s) --overwrite
# Watch the logs for the fdb-kubernetes-example pods to confirm that they have launched the fdbserver processes.
# Watch the logs for the fdb-kubernetes-example pods to confirm that they have reloaded their configuration, and then do a bounce.
kubectl exec -it sts/fdb-kubernetes-example -- fdbcli --exec "kill; kill all; status"
Once you are done, you can tear down the example with the following command:
kubectl delete -f packaging/docker/kubernetes/config.yaml; kubectl delete pvc -l app=fdb-kubernetes-example
kubectl delete -f packaging/docker/kubernetes/test_config.yaml; kubectl delete pvc -l app=fdb-kubernetes-example

View File

@ -24,6 +24,9 @@ import (
"os"
"reflect"
"testing"
"github.com/go-logr/zapr"
"go.uber.org/zap"
)
func loadConfigFromFile(path string) (*ProcessConfiguration, error) {
@ -99,4 +102,13 @@ func TestGeneratingArgumentForEnvironmentVariable(t *testing.T) {
t.Fail()
return
}
zapLogger, err := zap.NewDevelopment()
if err != nil {
panic(err)
}
log := zapr.NewLogger(zapLogger)
log.Info("JPB test", "key", "value")
t.Fail()
}

View File

@ -23,7 +23,10 @@ go 1.16
require (
github.com/fsnotify/fsnotify v1.5.0
github.com/go-logr/logr v0.4.0
github.com/go-logr/zapr v0.4.0
github.com/spf13/pflag v1.0.5
go.uber.org/zap v1.19.0
k8s.io/api v0.20.2
k8s.io/apimachinery v0.20.2
k8s.io/client-go v0.20.2

View File

@ -36,6 +36,8 @@ github.com/NYTimes/gziphandler v0.0.0-20170623195520-56545f4a5d46/go.mod h1:3wb0
github.com/PuerkitoBio/purell v1.1.1/go.mod h1:c11w/QuzBsJSee3cPx9rAFu61PvFxuPbtSwDGJws/X0=
github.com/PuerkitoBio/urlesc v0.0.0-20170810143723-de5bf2ad4578/go.mod h1:uGdkoq3SwY9Y+13GIhn11/XLaGBb4BfwItxLd5jeuXE=
github.com/asaskevich/govalidator v0.0.0-20190424111038-f61b66f89f4a/go.mod h1:lB+ZfQJz7igIIfQNfa7Ml4HSf2uFQQRzpGGRXenZAgY=
github.com/benbjohnson/clock v1.1.0 h1:Q92kusRqC1XV2MjkWETPvjJVqKetz1OzxZB7mHJLju8=
github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA=
github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
github.com/chzyer/logex v1.1.10/go.mod h1:+Ywpsq7O8HXn0nuIou7OrIPyXbp3wmkHB+jjWRnGsAI=
github.com/chzyer/readline v0.0.0-20180603132655-2972be24d48e/go.mod h1:nSuG5e5PlCu98SY8svDHJxuZscDgtXS6KTTbou5AhLI=
@ -62,8 +64,11 @@ github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-gl/glfw/v3.3/glfw v0.0.0-20200222043503-6f7a984d4dc4/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8=
github.com/go-logr/logr v0.1.0/go.mod h1:ixOQHD9gLJUVQQ2ZOR7zLEifBX6tGkNJF4QyIY7sIas=
github.com/go-logr/logr v0.2.0 h1:QvGt2nLcHH0WK9orKa+ppBPAxREcH364nPUedEpK0TY=
github.com/go-logr/logr v0.2.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
github.com/go-logr/logr v0.4.0 h1:K7/B1jt6fIBQVd4Owv2MqGQClcgf0R266+7C/QjRcLc=
github.com/go-logr/logr v0.4.0/go.mod h1:z6/tIYblkpsD+a4lm/fGIIU9mZ+XfAiaFtq7xTgseGU=
github.com/go-logr/zapr v0.4.0 h1:uc1uML3hRYL9/ZZPdgHS/n8Nzo+eaYL/Efxkkamf7OM=
github.com/go-logr/zapr v0.4.0/go.mod h1:tabnROwaDl0UNxkVeFRbY8bwB37GwRv0P8lg6aAiEnk=
github.com/go-openapi/jsonpointer v0.19.2/go.mod h1:3akKfEdA7DF1sugOqz1dVQHBcuDBPKZGEoHC/NkiQRg=
github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg=
github.com/go-openapi/jsonreference v0.19.2/go.mod h1:jMjeRr2HHw6nAVajTXJ4eiUwohSTlpa0o73RUL1owJc=
@ -157,6 +162,8 @@ github.com/onsi/ginkgo v1.11.0/go.mod h1:lLunBs/Ym6LB5Z9jYTR76FiuTmxDTDusOGeTQH+
github.com/onsi/gomega v0.0.0-20170829124025-dcabb60a477c/go.mod h1:C1qb7wdrVGGVU+Z6iS04AVkA3Q65CEZX59MT0QO5uiA=
github.com/onsi/gomega v1.7.0/go.mod h1:ex+gbHU/CVuBBDIJjb2X0qEXbFg53c61hWP/1CpauHY=
github.com/peterbourgon/diskv v2.0.1+incompatible/go.mod h1:uqqh8zWWbv1HBMNONnaR/tNboyR3/BZd58JJSHlUSCU=
github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4=
github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
@ -170,12 +177,21 @@ github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+
github.com/stretchr/objx v0.2.0/go.mod h1:qt09Ya8vawLte6SNmTgCsAVtYtaKzEcn8ATUoHMkEqE=
github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI=
github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4=
github.com/stretchr/testify v1.6.1 h1:hDPOHmpOpP40lSULcqw7IrRb/u7w6RpDC9399XyoNd0=
github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
go.opencensus.io v0.21.0/go.mod h1:mSImk1erAIZhrmZN+AvHh14ztQfjbGwt4TtuofqLduU=
go.opencensus.io v0.22.0/go.mod h1:+kGneAE2xo2IficOXnaByMWTGM9T73dGwxeWcUqIpI8=
go.opencensus.io v0.22.2/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.opencensus.io v0.22.3/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw=
go.uber.org/atomic v1.7.0 h1:ADUqmZGgLDDfbSL9ZmPxKTybcoEYHgpYfELNoN+7hsw=
go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc=
go.uber.org/goleak v1.1.10 h1:z+mqJhf6ss6BSfSM671tgKyZBFPTTJM+HLxnhPC3wu0=
go.uber.org/goleak v1.1.10/go.mod h1:8a7PlsEVH3e/a/GLqe5IIrQx6GzcnRmZEufDUTk4A7A=
go.uber.org/multierr v1.6.0 h1:y6IPFStTAIT5Ytl7/XYmHvzXQ7S3g/IeZW9hyZ5thw4=
go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU=
go.uber.org/zap v1.19.0 h1:mZQZefskPPCMIBCSEH0v2/iUqqLrYtaeqwD6FUGUnFE=
go.uber.org/zap v1.19.0/go.mod h1:xg/QME4nWcxGxrpdeYfq7UvYrLh66cuVKdrbD1XF/NI=
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w=
golang.org/x/crypto v0.0.0-20190510104115-cbcb75029529/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
golang.org/x/crypto v0.0.0-20190605123033-f99c8df09eb5/go.mod h1:yigFU9vqHzYiE8UmvKecakEJjdnWj3jj499lnFckfCI=
@ -205,6 +221,7 @@ golang.org/x/lint v0.0.0-20190909230951-414d861bb4ac/go.mod h1:6SW0HCj/g11FgYtHl
golang.org/x/lint v0.0.0-20190930215403-16217165b5de/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc=
golang.org/x/lint v0.0.0-20191125180803-fdd1cda4f05f/go.mod h1:5qLYkcX4OjUUV8bRuDixDT3tpyyb+LUpUlRWLxfhWrs=
golang.org/x/lint v0.0.0-20200130185559-910be7a94367/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/lint v0.0.0-20200302205851-738671d3881b h1:Wh+f8QHJXR411sJR8/vRBTZ7YapZaRvUcLFFJhusH0k=
golang.org/x/lint v0.0.0-20200302205851-738671d3881b/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY=
golang.org/x/mobile v0.0.0-20190312151609-d3739f865fa6/go.mod h1:z+o9i4GpDbdi3rU15maQ/Ox0txvL9dWGYEHz965HBQE=
golang.org/x/mobile v0.0.0-20190719004257-d2bd2a29d028/go.mod h1:E/iHnbuqvinMTCcRqshq8CkpyQDoeVncDDYHnLhea+o=
@ -303,6 +320,7 @@ golang.org/x/tools v0.0.0-20190628153133-6cdbf07be9d0/go.mod h1:/rFqwRUd4F7ZHNgw
golang.org/x/tools v0.0.0-20190816200558-6889da9d5479/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20190911174233-4f2ddba30aff/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191012152004-8de300cfc20a/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191108193012-7d206e10da11/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191113191852-77e3bb0ad9e7/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191115202509-3a792d9c32b2/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo=
@ -317,6 +335,7 @@ golang.org/x/tools v0.0.0-20200204074204-1cc6d1ef6c74/go.mod h1:TB2adYChydJhpapK
golang.org/x/tools v0.0.0-20200207183749-b753a1ba74fa/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200212150539-ea181f53ac56/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200224181240-023911ca70b2/go.mod h1:TB2adYChydJhpapKDTa4BR/hXlZSLoq2Wpct/0txZ28=
golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb h1:iKlO7ROJc6SttHKlxzwGytRtBUqX4VARrNTgP2YLX5M=
golang.org/x/tools v0.0.0-20200304193943-95d2e580d8eb/go.mod h1:o4KQGtdN14AW+yjsvvwRTJJuXz8XRtIHtEnmAXLyFUw=
golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0=
@ -388,8 +407,9 @@ gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v2 v2.2.8 h1:obN1ZagJSUGI0Ek/LBmuj4SNLPfIny3KsKFopxRdj10=
gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b h1:h8qDotaEPuJATrMmW04NCwg7v22aHH28wwpauUhK9Oo=
gopkg.in/yaml.v3 v3.0.0-20210107192922-496545a6307b/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190106161140-3f1c8253044a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=
honnef.co/go/tools v0.0.0-20190418001031-e561f6794a2a/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4=

View File

@ -23,10 +23,10 @@ import (
"context"
"encoding/json"
"fmt"
"log"
"os"
"strconv"
"github.com/go-logr/logr"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
@ -61,6 +61,9 @@ type PodClient struct {
// TimestampFeed is a channel where the pod client will send updates with
// the values from OutdatedConfigMapAnnotation.
TimestampFeed chan int64
// Logger is the logger we use for this client.
Logger logr.Logger
}
// CreatePodClient creates a new client for working with the pod object.
@ -155,7 +158,7 @@ func (client *PodClient) watchPod() error {
if event.Type == watch.Modified {
pod, valid := event.Object.(*corev1.Pod)
if !valid {
log.Printf("Error getting pod information from watch: %v", event)
client.Logger.Error(nil, "Error getting pod information from watch", "event", event)
}
client.processPodUpdate(pod)
}
@ -177,7 +180,7 @@ func (client *PodClient) processPodUpdate(pod *corev1.Pod) {
}
timestamp, err := strconv.ParseInt(annotation, 10, 64)
if err != nil {
log.Printf("Error parsing annotation %s: %s", annotation, err)
client.Logger.Error(err, "Error parsing annotation", "key", OutdatedConfigMapAnnotation, "rawAnnotation", annotation, err)
return
}

View File

@ -22,20 +22,34 @@ package main
import (
"fmt"
"github.com/go-logr/zapr"
"github.com/spf13/pflag"
"go.uber.org/zap"
)
var (
inputDir string
fdbserverPath string
monitorConfFile string
logPath string
)
func main() {
pflag.StringVar(&fdbserverPath, "fdbserver-path", "/usr/bin/fdbserver", "Path to the fdbserver binary")
pflag.StringVar(&inputDir, "input-dir", ".", "Directory containing input files")
pflag.StringVar(&monitorConfFile, "input-monitor-conf", "config.json", "Name of the file in the input directory that contains the monitor configuration")
pflag.StringVar(&logPath, "log-path", "", "Name of a file to send logs to. Logs will be sent to stdout in addition the file you pass in this argument. If this is blank, logs will only by sent to stdout")
pflag.Parse()
StartMonitor(fmt.Sprintf("%s/%s", inputDir, monitorConfFile), fdbserverPath)
zapConfig := zap.NewProductionConfig()
if logPath != "" {
zapConfig.OutputPaths = append(zapConfig.OutputPaths, logPath)
}
zapLogger, err := zapConfig.Build()
if err != nil {
panic(err)
}
logger := zapr.NewLogger(zapLogger)
StartMonitor(logger, fmt.Sprintf("%s/%s", inputDir, monitorConfFile), fdbserverPath)
}

View File

@ -20,9 +20,9 @@
package main
import (
"bufio"
"encoding/json"
"io"
"log"
"os"
"os/exec"
"os/signal"
@ -31,11 +31,14 @@ import (
"time"
"github.com/fsnotify/fsnotify"
"github.com/go-logr/logr"
)
// errorBackoffSeconds is the time to wait after a process fails before starting
// another process.
const errorBackoffSeconds = 5
// This delay will only be applied when there has been more than one failure
// within this time window.
const errorBackoffSeconds = 60
// Monitor provides the main monitor loop
type Monitor struct {
@ -60,7 +63,7 @@ type Monitor struct {
// zero will indicate that a process does not have a run loop. A PID of -1
// will indicate that a process has a run loop but is not currently running
// the subprocess.
ProcessesIDs []int
ProcessIDs []int
// Mutex defines a mutex around working with configuration.
Mutex sync.Mutex
@ -68,10 +71,13 @@ type Monitor struct {
// PodClient is a client for posting updates about this pod to
// Kubernetes.
PodClient *PodClient
// Logger is the logger instance for this monitor.
Logger logr.Logger
}
// StartMonitor starts the monitor loop.
func StartMonitor(configFile string, fdbserverPath string) {
func StartMonitor(logger logr.Logger, configFile string, fdbserverPath string) {
podClient, err := CreatePodClient()
if err != nil {
panic(err)
@ -81,6 +87,7 @@ func StartMonitor(configFile string, fdbserverPath string) {
ConfigFile: configFile,
FDBServerPath: fdbserverPath,
PodClient: podClient,
Logger: logger,
}
go func() { monitor.WatchPodTimestamps() }()
@ -91,36 +98,36 @@ func StartMonitor(configFile string, fdbserverPath string) {
func (monitor *Monitor) LoadConfiguration() {
file, err := os.Open(monitor.ConfigFile)
if err != nil {
log.Print(err.Error())
monitor.Logger.Error(err, "Error reading monitor config file", "monitorConfigPath", monitor.ConfigFile)
return
}
defer file.Close()
configuration := &ProcessConfiguration{}
configurationBytes, err := io.ReadAll(file)
if err != nil {
log.Print(err.Error())
monitor.Logger.Error(err, "Error reading monitor configuration", "monitorConfigPath", monitor.ConfigFile)
}
err = json.Unmarshal(configurationBytes, configuration)
if err != nil {
log.Print(err)
monitor.Logger.Error(err, "Error parsing monitor configuration", "rawConfiguration", string(configurationBytes))
return
}
_, err = configuration.GenerateArguments(1, nil)
if err != nil {
log.Print(err)
monitor.Logger.Error(err, "Error generating arguments for latest configuration", "configuration", configuration)
return
}
log.Printf("Received new configuration file")
monitor.Logger.Info("Received new configuration file", "configuration", configuration)
monitor.Mutex.Lock()
defer monitor.Mutex.Unlock()
if monitor.ProcessesIDs == nil {
monitor.ProcessesIDs = make([]int, configuration.ServerCount+1)
if monitor.ProcessIDs == nil {
monitor.ProcessIDs = make([]int, configuration.ServerCount+1)
} else {
for len(monitor.ProcessesIDs) <= configuration.ServerCount {
monitor.ProcessesIDs = append(monitor.ProcessesIDs, 0)
for len(monitor.ProcessIDs) <= configuration.ServerCount {
monitor.ProcessIDs = append(monitor.ProcessIDs, 0)
}
}
@ -129,8 +136,8 @@ func (monitor *Monitor) LoadConfiguration() {
monitor.LastConfigurationTime = time.Now()
for processNumber := 1; processNumber <= configuration.ServerCount; processNumber++ {
if monitor.ProcessesIDs[processNumber] == 0 {
monitor.ProcessesIDs[processNumber] = -1
if monitor.ProcessIDs[processNumber] == 0 {
monitor.ProcessIDs[processNumber] = -1
tempNumber := processNumber
go func() { monitor.RunProcess(tempNumber) }()
}
@ -138,18 +145,20 @@ func (monitor *Monitor) LoadConfiguration() {
err = monitor.PodClient.UpdateAnnotations(monitor)
if err != nil {
log.Printf("Error updating pod annotations: %s", err)
monitor.Logger.Error(err, "Error updating pod annotations")
}
}
// RunProcess runs a loop to continually start and watch a process.
func (monitor *Monitor) RunProcess(processNumber int) {
log.Printf("Starting run loop for subprocess %d", processNumber)
pid := 0
logger := monitor.Logger.WithValues("processNumber", processNumber, "area", "RunProcess")
logger.Info("Starting run loop")
for {
monitor.Mutex.Lock()
if monitor.ActiveConfiguration.ServerCount < processNumber {
log.Printf("Terminating run loop for subprocess %d", processNumber)
monitor.ProcessesIDs[processNumber] = 0
logger.Info("Terminating run loop")
monitor.ProcessIDs[processNumber] = 0
monitor.Mutex.Unlock()
return
}
@ -158,42 +167,85 @@ func (monitor *Monitor) RunProcess(processNumber int) {
arguments, err := monitor.ActiveConfiguration.GenerateArguments(processNumber, nil)
arguments = append([]string{monitor.FDBServerPath}, arguments...)
if err != nil {
log.Print(err)
logger.Error(err, "Error generating arguments for subprocess", "configuration", monitor.ActiveConfiguration)
time.Sleep(errorBackoffSeconds * time.Second)
}
cmd := exec.Cmd{
Path: arguments[0],
Args: arguments,
Stdout: os.Stdout,
Stderr: os.Stderr,
Path: arguments[0],
Args: arguments,
}
logger.Info("Starting subprocess", "arguments", arguments)
stdout, err := cmd.StdoutPipe()
if err != nil {
logger.Error(err, "Error getting stdout from subprocess")
}
stderr, err := cmd.StderrPipe()
if err != nil {
logger.Error(err, "Error getting stderr from subprocess")
}
log.Printf("Starting subprocess #%d: %v", processNumber, arguments)
err = cmd.Start()
if err != nil {
log.Printf("Error from subprocess %d: %s", processNumber, err.Error())
log.Printf("Subprocess #%d will restart in %d seconds", processNumber, errorBackoffSeconds)
logger.Error(err, "Error starting subprocess")
time.Sleep(errorBackoffSeconds * time.Second)
continue
}
monitor.Mutex.Lock()
monitor.ProcessesIDs[processNumber] = cmd.Process.Pid
monitor.Mutex.Unlock()
err = cmd.Wait()
log.Printf("Subprocess #%d terminated", processNumber)
if err != nil {
log.Printf("Error from subprocess #%d: %s", processNumber, err.Error())
if cmd.Process != nil {
pid = cmd.Process.Pid
} else {
logger.Error(nil, "No Process information availale for subprocess")
}
startTime := time.Now()
logger.Info("Subprocess started", "PID", pid)
monitor.Mutex.Lock()
monitor.ProcessesIDs[processNumber] = -1
monitor.ProcessIDs[processNumber] = pid
monitor.Mutex.Unlock()
log.Printf("Subprocess #%d will restart in %d seconds", processNumber, errorBackoffSeconds)
time.Sleep(errorBackoffSeconds * time.Second)
if stdout != nil {
stdoutScanner := bufio.NewScanner(stdout)
go func() {
for stdoutScanner.Scan() {
logger.Info("Subprocess output", "msg", stdoutScanner.Text(), "PID", pid)
}
}()
}
if stderr != nil {
stderrScanner := bufio.NewScanner(stderr)
go func() {
for stderrScanner.Scan() {
logger.Error(nil, "Subprocess error log", "msg", stderrScanner.Text(), "PID", pid)
}
}()
}
err = cmd.Wait()
if err != nil {
logger.Error(err, "Error from subprocess", "PID", pid)
}
exitCode := -1
if cmd.ProcessState != nil {
exitCode = cmd.ProcessState.ExitCode()
}
logger.Info("Subprocess terminated", "exitCode", exitCode, "PID", pid)
endTime := time.Now()
monitor.Mutex.Lock()
monitor.ProcessIDs[processNumber] = -1
monitor.Mutex.Unlock()
processDuration := endTime.Sub(startTime)
if processDuration.Seconds() < errorBackoffSeconds {
logger.Info("Backing off from restarting subprocess", "backOffTimeSeconds", errorBackoffSeconds, "lastExecutionDurationSeconds", processDuration)
time.Sleep(errorBackoffSeconds * time.Second)
}
}
}
@ -205,7 +257,7 @@ func (monitor *Monitor) WatchConfiguration(watcher *fsnotify.Watcher) {
if !ok {
return
}
log.Printf("Detected event on monitor conf file: %v", event)
monitor.Logger.Info("Detected event on monitor conf file", "event", event)
if event.Op&fsnotify.Write == fsnotify.Write || event.Op&fsnotify.Create == fsnotify.Create {
monitor.LoadConfiguration()
} else if event.Op&fsnotify.Remove == fsnotify.Remove {
@ -219,7 +271,7 @@ func (monitor *Monitor) WatchConfiguration(watcher *fsnotify.Watcher) {
if !ok {
return
}
log.Print(err)
monitor.Logger.Error(err, "Error watching for file system events")
}
}
}
@ -232,18 +284,19 @@ func (monitor *Monitor) Run() {
go func() {
latestSignal := <-signals
log.Printf("Received signal %v", latestSignal)
for processNumber, processID := range monitor.ProcessesIDs {
monitor.Logger.Info("Received system signal", "signal", latestSignal)
for processNumber, processID := range monitor.ProcessIDs {
if processID > 0 {
subprocessLogger := monitor.Logger.WithValues("processNumber", processNumber, "PID", processID)
process, err := os.FindProcess(processID)
if err != nil {
log.Printf("Error finding subprocess #%d (PID %d): %s", processNumber, processID, err.Error())
subprocessLogger.Error(err, "Error finding subprocess")
continue
}
log.Printf("Sending signal %v to subprocess #%d (PID %d)", latestSignal, processNumber, processID)
subprocessLogger.Info("Sending signal to subprocess", "signal", latestSignal)
err = process.Signal(latestSignal)
if err != nil {
log.Printf("Error signaling subprocess #%d (PID %d): %s", processNumber, processID, err.Error())
subprocessLogger.Error(err, "Error signaling subprocess")
continue
}
}

View File

@ -69,8 +69,15 @@ RUN rm -rf /mnt/website && rm -r /var/fdb/tmp
# Install the kubernetes monitor binary
COPY --from=go-build /fdb-kubernetes-monitor /usr/bin/
VOLUME /var/fdb/data
# Set up a non-root user
RUN groupadd --gid 4059 fdb && \
useradd --gid 4059 --uid 4059 --no-create-home --shell /bin/bash fdb && \
chown -R fdb:fdb /var/fdb
# Runtime Configuration Options
USER fdb
WORKDIR /var/fdb
ENTRYPOINT ["/usr/bin/fdb-kubernetes-monitor"]
VOLUME /var/fdb/data

View File

@ -23,7 +23,7 @@
# This is not a recommended way to run production clusters, but it can be useful
# to test the image in development.
#
# For more information on using this file, see fdbkubernetesmonitor/doc.go
# For more information on using this file, see fdbkubernetesmonitor/README.md
apiVersion: apps/v1
kind: StatefulSet
metadata:
@ -45,6 +45,11 @@ spec:
- name: foundationdb
image: foundationdb/foundationdb-kubernetes:latest
imagePullPolicy: IfNotPresent
args:
- --input-dir
- /var/fdb/dynamic-conf
- --log-path
- /var/fdb/logs/monitor.log
env:
- name: FDB_POD_NAME
valueFrom:
@ -76,19 +81,20 @@ spec:
valueFrom:
fieldRef:
fieldPath: metadata.name
args:
- --input-dir
- /var/fdb/dynamic-conf
volumeMounts:
- name: dynamic-conf
mountPath: /var/fdb/dynamic-conf
- name: data
mountPath: /var/fdb/data
- name: logs
mountPath: /var/fdb/logs
serviceAccountName: fdb-kubernetes-example
volumes:
- name: dynamic-conf
configMap:
name: fdb-kubernetes-example-config
- name: logs
emptyDir: {}
volumeClaimTemplates:
- metadata:
name: data
@ -142,7 +148,11 @@ data:
{"type": "Environment", "source": "FDB_INSTANCE_ID"},
{"value": "-"},
{"type": "ProcessNumber"}
]}
]},
{"value": "--logdir"},
{"value": "/var/fdb/logs"},
{"value": "--trace_format"},
{"value": "json"}
]
}
---
@ -176,5 +186,3 @@ roleRef:
subjects:
- kind: ServiceAccount
name: fdb-kubernetes-example