package main import ( "context" "flag" "fmt" "log/slog" "net/http" "os" "os/signal" "regexp" "strings" "sync" "syscall" "time" "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" "github.com/docker/docker/client" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" ) const ( // cachePeriod indicates the period of time the collector will reuse the results of docker inspect. cachePeriod = 1 * time.Second ) type ContainerClient interface { ContainerList(ctx context.Context, options container.ListOptions) ([]types.Container, error) ContainerInspect(ctx context.Context, containerID string) (types.ContainerJSON, error) } type dockerHealthCollector struct { mu sync.Mutex containerClient ContainerClient containerInfoCache []types.ContainerJSON lastseen time.Time } type descSource struct { name string help string } func (desc *descSource) Desc(labels prometheus.Labels) *prometheus.Desc { return prometheus.NewDesc(desc.name, desc.help, nil, labels) } var ( namespace = "container_state_" healthStatusDesc = descSource{ namespace + "health_status", "Container health status."} statusDesc = descSource{ namespace + "status", "Container status."} oomkilledDesc = descSource{ namespace + "oomkilled", "Container was killed by OOMKiller."} startedatDesc = descSource{ namespace + "startedat", "Time when the Container started."} finishedatDesc = descSource{ namespace + "finishedat", "Time when the Container finished."} restartcountDesc = descSource{ "container_restartcount", "Number of times the container has been restarted"} ) func (c *dockerHealthCollector) Describe(ch chan<- *prometheus.Desc) { ch <- healthStatusDesc.Desc(nil) ch <- statusDesc.Desc(nil) ch <- oomkilledDesc.Desc(nil) ch <- startedatDesc.Desc(nil) ch <- finishedatDesc.Desc(nil) ch <- restartcountDesc.Desc(nil) } func (c *dockerHealthCollector) Collect(ch chan<- prometheus.Metric) { c.mu.Lock() defer c.mu.Unlock() now := time.Now() if now.Sub(c.lastseen) >= cachePeriod { c.collectContainer() c.lastseen = now } c.collectMetrics(ch) } func (c *dockerHealthCollector) collectMetrics(ch chan<- prometheus.Metric) { for _, info := range c.containerInfoCache { var labels = map[string]string{} rep := regexp.MustCompile("[^a-zA-Z0-9_]") for k, v := range info.Config.Labels { label := strings.ToLower("container_label_" + k) labels[rep.ReplaceAllLiteralString(label, "_")] = v } labels["id"] = "/docker/" + info.ID labels["image"] = info.Config.Image labels["name"] = strings.TrimPrefix(info.Name, "/") b2f := func(b bool) float64 { if b { return 1 } return 0 } mapcopy := func(src map[string]string) prometheus.Labels { dst := map[string]string{} for k, v := range labels { dst[k] = v } return dst } for _, lv := range []string{"none", "starting", "healthy", "unhealthy"} { tmpLabels := mapcopy(labels) tmpLabels["status"] = lv ch <- prometheus.MustNewConstMetric(healthStatusDesc.Desc(tmpLabels), prometheus.GaugeValue, b2f(info.State.Health.Status == lv)) } for _, lv := range []string{"paused", "restarting", "running", "removing", "dead", "created", "exited"} { tmpLabels := mapcopy(labels) tmpLabels["status"] = lv ch <- prometheus.MustNewConstMetric(statusDesc.Desc(tmpLabels), prometheus.GaugeValue, b2f(info.State.Status == lv)) } ch <- prometheus.MustNewConstMetric(oomkilledDesc.Desc(labels), prometheus.GaugeValue, b2f(info.State.OOMKilled)) startedat, err := time.Parse(time.RFC3339Nano, info.State.StartedAt) errCheck(err) finishedat, err := time.Parse(time.RFC3339Nano, info.State.FinishedAt) errCheck(err) ch <- prometheus.MustNewConstMetric(startedatDesc.Desc(labels), prometheus.GaugeValue, float64(startedat.Unix())) ch <- prometheus.MustNewConstMetric(finishedatDesc.Desc(labels), prometheus.GaugeValue, float64(finishedat.Unix())) ch <- prometheus.MustNewConstMetric(restartcountDesc.Desc(labels), prometheus.GaugeValue, float64(info.RestartCount)) } } func (c *dockerHealthCollector) collectContainer() { containers, err := c.containerClient.ContainerList(context.Background(), container.ListOptions{All: true}) errCheck(err) c.containerInfoCache = []types.ContainerJSON{} for _, ct := range containers { info, err := c.containerClient.ContainerInspect(context.Background(), ct.ID) errCheck(err) c.containerInfoCache = append(c.containerInfoCache, info) if info.Config == nil { info.Config = &container.Config{Labels: map[string]string{}} } if info.State.Health == nil { info.State.Health = &types.Health{Status: "none"} } } } var logger *slog.Logger func errCheck(err error) { if err != nil { logger.Error("error occurred", "err", err) os.Exit(1) } } // Define flags. var ( address = flag.String("listen-address", ":8080", "The address to listen on for HTTP requests.") ) func init() { opts := &slog.HandlerOptions{ Level: slog.LevelInfo, } handler := slog.NewJSONHandler(os.Stdout, opts) logger = slog.New(handler) prometheus.MustRegister(prometheus.NewBuildInfoCollector()) } func main() { flag.Parse() client, err := client.NewEnvClient() errCheck(err) defer client.Close() _, err = client.Ping(context.Background()) errCheck(err) prometheus.MustRegister(&dockerHealthCollector{ containerClient: client, }) http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) { fmt.Fprintf(w, "

docker state exporter

") }) http.HandleFunc("/-/healthy", func(w http.ResponseWriter, r *http.Request) { fmt.Fprintf(w, "up") }) http.Handle("/metrics", promhttp.HandlerFor( prometheus.DefaultGatherer, promhttp.HandlerOpts{EnableOpenMetrics: true})) logger.Info("Server listening", "address", *address) server := &http.Server{Addr: *address, Handler: nil} go func() { err = server.ListenAndServe() if err != http.ErrServerClosed { errCheck(err) } }() quit := make(chan os.Signal, 1) signal.Notify(quit, syscall.SIGTERM, os.Interrupt) <-quit logger.Info("Server shutting down") ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) defer cancel() if err := server.Shutdown(ctx); err != nil { logger.Error("Failed to gracefully shutdown", "err", err) } logger.Info("Server shutdown") }