drive-health-metrics/main.go
James Coleman ddafa90a02
Some checks failed
Go package / build (push) Has been cancelled
first commit
2026-06-22 17:16:34 -05:00

137 lines
3.8 KiB
Go

// Command drive-health-metrics collects per-drive SMART health from every physical
// drive on a host — direct SATA/SAS, NVMe, and drives hidden behind a RAID
// controller (MegaCLI / storcli / perccli) — scores each drive, and exports the
// result. By default it emits CSV or InfluxDB line protocol once and exits;
// with -server it runs as a service exposing a Prometheus endpoint and pushing to
// InfluxDB/Kafka on a schedule.
package main
import (
"context"
"fmt"
"log"
"os"
"os/signal"
"syscall"
"github.com/prometheus/client_golang/prometheus"
)
// Basic application info. namespace is the Prometheus metric prefix and matches
// the InfluxDB measurement name.
const (
serviceName = "drive-health-metrics"
serviceDescription = "Collects and exports per-drive SMART health metrics"
namespace = "drive_health"
)
// App holds the shared application state: parsed flags, configuration, the
// Prometheus registry, and the exporter/outputs.
type App struct {
flags *Flags
config *Config
registry *prometheus.Registry
driveExporter *DriveExporter
httpOutput *HTTPOutput
influxOutput *InfluxOutput
}
// app is the global application state.
var app *App
func main() {
app = new(App)
app.ParseFlags()
app.ReadConfig()
switch {
case app.flags.Server:
runServer()
default:
runOneShot()
}
}
// runOneShot collects once and writes CSV or InfluxDB line protocol to stdout.
func runOneShot() {
switch app.flags.Format {
case "csv", "influx":
default:
fmt.Fprintf(os.Stderr, "invalid --format %q (want csv|influx)\n", app.flags.Format)
os.Exit(2)
}
drives, tsNs := collect()
if len(drives) == 0 {
fmt.Fprintln(os.Stderr, "WARNING: no drive records collected")
return
}
if app.flags.Format == "csv" {
fmt.Println(recordsToCSV(drives))
} else {
fmt.Println(recordsToInflux(drives, tsNs))
}
}
// runServer runs the long-lived service: a Prometheus HTTP endpoint plus the
// scheduled InfluxDB output, reloading configuration on SIGHUP and shutting down
// on SIGINT/SIGTERM.
func runServer() {
// Build the exporter and registry.
app.driveExporter = NewDriveExporter()
reg := prometheus.NewRegistry()
reg.MustRegister(app.driveExporter)
app.registry = reg
// Build the outputs.
app.httpOutput = NewHTTPOutput()
app.influxOutput = NewInfluxOutput()
if !app.httpOutput.OutputEnabled() && !app.influxOutput.OutputEnabled() {
log.Fatalln("No output services are enabled (set http_output.enabled or configure influx_output).")
}
// Monitor signals.
c := make(chan os.Signal, 1)
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
// Each outer iteration owns one background context for the output services;
// the inner loop applies SIGHUP reloads in place and only breaks out (to
// recreate the context and restart the services) when a config change
// requires it.
for {
ctx, cancel := context.WithCancel(context.Background())
go app.httpOutput.Start(ctx)
go app.influxOutput.Start(ctx)
restart := false
for !restart {
sig := <-c
if sig != syscall.SIGHUP {
// Termination/interruption: stop the services and exit.
cancel()
return
}
log.Println("Reloading configurations")
oldConfig := app.config
influxWasEnabled := app.influxOutput.OutputEnabled()
app.ReadConfig()
app.httpOutput.Reload()
app.influxOutput.Reload()
httpNeedsRestart := oldConfig.HTTP.BindAddr != app.config.HTTP.BindAddr ||
oldConfig.HTTP.Port != app.config.HTTP.Port ||
oldConfig.HTTP.Enabled != app.config.HTTP.Enabled
influxNeedsRestart := app.influxOutput.OutputEnabled() != influxWasEnabled ||
oldConfig.Influx.Frequency != app.config.Influx.Frequency
restart = httpNeedsRestart || influxNeedsRestart
}
// A restart-worthy change occurred: stop the current services and loop
// to start them on a fresh context.
cancel()
}
}