// Command drive-health-metrics collects per-drive SMART health from every physical // drive on a host — direct SATA/SAS, NVMe, and drives hidden behind a RAID // controller (MegaCLI / storcli / perccli) — scores each drive, and exports the // result. By default it emits CSV or InfluxDB line protocol once and exits; // with -server it runs as a service exposing a Prometheus endpoint and pushing to // InfluxDB/Kafka on a schedule. package main import ( "context" "fmt" "log" "os" "os/signal" "syscall" "github.com/prometheus/client_golang/prometheus" ) // Basic application info. namespace is the Prometheus metric prefix and matches // the InfluxDB measurement name. const ( serviceName = "drive-health-metrics" serviceDescription = "Collects and exports per-drive SMART health metrics" namespace = "drive_health" ) // App holds the shared application state: parsed flags, configuration, the // Prometheus registry, and the exporter/outputs. type App struct { flags *Flags config *Config registry *prometheus.Registry driveExporter *DriveExporter httpOutput *HTTPOutput influxOutput *InfluxOutput } // app is the global application state. var app *App func main() { app = new(App) app.ParseFlags() app.ReadConfig() switch { case app.flags.Server: runServer() default: runOneShot() } } // runOneShot collects once and writes CSV or InfluxDB line protocol to stdout. func runOneShot() { switch app.flags.Format { case "csv", "influx": default: fmt.Fprintf(os.Stderr, "invalid --format %q (want csv|influx)\n", app.flags.Format) os.Exit(2) } drives, tsNs := collect() if len(drives) == 0 { fmt.Fprintln(os.Stderr, "WARNING: no drive records collected") return } if app.flags.Format == "csv" { fmt.Println(recordsToCSV(drives)) } else { fmt.Println(recordsToInflux(drives, tsNs)) } } // runServer runs the long-lived service: a Prometheus HTTP endpoint plus the // scheduled InfluxDB output, reloading configuration on SIGHUP and shutting down // on SIGINT/SIGTERM. func runServer() { // Build the exporter and registry. app.driveExporter = NewDriveExporter() reg := prometheus.NewRegistry() reg.MustRegister(app.driveExporter) app.registry = reg // Build the outputs. app.httpOutput = NewHTTPOutput() app.influxOutput = NewInfluxOutput() if !app.httpOutput.OutputEnabled() && !app.influxOutput.OutputEnabled() { log.Fatalln("No output services are enabled (set http_output.enabled or configure influx_output).") } // Monitor signals. c := make(chan os.Signal, 1) signal.Notify(c, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP) // Each outer iteration owns one background context for the output services; // the inner loop applies SIGHUP reloads in place and only breaks out (to // recreate the context and restart the services) when a config change // requires it. for { ctx, cancel := context.WithCancel(context.Background()) go app.httpOutput.Start(ctx) go app.influxOutput.Start(ctx) restart := false for !restart { sig := <-c if sig != syscall.SIGHUP { // Termination/interruption: stop the services and exit. cancel() return } log.Println("Reloading configurations") oldConfig := app.config influxWasEnabled := app.influxOutput.OutputEnabled() app.ReadConfig() app.httpOutput.Reload() app.influxOutput.Reload() httpNeedsRestart := oldConfig.HTTP.BindAddr != app.config.HTTP.BindAddr || oldConfig.HTTP.Port != app.config.HTTP.Port || oldConfig.HTTP.Enabled != app.config.HTTP.Enabled influxNeedsRestart := app.influxOutput.OutputEnabled() != influxWasEnabled || oldConfig.Influx.Frequency != app.config.Influx.Frequency restart = httpNeedsRestart || influxNeedsRestart } // A restart-worthy change occurred: stop the current services and loop // to start them on a fresh context. cancel() } }