137 lines
3.8 KiB
Go
137 lines
3.8 KiB
Go
// Command drive-health-metrics collects per-drive SMART health from every physical
|
|
// drive on a host — direct SATA/SAS, NVMe, and drives hidden behind a RAID
|
|
// controller (MegaCLI / storcli / perccli) — scores each drive, and exports the
|
|
// result. By default it emits CSV or InfluxDB line protocol once and exits;
|
|
// with -server it runs as a service exposing a Prometheus endpoint and pushing to
|
|
// InfluxDB/Kafka on a schedule.
|
|
package main
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"log"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
)
|
|
|
|
// Basic application info. namespace is the Prometheus metric prefix and matches
|
|
// the InfluxDB measurement name.
|
|
const (
|
|
serviceName = "drive-health-metrics"
|
|
serviceDescription = "Collects and exports per-drive SMART health metrics"
|
|
namespace = "drive_health"
|
|
)
|
|
|
|
// App holds the shared application state: parsed flags, configuration, the
|
|
// Prometheus registry, and the exporter/outputs.
|
|
type App struct {
|
|
flags *Flags
|
|
config *Config
|
|
registry *prometheus.Registry
|
|
driveExporter *DriveExporter
|
|
httpOutput *HTTPOutput
|
|
influxOutput *InfluxOutput
|
|
}
|
|
|
|
// app is the global application state.
|
|
var app *App
|
|
|
|
func main() {
|
|
app = new(App)
|
|
app.ParseFlags()
|
|
app.ReadConfig()
|
|
|
|
switch {
|
|
case app.flags.Server:
|
|
runServer()
|
|
default:
|
|
runOneShot()
|
|
}
|
|
}
|
|
|
|
// runOneShot collects once and writes CSV or InfluxDB line protocol to stdout.
|
|
func runOneShot() {
|
|
switch app.flags.Format {
|
|
case "csv", "influx":
|
|
default:
|
|
fmt.Fprintf(os.Stderr, "invalid --format %q (want csv|influx)\n", app.flags.Format)
|
|
os.Exit(2)
|
|
}
|
|
|
|
drives, tsNs := collect()
|
|
if len(drives) == 0 {
|
|
fmt.Fprintln(os.Stderr, "WARNING: no drive records collected")
|
|
return
|
|
}
|
|
|
|
if app.flags.Format == "csv" {
|
|
fmt.Println(recordsToCSV(drives))
|
|
} else {
|
|
fmt.Println(recordsToInflux(drives, tsNs))
|
|
}
|
|
}
|
|
|
|
// runServer runs the long-lived service: a Prometheus HTTP endpoint plus the
|
|
// scheduled InfluxDB output, reloading configuration on SIGHUP and shutting down
|
|
// on SIGINT/SIGTERM.
|
|
func runServer() {
|
|
// Build the exporter and registry.
|
|
app.driveExporter = NewDriveExporter()
|
|
reg := prometheus.NewRegistry()
|
|
reg.MustRegister(app.driveExporter)
|
|
app.registry = reg
|
|
|
|
// Build the outputs.
|
|
app.httpOutput = NewHTTPOutput()
|
|
app.influxOutput = NewInfluxOutput()
|
|
|
|
if !app.httpOutput.OutputEnabled() && !app.influxOutput.OutputEnabled() {
|
|
log.Fatalln("No output services are enabled (set http_output.enabled or configure influx_output).")
|
|
}
|
|
|
|
// Monitor signals.
|
|
c := make(chan os.Signal, 1)
|
|
signal.Notify(c, syscall.SIGINT, syscall.SIGTERM, syscall.SIGHUP)
|
|
|
|
// Each outer iteration owns one background context for the output services;
|
|
// the inner loop applies SIGHUP reloads in place and only breaks out (to
|
|
// recreate the context and restart the services) when a config change
|
|
// requires it.
|
|
for {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
go app.httpOutput.Start(ctx)
|
|
go app.influxOutput.Start(ctx)
|
|
|
|
restart := false
|
|
for !restart {
|
|
sig := <-c
|
|
if sig != syscall.SIGHUP {
|
|
// Termination/interruption: stop the services and exit.
|
|
cancel()
|
|
return
|
|
}
|
|
|
|
log.Println("Reloading configurations")
|
|
oldConfig := app.config
|
|
influxWasEnabled := app.influxOutput.OutputEnabled()
|
|
|
|
app.ReadConfig()
|
|
app.httpOutput.Reload()
|
|
app.influxOutput.Reload()
|
|
|
|
httpNeedsRestart := oldConfig.HTTP.BindAddr != app.config.HTTP.BindAddr ||
|
|
oldConfig.HTTP.Port != app.config.HTTP.Port ||
|
|
oldConfig.HTTP.Enabled != app.config.HTTP.Enabled
|
|
influxNeedsRestart := app.influxOutput.OutputEnabled() != influxWasEnabled ||
|
|
oldConfig.Influx.Frequency != app.config.Influx.Frequency
|
|
restart = httpNeedsRestart || influxNeedsRestart
|
|
}
|
|
|
|
// A restart-worthy change occurred: stop the current services and loop
|
|
// to start them on a fresh context.
|
|
cancel()
|
|
}
|
|
}
|