drive-health-metrics/collect.go
James Coleman ddafa90a02
Some checks failed
Go package / build (push) Has been cancelled
first commit
2026-06-22 17:16:34 -05:00

189 lines
5.2 KiB
Go

package main
import (
"fmt"
"sort"
"strings"
"time"
)
// collect discovers every drive, queries SMART, attaches controller data, and
// scores it.
func collect() ([]*Drive, int64) {
host := hostname()
if app != nil && app.config != nil && app.config.Hostname != "" {
host = app.config.Hostname
}
collectedAt := time.Now().UTC().Format("2006-01-02T15:04:05Z")
tsNs := time.Now().Unix() * 1e9
st := newSmartTool()
ctrl := controllerIndex()
devices := st.scan()
// Fallback: no scan-open results but we do have controller drives -> probe
// a base device by megaraid index.
if len(devices) == 0 && len(ctrl) > 0 {
base := findBaseDev()
ids := make([]string, 0, len(ctrl))
for id := range ctrl {
ids = append(ids, id)
}
sort.Strings(ids)
for _, id := range ids {
for _, tmpl := range megaraidDtypes {
devices = append(devices, scanned{
path: base, dtype: fmt.Sprintf(tmpl, id), megaraidN: id,
})
}
}
}
var drives []*Drive
matched := map[string]bool{} // Controller IDs covered by a smartctl device.
for _, sc := range devices {
if sc.megaraidN != "" {
matched[sc.megaraidN] = true
}
d := &Drive{
CollectedAt: collectedAt,
Hostname: host,
DeviceID: sc.megaraidN,
}
ok := st.querySmart(sc.path, sc.dtype, d)
// Skip iSCSI LUNs and RAID virtual disks; they are not physical drives.
if isPseudoDevice(d) {
continue
}
// Attach controller-side data by megaraid index == DeviceID.
if sc.megaraidN != "" {
if cd, found := ctrl[sc.megaraidN]; found {
applyController(d, cd)
}
}
// Determine whether real SMART attribute data was obtained.
d.HaveSmart = ok && d.Model != "" && (d.PowerOnHours != nil ||
d.WearPctRemaining != nil ||
d.SmartHealth == "PASSED" || d.SmartHealth == "FAILED" ||
d.SmartHealth == "PASSED_BY_ATTR")
finalizeDerived(d)
drives = append(drives, d)
}
// Emit controller-only drives: physical drives the controller reports but
// smartctl cannot reach (e.g. NVMe behind a PERC). Health comes entirely
// from the controller (Status, predictive-failure, media/other counters).
ids := make([]string, 0, len(ctrl))
for id := range ctrl {
ids = append(ids, id)
}
sort.Strings(ids)
for _, id := range ids {
if matched[id] {
continue
}
d := &Drive{CollectedAt: collectedAt, Hostname: host, DeviceID: id}
applyController(d, ctrl[id])
if isPseudoDevice(d) {
continue
}
d.HaveSmart = false
finalizeDerived(d)
drives = append(drives, d)
}
return drives, tsNs
}
// applyController fills controller-side fields and uses MegaCLI/storcli inquiry
// as an identity fallback when smartctl passthrough failed.
func applyController(d *Drive, cd ctrlDrive) {
d.Enclosure = cd.Enclosure
d.Slot = cd.Slot
d.MediaErrCtrl = cd.MediaErr
d.OtherErrCtrl = cd.OtherErr
d.PredictiveFailureCtrl = cd.Predictive
d.SmartAlertCtrl = cd.SmartAlert
d.FwState = cd.FwState
// Identity fallback for when smartctl could not read the drive. Prefer the
// structured fields (perccli2); else split the legacy single-line Inquiry.
if cd.Model != "" || cd.Serial != "" || cd.Firmware != "" {
if d.Serial == "" {
d.Serial = cd.Serial
}
if d.Model == "" {
d.Model = cd.Model
}
if d.Firmware == "" {
d.Firmware = cd.Firmware
}
} else if cd.Inquiry != "" {
// Legacy MegaCLI "Inquiry Data" packs "<serial> <model...> <firmware>" on
// one line, where the model itself can contain spaces and the token count
// varies. Serial is always first and the firmware revision always last, so
// anchor on those and treat everything between as the model.
parts := strings.Fields(cd.Inquiry)
if d.Serial == "" && len(parts) >= 1 {
d.Serial = parts[0]
}
if d.Firmware == "" && len(parts) >= 2 {
d.Firmware = parts[len(parts)-1]
}
if d.Model == "" && len(parts) >= 3 {
d.Model = strings.Join(parts[1:len(parts)-1], " ")
}
}
if d.Rotation == "" {
d.Rotation = cd.Rotation
}
if d.TempC == nil {
d.TempC = cd.TempC
}
}
// finalizeDerived computes defect_total, power_on_years, and the risk score.
func finalizeDerived(d *Drive) {
// Aggregate drive-attributable defect counters. nil only when NONE was
// readable, so NO_DATA rows stay blank instead of showing a misleading 0.
defectParts := []*int{
d.Reallocated, d.Pending, d.Uncorrectable,
d.ReportedUncorrect, d.RuntimeBadBlocks, d.EndToEnd,
}
anyKnown := false
sum := 0
for _, p := range defectParts {
if p != nil {
anyKnown = true
sum += *p
}
}
if anyKnown {
d.DefectTotal = pInt(sum)
}
if d.PowerOnHours != nil && *d.PowerOnHours > 0 {
y := float64(*d.PowerOnHours) / 8760.0
d.PowerOnYears = pF(float64(int(y*100+0.5)) / 100) // Round to two decimals.
}
d.RiskScore, d.Recommendation, d.RiskReasons = scoreDrive(d)
}
// findBaseDev returns a real base block device to anchor the megaraid
// passthrough fallback probe, skipping loop and md devices and defaulting to
// /dev/sda when lsblk yields nothing usable.
func findBaseDev() string {
out := run("lsblk", "-dno", "NAME")
for _, ln := range strings.Split(out, "\n") {
name := strings.TrimSpace(ln)
if name != "" && !strings.Contains(name, "loop") && !strings.HasPrefix(name, "md") {
return "/dev/" + name
}
}
return "/dev/sda"
}