drive-health-metrics/score.go
James Coleman ddafa90a02
Some checks failed
Go package / build (push) Has been cancelled
first commit
2026-06-22 17:16:34 -05:00

173 lines
5.5 KiB
Go

package main
import (
"fmt"
"strings"
)
// scoreDrive computes a drive's risk score, recommendation, and the reason
// string behind them. The scoring rules are deliberate:
//
// - Only real, drive-attributable defects add meaningful score.
// - Missing or unreadable data is never treated as a failure (no points).
// - Wear and age are graded to nudge toward planned replacement.
//
// The score maps to a recommendation:
//
// >= 100 -> REPLACE_NOW (hard defect: drive is failing or failed)
// >= 50 -> REPLACE_SOON (serious wear or accumulating defects)
// >= 20 -> MONITOR (early warning signs)
// < 20 -> OK
//
// A drive with no SMART data and no controller red flags scores NO_DATA, meaning
// re-collect rather than replace.
func scoreDrive(d *Drive) (int, string, string) {
score := 0
var reasons []string
add := func(pts int, msg string) {
score += pts
reasons = append(reasons, msg)
}
min := func(a, b int) int {
if a < b {
return a
}
return b
}
// ---- Hard physical defects (drive-attributable) ----
if realloc := iv(d.Reallocated); realloc > 0 {
pts := min(40+realloc*5, 100)
add(pts, fmt.Sprintf("reallocated=%d(+%d)", realloc, pts))
}
if pending := iv(d.Pending); pending > 0 {
pts := min(50+pending*5, 100)
add(pts, fmt.Sprintf("pending=%d(+%d)", pending, pts))
}
if uncorr := iv(d.Uncorrectable); uncorr > 0 {
pts := min(60+uncorr*5, 100)
add(pts, fmt.Sprintf("uncorrectable=%d(+%d)", uncorr, pts))
}
if reported := iv(d.ReportedUncorrect); reported > 0 {
pts := min(reported*10, 60)
add(pts, fmt.Sprintf("reported_uncorrect=%d(+%d)", reported, pts))
}
if e2e := iv(d.EndToEnd); e2e > 0 {
pts := min(e2e*20, 80)
add(pts, fmt.Sprintf("end_to_end_err=%d(+%d)", e2e, pts))
}
if badblk := iv(d.RuntimeBadBlocks); badblk > 0 {
pts := min(badblk*5, 40)
add(pts, fmt.Sprintf("runtime_bad_blocks=%d(+%d)", badblk, pts))
}
// ---- SMART self-assessment: only penalize an EXPLICIT failure ----
if d.SmartHealth == "FAILED" {
add(100, "SMART_health=FAILED(+100)")
}
// ---- SATA link quality (cabling/backplane, not the NAND) ----
if crc := iv(d.UdmaCrc); crc > 0 {
pts := min(crc*3, 25)
add(pts, fmt.Sprintf("udma_crc=%d(+%d)", crc, pts))
}
// ---- RAID-controller signals ----
if d.PredictiveFailureCtrl > 0 {
add(70, "ctrl_predictive_failure(+70)")
}
if d.SmartAlertCtrl {
add(50, "ctrl_smart_alert(+50)")
}
// Penalize a controller-reported state only when it is not a healthy one.
// MegaCLI spells these out ("Online, Spun Up", "Hotspare") while storcli
// abbreviates ("Onln", "GHS"/"DHS" for hot spares, "JBOD"); all are fine.
fw := strings.ToLower(d.FwState)
fwHealthy := fw == "" ||
strings.Contains(fw, "online") || strings.Contains(fw, "onln") ||
strings.Contains(fw, "hotspare") || strings.Contains(fw, "ghs") ||
strings.Contains(fw, "dhs") || strings.Contains(fw, "jbod")
if !fwHealthy {
add(40, fmt.Sprintf("fw_state=%s(+40)", d.FwState))
}
// MegaCLI/storcli media errors: soft signal, graded gently and capped.
me := d.MediaErrCtrl
switch {
case me >= 100:
add(30, fmt.Sprintf("ctrl_media_errors=%d(+30)", me))
case me >= 20:
add(15, fmt.Sprintf("ctrl_media_errors=%d(+15)", me))
case me > 0:
add(5, fmt.Sprintf("ctrl_media_errors=%d(+5)", me))
}
// ---- NVMe critical warning bitmask (any bit set is a real alert) ----
if d.NvmeCriticalWarning != nil && *d.NvmeCriticalWarning > 0 {
add(60, fmt.Sprintf("nvme_critical_warning=0x%02x(+60)", *d.NvmeCriticalWarning))
}
// NVMe spare below threshold -> reserve exhaustion.
if d.NvmeAvailSpare != nil && d.NvmeAvailSpareThresh != nil &&
*d.NvmeAvailSpare <= *d.NvmeAvailSpareThresh {
add(40, fmt.Sprintf("nvme_avail_spare<=thresh(%d<=%d)(+40)",
*d.NvmeAvailSpare, *d.NvmeAvailSpareThresh))
}
// ---- Wear (graded; only meaningful with real SMART data) ----
if d.WearPctConsumed != nil {
wc := *d.WearPctConsumed
switch {
case wc >= 95:
add(80, fmt.Sprintf("wear_consumed=%d%%(+80)", wc))
case wc >= 90:
add(55, fmt.Sprintf("wear_consumed=%d%%(+55)", wc))
case wc >= 80:
add(30, fmt.Sprintf("wear_consumed=%d%%(+30)", wc))
case wc >= 70:
add(15, fmt.Sprintf("wear_consumed=%d%%(+15)", wc))
case wc >= 60:
add(8, fmt.Sprintf("wear_consumed=%d%%(+8)", wc))
}
}
// ---- Reserve-block exhaustion (Micron ID180 VALUE -> remaining %) ----
if d.UnusedReservePct != nil && *d.UnusedReservePct <= 10 {
add(30, fmt.Sprintf("reserve_blocks_low(val=%d)(+30)", *d.UnusedReservePct))
}
// ---- Age (gentle nudge only) ----
hours := iv(d.PowerOnHours)
switch {
case hours >= 61320: // Older than 7 years.
add(15, fmt.Sprintf("age=%dh(+15)", hours))
case hours >= 52560: // Older than 6 years.
add(8, fmt.Sprintf("age=%dh(+8)", hours))
case hours >= 43800: // Older than 5 years.
add(4, fmt.Sprintf("age=%dh(+4)", hours))
}
// ---- Decide recommendation ----
// NO_DATA only when nothing observed the drive: no SMART, no controller error
// signals, and no controller state. A controller-only drive (e.g. NVMe behind
// a PERC) reports a FwState, so it is scored on controller evidence instead.
if !d.HaveSmart && me == 0 && d.PredictiveFailureCtrl == 0 && !d.SmartAlertCtrl && d.FwState == "" {
return 0, "NO_DATA", "smartctl returned no usable SMART data; re-collect"
}
var rec string
switch {
case score >= 100:
rec = "REPLACE_NOW"
case score >= 50:
rec = "REPLACE_SOON"
case score >= 20:
rec = "MONITOR"
default:
rec = "OK"
}
if len(reasons) == 0 {
return score, rec, "no defects detected"
}
return score, rec, strings.Join(reasons, "; ")
}