173 lines
5.5 KiB
Go
173 lines
5.5 KiB
Go
package main
|
|
|
|
import (
|
|
"fmt"
|
|
"strings"
|
|
)
|
|
|
|
// scoreDrive computes a drive's risk score, recommendation, and the reason
|
|
// string behind them. The scoring rules are deliberate:
|
|
//
|
|
// - Only real, drive-attributable defects add meaningful score.
|
|
// - Missing or unreadable data is never treated as a failure (no points).
|
|
// - Wear and age are graded to nudge toward planned replacement.
|
|
//
|
|
// The score maps to a recommendation:
|
|
//
|
|
// >= 100 -> REPLACE_NOW (hard defect: drive is failing or failed)
|
|
// >= 50 -> REPLACE_SOON (serious wear or accumulating defects)
|
|
// >= 20 -> MONITOR (early warning signs)
|
|
// < 20 -> OK
|
|
//
|
|
// A drive with no SMART data and no controller red flags scores NO_DATA, meaning
|
|
// re-collect rather than replace.
|
|
func scoreDrive(d *Drive) (int, string, string) {
|
|
score := 0
|
|
var reasons []string
|
|
add := func(pts int, msg string) {
|
|
score += pts
|
|
reasons = append(reasons, msg)
|
|
}
|
|
min := func(a, b int) int {
|
|
if a < b {
|
|
return a
|
|
}
|
|
return b
|
|
}
|
|
|
|
// ---- Hard physical defects (drive-attributable) ----
|
|
if realloc := iv(d.Reallocated); realloc > 0 {
|
|
pts := min(40+realloc*5, 100)
|
|
add(pts, fmt.Sprintf("reallocated=%d(+%d)", realloc, pts))
|
|
}
|
|
if pending := iv(d.Pending); pending > 0 {
|
|
pts := min(50+pending*5, 100)
|
|
add(pts, fmt.Sprintf("pending=%d(+%d)", pending, pts))
|
|
}
|
|
if uncorr := iv(d.Uncorrectable); uncorr > 0 {
|
|
pts := min(60+uncorr*5, 100)
|
|
add(pts, fmt.Sprintf("uncorrectable=%d(+%d)", uncorr, pts))
|
|
}
|
|
if reported := iv(d.ReportedUncorrect); reported > 0 {
|
|
pts := min(reported*10, 60)
|
|
add(pts, fmt.Sprintf("reported_uncorrect=%d(+%d)", reported, pts))
|
|
}
|
|
if e2e := iv(d.EndToEnd); e2e > 0 {
|
|
pts := min(e2e*20, 80)
|
|
add(pts, fmt.Sprintf("end_to_end_err=%d(+%d)", e2e, pts))
|
|
}
|
|
if badblk := iv(d.RuntimeBadBlocks); badblk > 0 {
|
|
pts := min(badblk*5, 40)
|
|
add(pts, fmt.Sprintf("runtime_bad_blocks=%d(+%d)", badblk, pts))
|
|
}
|
|
|
|
// ---- SMART self-assessment: only penalize an EXPLICIT failure ----
|
|
if d.SmartHealth == "FAILED" {
|
|
add(100, "SMART_health=FAILED(+100)")
|
|
}
|
|
|
|
// ---- SATA link quality (cabling/backplane, not the NAND) ----
|
|
if crc := iv(d.UdmaCrc); crc > 0 {
|
|
pts := min(crc*3, 25)
|
|
add(pts, fmt.Sprintf("udma_crc=%d(+%d)", crc, pts))
|
|
}
|
|
|
|
// ---- RAID-controller signals ----
|
|
if d.PredictiveFailureCtrl > 0 {
|
|
add(70, "ctrl_predictive_failure(+70)")
|
|
}
|
|
if d.SmartAlertCtrl {
|
|
add(50, "ctrl_smart_alert(+50)")
|
|
}
|
|
// Penalize a controller-reported state only when it is not a healthy one.
|
|
// MegaCLI spells these out ("Online, Spun Up", "Hotspare") while storcli
|
|
// abbreviates ("Onln", "GHS"/"DHS" for hot spares, "JBOD"); all are fine.
|
|
fw := strings.ToLower(d.FwState)
|
|
fwHealthy := fw == "" ||
|
|
strings.Contains(fw, "online") || strings.Contains(fw, "onln") ||
|
|
strings.Contains(fw, "hotspare") || strings.Contains(fw, "ghs") ||
|
|
strings.Contains(fw, "dhs") || strings.Contains(fw, "jbod")
|
|
if !fwHealthy {
|
|
add(40, fmt.Sprintf("fw_state=%s(+40)", d.FwState))
|
|
}
|
|
|
|
// MegaCLI/storcli media errors: soft signal, graded gently and capped.
|
|
me := d.MediaErrCtrl
|
|
switch {
|
|
case me >= 100:
|
|
add(30, fmt.Sprintf("ctrl_media_errors=%d(+30)", me))
|
|
case me >= 20:
|
|
add(15, fmt.Sprintf("ctrl_media_errors=%d(+15)", me))
|
|
case me > 0:
|
|
add(5, fmt.Sprintf("ctrl_media_errors=%d(+5)", me))
|
|
}
|
|
|
|
// ---- NVMe critical warning bitmask (any bit set is a real alert) ----
|
|
if d.NvmeCriticalWarning != nil && *d.NvmeCriticalWarning > 0 {
|
|
add(60, fmt.Sprintf("nvme_critical_warning=0x%02x(+60)", *d.NvmeCriticalWarning))
|
|
}
|
|
// NVMe spare below threshold -> reserve exhaustion.
|
|
if d.NvmeAvailSpare != nil && d.NvmeAvailSpareThresh != nil &&
|
|
*d.NvmeAvailSpare <= *d.NvmeAvailSpareThresh {
|
|
add(40, fmt.Sprintf("nvme_avail_spare<=thresh(%d<=%d)(+40)",
|
|
*d.NvmeAvailSpare, *d.NvmeAvailSpareThresh))
|
|
}
|
|
|
|
// ---- Wear (graded; only meaningful with real SMART data) ----
|
|
if d.WearPctConsumed != nil {
|
|
wc := *d.WearPctConsumed
|
|
switch {
|
|
case wc >= 95:
|
|
add(80, fmt.Sprintf("wear_consumed=%d%%(+80)", wc))
|
|
case wc >= 90:
|
|
add(55, fmt.Sprintf("wear_consumed=%d%%(+55)", wc))
|
|
case wc >= 80:
|
|
add(30, fmt.Sprintf("wear_consumed=%d%%(+30)", wc))
|
|
case wc >= 70:
|
|
add(15, fmt.Sprintf("wear_consumed=%d%%(+15)", wc))
|
|
case wc >= 60:
|
|
add(8, fmt.Sprintf("wear_consumed=%d%%(+8)", wc))
|
|
}
|
|
}
|
|
|
|
// ---- Reserve-block exhaustion (Micron ID180 VALUE -> remaining %) ----
|
|
if d.UnusedReservePct != nil && *d.UnusedReservePct <= 10 {
|
|
add(30, fmt.Sprintf("reserve_blocks_low(val=%d)(+30)", *d.UnusedReservePct))
|
|
}
|
|
|
|
// ---- Age (gentle nudge only) ----
|
|
hours := iv(d.PowerOnHours)
|
|
switch {
|
|
case hours >= 61320: // Older than 7 years.
|
|
add(15, fmt.Sprintf("age=%dh(+15)", hours))
|
|
case hours >= 52560: // Older than 6 years.
|
|
add(8, fmt.Sprintf("age=%dh(+8)", hours))
|
|
case hours >= 43800: // Older than 5 years.
|
|
add(4, fmt.Sprintf("age=%dh(+4)", hours))
|
|
}
|
|
|
|
// ---- Decide recommendation ----
|
|
// NO_DATA only when nothing observed the drive: no SMART, no controller error
|
|
// signals, and no controller state. A controller-only drive (e.g. NVMe behind
|
|
// a PERC) reports a FwState, so it is scored on controller evidence instead.
|
|
if !d.HaveSmart && me == 0 && d.PredictiveFailureCtrl == 0 && !d.SmartAlertCtrl && d.FwState == "" {
|
|
return 0, "NO_DATA", "smartctl returned no usable SMART data; re-collect"
|
|
}
|
|
|
|
var rec string
|
|
switch {
|
|
case score >= 100:
|
|
rec = "REPLACE_NOW"
|
|
case score >= 50:
|
|
rec = "REPLACE_SOON"
|
|
case score >= 20:
|
|
rec = "MONITOR"
|
|
default:
|
|
rec = "OK"
|
|
}
|
|
|
|
if len(reasons) == 0 {
|
|
return score, rec, "no defects detected"
|
|
}
|
|
return score, rec, strings.Join(reasons, "; ")
|
|
}
|